xref: /linux/kernel/sched/ext.c (revision 06e51be3d5e7a07aea5c9012773df8d5de01db6c)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
4  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
5  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
6  */
7 #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
8 
9 enum scx_consts {
10 	SCX_DSP_DFL_MAX_BATCH		= 32,
11 	SCX_DSP_MAX_LOOPS		= 32,
12 	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
13 
14 	SCX_EXIT_BT_LEN			= 64,
15 	SCX_EXIT_MSG_LEN		= 1024,
16 	SCX_EXIT_DUMP_DFL_LEN		= 32768,
17 };
18 
19 enum scx_exit_kind {
20 	SCX_EXIT_NONE,
21 	SCX_EXIT_DONE,
22 
23 	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
24 	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
25 	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
26 	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
27 
28 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
29 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
30 	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
31 };
32 
33 /*
34  * An exit code can be specified when exiting with scx_bpf_exit() or
35  * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
36  * respectively. The codes are 64bit of the format:
37  *
38  *   Bits: [63  ..  48 47   ..  32 31 .. 0]
39  *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
40  *
41  *   SYS ACT: System-defined exit actions
42  *   SYS RSN: System-defined exit reasons
43  *   USR    : User-defined exit codes and reasons
44  *
45  * Using the above, users may communicate intention and context by ORing system
46  * actions and/or system reasons with a user-defined exit code.
47  */
48 enum scx_exit_code {
49 	/* Reasons */
50 	SCX_ECODE_RSN_HOTPLUG	= 1LLU << 32,
51 
52 	/* Actions */
53 	SCX_ECODE_ACT_RESTART	= 1LLU << 48,
54 };
55 
56 /*
57  * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
58  * being disabled.
59  */
60 struct scx_exit_info {
61 	/* %SCX_EXIT_* - broad category of the exit reason */
62 	enum scx_exit_kind	kind;
63 
64 	/* exit code if gracefully exiting */
65 	s64			exit_code;
66 
67 	/* textual representation of the above */
68 	const char		*reason;
69 
70 	/* backtrace if exiting due to an error */
71 	unsigned long		*bt;
72 	u32			bt_len;
73 
74 	/* informational message */
75 	char			*msg;
76 
77 	/* debug dump */
78 	char			*dump;
79 };
80 
81 /* sched_ext_ops.flags */
82 enum scx_ops_flags {
83 	/*
84 	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
85 	 */
86 	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
87 
88 	/*
89 	 * By default, if there are no other task to run on the CPU, ext core
90 	 * keeps running the current task even after its slice expires. If this
91 	 * flag is specified, such tasks are passed to ops.enqueue() with
92 	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
93 	 */
94 	SCX_OPS_ENQ_LAST	= 1LLU << 1,
95 
96 	/*
97 	 * An exiting task may schedule after PF_EXITING is set. In such cases,
98 	 * bpf_task_from_pid() may not be able to find the task and if the BPF
99 	 * scheduler depends on pid lookup for dispatching, the task will be
100 	 * lost leading to various issues including RCU grace period stalls.
101 	 *
102 	 * To mask this problem, by default, unhashed tasks are automatically
103 	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
104 	 * depend on pid lookups and wants to handle these tasks directly, the
105 	 * following flag can be used.
106 	 */
107 	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
108 
109 	/*
110 	 * If set, only tasks with policy set to SCHED_EXT are attached to
111 	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
112 	 */
113 	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
114 
115 	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
116 				  SCX_OPS_ENQ_LAST |
117 				  SCX_OPS_ENQ_EXITING |
118 				  SCX_OPS_SWITCH_PARTIAL,
119 };
120 
121 /* argument container for ops.init_task() */
122 struct scx_init_task_args {
123 	/*
124 	 * Set if ops.init_task() is being invoked on the fork path, as opposed
125 	 * to the scheduler transition path.
126 	 */
127 	bool			fork;
128 };
129 
130 /* argument container for ops.exit_task() */
131 struct scx_exit_task_args {
132 	/* Whether the task exited before running on sched_ext. */
133 	bool cancelled;
134 };
135 
136 enum scx_cpu_preempt_reason {
137 	/* next task is being scheduled by &sched_class_rt */
138 	SCX_CPU_PREEMPT_RT,
139 	/* next task is being scheduled by &sched_class_dl */
140 	SCX_CPU_PREEMPT_DL,
141 	/* next task is being scheduled by &sched_class_stop */
142 	SCX_CPU_PREEMPT_STOP,
143 	/* unknown reason for SCX being preempted */
144 	SCX_CPU_PREEMPT_UNKNOWN,
145 };
146 
147 /*
148  * Argument container for ops->cpu_acquire(). Currently empty, but may be
149  * expanded in the future.
150  */
151 struct scx_cpu_acquire_args {};
152 
153 /* argument container for ops->cpu_release() */
154 struct scx_cpu_release_args {
155 	/* the reason the CPU was preempted */
156 	enum scx_cpu_preempt_reason reason;
157 
158 	/* the task that's going to be scheduled on the CPU */
159 	struct task_struct	*task;
160 };
161 
162 /*
163  * Informational context provided to dump operations.
164  */
165 struct scx_dump_ctx {
166 	enum scx_exit_kind	kind;
167 	s64			exit_code;
168 	const char		*reason;
169 	u64			at_ns;
170 	u64			at_jiffies;
171 };
172 
173 /**
174  * struct sched_ext_ops - Operation table for BPF scheduler implementation
175  *
176  * Userland can implement an arbitrary scheduling policy by implementing and
177  * loading operations in this table.
178  */
179 struct sched_ext_ops {
180 	/**
181 	 * select_cpu - Pick the target CPU for a task which is being woken up
182 	 * @p: task being woken up
183 	 * @prev_cpu: the cpu @p was on before sleeping
184 	 * @wake_flags: SCX_WAKE_*
185 	 *
186 	 * Decision made here isn't final. @p may be moved to any CPU while it
187 	 * is getting dispatched for execution later. However, as @p is not on
188 	 * the rq at this point, getting the eventual execution CPU right here
189 	 * saves a small bit of overhead down the line.
190 	 *
191 	 * If an idle CPU is returned, the CPU is kicked and will try to
192 	 * dispatch. While an explicit custom mechanism can be added,
193 	 * select_cpu() serves as the default way to wake up idle CPUs.
194 	 *
195 	 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
196 	 * is dispatched, the ops.enqueue() callback will be skipped. Finally,
197 	 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
198 	 * local DSQ of whatever CPU is returned by this callback.
199 	 */
200 	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
201 
202 	/**
203 	 * enqueue - Enqueue a task on the BPF scheduler
204 	 * @p: task being enqueued
205 	 * @enq_flags: %SCX_ENQ_*
206 	 *
207 	 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
208 	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
209 	 * scheduler owns @p and if it fails to dispatch @p, the task will
210 	 * stall.
211 	 *
212 	 * If @p was dispatched from ops.select_cpu(), this callback is
213 	 * skipped.
214 	 */
215 	void (*enqueue)(struct task_struct *p, u64 enq_flags);
216 
217 	/**
218 	 * dequeue - Remove a task from the BPF scheduler
219 	 * @p: task being dequeued
220 	 * @deq_flags: %SCX_DEQ_*
221 	 *
222 	 * Remove @p from the BPF scheduler. This is usually called to isolate
223 	 * the task while updating its scheduling properties (e.g. priority).
224 	 *
225 	 * The ext core keeps track of whether the BPF side owns a given task or
226 	 * not and can gracefully ignore spurious dispatches from BPF side,
227 	 * which makes it safe to not implement this method. However, depending
228 	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
229 	 * scheduling position not being updated across a priority change.
230 	 */
231 	void (*dequeue)(struct task_struct *p, u64 deq_flags);
232 
233 	/**
234 	 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
235 	 * @cpu: CPU to dispatch tasks for
236 	 * @prev: previous task being switched out
237 	 *
238 	 * Called when a CPU's local dsq is empty. The operation should dispatch
239 	 * one or more tasks from the BPF scheduler into the DSQs using
240 	 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
241 	 * scx_bpf_consume().
242 	 *
243 	 * The maximum number of times scx_bpf_dispatch() can be called without
244 	 * an intervening scx_bpf_consume() is specified by
245 	 * ops.dispatch_max_batch. See the comments on top of the two functions
246 	 * for more details.
247 	 *
248 	 * When not %NULL, @prev is an SCX task with its slice depleted. If
249 	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
250 	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
251 	 * ops.dispatch() returns. To keep executing @prev, return without
252 	 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
253 	 */
254 	void (*dispatch)(s32 cpu, struct task_struct *prev);
255 
256 	/**
257 	 * tick - Periodic tick
258 	 * @p: task running currently
259 	 *
260 	 * This operation is called every 1/HZ seconds on CPUs which are
261 	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
262 	 * immediate dispatch cycle on the CPU.
263 	 */
264 	void (*tick)(struct task_struct *p);
265 
266 	/**
267 	 * runnable - A task is becoming runnable on its associated CPU
268 	 * @p: task becoming runnable
269 	 * @enq_flags: %SCX_ENQ_*
270 	 *
271 	 * This and the following three functions can be used to track a task's
272 	 * execution state transitions. A task becomes ->runnable() on a CPU,
273 	 * and then goes through one or more ->running() and ->stopping() pairs
274 	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
275 	 * done running on the CPU.
276 	 *
277 	 * @p is becoming runnable on the CPU because it's
278 	 *
279 	 * - waking up (%SCX_ENQ_WAKEUP)
280 	 * - being moved from another CPU
281 	 * - being restored after temporarily taken off the queue for an
282 	 *   attribute change.
283 	 *
284 	 * This and ->enqueue() are related but not coupled. This operation
285 	 * notifies @p's state transition and may not be followed by ->enqueue()
286 	 * e.g. when @p is being dispatched to a remote CPU, or when @p is
287 	 * being enqueued on a CPU experiencing a hotplug event. Likewise, a
288 	 * task may be ->enqueue()'d without being preceded by this operation
289 	 * e.g. after exhausting its slice.
290 	 */
291 	void (*runnable)(struct task_struct *p, u64 enq_flags);
292 
293 	/**
294 	 * running - A task is starting to run on its associated CPU
295 	 * @p: task starting to run
296 	 *
297 	 * See ->runnable() for explanation on the task state notifiers.
298 	 */
299 	void (*running)(struct task_struct *p);
300 
301 	/**
302 	 * stopping - A task is stopping execution
303 	 * @p: task stopping to run
304 	 * @runnable: is task @p still runnable?
305 	 *
306 	 * See ->runnable() for explanation on the task state notifiers. If
307 	 * !@runnable, ->quiescent() will be invoked after this operation
308 	 * returns.
309 	 */
310 	void (*stopping)(struct task_struct *p, bool runnable);
311 
312 	/**
313 	 * quiescent - A task is becoming not runnable on its associated CPU
314 	 * @p: task becoming not runnable
315 	 * @deq_flags: %SCX_DEQ_*
316 	 *
317 	 * See ->runnable() for explanation on the task state notifiers.
318 	 *
319 	 * @p is becoming quiescent on the CPU because it's
320 	 *
321 	 * - sleeping (%SCX_DEQ_SLEEP)
322 	 * - being moved to another CPU
323 	 * - being temporarily taken off the queue for an attribute change
324 	 *   (%SCX_DEQ_SAVE)
325 	 *
326 	 * This and ->dequeue() are related but not coupled. This operation
327 	 * notifies @p's state transition and may not be preceded by ->dequeue()
328 	 * e.g. when @p is being dispatched to a remote CPU.
329 	 */
330 	void (*quiescent)(struct task_struct *p, u64 deq_flags);
331 
332 	/**
333 	 * yield - Yield CPU
334 	 * @from: yielding task
335 	 * @to: optional yield target task
336 	 *
337 	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
338 	 * The BPF scheduler should ensure that other available tasks are
339 	 * dispatched before the yielding task. Return value is ignored in this
340 	 * case.
341 	 *
342 	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
343 	 * scheduler can implement the request, return %true; otherwise, %false.
344 	 */
345 	bool (*yield)(struct task_struct *from, struct task_struct *to);
346 
347 	/**
348 	 * core_sched_before - Task ordering for core-sched
349 	 * @a: task A
350 	 * @b: task B
351 	 *
352 	 * Used by core-sched to determine the ordering between two tasks. See
353 	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
354 	 * core-sched.
355 	 *
356 	 * Both @a and @b are runnable and may or may not currently be queued on
357 	 * the BPF scheduler. Should return %true if @a should run before @b.
358 	 * %false if there's no required ordering or @b should run before @a.
359 	 *
360 	 * If not specified, the default is ordering them according to when they
361 	 * became runnable.
362 	 */
363 	bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
364 
365 	/**
366 	 * set_weight - Set task weight
367 	 * @p: task to set weight for
368 	 * @weight: new eight [1..10000]
369 	 *
370 	 * Update @p's weight to @weight.
371 	 */
372 	void (*set_weight)(struct task_struct *p, u32 weight);
373 
374 	/**
375 	 * set_cpumask - Set CPU affinity
376 	 * @p: task to set CPU affinity for
377 	 * @cpumask: cpumask of cpus that @p can run on
378 	 *
379 	 * Update @p's CPU affinity to @cpumask.
380 	 */
381 	void (*set_cpumask)(struct task_struct *p,
382 			    const struct cpumask *cpumask);
383 
384 	/**
385 	 * update_idle - Update the idle state of a CPU
386 	 * @cpu: CPU to udpate the idle state for
387 	 * @idle: whether entering or exiting the idle state
388 	 *
389 	 * This operation is called when @rq's CPU goes or leaves the idle
390 	 * state. By default, implementing this operation disables the built-in
391 	 * idle CPU tracking and the following helpers become unavailable:
392 	 *
393 	 * - scx_bpf_select_cpu_dfl()
394 	 * - scx_bpf_test_and_clear_cpu_idle()
395 	 * - scx_bpf_pick_idle_cpu()
396 	 *
397 	 * The user also must implement ops.select_cpu() as the default
398 	 * implementation relies on scx_bpf_select_cpu_dfl().
399 	 *
400 	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
401 	 * tracking.
402 	 */
403 	void (*update_idle)(s32 cpu, bool idle);
404 
405 	/**
406 	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
407 	 * @cpu: The CPU being acquired by the BPF scheduler.
408 	 * @args: Acquire arguments, see the struct definition.
409 	 *
410 	 * A CPU that was previously released from the BPF scheduler is now once
411 	 * again under its control.
412 	 */
413 	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
414 
415 	/**
416 	 * cpu_release - A CPU is taken away from the BPF scheduler
417 	 * @cpu: The CPU being released by the BPF scheduler.
418 	 * @args: Release arguments, see the struct definition.
419 	 *
420 	 * The specified CPU is no longer under the control of the BPF
421 	 * scheduler. This could be because it was preempted by a higher
422 	 * priority sched_class, though there may be other reasons as well. The
423 	 * caller should consult @args->reason to determine the cause.
424 	 */
425 	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
426 
427 	/**
428 	 * init_task - Initialize a task to run in a BPF scheduler
429 	 * @p: task to initialize for BPF scheduling
430 	 * @args: init arguments, see the struct definition
431 	 *
432 	 * Either we're loading a BPF scheduler or a new task is being forked.
433 	 * Initialize @p for BPF scheduling. This operation may block and can
434 	 * be used for allocations, and is called exactly once for a task.
435 	 *
436 	 * Return 0 for success, -errno for failure. An error return while
437 	 * loading will abort loading of the BPF scheduler. During a fork, it
438 	 * will abort that specific fork.
439 	 */
440 	s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
441 
442 	/**
443 	 * exit_task - Exit a previously-running task from the system
444 	 * @p: task to exit
445 	 *
446 	 * @p is exiting or the BPF scheduler is being unloaded. Perform any
447 	 * necessary cleanup for @p.
448 	 */
449 	void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
450 
451 	/**
452 	 * enable - Enable BPF scheduling for a task
453 	 * @p: task to enable BPF scheduling for
454 	 *
455 	 * Enable @p for BPF scheduling. enable() is called on @p any time it
456 	 * enters SCX, and is always paired with a matching disable().
457 	 */
458 	void (*enable)(struct task_struct *p);
459 
460 	/**
461 	 * disable - Disable BPF scheduling for a task
462 	 * @p: task to disable BPF scheduling for
463 	 *
464 	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
465 	 * Disable BPF scheduling for @p. A disable() call is always matched
466 	 * with a prior enable() call.
467 	 */
468 	void (*disable)(struct task_struct *p);
469 
470 	/**
471 	 * dump - Dump BPF scheduler state on error
472 	 * @ctx: debug dump context
473 	 *
474 	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
475 	 */
476 	void (*dump)(struct scx_dump_ctx *ctx);
477 
478 	/**
479 	 * dump_cpu - Dump BPF scheduler state for a CPU on error
480 	 * @ctx: debug dump context
481 	 * @cpu: CPU to generate debug dump for
482 	 * @idle: @cpu is currently idle without any runnable tasks
483 	 *
484 	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
485 	 * @cpu. If @idle is %true and this operation doesn't produce any
486 	 * output, @cpu is skipped for dump.
487 	 */
488 	void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
489 
490 	/**
491 	 * dump_task - Dump BPF scheduler state for a runnable task on error
492 	 * @ctx: debug dump context
493 	 * @p: runnable task to generate debug dump for
494 	 *
495 	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
496 	 * @p.
497 	 */
498 	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
499 
500 	/*
501 	 * All online ops must come before ops.cpu_online().
502 	 */
503 
504 	/**
505 	 * cpu_online - A CPU became online
506 	 * @cpu: CPU which just came up
507 	 *
508 	 * @cpu just came online. @cpu will not call ops.enqueue() or
509 	 * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
510 	 */
511 	void (*cpu_online)(s32 cpu);
512 
513 	/**
514 	 * cpu_offline - A CPU is going offline
515 	 * @cpu: CPU which is going offline
516 	 *
517 	 * @cpu is going offline. @cpu will not call ops.enqueue() or
518 	 * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
519 	 */
520 	void (*cpu_offline)(s32 cpu);
521 
522 	/*
523 	 * All CPU hotplug ops must come before ops.init().
524 	 */
525 
526 	/**
527 	 * init - Initialize the BPF scheduler
528 	 */
529 	s32 (*init)(void);
530 
531 	/**
532 	 * exit - Clean up after the BPF scheduler
533 	 * @info: Exit info
534 	 */
535 	void (*exit)(struct scx_exit_info *info);
536 
537 	/**
538 	 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
539 	 */
540 	u32 dispatch_max_batch;
541 
542 	/**
543 	 * flags - %SCX_OPS_* flags
544 	 */
545 	u64 flags;
546 
547 	/**
548 	 * timeout_ms - The maximum amount of time, in milliseconds, that a
549 	 * runnable task should be able to wait before being scheduled. The
550 	 * maximum timeout may not exceed the default timeout of 30 seconds.
551 	 *
552 	 * Defaults to the maximum allowed timeout value of 30 seconds.
553 	 */
554 	u32 timeout_ms;
555 
556 	/**
557 	 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
558 	 * value of 32768 is used.
559 	 */
560 	u32 exit_dump_len;
561 
562 	/**
563 	 * hotplug_seq - A sequence number that may be set by the scheduler to
564 	 * detect when a hotplug event has occurred during the loading process.
565 	 * If 0, no detection occurs. Otherwise, the scheduler will fail to
566 	 * load if the sequence number does not match @scx_hotplug_seq on the
567 	 * enable path.
568 	 */
569 	u64 hotplug_seq;
570 
571 	/**
572 	 * name - BPF scheduler's name
573 	 *
574 	 * Must be a non-zero valid BPF object name including only isalnum(),
575 	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
576 	 * BPF scheduler is enabled.
577 	 */
578 	char name[SCX_OPS_NAME_LEN];
579 };
580 
581 enum scx_opi {
582 	SCX_OPI_BEGIN			= 0,
583 	SCX_OPI_NORMAL_BEGIN		= 0,
584 	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
585 	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
586 	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
587 	SCX_OPI_END			= SCX_OP_IDX(init),
588 };
589 
590 enum scx_wake_flags {
591 	/* expose select WF_* flags as enums */
592 	SCX_WAKE_FORK		= WF_FORK,
593 	SCX_WAKE_TTWU		= WF_TTWU,
594 	SCX_WAKE_SYNC		= WF_SYNC,
595 };
596 
597 enum scx_enq_flags {
598 	/* expose select ENQUEUE_* flags as enums */
599 	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
600 	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
601 
602 	/* high 32bits are SCX specific */
603 
604 	/*
605 	 * Set the following to trigger preemption when calling
606 	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
607 	 * current task is cleared to zero and the CPU is kicked into the
608 	 * scheduling path. Implies %SCX_ENQ_HEAD.
609 	 */
610 	SCX_ENQ_PREEMPT		= 1LLU << 32,
611 
612 	/*
613 	 * The task being enqueued was previously enqueued on the current CPU's
614 	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
615 	 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
616 	 * invoked in a ->cpu_release() callback, and the task is again
617 	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
618 	 * task will not be scheduled on the CPU until at least the next invocation
619 	 * of the ->cpu_acquire() callback.
620 	 */
621 	SCX_ENQ_REENQ		= 1LLU << 40,
622 
623 	/*
624 	 * The task being enqueued is the only task available for the cpu. By
625 	 * default, ext core keeps executing such tasks but when
626 	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
627 	 * %SCX_ENQ_LAST flag set.
628 	 *
629 	 * If the BPF scheduler wants to continue executing the task,
630 	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
631 	 * If the task gets queued on a different dsq or the BPF side, the BPF
632 	 * scheduler is responsible for triggering a follow-up scheduling event.
633 	 * Otherwise, Execution may stall.
634 	 */
635 	SCX_ENQ_LAST		= 1LLU << 41,
636 
637 	/* high 8 bits are internal */
638 	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
639 
640 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
641 	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
642 };
643 
644 enum scx_deq_flags {
645 	/* expose select DEQUEUE_* flags as enums */
646 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
647 
648 	/* high 32bits are SCX specific */
649 
650 	/*
651 	 * The generic core-sched layer decided to execute the task even though
652 	 * it hasn't been dispatched yet. Dequeue from the BPF side.
653 	 */
654 	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
655 };
656 
657 enum scx_pick_idle_cpu_flags {
658 	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
659 };
660 
661 enum scx_kick_flags {
662 	/*
663 	 * Kick the target CPU if idle. Guarantees that the target CPU goes
664 	 * through at least one full scheduling cycle before going idle. If the
665 	 * target CPU can be determined to be currently not idle and going to go
666 	 * through a scheduling cycle before going idle, noop.
667 	 */
668 	SCX_KICK_IDLE		= 1LLU << 0,
669 
670 	/*
671 	 * Preempt the current task and execute the dispatch path. If the
672 	 * current task of the target CPU is an SCX task, its ->scx.slice is
673 	 * cleared to zero before the scheduling path is invoked so that the
674 	 * task expires and the dispatch path is invoked.
675 	 */
676 	SCX_KICK_PREEMPT	= 1LLU << 1,
677 
678 	/*
679 	 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
680 	 * return after the target CPU finishes picking the next task.
681 	 */
682 	SCX_KICK_WAIT		= 1LLU << 2,
683 };
684 
685 enum scx_ops_enable_state {
686 	SCX_OPS_PREPPING,
687 	SCX_OPS_ENABLING,
688 	SCX_OPS_ENABLED,
689 	SCX_OPS_DISABLING,
690 	SCX_OPS_DISABLED,
691 };
692 
693 static const char *scx_ops_enable_state_str[] = {
694 	[SCX_OPS_PREPPING]	= "prepping",
695 	[SCX_OPS_ENABLING]	= "enabling",
696 	[SCX_OPS_ENABLED]	= "enabled",
697 	[SCX_OPS_DISABLING]	= "disabling",
698 	[SCX_OPS_DISABLED]	= "disabled",
699 };
700 
701 /*
702  * sched_ext_entity->ops_state
703  *
704  * Used to track the task ownership between the SCX core and the BPF scheduler.
705  * State transitions look as follows:
706  *
707  * NONE -> QUEUEING -> QUEUED -> DISPATCHING
708  *   ^              |                 |
709  *   |              v                 v
710  *   \-------------------------------/
711  *
712  * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
713  * sites for explanations on the conditions being waited upon and why they are
714  * safe. Transitions out of them into NONE or QUEUED must store_release and the
715  * waiters should load_acquire.
716  *
717  * Tracking scx_ops_state enables sched_ext core to reliably determine whether
718  * any given task can be dispatched by the BPF scheduler at all times and thus
719  * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
720  * to try to dispatch any task anytime regardless of its state as the SCX core
721  * can safely reject invalid dispatches.
722  */
723 enum scx_ops_state {
724 	SCX_OPSS_NONE,		/* owned by the SCX core */
725 	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
726 	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
727 	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
728 
729 	/*
730 	 * QSEQ brands each QUEUED instance so that, when dispatch races
731 	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
732 	 * on the task being dispatched.
733 	 *
734 	 * As some 32bit archs can't do 64bit store_release/load_acquire,
735 	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
736 	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
737 	 * and runs with IRQ disabled. 30 bits should be sufficient.
738 	 */
739 	SCX_OPSS_QSEQ_SHIFT	= 2,
740 };
741 
742 /* Use macros to ensure that the type is unsigned long for the masks */
743 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
744 #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
745 
746 /*
747  * During exit, a task may schedule after losing its PIDs. When disabling the
748  * BPF scheduler, we need to be able to iterate tasks in every state to
749  * guarantee system safety. Maintain a dedicated task list which contains every
750  * task between its fork and eventual free.
751  */
752 static DEFINE_SPINLOCK(scx_tasks_lock);
753 static LIST_HEAD(scx_tasks);
754 
755 /* ops enable/disable */
756 static struct kthread_worker *scx_ops_helper;
757 static DEFINE_MUTEX(scx_ops_enable_mutex);
758 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
759 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
760 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
761 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
762 static bool scx_switching_all;
763 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
764 
765 static struct sched_ext_ops scx_ops;
766 static bool scx_warned_zero_slice;
767 
768 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
769 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
770 DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
771 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
772 
773 struct static_key_false scx_has_op[SCX_OPI_END] =
774 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
775 
776 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
777 static struct scx_exit_info *scx_exit_info;
778 
779 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
780 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
781 
782 /*
783  * The maximum amount of time in jiffies that a task may be runnable without
784  * being scheduled on a CPU. If this timeout is exceeded, it will trigger
785  * scx_ops_error().
786  */
787 static unsigned long scx_watchdog_timeout;
788 
789 /*
790  * The last time the delayed work was run. This delayed work relies on
791  * ksoftirqd being able to run to service timer interrupts, so it's possible
792  * that this work itself could get wedged. To account for this, we check that
793  * it's not stalled in the timer tick, and trigger an error if it is.
794  */
795 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
796 
797 static struct delayed_work scx_watchdog_work;
798 
799 /* idle tracking */
800 #ifdef CONFIG_SMP
801 #ifdef CONFIG_CPUMASK_OFFSTACK
802 #define CL_ALIGNED_IF_ONSTACK
803 #else
804 #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
805 #endif
806 
807 static struct {
808 	cpumask_var_t cpu;
809 	cpumask_var_t smt;
810 } idle_masks CL_ALIGNED_IF_ONSTACK;
811 
812 #endif	/* CONFIG_SMP */
813 
814 /* for %SCX_KICK_WAIT */
815 static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
816 
817 /*
818  * Direct dispatch marker.
819  *
820  * Non-NULL values are used for direct dispatch from enqueue path. A valid
821  * pointer points to the task currently being enqueued. An ERR_PTR value is used
822  * to indicate that direct dispatch has already happened.
823  */
824 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
825 
826 /* dispatch queues */
827 static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
828 
829 static const struct rhashtable_params dsq_hash_params = {
830 	.key_len		= 8,
831 	.key_offset		= offsetof(struct scx_dispatch_q, id),
832 	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
833 };
834 
835 static struct rhashtable dsq_hash;
836 static LLIST_HEAD(dsqs_to_free);
837 
838 /* dispatch buf */
839 struct scx_dsp_buf_ent {
840 	struct task_struct	*task;
841 	unsigned long		qseq;
842 	u64			dsq_id;
843 	u64			enq_flags;
844 };
845 
846 static u32 scx_dsp_max_batch;
847 
848 struct scx_dsp_ctx {
849 	struct rq		*rq;
850 	struct rq_flags		*rf;
851 	u32			cursor;
852 	u32			nr_tasks;
853 	struct scx_dsp_buf_ent	buf[];
854 };
855 
856 static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
857 
858 /* string formatting from BPF */
859 struct scx_bstr_buf {
860 	u64			data[MAX_BPRINTF_VARARGS];
861 	char			line[SCX_EXIT_MSG_LEN];
862 };
863 
864 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
865 static struct scx_bstr_buf scx_exit_bstr_buf;
866 
867 /* ops debug dump */
868 struct scx_dump_data {
869 	s32			cpu;
870 	bool			first;
871 	s32			cursor;
872 	struct seq_buf		*s;
873 	const char		*prefix;
874 	struct scx_bstr_buf	buf;
875 };
876 
877 struct scx_dump_data scx_dump_data = {
878 	.cpu			= -1,
879 };
880 
881 /* /sys/kernel/sched_ext interface */
882 static struct kset *scx_kset;
883 static struct kobject *scx_root_kobj;
884 
885 #define CREATE_TRACE_POINTS
886 #include <trace/events/sched_ext.h>
887 
888 static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
889 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
890 					     s64 exit_code,
891 					     const char *fmt, ...);
892 
893 #define scx_ops_error_kind(err, fmt, args...)					\
894 	scx_ops_exit_kind((err), 0, fmt, ##args)
895 
896 #define scx_ops_exit(code, fmt, args...)					\
897 	scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
898 
899 #define scx_ops_error(fmt, args...)						\
900 	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
901 
902 #define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
903 
904 static long jiffies_delta_msecs(unsigned long at, unsigned long now)
905 {
906 	if (time_after(at, now))
907 		return jiffies_to_msecs(at - now);
908 	else
909 		return -(long)jiffies_to_msecs(now - at);
910 }
911 
912 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */
913 static u32 higher_bits(u32 flags)
914 {
915 	return ~((1 << fls(flags)) - 1);
916 }
917 
918 /* return the mask with only the highest bit set */
919 static u32 highest_bit(u32 flags)
920 {
921 	int bit = fls(flags);
922 	return ((u64)1 << bit) >> 1;
923 }
924 
925 /*
926  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
927  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
928  * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
929  * whether it's running from an allowed context.
930  *
931  * @mask is constant, always inline to cull the mask calculations.
932  */
933 static __always_inline void scx_kf_allow(u32 mask)
934 {
935 	/* nesting is allowed only in increasing scx_kf_mask order */
936 	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
937 		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
938 		  current->scx.kf_mask, mask);
939 	current->scx.kf_mask |= mask;
940 	barrier();
941 }
942 
943 static void scx_kf_disallow(u32 mask)
944 {
945 	barrier();
946 	current->scx.kf_mask &= ~mask;
947 }
948 
949 #define SCX_CALL_OP(mask, op, args...)						\
950 do {										\
951 	if (mask) {								\
952 		scx_kf_allow(mask);						\
953 		scx_ops.op(args);						\
954 		scx_kf_disallow(mask);						\
955 	} else {								\
956 		scx_ops.op(args);						\
957 	}									\
958 } while (0)
959 
960 #define SCX_CALL_OP_RET(mask, op, args...)					\
961 ({										\
962 	__typeof__(scx_ops.op(args)) __ret;					\
963 	if (mask) {								\
964 		scx_kf_allow(mask);						\
965 		__ret = scx_ops.op(args);					\
966 		scx_kf_disallow(mask);						\
967 	} else {								\
968 		__ret = scx_ops.op(args);					\
969 	}									\
970 	__ret;									\
971 })
972 
973 /*
974  * Some kfuncs are allowed only on the tasks that are subjects of the
975  * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
976  * restrictions, the following SCX_CALL_OP_*() variants should be used when
977  * invoking scx_ops operations that take task arguments. These can only be used
978  * for non-nesting operations due to the way the tasks are tracked.
979  *
980  * kfuncs which can only operate on such tasks can in turn use
981  * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
982  * the specific task.
983  */
984 #define SCX_CALL_OP_TASK(mask, op, task, args...)				\
985 do {										\
986 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
987 	current->scx.kf_tasks[0] = task;					\
988 	SCX_CALL_OP(mask, op, task, ##args);					\
989 	current->scx.kf_tasks[0] = NULL;					\
990 } while (0)
991 
992 #define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
993 ({										\
994 	__typeof__(scx_ops.op(task, ##args)) __ret;				\
995 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
996 	current->scx.kf_tasks[0] = task;					\
997 	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
998 	current->scx.kf_tasks[0] = NULL;					\
999 	__ret;									\
1000 })
1001 
1002 #define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
1003 ({										\
1004 	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
1005 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
1006 	current->scx.kf_tasks[0] = task0;					\
1007 	current->scx.kf_tasks[1] = task1;					\
1008 	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
1009 	current->scx.kf_tasks[0] = NULL;					\
1010 	current->scx.kf_tasks[1] = NULL;					\
1011 	__ret;									\
1012 })
1013 
1014 /* @mask is constant, always inline to cull unnecessary branches */
1015 static __always_inline bool scx_kf_allowed(u32 mask)
1016 {
1017 	if (unlikely(!(current->scx.kf_mask & mask))) {
1018 		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
1019 			      mask, current->scx.kf_mask);
1020 		return false;
1021 	}
1022 
1023 	if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) {
1024 		scx_ops_error("sleepable kfunc called from non-sleepable context");
1025 		return false;
1026 	}
1027 
1028 	/*
1029 	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
1030 	 * DISPATCH must not be called if we're running DEQUEUE which is nested
1031 	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
1032 	 * boundary thanks to the above in_interrupt() check.
1033 	 */
1034 	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
1035 		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
1036 		scx_ops_error("cpu_release kfunc called from a nested operation");
1037 		return false;
1038 	}
1039 
1040 	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
1041 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
1042 		scx_ops_error("dispatch kfunc called from a nested operation");
1043 		return false;
1044 	}
1045 
1046 	return true;
1047 }
1048 
1049 /* see SCX_CALL_OP_TASK() */
1050 static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
1051 							struct task_struct *p)
1052 {
1053 	if (!scx_kf_allowed(mask))
1054 		return false;
1055 
1056 	if (unlikely((p != current->scx.kf_tasks[0] &&
1057 		      p != current->scx.kf_tasks[1]))) {
1058 		scx_ops_error("called on a task not being operated on");
1059 		return false;
1060 	}
1061 
1062 	return true;
1063 }
1064 
1065 
1066 /*
1067  * SCX task iterator.
1068  */
1069 struct scx_task_iter {
1070 	struct sched_ext_entity		cursor;
1071 	struct task_struct		*locked;
1072 	struct rq			*rq;
1073 	struct rq_flags			rf;
1074 };
1075 
1076 /**
1077  * scx_task_iter_init - Initialize a task iterator
1078  * @iter: iterator to init
1079  *
1080  * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
1081  * @iter must eventually be exited with scx_task_iter_exit().
1082  *
1083  * scx_tasks_lock may be released between this and the first next() call or
1084  * between any two next() calls. If scx_tasks_lock is released between two
1085  * next() calls, the caller is responsible for ensuring that the task being
1086  * iterated remains accessible either through RCU read lock or obtaining a
1087  * reference count.
1088  *
1089  * All tasks which existed when the iteration started are guaranteed to be
1090  * visited as long as they still exist.
1091  */
1092 static void scx_task_iter_init(struct scx_task_iter *iter)
1093 {
1094 	lockdep_assert_held(&scx_tasks_lock);
1095 
1096 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
1097 	list_add(&iter->cursor.tasks_node, &scx_tasks);
1098 	iter->locked = NULL;
1099 }
1100 
1101 /**
1102  * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
1103  * @iter: iterator to unlock rq for
1104  *
1105  * If @iter is in the middle of a locked iteration, it may be locking the rq of
1106  * the task currently being visited. Unlock the rq if so. This function can be
1107  * safely called anytime during an iteration.
1108  *
1109  * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
1110  * not locking an rq.
1111  */
1112 static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
1113 {
1114 	if (iter->locked) {
1115 		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
1116 		iter->locked = NULL;
1117 		return true;
1118 	} else {
1119 		return false;
1120 	}
1121 }
1122 
1123 /**
1124  * scx_task_iter_exit - Exit a task iterator
1125  * @iter: iterator to exit
1126  *
1127  * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
1128  * If the iterator holds a task's rq lock, that rq lock is released. See
1129  * scx_task_iter_init() for details.
1130  */
1131 static void scx_task_iter_exit(struct scx_task_iter *iter)
1132 {
1133 	lockdep_assert_held(&scx_tasks_lock);
1134 
1135 	scx_task_iter_rq_unlock(iter);
1136 	list_del_init(&iter->cursor.tasks_node);
1137 }
1138 
1139 /**
1140  * scx_task_iter_next - Next task
1141  * @iter: iterator to walk
1142  *
1143  * Visit the next task. See scx_task_iter_init() for details.
1144  */
1145 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
1146 {
1147 	struct list_head *cursor = &iter->cursor.tasks_node;
1148 	struct sched_ext_entity *pos;
1149 
1150 	lockdep_assert_held(&scx_tasks_lock);
1151 
1152 	list_for_each_entry(pos, cursor, tasks_node) {
1153 		if (&pos->tasks_node == &scx_tasks)
1154 			return NULL;
1155 		if (!(pos->flags & SCX_TASK_CURSOR)) {
1156 			list_move(cursor, &pos->tasks_node);
1157 			return container_of(pos, struct task_struct, scx);
1158 		}
1159 	}
1160 
1161 	/* can't happen, should always terminate at scx_tasks above */
1162 	BUG();
1163 }
1164 
1165 /**
1166  * scx_task_iter_next_locked - Next non-idle task with its rq locked
1167  * @iter: iterator to walk
1168  * @include_dead: Whether we should include dead tasks in the iteration
1169  *
1170  * Visit the non-idle task with its rq lock held. Allows callers to specify
1171  * whether they would like to filter out dead tasks. See scx_task_iter_init()
1172  * for details.
1173  */
1174 static struct task_struct *
1175 scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
1176 {
1177 	struct task_struct *p;
1178 retry:
1179 	scx_task_iter_rq_unlock(iter);
1180 
1181 	while ((p = scx_task_iter_next(iter))) {
1182 		/*
1183 		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
1184 		 * which haven't yet been onlined. Test sched_class directly.
1185 		 */
1186 		if (p->sched_class != &idle_sched_class)
1187 			break;
1188 	}
1189 	if (!p)
1190 		return NULL;
1191 
1192 	iter->rq = task_rq_lock(p, &iter->rf);
1193 	iter->locked = p;
1194 
1195 	/*
1196 	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
1197 	 * the final __schedule(), won't ever need to be scheduled again and can
1198 	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
1199 	 * the final __schedle() while we're locking its rq and thus will stay
1200 	 * alive until the rq is unlocked.
1201 	 */
1202 	if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
1203 		goto retry;
1204 
1205 	return p;
1206 }
1207 
1208 static enum scx_ops_enable_state scx_ops_enable_state(void)
1209 {
1210 	return atomic_read(&scx_ops_enable_state_var);
1211 }
1212 
1213 static enum scx_ops_enable_state
1214 scx_ops_set_enable_state(enum scx_ops_enable_state to)
1215 {
1216 	return atomic_xchg(&scx_ops_enable_state_var, to);
1217 }
1218 
1219 static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
1220 					enum scx_ops_enable_state from)
1221 {
1222 	int from_v = from;
1223 
1224 	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
1225 }
1226 
1227 static bool scx_ops_bypassing(void)
1228 {
1229 	return unlikely(atomic_read(&scx_ops_bypass_depth));
1230 }
1231 
1232 /**
1233  * wait_ops_state - Busy-wait the specified ops state to end
1234  * @p: target task
1235  * @opss: state to wait the end of
1236  *
1237  * Busy-wait for @p to transition out of @opss. This can only be used when the
1238  * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
1239  * has load_acquire semantics to ensure that the caller can see the updates made
1240  * in the enqueueing and dispatching paths.
1241  */
1242 static void wait_ops_state(struct task_struct *p, unsigned long opss)
1243 {
1244 	do {
1245 		cpu_relax();
1246 	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
1247 }
1248 
1249 /**
1250  * ops_cpu_valid - Verify a cpu number
1251  * @cpu: cpu number which came from a BPF ops
1252  * @where: extra information reported on error
1253  *
1254  * @cpu is a cpu number which came from the BPF scheduler and can be any value.
1255  * Verify that it is in range and one of the possible cpus. If invalid, trigger
1256  * an ops error.
1257  */
1258 static bool ops_cpu_valid(s32 cpu, const char *where)
1259 {
1260 	if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
1261 		return true;
1262 	} else {
1263 		scx_ops_error("invalid CPU %d%s%s", cpu,
1264 			      where ? " " : "", where ?: "");
1265 		return false;
1266 	}
1267 }
1268 
1269 /**
1270  * ops_sanitize_err - Sanitize a -errno value
1271  * @ops_name: operation to blame on failure
1272  * @err: -errno value to sanitize
1273  *
1274  * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
1275  * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
1276  * cause misbehaviors. For an example, a large negative return from
1277  * ops.init_task() triggers an oops when passed up the call chain because the
1278  * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
1279  * handled as a pointer.
1280  */
1281 static int ops_sanitize_err(const char *ops_name, s32 err)
1282 {
1283 	if (err < 0 && err >= -MAX_ERRNO)
1284 		return err;
1285 
1286 	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
1287 	return -EPROTO;
1288 }
1289 
1290 /**
1291  * touch_core_sched - Update timestamp used for core-sched task ordering
1292  * @rq: rq to read clock from, must be locked
1293  * @p: task to update the timestamp for
1294  *
1295  * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
1296  * implement global or local-DSQ FIFO ordering for core-sched. Should be called
1297  * when a task becomes runnable and its turn on the CPU ends (e.g. slice
1298  * exhaustion).
1299  */
1300 static void touch_core_sched(struct rq *rq, struct task_struct *p)
1301 {
1302 #ifdef CONFIG_SCHED_CORE
1303 	/*
1304 	 * It's okay to update the timestamp spuriously. Use
1305 	 * sched_core_disabled() which is cheaper than enabled().
1306 	 */
1307 	if (!sched_core_disabled())
1308 		p->scx.core_sched_at = rq_clock_task(rq);
1309 #endif
1310 }
1311 
1312 /**
1313  * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
1314  * @rq: rq to read clock from, must be locked
1315  * @p: task being dispatched
1316  *
1317  * If the BPF scheduler implements custom core-sched ordering via
1318  * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
1319  * ordering within each local DSQ. This function is called from dispatch paths
1320  * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
1321  */
1322 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
1323 {
1324 	lockdep_assert_rq_held(rq);
1325 	assert_clock_updated(rq);
1326 
1327 #ifdef CONFIG_SCHED_CORE
1328 	if (SCX_HAS_OP(core_sched_before))
1329 		touch_core_sched(rq, p);
1330 #endif
1331 }
1332 
1333 static void update_curr_scx(struct rq *rq)
1334 {
1335 	struct task_struct *curr = rq->curr;
1336 	u64 now = rq_clock_task(rq);
1337 	u64 delta_exec;
1338 
1339 	if (time_before_eq64(now, curr->se.exec_start))
1340 		return;
1341 
1342 	delta_exec = now - curr->se.exec_start;
1343 	curr->se.exec_start = now;
1344 	curr->se.sum_exec_runtime += delta_exec;
1345 	account_group_exec_runtime(curr, delta_exec);
1346 	cgroup_account_cputime(curr, delta_exec);
1347 
1348 	if (curr->scx.slice != SCX_SLICE_INF) {
1349 		curr->scx.slice -= min(curr->scx.slice, delta_exec);
1350 		if (!curr->scx.slice)
1351 			touch_core_sched(rq, curr);
1352 	}
1353 }
1354 
1355 static bool scx_dsq_priq_less(struct rb_node *node_a,
1356 			      const struct rb_node *node_b)
1357 {
1358 	const struct task_struct *a =
1359 		container_of(node_a, struct task_struct, scx.dsq_node.priq);
1360 	const struct task_struct *b =
1361 		container_of(node_b, struct task_struct, scx.dsq_node.priq);
1362 
1363 	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
1364 }
1365 
1366 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
1367 {
1368 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
1369 	WRITE_ONCE(dsq->nr, dsq->nr + delta);
1370 }
1371 
1372 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
1373 			     u64 enq_flags)
1374 {
1375 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
1376 
1377 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list));
1378 	WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) ||
1379 		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
1380 
1381 	if (!is_local) {
1382 		raw_spin_lock(&dsq->lock);
1383 		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
1384 			scx_ops_error("attempting to dispatch to a destroyed dsq");
1385 			/* fall back to the global dsq */
1386 			raw_spin_unlock(&dsq->lock);
1387 			dsq = &scx_dsq_global;
1388 			raw_spin_lock(&dsq->lock);
1389 		}
1390 	}
1391 
1392 	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
1393 		     (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
1394 		/*
1395 		 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
1396 		 * their FIFO queues. To avoid confusion and accidentally
1397 		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
1398 		 * disallow any internal DSQ from doing vtime ordering of
1399 		 * tasks.
1400 		 */
1401 		scx_ops_error("cannot use vtime ordering for built-in DSQs");
1402 		enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
1403 	}
1404 
1405 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
1406 		struct rb_node *rbp;
1407 
1408 		/*
1409 		 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
1410 		 * linked to both the rbtree and list on PRIQs, this can only be
1411 		 * tested easily when adding the first task.
1412 		 */
1413 		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
1414 			     !list_empty(&dsq->list)))
1415 			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
1416 				      dsq->id);
1417 
1418 		p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ;
1419 		rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less);
1420 
1421 		/*
1422 		 * Find the previous task and insert after it on the list so
1423 		 * that @dsq->list is vtime ordered.
1424 		 */
1425 		rbp = rb_prev(&p->scx.dsq_node.priq);
1426 		if (rbp) {
1427 			struct task_struct *prev =
1428 				container_of(rbp, struct task_struct,
1429 					     scx.dsq_node.priq);
1430 			list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list);
1431 		} else {
1432 			list_add(&p->scx.dsq_node.list, &dsq->list);
1433 		}
1434 	} else {
1435 		/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
1436 		if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
1437 			scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
1438 				      dsq->id);
1439 
1440 		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
1441 			list_add(&p->scx.dsq_node.list, &dsq->list);
1442 		else
1443 			list_add_tail(&p->scx.dsq_node.list, &dsq->list);
1444 	}
1445 
1446 	dsq_mod_nr(dsq, 1);
1447 	p->scx.dsq = dsq;
1448 
1449 	/*
1450 	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
1451 	 * direct dispatch path, but we clear them here because the direct
1452 	 * dispatch verdict may be overridden on the enqueue path during e.g.
1453 	 * bypass.
1454 	 */
1455 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
1456 	p->scx.ddsp_enq_flags = 0;
1457 
1458 	/*
1459 	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
1460 	 * match waiters' load_acquire.
1461 	 */
1462 	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
1463 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1464 
1465 	if (is_local) {
1466 		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
1467 		bool preempt = false;
1468 
1469 		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
1470 		    rq->curr->sched_class == &ext_sched_class) {
1471 			rq->curr->scx.slice = 0;
1472 			preempt = true;
1473 		}
1474 
1475 		if (preempt || sched_class_above(&ext_sched_class,
1476 						 rq->curr->sched_class))
1477 			resched_curr(rq);
1478 	} else {
1479 		raw_spin_unlock(&dsq->lock);
1480 	}
1481 }
1482 
1483 static void task_unlink_from_dsq(struct task_struct *p,
1484 				 struct scx_dispatch_q *dsq)
1485 {
1486 	if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) {
1487 		rb_erase(&p->scx.dsq_node.priq, &dsq->priq);
1488 		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
1489 		p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ;
1490 	}
1491 
1492 	list_del_init(&p->scx.dsq_node.list);
1493 }
1494 
1495 static bool task_linked_on_dsq(struct task_struct *p)
1496 {
1497 	return !list_empty(&p->scx.dsq_node.list);
1498 }
1499 
1500 static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
1501 {
1502 	struct scx_dispatch_q *dsq = p->scx.dsq;
1503 	bool is_local = dsq == &rq->scx.local_dsq;
1504 
1505 	if (!dsq) {
1506 		WARN_ON_ONCE(task_linked_on_dsq(p));
1507 		/*
1508 		 * When dispatching directly from the BPF scheduler to a local
1509 		 * DSQ, the task isn't associated with any DSQ but
1510 		 * @p->scx.holding_cpu may be set under the protection of
1511 		 * %SCX_OPSS_DISPATCHING.
1512 		 */
1513 		if (p->scx.holding_cpu >= 0)
1514 			p->scx.holding_cpu = -1;
1515 		return;
1516 	}
1517 
1518 	if (!is_local)
1519 		raw_spin_lock(&dsq->lock);
1520 
1521 	/*
1522 	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
1523 	 * can't change underneath us.
1524 	*/
1525 	if (p->scx.holding_cpu < 0) {
1526 		/* @p must still be on @dsq, dequeue */
1527 		WARN_ON_ONCE(!task_linked_on_dsq(p));
1528 		task_unlink_from_dsq(p, dsq);
1529 		dsq_mod_nr(dsq, -1);
1530 	} else {
1531 		/*
1532 		 * We're racing against dispatch_to_local_dsq() which already
1533 		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
1534 		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
1535 		 * the race.
1536 		 */
1537 		WARN_ON_ONCE(task_linked_on_dsq(p));
1538 		p->scx.holding_cpu = -1;
1539 	}
1540 	p->scx.dsq = NULL;
1541 
1542 	if (!is_local)
1543 		raw_spin_unlock(&dsq->lock);
1544 }
1545 
1546 static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
1547 {
1548 	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
1549 }
1550 
1551 static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
1552 {
1553 	lockdep_assert(rcu_read_lock_any_held());
1554 
1555 	if (dsq_id == SCX_DSQ_GLOBAL)
1556 		return &scx_dsq_global;
1557 	else
1558 		return find_user_dsq(dsq_id);
1559 }
1560 
1561 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
1562 						    struct task_struct *p)
1563 {
1564 	struct scx_dispatch_q *dsq;
1565 
1566 	if (dsq_id == SCX_DSQ_LOCAL)
1567 		return &rq->scx.local_dsq;
1568 
1569 	dsq = find_non_local_dsq(dsq_id);
1570 	if (unlikely(!dsq)) {
1571 		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
1572 			      dsq_id, p->comm, p->pid);
1573 		return &scx_dsq_global;
1574 	}
1575 
1576 	return dsq;
1577 }
1578 
1579 static void mark_direct_dispatch(struct task_struct *ddsp_task,
1580 				 struct task_struct *p, u64 dsq_id,
1581 				 u64 enq_flags)
1582 {
1583 	/*
1584 	 * Mark that dispatch already happened from ops.select_cpu() or
1585 	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
1586 	 * which can never match a valid task pointer.
1587 	 */
1588 	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
1589 
1590 	/* @p must match the task on the enqueue path */
1591 	if (unlikely(p != ddsp_task)) {
1592 		if (IS_ERR(ddsp_task))
1593 			scx_ops_error("%s[%d] already direct-dispatched",
1594 				      p->comm, p->pid);
1595 		else
1596 			scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
1597 				      ddsp_task->comm, ddsp_task->pid,
1598 				      p->comm, p->pid);
1599 		return;
1600 	}
1601 
1602 	/*
1603 	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
1604 	 * dispatching to the local DSQ of a different CPU requires unlocking
1605 	 * the current rq which isn't allowed in the enqueue path. Use
1606 	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
1607 	 */
1608 	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
1609 		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
1610 		return;
1611 	}
1612 
1613 	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
1614 	WARN_ON_ONCE(p->scx.ddsp_enq_flags);
1615 
1616 	p->scx.ddsp_dsq_id = dsq_id;
1617 	p->scx.ddsp_enq_flags = enq_flags;
1618 }
1619 
1620 static void direct_dispatch(struct task_struct *p, u64 enq_flags)
1621 {
1622 	struct scx_dispatch_q *dsq;
1623 
1624 	touch_core_sched_dispatch(task_rq(p), p);
1625 
1626 	enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
1627 	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
1628 	dispatch_enqueue(dsq, p, enq_flags);
1629 }
1630 
1631 static bool scx_rq_online(struct rq *rq)
1632 {
1633 	return likely(rq->scx.flags & SCX_RQ_ONLINE);
1634 }
1635 
1636 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
1637 			    int sticky_cpu)
1638 {
1639 	struct task_struct **ddsp_taskp;
1640 	unsigned long qseq;
1641 
1642 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
1643 
1644 	/* rq migration */
1645 	if (sticky_cpu == cpu_of(rq))
1646 		goto local_norefill;
1647 
1648 	/*
1649 	 * If !scx_rq_online(), we already told the BPF scheduler that the CPU
1650 	 * is offline and are just running the hotplug path. Don't bother the
1651 	 * BPF scheduler.
1652 	 */
1653 	if (!scx_rq_online(rq))
1654 		goto local;
1655 
1656 	if (scx_ops_bypassing()) {
1657 		if (enq_flags & SCX_ENQ_LAST)
1658 			goto local;
1659 		else
1660 			goto global;
1661 	}
1662 
1663 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1664 		goto direct;
1665 
1666 	/* see %SCX_OPS_ENQ_EXITING */
1667 	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
1668 	    unlikely(p->flags & PF_EXITING))
1669 		goto local;
1670 
1671 	/* see %SCX_OPS_ENQ_LAST */
1672 	if (!static_branch_unlikely(&scx_ops_enq_last) &&
1673 	    (enq_flags & SCX_ENQ_LAST))
1674 		goto local;
1675 
1676 	if (!SCX_HAS_OP(enqueue))
1677 		goto global;
1678 
1679 	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
1680 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
1681 
1682 	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1683 	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
1684 
1685 	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
1686 	WARN_ON_ONCE(*ddsp_taskp);
1687 	*ddsp_taskp = p;
1688 
1689 	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
1690 
1691 	*ddsp_taskp = NULL;
1692 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1693 		goto direct;
1694 
1695 	/*
1696 	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
1697 	 * dequeue may be waiting. The store_release matches their load_acquire.
1698 	 */
1699 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
1700 	return;
1701 
1702 direct:
1703 	direct_dispatch(p, enq_flags);
1704 	return;
1705 
1706 local:
1707 	/*
1708 	 * For task-ordering, slice refill must be treated as implying the end
1709 	 * of the current slice. Otherwise, the longer @p stays on the CPU, the
1710 	 * higher priority it becomes from scx_prio_less()'s POV.
1711 	 */
1712 	touch_core_sched(rq, p);
1713 	p->scx.slice = SCX_SLICE_DFL;
1714 local_norefill:
1715 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
1716 	return;
1717 
1718 global:
1719 	touch_core_sched(rq, p);	/* see the comment in local: */
1720 	p->scx.slice = SCX_SLICE_DFL;
1721 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
1722 }
1723 
1724 static bool task_runnable(const struct task_struct *p)
1725 {
1726 	return !list_empty(&p->scx.runnable_node);
1727 }
1728 
1729 static void set_task_runnable(struct rq *rq, struct task_struct *p)
1730 {
1731 	lockdep_assert_rq_held(rq);
1732 
1733 	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
1734 		p->scx.runnable_at = jiffies;
1735 		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
1736 	}
1737 
1738 	/*
1739 	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
1740 	 * appened to the runnable_list.
1741 	 */
1742 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
1743 }
1744 
1745 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
1746 {
1747 	list_del_init(&p->scx.runnable_node);
1748 	if (reset_runnable_at)
1749 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
1750 }
1751 
1752 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
1753 {
1754 	int sticky_cpu = p->scx.sticky_cpu;
1755 
1756 	enq_flags |= rq->scx.extra_enq_flags;
1757 
1758 	if (sticky_cpu >= 0)
1759 		p->scx.sticky_cpu = -1;
1760 
1761 	/*
1762 	 * Restoring a running task will be immediately followed by
1763 	 * set_next_task_scx() which expects the task to not be on the BPF
1764 	 * scheduler as tasks can only start running through local DSQs. Force
1765 	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
1766 	 */
1767 	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
1768 		sticky_cpu = cpu_of(rq);
1769 
1770 	if (p->scx.flags & SCX_TASK_QUEUED) {
1771 		WARN_ON_ONCE(!task_runnable(p));
1772 		return;
1773 	}
1774 
1775 	set_task_runnable(rq, p);
1776 	p->scx.flags |= SCX_TASK_QUEUED;
1777 	rq->scx.nr_running++;
1778 	add_nr_running(rq, 1);
1779 
1780 	if (SCX_HAS_OP(runnable))
1781 		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
1782 
1783 	if (enq_flags & SCX_ENQ_WAKEUP)
1784 		touch_core_sched(rq, p);
1785 
1786 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
1787 }
1788 
1789 static void ops_dequeue(struct task_struct *p, u64 deq_flags)
1790 {
1791 	unsigned long opss;
1792 
1793 	/* dequeue is always temporary, don't reset runnable_at */
1794 	clr_task_runnable(p, false);
1795 
1796 	/* acquire ensures that we see the preceding updates on QUEUED */
1797 	opss = atomic_long_read_acquire(&p->scx.ops_state);
1798 
1799 	switch (opss & SCX_OPSS_STATE_MASK) {
1800 	case SCX_OPSS_NONE:
1801 		break;
1802 	case SCX_OPSS_QUEUEING:
1803 		/*
1804 		 * QUEUEING is started and finished while holding @p's rq lock.
1805 		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
1806 		 */
1807 		BUG();
1808 	case SCX_OPSS_QUEUED:
1809 		if (SCX_HAS_OP(dequeue))
1810 			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
1811 
1812 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
1813 					    SCX_OPSS_NONE))
1814 			break;
1815 		fallthrough;
1816 	case SCX_OPSS_DISPATCHING:
1817 		/*
1818 		 * If @p is being dispatched from the BPF scheduler to a DSQ,
1819 		 * wait for the transfer to complete so that @p doesn't get
1820 		 * added to its DSQ after dequeueing is complete.
1821 		 *
1822 		 * As we're waiting on DISPATCHING with the rq locked, the
1823 		 * dispatching side shouldn't try to lock the rq while
1824 		 * DISPATCHING is set. See dispatch_to_local_dsq().
1825 		 *
1826 		 * DISPATCHING shouldn't have qseq set and control can reach
1827 		 * here with NONE @opss from the above QUEUED case block.
1828 		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
1829 		 */
1830 		wait_ops_state(p, SCX_OPSS_DISPATCHING);
1831 		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1832 		break;
1833 	}
1834 }
1835 
1836 static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
1837 {
1838 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
1839 		WARN_ON_ONCE(task_runnable(p));
1840 		return;
1841 	}
1842 
1843 	ops_dequeue(p, deq_flags);
1844 
1845 	/*
1846 	 * A currently running task which is going off @rq first gets dequeued
1847 	 * and then stops running. As we want running <-> stopping transitions
1848 	 * to be contained within runnable <-> quiescent transitions, trigger
1849 	 * ->stopping() early here instead of in put_prev_task_scx().
1850 	 *
1851 	 * @p may go through multiple stopping <-> running transitions between
1852 	 * here and put_prev_task_scx() if task attribute changes occur while
1853 	 * balance_scx() leaves @rq unlocked. However, they don't contain any
1854 	 * information meaningful to the BPF scheduler and can be suppressed by
1855 	 * skipping the callbacks if the task is !QUEUED.
1856 	 */
1857 	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
1858 		update_curr_scx(rq);
1859 		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
1860 	}
1861 
1862 	if (SCX_HAS_OP(quiescent))
1863 		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
1864 
1865 	if (deq_flags & SCX_DEQ_SLEEP)
1866 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
1867 	else
1868 		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
1869 
1870 	p->scx.flags &= ~SCX_TASK_QUEUED;
1871 	rq->scx.nr_running--;
1872 	sub_nr_running(rq, 1);
1873 
1874 	dispatch_dequeue(rq, p);
1875 }
1876 
1877 static void yield_task_scx(struct rq *rq)
1878 {
1879 	struct task_struct *p = rq->curr;
1880 
1881 	if (SCX_HAS_OP(yield))
1882 		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
1883 	else
1884 		p->scx.slice = 0;
1885 }
1886 
1887 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
1888 {
1889 	struct task_struct *from = rq->curr;
1890 
1891 	if (SCX_HAS_OP(yield))
1892 		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
1893 	else
1894 		return false;
1895 }
1896 
1897 #ifdef CONFIG_SMP
1898 /**
1899  * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
1900  * @rq: rq to move the task into, currently locked
1901  * @p: task to move
1902  * @enq_flags: %SCX_ENQ_*
1903  *
1904  * Move @p which is currently on a different rq to @rq's local DSQ. The caller
1905  * must:
1906  *
1907  * 1. Start with exclusive access to @p either through its DSQ lock or
1908  *    %SCX_OPSS_DISPATCHING flag.
1909  *
1910  * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
1911  *
1912  * 3. Remember task_rq(@p). Release the exclusive access so that we don't
1913  *    deadlock with dequeue.
1914  *
1915  * 4. Lock @rq and the task_rq from #3.
1916  *
1917  * 5. Call this function.
1918  *
1919  * Returns %true if @p was successfully moved. %false after racing dequeue and
1920  * losing.
1921  */
1922 static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
1923 				   u64 enq_flags)
1924 {
1925 	struct rq *task_rq;
1926 
1927 	lockdep_assert_rq_held(rq);
1928 
1929 	/*
1930 	 * If dequeue got to @p while we were trying to lock both rq's, it'd
1931 	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
1932 	 * updated it to different values afterwards, as this operation can't be
1933 	 * preempted or recurse, @p->scx.holding_cpu can never become
1934 	 * raw_smp_processor_id() again before we're done. Thus, we can tell
1935 	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
1936 	 * still raw_smp_processor_id().
1937 	 *
1938 	 * See dispatch_dequeue() for the counterpart.
1939 	 */
1940 	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
1941 		return false;
1942 
1943 	/* @p->rq couldn't have changed if we're still the holding cpu */
1944 	task_rq = task_rq(p);
1945 	lockdep_assert_rq_held(task_rq);
1946 
1947 	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
1948 	deactivate_task(task_rq, p, 0);
1949 	set_task_cpu(p, cpu_of(rq));
1950 	p->scx.sticky_cpu = cpu_of(rq);
1951 
1952 	/*
1953 	 * We want to pass scx-specific enq_flags but activate_task() will
1954 	 * truncate the upper 32 bit. As we own @rq, we can pass them through
1955 	 * @rq->scx.extra_enq_flags instead.
1956 	 */
1957 	WARN_ON_ONCE(rq->scx.extra_enq_flags);
1958 	rq->scx.extra_enq_flags = enq_flags;
1959 	activate_task(rq, p, 0);
1960 	rq->scx.extra_enq_flags = 0;
1961 
1962 	return true;
1963 }
1964 
1965 /**
1966  * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked
1967  * @rq: current rq which is locked
1968  * @rf: rq_flags to use when unlocking @rq
1969  * @src_rq: rq to move task from
1970  * @dst_rq: rq to move task to
1971  *
1972  * We're holding @rq lock and trying to dispatch a task from @src_rq to
1973  * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
1974  * @rq stays locked isn't important as long as the state is restored after
1975  * dispatch_to_local_dsq_unlock().
1976  */
1977 static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
1978 				       struct rq *src_rq, struct rq *dst_rq)
1979 {
1980 	rq_unpin_lock(rq, rf);
1981 
1982 	if (src_rq == dst_rq) {
1983 		raw_spin_rq_unlock(rq);
1984 		raw_spin_rq_lock(dst_rq);
1985 	} else if (rq == src_rq) {
1986 		double_lock_balance(rq, dst_rq);
1987 		rq_repin_lock(rq, rf);
1988 	} else if (rq == dst_rq) {
1989 		double_lock_balance(rq, src_rq);
1990 		rq_repin_lock(rq, rf);
1991 	} else {
1992 		raw_spin_rq_unlock(rq);
1993 		double_rq_lock(src_rq, dst_rq);
1994 	}
1995 }
1996 
1997 /**
1998  * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
1999  * @rq: current rq which is locked
2000  * @rf: rq_flags to use when unlocking @rq
2001  * @src_rq: rq to move task from
2002  * @dst_rq: rq to move task to
2003  *
2004  * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
2005  */
2006 static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
2007 					 struct rq *src_rq, struct rq *dst_rq)
2008 {
2009 	if (src_rq == dst_rq) {
2010 		raw_spin_rq_unlock(dst_rq);
2011 		raw_spin_rq_lock(rq);
2012 		rq_repin_lock(rq, rf);
2013 	} else if (rq == src_rq) {
2014 		double_unlock_balance(rq, dst_rq);
2015 	} else if (rq == dst_rq) {
2016 		double_unlock_balance(rq, src_rq);
2017 	} else {
2018 		double_rq_unlock(src_rq, dst_rq);
2019 		raw_spin_rq_lock(rq);
2020 		rq_repin_lock(rq, rf);
2021 	}
2022 }
2023 #endif	/* CONFIG_SMP */
2024 
2025 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
2026 			       struct task_struct *p)
2027 {
2028 	lockdep_assert_held(&dsq->lock);	/* released on return */
2029 
2030 	/* @dsq is locked and @p is on this rq */
2031 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2032 	task_unlink_from_dsq(p, dsq);
2033 	list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
2034 	dsq_mod_nr(dsq, -1);
2035 	dsq_mod_nr(&rq->scx.local_dsq, 1);
2036 	p->scx.dsq = &rq->scx.local_dsq;
2037 	raw_spin_unlock(&dsq->lock);
2038 }
2039 
2040 #ifdef CONFIG_SMP
2041 /*
2042  * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
2043  * can be pulled to @rq.
2044  */
2045 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
2046 {
2047 	int cpu = cpu_of(rq);
2048 
2049 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2050 		return false;
2051 	if (unlikely(is_migration_disabled(p)))
2052 		return false;
2053 	if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
2054 		return false;
2055 	if (!scx_rq_online(rq))
2056 		return false;
2057 	return true;
2058 }
2059 
2060 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
2061 				struct scx_dispatch_q *dsq,
2062 				struct task_struct *p, struct rq *task_rq)
2063 {
2064 	bool moved = false;
2065 
2066 	lockdep_assert_held(&dsq->lock);	/* released on return */
2067 
2068 	/*
2069 	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
2070 	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
2071 	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
2072 	 * rq lock or fail, do a little dancing from our side. See
2073 	 * move_task_to_local_dsq().
2074 	 */
2075 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2076 	task_unlink_from_dsq(p, dsq);
2077 	dsq_mod_nr(dsq, -1);
2078 	p->scx.holding_cpu = raw_smp_processor_id();
2079 	raw_spin_unlock(&dsq->lock);
2080 
2081 	rq_unpin_lock(rq, rf);
2082 	double_lock_balance(rq, task_rq);
2083 	rq_repin_lock(rq, rf);
2084 
2085 	moved = move_task_to_local_dsq(rq, p, 0);
2086 
2087 	double_unlock_balance(rq, task_rq);
2088 
2089 	return moved;
2090 }
2091 #else	/* CONFIG_SMP */
2092 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; }
2093 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
2094 				struct scx_dispatch_q *dsq,
2095 				struct task_struct *p, struct rq *task_rq) { return false; }
2096 #endif	/* CONFIG_SMP */
2097 
2098 static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
2099 			       struct scx_dispatch_q *dsq)
2100 {
2101 	struct task_struct *p;
2102 retry:
2103 	if (list_empty(&dsq->list))
2104 		return false;
2105 
2106 	raw_spin_lock(&dsq->lock);
2107 
2108 	list_for_each_entry(p, &dsq->list, scx.dsq_node.list) {
2109 		struct rq *task_rq = task_rq(p);
2110 
2111 		if (rq == task_rq) {
2112 			consume_local_task(rq, dsq, p);
2113 			return true;
2114 		}
2115 
2116 		if (task_can_run_on_remote_rq(p, rq)) {
2117 			if (likely(consume_remote_task(rq, rf, dsq, p, task_rq)))
2118 				return true;
2119 			goto retry;
2120 		}
2121 	}
2122 
2123 	raw_spin_unlock(&dsq->lock);
2124 	return false;
2125 }
2126 
2127 enum dispatch_to_local_dsq_ret {
2128 	DTL_DISPATCHED,		/* successfully dispatched */
2129 	DTL_LOST,		/* lost race to dequeue */
2130 	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
2131 	DTL_INVALID,		/* invalid local dsq_id */
2132 };
2133 
2134 /**
2135  * dispatch_to_local_dsq - Dispatch a task to a local dsq
2136  * @rq: current rq which is locked
2137  * @rf: rq_flags to use when unlocking @rq
2138  * @dsq_id: destination dsq ID
2139  * @p: task to dispatch
2140  * @enq_flags: %SCX_ENQ_*
2141  *
2142  * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
2143  * @dsq_id. This function performs all the synchronization dancing needed
2144  * because local DSQs are protected with rq locks.
2145  *
2146  * The caller must have exclusive ownership of @p (e.g. through
2147  * %SCX_OPSS_DISPATCHING).
2148  */
2149 static enum dispatch_to_local_dsq_ret
2150 dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
2151 		      struct task_struct *p, u64 enq_flags)
2152 {
2153 	struct rq *src_rq = task_rq(p);
2154 	struct rq *dst_rq;
2155 
2156 	/*
2157 	 * We're synchronized against dequeue through DISPATCHING. As @p can't
2158 	 * be dequeued, its task_rq and cpus_allowed are stable too.
2159 	 */
2160 	if (dsq_id == SCX_DSQ_LOCAL) {
2161 		dst_rq = rq;
2162 	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
2163 		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
2164 
2165 		if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
2166 			return DTL_INVALID;
2167 		dst_rq = cpu_rq(cpu);
2168 	} else {
2169 		return DTL_NOT_LOCAL;
2170 	}
2171 
2172 	/* if dispatching to @rq that @p is already on, no lock dancing needed */
2173 	if (rq == src_rq && rq == dst_rq) {
2174 		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
2175 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
2176 		return DTL_DISPATCHED;
2177 	}
2178 
2179 #ifdef CONFIG_SMP
2180 	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
2181 		struct rq *locked_dst_rq = dst_rq;
2182 		bool dsp;
2183 
2184 		/*
2185 		 * @p is on a possibly remote @src_rq which we need to lock to
2186 		 * move the task. If dequeue is in progress, it'd be locking
2187 		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
2188 		 * lock while holding DISPATCHING.
2189 		 *
2190 		 * As DISPATCHING guarantees that @p is wholly ours, we can
2191 		 * pretend that we're moving from a DSQ and use the same
2192 		 * mechanism - mark the task under transfer with holding_cpu,
2193 		 * release DISPATCHING and then follow the same protocol.
2194 		 */
2195 		p->scx.holding_cpu = raw_smp_processor_id();
2196 
2197 		/* store_release ensures that dequeue sees the above */
2198 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
2199 
2200 		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
2201 
2202 		/*
2203 		 * We don't require the BPF scheduler to avoid dispatching to
2204 		 * offline CPUs mostly for convenience but also because CPUs can
2205 		 * go offline between scx_bpf_dispatch() calls and here. If @p
2206 		 * is destined to an offline CPU, queue it on its current CPU
2207 		 * instead, which should always be safe. As this is an allowed
2208 		 * behavior, don't trigger an ops error.
2209 		 */
2210 		if (!scx_rq_online(dst_rq))
2211 			dst_rq = src_rq;
2212 
2213 		if (src_rq == dst_rq) {
2214 			/*
2215 			 * As @p is staying on the same rq, there's no need to
2216 			 * go through the full deactivate/activate cycle.
2217 			 * Optimize by abbreviating the operations in
2218 			 * move_task_to_local_dsq().
2219 			 */
2220 			dsp = p->scx.holding_cpu == raw_smp_processor_id();
2221 			if (likely(dsp)) {
2222 				p->scx.holding_cpu = -1;
2223 				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
2224 						 enq_flags);
2225 			}
2226 		} else {
2227 			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
2228 		}
2229 
2230 		/* if the destination CPU is idle, wake it up */
2231 		if (dsp && sched_class_above(p->sched_class,
2232 					     dst_rq->curr->sched_class))
2233 			resched_curr(dst_rq);
2234 
2235 		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
2236 
2237 		return dsp ? DTL_DISPATCHED : DTL_LOST;
2238 	}
2239 #endif	/* CONFIG_SMP */
2240 
2241 	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
2242 		      cpu_of(dst_rq), p->comm, p->pid);
2243 	return DTL_INVALID;
2244 }
2245 
2246 /**
2247  * finish_dispatch - Asynchronously finish dispatching a task
2248  * @rq: current rq which is locked
2249  * @rf: rq_flags to use when unlocking @rq
2250  * @p: task to finish dispatching
2251  * @qseq_at_dispatch: qseq when @p started getting dispatched
2252  * @dsq_id: destination DSQ ID
2253  * @enq_flags: %SCX_ENQ_*
2254  *
2255  * Dispatching to local DSQs may need to wait for queueing to complete or
2256  * require rq lock dancing. As we don't wanna do either while inside
2257  * ops.dispatch() to avoid locking order inversion, we split dispatching into
2258  * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
2259  * task and its qseq. Once ops.dispatch() returns, this function is called to
2260  * finish up.
2261  *
2262  * There is no guarantee that @p is still valid for dispatching or even that it
2263  * was valid in the first place. Make sure that the task is still owned by the
2264  * BPF scheduler and claim the ownership before dispatching.
2265  */
2266 static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
2267 			    struct task_struct *p,
2268 			    unsigned long qseq_at_dispatch,
2269 			    u64 dsq_id, u64 enq_flags)
2270 {
2271 	struct scx_dispatch_q *dsq;
2272 	unsigned long opss;
2273 
2274 	touch_core_sched_dispatch(rq, p);
2275 retry:
2276 	/*
2277 	 * No need for _acquire here. @p is accessed only after a successful
2278 	 * try_cmpxchg to DISPATCHING.
2279 	 */
2280 	opss = atomic_long_read(&p->scx.ops_state);
2281 
2282 	switch (opss & SCX_OPSS_STATE_MASK) {
2283 	case SCX_OPSS_DISPATCHING:
2284 	case SCX_OPSS_NONE:
2285 		/* someone else already got to it */
2286 		return;
2287 	case SCX_OPSS_QUEUED:
2288 		/*
2289 		 * If qseq doesn't match, @p has gone through at least one
2290 		 * dispatch/dequeue and re-enqueue cycle between
2291 		 * scx_bpf_dispatch() and here and we have no claim on it.
2292 		 */
2293 		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
2294 			return;
2295 
2296 		/*
2297 		 * While we know @p is accessible, we don't yet have a claim on
2298 		 * it - the BPF scheduler is allowed to dispatch tasks
2299 		 * spuriously and there can be a racing dequeue attempt. Let's
2300 		 * claim @p by atomically transitioning it from QUEUED to
2301 		 * DISPATCHING.
2302 		 */
2303 		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2304 						   SCX_OPSS_DISPATCHING)))
2305 			break;
2306 		goto retry;
2307 	case SCX_OPSS_QUEUEING:
2308 		/*
2309 		 * do_enqueue_task() is in the process of transferring the task
2310 		 * to the BPF scheduler while holding @p's rq lock. As we aren't
2311 		 * holding any kernel or BPF resource that the enqueue path may
2312 		 * depend upon, it's safe to wait.
2313 		 */
2314 		wait_ops_state(p, opss);
2315 		goto retry;
2316 	}
2317 
2318 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
2319 
2320 	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
2321 	case DTL_DISPATCHED:
2322 		break;
2323 	case DTL_LOST:
2324 		break;
2325 	case DTL_INVALID:
2326 		dsq_id = SCX_DSQ_GLOBAL;
2327 		fallthrough;
2328 	case DTL_NOT_LOCAL:
2329 		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
2330 					    dsq_id, p);
2331 		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
2332 		break;
2333 	}
2334 }
2335 
2336 static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
2337 {
2338 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
2339 	u32 u;
2340 
2341 	for (u = 0; u < dspc->cursor; u++) {
2342 		struct scx_dsp_buf_ent *ent = &dspc->buf[u];
2343 
2344 		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
2345 				ent->enq_flags);
2346 	}
2347 
2348 	dspc->nr_tasks += dspc->cursor;
2349 	dspc->cursor = 0;
2350 }
2351 
2352 static int balance_one(struct rq *rq, struct task_struct *prev,
2353 		       struct rq_flags *rf, bool local)
2354 {
2355 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
2356 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
2357 	int nr_loops = SCX_DSP_MAX_LOOPS;
2358 	bool has_tasks = false;
2359 
2360 	lockdep_assert_rq_held(rq);
2361 	rq->scx.flags |= SCX_RQ_BALANCING;
2362 
2363 	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
2364 	    unlikely(rq->scx.cpu_released)) {
2365 		/*
2366 		 * If the previous sched_class for the current CPU was not SCX,
2367 		 * notify the BPF scheduler that it again has control of the
2368 		 * core. This callback complements ->cpu_release(), which is
2369 		 * emitted in scx_next_task_picked().
2370 		 */
2371 		if (SCX_HAS_OP(cpu_acquire))
2372 			SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
2373 		rq->scx.cpu_released = false;
2374 	}
2375 
2376 	if (prev_on_scx) {
2377 		WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
2378 		update_curr_scx(rq);
2379 
2380 		/*
2381 		 * If @prev is runnable & has slice left, it has priority and
2382 		 * fetching more just increases latency for the fetched tasks.
2383 		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
2384 		 * BPF scheduler wants to handle this explicitly, it should
2385 		 * implement ->cpu_released().
2386 		 *
2387 		 * See scx_ops_disable_workfn() for the explanation on the
2388 		 * bypassing test.
2389 		 *
2390 		 * When balancing a remote CPU for core-sched, there won't be a
2391 		 * following put_prev_task_scx() call and we don't own
2392 		 * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
2393 		 * same conditions later and pick @rq->curr accordingly.
2394 		 */
2395 		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2396 		    prev->scx.slice && !scx_ops_bypassing()) {
2397 			if (local)
2398 				prev->scx.flags |= SCX_TASK_BAL_KEEP;
2399 			goto has_tasks;
2400 		}
2401 	}
2402 
2403 	/* if there already are tasks to run, nothing to do */
2404 	if (rq->scx.local_dsq.nr)
2405 		goto has_tasks;
2406 
2407 	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
2408 		goto has_tasks;
2409 
2410 	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
2411 		goto out;
2412 
2413 	dspc->rq = rq;
2414 	dspc->rf = rf;
2415 
2416 	/*
2417 	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
2418 	 * the local DSQ might still end up empty after a successful
2419 	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
2420 	 * produced some tasks, retry. The BPF scheduler may depend on this
2421 	 * looping behavior to simplify its implementation.
2422 	 */
2423 	do {
2424 		dspc->nr_tasks = 0;
2425 
2426 		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
2427 			    prev_on_scx ? prev : NULL);
2428 
2429 		flush_dispatch_buf(rq, rf);
2430 
2431 		if (rq->scx.local_dsq.nr)
2432 			goto has_tasks;
2433 		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
2434 			goto has_tasks;
2435 
2436 		/*
2437 		 * ops.dispatch() can trap us in this loop by repeatedly
2438 		 * dispatching ineligible tasks. Break out once in a while to
2439 		 * allow the watchdog to run. As IRQ can't be enabled in
2440 		 * balance(), we want to complete this scheduling cycle and then
2441 		 * start a new one. IOW, we want to call resched_curr() on the
2442 		 * next, most likely idle, task, not the current one. Use
2443 		 * scx_bpf_kick_cpu() for deferred kicking.
2444 		 */
2445 		if (unlikely(!--nr_loops)) {
2446 			scx_bpf_kick_cpu(cpu_of(rq), 0);
2447 			break;
2448 		}
2449 	} while (dspc->nr_tasks);
2450 
2451 	goto out;
2452 
2453 has_tasks:
2454 	has_tasks = true;
2455 out:
2456 	rq->scx.flags &= ~SCX_RQ_BALANCING;
2457 	return has_tasks;
2458 }
2459 
2460 static int balance_scx(struct rq *rq, struct task_struct *prev,
2461 		       struct rq_flags *rf)
2462 {
2463 	int ret;
2464 
2465 	ret = balance_one(rq, prev, rf, true);
2466 
2467 #ifdef CONFIG_SCHED_SMT
2468 	/*
2469 	 * When core-sched is enabled, this ops.balance() call will be followed
2470 	 * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
2471 	 * on the SMT siblings. Balance the siblings too.
2472 	 */
2473 	if (sched_core_enabled(rq)) {
2474 		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
2475 		int scpu;
2476 
2477 		for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
2478 			struct rq *srq = cpu_rq(scpu);
2479 			struct rq_flags srf;
2480 			struct task_struct *sprev = srq->curr;
2481 
2482 			/*
2483 			 * While core-scheduling, rq lock is shared among
2484 			 * siblings but the debug annotations and rq clock
2485 			 * aren't. Do pinning dance to transfer the ownership.
2486 			 */
2487 			WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
2488 			rq_unpin_lock(rq, rf);
2489 			rq_pin_lock(srq, &srf);
2490 
2491 			update_rq_clock(srq);
2492 			balance_one(srq, sprev, &srf, false);
2493 
2494 			rq_unpin_lock(srq, &srf);
2495 			rq_repin_lock(rq, rf);
2496 		}
2497 	}
2498 #endif
2499 	return ret;
2500 }
2501 
2502 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
2503 {
2504 	if (p->scx.flags & SCX_TASK_QUEUED) {
2505 		/*
2506 		 * Core-sched might decide to execute @p before it is
2507 		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
2508 		 */
2509 		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
2510 		dispatch_dequeue(rq, p);
2511 	}
2512 
2513 	p->se.exec_start = rq_clock_task(rq);
2514 
2515 	/* see dequeue_task_scx() on why we skip when !QUEUED */
2516 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
2517 		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
2518 
2519 	clr_task_runnable(p, true);
2520 
2521 	/*
2522 	 * @p is getting newly scheduled or got kicked after someone updated its
2523 	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
2524 	 */
2525 	if ((p->scx.slice == SCX_SLICE_INF) !=
2526 	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
2527 		if (p->scx.slice == SCX_SLICE_INF)
2528 			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
2529 		else
2530 			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
2531 
2532 		sched_update_tick_dependency(rq);
2533 
2534 		/*
2535 		 * For now, let's refresh the load_avgs just when transitioning
2536 		 * in and out of nohz. In the future, we might want to add a
2537 		 * mechanism which calls the following periodically on
2538 		 * tick-stopped CPUs.
2539 		 */
2540 		update_other_load_avgs(rq);
2541 	}
2542 }
2543 
2544 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
2545 {
2546 #ifndef CONFIG_SMP
2547 	/*
2548 	 * UP workaround.
2549 	 *
2550 	 * Because SCX may transfer tasks across CPUs during dispatch, dispatch
2551 	 * is performed from its balance operation which isn't called in UP.
2552 	 * Let's work around by calling it from the operations which come right
2553 	 * after.
2554 	 *
2555 	 * 1. If the prev task is on SCX, pick_next_task() calls
2556 	 *    .put_prev_task() right after. As .put_prev_task() is also called
2557 	 *    from other places, we need to distinguish the calls which can be
2558 	 *    done by looking at the previous task's state - if still queued or
2559 	 *    dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
2560 	 *    This case is handled here.
2561 	 *
2562 	 * 2. If the prev task is not on SCX, the first following call into SCX
2563 	 *    will be .pick_next_task(), which is covered by calling
2564 	 *    balance_scx() from pick_next_task_scx().
2565 	 *
2566 	 * Note that we can't merge the first case into the second as
2567 	 * balance_scx() must be called before the previous SCX task goes
2568 	 * through put_prev_task_scx().
2569 	 *
2570 	 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf.
2571 	 * Pass in %NULL.
2572 	 */
2573 	if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
2574 		balance_scx(rq, p, NULL);
2575 #endif
2576 
2577 	update_curr_scx(rq);
2578 
2579 	/* see dequeue_task_scx() on why we skip when !QUEUED */
2580 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
2581 		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
2582 
2583 	/*
2584 	 * If we're being called from put_prev_task_balance(), balance_scx() may
2585 	 * have decided that @p should keep running.
2586 	 */
2587 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
2588 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
2589 		set_task_runnable(rq, p);
2590 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
2591 		return;
2592 	}
2593 
2594 	if (p->scx.flags & SCX_TASK_QUEUED) {
2595 		set_task_runnable(rq, p);
2596 
2597 		/*
2598 		 * If @p has slice left and balance_scx() didn't tag it for
2599 		 * keeping, @p is getting preempted by a higher priority
2600 		 * scheduler class or core-sched forcing a different task. Leave
2601 		 * it at the head of the local DSQ.
2602 		 */
2603 		if (p->scx.slice && !scx_ops_bypassing()) {
2604 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
2605 			return;
2606 		}
2607 
2608 		/*
2609 		 * If we're in the pick_next_task path, balance_scx() should
2610 		 * have already populated the local DSQ if there are any other
2611 		 * available tasks. If empty, tell ops.enqueue() that @p is the
2612 		 * only one available for this cpu. ops.enqueue() should put it
2613 		 * on the local DSQ so that the subsequent pick_next_task_scx()
2614 		 * can find the task unless it wants to trigger a separate
2615 		 * follow-up scheduling event.
2616 		 */
2617 		if (list_empty(&rq->scx.local_dsq.list))
2618 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
2619 		else
2620 			do_enqueue_task(rq, p, 0, -1);
2621 	}
2622 }
2623 
2624 static struct task_struct *first_local_task(struct rq *rq)
2625 {
2626 	return list_first_entry_or_null(&rq->scx.local_dsq.list,
2627 					struct task_struct, scx.dsq_node.list);
2628 }
2629 
2630 static struct task_struct *pick_next_task_scx(struct rq *rq)
2631 {
2632 	struct task_struct *p;
2633 
2634 #ifndef CONFIG_SMP
2635 	/* UP workaround - see the comment at the head of put_prev_task_scx() */
2636 	if (unlikely(rq->curr->sched_class != &ext_sched_class))
2637 		balance_scx(rq, rq->curr, NULL);
2638 #endif
2639 
2640 	p = first_local_task(rq);
2641 	if (!p)
2642 		return NULL;
2643 
2644 	set_next_task_scx(rq, p, true);
2645 
2646 	if (unlikely(!p->scx.slice)) {
2647 		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
2648 			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
2649 					p->comm, p->pid);
2650 			scx_warned_zero_slice = true;
2651 		}
2652 		p->scx.slice = SCX_SLICE_DFL;
2653 	}
2654 
2655 	return p;
2656 }
2657 
2658 #ifdef CONFIG_SCHED_CORE
2659 /**
2660  * scx_prio_less - Task ordering for core-sched
2661  * @a: task A
2662  * @b: task B
2663  *
2664  * Core-sched is implemented as an additional scheduling layer on top of the
2665  * usual sched_class'es and needs to find out the expected task ordering. For
2666  * SCX, core-sched calls this function to interrogate the task ordering.
2667  *
2668  * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
2669  * to implement the default task ordering. The older the timestamp, the higher
2670  * prority the task - the global FIFO ordering matching the default scheduling
2671  * behavior.
2672  *
2673  * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
2674  * implement FIFO ordering within each local DSQ. See pick_task_scx().
2675  */
2676 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
2677 		   bool in_fi)
2678 {
2679 	/*
2680 	 * The const qualifiers are dropped from task_struct pointers when
2681 	 * calling ops.core_sched_before(). Accesses are controlled by the
2682 	 * verifier.
2683 	 */
2684 	if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
2685 		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
2686 					      (struct task_struct *)a,
2687 					      (struct task_struct *)b);
2688 	else
2689 		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
2690 }
2691 
2692 /**
2693  * pick_task_scx - Pick a candidate task for core-sched
2694  * @rq: rq to pick the candidate task from
2695  *
2696  * Core-sched calls this function on each SMT sibling to determine the next
2697  * tasks to run on the SMT siblings. balance_one() has been called on all
2698  * siblings and put_prev_task_scx() has been called only for the current CPU.
2699  *
2700  * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
2701  * at the first task in the local dsq. @rq->curr has to be considered explicitly
2702  * to mimic %SCX_TASK_BAL_KEEP.
2703  */
2704 static struct task_struct *pick_task_scx(struct rq *rq)
2705 {
2706 	struct task_struct *curr = rq->curr;
2707 	struct task_struct *first = first_local_task(rq);
2708 
2709 	if (curr->scx.flags & SCX_TASK_QUEUED) {
2710 		/* is curr the only runnable task? */
2711 		if (!first)
2712 			return curr;
2713 
2714 		/*
2715 		 * Does curr trump first? We can always go by core_sched_at for
2716 		 * this comparison as it represents global FIFO ordering when
2717 		 * the default core-sched ordering is used and local-DSQ FIFO
2718 		 * ordering otherwise.
2719 		 *
2720 		 * We can have a task with an earlier timestamp on the DSQ. For
2721 		 * example, when a current task is preempted by a sibling
2722 		 * picking a different cookie, the task would be requeued at the
2723 		 * head of the local DSQ with an earlier timestamp than the
2724 		 * core-sched picked next task. Besides, the BPF scheduler may
2725 		 * dispatch any tasks to the local DSQ anytime.
2726 		 */
2727 		if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
2728 						     first->scx.core_sched_at))
2729 			return curr;
2730 	}
2731 
2732 	return first;	/* this may be %NULL */
2733 }
2734 #endif	/* CONFIG_SCHED_CORE */
2735 
2736 static enum scx_cpu_preempt_reason
2737 preempt_reason_from_class(const struct sched_class *class)
2738 {
2739 #ifdef CONFIG_SMP
2740 	if (class == &stop_sched_class)
2741 		return SCX_CPU_PREEMPT_STOP;
2742 #endif
2743 	if (class == &dl_sched_class)
2744 		return SCX_CPU_PREEMPT_DL;
2745 	if (class == &rt_sched_class)
2746 		return SCX_CPU_PREEMPT_RT;
2747 	return SCX_CPU_PREEMPT_UNKNOWN;
2748 }
2749 
2750 void scx_next_task_picked(struct rq *rq, struct task_struct *p,
2751 			  const struct sched_class *active)
2752 {
2753 	lockdep_assert_rq_held(rq);
2754 
2755 	if (!scx_enabled())
2756 		return;
2757 #ifdef CONFIG_SMP
2758 	/*
2759 	 * Pairs with the smp_load_acquire() issued by a CPU in
2760 	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
2761 	 * resched.
2762 	 */
2763 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
2764 #endif
2765 	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
2766 		return;
2767 
2768 	/*
2769 	 * The callback is conceptually meant to convey that the CPU is no
2770 	 * longer under the control of SCX. Therefore, don't invoke the
2771 	 * callback if the CPU is is staying on SCX, or going idle (in which
2772 	 * case the SCX scheduler has actively decided not to schedule any
2773 	 * tasks on the CPU).
2774 	 */
2775 	if (likely(active >= &ext_sched_class))
2776 		return;
2777 
2778 	/*
2779 	 * At this point we know that SCX was preempted by a higher priority
2780 	 * sched_class, so invoke the ->cpu_release() callback if we have not
2781 	 * done so already. We only send the callback once between SCX being
2782 	 * preempted, and it regaining control of the CPU.
2783 	 *
2784 	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
2785 	 *  next time that balance_scx() is invoked.
2786 	 */
2787 	if (!rq->scx.cpu_released) {
2788 		if (SCX_HAS_OP(cpu_release)) {
2789 			struct scx_cpu_release_args args = {
2790 				.reason = preempt_reason_from_class(active),
2791 				.task = p,
2792 			};
2793 
2794 			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
2795 				    cpu_release, cpu_of(rq), &args);
2796 		}
2797 		rq->scx.cpu_released = true;
2798 	}
2799 }
2800 
2801 #ifdef CONFIG_SMP
2802 
2803 static bool test_and_clear_cpu_idle(int cpu)
2804 {
2805 #ifdef CONFIG_SCHED_SMT
2806 	/*
2807 	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
2808 	 * cluster is not wholly idle either way. This also prevents
2809 	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
2810 	 */
2811 	if (sched_smt_active()) {
2812 		const struct cpumask *smt = cpu_smt_mask(cpu);
2813 
2814 		/*
2815 		 * If offline, @cpu is not its own sibling and
2816 		 * scx_pick_idle_cpu() can get caught in an infinite loop as
2817 		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
2818 		 * is eventually cleared.
2819 		 */
2820 		if (cpumask_intersects(smt, idle_masks.smt))
2821 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
2822 		else if (cpumask_test_cpu(cpu, idle_masks.smt))
2823 			__cpumask_clear_cpu(cpu, idle_masks.smt);
2824 	}
2825 #endif
2826 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
2827 }
2828 
2829 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
2830 {
2831 	int cpu;
2832 
2833 retry:
2834 	if (sched_smt_active()) {
2835 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
2836 		if (cpu < nr_cpu_ids)
2837 			goto found;
2838 
2839 		if (flags & SCX_PICK_IDLE_CORE)
2840 			return -EBUSY;
2841 	}
2842 
2843 	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
2844 	if (cpu >= nr_cpu_ids)
2845 		return -EBUSY;
2846 
2847 found:
2848 	if (test_and_clear_cpu_idle(cpu))
2849 		return cpu;
2850 	else
2851 		goto retry;
2852 }
2853 
2854 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
2855 			      u64 wake_flags, bool *found)
2856 {
2857 	s32 cpu;
2858 
2859 	*found = false;
2860 
2861 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
2862 		scx_ops_error("built-in idle tracking is disabled");
2863 		return prev_cpu;
2864 	}
2865 
2866 	/*
2867 	 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
2868 	 * under utilized, wake up @p to the local DSQ of the waker. Checking
2869 	 * only for an empty local DSQ is insufficient as it could give the
2870 	 * wakee an unfair advantage when the system is oversaturated.
2871 	 * Checking only for the presence of idle CPUs is also insufficient as
2872 	 * the local DSQ of the waker could have tasks piled up on it even if
2873 	 * there is an idle core elsewhere on the system.
2874 	 */
2875 	cpu = smp_processor_id();
2876 	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
2877 	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
2878 	    cpu_rq(cpu)->scx.local_dsq.nr == 0) {
2879 		if (cpumask_test_cpu(cpu, p->cpus_ptr))
2880 			goto cpu_found;
2881 	}
2882 
2883 	if (p->nr_cpus_allowed == 1) {
2884 		if (test_and_clear_cpu_idle(prev_cpu)) {
2885 			cpu = prev_cpu;
2886 			goto cpu_found;
2887 		} else {
2888 			return prev_cpu;
2889 		}
2890 	}
2891 
2892 	/*
2893 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
2894 	 * partially idle @prev_cpu.
2895 	 */
2896 	if (sched_smt_active()) {
2897 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
2898 		    test_and_clear_cpu_idle(prev_cpu)) {
2899 			cpu = prev_cpu;
2900 			goto cpu_found;
2901 		}
2902 
2903 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
2904 		if (cpu >= 0)
2905 			goto cpu_found;
2906 	}
2907 
2908 	if (test_and_clear_cpu_idle(prev_cpu)) {
2909 		cpu = prev_cpu;
2910 		goto cpu_found;
2911 	}
2912 
2913 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
2914 	if (cpu >= 0)
2915 		goto cpu_found;
2916 
2917 	return prev_cpu;
2918 
2919 cpu_found:
2920 	*found = true;
2921 	return cpu;
2922 }
2923 
2924 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
2925 {
2926 	/*
2927 	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
2928 	 * can be a good migration opportunity with low cache and memory
2929 	 * footprint. Returning a CPU different than @prev_cpu triggers
2930 	 * immediate rq migration. However, for SCX, as the current rq
2931 	 * association doesn't dictate where the task is going to run, this
2932 	 * doesn't fit well. If necessary, we can later add a dedicated method
2933 	 * which can decide to preempt self to force it through the regular
2934 	 * scheduling path.
2935 	 */
2936 	if (unlikely(wake_flags & WF_EXEC))
2937 		return prev_cpu;
2938 
2939 	if (SCX_HAS_OP(select_cpu)) {
2940 		s32 cpu;
2941 		struct task_struct **ddsp_taskp;
2942 
2943 		ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
2944 		WARN_ON_ONCE(*ddsp_taskp);
2945 		*ddsp_taskp = p;
2946 
2947 		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
2948 					   select_cpu, p, prev_cpu, wake_flags);
2949 		*ddsp_taskp = NULL;
2950 		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
2951 			return cpu;
2952 		else
2953 			return prev_cpu;
2954 	} else {
2955 		bool found;
2956 		s32 cpu;
2957 
2958 		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
2959 		if (found) {
2960 			p->scx.slice = SCX_SLICE_DFL;
2961 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
2962 		}
2963 		return cpu;
2964 	}
2965 }
2966 
2967 static void set_cpus_allowed_scx(struct task_struct *p,
2968 				 struct affinity_context *ac)
2969 {
2970 	set_cpus_allowed_common(p, ac);
2971 
2972 	/*
2973 	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
2974 	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
2975 	 * scheduler the effective one.
2976 	 *
2977 	 * Fine-grained memory write control is enforced by BPF making the const
2978 	 * designation pointless. Cast it away when calling the operation.
2979 	 */
2980 	if (SCX_HAS_OP(set_cpumask))
2981 		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
2982 				 (struct cpumask *)p->cpus_ptr);
2983 }
2984 
2985 static void reset_idle_masks(void)
2986 {
2987 	/*
2988 	 * Consider all online cpus idle. Should converge to the actual state
2989 	 * quickly.
2990 	 */
2991 	cpumask_copy(idle_masks.cpu, cpu_online_mask);
2992 	cpumask_copy(idle_masks.smt, cpu_online_mask);
2993 }
2994 
2995 void __scx_update_idle(struct rq *rq, bool idle)
2996 {
2997 	int cpu = cpu_of(rq);
2998 
2999 	if (SCX_HAS_OP(update_idle)) {
3000 		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
3001 		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
3002 			return;
3003 	}
3004 
3005 	if (idle)
3006 		cpumask_set_cpu(cpu, idle_masks.cpu);
3007 	else
3008 		cpumask_clear_cpu(cpu, idle_masks.cpu);
3009 
3010 #ifdef CONFIG_SCHED_SMT
3011 	if (sched_smt_active()) {
3012 		const struct cpumask *smt = cpu_smt_mask(cpu);
3013 
3014 		if (idle) {
3015 			/*
3016 			 * idle_masks.smt handling is racy but that's fine as
3017 			 * it's only for optimization and self-correcting.
3018 			 */
3019 			for_each_cpu(cpu, smt) {
3020 				if (!cpumask_test_cpu(cpu, idle_masks.cpu))
3021 					return;
3022 			}
3023 			cpumask_or(idle_masks.smt, idle_masks.smt, smt);
3024 		} else {
3025 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
3026 		}
3027 	}
3028 #endif
3029 }
3030 
3031 static void handle_hotplug(struct rq *rq, bool online)
3032 {
3033 	int cpu = cpu_of(rq);
3034 
3035 	atomic_long_inc(&scx_hotplug_seq);
3036 
3037 	if (online && SCX_HAS_OP(cpu_online))
3038 		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
3039 	else if (!online && SCX_HAS_OP(cpu_offline))
3040 		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
3041 	else
3042 		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
3043 			     "cpu %d going %s, exiting scheduler", cpu,
3044 			     online ? "online" : "offline");
3045 }
3046 
3047 void scx_rq_activate(struct rq *rq)
3048 {
3049 	handle_hotplug(rq, true);
3050 }
3051 
3052 void scx_rq_deactivate(struct rq *rq)
3053 {
3054 	handle_hotplug(rq, false);
3055 }
3056 
3057 static void rq_online_scx(struct rq *rq)
3058 {
3059 	rq->scx.flags |= SCX_RQ_ONLINE;
3060 }
3061 
3062 static void rq_offline_scx(struct rq *rq)
3063 {
3064 	rq->scx.flags &= ~SCX_RQ_ONLINE;
3065 }
3066 
3067 #else	/* CONFIG_SMP */
3068 
3069 static bool test_and_clear_cpu_idle(int cpu) { return false; }
3070 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
3071 static void reset_idle_masks(void) {}
3072 
3073 #endif	/* CONFIG_SMP */
3074 
3075 static bool check_rq_for_timeouts(struct rq *rq)
3076 {
3077 	struct task_struct *p;
3078 	struct rq_flags rf;
3079 	bool timed_out = false;
3080 
3081 	rq_lock_irqsave(rq, &rf);
3082 	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
3083 		unsigned long last_runnable = p->scx.runnable_at;
3084 
3085 		if (unlikely(time_after(jiffies,
3086 					last_runnable + scx_watchdog_timeout))) {
3087 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
3088 
3089 			scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
3090 					   "%s[%d] failed to run for %u.%03us",
3091 					   p->comm, p->pid,
3092 					   dur_ms / 1000, dur_ms % 1000);
3093 			timed_out = true;
3094 			break;
3095 		}
3096 	}
3097 	rq_unlock_irqrestore(rq, &rf);
3098 
3099 	return timed_out;
3100 }
3101 
3102 static void scx_watchdog_workfn(struct work_struct *work)
3103 {
3104 	int cpu;
3105 
3106 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
3107 
3108 	for_each_online_cpu(cpu) {
3109 		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
3110 			break;
3111 
3112 		cond_resched();
3113 	}
3114 	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
3115 			   scx_watchdog_timeout / 2);
3116 }
3117 
3118 void scx_tick(struct rq *rq)
3119 {
3120 	unsigned long last_check;
3121 
3122 	if (!scx_enabled())
3123 		return;
3124 
3125 	last_check = READ_ONCE(scx_watchdog_timestamp);
3126 	if (unlikely(time_after(jiffies,
3127 				last_check + READ_ONCE(scx_watchdog_timeout)))) {
3128 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
3129 
3130 		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
3131 				   "watchdog failed to check in for %u.%03us",
3132 				   dur_ms / 1000, dur_ms % 1000);
3133 	}
3134 
3135 	update_other_load_avgs(rq);
3136 }
3137 
3138 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
3139 {
3140 	update_curr_scx(rq);
3141 
3142 	/*
3143 	 * While disabling, always resched and refresh core-sched timestamp as
3144 	 * we can't trust the slice management or ops.core_sched_before().
3145 	 */
3146 	if (scx_ops_bypassing()) {
3147 		curr->scx.slice = 0;
3148 		touch_core_sched(rq, curr);
3149 	} else if (SCX_HAS_OP(tick)) {
3150 		SCX_CALL_OP(SCX_KF_REST, tick, curr);
3151 	}
3152 
3153 	if (!curr->scx.slice)
3154 		resched_curr(rq);
3155 }
3156 
3157 static enum scx_task_state scx_get_task_state(const struct task_struct *p)
3158 {
3159 	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
3160 }
3161 
3162 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
3163 {
3164 	enum scx_task_state prev_state = scx_get_task_state(p);
3165 	bool warn = false;
3166 
3167 	BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
3168 
3169 	switch (state) {
3170 	case SCX_TASK_NONE:
3171 		break;
3172 	case SCX_TASK_INIT:
3173 		warn = prev_state != SCX_TASK_NONE;
3174 		break;
3175 	case SCX_TASK_READY:
3176 		warn = prev_state == SCX_TASK_NONE;
3177 		break;
3178 	case SCX_TASK_ENABLED:
3179 		warn = prev_state != SCX_TASK_READY;
3180 		break;
3181 	default:
3182 		warn = true;
3183 		return;
3184 	}
3185 
3186 	WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
3187 		  prev_state, state, p->comm, p->pid);
3188 
3189 	p->scx.flags &= ~SCX_TASK_STATE_MASK;
3190 	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
3191 }
3192 
3193 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
3194 {
3195 	int ret;
3196 
3197 	p->scx.disallow = false;
3198 
3199 	if (SCX_HAS_OP(init_task)) {
3200 		struct scx_init_task_args args = {
3201 			.fork = fork,
3202 		};
3203 
3204 		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args);
3205 		if (unlikely(ret)) {
3206 			ret = ops_sanitize_err("init_task", ret);
3207 			return ret;
3208 		}
3209 	}
3210 
3211 	scx_set_task_state(p, SCX_TASK_INIT);
3212 
3213 	if (p->scx.disallow) {
3214 		struct rq *rq;
3215 		struct rq_flags rf;
3216 
3217 		rq = task_rq_lock(p, &rf);
3218 
3219 		/*
3220 		 * We're either in fork or load path and @p->policy will be
3221 		 * applied right after. Reverting @p->policy here and rejecting
3222 		 * %SCHED_EXT transitions from scx_check_setscheduler()
3223 		 * guarantees that if ops.init_task() sets @p->disallow, @p can
3224 		 * never be in SCX.
3225 		 */
3226 		if (p->policy == SCHED_EXT) {
3227 			p->policy = SCHED_NORMAL;
3228 			atomic_long_inc(&scx_nr_rejected);
3229 		}
3230 
3231 		task_rq_unlock(rq, p, &rf);
3232 	}
3233 
3234 	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
3235 	return 0;
3236 }
3237 
3238 static void set_task_scx_weight(struct task_struct *p)
3239 {
3240 	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
3241 
3242 	p->scx.weight = sched_weight_to_cgroup(weight);
3243 }
3244 
3245 static void scx_ops_enable_task(struct task_struct *p)
3246 {
3247 	lockdep_assert_rq_held(task_rq(p));
3248 
3249 	/*
3250 	 * Set the weight before calling ops.enable() so that the scheduler
3251 	 * doesn't see a stale value if they inspect the task struct.
3252 	 */
3253 	set_task_scx_weight(p);
3254 	if (SCX_HAS_OP(enable))
3255 		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
3256 	scx_set_task_state(p, SCX_TASK_ENABLED);
3257 
3258 	if (SCX_HAS_OP(set_weight))
3259 		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
3260 }
3261 
3262 static void scx_ops_disable_task(struct task_struct *p)
3263 {
3264 	lockdep_assert_rq_held(task_rq(p));
3265 	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
3266 
3267 	if (SCX_HAS_OP(disable))
3268 		SCX_CALL_OP(SCX_KF_REST, disable, p);
3269 	scx_set_task_state(p, SCX_TASK_READY);
3270 }
3271 
3272 static void scx_ops_exit_task(struct task_struct *p)
3273 {
3274 	struct scx_exit_task_args args = {
3275 		.cancelled = false,
3276 	};
3277 
3278 	lockdep_assert_rq_held(task_rq(p));
3279 
3280 	switch (scx_get_task_state(p)) {
3281 	case SCX_TASK_NONE:
3282 		return;
3283 	case SCX_TASK_INIT:
3284 		args.cancelled = true;
3285 		break;
3286 	case SCX_TASK_READY:
3287 		break;
3288 	case SCX_TASK_ENABLED:
3289 		scx_ops_disable_task(p);
3290 		break;
3291 	default:
3292 		WARN_ON_ONCE(true);
3293 		return;
3294 	}
3295 
3296 	if (SCX_HAS_OP(exit_task))
3297 		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
3298 	scx_set_task_state(p, SCX_TASK_NONE);
3299 }
3300 
3301 void init_scx_entity(struct sched_ext_entity *scx)
3302 {
3303 	/*
3304 	 * init_idle() calls this function again after fork sequence is
3305 	 * complete. Don't touch ->tasks_node as it's already linked.
3306 	 */
3307 	memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
3308 
3309 	INIT_LIST_HEAD(&scx->dsq_node.list);
3310 	RB_CLEAR_NODE(&scx->dsq_node.priq);
3311 	scx->sticky_cpu = -1;
3312 	scx->holding_cpu = -1;
3313 	INIT_LIST_HEAD(&scx->runnable_node);
3314 	scx->runnable_at = jiffies;
3315 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
3316 	scx->slice = SCX_SLICE_DFL;
3317 }
3318 
3319 void scx_pre_fork(struct task_struct *p)
3320 {
3321 	/*
3322 	 * BPF scheduler enable/disable paths want to be able to iterate and
3323 	 * update all tasks which can become complex when racing forks. As
3324 	 * enable/disable are very cold paths, let's use a percpu_rwsem to
3325 	 * exclude forks.
3326 	 */
3327 	percpu_down_read(&scx_fork_rwsem);
3328 }
3329 
3330 int scx_fork(struct task_struct *p)
3331 {
3332 	percpu_rwsem_assert_held(&scx_fork_rwsem);
3333 
3334 	if (scx_enabled())
3335 		return scx_ops_init_task(p, task_group(p), true);
3336 	else
3337 		return 0;
3338 }
3339 
3340 void scx_post_fork(struct task_struct *p)
3341 {
3342 	if (scx_enabled()) {
3343 		scx_set_task_state(p, SCX_TASK_READY);
3344 
3345 		/*
3346 		 * Enable the task immediately if it's running on sched_ext.
3347 		 * Otherwise, it'll be enabled in switching_to_scx() if and
3348 		 * when it's ever configured to run with a SCHED_EXT policy.
3349 		 */
3350 		if (p->sched_class == &ext_sched_class) {
3351 			struct rq_flags rf;
3352 			struct rq *rq;
3353 
3354 			rq = task_rq_lock(p, &rf);
3355 			scx_ops_enable_task(p);
3356 			task_rq_unlock(rq, p, &rf);
3357 		}
3358 	}
3359 
3360 	spin_lock_irq(&scx_tasks_lock);
3361 	list_add_tail(&p->scx.tasks_node, &scx_tasks);
3362 	spin_unlock_irq(&scx_tasks_lock);
3363 
3364 	percpu_up_read(&scx_fork_rwsem);
3365 }
3366 
3367 void scx_cancel_fork(struct task_struct *p)
3368 {
3369 	if (scx_enabled()) {
3370 		struct rq *rq;
3371 		struct rq_flags rf;
3372 
3373 		rq = task_rq_lock(p, &rf);
3374 		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
3375 		scx_ops_exit_task(p);
3376 		task_rq_unlock(rq, p, &rf);
3377 	}
3378 
3379 	percpu_up_read(&scx_fork_rwsem);
3380 }
3381 
3382 void sched_ext_free(struct task_struct *p)
3383 {
3384 	unsigned long flags;
3385 
3386 	spin_lock_irqsave(&scx_tasks_lock, flags);
3387 	list_del_init(&p->scx.tasks_node);
3388 	spin_unlock_irqrestore(&scx_tasks_lock, flags);
3389 
3390 	/*
3391 	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
3392 	 * ENABLED transitions can't race us. Disable ops for @p.
3393 	 */
3394 	if (scx_get_task_state(p) != SCX_TASK_NONE) {
3395 		struct rq_flags rf;
3396 		struct rq *rq;
3397 
3398 		rq = task_rq_lock(p, &rf);
3399 		scx_ops_exit_task(p);
3400 		task_rq_unlock(rq, p, &rf);
3401 	}
3402 }
3403 
3404 static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
3405 {
3406 	lockdep_assert_rq_held(task_rq(p));
3407 
3408 	set_task_scx_weight(p);
3409 	if (SCX_HAS_OP(set_weight))
3410 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
3411 }
3412 
3413 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
3414 {
3415 }
3416 
3417 static void switching_to_scx(struct rq *rq, struct task_struct *p)
3418 {
3419 	scx_ops_enable_task(p);
3420 
3421 	/*
3422 	 * set_cpus_allowed_scx() is not called while @p is associated with a
3423 	 * different scheduler class. Keep the BPF scheduler up-to-date.
3424 	 */
3425 	if (SCX_HAS_OP(set_cpumask))
3426 		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
3427 				 (struct cpumask *)p->cpus_ptr);
3428 }
3429 
3430 static void switched_from_scx(struct rq *rq, struct task_struct *p)
3431 {
3432 	scx_ops_disable_task(p);
3433 }
3434 
3435 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
3436 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
3437 
3438 int scx_check_setscheduler(struct task_struct *p, int policy)
3439 {
3440 	lockdep_assert_rq_held(task_rq(p));
3441 
3442 	/* if disallow, reject transitioning into SCX */
3443 	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
3444 	    p->policy != policy && policy == SCHED_EXT)
3445 		return -EACCES;
3446 
3447 	return 0;
3448 }
3449 
3450 #ifdef CONFIG_NO_HZ_FULL
3451 bool scx_can_stop_tick(struct rq *rq)
3452 {
3453 	struct task_struct *p = rq->curr;
3454 
3455 	if (scx_ops_bypassing())
3456 		return false;
3457 
3458 	if (p->sched_class != &ext_sched_class)
3459 		return true;
3460 
3461 	/*
3462 	 * @rq can dispatch from different DSQs, so we can't tell whether it
3463 	 * needs the tick or not by looking at nr_running. Allow stopping ticks
3464 	 * iff the BPF scheduler indicated so. See set_next_task_scx().
3465 	 */
3466 	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
3467 }
3468 #endif
3469 
3470 /*
3471  * Omitted operations:
3472  *
3473  * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
3474  *   isn't tied to the CPU at that point. Preemption is implemented by resetting
3475  *   the victim task's slice to 0 and triggering reschedule on the target CPU.
3476  *
3477  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
3478  *
3479  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
3480  *   their current sched_class. Call them directly from sched core instead.
3481  *
3482  * - task_woken: Unnecessary.
3483  */
3484 DEFINE_SCHED_CLASS(ext) = {
3485 	.enqueue_task		= enqueue_task_scx,
3486 	.dequeue_task		= dequeue_task_scx,
3487 	.yield_task		= yield_task_scx,
3488 	.yield_to_task		= yield_to_task_scx,
3489 
3490 	.wakeup_preempt		= wakeup_preempt_scx,
3491 
3492 	.pick_next_task		= pick_next_task_scx,
3493 
3494 	.put_prev_task		= put_prev_task_scx,
3495 	.set_next_task		= set_next_task_scx,
3496 
3497 #ifdef CONFIG_SMP
3498 	.balance		= balance_scx,
3499 	.select_task_rq		= select_task_rq_scx,
3500 	.set_cpus_allowed	= set_cpus_allowed_scx,
3501 
3502 	.rq_online		= rq_online_scx,
3503 	.rq_offline		= rq_offline_scx,
3504 #endif
3505 
3506 #ifdef CONFIG_SCHED_CORE
3507 	.pick_task		= pick_task_scx,
3508 #endif
3509 
3510 	.task_tick		= task_tick_scx,
3511 
3512 	.switching_to		= switching_to_scx,
3513 	.switched_from		= switched_from_scx,
3514 	.switched_to		= switched_to_scx,
3515 	.reweight_task		= reweight_task_scx,
3516 	.prio_changed		= prio_changed_scx,
3517 
3518 	.update_curr		= update_curr_scx,
3519 
3520 #ifdef CONFIG_UCLAMP_TASK
3521 	.uclamp_enabled		= 0,
3522 #endif
3523 };
3524 
3525 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
3526 {
3527 	memset(dsq, 0, sizeof(*dsq));
3528 
3529 	raw_spin_lock_init(&dsq->lock);
3530 	INIT_LIST_HEAD(&dsq->list);
3531 	dsq->id = dsq_id;
3532 }
3533 
3534 static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
3535 {
3536 	struct scx_dispatch_q *dsq;
3537 	int ret;
3538 
3539 	if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
3540 		return ERR_PTR(-EINVAL);
3541 
3542 	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
3543 	if (!dsq)
3544 		return ERR_PTR(-ENOMEM);
3545 
3546 	init_dsq(dsq, dsq_id);
3547 
3548 	ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
3549 				     dsq_hash_params);
3550 	if (ret) {
3551 		kfree(dsq);
3552 		return ERR_PTR(ret);
3553 	}
3554 	return dsq;
3555 }
3556 
3557 static void free_dsq_irq_workfn(struct irq_work *irq_work)
3558 {
3559 	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
3560 	struct scx_dispatch_q *dsq, *tmp_dsq;
3561 
3562 	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
3563 		kfree_rcu(dsq, rcu);
3564 }
3565 
3566 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
3567 
3568 static void destroy_dsq(u64 dsq_id)
3569 {
3570 	struct scx_dispatch_q *dsq;
3571 	unsigned long flags;
3572 
3573 	rcu_read_lock();
3574 
3575 	dsq = find_user_dsq(dsq_id);
3576 	if (!dsq)
3577 		goto out_unlock_rcu;
3578 
3579 	raw_spin_lock_irqsave(&dsq->lock, flags);
3580 
3581 	if (dsq->nr) {
3582 		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
3583 			      dsq->id, dsq->nr);
3584 		goto out_unlock_dsq;
3585 	}
3586 
3587 	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
3588 		goto out_unlock_dsq;
3589 
3590 	/*
3591 	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
3592 	 * queueing more tasks. As this function can be called from anywhere,
3593 	 * freeing is bounced through an irq work to avoid nesting RCU
3594 	 * operations inside scheduler locks.
3595 	 */
3596 	dsq->id = SCX_DSQ_INVALID;
3597 	llist_add(&dsq->free_node, &dsqs_to_free);
3598 	irq_work_queue(&free_dsq_irq_work);
3599 
3600 out_unlock_dsq:
3601 	raw_spin_unlock_irqrestore(&dsq->lock, flags);
3602 out_unlock_rcu:
3603 	rcu_read_unlock();
3604 }
3605 
3606 
3607 /********************************************************************************
3608  * Sysfs interface and ops enable/disable.
3609  */
3610 
3611 #define SCX_ATTR(_name)								\
3612 	static struct kobj_attribute scx_attr_##_name = {			\
3613 		.attr = { .name = __stringify(_name), .mode = 0444 },		\
3614 		.show = scx_attr_##_name##_show,				\
3615 	}
3616 
3617 static ssize_t scx_attr_state_show(struct kobject *kobj,
3618 				   struct kobj_attribute *ka, char *buf)
3619 {
3620 	return sysfs_emit(buf, "%s\n",
3621 			  scx_ops_enable_state_str[scx_ops_enable_state()]);
3622 }
3623 SCX_ATTR(state);
3624 
3625 static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
3626 					struct kobj_attribute *ka, char *buf)
3627 {
3628 	return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
3629 }
3630 SCX_ATTR(switch_all);
3631 
3632 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
3633 					 struct kobj_attribute *ka, char *buf)
3634 {
3635 	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
3636 }
3637 SCX_ATTR(nr_rejected);
3638 
3639 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
3640 					 struct kobj_attribute *ka, char *buf)
3641 {
3642 	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
3643 }
3644 SCX_ATTR(hotplug_seq);
3645 
3646 static struct attribute *scx_global_attrs[] = {
3647 	&scx_attr_state.attr,
3648 	&scx_attr_switch_all.attr,
3649 	&scx_attr_nr_rejected.attr,
3650 	&scx_attr_hotplug_seq.attr,
3651 	NULL,
3652 };
3653 
3654 static const struct attribute_group scx_global_attr_group = {
3655 	.attrs = scx_global_attrs,
3656 };
3657 
3658 static void scx_kobj_release(struct kobject *kobj)
3659 {
3660 	kfree(kobj);
3661 }
3662 
3663 static ssize_t scx_attr_ops_show(struct kobject *kobj,
3664 				 struct kobj_attribute *ka, char *buf)
3665 {
3666 	return sysfs_emit(buf, "%s\n", scx_ops.name);
3667 }
3668 SCX_ATTR(ops);
3669 
3670 static struct attribute *scx_sched_attrs[] = {
3671 	&scx_attr_ops.attr,
3672 	NULL,
3673 };
3674 ATTRIBUTE_GROUPS(scx_sched);
3675 
3676 static const struct kobj_type scx_ktype = {
3677 	.release = scx_kobj_release,
3678 	.sysfs_ops = &kobj_sysfs_ops,
3679 	.default_groups = scx_sched_groups,
3680 };
3681 
3682 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
3683 {
3684 	return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
3685 }
3686 
3687 static const struct kset_uevent_ops scx_uevent_ops = {
3688 	.uevent = scx_uevent,
3689 };
3690 
3691 /*
3692  * Used by sched_fork() and __setscheduler_prio() to pick the matching
3693  * sched_class. dl/rt are already handled.
3694  */
3695 bool task_should_scx(struct task_struct *p)
3696 {
3697 	if (!scx_enabled() ||
3698 	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
3699 		return false;
3700 	if (READ_ONCE(scx_switching_all))
3701 		return true;
3702 	return p->policy == SCHED_EXT;
3703 }
3704 
3705 /**
3706  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
3707  *
3708  * Bypassing guarantees that all runnable tasks make forward progress without
3709  * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
3710  * be held by tasks that the BPF scheduler is forgetting to run, which
3711  * unfortunately also excludes toggling the static branches.
3712  *
3713  * Let's work around by overriding a couple ops and modifying behaviors based on
3714  * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
3715  * to force global FIFO scheduling.
3716  *
3717  * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
3718  *
3719  * b. ops.dispatch() is ignored.
3720  *
3721  * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
3722  *    trusted. Whenever a tick triggers, the running task is rotated to the tail
3723  *    of the queue with core_sched_at touched.
3724  *
3725  * d. pick_next_task() suppresses zero slice warning.
3726  *
3727  * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
3728  *    operations.
3729  *
3730  * f. scx_prio_less() reverts to the default core_sched_at order.
3731  */
3732 static void scx_ops_bypass(bool bypass)
3733 {
3734 	int depth, cpu;
3735 
3736 	if (bypass) {
3737 		depth = atomic_inc_return(&scx_ops_bypass_depth);
3738 		WARN_ON_ONCE(depth <= 0);
3739 		if (depth != 1)
3740 			return;
3741 	} else {
3742 		depth = atomic_dec_return(&scx_ops_bypass_depth);
3743 		WARN_ON_ONCE(depth < 0);
3744 		if (depth != 0)
3745 			return;
3746 	}
3747 
3748 	/*
3749 	 * We need to guarantee that no tasks are on the BPF scheduler while
3750 	 * bypassing. Either we see enabled or the enable path sees the
3751 	 * increased bypass_depth before moving tasks to SCX.
3752 	 */
3753 	if (!scx_enabled())
3754 		return;
3755 
3756 	/*
3757 	 * No task property is changing. We just need to make sure all currently
3758 	 * queued tasks are re-queued according to the new scx_ops_bypassing()
3759 	 * state. As an optimization, walk each rq's runnable_list instead of
3760 	 * the scx_tasks list.
3761 	 *
3762 	 * This function can't trust the scheduler and thus can't use
3763 	 * cpus_read_lock(). Walk all possible CPUs instead of online.
3764 	 */
3765 	for_each_possible_cpu(cpu) {
3766 		struct rq *rq = cpu_rq(cpu);
3767 		struct rq_flags rf;
3768 		struct task_struct *p, *n;
3769 
3770 		rq_lock_irqsave(rq, &rf);
3771 
3772 		/*
3773 		 * The use of list_for_each_entry_safe_reverse() is required
3774 		 * because each task is going to be removed from and added back
3775 		 * to the runnable_list during iteration. Because they're added
3776 		 * to the tail of the list, safe reverse iteration can still
3777 		 * visit all nodes.
3778 		 */
3779 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
3780 						 scx.runnable_node) {
3781 			struct sched_enq_and_set_ctx ctx;
3782 
3783 			/* cycling deq/enq is enough, see the function comment */
3784 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
3785 			sched_enq_and_set_task(&ctx);
3786 		}
3787 
3788 		rq_unlock_irqrestore(rq, &rf);
3789 
3790 		/* kick to restore ticks */
3791 		resched_cpu(cpu);
3792 	}
3793 }
3794 
3795 static void free_exit_info(struct scx_exit_info *ei)
3796 {
3797 	kfree(ei->dump);
3798 	kfree(ei->msg);
3799 	kfree(ei->bt);
3800 	kfree(ei);
3801 }
3802 
3803 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
3804 {
3805 	struct scx_exit_info *ei;
3806 
3807 	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
3808 	if (!ei)
3809 		return NULL;
3810 
3811 	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
3812 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
3813 	ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
3814 
3815 	if (!ei->bt || !ei->msg || !ei->dump) {
3816 		free_exit_info(ei);
3817 		return NULL;
3818 	}
3819 
3820 	return ei;
3821 }
3822 
3823 static const char *scx_exit_reason(enum scx_exit_kind kind)
3824 {
3825 	switch (kind) {
3826 	case SCX_EXIT_UNREG:
3827 		return "Scheduler unregistered from user space";
3828 	case SCX_EXIT_UNREG_BPF:
3829 		return "Scheduler unregistered from BPF";
3830 	case SCX_EXIT_UNREG_KERN:
3831 		return "Scheduler unregistered from the main kernel";
3832 	case SCX_EXIT_SYSRQ:
3833 		return "disabled by sysrq-S";
3834 	case SCX_EXIT_ERROR:
3835 		return "runtime error";
3836 	case SCX_EXIT_ERROR_BPF:
3837 		return "scx_bpf_error";
3838 	case SCX_EXIT_ERROR_STALL:
3839 		return "runnable task stall";
3840 	default:
3841 		return "<UNKNOWN>";
3842 	}
3843 }
3844 
3845 static void scx_ops_disable_workfn(struct kthread_work *work)
3846 {
3847 	struct scx_exit_info *ei = scx_exit_info;
3848 	struct scx_task_iter sti;
3849 	struct task_struct *p;
3850 	struct rhashtable_iter rht_iter;
3851 	struct scx_dispatch_q *dsq;
3852 	int i, kind;
3853 
3854 	kind = atomic_read(&scx_exit_kind);
3855 	while (true) {
3856 		/*
3857 		 * NONE indicates that a new scx_ops has been registered since
3858 		 * disable was scheduled - don't kill the new ops. DONE
3859 		 * indicates that the ops has already been disabled.
3860 		 */
3861 		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
3862 			return;
3863 		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
3864 			break;
3865 	}
3866 	ei->kind = kind;
3867 	ei->reason = scx_exit_reason(ei->kind);
3868 
3869 	/* guarantee forward progress by bypassing scx_ops */
3870 	scx_ops_bypass(true);
3871 
3872 	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
3873 	case SCX_OPS_DISABLING:
3874 		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
3875 		break;
3876 	case SCX_OPS_DISABLED:
3877 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
3878 			scx_exit_info->msg);
3879 		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
3880 			     SCX_OPS_DISABLING);
3881 		goto done;
3882 	default:
3883 		break;
3884 	}
3885 
3886 	/*
3887 	 * Here, every runnable task is guaranteed to make forward progress and
3888 	 * we can safely use blocking synchronization constructs. Actually
3889 	 * disable ops.
3890 	 */
3891 	mutex_lock(&scx_ops_enable_mutex);
3892 
3893 	static_branch_disable(&__scx_switched_all);
3894 	WRITE_ONCE(scx_switching_all, false);
3895 
3896 	/*
3897 	 * Avoid racing against fork. See scx_ops_enable() for explanation on
3898 	 * the locking order.
3899 	 */
3900 	percpu_down_write(&scx_fork_rwsem);
3901 	cpus_read_lock();
3902 
3903 	spin_lock_irq(&scx_tasks_lock);
3904 	scx_task_iter_init(&sti);
3905 	/*
3906 	 * Invoke scx_ops_exit_task() on all non-idle tasks, including
3907 	 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount,
3908 	 * we may not have invoked sched_ext_free() on them by the time a
3909 	 * scheduler is disabled. We must therefore exit the task here, or we'd
3910 	 * fail to invoke ops.exit_task(), as the scheduler will have been
3911 	 * unloaded by the time the task is subsequently exited on the
3912 	 * sched_ext_free() path.
3913 	 */
3914 	while ((p = scx_task_iter_next_locked(&sti, true))) {
3915 		const struct sched_class *old_class = p->sched_class;
3916 		struct sched_enq_and_set_ctx ctx;
3917 
3918 		if (READ_ONCE(p->__state) != TASK_DEAD) {
3919 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
3920 					       &ctx);
3921 
3922 			p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
3923 			__setscheduler_prio(p, p->prio);
3924 			check_class_changing(task_rq(p), p, old_class);
3925 
3926 			sched_enq_and_set_task(&ctx);
3927 
3928 			check_class_changed(task_rq(p), p, old_class, p->prio);
3929 		}
3930 		scx_ops_exit_task(p);
3931 	}
3932 	scx_task_iter_exit(&sti);
3933 	spin_unlock_irq(&scx_tasks_lock);
3934 
3935 	/* no task is on scx, turn off all the switches and flush in-progress calls */
3936 	static_branch_disable_cpuslocked(&__scx_ops_enabled);
3937 	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
3938 		static_branch_disable_cpuslocked(&scx_has_op[i]);
3939 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
3940 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
3941 	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
3942 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
3943 	synchronize_rcu();
3944 
3945 	cpus_read_unlock();
3946 	percpu_up_write(&scx_fork_rwsem);
3947 
3948 	if (ei->kind >= SCX_EXIT_ERROR) {
3949 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
3950 
3951 		if (ei->msg[0] == '\0')
3952 			printk(KERN_ERR "sched_ext: %s\n", ei->reason);
3953 		else
3954 			printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
3955 
3956 		stack_trace_print(ei->bt, ei->bt_len, 2);
3957 	}
3958 
3959 	if (scx_ops.exit)
3960 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
3961 
3962 	cancel_delayed_work_sync(&scx_watchdog_work);
3963 
3964 	/*
3965 	 * Delete the kobject from the hierarchy eagerly in addition to just
3966 	 * dropping a reference. Otherwise, if the object is deleted
3967 	 * asynchronously, sysfs could observe an object of the same name still
3968 	 * in the hierarchy when another scheduler is loaded.
3969 	 */
3970 	kobject_del(scx_root_kobj);
3971 	kobject_put(scx_root_kobj);
3972 	scx_root_kobj = NULL;
3973 
3974 	memset(&scx_ops, 0, sizeof(scx_ops));
3975 
3976 	rhashtable_walk_enter(&dsq_hash, &rht_iter);
3977 	do {
3978 		rhashtable_walk_start(&rht_iter);
3979 
3980 		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
3981 			destroy_dsq(dsq->id);
3982 
3983 		rhashtable_walk_stop(&rht_iter);
3984 	} while (dsq == ERR_PTR(-EAGAIN));
3985 	rhashtable_walk_exit(&rht_iter);
3986 
3987 	free_percpu(scx_dsp_ctx);
3988 	scx_dsp_ctx = NULL;
3989 	scx_dsp_max_batch = 0;
3990 
3991 	free_exit_info(scx_exit_info);
3992 	scx_exit_info = NULL;
3993 
3994 	mutex_unlock(&scx_ops_enable_mutex);
3995 
3996 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
3997 		     SCX_OPS_DISABLING);
3998 done:
3999 	scx_ops_bypass(false);
4000 }
4001 
4002 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
4003 
4004 static void schedule_scx_ops_disable_work(void)
4005 {
4006 	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
4007 
4008 	/*
4009 	 * We may be called spuriously before the first bpf_sched_ext_reg(). If
4010 	 * scx_ops_helper isn't set up yet, there's nothing to do.
4011 	 */
4012 	if (helper)
4013 		kthread_queue_work(helper, &scx_ops_disable_work);
4014 }
4015 
4016 static void scx_ops_disable(enum scx_exit_kind kind)
4017 {
4018 	int none = SCX_EXIT_NONE;
4019 
4020 	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
4021 		kind = SCX_EXIT_ERROR;
4022 
4023 	atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
4024 
4025 	schedule_scx_ops_disable_work();
4026 }
4027 
4028 static void dump_newline(struct seq_buf *s)
4029 {
4030 	trace_sched_ext_dump("");
4031 
4032 	/* @s may be zero sized and seq_buf triggers WARN if so */
4033 	if (s->size)
4034 		seq_buf_putc(s, '\n');
4035 }
4036 
4037 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
4038 {
4039 	va_list args;
4040 
4041 #ifdef CONFIG_TRACEPOINTS
4042 	if (trace_sched_ext_dump_enabled()) {
4043 		/* protected by scx_dump_state()::dump_lock */
4044 		static char line_buf[SCX_EXIT_MSG_LEN];
4045 
4046 		va_start(args, fmt);
4047 		vscnprintf(line_buf, sizeof(line_buf), fmt, args);
4048 		va_end(args);
4049 
4050 		trace_sched_ext_dump(line_buf);
4051 	}
4052 #endif
4053 	/* @s may be zero sized and seq_buf triggers WARN if so */
4054 	if (s->size) {
4055 		va_start(args, fmt);
4056 		seq_buf_vprintf(s, fmt, args);
4057 		va_end(args);
4058 
4059 		seq_buf_putc(s, '\n');
4060 	}
4061 }
4062 
4063 static void dump_stack_trace(struct seq_buf *s, const char *prefix,
4064 			     const unsigned long *bt, unsigned int len)
4065 {
4066 	unsigned int i;
4067 
4068 	for (i = 0; i < len; i++)
4069 		dump_line(s, "%s%pS", prefix, (void *)bt[i]);
4070 }
4071 
4072 static void ops_dump_init(struct seq_buf *s, const char *prefix)
4073 {
4074 	struct scx_dump_data *dd = &scx_dump_data;
4075 
4076 	lockdep_assert_irqs_disabled();
4077 
4078 	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */
4079 	dd->first = true;
4080 	dd->cursor = 0;
4081 	dd->s = s;
4082 	dd->prefix = prefix;
4083 }
4084 
4085 static void ops_dump_flush(void)
4086 {
4087 	struct scx_dump_data *dd = &scx_dump_data;
4088 	char *line = dd->buf.line;
4089 
4090 	if (!dd->cursor)
4091 		return;
4092 
4093 	/*
4094 	 * There's something to flush and this is the first line. Insert a blank
4095 	 * line to distinguish ops dump.
4096 	 */
4097 	if (dd->first) {
4098 		dump_newline(dd->s);
4099 		dd->first = false;
4100 	}
4101 
4102 	/*
4103 	 * There may be multiple lines in $line. Scan and emit each line
4104 	 * separately.
4105 	 */
4106 	while (true) {
4107 		char *end = line;
4108 		char c;
4109 
4110 		while (*end != '\n' && *end != '\0')
4111 			end++;
4112 
4113 		/*
4114 		 * If $line overflowed, it may not have newline at the end.
4115 		 * Always emit with a newline.
4116 		 */
4117 		c = *end;
4118 		*end = '\0';
4119 		dump_line(dd->s, "%s%s", dd->prefix, line);
4120 		if (c == '\0')
4121 			break;
4122 
4123 		/* move to the next line */
4124 		end++;
4125 		if (*end == '\0')
4126 			break;
4127 		line = end;
4128 	}
4129 
4130 	dd->cursor = 0;
4131 }
4132 
4133 static void ops_dump_exit(void)
4134 {
4135 	ops_dump_flush();
4136 	scx_dump_data.cpu = -1;
4137 }
4138 
4139 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
4140 			  struct task_struct *p, char marker)
4141 {
4142 	static unsigned long bt[SCX_EXIT_BT_LEN];
4143 	char dsq_id_buf[19] = "(n/a)";
4144 	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
4145 	unsigned int bt_len;
4146 
4147 	if (p->scx.dsq)
4148 		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
4149 			  (unsigned long long)p->scx.dsq->id);
4150 
4151 	dump_newline(s);
4152 	dump_line(s, " %c%c %s[%d] %+ldms",
4153 		  marker, task_state_to_char(p), p->comm, p->pid,
4154 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
4155 	dump_line(s, "      scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
4156 		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
4157 		  p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK,
4158 		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
4159 	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
4160 		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
4161 		  p->scx.dsq_vtime);
4162 	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
4163 
4164 	if (SCX_HAS_OP(dump_task)) {
4165 		ops_dump_init(s, "    ");
4166 		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
4167 		ops_dump_exit();
4168 	}
4169 
4170 	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
4171 	if (bt_len) {
4172 		dump_newline(s);
4173 		dump_stack_trace(s, "    ", bt, bt_len);
4174 	}
4175 }
4176 
4177 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
4178 {
4179 	static DEFINE_SPINLOCK(dump_lock);
4180 	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
4181 	struct scx_dump_ctx dctx = {
4182 		.kind = ei->kind,
4183 		.exit_code = ei->exit_code,
4184 		.reason = ei->reason,
4185 		.at_ns = ktime_get_ns(),
4186 		.at_jiffies = jiffies,
4187 	};
4188 	struct seq_buf s;
4189 	unsigned long flags;
4190 	char *buf;
4191 	int cpu;
4192 
4193 	spin_lock_irqsave(&dump_lock, flags);
4194 
4195 	seq_buf_init(&s, ei->dump, dump_len);
4196 
4197 	if (ei->kind == SCX_EXIT_NONE) {
4198 		dump_line(&s, "Debug dump triggered by %s", ei->reason);
4199 	} else {
4200 		dump_line(&s, "%s[%d] triggered exit kind %d:",
4201 			  current->comm, current->pid, ei->kind);
4202 		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);
4203 		dump_newline(&s);
4204 		dump_line(&s, "Backtrace:");
4205 		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
4206 	}
4207 
4208 	if (SCX_HAS_OP(dump)) {
4209 		ops_dump_init(&s, "");
4210 		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
4211 		ops_dump_exit();
4212 	}
4213 
4214 	dump_newline(&s);
4215 	dump_line(&s, "CPU states");
4216 	dump_line(&s, "----------");
4217 
4218 	for_each_possible_cpu(cpu) {
4219 		struct rq *rq = cpu_rq(cpu);
4220 		struct rq_flags rf;
4221 		struct task_struct *p;
4222 		struct seq_buf ns;
4223 		size_t avail, used;
4224 		bool idle;
4225 
4226 		rq_lock(rq, &rf);
4227 
4228 		idle = list_empty(&rq->scx.runnable_list) &&
4229 			rq->curr->sched_class == &idle_sched_class;
4230 
4231 		if (idle && !SCX_HAS_OP(dump_cpu))
4232 			goto next;
4233 
4234 		/*
4235 		 * We don't yet know whether ops.dump_cpu() will produce output
4236 		 * and we may want to skip the default CPU dump if it doesn't.
4237 		 * Use a nested seq_buf to generate the standard dump so that we
4238 		 * can decide whether to commit later.
4239 		 */
4240 		avail = seq_buf_get_buf(&s, &buf);
4241 		seq_buf_init(&ns, buf, avail);
4242 
4243 		dump_newline(&ns);
4244 		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
4245 			  cpu, rq->scx.nr_running, rq->scx.flags,
4246 			  rq->scx.cpu_released, rq->scx.ops_qseq,
4247 			  rq->scx.pnt_seq);
4248 		dump_line(&ns, "          curr=%s[%d] class=%ps",
4249 			  rq->curr->comm, rq->curr->pid,
4250 			  rq->curr->sched_class);
4251 		if (!cpumask_empty(rq->scx.cpus_to_kick))
4252 			dump_line(&ns, "  cpus_to_kick   : %*pb",
4253 				  cpumask_pr_args(rq->scx.cpus_to_kick));
4254 		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
4255 			dump_line(&ns, "  idle_to_kick   : %*pb",
4256 				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
4257 		if (!cpumask_empty(rq->scx.cpus_to_preempt))
4258 			dump_line(&ns, "  cpus_to_preempt: %*pb",
4259 				  cpumask_pr_args(rq->scx.cpus_to_preempt));
4260 		if (!cpumask_empty(rq->scx.cpus_to_wait))
4261 			dump_line(&ns, "  cpus_to_wait   : %*pb",
4262 				  cpumask_pr_args(rq->scx.cpus_to_wait));
4263 
4264 		used = seq_buf_used(&ns);
4265 		if (SCX_HAS_OP(dump_cpu)) {
4266 			ops_dump_init(&ns, "  ");
4267 			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
4268 			ops_dump_exit();
4269 		}
4270 
4271 		/*
4272 		 * If idle && nothing generated by ops.dump_cpu(), there's
4273 		 * nothing interesting. Skip.
4274 		 */
4275 		if (idle && used == seq_buf_used(&ns))
4276 			goto next;
4277 
4278 		/*
4279 		 * $s may already have overflowed when $ns was created. If so,
4280 		 * calling commit on it will trigger BUG.
4281 		 */
4282 		if (avail) {
4283 			seq_buf_commit(&s, seq_buf_used(&ns));
4284 			if (seq_buf_has_overflowed(&ns))
4285 				seq_buf_set_overflow(&s);
4286 		}
4287 
4288 		if (rq->curr->sched_class == &ext_sched_class)
4289 			scx_dump_task(&s, &dctx, rq->curr, '*');
4290 
4291 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
4292 			scx_dump_task(&s, &dctx, p, ' ');
4293 	next:
4294 		rq_unlock(rq, &rf);
4295 	}
4296 
4297 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
4298 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
4299 		       trunc_marker, sizeof(trunc_marker));
4300 
4301 	spin_unlock_irqrestore(&dump_lock, flags);
4302 }
4303 
4304 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
4305 {
4306 	struct scx_exit_info *ei = scx_exit_info;
4307 
4308 	if (ei->kind >= SCX_EXIT_ERROR)
4309 		scx_dump_state(ei, scx_ops.exit_dump_len);
4310 
4311 	schedule_scx_ops_disable_work();
4312 }
4313 
4314 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
4315 
4316 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
4317 					     s64 exit_code,
4318 					     const char *fmt, ...)
4319 {
4320 	struct scx_exit_info *ei = scx_exit_info;
4321 	int none = SCX_EXIT_NONE;
4322 	va_list args;
4323 
4324 	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
4325 		return;
4326 
4327 	ei->exit_code = exit_code;
4328 
4329 	if (kind >= SCX_EXIT_ERROR)
4330 		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
4331 
4332 	va_start(args, fmt);
4333 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
4334 	va_end(args);
4335 
4336 	/*
4337 	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
4338 	 * in scx_ops_disable_workfn().
4339 	 */
4340 	ei->kind = kind;
4341 	ei->reason = scx_exit_reason(ei->kind);
4342 
4343 	irq_work_queue(&scx_ops_error_irq_work);
4344 }
4345 
4346 static struct kthread_worker *scx_create_rt_helper(const char *name)
4347 {
4348 	struct kthread_worker *helper;
4349 
4350 	helper = kthread_create_worker(0, name);
4351 	if (helper)
4352 		sched_set_fifo(helper->task);
4353 	return helper;
4354 }
4355 
4356 static void check_hotplug_seq(const struct sched_ext_ops *ops)
4357 {
4358 	unsigned long long global_hotplug_seq;
4359 
4360 	/*
4361 	 * If a hotplug event has occurred between when a scheduler was
4362 	 * initialized, and when we were able to attach, exit and notify user
4363 	 * space about it.
4364 	 */
4365 	if (ops->hotplug_seq) {
4366 		global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
4367 		if (ops->hotplug_seq != global_hotplug_seq) {
4368 			scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
4369 				     "expected hotplug seq %llu did not match actual %llu",
4370 				     ops->hotplug_seq, global_hotplug_seq);
4371 		}
4372 	}
4373 }
4374 
4375 static int validate_ops(const struct sched_ext_ops *ops)
4376 {
4377 	/*
4378 	 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
4379 	 * ops.enqueue() callback isn't implemented.
4380 	 */
4381 	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
4382 		scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
4383 		return -EINVAL;
4384 	}
4385 
4386 	return 0;
4387 }
4388 
4389 static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
4390 {
4391 	struct scx_task_iter sti;
4392 	struct task_struct *p;
4393 	unsigned long timeout;
4394 	int i, ret;
4395 
4396 	mutex_lock(&scx_ops_enable_mutex);
4397 
4398 	if (!scx_ops_helper) {
4399 		WRITE_ONCE(scx_ops_helper,
4400 			   scx_create_rt_helper("sched_ext_ops_helper"));
4401 		if (!scx_ops_helper) {
4402 			ret = -ENOMEM;
4403 			goto err_unlock;
4404 		}
4405 	}
4406 
4407 	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
4408 		ret = -EBUSY;
4409 		goto err_unlock;
4410 	}
4411 
4412 	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
4413 	if (!scx_root_kobj) {
4414 		ret = -ENOMEM;
4415 		goto err_unlock;
4416 	}
4417 
4418 	scx_root_kobj->kset = scx_kset;
4419 	ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
4420 	if (ret < 0)
4421 		goto err;
4422 
4423 	scx_exit_info = alloc_exit_info(ops->exit_dump_len);
4424 	if (!scx_exit_info) {
4425 		ret = -ENOMEM;
4426 		goto err_del;
4427 	}
4428 
4429 	/*
4430 	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
4431 	 * disable path. Failure triggers full disabling from here on.
4432 	 */
4433 	scx_ops = *ops;
4434 
4435 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
4436 		     SCX_OPS_DISABLED);
4437 
4438 	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
4439 	scx_warned_zero_slice = false;
4440 
4441 	atomic_long_set(&scx_nr_rejected, 0);
4442 
4443 	/*
4444 	 * Keep CPUs stable during enable so that the BPF scheduler can track
4445 	 * online CPUs by watching ->on/offline_cpu() after ->init().
4446 	 */
4447 	cpus_read_lock();
4448 
4449 	if (scx_ops.init) {
4450 		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init);
4451 		if (ret) {
4452 			ret = ops_sanitize_err("init", ret);
4453 			goto err_disable_unlock_cpus;
4454 		}
4455 	}
4456 
4457 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
4458 		if (((void (**)(void))ops)[i])
4459 			static_branch_enable_cpuslocked(&scx_has_op[i]);
4460 
4461 	cpus_read_unlock();
4462 
4463 	ret = validate_ops(ops);
4464 	if (ret)
4465 		goto err_disable;
4466 
4467 	WARN_ON_ONCE(scx_dsp_ctx);
4468 	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
4469 	scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
4470 						   scx_dsp_max_batch),
4471 				     __alignof__(struct scx_dsp_ctx));
4472 	if (!scx_dsp_ctx) {
4473 		ret = -ENOMEM;
4474 		goto err_disable;
4475 	}
4476 
4477 	if (ops->timeout_ms)
4478 		timeout = msecs_to_jiffies(ops->timeout_ms);
4479 	else
4480 		timeout = SCX_WATCHDOG_MAX_TIMEOUT;
4481 
4482 	WRITE_ONCE(scx_watchdog_timeout, timeout);
4483 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
4484 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
4485 			   scx_watchdog_timeout / 2);
4486 
4487 	/*
4488 	 * Lock out forks before opening the floodgate so that they don't wander
4489 	 * into the operations prematurely.
4490 	 *
4491 	 * We don't need to keep the CPUs stable but grab cpus_read_lock() to
4492 	 * ease future locking changes for cgroup suport.
4493 	 *
4494 	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
4495 	 * following dependency chain:
4496 	 *
4497 	 *   scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
4498 	 */
4499 	percpu_down_write(&scx_fork_rwsem);
4500 	cpus_read_lock();
4501 
4502 	check_hotplug_seq(ops);
4503 
4504 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
4505 		if (((void (**)(void))ops)[i])
4506 			static_branch_enable_cpuslocked(&scx_has_op[i]);
4507 
4508 	if (ops->flags & SCX_OPS_ENQ_LAST)
4509 		static_branch_enable_cpuslocked(&scx_ops_enq_last);
4510 
4511 	if (ops->flags & SCX_OPS_ENQ_EXITING)
4512 		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
4513 	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
4514 		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
4515 
4516 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
4517 		reset_idle_masks();
4518 		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
4519 	} else {
4520 		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
4521 	}
4522 
4523 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
4524 
4525 	/*
4526 	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
4527 	 * preventing new tasks from being added. No need to exclude tasks
4528 	 * leaving as sched_ext_free() can handle both prepped and enabled
4529 	 * tasks. Prep all tasks first and then enable them with preemption
4530 	 * disabled.
4531 	 */
4532 	spin_lock_irq(&scx_tasks_lock);
4533 
4534 	scx_task_iter_init(&sti);
4535 	while ((p = scx_task_iter_next_locked(&sti, false))) {
4536 		get_task_struct(p);
4537 		scx_task_iter_rq_unlock(&sti);
4538 		spin_unlock_irq(&scx_tasks_lock);
4539 
4540 		ret = scx_ops_init_task(p, task_group(p), false);
4541 		if (ret) {
4542 			put_task_struct(p);
4543 			spin_lock_irq(&scx_tasks_lock);
4544 			scx_task_iter_exit(&sti);
4545 			spin_unlock_irq(&scx_tasks_lock);
4546 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
4547 			       ret, p->comm, p->pid);
4548 			goto err_disable_unlock_all;
4549 		}
4550 
4551 		put_task_struct(p);
4552 		spin_lock_irq(&scx_tasks_lock);
4553 	}
4554 	scx_task_iter_exit(&sti);
4555 
4556 	/*
4557 	 * All tasks are prepped but are still ops-disabled. Ensure that
4558 	 * %current can't be scheduled out and switch everyone.
4559 	 * preempt_disable() is necessary because we can't guarantee that
4560 	 * %current won't be starved if scheduled out while switching.
4561 	 */
4562 	preempt_disable();
4563 
4564 	/*
4565 	 * From here on, the disable path must assume that tasks have ops
4566 	 * enabled and need to be recovered.
4567 	 *
4568 	 * Transition to ENABLING fails iff the BPF scheduler has already
4569 	 * triggered scx_bpf_error(). Returning an error code here would lose
4570 	 * the recorded error information. Exit indicating success so that the
4571 	 * error is notified through ops.exit() with all the details.
4572 	 */
4573 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
4574 		preempt_enable();
4575 		spin_unlock_irq(&scx_tasks_lock);
4576 		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
4577 		ret = 0;
4578 		goto err_disable_unlock_all;
4579 	}
4580 
4581 	/*
4582 	 * We're fully committed and can't fail. The PREPPED -> ENABLED
4583 	 * transitions here are synchronized against sched_ext_free() through
4584 	 * scx_tasks_lock.
4585 	 */
4586 	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
4587 
4588 	scx_task_iter_init(&sti);
4589 	while ((p = scx_task_iter_next_locked(&sti, false))) {
4590 		const struct sched_class *old_class = p->sched_class;
4591 		struct sched_enq_and_set_ctx ctx;
4592 
4593 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
4594 
4595 		scx_set_task_state(p, SCX_TASK_READY);
4596 		__setscheduler_prio(p, p->prio);
4597 		check_class_changing(task_rq(p), p, old_class);
4598 
4599 		sched_enq_and_set_task(&ctx);
4600 
4601 		check_class_changed(task_rq(p), p, old_class, p->prio);
4602 	}
4603 	scx_task_iter_exit(&sti);
4604 
4605 	spin_unlock_irq(&scx_tasks_lock);
4606 	preempt_enable();
4607 	cpus_read_unlock();
4608 	percpu_up_write(&scx_fork_rwsem);
4609 
4610 	/* see above ENABLING transition for the explanation on exiting with 0 */
4611 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
4612 		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
4613 		ret = 0;
4614 		goto err_disable;
4615 	}
4616 
4617 	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
4618 		static_branch_enable(&__scx_switched_all);
4619 
4620 	kobject_uevent(scx_root_kobj, KOBJ_ADD);
4621 	mutex_unlock(&scx_ops_enable_mutex);
4622 
4623 	return 0;
4624 
4625 err_del:
4626 	kobject_del(scx_root_kobj);
4627 err:
4628 	kobject_put(scx_root_kobj);
4629 	scx_root_kobj = NULL;
4630 	if (scx_exit_info) {
4631 		free_exit_info(scx_exit_info);
4632 		scx_exit_info = NULL;
4633 	}
4634 err_unlock:
4635 	mutex_unlock(&scx_ops_enable_mutex);
4636 	return ret;
4637 
4638 err_disable_unlock_all:
4639 	percpu_up_write(&scx_fork_rwsem);
4640 err_disable_unlock_cpus:
4641 	cpus_read_unlock();
4642 err_disable:
4643 	mutex_unlock(&scx_ops_enable_mutex);
4644 	/* must be fully disabled before returning */
4645 	scx_ops_disable(SCX_EXIT_ERROR);
4646 	kthread_flush_work(&scx_ops_disable_work);
4647 	return ret;
4648 }
4649 
4650 
4651 /********************************************************************************
4652  * bpf_struct_ops plumbing.
4653  */
4654 #include <linux/bpf_verifier.h>
4655 #include <linux/bpf.h>
4656 #include <linux/btf.h>
4657 
4658 extern struct btf *btf_vmlinux;
4659 static const struct btf_type *task_struct_type;
4660 static u32 task_struct_type_id;
4661 
4662 static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
4663 			       enum bpf_access_type type,
4664 			       const struct bpf_prog *prog,
4665 			       struct bpf_insn_access_aux *info)
4666 {
4667 	struct btf *btf = bpf_get_btf_vmlinux();
4668 	const struct bpf_struct_ops_desc *st_ops_desc;
4669 	const struct btf_member *member;
4670 	const struct btf_type *t;
4671 	u32 btf_id, member_idx;
4672 	const char *mname;
4673 
4674 	/* struct_ops op args are all sequential, 64-bit numbers */
4675 	if (off != arg_n * sizeof(__u64))
4676 		return false;
4677 
4678 	/* btf_id should be the type id of struct sched_ext_ops */
4679 	btf_id = prog->aux->attach_btf_id;
4680 	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
4681 	if (!st_ops_desc)
4682 		return false;
4683 
4684 	/* BTF type of struct sched_ext_ops */
4685 	t = st_ops_desc->type;
4686 
4687 	member_idx = prog->expected_attach_type;
4688 	if (member_idx >= btf_type_vlen(t))
4689 		return false;
4690 
4691 	/*
4692 	 * Get the member name of this struct_ops program, which corresponds to
4693 	 * a field in struct sched_ext_ops. For example, the member name of the
4694 	 * dispatch struct_ops program (callback) is "dispatch".
4695 	 */
4696 	member = &btf_type_member(t)[member_idx];
4697 	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
4698 
4699 	if (!strcmp(mname, op)) {
4700 		/*
4701 		 * The value is a pointer to a type (struct task_struct) given
4702 		 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
4703 		 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
4704 		 * should check the pointer to make sure it is not NULL before
4705 		 * using it, or the verifier will reject the program.
4706 		 *
4707 		 * Longer term, this is something that should be addressed by
4708 		 * BTF, and be fully contained within the verifier.
4709 		 */
4710 		info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
4711 		info->btf = btf_vmlinux;
4712 		info->btf_id = task_struct_type_id;
4713 
4714 		return true;
4715 	}
4716 
4717 	return false;
4718 }
4719 
4720 static bool bpf_scx_is_valid_access(int off, int size,
4721 				    enum bpf_access_type type,
4722 				    const struct bpf_prog *prog,
4723 				    struct bpf_insn_access_aux *info)
4724 {
4725 	if (type != BPF_READ)
4726 		return false;
4727 	if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) ||
4728 	    set_arg_maybe_null("yield", 1, off, size, type, prog, info))
4729 		return true;
4730 	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
4731 		return false;
4732 	if (off % size != 0)
4733 		return false;
4734 
4735 	return btf_ctx_access(off, size, type, prog, info);
4736 }
4737 
4738 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
4739 				     const struct bpf_reg_state *reg, int off,
4740 				     int size)
4741 {
4742 	const struct btf_type *t;
4743 
4744 	t = btf_type_by_id(reg->btf, reg->btf_id);
4745 	if (t == task_struct_type) {
4746 		if (off >= offsetof(struct task_struct, scx.slice) &&
4747 		    off + size <= offsetofend(struct task_struct, scx.slice))
4748 			return SCALAR_VALUE;
4749 		if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
4750 		    off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
4751 			return SCALAR_VALUE;
4752 		if (off >= offsetof(struct task_struct, scx.disallow) &&
4753 		    off + size <= offsetofend(struct task_struct, scx.disallow))
4754 			return SCALAR_VALUE;
4755 	}
4756 
4757 	return -EACCES;
4758 }
4759 
4760 static const struct bpf_func_proto *
4761 bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4762 {
4763 	switch (func_id) {
4764 	case BPF_FUNC_task_storage_get:
4765 		return &bpf_task_storage_get_proto;
4766 	case BPF_FUNC_task_storage_delete:
4767 		return &bpf_task_storage_delete_proto;
4768 	default:
4769 		return bpf_base_func_proto(func_id, prog);
4770 	}
4771 }
4772 
4773 static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
4774 	.get_func_proto = bpf_scx_get_func_proto,
4775 	.is_valid_access = bpf_scx_is_valid_access,
4776 	.btf_struct_access = bpf_scx_btf_struct_access,
4777 };
4778 
4779 static int bpf_scx_init_member(const struct btf_type *t,
4780 			       const struct btf_member *member,
4781 			       void *kdata, const void *udata)
4782 {
4783 	const struct sched_ext_ops *uops = udata;
4784 	struct sched_ext_ops *ops = kdata;
4785 	u32 moff = __btf_member_bit_offset(t, member) / 8;
4786 	int ret;
4787 
4788 	switch (moff) {
4789 	case offsetof(struct sched_ext_ops, dispatch_max_batch):
4790 		if (*(u32 *)(udata + moff) > INT_MAX)
4791 			return -E2BIG;
4792 		ops->dispatch_max_batch = *(u32 *)(udata + moff);
4793 		return 1;
4794 	case offsetof(struct sched_ext_ops, flags):
4795 		if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
4796 			return -EINVAL;
4797 		ops->flags = *(u64 *)(udata + moff);
4798 		return 1;
4799 	case offsetof(struct sched_ext_ops, name):
4800 		ret = bpf_obj_name_cpy(ops->name, uops->name,
4801 				       sizeof(ops->name));
4802 		if (ret < 0)
4803 			return ret;
4804 		if (ret == 0)
4805 			return -EINVAL;
4806 		return 1;
4807 	case offsetof(struct sched_ext_ops, timeout_ms):
4808 		if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
4809 		    SCX_WATCHDOG_MAX_TIMEOUT)
4810 			return -E2BIG;
4811 		ops->timeout_ms = *(u32 *)(udata + moff);
4812 		return 1;
4813 	case offsetof(struct sched_ext_ops, exit_dump_len):
4814 		ops->exit_dump_len =
4815 			*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
4816 		return 1;
4817 	case offsetof(struct sched_ext_ops, hotplug_seq):
4818 		ops->hotplug_seq = *(u64 *)(udata + moff);
4819 		return 1;
4820 	}
4821 
4822 	return 0;
4823 }
4824 
4825 static int bpf_scx_check_member(const struct btf_type *t,
4826 				const struct btf_member *member,
4827 				const struct bpf_prog *prog)
4828 {
4829 	u32 moff = __btf_member_bit_offset(t, member) / 8;
4830 
4831 	switch (moff) {
4832 	case offsetof(struct sched_ext_ops, init_task):
4833 	case offsetof(struct sched_ext_ops, cpu_online):
4834 	case offsetof(struct sched_ext_ops, cpu_offline):
4835 	case offsetof(struct sched_ext_ops, init):
4836 	case offsetof(struct sched_ext_ops, exit):
4837 		break;
4838 	default:
4839 		if (prog->sleepable)
4840 			return -EINVAL;
4841 	}
4842 
4843 	return 0;
4844 }
4845 
4846 static int bpf_scx_reg(void *kdata, struct bpf_link *link)
4847 {
4848 	return scx_ops_enable(kdata, link);
4849 }
4850 
4851 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
4852 {
4853 	scx_ops_disable(SCX_EXIT_UNREG);
4854 	kthread_flush_work(&scx_ops_disable_work);
4855 }
4856 
4857 static int bpf_scx_init(struct btf *btf)
4858 {
4859 	u32 type_id;
4860 
4861 	type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
4862 	if (type_id < 0)
4863 		return -EINVAL;
4864 	task_struct_type = btf_type_by_id(btf, type_id);
4865 	task_struct_type_id = type_id;
4866 
4867 	return 0;
4868 }
4869 
4870 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
4871 {
4872 	/*
4873 	 * sched_ext does not support updating the actively-loaded BPF
4874 	 * scheduler, as registering a BPF scheduler can always fail if the
4875 	 * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
4876 	 * etc. Similarly, we can always race with unregistration happening
4877 	 * elsewhere, such as with sysrq.
4878 	 */
4879 	return -EOPNOTSUPP;
4880 }
4881 
4882 static int bpf_scx_validate(void *kdata)
4883 {
4884 	return 0;
4885 }
4886 
4887 static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
4888 static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
4889 static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
4890 static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
4891 static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
4892 static void running_stub(struct task_struct *p) {}
4893 static void stopping_stub(struct task_struct *p, bool runnable) {}
4894 static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
4895 static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
4896 static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
4897 static void set_weight_stub(struct task_struct *p, u32 weight) {}
4898 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
4899 static void update_idle_stub(s32 cpu, bool idle) {}
4900 static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
4901 static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
4902 static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
4903 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
4904 static void enable_stub(struct task_struct *p) {}
4905 static void disable_stub(struct task_struct *p) {}
4906 static void cpu_online_stub(s32 cpu) {}
4907 static void cpu_offline_stub(s32 cpu) {}
4908 static s32 init_stub(void) { return -EINVAL; }
4909 static void exit_stub(struct scx_exit_info *info) {}
4910 
4911 static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
4912 	.select_cpu = select_cpu_stub,
4913 	.enqueue = enqueue_stub,
4914 	.dequeue = dequeue_stub,
4915 	.dispatch = dispatch_stub,
4916 	.runnable = runnable_stub,
4917 	.running = running_stub,
4918 	.stopping = stopping_stub,
4919 	.quiescent = quiescent_stub,
4920 	.yield = yield_stub,
4921 	.core_sched_before = core_sched_before_stub,
4922 	.set_weight = set_weight_stub,
4923 	.set_cpumask = set_cpumask_stub,
4924 	.update_idle = update_idle_stub,
4925 	.cpu_acquire = cpu_acquire_stub,
4926 	.cpu_release = cpu_release_stub,
4927 	.init_task = init_task_stub,
4928 	.exit_task = exit_task_stub,
4929 	.enable = enable_stub,
4930 	.disable = disable_stub,
4931 	.cpu_online = cpu_online_stub,
4932 	.cpu_offline = cpu_offline_stub,
4933 	.init = init_stub,
4934 	.exit = exit_stub,
4935 };
4936 
4937 static struct bpf_struct_ops bpf_sched_ext_ops = {
4938 	.verifier_ops = &bpf_scx_verifier_ops,
4939 	.reg = bpf_scx_reg,
4940 	.unreg = bpf_scx_unreg,
4941 	.check_member = bpf_scx_check_member,
4942 	.init_member = bpf_scx_init_member,
4943 	.init = bpf_scx_init,
4944 	.update = bpf_scx_update,
4945 	.validate = bpf_scx_validate,
4946 	.name = "sched_ext_ops",
4947 	.owner = THIS_MODULE,
4948 	.cfi_stubs = &__bpf_ops_sched_ext_ops
4949 };
4950 
4951 
4952 /********************************************************************************
4953  * System integration and init.
4954  */
4955 
4956 static void sysrq_handle_sched_ext_reset(u8 key)
4957 {
4958 	if (scx_ops_helper)
4959 		scx_ops_disable(SCX_EXIT_SYSRQ);
4960 	else
4961 		pr_info("sched_ext: BPF scheduler not yet used\n");
4962 }
4963 
4964 static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
4965 	.handler	= sysrq_handle_sched_ext_reset,
4966 	.help_msg	= "reset-sched-ext(S)",
4967 	.action_msg	= "Disable sched_ext and revert all tasks to CFS",
4968 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
4969 };
4970 
4971 static void sysrq_handle_sched_ext_dump(u8 key)
4972 {
4973 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
4974 
4975 	if (scx_enabled())
4976 		scx_dump_state(&ei, 0);
4977 }
4978 
4979 static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
4980 	.handler	= sysrq_handle_sched_ext_dump,
4981 	.help_msg	= "dump-sched-ext(D)",
4982 	.action_msg	= "Trigger sched_ext debug dump",
4983 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
4984 };
4985 
4986 static bool can_skip_idle_kick(struct rq *rq)
4987 {
4988 	lockdep_assert_rq_held(rq);
4989 
4990 	/*
4991 	 * We can skip idle kicking if @rq is going to go through at least one
4992 	 * full SCX scheduling cycle before going idle. Just checking whether
4993 	 * curr is not idle is insufficient because we could be racing
4994 	 * balance_one() trying to pull the next task from a remote rq, which
4995 	 * may fail, and @rq may become idle afterwards.
4996 	 *
4997 	 * The race window is small and we don't and can't guarantee that @rq is
4998 	 * only kicked while idle anyway. Skip only when sure.
4999 	 */
5000 	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING);
5001 }
5002 
5003 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
5004 {
5005 	struct rq *rq = cpu_rq(cpu);
5006 	struct scx_rq *this_scx = &this_rq->scx;
5007 	bool should_wait = false;
5008 	unsigned long flags;
5009 
5010 	raw_spin_rq_lock_irqsave(rq, flags);
5011 
5012 	/*
5013 	 * During CPU hotplug, a CPU may depend on kicking itself to make
5014 	 * forward progress. Allow kicking self regardless of online state.
5015 	 */
5016 	if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
5017 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
5018 			if (rq->curr->sched_class == &ext_sched_class)
5019 				rq->curr->scx.slice = 0;
5020 			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
5021 		}
5022 
5023 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
5024 			pseqs[cpu] = rq->scx.pnt_seq;
5025 			should_wait = true;
5026 		}
5027 
5028 		resched_curr(rq);
5029 	} else {
5030 		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
5031 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5032 	}
5033 
5034 	raw_spin_rq_unlock_irqrestore(rq, flags);
5035 
5036 	return should_wait;
5037 }
5038 
5039 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
5040 {
5041 	struct rq *rq = cpu_rq(cpu);
5042 	unsigned long flags;
5043 
5044 	raw_spin_rq_lock_irqsave(rq, flags);
5045 
5046 	if (!can_skip_idle_kick(rq) &&
5047 	    (cpu_online(cpu) || cpu == cpu_of(this_rq)))
5048 		resched_curr(rq);
5049 
5050 	raw_spin_rq_unlock_irqrestore(rq, flags);
5051 }
5052 
5053 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
5054 {
5055 	struct rq *this_rq = this_rq();
5056 	struct scx_rq *this_scx = &this_rq->scx;
5057 	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
5058 	bool should_wait = false;
5059 	s32 cpu;
5060 
5061 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
5062 		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
5063 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
5064 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
5065 	}
5066 
5067 	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
5068 		kick_one_cpu_if_idle(cpu, this_rq);
5069 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
5070 	}
5071 
5072 	if (!should_wait)
5073 		return;
5074 
5075 	for_each_cpu(cpu, this_scx->cpus_to_wait) {
5076 		unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
5077 
5078 		if (cpu != cpu_of(this_rq)) {
5079 			/*
5080 			 * Pairs with smp_store_release() issued by this CPU in
5081 			 * scx_next_task_picked() on the resched path.
5082 			 *
5083 			 * We busy-wait here to guarantee that no other task can
5084 			 * be scheduled on our core before the target CPU has
5085 			 * entered the resched path.
5086 			 */
5087 			while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
5088 				cpu_relax();
5089 		}
5090 
5091 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5092 	}
5093 }
5094 
5095 /**
5096  * print_scx_info - print out sched_ext scheduler state
5097  * @log_lvl: the log level to use when printing
5098  * @p: target task
5099  *
5100  * If a sched_ext scheduler is enabled, print the name and state of the
5101  * scheduler. If @p is on sched_ext, print further information about the task.
5102  *
5103  * This function can be safely called on any task as long as the task_struct
5104  * itself is accessible. While safe, this function isn't synchronized and may
5105  * print out mixups or garbages of limited length.
5106  */
5107 void print_scx_info(const char *log_lvl, struct task_struct *p)
5108 {
5109 	enum scx_ops_enable_state state = scx_ops_enable_state();
5110 	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
5111 	char runnable_at_buf[22] = "?";
5112 	struct sched_class *class;
5113 	unsigned long runnable_at;
5114 
5115 	if (state == SCX_OPS_DISABLED)
5116 		return;
5117 
5118 	/*
5119 	 * Carefully check if the task was running on sched_ext, and then
5120 	 * carefully copy the time it's been runnable, and its state.
5121 	 */
5122 	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
5123 	    class != &ext_sched_class) {
5124 		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
5125 		       scx_ops_enable_state_str[state], all);
5126 		return;
5127 	}
5128 
5129 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
5130 				      sizeof(runnable_at)))
5131 		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
5132 			  jiffies_delta_msecs(runnable_at, jiffies));
5133 
5134 	/* print everything onto one line to conserve console space */
5135 	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
5136 	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
5137 	       runnable_at_buf);
5138 }
5139 
5140 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
5141 {
5142 	/*
5143 	 * SCX schedulers often have userspace components which are sometimes
5144 	 * involved in critial scheduling paths. PM operations involve freezing
5145 	 * userspace which can lead to scheduling misbehaviors including stalls.
5146 	 * Let's bypass while PM operations are in progress.
5147 	 */
5148 	switch (event) {
5149 	case PM_HIBERNATION_PREPARE:
5150 	case PM_SUSPEND_PREPARE:
5151 	case PM_RESTORE_PREPARE:
5152 		scx_ops_bypass(true);
5153 		break;
5154 	case PM_POST_HIBERNATION:
5155 	case PM_POST_SUSPEND:
5156 	case PM_POST_RESTORE:
5157 		scx_ops_bypass(false);
5158 		break;
5159 	}
5160 
5161 	return NOTIFY_OK;
5162 }
5163 
5164 static struct notifier_block scx_pm_notifier = {
5165 	.notifier_call = scx_pm_handler,
5166 };
5167 
5168 void __init init_sched_ext_class(void)
5169 {
5170 	s32 cpu, v;
5171 
5172 	/*
5173 	 * The following is to prevent the compiler from optimizing out the enum
5174 	 * definitions so that BPF scheduler implementations can use them
5175 	 * through the generated vmlinux.h.
5176 	 */
5177 	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT);
5178 
5179 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
5180 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
5181 #ifdef CONFIG_SMP
5182 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
5183 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
5184 #endif
5185 	scx_kick_cpus_pnt_seqs =
5186 		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
5187 			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
5188 	BUG_ON(!scx_kick_cpus_pnt_seqs);
5189 
5190 	for_each_possible_cpu(cpu) {
5191 		struct rq *rq = cpu_rq(cpu);
5192 
5193 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
5194 		INIT_LIST_HEAD(&rq->scx.runnable_list);
5195 
5196 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
5197 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
5198 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
5199 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
5200 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
5201 
5202 		if (cpu_online(cpu))
5203 			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
5204 	}
5205 
5206 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
5207 	register_sysrq_key('D', &sysrq_sched_ext_dump_op);
5208 	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
5209 }
5210 
5211 
5212 /********************************************************************************
5213  * Helpers that can be called from the BPF scheduler.
5214  */
5215 #include <linux/btf_ids.h>
5216 
5217 __bpf_kfunc_start_defs();
5218 
5219 /**
5220  * scx_bpf_create_dsq - Create a custom DSQ
5221  * @dsq_id: DSQ to create
5222  * @node: NUMA node to allocate from
5223  *
5224  * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and
5225  * ops.init_task().
5226  */
5227 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
5228 {
5229 	if (!scx_kf_allowed(SCX_KF_SLEEPABLE))
5230 		return -EINVAL;
5231 
5232 	if (unlikely(node >= (int)nr_node_ids ||
5233 		     (node < 0 && node != NUMA_NO_NODE)))
5234 		return -EINVAL;
5235 	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
5236 }
5237 
5238 __bpf_kfunc_end_defs();
5239 
5240 BTF_KFUNCS_START(scx_kfunc_ids_sleepable)
5241 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
5242 BTF_KFUNCS_END(scx_kfunc_ids_sleepable)
5243 
5244 static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
5245 	.owner			= THIS_MODULE,
5246 	.set			= &scx_kfunc_ids_sleepable,
5247 };
5248 
5249 __bpf_kfunc_start_defs();
5250 
5251 /**
5252  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
5253  * @p: task_struct to select a CPU for
5254  * @prev_cpu: CPU @p was on previously
5255  * @wake_flags: %SCX_WAKE_* flags
5256  * @is_idle: out parameter indicating whether the returned CPU is idle
5257  *
5258  * Can only be called from ops.select_cpu() if the built-in CPU selection is
5259  * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
5260  * @p, @prev_cpu and @wake_flags match ops.select_cpu().
5261  *
5262  * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
5263  * currently idle and thus a good candidate for direct dispatching.
5264  */
5265 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
5266 				       u64 wake_flags, bool *is_idle)
5267 {
5268 	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
5269 		*is_idle = false;
5270 		return prev_cpu;
5271 	}
5272 #ifdef CONFIG_SMP
5273 	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
5274 #else
5275 	*is_idle = false;
5276 	return prev_cpu;
5277 #endif
5278 }
5279 
5280 __bpf_kfunc_end_defs();
5281 
5282 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
5283 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
5284 BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
5285 
5286 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
5287 	.owner			= THIS_MODULE,
5288 	.set			= &scx_kfunc_ids_select_cpu,
5289 };
5290 
5291 static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
5292 {
5293 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
5294 		return false;
5295 
5296 	lockdep_assert_irqs_disabled();
5297 
5298 	if (unlikely(!p)) {
5299 		scx_ops_error("called with NULL task");
5300 		return false;
5301 	}
5302 
5303 	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
5304 		scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
5305 		return false;
5306 	}
5307 
5308 	return true;
5309 }
5310 
5311 static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
5312 {
5313 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
5314 	struct task_struct *ddsp_task;
5315 
5316 	ddsp_task = __this_cpu_read(direct_dispatch_task);
5317 	if (ddsp_task) {
5318 		mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
5319 		return;
5320 	}
5321 
5322 	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
5323 		scx_ops_error("dispatch buffer overflow");
5324 		return;
5325 	}
5326 
5327 	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
5328 		.task = p,
5329 		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
5330 		.dsq_id = dsq_id,
5331 		.enq_flags = enq_flags,
5332 	};
5333 }
5334 
5335 __bpf_kfunc_start_defs();
5336 
5337 /**
5338  * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
5339  * @p: task_struct to dispatch
5340  * @dsq_id: DSQ to dispatch to
5341  * @slice: duration @p can run for in nsecs
5342  * @enq_flags: SCX_ENQ_*
5343  *
5344  * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
5345  * to call this function spuriously. Can be called from ops.enqueue(),
5346  * ops.select_cpu(), and ops.dispatch().
5347  *
5348  * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
5349  * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
5350  * used to target the local DSQ of a CPU other than the enqueueing one. Use
5351  * ops.select_cpu() to be on the target CPU in the first place.
5352  *
5353  * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
5354  * will be directly dispatched to the corresponding dispatch queue after
5355  * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
5356  * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
5357  * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
5358  * task is dispatched.
5359  *
5360  * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
5361  * and this function can be called upto ops.dispatch_max_batch times to dispatch
5362  * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
5363  * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
5364  *
5365  * This function doesn't have any locking restrictions and may be called under
5366  * BPF locks (in the future when BPF introduces more flexible locking).
5367  *
5368  * @p is allowed to run for @slice. The scheduling path is triggered on slice
5369  * exhaustion. If zero, the current residual slice is maintained. If
5370  * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
5371  * scx_bpf_kick_cpu() to trigger scheduling.
5372  */
5373 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
5374 				  u64 enq_flags)
5375 {
5376 	if (!scx_dispatch_preamble(p, enq_flags))
5377 		return;
5378 
5379 	if (slice)
5380 		p->scx.slice = slice;
5381 	else
5382 		p->scx.slice = p->scx.slice ?: 1;
5383 
5384 	scx_dispatch_commit(p, dsq_id, enq_flags);
5385 }
5386 
5387 /**
5388  * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
5389  * @p: task_struct to dispatch
5390  * @dsq_id: DSQ to dispatch to
5391  * @slice: duration @p can run for in nsecs
5392  * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
5393  * @enq_flags: SCX_ENQ_*
5394  *
5395  * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
5396  * Tasks queued into the priority queue are ordered by @vtime and always
5397  * consumed after the tasks in the FIFO queue. All other aspects are identical
5398  * to scx_bpf_dispatch().
5399  *
5400  * @vtime ordering is according to time_before64() which considers wrapping. A
5401  * numerically larger vtime may indicate an earlier position in the ordering and
5402  * vice-versa.
5403  */
5404 __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
5405 					u64 slice, u64 vtime, u64 enq_flags)
5406 {
5407 	if (!scx_dispatch_preamble(p, enq_flags))
5408 		return;
5409 
5410 	if (slice)
5411 		p->scx.slice = slice;
5412 	else
5413 		p->scx.slice = p->scx.slice ?: 1;
5414 
5415 	p->scx.dsq_vtime = vtime;
5416 
5417 	scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
5418 }
5419 
5420 __bpf_kfunc_end_defs();
5421 
5422 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
5423 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
5424 BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
5425 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
5426 
5427 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
5428 	.owner			= THIS_MODULE,
5429 	.set			= &scx_kfunc_ids_enqueue_dispatch,
5430 };
5431 
5432 __bpf_kfunc_start_defs();
5433 
5434 /**
5435  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
5436  *
5437  * Can only be called from ops.dispatch().
5438  */
5439 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
5440 {
5441 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
5442 		return 0;
5443 
5444 	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
5445 }
5446 
5447 /**
5448  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
5449  *
5450  * Cancel the latest dispatch. Can be called multiple times to cancel further
5451  * dispatches. Can only be called from ops.dispatch().
5452  */
5453 __bpf_kfunc void scx_bpf_dispatch_cancel(void)
5454 {
5455 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
5456 
5457 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
5458 		return;
5459 
5460 	if (dspc->cursor > 0)
5461 		dspc->cursor--;
5462 	else
5463 		scx_ops_error("dispatch buffer underflow");
5464 }
5465 
5466 /**
5467  * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
5468  * @dsq_id: DSQ to consume
5469  *
5470  * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
5471  * to the current CPU's local DSQ for execution. Can only be called from
5472  * ops.dispatch().
5473  *
5474  * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
5475  * trying to consume the specified DSQ. It may also grab rq locks and thus can't
5476  * be called under any BPF locks.
5477  *
5478  * Returns %true if a task has been consumed, %false if there isn't any task to
5479  * consume.
5480  */
5481 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
5482 {
5483 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
5484 	struct scx_dispatch_q *dsq;
5485 
5486 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
5487 		return false;
5488 
5489 	flush_dispatch_buf(dspc->rq, dspc->rf);
5490 
5491 	dsq = find_non_local_dsq(dsq_id);
5492 	if (unlikely(!dsq)) {
5493 		scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
5494 		return false;
5495 	}
5496 
5497 	if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) {
5498 		/*
5499 		 * A successfully consumed task can be dequeued before it starts
5500 		 * running while the CPU is trying to migrate other dispatched
5501 		 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
5502 		 * local DSQ.
5503 		 */
5504 		dspc->nr_tasks++;
5505 		return true;
5506 	} else {
5507 		return false;
5508 	}
5509 }
5510 
5511 __bpf_kfunc_end_defs();
5512 
5513 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
5514 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
5515 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
5516 BTF_ID_FLAGS(func, scx_bpf_consume)
5517 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
5518 
5519 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
5520 	.owner			= THIS_MODULE,
5521 	.set			= &scx_kfunc_ids_dispatch,
5522 };
5523 
5524 __bpf_kfunc_start_defs();
5525 
5526 /**
5527  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
5528  *
5529  * Iterate over all of the tasks currently enqueued on the local DSQ of the
5530  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
5531  * processed tasks. Can only be called from ops.cpu_release().
5532  */
5533 __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
5534 {
5535 	u32 nr_enqueued, i;
5536 	struct rq *rq;
5537 
5538 	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
5539 		return 0;
5540 
5541 	rq = cpu_rq(smp_processor_id());
5542 	lockdep_assert_rq_held(rq);
5543 
5544 	/*
5545 	 * Get the number of tasks on the local DSQ before iterating over it to
5546 	 * pull off tasks. The enqueue callback below can signal that it wants
5547 	 * the task to stay on the local DSQ, and we want to prevent the BPF
5548 	 * scheduler from causing us to loop indefinitely.
5549 	 */
5550 	nr_enqueued = rq->scx.local_dsq.nr;
5551 	for (i = 0; i < nr_enqueued; i++) {
5552 		struct task_struct *p;
5553 
5554 		p = first_local_task(rq);
5555 		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) !=
5556 			     SCX_OPSS_NONE);
5557 		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
5558 		WARN_ON_ONCE(p->scx.holding_cpu != -1);
5559 		dispatch_dequeue(rq, p);
5560 		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
5561 	}
5562 
5563 	return nr_enqueued;
5564 }
5565 
5566 __bpf_kfunc_end_defs();
5567 
5568 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
5569 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
5570 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
5571 
5572 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
5573 	.owner			= THIS_MODULE,
5574 	.set			= &scx_kfunc_ids_cpu_release,
5575 };
5576 
5577 __bpf_kfunc_start_defs();
5578 
5579 /**
5580  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
5581  * @cpu: cpu to kick
5582  * @flags: %SCX_KICK_* flags
5583  *
5584  * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
5585  * trigger rescheduling on a busy CPU. This can be called from any online
5586  * scx_ops operation and the actual kicking is performed asynchronously through
5587  * an irq work.
5588  */
5589 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
5590 {
5591 	struct rq *this_rq;
5592 	unsigned long irq_flags;
5593 
5594 	if (!ops_cpu_valid(cpu, NULL))
5595 		return;
5596 
5597 	/*
5598 	 * While bypassing for PM ops, IRQ handling may not be online which can
5599 	 * lead to irq_work_queue() malfunction such as infinite busy wait for
5600 	 * IRQ status update. Suppress kicking.
5601 	 */
5602 	if (scx_ops_bypassing())
5603 		return;
5604 
5605 	local_irq_save(irq_flags);
5606 
5607 	this_rq = this_rq();
5608 
5609 	/*
5610 	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
5611 	 * rq locks. We can probably be smarter and avoid bouncing if called
5612 	 * from ops which don't hold a rq lock.
5613 	 */
5614 	if (flags & SCX_KICK_IDLE) {
5615 		struct rq *target_rq = cpu_rq(cpu);
5616 
5617 		if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
5618 			scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
5619 
5620 		if (raw_spin_rq_trylock(target_rq)) {
5621 			if (can_skip_idle_kick(target_rq)) {
5622 				raw_spin_rq_unlock(target_rq);
5623 				goto out;
5624 			}
5625 			raw_spin_rq_unlock(target_rq);
5626 		}
5627 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
5628 	} else {
5629 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
5630 
5631 		if (flags & SCX_KICK_PREEMPT)
5632 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
5633 		if (flags & SCX_KICK_WAIT)
5634 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
5635 	}
5636 
5637 	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
5638 out:
5639 	local_irq_restore(irq_flags);
5640 }
5641 
5642 /**
5643  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
5644  * @dsq_id: id of the DSQ
5645  *
5646  * Return the number of tasks in the DSQ matching @dsq_id. If not found,
5647  * -%ENOENT is returned.
5648  */
5649 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
5650 {
5651 	struct scx_dispatch_q *dsq;
5652 	s32 ret;
5653 
5654 	preempt_disable();
5655 
5656 	if (dsq_id == SCX_DSQ_LOCAL) {
5657 		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
5658 		goto out;
5659 	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
5660 		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
5661 
5662 		if (ops_cpu_valid(cpu, NULL)) {
5663 			ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
5664 			goto out;
5665 		}
5666 	} else {
5667 		dsq = find_non_local_dsq(dsq_id);
5668 		if (dsq) {
5669 			ret = READ_ONCE(dsq->nr);
5670 			goto out;
5671 		}
5672 	}
5673 	ret = -ENOENT;
5674 out:
5675 	preempt_enable();
5676 	return ret;
5677 }
5678 
5679 /**
5680  * scx_bpf_destroy_dsq - Destroy a custom DSQ
5681  * @dsq_id: DSQ to destroy
5682  *
5683  * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
5684  * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
5685  * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
5686  * which doesn't exist. Can be called from any online scx_ops operations.
5687  */
5688 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
5689 {
5690 	destroy_dsq(dsq_id);
5691 }
5692 
5693 __bpf_kfunc_end_defs();
5694 
5695 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
5696 			 char *fmt, unsigned long long *data, u32 data__sz)
5697 {
5698 	struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
5699 	s32 ret;
5700 
5701 	if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
5702 	    (data__sz && !data)) {
5703 		scx_ops_error("invalid data=%p and data__sz=%u",
5704 			      (void *)data, data__sz);
5705 		return -EINVAL;
5706 	}
5707 
5708 	ret = copy_from_kernel_nofault(data_buf, data, data__sz);
5709 	if (ret < 0) {
5710 		scx_ops_error("failed to read data fields (%d)", ret);
5711 		return ret;
5712 	}
5713 
5714 	ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
5715 				  &bprintf_data);
5716 	if (ret < 0) {
5717 		scx_ops_error("format preparation failed (%d)", ret);
5718 		return ret;
5719 	}
5720 
5721 	ret = bstr_printf(line_buf, line_size, fmt,
5722 			  bprintf_data.bin_args);
5723 	bpf_bprintf_cleanup(&bprintf_data);
5724 	if (ret < 0) {
5725 		scx_ops_error("(\"%s\", %p, %u) failed to format",
5726 			      fmt, data, data__sz);
5727 		return ret;
5728 	}
5729 
5730 	return ret;
5731 }
5732 
5733 static s32 bstr_format(struct scx_bstr_buf *buf,
5734 		       char *fmt, unsigned long long *data, u32 data__sz)
5735 {
5736 	return __bstr_format(buf->data, buf->line, sizeof(buf->line),
5737 			     fmt, data, data__sz);
5738 }
5739 
5740 __bpf_kfunc_start_defs();
5741 
5742 /**
5743  * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
5744  * @exit_code: Exit value to pass to user space via struct scx_exit_info.
5745  * @fmt: error message format string
5746  * @data: format string parameters packaged using ___bpf_fill() macro
5747  * @data__sz: @data len, must end in '__sz' for the verifier
5748  *
5749  * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
5750  * disabling.
5751  */
5752 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
5753 				   unsigned long long *data, u32 data__sz)
5754 {
5755 	unsigned long flags;
5756 
5757 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
5758 	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
5759 		scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
5760 				  scx_exit_bstr_buf.line);
5761 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
5762 }
5763 
5764 /**
5765  * scx_bpf_error_bstr - Indicate fatal error
5766  * @fmt: error message format string
5767  * @data: format string parameters packaged using ___bpf_fill() macro
5768  * @data__sz: @data len, must end in '__sz' for the verifier
5769  *
5770  * Indicate that the BPF scheduler encountered a fatal error and initiate ops
5771  * disabling.
5772  */
5773 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
5774 				    u32 data__sz)
5775 {
5776 	unsigned long flags;
5777 
5778 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
5779 	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
5780 		scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
5781 				  scx_exit_bstr_buf.line);
5782 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
5783 }
5784 
5785 /**
5786  * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
5787  * @fmt: format string
5788  * @data: format string parameters packaged using ___bpf_fill() macro
5789  * @data__sz: @data len, must end in '__sz' for the verifier
5790  *
5791  * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
5792  * dump_task() to generate extra debug dump specific to the BPF scheduler.
5793  *
5794  * The extra dump may be multiple lines. A single line may be split over
5795  * multiple calls. The last line is automatically terminated.
5796  */
5797 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
5798 				   u32 data__sz)
5799 {
5800 	struct scx_dump_data *dd = &scx_dump_data;
5801 	struct scx_bstr_buf *buf = &dd->buf;
5802 	s32 ret;
5803 
5804 	if (raw_smp_processor_id() != dd->cpu) {
5805 		scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
5806 		return;
5807 	}
5808 
5809 	/* append the formatted string to the line buf */
5810 	ret = __bstr_format(buf->data, buf->line + dd->cursor,
5811 			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
5812 	if (ret < 0) {
5813 		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
5814 			  dd->prefix, fmt, data, data__sz, ret);
5815 		return;
5816 	}
5817 
5818 	dd->cursor += ret;
5819 	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
5820 
5821 	if (!dd->cursor)
5822 		return;
5823 
5824 	/*
5825 	 * If the line buf overflowed or ends in a newline, flush it into the
5826 	 * dump. This is to allow the caller to generate a single line over
5827 	 * multiple calls. As ops_dump_flush() can also handle multiple lines in
5828 	 * the line buf, the only case which can lead to an unexpected
5829 	 * truncation is when the caller keeps generating newlines in the middle
5830 	 * instead of the end consecutively. Don't do that.
5831 	 */
5832 	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
5833 		ops_dump_flush();
5834 }
5835 
5836 /**
5837  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
5838  *
5839  * All valid CPU IDs in the system are smaller than the returned value.
5840  */
5841 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
5842 {
5843 	return nr_cpu_ids;
5844 }
5845 
5846 /**
5847  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
5848  */
5849 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
5850 {
5851 	return cpu_possible_mask;
5852 }
5853 
5854 /**
5855  * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
5856  */
5857 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
5858 {
5859 	return cpu_online_mask;
5860 }
5861 
5862 /**
5863  * scx_bpf_put_cpumask - Release a possible/online cpumask
5864  * @cpumask: cpumask to release
5865  */
5866 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
5867 {
5868 	/*
5869 	 * Empty function body because we aren't actually acquiring or releasing
5870 	 * a reference to a global cpumask, which is read-only in the caller and
5871 	 * is never released. The acquire / release semantics here are just used
5872 	 * to make the cpumask is a trusted pointer in the caller.
5873 	 */
5874 }
5875 
5876 /**
5877  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
5878  * per-CPU cpumask.
5879  *
5880  * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
5881  */
5882 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
5883 {
5884 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
5885 		scx_ops_error("built-in idle tracking is disabled");
5886 		return cpu_none_mask;
5887 	}
5888 
5889 #ifdef CONFIG_SMP
5890 	return idle_masks.cpu;
5891 #else
5892 	return cpu_none_mask;
5893 #endif
5894 }
5895 
5896 /**
5897  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
5898  * per-physical-core cpumask. Can be used to determine if an entire physical
5899  * core is free.
5900  *
5901  * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
5902  */
5903 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
5904 {
5905 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
5906 		scx_ops_error("built-in idle tracking is disabled");
5907 		return cpu_none_mask;
5908 	}
5909 
5910 #ifdef CONFIG_SMP
5911 	if (sched_smt_active())
5912 		return idle_masks.smt;
5913 	else
5914 		return idle_masks.cpu;
5915 #else
5916 	return cpu_none_mask;
5917 #endif
5918 }
5919 
5920 /**
5921  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
5922  * either the percpu, or SMT idle-tracking cpumask.
5923  */
5924 __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
5925 {
5926 	/*
5927 	 * Empty function body because we aren't actually acquiring or releasing
5928 	 * a reference to a global idle cpumask, which is read-only in the
5929 	 * caller and is never released. The acquire / release semantics here
5930 	 * are just used to make the cpumask a trusted pointer in the caller.
5931 	 */
5932 }
5933 
5934 /**
5935  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
5936  * @cpu: cpu to test and clear idle for
5937  *
5938  * Returns %true if @cpu was idle and its idle state was successfully cleared.
5939  * %false otherwise.
5940  *
5941  * Unavailable if ops.update_idle() is implemented and
5942  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
5943  */
5944 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
5945 {
5946 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
5947 		scx_ops_error("built-in idle tracking is disabled");
5948 		return false;
5949 	}
5950 
5951 	if (ops_cpu_valid(cpu, NULL))
5952 		return test_and_clear_cpu_idle(cpu);
5953 	else
5954 		return false;
5955 }
5956 
5957 /**
5958  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
5959  * @cpus_allowed: Allowed cpumask
5960  * @flags: %SCX_PICK_IDLE_CPU_* flags
5961  *
5962  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
5963  * number on success. -%EBUSY if no matching cpu was found.
5964  *
5965  * Idle CPU tracking may race against CPU scheduling state transitions. For
5966  * example, this function may return -%EBUSY as CPUs are transitioning into the
5967  * idle state. If the caller then assumes that there will be dispatch events on
5968  * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
5969  * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
5970  * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
5971  * event in the near future.
5972  *
5973  * Unavailable if ops.update_idle() is implemented and
5974  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
5975  */
5976 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
5977 				      u64 flags)
5978 {
5979 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
5980 		scx_ops_error("built-in idle tracking is disabled");
5981 		return -EBUSY;
5982 	}
5983 
5984 	return scx_pick_idle_cpu(cpus_allowed, flags);
5985 }
5986 
5987 /**
5988  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
5989  * @cpus_allowed: Allowed cpumask
5990  * @flags: %SCX_PICK_IDLE_CPU_* flags
5991  *
5992  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
5993  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
5994  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
5995  * empty.
5996  *
5997  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
5998  * set, this function can't tell which CPUs are idle and will always pick any
5999  * CPU.
6000  */
6001 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
6002 				     u64 flags)
6003 {
6004 	s32 cpu;
6005 
6006 	if (static_branch_likely(&scx_builtin_idle_enabled)) {
6007 		cpu = scx_pick_idle_cpu(cpus_allowed, flags);
6008 		if (cpu >= 0)
6009 			return cpu;
6010 	}
6011 
6012 	cpu = cpumask_any_distribute(cpus_allowed);
6013 	if (cpu < nr_cpu_ids)
6014 		return cpu;
6015 	else
6016 		return -EBUSY;
6017 }
6018 
6019 /**
6020  * scx_bpf_task_running - Is task currently running?
6021  * @p: task of interest
6022  */
6023 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
6024 {
6025 	return task_rq(p)->curr == p;
6026 }
6027 
6028 /**
6029  * scx_bpf_task_cpu - CPU a task is currently associated with
6030  * @p: task of interest
6031  */
6032 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
6033 {
6034 	return task_cpu(p);
6035 }
6036 
6037 __bpf_kfunc_end_defs();
6038 
6039 BTF_KFUNCS_START(scx_kfunc_ids_any)
6040 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
6041 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
6042 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
6043 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
6044 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
6045 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
6046 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
6047 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
6048 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
6049 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
6050 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
6051 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
6052 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
6053 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
6054 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
6055 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
6056 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
6057 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
6058 BTF_KFUNCS_END(scx_kfunc_ids_any)
6059 
6060 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
6061 	.owner			= THIS_MODULE,
6062 	.set			= &scx_kfunc_ids_any,
6063 };
6064 
6065 static int __init scx_init(void)
6066 {
6067 	int ret;
6068 
6069 	/*
6070 	 * kfunc registration can't be done from init_sched_ext_class() as
6071 	 * register_btf_kfunc_id_set() needs most of the system to be up.
6072 	 *
6073 	 * Some kfuncs are context-sensitive and can only be called from
6074 	 * specific SCX ops. They are grouped into BTF sets accordingly.
6075 	 * Unfortunately, BPF currently doesn't have a way of enforcing such
6076 	 * restrictions. Eventually, the verifier should be able to enforce
6077 	 * them. For now, register them the same and make each kfunc explicitly
6078 	 * check using scx_kf_allowed().
6079 	 */
6080 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
6081 					     &scx_kfunc_set_sleepable)) ||
6082 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
6083 					     &scx_kfunc_set_select_cpu)) ||
6084 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
6085 					     &scx_kfunc_set_enqueue_dispatch)) ||
6086 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
6087 					     &scx_kfunc_set_dispatch)) ||
6088 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
6089 					     &scx_kfunc_set_cpu_release)) ||
6090 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
6091 					     &scx_kfunc_set_any)) ||
6092 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
6093 					     &scx_kfunc_set_any)) ||
6094 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
6095 					     &scx_kfunc_set_any))) {
6096 		pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
6097 		return ret;
6098 	}
6099 
6100 	ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
6101 	if (ret) {
6102 		pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
6103 		return ret;
6104 	}
6105 
6106 	ret = register_pm_notifier(&scx_pm_notifier);
6107 	if (ret) {
6108 		pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
6109 		return ret;
6110 	}
6111 
6112 	scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
6113 	if (!scx_kset) {
6114 		pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
6115 		return -ENOMEM;
6116 	}
6117 
6118 	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
6119 	if (ret < 0) {
6120 		pr_err("sched_ext: Failed to add global attributes\n");
6121 		return ret;
6122 	}
6123 
6124 	return 0;
6125 }
6126 __initcall(scx_init);
6127