1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #include <linux/btf_ids.h> 10 #include "ext_idle.h" 11 12 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 13 14 enum scx_consts { 15 SCX_DSP_DFL_MAX_BATCH = 32, 16 SCX_DSP_MAX_LOOPS = 32, 17 SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, 18 19 SCX_EXIT_BT_LEN = 64, 20 SCX_EXIT_MSG_LEN = 1024, 21 SCX_EXIT_DUMP_DFL_LEN = 32768, 22 23 SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, 24 25 /* 26 * Iterating all tasks may take a while. Periodically drop 27 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. 28 */ 29 SCX_TASK_ITER_BATCH = 32, 30 }; 31 32 enum scx_exit_kind { 33 SCX_EXIT_NONE, 34 SCX_EXIT_DONE, 35 36 SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ 37 SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ 38 SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ 39 SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ 40 41 SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ 42 SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ 43 SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ 44 }; 45 46 /* 47 * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), 48 * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes 49 * are 64bit of the format: 50 * 51 * Bits: [63 .. 48 47 .. 32 31 .. 0] 52 * [ SYS ACT ] [ SYS RSN ] [ USR ] 53 * 54 * SYS ACT: System-defined exit actions 55 * SYS RSN: System-defined exit reasons 56 * USR : User-defined exit codes and reasons 57 * 58 * Using the above, users may communicate intention and context by ORing system 59 * actions and/or system reasons with a user-defined exit code. 60 */ 61 enum scx_exit_code { 62 /* Reasons */ 63 SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, 64 65 /* Actions */ 66 SCX_ECODE_ACT_RESTART = 1LLU << 48, 67 }; 68 69 /* 70 * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is 71 * being disabled. 72 */ 73 struct scx_exit_info { 74 /* %SCX_EXIT_* - broad category of the exit reason */ 75 enum scx_exit_kind kind; 76 77 /* exit code if gracefully exiting */ 78 s64 exit_code; 79 80 /* textual representation of the above */ 81 const char *reason; 82 83 /* backtrace if exiting due to an error */ 84 unsigned long *bt; 85 u32 bt_len; 86 87 /* informational message */ 88 char *msg; 89 90 /* debug dump */ 91 char *dump; 92 }; 93 94 /* sched_ext_ops.flags */ 95 enum scx_ops_flags { 96 /* 97 * Keep built-in idle tracking even if ops.update_idle() is implemented. 98 */ 99 SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 100 101 /* 102 * By default, if there are no other task to run on the CPU, ext core 103 * keeps running the current task even after its slice expires. If this 104 * flag is specified, such tasks are passed to ops.enqueue() with 105 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 106 */ 107 SCX_OPS_ENQ_LAST = 1LLU << 1, 108 109 /* 110 * An exiting task may schedule after PF_EXITING is set. In such cases, 111 * bpf_task_from_pid() may not be able to find the task and if the BPF 112 * scheduler depends on pid lookup for dispatching, the task will be 113 * lost leading to various issues including RCU grace period stalls. 114 * 115 * To mask this problem, by default, unhashed tasks are automatically 116 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't 117 * depend on pid lookups and wants to handle these tasks directly, the 118 * following flag can be used. 119 */ 120 SCX_OPS_ENQ_EXITING = 1LLU << 2, 121 122 /* 123 * If set, only tasks with policy set to SCHED_EXT are attached to 124 * sched_ext. If clear, SCHED_NORMAL tasks are also included. 125 */ 126 SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 127 128 /* 129 * A migration disabled task can only execute on its current CPU. By 130 * default, such tasks are automatically put on the CPU's local DSQ with 131 * the default slice on enqueue. If this ops flag is set, they also go 132 * through ops.enqueue(). 133 * 134 * A migration disabled task never invokes ops.select_cpu() as it can 135 * only select the current CPU. Also, p->cpus_ptr will only contain its 136 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr 137 * and thus may disagree with cpumask_weight(p->cpus_ptr). 138 */ 139 SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, 140 141 /* 142 * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes 143 * ops.enqueue() on the ops.select_cpu() selected or the wakee's 144 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline 145 * transfers. When this optimization is enabled, ops.select_cpu() is 146 * skipped in some cases (when racing against the wakee switching out). 147 * As the BPF scheduler may depend on ops.select_cpu() being invoked 148 * during wakeups, queued wakeup is disabled by default. 149 * 150 * If this ops flag is set, queued wakeup optimization is enabled and 151 * the BPF scheduler must be able to handle ops.enqueue() invoked on the 152 * wakee's CPU without preceding ops.select_cpu() even for tasks which 153 * may be executed on multiple CPUs. 154 */ 155 SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, 156 157 /* 158 * If set, enable per-node idle cpumasks. If clear, use a single global 159 * flat idle cpumask. 160 */ 161 SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, 162 163 /* 164 * CPU cgroup support flags 165 */ 166 SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ 167 168 SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 169 SCX_OPS_ENQ_LAST | 170 SCX_OPS_ENQ_EXITING | 171 SCX_OPS_ENQ_MIGRATION_DISABLED | 172 SCX_OPS_ALLOW_QUEUED_WAKEUP | 173 SCX_OPS_SWITCH_PARTIAL | 174 SCX_OPS_BUILTIN_IDLE_PER_NODE | 175 SCX_OPS_HAS_CGROUP_WEIGHT, 176 177 /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ 178 __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, 179 180 SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, 181 }; 182 183 /* argument container for ops.init_task() */ 184 struct scx_init_task_args { 185 /* 186 * Set if ops.init_task() is being invoked on the fork path, as opposed 187 * to the scheduler transition path. 188 */ 189 bool fork; 190 #ifdef CONFIG_EXT_GROUP_SCHED 191 /* the cgroup the task is joining */ 192 struct cgroup *cgroup; 193 #endif 194 }; 195 196 /* argument container for ops.exit_task() */ 197 struct scx_exit_task_args { 198 /* Whether the task exited before running on sched_ext. */ 199 bool cancelled; 200 }; 201 202 /* argument container for ops->cgroup_init() */ 203 struct scx_cgroup_init_args { 204 /* the weight of the cgroup [1..10000] */ 205 u32 weight; 206 207 /* bandwidth control parameters from cpu.max and cpu.max.burst */ 208 u64 bw_period_us; 209 u64 bw_quota_us; 210 u64 bw_burst_us; 211 }; 212 213 enum scx_cpu_preempt_reason { 214 /* next task is being scheduled by &sched_class_rt */ 215 SCX_CPU_PREEMPT_RT, 216 /* next task is being scheduled by &sched_class_dl */ 217 SCX_CPU_PREEMPT_DL, 218 /* next task is being scheduled by &sched_class_stop */ 219 SCX_CPU_PREEMPT_STOP, 220 /* unknown reason for SCX being preempted */ 221 SCX_CPU_PREEMPT_UNKNOWN, 222 }; 223 224 /* 225 * Argument container for ops->cpu_acquire(). Currently empty, but may be 226 * expanded in the future. 227 */ 228 struct scx_cpu_acquire_args {}; 229 230 /* argument container for ops->cpu_release() */ 231 struct scx_cpu_release_args { 232 /* the reason the CPU was preempted */ 233 enum scx_cpu_preempt_reason reason; 234 235 /* the task that's going to be scheduled on the CPU */ 236 struct task_struct *task; 237 }; 238 239 /* 240 * Informational context provided to dump operations. 241 */ 242 struct scx_dump_ctx { 243 enum scx_exit_kind kind; 244 s64 exit_code; 245 const char *reason; 246 u64 at_ns; 247 u64 at_jiffies; 248 }; 249 250 /** 251 * struct sched_ext_ops - Operation table for BPF scheduler implementation 252 * 253 * A BPF scheduler can implement an arbitrary scheduling policy by 254 * implementing and loading operations in this table. Note that a userland 255 * scheduling policy can also be implemented using the BPF scheduler 256 * as a shim layer. 257 */ 258 struct sched_ext_ops { 259 /** 260 * @select_cpu: Pick the target CPU for a task which is being woken up 261 * @p: task being woken up 262 * @prev_cpu: the cpu @p was on before sleeping 263 * @wake_flags: SCX_WAKE_* 264 * 265 * Decision made here isn't final. @p may be moved to any CPU while it 266 * is getting dispatched for execution later. However, as @p is not on 267 * the rq at this point, getting the eventual execution CPU right here 268 * saves a small bit of overhead down the line. 269 * 270 * If an idle CPU is returned, the CPU is kicked and will try to 271 * dispatch. While an explicit custom mechanism can be added, 272 * select_cpu() serves as the default way to wake up idle CPUs. 273 * 274 * @p may be inserted into a DSQ directly by calling 275 * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. 276 * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ 277 * of the CPU returned by this operation. 278 * 279 * Note that select_cpu() is never called for tasks that can only run 280 * on a single CPU or tasks with migration disabled, as they don't have 281 * the option to select a different CPU. See select_task_rq() for 282 * details. 283 */ 284 s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); 285 286 /** 287 * @enqueue: Enqueue a task on the BPF scheduler 288 * @p: task being enqueued 289 * @enq_flags: %SCX_ENQ_* 290 * 291 * @p is ready to run. Insert directly into a DSQ by calling 292 * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly 293 * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, 294 * the task will stall. 295 * 296 * If @p was inserted into a DSQ from ops.select_cpu(), this callback is 297 * skipped. 298 */ 299 void (*enqueue)(struct task_struct *p, u64 enq_flags); 300 301 /** 302 * @dequeue: Remove a task from the BPF scheduler 303 * @p: task being dequeued 304 * @deq_flags: %SCX_DEQ_* 305 * 306 * Remove @p from the BPF scheduler. This is usually called to isolate 307 * the task while updating its scheduling properties (e.g. priority). 308 * 309 * The ext core keeps track of whether the BPF side owns a given task or 310 * not and can gracefully ignore spurious dispatches from BPF side, 311 * which makes it safe to not implement this method. However, depending 312 * on the scheduling logic, this can lead to confusing behaviors - e.g. 313 * scheduling position not being updated across a priority change. 314 */ 315 void (*dequeue)(struct task_struct *p, u64 deq_flags); 316 317 /** 318 * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs 319 * @cpu: CPU to dispatch tasks for 320 * @prev: previous task being switched out 321 * 322 * Called when a CPU's local dsq is empty. The operation should dispatch 323 * one or more tasks from the BPF scheduler into the DSQs using 324 * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ 325 * using scx_bpf_dsq_move_to_local(). 326 * 327 * The maximum number of times scx_bpf_dsq_insert() can be called 328 * without an intervening scx_bpf_dsq_move_to_local() is specified by 329 * ops.dispatch_max_batch. See the comments on top of the two functions 330 * for more details. 331 * 332 * When not %NULL, @prev is an SCX task with its slice depleted. If 333 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in 334 * @prev->scx.flags, it is not enqueued yet and will be enqueued after 335 * ops.dispatch() returns. To keep executing @prev, return without 336 * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. 337 */ 338 void (*dispatch)(s32 cpu, struct task_struct *prev); 339 340 /** 341 * @tick: Periodic tick 342 * @p: task running currently 343 * 344 * This operation is called every 1/HZ seconds on CPUs which are 345 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an 346 * immediate dispatch cycle on the CPU. 347 */ 348 void (*tick)(struct task_struct *p); 349 350 /** 351 * @runnable: A task is becoming runnable on its associated CPU 352 * @p: task becoming runnable 353 * @enq_flags: %SCX_ENQ_* 354 * 355 * This and the following three functions can be used to track a task's 356 * execution state transitions. A task becomes ->runnable() on a CPU, 357 * and then goes through one or more ->running() and ->stopping() pairs 358 * as it runs on the CPU, and eventually becomes ->quiescent() when it's 359 * done running on the CPU. 360 * 361 * @p is becoming runnable on the CPU because it's 362 * 363 * - waking up (%SCX_ENQ_WAKEUP) 364 * - being moved from another CPU 365 * - being restored after temporarily taken off the queue for an 366 * attribute change. 367 * 368 * This and ->enqueue() are related but not coupled. This operation 369 * notifies @p's state transition and may not be followed by ->enqueue() 370 * e.g. when @p is being dispatched to a remote CPU, or when @p is 371 * being enqueued on a CPU experiencing a hotplug event. Likewise, a 372 * task may be ->enqueue()'d without being preceded by this operation 373 * e.g. after exhausting its slice. 374 */ 375 void (*runnable)(struct task_struct *p, u64 enq_flags); 376 377 /** 378 * @running: A task is starting to run on its associated CPU 379 * @p: task starting to run 380 * 381 * Note that this callback may be called from a CPU other than the 382 * one the task is going to run on. This can happen when a task 383 * property is changed (i.e., affinity), since scx_next_task_scx(), 384 * which triggers this callback, may run on a CPU different from 385 * the task's assigned CPU. 386 * 387 * Therefore, always use scx_bpf_task_cpu(@p) to determine the 388 * target CPU the task is going to use. 389 * 390 * See ->runnable() for explanation on the task state notifiers. 391 */ 392 void (*running)(struct task_struct *p); 393 394 /** 395 * @stopping: A task is stopping execution 396 * @p: task stopping to run 397 * @runnable: is task @p still runnable? 398 * 399 * Note that this callback may be called from a CPU other than the 400 * one the task was running on. This can happen when a task 401 * property is changed (i.e., affinity), since dequeue_task_scx(), 402 * which triggers this callback, may run on a CPU different from 403 * the task's assigned CPU. 404 * 405 * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU 406 * the task was running on. 407 * 408 * See ->runnable() for explanation on the task state notifiers. If 409 * !@runnable, ->quiescent() will be invoked after this operation 410 * returns. 411 */ 412 void (*stopping)(struct task_struct *p, bool runnable); 413 414 /** 415 * @quiescent: A task is becoming not runnable on its associated CPU 416 * @p: task becoming not runnable 417 * @deq_flags: %SCX_DEQ_* 418 * 419 * See ->runnable() for explanation on the task state notifiers. 420 * 421 * @p is becoming quiescent on the CPU because it's 422 * 423 * - sleeping (%SCX_DEQ_SLEEP) 424 * - being moved to another CPU 425 * - being temporarily taken off the queue for an attribute change 426 * (%SCX_DEQ_SAVE) 427 * 428 * This and ->dequeue() are related but not coupled. This operation 429 * notifies @p's state transition and may not be preceded by ->dequeue() 430 * e.g. when @p is being dispatched to a remote CPU. 431 */ 432 void (*quiescent)(struct task_struct *p, u64 deq_flags); 433 434 /** 435 * @yield: Yield CPU 436 * @from: yielding task 437 * @to: optional yield target task 438 * 439 * If @to is NULL, @from is yielding the CPU to other runnable tasks. 440 * The BPF scheduler should ensure that other available tasks are 441 * dispatched before the yielding task. Return value is ignored in this 442 * case. 443 * 444 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf 445 * scheduler can implement the request, return %true; otherwise, %false. 446 */ 447 bool (*yield)(struct task_struct *from, struct task_struct *to); 448 449 /** 450 * @core_sched_before: Task ordering for core-sched 451 * @a: task A 452 * @b: task B 453 * 454 * Used by core-sched to determine the ordering between two tasks. See 455 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on 456 * core-sched. 457 * 458 * Both @a and @b are runnable and may or may not currently be queued on 459 * the BPF scheduler. Should return %true if @a should run before @b. 460 * %false if there's no required ordering or @b should run before @a. 461 * 462 * If not specified, the default is ordering them according to when they 463 * became runnable. 464 */ 465 bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); 466 467 /** 468 * @set_weight: Set task weight 469 * @p: task to set weight for 470 * @weight: new weight [1..10000] 471 * 472 * Update @p's weight to @weight. 473 */ 474 void (*set_weight)(struct task_struct *p, u32 weight); 475 476 /** 477 * @set_cpumask: Set CPU affinity 478 * @p: task to set CPU affinity for 479 * @cpumask: cpumask of cpus that @p can run on 480 * 481 * Update @p's CPU affinity to @cpumask. 482 */ 483 void (*set_cpumask)(struct task_struct *p, 484 const struct cpumask *cpumask); 485 486 /** 487 * @update_idle: Update the idle state of a CPU 488 * @cpu: CPU to update the idle state for 489 * @idle: whether entering or exiting the idle state 490 * 491 * This operation is called when @rq's CPU goes or leaves the idle 492 * state. By default, implementing this operation disables the built-in 493 * idle CPU tracking and the following helpers become unavailable: 494 * 495 * - scx_bpf_select_cpu_dfl() 496 * - scx_bpf_select_cpu_and() 497 * - scx_bpf_test_and_clear_cpu_idle() 498 * - scx_bpf_pick_idle_cpu() 499 * 500 * The user also must implement ops.select_cpu() as the default 501 * implementation relies on scx_bpf_select_cpu_dfl(). 502 * 503 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle 504 * tracking. 505 */ 506 void (*update_idle)(s32 cpu, bool idle); 507 508 /** 509 * @cpu_acquire: A CPU is becoming available to the BPF scheduler 510 * @cpu: The CPU being acquired by the BPF scheduler. 511 * @args: Acquire arguments, see the struct definition. 512 * 513 * A CPU that was previously released from the BPF scheduler is now once 514 * again under its control. 515 */ 516 void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); 517 518 /** 519 * @cpu_release: A CPU is taken away from the BPF scheduler 520 * @cpu: The CPU being released by the BPF scheduler. 521 * @args: Release arguments, see the struct definition. 522 * 523 * The specified CPU is no longer under the control of the BPF 524 * scheduler. This could be because it was preempted by a higher 525 * priority sched_class, though there may be other reasons as well. The 526 * caller should consult @args->reason to determine the cause. 527 */ 528 void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); 529 530 /** 531 * @init_task: Initialize a task to run in a BPF scheduler 532 * @p: task to initialize for BPF scheduling 533 * @args: init arguments, see the struct definition 534 * 535 * Either we're loading a BPF scheduler or a new task is being forked. 536 * Initialize @p for BPF scheduling. This operation may block and can 537 * be used for allocations, and is called exactly once for a task. 538 * 539 * Return 0 for success, -errno for failure. An error return while 540 * loading will abort loading of the BPF scheduler. During a fork, it 541 * will abort that specific fork. 542 */ 543 s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); 544 545 /** 546 * @exit_task: Exit a previously-running task from the system 547 * @p: task to exit 548 * @args: exit arguments, see the struct definition 549 * 550 * @p is exiting or the BPF scheduler is being unloaded. Perform any 551 * necessary cleanup for @p. 552 */ 553 void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); 554 555 /** 556 * @enable: Enable BPF scheduling for a task 557 * @p: task to enable BPF scheduling for 558 * 559 * Enable @p for BPF scheduling. enable() is called on @p any time it 560 * enters SCX, and is always paired with a matching disable(). 561 */ 562 void (*enable)(struct task_struct *p); 563 564 /** 565 * @disable: Disable BPF scheduling for a task 566 * @p: task to disable BPF scheduling for 567 * 568 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. 569 * Disable BPF scheduling for @p. A disable() call is always matched 570 * with a prior enable() call. 571 */ 572 void (*disable)(struct task_struct *p); 573 574 /** 575 * @dump: Dump BPF scheduler state on error 576 * @ctx: debug dump context 577 * 578 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. 579 */ 580 void (*dump)(struct scx_dump_ctx *ctx); 581 582 /** 583 * @dump_cpu: Dump BPF scheduler state for a CPU on error 584 * @ctx: debug dump context 585 * @cpu: CPU to generate debug dump for 586 * @idle: @cpu is currently idle without any runnable tasks 587 * 588 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 589 * @cpu. If @idle is %true and this operation doesn't produce any 590 * output, @cpu is skipped for dump. 591 */ 592 void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); 593 594 /** 595 * @dump_task: Dump BPF scheduler state for a runnable task on error 596 * @ctx: debug dump context 597 * @p: runnable task to generate debug dump for 598 * 599 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 600 * @p. 601 */ 602 void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); 603 604 #ifdef CONFIG_EXT_GROUP_SCHED 605 /** 606 * @cgroup_init: Initialize a cgroup 607 * @cgrp: cgroup being initialized 608 * @args: init arguments, see the struct definition 609 * 610 * Either the BPF scheduler is being loaded or @cgrp created, initialize 611 * @cgrp for sched_ext. This operation may block. 612 * 613 * Return 0 for success, -errno for failure. An error return while 614 * loading will abort loading of the BPF scheduler. During cgroup 615 * creation, it will abort the specific cgroup creation. 616 */ 617 s32 (*cgroup_init)(struct cgroup *cgrp, 618 struct scx_cgroup_init_args *args); 619 620 /** 621 * @cgroup_exit: Exit a cgroup 622 * @cgrp: cgroup being exited 623 * 624 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit 625 * @cgrp for sched_ext. This operation my block. 626 */ 627 void (*cgroup_exit)(struct cgroup *cgrp); 628 629 /** 630 * @cgroup_prep_move: Prepare a task to be moved to a different cgroup 631 * @p: task being moved 632 * @from: cgroup @p is being moved from 633 * @to: cgroup @p is being moved to 634 * 635 * Prepare @p for move from cgroup @from to @to. This operation may 636 * block and can be used for allocations. 637 * 638 * Return 0 for success, -errno for failure. An error return aborts the 639 * migration. 640 */ 641 s32 (*cgroup_prep_move)(struct task_struct *p, 642 struct cgroup *from, struct cgroup *to); 643 644 /** 645 * @cgroup_move: Commit cgroup move 646 * @p: task being moved 647 * @from: cgroup @p is being moved from 648 * @to: cgroup @p is being moved to 649 * 650 * Commit the move. @p is dequeued during this operation. 651 */ 652 void (*cgroup_move)(struct task_struct *p, 653 struct cgroup *from, struct cgroup *to); 654 655 /** 656 * @cgroup_cancel_move: Cancel cgroup move 657 * @p: task whose cgroup move is being canceled 658 * @from: cgroup @p was being moved from 659 * @to: cgroup @p was being moved to 660 * 661 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). 662 * Undo the preparation. 663 */ 664 void (*cgroup_cancel_move)(struct task_struct *p, 665 struct cgroup *from, struct cgroup *to); 666 667 /** 668 * @cgroup_set_weight: A cgroup's weight is being changed 669 * @cgrp: cgroup whose weight is being updated 670 * @weight: new weight [1..10000] 671 * 672 * Update @cgrp's weight to @weight. 673 */ 674 void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); 675 676 /** 677 * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed 678 * @cgrp: cgroup whose bandwidth is being updated 679 * @period_us: bandwidth control period 680 * @quota_us: bandwidth control quota 681 * @burst_us: bandwidth control burst 682 * 683 * Update @cgrp's bandwidth control parameters. This is from the cpu.max 684 * cgroup interface. 685 * 686 * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled 687 * to. For example, if @period_us is 1_000_000 and @quota_us is 688 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be 689 * interpreted in the same fashion and specifies how much @cgrp can 690 * burst temporarily. The specific control mechanism and thus the 691 * interpretation of @period_us and burstiness is upto to the BPF 692 * scheduler. 693 */ 694 void (*cgroup_set_bandwidth)(struct cgroup *cgrp, 695 u64 period_us, u64 quota_us, u64 burst_us); 696 697 #endif /* CONFIG_EXT_GROUP_SCHED */ 698 699 /* 700 * All online ops must come before ops.cpu_online(). 701 */ 702 703 /** 704 * @cpu_online: A CPU became online 705 * @cpu: CPU which just came up 706 * 707 * @cpu just came online. @cpu will not call ops.enqueue() or 708 * ops.dispatch(), nor run tasks associated with other CPUs beforehand. 709 */ 710 void (*cpu_online)(s32 cpu); 711 712 /** 713 * @cpu_offline: A CPU is going offline 714 * @cpu: CPU which is going offline 715 * 716 * @cpu is going offline. @cpu will not call ops.enqueue() or 717 * ops.dispatch(), nor run tasks associated with other CPUs afterwards. 718 */ 719 void (*cpu_offline)(s32 cpu); 720 721 /* 722 * All CPU hotplug ops must come before ops.init(). 723 */ 724 725 /** 726 * @init: Initialize the BPF scheduler 727 */ 728 s32 (*init)(void); 729 730 /** 731 * @exit: Clean up after the BPF scheduler 732 * @info: Exit info 733 * 734 * ops.exit() is also called on ops.init() failure, which is a bit 735 * unusual. This is to allow rich reporting through @info on how 736 * ops.init() failed. 737 */ 738 void (*exit)(struct scx_exit_info *info); 739 740 /** 741 * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch 742 */ 743 u32 dispatch_max_batch; 744 745 /** 746 * @flags: %SCX_OPS_* flags 747 */ 748 u64 flags; 749 750 /** 751 * @timeout_ms: The maximum amount of time, in milliseconds, that a 752 * runnable task should be able to wait before being scheduled. The 753 * maximum timeout may not exceed the default timeout of 30 seconds. 754 * 755 * Defaults to the maximum allowed timeout value of 30 seconds. 756 */ 757 u32 timeout_ms; 758 759 /** 760 * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default 761 * value of 32768 is used. 762 */ 763 u32 exit_dump_len; 764 765 /** 766 * @hotplug_seq: A sequence number that may be set by the scheduler to 767 * detect when a hotplug event has occurred during the loading process. 768 * If 0, no detection occurs. Otherwise, the scheduler will fail to 769 * load if the sequence number does not match @scx_hotplug_seq on the 770 * enable path. 771 */ 772 u64 hotplug_seq; 773 774 /** 775 * @name: BPF scheduler's name 776 * 777 * Must be a non-zero valid BPF object name including only isalnum(), 778 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the 779 * BPF scheduler is enabled. 780 */ 781 char name[SCX_OPS_NAME_LEN]; 782 783 /* internal use only, must be NULL */ 784 void *priv; 785 }; 786 787 enum scx_opi { 788 SCX_OPI_BEGIN = 0, 789 SCX_OPI_NORMAL_BEGIN = 0, 790 SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), 791 SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), 792 SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), 793 SCX_OPI_END = SCX_OP_IDX(init), 794 }; 795 796 /* 797 * Collection of event counters. Event types are placed in descending order. 798 */ 799 struct scx_event_stats { 800 /* 801 * If ops.select_cpu() returns a CPU which can't be used by the task, 802 * the core scheduler code silently picks a fallback CPU. 803 */ 804 s64 SCX_EV_SELECT_CPU_FALLBACK; 805 806 /* 807 * When dispatching to a local DSQ, the CPU may have gone offline in 808 * the meantime. In this case, the task is bounced to the global DSQ. 809 */ 810 s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; 811 812 /* 813 * If SCX_OPS_ENQ_LAST is not set, the number of times that a task 814 * continued to run because there were no other tasks on the CPU. 815 */ 816 s64 SCX_EV_DISPATCH_KEEP_LAST; 817 818 /* 819 * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task 820 * is dispatched to a local DSQ when exiting. 821 */ 822 s64 SCX_EV_ENQ_SKIP_EXITING; 823 824 /* 825 * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a 826 * migration disabled task skips ops.enqueue() and is dispatched to its 827 * local DSQ. 828 */ 829 s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; 830 831 /* 832 * Total number of times a task's time slice was refilled with the 833 * default value (SCX_SLICE_DFL). 834 */ 835 s64 SCX_EV_REFILL_SLICE_DFL; 836 837 /* 838 * The total duration of bypass modes in nanoseconds. 839 */ 840 s64 SCX_EV_BYPASS_DURATION; 841 842 /* 843 * The number of tasks dispatched in the bypassing mode. 844 */ 845 s64 SCX_EV_BYPASS_DISPATCH; 846 847 /* 848 * The number of times the bypassing mode has been activated. 849 */ 850 s64 SCX_EV_BYPASS_ACTIVATE; 851 }; 852 853 struct scx_sched { 854 struct sched_ext_ops ops; 855 DECLARE_BITMAP(has_op, SCX_OPI_END); 856 857 /* 858 * Dispatch queues. 859 * 860 * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. 861 * This is to avoid live-locking in bypass mode where all tasks are 862 * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If 863 * per-node split isn't sufficient, it can be further split. 864 */ 865 struct rhashtable dsq_hash; 866 struct scx_dispatch_q **global_dsqs; 867 868 /* 869 * The event counters are in a per-CPU variable to minimize the 870 * accounting overhead. A system-wide view on the event counter is 871 * constructed when requested by scx_bpf_events(). 872 */ 873 struct scx_event_stats __percpu *event_stats_cpu; 874 875 bool warned_zero_slice; 876 877 atomic_t exit_kind; 878 struct scx_exit_info *exit_info; 879 880 struct kobject kobj; 881 882 struct kthread_worker *helper; 883 struct irq_work error_irq_work; 884 struct kthread_work disable_work; 885 struct rcu_work rcu_work; 886 }; 887 888 enum scx_wake_flags { 889 /* expose select WF_* flags as enums */ 890 SCX_WAKE_FORK = WF_FORK, 891 SCX_WAKE_TTWU = WF_TTWU, 892 SCX_WAKE_SYNC = WF_SYNC, 893 }; 894 895 enum scx_enq_flags { 896 /* expose select ENQUEUE_* flags as enums */ 897 SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 898 SCX_ENQ_HEAD = ENQUEUE_HEAD, 899 SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, 900 901 /* high 32bits are SCX specific */ 902 903 /* 904 * Set the following to trigger preemption when calling 905 * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the 906 * current task is cleared to zero and the CPU is kicked into the 907 * scheduling path. Implies %SCX_ENQ_HEAD. 908 */ 909 SCX_ENQ_PREEMPT = 1LLU << 32, 910 911 /* 912 * The task being enqueued was previously enqueued on the current CPU's 913 * %SCX_DSQ_LOCAL, but was removed from it in a call to the 914 * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was 915 * invoked in a ->cpu_release() callback, and the task is again 916 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the 917 * task will not be scheduled on the CPU until at least the next invocation 918 * of the ->cpu_acquire() callback. 919 */ 920 SCX_ENQ_REENQ = 1LLU << 40, 921 922 /* 923 * The task being enqueued is the only task available for the cpu. By 924 * default, ext core keeps executing such tasks but when 925 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the 926 * %SCX_ENQ_LAST flag set. 927 * 928 * The BPF scheduler is responsible for triggering a follow-up 929 * scheduling event. Otherwise, Execution may stall. 930 */ 931 SCX_ENQ_LAST = 1LLU << 41, 932 933 /* high 8 bits are internal */ 934 __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, 935 936 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 937 SCX_ENQ_DSQ_PRIQ = 1LLU << 57, 938 }; 939 940 enum scx_deq_flags { 941 /* expose select DEQUEUE_* flags as enums */ 942 SCX_DEQ_SLEEP = DEQUEUE_SLEEP, 943 944 /* high 32bits are SCX specific */ 945 946 /* 947 * The generic core-sched layer decided to execute the task even though 948 * it hasn't been dispatched yet. Dequeue from the BPF side. 949 */ 950 SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, 951 }; 952 953 enum scx_pick_idle_cpu_flags { 954 SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 955 SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ 956 }; 957 958 enum scx_kick_flags { 959 /* 960 * Kick the target CPU if idle. Guarantees that the target CPU goes 961 * through at least one full scheduling cycle before going idle. If the 962 * target CPU can be determined to be currently not idle and going to go 963 * through a scheduling cycle before going idle, noop. 964 */ 965 SCX_KICK_IDLE = 1LLU << 0, 966 967 /* 968 * Preempt the current task and execute the dispatch path. If the 969 * current task of the target CPU is an SCX task, its ->scx.slice is 970 * cleared to zero before the scheduling path is invoked so that the 971 * task expires and the dispatch path is invoked. 972 */ 973 SCX_KICK_PREEMPT = 1LLU << 1, 974 975 /* 976 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will 977 * return after the target CPU finishes picking the next task. 978 */ 979 SCX_KICK_WAIT = 1LLU << 2, 980 }; 981 982 enum scx_tg_flags { 983 SCX_TG_ONLINE = 1U << 0, 984 SCX_TG_INITED = 1U << 1, 985 }; 986 987 enum scx_enable_state { 988 SCX_ENABLING, 989 SCX_ENABLED, 990 SCX_DISABLING, 991 SCX_DISABLED, 992 }; 993 994 static const char *scx_enable_state_str[] = { 995 [SCX_ENABLING] = "enabling", 996 [SCX_ENABLED] = "enabled", 997 [SCX_DISABLING] = "disabling", 998 [SCX_DISABLED] = "disabled", 999 }; 1000 1001 /* 1002 * sched_ext_entity->ops_state 1003 * 1004 * Used to track the task ownership between the SCX core and the BPF scheduler. 1005 * State transitions look as follows: 1006 * 1007 * NONE -> QUEUEING -> QUEUED -> DISPATCHING 1008 * ^ | | 1009 * | v v 1010 * \-------------------------------/ 1011 * 1012 * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 1013 * sites for explanations on the conditions being waited upon and why they are 1014 * safe. Transitions out of them into NONE or QUEUED must store_release and the 1015 * waiters should load_acquire. 1016 * 1017 * Tracking scx_ops_state enables sched_ext core to reliably determine whether 1018 * any given task can be dispatched by the BPF scheduler at all times and thus 1019 * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 1020 * to try to dispatch any task anytime regardless of its state as the SCX core 1021 * can safely reject invalid dispatches. 1022 */ 1023 enum scx_ops_state { 1024 SCX_OPSS_NONE, /* owned by the SCX core */ 1025 SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ 1026 SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ 1027 SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ 1028 1029 /* 1030 * QSEQ brands each QUEUED instance so that, when dispatch races 1031 * dequeue/requeue, the dispatcher can tell whether it still has a claim 1032 * on the task being dispatched. 1033 * 1034 * As some 32bit archs can't do 64bit store_release/load_acquire, 1035 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on 1036 * 32bit machines. The dispatch race window QSEQ protects is very narrow 1037 * and runs with IRQ disabled. 30 bits should be sufficient. 1038 */ 1039 SCX_OPSS_QSEQ_SHIFT = 2, 1040 }; 1041 1042 /* Use macros to ensure that the type is unsigned long for the masks */ 1043 #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) 1044 #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) 1045 1046 /* 1047 * NOTE: sched_ext is in the process of growing multiple scheduler support and 1048 * scx_root usage is in a transitional state. Naked dereferences are safe if the 1049 * caller is one of the tasks attached to SCX and explicit RCU dereference is 1050 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 1051 * are used as temporary markers to indicate that the dereferences need to be 1052 * updated to point to the associated scheduler instances rather than scx_root. 1053 */ 1054 static struct scx_sched __rcu *scx_root; 1055 1056 /* 1057 * During exit, a task may schedule after losing its PIDs. When disabling the 1058 * BPF scheduler, we need to be able to iterate tasks in every state to 1059 * guarantee system safety. Maintain a dedicated task list which contains every 1060 * task between its fork and eventual free. 1061 */ 1062 static DEFINE_SPINLOCK(scx_tasks_lock); 1063 static LIST_HEAD(scx_tasks); 1064 1065 /* ops enable/disable */ 1066 static DEFINE_MUTEX(scx_enable_mutex); 1067 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 1068 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 1069 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 1070 static unsigned long scx_in_softlockup; 1071 static atomic_t scx_breather_depth = ATOMIC_INIT(0); 1072 static int scx_bypass_depth; 1073 static bool scx_init_task_enabled; 1074 static bool scx_switching_all; 1075 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 1076 1077 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 1078 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 1079 1080 /* 1081 * A monotically increasing sequence number that is incremented every time a 1082 * scheduler is enabled. This can be used by to check if any custom sched_ext 1083 * scheduler has ever been used in the system. 1084 */ 1085 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 1086 1087 /* 1088 * The maximum amount of time in jiffies that a task may be runnable without 1089 * being scheduled on a CPU. If this timeout is exceeded, it will trigger 1090 * scx_error(). 1091 */ 1092 static unsigned long scx_watchdog_timeout; 1093 1094 /* 1095 * The last time the delayed work was run. This delayed work relies on 1096 * ksoftirqd being able to run to service timer interrupts, so it's possible 1097 * that this work itself could get wedged. To account for this, we check that 1098 * it's not stalled in the timer tick, and trigger an error if it is. 1099 */ 1100 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 1101 1102 static struct delayed_work scx_watchdog_work; 1103 1104 /* for %SCX_KICK_WAIT */ 1105 static unsigned long __percpu *scx_kick_cpus_pnt_seqs; 1106 1107 /* 1108 * Direct dispatch marker. 1109 * 1110 * Non-NULL values are used for direct dispatch from enqueue path. A valid 1111 * pointer points to the task currently being enqueued. An ERR_PTR value is used 1112 * to indicate that direct dispatch has already happened. 1113 */ 1114 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 1115 1116 static const struct rhashtable_params dsq_hash_params = { 1117 .key_len = sizeof_field(struct scx_dispatch_q, id), 1118 .key_offset = offsetof(struct scx_dispatch_q, id), 1119 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 1120 }; 1121 1122 static LLIST_HEAD(dsqs_to_free); 1123 1124 /* dispatch buf */ 1125 struct scx_dsp_buf_ent { 1126 struct task_struct *task; 1127 unsigned long qseq; 1128 u64 dsq_id; 1129 u64 enq_flags; 1130 }; 1131 1132 static u32 scx_dsp_max_batch; 1133 1134 struct scx_dsp_ctx { 1135 struct rq *rq; 1136 u32 cursor; 1137 u32 nr_tasks; 1138 struct scx_dsp_buf_ent buf[]; 1139 }; 1140 1141 static struct scx_dsp_ctx __percpu *scx_dsp_ctx; 1142 1143 /* string formatting from BPF */ 1144 struct scx_bstr_buf { 1145 u64 data[MAX_BPRINTF_VARARGS]; 1146 char line[SCX_EXIT_MSG_LEN]; 1147 }; 1148 1149 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 1150 static struct scx_bstr_buf scx_exit_bstr_buf; 1151 1152 /* ops debug dump */ 1153 struct scx_dump_data { 1154 s32 cpu; 1155 bool first; 1156 s32 cursor; 1157 struct seq_buf *s; 1158 const char *prefix; 1159 struct scx_bstr_buf buf; 1160 }; 1161 1162 static struct scx_dump_data scx_dump_data = { 1163 .cpu = -1, 1164 }; 1165 1166 /* /sys/kernel/sched_ext interface */ 1167 static struct kset *scx_kset; 1168 1169 #define CREATE_TRACE_POINTS 1170 #include <trace/events/sched_ext.h> 1171 1172 static void process_ddsp_deferred_locals(struct rq *rq); 1173 static void scx_bpf_kick_cpu(s32 cpu, u64 flags); 1174 static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 1175 s64 exit_code, const char *fmt, va_list args); 1176 1177 static __printf(4, 5) void scx_exit(struct scx_sched *sch, 1178 enum scx_exit_kind kind, s64 exit_code, 1179 const char *fmt, ...) 1180 { 1181 va_list args; 1182 1183 va_start(args, fmt); 1184 scx_vexit(sch, kind, exit_code, fmt, args); 1185 va_end(args); 1186 } 1187 1188 static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, 1189 const char *fmt, ...) 1190 { 1191 struct scx_sched *sch; 1192 va_list args; 1193 1194 rcu_read_lock(); 1195 sch = rcu_dereference(scx_root); 1196 if (sch) { 1197 va_start(args, fmt); 1198 scx_vexit(sch, kind, exit_code, fmt, args); 1199 va_end(args); 1200 } 1201 rcu_read_unlock(); 1202 } 1203 1204 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 1205 #define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) 1206 1207 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 1208 1209 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 1210 { 1211 if (time_after(at, now)) 1212 return jiffies_to_msecs(at - now); 1213 else 1214 return -(long)jiffies_to_msecs(now - at); 1215 } 1216 1217 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */ 1218 static u32 higher_bits(u32 flags) 1219 { 1220 return ~((1 << fls(flags)) - 1); 1221 } 1222 1223 /* return the mask with only the highest bit set */ 1224 static u32 highest_bit(u32 flags) 1225 { 1226 int bit = fls(flags); 1227 return ((u64)1 << bit) >> 1; 1228 } 1229 1230 static bool u32_before(u32 a, u32 b) 1231 { 1232 return (s32)(a - b) < 0; 1233 } 1234 1235 static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) 1236 { 1237 struct scx_sched *sch = scx_root; 1238 1239 return sch->global_dsqs[cpu_to_node(task_cpu(p))]; 1240 } 1241 1242 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 1243 { 1244 return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); 1245 } 1246 1247 /* 1248 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX 1249 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate 1250 * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check 1251 * whether it's running from an allowed context. 1252 * 1253 * @mask is constant, always inline to cull the mask calculations. 1254 */ 1255 static __always_inline void scx_kf_allow(u32 mask) 1256 { 1257 /* nesting is allowed only in increasing scx_kf_mask order */ 1258 WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, 1259 "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", 1260 current->scx.kf_mask, mask); 1261 current->scx.kf_mask |= mask; 1262 barrier(); 1263 } 1264 1265 static void scx_kf_disallow(u32 mask) 1266 { 1267 barrier(); 1268 current->scx.kf_mask &= ~mask; 1269 } 1270 1271 /* 1272 * Track the rq currently locked. 1273 * 1274 * This allows kfuncs to safely operate on rq from any scx ops callback, 1275 * knowing which rq is already locked. 1276 */ 1277 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 1278 1279 static inline void update_locked_rq(struct rq *rq) 1280 { 1281 /* 1282 * Check whether @rq is actually locked. This can help expose bugs 1283 * or incorrect assumptions about the context in which a kfunc or 1284 * callback is executed. 1285 */ 1286 if (rq) 1287 lockdep_assert_rq_held(rq); 1288 __this_cpu_write(scx_locked_rq_state, rq); 1289 } 1290 1291 #define SCX_CALL_OP(sch, mask, op, rq, args...) \ 1292 do { \ 1293 if (rq) \ 1294 update_locked_rq(rq); \ 1295 if (mask) { \ 1296 scx_kf_allow(mask); \ 1297 (sch)->ops.op(args); \ 1298 scx_kf_disallow(mask); \ 1299 } else { \ 1300 (sch)->ops.op(args); \ 1301 } \ 1302 if (rq) \ 1303 update_locked_rq(NULL); \ 1304 } while (0) 1305 1306 #define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ 1307 ({ \ 1308 __typeof__((sch)->ops.op(args)) __ret; \ 1309 \ 1310 if (rq) \ 1311 update_locked_rq(rq); \ 1312 if (mask) { \ 1313 scx_kf_allow(mask); \ 1314 __ret = (sch)->ops.op(args); \ 1315 scx_kf_disallow(mask); \ 1316 } else { \ 1317 __ret = (sch)->ops.op(args); \ 1318 } \ 1319 if (rq) \ 1320 update_locked_rq(NULL); \ 1321 __ret; \ 1322 }) 1323 1324 /* 1325 * Some kfuncs are allowed only on the tasks that are subjects of the 1326 * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such 1327 * restrictions, the following SCX_CALL_OP_*() variants should be used when 1328 * invoking scx_ops operations that take task arguments. These can only be used 1329 * for non-nesting operations due to the way the tasks are tracked. 1330 * 1331 * kfuncs which can only operate on such tasks can in turn use 1332 * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on 1333 * the specific task. 1334 */ 1335 #define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ 1336 do { \ 1337 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 1338 current->scx.kf_tasks[0] = task; \ 1339 SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ 1340 current->scx.kf_tasks[0] = NULL; \ 1341 } while (0) 1342 1343 #define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ 1344 ({ \ 1345 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 1346 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 1347 current->scx.kf_tasks[0] = task; \ 1348 __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ 1349 current->scx.kf_tasks[0] = NULL; \ 1350 __ret; \ 1351 }) 1352 1353 #define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ 1354 ({ \ 1355 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 1356 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 1357 current->scx.kf_tasks[0] = task0; \ 1358 current->scx.kf_tasks[1] = task1; \ 1359 __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ 1360 current->scx.kf_tasks[0] = NULL; \ 1361 current->scx.kf_tasks[1] = NULL; \ 1362 __ret; \ 1363 }) 1364 1365 /* @mask is constant, always inline to cull unnecessary branches */ 1366 static __always_inline bool scx_kf_allowed(u32 mask) 1367 { 1368 if (unlikely(!(current->scx.kf_mask & mask))) { 1369 scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", 1370 mask, current->scx.kf_mask); 1371 return false; 1372 } 1373 1374 /* 1375 * Enforce nesting boundaries. e.g. A kfunc which can be called from 1376 * DISPATCH must not be called if we're running DEQUEUE which is nested 1377 * inside ops.dispatch(). We don't need to check boundaries for any 1378 * blocking kfuncs as the verifier ensures they're only called from 1379 * sleepable progs. 1380 */ 1381 if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && 1382 (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { 1383 scx_kf_error("cpu_release kfunc called from a nested operation"); 1384 return false; 1385 } 1386 1387 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && 1388 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { 1389 scx_kf_error("dispatch kfunc called from a nested operation"); 1390 return false; 1391 } 1392 1393 return true; 1394 } 1395 1396 /* see SCX_CALL_OP_TASK() */ 1397 static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, 1398 struct task_struct *p) 1399 { 1400 if (!scx_kf_allowed(mask)) 1401 return false; 1402 1403 if (unlikely((p != current->scx.kf_tasks[0] && 1404 p != current->scx.kf_tasks[1]))) { 1405 scx_kf_error("called on a task not being operated on"); 1406 return false; 1407 } 1408 1409 return true; 1410 } 1411 1412 /** 1413 * nldsq_next_task - Iterate to the next task in a non-local DSQ 1414 * @dsq: user dsq being iterated 1415 * @cur: current position, %NULL to start iteration 1416 * @rev: walk backwards 1417 * 1418 * Returns %NULL when iteration is finished. 1419 */ 1420 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 1421 struct task_struct *cur, bool rev) 1422 { 1423 struct list_head *list_node; 1424 struct scx_dsq_list_node *dsq_lnode; 1425 1426 lockdep_assert_held(&dsq->lock); 1427 1428 if (cur) 1429 list_node = &cur->scx.dsq_list.node; 1430 else 1431 list_node = &dsq->list; 1432 1433 /* find the next task, need to skip BPF iteration cursors */ 1434 do { 1435 if (rev) 1436 list_node = list_node->prev; 1437 else 1438 list_node = list_node->next; 1439 1440 if (list_node == &dsq->list) 1441 return NULL; 1442 1443 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 1444 node); 1445 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 1446 1447 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 1448 } 1449 1450 #define nldsq_for_each_task(p, dsq) \ 1451 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 1452 (p) = nldsq_next_task((dsq), (p), false)) 1453 1454 1455 /* 1456 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 1457 * dispatch order. BPF-visible iterator is opaque and larger to allow future 1458 * changes without breaking backward compatibility. Can be used with 1459 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 1460 */ 1461 enum scx_dsq_iter_flags { 1462 /* iterate in the reverse dispatch order */ 1463 SCX_DSQ_ITER_REV = 1U << 16, 1464 1465 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 1466 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 1467 1468 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 1469 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 1470 __SCX_DSQ_ITER_HAS_SLICE | 1471 __SCX_DSQ_ITER_HAS_VTIME, 1472 }; 1473 1474 struct bpf_iter_scx_dsq_kern { 1475 struct scx_dsq_list_node cursor; 1476 struct scx_dispatch_q *dsq; 1477 u64 slice; 1478 u64 vtime; 1479 } __attribute__((aligned(8))); 1480 1481 struct bpf_iter_scx_dsq { 1482 u64 __opaque[6]; 1483 } __attribute__((aligned(8))); 1484 1485 1486 /* 1487 * SCX task iterator. 1488 */ 1489 struct scx_task_iter { 1490 struct sched_ext_entity cursor; 1491 struct task_struct *locked; 1492 struct rq *rq; 1493 struct rq_flags rf; 1494 u32 cnt; 1495 }; 1496 1497 /** 1498 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 1499 * @iter: iterator to init 1500 * 1501 * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter 1502 * must eventually be stopped with scx_task_iter_stop(). 1503 * 1504 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 1505 * between this and the first next() call or between any two next() calls. If 1506 * the locks are released between two next() calls, the caller is responsible 1507 * for ensuring that the task being iterated remains accessible either through 1508 * RCU read lock or obtaining a reference count. 1509 * 1510 * All tasks which existed when the iteration started are guaranteed to be 1511 * visited as long as they still exist. 1512 */ 1513 static void scx_task_iter_start(struct scx_task_iter *iter) 1514 { 1515 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 1516 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 1517 1518 spin_lock_irq(&scx_tasks_lock); 1519 1520 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 1521 list_add(&iter->cursor.tasks_node, &scx_tasks); 1522 iter->locked = NULL; 1523 iter->cnt = 0; 1524 } 1525 1526 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 1527 { 1528 if (iter->locked) { 1529 task_rq_unlock(iter->rq, iter->locked, &iter->rf); 1530 iter->locked = NULL; 1531 } 1532 } 1533 1534 /** 1535 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 1536 * @iter: iterator to unlock 1537 * 1538 * If @iter is in the middle of a locked iteration, it may be locking the rq of 1539 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 1540 * This function can be safely called anytime during an iteration. 1541 */ 1542 static void scx_task_iter_unlock(struct scx_task_iter *iter) 1543 { 1544 __scx_task_iter_rq_unlock(iter); 1545 spin_unlock_irq(&scx_tasks_lock); 1546 } 1547 1548 /** 1549 * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() 1550 * @iter: iterator to re-lock 1551 * 1552 * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it 1553 * doesn't re-lock the rq lock. Must be called before other iterator operations. 1554 */ 1555 static void scx_task_iter_relock(struct scx_task_iter *iter) 1556 { 1557 spin_lock_irq(&scx_tasks_lock); 1558 } 1559 1560 /** 1561 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 1562 * @iter: iterator to exit 1563 * 1564 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 1565 * which is released on return. If the iterator holds a task's rq lock, that rq 1566 * lock is also released. See scx_task_iter_start() for details. 1567 */ 1568 static void scx_task_iter_stop(struct scx_task_iter *iter) 1569 { 1570 list_del_init(&iter->cursor.tasks_node); 1571 scx_task_iter_unlock(iter); 1572 } 1573 1574 /** 1575 * scx_task_iter_next - Next task 1576 * @iter: iterator to walk 1577 * 1578 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 1579 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 1580 * by holding scx_tasks_lock for too long. 1581 */ 1582 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 1583 { 1584 struct list_head *cursor = &iter->cursor.tasks_node; 1585 struct sched_ext_entity *pos; 1586 1587 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 1588 scx_task_iter_unlock(iter); 1589 cond_resched(); 1590 scx_task_iter_relock(iter); 1591 } 1592 1593 list_for_each_entry(pos, cursor, tasks_node) { 1594 if (&pos->tasks_node == &scx_tasks) 1595 return NULL; 1596 if (!(pos->flags & SCX_TASK_CURSOR)) { 1597 list_move(cursor, &pos->tasks_node); 1598 return container_of(pos, struct task_struct, scx); 1599 } 1600 } 1601 1602 /* can't happen, should always terminate at scx_tasks above */ 1603 BUG(); 1604 } 1605 1606 /** 1607 * scx_task_iter_next_locked - Next non-idle task with its rq locked 1608 * @iter: iterator to walk 1609 * 1610 * Visit the non-idle task with its rq lock held. Allows callers to specify 1611 * whether they would like to filter out dead tasks. See scx_task_iter_start() 1612 * for details. 1613 */ 1614 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 1615 { 1616 struct task_struct *p; 1617 1618 __scx_task_iter_rq_unlock(iter); 1619 1620 while ((p = scx_task_iter_next(iter))) { 1621 /* 1622 * scx_task_iter is used to prepare and move tasks into SCX 1623 * while loading the BPF scheduler and vice-versa while 1624 * unloading. The init_tasks ("swappers") should be excluded 1625 * from the iteration because: 1626 * 1627 * - It's unsafe to use __setschduler_prio() on an init_task to 1628 * determine the sched_class to use as it won't preserve its 1629 * idle_sched_class. 1630 * 1631 * - ops.init/exit_task() can easily be confused if called with 1632 * init_tasks as they, e.g., share PID 0. 1633 * 1634 * As init_tasks are never scheduled through SCX, they can be 1635 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 1636 * doesn't work here: 1637 * 1638 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 1639 * yet been onlined. 1640 * 1641 * - %PF_IDLE can be set on tasks that are not init_tasks. See 1642 * play_idle_precise() used by CONFIG_IDLE_INJECT. 1643 * 1644 * Test for idle_sched_class as only init_tasks are on it. 1645 */ 1646 if (p->sched_class != &idle_sched_class) 1647 break; 1648 } 1649 if (!p) 1650 return NULL; 1651 1652 iter->rq = task_rq_lock(p, &iter->rf); 1653 iter->locked = p; 1654 1655 return p; 1656 } 1657 1658 /** 1659 * scx_add_event - Increase an event counter for 'name' by 'cnt' 1660 * @sch: scx_sched to account events for 1661 * @name: an event name defined in struct scx_event_stats 1662 * @cnt: the number of the event occurred 1663 * 1664 * This can be used when preemption is not disabled. 1665 */ 1666 #define scx_add_event(sch, name, cnt) do { \ 1667 this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ 1668 trace_sched_ext_event(#name, (cnt)); \ 1669 } while(0) 1670 1671 /** 1672 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1673 * @sch: scx_sched to account events for 1674 * @name: an event name defined in struct scx_event_stats 1675 * @cnt: the number of the event occurred 1676 * 1677 * This should be used only when preemption is disabled. 1678 */ 1679 #define __scx_add_event(sch, name, cnt) do { \ 1680 __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ 1681 trace_sched_ext_event(#name, cnt); \ 1682 } while(0) 1683 1684 /** 1685 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1686 * @dst_e: destination event stats 1687 * @src_e: source event stats 1688 * @kind: a kind of event to be aggregated 1689 */ 1690 #define scx_agg_event(dst_e, src_e, kind) do { \ 1691 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1692 } while(0) 1693 1694 /** 1695 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1696 * @s: output seq_buf 1697 * @events: event stats 1698 * @kind: a kind of event to dump 1699 */ 1700 #define scx_dump_event(s, events, kind) do { \ 1701 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1702 } while (0) 1703 1704 1705 static void scx_read_events(struct scx_sched *sch, 1706 struct scx_event_stats *events); 1707 1708 static enum scx_enable_state scx_enable_state(void) 1709 { 1710 return atomic_read(&scx_enable_state_var); 1711 } 1712 1713 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1714 { 1715 return atomic_xchg(&scx_enable_state_var, to); 1716 } 1717 1718 static bool scx_tryset_enable_state(enum scx_enable_state to, 1719 enum scx_enable_state from) 1720 { 1721 int from_v = from; 1722 1723 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1724 } 1725 1726 /** 1727 * wait_ops_state - Busy-wait the specified ops state to end 1728 * @p: target task 1729 * @opss: state to wait the end of 1730 * 1731 * Busy-wait for @p to transition out of @opss. This can only be used when the 1732 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1733 * has load_acquire semantics to ensure that the caller can see the updates made 1734 * in the enqueueing and dispatching paths. 1735 */ 1736 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1737 { 1738 do { 1739 cpu_relax(); 1740 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1741 } 1742 1743 static inline bool __cpu_valid(s32 cpu) 1744 { 1745 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1746 } 1747 1748 /** 1749 * ops_cpu_valid - Verify a cpu number, to be used on ops input args 1750 * @sch: scx_sched to abort on error 1751 * @cpu: cpu number which came from a BPF ops 1752 * @where: extra information reported on error 1753 * 1754 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1755 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1756 * an ops error. 1757 */ 1758 static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1759 { 1760 if (__cpu_valid(cpu)) { 1761 return true; 1762 } else { 1763 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1764 return false; 1765 } 1766 } 1767 1768 /** 1769 * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args 1770 * @cpu: cpu number which came from a BPF ops 1771 * @where: extra information reported on error 1772 * 1773 * The same as ops_cpu_valid() but @sch is implicit. 1774 */ 1775 static bool kf_cpu_valid(u32 cpu, const char *where) 1776 { 1777 if (__cpu_valid(cpu)) { 1778 return true; 1779 } else { 1780 scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1781 return false; 1782 } 1783 } 1784 1785 /** 1786 * ops_sanitize_err - Sanitize a -errno value 1787 * @sch: scx_sched to error out on error 1788 * @ops_name: operation to blame on failure 1789 * @err: -errno value to sanitize 1790 * 1791 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1792 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1793 * cause misbehaviors. For an example, a large negative return from 1794 * ops.init_task() triggers an oops when passed up the call chain because the 1795 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1796 * handled as a pointer. 1797 */ 1798 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1799 { 1800 if (err < 0 && err >= -MAX_ERRNO) 1801 return err; 1802 1803 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1804 return -EPROTO; 1805 } 1806 1807 static void run_deferred(struct rq *rq) 1808 { 1809 process_ddsp_deferred_locals(rq); 1810 } 1811 1812 static void deferred_bal_cb_workfn(struct rq *rq) 1813 { 1814 run_deferred(rq); 1815 } 1816 1817 static void deferred_irq_workfn(struct irq_work *irq_work) 1818 { 1819 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1820 1821 raw_spin_rq_lock(rq); 1822 run_deferred(rq); 1823 raw_spin_rq_unlock(rq); 1824 } 1825 1826 /** 1827 * schedule_deferred - Schedule execution of deferred actions on an rq 1828 * @rq: target rq 1829 * 1830 * Schedule execution of deferred actions on @rq. Must be called with @rq 1831 * locked. Deferred actions are executed with @rq locked but unpinned, and thus 1832 * can unlock @rq to e.g. migrate tasks to other rqs. 1833 */ 1834 static void schedule_deferred(struct rq *rq) 1835 { 1836 lockdep_assert_rq_held(rq); 1837 1838 /* 1839 * If in the middle of waking up a task, task_woken_scx() will be called 1840 * afterwards which will then run the deferred actions, no need to 1841 * schedule anything. 1842 */ 1843 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1844 return; 1845 1846 /* 1847 * If in balance, the balance callbacks will be called before rq lock is 1848 * released. Schedule one. 1849 */ 1850 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1851 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 1852 deferred_bal_cb_workfn); 1853 return; 1854 } 1855 1856 /* 1857 * No scheduler hooks available. Queue an irq work. They are executed on 1858 * IRQ re-enable which may take a bit longer than the scheduler hooks. 1859 * The above WAKEUP and BALANCE paths should cover most of the cases and 1860 * the time to IRQ re-enable shouldn't be long. 1861 */ 1862 irq_work_queue(&rq->scx.deferred_irq_work); 1863 } 1864 1865 /** 1866 * touch_core_sched - Update timestamp used for core-sched task ordering 1867 * @rq: rq to read clock from, must be locked 1868 * @p: task to update the timestamp for 1869 * 1870 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1871 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1872 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1873 * exhaustion). 1874 */ 1875 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1876 { 1877 lockdep_assert_rq_held(rq); 1878 1879 #ifdef CONFIG_SCHED_CORE 1880 /* 1881 * It's okay to update the timestamp spuriously. Use 1882 * sched_core_disabled() which is cheaper than enabled(). 1883 * 1884 * As this is used to determine ordering between tasks of sibling CPUs, 1885 * it may be better to use per-core dispatch sequence instead. 1886 */ 1887 if (!sched_core_disabled()) 1888 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1889 #endif 1890 } 1891 1892 /** 1893 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1894 * @rq: rq to read clock from, must be locked 1895 * @p: task being dispatched 1896 * 1897 * If the BPF scheduler implements custom core-sched ordering via 1898 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1899 * ordering within each local DSQ. This function is called from dispatch paths 1900 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1901 */ 1902 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1903 { 1904 lockdep_assert_rq_held(rq); 1905 1906 #ifdef CONFIG_SCHED_CORE 1907 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1908 touch_core_sched(rq, p); 1909 #endif 1910 } 1911 1912 static void update_curr_scx(struct rq *rq) 1913 { 1914 struct task_struct *curr = rq->curr; 1915 s64 delta_exec; 1916 1917 delta_exec = update_curr_common(rq); 1918 if (unlikely(delta_exec <= 0)) 1919 return; 1920 1921 if (curr->scx.slice != SCX_SLICE_INF) { 1922 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1923 if (!curr->scx.slice) 1924 touch_core_sched(rq, curr); 1925 } 1926 } 1927 1928 static bool scx_dsq_priq_less(struct rb_node *node_a, 1929 const struct rb_node *node_b) 1930 { 1931 const struct task_struct *a = 1932 container_of(node_a, struct task_struct, scx.dsq_priq); 1933 const struct task_struct *b = 1934 container_of(node_b, struct task_struct, scx.dsq_priq); 1935 1936 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1937 } 1938 1939 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) 1940 { 1941 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1942 WRITE_ONCE(dsq->nr, dsq->nr + delta); 1943 } 1944 1945 static void refill_task_slice_dfl(struct task_struct *p) 1946 { 1947 p->scx.slice = SCX_SLICE_DFL; 1948 __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); 1949 } 1950 1951 static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1952 struct task_struct *p, u64 enq_flags) 1953 { 1954 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1955 1956 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1957 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1958 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1959 1960 if (!is_local) { 1961 raw_spin_lock(&dsq->lock); 1962 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1963 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1964 /* fall back to the global dsq */ 1965 raw_spin_unlock(&dsq->lock); 1966 dsq = find_global_dsq(p); 1967 raw_spin_lock(&dsq->lock); 1968 } 1969 } 1970 1971 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1972 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1973 /* 1974 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1975 * their FIFO queues. To avoid confusion and accidentally 1976 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1977 * disallow any internal DSQ from doing vtime ordering of 1978 * tasks. 1979 */ 1980 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1981 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1982 } 1983 1984 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1985 struct rb_node *rbp; 1986 1987 /* 1988 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1989 * linked to both the rbtree and list on PRIQs, this can only be 1990 * tested easily when adding the first task. 1991 */ 1992 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1993 nldsq_next_task(dsq, NULL, false))) 1994 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1995 dsq->id); 1996 1997 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1998 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1999 2000 /* 2001 * Find the previous task and insert after it on the list so 2002 * that @dsq->list is vtime ordered. 2003 */ 2004 rbp = rb_prev(&p->scx.dsq_priq); 2005 if (rbp) { 2006 struct task_struct *prev = 2007 container_of(rbp, struct task_struct, 2008 scx.dsq_priq); 2009 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 2010 } else { 2011 list_add(&p->scx.dsq_list.node, &dsq->list); 2012 } 2013 } else { 2014 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 2015 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 2016 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 2017 dsq->id); 2018 2019 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2020 list_add(&p->scx.dsq_list.node, &dsq->list); 2021 else 2022 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 2023 } 2024 2025 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 2026 dsq->seq++; 2027 p->scx.dsq_seq = dsq->seq; 2028 2029 dsq_mod_nr(dsq, 1); 2030 p->scx.dsq = dsq; 2031 2032 /* 2033 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the 2034 * direct dispatch path, but we clear them here because the direct 2035 * dispatch verdict may be overridden on the enqueue path during e.g. 2036 * bypass. 2037 */ 2038 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 2039 p->scx.ddsp_enq_flags = 0; 2040 2041 /* 2042 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 2043 * match waiters' load_acquire. 2044 */ 2045 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 2046 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2047 2048 if (is_local) { 2049 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 2050 bool preempt = false; 2051 2052 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 2053 rq->curr->sched_class == &ext_sched_class) { 2054 rq->curr->scx.slice = 0; 2055 preempt = true; 2056 } 2057 2058 if (preempt || sched_class_above(&ext_sched_class, 2059 rq->curr->sched_class)) 2060 resched_curr(rq); 2061 } else { 2062 raw_spin_unlock(&dsq->lock); 2063 } 2064 } 2065 2066 static void task_unlink_from_dsq(struct task_struct *p, 2067 struct scx_dispatch_q *dsq) 2068 { 2069 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 2070 2071 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 2072 rb_erase(&p->scx.dsq_priq, &dsq->priq); 2073 RB_CLEAR_NODE(&p->scx.dsq_priq); 2074 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 2075 } 2076 2077 list_del_init(&p->scx.dsq_list.node); 2078 dsq_mod_nr(dsq, -1); 2079 } 2080 2081 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 2082 { 2083 struct scx_dispatch_q *dsq = p->scx.dsq; 2084 bool is_local = dsq == &rq->scx.local_dsq; 2085 2086 if (!dsq) { 2087 /* 2088 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 2089 * Unlinking is all that's needed to cancel. 2090 */ 2091 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 2092 list_del_init(&p->scx.dsq_list.node); 2093 2094 /* 2095 * When dispatching directly from the BPF scheduler to a local 2096 * DSQ, the task isn't associated with any DSQ but 2097 * @p->scx.holding_cpu may be set under the protection of 2098 * %SCX_OPSS_DISPATCHING. 2099 */ 2100 if (p->scx.holding_cpu >= 0) 2101 p->scx.holding_cpu = -1; 2102 2103 return; 2104 } 2105 2106 if (!is_local) 2107 raw_spin_lock(&dsq->lock); 2108 2109 /* 2110 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 2111 * change underneath us. 2112 */ 2113 if (p->scx.holding_cpu < 0) { 2114 /* @p must still be on @dsq, dequeue */ 2115 task_unlink_from_dsq(p, dsq); 2116 } else { 2117 /* 2118 * We're racing against dispatch_to_local_dsq() which already 2119 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 2120 * holding_cpu which tells dispatch_to_local_dsq() that it lost 2121 * the race. 2122 */ 2123 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 2124 p->scx.holding_cpu = -1; 2125 } 2126 p->scx.dsq = NULL; 2127 2128 if (!is_local) 2129 raw_spin_unlock(&dsq->lock); 2130 } 2131 2132 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 2133 struct rq *rq, u64 dsq_id, 2134 struct task_struct *p) 2135 { 2136 struct scx_dispatch_q *dsq; 2137 2138 if (dsq_id == SCX_DSQ_LOCAL) 2139 return &rq->scx.local_dsq; 2140 2141 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 2142 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 2143 2144 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 2145 return find_global_dsq(p); 2146 2147 return &cpu_rq(cpu)->scx.local_dsq; 2148 } 2149 2150 if (dsq_id == SCX_DSQ_GLOBAL) 2151 dsq = find_global_dsq(p); 2152 else 2153 dsq = find_user_dsq(sch, dsq_id); 2154 2155 if (unlikely(!dsq)) { 2156 scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", 2157 dsq_id, p->comm, p->pid); 2158 return find_global_dsq(p); 2159 } 2160 2161 return dsq; 2162 } 2163 2164 static void mark_direct_dispatch(struct task_struct *ddsp_task, 2165 struct task_struct *p, u64 dsq_id, 2166 u64 enq_flags) 2167 { 2168 /* 2169 * Mark that dispatch already happened from ops.select_cpu() or 2170 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 2171 * which can never match a valid task pointer. 2172 */ 2173 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 2174 2175 /* @p must match the task on the enqueue path */ 2176 if (unlikely(p != ddsp_task)) { 2177 if (IS_ERR(ddsp_task)) 2178 scx_kf_error("%s[%d] already direct-dispatched", 2179 p->comm, p->pid); 2180 else 2181 scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 2182 ddsp_task->comm, ddsp_task->pid, 2183 p->comm, p->pid); 2184 return; 2185 } 2186 2187 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 2188 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 2189 2190 p->scx.ddsp_dsq_id = dsq_id; 2191 p->scx.ddsp_enq_flags = enq_flags; 2192 } 2193 2194 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 2195 u64 enq_flags) 2196 { 2197 struct rq *rq = task_rq(p); 2198 struct scx_dispatch_q *dsq = 2199 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); 2200 2201 touch_core_sched_dispatch(rq, p); 2202 2203 p->scx.ddsp_enq_flags |= enq_flags; 2204 2205 /* 2206 * We are in the enqueue path with @rq locked and pinned, and thus can't 2207 * double lock a remote rq and enqueue to its local DSQ. For 2208 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 2209 * the enqueue so that it's executed when @rq can be unlocked. 2210 */ 2211 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 2212 unsigned long opss; 2213 2214 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 2215 2216 switch (opss & SCX_OPSS_STATE_MASK) { 2217 case SCX_OPSS_NONE: 2218 break; 2219 case SCX_OPSS_QUEUEING: 2220 /* 2221 * As @p was never passed to the BPF side, _release is 2222 * not strictly necessary. Still do it for consistency. 2223 */ 2224 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2225 break; 2226 default: 2227 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 2228 p->comm, p->pid, opss); 2229 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2230 break; 2231 } 2232 2233 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 2234 list_add_tail(&p->scx.dsq_list.node, 2235 &rq->scx.ddsp_deferred_locals); 2236 schedule_deferred(rq); 2237 return; 2238 } 2239 2240 dispatch_enqueue(sch, dsq, p, 2241 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 2242 } 2243 2244 static bool scx_rq_online(struct rq *rq) 2245 { 2246 /* 2247 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 2248 * the online state as seen from the BPF scheduler. cpu_active() test 2249 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 2250 * stay set until the current scheduling operation is complete even if 2251 * we aren't locking @rq. 2252 */ 2253 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 2254 } 2255 2256 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 2257 int sticky_cpu) 2258 { 2259 struct scx_sched *sch = scx_root; 2260 struct task_struct **ddsp_taskp; 2261 unsigned long qseq; 2262 2263 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 2264 2265 /* rq migration */ 2266 if (sticky_cpu == cpu_of(rq)) 2267 goto local_norefill; 2268 2269 /* 2270 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 2271 * is offline and are just running the hotplug path. Don't bother the 2272 * BPF scheduler. 2273 */ 2274 if (!scx_rq_online(rq)) 2275 goto local; 2276 2277 if (scx_rq_bypassing(rq)) { 2278 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 2279 goto global; 2280 } 2281 2282 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2283 goto direct; 2284 2285 /* see %SCX_OPS_ENQ_EXITING */ 2286 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 2287 unlikely(p->flags & PF_EXITING)) { 2288 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 2289 goto local; 2290 } 2291 2292 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 2293 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 2294 is_migration_disabled(p)) { 2295 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 2296 goto local; 2297 } 2298 2299 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 2300 goto global; 2301 2302 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 2303 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 2304 2305 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2306 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 2307 2308 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 2309 WARN_ON_ONCE(*ddsp_taskp); 2310 *ddsp_taskp = p; 2311 2312 SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); 2313 2314 *ddsp_taskp = NULL; 2315 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2316 goto direct; 2317 2318 /* 2319 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 2320 * dequeue may be waiting. The store_release matches their load_acquire. 2321 */ 2322 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 2323 return; 2324 2325 direct: 2326 direct_dispatch(sch, p, enq_flags); 2327 return; 2328 2329 local: 2330 /* 2331 * For task-ordering, slice refill must be treated as implying the end 2332 * of the current slice. Otherwise, the longer @p stays on the CPU, the 2333 * higher priority it becomes from scx_prio_less()'s POV. 2334 */ 2335 touch_core_sched(rq, p); 2336 refill_task_slice_dfl(p); 2337 local_norefill: 2338 dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); 2339 return; 2340 2341 global: 2342 touch_core_sched(rq, p); /* see the comment in local: */ 2343 refill_task_slice_dfl(p); 2344 dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); 2345 } 2346 2347 static bool task_runnable(const struct task_struct *p) 2348 { 2349 return !list_empty(&p->scx.runnable_node); 2350 } 2351 2352 static void set_task_runnable(struct rq *rq, struct task_struct *p) 2353 { 2354 lockdep_assert_rq_held(rq); 2355 2356 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 2357 p->scx.runnable_at = jiffies; 2358 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 2359 } 2360 2361 /* 2362 * list_add_tail() must be used. scx_bypass() depends on tasks being 2363 * appended to the runnable_list. 2364 */ 2365 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 2366 } 2367 2368 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2369 { 2370 list_del_init(&p->scx.runnable_node); 2371 if (reset_runnable_at) 2372 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2373 } 2374 2375 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) 2376 { 2377 struct scx_sched *sch = scx_root; 2378 int sticky_cpu = p->scx.sticky_cpu; 2379 2380 if (enq_flags & ENQUEUE_WAKEUP) 2381 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2382 2383 enq_flags |= rq->scx.extra_enq_flags; 2384 2385 if (sticky_cpu >= 0) 2386 p->scx.sticky_cpu = -1; 2387 2388 /* 2389 * Restoring a running task will be immediately followed by 2390 * set_next_task_scx() which expects the task to not be on the BPF 2391 * scheduler as tasks can only start running through local DSQs. Force 2392 * direct-dispatch into the local DSQ by setting the sticky_cpu. 2393 */ 2394 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2395 sticky_cpu = cpu_of(rq); 2396 2397 if (p->scx.flags & SCX_TASK_QUEUED) { 2398 WARN_ON_ONCE(!task_runnable(p)); 2399 goto out; 2400 } 2401 2402 set_task_runnable(rq, p); 2403 p->scx.flags |= SCX_TASK_QUEUED; 2404 rq->scx.nr_running++; 2405 add_nr_running(rq, 1); 2406 2407 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2408 SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); 2409 2410 if (enq_flags & SCX_ENQ_WAKEUP) 2411 touch_core_sched(rq, p); 2412 2413 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2414 out: 2415 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2416 2417 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2418 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2419 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2420 } 2421 2422 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2423 { 2424 struct scx_sched *sch = scx_root; 2425 unsigned long opss; 2426 2427 /* dequeue is always temporary, don't reset runnable_at */ 2428 clr_task_runnable(p, false); 2429 2430 /* acquire ensures that we see the preceding updates on QUEUED */ 2431 opss = atomic_long_read_acquire(&p->scx.ops_state); 2432 2433 switch (opss & SCX_OPSS_STATE_MASK) { 2434 case SCX_OPSS_NONE: 2435 break; 2436 case SCX_OPSS_QUEUEING: 2437 /* 2438 * QUEUEING is started and finished while holding @p's rq lock. 2439 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2440 */ 2441 BUG(); 2442 case SCX_OPSS_QUEUED: 2443 if (SCX_HAS_OP(sch, dequeue)) 2444 SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, 2445 p, deq_flags); 2446 2447 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2448 SCX_OPSS_NONE)) 2449 break; 2450 fallthrough; 2451 case SCX_OPSS_DISPATCHING: 2452 /* 2453 * If @p is being dispatched from the BPF scheduler to a DSQ, 2454 * wait for the transfer to complete so that @p doesn't get 2455 * added to its DSQ after dequeueing is complete. 2456 * 2457 * As we're waiting on DISPATCHING with the rq locked, the 2458 * dispatching side shouldn't try to lock the rq while 2459 * DISPATCHING is set. See dispatch_to_local_dsq(). 2460 * 2461 * DISPATCHING shouldn't have qseq set and control can reach 2462 * here with NONE @opss from the above QUEUED case block. 2463 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2464 */ 2465 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2466 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2467 break; 2468 } 2469 } 2470 2471 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) 2472 { 2473 struct scx_sched *sch = scx_root; 2474 2475 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2476 WARN_ON_ONCE(task_runnable(p)); 2477 return true; 2478 } 2479 2480 ops_dequeue(rq, p, deq_flags); 2481 2482 /* 2483 * A currently running task which is going off @rq first gets dequeued 2484 * and then stops running. As we want running <-> stopping transitions 2485 * to be contained within runnable <-> quiescent transitions, trigger 2486 * ->stopping() early here instead of in put_prev_task_scx(). 2487 * 2488 * @p may go through multiple stopping <-> running transitions between 2489 * here and put_prev_task_scx() if task attribute changes occur while 2490 * balance_scx() leaves @rq unlocked. However, they don't contain any 2491 * information meaningful to the BPF scheduler and can be suppressed by 2492 * skipping the callbacks if the task is !QUEUED. 2493 */ 2494 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2495 update_curr_scx(rq); 2496 SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); 2497 } 2498 2499 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2500 SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); 2501 2502 if (deq_flags & SCX_DEQ_SLEEP) 2503 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2504 else 2505 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2506 2507 p->scx.flags &= ~SCX_TASK_QUEUED; 2508 rq->scx.nr_running--; 2509 sub_nr_running(rq, 1); 2510 2511 dispatch_dequeue(rq, p); 2512 return true; 2513 } 2514 2515 static void yield_task_scx(struct rq *rq) 2516 { 2517 struct scx_sched *sch = scx_root; 2518 struct task_struct *p = rq->curr; 2519 2520 if (SCX_HAS_OP(sch, yield)) 2521 SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); 2522 else 2523 p->scx.slice = 0; 2524 } 2525 2526 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2527 { 2528 struct scx_sched *sch = scx_root; 2529 struct task_struct *from = rq->curr; 2530 2531 if (SCX_HAS_OP(sch, yield)) 2532 return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, 2533 from, to); 2534 else 2535 return false; 2536 } 2537 2538 static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2539 struct scx_dispatch_q *src_dsq, 2540 struct rq *dst_rq) 2541 { 2542 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2543 2544 /* @dsq is locked and @p is on @dst_rq */ 2545 lockdep_assert_held(&src_dsq->lock); 2546 lockdep_assert_rq_held(dst_rq); 2547 2548 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2549 2550 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2551 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2552 else 2553 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2554 2555 dsq_mod_nr(dst_dsq, 1); 2556 p->scx.dsq = dst_dsq; 2557 } 2558 2559 /** 2560 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2561 * @p: task to move 2562 * @enq_flags: %SCX_ENQ_* 2563 * @src_rq: rq to move the task from, locked on entry, released on return 2564 * @dst_rq: rq to move the task into, locked on return 2565 * 2566 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2567 */ 2568 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2569 struct rq *src_rq, struct rq *dst_rq) 2570 { 2571 lockdep_assert_rq_held(src_rq); 2572 2573 /* the following marks @p MIGRATING which excludes dequeue */ 2574 deactivate_task(src_rq, p, 0); 2575 set_task_cpu(p, cpu_of(dst_rq)); 2576 p->scx.sticky_cpu = cpu_of(dst_rq); 2577 2578 raw_spin_rq_unlock(src_rq); 2579 raw_spin_rq_lock(dst_rq); 2580 2581 /* 2582 * We want to pass scx-specific enq_flags but activate_task() will 2583 * truncate the upper 32 bit. As we own @rq, we can pass them through 2584 * @rq->scx.extra_enq_flags instead. 2585 */ 2586 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2587 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2588 dst_rq->scx.extra_enq_flags = enq_flags; 2589 activate_task(dst_rq, p, 0); 2590 dst_rq->scx.extra_enq_flags = 0; 2591 } 2592 2593 /* 2594 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2595 * differences: 2596 * 2597 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2598 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2599 * this CPU?". 2600 * 2601 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2602 * must be allowed to finish on the CPU that it's currently on regardless of 2603 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2604 * BPF scheduler shouldn't attempt to migrate a task which has migration 2605 * disabled. 2606 * 2607 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2608 * no to the BPF scheduler initiated migrations while offline. 2609 * 2610 * The caller must ensure that @p and @rq are on different CPUs. 2611 */ 2612 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2613 struct task_struct *p, struct rq *rq, 2614 bool enforce) 2615 { 2616 int cpu = cpu_of(rq); 2617 2618 WARN_ON_ONCE(task_cpu(p) == cpu); 2619 2620 /* 2621 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2622 * the pinned CPU in migrate_disable_switch() while @p is being switched 2623 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2624 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2625 * @p passing the below task_allowed_on_cpu() check while migration is 2626 * disabled. 2627 * 2628 * Test the migration disabled state first as the race window is narrow 2629 * and the BPF scheduler failing to check migration disabled state can 2630 * easily be masked if task_allowed_on_cpu() is done first. 2631 */ 2632 if (unlikely(is_migration_disabled(p))) { 2633 if (enforce) 2634 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2635 p->comm, p->pid, task_cpu(p), cpu); 2636 return false; 2637 } 2638 2639 /* 2640 * We don't require the BPF scheduler to avoid dispatching to offline 2641 * CPUs mostly for convenience but also because CPUs can go offline 2642 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2643 * picked CPU is outside the allowed mask. 2644 */ 2645 if (!task_allowed_on_cpu(p, cpu)) { 2646 if (enforce) 2647 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2648 cpu, p->comm, p->pid); 2649 return false; 2650 } 2651 2652 if (!scx_rq_online(rq)) { 2653 if (enforce) 2654 __scx_add_event(scx_root, 2655 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2656 return false; 2657 } 2658 2659 return true; 2660 } 2661 2662 /** 2663 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2664 * @p: target task 2665 * @dsq: locked DSQ @p is currently on 2666 * @src_rq: rq @p is currently on, stable with @dsq locked 2667 * 2668 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2669 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2670 * required when transferring into a local DSQ. Even when transferring into a 2671 * non-local DSQ, it's better to use the same mechanism to protect against 2672 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2673 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2674 * 2675 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2676 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2677 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2678 * dancing from our side. 2679 * 2680 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2681 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2682 * would be cleared to -1. While other cpus may have updated it to different 2683 * values afterwards, as this operation can't be preempted or recurse, the 2684 * holding_cpu can never become this CPU again before we're done. Thus, we can 2685 * tell whether we lost to dequeue by testing whether the holding_cpu still 2686 * points to this CPU. See dispatch_dequeue() for the counterpart. 2687 * 2688 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2689 * still valid. %false if lost to dequeue. 2690 */ 2691 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2692 struct scx_dispatch_q *dsq, 2693 struct rq *src_rq) 2694 { 2695 s32 cpu = raw_smp_processor_id(); 2696 2697 lockdep_assert_held(&dsq->lock); 2698 2699 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2700 task_unlink_from_dsq(p, dsq); 2701 p->scx.holding_cpu = cpu; 2702 2703 raw_spin_unlock(&dsq->lock); 2704 raw_spin_rq_lock(src_rq); 2705 2706 /* task_rq couldn't have changed if we're still the holding cpu */ 2707 return likely(p->scx.holding_cpu == cpu) && 2708 !WARN_ON_ONCE(src_rq != task_rq(p)); 2709 } 2710 2711 static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, 2712 struct scx_dispatch_q *dsq, struct rq *src_rq) 2713 { 2714 raw_spin_rq_unlock(this_rq); 2715 2716 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2717 move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); 2718 return true; 2719 } else { 2720 raw_spin_rq_unlock(src_rq); 2721 raw_spin_rq_lock(this_rq); 2722 return false; 2723 } 2724 } 2725 2726 /** 2727 * move_task_between_dsqs() - Move a task from one DSQ to another 2728 * @sch: scx_sched being operated on 2729 * @p: target task 2730 * @enq_flags: %SCX_ENQ_* 2731 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2732 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2733 * 2734 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2735 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2736 * will change. As @p's task_rq is locked, this function doesn't need to use the 2737 * holding_cpu mechanism. 2738 * 2739 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2740 * return value, is locked. 2741 */ 2742 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2743 struct task_struct *p, u64 enq_flags, 2744 struct scx_dispatch_q *src_dsq, 2745 struct scx_dispatch_q *dst_dsq) 2746 { 2747 struct rq *src_rq = task_rq(p), *dst_rq; 2748 2749 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2750 lockdep_assert_held(&src_dsq->lock); 2751 lockdep_assert_rq_held(src_rq); 2752 2753 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2754 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2755 if (src_rq != dst_rq && 2756 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2757 dst_dsq = find_global_dsq(p); 2758 dst_rq = src_rq; 2759 } 2760 } else { 2761 /* no need to migrate if destination is a non-local DSQ */ 2762 dst_rq = src_rq; 2763 } 2764 2765 /* 2766 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2767 * CPU, @p will be migrated. 2768 */ 2769 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2770 /* @p is going from a non-local DSQ to a local DSQ */ 2771 if (src_rq == dst_rq) { 2772 task_unlink_from_dsq(p, src_dsq); 2773 move_local_task_to_local_dsq(p, enq_flags, 2774 src_dsq, dst_rq); 2775 raw_spin_unlock(&src_dsq->lock); 2776 } else { 2777 raw_spin_unlock(&src_dsq->lock); 2778 move_remote_task_to_local_dsq(p, enq_flags, 2779 src_rq, dst_rq); 2780 } 2781 } else { 2782 /* 2783 * @p is going from a non-local DSQ to a non-local DSQ. As 2784 * $src_dsq is already locked, do an abbreviated dequeue. 2785 */ 2786 task_unlink_from_dsq(p, src_dsq); 2787 p->scx.dsq = NULL; 2788 raw_spin_unlock(&src_dsq->lock); 2789 2790 dispatch_enqueue(sch, dst_dsq, p, enq_flags); 2791 } 2792 2793 return dst_rq; 2794 } 2795 2796 /* 2797 * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly 2798 * banging on the same DSQ on a large NUMA system to the point where switching 2799 * to the bypass mode can take a long time. Inject artificial delays while the 2800 * bypass mode is switching to guarantee timely completion. 2801 */ 2802 static void scx_breather(struct rq *rq) 2803 { 2804 u64 until; 2805 2806 lockdep_assert_rq_held(rq); 2807 2808 if (likely(!atomic_read(&scx_breather_depth))) 2809 return; 2810 2811 raw_spin_rq_unlock(rq); 2812 2813 until = ktime_get_ns() + NSEC_PER_MSEC; 2814 2815 do { 2816 int cnt = 1024; 2817 while (atomic_read(&scx_breather_depth) && --cnt) 2818 cpu_relax(); 2819 } while (atomic_read(&scx_breather_depth) && 2820 time_before64(ktime_get_ns(), until)); 2821 2822 raw_spin_rq_lock(rq); 2823 } 2824 2825 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2826 struct scx_dispatch_q *dsq) 2827 { 2828 struct task_struct *p; 2829 retry: 2830 /* 2831 * This retry loop can repeatedly race against scx_bypass() dequeueing 2832 * tasks from @dsq trying to put the system into the bypass mode. On 2833 * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock 2834 * the machine into soft lockups. Give a breather. 2835 */ 2836 scx_breather(rq); 2837 2838 /* 2839 * The caller can't expect to successfully consume a task if the task's 2840 * addition to @dsq isn't guaranteed to be visible somehow. Test 2841 * @dsq->list without locking and skip if it seems empty. 2842 */ 2843 if (list_empty(&dsq->list)) 2844 return false; 2845 2846 raw_spin_lock(&dsq->lock); 2847 2848 nldsq_for_each_task(p, dsq) { 2849 struct rq *task_rq = task_rq(p); 2850 2851 if (rq == task_rq) { 2852 task_unlink_from_dsq(p, dsq); 2853 move_local_task_to_local_dsq(p, 0, dsq, rq); 2854 raw_spin_unlock(&dsq->lock); 2855 return true; 2856 } 2857 2858 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2859 if (likely(consume_remote_task(rq, p, dsq, task_rq))) 2860 return true; 2861 goto retry; 2862 } 2863 } 2864 2865 raw_spin_unlock(&dsq->lock); 2866 return false; 2867 } 2868 2869 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2870 { 2871 int node = cpu_to_node(cpu_of(rq)); 2872 2873 return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); 2874 } 2875 2876 /** 2877 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2878 * @sch: scx_sched being operated on 2879 * @rq: current rq which is locked 2880 * @dst_dsq: destination DSQ 2881 * @p: task to dispatch 2882 * @enq_flags: %SCX_ENQ_* 2883 * 2884 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2885 * DSQ. This function performs all the synchronization dancing needed because 2886 * local DSQs are protected with rq locks. 2887 * 2888 * The caller must have exclusive ownership of @p (e.g. through 2889 * %SCX_OPSS_DISPATCHING). 2890 */ 2891 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2892 struct scx_dispatch_q *dst_dsq, 2893 struct task_struct *p, u64 enq_flags) 2894 { 2895 struct rq *src_rq = task_rq(p); 2896 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2897 struct rq *locked_rq = rq; 2898 2899 /* 2900 * We're synchronized against dequeue through DISPATCHING. As @p can't 2901 * be dequeued, its task_rq and cpus_allowed are stable too. 2902 * 2903 * If dispatching to @rq that @p is already on, no lock dancing needed. 2904 */ 2905 if (rq == src_rq && rq == dst_rq) { 2906 dispatch_enqueue(sch, dst_dsq, p, 2907 enq_flags | SCX_ENQ_CLEAR_OPSS); 2908 return; 2909 } 2910 2911 if (src_rq != dst_rq && 2912 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2913 dispatch_enqueue(sch, find_global_dsq(p), p, 2914 enq_flags | SCX_ENQ_CLEAR_OPSS); 2915 return; 2916 } 2917 2918 /* 2919 * @p is on a possibly remote @src_rq which we need to lock to move the 2920 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2921 * on DISPATCHING, so we can't grab @src_rq lock while holding 2922 * DISPATCHING. 2923 * 2924 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2925 * we're moving from a DSQ and use the same mechanism - mark the task 2926 * under transfer with holding_cpu, release DISPATCHING and then follow 2927 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2928 */ 2929 p->scx.holding_cpu = raw_smp_processor_id(); 2930 2931 /* store_release ensures that dequeue sees the above */ 2932 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2933 2934 /* switch to @src_rq lock */ 2935 if (locked_rq != src_rq) { 2936 raw_spin_rq_unlock(locked_rq); 2937 locked_rq = src_rq; 2938 raw_spin_rq_lock(src_rq); 2939 } 2940 2941 /* task_rq couldn't have changed if we're still the holding cpu */ 2942 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2943 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2944 /* 2945 * If @p is staying on the same rq, there's no need to go 2946 * through the full deactivate/activate cycle. Optimize by 2947 * abbreviating move_remote_task_to_local_dsq(). 2948 */ 2949 if (src_rq == dst_rq) { 2950 p->scx.holding_cpu = -1; 2951 dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, 2952 enq_flags); 2953 } else { 2954 move_remote_task_to_local_dsq(p, enq_flags, 2955 src_rq, dst_rq); 2956 /* task has been moved to dst_rq, which is now locked */ 2957 locked_rq = dst_rq; 2958 } 2959 2960 /* if the destination CPU is idle, wake it up */ 2961 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2962 resched_curr(dst_rq); 2963 } 2964 2965 /* switch back to @rq lock */ 2966 if (locked_rq != rq) { 2967 raw_spin_rq_unlock(locked_rq); 2968 raw_spin_rq_lock(rq); 2969 } 2970 } 2971 2972 /** 2973 * finish_dispatch - Asynchronously finish dispatching a task 2974 * @rq: current rq which is locked 2975 * @p: task to finish dispatching 2976 * @qseq_at_dispatch: qseq when @p started getting dispatched 2977 * @dsq_id: destination DSQ ID 2978 * @enq_flags: %SCX_ENQ_* 2979 * 2980 * Dispatching to local DSQs may need to wait for queueing to complete or 2981 * require rq lock dancing. As we don't wanna do either while inside 2982 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2983 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2984 * task and its qseq. Once ops.dispatch() returns, this function is called to 2985 * finish up. 2986 * 2987 * There is no guarantee that @p is still valid for dispatching or even that it 2988 * was valid in the first place. Make sure that the task is still owned by the 2989 * BPF scheduler and claim the ownership before dispatching. 2990 */ 2991 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2992 struct task_struct *p, 2993 unsigned long qseq_at_dispatch, 2994 u64 dsq_id, u64 enq_flags) 2995 { 2996 struct scx_dispatch_q *dsq; 2997 unsigned long opss; 2998 2999 touch_core_sched_dispatch(rq, p); 3000 retry: 3001 /* 3002 * No need for _acquire here. @p is accessed only after a successful 3003 * try_cmpxchg to DISPATCHING. 3004 */ 3005 opss = atomic_long_read(&p->scx.ops_state); 3006 3007 switch (opss & SCX_OPSS_STATE_MASK) { 3008 case SCX_OPSS_DISPATCHING: 3009 case SCX_OPSS_NONE: 3010 /* someone else already got to it */ 3011 return; 3012 case SCX_OPSS_QUEUED: 3013 /* 3014 * If qseq doesn't match, @p has gone through at least one 3015 * dispatch/dequeue and re-enqueue cycle between 3016 * scx_bpf_dsq_insert() and here and we have no claim on it. 3017 */ 3018 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 3019 return; 3020 3021 /* 3022 * While we know @p is accessible, we don't yet have a claim on 3023 * it - the BPF scheduler is allowed to dispatch tasks 3024 * spuriously and there can be a racing dequeue attempt. Let's 3025 * claim @p by atomically transitioning it from QUEUED to 3026 * DISPATCHING. 3027 */ 3028 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 3029 SCX_OPSS_DISPATCHING))) 3030 break; 3031 goto retry; 3032 case SCX_OPSS_QUEUEING: 3033 /* 3034 * do_enqueue_task() is in the process of transferring the task 3035 * to the BPF scheduler while holding @p's rq lock. As we aren't 3036 * holding any kernel or BPF resource that the enqueue path may 3037 * depend upon, it's safe to wait. 3038 */ 3039 wait_ops_state(p, opss); 3040 goto retry; 3041 } 3042 3043 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 3044 3045 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); 3046 3047 if (dsq->id == SCX_DSQ_LOCAL) 3048 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 3049 else 3050 dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 3051 } 3052 3053 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 3054 { 3055 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 3056 u32 u; 3057 3058 for (u = 0; u < dspc->cursor; u++) { 3059 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 3060 3061 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 3062 ent->enq_flags); 3063 } 3064 3065 dspc->nr_tasks += dspc->cursor; 3066 dspc->cursor = 0; 3067 } 3068 3069 static int balance_one(struct rq *rq, struct task_struct *prev) 3070 { 3071 struct scx_sched *sch = scx_root; 3072 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 3073 bool prev_on_scx = prev->sched_class == &ext_sched_class; 3074 bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; 3075 int nr_loops = SCX_DSP_MAX_LOOPS; 3076 3077 lockdep_assert_rq_held(rq); 3078 rq->scx.flags |= SCX_RQ_IN_BALANCE; 3079 rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); 3080 3081 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 3082 unlikely(rq->scx.cpu_released)) { 3083 /* 3084 * If the previous sched_class for the current CPU was not SCX, 3085 * notify the BPF scheduler that it again has control of the 3086 * core. This callback complements ->cpu_release(), which is 3087 * emitted in switch_class(). 3088 */ 3089 if (SCX_HAS_OP(sch, cpu_acquire)) 3090 SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, 3091 cpu_of(rq), NULL); 3092 rq->scx.cpu_released = false; 3093 } 3094 3095 if (prev_on_scx) { 3096 update_curr_scx(rq); 3097 3098 /* 3099 * If @prev is runnable & has slice left, it has priority and 3100 * fetching more just increases latency for the fetched tasks. 3101 * Tell pick_task_scx() to keep running @prev. If the BPF 3102 * scheduler wants to handle this explicitly, it should 3103 * implement ->cpu_release(). 3104 * 3105 * See scx_disable_workfn() for the explanation on the bypassing 3106 * test. 3107 */ 3108 if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { 3109 rq->scx.flags |= SCX_RQ_BAL_KEEP; 3110 goto has_tasks; 3111 } 3112 } 3113 3114 /* if there already are tasks to run, nothing to do */ 3115 if (rq->scx.local_dsq.nr) 3116 goto has_tasks; 3117 3118 if (consume_global_dsq(sch, rq)) 3119 goto has_tasks; 3120 3121 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || 3122 scx_rq_bypassing(rq) || !scx_rq_online(rq)) 3123 goto no_tasks; 3124 3125 dspc->rq = rq; 3126 3127 /* 3128 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 3129 * the local DSQ might still end up empty after a successful 3130 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 3131 * produced some tasks, retry. The BPF scheduler may depend on this 3132 * looping behavior to simplify its implementation. 3133 */ 3134 do { 3135 dspc->nr_tasks = 0; 3136 3137 SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, 3138 cpu_of(rq), prev_on_scx ? prev : NULL); 3139 3140 flush_dispatch_buf(sch, rq); 3141 3142 if (prev_on_rq && prev->scx.slice) { 3143 rq->scx.flags |= SCX_RQ_BAL_KEEP; 3144 goto has_tasks; 3145 } 3146 if (rq->scx.local_dsq.nr) 3147 goto has_tasks; 3148 if (consume_global_dsq(sch, rq)) 3149 goto has_tasks; 3150 3151 /* 3152 * ops.dispatch() can trap us in this loop by repeatedly 3153 * dispatching ineligible tasks. Break out once in a while to 3154 * allow the watchdog to run. As IRQ can't be enabled in 3155 * balance(), we want to complete this scheduling cycle and then 3156 * start a new one. IOW, we want to call resched_curr() on the 3157 * next, most likely idle, task, not the current one. Use 3158 * scx_bpf_kick_cpu() for deferred kicking. 3159 */ 3160 if (unlikely(!--nr_loops)) { 3161 scx_bpf_kick_cpu(cpu_of(rq), 0); 3162 break; 3163 } 3164 } while (dspc->nr_tasks); 3165 3166 no_tasks: 3167 /* 3168 * Didn't find another task to run. Keep running @prev unless 3169 * %SCX_OPS_ENQ_LAST is in effect. 3170 */ 3171 if (prev_on_rq && 3172 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { 3173 rq->scx.flags |= SCX_RQ_BAL_KEEP; 3174 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 3175 goto has_tasks; 3176 } 3177 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 3178 return false; 3179 3180 has_tasks: 3181 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 3182 return true; 3183 } 3184 3185 static int balance_scx(struct rq *rq, struct task_struct *prev, 3186 struct rq_flags *rf) 3187 { 3188 int ret; 3189 3190 rq_unpin_lock(rq, rf); 3191 3192 ret = balance_one(rq, prev); 3193 3194 #ifdef CONFIG_SCHED_SMT 3195 /* 3196 * When core-sched is enabled, this ops.balance() call will be followed 3197 * by pick_task_scx() on this CPU and the SMT siblings. Balance the 3198 * siblings too. 3199 */ 3200 if (sched_core_enabled(rq)) { 3201 const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); 3202 int scpu; 3203 3204 for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { 3205 struct rq *srq = cpu_rq(scpu); 3206 struct task_struct *sprev = srq->curr; 3207 3208 WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); 3209 update_rq_clock(srq); 3210 balance_one(srq, sprev); 3211 } 3212 } 3213 #endif 3214 rq_repin_lock(rq, rf); 3215 3216 return ret; 3217 } 3218 3219 static void process_ddsp_deferred_locals(struct rq *rq) 3220 { 3221 struct task_struct *p; 3222 3223 lockdep_assert_rq_held(rq); 3224 3225 /* 3226 * Now that @rq can be unlocked, execute the deferred enqueueing of 3227 * tasks directly dispatched to the local DSQs of other CPUs. See 3228 * direct_dispatch(). Keep popping from the head instead of using 3229 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 3230 * temporarily. 3231 */ 3232 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 3233 struct task_struct, scx.dsq_list.node))) { 3234 struct scx_sched *sch = scx_root; 3235 struct scx_dispatch_q *dsq; 3236 3237 list_del_init(&p->scx.dsq_list.node); 3238 3239 dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); 3240 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 3241 dispatch_to_local_dsq(sch, rq, dsq, p, 3242 p->scx.ddsp_enq_flags); 3243 } 3244 } 3245 3246 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 3247 { 3248 struct scx_sched *sch = scx_root; 3249 3250 if (p->scx.flags & SCX_TASK_QUEUED) { 3251 /* 3252 * Core-sched might decide to execute @p before it is 3253 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 3254 */ 3255 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 3256 dispatch_dequeue(rq, p); 3257 } 3258 3259 p->se.exec_start = rq_clock_task(rq); 3260 3261 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3262 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 3263 SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); 3264 3265 clr_task_runnable(p, true); 3266 3267 /* 3268 * @p is getting newly scheduled or got kicked after someone updated its 3269 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 3270 */ 3271 if ((p->scx.slice == SCX_SLICE_INF) != 3272 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 3273 if (p->scx.slice == SCX_SLICE_INF) 3274 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 3275 else 3276 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 3277 3278 sched_update_tick_dependency(rq); 3279 3280 /* 3281 * For now, let's refresh the load_avgs just when transitioning 3282 * in and out of nohz. In the future, we might want to add a 3283 * mechanism which calls the following periodically on 3284 * tick-stopped CPUs. 3285 */ 3286 update_other_load_avgs(rq); 3287 } 3288 } 3289 3290 static enum scx_cpu_preempt_reason 3291 preempt_reason_from_class(const struct sched_class *class) 3292 { 3293 if (class == &stop_sched_class) 3294 return SCX_CPU_PREEMPT_STOP; 3295 if (class == &dl_sched_class) 3296 return SCX_CPU_PREEMPT_DL; 3297 if (class == &rt_sched_class) 3298 return SCX_CPU_PREEMPT_RT; 3299 return SCX_CPU_PREEMPT_UNKNOWN; 3300 } 3301 3302 static void switch_class(struct rq *rq, struct task_struct *next) 3303 { 3304 struct scx_sched *sch = scx_root; 3305 const struct sched_class *next_class = next->sched_class; 3306 3307 /* 3308 * Pairs with the smp_load_acquire() issued by a CPU in 3309 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a 3310 * resched. 3311 */ 3312 smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); 3313 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3314 return; 3315 3316 /* 3317 * The callback is conceptually meant to convey that the CPU is no 3318 * longer under the control of SCX. Therefore, don't invoke the callback 3319 * if the next class is below SCX (in which case the BPF scheduler has 3320 * actively decided not to schedule any tasks on the CPU). 3321 */ 3322 if (sched_class_above(&ext_sched_class, next_class)) 3323 return; 3324 3325 /* 3326 * At this point we know that SCX was preempted by a higher priority 3327 * sched_class, so invoke the ->cpu_release() callback if we have not 3328 * done so already. We only send the callback once between SCX being 3329 * preempted, and it regaining control of the CPU. 3330 * 3331 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3332 * next time that balance_scx() is invoked. 3333 */ 3334 if (!rq->scx.cpu_released) { 3335 if (SCX_HAS_OP(sch, cpu_release)) { 3336 struct scx_cpu_release_args args = { 3337 .reason = preempt_reason_from_class(next_class), 3338 .task = next, 3339 }; 3340 3341 SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, 3342 cpu_of(rq), &args); 3343 } 3344 rq->scx.cpu_released = true; 3345 } 3346 } 3347 3348 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3349 struct task_struct *next) 3350 { 3351 struct scx_sched *sch = scx_root; 3352 update_curr_scx(rq); 3353 3354 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3355 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3356 SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); 3357 3358 if (p->scx.flags & SCX_TASK_QUEUED) { 3359 set_task_runnable(rq, p); 3360 3361 /* 3362 * If @p has slice left and is being put, @p is getting 3363 * preempted by a higher priority scheduler class or core-sched 3364 * forcing a different task. Leave it at the head of the local 3365 * DSQ. 3366 */ 3367 if (p->scx.slice && !scx_rq_bypassing(rq)) { 3368 dispatch_enqueue(sch, &rq->scx.local_dsq, p, 3369 SCX_ENQ_HEAD); 3370 goto switch_class; 3371 } 3372 3373 /* 3374 * If @p is runnable but we're about to enter a lower 3375 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3376 * ops.enqueue() that @p is the only one available for this cpu, 3377 * which should trigger an explicit follow-up scheduling event. 3378 */ 3379 if (sched_class_above(&ext_sched_class, next->sched_class)) { 3380 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3381 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3382 } else { 3383 do_enqueue_task(rq, p, 0, -1); 3384 } 3385 } 3386 3387 switch_class: 3388 if (next && next->sched_class != &ext_sched_class) 3389 switch_class(rq, next); 3390 } 3391 3392 static struct task_struct *first_local_task(struct rq *rq) 3393 { 3394 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3395 struct task_struct, scx.dsq_list.node); 3396 } 3397 3398 static struct task_struct *pick_task_scx(struct rq *rq) 3399 { 3400 struct task_struct *prev = rq->curr; 3401 struct task_struct *p; 3402 bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3403 bool kick_idle = false; 3404 3405 /* 3406 * WORKAROUND: 3407 * 3408 * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just 3409 * have gone through balance_scx(). Unfortunately, there currently is a 3410 * bug where fair could say yes on balance() but no on pick_task(), 3411 * which then ends up calling pick_task_scx() without preceding 3412 * balance_scx(). 3413 * 3414 * Keep running @prev if possible and avoid stalling from entering idle 3415 * without balancing. 3416 * 3417 * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() 3418 * if pick_task_scx() is called without preceding balance_scx(). 3419 */ 3420 if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { 3421 if (prev->scx.flags & SCX_TASK_QUEUED) { 3422 keep_prev = true; 3423 } else { 3424 keep_prev = false; 3425 kick_idle = true; 3426 } 3427 } else if (unlikely(keep_prev && 3428 prev->sched_class != &ext_sched_class)) { 3429 /* 3430 * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is 3431 * conditional on scx_enabled() and may have been skipped. 3432 */ 3433 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3434 keep_prev = false; 3435 } 3436 3437 /* 3438 * If balance_scx() is telling us to keep running @prev, replenish slice 3439 * if necessary and keep running @prev. Otherwise, pop the first one 3440 * from the local DSQ. 3441 */ 3442 if (keep_prev) { 3443 p = prev; 3444 if (!p->scx.slice) 3445 refill_task_slice_dfl(p); 3446 } else { 3447 p = first_local_task(rq); 3448 if (!p) { 3449 if (kick_idle) 3450 scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); 3451 return NULL; 3452 } 3453 3454 if (unlikely(!p->scx.slice)) { 3455 struct scx_sched *sch = scx_root; 3456 3457 if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { 3458 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3459 p->comm, p->pid, __func__); 3460 sch->warned_zero_slice = true; 3461 } 3462 refill_task_slice_dfl(p); 3463 } 3464 } 3465 3466 return p; 3467 } 3468 3469 #ifdef CONFIG_SCHED_CORE 3470 /** 3471 * scx_prio_less - Task ordering for core-sched 3472 * @a: task A 3473 * @b: task B 3474 * @in_fi: in forced idle state 3475 * 3476 * Core-sched is implemented as an additional scheduling layer on top of the 3477 * usual sched_class'es and needs to find out the expected task ordering. For 3478 * SCX, core-sched calls this function to interrogate the task ordering. 3479 * 3480 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3481 * to implement the default task ordering. The older the timestamp, the higher 3482 * priority the task - the global FIFO ordering matching the default scheduling 3483 * behavior. 3484 * 3485 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3486 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3487 */ 3488 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3489 bool in_fi) 3490 { 3491 struct scx_sched *sch = scx_root; 3492 3493 /* 3494 * The const qualifiers are dropped from task_struct pointers when 3495 * calling ops.core_sched_before(). Accesses are controlled by the 3496 * verifier. 3497 */ 3498 if (SCX_HAS_OP(sch, core_sched_before) && 3499 !scx_rq_bypassing(task_rq(a))) 3500 return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, 3501 NULL, 3502 (struct task_struct *)a, 3503 (struct task_struct *)b); 3504 else 3505 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3506 } 3507 #endif /* CONFIG_SCHED_CORE */ 3508 3509 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3510 { 3511 struct scx_sched *sch = scx_root; 3512 bool rq_bypass; 3513 3514 /* 3515 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3516 * can be a good migration opportunity with low cache and memory 3517 * footprint. Returning a CPU different than @prev_cpu triggers 3518 * immediate rq migration. However, for SCX, as the current rq 3519 * association doesn't dictate where the task is going to run, this 3520 * doesn't fit well. If necessary, we can later add a dedicated method 3521 * which can decide to preempt self to force it through the regular 3522 * scheduling path. 3523 */ 3524 if (unlikely(wake_flags & WF_EXEC)) 3525 return prev_cpu; 3526 3527 rq_bypass = scx_rq_bypassing(task_rq(p)); 3528 if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { 3529 s32 cpu; 3530 struct task_struct **ddsp_taskp; 3531 3532 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3533 WARN_ON_ONCE(*ddsp_taskp); 3534 *ddsp_taskp = p; 3535 3536 cpu = SCX_CALL_OP_TASK_RET(sch, 3537 SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, 3538 select_cpu, NULL, p, prev_cpu, 3539 wake_flags); 3540 p->scx.selected_cpu = cpu; 3541 *ddsp_taskp = NULL; 3542 if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3543 return cpu; 3544 else 3545 return prev_cpu; 3546 } else { 3547 s32 cpu; 3548 3549 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3550 if (cpu >= 0) { 3551 refill_task_slice_dfl(p); 3552 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3553 } else { 3554 cpu = prev_cpu; 3555 } 3556 p->scx.selected_cpu = cpu; 3557 3558 if (rq_bypass) 3559 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3560 return cpu; 3561 } 3562 } 3563 3564 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3565 { 3566 run_deferred(rq); 3567 } 3568 3569 static void set_cpus_allowed_scx(struct task_struct *p, 3570 struct affinity_context *ac) 3571 { 3572 struct scx_sched *sch = scx_root; 3573 3574 set_cpus_allowed_common(p, ac); 3575 3576 /* 3577 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3578 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3579 * scheduler the effective one. 3580 * 3581 * Fine-grained memory write control is enforced by BPF making the const 3582 * designation pointless. Cast it away when calling the operation. 3583 */ 3584 if (SCX_HAS_OP(sch, set_cpumask)) 3585 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, 3586 p, (struct cpumask *)p->cpus_ptr); 3587 } 3588 3589 static void handle_hotplug(struct rq *rq, bool online) 3590 { 3591 struct scx_sched *sch = scx_root; 3592 int cpu = cpu_of(rq); 3593 3594 atomic_long_inc(&scx_hotplug_seq); 3595 3596 /* 3597 * scx_root updates are protected by cpus_read_lock() and will stay 3598 * stable here. Note that we can't depend on scx_enabled() test as the 3599 * hotplug ops need to be enabled before __scx_enabled is set. 3600 */ 3601 if (unlikely(!sch)) 3602 return; 3603 3604 if (scx_enabled()) 3605 scx_idle_update_selcpu_topology(&sch->ops); 3606 3607 if (online && SCX_HAS_OP(sch, cpu_online)) 3608 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); 3609 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3610 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); 3611 else 3612 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3613 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3614 "cpu %d going %s, exiting scheduler", cpu, 3615 online ? "online" : "offline"); 3616 } 3617 3618 void scx_rq_activate(struct rq *rq) 3619 { 3620 handle_hotplug(rq, true); 3621 } 3622 3623 void scx_rq_deactivate(struct rq *rq) 3624 { 3625 handle_hotplug(rq, false); 3626 } 3627 3628 static void rq_online_scx(struct rq *rq) 3629 { 3630 rq->scx.flags |= SCX_RQ_ONLINE; 3631 } 3632 3633 static void rq_offline_scx(struct rq *rq) 3634 { 3635 rq->scx.flags &= ~SCX_RQ_ONLINE; 3636 } 3637 3638 3639 static bool check_rq_for_timeouts(struct rq *rq) 3640 { 3641 struct scx_sched *sch; 3642 struct task_struct *p; 3643 struct rq_flags rf; 3644 bool timed_out = false; 3645 3646 rq_lock_irqsave(rq, &rf); 3647 sch = rcu_dereference_bh(scx_root); 3648 if (unlikely(!sch)) 3649 goto out_unlock; 3650 3651 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3652 unsigned long last_runnable = p->scx.runnable_at; 3653 3654 if (unlikely(time_after(jiffies, 3655 last_runnable + scx_watchdog_timeout))) { 3656 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3657 3658 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, 3659 "%s[%d] failed to run for %u.%03us", 3660 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); 3661 timed_out = true; 3662 break; 3663 } 3664 } 3665 out_unlock: 3666 rq_unlock_irqrestore(rq, &rf); 3667 return timed_out; 3668 } 3669 3670 static void scx_watchdog_workfn(struct work_struct *work) 3671 { 3672 int cpu; 3673 3674 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3675 3676 for_each_online_cpu(cpu) { 3677 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3678 break; 3679 3680 cond_resched(); 3681 } 3682 queue_delayed_work(system_unbound_wq, to_delayed_work(work), 3683 scx_watchdog_timeout / 2); 3684 } 3685 3686 void scx_tick(struct rq *rq) 3687 { 3688 struct scx_sched *sch; 3689 unsigned long last_check; 3690 3691 if (!scx_enabled()) 3692 return; 3693 3694 sch = rcu_dereference_bh(scx_root); 3695 if (unlikely(!sch)) 3696 return; 3697 3698 last_check = READ_ONCE(scx_watchdog_timestamp); 3699 if (unlikely(time_after(jiffies, 3700 last_check + READ_ONCE(scx_watchdog_timeout)))) { 3701 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3702 3703 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, 3704 "watchdog failed to check in for %u.%03us", 3705 dur_ms / 1000, dur_ms % 1000); 3706 } 3707 3708 update_other_load_avgs(rq); 3709 } 3710 3711 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3712 { 3713 struct scx_sched *sch = scx_root; 3714 3715 update_curr_scx(rq); 3716 3717 /* 3718 * While disabling, always resched and refresh core-sched timestamp as 3719 * we can't trust the slice management or ops.core_sched_before(). 3720 */ 3721 if (scx_rq_bypassing(rq)) { 3722 curr->scx.slice = 0; 3723 touch_core_sched(rq, curr); 3724 } else if (SCX_HAS_OP(sch, tick)) { 3725 SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); 3726 } 3727 3728 if (!curr->scx.slice) 3729 resched_curr(rq); 3730 } 3731 3732 #ifdef CONFIG_EXT_GROUP_SCHED 3733 static struct cgroup *tg_cgrp(struct task_group *tg) 3734 { 3735 /* 3736 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3737 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3738 * root cgroup. 3739 */ 3740 if (tg && tg->css.cgroup) 3741 return tg->css.cgroup; 3742 else 3743 return &cgrp_dfl_root.cgrp; 3744 } 3745 3746 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3747 3748 #else /* CONFIG_EXT_GROUP_SCHED */ 3749 3750 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3751 3752 #endif /* CONFIG_EXT_GROUP_SCHED */ 3753 3754 static enum scx_task_state scx_get_task_state(const struct task_struct *p) 3755 { 3756 return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; 3757 } 3758 3759 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) 3760 { 3761 enum scx_task_state prev_state = scx_get_task_state(p); 3762 bool warn = false; 3763 3764 BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); 3765 3766 switch (state) { 3767 case SCX_TASK_NONE: 3768 break; 3769 case SCX_TASK_INIT: 3770 warn = prev_state != SCX_TASK_NONE; 3771 break; 3772 case SCX_TASK_READY: 3773 warn = prev_state == SCX_TASK_NONE; 3774 break; 3775 case SCX_TASK_ENABLED: 3776 warn = prev_state != SCX_TASK_READY; 3777 break; 3778 default: 3779 warn = true; 3780 return; 3781 } 3782 3783 WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", 3784 prev_state, state, p->comm, p->pid); 3785 3786 p->scx.flags &= ~SCX_TASK_STATE_MASK; 3787 p->scx.flags |= state << SCX_TASK_STATE_SHIFT; 3788 } 3789 3790 static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) 3791 { 3792 struct scx_sched *sch = scx_root; 3793 int ret; 3794 3795 p->scx.disallow = false; 3796 3797 if (SCX_HAS_OP(sch, init_task)) { 3798 struct scx_init_task_args args = { 3799 SCX_INIT_TASK_ARGS_CGROUP(tg) 3800 .fork = fork, 3801 }; 3802 3803 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, 3804 p, &args); 3805 if (unlikely(ret)) { 3806 ret = ops_sanitize_err(sch, "init_task", ret); 3807 return ret; 3808 } 3809 } 3810 3811 scx_set_task_state(p, SCX_TASK_INIT); 3812 3813 if (p->scx.disallow) { 3814 if (!fork) { 3815 struct rq *rq; 3816 struct rq_flags rf; 3817 3818 rq = task_rq_lock(p, &rf); 3819 3820 /* 3821 * We're in the load path and @p->policy will be applied 3822 * right after. Reverting @p->policy here and rejecting 3823 * %SCHED_EXT transitions from scx_check_setscheduler() 3824 * guarantees that if ops.init_task() sets @p->disallow, 3825 * @p can never be in SCX. 3826 */ 3827 if (p->policy == SCHED_EXT) { 3828 p->policy = SCHED_NORMAL; 3829 atomic_long_inc(&scx_nr_rejected); 3830 } 3831 3832 task_rq_unlock(rq, p, &rf); 3833 } else if (p->policy == SCHED_EXT) { 3834 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3835 p->comm, p->pid); 3836 } 3837 } 3838 3839 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 3840 return 0; 3841 } 3842 3843 static void scx_enable_task(struct task_struct *p) 3844 { 3845 struct scx_sched *sch = scx_root; 3846 struct rq *rq = task_rq(p); 3847 u32 weight; 3848 3849 lockdep_assert_rq_held(rq); 3850 3851 /* 3852 * Set the weight before calling ops.enable() so that the scheduler 3853 * doesn't see a stale value if they inspect the task struct. 3854 */ 3855 if (task_has_idle_policy(p)) 3856 weight = WEIGHT_IDLEPRIO; 3857 else 3858 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3859 3860 p->scx.weight = sched_weight_to_cgroup(weight); 3861 3862 if (SCX_HAS_OP(sch, enable)) 3863 SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); 3864 scx_set_task_state(p, SCX_TASK_ENABLED); 3865 3866 if (SCX_HAS_OP(sch, set_weight)) 3867 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, 3868 p, p->scx.weight); 3869 } 3870 3871 static void scx_disable_task(struct task_struct *p) 3872 { 3873 struct scx_sched *sch = scx_root; 3874 struct rq *rq = task_rq(p); 3875 3876 lockdep_assert_rq_held(rq); 3877 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3878 3879 if (SCX_HAS_OP(sch, disable)) 3880 SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); 3881 scx_set_task_state(p, SCX_TASK_READY); 3882 } 3883 3884 static void scx_exit_task(struct task_struct *p) 3885 { 3886 struct scx_sched *sch = scx_root; 3887 struct scx_exit_task_args args = { 3888 .cancelled = false, 3889 }; 3890 3891 lockdep_assert_rq_held(task_rq(p)); 3892 3893 switch (scx_get_task_state(p)) { 3894 case SCX_TASK_NONE: 3895 return; 3896 case SCX_TASK_INIT: 3897 args.cancelled = true; 3898 break; 3899 case SCX_TASK_READY: 3900 break; 3901 case SCX_TASK_ENABLED: 3902 scx_disable_task(p); 3903 break; 3904 default: 3905 WARN_ON_ONCE(true); 3906 return; 3907 } 3908 3909 if (SCX_HAS_OP(sch, exit_task)) 3910 SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), 3911 p, &args); 3912 scx_set_task_state(p, SCX_TASK_NONE); 3913 } 3914 3915 void init_scx_entity(struct sched_ext_entity *scx) 3916 { 3917 memset(scx, 0, sizeof(*scx)); 3918 INIT_LIST_HEAD(&scx->dsq_list.node); 3919 RB_CLEAR_NODE(&scx->dsq_priq); 3920 scx->sticky_cpu = -1; 3921 scx->holding_cpu = -1; 3922 INIT_LIST_HEAD(&scx->runnable_node); 3923 scx->runnable_at = jiffies; 3924 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3925 scx->slice = SCX_SLICE_DFL; 3926 } 3927 3928 void scx_pre_fork(struct task_struct *p) 3929 { 3930 /* 3931 * BPF scheduler enable/disable paths want to be able to iterate and 3932 * update all tasks which can become complex when racing forks. As 3933 * enable/disable are very cold paths, let's use a percpu_rwsem to 3934 * exclude forks. 3935 */ 3936 percpu_down_read(&scx_fork_rwsem); 3937 } 3938 3939 int scx_fork(struct task_struct *p) 3940 { 3941 percpu_rwsem_assert_held(&scx_fork_rwsem); 3942 3943 if (scx_init_task_enabled) 3944 return scx_init_task(p, task_group(p), true); 3945 else 3946 return 0; 3947 } 3948 3949 void scx_post_fork(struct task_struct *p) 3950 { 3951 if (scx_init_task_enabled) { 3952 scx_set_task_state(p, SCX_TASK_READY); 3953 3954 /* 3955 * Enable the task immediately if it's running on sched_ext. 3956 * Otherwise, it'll be enabled in switching_to_scx() if and 3957 * when it's ever configured to run with a SCHED_EXT policy. 3958 */ 3959 if (p->sched_class == &ext_sched_class) { 3960 struct rq_flags rf; 3961 struct rq *rq; 3962 3963 rq = task_rq_lock(p, &rf); 3964 scx_enable_task(p); 3965 task_rq_unlock(rq, p, &rf); 3966 } 3967 } 3968 3969 spin_lock_irq(&scx_tasks_lock); 3970 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3971 spin_unlock_irq(&scx_tasks_lock); 3972 3973 percpu_up_read(&scx_fork_rwsem); 3974 } 3975 3976 void scx_cancel_fork(struct task_struct *p) 3977 { 3978 if (scx_enabled()) { 3979 struct rq *rq; 3980 struct rq_flags rf; 3981 3982 rq = task_rq_lock(p, &rf); 3983 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3984 scx_exit_task(p); 3985 task_rq_unlock(rq, p, &rf); 3986 } 3987 3988 percpu_up_read(&scx_fork_rwsem); 3989 } 3990 3991 void sched_ext_free(struct task_struct *p) 3992 { 3993 unsigned long flags; 3994 3995 spin_lock_irqsave(&scx_tasks_lock, flags); 3996 list_del_init(&p->scx.tasks_node); 3997 spin_unlock_irqrestore(&scx_tasks_lock, flags); 3998 3999 /* 4000 * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED 4001 * transitions can't race us. Disable ops for @p. 4002 */ 4003 if (scx_get_task_state(p) != SCX_TASK_NONE) { 4004 struct rq_flags rf; 4005 struct rq *rq; 4006 4007 rq = task_rq_lock(p, &rf); 4008 scx_exit_task(p); 4009 task_rq_unlock(rq, p, &rf); 4010 } 4011 } 4012 4013 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 4014 const struct load_weight *lw) 4015 { 4016 struct scx_sched *sch = scx_root; 4017 4018 lockdep_assert_rq_held(task_rq(p)); 4019 4020 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 4021 if (SCX_HAS_OP(sch, set_weight)) 4022 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, 4023 p, p->scx.weight); 4024 } 4025 4026 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) 4027 { 4028 } 4029 4030 static void switching_to_scx(struct rq *rq, struct task_struct *p) 4031 { 4032 struct scx_sched *sch = scx_root; 4033 4034 scx_enable_task(p); 4035 4036 /* 4037 * set_cpus_allowed_scx() is not called while @p is associated with a 4038 * different scheduler class. Keep the BPF scheduler up-to-date. 4039 */ 4040 if (SCX_HAS_OP(sch, set_cpumask)) 4041 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, 4042 p, (struct cpumask *)p->cpus_ptr); 4043 } 4044 4045 static void switched_from_scx(struct rq *rq, struct task_struct *p) 4046 { 4047 scx_disable_task(p); 4048 } 4049 4050 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} 4051 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 4052 4053 int scx_check_setscheduler(struct task_struct *p, int policy) 4054 { 4055 lockdep_assert_rq_held(task_rq(p)); 4056 4057 /* if disallow, reject transitioning into SCX */ 4058 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 4059 p->policy != policy && policy == SCHED_EXT) 4060 return -EACCES; 4061 4062 return 0; 4063 } 4064 4065 #ifdef CONFIG_NO_HZ_FULL 4066 bool scx_can_stop_tick(struct rq *rq) 4067 { 4068 struct task_struct *p = rq->curr; 4069 4070 if (scx_rq_bypassing(rq)) 4071 return false; 4072 4073 if (p->sched_class != &ext_sched_class) 4074 return true; 4075 4076 /* 4077 * @rq can dispatch from different DSQs, so we can't tell whether it 4078 * needs the tick or not by looking at nr_running. Allow stopping ticks 4079 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4080 */ 4081 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4082 } 4083 #endif 4084 4085 #ifdef CONFIG_EXT_GROUP_SCHED 4086 4087 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); 4088 static bool scx_cgroup_enabled; 4089 4090 void scx_tg_init(struct task_group *tg) 4091 { 4092 tg->scx.weight = CGROUP_WEIGHT_DFL; 4093 tg->scx.bw_period_us = default_bw_period_us(); 4094 tg->scx.bw_quota_us = RUNTIME_INF; 4095 } 4096 4097 int scx_tg_online(struct task_group *tg) 4098 { 4099 struct scx_sched *sch = scx_root; 4100 int ret = 0; 4101 4102 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4103 4104 percpu_down_read(&scx_cgroup_rwsem); 4105 4106 if (scx_cgroup_enabled) { 4107 if (SCX_HAS_OP(sch, cgroup_init)) { 4108 struct scx_cgroup_init_args args = 4109 { .weight = tg->scx.weight, 4110 .bw_period_us = tg->scx.bw_period_us, 4111 .bw_quota_us = tg->scx.bw_quota_us, 4112 .bw_burst_us = tg->scx.bw_burst_us }; 4113 4114 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, 4115 NULL, tg->css.cgroup, &args); 4116 if (ret) 4117 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4118 } 4119 if (ret == 0) 4120 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4121 } else { 4122 tg->scx.flags |= SCX_TG_ONLINE; 4123 } 4124 4125 percpu_up_read(&scx_cgroup_rwsem); 4126 return ret; 4127 } 4128 4129 void scx_tg_offline(struct task_group *tg) 4130 { 4131 struct scx_sched *sch = scx_root; 4132 4133 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4134 4135 percpu_down_read(&scx_cgroup_rwsem); 4136 4137 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4138 (tg->scx.flags & SCX_TG_INITED)) 4139 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, 4140 tg->css.cgroup); 4141 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4142 4143 percpu_up_read(&scx_cgroup_rwsem); 4144 } 4145 4146 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4147 { 4148 struct scx_sched *sch = scx_root; 4149 struct cgroup_subsys_state *css; 4150 struct task_struct *p; 4151 int ret; 4152 4153 /* released in scx_finish/cancel_attach() */ 4154 percpu_down_read(&scx_cgroup_rwsem); 4155 4156 if (!scx_cgroup_enabled) 4157 return 0; 4158 4159 cgroup_taskset_for_each(p, css, tset) { 4160 struct cgroup *from = tg_cgrp(task_group(p)); 4161 struct cgroup *to = tg_cgrp(css_tg(css)); 4162 4163 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4164 4165 /* 4166 * sched_move_task() omits identity migrations. Let's match the 4167 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4168 * always match one-to-one. 4169 */ 4170 if (from == to) 4171 continue; 4172 4173 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4174 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, 4175 cgroup_prep_move, NULL, 4176 p, from, css->cgroup); 4177 if (ret) 4178 goto err; 4179 } 4180 4181 p->scx.cgrp_moving_from = from; 4182 } 4183 4184 return 0; 4185 4186 err: 4187 cgroup_taskset_for_each(p, css, tset) { 4188 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4189 p->scx.cgrp_moving_from) 4190 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, 4191 p, p->scx.cgrp_moving_from, css->cgroup); 4192 p->scx.cgrp_moving_from = NULL; 4193 } 4194 4195 percpu_up_read(&scx_cgroup_rwsem); 4196 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4197 } 4198 4199 void scx_cgroup_move_task(struct task_struct *p) 4200 { 4201 struct scx_sched *sch = scx_root; 4202 4203 if (!scx_cgroup_enabled) 4204 return; 4205 4206 /* 4207 * @p must have ops.cgroup_prep_move() called on it and thus 4208 * cgrp_moving_from set. 4209 */ 4210 if (SCX_HAS_OP(sch, cgroup_move) && 4211 !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) 4212 SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, 4213 p, p->scx.cgrp_moving_from, 4214 tg_cgrp(task_group(p))); 4215 p->scx.cgrp_moving_from = NULL; 4216 } 4217 4218 void scx_cgroup_finish_attach(void) 4219 { 4220 percpu_up_read(&scx_cgroup_rwsem); 4221 } 4222 4223 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4224 { 4225 struct scx_sched *sch = scx_root; 4226 struct cgroup_subsys_state *css; 4227 struct task_struct *p; 4228 4229 if (!scx_cgroup_enabled) 4230 goto out_unlock; 4231 4232 cgroup_taskset_for_each(p, css, tset) { 4233 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4234 p->scx.cgrp_moving_from) 4235 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, 4236 p, p->scx.cgrp_moving_from, css->cgroup); 4237 p->scx.cgrp_moving_from = NULL; 4238 } 4239 out_unlock: 4240 percpu_up_read(&scx_cgroup_rwsem); 4241 } 4242 4243 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4244 { 4245 struct scx_sched *sch = scx_root; 4246 4247 percpu_down_read(&scx_cgroup_rwsem); 4248 4249 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4250 tg->scx.weight != weight) 4251 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, 4252 tg_cgrp(tg), weight); 4253 4254 tg->scx.weight = weight; 4255 4256 percpu_up_read(&scx_cgroup_rwsem); 4257 } 4258 4259 void scx_group_set_idle(struct task_group *tg, bool idle) 4260 { 4261 /* TODO: Implement ops->cgroup_set_idle() */ 4262 } 4263 4264 void scx_group_set_bandwidth(struct task_group *tg, 4265 u64 period_us, u64 quota_us, u64 burst_us) 4266 { 4267 struct scx_sched *sch = scx_root; 4268 4269 percpu_down_read(&scx_cgroup_rwsem); 4270 4271 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4272 (tg->scx.bw_period_us != period_us || 4273 tg->scx.bw_quota_us != quota_us || 4274 tg->scx.bw_burst_us != burst_us)) 4275 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, 4276 tg_cgrp(tg), period_us, quota_us, burst_us); 4277 4278 tg->scx.bw_period_us = period_us; 4279 tg->scx.bw_quota_us = quota_us; 4280 tg->scx.bw_burst_us = burst_us; 4281 4282 percpu_up_read(&scx_cgroup_rwsem); 4283 } 4284 4285 static void scx_cgroup_lock(void) 4286 { 4287 percpu_down_write(&scx_cgroup_rwsem); 4288 } 4289 4290 static void scx_cgroup_unlock(void) 4291 { 4292 percpu_up_write(&scx_cgroup_rwsem); 4293 } 4294 4295 #else /* CONFIG_EXT_GROUP_SCHED */ 4296 4297 static inline void scx_cgroup_lock(void) {} 4298 static inline void scx_cgroup_unlock(void) {} 4299 4300 #endif /* CONFIG_EXT_GROUP_SCHED */ 4301 4302 /* 4303 * Omitted operations: 4304 * 4305 * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task 4306 * isn't tied to the CPU at that point. Preemption is implemented by resetting 4307 * the victim task's slice to 0 and triggering reschedule on the target CPU. 4308 * 4309 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4310 * 4311 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4312 * their current sched_class. Call them directly from sched core instead. 4313 */ 4314 DEFINE_SCHED_CLASS(ext) = { 4315 .enqueue_task = enqueue_task_scx, 4316 .dequeue_task = dequeue_task_scx, 4317 .yield_task = yield_task_scx, 4318 .yield_to_task = yield_to_task_scx, 4319 4320 .wakeup_preempt = wakeup_preempt_scx, 4321 4322 .balance = balance_scx, 4323 .pick_task = pick_task_scx, 4324 4325 .put_prev_task = put_prev_task_scx, 4326 .set_next_task = set_next_task_scx, 4327 4328 .select_task_rq = select_task_rq_scx, 4329 .task_woken = task_woken_scx, 4330 .set_cpus_allowed = set_cpus_allowed_scx, 4331 4332 .rq_online = rq_online_scx, 4333 .rq_offline = rq_offline_scx, 4334 4335 .task_tick = task_tick_scx, 4336 4337 .switching_to = switching_to_scx, 4338 .switched_from = switched_from_scx, 4339 .switched_to = switched_to_scx, 4340 .reweight_task = reweight_task_scx, 4341 .prio_changed = prio_changed_scx, 4342 4343 .update_curr = update_curr_scx, 4344 4345 #ifdef CONFIG_UCLAMP_TASK 4346 .uclamp_enabled = 1, 4347 #endif 4348 }; 4349 4350 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) 4351 { 4352 memset(dsq, 0, sizeof(*dsq)); 4353 4354 raw_spin_lock_init(&dsq->lock); 4355 INIT_LIST_HEAD(&dsq->list); 4356 dsq->id = dsq_id; 4357 } 4358 4359 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4360 { 4361 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4362 struct scx_dispatch_q *dsq, *tmp_dsq; 4363 4364 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4365 kfree_rcu(dsq, rcu); 4366 } 4367 4368 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4369 4370 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4371 { 4372 struct scx_dispatch_q *dsq; 4373 unsigned long flags; 4374 4375 rcu_read_lock(); 4376 4377 dsq = find_user_dsq(sch, dsq_id); 4378 if (!dsq) 4379 goto out_unlock_rcu; 4380 4381 raw_spin_lock_irqsave(&dsq->lock, flags); 4382 4383 if (dsq->nr) { 4384 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4385 dsq->id, dsq->nr); 4386 goto out_unlock_dsq; 4387 } 4388 4389 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4390 dsq_hash_params)) 4391 goto out_unlock_dsq; 4392 4393 /* 4394 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4395 * queueing more tasks. As this function can be called from anywhere, 4396 * freeing is bounced through an irq work to avoid nesting RCU 4397 * operations inside scheduler locks. 4398 */ 4399 dsq->id = SCX_DSQ_INVALID; 4400 llist_add(&dsq->free_node, &dsqs_to_free); 4401 irq_work_queue(&free_dsq_irq_work); 4402 4403 out_unlock_dsq: 4404 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4405 out_unlock_rcu: 4406 rcu_read_unlock(); 4407 } 4408 4409 #ifdef CONFIG_EXT_GROUP_SCHED 4410 static void scx_cgroup_exit(struct scx_sched *sch) 4411 { 4412 struct cgroup_subsys_state *css; 4413 4414 percpu_rwsem_assert_held(&scx_cgroup_rwsem); 4415 4416 scx_cgroup_enabled = false; 4417 4418 /* 4419 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 4420 * cgroups and exit all the inited ones, all online cgroups are exited. 4421 */ 4422 rcu_read_lock(); 4423 css_for_each_descendant_post(css, &root_task_group.css) { 4424 struct task_group *tg = css_tg(css); 4425 4426 if (!(tg->scx.flags & SCX_TG_INITED)) 4427 continue; 4428 tg->scx.flags &= ~SCX_TG_INITED; 4429 4430 if (!sch->ops.cgroup_exit) 4431 continue; 4432 4433 if (WARN_ON_ONCE(!css_tryget(css))) 4434 continue; 4435 rcu_read_unlock(); 4436 4437 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, 4438 css->cgroup); 4439 4440 rcu_read_lock(); 4441 css_put(css); 4442 } 4443 rcu_read_unlock(); 4444 } 4445 4446 static int scx_cgroup_init(struct scx_sched *sch) 4447 { 4448 struct cgroup_subsys_state *css; 4449 int ret; 4450 4451 percpu_rwsem_assert_held(&scx_cgroup_rwsem); 4452 4453 /* 4454 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 4455 * cgroups and init, all online cgroups are initialized. 4456 */ 4457 rcu_read_lock(); 4458 css_for_each_descendant_pre(css, &root_task_group.css) { 4459 struct task_group *tg = css_tg(css); 4460 struct scx_cgroup_init_args args = { 4461 .weight = tg->scx.weight, 4462 .bw_period_us = tg->scx.bw_period_us, 4463 .bw_quota_us = tg->scx.bw_quota_us, 4464 .bw_burst_us = tg->scx.bw_burst_us, 4465 }; 4466 4467 if ((tg->scx.flags & 4468 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4469 continue; 4470 4471 if (!sch->ops.cgroup_init) { 4472 tg->scx.flags |= SCX_TG_INITED; 4473 continue; 4474 } 4475 4476 if (WARN_ON_ONCE(!css_tryget(css))) 4477 continue; 4478 rcu_read_unlock(); 4479 4480 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, 4481 css->cgroup, &args); 4482 if (ret) { 4483 css_put(css); 4484 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4485 return ret; 4486 } 4487 tg->scx.flags |= SCX_TG_INITED; 4488 4489 rcu_read_lock(); 4490 css_put(css); 4491 } 4492 rcu_read_unlock(); 4493 4494 WARN_ON_ONCE(scx_cgroup_enabled); 4495 scx_cgroup_enabled = true; 4496 4497 return 0; 4498 } 4499 4500 #else 4501 static void scx_cgroup_exit(struct scx_sched *sch) {} 4502 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4503 #endif 4504 4505 4506 /******************************************************************************** 4507 * Sysfs interface and ops enable/disable. 4508 */ 4509 4510 #define SCX_ATTR(_name) \ 4511 static struct kobj_attribute scx_attr_##_name = { \ 4512 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4513 .show = scx_attr_##_name##_show, \ 4514 } 4515 4516 static ssize_t scx_attr_state_show(struct kobject *kobj, 4517 struct kobj_attribute *ka, char *buf) 4518 { 4519 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4520 } 4521 SCX_ATTR(state); 4522 4523 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4524 struct kobj_attribute *ka, char *buf) 4525 { 4526 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4527 } 4528 SCX_ATTR(switch_all); 4529 4530 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4531 struct kobj_attribute *ka, char *buf) 4532 { 4533 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4534 } 4535 SCX_ATTR(nr_rejected); 4536 4537 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4538 struct kobj_attribute *ka, char *buf) 4539 { 4540 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4541 } 4542 SCX_ATTR(hotplug_seq); 4543 4544 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4545 struct kobj_attribute *ka, char *buf) 4546 { 4547 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4548 } 4549 SCX_ATTR(enable_seq); 4550 4551 static struct attribute *scx_global_attrs[] = { 4552 &scx_attr_state.attr, 4553 &scx_attr_switch_all.attr, 4554 &scx_attr_nr_rejected.attr, 4555 &scx_attr_hotplug_seq.attr, 4556 &scx_attr_enable_seq.attr, 4557 NULL, 4558 }; 4559 4560 static const struct attribute_group scx_global_attr_group = { 4561 .attrs = scx_global_attrs, 4562 }; 4563 4564 static void free_exit_info(struct scx_exit_info *ei); 4565 4566 static void scx_sched_free_rcu_work(struct work_struct *work) 4567 { 4568 struct rcu_work *rcu_work = to_rcu_work(work); 4569 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 4570 struct rhashtable_iter rht_iter; 4571 struct scx_dispatch_q *dsq; 4572 int node; 4573 4574 kthread_stop(sch->helper->task); 4575 free_percpu(sch->event_stats_cpu); 4576 4577 for_each_node_state(node, N_POSSIBLE) 4578 kfree(sch->global_dsqs[node]); 4579 kfree(sch->global_dsqs); 4580 4581 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 4582 do { 4583 rhashtable_walk_start(&rht_iter); 4584 4585 while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) 4586 destroy_dsq(sch, dsq->id); 4587 4588 rhashtable_walk_stop(&rht_iter); 4589 } while (dsq == ERR_PTR(-EAGAIN)); 4590 rhashtable_walk_exit(&rht_iter); 4591 4592 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 4593 free_exit_info(sch->exit_info); 4594 kfree(sch); 4595 } 4596 4597 static void scx_kobj_release(struct kobject *kobj) 4598 { 4599 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4600 4601 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 4602 queue_rcu_work(system_unbound_wq, &sch->rcu_work); 4603 } 4604 4605 static ssize_t scx_attr_ops_show(struct kobject *kobj, 4606 struct kobj_attribute *ka, char *buf) 4607 { 4608 return sysfs_emit(buf, "%s\n", scx_root->ops.name); 4609 } 4610 SCX_ATTR(ops); 4611 4612 #define scx_attr_event_show(buf, at, events, kind) ({ \ 4613 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4614 }) 4615 4616 static ssize_t scx_attr_events_show(struct kobject *kobj, 4617 struct kobj_attribute *ka, char *buf) 4618 { 4619 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4620 struct scx_event_stats events; 4621 int at = 0; 4622 4623 scx_read_events(sch, &events); 4624 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4625 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4626 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4627 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4628 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4629 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 4630 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4631 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4632 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4633 return at; 4634 } 4635 SCX_ATTR(events); 4636 4637 static struct attribute *scx_sched_attrs[] = { 4638 &scx_attr_ops.attr, 4639 &scx_attr_events.attr, 4640 NULL, 4641 }; 4642 ATTRIBUTE_GROUPS(scx_sched); 4643 4644 static const struct kobj_type scx_ktype = { 4645 .release = scx_kobj_release, 4646 .sysfs_ops = &kobj_sysfs_ops, 4647 .default_groups = scx_sched_groups, 4648 }; 4649 4650 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 4651 { 4652 return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); 4653 } 4654 4655 static const struct kset_uevent_ops scx_uevent_ops = { 4656 .uevent = scx_uevent, 4657 }; 4658 4659 /* 4660 * Used by sched_fork() and __setscheduler_prio() to pick the matching 4661 * sched_class. dl/rt are already handled. 4662 */ 4663 bool task_should_scx(int policy) 4664 { 4665 if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) 4666 return false; 4667 if (READ_ONCE(scx_switching_all)) 4668 return true; 4669 return policy == SCHED_EXT; 4670 } 4671 4672 bool scx_allow_ttwu_queue(const struct task_struct *p) 4673 { 4674 return !scx_enabled() || 4675 (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || 4676 p->sched_class != &ext_sched_class; 4677 } 4678 4679 /** 4680 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 4681 * 4682 * While there are various reasons why RCU CPU stalls can occur on a system 4683 * that may not be caused by the current BPF scheduler, try kicking out the 4684 * current scheduler in an attempt to recover the system to a good state before 4685 * issuing panics. 4686 */ 4687 bool scx_rcu_cpu_stall(void) 4688 { 4689 struct scx_sched *sch; 4690 4691 rcu_read_lock(); 4692 4693 sch = rcu_dereference(scx_root); 4694 if (unlikely(!sch)) { 4695 rcu_read_unlock(); 4696 return false; 4697 } 4698 4699 switch (scx_enable_state()) { 4700 case SCX_ENABLING: 4701 case SCX_ENABLED: 4702 break; 4703 default: 4704 rcu_read_unlock(); 4705 return false; 4706 } 4707 4708 scx_error(sch, "RCU CPU stall detected!"); 4709 rcu_read_unlock(); 4710 4711 return true; 4712 } 4713 4714 /** 4715 * scx_softlockup - sched_ext softlockup handler 4716 * @dur_s: number of seconds of CPU stuck due to soft lockup 4717 * 4718 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 4719 * live-lock the system by making many CPUs target the same DSQ to the point 4720 * where soft-lockup detection triggers. This function is called from 4721 * soft-lockup watchdog when the triggering point is close and tries to unjam 4722 * the system by enabling the breather and aborting the BPF scheduler. 4723 */ 4724 void scx_softlockup(u32 dur_s) 4725 { 4726 struct scx_sched *sch; 4727 4728 rcu_read_lock(); 4729 4730 sch = rcu_dereference(scx_root); 4731 if (unlikely(!sch)) 4732 goto out_unlock; 4733 4734 switch (scx_enable_state()) { 4735 case SCX_ENABLING: 4736 case SCX_ENABLED: 4737 break; 4738 default: 4739 goto out_unlock; 4740 } 4741 4742 /* allow only one instance, cleared at the end of scx_bypass() */ 4743 if (test_and_set_bit(0, &scx_in_softlockup)) 4744 goto out_unlock; 4745 4746 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", 4747 smp_processor_id(), dur_s, scx_root->ops.name); 4748 4749 /* 4750 * Some CPUs may be trapped in the dispatch paths. Enable breather 4751 * immediately; otherwise, we might even be able to get to scx_bypass(). 4752 */ 4753 atomic_inc(&scx_breather_depth); 4754 4755 scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); 4756 out_unlock: 4757 rcu_read_unlock(); 4758 } 4759 4760 static void scx_clear_softlockup(void) 4761 { 4762 if (test_and_clear_bit(0, &scx_in_softlockup)) 4763 atomic_dec(&scx_breather_depth); 4764 } 4765 4766 /** 4767 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 4768 * @bypass: true for bypass, false for unbypass 4769 * 4770 * Bypassing guarantees that all runnable tasks make forward progress without 4771 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 4772 * be held by tasks that the BPF scheduler is forgetting to run, which 4773 * unfortunately also excludes toggling the static branches. 4774 * 4775 * Let's work around by overriding a couple ops and modifying behaviors based on 4776 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 4777 * to force global FIFO scheduling. 4778 * 4779 * - ops.select_cpu() is ignored and the default select_cpu() is used. 4780 * 4781 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 4782 * %SCX_OPS_ENQ_LAST is also ignored. 4783 * 4784 * - ops.dispatch() is ignored. 4785 * 4786 * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 4787 * can't be trusted. Whenever a tick triggers, the running task is rotated to 4788 * the tail of the queue with core_sched_at touched. 4789 * 4790 * - pick_next_task() suppresses zero slice warning. 4791 * 4792 * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM 4793 * operations. 4794 * 4795 * - scx_prio_less() reverts to the default core_sched_at order. 4796 */ 4797 static void scx_bypass(bool bypass) 4798 { 4799 static DEFINE_RAW_SPINLOCK(bypass_lock); 4800 static unsigned long bypass_timestamp; 4801 struct scx_sched *sch; 4802 unsigned long flags; 4803 int cpu; 4804 4805 raw_spin_lock_irqsave(&bypass_lock, flags); 4806 sch = rcu_dereference_bh(scx_root); 4807 4808 if (bypass) { 4809 scx_bypass_depth++; 4810 WARN_ON_ONCE(scx_bypass_depth <= 0); 4811 if (scx_bypass_depth != 1) 4812 goto unlock; 4813 bypass_timestamp = ktime_get_ns(); 4814 if (sch) 4815 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 4816 } else { 4817 scx_bypass_depth--; 4818 WARN_ON_ONCE(scx_bypass_depth < 0); 4819 if (scx_bypass_depth != 0) 4820 goto unlock; 4821 if (sch) 4822 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 4823 ktime_get_ns() - bypass_timestamp); 4824 } 4825 4826 atomic_inc(&scx_breather_depth); 4827 4828 /* 4829 * No task property is changing. We just need to make sure all currently 4830 * queued tasks are re-queued according to the new scx_rq_bypassing() 4831 * state. As an optimization, walk each rq's runnable_list instead of 4832 * the scx_tasks list. 4833 * 4834 * This function can't trust the scheduler and thus can't use 4835 * cpus_read_lock(). Walk all possible CPUs instead of online. 4836 */ 4837 for_each_possible_cpu(cpu) { 4838 struct rq *rq = cpu_rq(cpu); 4839 struct task_struct *p, *n; 4840 4841 raw_spin_rq_lock(rq); 4842 4843 if (bypass) { 4844 WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); 4845 rq->scx.flags |= SCX_RQ_BYPASSING; 4846 } else { 4847 WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); 4848 rq->scx.flags &= ~SCX_RQ_BYPASSING; 4849 } 4850 4851 /* 4852 * We need to guarantee that no tasks are on the BPF scheduler 4853 * while bypassing. Either we see enabled or the enable path 4854 * sees scx_rq_bypassing() before moving tasks to SCX. 4855 */ 4856 if (!scx_enabled()) { 4857 raw_spin_rq_unlock(rq); 4858 continue; 4859 } 4860 4861 /* 4862 * The use of list_for_each_entry_safe_reverse() is required 4863 * because each task is going to be removed from and added back 4864 * to the runnable_list during iteration. Because they're added 4865 * to the tail of the list, safe reverse iteration can still 4866 * visit all nodes. 4867 */ 4868 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 4869 scx.runnable_node) { 4870 struct sched_enq_and_set_ctx ctx; 4871 4872 /* cycling deq/enq is enough, see the function comment */ 4873 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4874 sched_enq_and_set_task(&ctx); 4875 } 4876 4877 /* resched to restore ticks and idle state */ 4878 if (cpu_online(cpu) || cpu == smp_processor_id()) 4879 resched_curr(rq); 4880 4881 raw_spin_rq_unlock(rq); 4882 } 4883 4884 atomic_dec(&scx_breather_depth); 4885 unlock: 4886 raw_spin_unlock_irqrestore(&bypass_lock, flags); 4887 scx_clear_softlockup(); 4888 } 4889 4890 static void free_exit_info(struct scx_exit_info *ei) 4891 { 4892 kvfree(ei->dump); 4893 kfree(ei->msg); 4894 kfree(ei->bt); 4895 kfree(ei); 4896 } 4897 4898 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 4899 { 4900 struct scx_exit_info *ei; 4901 4902 ei = kzalloc(sizeof(*ei), GFP_KERNEL); 4903 if (!ei) 4904 return NULL; 4905 4906 ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); 4907 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 4908 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 4909 4910 if (!ei->bt || !ei->msg || !ei->dump) { 4911 free_exit_info(ei); 4912 return NULL; 4913 } 4914 4915 return ei; 4916 } 4917 4918 static const char *scx_exit_reason(enum scx_exit_kind kind) 4919 { 4920 switch (kind) { 4921 case SCX_EXIT_UNREG: 4922 return "unregistered from user space"; 4923 case SCX_EXIT_UNREG_BPF: 4924 return "unregistered from BPF"; 4925 case SCX_EXIT_UNREG_KERN: 4926 return "unregistered from the main kernel"; 4927 case SCX_EXIT_SYSRQ: 4928 return "disabled by sysrq-S"; 4929 case SCX_EXIT_ERROR: 4930 return "runtime error"; 4931 case SCX_EXIT_ERROR_BPF: 4932 return "scx_bpf_error"; 4933 case SCX_EXIT_ERROR_STALL: 4934 return "runnable task stall"; 4935 default: 4936 return "<UNKNOWN>"; 4937 } 4938 } 4939 4940 static void scx_disable_workfn(struct kthread_work *work) 4941 { 4942 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 4943 struct scx_exit_info *ei = sch->exit_info; 4944 struct scx_task_iter sti; 4945 struct task_struct *p; 4946 int kind, cpu; 4947 4948 kind = atomic_read(&sch->exit_kind); 4949 while (true) { 4950 if (kind == SCX_EXIT_DONE) /* already disabled? */ 4951 return; 4952 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 4953 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 4954 break; 4955 } 4956 ei->kind = kind; 4957 ei->reason = scx_exit_reason(ei->kind); 4958 4959 /* guarantee forward progress by bypassing scx_ops */ 4960 scx_bypass(true); 4961 4962 switch (scx_set_enable_state(SCX_DISABLING)) { 4963 case SCX_DISABLING: 4964 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 4965 break; 4966 case SCX_DISABLED: 4967 pr_warn("sched_ext: ops error detected without ops (%s)\n", 4968 sch->exit_info->msg); 4969 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 4970 goto done; 4971 default: 4972 break; 4973 } 4974 4975 /* 4976 * Here, every runnable task is guaranteed to make forward progress and 4977 * we can safely use blocking synchronization constructs. Actually 4978 * disable ops. 4979 */ 4980 mutex_lock(&scx_enable_mutex); 4981 4982 static_branch_disable(&__scx_switched_all); 4983 WRITE_ONCE(scx_switching_all, false); 4984 4985 /* 4986 * Shut down cgroup support before tasks so that the cgroup attach path 4987 * doesn't race against scx_exit_task(). 4988 */ 4989 scx_cgroup_lock(); 4990 scx_cgroup_exit(sch); 4991 scx_cgroup_unlock(); 4992 4993 /* 4994 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 4995 * must be switched out and exited synchronously. 4996 */ 4997 percpu_down_write(&scx_fork_rwsem); 4998 4999 scx_init_task_enabled = false; 5000 5001 scx_task_iter_start(&sti); 5002 while ((p = scx_task_iter_next_locked(&sti))) { 5003 const struct sched_class *old_class = p->sched_class; 5004 const struct sched_class *new_class = 5005 __setscheduler_class(p->policy, p->prio); 5006 struct sched_enq_and_set_ctx ctx; 5007 5008 if (old_class != new_class && p->se.sched_delayed) 5009 dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5010 5011 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5012 5013 p->sched_class = new_class; 5014 check_class_changing(task_rq(p), p, old_class); 5015 5016 sched_enq_and_set_task(&ctx); 5017 5018 check_class_changed(task_rq(p), p, old_class, p->prio); 5019 scx_exit_task(p); 5020 } 5021 scx_task_iter_stop(&sti); 5022 percpu_up_write(&scx_fork_rwsem); 5023 5024 /* 5025 * Invalidate all the rq clocks to prevent getting outdated 5026 * rq clocks from a previous scx scheduler. 5027 */ 5028 for_each_possible_cpu(cpu) { 5029 struct rq *rq = cpu_rq(cpu); 5030 scx_rq_clock_invalidate(rq); 5031 } 5032 5033 /* no task is on scx, turn off all the switches and flush in-progress calls */ 5034 static_branch_disable(&__scx_enabled); 5035 bitmap_zero(sch->has_op, SCX_OPI_END); 5036 scx_idle_disable(); 5037 synchronize_rcu(); 5038 5039 if (ei->kind >= SCX_EXIT_ERROR) { 5040 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 5041 sch->ops.name, ei->reason); 5042 5043 if (ei->msg[0] != '\0') 5044 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 5045 #ifdef CONFIG_STACKTRACE 5046 stack_trace_print(ei->bt, ei->bt_len, 2); 5047 #endif 5048 } else { 5049 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 5050 sch->ops.name, ei->reason); 5051 } 5052 5053 if (sch->ops.exit) 5054 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); 5055 5056 cancel_delayed_work_sync(&scx_watchdog_work); 5057 5058 /* 5059 * scx_root clearing must be inside cpus_read_lock(). See 5060 * handle_hotplug(). 5061 */ 5062 cpus_read_lock(); 5063 RCU_INIT_POINTER(scx_root, NULL); 5064 cpus_read_unlock(); 5065 5066 /* 5067 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 5068 * could observe an object of the same name still in the hierarchy when 5069 * the next scheduler is loaded. 5070 */ 5071 kobject_del(&sch->kobj); 5072 5073 free_percpu(scx_dsp_ctx); 5074 scx_dsp_ctx = NULL; 5075 scx_dsp_max_batch = 0; 5076 5077 mutex_unlock(&scx_enable_mutex); 5078 5079 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 5080 done: 5081 scx_bypass(false); 5082 } 5083 5084 static void scx_disable(enum scx_exit_kind kind) 5085 { 5086 int none = SCX_EXIT_NONE; 5087 struct scx_sched *sch; 5088 5089 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 5090 kind = SCX_EXIT_ERROR; 5091 5092 rcu_read_lock(); 5093 sch = rcu_dereference(scx_root); 5094 if (sch) { 5095 atomic_try_cmpxchg(&sch->exit_kind, &none, kind); 5096 kthread_queue_work(sch->helper, &sch->disable_work); 5097 } 5098 rcu_read_unlock(); 5099 } 5100 5101 static void dump_newline(struct seq_buf *s) 5102 { 5103 trace_sched_ext_dump(""); 5104 5105 /* @s may be zero sized and seq_buf triggers WARN if so */ 5106 if (s->size) 5107 seq_buf_putc(s, '\n'); 5108 } 5109 5110 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 5111 { 5112 va_list args; 5113 5114 #ifdef CONFIG_TRACEPOINTS 5115 if (trace_sched_ext_dump_enabled()) { 5116 /* protected by scx_dump_state()::dump_lock */ 5117 static char line_buf[SCX_EXIT_MSG_LEN]; 5118 5119 va_start(args, fmt); 5120 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 5121 va_end(args); 5122 5123 trace_sched_ext_dump(line_buf); 5124 } 5125 #endif 5126 /* @s may be zero sized and seq_buf triggers WARN if so */ 5127 if (s->size) { 5128 va_start(args, fmt); 5129 seq_buf_vprintf(s, fmt, args); 5130 va_end(args); 5131 5132 seq_buf_putc(s, '\n'); 5133 } 5134 } 5135 5136 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 5137 const unsigned long *bt, unsigned int len) 5138 { 5139 unsigned int i; 5140 5141 for (i = 0; i < len; i++) 5142 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 5143 } 5144 5145 static void ops_dump_init(struct seq_buf *s, const char *prefix) 5146 { 5147 struct scx_dump_data *dd = &scx_dump_data; 5148 5149 lockdep_assert_irqs_disabled(); 5150 5151 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 5152 dd->first = true; 5153 dd->cursor = 0; 5154 dd->s = s; 5155 dd->prefix = prefix; 5156 } 5157 5158 static void ops_dump_flush(void) 5159 { 5160 struct scx_dump_data *dd = &scx_dump_data; 5161 char *line = dd->buf.line; 5162 5163 if (!dd->cursor) 5164 return; 5165 5166 /* 5167 * There's something to flush and this is the first line. Insert a blank 5168 * line to distinguish ops dump. 5169 */ 5170 if (dd->first) { 5171 dump_newline(dd->s); 5172 dd->first = false; 5173 } 5174 5175 /* 5176 * There may be multiple lines in $line. Scan and emit each line 5177 * separately. 5178 */ 5179 while (true) { 5180 char *end = line; 5181 char c; 5182 5183 while (*end != '\n' && *end != '\0') 5184 end++; 5185 5186 /* 5187 * If $line overflowed, it may not have newline at the end. 5188 * Always emit with a newline. 5189 */ 5190 c = *end; 5191 *end = '\0'; 5192 dump_line(dd->s, "%s%s", dd->prefix, line); 5193 if (c == '\0') 5194 break; 5195 5196 /* move to the next line */ 5197 end++; 5198 if (*end == '\0') 5199 break; 5200 line = end; 5201 } 5202 5203 dd->cursor = 0; 5204 } 5205 5206 static void ops_dump_exit(void) 5207 { 5208 ops_dump_flush(); 5209 scx_dump_data.cpu = -1; 5210 } 5211 5212 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, 5213 struct task_struct *p, char marker) 5214 { 5215 static unsigned long bt[SCX_EXIT_BT_LEN]; 5216 struct scx_sched *sch = scx_root; 5217 char dsq_id_buf[19] = "(n/a)"; 5218 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 5219 unsigned int bt_len = 0; 5220 5221 if (p->scx.dsq) 5222 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 5223 (unsigned long long)p->scx.dsq->id); 5224 5225 dump_newline(s); 5226 dump_line(s, " %c%c %s[%d] %+ldms", 5227 marker, task_state_to_char(p), p->comm, p->pid, 5228 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 5229 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 5230 scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, 5231 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 5232 ops_state >> SCX_OPSS_QSEQ_SHIFT); 5233 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 5234 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 5235 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 5236 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 5237 dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); 5238 5239 if (SCX_HAS_OP(sch, dump_task)) { 5240 ops_dump_init(s, " "); 5241 SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); 5242 ops_dump_exit(); 5243 } 5244 5245 #ifdef CONFIG_STACKTRACE 5246 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 5247 #endif 5248 if (bt_len) { 5249 dump_newline(s); 5250 dump_stack_trace(s, " ", bt, bt_len); 5251 } 5252 } 5253 5254 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) 5255 { 5256 static DEFINE_SPINLOCK(dump_lock); 5257 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 5258 struct scx_sched *sch = scx_root; 5259 struct scx_dump_ctx dctx = { 5260 .kind = ei->kind, 5261 .exit_code = ei->exit_code, 5262 .reason = ei->reason, 5263 .at_ns = ktime_get_ns(), 5264 .at_jiffies = jiffies, 5265 }; 5266 struct seq_buf s; 5267 struct scx_event_stats events; 5268 unsigned long flags; 5269 char *buf; 5270 int cpu; 5271 5272 spin_lock_irqsave(&dump_lock, flags); 5273 5274 seq_buf_init(&s, ei->dump, dump_len); 5275 5276 if (ei->kind == SCX_EXIT_NONE) { 5277 dump_line(&s, "Debug dump triggered by %s", ei->reason); 5278 } else { 5279 dump_line(&s, "%s[%d] triggered exit kind %d:", 5280 current->comm, current->pid, ei->kind); 5281 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 5282 dump_newline(&s); 5283 dump_line(&s, "Backtrace:"); 5284 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 5285 } 5286 5287 if (SCX_HAS_OP(sch, dump)) { 5288 ops_dump_init(&s, ""); 5289 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); 5290 ops_dump_exit(); 5291 } 5292 5293 dump_newline(&s); 5294 dump_line(&s, "CPU states"); 5295 dump_line(&s, "----------"); 5296 5297 for_each_possible_cpu(cpu) { 5298 struct rq *rq = cpu_rq(cpu); 5299 struct rq_flags rf; 5300 struct task_struct *p; 5301 struct seq_buf ns; 5302 size_t avail, used; 5303 bool idle; 5304 5305 rq_lock(rq, &rf); 5306 5307 idle = list_empty(&rq->scx.runnable_list) && 5308 rq->curr->sched_class == &idle_sched_class; 5309 5310 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 5311 goto next; 5312 5313 /* 5314 * We don't yet know whether ops.dump_cpu() will produce output 5315 * and we may want to skip the default CPU dump if it doesn't. 5316 * Use a nested seq_buf to generate the standard dump so that we 5317 * can decide whether to commit later. 5318 */ 5319 avail = seq_buf_get_buf(&s, &buf); 5320 seq_buf_init(&ns, buf, avail); 5321 5322 dump_newline(&ns); 5323 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", 5324 cpu, rq->scx.nr_running, rq->scx.flags, 5325 rq->scx.cpu_released, rq->scx.ops_qseq, 5326 rq->scx.pnt_seq); 5327 dump_line(&ns, " curr=%s[%d] class=%ps", 5328 rq->curr->comm, rq->curr->pid, 5329 rq->curr->sched_class); 5330 if (!cpumask_empty(rq->scx.cpus_to_kick)) 5331 dump_line(&ns, " cpus_to_kick : %*pb", 5332 cpumask_pr_args(rq->scx.cpus_to_kick)); 5333 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 5334 dump_line(&ns, " idle_to_kick : %*pb", 5335 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 5336 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 5337 dump_line(&ns, " cpus_to_preempt: %*pb", 5338 cpumask_pr_args(rq->scx.cpus_to_preempt)); 5339 if (!cpumask_empty(rq->scx.cpus_to_wait)) 5340 dump_line(&ns, " cpus_to_wait : %*pb", 5341 cpumask_pr_args(rq->scx.cpus_to_wait)); 5342 5343 used = seq_buf_used(&ns); 5344 if (SCX_HAS_OP(sch, dump_cpu)) { 5345 ops_dump_init(&ns, " "); 5346 SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, 5347 &dctx, cpu, idle); 5348 ops_dump_exit(); 5349 } 5350 5351 /* 5352 * If idle && nothing generated by ops.dump_cpu(), there's 5353 * nothing interesting. Skip. 5354 */ 5355 if (idle && used == seq_buf_used(&ns)) 5356 goto next; 5357 5358 /* 5359 * $s may already have overflowed when $ns was created. If so, 5360 * calling commit on it will trigger BUG. 5361 */ 5362 if (avail) { 5363 seq_buf_commit(&s, seq_buf_used(&ns)); 5364 if (seq_buf_has_overflowed(&ns)) 5365 seq_buf_set_overflow(&s); 5366 } 5367 5368 if (rq->curr->sched_class == &ext_sched_class) 5369 scx_dump_task(&s, &dctx, rq->curr, '*'); 5370 5371 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 5372 scx_dump_task(&s, &dctx, p, ' '); 5373 next: 5374 rq_unlock(rq, &rf); 5375 } 5376 5377 dump_newline(&s); 5378 dump_line(&s, "Event counters"); 5379 dump_line(&s, "--------------"); 5380 5381 scx_read_events(sch, &events); 5382 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 5383 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 5384 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 5385 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 5386 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 5387 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 5388 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 5389 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 5390 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 5391 5392 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 5393 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 5394 trunc_marker, sizeof(trunc_marker)); 5395 5396 spin_unlock_irqrestore(&dump_lock, flags); 5397 } 5398 5399 static void scx_error_irq_workfn(struct irq_work *irq_work) 5400 { 5401 struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); 5402 struct scx_exit_info *ei = sch->exit_info; 5403 5404 if (ei->kind >= SCX_EXIT_ERROR) 5405 scx_dump_state(ei, sch->ops.exit_dump_len); 5406 5407 kthread_queue_work(sch->helper, &sch->disable_work); 5408 } 5409 5410 static void scx_vexit(struct scx_sched *sch, 5411 enum scx_exit_kind kind, s64 exit_code, 5412 const char *fmt, va_list args) 5413 { 5414 struct scx_exit_info *ei = sch->exit_info; 5415 int none = SCX_EXIT_NONE; 5416 5417 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 5418 return; 5419 5420 ei->exit_code = exit_code; 5421 #ifdef CONFIG_STACKTRACE 5422 if (kind >= SCX_EXIT_ERROR) 5423 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 5424 #endif 5425 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 5426 5427 /* 5428 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 5429 * in scx_disable_workfn(). 5430 */ 5431 ei->kind = kind; 5432 ei->reason = scx_exit_reason(ei->kind); 5433 5434 irq_work_queue(&sch->error_irq_work); 5435 } 5436 5437 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) 5438 { 5439 struct scx_sched *sch; 5440 int node, ret; 5441 5442 sch = kzalloc(sizeof(*sch), GFP_KERNEL); 5443 if (!sch) 5444 return ERR_PTR(-ENOMEM); 5445 5446 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 5447 if (!sch->exit_info) { 5448 ret = -ENOMEM; 5449 goto err_free_sch; 5450 } 5451 5452 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 5453 if (ret < 0) 5454 goto err_free_ei; 5455 5456 sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]), 5457 GFP_KERNEL); 5458 if (!sch->global_dsqs) { 5459 ret = -ENOMEM; 5460 goto err_free_hash; 5461 } 5462 5463 for_each_node_state(node, N_POSSIBLE) { 5464 struct scx_dispatch_q *dsq; 5465 5466 dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); 5467 if (!dsq) { 5468 ret = -ENOMEM; 5469 goto err_free_gdsqs; 5470 } 5471 5472 init_dsq(dsq, SCX_DSQ_GLOBAL); 5473 sch->global_dsqs[node] = dsq; 5474 } 5475 5476 sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); 5477 if (!sch->event_stats_cpu) 5478 goto err_free_gdsqs; 5479 5480 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 5481 if (!sch->helper) 5482 goto err_free_event_stats; 5483 sched_set_fifo(sch->helper->task); 5484 5485 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 5486 init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); 5487 kthread_init_work(&sch->disable_work, scx_disable_workfn); 5488 sch->ops = *ops; 5489 ops->priv = sch; 5490 5491 sch->kobj.kset = scx_kset; 5492 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 5493 if (ret < 0) 5494 goto err_stop_helper; 5495 5496 return sch; 5497 5498 err_stop_helper: 5499 kthread_stop(sch->helper->task); 5500 err_free_event_stats: 5501 free_percpu(sch->event_stats_cpu); 5502 err_free_gdsqs: 5503 for_each_node_state(node, N_POSSIBLE) 5504 kfree(sch->global_dsqs[node]); 5505 kfree(sch->global_dsqs); 5506 err_free_hash: 5507 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 5508 err_free_ei: 5509 free_exit_info(sch->exit_info); 5510 err_free_sch: 5511 kfree(sch); 5512 return ERR_PTR(ret); 5513 } 5514 5515 static void check_hotplug_seq(struct scx_sched *sch, 5516 const struct sched_ext_ops *ops) 5517 { 5518 unsigned long long global_hotplug_seq; 5519 5520 /* 5521 * If a hotplug event has occurred between when a scheduler was 5522 * initialized, and when we were able to attach, exit and notify user 5523 * space about it. 5524 */ 5525 if (ops->hotplug_seq) { 5526 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 5527 if (ops->hotplug_seq != global_hotplug_seq) { 5528 scx_exit(sch, SCX_EXIT_UNREG_KERN, 5529 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 5530 "expected hotplug seq %llu did not match actual %llu", 5531 ops->hotplug_seq, global_hotplug_seq); 5532 } 5533 } 5534 } 5535 5536 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 5537 { 5538 /* 5539 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 5540 * ops.enqueue() callback isn't implemented. 5541 */ 5542 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 5543 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 5544 return -EINVAL; 5545 } 5546 5547 /* 5548 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 5549 * selection policy to be enabled. 5550 */ 5551 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 5552 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 5553 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 5554 return -EINVAL; 5555 } 5556 5557 if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) 5558 pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); 5559 5560 return 0; 5561 } 5562 5563 static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 5564 { 5565 struct scx_sched *sch; 5566 struct scx_task_iter sti; 5567 struct task_struct *p; 5568 unsigned long timeout; 5569 int i, cpu, ret; 5570 5571 if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), 5572 cpu_possible_mask)) { 5573 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 5574 return -EINVAL; 5575 } 5576 5577 mutex_lock(&scx_enable_mutex); 5578 5579 if (scx_enable_state() != SCX_DISABLED) { 5580 ret = -EBUSY; 5581 goto err_unlock; 5582 } 5583 5584 sch = scx_alloc_and_add_sched(ops); 5585 if (IS_ERR(sch)) { 5586 ret = PTR_ERR(sch); 5587 goto err_unlock; 5588 } 5589 5590 /* 5591 * Transition to ENABLING and clear exit info to arm the disable path. 5592 * Failure triggers full disabling from here on. 5593 */ 5594 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 5595 WARN_ON_ONCE(scx_root); 5596 5597 atomic_long_set(&scx_nr_rejected, 0); 5598 5599 for_each_possible_cpu(cpu) 5600 cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; 5601 5602 /* 5603 * Keep CPUs stable during enable so that the BPF scheduler can track 5604 * online CPUs by watching ->on/offline_cpu() after ->init(). 5605 */ 5606 cpus_read_lock(); 5607 5608 /* 5609 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 5610 * See handle_hotplug(). 5611 */ 5612 rcu_assign_pointer(scx_root, sch); 5613 5614 scx_idle_enable(ops); 5615 5616 if (sch->ops.init) { 5617 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); 5618 if (ret) { 5619 ret = ops_sanitize_err(sch, "init", ret); 5620 cpus_read_unlock(); 5621 scx_error(sch, "ops.init() failed (%d)", ret); 5622 goto err_disable; 5623 } 5624 } 5625 5626 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 5627 if (((void (**)(void))ops)[i]) 5628 set_bit(i, sch->has_op); 5629 5630 check_hotplug_seq(sch, ops); 5631 scx_idle_update_selcpu_topology(ops); 5632 5633 cpus_read_unlock(); 5634 5635 ret = validate_ops(sch, ops); 5636 if (ret) 5637 goto err_disable; 5638 5639 WARN_ON_ONCE(scx_dsp_ctx); 5640 scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 5641 scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, 5642 scx_dsp_max_batch), 5643 __alignof__(struct scx_dsp_ctx)); 5644 if (!scx_dsp_ctx) { 5645 ret = -ENOMEM; 5646 goto err_disable; 5647 } 5648 5649 if (ops->timeout_ms) 5650 timeout = msecs_to_jiffies(ops->timeout_ms); 5651 else 5652 timeout = SCX_WATCHDOG_MAX_TIMEOUT; 5653 5654 WRITE_ONCE(scx_watchdog_timeout, timeout); 5655 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5656 queue_delayed_work(system_unbound_wq, &scx_watchdog_work, 5657 scx_watchdog_timeout / 2); 5658 5659 /* 5660 * Once __scx_enabled is set, %current can be switched to SCX anytime. 5661 * This can lead to stalls as some BPF schedulers (e.g. userspace 5662 * scheduling) may not function correctly before all tasks are switched. 5663 * Init in bypass mode to guarantee forward progress. 5664 */ 5665 scx_bypass(true); 5666 5667 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 5668 if (((void (**)(void))ops)[i]) 5669 set_bit(i, sch->has_op); 5670 5671 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 5672 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 5673 5674 /* 5675 * Lock out forks, cgroup on/offlining and moves before opening the 5676 * floodgate so that they don't wander into the operations prematurely. 5677 */ 5678 percpu_down_write(&scx_fork_rwsem); 5679 5680 WARN_ON_ONCE(scx_init_task_enabled); 5681 scx_init_task_enabled = true; 5682 5683 /* 5684 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 5685 * preventing new tasks from being added. No need to exclude tasks 5686 * leaving as sched_ext_free() can handle both prepped and enabled 5687 * tasks. Prep all tasks first and then enable them with preemption 5688 * disabled. 5689 * 5690 * All cgroups should be initialized before scx_init_task() so that the 5691 * BPF scheduler can reliably track each task's cgroup membership from 5692 * scx_init_task(). Lock out cgroup on/offlining and task migrations 5693 * while tasks are being initialized so that scx_cgroup_can_attach() 5694 * never sees uninitialized tasks. 5695 */ 5696 scx_cgroup_lock(); 5697 ret = scx_cgroup_init(sch); 5698 if (ret) 5699 goto err_disable_unlock_all; 5700 5701 scx_task_iter_start(&sti); 5702 while ((p = scx_task_iter_next_locked(&sti))) { 5703 /* 5704 * @p may already be dead, have lost all its usages counts and 5705 * be waiting for RCU grace period before being freed. @p can't 5706 * be initialized for SCX in such cases and should be ignored. 5707 */ 5708 if (!tryget_task_struct(p)) 5709 continue; 5710 5711 scx_task_iter_unlock(&sti); 5712 5713 ret = scx_init_task(p, task_group(p), false); 5714 if (ret) { 5715 put_task_struct(p); 5716 scx_task_iter_relock(&sti); 5717 scx_task_iter_stop(&sti); 5718 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 5719 ret, p->comm, p->pid); 5720 goto err_disable_unlock_all; 5721 } 5722 5723 scx_set_task_state(p, SCX_TASK_READY); 5724 5725 put_task_struct(p); 5726 scx_task_iter_relock(&sti); 5727 } 5728 scx_task_iter_stop(&sti); 5729 scx_cgroup_unlock(); 5730 percpu_up_write(&scx_fork_rwsem); 5731 5732 /* 5733 * All tasks are READY. It's safe to turn on scx_enabled() and switch 5734 * all eligible tasks. 5735 */ 5736 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 5737 static_branch_enable(&__scx_enabled); 5738 5739 /* 5740 * We're fully committed and can't fail. The task READY -> ENABLED 5741 * transitions here are synchronized against sched_ext_free() through 5742 * scx_tasks_lock. 5743 */ 5744 percpu_down_write(&scx_fork_rwsem); 5745 scx_task_iter_start(&sti); 5746 while ((p = scx_task_iter_next_locked(&sti))) { 5747 const struct sched_class *old_class = p->sched_class; 5748 const struct sched_class *new_class = 5749 __setscheduler_class(p->policy, p->prio); 5750 struct sched_enq_and_set_ctx ctx; 5751 5752 if (old_class != new_class && p->se.sched_delayed) 5753 dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5754 5755 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5756 5757 p->scx.slice = SCX_SLICE_DFL; 5758 p->sched_class = new_class; 5759 check_class_changing(task_rq(p), p, old_class); 5760 5761 sched_enq_and_set_task(&ctx); 5762 5763 check_class_changed(task_rq(p), p, old_class, p->prio); 5764 } 5765 scx_task_iter_stop(&sti); 5766 percpu_up_write(&scx_fork_rwsem); 5767 5768 scx_bypass(false); 5769 5770 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 5771 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 5772 goto err_disable; 5773 } 5774 5775 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 5776 static_branch_enable(&__scx_switched_all); 5777 5778 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 5779 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 5780 kobject_uevent(&sch->kobj, KOBJ_ADD); 5781 mutex_unlock(&scx_enable_mutex); 5782 5783 atomic_long_inc(&scx_enable_seq); 5784 5785 return 0; 5786 5787 err_unlock: 5788 mutex_unlock(&scx_enable_mutex); 5789 return ret; 5790 5791 err_disable_unlock_all: 5792 scx_cgroup_unlock(); 5793 percpu_up_write(&scx_fork_rwsem); 5794 scx_bypass(false); 5795 err_disable: 5796 mutex_unlock(&scx_enable_mutex); 5797 /* 5798 * Returning an error code here would not pass all the error information 5799 * to userspace. Record errno using scx_error() for cases scx_error() 5800 * wasn't already invoked and exit indicating success so that the error 5801 * is notified through ops.exit() with all the details. 5802 * 5803 * Flush scx_disable_work to ensure that error is reported before init 5804 * completion. sch's base reference will be put by bpf_scx_unreg(). 5805 */ 5806 scx_error(sch, "scx_enable() failed (%d)", ret); 5807 kthread_flush_work(&sch->disable_work); 5808 return 0; 5809 } 5810 5811 5812 /******************************************************************************** 5813 * bpf_struct_ops plumbing. 5814 */ 5815 #include <linux/bpf_verifier.h> 5816 #include <linux/bpf.h> 5817 #include <linux/btf.h> 5818 5819 static const struct btf_type *task_struct_type; 5820 5821 static bool bpf_scx_is_valid_access(int off, int size, 5822 enum bpf_access_type type, 5823 const struct bpf_prog *prog, 5824 struct bpf_insn_access_aux *info) 5825 { 5826 if (type != BPF_READ) 5827 return false; 5828 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 5829 return false; 5830 if (off % size != 0) 5831 return false; 5832 5833 return btf_ctx_access(off, size, type, prog, info); 5834 } 5835 5836 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 5837 const struct bpf_reg_state *reg, int off, 5838 int size) 5839 { 5840 const struct btf_type *t; 5841 5842 t = btf_type_by_id(reg->btf, reg->btf_id); 5843 if (t == task_struct_type) { 5844 if (off >= offsetof(struct task_struct, scx.slice) && 5845 off + size <= offsetofend(struct task_struct, scx.slice)) 5846 return SCALAR_VALUE; 5847 if (off >= offsetof(struct task_struct, scx.dsq_vtime) && 5848 off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) 5849 return SCALAR_VALUE; 5850 if (off >= offsetof(struct task_struct, scx.disallow) && 5851 off + size <= offsetofend(struct task_struct, scx.disallow)) 5852 return SCALAR_VALUE; 5853 } 5854 5855 return -EACCES; 5856 } 5857 5858 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 5859 .get_func_proto = bpf_base_func_proto, 5860 .is_valid_access = bpf_scx_is_valid_access, 5861 .btf_struct_access = bpf_scx_btf_struct_access, 5862 }; 5863 5864 static int bpf_scx_init_member(const struct btf_type *t, 5865 const struct btf_member *member, 5866 void *kdata, const void *udata) 5867 { 5868 const struct sched_ext_ops *uops = udata; 5869 struct sched_ext_ops *ops = kdata; 5870 u32 moff = __btf_member_bit_offset(t, member) / 8; 5871 int ret; 5872 5873 switch (moff) { 5874 case offsetof(struct sched_ext_ops, dispatch_max_batch): 5875 if (*(u32 *)(udata + moff) > INT_MAX) 5876 return -E2BIG; 5877 ops->dispatch_max_batch = *(u32 *)(udata + moff); 5878 return 1; 5879 case offsetof(struct sched_ext_ops, flags): 5880 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 5881 return -EINVAL; 5882 ops->flags = *(u64 *)(udata + moff); 5883 return 1; 5884 case offsetof(struct sched_ext_ops, name): 5885 ret = bpf_obj_name_cpy(ops->name, uops->name, 5886 sizeof(ops->name)); 5887 if (ret < 0) 5888 return ret; 5889 if (ret == 0) 5890 return -EINVAL; 5891 return 1; 5892 case offsetof(struct sched_ext_ops, timeout_ms): 5893 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 5894 SCX_WATCHDOG_MAX_TIMEOUT) 5895 return -E2BIG; 5896 ops->timeout_ms = *(u32 *)(udata + moff); 5897 return 1; 5898 case offsetof(struct sched_ext_ops, exit_dump_len): 5899 ops->exit_dump_len = 5900 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 5901 return 1; 5902 case offsetof(struct sched_ext_ops, hotplug_seq): 5903 ops->hotplug_seq = *(u64 *)(udata + moff); 5904 return 1; 5905 } 5906 5907 return 0; 5908 } 5909 5910 static int bpf_scx_check_member(const struct btf_type *t, 5911 const struct btf_member *member, 5912 const struct bpf_prog *prog) 5913 { 5914 u32 moff = __btf_member_bit_offset(t, member) / 8; 5915 5916 switch (moff) { 5917 case offsetof(struct sched_ext_ops, init_task): 5918 #ifdef CONFIG_EXT_GROUP_SCHED 5919 case offsetof(struct sched_ext_ops, cgroup_init): 5920 case offsetof(struct sched_ext_ops, cgroup_exit): 5921 case offsetof(struct sched_ext_ops, cgroup_prep_move): 5922 #endif 5923 case offsetof(struct sched_ext_ops, cpu_online): 5924 case offsetof(struct sched_ext_ops, cpu_offline): 5925 case offsetof(struct sched_ext_ops, init): 5926 case offsetof(struct sched_ext_ops, exit): 5927 break; 5928 default: 5929 if (prog->sleepable) 5930 return -EINVAL; 5931 } 5932 5933 return 0; 5934 } 5935 5936 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 5937 { 5938 return scx_enable(kdata, link); 5939 } 5940 5941 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 5942 { 5943 struct sched_ext_ops *ops = kdata; 5944 struct scx_sched *sch = ops->priv; 5945 5946 scx_disable(SCX_EXIT_UNREG); 5947 kthread_flush_work(&sch->disable_work); 5948 kobject_put(&sch->kobj); 5949 } 5950 5951 static int bpf_scx_init(struct btf *btf) 5952 { 5953 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 5954 5955 return 0; 5956 } 5957 5958 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 5959 { 5960 /* 5961 * sched_ext does not support updating the actively-loaded BPF 5962 * scheduler, as registering a BPF scheduler can always fail if the 5963 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 5964 * etc. Similarly, we can always race with unregistration happening 5965 * elsewhere, such as with sysrq. 5966 */ 5967 return -EOPNOTSUPP; 5968 } 5969 5970 static int bpf_scx_validate(void *kdata) 5971 { 5972 return 0; 5973 } 5974 5975 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 5976 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 5977 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 5978 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 5979 static void sched_ext_ops__tick(struct task_struct *p) {} 5980 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 5981 static void sched_ext_ops__running(struct task_struct *p) {} 5982 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 5983 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 5984 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 5985 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 5986 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 5987 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 5988 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 5989 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 5990 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 5991 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 5992 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 5993 static void sched_ext_ops__enable(struct task_struct *p) {} 5994 static void sched_ext_ops__disable(struct task_struct *p) {} 5995 #ifdef CONFIG_EXT_GROUP_SCHED 5996 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 5997 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 5998 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 5999 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 6000 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 6001 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 6002 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 6003 #endif 6004 static void sched_ext_ops__cpu_online(s32 cpu) {} 6005 static void sched_ext_ops__cpu_offline(s32 cpu) {} 6006 static s32 sched_ext_ops__init(void) { return -EINVAL; } 6007 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 6008 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 6009 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 6010 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 6011 6012 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 6013 .select_cpu = sched_ext_ops__select_cpu, 6014 .enqueue = sched_ext_ops__enqueue, 6015 .dequeue = sched_ext_ops__dequeue, 6016 .dispatch = sched_ext_ops__dispatch, 6017 .tick = sched_ext_ops__tick, 6018 .runnable = sched_ext_ops__runnable, 6019 .running = sched_ext_ops__running, 6020 .stopping = sched_ext_ops__stopping, 6021 .quiescent = sched_ext_ops__quiescent, 6022 .yield = sched_ext_ops__yield, 6023 .core_sched_before = sched_ext_ops__core_sched_before, 6024 .set_weight = sched_ext_ops__set_weight, 6025 .set_cpumask = sched_ext_ops__set_cpumask, 6026 .update_idle = sched_ext_ops__update_idle, 6027 .cpu_acquire = sched_ext_ops__cpu_acquire, 6028 .cpu_release = sched_ext_ops__cpu_release, 6029 .init_task = sched_ext_ops__init_task, 6030 .exit_task = sched_ext_ops__exit_task, 6031 .enable = sched_ext_ops__enable, 6032 .disable = sched_ext_ops__disable, 6033 #ifdef CONFIG_EXT_GROUP_SCHED 6034 .cgroup_init = sched_ext_ops__cgroup_init, 6035 .cgroup_exit = sched_ext_ops__cgroup_exit, 6036 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 6037 .cgroup_move = sched_ext_ops__cgroup_move, 6038 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 6039 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 6040 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 6041 #endif 6042 .cpu_online = sched_ext_ops__cpu_online, 6043 .cpu_offline = sched_ext_ops__cpu_offline, 6044 .init = sched_ext_ops__init, 6045 .exit = sched_ext_ops__exit, 6046 .dump = sched_ext_ops__dump, 6047 .dump_cpu = sched_ext_ops__dump_cpu, 6048 .dump_task = sched_ext_ops__dump_task, 6049 }; 6050 6051 static struct bpf_struct_ops bpf_sched_ext_ops = { 6052 .verifier_ops = &bpf_scx_verifier_ops, 6053 .reg = bpf_scx_reg, 6054 .unreg = bpf_scx_unreg, 6055 .check_member = bpf_scx_check_member, 6056 .init_member = bpf_scx_init_member, 6057 .init = bpf_scx_init, 6058 .update = bpf_scx_update, 6059 .validate = bpf_scx_validate, 6060 .name = "sched_ext_ops", 6061 .owner = THIS_MODULE, 6062 .cfi_stubs = &__bpf_ops_sched_ext_ops 6063 }; 6064 6065 6066 /******************************************************************************** 6067 * System integration and init. 6068 */ 6069 6070 static void sysrq_handle_sched_ext_reset(u8 key) 6071 { 6072 scx_disable(SCX_EXIT_SYSRQ); 6073 } 6074 6075 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 6076 .handler = sysrq_handle_sched_ext_reset, 6077 .help_msg = "reset-sched-ext(S)", 6078 .action_msg = "Disable sched_ext and revert all tasks to CFS", 6079 .enable_mask = SYSRQ_ENABLE_RTNICE, 6080 }; 6081 6082 static void sysrq_handle_sched_ext_dump(u8 key) 6083 { 6084 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 6085 6086 if (scx_enabled()) 6087 scx_dump_state(&ei, 0); 6088 } 6089 6090 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 6091 .handler = sysrq_handle_sched_ext_dump, 6092 .help_msg = "dump-sched-ext(D)", 6093 .action_msg = "Trigger sched_ext debug dump", 6094 .enable_mask = SYSRQ_ENABLE_RTNICE, 6095 }; 6096 6097 static bool can_skip_idle_kick(struct rq *rq) 6098 { 6099 lockdep_assert_rq_held(rq); 6100 6101 /* 6102 * We can skip idle kicking if @rq is going to go through at least one 6103 * full SCX scheduling cycle before going idle. Just checking whether 6104 * curr is not idle is insufficient because we could be racing 6105 * balance_one() trying to pull the next task from a remote rq, which 6106 * may fail, and @rq may become idle afterwards. 6107 * 6108 * The race window is small and we don't and can't guarantee that @rq is 6109 * only kicked while idle anyway. Skip only when sure. 6110 */ 6111 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 6112 } 6113 6114 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) 6115 { 6116 struct rq *rq = cpu_rq(cpu); 6117 struct scx_rq *this_scx = &this_rq->scx; 6118 bool should_wait = false; 6119 unsigned long flags; 6120 6121 raw_spin_rq_lock_irqsave(rq, flags); 6122 6123 /* 6124 * During CPU hotplug, a CPU may depend on kicking itself to make 6125 * forward progress. Allow kicking self regardless of online state. 6126 */ 6127 if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { 6128 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 6129 if (rq->curr->sched_class == &ext_sched_class) 6130 rq->curr->scx.slice = 0; 6131 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 6132 } 6133 6134 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 6135 pseqs[cpu] = rq->scx.pnt_seq; 6136 should_wait = true; 6137 } 6138 6139 resched_curr(rq); 6140 } else { 6141 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 6142 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 6143 } 6144 6145 raw_spin_rq_unlock_irqrestore(rq, flags); 6146 6147 return should_wait; 6148 } 6149 6150 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 6151 { 6152 struct rq *rq = cpu_rq(cpu); 6153 unsigned long flags; 6154 6155 raw_spin_rq_lock_irqsave(rq, flags); 6156 6157 if (!can_skip_idle_kick(rq) && 6158 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 6159 resched_curr(rq); 6160 6161 raw_spin_rq_unlock_irqrestore(rq, flags); 6162 } 6163 6164 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 6165 { 6166 struct rq *this_rq = this_rq(); 6167 struct scx_rq *this_scx = &this_rq->scx; 6168 unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); 6169 bool should_wait = false; 6170 s32 cpu; 6171 6172 for_each_cpu(cpu, this_scx->cpus_to_kick) { 6173 should_wait |= kick_one_cpu(cpu, this_rq, pseqs); 6174 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 6175 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 6176 } 6177 6178 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 6179 kick_one_cpu_if_idle(cpu, this_rq); 6180 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 6181 } 6182 6183 if (!should_wait) 6184 return; 6185 6186 for_each_cpu(cpu, this_scx->cpus_to_wait) { 6187 unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; 6188 6189 if (cpu != cpu_of(this_rq)) { 6190 /* 6191 * Pairs with smp_store_release() issued by this CPU in 6192 * switch_class() on the resched path. 6193 * 6194 * We busy-wait here to guarantee that no other task can 6195 * be scheduled on our core before the target CPU has 6196 * entered the resched path. 6197 */ 6198 while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) 6199 cpu_relax(); 6200 } 6201 6202 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 6203 } 6204 } 6205 6206 /** 6207 * print_scx_info - print out sched_ext scheduler state 6208 * @log_lvl: the log level to use when printing 6209 * @p: target task 6210 * 6211 * If a sched_ext scheduler is enabled, print the name and state of the 6212 * scheduler. If @p is on sched_ext, print further information about the task. 6213 * 6214 * This function can be safely called on any task as long as the task_struct 6215 * itself is accessible. While safe, this function isn't synchronized and may 6216 * print out mixups or garbages of limited length. 6217 */ 6218 void print_scx_info(const char *log_lvl, struct task_struct *p) 6219 { 6220 struct scx_sched *sch = scx_root; 6221 enum scx_enable_state state = scx_enable_state(); 6222 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 6223 char runnable_at_buf[22] = "?"; 6224 struct sched_class *class; 6225 unsigned long runnable_at; 6226 6227 if (state == SCX_DISABLED) 6228 return; 6229 6230 /* 6231 * Carefully check if the task was running on sched_ext, and then 6232 * carefully copy the time it's been runnable, and its state. 6233 */ 6234 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 6235 class != &ext_sched_class) { 6236 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 6237 scx_enable_state_str[state], all); 6238 return; 6239 } 6240 6241 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 6242 sizeof(runnable_at))) 6243 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 6244 jiffies_delta_msecs(runnable_at, jiffies)); 6245 6246 /* print everything onto one line to conserve console space */ 6247 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 6248 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 6249 runnable_at_buf); 6250 } 6251 6252 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 6253 { 6254 /* 6255 * SCX schedulers often have userspace components which are sometimes 6256 * involved in critial scheduling paths. PM operations involve freezing 6257 * userspace which can lead to scheduling misbehaviors including stalls. 6258 * Let's bypass while PM operations are in progress. 6259 */ 6260 switch (event) { 6261 case PM_HIBERNATION_PREPARE: 6262 case PM_SUSPEND_PREPARE: 6263 case PM_RESTORE_PREPARE: 6264 scx_bypass(true); 6265 break; 6266 case PM_POST_HIBERNATION: 6267 case PM_POST_SUSPEND: 6268 case PM_POST_RESTORE: 6269 scx_bypass(false); 6270 break; 6271 } 6272 6273 return NOTIFY_OK; 6274 } 6275 6276 static struct notifier_block scx_pm_notifier = { 6277 .notifier_call = scx_pm_handler, 6278 }; 6279 6280 void __init init_sched_ext_class(void) 6281 { 6282 s32 cpu, v; 6283 6284 /* 6285 * The following is to prevent the compiler from optimizing out the enum 6286 * definitions so that BPF scheduler implementations can use them 6287 * through the generated vmlinux.h. 6288 */ 6289 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 6290 SCX_TG_ONLINE); 6291 6292 scx_idle_init_masks(); 6293 6294 scx_kick_cpus_pnt_seqs = 6295 __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, 6296 __alignof__(scx_kick_cpus_pnt_seqs[0])); 6297 BUG_ON(!scx_kick_cpus_pnt_seqs); 6298 6299 for_each_possible_cpu(cpu) { 6300 struct rq *rq = cpu_rq(cpu); 6301 int n = cpu_to_node(cpu); 6302 6303 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); 6304 INIT_LIST_HEAD(&rq->scx.runnable_list); 6305 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 6306 6307 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 6308 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 6309 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 6310 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 6311 init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); 6312 init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); 6313 6314 if (cpu_online(cpu)) 6315 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 6316 } 6317 6318 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 6319 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 6320 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 6321 } 6322 6323 6324 /******************************************************************************** 6325 * Helpers that can be called from the BPF scheduler. 6326 */ 6327 static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) 6328 { 6329 if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 6330 return false; 6331 6332 lockdep_assert_irqs_disabled(); 6333 6334 if (unlikely(!p)) { 6335 scx_kf_error("called with NULL task"); 6336 return false; 6337 } 6338 6339 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 6340 scx_kf_error("invalid enq_flags 0x%llx", enq_flags); 6341 return false; 6342 } 6343 6344 return true; 6345 } 6346 6347 static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, 6348 u64 enq_flags) 6349 { 6350 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 6351 struct task_struct *ddsp_task; 6352 6353 ddsp_task = __this_cpu_read(direct_dispatch_task); 6354 if (ddsp_task) { 6355 mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); 6356 return; 6357 } 6358 6359 if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { 6360 scx_kf_error("dispatch buffer overflow"); 6361 return; 6362 } 6363 6364 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 6365 .task = p, 6366 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 6367 .dsq_id = dsq_id, 6368 .enq_flags = enq_flags, 6369 }; 6370 } 6371 6372 __bpf_kfunc_start_defs(); 6373 6374 /** 6375 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 6376 * @p: task_struct to insert 6377 * @dsq_id: DSQ to insert into 6378 * @slice: duration @p can run for in nsecs, 0 to keep the current value 6379 * @enq_flags: SCX_ENQ_* 6380 * 6381 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 6382 * call this function spuriously. Can be called from ops.enqueue(), 6383 * ops.select_cpu(), and ops.dispatch(). 6384 * 6385 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 6386 * and @p must match the task being enqueued. 6387 * 6388 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 6389 * will be directly inserted into the corresponding dispatch queue after 6390 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 6391 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 6392 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 6393 * task is inserted. 6394 * 6395 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 6396 * and this function can be called upto ops.dispatch_max_batch times to insert 6397 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 6398 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 6399 * counter. 6400 * 6401 * This function doesn't have any locking restrictions and may be called under 6402 * BPF locks (in the future when BPF introduces more flexible locking). 6403 * 6404 * @p is allowed to run for @slice. The scheduling path is triggered on slice 6405 * exhaustion. If zero, the current residual slice is maintained. If 6406 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 6407 * scx_bpf_kick_cpu() to trigger scheduling. 6408 */ 6409 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, 6410 u64 enq_flags) 6411 { 6412 if (!scx_dsq_insert_preamble(p, enq_flags)) 6413 return; 6414 6415 if (slice) 6416 p->scx.slice = slice; 6417 else 6418 p->scx.slice = p->scx.slice ?: 1; 6419 6420 scx_dsq_insert_commit(p, dsq_id, enq_flags); 6421 } 6422 6423 /** 6424 * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ 6425 * @p: task_struct to insert 6426 * @dsq_id: DSQ to insert into 6427 * @slice: duration @p can run for in nsecs, 0 to keep the current value 6428 * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 6429 * @enq_flags: SCX_ENQ_* 6430 * 6431 * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. 6432 * Tasks queued into the priority queue are ordered by @vtime. All other aspects 6433 * are identical to scx_bpf_dsq_insert(). 6434 * 6435 * @vtime ordering is according to time_before64() which considers wrapping. A 6436 * numerically larger vtime may indicate an earlier position in the ordering and 6437 * vice-versa. 6438 * 6439 * A DSQ can only be used as a FIFO or priority queue at any given time and this 6440 * function must not be called on a DSQ which already has one or more FIFO tasks 6441 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 6442 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 6443 */ 6444 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 6445 u64 slice, u64 vtime, u64 enq_flags) 6446 { 6447 if (!scx_dsq_insert_preamble(p, enq_flags)) 6448 return; 6449 6450 if (slice) 6451 p->scx.slice = slice; 6452 else 6453 p->scx.slice = p->scx.slice ?: 1; 6454 6455 p->scx.dsq_vtime = vtime; 6456 6457 scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 6458 } 6459 6460 __bpf_kfunc_end_defs(); 6461 6462 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 6463 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) 6464 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 6465 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 6466 6467 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 6468 .owner = THIS_MODULE, 6469 .set = &scx_kfunc_ids_enqueue_dispatch, 6470 }; 6471 6472 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 6473 struct task_struct *p, u64 dsq_id, u64 enq_flags) 6474 { 6475 struct scx_sched *sch = scx_root; 6476 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 6477 struct rq *this_rq, *src_rq, *locked_rq; 6478 bool dispatched = false; 6479 bool in_balance; 6480 unsigned long flags; 6481 6482 if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) 6483 return false; 6484 6485 /* 6486 * Can be called from either ops.dispatch() locking this_rq() or any 6487 * context where no rq lock is held. If latter, lock @p's task_rq which 6488 * we'll likely need anyway. 6489 */ 6490 src_rq = task_rq(p); 6491 6492 local_irq_save(flags); 6493 this_rq = this_rq(); 6494 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 6495 6496 if (in_balance) { 6497 if (this_rq != src_rq) { 6498 raw_spin_rq_unlock(this_rq); 6499 raw_spin_rq_lock(src_rq); 6500 } 6501 } else { 6502 raw_spin_rq_lock(src_rq); 6503 } 6504 6505 /* 6506 * If the BPF scheduler keeps calling this function repeatedly, it can 6507 * cause similar live-lock conditions as consume_dispatch_q(). Insert a 6508 * breather if necessary. 6509 */ 6510 scx_breather(src_rq); 6511 6512 locked_rq = src_rq; 6513 raw_spin_lock(&src_dsq->lock); 6514 6515 /* 6516 * Did someone else get to it? @p could have already left $src_dsq, got 6517 * re-enqueud, or be in the process of being consumed by someone else. 6518 */ 6519 if (unlikely(p->scx.dsq != src_dsq || 6520 u32_before(kit->cursor.priv, p->scx.dsq_seq) || 6521 p->scx.holding_cpu >= 0) || 6522 WARN_ON_ONCE(src_rq != task_rq(p))) { 6523 raw_spin_unlock(&src_dsq->lock); 6524 goto out; 6525 } 6526 6527 /* @p is still on $src_dsq and stable, determine the destination */ 6528 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); 6529 6530 /* 6531 * Apply vtime and slice updates before moving so that the new time is 6532 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 6533 * this is safe as we're locking it. 6534 */ 6535 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 6536 p->scx.dsq_vtime = kit->vtime; 6537 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 6538 p->scx.slice = kit->slice; 6539 6540 /* execute move */ 6541 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 6542 dispatched = true; 6543 out: 6544 if (in_balance) { 6545 if (this_rq != locked_rq) { 6546 raw_spin_rq_unlock(locked_rq); 6547 raw_spin_rq_lock(this_rq); 6548 } 6549 } else { 6550 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 6551 } 6552 6553 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 6554 __SCX_DSQ_ITER_HAS_VTIME); 6555 return dispatched; 6556 } 6557 6558 __bpf_kfunc_start_defs(); 6559 6560 /** 6561 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 6562 * 6563 * Can only be called from ops.dispatch(). 6564 */ 6565 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) 6566 { 6567 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 6568 return 0; 6569 6570 return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); 6571 } 6572 6573 /** 6574 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 6575 * 6576 * Cancel the latest dispatch. Can be called multiple times to cancel further 6577 * dispatches. Can only be called from ops.dispatch(). 6578 */ 6579 __bpf_kfunc void scx_bpf_dispatch_cancel(void) 6580 { 6581 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 6582 6583 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 6584 return; 6585 6586 if (dspc->cursor > 0) 6587 dspc->cursor--; 6588 else 6589 scx_kf_error("dispatch buffer underflow"); 6590 } 6591 6592 /** 6593 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 6594 * @dsq_id: DSQ to move task from 6595 * 6596 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 6597 * local DSQ for execution. Can only be called from ops.dispatch(). 6598 * 6599 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 6600 * before trying to move from the specified DSQ. It may also grab rq locks and 6601 * thus can't be called under any BPF locks. 6602 * 6603 * Returns %true if a task has been moved, %false if there isn't any task to 6604 * move. 6605 */ 6606 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) 6607 { 6608 struct scx_sched *sch = scx_root; 6609 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 6610 struct scx_dispatch_q *dsq; 6611 6612 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 6613 return false; 6614 6615 flush_dispatch_buf(sch, dspc->rq); 6616 6617 dsq = find_user_dsq(sch, dsq_id); 6618 if (unlikely(!dsq)) { 6619 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 6620 return false; 6621 } 6622 6623 if (consume_dispatch_q(sch, dspc->rq, dsq)) { 6624 /* 6625 * A successfully consumed task can be dequeued before it starts 6626 * running while the CPU is trying to migrate other dispatched 6627 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty 6628 * local DSQ. 6629 */ 6630 dspc->nr_tasks++; 6631 return true; 6632 } else { 6633 return false; 6634 } 6635 } 6636 6637 /** 6638 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 6639 * @it__iter: DSQ iterator in progress 6640 * @slice: duration the moved task can run for in nsecs 6641 * 6642 * Override the slice of the next task that will be moved from @it__iter using 6643 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 6644 * slice duration is kept. 6645 */ 6646 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 6647 u64 slice) 6648 { 6649 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 6650 6651 kit->slice = slice; 6652 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 6653 } 6654 6655 /** 6656 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 6657 * @it__iter: DSQ iterator in progress 6658 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 6659 * 6660 * Override the vtime of the next task that will be moved from @it__iter using 6661 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 6662 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 6663 * override is ignored and cleared. 6664 */ 6665 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 6666 u64 vtime) 6667 { 6668 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 6669 6670 kit->vtime = vtime; 6671 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 6672 } 6673 6674 /** 6675 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 6676 * @it__iter: DSQ iterator in progress 6677 * @p: task to transfer 6678 * @dsq_id: DSQ to move @p to 6679 * @enq_flags: SCX_ENQ_* 6680 * 6681 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 6682 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 6683 * be the destination. 6684 * 6685 * For the transfer to be successful, @p must still be on the DSQ and have been 6686 * queued before the DSQ iteration started. This function doesn't care whether 6687 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 6688 * been queued before the iteration started. 6689 * 6690 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 6691 * 6692 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 6693 * lock (e.g. BPF timers or SYSCALL programs). 6694 * 6695 * Returns %true if @p has been consumed, %false if @p had already been consumed 6696 * or dequeued. 6697 */ 6698 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 6699 struct task_struct *p, u64 dsq_id, 6700 u64 enq_flags) 6701 { 6702 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 6703 p, dsq_id, enq_flags); 6704 } 6705 6706 /** 6707 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 6708 * @it__iter: DSQ iterator in progress 6709 * @p: task to transfer 6710 * @dsq_id: DSQ to move @p to 6711 * @enq_flags: SCX_ENQ_* 6712 * 6713 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 6714 * priority queue of the DSQ specified by @dsq_id. The destination must be a 6715 * user DSQ as only user DSQs support priority queue. 6716 * 6717 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 6718 * and scx_bpf_dsq_move_set_vtime() to update. 6719 * 6720 * All other aspects are identical to scx_bpf_dsq_move(). See 6721 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 6722 */ 6723 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 6724 struct task_struct *p, u64 dsq_id, 6725 u64 enq_flags) 6726 { 6727 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 6728 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 6729 } 6730 6731 __bpf_kfunc_end_defs(); 6732 6733 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 6734 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) 6735 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) 6736 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) 6737 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) 6738 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) 6739 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 6740 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 6741 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 6742 6743 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 6744 .owner = THIS_MODULE, 6745 .set = &scx_kfunc_ids_dispatch, 6746 }; 6747 6748 __bpf_kfunc_start_defs(); 6749 6750 /** 6751 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 6752 * 6753 * Iterate over all of the tasks currently enqueued on the local DSQ of the 6754 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 6755 * processed tasks. Can only be called from ops.cpu_release(). 6756 */ 6757 __bpf_kfunc u32 scx_bpf_reenqueue_local(void) 6758 { 6759 LIST_HEAD(tasks); 6760 u32 nr_enqueued = 0; 6761 struct rq *rq; 6762 struct task_struct *p, *n; 6763 6764 if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) 6765 return 0; 6766 6767 rq = cpu_rq(smp_processor_id()); 6768 lockdep_assert_rq_held(rq); 6769 6770 /* 6771 * The BPF scheduler may choose to dispatch tasks back to 6772 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 6773 * first to avoid processing the same tasks repeatedly. 6774 */ 6775 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 6776 scx.dsq_list.node) { 6777 /* 6778 * If @p is being migrated, @p's current CPU may not agree with 6779 * its allowed CPUs and the migration_cpu_stop is about to 6780 * deactivate and re-activate @p anyway. Skip re-enqueueing. 6781 * 6782 * While racing sched property changes may also dequeue and 6783 * re-enqueue a migrating task while its current CPU and allowed 6784 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 6785 * the current local DSQ for running tasks and thus are not 6786 * visible to the BPF scheduler. 6787 * 6788 * Also skip re-enqueueing tasks that can only run on this 6789 * CPU, as they would just be re-added to the same local 6790 * DSQ without any benefit. 6791 */ 6792 if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) 6793 continue; 6794 6795 dispatch_dequeue(rq, p); 6796 list_add_tail(&p->scx.dsq_list.node, &tasks); 6797 } 6798 6799 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 6800 list_del_init(&p->scx.dsq_list.node); 6801 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 6802 nr_enqueued++; 6803 } 6804 6805 return nr_enqueued; 6806 } 6807 6808 __bpf_kfunc_end_defs(); 6809 6810 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 6811 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) 6812 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 6813 6814 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 6815 .owner = THIS_MODULE, 6816 .set = &scx_kfunc_ids_cpu_release, 6817 }; 6818 6819 __bpf_kfunc_start_defs(); 6820 6821 /** 6822 * scx_bpf_create_dsq - Create a custom DSQ 6823 * @dsq_id: DSQ to create 6824 * @node: NUMA node to allocate from 6825 * 6826 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 6827 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 6828 */ 6829 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) 6830 { 6831 struct scx_dispatch_q *dsq; 6832 struct scx_sched *sch; 6833 s32 ret; 6834 6835 if (unlikely(node >= (int)nr_node_ids || 6836 (node < 0 && node != NUMA_NO_NODE))) 6837 return -EINVAL; 6838 6839 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 6840 return -EINVAL; 6841 6842 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 6843 if (!dsq) 6844 return -ENOMEM; 6845 6846 init_dsq(dsq, dsq_id); 6847 6848 rcu_read_lock(); 6849 6850 sch = rcu_dereference(scx_root); 6851 if (sch) 6852 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 6853 dsq_hash_params); 6854 else 6855 ret = -ENODEV; 6856 6857 rcu_read_unlock(); 6858 if (ret) 6859 kfree(dsq); 6860 return ret; 6861 } 6862 6863 __bpf_kfunc_end_defs(); 6864 6865 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 6866 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) 6867 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) 6868 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) 6869 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 6870 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 6871 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 6872 6873 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 6874 .owner = THIS_MODULE, 6875 .set = &scx_kfunc_ids_unlocked, 6876 }; 6877 6878 __bpf_kfunc_start_defs(); 6879 6880 /** 6881 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 6882 * @cpu: cpu to kick 6883 * @flags: %SCX_KICK_* flags 6884 * 6885 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 6886 * trigger rescheduling on a busy CPU. This can be called from any online 6887 * scx_ops operation and the actual kicking is performed asynchronously through 6888 * an irq work. 6889 */ 6890 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) 6891 { 6892 struct rq *this_rq; 6893 unsigned long irq_flags; 6894 6895 if (!kf_cpu_valid(cpu, NULL)) 6896 return; 6897 6898 local_irq_save(irq_flags); 6899 6900 this_rq = this_rq(); 6901 6902 /* 6903 * While bypassing for PM ops, IRQ handling may not be online which can 6904 * lead to irq_work_queue() malfunction such as infinite busy wait for 6905 * IRQ status update. Suppress kicking. 6906 */ 6907 if (scx_rq_bypassing(this_rq)) 6908 goto out; 6909 6910 /* 6911 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 6912 * rq locks. We can probably be smarter and avoid bouncing if called 6913 * from ops which don't hold a rq lock. 6914 */ 6915 if (flags & SCX_KICK_IDLE) { 6916 struct rq *target_rq = cpu_rq(cpu); 6917 6918 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 6919 scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 6920 6921 if (raw_spin_rq_trylock(target_rq)) { 6922 if (can_skip_idle_kick(target_rq)) { 6923 raw_spin_rq_unlock(target_rq); 6924 goto out; 6925 } 6926 raw_spin_rq_unlock(target_rq); 6927 } 6928 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 6929 } else { 6930 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 6931 6932 if (flags & SCX_KICK_PREEMPT) 6933 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 6934 if (flags & SCX_KICK_WAIT) 6935 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 6936 } 6937 6938 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 6939 out: 6940 local_irq_restore(irq_flags); 6941 } 6942 6943 /** 6944 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 6945 * @dsq_id: id of the DSQ 6946 * 6947 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 6948 * -%ENOENT is returned. 6949 */ 6950 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) 6951 { 6952 struct scx_sched *sch; 6953 struct scx_dispatch_q *dsq; 6954 s32 ret; 6955 6956 preempt_disable(); 6957 6958 sch = rcu_dereference_sched(scx_root); 6959 if (unlikely(!sch)) { 6960 ret = -ENODEV; 6961 goto out; 6962 } 6963 6964 if (dsq_id == SCX_DSQ_LOCAL) { 6965 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 6966 goto out; 6967 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 6968 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 6969 6970 if (ops_cpu_valid(sch, cpu, NULL)) { 6971 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 6972 goto out; 6973 } 6974 } else { 6975 dsq = find_user_dsq(sch, dsq_id); 6976 if (dsq) { 6977 ret = READ_ONCE(dsq->nr); 6978 goto out; 6979 } 6980 } 6981 ret = -ENOENT; 6982 out: 6983 preempt_enable(); 6984 return ret; 6985 } 6986 6987 /** 6988 * scx_bpf_destroy_dsq - Destroy a custom DSQ 6989 * @dsq_id: DSQ to destroy 6990 * 6991 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 6992 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 6993 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 6994 * which doesn't exist. Can be called from any online scx_ops operations. 6995 */ 6996 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) 6997 { 6998 struct scx_sched *sch; 6999 7000 rcu_read_lock(); 7001 sch = rcu_dereference(scx_root); 7002 if (sch) 7003 destroy_dsq(sch, dsq_id); 7004 rcu_read_unlock(); 7005 } 7006 7007 /** 7008 * bpf_iter_scx_dsq_new - Create a DSQ iterator 7009 * @it: iterator to initialize 7010 * @dsq_id: DSQ to iterate 7011 * @flags: %SCX_DSQ_ITER_* 7012 * 7013 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 7014 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 7015 * tasks which are already queued when this function is invoked. 7016 */ 7017 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 7018 u64 flags) 7019 { 7020 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 7021 struct scx_sched *sch; 7022 7023 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 7024 sizeof(struct bpf_iter_scx_dsq)); 7025 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 7026 __alignof__(struct bpf_iter_scx_dsq)); 7027 7028 /* 7029 * next() and destroy() will be called regardless of the return value. 7030 * Always clear $kit->dsq. 7031 */ 7032 kit->dsq = NULL; 7033 7034 sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); 7035 if (unlikely(!sch)) 7036 return -ENODEV; 7037 7038 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 7039 return -EINVAL; 7040 7041 kit->dsq = find_user_dsq(sch, dsq_id); 7042 if (!kit->dsq) 7043 return -ENOENT; 7044 7045 INIT_LIST_HEAD(&kit->cursor.node); 7046 kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; 7047 kit->cursor.priv = READ_ONCE(kit->dsq->seq); 7048 7049 return 0; 7050 } 7051 7052 /** 7053 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 7054 * @it: iterator to progress 7055 * 7056 * Return the next task. See bpf_iter_scx_dsq_new(). 7057 */ 7058 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 7059 { 7060 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 7061 bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; 7062 struct task_struct *p; 7063 unsigned long flags; 7064 7065 if (!kit->dsq) 7066 return NULL; 7067 7068 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 7069 7070 if (list_empty(&kit->cursor.node)) 7071 p = NULL; 7072 else 7073 p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); 7074 7075 /* 7076 * Only tasks which were queued before the iteration started are 7077 * visible. This bounds BPF iterations and guarantees that vtime never 7078 * jumps in the other direction while iterating. 7079 */ 7080 do { 7081 p = nldsq_next_task(kit->dsq, p, rev); 7082 } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); 7083 7084 if (p) { 7085 if (rev) 7086 list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); 7087 else 7088 list_move(&kit->cursor.node, &p->scx.dsq_list.node); 7089 } else { 7090 list_del_init(&kit->cursor.node); 7091 } 7092 7093 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 7094 7095 return p; 7096 } 7097 7098 /** 7099 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 7100 * @it: iterator to destroy 7101 * 7102 * Undo scx_iter_scx_dsq_new(). 7103 */ 7104 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 7105 { 7106 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 7107 7108 if (!kit->dsq) 7109 return; 7110 7111 if (!list_empty(&kit->cursor.node)) { 7112 unsigned long flags; 7113 7114 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 7115 list_del_init(&kit->cursor.node); 7116 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 7117 } 7118 kit->dsq = NULL; 7119 } 7120 7121 __bpf_kfunc_end_defs(); 7122 7123 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, 7124 char *fmt, unsigned long long *data, u32 data__sz) 7125 { 7126 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 7127 s32 ret; 7128 7129 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 7130 (data__sz && !data)) { 7131 scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); 7132 return -EINVAL; 7133 } 7134 7135 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 7136 if (ret < 0) { 7137 scx_kf_error("failed to read data fields (%d)", ret); 7138 return ret; 7139 } 7140 7141 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 7142 &bprintf_data); 7143 if (ret < 0) { 7144 scx_kf_error("format preparation failed (%d)", ret); 7145 return ret; 7146 } 7147 7148 ret = bstr_printf(line_buf, line_size, fmt, 7149 bprintf_data.bin_args); 7150 bpf_bprintf_cleanup(&bprintf_data); 7151 if (ret < 0) { 7152 scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 7153 return ret; 7154 } 7155 7156 return ret; 7157 } 7158 7159 static s32 bstr_format(struct scx_bstr_buf *buf, 7160 char *fmt, unsigned long long *data, u32 data__sz) 7161 { 7162 return __bstr_format(buf->data, buf->line, sizeof(buf->line), 7163 fmt, data, data__sz); 7164 } 7165 7166 __bpf_kfunc_start_defs(); 7167 7168 /** 7169 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 7170 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 7171 * @fmt: error message format string 7172 * @data: format string parameters packaged using ___bpf_fill() macro 7173 * @data__sz: @data len, must end in '__sz' for the verifier 7174 * 7175 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 7176 * disabling. 7177 */ 7178 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 7179 unsigned long long *data, u32 data__sz) 7180 { 7181 unsigned long flags; 7182 7183 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 7184 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 7185 scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 7186 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 7187 } 7188 7189 /** 7190 * scx_bpf_error_bstr - Indicate fatal error 7191 * @fmt: error message format string 7192 * @data: format string parameters packaged using ___bpf_fill() macro 7193 * @data__sz: @data len, must end in '__sz' for the verifier 7194 * 7195 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 7196 * disabling. 7197 */ 7198 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 7199 u32 data__sz) 7200 { 7201 unsigned long flags; 7202 7203 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 7204 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 7205 scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 7206 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 7207 } 7208 7209 /** 7210 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 7211 * @fmt: format string 7212 * @data: format string parameters packaged using ___bpf_fill() macro 7213 * @data__sz: @data len, must end in '__sz' for the verifier 7214 * 7215 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 7216 * dump_task() to generate extra debug dump specific to the BPF scheduler. 7217 * 7218 * The extra dump may be multiple lines. A single line may be split over 7219 * multiple calls. The last line is automatically terminated. 7220 */ 7221 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 7222 u32 data__sz) 7223 { 7224 struct scx_dump_data *dd = &scx_dump_data; 7225 struct scx_bstr_buf *buf = &dd->buf; 7226 s32 ret; 7227 7228 if (raw_smp_processor_id() != dd->cpu) { 7229 scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); 7230 return; 7231 } 7232 7233 /* append the formatted string to the line buf */ 7234 ret = __bstr_format(buf->data, buf->line + dd->cursor, 7235 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 7236 if (ret < 0) { 7237 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 7238 dd->prefix, fmt, data, data__sz, ret); 7239 return; 7240 } 7241 7242 dd->cursor += ret; 7243 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 7244 7245 if (!dd->cursor) 7246 return; 7247 7248 /* 7249 * If the line buf overflowed or ends in a newline, flush it into the 7250 * dump. This is to allow the caller to generate a single line over 7251 * multiple calls. As ops_dump_flush() can also handle multiple lines in 7252 * the line buf, the only case which can lead to an unexpected 7253 * truncation is when the caller keeps generating newlines in the middle 7254 * instead of the end consecutively. Don't do that. 7255 */ 7256 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 7257 ops_dump_flush(); 7258 } 7259 7260 /** 7261 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 7262 * @cpu: CPU of interest 7263 * 7264 * Return the maximum relative capacity of @cpu in relation to the most 7265 * performant CPU in the system. The return value is in the range [1, 7266 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 7267 */ 7268 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) 7269 { 7270 if (kf_cpu_valid(cpu, NULL)) 7271 return arch_scale_cpu_capacity(cpu); 7272 else 7273 return SCX_CPUPERF_ONE; 7274 } 7275 7276 /** 7277 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 7278 * @cpu: CPU of interest 7279 * 7280 * Return the current relative performance of @cpu in relation to its maximum. 7281 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 7282 * 7283 * The current performance level of a CPU in relation to the maximum performance 7284 * available in the system can be calculated as follows: 7285 * 7286 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 7287 * 7288 * The result is in the range [1, %SCX_CPUPERF_ONE]. 7289 */ 7290 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) 7291 { 7292 if (kf_cpu_valid(cpu, NULL)) 7293 return arch_scale_freq_capacity(cpu); 7294 else 7295 return SCX_CPUPERF_ONE; 7296 } 7297 7298 /** 7299 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 7300 * @cpu: CPU of interest 7301 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 7302 * 7303 * Set the target performance level of @cpu to @perf. @perf is in linear 7304 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 7305 * schedutil cpufreq governor chooses the target frequency. 7306 * 7307 * The actual performance level chosen, CPU grouping, and the overhead and 7308 * latency of the operations are dependent on the hardware and cpufreq driver in 7309 * use. Consult hardware and cpufreq documentation for more information. The 7310 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 7311 */ 7312 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) 7313 { 7314 if (unlikely(perf > SCX_CPUPERF_ONE)) { 7315 scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); 7316 return; 7317 } 7318 7319 if (kf_cpu_valid(cpu, NULL)) { 7320 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 7321 struct rq_flags rf; 7322 7323 /* 7324 * When called with an rq lock held, restrict the operation 7325 * to the corresponding CPU to prevent ABBA deadlocks. 7326 */ 7327 if (locked_rq && rq != locked_rq) { 7328 scx_kf_error("Invalid target CPU %d", cpu); 7329 return; 7330 } 7331 7332 /* 7333 * If no rq lock is held, allow to operate on any CPU by 7334 * acquiring the corresponding rq lock. 7335 */ 7336 if (!locked_rq) { 7337 rq_lock_irqsave(rq, &rf); 7338 update_rq_clock(rq); 7339 } 7340 7341 rq->scx.cpuperf_target = perf; 7342 cpufreq_update_util(rq, 0); 7343 7344 if (!locked_rq) 7345 rq_unlock_irqrestore(rq, &rf); 7346 } 7347 } 7348 7349 /** 7350 * scx_bpf_nr_node_ids - Return the number of possible node IDs 7351 * 7352 * All valid node IDs in the system are smaller than the returned value. 7353 */ 7354 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 7355 { 7356 return nr_node_ids; 7357 } 7358 7359 /** 7360 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 7361 * 7362 * All valid CPU IDs in the system are smaller than the returned value. 7363 */ 7364 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 7365 { 7366 return nr_cpu_ids; 7367 } 7368 7369 /** 7370 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 7371 */ 7372 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 7373 { 7374 return cpu_possible_mask; 7375 } 7376 7377 /** 7378 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 7379 */ 7380 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 7381 { 7382 return cpu_online_mask; 7383 } 7384 7385 /** 7386 * scx_bpf_put_cpumask - Release a possible/online cpumask 7387 * @cpumask: cpumask to release 7388 */ 7389 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 7390 { 7391 /* 7392 * Empty function body because we aren't actually acquiring or releasing 7393 * a reference to a global cpumask, which is read-only in the caller and 7394 * is never released. The acquire / release semantics here are just used 7395 * to make the cpumask is a trusted pointer in the caller. 7396 */ 7397 } 7398 7399 /** 7400 * scx_bpf_task_running - Is task currently running? 7401 * @p: task of interest 7402 */ 7403 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 7404 { 7405 return task_rq(p)->curr == p; 7406 } 7407 7408 /** 7409 * scx_bpf_task_cpu - CPU a task is currently associated with 7410 * @p: task of interest 7411 */ 7412 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 7413 { 7414 return task_cpu(p); 7415 } 7416 7417 /** 7418 * scx_bpf_cpu_rq - Fetch the rq of a CPU 7419 * @cpu: CPU of the rq 7420 */ 7421 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) 7422 { 7423 if (!kf_cpu_valid(cpu, NULL)) 7424 return NULL; 7425 7426 return cpu_rq(cpu); 7427 } 7428 7429 /** 7430 * scx_bpf_task_cgroup - Return the sched cgroup of a task 7431 * @p: task of interest 7432 * 7433 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 7434 * from the scheduler's POV. SCX operations should use this function to 7435 * determine @p's current cgroup as, unlike following @p->cgroups, 7436 * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all 7437 * rq-locked operations. Can be called on the parameter tasks of rq-locked 7438 * operations. The restriction guarantees that @p's rq is locked by the caller. 7439 */ 7440 #ifdef CONFIG_CGROUP_SCHED 7441 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) 7442 { 7443 struct task_group *tg = p->sched_task_group; 7444 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 7445 7446 if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) 7447 goto out; 7448 7449 cgrp = tg_cgrp(tg); 7450 7451 out: 7452 cgroup_get(cgrp); 7453 return cgrp; 7454 } 7455 #endif 7456 7457 /** 7458 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 7459 * clock for the current CPU. The clock returned is in nanoseconds. 7460 * 7461 * It provides the following properties: 7462 * 7463 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 7464 * to account for execution time and track tasks' runtime properties. 7465 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 7466 * eventually reads a hardware timestamp counter -- is neither performant nor 7467 * scalable. scx_bpf_now() aims to provide a high-performance clock by 7468 * using the rq clock in the scheduler core whenever possible. 7469 * 7470 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 7471 * scheduler use cases, the required clock resolution is lower than the most 7472 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 7473 * uses the rq clock in the scheduler core whenever it is valid. It considers 7474 * that the rq clock is valid from the time the rq clock is updated 7475 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 7476 * 7477 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 7478 * guarantees the clock never goes backward when comparing them in the same 7479 * CPU. On the other hand, when comparing clocks in different CPUs, there 7480 * is no such guarantee -- the clock can go backward. It provides a 7481 * monotonically *non-decreasing* clock so that it would provide the same 7482 * clock values in two different scx_bpf_now() calls in the same CPU 7483 * during the same period of when the rq clock is valid. 7484 */ 7485 __bpf_kfunc u64 scx_bpf_now(void) 7486 { 7487 struct rq *rq; 7488 u64 clock; 7489 7490 preempt_disable(); 7491 7492 rq = this_rq(); 7493 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 7494 /* 7495 * If the rq clock is valid, use the cached rq clock. 7496 * 7497 * Note that scx_bpf_now() is re-entrant between a process 7498 * context and an interrupt context (e.g., timer interrupt). 7499 * However, we don't need to consider the race between them 7500 * because such race is not observable from a caller. 7501 */ 7502 clock = READ_ONCE(rq->scx.clock); 7503 } else { 7504 /* 7505 * Otherwise, return a fresh rq clock. 7506 * 7507 * The rq clock is updated outside of the rq lock. 7508 * In this case, keep the updated rq clock invalid so the next 7509 * kfunc call outside the rq lock gets a fresh rq clock. 7510 */ 7511 clock = sched_clock_cpu(cpu_of(rq)); 7512 } 7513 7514 preempt_enable(); 7515 7516 return clock; 7517 } 7518 7519 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 7520 { 7521 struct scx_event_stats *e_cpu; 7522 int cpu; 7523 7524 /* Aggregate per-CPU event counters into @events. */ 7525 memset(events, 0, sizeof(*events)); 7526 for_each_possible_cpu(cpu) { 7527 e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); 7528 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 7529 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 7530 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 7531 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 7532 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 7533 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 7534 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 7535 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 7536 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 7537 } 7538 } 7539 7540 /* 7541 * scx_bpf_events - Get a system-wide event counter to 7542 * @events: output buffer from a BPF program 7543 * @events__sz: @events len, must end in '__sz'' for the verifier 7544 */ 7545 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 7546 size_t events__sz) 7547 { 7548 struct scx_sched *sch; 7549 struct scx_event_stats e_sys; 7550 7551 rcu_read_lock(); 7552 sch = rcu_dereference(scx_root); 7553 if (sch) 7554 scx_read_events(sch, &e_sys); 7555 else 7556 memset(&e_sys, 0, sizeof(e_sys)); 7557 rcu_read_unlock(); 7558 7559 /* 7560 * We cannot entirely trust a BPF-provided size since a BPF program 7561 * might be compiled against a different vmlinux.h, of which 7562 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 7563 * (an older vmlinux.h). Hence, we use the smaller size to avoid 7564 * memory corruption. 7565 */ 7566 events__sz = min(events__sz, sizeof(*events)); 7567 memcpy(events, &e_sys, events__sz); 7568 } 7569 7570 __bpf_kfunc_end_defs(); 7571 7572 BTF_KFUNCS_START(scx_kfunc_ids_any) 7573 BTF_ID_FLAGS(func, scx_bpf_kick_cpu) 7574 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 7575 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 7576 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) 7577 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 7578 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 7579 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) 7580 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) 7581 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) 7582 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) 7583 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) 7584 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) 7585 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 7586 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 7587 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 7588 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 7589 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 7590 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 7591 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 7592 BTF_ID_FLAGS(func, scx_bpf_cpu_rq) 7593 #ifdef CONFIG_CGROUP_SCHED 7594 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) 7595 #endif 7596 BTF_ID_FLAGS(func, scx_bpf_now) 7597 BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) 7598 BTF_KFUNCS_END(scx_kfunc_ids_any) 7599 7600 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 7601 .owner = THIS_MODULE, 7602 .set = &scx_kfunc_ids_any, 7603 }; 7604 7605 static int __init scx_init(void) 7606 { 7607 int ret; 7608 7609 /* 7610 * kfunc registration can't be done from init_sched_ext_class() as 7611 * register_btf_kfunc_id_set() needs most of the system to be up. 7612 * 7613 * Some kfuncs are context-sensitive and can only be called from 7614 * specific SCX ops. They are grouped into BTF sets accordingly. 7615 * Unfortunately, BPF currently doesn't have a way of enforcing such 7616 * restrictions. Eventually, the verifier should be able to enforce 7617 * them. For now, register them the same and make each kfunc explicitly 7618 * check using scx_kf_allowed(). 7619 */ 7620 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7621 &scx_kfunc_set_enqueue_dispatch)) || 7622 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7623 &scx_kfunc_set_dispatch)) || 7624 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7625 &scx_kfunc_set_cpu_release)) || 7626 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7627 &scx_kfunc_set_unlocked)) || 7628 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 7629 &scx_kfunc_set_unlocked)) || 7630 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7631 &scx_kfunc_set_any)) || 7632 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 7633 &scx_kfunc_set_any)) || 7634 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 7635 &scx_kfunc_set_any))) { 7636 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 7637 return ret; 7638 } 7639 7640 ret = scx_idle_init(); 7641 if (ret) { 7642 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 7643 return ret; 7644 } 7645 7646 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 7647 if (ret) { 7648 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 7649 return ret; 7650 } 7651 7652 ret = register_pm_notifier(&scx_pm_notifier); 7653 if (ret) { 7654 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 7655 return ret; 7656 } 7657 7658 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 7659 if (!scx_kset) { 7660 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 7661 return -ENOMEM; 7662 } 7663 7664 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 7665 if (ret < 0) { 7666 pr_err("sched_ext: Failed to add global attributes\n"); 7667 return ret; 7668 } 7669 7670 return 0; 7671 } 7672 __initcall(scx_init); 7673