1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 */ 7 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 8 9 enum scx_consts { 10 SCX_DSP_DFL_MAX_BATCH = 32, 11 SCX_DSP_MAX_LOOPS = 32, 12 SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, 13 14 SCX_EXIT_BT_LEN = 64, 15 SCX_EXIT_MSG_LEN = 1024, 16 SCX_EXIT_DUMP_DFL_LEN = 32768, 17 }; 18 19 enum scx_exit_kind { 20 SCX_EXIT_NONE, 21 SCX_EXIT_DONE, 22 23 SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ 24 SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ 25 SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ 26 SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ 27 28 SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ 29 SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ 30 SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ 31 }; 32 33 /* 34 * An exit code can be specified when exiting with scx_bpf_exit() or 35 * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN 36 * respectively. The codes are 64bit of the format: 37 * 38 * Bits: [63 .. 48 47 .. 32 31 .. 0] 39 * [ SYS ACT ] [ SYS RSN ] [ USR ] 40 * 41 * SYS ACT: System-defined exit actions 42 * SYS RSN: System-defined exit reasons 43 * USR : User-defined exit codes and reasons 44 * 45 * Using the above, users may communicate intention and context by ORing system 46 * actions and/or system reasons with a user-defined exit code. 47 */ 48 enum scx_exit_code { 49 /* Reasons */ 50 SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, 51 52 /* Actions */ 53 SCX_ECODE_ACT_RESTART = 1LLU << 48, 54 }; 55 56 /* 57 * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is 58 * being disabled. 59 */ 60 struct scx_exit_info { 61 /* %SCX_EXIT_* - broad category of the exit reason */ 62 enum scx_exit_kind kind; 63 64 /* exit code if gracefully exiting */ 65 s64 exit_code; 66 67 /* textual representation of the above */ 68 const char *reason; 69 70 /* backtrace if exiting due to an error */ 71 unsigned long *bt; 72 u32 bt_len; 73 74 /* informational message */ 75 char *msg; 76 77 /* debug dump */ 78 char *dump; 79 }; 80 81 /* sched_ext_ops.flags */ 82 enum scx_ops_flags { 83 /* 84 * Keep built-in idle tracking even if ops.update_idle() is implemented. 85 */ 86 SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 87 88 /* 89 * By default, if there are no other task to run on the CPU, ext core 90 * keeps running the current task even after its slice expires. If this 91 * flag is specified, such tasks are passed to ops.enqueue() with 92 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 93 */ 94 SCX_OPS_ENQ_LAST = 1LLU << 1, 95 96 /* 97 * An exiting task may schedule after PF_EXITING is set. In such cases, 98 * bpf_task_from_pid() may not be able to find the task and if the BPF 99 * scheduler depends on pid lookup for dispatching, the task will be 100 * lost leading to various issues including RCU grace period stalls. 101 * 102 * To mask this problem, by default, unhashed tasks are automatically 103 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't 104 * depend on pid lookups and wants to handle these tasks directly, the 105 * following flag can be used. 106 */ 107 SCX_OPS_ENQ_EXITING = 1LLU << 2, 108 109 /* 110 * If set, only tasks with policy set to SCHED_EXT are attached to 111 * sched_ext. If clear, SCHED_NORMAL tasks are also included. 112 */ 113 SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 114 115 SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 116 SCX_OPS_ENQ_LAST | 117 SCX_OPS_ENQ_EXITING | 118 SCX_OPS_SWITCH_PARTIAL, 119 }; 120 121 /* argument container for ops.init_task() */ 122 struct scx_init_task_args { 123 /* 124 * Set if ops.init_task() is being invoked on the fork path, as opposed 125 * to the scheduler transition path. 126 */ 127 bool fork; 128 }; 129 130 /* argument container for ops.exit_task() */ 131 struct scx_exit_task_args { 132 /* Whether the task exited before running on sched_ext. */ 133 bool cancelled; 134 }; 135 136 enum scx_cpu_preempt_reason { 137 /* next task is being scheduled by &sched_class_rt */ 138 SCX_CPU_PREEMPT_RT, 139 /* next task is being scheduled by &sched_class_dl */ 140 SCX_CPU_PREEMPT_DL, 141 /* next task is being scheduled by &sched_class_stop */ 142 SCX_CPU_PREEMPT_STOP, 143 /* unknown reason for SCX being preempted */ 144 SCX_CPU_PREEMPT_UNKNOWN, 145 }; 146 147 /* 148 * Argument container for ops->cpu_acquire(). Currently empty, but may be 149 * expanded in the future. 150 */ 151 struct scx_cpu_acquire_args {}; 152 153 /* argument container for ops->cpu_release() */ 154 struct scx_cpu_release_args { 155 /* the reason the CPU was preempted */ 156 enum scx_cpu_preempt_reason reason; 157 158 /* the task that's going to be scheduled on the CPU */ 159 struct task_struct *task; 160 }; 161 162 /* 163 * Informational context provided to dump operations. 164 */ 165 struct scx_dump_ctx { 166 enum scx_exit_kind kind; 167 s64 exit_code; 168 const char *reason; 169 u64 at_ns; 170 u64 at_jiffies; 171 }; 172 173 /** 174 * struct sched_ext_ops - Operation table for BPF scheduler implementation 175 * 176 * Userland can implement an arbitrary scheduling policy by implementing and 177 * loading operations in this table. 178 */ 179 struct sched_ext_ops { 180 /** 181 * select_cpu - Pick the target CPU for a task which is being woken up 182 * @p: task being woken up 183 * @prev_cpu: the cpu @p was on before sleeping 184 * @wake_flags: SCX_WAKE_* 185 * 186 * Decision made here isn't final. @p may be moved to any CPU while it 187 * is getting dispatched for execution later. However, as @p is not on 188 * the rq at this point, getting the eventual execution CPU right here 189 * saves a small bit of overhead down the line. 190 * 191 * If an idle CPU is returned, the CPU is kicked and will try to 192 * dispatch. While an explicit custom mechanism can be added, 193 * select_cpu() serves as the default way to wake up idle CPUs. 194 * 195 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p 196 * is dispatched, the ops.enqueue() callback will be skipped. Finally, 197 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the 198 * local DSQ of whatever CPU is returned by this callback. 199 */ 200 s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); 201 202 /** 203 * enqueue - Enqueue a task on the BPF scheduler 204 * @p: task being enqueued 205 * @enq_flags: %SCX_ENQ_* 206 * 207 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() 208 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf 209 * scheduler owns @p and if it fails to dispatch @p, the task will 210 * stall. 211 * 212 * If @p was dispatched from ops.select_cpu(), this callback is 213 * skipped. 214 */ 215 void (*enqueue)(struct task_struct *p, u64 enq_flags); 216 217 /** 218 * dequeue - Remove a task from the BPF scheduler 219 * @p: task being dequeued 220 * @deq_flags: %SCX_DEQ_* 221 * 222 * Remove @p from the BPF scheduler. This is usually called to isolate 223 * the task while updating its scheduling properties (e.g. priority). 224 * 225 * The ext core keeps track of whether the BPF side owns a given task or 226 * not and can gracefully ignore spurious dispatches from BPF side, 227 * which makes it safe to not implement this method. However, depending 228 * on the scheduling logic, this can lead to confusing behaviors - e.g. 229 * scheduling position not being updated across a priority change. 230 */ 231 void (*dequeue)(struct task_struct *p, u64 deq_flags); 232 233 /** 234 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs 235 * @cpu: CPU to dispatch tasks for 236 * @prev: previous task being switched out 237 * 238 * Called when a CPU's local dsq is empty. The operation should dispatch 239 * one or more tasks from the BPF scheduler into the DSQs using 240 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using 241 * scx_bpf_consume(). 242 * 243 * The maximum number of times scx_bpf_dispatch() can be called without 244 * an intervening scx_bpf_consume() is specified by 245 * ops.dispatch_max_batch. See the comments on top of the two functions 246 * for more details. 247 * 248 * When not %NULL, @prev is an SCX task with its slice depleted. If 249 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in 250 * @prev->scx.flags, it is not enqueued yet and will be enqueued after 251 * ops.dispatch() returns. To keep executing @prev, return without 252 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. 253 */ 254 void (*dispatch)(s32 cpu, struct task_struct *prev); 255 256 /** 257 * tick - Periodic tick 258 * @p: task running currently 259 * 260 * This operation is called every 1/HZ seconds on CPUs which are 261 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an 262 * immediate dispatch cycle on the CPU. 263 */ 264 void (*tick)(struct task_struct *p); 265 266 /** 267 * runnable - A task is becoming runnable on its associated CPU 268 * @p: task becoming runnable 269 * @enq_flags: %SCX_ENQ_* 270 * 271 * This and the following three functions can be used to track a task's 272 * execution state transitions. A task becomes ->runnable() on a CPU, 273 * and then goes through one or more ->running() and ->stopping() pairs 274 * as it runs on the CPU, and eventually becomes ->quiescent() when it's 275 * done running on the CPU. 276 * 277 * @p is becoming runnable on the CPU because it's 278 * 279 * - waking up (%SCX_ENQ_WAKEUP) 280 * - being moved from another CPU 281 * - being restored after temporarily taken off the queue for an 282 * attribute change. 283 * 284 * This and ->enqueue() are related but not coupled. This operation 285 * notifies @p's state transition and may not be followed by ->enqueue() 286 * e.g. when @p is being dispatched to a remote CPU, or when @p is 287 * being enqueued on a CPU experiencing a hotplug event. Likewise, a 288 * task may be ->enqueue()'d without being preceded by this operation 289 * e.g. after exhausting its slice. 290 */ 291 void (*runnable)(struct task_struct *p, u64 enq_flags); 292 293 /** 294 * running - A task is starting to run on its associated CPU 295 * @p: task starting to run 296 * 297 * See ->runnable() for explanation on the task state notifiers. 298 */ 299 void (*running)(struct task_struct *p); 300 301 /** 302 * stopping - A task is stopping execution 303 * @p: task stopping to run 304 * @runnable: is task @p still runnable? 305 * 306 * See ->runnable() for explanation on the task state notifiers. If 307 * !@runnable, ->quiescent() will be invoked after this operation 308 * returns. 309 */ 310 void (*stopping)(struct task_struct *p, bool runnable); 311 312 /** 313 * quiescent - A task is becoming not runnable on its associated CPU 314 * @p: task becoming not runnable 315 * @deq_flags: %SCX_DEQ_* 316 * 317 * See ->runnable() for explanation on the task state notifiers. 318 * 319 * @p is becoming quiescent on the CPU because it's 320 * 321 * - sleeping (%SCX_DEQ_SLEEP) 322 * - being moved to another CPU 323 * - being temporarily taken off the queue for an attribute change 324 * (%SCX_DEQ_SAVE) 325 * 326 * This and ->dequeue() are related but not coupled. This operation 327 * notifies @p's state transition and may not be preceded by ->dequeue() 328 * e.g. when @p is being dispatched to a remote CPU. 329 */ 330 void (*quiescent)(struct task_struct *p, u64 deq_flags); 331 332 /** 333 * yield - Yield CPU 334 * @from: yielding task 335 * @to: optional yield target task 336 * 337 * If @to is NULL, @from is yielding the CPU to other runnable tasks. 338 * The BPF scheduler should ensure that other available tasks are 339 * dispatched before the yielding task. Return value is ignored in this 340 * case. 341 * 342 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf 343 * scheduler can implement the request, return %true; otherwise, %false. 344 */ 345 bool (*yield)(struct task_struct *from, struct task_struct *to); 346 347 /** 348 * set_weight - Set task weight 349 * @p: task to set weight for 350 * @weight: new eight [1..10000] 351 * 352 * Update @p's weight to @weight. 353 */ 354 void (*set_weight)(struct task_struct *p, u32 weight); 355 356 /** 357 * set_cpumask - Set CPU affinity 358 * @p: task to set CPU affinity for 359 * @cpumask: cpumask of cpus that @p can run on 360 * 361 * Update @p's CPU affinity to @cpumask. 362 */ 363 void (*set_cpumask)(struct task_struct *p, 364 const struct cpumask *cpumask); 365 366 /** 367 * update_idle - Update the idle state of a CPU 368 * @cpu: CPU to udpate the idle state for 369 * @idle: whether entering or exiting the idle state 370 * 371 * This operation is called when @rq's CPU goes or leaves the idle 372 * state. By default, implementing this operation disables the built-in 373 * idle CPU tracking and the following helpers become unavailable: 374 * 375 * - scx_bpf_select_cpu_dfl() 376 * - scx_bpf_test_and_clear_cpu_idle() 377 * - scx_bpf_pick_idle_cpu() 378 * 379 * The user also must implement ops.select_cpu() as the default 380 * implementation relies on scx_bpf_select_cpu_dfl(). 381 * 382 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle 383 * tracking. 384 */ 385 void (*update_idle)(s32 cpu, bool idle); 386 387 /** 388 * cpu_acquire - A CPU is becoming available to the BPF scheduler 389 * @cpu: The CPU being acquired by the BPF scheduler. 390 * @args: Acquire arguments, see the struct definition. 391 * 392 * A CPU that was previously released from the BPF scheduler is now once 393 * again under its control. 394 */ 395 void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); 396 397 /** 398 * cpu_release - A CPU is taken away from the BPF scheduler 399 * @cpu: The CPU being released by the BPF scheduler. 400 * @args: Release arguments, see the struct definition. 401 * 402 * The specified CPU is no longer under the control of the BPF 403 * scheduler. This could be because it was preempted by a higher 404 * priority sched_class, though there may be other reasons as well. The 405 * caller should consult @args->reason to determine the cause. 406 */ 407 void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); 408 409 /** 410 * init_task - Initialize a task to run in a BPF scheduler 411 * @p: task to initialize for BPF scheduling 412 * @args: init arguments, see the struct definition 413 * 414 * Either we're loading a BPF scheduler or a new task is being forked. 415 * Initialize @p for BPF scheduling. This operation may block and can 416 * be used for allocations, and is called exactly once for a task. 417 * 418 * Return 0 for success, -errno for failure. An error return while 419 * loading will abort loading of the BPF scheduler. During a fork, it 420 * will abort that specific fork. 421 */ 422 s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); 423 424 /** 425 * exit_task - Exit a previously-running task from the system 426 * @p: task to exit 427 * 428 * @p is exiting or the BPF scheduler is being unloaded. Perform any 429 * necessary cleanup for @p. 430 */ 431 void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); 432 433 /** 434 * enable - Enable BPF scheduling for a task 435 * @p: task to enable BPF scheduling for 436 * 437 * Enable @p for BPF scheduling. enable() is called on @p any time it 438 * enters SCX, and is always paired with a matching disable(). 439 */ 440 void (*enable)(struct task_struct *p); 441 442 /** 443 * disable - Disable BPF scheduling for a task 444 * @p: task to disable BPF scheduling for 445 * 446 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. 447 * Disable BPF scheduling for @p. A disable() call is always matched 448 * with a prior enable() call. 449 */ 450 void (*disable)(struct task_struct *p); 451 452 /** 453 * dump - Dump BPF scheduler state on error 454 * @ctx: debug dump context 455 * 456 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. 457 */ 458 void (*dump)(struct scx_dump_ctx *ctx); 459 460 /** 461 * dump_cpu - Dump BPF scheduler state for a CPU on error 462 * @ctx: debug dump context 463 * @cpu: CPU to generate debug dump for 464 * @idle: @cpu is currently idle without any runnable tasks 465 * 466 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 467 * @cpu. If @idle is %true and this operation doesn't produce any 468 * output, @cpu is skipped for dump. 469 */ 470 void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); 471 472 /** 473 * dump_task - Dump BPF scheduler state for a runnable task on error 474 * @ctx: debug dump context 475 * @p: runnable task to generate debug dump for 476 * 477 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 478 * @p. 479 */ 480 void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); 481 482 /* 483 * All online ops must come before ops.cpu_online(). 484 */ 485 486 /** 487 * cpu_online - A CPU became online 488 * @cpu: CPU which just came up 489 * 490 * @cpu just came online. @cpu will not call ops.enqueue() or 491 * ops.dispatch(), nor run tasks associated with other CPUs beforehand. 492 */ 493 void (*cpu_online)(s32 cpu); 494 495 /** 496 * cpu_offline - A CPU is going offline 497 * @cpu: CPU which is going offline 498 * 499 * @cpu is going offline. @cpu will not call ops.enqueue() or 500 * ops.dispatch(), nor run tasks associated with other CPUs afterwards. 501 */ 502 void (*cpu_offline)(s32 cpu); 503 504 /* 505 * All CPU hotplug ops must come before ops.init(). 506 */ 507 508 /** 509 * init - Initialize the BPF scheduler 510 */ 511 s32 (*init)(void); 512 513 /** 514 * exit - Clean up after the BPF scheduler 515 * @info: Exit info 516 */ 517 void (*exit)(struct scx_exit_info *info); 518 519 /** 520 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch 521 */ 522 u32 dispatch_max_batch; 523 524 /** 525 * flags - %SCX_OPS_* flags 526 */ 527 u64 flags; 528 529 /** 530 * timeout_ms - The maximum amount of time, in milliseconds, that a 531 * runnable task should be able to wait before being scheduled. The 532 * maximum timeout may not exceed the default timeout of 30 seconds. 533 * 534 * Defaults to the maximum allowed timeout value of 30 seconds. 535 */ 536 u32 timeout_ms; 537 538 /** 539 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default 540 * value of 32768 is used. 541 */ 542 u32 exit_dump_len; 543 544 /** 545 * hotplug_seq - A sequence number that may be set by the scheduler to 546 * detect when a hotplug event has occurred during the loading process. 547 * If 0, no detection occurs. Otherwise, the scheduler will fail to 548 * load if the sequence number does not match @scx_hotplug_seq on the 549 * enable path. 550 */ 551 u64 hotplug_seq; 552 553 /** 554 * name - BPF scheduler's name 555 * 556 * Must be a non-zero valid BPF object name including only isalnum(), 557 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the 558 * BPF scheduler is enabled. 559 */ 560 char name[SCX_OPS_NAME_LEN]; 561 }; 562 563 enum scx_opi { 564 SCX_OPI_BEGIN = 0, 565 SCX_OPI_NORMAL_BEGIN = 0, 566 SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), 567 SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), 568 SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), 569 SCX_OPI_END = SCX_OP_IDX(init), 570 }; 571 572 enum scx_wake_flags { 573 /* expose select WF_* flags as enums */ 574 SCX_WAKE_FORK = WF_FORK, 575 SCX_WAKE_TTWU = WF_TTWU, 576 SCX_WAKE_SYNC = WF_SYNC, 577 }; 578 579 enum scx_enq_flags { 580 /* expose select ENQUEUE_* flags as enums */ 581 SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 582 SCX_ENQ_HEAD = ENQUEUE_HEAD, 583 584 /* high 32bits are SCX specific */ 585 586 /* 587 * Set the following to trigger preemption when calling 588 * scx_bpf_dispatch() with a local dsq as the target. The slice of the 589 * current task is cleared to zero and the CPU is kicked into the 590 * scheduling path. Implies %SCX_ENQ_HEAD. 591 */ 592 SCX_ENQ_PREEMPT = 1LLU << 32, 593 594 /* 595 * The task being enqueued was previously enqueued on the current CPU's 596 * %SCX_DSQ_LOCAL, but was removed from it in a call to the 597 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was 598 * invoked in a ->cpu_release() callback, and the task is again 599 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the 600 * task will not be scheduled on the CPU until at least the next invocation 601 * of the ->cpu_acquire() callback. 602 */ 603 SCX_ENQ_REENQ = 1LLU << 40, 604 605 /* 606 * The task being enqueued is the only task available for the cpu. By 607 * default, ext core keeps executing such tasks but when 608 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the 609 * %SCX_ENQ_LAST flag set. 610 * 611 * If the BPF scheduler wants to continue executing the task, 612 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. 613 * If the task gets queued on a different dsq or the BPF side, the BPF 614 * scheduler is responsible for triggering a follow-up scheduling event. 615 * Otherwise, Execution may stall. 616 */ 617 SCX_ENQ_LAST = 1LLU << 41, 618 619 /* high 8 bits are internal */ 620 __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, 621 622 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 623 }; 624 625 enum scx_deq_flags { 626 /* expose select DEQUEUE_* flags as enums */ 627 SCX_DEQ_SLEEP = DEQUEUE_SLEEP, 628 }; 629 630 enum scx_pick_idle_cpu_flags { 631 SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 632 }; 633 634 enum scx_kick_flags { 635 /* 636 * Kick the target CPU if idle. Guarantees that the target CPU goes 637 * through at least one full scheduling cycle before going idle. If the 638 * target CPU can be determined to be currently not idle and going to go 639 * through a scheduling cycle before going idle, noop. 640 */ 641 SCX_KICK_IDLE = 1LLU << 0, 642 643 /* 644 * Preempt the current task and execute the dispatch path. If the 645 * current task of the target CPU is an SCX task, its ->scx.slice is 646 * cleared to zero before the scheduling path is invoked so that the 647 * task expires and the dispatch path is invoked. 648 */ 649 SCX_KICK_PREEMPT = 1LLU << 1, 650 651 /* 652 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will 653 * return after the target CPU finishes picking the next task. 654 */ 655 SCX_KICK_WAIT = 1LLU << 2, 656 }; 657 658 enum scx_ops_enable_state { 659 SCX_OPS_PREPPING, 660 SCX_OPS_ENABLING, 661 SCX_OPS_ENABLED, 662 SCX_OPS_DISABLING, 663 SCX_OPS_DISABLED, 664 }; 665 666 static const char *scx_ops_enable_state_str[] = { 667 [SCX_OPS_PREPPING] = "prepping", 668 [SCX_OPS_ENABLING] = "enabling", 669 [SCX_OPS_ENABLED] = "enabled", 670 [SCX_OPS_DISABLING] = "disabling", 671 [SCX_OPS_DISABLED] = "disabled", 672 }; 673 674 /* 675 * sched_ext_entity->ops_state 676 * 677 * Used to track the task ownership between the SCX core and the BPF scheduler. 678 * State transitions look as follows: 679 * 680 * NONE -> QUEUEING -> QUEUED -> DISPATCHING 681 * ^ | | 682 * | v v 683 * \-------------------------------/ 684 * 685 * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 686 * sites for explanations on the conditions being waited upon and why they are 687 * safe. Transitions out of them into NONE or QUEUED must store_release and the 688 * waiters should load_acquire. 689 * 690 * Tracking scx_ops_state enables sched_ext core to reliably determine whether 691 * any given task can be dispatched by the BPF scheduler at all times and thus 692 * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 693 * to try to dispatch any task anytime regardless of its state as the SCX core 694 * can safely reject invalid dispatches. 695 */ 696 enum scx_ops_state { 697 SCX_OPSS_NONE, /* owned by the SCX core */ 698 SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ 699 SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ 700 SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ 701 702 /* 703 * QSEQ brands each QUEUED instance so that, when dispatch races 704 * dequeue/requeue, the dispatcher can tell whether it still has a claim 705 * on the task being dispatched. 706 * 707 * As some 32bit archs can't do 64bit store_release/load_acquire, 708 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on 709 * 32bit machines. The dispatch race window QSEQ protects is very narrow 710 * and runs with IRQ disabled. 30 bits should be sufficient. 711 */ 712 SCX_OPSS_QSEQ_SHIFT = 2, 713 }; 714 715 /* Use macros to ensure that the type is unsigned long for the masks */ 716 #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) 717 #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) 718 719 /* 720 * During exit, a task may schedule after losing its PIDs. When disabling the 721 * BPF scheduler, we need to be able to iterate tasks in every state to 722 * guarantee system safety. Maintain a dedicated task list which contains every 723 * task between its fork and eventual free. 724 */ 725 static DEFINE_SPINLOCK(scx_tasks_lock); 726 static LIST_HEAD(scx_tasks); 727 728 /* ops enable/disable */ 729 static struct kthread_worker *scx_ops_helper; 730 static DEFINE_MUTEX(scx_ops_enable_mutex); 731 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); 732 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 733 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); 734 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); 735 static bool scx_switching_all; 736 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 737 738 static struct sched_ext_ops scx_ops; 739 static bool scx_warned_zero_slice; 740 741 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); 742 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); 743 DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); 744 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); 745 746 struct static_key_false scx_has_op[SCX_OPI_END] = 747 { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; 748 749 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); 750 static struct scx_exit_info *scx_exit_info; 751 752 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 753 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 754 755 /* 756 * The maximum amount of time in jiffies that a task may be runnable without 757 * being scheduled on a CPU. If this timeout is exceeded, it will trigger 758 * scx_ops_error(). 759 */ 760 static unsigned long scx_watchdog_timeout; 761 762 /* 763 * The last time the delayed work was run. This delayed work relies on 764 * ksoftirqd being able to run to service timer interrupts, so it's possible 765 * that this work itself could get wedged. To account for this, we check that 766 * it's not stalled in the timer tick, and trigger an error if it is. 767 */ 768 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 769 770 static struct delayed_work scx_watchdog_work; 771 772 /* idle tracking */ 773 #ifdef CONFIG_SMP 774 #ifdef CONFIG_CPUMASK_OFFSTACK 775 #define CL_ALIGNED_IF_ONSTACK 776 #else 777 #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp 778 #endif 779 780 static struct { 781 cpumask_var_t cpu; 782 cpumask_var_t smt; 783 } idle_masks CL_ALIGNED_IF_ONSTACK; 784 785 #endif /* CONFIG_SMP */ 786 787 /* for %SCX_KICK_WAIT */ 788 static unsigned long __percpu *scx_kick_cpus_pnt_seqs; 789 790 /* 791 * Direct dispatch marker. 792 * 793 * Non-NULL values are used for direct dispatch from enqueue path. A valid 794 * pointer points to the task currently being enqueued. An ERR_PTR value is used 795 * to indicate that direct dispatch has already happened. 796 */ 797 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 798 799 /* dispatch queues */ 800 static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; 801 802 static const struct rhashtable_params dsq_hash_params = { 803 .key_len = 8, 804 .key_offset = offsetof(struct scx_dispatch_q, id), 805 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 806 }; 807 808 static struct rhashtable dsq_hash; 809 static LLIST_HEAD(dsqs_to_free); 810 811 /* dispatch buf */ 812 struct scx_dsp_buf_ent { 813 struct task_struct *task; 814 unsigned long qseq; 815 u64 dsq_id; 816 u64 enq_flags; 817 }; 818 819 static u32 scx_dsp_max_batch; 820 821 struct scx_dsp_ctx { 822 struct rq *rq; 823 struct rq_flags *rf; 824 u32 cursor; 825 u32 nr_tasks; 826 struct scx_dsp_buf_ent buf[]; 827 }; 828 829 static struct scx_dsp_ctx __percpu *scx_dsp_ctx; 830 831 /* string formatting from BPF */ 832 struct scx_bstr_buf { 833 u64 data[MAX_BPRINTF_VARARGS]; 834 char line[SCX_EXIT_MSG_LEN]; 835 }; 836 837 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 838 static struct scx_bstr_buf scx_exit_bstr_buf; 839 840 /* ops debug dump */ 841 struct scx_dump_data { 842 s32 cpu; 843 bool first; 844 s32 cursor; 845 struct seq_buf *s; 846 const char *prefix; 847 struct scx_bstr_buf buf; 848 }; 849 850 struct scx_dump_data scx_dump_data = { 851 .cpu = -1, 852 }; 853 854 /* /sys/kernel/sched_ext interface */ 855 static struct kset *scx_kset; 856 static struct kobject *scx_root_kobj; 857 858 #define CREATE_TRACE_POINTS 859 #include <trace/events/sched_ext.h> 860 861 static void scx_bpf_kick_cpu(s32 cpu, u64 flags); 862 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, 863 s64 exit_code, 864 const char *fmt, ...); 865 866 #define scx_ops_error_kind(err, fmt, args...) \ 867 scx_ops_exit_kind((err), 0, fmt, ##args) 868 869 #define scx_ops_exit(code, fmt, args...) \ 870 scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) 871 872 #define scx_ops_error(fmt, args...) \ 873 scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) 874 875 #define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) 876 877 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 878 { 879 if (time_after(at, now)) 880 return jiffies_to_msecs(at - now); 881 else 882 return -(long)jiffies_to_msecs(now - at); 883 } 884 885 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */ 886 static u32 higher_bits(u32 flags) 887 { 888 return ~((1 << fls(flags)) - 1); 889 } 890 891 /* return the mask with only the highest bit set */ 892 static u32 highest_bit(u32 flags) 893 { 894 int bit = fls(flags); 895 return ((u64)1 << bit) >> 1; 896 } 897 898 /* 899 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX 900 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate 901 * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check 902 * whether it's running from an allowed context. 903 * 904 * @mask is constant, always inline to cull the mask calculations. 905 */ 906 static __always_inline void scx_kf_allow(u32 mask) 907 { 908 /* nesting is allowed only in increasing scx_kf_mask order */ 909 WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, 910 "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", 911 current->scx.kf_mask, mask); 912 current->scx.kf_mask |= mask; 913 barrier(); 914 } 915 916 static void scx_kf_disallow(u32 mask) 917 { 918 barrier(); 919 current->scx.kf_mask &= ~mask; 920 } 921 922 #define SCX_CALL_OP(mask, op, args...) \ 923 do { \ 924 if (mask) { \ 925 scx_kf_allow(mask); \ 926 scx_ops.op(args); \ 927 scx_kf_disallow(mask); \ 928 } else { \ 929 scx_ops.op(args); \ 930 } \ 931 } while (0) 932 933 #define SCX_CALL_OP_RET(mask, op, args...) \ 934 ({ \ 935 __typeof__(scx_ops.op(args)) __ret; \ 936 if (mask) { \ 937 scx_kf_allow(mask); \ 938 __ret = scx_ops.op(args); \ 939 scx_kf_disallow(mask); \ 940 } else { \ 941 __ret = scx_ops.op(args); \ 942 } \ 943 __ret; \ 944 }) 945 946 /* 947 * Some kfuncs are allowed only on the tasks that are subjects of the 948 * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such 949 * restrictions, the following SCX_CALL_OP_*() variants should be used when 950 * invoking scx_ops operations that take task arguments. These can only be used 951 * for non-nesting operations due to the way the tasks are tracked. 952 * 953 * kfuncs which can only operate on such tasks can in turn use 954 * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on 955 * the specific task. 956 */ 957 #define SCX_CALL_OP_TASK(mask, op, task, args...) \ 958 do { \ 959 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 960 current->scx.kf_tasks[0] = task; \ 961 SCX_CALL_OP(mask, op, task, ##args); \ 962 current->scx.kf_tasks[0] = NULL; \ 963 } while (0) 964 965 #define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ 966 ({ \ 967 __typeof__(scx_ops.op(task, ##args)) __ret; \ 968 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 969 current->scx.kf_tasks[0] = task; \ 970 __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ 971 current->scx.kf_tasks[0] = NULL; \ 972 __ret; \ 973 }) 974 975 #define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ 976 ({ \ 977 __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ 978 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 979 current->scx.kf_tasks[0] = task0; \ 980 current->scx.kf_tasks[1] = task1; \ 981 __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ 982 current->scx.kf_tasks[0] = NULL; \ 983 current->scx.kf_tasks[1] = NULL; \ 984 __ret; \ 985 }) 986 987 /* @mask is constant, always inline to cull unnecessary branches */ 988 static __always_inline bool scx_kf_allowed(u32 mask) 989 { 990 if (unlikely(!(current->scx.kf_mask & mask))) { 991 scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", 992 mask, current->scx.kf_mask); 993 return false; 994 } 995 996 if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { 997 scx_ops_error("sleepable kfunc called from non-sleepable context"); 998 return false; 999 } 1000 1001 /* 1002 * Enforce nesting boundaries. e.g. A kfunc which can be called from 1003 * DISPATCH must not be called if we're running DEQUEUE which is nested 1004 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE 1005 * boundary thanks to the above in_interrupt() check. 1006 */ 1007 if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && 1008 (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { 1009 scx_ops_error("cpu_release kfunc called from a nested operation"); 1010 return false; 1011 } 1012 1013 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && 1014 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { 1015 scx_ops_error("dispatch kfunc called from a nested operation"); 1016 return false; 1017 } 1018 1019 return true; 1020 } 1021 1022 /* see SCX_CALL_OP_TASK() */ 1023 static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, 1024 struct task_struct *p) 1025 { 1026 if (!scx_kf_allowed(mask)) 1027 return false; 1028 1029 if (unlikely((p != current->scx.kf_tasks[0] && 1030 p != current->scx.kf_tasks[1]))) { 1031 scx_ops_error("called on a task not being operated on"); 1032 return false; 1033 } 1034 1035 return true; 1036 } 1037 1038 1039 /* 1040 * SCX task iterator. 1041 */ 1042 struct scx_task_iter { 1043 struct sched_ext_entity cursor; 1044 struct task_struct *locked; 1045 struct rq *rq; 1046 struct rq_flags rf; 1047 }; 1048 1049 /** 1050 * scx_task_iter_init - Initialize a task iterator 1051 * @iter: iterator to init 1052 * 1053 * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, 1054 * @iter must eventually be exited with scx_task_iter_exit(). 1055 * 1056 * scx_tasks_lock may be released between this and the first next() call or 1057 * between any two next() calls. If scx_tasks_lock is released between two 1058 * next() calls, the caller is responsible for ensuring that the task being 1059 * iterated remains accessible either through RCU read lock or obtaining a 1060 * reference count. 1061 * 1062 * All tasks which existed when the iteration started are guaranteed to be 1063 * visited as long as they still exist. 1064 */ 1065 static void scx_task_iter_init(struct scx_task_iter *iter) 1066 { 1067 lockdep_assert_held(&scx_tasks_lock); 1068 1069 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 1070 list_add(&iter->cursor.tasks_node, &scx_tasks); 1071 iter->locked = NULL; 1072 } 1073 1074 /** 1075 * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator 1076 * @iter: iterator to unlock rq for 1077 * 1078 * If @iter is in the middle of a locked iteration, it may be locking the rq of 1079 * the task currently being visited. Unlock the rq if so. This function can be 1080 * safely called anytime during an iteration. 1081 * 1082 * Returns %true if the rq @iter was locking is unlocked. %false if @iter was 1083 * not locking an rq. 1084 */ 1085 static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) 1086 { 1087 if (iter->locked) { 1088 task_rq_unlock(iter->rq, iter->locked, &iter->rf); 1089 iter->locked = NULL; 1090 return true; 1091 } else { 1092 return false; 1093 } 1094 } 1095 1096 /** 1097 * scx_task_iter_exit - Exit a task iterator 1098 * @iter: iterator to exit 1099 * 1100 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. 1101 * If the iterator holds a task's rq lock, that rq lock is released. See 1102 * scx_task_iter_init() for details. 1103 */ 1104 static void scx_task_iter_exit(struct scx_task_iter *iter) 1105 { 1106 lockdep_assert_held(&scx_tasks_lock); 1107 1108 scx_task_iter_rq_unlock(iter); 1109 list_del_init(&iter->cursor.tasks_node); 1110 } 1111 1112 /** 1113 * scx_task_iter_next - Next task 1114 * @iter: iterator to walk 1115 * 1116 * Visit the next task. See scx_task_iter_init() for details. 1117 */ 1118 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 1119 { 1120 struct list_head *cursor = &iter->cursor.tasks_node; 1121 struct sched_ext_entity *pos; 1122 1123 lockdep_assert_held(&scx_tasks_lock); 1124 1125 list_for_each_entry(pos, cursor, tasks_node) { 1126 if (&pos->tasks_node == &scx_tasks) 1127 return NULL; 1128 if (!(pos->flags & SCX_TASK_CURSOR)) { 1129 list_move(cursor, &pos->tasks_node); 1130 return container_of(pos, struct task_struct, scx); 1131 } 1132 } 1133 1134 /* can't happen, should always terminate at scx_tasks above */ 1135 BUG(); 1136 } 1137 1138 /** 1139 * scx_task_iter_next_locked - Next non-idle task with its rq locked 1140 * @iter: iterator to walk 1141 * @include_dead: Whether we should include dead tasks in the iteration 1142 * 1143 * Visit the non-idle task with its rq lock held. Allows callers to specify 1144 * whether they would like to filter out dead tasks. See scx_task_iter_init() 1145 * for details. 1146 */ 1147 static struct task_struct * 1148 scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) 1149 { 1150 struct task_struct *p; 1151 retry: 1152 scx_task_iter_rq_unlock(iter); 1153 1154 while ((p = scx_task_iter_next(iter))) { 1155 /* 1156 * is_idle_task() tests %PF_IDLE which may not be set for CPUs 1157 * which haven't yet been onlined. Test sched_class directly. 1158 */ 1159 if (p->sched_class != &idle_sched_class) 1160 break; 1161 } 1162 if (!p) 1163 return NULL; 1164 1165 iter->rq = task_rq_lock(p, &iter->rf); 1166 iter->locked = p; 1167 1168 /* 1169 * If we see %TASK_DEAD, @p already disabled preemption, is about to do 1170 * the final __schedule(), won't ever need to be scheduled again and can 1171 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter 1172 * the final __schedle() while we're locking its rq and thus will stay 1173 * alive until the rq is unlocked. 1174 */ 1175 if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) 1176 goto retry; 1177 1178 return p; 1179 } 1180 1181 static enum scx_ops_enable_state scx_ops_enable_state(void) 1182 { 1183 return atomic_read(&scx_ops_enable_state_var); 1184 } 1185 1186 static enum scx_ops_enable_state 1187 scx_ops_set_enable_state(enum scx_ops_enable_state to) 1188 { 1189 return atomic_xchg(&scx_ops_enable_state_var, to); 1190 } 1191 1192 static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, 1193 enum scx_ops_enable_state from) 1194 { 1195 int from_v = from; 1196 1197 return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); 1198 } 1199 1200 static bool scx_ops_bypassing(void) 1201 { 1202 return unlikely(atomic_read(&scx_ops_bypass_depth)); 1203 } 1204 1205 /** 1206 * wait_ops_state - Busy-wait the specified ops state to end 1207 * @p: target task 1208 * @opss: state to wait the end of 1209 * 1210 * Busy-wait for @p to transition out of @opss. This can only be used when the 1211 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1212 * has load_acquire semantics to ensure that the caller can see the updates made 1213 * in the enqueueing and dispatching paths. 1214 */ 1215 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1216 { 1217 do { 1218 cpu_relax(); 1219 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1220 } 1221 1222 /** 1223 * ops_cpu_valid - Verify a cpu number 1224 * @cpu: cpu number which came from a BPF ops 1225 * @where: extra information reported on error 1226 * 1227 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1228 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1229 * an ops error. 1230 */ 1231 static bool ops_cpu_valid(s32 cpu, const char *where) 1232 { 1233 if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { 1234 return true; 1235 } else { 1236 scx_ops_error("invalid CPU %d%s%s", cpu, 1237 where ? " " : "", where ?: ""); 1238 return false; 1239 } 1240 } 1241 1242 /** 1243 * ops_sanitize_err - Sanitize a -errno value 1244 * @ops_name: operation to blame on failure 1245 * @err: -errno value to sanitize 1246 * 1247 * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return 1248 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1249 * cause misbehaviors. For an example, a large negative return from 1250 * ops.init_task() triggers an oops when passed up the call chain because the 1251 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1252 * handled as a pointer. 1253 */ 1254 static int ops_sanitize_err(const char *ops_name, s32 err) 1255 { 1256 if (err < 0 && err >= -MAX_ERRNO) 1257 return err; 1258 1259 scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); 1260 return -EPROTO; 1261 } 1262 1263 static void update_curr_scx(struct rq *rq) 1264 { 1265 struct task_struct *curr = rq->curr; 1266 u64 now = rq_clock_task(rq); 1267 u64 delta_exec; 1268 1269 if (time_before_eq64(now, curr->se.exec_start)) 1270 return; 1271 1272 delta_exec = now - curr->se.exec_start; 1273 curr->se.exec_start = now; 1274 curr->se.sum_exec_runtime += delta_exec; 1275 account_group_exec_runtime(curr, delta_exec); 1276 cgroup_account_cputime(curr, delta_exec); 1277 1278 if (curr->scx.slice != SCX_SLICE_INF) 1279 curr->scx.slice -= min(curr->scx.slice, delta_exec); 1280 } 1281 1282 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) 1283 { 1284 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1285 WRITE_ONCE(dsq->nr, dsq->nr + delta); 1286 } 1287 1288 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, 1289 u64 enq_flags) 1290 { 1291 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1292 1293 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node)); 1294 1295 if (!is_local) { 1296 raw_spin_lock(&dsq->lock); 1297 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1298 scx_ops_error("attempting to dispatch to a destroyed dsq"); 1299 /* fall back to the global dsq */ 1300 raw_spin_unlock(&dsq->lock); 1301 dsq = &scx_dsq_global; 1302 raw_spin_lock(&dsq->lock); 1303 } 1304 } 1305 1306 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 1307 list_add(&p->scx.dsq_node, &dsq->list); 1308 else 1309 list_add_tail(&p->scx.dsq_node, &dsq->list); 1310 1311 dsq_mod_nr(dsq, 1); 1312 p->scx.dsq = dsq; 1313 1314 /* 1315 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the 1316 * direct dispatch path, but we clear them here because the direct 1317 * dispatch verdict may be overridden on the enqueue path during e.g. 1318 * bypass. 1319 */ 1320 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1321 p->scx.ddsp_enq_flags = 0; 1322 1323 /* 1324 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1325 * match waiters' load_acquire. 1326 */ 1327 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1328 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1329 1330 if (is_local) { 1331 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1332 bool preempt = false; 1333 1334 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1335 rq->curr->sched_class == &ext_sched_class) { 1336 rq->curr->scx.slice = 0; 1337 preempt = true; 1338 } 1339 1340 if (preempt || sched_class_above(&ext_sched_class, 1341 rq->curr->sched_class)) 1342 resched_curr(rq); 1343 } else { 1344 raw_spin_unlock(&dsq->lock); 1345 } 1346 } 1347 1348 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1349 { 1350 struct scx_dispatch_q *dsq = p->scx.dsq; 1351 bool is_local = dsq == &rq->scx.local_dsq; 1352 1353 if (!dsq) { 1354 WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); 1355 /* 1356 * When dispatching directly from the BPF scheduler to a local 1357 * DSQ, the task isn't associated with any DSQ but 1358 * @p->scx.holding_cpu may be set under the protection of 1359 * %SCX_OPSS_DISPATCHING. 1360 */ 1361 if (p->scx.holding_cpu >= 0) 1362 p->scx.holding_cpu = -1; 1363 return; 1364 } 1365 1366 if (!is_local) 1367 raw_spin_lock(&dsq->lock); 1368 1369 /* 1370 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node 1371 * can't change underneath us. 1372 */ 1373 if (p->scx.holding_cpu < 0) { 1374 /* @p must still be on @dsq, dequeue */ 1375 WARN_ON_ONCE(list_empty(&p->scx.dsq_node)); 1376 list_del_init(&p->scx.dsq_node); 1377 dsq_mod_nr(dsq, -1); 1378 } else { 1379 /* 1380 * We're racing against dispatch_to_local_dsq() which already 1381 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1382 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1383 * the race. 1384 */ 1385 WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); 1386 p->scx.holding_cpu = -1; 1387 } 1388 p->scx.dsq = NULL; 1389 1390 if (!is_local) 1391 raw_spin_unlock(&dsq->lock); 1392 } 1393 1394 static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 1395 { 1396 return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 1397 } 1398 1399 static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) 1400 { 1401 lockdep_assert(rcu_read_lock_any_held()); 1402 1403 if (dsq_id == SCX_DSQ_GLOBAL) 1404 return &scx_dsq_global; 1405 else 1406 return find_user_dsq(dsq_id); 1407 } 1408 1409 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, 1410 struct task_struct *p) 1411 { 1412 struct scx_dispatch_q *dsq; 1413 1414 if (dsq_id == SCX_DSQ_LOCAL) 1415 return &rq->scx.local_dsq; 1416 1417 dsq = find_non_local_dsq(dsq_id); 1418 if (unlikely(!dsq)) { 1419 scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", 1420 dsq_id, p->comm, p->pid); 1421 return &scx_dsq_global; 1422 } 1423 1424 return dsq; 1425 } 1426 1427 static void mark_direct_dispatch(struct task_struct *ddsp_task, 1428 struct task_struct *p, u64 dsq_id, 1429 u64 enq_flags) 1430 { 1431 /* 1432 * Mark that dispatch already happened from ops.select_cpu() or 1433 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1434 * which can never match a valid task pointer. 1435 */ 1436 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1437 1438 /* @p must match the task on the enqueue path */ 1439 if (unlikely(p != ddsp_task)) { 1440 if (IS_ERR(ddsp_task)) 1441 scx_ops_error("%s[%d] already direct-dispatched", 1442 p->comm, p->pid); 1443 else 1444 scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1445 ddsp_task->comm, ddsp_task->pid, 1446 p->comm, p->pid); 1447 return; 1448 } 1449 1450 /* 1451 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because 1452 * dispatching to the local DSQ of a different CPU requires unlocking 1453 * the current rq which isn't allowed in the enqueue path. Use 1454 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. 1455 */ 1456 if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { 1457 scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); 1458 return; 1459 } 1460 1461 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1462 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1463 1464 p->scx.ddsp_dsq_id = dsq_id; 1465 p->scx.ddsp_enq_flags = enq_flags; 1466 } 1467 1468 static void direct_dispatch(struct task_struct *p, u64 enq_flags) 1469 { 1470 struct scx_dispatch_q *dsq; 1471 1472 enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1473 dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); 1474 dispatch_enqueue(dsq, p, enq_flags); 1475 } 1476 1477 static bool scx_rq_online(struct rq *rq) 1478 { 1479 return likely(rq->scx.flags & SCX_RQ_ONLINE); 1480 } 1481 1482 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1483 int sticky_cpu) 1484 { 1485 struct task_struct **ddsp_taskp; 1486 unsigned long qseq; 1487 1488 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1489 1490 /* rq migration */ 1491 if (sticky_cpu == cpu_of(rq)) 1492 goto local_norefill; 1493 1494 /* 1495 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 1496 * is offline and are just running the hotplug path. Don't bother the 1497 * BPF scheduler. 1498 */ 1499 if (!scx_rq_online(rq)) 1500 goto local; 1501 1502 if (scx_ops_bypassing()) { 1503 if (enq_flags & SCX_ENQ_LAST) 1504 goto local; 1505 else 1506 goto global; 1507 } 1508 1509 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1510 goto direct; 1511 1512 /* see %SCX_OPS_ENQ_EXITING */ 1513 if (!static_branch_unlikely(&scx_ops_enq_exiting) && 1514 unlikely(p->flags & PF_EXITING)) 1515 goto local; 1516 1517 /* see %SCX_OPS_ENQ_LAST */ 1518 if (!static_branch_unlikely(&scx_ops_enq_last) && 1519 (enq_flags & SCX_ENQ_LAST)) 1520 goto local; 1521 1522 if (!SCX_HAS_OP(enqueue)) 1523 goto global; 1524 1525 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1526 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1527 1528 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1529 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1530 1531 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1532 WARN_ON_ONCE(*ddsp_taskp); 1533 *ddsp_taskp = p; 1534 1535 SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); 1536 1537 *ddsp_taskp = NULL; 1538 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1539 goto direct; 1540 1541 /* 1542 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1543 * dequeue may be waiting. The store_release matches their load_acquire. 1544 */ 1545 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1546 return; 1547 1548 direct: 1549 direct_dispatch(p, enq_flags); 1550 return; 1551 1552 local: 1553 p->scx.slice = SCX_SLICE_DFL; 1554 local_norefill: 1555 dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); 1556 return; 1557 1558 global: 1559 p->scx.slice = SCX_SLICE_DFL; 1560 dispatch_enqueue(&scx_dsq_global, p, enq_flags); 1561 } 1562 1563 static bool task_runnable(const struct task_struct *p) 1564 { 1565 return !list_empty(&p->scx.runnable_node); 1566 } 1567 1568 static void set_task_runnable(struct rq *rq, struct task_struct *p) 1569 { 1570 lockdep_assert_rq_held(rq); 1571 1572 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 1573 p->scx.runnable_at = jiffies; 1574 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 1575 } 1576 1577 /* 1578 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being 1579 * appened to the runnable_list. 1580 */ 1581 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 1582 } 1583 1584 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 1585 { 1586 list_del_init(&p->scx.runnable_node); 1587 if (reset_runnable_at) 1588 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 1589 } 1590 1591 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) 1592 { 1593 int sticky_cpu = p->scx.sticky_cpu; 1594 1595 enq_flags |= rq->scx.extra_enq_flags; 1596 1597 if (sticky_cpu >= 0) 1598 p->scx.sticky_cpu = -1; 1599 1600 /* 1601 * Restoring a running task will be immediately followed by 1602 * set_next_task_scx() which expects the task to not be on the BPF 1603 * scheduler as tasks can only start running through local DSQs. Force 1604 * direct-dispatch into the local DSQ by setting the sticky_cpu. 1605 */ 1606 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 1607 sticky_cpu = cpu_of(rq); 1608 1609 if (p->scx.flags & SCX_TASK_QUEUED) { 1610 WARN_ON_ONCE(!task_runnable(p)); 1611 return; 1612 } 1613 1614 set_task_runnable(rq, p); 1615 p->scx.flags |= SCX_TASK_QUEUED; 1616 rq->scx.nr_running++; 1617 add_nr_running(rq, 1); 1618 1619 if (SCX_HAS_OP(runnable)) 1620 SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); 1621 1622 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 1623 } 1624 1625 static void ops_dequeue(struct task_struct *p, u64 deq_flags) 1626 { 1627 unsigned long opss; 1628 1629 /* dequeue is always temporary, don't reset runnable_at */ 1630 clr_task_runnable(p, false); 1631 1632 /* acquire ensures that we see the preceding updates on QUEUED */ 1633 opss = atomic_long_read_acquire(&p->scx.ops_state); 1634 1635 switch (opss & SCX_OPSS_STATE_MASK) { 1636 case SCX_OPSS_NONE: 1637 break; 1638 case SCX_OPSS_QUEUEING: 1639 /* 1640 * QUEUEING is started and finished while holding @p's rq lock. 1641 * As we're holding the rq lock now, we shouldn't see QUEUEING. 1642 */ 1643 BUG(); 1644 case SCX_OPSS_QUEUED: 1645 if (SCX_HAS_OP(dequeue)) 1646 SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); 1647 1648 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 1649 SCX_OPSS_NONE)) 1650 break; 1651 fallthrough; 1652 case SCX_OPSS_DISPATCHING: 1653 /* 1654 * If @p is being dispatched from the BPF scheduler to a DSQ, 1655 * wait for the transfer to complete so that @p doesn't get 1656 * added to its DSQ after dequeueing is complete. 1657 * 1658 * As we're waiting on DISPATCHING with the rq locked, the 1659 * dispatching side shouldn't try to lock the rq while 1660 * DISPATCHING is set. See dispatch_to_local_dsq(). 1661 * 1662 * DISPATCHING shouldn't have qseq set and control can reach 1663 * here with NONE @opss from the above QUEUED case block. 1664 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 1665 */ 1666 wait_ops_state(p, SCX_OPSS_DISPATCHING); 1667 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1668 break; 1669 } 1670 } 1671 1672 static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) 1673 { 1674 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 1675 WARN_ON_ONCE(task_runnable(p)); 1676 return; 1677 } 1678 1679 ops_dequeue(p, deq_flags); 1680 1681 /* 1682 * A currently running task which is going off @rq first gets dequeued 1683 * and then stops running. As we want running <-> stopping transitions 1684 * to be contained within runnable <-> quiescent transitions, trigger 1685 * ->stopping() early here instead of in put_prev_task_scx(). 1686 * 1687 * @p may go through multiple stopping <-> running transitions between 1688 * here and put_prev_task_scx() if task attribute changes occur while 1689 * balance_scx() leaves @rq unlocked. However, they don't contain any 1690 * information meaningful to the BPF scheduler and can be suppressed by 1691 * skipping the callbacks if the task is !QUEUED. 1692 */ 1693 if (SCX_HAS_OP(stopping) && task_current(rq, p)) { 1694 update_curr_scx(rq); 1695 SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); 1696 } 1697 1698 if (SCX_HAS_OP(quiescent)) 1699 SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); 1700 1701 if (deq_flags & SCX_DEQ_SLEEP) 1702 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 1703 else 1704 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 1705 1706 p->scx.flags &= ~SCX_TASK_QUEUED; 1707 rq->scx.nr_running--; 1708 sub_nr_running(rq, 1); 1709 1710 dispatch_dequeue(rq, p); 1711 } 1712 1713 static void yield_task_scx(struct rq *rq) 1714 { 1715 struct task_struct *p = rq->curr; 1716 1717 if (SCX_HAS_OP(yield)) 1718 SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); 1719 else 1720 p->scx.slice = 0; 1721 } 1722 1723 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 1724 { 1725 struct task_struct *from = rq->curr; 1726 1727 if (SCX_HAS_OP(yield)) 1728 return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); 1729 else 1730 return false; 1731 } 1732 1733 #ifdef CONFIG_SMP 1734 /** 1735 * move_task_to_local_dsq - Move a task from a different rq to a local DSQ 1736 * @rq: rq to move the task into, currently locked 1737 * @p: task to move 1738 * @enq_flags: %SCX_ENQ_* 1739 * 1740 * Move @p which is currently on a different rq to @rq's local DSQ. The caller 1741 * must: 1742 * 1743 * 1. Start with exclusive access to @p either through its DSQ lock or 1744 * %SCX_OPSS_DISPATCHING flag. 1745 * 1746 * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). 1747 * 1748 * 3. Remember task_rq(@p). Release the exclusive access so that we don't 1749 * deadlock with dequeue. 1750 * 1751 * 4. Lock @rq and the task_rq from #3. 1752 * 1753 * 5. Call this function. 1754 * 1755 * Returns %true if @p was successfully moved. %false after racing dequeue and 1756 * losing. 1757 */ 1758 static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, 1759 u64 enq_flags) 1760 { 1761 struct rq *task_rq; 1762 1763 lockdep_assert_rq_held(rq); 1764 1765 /* 1766 * If dequeue got to @p while we were trying to lock both rq's, it'd 1767 * have cleared @p->scx.holding_cpu to -1. While other cpus may have 1768 * updated it to different values afterwards, as this operation can't be 1769 * preempted or recurse, @p->scx.holding_cpu can never become 1770 * raw_smp_processor_id() again before we're done. Thus, we can tell 1771 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is 1772 * still raw_smp_processor_id(). 1773 * 1774 * See dispatch_dequeue() for the counterpart. 1775 */ 1776 if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) 1777 return false; 1778 1779 /* @p->rq couldn't have changed if we're still the holding cpu */ 1780 task_rq = task_rq(p); 1781 lockdep_assert_rq_held(task_rq); 1782 1783 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); 1784 deactivate_task(task_rq, p, 0); 1785 set_task_cpu(p, cpu_of(rq)); 1786 p->scx.sticky_cpu = cpu_of(rq); 1787 1788 /* 1789 * We want to pass scx-specific enq_flags but activate_task() will 1790 * truncate the upper 32 bit. As we own @rq, we can pass them through 1791 * @rq->scx.extra_enq_flags instead. 1792 */ 1793 WARN_ON_ONCE(rq->scx.extra_enq_flags); 1794 rq->scx.extra_enq_flags = enq_flags; 1795 activate_task(rq, p, 0); 1796 rq->scx.extra_enq_flags = 0; 1797 1798 return true; 1799 } 1800 1801 /** 1802 * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked 1803 * @rq: current rq which is locked 1804 * @rf: rq_flags to use when unlocking @rq 1805 * @src_rq: rq to move task from 1806 * @dst_rq: rq to move task to 1807 * 1808 * We're holding @rq lock and trying to dispatch a task from @src_rq to 1809 * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether 1810 * @rq stays locked isn't important as long as the state is restored after 1811 * dispatch_to_local_dsq_unlock(). 1812 */ 1813 static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, 1814 struct rq *src_rq, struct rq *dst_rq) 1815 { 1816 rq_unpin_lock(rq, rf); 1817 1818 if (src_rq == dst_rq) { 1819 raw_spin_rq_unlock(rq); 1820 raw_spin_rq_lock(dst_rq); 1821 } else if (rq == src_rq) { 1822 double_lock_balance(rq, dst_rq); 1823 rq_repin_lock(rq, rf); 1824 } else if (rq == dst_rq) { 1825 double_lock_balance(rq, src_rq); 1826 rq_repin_lock(rq, rf); 1827 } else { 1828 raw_spin_rq_unlock(rq); 1829 double_rq_lock(src_rq, dst_rq); 1830 } 1831 } 1832 1833 /** 1834 * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() 1835 * @rq: current rq which is locked 1836 * @rf: rq_flags to use when unlocking @rq 1837 * @src_rq: rq to move task from 1838 * @dst_rq: rq to move task to 1839 * 1840 * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. 1841 */ 1842 static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, 1843 struct rq *src_rq, struct rq *dst_rq) 1844 { 1845 if (src_rq == dst_rq) { 1846 raw_spin_rq_unlock(dst_rq); 1847 raw_spin_rq_lock(rq); 1848 rq_repin_lock(rq, rf); 1849 } else if (rq == src_rq) { 1850 double_unlock_balance(rq, dst_rq); 1851 } else if (rq == dst_rq) { 1852 double_unlock_balance(rq, src_rq); 1853 } else { 1854 double_rq_unlock(src_rq, dst_rq); 1855 raw_spin_rq_lock(rq); 1856 rq_repin_lock(rq, rf); 1857 } 1858 } 1859 #endif /* CONFIG_SMP */ 1860 1861 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, 1862 struct task_struct *p) 1863 { 1864 lockdep_assert_held(&dsq->lock); /* released on return */ 1865 1866 /* @dsq is locked and @p is on this rq */ 1867 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 1868 list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list); 1869 dsq_mod_nr(dsq, -1); 1870 dsq_mod_nr(&rq->scx.local_dsq, 1); 1871 p->scx.dsq = &rq->scx.local_dsq; 1872 raw_spin_unlock(&dsq->lock); 1873 } 1874 1875 #ifdef CONFIG_SMP 1876 /* 1877 * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p 1878 * can be pulled to @rq. 1879 */ 1880 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) 1881 { 1882 int cpu = cpu_of(rq); 1883 1884 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1885 return false; 1886 if (unlikely(is_migration_disabled(p))) 1887 return false; 1888 if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) 1889 return false; 1890 if (!scx_rq_online(rq)) 1891 return false; 1892 return true; 1893 } 1894 1895 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, 1896 struct scx_dispatch_q *dsq, 1897 struct task_struct *p, struct rq *task_rq) 1898 { 1899 bool moved = false; 1900 1901 lockdep_assert_held(&dsq->lock); /* released on return */ 1902 1903 /* 1904 * @dsq is locked and @p is on a remote rq. @p is currently protected by 1905 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab 1906 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the 1907 * rq lock or fail, do a little dancing from our side. See 1908 * move_task_to_local_dsq(). 1909 */ 1910 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 1911 list_del_init(&p->scx.dsq_node); 1912 dsq_mod_nr(dsq, -1); 1913 p->scx.holding_cpu = raw_smp_processor_id(); 1914 raw_spin_unlock(&dsq->lock); 1915 1916 rq_unpin_lock(rq, rf); 1917 double_lock_balance(rq, task_rq); 1918 rq_repin_lock(rq, rf); 1919 1920 moved = move_task_to_local_dsq(rq, p, 0); 1921 1922 double_unlock_balance(rq, task_rq); 1923 1924 return moved; 1925 } 1926 #else /* CONFIG_SMP */ 1927 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } 1928 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, 1929 struct scx_dispatch_q *dsq, 1930 struct task_struct *p, struct rq *task_rq) { return false; } 1931 #endif /* CONFIG_SMP */ 1932 1933 static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, 1934 struct scx_dispatch_q *dsq) 1935 { 1936 struct task_struct *p; 1937 retry: 1938 if (list_empty(&dsq->list)) 1939 return false; 1940 1941 raw_spin_lock(&dsq->lock); 1942 1943 list_for_each_entry(p, &dsq->list, scx.dsq_node) { 1944 struct rq *task_rq = task_rq(p); 1945 1946 if (rq == task_rq) { 1947 consume_local_task(rq, dsq, p); 1948 return true; 1949 } 1950 1951 if (task_can_run_on_remote_rq(p, rq)) { 1952 if (likely(consume_remote_task(rq, rf, dsq, p, task_rq))) 1953 return true; 1954 goto retry; 1955 } 1956 } 1957 1958 raw_spin_unlock(&dsq->lock); 1959 return false; 1960 } 1961 1962 enum dispatch_to_local_dsq_ret { 1963 DTL_DISPATCHED, /* successfully dispatched */ 1964 DTL_LOST, /* lost race to dequeue */ 1965 DTL_NOT_LOCAL, /* destination is not a local DSQ */ 1966 DTL_INVALID, /* invalid local dsq_id */ 1967 }; 1968 1969 /** 1970 * dispatch_to_local_dsq - Dispatch a task to a local dsq 1971 * @rq: current rq which is locked 1972 * @rf: rq_flags to use when unlocking @rq 1973 * @dsq_id: destination dsq ID 1974 * @p: task to dispatch 1975 * @enq_flags: %SCX_ENQ_* 1976 * 1977 * We're holding @rq lock and want to dispatch @p to the local DSQ identified by 1978 * @dsq_id. This function performs all the synchronization dancing needed 1979 * because local DSQs are protected with rq locks. 1980 * 1981 * The caller must have exclusive ownership of @p (e.g. through 1982 * %SCX_OPSS_DISPATCHING). 1983 */ 1984 static enum dispatch_to_local_dsq_ret 1985 dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, 1986 struct task_struct *p, u64 enq_flags) 1987 { 1988 struct rq *src_rq = task_rq(p); 1989 struct rq *dst_rq; 1990 1991 /* 1992 * We're synchronized against dequeue through DISPATCHING. As @p can't 1993 * be dequeued, its task_rq and cpus_allowed are stable too. 1994 */ 1995 if (dsq_id == SCX_DSQ_LOCAL) { 1996 dst_rq = rq; 1997 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1998 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1999 2000 if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 2001 return DTL_INVALID; 2002 dst_rq = cpu_rq(cpu); 2003 } else { 2004 return DTL_NOT_LOCAL; 2005 } 2006 2007 /* if dispatching to @rq that @p is already on, no lock dancing needed */ 2008 if (rq == src_rq && rq == dst_rq) { 2009 dispatch_enqueue(&dst_rq->scx.local_dsq, p, 2010 enq_flags | SCX_ENQ_CLEAR_OPSS); 2011 return DTL_DISPATCHED; 2012 } 2013 2014 #ifdef CONFIG_SMP 2015 if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { 2016 struct rq *locked_dst_rq = dst_rq; 2017 bool dsp; 2018 2019 /* 2020 * @p is on a possibly remote @src_rq which we need to lock to 2021 * move the task. If dequeue is in progress, it'd be locking 2022 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq 2023 * lock while holding DISPATCHING. 2024 * 2025 * As DISPATCHING guarantees that @p is wholly ours, we can 2026 * pretend that we're moving from a DSQ and use the same 2027 * mechanism - mark the task under transfer with holding_cpu, 2028 * release DISPATCHING and then follow the same protocol. 2029 */ 2030 p->scx.holding_cpu = raw_smp_processor_id(); 2031 2032 /* store_release ensures that dequeue sees the above */ 2033 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2034 2035 dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); 2036 2037 /* 2038 * We don't require the BPF scheduler to avoid dispatching to 2039 * offline CPUs mostly for convenience but also because CPUs can 2040 * go offline between scx_bpf_dispatch() calls and here. If @p 2041 * is destined to an offline CPU, queue it on its current CPU 2042 * instead, which should always be safe. As this is an allowed 2043 * behavior, don't trigger an ops error. 2044 */ 2045 if (!scx_rq_online(dst_rq)) 2046 dst_rq = src_rq; 2047 2048 if (src_rq == dst_rq) { 2049 /* 2050 * As @p is staying on the same rq, there's no need to 2051 * go through the full deactivate/activate cycle. 2052 * Optimize by abbreviating the operations in 2053 * move_task_to_local_dsq(). 2054 */ 2055 dsp = p->scx.holding_cpu == raw_smp_processor_id(); 2056 if (likely(dsp)) { 2057 p->scx.holding_cpu = -1; 2058 dispatch_enqueue(&dst_rq->scx.local_dsq, p, 2059 enq_flags); 2060 } 2061 } else { 2062 dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); 2063 } 2064 2065 /* if the destination CPU is idle, wake it up */ 2066 if (dsp && sched_class_above(p->sched_class, 2067 dst_rq->curr->sched_class)) 2068 resched_curr(dst_rq); 2069 2070 dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); 2071 2072 return dsp ? DTL_DISPATCHED : DTL_LOST; 2073 } 2074 #endif /* CONFIG_SMP */ 2075 2076 scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", 2077 cpu_of(dst_rq), p->comm, p->pid); 2078 return DTL_INVALID; 2079 } 2080 2081 /** 2082 * finish_dispatch - Asynchronously finish dispatching a task 2083 * @rq: current rq which is locked 2084 * @rf: rq_flags to use when unlocking @rq 2085 * @p: task to finish dispatching 2086 * @qseq_at_dispatch: qseq when @p started getting dispatched 2087 * @dsq_id: destination DSQ ID 2088 * @enq_flags: %SCX_ENQ_* 2089 * 2090 * Dispatching to local DSQs may need to wait for queueing to complete or 2091 * require rq lock dancing. As we don't wanna do either while inside 2092 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2093 * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the 2094 * task and its qseq. Once ops.dispatch() returns, this function is called to 2095 * finish up. 2096 * 2097 * There is no guarantee that @p is still valid for dispatching or even that it 2098 * was valid in the first place. Make sure that the task is still owned by the 2099 * BPF scheduler and claim the ownership before dispatching. 2100 */ 2101 static void finish_dispatch(struct rq *rq, struct rq_flags *rf, 2102 struct task_struct *p, 2103 unsigned long qseq_at_dispatch, 2104 u64 dsq_id, u64 enq_flags) 2105 { 2106 struct scx_dispatch_q *dsq; 2107 unsigned long opss; 2108 2109 retry: 2110 /* 2111 * No need for _acquire here. @p is accessed only after a successful 2112 * try_cmpxchg to DISPATCHING. 2113 */ 2114 opss = atomic_long_read(&p->scx.ops_state); 2115 2116 switch (opss & SCX_OPSS_STATE_MASK) { 2117 case SCX_OPSS_DISPATCHING: 2118 case SCX_OPSS_NONE: 2119 /* someone else already got to it */ 2120 return; 2121 case SCX_OPSS_QUEUED: 2122 /* 2123 * If qseq doesn't match, @p has gone through at least one 2124 * dispatch/dequeue and re-enqueue cycle between 2125 * scx_bpf_dispatch() and here and we have no claim on it. 2126 */ 2127 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2128 return; 2129 2130 /* 2131 * While we know @p is accessible, we don't yet have a claim on 2132 * it - the BPF scheduler is allowed to dispatch tasks 2133 * spuriously and there can be a racing dequeue attempt. Let's 2134 * claim @p by atomically transitioning it from QUEUED to 2135 * DISPATCHING. 2136 */ 2137 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2138 SCX_OPSS_DISPATCHING))) 2139 break; 2140 goto retry; 2141 case SCX_OPSS_QUEUEING: 2142 /* 2143 * do_enqueue_task() is in the process of transferring the task 2144 * to the BPF scheduler while holding @p's rq lock. As we aren't 2145 * holding any kernel or BPF resource that the enqueue path may 2146 * depend upon, it's safe to wait. 2147 */ 2148 wait_ops_state(p, opss); 2149 goto retry; 2150 } 2151 2152 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2153 2154 switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { 2155 case DTL_DISPATCHED: 2156 break; 2157 case DTL_LOST: 2158 break; 2159 case DTL_INVALID: 2160 dsq_id = SCX_DSQ_GLOBAL; 2161 fallthrough; 2162 case DTL_NOT_LOCAL: 2163 dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), 2164 dsq_id, p); 2165 dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2166 break; 2167 } 2168 } 2169 2170 static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) 2171 { 2172 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 2173 u32 u; 2174 2175 for (u = 0; u < dspc->cursor; u++) { 2176 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2177 2178 finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, 2179 ent->enq_flags); 2180 } 2181 2182 dspc->nr_tasks += dspc->cursor; 2183 dspc->cursor = 0; 2184 } 2185 2186 static int balance_scx(struct rq *rq, struct task_struct *prev, 2187 struct rq_flags *rf) 2188 { 2189 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 2190 bool prev_on_scx = prev->sched_class == &ext_sched_class; 2191 int nr_loops = SCX_DSP_MAX_LOOPS; 2192 bool has_tasks = false; 2193 2194 lockdep_assert_rq_held(rq); 2195 rq->scx.flags |= SCX_RQ_BALANCING; 2196 2197 if (static_branch_unlikely(&scx_ops_cpu_preempt) && 2198 unlikely(rq->scx.cpu_released)) { 2199 /* 2200 * If the previous sched_class for the current CPU was not SCX, 2201 * notify the BPF scheduler that it again has control of the 2202 * core. This callback complements ->cpu_release(), which is 2203 * emitted in scx_next_task_picked(). 2204 */ 2205 if (SCX_HAS_OP(cpu_acquire)) 2206 SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); 2207 rq->scx.cpu_released = false; 2208 } 2209 2210 if (prev_on_scx) { 2211 WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP); 2212 update_curr_scx(rq); 2213 2214 /* 2215 * If @prev is runnable & has slice left, it has priority and 2216 * fetching more just increases latency for the fetched tasks. 2217 * Tell put_prev_task_scx() to put @prev on local_dsq. If the 2218 * BPF scheduler wants to handle this explicitly, it should 2219 * implement ->cpu_released(). 2220 * 2221 * See scx_ops_disable_workfn() for the explanation on the 2222 * bypassing test. 2223 */ 2224 if ((prev->scx.flags & SCX_TASK_QUEUED) && 2225 prev->scx.slice && !scx_ops_bypassing()) { 2226 prev->scx.flags |= SCX_TASK_BAL_KEEP; 2227 goto has_tasks; 2228 } 2229 } 2230 2231 /* if there already are tasks to run, nothing to do */ 2232 if (rq->scx.local_dsq.nr) 2233 goto has_tasks; 2234 2235 if (consume_dispatch_q(rq, rf, &scx_dsq_global)) 2236 goto has_tasks; 2237 2238 if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) 2239 goto out; 2240 2241 dspc->rq = rq; 2242 dspc->rf = rf; 2243 2244 /* 2245 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2246 * the local DSQ might still end up empty after a successful 2247 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2248 * produced some tasks, retry. The BPF scheduler may depend on this 2249 * looping behavior to simplify its implementation. 2250 */ 2251 do { 2252 dspc->nr_tasks = 0; 2253 2254 SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), 2255 prev_on_scx ? prev : NULL); 2256 2257 flush_dispatch_buf(rq, rf); 2258 2259 if (rq->scx.local_dsq.nr) 2260 goto has_tasks; 2261 if (consume_dispatch_q(rq, rf, &scx_dsq_global)) 2262 goto has_tasks; 2263 2264 /* 2265 * ops.dispatch() can trap us in this loop by repeatedly 2266 * dispatching ineligible tasks. Break out once in a while to 2267 * allow the watchdog to run. As IRQ can't be enabled in 2268 * balance(), we want to complete this scheduling cycle and then 2269 * start a new one. IOW, we want to call resched_curr() on the 2270 * next, most likely idle, task, not the current one. Use 2271 * scx_bpf_kick_cpu() for deferred kicking. 2272 */ 2273 if (unlikely(!--nr_loops)) { 2274 scx_bpf_kick_cpu(cpu_of(rq), 0); 2275 break; 2276 } 2277 } while (dspc->nr_tasks); 2278 2279 goto out; 2280 2281 has_tasks: 2282 has_tasks = true; 2283 out: 2284 rq->scx.flags &= ~SCX_RQ_BALANCING; 2285 return has_tasks; 2286 } 2287 2288 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 2289 { 2290 if (p->scx.flags & SCX_TASK_QUEUED) { 2291 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2292 dispatch_dequeue(rq, p); 2293 } 2294 2295 p->se.exec_start = rq_clock_task(rq); 2296 2297 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2298 if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) 2299 SCX_CALL_OP_TASK(SCX_KF_REST, running, p); 2300 2301 clr_task_runnable(p, true); 2302 2303 /* 2304 * @p is getting newly scheduled or got kicked after someone updated its 2305 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 2306 */ 2307 if ((p->scx.slice == SCX_SLICE_INF) != 2308 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 2309 if (p->scx.slice == SCX_SLICE_INF) 2310 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 2311 else 2312 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 2313 2314 sched_update_tick_dependency(rq); 2315 2316 /* 2317 * For now, let's refresh the load_avgs just when transitioning 2318 * in and out of nohz. In the future, we might want to add a 2319 * mechanism which calls the following periodically on 2320 * tick-stopped CPUs. 2321 */ 2322 update_other_load_avgs(rq); 2323 } 2324 } 2325 2326 static void put_prev_task_scx(struct rq *rq, struct task_struct *p) 2327 { 2328 #ifndef CONFIG_SMP 2329 /* 2330 * UP workaround. 2331 * 2332 * Because SCX may transfer tasks across CPUs during dispatch, dispatch 2333 * is performed from its balance operation which isn't called in UP. 2334 * Let's work around by calling it from the operations which come right 2335 * after. 2336 * 2337 * 1. If the prev task is on SCX, pick_next_task() calls 2338 * .put_prev_task() right after. As .put_prev_task() is also called 2339 * from other places, we need to distinguish the calls which can be 2340 * done by looking at the previous task's state - if still queued or 2341 * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). 2342 * This case is handled here. 2343 * 2344 * 2. If the prev task is not on SCX, the first following call into SCX 2345 * will be .pick_next_task(), which is covered by calling 2346 * balance_scx() from pick_next_task_scx(). 2347 * 2348 * Note that we can't merge the first case into the second as 2349 * balance_scx() must be called before the previous SCX task goes 2350 * through put_prev_task_scx(). 2351 * 2352 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. 2353 * Pass in %NULL. 2354 */ 2355 if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) 2356 balance_scx(rq, p, NULL); 2357 #endif 2358 2359 update_curr_scx(rq); 2360 2361 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2362 if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 2363 SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); 2364 2365 /* 2366 * If we're being called from put_prev_task_balance(), balance_scx() may 2367 * have decided that @p should keep running. 2368 */ 2369 if (p->scx.flags & SCX_TASK_BAL_KEEP) { 2370 p->scx.flags &= ~SCX_TASK_BAL_KEEP; 2371 set_task_runnable(rq, p); 2372 dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); 2373 return; 2374 } 2375 2376 if (p->scx.flags & SCX_TASK_QUEUED) { 2377 set_task_runnable(rq, p); 2378 2379 /* 2380 * If @p has slice left and balance_scx() didn't tag it for 2381 * keeping, @p is getting preempted by a higher priority 2382 * scheduler class. Leave it at the head of the local DSQ. 2383 */ 2384 if (p->scx.slice && !scx_ops_bypassing()) { 2385 dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); 2386 return; 2387 } 2388 2389 /* 2390 * If we're in the pick_next_task path, balance_scx() should 2391 * have already populated the local DSQ if there are any other 2392 * available tasks. If empty, tell ops.enqueue() that @p is the 2393 * only one available for this cpu. ops.enqueue() should put it 2394 * on the local DSQ so that the subsequent pick_next_task_scx() 2395 * can find the task unless it wants to trigger a separate 2396 * follow-up scheduling event. 2397 */ 2398 if (list_empty(&rq->scx.local_dsq.list)) 2399 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 2400 else 2401 do_enqueue_task(rq, p, 0, -1); 2402 } 2403 } 2404 2405 static struct task_struct *first_local_task(struct rq *rq) 2406 { 2407 return list_first_entry_or_null(&rq->scx.local_dsq.list, 2408 struct task_struct, scx.dsq_node); 2409 } 2410 2411 static struct task_struct *pick_next_task_scx(struct rq *rq) 2412 { 2413 struct task_struct *p; 2414 2415 #ifndef CONFIG_SMP 2416 /* UP workaround - see the comment at the head of put_prev_task_scx() */ 2417 if (unlikely(rq->curr->sched_class != &ext_sched_class)) 2418 balance_scx(rq, rq->curr, NULL); 2419 #endif 2420 2421 p = first_local_task(rq); 2422 if (!p) 2423 return NULL; 2424 2425 set_next_task_scx(rq, p, true); 2426 2427 if (unlikely(!p->scx.slice)) { 2428 if (!scx_ops_bypassing() && !scx_warned_zero_slice) { 2429 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", 2430 p->comm, p->pid); 2431 scx_warned_zero_slice = true; 2432 } 2433 p->scx.slice = SCX_SLICE_DFL; 2434 } 2435 2436 return p; 2437 } 2438 2439 static enum scx_cpu_preempt_reason 2440 preempt_reason_from_class(const struct sched_class *class) 2441 { 2442 #ifdef CONFIG_SMP 2443 if (class == &stop_sched_class) 2444 return SCX_CPU_PREEMPT_STOP; 2445 #endif 2446 if (class == &dl_sched_class) 2447 return SCX_CPU_PREEMPT_DL; 2448 if (class == &rt_sched_class) 2449 return SCX_CPU_PREEMPT_RT; 2450 return SCX_CPU_PREEMPT_UNKNOWN; 2451 } 2452 2453 void scx_next_task_picked(struct rq *rq, struct task_struct *p, 2454 const struct sched_class *active) 2455 { 2456 lockdep_assert_rq_held(rq); 2457 2458 if (!scx_enabled()) 2459 return; 2460 #ifdef CONFIG_SMP 2461 /* 2462 * Pairs with the smp_load_acquire() issued by a CPU in 2463 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a 2464 * resched. 2465 */ 2466 smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); 2467 #endif 2468 if (!static_branch_unlikely(&scx_ops_cpu_preempt)) 2469 return; 2470 2471 /* 2472 * The callback is conceptually meant to convey that the CPU is no 2473 * longer under the control of SCX. Therefore, don't invoke the 2474 * callback if the CPU is is staying on SCX, or going idle (in which 2475 * case the SCX scheduler has actively decided not to schedule any 2476 * tasks on the CPU). 2477 */ 2478 if (likely(active >= &ext_sched_class)) 2479 return; 2480 2481 /* 2482 * At this point we know that SCX was preempted by a higher priority 2483 * sched_class, so invoke the ->cpu_release() callback if we have not 2484 * done so already. We only send the callback once between SCX being 2485 * preempted, and it regaining control of the CPU. 2486 * 2487 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 2488 * next time that balance_scx() is invoked. 2489 */ 2490 if (!rq->scx.cpu_released) { 2491 if (SCX_HAS_OP(cpu_release)) { 2492 struct scx_cpu_release_args args = { 2493 .reason = preempt_reason_from_class(active), 2494 .task = p, 2495 }; 2496 2497 SCX_CALL_OP(SCX_KF_CPU_RELEASE, 2498 cpu_release, cpu_of(rq), &args); 2499 } 2500 rq->scx.cpu_released = true; 2501 } 2502 } 2503 2504 #ifdef CONFIG_SMP 2505 2506 static bool test_and_clear_cpu_idle(int cpu) 2507 { 2508 #ifdef CONFIG_SCHED_SMT 2509 /* 2510 * SMT mask should be cleared whether we can claim @cpu or not. The SMT 2511 * cluster is not wholly idle either way. This also prevents 2512 * scx_pick_idle_cpu() from getting caught in an infinite loop. 2513 */ 2514 if (sched_smt_active()) { 2515 const struct cpumask *smt = cpu_smt_mask(cpu); 2516 2517 /* 2518 * If offline, @cpu is not its own sibling and 2519 * scx_pick_idle_cpu() can get caught in an infinite loop as 2520 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu 2521 * is eventually cleared. 2522 */ 2523 if (cpumask_intersects(smt, idle_masks.smt)) 2524 cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 2525 else if (cpumask_test_cpu(cpu, idle_masks.smt)) 2526 __cpumask_clear_cpu(cpu, idle_masks.smt); 2527 } 2528 #endif 2529 return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); 2530 } 2531 2532 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) 2533 { 2534 int cpu; 2535 2536 retry: 2537 if (sched_smt_active()) { 2538 cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); 2539 if (cpu < nr_cpu_ids) 2540 goto found; 2541 2542 if (flags & SCX_PICK_IDLE_CORE) 2543 return -EBUSY; 2544 } 2545 2546 cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); 2547 if (cpu >= nr_cpu_ids) 2548 return -EBUSY; 2549 2550 found: 2551 if (test_and_clear_cpu_idle(cpu)) 2552 return cpu; 2553 else 2554 goto retry; 2555 } 2556 2557 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 2558 u64 wake_flags, bool *found) 2559 { 2560 s32 cpu; 2561 2562 *found = false; 2563 2564 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 2565 scx_ops_error("built-in idle tracking is disabled"); 2566 return prev_cpu; 2567 } 2568 2569 /* 2570 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is 2571 * under utilized, wake up @p to the local DSQ of the waker. Checking 2572 * only for an empty local DSQ is insufficient as it could give the 2573 * wakee an unfair advantage when the system is oversaturated. 2574 * Checking only for the presence of idle CPUs is also insufficient as 2575 * the local DSQ of the waker could have tasks piled up on it even if 2576 * there is an idle core elsewhere on the system. 2577 */ 2578 cpu = smp_processor_id(); 2579 if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && 2580 !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && 2581 cpu_rq(cpu)->scx.local_dsq.nr == 0) { 2582 if (cpumask_test_cpu(cpu, p->cpus_ptr)) 2583 goto cpu_found; 2584 } 2585 2586 if (p->nr_cpus_allowed == 1) { 2587 if (test_and_clear_cpu_idle(prev_cpu)) { 2588 cpu = prev_cpu; 2589 goto cpu_found; 2590 } else { 2591 return prev_cpu; 2592 } 2593 } 2594 2595 /* 2596 * If CPU has SMT, any wholly idle CPU is likely a better pick than 2597 * partially idle @prev_cpu. 2598 */ 2599 if (sched_smt_active()) { 2600 if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && 2601 test_and_clear_cpu_idle(prev_cpu)) { 2602 cpu = prev_cpu; 2603 goto cpu_found; 2604 } 2605 2606 cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); 2607 if (cpu >= 0) 2608 goto cpu_found; 2609 } 2610 2611 if (test_and_clear_cpu_idle(prev_cpu)) { 2612 cpu = prev_cpu; 2613 goto cpu_found; 2614 } 2615 2616 cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); 2617 if (cpu >= 0) 2618 goto cpu_found; 2619 2620 return prev_cpu; 2621 2622 cpu_found: 2623 *found = true; 2624 return cpu; 2625 } 2626 2627 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 2628 { 2629 /* 2630 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 2631 * can be a good migration opportunity with low cache and memory 2632 * footprint. Returning a CPU different than @prev_cpu triggers 2633 * immediate rq migration. However, for SCX, as the current rq 2634 * association doesn't dictate where the task is going to run, this 2635 * doesn't fit well. If necessary, we can later add a dedicated method 2636 * which can decide to preempt self to force it through the regular 2637 * scheduling path. 2638 */ 2639 if (unlikely(wake_flags & WF_EXEC)) 2640 return prev_cpu; 2641 2642 if (SCX_HAS_OP(select_cpu)) { 2643 s32 cpu; 2644 struct task_struct **ddsp_taskp; 2645 2646 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 2647 WARN_ON_ONCE(*ddsp_taskp); 2648 *ddsp_taskp = p; 2649 2650 cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, 2651 select_cpu, p, prev_cpu, wake_flags); 2652 *ddsp_taskp = NULL; 2653 if (ops_cpu_valid(cpu, "from ops.select_cpu()")) 2654 return cpu; 2655 else 2656 return prev_cpu; 2657 } else { 2658 bool found; 2659 s32 cpu; 2660 2661 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); 2662 if (found) { 2663 p->scx.slice = SCX_SLICE_DFL; 2664 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 2665 } 2666 return cpu; 2667 } 2668 } 2669 2670 static void set_cpus_allowed_scx(struct task_struct *p, 2671 struct affinity_context *ac) 2672 { 2673 set_cpus_allowed_common(p, ac); 2674 2675 /* 2676 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 2677 * differ from the configured one in @p->cpus_mask. Always tell the bpf 2678 * scheduler the effective one. 2679 * 2680 * Fine-grained memory write control is enforced by BPF making the const 2681 * designation pointless. Cast it away when calling the operation. 2682 */ 2683 if (SCX_HAS_OP(set_cpumask)) 2684 SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, 2685 (struct cpumask *)p->cpus_ptr); 2686 } 2687 2688 static void reset_idle_masks(void) 2689 { 2690 /* 2691 * Consider all online cpus idle. Should converge to the actual state 2692 * quickly. 2693 */ 2694 cpumask_copy(idle_masks.cpu, cpu_online_mask); 2695 cpumask_copy(idle_masks.smt, cpu_online_mask); 2696 } 2697 2698 void __scx_update_idle(struct rq *rq, bool idle) 2699 { 2700 int cpu = cpu_of(rq); 2701 2702 if (SCX_HAS_OP(update_idle)) { 2703 SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); 2704 if (!static_branch_unlikely(&scx_builtin_idle_enabled)) 2705 return; 2706 } 2707 2708 if (idle) 2709 cpumask_set_cpu(cpu, idle_masks.cpu); 2710 else 2711 cpumask_clear_cpu(cpu, idle_masks.cpu); 2712 2713 #ifdef CONFIG_SCHED_SMT 2714 if (sched_smt_active()) { 2715 const struct cpumask *smt = cpu_smt_mask(cpu); 2716 2717 if (idle) { 2718 /* 2719 * idle_masks.smt handling is racy but that's fine as 2720 * it's only for optimization and self-correcting. 2721 */ 2722 for_each_cpu(cpu, smt) { 2723 if (!cpumask_test_cpu(cpu, idle_masks.cpu)) 2724 return; 2725 } 2726 cpumask_or(idle_masks.smt, idle_masks.smt, smt); 2727 } else { 2728 cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 2729 } 2730 } 2731 #endif 2732 } 2733 2734 static void handle_hotplug(struct rq *rq, bool online) 2735 { 2736 int cpu = cpu_of(rq); 2737 2738 atomic_long_inc(&scx_hotplug_seq); 2739 2740 if (online && SCX_HAS_OP(cpu_online)) 2741 SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu); 2742 else if (!online && SCX_HAS_OP(cpu_offline)) 2743 SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu); 2744 else 2745 scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 2746 "cpu %d going %s, exiting scheduler", cpu, 2747 online ? "online" : "offline"); 2748 } 2749 2750 void scx_rq_activate(struct rq *rq) 2751 { 2752 handle_hotplug(rq, true); 2753 } 2754 2755 void scx_rq_deactivate(struct rq *rq) 2756 { 2757 handle_hotplug(rq, false); 2758 } 2759 2760 static void rq_online_scx(struct rq *rq) 2761 { 2762 rq->scx.flags |= SCX_RQ_ONLINE; 2763 } 2764 2765 static void rq_offline_scx(struct rq *rq) 2766 { 2767 rq->scx.flags &= ~SCX_RQ_ONLINE; 2768 } 2769 2770 #else /* CONFIG_SMP */ 2771 2772 static bool test_and_clear_cpu_idle(int cpu) { return false; } 2773 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } 2774 static void reset_idle_masks(void) {} 2775 2776 #endif /* CONFIG_SMP */ 2777 2778 static bool check_rq_for_timeouts(struct rq *rq) 2779 { 2780 struct task_struct *p; 2781 struct rq_flags rf; 2782 bool timed_out = false; 2783 2784 rq_lock_irqsave(rq, &rf); 2785 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 2786 unsigned long last_runnable = p->scx.runnable_at; 2787 2788 if (unlikely(time_after(jiffies, 2789 last_runnable + scx_watchdog_timeout))) { 2790 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 2791 2792 scx_ops_error_kind(SCX_EXIT_ERROR_STALL, 2793 "%s[%d] failed to run for %u.%03us", 2794 p->comm, p->pid, 2795 dur_ms / 1000, dur_ms % 1000); 2796 timed_out = true; 2797 break; 2798 } 2799 } 2800 rq_unlock_irqrestore(rq, &rf); 2801 2802 return timed_out; 2803 } 2804 2805 static void scx_watchdog_workfn(struct work_struct *work) 2806 { 2807 int cpu; 2808 2809 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 2810 2811 for_each_online_cpu(cpu) { 2812 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 2813 break; 2814 2815 cond_resched(); 2816 } 2817 queue_delayed_work(system_unbound_wq, to_delayed_work(work), 2818 scx_watchdog_timeout / 2); 2819 } 2820 2821 void scx_tick(struct rq *rq) 2822 { 2823 unsigned long last_check; 2824 2825 if (!scx_enabled()) 2826 return; 2827 2828 last_check = READ_ONCE(scx_watchdog_timestamp); 2829 if (unlikely(time_after(jiffies, 2830 last_check + READ_ONCE(scx_watchdog_timeout)))) { 2831 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 2832 2833 scx_ops_error_kind(SCX_EXIT_ERROR_STALL, 2834 "watchdog failed to check in for %u.%03us", 2835 dur_ms / 1000, dur_ms % 1000); 2836 } 2837 2838 update_other_load_avgs(rq); 2839 } 2840 2841 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 2842 { 2843 update_curr_scx(rq); 2844 2845 /* 2846 * While bypassing, always resched as we can't trust the slice 2847 * management. 2848 */ 2849 if (scx_ops_bypassing()) 2850 curr->scx.slice = 0; 2851 else if (SCX_HAS_OP(tick)) 2852 SCX_CALL_OP(SCX_KF_REST, tick, curr); 2853 2854 if (!curr->scx.slice) 2855 resched_curr(rq); 2856 } 2857 2858 static enum scx_task_state scx_get_task_state(const struct task_struct *p) 2859 { 2860 return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; 2861 } 2862 2863 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) 2864 { 2865 enum scx_task_state prev_state = scx_get_task_state(p); 2866 bool warn = false; 2867 2868 BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); 2869 2870 switch (state) { 2871 case SCX_TASK_NONE: 2872 break; 2873 case SCX_TASK_INIT: 2874 warn = prev_state != SCX_TASK_NONE; 2875 break; 2876 case SCX_TASK_READY: 2877 warn = prev_state == SCX_TASK_NONE; 2878 break; 2879 case SCX_TASK_ENABLED: 2880 warn = prev_state != SCX_TASK_READY; 2881 break; 2882 default: 2883 warn = true; 2884 return; 2885 } 2886 2887 WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", 2888 prev_state, state, p->comm, p->pid); 2889 2890 p->scx.flags &= ~SCX_TASK_STATE_MASK; 2891 p->scx.flags |= state << SCX_TASK_STATE_SHIFT; 2892 } 2893 2894 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) 2895 { 2896 int ret; 2897 2898 p->scx.disallow = false; 2899 2900 if (SCX_HAS_OP(init_task)) { 2901 struct scx_init_task_args args = { 2902 .fork = fork, 2903 }; 2904 2905 ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); 2906 if (unlikely(ret)) { 2907 ret = ops_sanitize_err("init_task", ret); 2908 return ret; 2909 } 2910 } 2911 2912 scx_set_task_state(p, SCX_TASK_INIT); 2913 2914 if (p->scx.disallow) { 2915 struct rq *rq; 2916 struct rq_flags rf; 2917 2918 rq = task_rq_lock(p, &rf); 2919 2920 /* 2921 * We're either in fork or load path and @p->policy will be 2922 * applied right after. Reverting @p->policy here and rejecting 2923 * %SCHED_EXT transitions from scx_check_setscheduler() 2924 * guarantees that if ops.init_task() sets @p->disallow, @p can 2925 * never be in SCX. 2926 */ 2927 if (p->policy == SCHED_EXT) { 2928 p->policy = SCHED_NORMAL; 2929 atomic_long_inc(&scx_nr_rejected); 2930 } 2931 2932 task_rq_unlock(rq, p, &rf); 2933 } 2934 2935 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2936 return 0; 2937 } 2938 2939 static void set_task_scx_weight(struct task_struct *p) 2940 { 2941 u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 2942 2943 p->scx.weight = sched_weight_to_cgroup(weight); 2944 } 2945 2946 static void scx_ops_enable_task(struct task_struct *p) 2947 { 2948 lockdep_assert_rq_held(task_rq(p)); 2949 2950 /* 2951 * Set the weight before calling ops.enable() so that the scheduler 2952 * doesn't see a stale value if they inspect the task struct. 2953 */ 2954 set_task_scx_weight(p); 2955 if (SCX_HAS_OP(enable)) 2956 SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); 2957 scx_set_task_state(p, SCX_TASK_ENABLED); 2958 2959 if (SCX_HAS_OP(set_weight)) 2960 SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); 2961 } 2962 2963 static void scx_ops_disable_task(struct task_struct *p) 2964 { 2965 lockdep_assert_rq_held(task_rq(p)); 2966 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 2967 2968 if (SCX_HAS_OP(disable)) 2969 SCX_CALL_OP(SCX_KF_REST, disable, p); 2970 scx_set_task_state(p, SCX_TASK_READY); 2971 } 2972 2973 static void scx_ops_exit_task(struct task_struct *p) 2974 { 2975 struct scx_exit_task_args args = { 2976 .cancelled = false, 2977 }; 2978 2979 lockdep_assert_rq_held(task_rq(p)); 2980 2981 switch (scx_get_task_state(p)) { 2982 case SCX_TASK_NONE: 2983 return; 2984 case SCX_TASK_INIT: 2985 args.cancelled = true; 2986 break; 2987 case SCX_TASK_READY: 2988 break; 2989 case SCX_TASK_ENABLED: 2990 scx_ops_disable_task(p); 2991 break; 2992 default: 2993 WARN_ON_ONCE(true); 2994 return; 2995 } 2996 2997 if (SCX_HAS_OP(exit_task)) 2998 SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); 2999 scx_set_task_state(p, SCX_TASK_NONE); 3000 } 3001 3002 void init_scx_entity(struct sched_ext_entity *scx) 3003 { 3004 /* 3005 * init_idle() calls this function again after fork sequence is 3006 * complete. Don't touch ->tasks_node as it's already linked. 3007 */ 3008 memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); 3009 3010 INIT_LIST_HEAD(&scx->dsq_node); 3011 scx->sticky_cpu = -1; 3012 scx->holding_cpu = -1; 3013 INIT_LIST_HEAD(&scx->runnable_node); 3014 scx->runnable_at = jiffies; 3015 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3016 scx->slice = SCX_SLICE_DFL; 3017 } 3018 3019 void scx_pre_fork(struct task_struct *p) 3020 { 3021 /* 3022 * BPF scheduler enable/disable paths want to be able to iterate and 3023 * update all tasks which can become complex when racing forks. As 3024 * enable/disable are very cold paths, let's use a percpu_rwsem to 3025 * exclude forks. 3026 */ 3027 percpu_down_read(&scx_fork_rwsem); 3028 } 3029 3030 int scx_fork(struct task_struct *p) 3031 { 3032 percpu_rwsem_assert_held(&scx_fork_rwsem); 3033 3034 if (scx_enabled()) 3035 return scx_ops_init_task(p, task_group(p), true); 3036 else 3037 return 0; 3038 } 3039 3040 void scx_post_fork(struct task_struct *p) 3041 { 3042 if (scx_enabled()) { 3043 scx_set_task_state(p, SCX_TASK_READY); 3044 3045 /* 3046 * Enable the task immediately if it's running on sched_ext. 3047 * Otherwise, it'll be enabled in switching_to_scx() if and 3048 * when it's ever configured to run with a SCHED_EXT policy. 3049 */ 3050 if (p->sched_class == &ext_sched_class) { 3051 struct rq_flags rf; 3052 struct rq *rq; 3053 3054 rq = task_rq_lock(p, &rf); 3055 scx_ops_enable_task(p); 3056 task_rq_unlock(rq, p, &rf); 3057 } 3058 } 3059 3060 spin_lock_irq(&scx_tasks_lock); 3061 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3062 spin_unlock_irq(&scx_tasks_lock); 3063 3064 percpu_up_read(&scx_fork_rwsem); 3065 } 3066 3067 void scx_cancel_fork(struct task_struct *p) 3068 { 3069 if (scx_enabled()) { 3070 struct rq *rq; 3071 struct rq_flags rf; 3072 3073 rq = task_rq_lock(p, &rf); 3074 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3075 scx_ops_exit_task(p); 3076 task_rq_unlock(rq, p, &rf); 3077 } 3078 3079 percpu_up_read(&scx_fork_rwsem); 3080 } 3081 3082 void sched_ext_free(struct task_struct *p) 3083 { 3084 unsigned long flags; 3085 3086 spin_lock_irqsave(&scx_tasks_lock, flags); 3087 list_del_init(&p->scx.tasks_node); 3088 spin_unlock_irqrestore(&scx_tasks_lock, flags); 3089 3090 /* 3091 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> 3092 * ENABLED transitions can't race us. Disable ops for @p. 3093 */ 3094 if (scx_get_task_state(p) != SCX_TASK_NONE) { 3095 struct rq_flags rf; 3096 struct rq *rq; 3097 3098 rq = task_rq_lock(p, &rf); 3099 scx_ops_exit_task(p); 3100 task_rq_unlock(rq, p, &rf); 3101 } 3102 } 3103 3104 static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) 3105 { 3106 lockdep_assert_rq_held(task_rq(p)); 3107 3108 set_task_scx_weight(p); 3109 if (SCX_HAS_OP(set_weight)) 3110 SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); 3111 } 3112 3113 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) 3114 { 3115 } 3116 3117 static void switching_to_scx(struct rq *rq, struct task_struct *p) 3118 { 3119 scx_ops_enable_task(p); 3120 3121 /* 3122 * set_cpus_allowed_scx() is not called while @p is associated with a 3123 * different scheduler class. Keep the BPF scheduler up-to-date. 3124 */ 3125 if (SCX_HAS_OP(set_cpumask)) 3126 SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, 3127 (struct cpumask *)p->cpus_ptr); 3128 } 3129 3130 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3131 { 3132 scx_ops_disable_task(p); 3133 } 3134 3135 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} 3136 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 3137 3138 int scx_check_setscheduler(struct task_struct *p, int policy) 3139 { 3140 lockdep_assert_rq_held(task_rq(p)); 3141 3142 /* if disallow, reject transitioning into SCX */ 3143 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 3144 p->policy != policy && policy == SCHED_EXT) 3145 return -EACCES; 3146 3147 return 0; 3148 } 3149 3150 #ifdef CONFIG_NO_HZ_FULL 3151 bool scx_can_stop_tick(struct rq *rq) 3152 { 3153 struct task_struct *p = rq->curr; 3154 3155 if (scx_ops_bypassing()) 3156 return false; 3157 3158 if (p->sched_class != &ext_sched_class) 3159 return true; 3160 3161 /* 3162 * @rq can dispatch from different DSQs, so we can't tell whether it 3163 * needs the tick or not by looking at nr_running. Allow stopping ticks 3164 * iff the BPF scheduler indicated so. See set_next_task_scx(). 3165 */ 3166 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 3167 } 3168 #endif 3169 3170 /* 3171 * Omitted operations: 3172 * 3173 * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task 3174 * isn't tied to the CPU at that point. Preemption is implemented by resetting 3175 * the victim task's slice to 0 and triggering reschedule on the target CPU. 3176 * 3177 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 3178 * 3179 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 3180 * their current sched_class. Call them directly from sched core instead. 3181 * 3182 * - task_woken: Unnecessary. 3183 */ 3184 DEFINE_SCHED_CLASS(ext) = { 3185 .enqueue_task = enqueue_task_scx, 3186 .dequeue_task = dequeue_task_scx, 3187 .yield_task = yield_task_scx, 3188 .yield_to_task = yield_to_task_scx, 3189 3190 .wakeup_preempt = wakeup_preempt_scx, 3191 3192 .pick_next_task = pick_next_task_scx, 3193 3194 .put_prev_task = put_prev_task_scx, 3195 .set_next_task = set_next_task_scx, 3196 3197 #ifdef CONFIG_SMP 3198 .balance = balance_scx, 3199 .select_task_rq = select_task_rq_scx, 3200 .set_cpus_allowed = set_cpus_allowed_scx, 3201 3202 .rq_online = rq_online_scx, 3203 .rq_offline = rq_offline_scx, 3204 #endif 3205 3206 .task_tick = task_tick_scx, 3207 3208 .switching_to = switching_to_scx, 3209 .switched_from = switched_from_scx, 3210 .switched_to = switched_to_scx, 3211 .reweight_task = reweight_task_scx, 3212 .prio_changed = prio_changed_scx, 3213 3214 .update_curr = update_curr_scx, 3215 3216 #ifdef CONFIG_UCLAMP_TASK 3217 .uclamp_enabled = 0, 3218 #endif 3219 }; 3220 3221 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) 3222 { 3223 memset(dsq, 0, sizeof(*dsq)); 3224 3225 raw_spin_lock_init(&dsq->lock); 3226 INIT_LIST_HEAD(&dsq->list); 3227 dsq->id = dsq_id; 3228 } 3229 3230 static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) 3231 { 3232 struct scx_dispatch_q *dsq; 3233 int ret; 3234 3235 if (dsq_id & SCX_DSQ_FLAG_BUILTIN) 3236 return ERR_PTR(-EINVAL); 3237 3238 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 3239 if (!dsq) 3240 return ERR_PTR(-ENOMEM); 3241 3242 init_dsq(dsq, dsq_id); 3243 3244 ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, 3245 dsq_hash_params); 3246 if (ret) { 3247 kfree(dsq); 3248 return ERR_PTR(ret); 3249 } 3250 return dsq; 3251 } 3252 3253 static void free_dsq_irq_workfn(struct irq_work *irq_work) 3254 { 3255 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 3256 struct scx_dispatch_q *dsq, *tmp_dsq; 3257 3258 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 3259 kfree_rcu(dsq, rcu); 3260 } 3261 3262 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 3263 3264 static void destroy_dsq(u64 dsq_id) 3265 { 3266 struct scx_dispatch_q *dsq; 3267 unsigned long flags; 3268 3269 rcu_read_lock(); 3270 3271 dsq = find_user_dsq(dsq_id); 3272 if (!dsq) 3273 goto out_unlock_rcu; 3274 3275 raw_spin_lock_irqsave(&dsq->lock, flags); 3276 3277 if (dsq->nr) { 3278 scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", 3279 dsq->id, dsq->nr); 3280 goto out_unlock_dsq; 3281 } 3282 3283 if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) 3284 goto out_unlock_dsq; 3285 3286 /* 3287 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 3288 * queueing more tasks. As this function can be called from anywhere, 3289 * freeing is bounced through an irq work to avoid nesting RCU 3290 * operations inside scheduler locks. 3291 */ 3292 dsq->id = SCX_DSQ_INVALID; 3293 llist_add(&dsq->free_node, &dsqs_to_free); 3294 irq_work_queue(&free_dsq_irq_work); 3295 3296 out_unlock_dsq: 3297 raw_spin_unlock_irqrestore(&dsq->lock, flags); 3298 out_unlock_rcu: 3299 rcu_read_unlock(); 3300 } 3301 3302 3303 /******************************************************************************** 3304 * Sysfs interface and ops enable/disable. 3305 */ 3306 3307 #define SCX_ATTR(_name) \ 3308 static struct kobj_attribute scx_attr_##_name = { \ 3309 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 3310 .show = scx_attr_##_name##_show, \ 3311 } 3312 3313 static ssize_t scx_attr_state_show(struct kobject *kobj, 3314 struct kobj_attribute *ka, char *buf) 3315 { 3316 return sysfs_emit(buf, "%s\n", 3317 scx_ops_enable_state_str[scx_ops_enable_state()]); 3318 } 3319 SCX_ATTR(state); 3320 3321 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 3322 struct kobj_attribute *ka, char *buf) 3323 { 3324 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 3325 } 3326 SCX_ATTR(switch_all); 3327 3328 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 3329 struct kobj_attribute *ka, char *buf) 3330 { 3331 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 3332 } 3333 SCX_ATTR(nr_rejected); 3334 3335 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 3336 struct kobj_attribute *ka, char *buf) 3337 { 3338 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 3339 } 3340 SCX_ATTR(hotplug_seq); 3341 3342 static struct attribute *scx_global_attrs[] = { 3343 &scx_attr_state.attr, 3344 &scx_attr_switch_all.attr, 3345 &scx_attr_nr_rejected.attr, 3346 &scx_attr_hotplug_seq.attr, 3347 NULL, 3348 }; 3349 3350 static const struct attribute_group scx_global_attr_group = { 3351 .attrs = scx_global_attrs, 3352 }; 3353 3354 static void scx_kobj_release(struct kobject *kobj) 3355 { 3356 kfree(kobj); 3357 } 3358 3359 static ssize_t scx_attr_ops_show(struct kobject *kobj, 3360 struct kobj_attribute *ka, char *buf) 3361 { 3362 return sysfs_emit(buf, "%s\n", scx_ops.name); 3363 } 3364 SCX_ATTR(ops); 3365 3366 static struct attribute *scx_sched_attrs[] = { 3367 &scx_attr_ops.attr, 3368 NULL, 3369 }; 3370 ATTRIBUTE_GROUPS(scx_sched); 3371 3372 static const struct kobj_type scx_ktype = { 3373 .release = scx_kobj_release, 3374 .sysfs_ops = &kobj_sysfs_ops, 3375 .default_groups = scx_sched_groups, 3376 }; 3377 3378 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 3379 { 3380 return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); 3381 } 3382 3383 static const struct kset_uevent_ops scx_uevent_ops = { 3384 .uevent = scx_uevent, 3385 }; 3386 3387 /* 3388 * Used by sched_fork() and __setscheduler_prio() to pick the matching 3389 * sched_class. dl/rt are already handled. 3390 */ 3391 bool task_should_scx(struct task_struct *p) 3392 { 3393 if (!scx_enabled() || 3394 unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) 3395 return false; 3396 if (READ_ONCE(scx_switching_all)) 3397 return true; 3398 return p->policy == SCHED_EXT; 3399 } 3400 3401 /** 3402 * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress 3403 * 3404 * Bypassing guarantees that all runnable tasks make forward progress without 3405 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 3406 * be held by tasks that the BPF scheduler is forgetting to run, which 3407 * unfortunately also excludes toggling the static branches. 3408 * 3409 * Let's work around by overriding a couple ops and modifying behaviors based on 3410 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 3411 * to force global FIFO scheduling. 3412 * 3413 * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 3414 * 3415 * b. ops.dispatch() is ignored. 3416 * 3417 * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be 3418 * trusted. Whenever a tick triggers, the running task is rotated to the tail 3419 * of the queue. 3420 * 3421 * d. pick_next_task() suppresses zero slice warning. 3422 * 3423 * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM 3424 * operations. 3425 */ 3426 static void scx_ops_bypass(bool bypass) 3427 { 3428 int depth, cpu; 3429 3430 if (bypass) { 3431 depth = atomic_inc_return(&scx_ops_bypass_depth); 3432 WARN_ON_ONCE(depth <= 0); 3433 if (depth != 1) 3434 return; 3435 } else { 3436 depth = atomic_dec_return(&scx_ops_bypass_depth); 3437 WARN_ON_ONCE(depth < 0); 3438 if (depth != 0) 3439 return; 3440 } 3441 3442 /* 3443 * We need to guarantee that no tasks are on the BPF scheduler while 3444 * bypassing. Either we see enabled or the enable path sees the 3445 * increased bypass_depth before moving tasks to SCX. 3446 */ 3447 if (!scx_enabled()) 3448 return; 3449 3450 /* 3451 * No task property is changing. We just need to make sure all currently 3452 * queued tasks are re-queued according to the new scx_ops_bypassing() 3453 * state. As an optimization, walk each rq's runnable_list instead of 3454 * the scx_tasks list. 3455 * 3456 * This function can't trust the scheduler and thus can't use 3457 * cpus_read_lock(). Walk all possible CPUs instead of online. 3458 */ 3459 for_each_possible_cpu(cpu) { 3460 struct rq *rq = cpu_rq(cpu); 3461 struct rq_flags rf; 3462 struct task_struct *p, *n; 3463 3464 rq_lock_irqsave(rq, &rf); 3465 3466 /* 3467 * The use of list_for_each_entry_safe_reverse() is required 3468 * because each task is going to be removed from and added back 3469 * to the runnable_list during iteration. Because they're added 3470 * to the tail of the list, safe reverse iteration can still 3471 * visit all nodes. 3472 */ 3473 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 3474 scx.runnable_node) { 3475 struct sched_enq_and_set_ctx ctx; 3476 3477 /* cycling deq/enq is enough, see the function comment */ 3478 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 3479 sched_enq_and_set_task(&ctx); 3480 } 3481 3482 rq_unlock_irqrestore(rq, &rf); 3483 3484 /* kick to restore ticks */ 3485 resched_cpu(cpu); 3486 } 3487 } 3488 3489 static void free_exit_info(struct scx_exit_info *ei) 3490 { 3491 kfree(ei->dump); 3492 kfree(ei->msg); 3493 kfree(ei->bt); 3494 kfree(ei); 3495 } 3496 3497 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 3498 { 3499 struct scx_exit_info *ei; 3500 3501 ei = kzalloc(sizeof(*ei), GFP_KERNEL); 3502 if (!ei) 3503 return NULL; 3504 3505 ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); 3506 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 3507 ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); 3508 3509 if (!ei->bt || !ei->msg || !ei->dump) { 3510 free_exit_info(ei); 3511 return NULL; 3512 } 3513 3514 return ei; 3515 } 3516 3517 static const char *scx_exit_reason(enum scx_exit_kind kind) 3518 { 3519 switch (kind) { 3520 case SCX_EXIT_UNREG: 3521 return "Scheduler unregistered from user space"; 3522 case SCX_EXIT_UNREG_BPF: 3523 return "Scheduler unregistered from BPF"; 3524 case SCX_EXIT_UNREG_KERN: 3525 return "Scheduler unregistered from the main kernel"; 3526 case SCX_EXIT_SYSRQ: 3527 return "disabled by sysrq-S"; 3528 case SCX_EXIT_ERROR: 3529 return "runtime error"; 3530 case SCX_EXIT_ERROR_BPF: 3531 return "scx_bpf_error"; 3532 case SCX_EXIT_ERROR_STALL: 3533 return "runnable task stall"; 3534 default: 3535 return "<UNKNOWN>"; 3536 } 3537 } 3538 3539 static void scx_ops_disable_workfn(struct kthread_work *work) 3540 { 3541 struct scx_exit_info *ei = scx_exit_info; 3542 struct scx_task_iter sti; 3543 struct task_struct *p; 3544 struct rhashtable_iter rht_iter; 3545 struct scx_dispatch_q *dsq; 3546 int i, kind; 3547 3548 kind = atomic_read(&scx_exit_kind); 3549 while (true) { 3550 /* 3551 * NONE indicates that a new scx_ops has been registered since 3552 * disable was scheduled - don't kill the new ops. DONE 3553 * indicates that the ops has already been disabled. 3554 */ 3555 if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) 3556 return; 3557 if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) 3558 break; 3559 } 3560 ei->kind = kind; 3561 ei->reason = scx_exit_reason(ei->kind); 3562 3563 /* guarantee forward progress by bypassing scx_ops */ 3564 scx_ops_bypass(true); 3565 3566 switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { 3567 case SCX_OPS_DISABLING: 3568 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 3569 break; 3570 case SCX_OPS_DISABLED: 3571 pr_warn("sched_ext: ops error detected without ops (%s)\n", 3572 scx_exit_info->msg); 3573 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != 3574 SCX_OPS_DISABLING); 3575 goto done; 3576 default: 3577 break; 3578 } 3579 3580 /* 3581 * Here, every runnable task is guaranteed to make forward progress and 3582 * we can safely use blocking synchronization constructs. Actually 3583 * disable ops. 3584 */ 3585 mutex_lock(&scx_ops_enable_mutex); 3586 3587 static_branch_disable(&__scx_switched_all); 3588 WRITE_ONCE(scx_switching_all, false); 3589 3590 /* 3591 * Avoid racing against fork. See scx_ops_enable() for explanation on 3592 * the locking order. 3593 */ 3594 percpu_down_write(&scx_fork_rwsem); 3595 cpus_read_lock(); 3596 3597 spin_lock_irq(&scx_tasks_lock); 3598 scx_task_iter_init(&sti); 3599 /* 3600 * Invoke scx_ops_exit_task() on all non-idle tasks, including 3601 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, 3602 * we may not have invoked sched_ext_free() on them by the time a 3603 * scheduler is disabled. We must therefore exit the task here, or we'd 3604 * fail to invoke ops.exit_task(), as the scheduler will have been 3605 * unloaded by the time the task is subsequently exited on the 3606 * sched_ext_free() path. 3607 */ 3608 while ((p = scx_task_iter_next_locked(&sti, true))) { 3609 const struct sched_class *old_class = p->sched_class; 3610 struct sched_enq_and_set_ctx ctx; 3611 3612 if (READ_ONCE(p->__state) != TASK_DEAD) { 3613 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, 3614 &ctx); 3615 3616 p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); 3617 __setscheduler_prio(p, p->prio); 3618 check_class_changing(task_rq(p), p, old_class); 3619 3620 sched_enq_and_set_task(&ctx); 3621 3622 check_class_changed(task_rq(p), p, old_class, p->prio); 3623 } 3624 scx_ops_exit_task(p); 3625 } 3626 scx_task_iter_exit(&sti); 3627 spin_unlock_irq(&scx_tasks_lock); 3628 3629 /* no task is on scx, turn off all the switches and flush in-progress calls */ 3630 static_branch_disable_cpuslocked(&__scx_ops_enabled); 3631 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 3632 static_branch_disable_cpuslocked(&scx_has_op[i]); 3633 static_branch_disable_cpuslocked(&scx_ops_enq_last); 3634 static_branch_disable_cpuslocked(&scx_ops_enq_exiting); 3635 static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); 3636 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 3637 synchronize_rcu(); 3638 3639 cpus_read_unlock(); 3640 percpu_up_write(&scx_fork_rwsem); 3641 3642 if (ei->kind >= SCX_EXIT_ERROR) { 3643 printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); 3644 3645 if (ei->msg[0] == '\0') 3646 printk(KERN_ERR "sched_ext: %s\n", ei->reason); 3647 else 3648 printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); 3649 3650 stack_trace_print(ei->bt, ei->bt_len, 2); 3651 } 3652 3653 if (scx_ops.exit) 3654 SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); 3655 3656 cancel_delayed_work_sync(&scx_watchdog_work); 3657 3658 /* 3659 * Delete the kobject from the hierarchy eagerly in addition to just 3660 * dropping a reference. Otherwise, if the object is deleted 3661 * asynchronously, sysfs could observe an object of the same name still 3662 * in the hierarchy when another scheduler is loaded. 3663 */ 3664 kobject_del(scx_root_kobj); 3665 kobject_put(scx_root_kobj); 3666 scx_root_kobj = NULL; 3667 3668 memset(&scx_ops, 0, sizeof(scx_ops)); 3669 3670 rhashtable_walk_enter(&dsq_hash, &rht_iter); 3671 do { 3672 rhashtable_walk_start(&rht_iter); 3673 3674 while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) 3675 destroy_dsq(dsq->id); 3676 3677 rhashtable_walk_stop(&rht_iter); 3678 } while (dsq == ERR_PTR(-EAGAIN)); 3679 rhashtable_walk_exit(&rht_iter); 3680 3681 free_percpu(scx_dsp_ctx); 3682 scx_dsp_ctx = NULL; 3683 scx_dsp_max_batch = 0; 3684 3685 free_exit_info(scx_exit_info); 3686 scx_exit_info = NULL; 3687 3688 mutex_unlock(&scx_ops_enable_mutex); 3689 3690 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != 3691 SCX_OPS_DISABLING); 3692 done: 3693 scx_ops_bypass(false); 3694 } 3695 3696 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); 3697 3698 static void schedule_scx_ops_disable_work(void) 3699 { 3700 struct kthread_worker *helper = READ_ONCE(scx_ops_helper); 3701 3702 /* 3703 * We may be called spuriously before the first bpf_sched_ext_reg(). If 3704 * scx_ops_helper isn't set up yet, there's nothing to do. 3705 */ 3706 if (helper) 3707 kthread_queue_work(helper, &scx_ops_disable_work); 3708 } 3709 3710 static void scx_ops_disable(enum scx_exit_kind kind) 3711 { 3712 int none = SCX_EXIT_NONE; 3713 3714 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 3715 kind = SCX_EXIT_ERROR; 3716 3717 atomic_try_cmpxchg(&scx_exit_kind, &none, kind); 3718 3719 schedule_scx_ops_disable_work(); 3720 } 3721 3722 static void dump_newline(struct seq_buf *s) 3723 { 3724 trace_sched_ext_dump(""); 3725 3726 /* @s may be zero sized and seq_buf triggers WARN if so */ 3727 if (s->size) 3728 seq_buf_putc(s, '\n'); 3729 } 3730 3731 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 3732 { 3733 va_list args; 3734 3735 #ifdef CONFIG_TRACEPOINTS 3736 if (trace_sched_ext_dump_enabled()) { 3737 /* protected by scx_dump_state()::dump_lock */ 3738 static char line_buf[SCX_EXIT_MSG_LEN]; 3739 3740 va_start(args, fmt); 3741 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 3742 va_end(args); 3743 3744 trace_sched_ext_dump(line_buf); 3745 } 3746 #endif 3747 /* @s may be zero sized and seq_buf triggers WARN if so */ 3748 if (s->size) { 3749 va_start(args, fmt); 3750 seq_buf_vprintf(s, fmt, args); 3751 va_end(args); 3752 3753 seq_buf_putc(s, '\n'); 3754 } 3755 } 3756 3757 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 3758 const unsigned long *bt, unsigned int len) 3759 { 3760 unsigned int i; 3761 3762 for (i = 0; i < len; i++) 3763 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 3764 } 3765 3766 static void ops_dump_init(struct seq_buf *s, const char *prefix) 3767 { 3768 struct scx_dump_data *dd = &scx_dump_data; 3769 3770 lockdep_assert_irqs_disabled(); 3771 3772 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 3773 dd->first = true; 3774 dd->cursor = 0; 3775 dd->s = s; 3776 dd->prefix = prefix; 3777 } 3778 3779 static void ops_dump_flush(void) 3780 { 3781 struct scx_dump_data *dd = &scx_dump_data; 3782 char *line = dd->buf.line; 3783 3784 if (!dd->cursor) 3785 return; 3786 3787 /* 3788 * There's something to flush and this is the first line. Insert a blank 3789 * line to distinguish ops dump. 3790 */ 3791 if (dd->first) { 3792 dump_newline(dd->s); 3793 dd->first = false; 3794 } 3795 3796 /* 3797 * There may be multiple lines in $line. Scan and emit each line 3798 * separately. 3799 */ 3800 while (true) { 3801 char *end = line; 3802 char c; 3803 3804 while (*end != '\n' && *end != '\0') 3805 end++; 3806 3807 /* 3808 * If $line overflowed, it may not have newline at the end. 3809 * Always emit with a newline. 3810 */ 3811 c = *end; 3812 *end = '\0'; 3813 dump_line(dd->s, "%s%s", dd->prefix, line); 3814 if (c == '\0') 3815 break; 3816 3817 /* move to the next line */ 3818 end++; 3819 if (*end == '\0') 3820 break; 3821 line = end; 3822 } 3823 3824 dd->cursor = 0; 3825 } 3826 3827 static void ops_dump_exit(void) 3828 { 3829 ops_dump_flush(); 3830 scx_dump_data.cpu = -1; 3831 } 3832 3833 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, 3834 struct task_struct *p, char marker) 3835 { 3836 static unsigned long bt[SCX_EXIT_BT_LEN]; 3837 char dsq_id_buf[19] = "(n/a)"; 3838 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 3839 unsigned int bt_len; 3840 3841 if (p->scx.dsq) 3842 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 3843 (unsigned long long)p->scx.dsq->id); 3844 3845 dump_newline(s); 3846 dump_line(s, " %c%c %s[%d] %+ldms", 3847 marker, task_state_to_char(p), p->comm, p->pid, 3848 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 3849 dump_line(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu", 3850 scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, 3851 ops_state & SCX_OPSS_STATE_MASK, 3852 ops_state >> SCX_OPSS_QSEQ_SHIFT); 3853 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 3854 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 3855 dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); 3856 3857 if (SCX_HAS_OP(dump_task)) { 3858 ops_dump_init(s, " "); 3859 SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); 3860 ops_dump_exit(); 3861 } 3862 3863 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 3864 if (bt_len) { 3865 dump_newline(s); 3866 dump_stack_trace(s, " ", bt, bt_len); 3867 } 3868 } 3869 3870 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) 3871 { 3872 static DEFINE_SPINLOCK(dump_lock); 3873 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 3874 struct scx_dump_ctx dctx = { 3875 .kind = ei->kind, 3876 .exit_code = ei->exit_code, 3877 .reason = ei->reason, 3878 .at_ns = ktime_get_ns(), 3879 .at_jiffies = jiffies, 3880 }; 3881 struct seq_buf s; 3882 unsigned long flags; 3883 char *buf; 3884 int cpu; 3885 3886 spin_lock_irqsave(&dump_lock, flags); 3887 3888 seq_buf_init(&s, ei->dump, dump_len); 3889 3890 if (ei->kind == SCX_EXIT_NONE) { 3891 dump_line(&s, "Debug dump triggered by %s", ei->reason); 3892 } else { 3893 dump_line(&s, "%s[%d] triggered exit kind %d:", 3894 current->comm, current->pid, ei->kind); 3895 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 3896 dump_newline(&s); 3897 dump_line(&s, "Backtrace:"); 3898 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 3899 } 3900 3901 if (SCX_HAS_OP(dump)) { 3902 ops_dump_init(&s, ""); 3903 SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); 3904 ops_dump_exit(); 3905 } 3906 3907 dump_newline(&s); 3908 dump_line(&s, "CPU states"); 3909 dump_line(&s, "----------"); 3910 3911 for_each_possible_cpu(cpu) { 3912 struct rq *rq = cpu_rq(cpu); 3913 struct rq_flags rf; 3914 struct task_struct *p; 3915 struct seq_buf ns; 3916 size_t avail, used; 3917 bool idle; 3918 3919 rq_lock(rq, &rf); 3920 3921 idle = list_empty(&rq->scx.runnable_list) && 3922 rq->curr->sched_class == &idle_sched_class; 3923 3924 if (idle && !SCX_HAS_OP(dump_cpu)) 3925 goto next; 3926 3927 /* 3928 * We don't yet know whether ops.dump_cpu() will produce output 3929 * and we may want to skip the default CPU dump if it doesn't. 3930 * Use a nested seq_buf to generate the standard dump so that we 3931 * can decide whether to commit later. 3932 */ 3933 avail = seq_buf_get_buf(&s, &buf); 3934 seq_buf_init(&ns, buf, avail); 3935 3936 dump_newline(&ns); 3937 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", 3938 cpu, rq->scx.nr_running, rq->scx.flags, 3939 rq->scx.cpu_released, rq->scx.ops_qseq, 3940 rq->scx.pnt_seq); 3941 dump_line(&ns, " curr=%s[%d] class=%ps", 3942 rq->curr->comm, rq->curr->pid, 3943 rq->curr->sched_class); 3944 if (!cpumask_empty(rq->scx.cpus_to_kick)) 3945 dump_line(&ns, " cpus_to_kick : %*pb", 3946 cpumask_pr_args(rq->scx.cpus_to_kick)); 3947 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 3948 dump_line(&ns, " idle_to_kick : %*pb", 3949 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 3950 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 3951 dump_line(&ns, " cpus_to_preempt: %*pb", 3952 cpumask_pr_args(rq->scx.cpus_to_preempt)); 3953 if (!cpumask_empty(rq->scx.cpus_to_wait)) 3954 dump_line(&ns, " cpus_to_wait : %*pb", 3955 cpumask_pr_args(rq->scx.cpus_to_wait)); 3956 3957 used = seq_buf_used(&ns); 3958 if (SCX_HAS_OP(dump_cpu)) { 3959 ops_dump_init(&ns, " "); 3960 SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); 3961 ops_dump_exit(); 3962 } 3963 3964 /* 3965 * If idle && nothing generated by ops.dump_cpu(), there's 3966 * nothing interesting. Skip. 3967 */ 3968 if (idle && used == seq_buf_used(&ns)) 3969 goto next; 3970 3971 /* 3972 * $s may already have overflowed when $ns was created. If so, 3973 * calling commit on it will trigger BUG. 3974 */ 3975 if (avail) { 3976 seq_buf_commit(&s, seq_buf_used(&ns)); 3977 if (seq_buf_has_overflowed(&ns)) 3978 seq_buf_set_overflow(&s); 3979 } 3980 3981 if (rq->curr->sched_class == &ext_sched_class) 3982 scx_dump_task(&s, &dctx, rq->curr, '*'); 3983 3984 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 3985 scx_dump_task(&s, &dctx, p, ' '); 3986 next: 3987 rq_unlock(rq, &rf); 3988 } 3989 3990 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 3991 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 3992 trunc_marker, sizeof(trunc_marker)); 3993 3994 spin_unlock_irqrestore(&dump_lock, flags); 3995 } 3996 3997 static void scx_ops_error_irq_workfn(struct irq_work *irq_work) 3998 { 3999 struct scx_exit_info *ei = scx_exit_info; 4000 4001 if (ei->kind >= SCX_EXIT_ERROR) 4002 scx_dump_state(ei, scx_ops.exit_dump_len); 4003 4004 schedule_scx_ops_disable_work(); 4005 } 4006 4007 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); 4008 4009 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, 4010 s64 exit_code, 4011 const char *fmt, ...) 4012 { 4013 struct scx_exit_info *ei = scx_exit_info; 4014 int none = SCX_EXIT_NONE; 4015 va_list args; 4016 4017 if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) 4018 return; 4019 4020 ei->exit_code = exit_code; 4021 4022 if (kind >= SCX_EXIT_ERROR) 4023 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 4024 4025 va_start(args, fmt); 4026 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 4027 va_end(args); 4028 4029 /* 4030 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 4031 * in scx_ops_disable_workfn(). 4032 */ 4033 ei->kind = kind; 4034 ei->reason = scx_exit_reason(ei->kind); 4035 4036 irq_work_queue(&scx_ops_error_irq_work); 4037 } 4038 4039 static struct kthread_worker *scx_create_rt_helper(const char *name) 4040 { 4041 struct kthread_worker *helper; 4042 4043 helper = kthread_create_worker(0, name); 4044 if (helper) 4045 sched_set_fifo(helper->task); 4046 return helper; 4047 } 4048 4049 static void check_hotplug_seq(const struct sched_ext_ops *ops) 4050 { 4051 unsigned long long global_hotplug_seq; 4052 4053 /* 4054 * If a hotplug event has occurred between when a scheduler was 4055 * initialized, and when we were able to attach, exit and notify user 4056 * space about it. 4057 */ 4058 if (ops->hotplug_seq) { 4059 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 4060 if (ops->hotplug_seq != global_hotplug_seq) { 4061 scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 4062 "expected hotplug seq %llu did not match actual %llu", 4063 ops->hotplug_seq, global_hotplug_seq); 4064 } 4065 } 4066 } 4067 4068 static int validate_ops(const struct sched_ext_ops *ops) 4069 { 4070 /* 4071 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 4072 * ops.enqueue() callback isn't implemented. 4073 */ 4074 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 4075 scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 4076 return -EINVAL; 4077 } 4078 4079 return 0; 4080 } 4081 4082 static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) 4083 { 4084 struct scx_task_iter sti; 4085 struct task_struct *p; 4086 unsigned long timeout; 4087 int i, ret; 4088 4089 mutex_lock(&scx_ops_enable_mutex); 4090 4091 if (!scx_ops_helper) { 4092 WRITE_ONCE(scx_ops_helper, 4093 scx_create_rt_helper("sched_ext_ops_helper")); 4094 if (!scx_ops_helper) { 4095 ret = -ENOMEM; 4096 goto err_unlock; 4097 } 4098 } 4099 4100 if (scx_ops_enable_state() != SCX_OPS_DISABLED) { 4101 ret = -EBUSY; 4102 goto err_unlock; 4103 } 4104 4105 scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); 4106 if (!scx_root_kobj) { 4107 ret = -ENOMEM; 4108 goto err_unlock; 4109 } 4110 4111 scx_root_kobj->kset = scx_kset; 4112 ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); 4113 if (ret < 0) 4114 goto err; 4115 4116 scx_exit_info = alloc_exit_info(ops->exit_dump_len); 4117 if (!scx_exit_info) { 4118 ret = -ENOMEM; 4119 goto err_del; 4120 } 4121 4122 /* 4123 * Set scx_ops, transition to PREPPING and clear exit info to arm the 4124 * disable path. Failure triggers full disabling from here on. 4125 */ 4126 scx_ops = *ops; 4127 4128 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != 4129 SCX_OPS_DISABLED); 4130 4131 atomic_set(&scx_exit_kind, SCX_EXIT_NONE); 4132 scx_warned_zero_slice = false; 4133 4134 atomic_long_set(&scx_nr_rejected, 0); 4135 4136 /* 4137 * Keep CPUs stable during enable so that the BPF scheduler can track 4138 * online CPUs by watching ->on/offline_cpu() after ->init(). 4139 */ 4140 cpus_read_lock(); 4141 4142 if (scx_ops.init) { 4143 ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); 4144 if (ret) { 4145 ret = ops_sanitize_err("init", ret); 4146 goto err_disable_unlock_cpus; 4147 } 4148 } 4149 4150 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 4151 if (((void (**)(void))ops)[i]) 4152 static_branch_enable_cpuslocked(&scx_has_op[i]); 4153 4154 cpus_read_unlock(); 4155 4156 ret = validate_ops(ops); 4157 if (ret) 4158 goto err_disable; 4159 4160 WARN_ON_ONCE(scx_dsp_ctx); 4161 scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 4162 scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, 4163 scx_dsp_max_batch), 4164 __alignof__(struct scx_dsp_ctx)); 4165 if (!scx_dsp_ctx) { 4166 ret = -ENOMEM; 4167 goto err_disable; 4168 } 4169 4170 if (ops->timeout_ms) 4171 timeout = msecs_to_jiffies(ops->timeout_ms); 4172 else 4173 timeout = SCX_WATCHDOG_MAX_TIMEOUT; 4174 4175 WRITE_ONCE(scx_watchdog_timeout, timeout); 4176 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 4177 queue_delayed_work(system_unbound_wq, &scx_watchdog_work, 4178 scx_watchdog_timeout / 2); 4179 4180 /* 4181 * Lock out forks before opening the floodgate so that they don't wander 4182 * into the operations prematurely. 4183 * 4184 * We don't need to keep the CPUs stable but grab cpus_read_lock() to 4185 * ease future locking changes for cgroup suport. 4186 * 4187 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the 4188 * following dependency chain: 4189 * 4190 * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock 4191 */ 4192 percpu_down_write(&scx_fork_rwsem); 4193 cpus_read_lock(); 4194 4195 check_hotplug_seq(ops); 4196 4197 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 4198 if (((void (**)(void))ops)[i]) 4199 static_branch_enable_cpuslocked(&scx_has_op[i]); 4200 4201 if (ops->flags & SCX_OPS_ENQ_LAST) 4202 static_branch_enable_cpuslocked(&scx_ops_enq_last); 4203 4204 if (ops->flags & SCX_OPS_ENQ_EXITING) 4205 static_branch_enable_cpuslocked(&scx_ops_enq_exiting); 4206 if (scx_ops.cpu_acquire || scx_ops.cpu_release) 4207 static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); 4208 4209 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { 4210 reset_idle_masks(); 4211 static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); 4212 } else { 4213 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 4214 } 4215 4216 static_branch_enable_cpuslocked(&__scx_ops_enabled); 4217 4218 /* 4219 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 4220 * preventing new tasks from being added. No need to exclude tasks 4221 * leaving as sched_ext_free() can handle both prepped and enabled 4222 * tasks. Prep all tasks first and then enable them with preemption 4223 * disabled. 4224 */ 4225 spin_lock_irq(&scx_tasks_lock); 4226 4227 scx_task_iter_init(&sti); 4228 while ((p = scx_task_iter_next_locked(&sti, false))) { 4229 get_task_struct(p); 4230 scx_task_iter_rq_unlock(&sti); 4231 spin_unlock_irq(&scx_tasks_lock); 4232 4233 ret = scx_ops_init_task(p, task_group(p), false); 4234 if (ret) { 4235 put_task_struct(p); 4236 spin_lock_irq(&scx_tasks_lock); 4237 scx_task_iter_exit(&sti); 4238 spin_unlock_irq(&scx_tasks_lock); 4239 pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", 4240 ret, p->comm, p->pid); 4241 goto err_disable_unlock_all; 4242 } 4243 4244 put_task_struct(p); 4245 spin_lock_irq(&scx_tasks_lock); 4246 } 4247 scx_task_iter_exit(&sti); 4248 4249 /* 4250 * All tasks are prepped but are still ops-disabled. Ensure that 4251 * %current can't be scheduled out and switch everyone. 4252 * preempt_disable() is necessary because we can't guarantee that 4253 * %current won't be starved if scheduled out while switching. 4254 */ 4255 preempt_disable(); 4256 4257 /* 4258 * From here on, the disable path must assume that tasks have ops 4259 * enabled and need to be recovered. 4260 * 4261 * Transition to ENABLING fails iff the BPF scheduler has already 4262 * triggered scx_bpf_error(). Returning an error code here would lose 4263 * the recorded error information. Exit indicating success so that the 4264 * error is notified through ops.exit() with all the details. 4265 */ 4266 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { 4267 preempt_enable(); 4268 spin_unlock_irq(&scx_tasks_lock); 4269 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 4270 ret = 0; 4271 goto err_disable_unlock_all; 4272 } 4273 4274 /* 4275 * We're fully committed and can't fail. The PREPPED -> ENABLED 4276 * transitions here are synchronized against sched_ext_free() through 4277 * scx_tasks_lock. 4278 */ 4279 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 4280 4281 scx_task_iter_init(&sti); 4282 while ((p = scx_task_iter_next_locked(&sti, false))) { 4283 const struct sched_class *old_class = p->sched_class; 4284 struct sched_enq_and_set_ctx ctx; 4285 4286 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4287 4288 scx_set_task_state(p, SCX_TASK_READY); 4289 __setscheduler_prio(p, p->prio); 4290 check_class_changing(task_rq(p), p, old_class); 4291 4292 sched_enq_and_set_task(&ctx); 4293 4294 check_class_changed(task_rq(p), p, old_class, p->prio); 4295 } 4296 scx_task_iter_exit(&sti); 4297 4298 spin_unlock_irq(&scx_tasks_lock); 4299 preempt_enable(); 4300 cpus_read_unlock(); 4301 percpu_up_write(&scx_fork_rwsem); 4302 4303 /* see above ENABLING transition for the explanation on exiting with 0 */ 4304 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { 4305 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 4306 ret = 0; 4307 goto err_disable; 4308 } 4309 4310 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 4311 static_branch_enable(&__scx_switched_all); 4312 4313 kobject_uevent(scx_root_kobj, KOBJ_ADD); 4314 mutex_unlock(&scx_ops_enable_mutex); 4315 4316 return 0; 4317 4318 err_del: 4319 kobject_del(scx_root_kobj); 4320 err: 4321 kobject_put(scx_root_kobj); 4322 scx_root_kobj = NULL; 4323 if (scx_exit_info) { 4324 free_exit_info(scx_exit_info); 4325 scx_exit_info = NULL; 4326 } 4327 err_unlock: 4328 mutex_unlock(&scx_ops_enable_mutex); 4329 return ret; 4330 4331 err_disable_unlock_all: 4332 percpu_up_write(&scx_fork_rwsem); 4333 err_disable_unlock_cpus: 4334 cpus_read_unlock(); 4335 err_disable: 4336 mutex_unlock(&scx_ops_enable_mutex); 4337 /* must be fully disabled before returning */ 4338 scx_ops_disable(SCX_EXIT_ERROR); 4339 kthread_flush_work(&scx_ops_disable_work); 4340 return ret; 4341 } 4342 4343 4344 /******************************************************************************** 4345 * bpf_struct_ops plumbing. 4346 */ 4347 #include <linux/bpf_verifier.h> 4348 #include <linux/bpf.h> 4349 #include <linux/btf.h> 4350 4351 extern struct btf *btf_vmlinux; 4352 static const struct btf_type *task_struct_type; 4353 static u32 task_struct_type_id; 4354 4355 static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, 4356 enum bpf_access_type type, 4357 const struct bpf_prog *prog, 4358 struct bpf_insn_access_aux *info) 4359 { 4360 struct btf *btf = bpf_get_btf_vmlinux(); 4361 const struct bpf_struct_ops_desc *st_ops_desc; 4362 const struct btf_member *member; 4363 const struct btf_type *t; 4364 u32 btf_id, member_idx; 4365 const char *mname; 4366 4367 /* struct_ops op args are all sequential, 64-bit numbers */ 4368 if (off != arg_n * sizeof(__u64)) 4369 return false; 4370 4371 /* btf_id should be the type id of struct sched_ext_ops */ 4372 btf_id = prog->aux->attach_btf_id; 4373 st_ops_desc = bpf_struct_ops_find(btf, btf_id); 4374 if (!st_ops_desc) 4375 return false; 4376 4377 /* BTF type of struct sched_ext_ops */ 4378 t = st_ops_desc->type; 4379 4380 member_idx = prog->expected_attach_type; 4381 if (member_idx >= btf_type_vlen(t)) 4382 return false; 4383 4384 /* 4385 * Get the member name of this struct_ops program, which corresponds to 4386 * a field in struct sched_ext_ops. For example, the member name of the 4387 * dispatch struct_ops program (callback) is "dispatch". 4388 */ 4389 member = &btf_type_member(t)[member_idx]; 4390 mname = btf_name_by_offset(btf_vmlinux, member->name_off); 4391 4392 if (!strcmp(mname, op)) { 4393 /* 4394 * The value is a pointer to a type (struct task_struct) given 4395 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), 4396 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program 4397 * should check the pointer to make sure it is not NULL before 4398 * using it, or the verifier will reject the program. 4399 * 4400 * Longer term, this is something that should be addressed by 4401 * BTF, and be fully contained within the verifier. 4402 */ 4403 info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; 4404 info->btf = btf_vmlinux; 4405 info->btf_id = task_struct_type_id; 4406 4407 return true; 4408 } 4409 4410 return false; 4411 } 4412 4413 static bool bpf_scx_is_valid_access(int off, int size, 4414 enum bpf_access_type type, 4415 const struct bpf_prog *prog, 4416 struct bpf_insn_access_aux *info) 4417 { 4418 if (type != BPF_READ) 4419 return false; 4420 if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || 4421 set_arg_maybe_null("yield", 1, off, size, type, prog, info)) 4422 return true; 4423 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 4424 return false; 4425 if (off % size != 0) 4426 return false; 4427 4428 return btf_ctx_access(off, size, type, prog, info); 4429 } 4430 4431 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 4432 const struct bpf_reg_state *reg, int off, 4433 int size) 4434 { 4435 const struct btf_type *t; 4436 4437 t = btf_type_by_id(reg->btf, reg->btf_id); 4438 if (t == task_struct_type) { 4439 if (off >= offsetof(struct task_struct, scx.slice) && 4440 off + size <= offsetofend(struct task_struct, scx.slice)) 4441 return SCALAR_VALUE; 4442 if (off >= offsetof(struct task_struct, scx.disallow) && 4443 off + size <= offsetofend(struct task_struct, scx.disallow)) 4444 return SCALAR_VALUE; 4445 } 4446 4447 return -EACCES; 4448 } 4449 4450 static const struct bpf_func_proto * 4451 bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4452 { 4453 switch (func_id) { 4454 case BPF_FUNC_task_storage_get: 4455 return &bpf_task_storage_get_proto; 4456 case BPF_FUNC_task_storage_delete: 4457 return &bpf_task_storage_delete_proto; 4458 default: 4459 return bpf_base_func_proto(func_id, prog); 4460 } 4461 } 4462 4463 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 4464 .get_func_proto = bpf_scx_get_func_proto, 4465 .is_valid_access = bpf_scx_is_valid_access, 4466 .btf_struct_access = bpf_scx_btf_struct_access, 4467 }; 4468 4469 static int bpf_scx_init_member(const struct btf_type *t, 4470 const struct btf_member *member, 4471 void *kdata, const void *udata) 4472 { 4473 const struct sched_ext_ops *uops = udata; 4474 struct sched_ext_ops *ops = kdata; 4475 u32 moff = __btf_member_bit_offset(t, member) / 8; 4476 int ret; 4477 4478 switch (moff) { 4479 case offsetof(struct sched_ext_ops, dispatch_max_batch): 4480 if (*(u32 *)(udata + moff) > INT_MAX) 4481 return -E2BIG; 4482 ops->dispatch_max_batch = *(u32 *)(udata + moff); 4483 return 1; 4484 case offsetof(struct sched_ext_ops, flags): 4485 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 4486 return -EINVAL; 4487 ops->flags = *(u64 *)(udata + moff); 4488 return 1; 4489 case offsetof(struct sched_ext_ops, name): 4490 ret = bpf_obj_name_cpy(ops->name, uops->name, 4491 sizeof(ops->name)); 4492 if (ret < 0) 4493 return ret; 4494 if (ret == 0) 4495 return -EINVAL; 4496 return 1; 4497 case offsetof(struct sched_ext_ops, timeout_ms): 4498 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 4499 SCX_WATCHDOG_MAX_TIMEOUT) 4500 return -E2BIG; 4501 ops->timeout_ms = *(u32 *)(udata + moff); 4502 return 1; 4503 case offsetof(struct sched_ext_ops, exit_dump_len): 4504 ops->exit_dump_len = 4505 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 4506 return 1; 4507 case offsetof(struct sched_ext_ops, hotplug_seq): 4508 ops->hotplug_seq = *(u64 *)(udata + moff); 4509 return 1; 4510 } 4511 4512 return 0; 4513 } 4514 4515 static int bpf_scx_check_member(const struct btf_type *t, 4516 const struct btf_member *member, 4517 const struct bpf_prog *prog) 4518 { 4519 u32 moff = __btf_member_bit_offset(t, member) / 8; 4520 4521 switch (moff) { 4522 case offsetof(struct sched_ext_ops, init_task): 4523 case offsetof(struct sched_ext_ops, cpu_online): 4524 case offsetof(struct sched_ext_ops, cpu_offline): 4525 case offsetof(struct sched_ext_ops, init): 4526 case offsetof(struct sched_ext_ops, exit): 4527 break; 4528 default: 4529 if (prog->sleepable) 4530 return -EINVAL; 4531 } 4532 4533 return 0; 4534 } 4535 4536 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 4537 { 4538 return scx_ops_enable(kdata, link); 4539 } 4540 4541 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 4542 { 4543 scx_ops_disable(SCX_EXIT_UNREG); 4544 kthread_flush_work(&scx_ops_disable_work); 4545 } 4546 4547 static int bpf_scx_init(struct btf *btf) 4548 { 4549 u32 type_id; 4550 4551 type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); 4552 if (type_id < 0) 4553 return -EINVAL; 4554 task_struct_type = btf_type_by_id(btf, type_id); 4555 task_struct_type_id = type_id; 4556 4557 return 0; 4558 } 4559 4560 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 4561 { 4562 /* 4563 * sched_ext does not support updating the actively-loaded BPF 4564 * scheduler, as registering a BPF scheduler can always fail if the 4565 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 4566 * etc. Similarly, we can always race with unregistration happening 4567 * elsewhere, such as with sysrq. 4568 */ 4569 return -EOPNOTSUPP; 4570 } 4571 4572 static int bpf_scx_validate(void *kdata) 4573 { 4574 return 0; 4575 } 4576 4577 static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 4578 static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} 4579 static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} 4580 static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} 4581 static void runnable_stub(struct task_struct *p, u64 enq_flags) {} 4582 static void running_stub(struct task_struct *p) {} 4583 static void stopping_stub(struct task_struct *p, bool runnable) {} 4584 static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} 4585 static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } 4586 static void set_weight_stub(struct task_struct *p, u32 weight) {} 4587 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} 4588 static void update_idle_stub(s32 cpu, bool idle) {} 4589 static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} 4590 static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} 4591 static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 4592 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} 4593 static void enable_stub(struct task_struct *p) {} 4594 static void disable_stub(struct task_struct *p) {} 4595 static void cpu_online_stub(s32 cpu) {} 4596 static void cpu_offline_stub(s32 cpu) {} 4597 static s32 init_stub(void) { return -EINVAL; } 4598 static void exit_stub(struct scx_exit_info *info) {} 4599 4600 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 4601 .select_cpu = select_cpu_stub, 4602 .enqueue = enqueue_stub, 4603 .dequeue = dequeue_stub, 4604 .dispatch = dispatch_stub, 4605 .runnable = runnable_stub, 4606 .running = running_stub, 4607 .stopping = stopping_stub, 4608 .quiescent = quiescent_stub, 4609 .yield = yield_stub, 4610 .set_weight = set_weight_stub, 4611 .set_cpumask = set_cpumask_stub, 4612 .update_idle = update_idle_stub, 4613 .cpu_acquire = cpu_acquire_stub, 4614 .cpu_release = cpu_release_stub, 4615 .init_task = init_task_stub, 4616 .exit_task = exit_task_stub, 4617 .enable = enable_stub, 4618 .disable = disable_stub, 4619 .cpu_online = cpu_online_stub, 4620 .cpu_offline = cpu_offline_stub, 4621 .init = init_stub, 4622 .exit = exit_stub, 4623 }; 4624 4625 static struct bpf_struct_ops bpf_sched_ext_ops = { 4626 .verifier_ops = &bpf_scx_verifier_ops, 4627 .reg = bpf_scx_reg, 4628 .unreg = bpf_scx_unreg, 4629 .check_member = bpf_scx_check_member, 4630 .init_member = bpf_scx_init_member, 4631 .init = bpf_scx_init, 4632 .update = bpf_scx_update, 4633 .validate = bpf_scx_validate, 4634 .name = "sched_ext_ops", 4635 .owner = THIS_MODULE, 4636 .cfi_stubs = &__bpf_ops_sched_ext_ops 4637 }; 4638 4639 4640 /******************************************************************************** 4641 * System integration and init. 4642 */ 4643 4644 static void sysrq_handle_sched_ext_reset(u8 key) 4645 { 4646 if (scx_ops_helper) 4647 scx_ops_disable(SCX_EXIT_SYSRQ); 4648 else 4649 pr_info("sched_ext: BPF scheduler not yet used\n"); 4650 } 4651 4652 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 4653 .handler = sysrq_handle_sched_ext_reset, 4654 .help_msg = "reset-sched-ext(S)", 4655 .action_msg = "Disable sched_ext and revert all tasks to CFS", 4656 .enable_mask = SYSRQ_ENABLE_RTNICE, 4657 }; 4658 4659 static void sysrq_handle_sched_ext_dump(u8 key) 4660 { 4661 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 4662 4663 if (scx_enabled()) 4664 scx_dump_state(&ei, 0); 4665 } 4666 4667 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 4668 .handler = sysrq_handle_sched_ext_dump, 4669 .help_msg = "dump-sched-ext(D)", 4670 .action_msg = "Trigger sched_ext debug dump", 4671 .enable_mask = SYSRQ_ENABLE_RTNICE, 4672 }; 4673 4674 static bool can_skip_idle_kick(struct rq *rq) 4675 { 4676 lockdep_assert_rq_held(rq); 4677 4678 /* 4679 * We can skip idle kicking if @rq is going to go through at least one 4680 * full SCX scheduling cycle before going idle. Just checking whether 4681 * curr is not idle is insufficient because we could be racing 4682 * balance_one() trying to pull the next task from a remote rq, which 4683 * may fail, and @rq may become idle afterwards. 4684 * 4685 * The race window is small and we don't and can't guarantee that @rq is 4686 * only kicked while idle anyway. Skip only when sure. 4687 */ 4688 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING); 4689 } 4690 4691 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) 4692 { 4693 struct rq *rq = cpu_rq(cpu); 4694 struct scx_rq *this_scx = &this_rq->scx; 4695 bool should_wait = false; 4696 unsigned long flags; 4697 4698 raw_spin_rq_lock_irqsave(rq, flags); 4699 4700 /* 4701 * During CPU hotplug, a CPU may depend on kicking itself to make 4702 * forward progress. Allow kicking self regardless of online state. 4703 */ 4704 if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { 4705 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 4706 if (rq->curr->sched_class == &ext_sched_class) 4707 rq->curr->scx.slice = 0; 4708 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 4709 } 4710 4711 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 4712 pseqs[cpu] = rq->scx.pnt_seq; 4713 should_wait = true; 4714 } 4715 4716 resched_curr(rq); 4717 } else { 4718 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 4719 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 4720 } 4721 4722 raw_spin_rq_unlock_irqrestore(rq, flags); 4723 4724 return should_wait; 4725 } 4726 4727 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 4728 { 4729 struct rq *rq = cpu_rq(cpu); 4730 unsigned long flags; 4731 4732 raw_spin_rq_lock_irqsave(rq, flags); 4733 4734 if (!can_skip_idle_kick(rq) && 4735 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 4736 resched_curr(rq); 4737 4738 raw_spin_rq_unlock_irqrestore(rq, flags); 4739 } 4740 4741 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 4742 { 4743 struct rq *this_rq = this_rq(); 4744 struct scx_rq *this_scx = &this_rq->scx; 4745 unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); 4746 bool should_wait = false; 4747 s32 cpu; 4748 4749 for_each_cpu(cpu, this_scx->cpus_to_kick) { 4750 should_wait |= kick_one_cpu(cpu, this_rq, pseqs); 4751 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 4752 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 4753 } 4754 4755 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 4756 kick_one_cpu_if_idle(cpu, this_rq); 4757 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 4758 } 4759 4760 if (!should_wait) 4761 return; 4762 4763 for_each_cpu(cpu, this_scx->cpus_to_wait) { 4764 unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; 4765 4766 if (cpu != cpu_of(this_rq)) { 4767 /* 4768 * Pairs with smp_store_release() issued by this CPU in 4769 * scx_next_task_picked() on the resched path. 4770 * 4771 * We busy-wait here to guarantee that no other task can 4772 * be scheduled on our core before the target CPU has 4773 * entered the resched path. 4774 */ 4775 while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) 4776 cpu_relax(); 4777 } 4778 4779 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 4780 } 4781 } 4782 4783 /** 4784 * print_scx_info - print out sched_ext scheduler state 4785 * @log_lvl: the log level to use when printing 4786 * @p: target task 4787 * 4788 * If a sched_ext scheduler is enabled, print the name and state of the 4789 * scheduler. If @p is on sched_ext, print further information about the task. 4790 * 4791 * This function can be safely called on any task as long as the task_struct 4792 * itself is accessible. While safe, this function isn't synchronized and may 4793 * print out mixups or garbages of limited length. 4794 */ 4795 void print_scx_info(const char *log_lvl, struct task_struct *p) 4796 { 4797 enum scx_ops_enable_state state = scx_ops_enable_state(); 4798 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 4799 char runnable_at_buf[22] = "?"; 4800 struct sched_class *class; 4801 unsigned long runnable_at; 4802 4803 if (state == SCX_OPS_DISABLED) 4804 return; 4805 4806 /* 4807 * Carefully check if the task was running on sched_ext, and then 4808 * carefully copy the time it's been runnable, and its state. 4809 */ 4810 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 4811 class != &ext_sched_class) { 4812 printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, 4813 scx_ops_enable_state_str[state], all); 4814 return; 4815 } 4816 4817 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 4818 sizeof(runnable_at))) 4819 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 4820 jiffies_delta_msecs(runnable_at, jiffies)); 4821 4822 /* print everything onto one line to conserve console space */ 4823 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 4824 log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, 4825 runnable_at_buf); 4826 } 4827 4828 void __init init_sched_ext_class(void) 4829 { 4830 s32 cpu, v; 4831 4832 /* 4833 * The following is to prevent the compiler from optimizing out the enum 4834 * definitions so that BPF scheduler implementations can use them 4835 * through the generated vmlinux.h. 4836 */ 4837 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); 4838 4839 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 4840 init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); 4841 #ifdef CONFIG_SMP 4842 BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); 4843 BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); 4844 #endif 4845 scx_kick_cpus_pnt_seqs = 4846 __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, 4847 __alignof__(scx_kick_cpus_pnt_seqs[0])); 4848 BUG_ON(!scx_kick_cpus_pnt_seqs); 4849 4850 for_each_possible_cpu(cpu) { 4851 struct rq *rq = cpu_rq(cpu); 4852 4853 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); 4854 INIT_LIST_HEAD(&rq->scx.runnable_list); 4855 4856 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); 4857 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); 4858 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); 4859 BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); 4860 init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); 4861 4862 if (cpu_online(cpu)) 4863 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 4864 } 4865 4866 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 4867 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 4868 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 4869 } 4870 4871 4872 /******************************************************************************** 4873 * Helpers that can be called from the BPF scheduler. 4874 */ 4875 #include <linux/btf_ids.h> 4876 4877 __bpf_kfunc_start_defs(); 4878 4879 /** 4880 * scx_bpf_create_dsq - Create a custom DSQ 4881 * @dsq_id: DSQ to create 4882 * @node: NUMA node to allocate from 4883 * 4884 * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and 4885 * ops.init_task(). 4886 */ 4887 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) 4888 { 4889 if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) 4890 return -EINVAL; 4891 4892 if (unlikely(node >= (int)nr_node_ids || 4893 (node < 0 && node != NUMA_NO_NODE))) 4894 return -EINVAL; 4895 return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); 4896 } 4897 4898 __bpf_kfunc_end_defs(); 4899 4900 BTF_KFUNCS_START(scx_kfunc_ids_sleepable) 4901 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) 4902 BTF_KFUNCS_END(scx_kfunc_ids_sleepable) 4903 4904 static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { 4905 .owner = THIS_MODULE, 4906 .set = &scx_kfunc_ids_sleepable, 4907 }; 4908 4909 __bpf_kfunc_start_defs(); 4910 4911 /** 4912 * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() 4913 * @p: task_struct to select a CPU for 4914 * @prev_cpu: CPU @p was on previously 4915 * @wake_flags: %SCX_WAKE_* flags 4916 * @is_idle: out parameter indicating whether the returned CPU is idle 4917 * 4918 * Can only be called from ops.select_cpu() if the built-in CPU selection is 4919 * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. 4920 * @p, @prev_cpu and @wake_flags match ops.select_cpu(). 4921 * 4922 * Returns the picked CPU with *@is_idle indicating whether the picked CPU is 4923 * currently idle and thus a good candidate for direct dispatching. 4924 */ 4925 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 4926 u64 wake_flags, bool *is_idle) 4927 { 4928 if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { 4929 *is_idle = false; 4930 return prev_cpu; 4931 } 4932 #ifdef CONFIG_SMP 4933 return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); 4934 #else 4935 *is_idle = false; 4936 return prev_cpu; 4937 #endif 4938 } 4939 4940 __bpf_kfunc_end_defs(); 4941 4942 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) 4943 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) 4944 BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) 4945 4946 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { 4947 .owner = THIS_MODULE, 4948 .set = &scx_kfunc_ids_select_cpu, 4949 }; 4950 4951 static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) 4952 { 4953 if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 4954 return false; 4955 4956 lockdep_assert_irqs_disabled(); 4957 4958 if (unlikely(!p)) { 4959 scx_ops_error("called with NULL task"); 4960 return false; 4961 } 4962 4963 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 4964 scx_ops_error("invalid enq_flags 0x%llx", enq_flags); 4965 return false; 4966 } 4967 4968 return true; 4969 } 4970 4971 static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) 4972 { 4973 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 4974 struct task_struct *ddsp_task; 4975 4976 ddsp_task = __this_cpu_read(direct_dispatch_task); 4977 if (ddsp_task) { 4978 mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); 4979 return; 4980 } 4981 4982 if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { 4983 scx_ops_error("dispatch buffer overflow"); 4984 return; 4985 } 4986 4987 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 4988 .task = p, 4989 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 4990 .dsq_id = dsq_id, 4991 .enq_flags = enq_flags, 4992 }; 4993 } 4994 4995 __bpf_kfunc_start_defs(); 4996 4997 /** 4998 * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ 4999 * @p: task_struct to dispatch 5000 * @dsq_id: DSQ to dispatch to 5001 * @slice: duration @p can run for in nsecs 5002 * @enq_flags: SCX_ENQ_* 5003 * 5004 * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe 5005 * to call this function spuriously. Can be called from ops.enqueue(), 5006 * ops.select_cpu(), and ops.dispatch(). 5007 * 5008 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 5009 * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be 5010 * used to target the local DSQ of a CPU other than the enqueueing one. Use 5011 * ops.select_cpu() to be on the target CPU in the first place. 5012 * 5013 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 5014 * will be directly dispatched to the corresponding dispatch queue after 5015 * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be 5016 * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). 5017 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 5018 * task is dispatched. 5019 * 5020 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 5021 * and this function can be called upto ops.dispatch_max_batch times to dispatch 5022 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 5023 * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. 5024 * 5025 * This function doesn't have any locking restrictions and may be called under 5026 * BPF locks (in the future when BPF introduces more flexible locking). 5027 * 5028 * @p is allowed to run for @slice. The scheduling path is triggered on slice 5029 * exhaustion. If zero, the current residual slice is maintained. If 5030 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 5031 * scx_bpf_kick_cpu() to trigger scheduling. 5032 */ 5033 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, 5034 u64 enq_flags) 5035 { 5036 if (!scx_dispatch_preamble(p, enq_flags)) 5037 return; 5038 5039 if (slice) 5040 p->scx.slice = slice; 5041 else 5042 p->scx.slice = p->scx.slice ?: 1; 5043 5044 scx_dispatch_commit(p, dsq_id, enq_flags); 5045 } 5046 5047 __bpf_kfunc_end_defs(); 5048 5049 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 5050 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) 5051 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 5052 5053 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 5054 .owner = THIS_MODULE, 5055 .set = &scx_kfunc_ids_enqueue_dispatch, 5056 }; 5057 5058 __bpf_kfunc_start_defs(); 5059 5060 /** 5061 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 5062 * 5063 * Can only be called from ops.dispatch(). 5064 */ 5065 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) 5066 { 5067 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 5068 return 0; 5069 5070 return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); 5071 } 5072 5073 /** 5074 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 5075 * 5076 * Cancel the latest dispatch. Can be called multiple times to cancel further 5077 * dispatches. Can only be called from ops.dispatch(). 5078 */ 5079 __bpf_kfunc void scx_bpf_dispatch_cancel(void) 5080 { 5081 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 5082 5083 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 5084 return; 5085 5086 if (dspc->cursor > 0) 5087 dspc->cursor--; 5088 else 5089 scx_ops_error("dispatch buffer underflow"); 5090 } 5091 5092 /** 5093 * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ 5094 * @dsq_id: DSQ to consume 5095 * 5096 * Consume a task from the non-local DSQ identified by @dsq_id and transfer it 5097 * to the current CPU's local DSQ for execution. Can only be called from 5098 * ops.dispatch(). 5099 * 5100 * This function flushes the in-flight dispatches from scx_bpf_dispatch() before 5101 * trying to consume the specified DSQ. It may also grab rq locks and thus can't 5102 * be called under any BPF locks. 5103 * 5104 * Returns %true if a task has been consumed, %false if there isn't any task to 5105 * consume. 5106 */ 5107 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) 5108 { 5109 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 5110 struct scx_dispatch_q *dsq; 5111 5112 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 5113 return false; 5114 5115 flush_dispatch_buf(dspc->rq, dspc->rf); 5116 5117 dsq = find_non_local_dsq(dsq_id); 5118 if (unlikely(!dsq)) { 5119 scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); 5120 return false; 5121 } 5122 5123 if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { 5124 /* 5125 * A successfully consumed task can be dequeued before it starts 5126 * running while the CPU is trying to migrate other dispatched 5127 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty 5128 * local DSQ. 5129 */ 5130 dspc->nr_tasks++; 5131 return true; 5132 } else { 5133 return false; 5134 } 5135 } 5136 5137 __bpf_kfunc_end_defs(); 5138 5139 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 5140 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) 5141 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) 5142 BTF_ID_FLAGS(func, scx_bpf_consume) 5143 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 5144 5145 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 5146 .owner = THIS_MODULE, 5147 .set = &scx_kfunc_ids_dispatch, 5148 }; 5149 5150 __bpf_kfunc_start_defs(); 5151 5152 /** 5153 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 5154 * 5155 * Iterate over all of the tasks currently enqueued on the local DSQ of the 5156 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 5157 * processed tasks. Can only be called from ops.cpu_release(). 5158 */ 5159 __bpf_kfunc u32 scx_bpf_reenqueue_local(void) 5160 { 5161 u32 nr_enqueued, i; 5162 struct rq *rq; 5163 5164 if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) 5165 return 0; 5166 5167 rq = cpu_rq(smp_processor_id()); 5168 lockdep_assert_rq_held(rq); 5169 5170 /* 5171 * Get the number of tasks on the local DSQ before iterating over it to 5172 * pull off tasks. The enqueue callback below can signal that it wants 5173 * the task to stay on the local DSQ, and we want to prevent the BPF 5174 * scheduler from causing us to loop indefinitely. 5175 */ 5176 nr_enqueued = rq->scx.local_dsq.nr; 5177 for (i = 0; i < nr_enqueued; i++) { 5178 struct task_struct *p; 5179 5180 p = first_local_task(rq); 5181 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != 5182 SCX_OPSS_NONE); 5183 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 5184 WARN_ON_ONCE(p->scx.holding_cpu != -1); 5185 dispatch_dequeue(rq, p); 5186 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 5187 } 5188 5189 return nr_enqueued; 5190 } 5191 5192 __bpf_kfunc_end_defs(); 5193 5194 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 5195 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) 5196 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 5197 5198 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 5199 .owner = THIS_MODULE, 5200 .set = &scx_kfunc_ids_cpu_release, 5201 }; 5202 5203 __bpf_kfunc_start_defs(); 5204 5205 /** 5206 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 5207 * @cpu: cpu to kick 5208 * @flags: %SCX_KICK_* flags 5209 * 5210 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 5211 * trigger rescheduling on a busy CPU. This can be called from any online 5212 * scx_ops operation and the actual kicking is performed asynchronously through 5213 * an irq work. 5214 */ 5215 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) 5216 { 5217 struct rq *this_rq; 5218 unsigned long irq_flags; 5219 5220 if (!ops_cpu_valid(cpu, NULL)) 5221 return; 5222 5223 /* 5224 * While bypassing for PM ops, IRQ handling may not be online which can 5225 * lead to irq_work_queue() malfunction such as infinite busy wait for 5226 * IRQ status update. Suppress kicking. 5227 */ 5228 if (scx_ops_bypassing()) 5229 return; 5230 5231 local_irq_save(irq_flags); 5232 5233 this_rq = this_rq(); 5234 5235 /* 5236 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 5237 * rq locks. We can probably be smarter and avoid bouncing if called 5238 * from ops which don't hold a rq lock. 5239 */ 5240 if (flags & SCX_KICK_IDLE) { 5241 struct rq *target_rq = cpu_rq(cpu); 5242 5243 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 5244 scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 5245 5246 if (raw_spin_rq_trylock(target_rq)) { 5247 if (can_skip_idle_kick(target_rq)) { 5248 raw_spin_rq_unlock(target_rq); 5249 goto out; 5250 } 5251 raw_spin_rq_unlock(target_rq); 5252 } 5253 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 5254 } else { 5255 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 5256 5257 if (flags & SCX_KICK_PREEMPT) 5258 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 5259 if (flags & SCX_KICK_WAIT) 5260 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 5261 } 5262 5263 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 5264 out: 5265 local_irq_restore(irq_flags); 5266 } 5267 5268 /** 5269 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 5270 * @dsq_id: id of the DSQ 5271 * 5272 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 5273 * -%ENOENT is returned. 5274 */ 5275 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) 5276 { 5277 struct scx_dispatch_q *dsq; 5278 s32 ret; 5279 5280 preempt_disable(); 5281 5282 if (dsq_id == SCX_DSQ_LOCAL) { 5283 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 5284 goto out; 5285 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 5286 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 5287 5288 if (ops_cpu_valid(cpu, NULL)) { 5289 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 5290 goto out; 5291 } 5292 } else { 5293 dsq = find_non_local_dsq(dsq_id); 5294 if (dsq) { 5295 ret = READ_ONCE(dsq->nr); 5296 goto out; 5297 } 5298 } 5299 ret = -ENOENT; 5300 out: 5301 preempt_enable(); 5302 return ret; 5303 } 5304 5305 /** 5306 * scx_bpf_destroy_dsq - Destroy a custom DSQ 5307 * @dsq_id: DSQ to destroy 5308 * 5309 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 5310 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 5311 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 5312 * which doesn't exist. Can be called from any online scx_ops operations. 5313 */ 5314 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) 5315 { 5316 destroy_dsq(dsq_id); 5317 } 5318 5319 __bpf_kfunc_end_defs(); 5320 5321 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, 5322 char *fmt, unsigned long long *data, u32 data__sz) 5323 { 5324 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 5325 s32 ret; 5326 5327 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 5328 (data__sz && !data)) { 5329 scx_ops_error("invalid data=%p and data__sz=%u", 5330 (void *)data, data__sz); 5331 return -EINVAL; 5332 } 5333 5334 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 5335 if (ret < 0) { 5336 scx_ops_error("failed to read data fields (%d)", ret); 5337 return ret; 5338 } 5339 5340 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 5341 &bprintf_data); 5342 if (ret < 0) { 5343 scx_ops_error("format preparation failed (%d)", ret); 5344 return ret; 5345 } 5346 5347 ret = bstr_printf(line_buf, line_size, fmt, 5348 bprintf_data.bin_args); 5349 bpf_bprintf_cleanup(&bprintf_data); 5350 if (ret < 0) { 5351 scx_ops_error("(\"%s\", %p, %u) failed to format", 5352 fmt, data, data__sz); 5353 return ret; 5354 } 5355 5356 return ret; 5357 } 5358 5359 static s32 bstr_format(struct scx_bstr_buf *buf, 5360 char *fmt, unsigned long long *data, u32 data__sz) 5361 { 5362 return __bstr_format(buf->data, buf->line, sizeof(buf->line), 5363 fmt, data, data__sz); 5364 } 5365 5366 __bpf_kfunc_start_defs(); 5367 5368 /** 5369 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 5370 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 5371 * @fmt: error message format string 5372 * @data: format string parameters packaged using ___bpf_fill() macro 5373 * @data__sz: @data len, must end in '__sz' for the verifier 5374 * 5375 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 5376 * disabling. 5377 */ 5378 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 5379 unsigned long long *data, u32 data__sz) 5380 { 5381 unsigned long flags; 5382 5383 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 5384 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 5385 scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", 5386 scx_exit_bstr_buf.line); 5387 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 5388 } 5389 5390 /** 5391 * scx_bpf_error_bstr - Indicate fatal error 5392 * @fmt: error message format string 5393 * @data: format string parameters packaged using ___bpf_fill() macro 5394 * @data__sz: @data len, must end in '__sz' for the verifier 5395 * 5396 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 5397 * disabling. 5398 */ 5399 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 5400 u32 data__sz) 5401 { 5402 unsigned long flags; 5403 5404 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 5405 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 5406 scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", 5407 scx_exit_bstr_buf.line); 5408 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 5409 } 5410 5411 /** 5412 * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler 5413 * @fmt: format string 5414 * @data: format string parameters packaged using ___bpf_fill() macro 5415 * @data__sz: @data len, must end in '__sz' for the verifier 5416 * 5417 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 5418 * dump_task() to generate extra debug dump specific to the BPF scheduler. 5419 * 5420 * The extra dump may be multiple lines. A single line may be split over 5421 * multiple calls. The last line is automatically terminated. 5422 */ 5423 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 5424 u32 data__sz) 5425 { 5426 struct scx_dump_data *dd = &scx_dump_data; 5427 struct scx_bstr_buf *buf = &dd->buf; 5428 s32 ret; 5429 5430 if (raw_smp_processor_id() != dd->cpu) { 5431 scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); 5432 return; 5433 } 5434 5435 /* append the formatted string to the line buf */ 5436 ret = __bstr_format(buf->data, buf->line + dd->cursor, 5437 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 5438 if (ret < 0) { 5439 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 5440 dd->prefix, fmt, data, data__sz, ret); 5441 return; 5442 } 5443 5444 dd->cursor += ret; 5445 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 5446 5447 if (!dd->cursor) 5448 return; 5449 5450 /* 5451 * If the line buf overflowed or ends in a newline, flush it into the 5452 * dump. This is to allow the caller to generate a single line over 5453 * multiple calls. As ops_dump_flush() can also handle multiple lines in 5454 * the line buf, the only case which can lead to an unexpected 5455 * truncation is when the caller keeps generating newlines in the middle 5456 * instead of the end consecutively. Don't do that. 5457 */ 5458 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 5459 ops_dump_flush(); 5460 } 5461 5462 /** 5463 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 5464 * 5465 * All valid CPU IDs in the system are smaller than the returned value. 5466 */ 5467 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 5468 { 5469 return nr_cpu_ids; 5470 } 5471 5472 /** 5473 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 5474 */ 5475 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 5476 { 5477 return cpu_possible_mask; 5478 } 5479 5480 /** 5481 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 5482 */ 5483 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 5484 { 5485 return cpu_online_mask; 5486 } 5487 5488 /** 5489 * scx_bpf_put_cpumask - Release a possible/online cpumask 5490 * @cpumask: cpumask to release 5491 */ 5492 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 5493 { 5494 /* 5495 * Empty function body because we aren't actually acquiring or releasing 5496 * a reference to a global cpumask, which is read-only in the caller and 5497 * is never released. The acquire / release semantics here are just used 5498 * to make the cpumask is a trusted pointer in the caller. 5499 */ 5500 } 5501 5502 /** 5503 * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking 5504 * per-CPU cpumask. 5505 * 5506 * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 5507 */ 5508 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) 5509 { 5510 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 5511 scx_ops_error("built-in idle tracking is disabled"); 5512 return cpu_none_mask; 5513 } 5514 5515 #ifdef CONFIG_SMP 5516 return idle_masks.cpu; 5517 #else 5518 return cpu_none_mask; 5519 #endif 5520 } 5521 5522 /** 5523 * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, 5524 * per-physical-core cpumask. Can be used to determine if an entire physical 5525 * core is free. 5526 * 5527 * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 5528 */ 5529 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) 5530 { 5531 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 5532 scx_ops_error("built-in idle tracking is disabled"); 5533 return cpu_none_mask; 5534 } 5535 5536 #ifdef CONFIG_SMP 5537 if (sched_smt_active()) 5538 return idle_masks.smt; 5539 else 5540 return idle_masks.cpu; 5541 #else 5542 return cpu_none_mask; 5543 #endif 5544 } 5545 5546 /** 5547 * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to 5548 * either the percpu, or SMT idle-tracking cpumask. 5549 */ 5550 __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) 5551 { 5552 /* 5553 * Empty function body because we aren't actually acquiring or releasing 5554 * a reference to a global idle cpumask, which is read-only in the 5555 * caller and is never released. The acquire / release semantics here 5556 * are just used to make the cpumask a trusted pointer in the caller. 5557 */ 5558 } 5559 5560 /** 5561 * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state 5562 * @cpu: cpu to test and clear idle for 5563 * 5564 * Returns %true if @cpu was idle and its idle state was successfully cleared. 5565 * %false otherwise. 5566 * 5567 * Unavailable if ops.update_idle() is implemented and 5568 * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 5569 */ 5570 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) 5571 { 5572 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 5573 scx_ops_error("built-in idle tracking is disabled"); 5574 return false; 5575 } 5576 5577 if (ops_cpu_valid(cpu, NULL)) 5578 return test_and_clear_cpu_idle(cpu); 5579 else 5580 return false; 5581 } 5582 5583 /** 5584 * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu 5585 * @cpus_allowed: Allowed cpumask 5586 * @flags: %SCX_PICK_IDLE_CPU_* flags 5587 * 5588 * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu 5589 * number on success. -%EBUSY if no matching cpu was found. 5590 * 5591 * Idle CPU tracking may race against CPU scheduling state transitions. For 5592 * example, this function may return -%EBUSY as CPUs are transitioning into the 5593 * idle state. If the caller then assumes that there will be dispatch events on 5594 * the CPUs as they were all busy, the scheduler may end up stalling with CPUs 5595 * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and 5596 * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch 5597 * event in the near future. 5598 * 5599 * Unavailable if ops.update_idle() is implemented and 5600 * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 5601 */ 5602 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, 5603 u64 flags) 5604 { 5605 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 5606 scx_ops_error("built-in idle tracking is disabled"); 5607 return -EBUSY; 5608 } 5609 5610 return scx_pick_idle_cpu(cpus_allowed, flags); 5611 } 5612 5613 /** 5614 * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU 5615 * @cpus_allowed: Allowed cpumask 5616 * @flags: %SCX_PICK_IDLE_CPU_* flags 5617 * 5618 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any 5619 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu 5620 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is 5621 * empty. 5622 * 5623 * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not 5624 * set, this function can't tell which CPUs are idle and will always pick any 5625 * CPU. 5626 */ 5627 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, 5628 u64 flags) 5629 { 5630 s32 cpu; 5631 5632 if (static_branch_likely(&scx_builtin_idle_enabled)) { 5633 cpu = scx_pick_idle_cpu(cpus_allowed, flags); 5634 if (cpu >= 0) 5635 return cpu; 5636 } 5637 5638 cpu = cpumask_any_distribute(cpus_allowed); 5639 if (cpu < nr_cpu_ids) 5640 return cpu; 5641 else 5642 return -EBUSY; 5643 } 5644 5645 /** 5646 * scx_bpf_task_running - Is task currently running? 5647 * @p: task of interest 5648 */ 5649 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 5650 { 5651 return task_rq(p)->curr == p; 5652 } 5653 5654 /** 5655 * scx_bpf_task_cpu - CPU a task is currently associated with 5656 * @p: task of interest 5657 */ 5658 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 5659 { 5660 return task_cpu(p); 5661 } 5662 5663 __bpf_kfunc_end_defs(); 5664 5665 BTF_KFUNCS_START(scx_kfunc_ids_any) 5666 BTF_ID_FLAGS(func, scx_bpf_kick_cpu) 5667 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 5668 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 5669 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) 5670 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) 5671 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) 5672 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 5673 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 5674 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 5675 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 5676 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) 5677 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) 5678 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 5679 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) 5680 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) 5681 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) 5682 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 5683 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 5684 BTF_KFUNCS_END(scx_kfunc_ids_any) 5685 5686 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 5687 .owner = THIS_MODULE, 5688 .set = &scx_kfunc_ids_any, 5689 }; 5690 5691 static int __init scx_init(void) 5692 { 5693 int ret; 5694 5695 /* 5696 * kfunc registration can't be done from init_sched_ext_class() as 5697 * register_btf_kfunc_id_set() needs most of the system to be up. 5698 * 5699 * Some kfuncs are context-sensitive and can only be called from 5700 * specific SCX ops. They are grouped into BTF sets accordingly. 5701 * Unfortunately, BPF currently doesn't have a way of enforcing such 5702 * restrictions. Eventually, the verifier should be able to enforce 5703 * them. For now, register them the same and make each kfunc explicitly 5704 * check using scx_kf_allowed(). 5705 */ 5706 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 5707 &scx_kfunc_set_sleepable)) || 5708 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 5709 &scx_kfunc_set_select_cpu)) || 5710 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 5711 &scx_kfunc_set_enqueue_dispatch)) || 5712 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 5713 &scx_kfunc_set_dispatch)) || 5714 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 5715 &scx_kfunc_set_cpu_release)) || 5716 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 5717 &scx_kfunc_set_any)) || 5718 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 5719 &scx_kfunc_set_any)) || 5720 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 5721 &scx_kfunc_set_any))) { 5722 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 5723 return ret; 5724 } 5725 5726 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 5727 if (ret) { 5728 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 5729 return ret; 5730 } 5731 5732 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 5733 if (!scx_kset) { 5734 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 5735 return -ENOMEM; 5736 } 5737 5738 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 5739 if (ret < 0) { 5740 pr_err("sched_ext: Failed to add global attributes\n"); 5741 return ret; 5742 } 5743 5744 return 0; 5745 } 5746 __initcall(scx_init); 5747