1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 */ 7 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 8 9 enum scx_consts { 10 SCX_DSP_DFL_MAX_BATCH = 32, 11 SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, 12 13 SCX_EXIT_BT_LEN = 64, 14 SCX_EXIT_MSG_LEN = 1024, 15 SCX_EXIT_DUMP_DFL_LEN = 32768, 16 }; 17 18 enum scx_exit_kind { 19 SCX_EXIT_NONE, 20 SCX_EXIT_DONE, 21 22 SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ 23 SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ 24 SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ 25 SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ 26 27 SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ 28 SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ 29 SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ 30 }; 31 32 /* 33 * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is 34 * being disabled. 35 */ 36 struct scx_exit_info { 37 /* %SCX_EXIT_* - broad category of the exit reason */ 38 enum scx_exit_kind kind; 39 40 /* exit code if gracefully exiting */ 41 s64 exit_code; 42 43 /* textual representation of the above */ 44 const char *reason; 45 46 /* backtrace if exiting due to an error */ 47 unsigned long *bt; 48 u32 bt_len; 49 50 /* informational message */ 51 char *msg; 52 53 /* debug dump */ 54 char *dump; 55 }; 56 57 /* sched_ext_ops.flags */ 58 enum scx_ops_flags { 59 /* 60 * Keep built-in idle tracking even if ops.update_idle() is implemented. 61 */ 62 SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 63 64 /* 65 * By default, if there are no other task to run on the CPU, ext core 66 * keeps running the current task even after its slice expires. If this 67 * flag is specified, such tasks are passed to ops.enqueue() with 68 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 69 */ 70 SCX_OPS_ENQ_LAST = 1LLU << 1, 71 72 /* 73 * An exiting task may schedule after PF_EXITING is set. In such cases, 74 * bpf_task_from_pid() may not be able to find the task and if the BPF 75 * scheduler depends on pid lookup for dispatching, the task will be 76 * lost leading to various issues including RCU grace period stalls. 77 * 78 * To mask this problem, by default, unhashed tasks are automatically 79 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't 80 * depend on pid lookups and wants to handle these tasks directly, the 81 * following flag can be used. 82 */ 83 SCX_OPS_ENQ_EXITING = 1LLU << 2, 84 85 /* 86 * If set, only tasks with policy set to SCHED_EXT are attached to 87 * sched_ext. If clear, SCHED_NORMAL tasks are also included. 88 */ 89 SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 90 91 SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 92 SCX_OPS_ENQ_LAST | 93 SCX_OPS_ENQ_EXITING | 94 SCX_OPS_SWITCH_PARTIAL, 95 }; 96 97 /* argument container for ops.init_task() */ 98 struct scx_init_task_args { 99 /* 100 * Set if ops.init_task() is being invoked on the fork path, as opposed 101 * to the scheduler transition path. 102 */ 103 bool fork; 104 }; 105 106 /* argument container for ops.exit_task() */ 107 struct scx_exit_task_args { 108 /* Whether the task exited before running on sched_ext. */ 109 bool cancelled; 110 }; 111 112 /* 113 * Informational context provided to dump operations. 114 */ 115 struct scx_dump_ctx { 116 enum scx_exit_kind kind; 117 s64 exit_code; 118 const char *reason; 119 u64 at_ns; 120 u64 at_jiffies; 121 }; 122 123 /** 124 * struct sched_ext_ops - Operation table for BPF scheduler implementation 125 * 126 * Userland can implement an arbitrary scheduling policy by implementing and 127 * loading operations in this table. 128 */ 129 struct sched_ext_ops { 130 /** 131 * select_cpu - Pick the target CPU for a task which is being woken up 132 * @p: task being woken up 133 * @prev_cpu: the cpu @p was on before sleeping 134 * @wake_flags: SCX_WAKE_* 135 * 136 * Decision made here isn't final. @p may be moved to any CPU while it 137 * is getting dispatched for execution later. However, as @p is not on 138 * the rq at this point, getting the eventual execution CPU right here 139 * saves a small bit of overhead down the line. 140 * 141 * If an idle CPU is returned, the CPU is kicked and will try to 142 * dispatch. While an explicit custom mechanism can be added, 143 * select_cpu() serves as the default way to wake up idle CPUs. 144 * 145 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p 146 * is dispatched, the ops.enqueue() callback will be skipped. Finally, 147 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the 148 * local DSQ of whatever CPU is returned by this callback. 149 */ 150 s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); 151 152 /** 153 * enqueue - Enqueue a task on the BPF scheduler 154 * @p: task being enqueued 155 * @enq_flags: %SCX_ENQ_* 156 * 157 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() 158 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf 159 * scheduler owns @p and if it fails to dispatch @p, the task will 160 * stall. 161 * 162 * If @p was dispatched from ops.select_cpu(), this callback is 163 * skipped. 164 */ 165 void (*enqueue)(struct task_struct *p, u64 enq_flags); 166 167 /** 168 * dequeue - Remove a task from the BPF scheduler 169 * @p: task being dequeued 170 * @deq_flags: %SCX_DEQ_* 171 * 172 * Remove @p from the BPF scheduler. This is usually called to isolate 173 * the task while updating its scheduling properties (e.g. priority). 174 * 175 * The ext core keeps track of whether the BPF side owns a given task or 176 * not and can gracefully ignore spurious dispatches from BPF side, 177 * which makes it safe to not implement this method. However, depending 178 * on the scheduling logic, this can lead to confusing behaviors - e.g. 179 * scheduling position not being updated across a priority change. 180 */ 181 void (*dequeue)(struct task_struct *p, u64 deq_flags); 182 183 /** 184 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs 185 * @cpu: CPU to dispatch tasks for 186 * @prev: previous task being switched out 187 * 188 * Called when a CPU's local dsq is empty. The operation should dispatch 189 * one or more tasks from the BPF scheduler into the DSQs using 190 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using 191 * scx_bpf_consume(). 192 * 193 * The maximum number of times scx_bpf_dispatch() can be called without 194 * an intervening scx_bpf_consume() is specified by 195 * ops.dispatch_max_batch. See the comments on top of the two functions 196 * for more details. 197 * 198 * When not %NULL, @prev is an SCX task with its slice depleted. If 199 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in 200 * @prev->scx.flags, it is not enqueued yet and will be enqueued after 201 * ops.dispatch() returns. To keep executing @prev, return without 202 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. 203 */ 204 void (*dispatch)(s32 cpu, struct task_struct *prev); 205 206 /** 207 * tick - Periodic tick 208 * @p: task running currently 209 * 210 * This operation is called every 1/HZ seconds on CPUs which are 211 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an 212 * immediate dispatch cycle on the CPU. 213 */ 214 void (*tick)(struct task_struct *p); 215 216 /** 217 * yield - Yield CPU 218 * @from: yielding task 219 * @to: optional yield target task 220 * 221 * If @to is NULL, @from is yielding the CPU to other runnable tasks. 222 * The BPF scheduler should ensure that other available tasks are 223 * dispatched before the yielding task. Return value is ignored in this 224 * case. 225 * 226 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf 227 * scheduler can implement the request, return %true; otherwise, %false. 228 */ 229 bool (*yield)(struct task_struct *from, struct task_struct *to); 230 231 /** 232 * set_weight - Set task weight 233 * @p: task to set weight for 234 * @weight: new eight [1..10000] 235 * 236 * Update @p's weight to @weight. 237 */ 238 void (*set_weight)(struct task_struct *p, u32 weight); 239 240 /** 241 * set_cpumask - Set CPU affinity 242 * @p: task to set CPU affinity for 243 * @cpumask: cpumask of cpus that @p can run on 244 * 245 * Update @p's CPU affinity to @cpumask. 246 */ 247 void (*set_cpumask)(struct task_struct *p, 248 const struct cpumask *cpumask); 249 250 /** 251 * update_idle - Update the idle state of a CPU 252 * @cpu: CPU to udpate the idle state for 253 * @idle: whether entering or exiting the idle state 254 * 255 * This operation is called when @rq's CPU goes or leaves the idle 256 * state. By default, implementing this operation disables the built-in 257 * idle CPU tracking and the following helpers become unavailable: 258 * 259 * - scx_bpf_select_cpu_dfl() 260 * - scx_bpf_test_and_clear_cpu_idle() 261 * - scx_bpf_pick_idle_cpu() 262 * 263 * The user also must implement ops.select_cpu() as the default 264 * implementation relies on scx_bpf_select_cpu_dfl(). 265 * 266 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle 267 * tracking. 268 */ 269 void (*update_idle)(s32 cpu, bool idle); 270 271 /** 272 * init_task - Initialize a task to run in a BPF scheduler 273 * @p: task to initialize for BPF scheduling 274 * @args: init arguments, see the struct definition 275 * 276 * Either we're loading a BPF scheduler or a new task is being forked. 277 * Initialize @p for BPF scheduling. This operation may block and can 278 * be used for allocations, and is called exactly once for a task. 279 * 280 * Return 0 for success, -errno for failure. An error return while 281 * loading will abort loading of the BPF scheduler. During a fork, it 282 * will abort that specific fork. 283 */ 284 s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); 285 286 /** 287 * exit_task - Exit a previously-running task from the system 288 * @p: task to exit 289 * 290 * @p is exiting or the BPF scheduler is being unloaded. Perform any 291 * necessary cleanup for @p. 292 */ 293 void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); 294 295 /** 296 * enable - Enable BPF scheduling for a task 297 * @p: task to enable BPF scheduling for 298 * 299 * Enable @p for BPF scheduling. enable() is called on @p any time it 300 * enters SCX, and is always paired with a matching disable(). 301 */ 302 void (*enable)(struct task_struct *p); 303 304 /** 305 * disable - Disable BPF scheduling for a task 306 * @p: task to disable BPF scheduling for 307 * 308 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. 309 * Disable BPF scheduling for @p. A disable() call is always matched 310 * with a prior enable() call. 311 */ 312 void (*disable)(struct task_struct *p); 313 314 /** 315 * dump - Dump BPF scheduler state on error 316 * @ctx: debug dump context 317 * 318 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. 319 */ 320 void (*dump)(struct scx_dump_ctx *ctx); 321 322 /** 323 * dump_cpu - Dump BPF scheduler state for a CPU on error 324 * @ctx: debug dump context 325 * @cpu: CPU to generate debug dump for 326 * @idle: @cpu is currently idle without any runnable tasks 327 * 328 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 329 * @cpu. If @idle is %true and this operation doesn't produce any 330 * output, @cpu is skipped for dump. 331 */ 332 void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); 333 334 /** 335 * dump_task - Dump BPF scheduler state for a runnable task on error 336 * @ctx: debug dump context 337 * @p: runnable task to generate debug dump for 338 * 339 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 340 * @p. 341 */ 342 void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); 343 344 /* 345 * All online ops must come before ops.init(). 346 */ 347 348 /** 349 * init - Initialize the BPF scheduler 350 */ 351 s32 (*init)(void); 352 353 /** 354 * exit - Clean up after the BPF scheduler 355 * @info: Exit info 356 */ 357 void (*exit)(struct scx_exit_info *info); 358 359 /** 360 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch 361 */ 362 u32 dispatch_max_batch; 363 364 /** 365 * flags - %SCX_OPS_* flags 366 */ 367 u64 flags; 368 369 /** 370 * timeout_ms - The maximum amount of time, in milliseconds, that a 371 * runnable task should be able to wait before being scheduled. The 372 * maximum timeout may not exceed the default timeout of 30 seconds. 373 * 374 * Defaults to the maximum allowed timeout value of 30 seconds. 375 */ 376 u32 timeout_ms; 377 378 /** 379 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default 380 * value of 32768 is used. 381 */ 382 u32 exit_dump_len; 383 384 /** 385 * name - BPF scheduler's name 386 * 387 * Must be a non-zero valid BPF object name including only isalnum(), 388 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the 389 * BPF scheduler is enabled. 390 */ 391 char name[SCX_OPS_NAME_LEN]; 392 }; 393 394 enum scx_opi { 395 SCX_OPI_BEGIN = 0, 396 SCX_OPI_NORMAL_BEGIN = 0, 397 SCX_OPI_NORMAL_END = SCX_OP_IDX(init), 398 SCX_OPI_END = SCX_OP_IDX(init), 399 }; 400 401 enum scx_wake_flags { 402 /* expose select WF_* flags as enums */ 403 SCX_WAKE_FORK = WF_FORK, 404 SCX_WAKE_TTWU = WF_TTWU, 405 SCX_WAKE_SYNC = WF_SYNC, 406 }; 407 408 enum scx_enq_flags { 409 /* expose select ENQUEUE_* flags as enums */ 410 SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 411 SCX_ENQ_HEAD = ENQUEUE_HEAD, 412 413 /* high 32bits are SCX specific */ 414 415 /* 416 * The task being enqueued is the only task available for the cpu. By 417 * default, ext core keeps executing such tasks but when 418 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the 419 * %SCX_ENQ_LAST flag set. 420 * 421 * If the BPF scheduler wants to continue executing the task, 422 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. 423 * If the task gets queued on a different dsq or the BPF side, the BPF 424 * scheduler is responsible for triggering a follow-up scheduling event. 425 * Otherwise, Execution may stall. 426 */ 427 SCX_ENQ_LAST = 1LLU << 41, 428 429 /* high 8 bits are internal */ 430 __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, 431 432 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 433 }; 434 435 enum scx_deq_flags { 436 /* expose select DEQUEUE_* flags as enums */ 437 SCX_DEQ_SLEEP = DEQUEUE_SLEEP, 438 }; 439 440 enum scx_pick_idle_cpu_flags { 441 SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 442 }; 443 444 enum scx_ops_enable_state { 445 SCX_OPS_PREPPING, 446 SCX_OPS_ENABLING, 447 SCX_OPS_ENABLED, 448 SCX_OPS_DISABLING, 449 SCX_OPS_DISABLED, 450 }; 451 452 static const char *scx_ops_enable_state_str[] = { 453 [SCX_OPS_PREPPING] = "prepping", 454 [SCX_OPS_ENABLING] = "enabling", 455 [SCX_OPS_ENABLED] = "enabled", 456 [SCX_OPS_DISABLING] = "disabling", 457 [SCX_OPS_DISABLED] = "disabled", 458 }; 459 460 /* 461 * sched_ext_entity->ops_state 462 * 463 * Used to track the task ownership between the SCX core and the BPF scheduler. 464 * State transitions look as follows: 465 * 466 * NONE -> QUEUEING -> QUEUED -> DISPATCHING 467 * ^ | | 468 * | v v 469 * \-------------------------------/ 470 * 471 * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 472 * sites for explanations on the conditions being waited upon and why they are 473 * safe. Transitions out of them into NONE or QUEUED must store_release and the 474 * waiters should load_acquire. 475 * 476 * Tracking scx_ops_state enables sched_ext core to reliably determine whether 477 * any given task can be dispatched by the BPF scheduler at all times and thus 478 * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 479 * to try to dispatch any task anytime regardless of its state as the SCX core 480 * can safely reject invalid dispatches. 481 */ 482 enum scx_ops_state { 483 SCX_OPSS_NONE, /* owned by the SCX core */ 484 SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ 485 SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ 486 SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ 487 488 /* 489 * QSEQ brands each QUEUED instance so that, when dispatch races 490 * dequeue/requeue, the dispatcher can tell whether it still has a claim 491 * on the task being dispatched. 492 * 493 * As some 32bit archs can't do 64bit store_release/load_acquire, 494 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on 495 * 32bit machines. The dispatch race window QSEQ protects is very narrow 496 * and runs with IRQ disabled. 30 bits should be sufficient. 497 */ 498 SCX_OPSS_QSEQ_SHIFT = 2, 499 }; 500 501 /* Use macros to ensure that the type is unsigned long for the masks */ 502 #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) 503 #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) 504 505 /* 506 * During exit, a task may schedule after losing its PIDs. When disabling the 507 * BPF scheduler, we need to be able to iterate tasks in every state to 508 * guarantee system safety. Maintain a dedicated task list which contains every 509 * task between its fork and eventual free. 510 */ 511 static DEFINE_SPINLOCK(scx_tasks_lock); 512 static LIST_HEAD(scx_tasks); 513 514 /* ops enable/disable */ 515 static struct kthread_worker *scx_ops_helper; 516 static DEFINE_MUTEX(scx_ops_enable_mutex); 517 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); 518 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 519 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); 520 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); 521 static bool scx_switching_all; 522 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 523 524 static struct sched_ext_ops scx_ops; 525 static bool scx_warned_zero_slice; 526 527 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); 528 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); 529 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); 530 531 struct static_key_false scx_has_op[SCX_OPI_END] = 532 { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; 533 534 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); 535 static struct scx_exit_info *scx_exit_info; 536 537 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 538 539 /* 540 * The maximum amount of time in jiffies that a task may be runnable without 541 * being scheduled on a CPU. If this timeout is exceeded, it will trigger 542 * scx_ops_error(). 543 */ 544 static unsigned long scx_watchdog_timeout; 545 546 /* 547 * The last time the delayed work was run. This delayed work relies on 548 * ksoftirqd being able to run to service timer interrupts, so it's possible 549 * that this work itself could get wedged. To account for this, we check that 550 * it's not stalled in the timer tick, and trigger an error if it is. 551 */ 552 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 553 554 static struct delayed_work scx_watchdog_work; 555 556 /* idle tracking */ 557 #ifdef CONFIG_SMP 558 #ifdef CONFIG_CPUMASK_OFFSTACK 559 #define CL_ALIGNED_IF_ONSTACK 560 #else 561 #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp 562 #endif 563 564 static struct { 565 cpumask_var_t cpu; 566 cpumask_var_t smt; 567 } idle_masks CL_ALIGNED_IF_ONSTACK; 568 569 #endif /* CONFIG_SMP */ 570 571 /* 572 * Direct dispatch marker. 573 * 574 * Non-NULL values are used for direct dispatch from enqueue path. A valid 575 * pointer points to the task currently being enqueued. An ERR_PTR value is used 576 * to indicate that direct dispatch has already happened. 577 */ 578 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 579 580 /* dispatch queues */ 581 static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; 582 583 static const struct rhashtable_params dsq_hash_params = { 584 .key_len = 8, 585 .key_offset = offsetof(struct scx_dispatch_q, id), 586 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 587 }; 588 589 static struct rhashtable dsq_hash; 590 static LLIST_HEAD(dsqs_to_free); 591 592 /* dispatch buf */ 593 struct scx_dsp_buf_ent { 594 struct task_struct *task; 595 unsigned long qseq; 596 u64 dsq_id; 597 u64 enq_flags; 598 }; 599 600 static u32 scx_dsp_max_batch; 601 602 struct scx_dsp_ctx { 603 struct rq *rq; 604 struct rq_flags *rf; 605 u32 cursor; 606 u32 nr_tasks; 607 struct scx_dsp_buf_ent buf[]; 608 }; 609 610 static struct scx_dsp_ctx __percpu *scx_dsp_ctx; 611 612 /* string formatting from BPF */ 613 struct scx_bstr_buf { 614 u64 data[MAX_BPRINTF_VARARGS]; 615 char line[SCX_EXIT_MSG_LEN]; 616 }; 617 618 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 619 static struct scx_bstr_buf scx_exit_bstr_buf; 620 621 /* ops debug dump */ 622 struct scx_dump_data { 623 s32 cpu; 624 bool first; 625 s32 cursor; 626 struct seq_buf *s; 627 const char *prefix; 628 struct scx_bstr_buf buf; 629 }; 630 631 struct scx_dump_data scx_dump_data = { 632 .cpu = -1, 633 }; 634 635 /* /sys/kernel/sched_ext interface */ 636 static struct kset *scx_kset; 637 static struct kobject *scx_root_kobj; 638 639 #define CREATE_TRACE_POINTS 640 #include <trace/events/sched_ext.h> 641 642 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, 643 s64 exit_code, 644 const char *fmt, ...); 645 646 #define scx_ops_error_kind(err, fmt, args...) \ 647 scx_ops_exit_kind((err), 0, fmt, ##args) 648 649 #define scx_ops_exit(code, fmt, args...) \ 650 scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) 651 652 #define scx_ops_error(fmt, args...) \ 653 scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) 654 655 #define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) 656 657 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 658 { 659 if (time_after(at, now)) 660 return jiffies_to_msecs(at - now); 661 else 662 return -(long)jiffies_to_msecs(now - at); 663 } 664 665 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */ 666 static u32 higher_bits(u32 flags) 667 { 668 return ~((1 << fls(flags)) - 1); 669 } 670 671 /* return the mask with only the highest bit set */ 672 static u32 highest_bit(u32 flags) 673 { 674 int bit = fls(flags); 675 return ((u64)1 << bit) >> 1; 676 } 677 678 /* 679 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX 680 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate 681 * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check 682 * whether it's running from an allowed context. 683 * 684 * @mask is constant, always inline to cull the mask calculations. 685 */ 686 static __always_inline void scx_kf_allow(u32 mask) 687 { 688 /* nesting is allowed only in increasing scx_kf_mask order */ 689 WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, 690 "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", 691 current->scx.kf_mask, mask); 692 current->scx.kf_mask |= mask; 693 barrier(); 694 } 695 696 static void scx_kf_disallow(u32 mask) 697 { 698 barrier(); 699 current->scx.kf_mask &= ~mask; 700 } 701 702 #define SCX_CALL_OP(mask, op, args...) \ 703 do { \ 704 if (mask) { \ 705 scx_kf_allow(mask); \ 706 scx_ops.op(args); \ 707 scx_kf_disallow(mask); \ 708 } else { \ 709 scx_ops.op(args); \ 710 } \ 711 } while (0) 712 713 #define SCX_CALL_OP_RET(mask, op, args...) \ 714 ({ \ 715 __typeof__(scx_ops.op(args)) __ret; \ 716 if (mask) { \ 717 scx_kf_allow(mask); \ 718 __ret = scx_ops.op(args); \ 719 scx_kf_disallow(mask); \ 720 } else { \ 721 __ret = scx_ops.op(args); \ 722 } \ 723 __ret; \ 724 }) 725 726 /* @mask is constant, always inline to cull unnecessary branches */ 727 static __always_inline bool scx_kf_allowed(u32 mask) 728 { 729 if (unlikely(!(current->scx.kf_mask & mask))) { 730 scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", 731 mask, current->scx.kf_mask); 732 return false; 733 } 734 735 if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { 736 scx_ops_error("sleepable kfunc called from non-sleepable context"); 737 return false; 738 } 739 740 /* 741 * Enforce nesting boundaries. e.g. A kfunc which can be called from 742 * DISPATCH must not be called if we're running DEQUEUE which is nested 743 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE 744 * boundary thanks to the above in_interrupt() check. 745 */ 746 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && 747 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { 748 scx_ops_error("dispatch kfunc called from a nested operation"); 749 return false; 750 } 751 752 return true; 753 } 754 755 756 /* 757 * SCX task iterator. 758 */ 759 struct scx_task_iter { 760 struct sched_ext_entity cursor; 761 struct task_struct *locked; 762 struct rq *rq; 763 struct rq_flags rf; 764 }; 765 766 /** 767 * scx_task_iter_init - Initialize a task iterator 768 * @iter: iterator to init 769 * 770 * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, 771 * @iter must eventually be exited with scx_task_iter_exit(). 772 * 773 * scx_tasks_lock may be released between this and the first next() call or 774 * between any two next() calls. If scx_tasks_lock is released between two 775 * next() calls, the caller is responsible for ensuring that the task being 776 * iterated remains accessible either through RCU read lock or obtaining a 777 * reference count. 778 * 779 * All tasks which existed when the iteration started are guaranteed to be 780 * visited as long as they still exist. 781 */ 782 static void scx_task_iter_init(struct scx_task_iter *iter) 783 { 784 lockdep_assert_held(&scx_tasks_lock); 785 786 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 787 list_add(&iter->cursor.tasks_node, &scx_tasks); 788 iter->locked = NULL; 789 } 790 791 /** 792 * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator 793 * @iter: iterator to unlock rq for 794 * 795 * If @iter is in the middle of a locked iteration, it may be locking the rq of 796 * the task currently being visited. Unlock the rq if so. This function can be 797 * safely called anytime during an iteration. 798 * 799 * Returns %true if the rq @iter was locking is unlocked. %false if @iter was 800 * not locking an rq. 801 */ 802 static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) 803 { 804 if (iter->locked) { 805 task_rq_unlock(iter->rq, iter->locked, &iter->rf); 806 iter->locked = NULL; 807 return true; 808 } else { 809 return false; 810 } 811 } 812 813 /** 814 * scx_task_iter_exit - Exit a task iterator 815 * @iter: iterator to exit 816 * 817 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. 818 * If the iterator holds a task's rq lock, that rq lock is released. See 819 * scx_task_iter_init() for details. 820 */ 821 static void scx_task_iter_exit(struct scx_task_iter *iter) 822 { 823 lockdep_assert_held(&scx_tasks_lock); 824 825 scx_task_iter_rq_unlock(iter); 826 list_del_init(&iter->cursor.tasks_node); 827 } 828 829 /** 830 * scx_task_iter_next - Next task 831 * @iter: iterator to walk 832 * 833 * Visit the next task. See scx_task_iter_init() for details. 834 */ 835 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 836 { 837 struct list_head *cursor = &iter->cursor.tasks_node; 838 struct sched_ext_entity *pos; 839 840 lockdep_assert_held(&scx_tasks_lock); 841 842 list_for_each_entry(pos, cursor, tasks_node) { 843 if (&pos->tasks_node == &scx_tasks) 844 return NULL; 845 if (!(pos->flags & SCX_TASK_CURSOR)) { 846 list_move(cursor, &pos->tasks_node); 847 return container_of(pos, struct task_struct, scx); 848 } 849 } 850 851 /* can't happen, should always terminate at scx_tasks above */ 852 BUG(); 853 } 854 855 /** 856 * scx_task_iter_next_locked - Next non-idle task with its rq locked 857 * @iter: iterator to walk 858 * @include_dead: Whether we should include dead tasks in the iteration 859 * 860 * Visit the non-idle task with its rq lock held. Allows callers to specify 861 * whether they would like to filter out dead tasks. See scx_task_iter_init() 862 * for details. 863 */ 864 static struct task_struct * 865 scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) 866 { 867 struct task_struct *p; 868 retry: 869 scx_task_iter_rq_unlock(iter); 870 871 while ((p = scx_task_iter_next(iter))) { 872 /* 873 * is_idle_task() tests %PF_IDLE which may not be set for CPUs 874 * which haven't yet been onlined. Test sched_class directly. 875 */ 876 if (p->sched_class != &idle_sched_class) 877 break; 878 } 879 if (!p) 880 return NULL; 881 882 iter->rq = task_rq_lock(p, &iter->rf); 883 iter->locked = p; 884 885 /* 886 * If we see %TASK_DEAD, @p already disabled preemption, is about to do 887 * the final __schedule(), won't ever need to be scheduled again and can 888 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter 889 * the final __schedle() while we're locking its rq and thus will stay 890 * alive until the rq is unlocked. 891 */ 892 if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) 893 goto retry; 894 895 return p; 896 } 897 898 static enum scx_ops_enable_state scx_ops_enable_state(void) 899 { 900 return atomic_read(&scx_ops_enable_state_var); 901 } 902 903 static enum scx_ops_enable_state 904 scx_ops_set_enable_state(enum scx_ops_enable_state to) 905 { 906 return atomic_xchg(&scx_ops_enable_state_var, to); 907 } 908 909 static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, 910 enum scx_ops_enable_state from) 911 { 912 int from_v = from; 913 914 return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); 915 } 916 917 static bool scx_ops_bypassing(void) 918 { 919 return unlikely(atomic_read(&scx_ops_bypass_depth)); 920 } 921 922 /** 923 * wait_ops_state - Busy-wait the specified ops state to end 924 * @p: target task 925 * @opss: state to wait the end of 926 * 927 * Busy-wait for @p to transition out of @opss. This can only be used when the 928 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 929 * has load_acquire semantics to ensure that the caller can see the updates made 930 * in the enqueueing and dispatching paths. 931 */ 932 static void wait_ops_state(struct task_struct *p, unsigned long opss) 933 { 934 do { 935 cpu_relax(); 936 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 937 } 938 939 /** 940 * ops_cpu_valid - Verify a cpu number 941 * @cpu: cpu number which came from a BPF ops 942 * @where: extra information reported on error 943 * 944 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 945 * Verify that it is in range and one of the possible cpus. If invalid, trigger 946 * an ops error. 947 */ 948 static bool ops_cpu_valid(s32 cpu, const char *where) 949 { 950 if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { 951 return true; 952 } else { 953 scx_ops_error("invalid CPU %d%s%s", cpu, 954 where ? " " : "", where ?: ""); 955 return false; 956 } 957 } 958 959 /** 960 * ops_sanitize_err - Sanitize a -errno value 961 * @ops_name: operation to blame on failure 962 * @err: -errno value to sanitize 963 * 964 * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return 965 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 966 * cause misbehaviors. For an example, a large negative return from 967 * ops.init_task() triggers an oops when passed up the call chain because the 968 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 969 * handled as a pointer. 970 */ 971 static int ops_sanitize_err(const char *ops_name, s32 err) 972 { 973 if (err < 0 && err >= -MAX_ERRNO) 974 return err; 975 976 scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); 977 return -EPROTO; 978 } 979 980 static void update_curr_scx(struct rq *rq) 981 { 982 struct task_struct *curr = rq->curr; 983 u64 now = rq_clock_task(rq); 984 u64 delta_exec; 985 986 if (time_before_eq64(now, curr->se.exec_start)) 987 return; 988 989 delta_exec = now - curr->se.exec_start; 990 curr->se.exec_start = now; 991 curr->se.sum_exec_runtime += delta_exec; 992 account_group_exec_runtime(curr, delta_exec); 993 cgroup_account_cputime(curr, delta_exec); 994 995 curr->scx.slice -= min(curr->scx.slice, delta_exec); 996 } 997 998 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) 999 { 1000 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1001 WRITE_ONCE(dsq->nr, dsq->nr + delta); 1002 } 1003 1004 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, 1005 u64 enq_flags) 1006 { 1007 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1008 1009 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node)); 1010 1011 if (!is_local) { 1012 raw_spin_lock(&dsq->lock); 1013 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1014 scx_ops_error("attempting to dispatch to a destroyed dsq"); 1015 /* fall back to the global dsq */ 1016 raw_spin_unlock(&dsq->lock); 1017 dsq = &scx_dsq_global; 1018 raw_spin_lock(&dsq->lock); 1019 } 1020 } 1021 1022 if (enq_flags & SCX_ENQ_HEAD) 1023 list_add(&p->scx.dsq_node, &dsq->list); 1024 else 1025 list_add_tail(&p->scx.dsq_node, &dsq->list); 1026 1027 dsq_mod_nr(dsq, 1); 1028 p->scx.dsq = dsq; 1029 1030 /* 1031 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the 1032 * direct dispatch path, but we clear them here because the direct 1033 * dispatch verdict may be overridden on the enqueue path during e.g. 1034 * bypass. 1035 */ 1036 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1037 p->scx.ddsp_enq_flags = 0; 1038 1039 /* 1040 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1041 * match waiters' load_acquire. 1042 */ 1043 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1044 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1045 1046 if (is_local) { 1047 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1048 1049 if (sched_class_above(&ext_sched_class, rq->curr->sched_class)) 1050 resched_curr(rq); 1051 } else { 1052 raw_spin_unlock(&dsq->lock); 1053 } 1054 } 1055 1056 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1057 { 1058 struct scx_dispatch_q *dsq = p->scx.dsq; 1059 bool is_local = dsq == &rq->scx.local_dsq; 1060 1061 if (!dsq) { 1062 WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); 1063 /* 1064 * When dispatching directly from the BPF scheduler to a local 1065 * DSQ, the task isn't associated with any DSQ but 1066 * @p->scx.holding_cpu may be set under the protection of 1067 * %SCX_OPSS_DISPATCHING. 1068 */ 1069 if (p->scx.holding_cpu >= 0) 1070 p->scx.holding_cpu = -1; 1071 return; 1072 } 1073 1074 if (!is_local) 1075 raw_spin_lock(&dsq->lock); 1076 1077 /* 1078 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node 1079 * can't change underneath us. 1080 */ 1081 if (p->scx.holding_cpu < 0) { 1082 /* @p must still be on @dsq, dequeue */ 1083 WARN_ON_ONCE(list_empty(&p->scx.dsq_node)); 1084 list_del_init(&p->scx.dsq_node); 1085 dsq_mod_nr(dsq, -1); 1086 } else { 1087 /* 1088 * We're racing against dispatch_to_local_dsq() which already 1089 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1090 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1091 * the race. 1092 */ 1093 WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); 1094 p->scx.holding_cpu = -1; 1095 } 1096 p->scx.dsq = NULL; 1097 1098 if (!is_local) 1099 raw_spin_unlock(&dsq->lock); 1100 } 1101 1102 static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 1103 { 1104 return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 1105 } 1106 1107 static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) 1108 { 1109 lockdep_assert(rcu_read_lock_any_held()); 1110 1111 if (dsq_id == SCX_DSQ_GLOBAL) 1112 return &scx_dsq_global; 1113 else 1114 return find_user_dsq(dsq_id); 1115 } 1116 1117 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, 1118 struct task_struct *p) 1119 { 1120 struct scx_dispatch_q *dsq; 1121 1122 if (dsq_id == SCX_DSQ_LOCAL) 1123 return &rq->scx.local_dsq; 1124 1125 dsq = find_non_local_dsq(dsq_id); 1126 if (unlikely(!dsq)) { 1127 scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", 1128 dsq_id, p->comm, p->pid); 1129 return &scx_dsq_global; 1130 } 1131 1132 return dsq; 1133 } 1134 1135 static void mark_direct_dispatch(struct task_struct *ddsp_task, 1136 struct task_struct *p, u64 dsq_id, 1137 u64 enq_flags) 1138 { 1139 /* 1140 * Mark that dispatch already happened from ops.select_cpu() or 1141 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1142 * which can never match a valid task pointer. 1143 */ 1144 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1145 1146 /* @p must match the task on the enqueue path */ 1147 if (unlikely(p != ddsp_task)) { 1148 if (IS_ERR(ddsp_task)) 1149 scx_ops_error("%s[%d] already direct-dispatched", 1150 p->comm, p->pid); 1151 else 1152 scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1153 ddsp_task->comm, ddsp_task->pid, 1154 p->comm, p->pid); 1155 return; 1156 } 1157 1158 /* 1159 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because 1160 * dispatching to the local DSQ of a different CPU requires unlocking 1161 * the current rq which isn't allowed in the enqueue path. Use 1162 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. 1163 */ 1164 if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { 1165 scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); 1166 return; 1167 } 1168 1169 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1170 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1171 1172 p->scx.ddsp_dsq_id = dsq_id; 1173 p->scx.ddsp_enq_flags = enq_flags; 1174 } 1175 1176 static void direct_dispatch(struct task_struct *p, u64 enq_flags) 1177 { 1178 struct scx_dispatch_q *dsq; 1179 1180 enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1181 dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); 1182 dispatch_enqueue(dsq, p, enq_flags); 1183 } 1184 1185 static bool scx_rq_online(struct rq *rq) 1186 { 1187 #ifdef CONFIG_SMP 1188 return likely(rq->online); 1189 #else 1190 return true; 1191 #endif 1192 } 1193 1194 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1195 int sticky_cpu) 1196 { 1197 struct task_struct **ddsp_taskp; 1198 unsigned long qseq; 1199 1200 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1201 1202 /* rq migration */ 1203 if (sticky_cpu == cpu_of(rq)) 1204 goto local_norefill; 1205 1206 if (!scx_rq_online(rq)) 1207 goto local; 1208 1209 if (scx_ops_bypassing()) { 1210 if (enq_flags & SCX_ENQ_LAST) 1211 goto local; 1212 else 1213 goto global; 1214 } 1215 1216 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1217 goto direct; 1218 1219 /* see %SCX_OPS_ENQ_EXITING */ 1220 if (!static_branch_unlikely(&scx_ops_enq_exiting) && 1221 unlikely(p->flags & PF_EXITING)) 1222 goto local; 1223 1224 /* see %SCX_OPS_ENQ_LAST */ 1225 if (!static_branch_unlikely(&scx_ops_enq_last) && 1226 (enq_flags & SCX_ENQ_LAST)) 1227 goto local; 1228 1229 if (!SCX_HAS_OP(enqueue)) 1230 goto global; 1231 1232 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1233 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1234 1235 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1236 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1237 1238 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1239 WARN_ON_ONCE(*ddsp_taskp); 1240 *ddsp_taskp = p; 1241 1242 SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags); 1243 1244 *ddsp_taskp = NULL; 1245 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1246 goto direct; 1247 1248 /* 1249 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1250 * dequeue may be waiting. The store_release matches their load_acquire. 1251 */ 1252 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1253 return; 1254 1255 direct: 1256 direct_dispatch(p, enq_flags); 1257 return; 1258 1259 local: 1260 p->scx.slice = SCX_SLICE_DFL; 1261 local_norefill: 1262 dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); 1263 return; 1264 1265 global: 1266 p->scx.slice = SCX_SLICE_DFL; 1267 dispatch_enqueue(&scx_dsq_global, p, enq_flags); 1268 } 1269 1270 static bool task_runnable(const struct task_struct *p) 1271 { 1272 return !list_empty(&p->scx.runnable_node); 1273 } 1274 1275 static void set_task_runnable(struct rq *rq, struct task_struct *p) 1276 { 1277 lockdep_assert_rq_held(rq); 1278 1279 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 1280 p->scx.runnable_at = jiffies; 1281 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 1282 } 1283 1284 /* 1285 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being 1286 * appened to the runnable_list. 1287 */ 1288 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 1289 } 1290 1291 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 1292 { 1293 list_del_init(&p->scx.runnable_node); 1294 if (reset_runnable_at) 1295 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 1296 } 1297 1298 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) 1299 { 1300 int sticky_cpu = p->scx.sticky_cpu; 1301 1302 enq_flags |= rq->scx.extra_enq_flags; 1303 1304 if (sticky_cpu >= 0) 1305 p->scx.sticky_cpu = -1; 1306 1307 /* 1308 * Restoring a running task will be immediately followed by 1309 * set_next_task_scx() which expects the task to not be on the BPF 1310 * scheduler as tasks can only start running through local DSQs. Force 1311 * direct-dispatch into the local DSQ by setting the sticky_cpu. 1312 */ 1313 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 1314 sticky_cpu = cpu_of(rq); 1315 1316 if (p->scx.flags & SCX_TASK_QUEUED) { 1317 WARN_ON_ONCE(!task_runnable(p)); 1318 return; 1319 } 1320 1321 set_task_runnable(rq, p); 1322 p->scx.flags |= SCX_TASK_QUEUED; 1323 rq->scx.nr_running++; 1324 add_nr_running(rq, 1); 1325 1326 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 1327 } 1328 1329 static void ops_dequeue(struct task_struct *p, u64 deq_flags) 1330 { 1331 unsigned long opss; 1332 1333 /* dequeue is always temporary, don't reset runnable_at */ 1334 clr_task_runnable(p, false); 1335 1336 /* acquire ensures that we see the preceding updates on QUEUED */ 1337 opss = atomic_long_read_acquire(&p->scx.ops_state); 1338 1339 switch (opss & SCX_OPSS_STATE_MASK) { 1340 case SCX_OPSS_NONE: 1341 break; 1342 case SCX_OPSS_QUEUEING: 1343 /* 1344 * QUEUEING is started and finished while holding @p's rq lock. 1345 * As we're holding the rq lock now, we shouldn't see QUEUEING. 1346 */ 1347 BUG(); 1348 case SCX_OPSS_QUEUED: 1349 if (SCX_HAS_OP(dequeue)) 1350 SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags); 1351 1352 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 1353 SCX_OPSS_NONE)) 1354 break; 1355 fallthrough; 1356 case SCX_OPSS_DISPATCHING: 1357 /* 1358 * If @p is being dispatched from the BPF scheduler to a DSQ, 1359 * wait for the transfer to complete so that @p doesn't get 1360 * added to its DSQ after dequeueing is complete. 1361 * 1362 * As we're waiting on DISPATCHING with the rq locked, the 1363 * dispatching side shouldn't try to lock the rq while 1364 * DISPATCHING is set. See dispatch_to_local_dsq(). 1365 * 1366 * DISPATCHING shouldn't have qseq set and control can reach 1367 * here with NONE @opss from the above QUEUED case block. 1368 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 1369 */ 1370 wait_ops_state(p, SCX_OPSS_DISPATCHING); 1371 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1372 break; 1373 } 1374 } 1375 1376 static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) 1377 { 1378 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 1379 WARN_ON_ONCE(task_runnable(p)); 1380 return; 1381 } 1382 1383 ops_dequeue(p, deq_flags); 1384 1385 if (deq_flags & SCX_DEQ_SLEEP) 1386 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 1387 else 1388 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 1389 1390 p->scx.flags &= ~SCX_TASK_QUEUED; 1391 rq->scx.nr_running--; 1392 sub_nr_running(rq, 1); 1393 1394 dispatch_dequeue(rq, p); 1395 } 1396 1397 static void yield_task_scx(struct rq *rq) 1398 { 1399 struct task_struct *p = rq->curr; 1400 1401 if (SCX_HAS_OP(yield)) 1402 SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL); 1403 else 1404 p->scx.slice = 0; 1405 } 1406 1407 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 1408 { 1409 struct task_struct *from = rq->curr; 1410 1411 if (SCX_HAS_OP(yield)) 1412 return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to); 1413 else 1414 return false; 1415 } 1416 1417 #ifdef CONFIG_SMP 1418 /** 1419 * move_task_to_local_dsq - Move a task from a different rq to a local DSQ 1420 * @rq: rq to move the task into, currently locked 1421 * @p: task to move 1422 * @enq_flags: %SCX_ENQ_* 1423 * 1424 * Move @p which is currently on a different rq to @rq's local DSQ. The caller 1425 * must: 1426 * 1427 * 1. Start with exclusive access to @p either through its DSQ lock or 1428 * %SCX_OPSS_DISPATCHING flag. 1429 * 1430 * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). 1431 * 1432 * 3. Remember task_rq(@p). Release the exclusive access so that we don't 1433 * deadlock with dequeue. 1434 * 1435 * 4. Lock @rq and the task_rq from #3. 1436 * 1437 * 5. Call this function. 1438 * 1439 * Returns %true if @p was successfully moved. %false after racing dequeue and 1440 * losing. 1441 */ 1442 static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, 1443 u64 enq_flags) 1444 { 1445 struct rq *task_rq; 1446 1447 lockdep_assert_rq_held(rq); 1448 1449 /* 1450 * If dequeue got to @p while we were trying to lock both rq's, it'd 1451 * have cleared @p->scx.holding_cpu to -1. While other cpus may have 1452 * updated it to different values afterwards, as this operation can't be 1453 * preempted or recurse, @p->scx.holding_cpu can never become 1454 * raw_smp_processor_id() again before we're done. Thus, we can tell 1455 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is 1456 * still raw_smp_processor_id(). 1457 * 1458 * See dispatch_dequeue() for the counterpart. 1459 */ 1460 if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) 1461 return false; 1462 1463 /* @p->rq couldn't have changed if we're still the holding cpu */ 1464 task_rq = task_rq(p); 1465 lockdep_assert_rq_held(task_rq); 1466 1467 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); 1468 deactivate_task(task_rq, p, 0); 1469 set_task_cpu(p, cpu_of(rq)); 1470 p->scx.sticky_cpu = cpu_of(rq); 1471 1472 /* 1473 * We want to pass scx-specific enq_flags but activate_task() will 1474 * truncate the upper 32 bit. As we own @rq, we can pass them through 1475 * @rq->scx.extra_enq_flags instead. 1476 */ 1477 WARN_ON_ONCE(rq->scx.extra_enq_flags); 1478 rq->scx.extra_enq_flags = enq_flags; 1479 activate_task(rq, p, 0); 1480 rq->scx.extra_enq_flags = 0; 1481 1482 return true; 1483 } 1484 1485 /** 1486 * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked 1487 * @rq: current rq which is locked 1488 * @rf: rq_flags to use when unlocking @rq 1489 * @src_rq: rq to move task from 1490 * @dst_rq: rq to move task to 1491 * 1492 * We're holding @rq lock and trying to dispatch a task from @src_rq to 1493 * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether 1494 * @rq stays locked isn't important as long as the state is restored after 1495 * dispatch_to_local_dsq_unlock(). 1496 */ 1497 static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, 1498 struct rq *src_rq, struct rq *dst_rq) 1499 { 1500 rq_unpin_lock(rq, rf); 1501 1502 if (src_rq == dst_rq) { 1503 raw_spin_rq_unlock(rq); 1504 raw_spin_rq_lock(dst_rq); 1505 } else if (rq == src_rq) { 1506 double_lock_balance(rq, dst_rq); 1507 rq_repin_lock(rq, rf); 1508 } else if (rq == dst_rq) { 1509 double_lock_balance(rq, src_rq); 1510 rq_repin_lock(rq, rf); 1511 } else { 1512 raw_spin_rq_unlock(rq); 1513 double_rq_lock(src_rq, dst_rq); 1514 } 1515 } 1516 1517 /** 1518 * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() 1519 * @rq: current rq which is locked 1520 * @rf: rq_flags to use when unlocking @rq 1521 * @src_rq: rq to move task from 1522 * @dst_rq: rq to move task to 1523 * 1524 * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. 1525 */ 1526 static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, 1527 struct rq *src_rq, struct rq *dst_rq) 1528 { 1529 if (src_rq == dst_rq) { 1530 raw_spin_rq_unlock(dst_rq); 1531 raw_spin_rq_lock(rq); 1532 rq_repin_lock(rq, rf); 1533 } else if (rq == src_rq) { 1534 double_unlock_balance(rq, dst_rq); 1535 } else if (rq == dst_rq) { 1536 double_unlock_balance(rq, src_rq); 1537 } else { 1538 double_rq_unlock(src_rq, dst_rq); 1539 raw_spin_rq_lock(rq); 1540 rq_repin_lock(rq, rf); 1541 } 1542 } 1543 #endif /* CONFIG_SMP */ 1544 1545 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, 1546 struct task_struct *p) 1547 { 1548 lockdep_assert_held(&dsq->lock); /* released on return */ 1549 1550 /* @dsq is locked and @p is on this rq */ 1551 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 1552 list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list); 1553 dsq_mod_nr(dsq, -1); 1554 dsq_mod_nr(&rq->scx.local_dsq, 1); 1555 p->scx.dsq = &rq->scx.local_dsq; 1556 raw_spin_unlock(&dsq->lock); 1557 } 1558 1559 #ifdef CONFIG_SMP 1560 /* 1561 * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p 1562 * can be pulled to @rq. 1563 */ 1564 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) 1565 { 1566 int cpu = cpu_of(rq); 1567 1568 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1569 return false; 1570 if (unlikely(is_migration_disabled(p))) 1571 return false; 1572 if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) 1573 return false; 1574 if (!scx_rq_online(rq)) 1575 return false; 1576 return true; 1577 } 1578 1579 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, 1580 struct scx_dispatch_q *dsq, 1581 struct task_struct *p, struct rq *task_rq) 1582 { 1583 bool moved = false; 1584 1585 lockdep_assert_held(&dsq->lock); /* released on return */ 1586 1587 /* 1588 * @dsq is locked and @p is on a remote rq. @p is currently protected by 1589 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab 1590 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the 1591 * rq lock or fail, do a little dancing from our side. See 1592 * move_task_to_local_dsq(). 1593 */ 1594 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 1595 list_del_init(&p->scx.dsq_node); 1596 dsq_mod_nr(dsq, -1); 1597 p->scx.holding_cpu = raw_smp_processor_id(); 1598 raw_spin_unlock(&dsq->lock); 1599 1600 rq_unpin_lock(rq, rf); 1601 double_lock_balance(rq, task_rq); 1602 rq_repin_lock(rq, rf); 1603 1604 moved = move_task_to_local_dsq(rq, p, 0); 1605 1606 double_unlock_balance(rq, task_rq); 1607 1608 return moved; 1609 } 1610 #else /* CONFIG_SMP */ 1611 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } 1612 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, 1613 struct scx_dispatch_q *dsq, 1614 struct task_struct *p, struct rq *task_rq) { return false; } 1615 #endif /* CONFIG_SMP */ 1616 1617 static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, 1618 struct scx_dispatch_q *dsq) 1619 { 1620 struct task_struct *p; 1621 retry: 1622 if (list_empty(&dsq->list)) 1623 return false; 1624 1625 raw_spin_lock(&dsq->lock); 1626 1627 list_for_each_entry(p, &dsq->list, scx.dsq_node) { 1628 struct rq *task_rq = task_rq(p); 1629 1630 if (rq == task_rq) { 1631 consume_local_task(rq, dsq, p); 1632 return true; 1633 } 1634 1635 if (task_can_run_on_remote_rq(p, rq)) { 1636 if (likely(consume_remote_task(rq, rf, dsq, p, task_rq))) 1637 return true; 1638 goto retry; 1639 } 1640 } 1641 1642 raw_spin_unlock(&dsq->lock); 1643 return false; 1644 } 1645 1646 enum dispatch_to_local_dsq_ret { 1647 DTL_DISPATCHED, /* successfully dispatched */ 1648 DTL_LOST, /* lost race to dequeue */ 1649 DTL_NOT_LOCAL, /* destination is not a local DSQ */ 1650 DTL_INVALID, /* invalid local dsq_id */ 1651 }; 1652 1653 /** 1654 * dispatch_to_local_dsq - Dispatch a task to a local dsq 1655 * @rq: current rq which is locked 1656 * @rf: rq_flags to use when unlocking @rq 1657 * @dsq_id: destination dsq ID 1658 * @p: task to dispatch 1659 * @enq_flags: %SCX_ENQ_* 1660 * 1661 * We're holding @rq lock and want to dispatch @p to the local DSQ identified by 1662 * @dsq_id. This function performs all the synchronization dancing needed 1663 * because local DSQs are protected with rq locks. 1664 * 1665 * The caller must have exclusive ownership of @p (e.g. through 1666 * %SCX_OPSS_DISPATCHING). 1667 */ 1668 static enum dispatch_to_local_dsq_ret 1669 dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, 1670 struct task_struct *p, u64 enq_flags) 1671 { 1672 struct rq *src_rq = task_rq(p); 1673 struct rq *dst_rq; 1674 1675 /* 1676 * We're synchronized against dequeue through DISPATCHING. As @p can't 1677 * be dequeued, its task_rq and cpus_allowed are stable too. 1678 */ 1679 if (dsq_id == SCX_DSQ_LOCAL) { 1680 dst_rq = rq; 1681 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1682 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1683 1684 if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1685 return DTL_INVALID; 1686 dst_rq = cpu_rq(cpu); 1687 } else { 1688 return DTL_NOT_LOCAL; 1689 } 1690 1691 /* if dispatching to @rq that @p is already on, no lock dancing needed */ 1692 if (rq == src_rq && rq == dst_rq) { 1693 dispatch_enqueue(&dst_rq->scx.local_dsq, p, 1694 enq_flags | SCX_ENQ_CLEAR_OPSS); 1695 return DTL_DISPATCHED; 1696 } 1697 1698 #ifdef CONFIG_SMP 1699 if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { 1700 struct rq *locked_dst_rq = dst_rq; 1701 bool dsp; 1702 1703 /* 1704 * @p is on a possibly remote @src_rq which we need to lock to 1705 * move the task. If dequeue is in progress, it'd be locking 1706 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq 1707 * lock while holding DISPATCHING. 1708 * 1709 * As DISPATCHING guarantees that @p is wholly ours, we can 1710 * pretend that we're moving from a DSQ and use the same 1711 * mechanism - mark the task under transfer with holding_cpu, 1712 * release DISPATCHING and then follow the same protocol. 1713 */ 1714 p->scx.holding_cpu = raw_smp_processor_id(); 1715 1716 /* store_release ensures that dequeue sees the above */ 1717 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1718 1719 dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); 1720 1721 /* 1722 * We don't require the BPF scheduler to avoid dispatching to 1723 * offline CPUs mostly for convenience but also because CPUs can 1724 * go offline between scx_bpf_dispatch() calls and here. If @p 1725 * is destined to an offline CPU, queue it on its current CPU 1726 * instead, which should always be safe. As this is an allowed 1727 * behavior, don't trigger an ops error. 1728 */ 1729 if (!scx_rq_online(dst_rq)) 1730 dst_rq = src_rq; 1731 1732 if (src_rq == dst_rq) { 1733 /* 1734 * As @p is staying on the same rq, there's no need to 1735 * go through the full deactivate/activate cycle. 1736 * Optimize by abbreviating the operations in 1737 * move_task_to_local_dsq(). 1738 */ 1739 dsp = p->scx.holding_cpu == raw_smp_processor_id(); 1740 if (likely(dsp)) { 1741 p->scx.holding_cpu = -1; 1742 dispatch_enqueue(&dst_rq->scx.local_dsq, p, 1743 enq_flags); 1744 } 1745 } else { 1746 dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); 1747 } 1748 1749 /* if the destination CPU is idle, wake it up */ 1750 if (dsp && sched_class_above(p->sched_class, 1751 dst_rq->curr->sched_class)) 1752 resched_curr(dst_rq); 1753 1754 dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); 1755 1756 return dsp ? DTL_DISPATCHED : DTL_LOST; 1757 } 1758 #endif /* CONFIG_SMP */ 1759 1760 scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", 1761 cpu_of(dst_rq), p->comm, p->pid); 1762 return DTL_INVALID; 1763 } 1764 1765 /** 1766 * finish_dispatch - Asynchronously finish dispatching a task 1767 * @rq: current rq which is locked 1768 * @rf: rq_flags to use when unlocking @rq 1769 * @p: task to finish dispatching 1770 * @qseq_at_dispatch: qseq when @p started getting dispatched 1771 * @dsq_id: destination DSQ ID 1772 * @enq_flags: %SCX_ENQ_* 1773 * 1774 * Dispatching to local DSQs may need to wait for queueing to complete or 1775 * require rq lock dancing. As we don't wanna do either while inside 1776 * ops.dispatch() to avoid locking order inversion, we split dispatching into 1777 * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the 1778 * task and its qseq. Once ops.dispatch() returns, this function is called to 1779 * finish up. 1780 * 1781 * There is no guarantee that @p is still valid for dispatching or even that it 1782 * was valid in the first place. Make sure that the task is still owned by the 1783 * BPF scheduler and claim the ownership before dispatching. 1784 */ 1785 static void finish_dispatch(struct rq *rq, struct rq_flags *rf, 1786 struct task_struct *p, 1787 unsigned long qseq_at_dispatch, 1788 u64 dsq_id, u64 enq_flags) 1789 { 1790 struct scx_dispatch_q *dsq; 1791 unsigned long opss; 1792 1793 retry: 1794 /* 1795 * No need for _acquire here. @p is accessed only after a successful 1796 * try_cmpxchg to DISPATCHING. 1797 */ 1798 opss = atomic_long_read(&p->scx.ops_state); 1799 1800 switch (opss & SCX_OPSS_STATE_MASK) { 1801 case SCX_OPSS_DISPATCHING: 1802 case SCX_OPSS_NONE: 1803 /* someone else already got to it */ 1804 return; 1805 case SCX_OPSS_QUEUED: 1806 /* 1807 * If qseq doesn't match, @p has gone through at least one 1808 * dispatch/dequeue and re-enqueue cycle between 1809 * scx_bpf_dispatch() and here and we have no claim on it. 1810 */ 1811 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 1812 return; 1813 1814 /* 1815 * While we know @p is accessible, we don't yet have a claim on 1816 * it - the BPF scheduler is allowed to dispatch tasks 1817 * spuriously and there can be a racing dequeue attempt. Let's 1818 * claim @p by atomically transitioning it from QUEUED to 1819 * DISPATCHING. 1820 */ 1821 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 1822 SCX_OPSS_DISPATCHING))) 1823 break; 1824 goto retry; 1825 case SCX_OPSS_QUEUEING: 1826 /* 1827 * do_enqueue_task() is in the process of transferring the task 1828 * to the BPF scheduler while holding @p's rq lock. As we aren't 1829 * holding any kernel or BPF resource that the enqueue path may 1830 * depend upon, it's safe to wait. 1831 */ 1832 wait_ops_state(p, opss); 1833 goto retry; 1834 } 1835 1836 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 1837 1838 switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { 1839 case DTL_DISPATCHED: 1840 break; 1841 case DTL_LOST: 1842 break; 1843 case DTL_INVALID: 1844 dsq_id = SCX_DSQ_GLOBAL; 1845 fallthrough; 1846 case DTL_NOT_LOCAL: 1847 dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), 1848 dsq_id, p); 1849 dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 1850 break; 1851 } 1852 } 1853 1854 static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) 1855 { 1856 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 1857 u32 u; 1858 1859 for (u = 0; u < dspc->cursor; u++) { 1860 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 1861 1862 finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, 1863 ent->enq_flags); 1864 } 1865 1866 dspc->nr_tasks += dspc->cursor; 1867 dspc->cursor = 0; 1868 } 1869 1870 static int balance_scx(struct rq *rq, struct task_struct *prev, 1871 struct rq_flags *rf) 1872 { 1873 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 1874 bool prev_on_scx = prev->sched_class == &ext_sched_class; 1875 1876 lockdep_assert_rq_held(rq); 1877 1878 if (prev_on_scx) { 1879 WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP); 1880 update_curr_scx(rq); 1881 1882 /* 1883 * If @prev is runnable & has slice left, it has priority and 1884 * fetching more just increases latency for the fetched tasks. 1885 * Tell put_prev_task_scx() to put @prev on local_dsq. 1886 * 1887 * See scx_ops_disable_workfn() for the explanation on the 1888 * bypassing test. 1889 */ 1890 if ((prev->scx.flags & SCX_TASK_QUEUED) && 1891 prev->scx.slice && !scx_ops_bypassing()) { 1892 prev->scx.flags |= SCX_TASK_BAL_KEEP; 1893 return 1; 1894 } 1895 } 1896 1897 /* if there already are tasks to run, nothing to do */ 1898 if (rq->scx.local_dsq.nr) 1899 return 1; 1900 1901 if (consume_dispatch_q(rq, rf, &scx_dsq_global)) 1902 return 1; 1903 1904 if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) 1905 return 0; 1906 1907 dspc->rq = rq; 1908 dspc->rf = rf; 1909 1910 /* 1911 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 1912 * the local DSQ might still end up empty after a successful 1913 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 1914 * produced some tasks, retry. The BPF scheduler may depend on this 1915 * looping behavior to simplify its implementation. 1916 */ 1917 do { 1918 dspc->nr_tasks = 0; 1919 1920 SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), 1921 prev_on_scx ? prev : NULL); 1922 1923 flush_dispatch_buf(rq, rf); 1924 1925 if (rq->scx.local_dsq.nr) 1926 return 1; 1927 if (consume_dispatch_q(rq, rf, &scx_dsq_global)) 1928 return 1; 1929 } while (dspc->nr_tasks); 1930 1931 return 0; 1932 } 1933 1934 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 1935 { 1936 if (p->scx.flags & SCX_TASK_QUEUED) { 1937 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1938 dispatch_dequeue(rq, p); 1939 } 1940 1941 p->se.exec_start = rq_clock_task(rq); 1942 1943 clr_task_runnable(p, true); 1944 } 1945 1946 static void put_prev_task_scx(struct rq *rq, struct task_struct *p) 1947 { 1948 #ifndef CONFIG_SMP 1949 /* 1950 * UP workaround. 1951 * 1952 * Because SCX may transfer tasks across CPUs during dispatch, dispatch 1953 * is performed from its balance operation which isn't called in UP. 1954 * Let's work around by calling it from the operations which come right 1955 * after. 1956 * 1957 * 1. If the prev task is on SCX, pick_next_task() calls 1958 * .put_prev_task() right after. As .put_prev_task() is also called 1959 * from other places, we need to distinguish the calls which can be 1960 * done by looking at the previous task's state - if still queued or 1961 * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). 1962 * This case is handled here. 1963 * 1964 * 2. If the prev task is not on SCX, the first following call into SCX 1965 * will be .pick_next_task(), which is covered by calling 1966 * balance_scx() from pick_next_task_scx(). 1967 * 1968 * Note that we can't merge the first case into the second as 1969 * balance_scx() must be called before the previous SCX task goes 1970 * through put_prev_task_scx(). 1971 * 1972 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. 1973 * Pass in %NULL. 1974 */ 1975 if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) 1976 balance_scx(rq, p, NULL); 1977 #endif 1978 1979 update_curr_scx(rq); 1980 1981 /* 1982 * If we're being called from put_prev_task_balance(), balance_scx() may 1983 * have decided that @p should keep running. 1984 */ 1985 if (p->scx.flags & SCX_TASK_BAL_KEEP) { 1986 p->scx.flags &= ~SCX_TASK_BAL_KEEP; 1987 set_task_runnable(rq, p); 1988 dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); 1989 return; 1990 } 1991 1992 if (p->scx.flags & SCX_TASK_QUEUED) { 1993 set_task_runnable(rq, p); 1994 1995 /* 1996 * If @p has slice left and balance_scx() didn't tag it for 1997 * keeping, @p is getting preempted by a higher priority 1998 * scheduler class. Leave it at the head of the local DSQ. 1999 */ 2000 if (p->scx.slice && !scx_ops_bypassing()) { 2001 dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); 2002 return; 2003 } 2004 2005 /* 2006 * If we're in the pick_next_task path, balance_scx() should 2007 * have already populated the local DSQ if there are any other 2008 * available tasks. If empty, tell ops.enqueue() that @p is the 2009 * only one available for this cpu. ops.enqueue() should put it 2010 * on the local DSQ so that the subsequent pick_next_task_scx() 2011 * can find the task unless it wants to trigger a separate 2012 * follow-up scheduling event. 2013 */ 2014 if (list_empty(&rq->scx.local_dsq.list)) 2015 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 2016 else 2017 do_enqueue_task(rq, p, 0, -1); 2018 } 2019 } 2020 2021 static struct task_struct *first_local_task(struct rq *rq) 2022 { 2023 return list_first_entry_or_null(&rq->scx.local_dsq.list, 2024 struct task_struct, scx.dsq_node); 2025 } 2026 2027 static struct task_struct *pick_next_task_scx(struct rq *rq) 2028 { 2029 struct task_struct *p; 2030 2031 #ifndef CONFIG_SMP 2032 /* UP workaround - see the comment at the head of put_prev_task_scx() */ 2033 if (unlikely(rq->curr->sched_class != &ext_sched_class)) 2034 balance_scx(rq, rq->curr, NULL); 2035 #endif 2036 2037 p = first_local_task(rq); 2038 if (!p) 2039 return NULL; 2040 2041 set_next_task_scx(rq, p, true); 2042 2043 if (unlikely(!p->scx.slice)) { 2044 if (!scx_ops_bypassing() && !scx_warned_zero_slice) { 2045 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", 2046 p->comm, p->pid); 2047 scx_warned_zero_slice = true; 2048 } 2049 p->scx.slice = SCX_SLICE_DFL; 2050 } 2051 2052 return p; 2053 } 2054 2055 #ifdef CONFIG_SMP 2056 2057 static bool test_and_clear_cpu_idle(int cpu) 2058 { 2059 #ifdef CONFIG_SCHED_SMT 2060 /* 2061 * SMT mask should be cleared whether we can claim @cpu or not. The SMT 2062 * cluster is not wholly idle either way. This also prevents 2063 * scx_pick_idle_cpu() from getting caught in an infinite loop. 2064 */ 2065 if (sched_smt_active()) { 2066 const struct cpumask *smt = cpu_smt_mask(cpu); 2067 2068 /* 2069 * If offline, @cpu is not its own sibling and 2070 * scx_pick_idle_cpu() can get caught in an infinite loop as 2071 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu 2072 * is eventually cleared. 2073 */ 2074 if (cpumask_intersects(smt, idle_masks.smt)) 2075 cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 2076 else if (cpumask_test_cpu(cpu, idle_masks.smt)) 2077 __cpumask_clear_cpu(cpu, idle_masks.smt); 2078 } 2079 #endif 2080 return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); 2081 } 2082 2083 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) 2084 { 2085 int cpu; 2086 2087 retry: 2088 if (sched_smt_active()) { 2089 cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); 2090 if (cpu < nr_cpu_ids) 2091 goto found; 2092 2093 if (flags & SCX_PICK_IDLE_CORE) 2094 return -EBUSY; 2095 } 2096 2097 cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); 2098 if (cpu >= nr_cpu_ids) 2099 return -EBUSY; 2100 2101 found: 2102 if (test_and_clear_cpu_idle(cpu)) 2103 return cpu; 2104 else 2105 goto retry; 2106 } 2107 2108 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 2109 u64 wake_flags, bool *found) 2110 { 2111 s32 cpu; 2112 2113 *found = false; 2114 2115 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 2116 scx_ops_error("built-in idle tracking is disabled"); 2117 return prev_cpu; 2118 } 2119 2120 /* 2121 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is 2122 * under utilized, wake up @p to the local DSQ of the waker. Checking 2123 * only for an empty local DSQ is insufficient as it could give the 2124 * wakee an unfair advantage when the system is oversaturated. 2125 * Checking only for the presence of idle CPUs is also insufficient as 2126 * the local DSQ of the waker could have tasks piled up on it even if 2127 * there is an idle core elsewhere on the system. 2128 */ 2129 cpu = smp_processor_id(); 2130 if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && 2131 !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && 2132 cpu_rq(cpu)->scx.local_dsq.nr == 0) { 2133 if (cpumask_test_cpu(cpu, p->cpus_ptr)) 2134 goto cpu_found; 2135 } 2136 2137 if (p->nr_cpus_allowed == 1) { 2138 if (test_and_clear_cpu_idle(prev_cpu)) { 2139 cpu = prev_cpu; 2140 goto cpu_found; 2141 } else { 2142 return prev_cpu; 2143 } 2144 } 2145 2146 /* 2147 * If CPU has SMT, any wholly idle CPU is likely a better pick than 2148 * partially idle @prev_cpu. 2149 */ 2150 if (sched_smt_active()) { 2151 if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && 2152 test_and_clear_cpu_idle(prev_cpu)) { 2153 cpu = prev_cpu; 2154 goto cpu_found; 2155 } 2156 2157 cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); 2158 if (cpu >= 0) 2159 goto cpu_found; 2160 } 2161 2162 if (test_and_clear_cpu_idle(prev_cpu)) { 2163 cpu = prev_cpu; 2164 goto cpu_found; 2165 } 2166 2167 cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); 2168 if (cpu >= 0) 2169 goto cpu_found; 2170 2171 return prev_cpu; 2172 2173 cpu_found: 2174 *found = true; 2175 return cpu; 2176 } 2177 2178 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 2179 { 2180 /* 2181 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 2182 * can be a good migration opportunity with low cache and memory 2183 * footprint. Returning a CPU different than @prev_cpu triggers 2184 * immediate rq migration. However, for SCX, as the current rq 2185 * association doesn't dictate where the task is going to run, this 2186 * doesn't fit well. If necessary, we can later add a dedicated method 2187 * which can decide to preempt self to force it through the regular 2188 * scheduling path. 2189 */ 2190 if (unlikely(wake_flags & WF_EXEC)) 2191 return prev_cpu; 2192 2193 if (SCX_HAS_OP(select_cpu)) { 2194 s32 cpu; 2195 struct task_struct **ddsp_taskp; 2196 2197 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 2198 WARN_ON_ONCE(*ddsp_taskp); 2199 *ddsp_taskp = p; 2200 2201 cpu = SCX_CALL_OP_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, 2202 select_cpu, p, prev_cpu, wake_flags); 2203 *ddsp_taskp = NULL; 2204 if (ops_cpu_valid(cpu, "from ops.select_cpu()")) 2205 return cpu; 2206 else 2207 return prev_cpu; 2208 } else { 2209 bool found; 2210 s32 cpu; 2211 2212 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); 2213 if (found) { 2214 p->scx.slice = SCX_SLICE_DFL; 2215 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 2216 } 2217 return cpu; 2218 } 2219 } 2220 2221 static void set_cpus_allowed_scx(struct task_struct *p, 2222 struct affinity_context *ac) 2223 { 2224 set_cpus_allowed_common(p, ac); 2225 2226 /* 2227 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 2228 * differ from the configured one in @p->cpus_mask. Always tell the bpf 2229 * scheduler the effective one. 2230 * 2231 * Fine-grained memory write control is enforced by BPF making the const 2232 * designation pointless. Cast it away when calling the operation. 2233 */ 2234 if (SCX_HAS_OP(set_cpumask)) 2235 SCX_CALL_OP(SCX_KF_REST, set_cpumask, p, 2236 (struct cpumask *)p->cpus_ptr); 2237 } 2238 2239 static void reset_idle_masks(void) 2240 { 2241 /* 2242 * Consider all online cpus idle. Should converge to the actual state 2243 * quickly. 2244 */ 2245 cpumask_copy(idle_masks.cpu, cpu_online_mask); 2246 cpumask_copy(idle_masks.smt, cpu_online_mask); 2247 } 2248 2249 void __scx_update_idle(struct rq *rq, bool idle) 2250 { 2251 int cpu = cpu_of(rq); 2252 2253 if (SCX_HAS_OP(update_idle)) { 2254 SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); 2255 if (!static_branch_unlikely(&scx_builtin_idle_enabled)) 2256 return; 2257 } 2258 2259 if (idle) 2260 cpumask_set_cpu(cpu, idle_masks.cpu); 2261 else 2262 cpumask_clear_cpu(cpu, idle_masks.cpu); 2263 2264 #ifdef CONFIG_SCHED_SMT 2265 if (sched_smt_active()) { 2266 const struct cpumask *smt = cpu_smt_mask(cpu); 2267 2268 if (idle) { 2269 /* 2270 * idle_masks.smt handling is racy but that's fine as 2271 * it's only for optimization and self-correcting. 2272 */ 2273 for_each_cpu(cpu, smt) { 2274 if (!cpumask_test_cpu(cpu, idle_masks.cpu)) 2275 return; 2276 } 2277 cpumask_or(idle_masks.smt, idle_masks.smt, smt); 2278 } else { 2279 cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 2280 } 2281 } 2282 #endif 2283 } 2284 2285 #else /* CONFIG_SMP */ 2286 2287 static bool test_and_clear_cpu_idle(int cpu) { return false; } 2288 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } 2289 static void reset_idle_masks(void) {} 2290 2291 #endif /* CONFIG_SMP */ 2292 2293 static bool check_rq_for_timeouts(struct rq *rq) 2294 { 2295 struct task_struct *p; 2296 struct rq_flags rf; 2297 bool timed_out = false; 2298 2299 rq_lock_irqsave(rq, &rf); 2300 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 2301 unsigned long last_runnable = p->scx.runnable_at; 2302 2303 if (unlikely(time_after(jiffies, 2304 last_runnable + scx_watchdog_timeout))) { 2305 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 2306 2307 scx_ops_error_kind(SCX_EXIT_ERROR_STALL, 2308 "%s[%d] failed to run for %u.%03us", 2309 p->comm, p->pid, 2310 dur_ms / 1000, dur_ms % 1000); 2311 timed_out = true; 2312 break; 2313 } 2314 } 2315 rq_unlock_irqrestore(rq, &rf); 2316 2317 return timed_out; 2318 } 2319 2320 static void scx_watchdog_workfn(struct work_struct *work) 2321 { 2322 int cpu; 2323 2324 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 2325 2326 for_each_online_cpu(cpu) { 2327 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 2328 break; 2329 2330 cond_resched(); 2331 } 2332 queue_delayed_work(system_unbound_wq, to_delayed_work(work), 2333 scx_watchdog_timeout / 2); 2334 } 2335 2336 void scx_tick(struct rq *rq) 2337 { 2338 unsigned long last_check; 2339 2340 if (!scx_enabled()) 2341 return; 2342 2343 last_check = READ_ONCE(scx_watchdog_timestamp); 2344 if (unlikely(time_after(jiffies, 2345 last_check + READ_ONCE(scx_watchdog_timeout)))) { 2346 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 2347 2348 scx_ops_error_kind(SCX_EXIT_ERROR_STALL, 2349 "watchdog failed to check in for %u.%03us", 2350 dur_ms / 1000, dur_ms % 1000); 2351 } 2352 2353 update_other_load_avgs(rq); 2354 } 2355 2356 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 2357 { 2358 update_curr_scx(rq); 2359 2360 /* 2361 * While bypassing, always resched as we can't trust the slice 2362 * management. 2363 */ 2364 if (scx_ops_bypassing()) 2365 curr->scx.slice = 0; 2366 else if (SCX_HAS_OP(tick)) 2367 SCX_CALL_OP(SCX_KF_REST, tick, curr); 2368 2369 if (!curr->scx.slice) 2370 resched_curr(rq); 2371 } 2372 2373 static enum scx_task_state scx_get_task_state(const struct task_struct *p) 2374 { 2375 return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; 2376 } 2377 2378 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) 2379 { 2380 enum scx_task_state prev_state = scx_get_task_state(p); 2381 bool warn = false; 2382 2383 BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); 2384 2385 switch (state) { 2386 case SCX_TASK_NONE: 2387 break; 2388 case SCX_TASK_INIT: 2389 warn = prev_state != SCX_TASK_NONE; 2390 break; 2391 case SCX_TASK_READY: 2392 warn = prev_state == SCX_TASK_NONE; 2393 break; 2394 case SCX_TASK_ENABLED: 2395 warn = prev_state != SCX_TASK_READY; 2396 break; 2397 default: 2398 warn = true; 2399 return; 2400 } 2401 2402 WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", 2403 prev_state, state, p->comm, p->pid); 2404 2405 p->scx.flags &= ~SCX_TASK_STATE_MASK; 2406 p->scx.flags |= state << SCX_TASK_STATE_SHIFT; 2407 } 2408 2409 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) 2410 { 2411 int ret; 2412 2413 p->scx.disallow = false; 2414 2415 if (SCX_HAS_OP(init_task)) { 2416 struct scx_init_task_args args = { 2417 .fork = fork, 2418 }; 2419 2420 ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); 2421 if (unlikely(ret)) { 2422 ret = ops_sanitize_err("init_task", ret); 2423 return ret; 2424 } 2425 } 2426 2427 scx_set_task_state(p, SCX_TASK_INIT); 2428 2429 if (p->scx.disallow) { 2430 struct rq *rq; 2431 struct rq_flags rf; 2432 2433 rq = task_rq_lock(p, &rf); 2434 2435 /* 2436 * We're either in fork or load path and @p->policy will be 2437 * applied right after. Reverting @p->policy here and rejecting 2438 * %SCHED_EXT transitions from scx_check_setscheduler() 2439 * guarantees that if ops.init_task() sets @p->disallow, @p can 2440 * never be in SCX. 2441 */ 2442 if (p->policy == SCHED_EXT) { 2443 p->policy = SCHED_NORMAL; 2444 atomic_long_inc(&scx_nr_rejected); 2445 } 2446 2447 task_rq_unlock(rq, p, &rf); 2448 } 2449 2450 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2451 return 0; 2452 } 2453 2454 static void set_task_scx_weight(struct task_struct *p) 2455 { 2456 u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 2457 2458 p->scx.weight = sched_weight_to_cgroup(weight); 2459 } 2460 2461 static void scx_ops_enable_task(struct task_struct *p) 2462 { 2463 lockdep_assert_rq_held(task_rq(p)); 2464 2465 /* 2466 * Set the weight before calling ops.enable() so that the scheduler 2467 * doesn't see a stale value if they inspect the task struct. 2468 */ 2469 set_task_scx_weight(p); 2470 if (SCX_HAS_OP(enable)) 2471 SCX_CALL_OP(SCX_KF_REST, enable, p); 2472 scx_set_task_state(p, SCX_TASK_ENABLED); 2473 2474 if (SCX_HAS_OP(set_weight)) 2475 SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); 2476 } 2477 2478 static void scx_ops_disable_task(struct task_struct *p) 2479 { 2480 lockdep_assert_rq_held(task_rq(p)); 2481 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 2482 2483 if (SCX_HAS_OP(disable)) 2484 SCX_CALL_OP(SCX_KF_REST, disable, p); 2485 scx_set_task_state(p, SCX_TASK_READY); 2486 } 2487 2488 static void scx_ops_exit_task(struct task_struct *p) 2489 { 2490 struct scx_exit_task_args args = { 2491 .cancelled = false, 2492 }; 2493 2494 lockdep_assert_rq_held(task_rq(p)); 2495 2496 switch (scx_get_task_state(p)) { 2497 case SCX_TASK_NONE: 2498 return; 2499 case SCX_TASK_INIT: 2500 args.cancelled = true; 2501 break; 2502 case SCX_TASK_READY: 2503 break; 2504 case SCX_TASK_ENABLED: 2505 scx_ops_disable_task(p); 2506 break; 2507 default: 2508 WARN_ON_ONCE(true); 2509 return; 2510 } 2511 2512 if (SCX_HAS_OP(exit_task)) 2513 SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); 2514 scx_set_task_state(p, SCX_TASK_NONE); 2515 } 2516 2517 void init_scx_entity(struct sched_ext_entity *scx) 2518 { 2519 /* 2520 * init_idle() calls this function again after fork sequence is 2521 * complete. Don't touch ->tasks_node as it's already linked. 2522 */ 2523 memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); 2524 2525 INIT_LIST_HEAD(&scx->dsq_node); 2526 scx->sticky_cpu = -1; 2527 scx->holding_cpu = -1; 2528 INIT_LIST_HEAD(&scx->runnable_node); 2529 scx->runnable_at = jiffies; 2530 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 2531 scx->slice = SCX_SLICE_DFL; 2532 } 2533 2534 void scx_pre_fork(struct task_struct *p) 2535 { 2536 /* 2537 * BPF scheduler enable/disable paths want to be able to iterate and 2538 * update all tasks which can become complex when racing forks. As 2539 * enable/disable are very cold paths, let's use a percpu_rwsem to 2540 * exclude forks. 2541 */ 2542 percpu_down_read(&scx_fork_rwsem); 2543 } 2544 2545 int scx_fork(struct task_struct *p) 2546 { 2547 percpu_rwsem_assert_held(&scx_fork_rwsem); 2548 2549 if (scx_enabled()) 2550 return scx_ops_init_task(p, task_group(p), true); 2551 else 2552 return 0; 2553 } 2554 2555 void scx_post_fork(struct task_struct *p) 2556 { 2557 if (scx_enabled()) { 2558 scx_set_task_state(p, SCX_TASK_READY); 2559 2560 /* 2561 * Enable the task immediately if it's running on sched_ext. 2562 * Otherwise, it'll be enabled in switching_to_scx() if and 2563 * when it's ever configured to run with a SCHED_EXT policy. 2564 */ 2565 if (p->sched_class == &ext_sched_class) { 2566 struct rq_flags rf; 2567 struct rq *rq; 2568 2569 rq = task_rq_lock(p, &rf); 2570 scx_ops_enable_task(p); 2571 task_rq_unlock(rq, p, &rf); 2572 } 2573 } 2574 2575 spin_lock_irq(&scx_tasks_lock); 2576 list_add_tail(&p->scx.tasks_node, &scx_tasks); 2577 spin_unlock_irq(&scx_tasks_lock); 2578 2579 percpu_up_read(&scx_fork_rwsem); 2580 } 2581 2582 void scx_cancel_fork(struct task_struct *p) 2583 { 2584 if (scx_enabled()) { 2585 struct rq *rq; 2586 struct rq_flags rf; 2587 2588 rq = task_rq_lock(p, &rf); 2589 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 2590 scx_ops_exit_task(p); 2591 task_rq_unlock(rq, p, &rf); 2592 } 2593 2594 percpu_up_read(&scx_fork_rwsem); 2595 } 2596 2597 void sched_ext_free(struct task_struct *p) 2598 { 2599 unsigned long flags; 2600 2601 spin_lock_irqsave(&scx_tasks_lock, flags); 2602 list_del_init(&p->scx.tasks_node); 2603 spin_unlock_irqrestore(&scx_tasks_lock, flags); 2604 2605 /* 2606 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> 2607 * ENABLED transitions can't race us. Disable ops for @p. 2608 */ 2609 if (scx_get_task_state(p) != SCX_TASK_NONE) { 2610 struct rq_flags rf; 2611 struct rq *rq; 2612 2613 rq = task_rq_lock(p, &rf); 2614 scx_ops_exit_task(p); 2615 task_rq_unlock(rq, p, &rf); 2616 } 2617 } 2618 2619 static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) 2620 { 2621 lockdep_assert_rq_held(task_rq(p)); 2622 2623 set_task_scx_weight(p); 2624 if (SCX_HAS_OP(set_weight)) 2625 SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); 2626 } 2627 2628 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) 2629 { 2630 } 2631 2632 static void switching_to_scx(struct rq *rq, struct task_struct *p) 2633 { 2634 scx_ops_enable_task(p); 2635 2636 /* 2637 * set_cpus_allowed_scx() is not called while @p is associated with a 2638 * different scheduler class. Keep the BPF scheduler up-to-date. 2639 */ 2640 if (SCX_HAS_OP(set_cpumask)) 2641 SCX_CALL_OP(SCX_KF_REST, set_cpumask, p, 2642 (struct cpumask *)p->cpus_ptr); 2643 } 2644 2645 static void switched_from_scx(struct rq *rq, struct task_struct *p) 2646 { 2647 scx_ops_disable_task(p); 2648 } 2649 2650 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} 2651 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 2652 2653 int scx_check_setscheduler(struct task_struct *p, int policy) 2654 { 2655 lockdep_assert_rq_held(task_rq(p)); 2656 2657 /* if disallow, reject transitioning into SCX */ 2658 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 2659 p->policy != policy && policy == SCHED_EXT) 2660 return -EACCES; 2661 2662 return 0; 2663 } 2664 2665 /* 2666 * Omitted operations: 2667 * 2668 * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task 2669 * isn't tied to the CPU at that point. 2670 * 2671 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 2672 * 2673 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 2674 * their current sched_class. Call them directly from sched core instead. 2675 * 2676 * - task_woken: Unnecessary. 2677 */ 2678 DEFINE_SCHED_CLASS(ext) = { 2679 .enqueue_task = enqueue_task_scx, 2680 .dequeue_task = dequeue_task_scx, 2681 .yield_task = yield_task_scx, 2682 .yield_to_task = yield_to_task_scx, 2683 2684 .wakeup_preempt = wakeup_preempt_scx, 2685 2686 .pick_next_task = pick_next_task_scx, 2687 2688 .put_prev_task = put_prev_task_scx, 2689 .set_next_task = set_next_task_scx, 2690 2691 #ifdef CONFIG_SMP 2692 .balance = balance_scx, 2693 .select_task_rq = select_task_rq_scx, 2694 .set_cpus_allowed = set_cpus_allowed_scx, 2695 #endif 2696 2697 .task_tick = task_tick_scx, 2698 2699 .switching_to = switching_to_scx, 2700 .switched_from = switched_from_scx, 2701 .switched_to = switched_to_scx, 2702 .reweight_task = reweight_task_scx, 2703 .prio_changed = prio_changed_scx, 2704 2705 .update_curr = update_curr_scx, 2706 2707 #ifdef CONFIG_UCLAMP_TASK 2708 .uclamp_enabled = 0, 2709 #endif 2710 }; 2711 2712 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) 2713 { 2714 memset(dsq, 0, sizeof(*dsq)); 2715 2716 raw_spin_lock_init(&dsq->lock); 2717 INIT_LIST_HEAD(&dsq->list); 2718 dsq->id = dsq_id; 2719 } 2720 2721 static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) 2722 { 2723 struct scx_dispatch_q *dsq; 2724 int ret; 2725 2726 if (dsq_id & SCX_DSQ_FLAG_BUILTIN) 2727 return ERR_PTR(-EINVAL); 2728 2729 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 2730 if (!dsq) 2731 return ERR_PTR(-ENOMEM); 2732 2733 init_dsq(dsq, dsq_id); 2734 2735 ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, 2736 dsq_hash_params); 2737 if (ret) { 2738 kfree(dsq); 2739 return ERR_PTR(ret); 2740 } 2741 return dsq; 2742 } 2743 2744 static void free_dsq_irq_workfn(struct irq_work *irq_work) 2745 { 2746 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 2747 struct scx_dispatch_q *dsq, *tmp_dsq; 2748 2749 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 2750 kfree_rcu(dsq, rcu); 2751 } 2752 2753 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 2754 2755 static void destroy_dsq(u64 dsq_id) 2756 { 2757 struct scx_dispatch_q *dsq; 2758 unsigned long flags; 2759 2760 rcu_read_lock(); 2761 2762 dsq = find_user_dsq(dsq_id); 2763 if (!dsq) 2764 goto out_unlock_rcu; 2765 2766 raw_spin_lock_irqsave(&dsq->lock, flags); 2767 2768 if (dsq->nr) { 2769 scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", 2770 dsq->id, dsq->nr); 2771 goto out_unlock_dsq; 2772 } 2773 2774 if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) 2775 goto out_unlock_dsq; 2776 2777 /* 2778 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 2779 * queueing more tasks. As this function can be called from anywhere, 2780 * freeing is bounced through an irq work to avoid nesting RCU 2781 * operations inside scheduler locks. 2782 */ 2783 dsq->id = SCX_DSQ_INVALID; 2784 llist_add(&dsq->free_node, &dsqs_to_free); 2785 irq_work_queue(&free_dsq_irq_work); 2786 2787 out_unlock_dsq: 2788 raw_spin_unlock_irqrestore(&dsq->lock, flags); 2789 out_unlock_rcu: 2790 rcu_read_unlock(); 2791 } 2792 2793 2794 /******************************************************************************** 2795 * Sysfs interface and ops enable/disable. 2796 */ 2797 2798 #define SCX_ATTR(_name) \ 2799 static struct kobj_attribute scx_attr_##_name = { \ 2800 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2801 .show = scx_attr_##_name##_show, \ 2802 } 2803 2804 static ssize_t scx_attr_state_show(struct kobject *kobj, 2805 struct kobj_attribute *ka, char *buf) 2806 { 2807 return sysfs_emit(buf, "%s\n", 2808 scx_ops_enable_state_str[scx_ops_enable_state()]); 2809 } 2810 SCX_ATTR(state); 2811 2812 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 2813 struct kobj_attribute *ka, char *buf) 2814 { 2815 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 2816 } 2817 SCX_ATTR(switch_all); 2818 2819 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 2820 struct kobj_attribute *ka, char *buf) 2821 { 2822 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 2823 } 2824 SCX_ATTR(nr_rejected); 2825 2826 static struct attribute *scx_global_attrs[] = { 2827 &scx_attr_state.attr, 2828 &scx_attr_switch_all.attr, 2829 &scx_attr_nr_rejected.attr, 2830 NULL, 2831 }; 2832 2833 static const struct attribute_group scx_global_attr_group = { 2834 .attrs = scx_global_attrs, 2835 }; 2836 2837 static void scx_kobj_release(struct kobject *kobj) 2838 { 2839 kfree(kobj); 2840 } 2841 2842 static ssize_t scx_attr_ops_show(struct kobject *kobj, 2843 struct kobj_attribute *ka, char *buf) 2844 { 2845 return sysfs_emit(buf, "%s\n", scx_ops.name); 2846 } 2847 SCX_ATTR(ops); 2848 2849 static struct attribute *scx_sched_attrs[] = { 2850 &scx_attr_ops.attr, 2851 NULL, 2852 }; 2853 ATTRIBUTE_GROUPS(scx_sched); 2854 2855 static const struct kobj_type scx_ktype = { 2856 .release = scx_kobj_release, 2857 .sysfs_ops = &kobj_sysfs_ops, 2858 .default_groups = scx_sched_groups, 2859 }; 2860 2861 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 2862 { 2863 return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); 2864 } 2865 2866 static const struct kset_uevent_ops scx_uevent_ops = { 2867 .uevent = scx_uevent, 2868 }; 2869 2870 /* 2871 * Used by sched_fork() and __setscheduler_prio() to pick the matching 2872 * sched_class. dl/rt are already handled. 2873 */ 2874 bool task_should_scx(struct task_struct *p) 2875 { 2876 if (!scx_enabled() || 2877 unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) 2878 return false; 2879 if (READ_ONCE(scx_switching_all)) 2880 return true; 2881 return p->policy == SCHED_EXT; 2882 } 2883 2884 /** 2885 * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress 2886 * 2887 * Bypassing guarantees that all runnable tasks make forward progress without 2888 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 2889 * be held by tasks that the BPF scheduler is forgetting to run, which 2890 * unfortunately also excludes toggling the static branches. 2891 * 2892 * Let's work around by overriding a couple ops and modifying behaviors based on 2893 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 2894 * to force global FIFO scheduling. 2895 * 2896 * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 2897 * 2898 * b. ops.dispatch() is ignored. 2899 * 2900 * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be 2901 * trusted. Whenever a tick triggers, the running task is rotated to the tail 2902 * of the queue. 2903 * 2904 * d. pick_next_task() suppresses zero slice warning. 2905 */ 2906 static void scx_ops_bypass(bool bypass) 2907 { 2908 int depth, cpu; 2909 2910 if (bypass) { 2911 depth = atomic_inc_return(&scx_ops_bypass_depth); 2912 WARN_ON_ONCE(depth <= 0); 2913 if (depth != 1) 2914 return; 2915 } else { 2916 depth = atomic_dec_return(&scx_ops_bypass_depth); 2917 WARN_ON_ONCE(depth < 0); 2918 if (depth != 0) 2919 return; 2920 } 2921 2922 /* 2923 * We need to guarantee that no tasks are on the BPF scheduler while 2924 * bypassing. Either we see enabled or the enable path sees the 2925 * increased bypass_depth before moving tasks to SCX. 2926 */ 2927 if (!scx_enabled()) 2928 return; 2929 2930 /* 2931 * No task property is changing. We just need to make sure all currently 2932 * queued tasks are re-queued according to the new scx_ops_bypassing() 2933 * state. As an optimization, walk each rq's runnable_list instead of 2934 * the scx_tasks list. 2935 * 2936 * This function can't trust the scheduler and thus can't use 2937 * cpus_read_lock(). Walk all possible CPUs instead of online. 2938 */ 2939 for_each_possible_cpu(cpu) { 2940 struct rq *rq = cpu_rq(cpu); 2941 struct rq_flags rf; 2942 struct task_struct *p, *n; 2943 2944 rq_lock_irqsave(rq, &rf); 2945 2946 /* 2947 * The use of list_for_each_entry_safe_reverse() is required 2948 * because each task is going to be removed from and added back 2949 * to the runnable_list during iteration. Because they're added 2950 * to the tail of the list, safe reverse iteration can still 2951 * visit all nodes. 2952 */ 2953 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 2954 scx.runnable_node) { 2955 struct sched_enq_and_set_ctx ctx; 2956 2957 /* cycling deq/enq is enough, see the function comment */ 2958 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 2959 sched_enq_and_set_task(&ctx); 2960 } 2961 2962 rq_unlock_irqrestore(rq, &rf); 2963 } 2964 } 2965 2966 static void free_exit_info(struct scx_exit_info *ei) 2967 { 2968 kfree(ei->dump); 2969 kfree(ei->msg); 2970 kfree(ei->bt); 2971 kfree(ei); 2972 } 2973 2974 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 2975 { 2976 struct scx_exit_info *ei; 2977 2978 ei = kzalloc(sizeof(*ei), GFP_KERNEL); 2979 if (!ei) 2980 return NULL; 2981 2982 ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); 2983 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 2984 ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); 2985 2986 if (!ei->bt || !ei->msg || !ei->dump) { 2987 free_exit_info(ei); 2988 return NULL; 2989 } 2990 2991 return ei; 2992 } 2993 2994 static const char *scx_exit_reason(enum scx_exit_kind kind) 2995 { 2996 switch (kind) { 2997 case SCX_EXIT_UNREG: 2998 return "Scheduler unregistered from user space"; 2999 case SCX_EXIT_UNREG_BPF: 3000 return "Scheduler unregistered from BPF"; 3001 case SCX_EXIT_UNREG_KERN: 3002 return "Scheduler unregistered from the main kernel"; 3003 case SCX_EXIT_SYSRQ: 3004 return "disabled by sysrq-S"; 3005 case SCX_EXIT_ERROR: 3006 return "runtime error"; 3007 case SCX_EXIT_ERROR_BPF: 3008 return "scx_bpf_error"; 3009 case SCX_EXIT_ERROR_STALL: 3010 return "runnable task stall"; 3011 default: 3012 return "<UNKNOWN>"; 3013 } 3014 } 3015 3016 static void scx_ops_disable_workfn(struct kthread_work *work) 3017 { 3018 struct scx_exit_info *ei = scx_exit_info; 3019 struct scx_task_iter sti; 3020 struct task_struct *p; 3021 struct rhashtable_iter rht_iter; 3022 struct scx_dispatch_q *dsq; 3023 int i, kind; 3024 3025 kind = atomic_read(&scx_exit_kind); 3026 while (true) { 3027 /* 3028 * NONE indicates that a new scx_ops has been registered since 3029 * disable was scheduled - don't kill the new ops. DONE 3030 * indicates that the ops has already been disabled. 3031 */ 3032 if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) 3033 return; 3034 if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) 3035 break; 3036 } 3037 ei->kind = kind; 3038 ei->reason = scx_exit_reason(ei->kind); 3039 3040 /* guarantee forward progress by bypassing scx_ops */ 3041 scx_ops_bypass(true); 3042 3043 switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { 3044 case SCX_OPS_DISABLING: 3045 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 3046 break; 3047 case SCX_OPS_DISABLED: 3048 pr_warn("sched_ext: ops error detected without ops (%s)\n", 3049 scx_exit_info->msg); 3050 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != 3051 SCX_OPS_DISABLING); 3052 goto done; 3053 default: 3054 break; 3055 } 3056 3057 /* 3058 * Here, every runnable task is guaranteed to make forward progress and 3059 * we can safely use blocking synchronization constructs. Actually 3060 * disable ops. 3061 */ 3062 mutex_lock(&scx_ops_enable_mutex); 3063 3064 static_branch_disable(&__scx_switched_all); 3065 WRITE_ONCE(scx_switching_all, false); 3066 3067 /* 3068 * Avoid racing against fork. See scx_ops_enable() for explanation on 3069 * the locking order. 3070 */ 3071 percpu_down_write(&scx_fork_rwsem); 3072 cpus_read_lock(); 3073 3074 spin_lock_irq(&scx_tasks_lock); 3075 scx_task_iter_init(&sti); 3076 /* 3077 * Invoke scx_ops_exit_task() on all non-idle tasks, including 3078 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, 3079 * we may not have invoked sched_ext_free() on them by the time a 3080 * scheduler is disabled. We must therefore exit the task here, or we'd 3081 * fail to invoke ops.exit_task(), as the scheduler will have been 3082 * unloaded by the time the task is subsequently exited on the 3083 * sched_ext_free() path. 3084 */ 3085 while ((p = scx_task_iter_next_locked(&sti, true))) { 3086 const struct sched_class *old_class = p->sched_class; 3087 struct sched_enq_and_set_ctx ctx; 3088 3089 if (READ_ONCE(p->__state) != TASK_DEAD) { 3090 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, 3091 &ctx); 3092 3093 p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); 3094 __setscheduler_prio(p, p->prio); 3095 check_class_changing(task_rq(p), p, old_class); 3096 3097 sched_enq_and_set_task(&ctx); 3098 3099 check_class_changed(task_rq(p), p, old_class, p->prio); 3100 } 3101 scx_ops_exit_task(p); 3102 } 3103 scx_task_iter_exit(&sti); 3104 spin_unlock_irq(&scx_tasks_lock); 3105 3106 /* no task is on scx, turn off all the switches and flush in-progress calls */ 3107 static_branch_disable_cpuslocked(&__scx_ops_enabled); 3108 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 3109 static_branch_disable_cpuslocked(&scx_has_op[i]); 3110 static_branch_disable_cpuslocked(&scx_ops_enq_last); 3111 static_branch_disable_cpuslocked(&scx_ops_enq_exiting); 3112 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 3113 synchronize_rcu(); 3114 3115 cpus_read_unlock(); 3116 percpu_up_write(&scx_fork_rwsem); 3117 3118 if (ei->kind >= SCX_EXIT_ERROR) { 3119 printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); 3120 3121 if (ei->msg[0] == '\0') 3122 printk(KERN_ERR "sched_ext: %s\n", ei->reason); 3123 else 3124 printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); 3125 3126 stack_trace_print(ei->bt, ei->bt_len, 2); 3127 } 3128 3129 if (scx_ops.exit) 3130 SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); 3131 3132 cancel_delayed_work_sync(&scx_watchdog_work); 3133 3134 /* 3135 * Delete the kobject from the hierarchy eagerly in addition to just 3136 * dropping a reference. Otherwise, if the object is deleted 3137 * asynchronously, sysfs could observe an object of the same name still 3138 * in the hierarchy when another scheduler is loaded. 3139 */ 3140 kobject_del(scx_root_kobj); 3141 kobject_put(scx_root_kobj); 3142 scx_root_kobj = NULL; 3143 3144 memset(&scx_ops, 0, sizeof(scx_ops)); 3145 3146 rhashtable_walk_enter(&dsq_hash, &rht_iter); 3147 do { 3148 rhashtable_walk_start(&rht_iter); 3149 3150 while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) 3151 destroy_dsq(dsq->id); 3152 3153 rhashtable_walk_stop(&rht_iter); 3154 } while (dsq == ERR_PTR(-EAGAIN)); 3155 rhashtable_walk_exit(&rht_iter); 3156 3157 free_percpu(scx_dsp_ctx); 3158 scx_dsp_ctx = NULL; 3159 scx_dsp_max_batch = 0; 3160 3161 free_exit_info(scx_exit_info); 3162 scx_exit_info = NULL; 3163 3164 mutex_unlock(&scx_ops_enable_mutex); 3165 3166 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != 3167 SCX_OPS_DISABLING); 3168 done: 3169 scx_ops_bypass(false); 3170 } 3171 3172 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); 3173 3174 static void schedule_scx_ops_disable_work(void) 3175 { 3176 struct kthread_worker *helper = READ_ONCE(scx_ops_helper); 3177 3178 /* 3179 * We may be called spuriously before the first bpf_sched_ext_reg(). If 3180 * scx_ops_helper isn't set up yet, there's nothing to do. 3181 */ 3182 if (helper) 3183 kthread_queue_work(helper, &scx_ops_disable_work); 3184 } 3185 3186 static void scx_ops_disable(enum scx_exit_kind kind) 3187 { 3188 int none = SCX_EXIT_NONE; 3189 3190 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 3191 kind = SCX_EXIT_ERROR; 3192 3193 atomic_try_cmpxchg(&scx_exit_kind, &none, kind); 3194 3195 schedule_scx_ops_disable_work(); 3196 } 3197 3198 static void dump_newline(struct seq_buf *s) 3199 { 3200 trace_sched_ext_dump(""); 3201 3202 /* @s may be zero sized and seq_buf triggers WARN if so */ 3203 if (s->size) 3204 seq_buf_putc(s, '\n'); 3205 } 3206 3207 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 3208 { 3209 va_list args; 3210 3211 #ifdef CONFIG_TRACEPOINTS 3212 if (trace_sched_ext_dump_enabled()) { 3213 /* protected by scx_dump_state()::dump_lock */ 3214 static char line_buf[SCX_EXIT_MSG_LEN]; 3215 3216 va_start(args, fmt); 3217 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 3218 va_end(args); 3219 3220 trace_sched_ext_dump(line_buf); 3221 } 3222 #endif 3223 /* @s may be zero sized and seq_buf triggers WARN if so */ 3224 if (s->size) { 3225 va_start(args, fmt); 3226 seq_buf_vprintf(s, fmt, args); 3227 va_end(args); 3228 3229 seq_buf_putc(s, '\n'); 3230 } 3231 } 3232 3233 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 3234 const unsigned long *bt, unsigned int len) 3235 { 3236 unsigned int i; 3237 3238 for (i = 0; i < len; i++) 3239 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 3240 } 3241 3242 static void ops_dump_init(struct seq_buf *s, const char *prefix) 3243 { 3244 struct scx_dump_data *dd = &scx_dump_data; 3245 3246 lockdep_assert_irqs_disabled(); 3247 3248 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 3249 dd->first = true; 3250 dd->cursor = 0; 3251 dd->s = s; 3252 dd->prefix = prefix; 3253 } 3254 3255 static void ops_dump_flush(void) 3256 { 3257 struct scx_dump_data *dd = &scx_dump_data; 3258 char *line = dd->buf.line; 3259 3260 if (!dd->cursor) 3261 return; 3262 3263 /* 3264 * There's something to flush and this is the first line. Insert a blank 3265 * line to distinguish ops dump. 3266 */ 3267 if (dd->first) { 3268 dump_newline(dd->s); 3269 dd->first = false; 3270 } 3271 3272 /* 3273 * There may be multiple lines in $line. Scan and emit each line 3274 * separately. 3275 */ 3276 while (true) { 3277 char *end = line; 3278 char c; 3279 3280 while (*end != '\n' && *end != '\0') 3281 end++; 3282 3283 /* 3284 * If $line overflowed, it may not have newline at the end. 3285 * Always emit with a newline. 3286 */ 3287 c = *end; 3288 *end = '\0'; 3289 dump_line(dd->s, "%s%s", dd->prefix, line); 3290 if (c == '\0') 3291 break; 3292 3293 /* move to the next line */ 3294 end++; 3295 if (*end == '\0') 3296 break; 3297 line = end; 3298 } 3299 3300 dd->cursor = 0; 3301 } 3302 3303 static void ops_dump_exit(void) 3304 { 3305 ops_dump_flush(); 3306 scx_dump_data.cpu = -1; 3307 } 3308 3309 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, 3310 struct task_struct *p, char marker) 3311 { 3312 static unsigned long bt[SCX_EXIT_BT_LEN]; 3313 char dsq_id_buf[19] = "(n/a)"; 3314 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 3315 unsigned int bt_len; 3316 3317 if (p->scx.dsq) 3318 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 3319 (unsigned long long)p->scx.dsq->id); 3320 3321 dump_newline(s); 3322 dump_line(s, " %c%c %s[%d] %+ldms", 3323 marker, task_state_to_char(p), p->comm, p->pid, 3324 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 3325 dump_line(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu", 3326 scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, 3327 ops_state & SCX_OPSS_STATE_MASK, 3328 ops_state >> SCX_OPSS_QSEQ_SHIFT); 3329 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 3330 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 3331 dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); 3332 3333 if (SCX_HAS_OP(dump_task)) { 3334 ops_dump_init(s, " "); 3335 SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); 3336 ops_dump_exit(); 3337 } 3338 3339 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 3340 if (bt_len) { 3341 dump_newline(s); 3342 dump_stack_trace(s, " ", bt, bt_len); 3343 } 3344 } 3345 3346 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) 3347 { 3348 static DEFINE_SPINLOCK(dump_lock); 3349 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 3350 struct scx_dump_ctx dctx = { 3351 .kind = ei->kind, 3352 .exit_code = ei->exit_code, 3353 .reason = ei->reason, 3354 .at_ns = ktime_get_ns(), 3355 .at_jiffies = jiffies, 3356 }; 3357 struct seq_buf s; 3358 unsigned long flags; 3359 char *buf; 3360 int cpu; 3361 3362 spin_lock_irqsave(&dump_lock, flags); 3363 3364 seq_buf_init(&s, ei->dump, dump_len); 3365 3366 if (ei->kind == SCX_EXIT_NONE) { 3367 dump_line(&s, "Debug dump triggered by %s", ei->reason); 3368 } else { 3369 dump_line(&s, "%s[%d] triggered exit kind %d:", 3370 current->comm, current->pid, ei->kind); 3371 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 3372 dump_newline(&s); 3373 dump_line(&s, "Backtrace:"); 3374 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 3375 } 3376 3377 if (SCX_HAS_OP(dump)) { 3378 ops_dump_init(&s, ""); 3379 SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); 3380 ops_dump_exit(); 3381 } 3382 3383 dump_newline(&s); 3384 dump_line(&s, "CPU states"); 3385 dump_line(&s, "----------"); 3386 3387 for_each_possible_cpu(cpu) { 3388 struct rq *rq = cpu_rq(cpu); 3389 struct rq_flags rf; 3390 struct task_struct *p; 3391 struct seq_buf ns; 3392 size_t avail, used; 3393 bool idle; 3394 3395 rq_lock(rq, &rf); 3396 3397 idle = list_empty(&rq->scx.runnable_list) && 3398 rq->curr->sched_class == &idle_sched_class; 3399 3400 if (idle && !SCX_HAS_OP(dump_cpu)) 3401 goto next; 3402 3403 /* 3404 * We don't yet know whether ops.dump_cpu() will produce output 3405 * and we may want to skip the default CPU dump if it doesn't. 3406 * Use a nested seq_buf to generate the standard dump so that we 3407 * can decide whether to commit later. 3408 */ 3409 avail = seq_buf_get_buf(&s, &buf); 3410 seq_buf_init(&ns, buf, avail); 3411 3412 dump_newline(&ns); 3413 dump_line(&ns, "CPU %-4d: nr_run=%u ops_qseq=%lu", 3414 cpu, rq->scx.nr_running, rq->scx.ops_qseq); 3415 dump_line(&ns, " curr=%s[%d] class=%ps", 3416 rq->curr->comm, rq->curr->pid, 3417 rq->curr->sched_class); 3418 3419 used = seq_buf_used(&ns); 3420 if (SCX_HAS_OP(dump_cpu)) { 3421 ops_dump_init(&ns, " "); 3422 SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); 3423 ops_dump_exit(); 3424 } 3425 3426 /* 3427 * If idle && nothing generated by ops.dump_cpu(), there's 3428 * nothing interesting. Skip. 3429 */ 3430 if (idle && used == seq_buf_used(&ns)) 3431 goto next; 3432 3433 /* 3434 * $s may already have overflowed when $ns was created. If so, 3435 * calling commit on it will trigger BUG. 3436 */ 3437 if (avail) { 3438 seq_buf_commit(&s, seq_buf_used(&ns)); 3439 if (seq_buf_has_overflowed(&ns)) 3440 seq_buf_set_overflow(&s); 3441 } 3442 3443 if (rq->curr->sched_class == &ext_sched_class) 3444 scx_dump_task(&s, &dctx, rq->curr, '*'); 3445 3446 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 3447 scx_dump_task(&s, &dctx, p, ' '); 3448 next: 3449 rq_unlock(rq, &rf); 3450 } 3451 3452 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 3453 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 3454 trunc_marker, sizeof(trunc_marker)); 3455 3456 spin_unlock_irqrestore(&dump_lock, flags); 3457 } 3458 3459 static void scx_ops_error_irq_workfn(struct irq_work *irq_work) 3460 { 3461 struct scx_exit_info *ei = scx_exit_info; 3462 3463 if (ei->kind >= SCX_EXIT_ERROR) 3464 scx_dump_state(ei, scx_ops.exit_dump_len); 3465 3466 schedule_scx_ops_disable_work(); 3467 } 3468 3469 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); 3470 3471 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, 3472 s64 exit_code, 3473 const char *fmt, ...) 3474 { 3475 struct scx_exit_info *ei = scx_exit_info; 3476 int none = SCX_EXIT_NONE; 3477 va_list args; 3478 3479 if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) 3480 return; 3481 3482 ei->exit_code = exit_code; 3483 3484 if (kind >= SCX_EXIT_ERROR) 3485 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 3486 3487 va_start(args, fmt); 3488 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 3489 va_end(args); 3490 3491 /* 3492 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 3493 * in scx_ops_disable_workfn(). 3494 */ 3495 ei->kind = kind; 3496 ei->reason = scx_exit_reason(ei->kind); 3497 3498 irq_work_queue(&scx_ops_error_irq_work); 3499 } 3500 3501 static struct kthread_worker *scx_create_rt_helper(const char *name) 3502 { 3503 struct kthread_worker *helper; 3504 3505 helper = kthread_create_worker(0, name); 3506 if (helper) 3507 sched_set_fifo(helper->task); 3508 return helper; 3509 } 3510 3511 static int validate_ops(const struct sched_ext_ops *ops) 3512 { 3513 /* 3514 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 3515 * ops.enqueue() callback isn't implemented. 3516 */ 3517 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 3518 scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 3519 return -EINVAL; 3520 } 3521 3522 return 0; 3523 } 3524 3525 static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) 3526 { 3527 struct scx_task_iter sti; 3528 struct task_struct *p; 3529 unsigned long timeout; 3530 int i, ret; 3531 3532 mutex_lock(&scx_ops_enable_mutex); 3533 3534 if (!scx_ops_helper) { 3535 WRITE_ONCE(scx_ops_helper, 3536 scx_create_rt_helper("sched_ext_ops_helper")); 3537 if (!scx_ops_helper) { 3538 ret = -ENOMEM; 3539 goto err_unlock; 3540 } 3541 } 3542 3543 if (scx_ops_enable_state() != SCX_OPS_DISABLED) { 3544 ret = -EBUSY; 3545 goto err_unlock; 3546 } 3547 3548 scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); 3549 if (!scx_root_kobj) { 3550 ret = -ENOMEM; 3551 goto err_unlock; 3552 } 3553 3554 scx_root_kobj->kset = scx_kset; 3555 ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); 3556 if (ret < 0) 3557 goto err; 3558 3559 scx_exit_info = alloc_exit_info(ops->exit_dump_len); 3560 if (!scx_exit_info) { 3561 ret = -ENOMEM; 3562 goto err_del; 3563 } 3564 3565 /* 3566 * Set scx_ops, transition to PREPPING and clear exit info to arm the 3567 * disable path. Failure triggers full disabling from here on. 3568 */ 3569 scx_ops = *ops; 3570 3571 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != 3572 SCX_OPS_DISABLED); 3573 3574 atomic_set(&scx_exit_kind, SCX_EXIT_NONE); 3575 scx_warned_zero_slice = false; 3576 3577 atomic_long_set(&scx_nr_rejected, 0); 3578 3579 /* 3580 * Keep CPUs stable during enable so that the BPF scheduler can track 3581 * online CPUs by watching ->on/offline_cpu() after ->init(). 3582 */ 3583 cpus_read_lock(); 3584 3585 if (scx_ops.init) { 3586 ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); 3587 if (ret) { 3588 ret = ops_sanitize_err("init", ret); 3589 goto err_disable_unlock_cpus; 3590 } 3591 } 3592 3593 cpus_read_unlock(); 3594 3595 ret = validate_ops(ops); 3596 if (ret) 3597 goto err_disable; 3598 3599 WARN_ON_ONCE(scx_dsp_ctx); 3600 scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 3601 scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, 3602 scx_dsp_max_batch), 3603 __alignof__(struct scx_dsp_ctx)); 3604 if (!scx_dsp_ctx) { 3605 ret = -ENOMEM; 3606 goto err_disable; 3607 } 3608 3609 if (ops->timeout_ms) 3610 timeout = msecs_to_jiffies(ops->timeout_ms); 3611 else 3612 timeout = SCX_WATCHDOG_MAX_TIMEOUT; 3613 3614 WRITE_ONCE(scx_watchdog_timeout, timeout); 3615 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3616 queue_delayed_work(system_unbound_wq, &scx_watchdog_work, 3617 scx_watchdog_timeout / 2); 3618 3619 /* 3620 * Lock out forks before opening the floodgate so that they don't wander 3621 * into the operations prematurely. 3622 * 3623 * We don't need to keep the CPUs stable but grab cpus_read_lock() to 3624 * ease future locking changes for cgroup suport. 3625 * 3626 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the 3627 * following dependency chain: 3628 * 3629 * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock 3630 */ 3631 percpu_down_write(&scx_fork_rwsem); 3632 cpus_read_lock(); 3633 3634 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 3635 if (((void (**)(void))ops)[i]) 3636 static_branch_enable_cpuslocked(&scx_has_op[i]); 3637 3638 if (ops->flags & SCX_OPS_ENQ_LAST) 3639 static_branch_enable_cpuslocked(&scx_ops_enq_last); 3640 3641 if (ops->flags & SCX_OPS_ENQ_EXITING) 3642 static_branch_enable_cpuslocked(&scx_ops_enq_exiting); 3643 3644 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { 3645 reset_idle_masks(); 3646 static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); 3647 } else { 3648 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 3649 } 3650 3651 static_branch_enable_cpuslocked(&__scx_ops_enabled); 3652 3653 /* 3654 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 3655 * preventing new tasks from being added. No need to exclude tasks 3656 * leaving as sched_ext_free() can handle both prepped and enabled 3657 * tasks. Prep all tasks first and then enable them with preemption 3658 * disabled. 3659 */ 3660 spin_lock_irq(&scx_tasks_lock); 3661 3662 scx_task_iter_init(&sti); 3663 while ((p = scx_task_iter_next_locked(&sti, false))) { 3664 get_task_struct(p); 3665 scx_task_iter_rq_unlock(&sti); 3666 spin_unlock_irq(&scx_tasks_lock); 3667 3668 ret = scx_ops_init_task(p, task_group(p), false); 3669 if (ret) { 3670 put_task_struct(p); 3671 spin_lock_irq(&scx_tasks_lock); 3672 scx_task_iter_exit(&sti); 3673 spin_unlock_irq(&scx_tasks_lock); 3674 pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", 3675 ret, p->comm, p->pid); 3676 goto err_disable_unlock_all; 3677 } 3678 3679 put_task_struct(p); 3680 spin_lock_irq(&scx_tasks_lock); 3681 } 3682 scx_task_iter_exit(&sti); 3683 3684 /* 3685 * All tasks are prepped but are still ops-disabled. Ensure that 3686 * %current can't be scheduled out and switch everyone. 3687 * preempt_disable() is necessary because we can't guarantee that 3688 * %current won't be starved if scheduled out while switching. 3689 */ 3690 preempt_disable(); 3691 3692 /* 3693 * From here on, the disable path must assume that tasks have ops 3694 * enabled and need to be recovered. 3695 * 3696 * Transition to ENABLING fails iff the BPF scheduler has already 3697 * triggered scx_bpf_error(). Returning an error code here would lose 3698 * the recorded error information. Exit indicating success so that the 3699 * error is notified through ops.exit() with all the details. 3700 */ 3701 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { 3702 preempt_enable(); 3703 spin_unlock_irq(&scx_tasks_lock); 3704 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 3705 ret = 0; 3706 goto err_disable_unlock_all; 3707 } 3708 3709 /* 3710 * We're fully committed and can't fail. The PREPPED -> ENABLED 3711 * transitions here are synchronized against sched_ext_free() through 3712 * scx_tasks_lock. 3713 */ 3714 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 3715 3716 scx_task_iter_init(&sti); 3717 while ((p = scx_task_iter_next_locked(&sti, false))) { 3718 const struct sched_class *old_class = p->sched_class; 3719 struct sched_enq_and_set_ctx ctx; 3720 3721 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 3722 3723 scx_set_task_state(p, SCX_TASK_READY); 3724 __setscheduler_prio(p, p->prio); 3725 check_class_changing(task_rq(p), p, old_class); 3726 3727 sched_enq_and_set_task(&ctx); 3728 3729 check_class_changed(task_rq(p), p, old_class, p->prio); 3730 } 3731 scx_task_iter_exit(&sti); 3732 3733 spin_unlock_irq(&scx_tasks_lock); 3734 preempt_enable(); 3735 cpus_read_unlock(); 3736 percpu_up_write(&scx_fork_rwsem); 3737 3738 /* see above ENABLING transition for the explanation on exiting with 0 */ 3739 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { 3740 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 3741 ret = 0; 3742 goto err_disable; 3743 } 3744 3745 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 3746 static_branch_enable(&__scx_switched_all); 3747 3748 kobject_uevent(scx_root_kobj, KOBJ_ADD); 3749 mutex_unlock(&scx_ops_enable_mutex); 3750 3751 return 0; 3752 3753 err_del: 3754 kobject_del(scx_root_kobj); 3755 err: 3756 kobject_put(scx_root_kobj); 3757 scx_root_kobj = NULL; 3758 if (scx_exit_info) { 3759 free_exit_info(scx_exit_info); 3760 scx_exit_info = NULL; 3761 } 3762 err_unlock: 3763 mutex_unlock(&scx_ops_enable_mutex); 3764 return ret; 3765 3766 err_disable_unlock_all: 3767 percpu_up_write(&scx_fork_rwsem); 3768 err_disable_unlock_cpus: 3769 cpus_read_unlock(); 3770 err_disable: 3771 mutex_unlock(&scx_ops_enable_mutex); 3772 /* must be fully disabled before returning */ 3773 scx_ops_disable(SCX_EXIT_ERROR); 3774 kthread_flush_work(&scx_ops_disable_work); 3775 return ret; 3776 } 3777 3778 3779 /******************************************************************************** 3780 * bpf_struct_ops plumbing. 3781 */ 3782 #include <linux/bpf_verifier.h> 3783 #include <linux/bpf.h> 3784 #include <linux/btf.h> 3785 3786 extern struct btf *btf_vmlinux; 3787 static const struct btf_type *task_struct_type; 3788 static u32 task_struct_type_id; 3789 3790 static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, 3791 enum bpf_access_type type, 3792 const struct bpf_prog *prog, 3793 struct bpf_insn_access_aux *info) 3794 { 3795 struct btf *btf = bpf_get_btf_vmlinux(); 3796 const struct bpf_struct_ops_desc *st_ops_desc; 3797 const struct btf_member *member; 3798 const struct btf_type *t; 3799 u32 btf_id, member_idx; 3800 const char *mname; 3801 3802 /* struct_ops op args are all sequential, 64-bit numbers */ 3803 if (off != arg_n * sizeof(__u64)) 3804 return false; 3805 3806 /* btf_id should be the type id of struct sched_ext_ops */ 3807 btf_id = prog->aux->attach_btf_id; 3808 st_ops_desc = bpf_struct_ops_find(btf, btf_id); 3809 if (!st_ops_desc) 3810 return false; 3811 3812 /* BTF type of struct sched_ext_ops */ 3813 t = st_ops_desc->type; 3814 3815 member_idx = prog->expected_attach_type; 3816 if (member_idx >= btf_type_vlen(t)) 3817 return false; 3818 3819 /* 3820 * Get the member name of this struct_ops program, which corresponds to 3821 * a field in struct sched_ext_ops. For example, the member name of the 3822 * dispatch struct_ops program (callback) is "dispatch". 3823 */ 3824 member = &btf_type_member(t)[member_idx]; 3825 mname = btf_name_by_offset(btf_vmlinux, member->name_off); 3826 3827 if (!strcmp(mname, op)) { 3828 /* 3829 * The value is a pointer to a type (struct task_struct) given 3830 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), 3831 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program 3832 * should check the pointer to make sure it is not NULL before 3833 * using it, or the verifier will reject the program. 3834 * 3835 * Longer term, this is something that should be addressed by 3836 * BTF, and be fully contained within the verifier. 3837 */ 3838 info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; 3839 info->btf = btf_vmlinux; 3840 info->btf_id = task_struct_type_id; 3841 3842 return true; 3843 } 3844 3845 return false; 3846 } 3847 3848 static bool bpf_scx_is_valid_access(int off, int size, 3849 enum bpf_access_type type, 3850 const struct bpf_prog *prog, 3851 struct bpf_insn_access_aux *info) 3852 { 3853 if (type != BPF_READ) 3854 return false; 3855 if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || 3856 set_arg_maybe_null("yield", 1, off, size, type, prog, info)) 3857 return true; 3858 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 3859 return false; 3860 if (off % size != 0) 3861 return false; 3862 3863 return btf_ctx_access(off, size, type, prog, info); 3864 } 3865 3866 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 3867 const struct bpf_reg_state *reg, int off, 3868 int size) 3869 { 3870 const struct btf_type *t; 3871 3872 t = btf_type_by_id(reg->btf, reg->btf_id); 3873 if (t == task_struct_type) { 3874 if (off >= offsetof(struct task_struct, scx.slice) && 3875 off + size <= offsetofend(struct task_struct, scx.slice)) 3876 return SCALAR_VALUE; 3877 if (off >= offsetof(struct task_struct, scx.disallow) && 3878 off + size <= offsetofend(struct task_struct, scx.disallow)) 3879 return SCALAR_VALUE; 3880 } 3881 3882 return -EACCES; 3883 } 3884 3885 static const struct bpf_func_proto * 3886 bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3887 { 3888 switch (func_id) { 3889 case BPF_FUNC_task_storage_get: 3890 return &bpf_task_storage_get_proto; 3891 case BPF_FUNC_task_storage_delete: 3892 return &bpf_task_storage_delete_proto; 3893 default: 3894 return bpf_base_func_proto(func_id, prog); 3895 } 3896 } 3897 3898 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 3899 .get_func_proto = bpf_scx_get_func_proto, 3900 .is_valid_access = bpf_scx_is_valid_access, 3901 .btf_struct_access = bpf_scx_btf_struct_access, 3902 }; 3903 3904 static int bpf_scx_init_member(const struct btf_type *t, 3905 const struct btf_member *member, 3906 void *kdata, const void *udata) 3907 { 3908 const struct sched_ext_ops *uops = udata; 3909 struct sched_ext_ops *ops = kdata; 3910 u32 moff = __btf_member_bit_offset(t, member) / 8; 3911 int ret; 3912 3913 switch (moff) { 3914 case offsetof(struct sched_ext_ops, dispatch_max_batch): 3915 if (*(u32 *)(udata + moff) > INT_MAX) 3916 return -E2BIG; 3917 ops->dispatch_max_batch = *(u32 *)(udata + moff); 3918 return 1; 3919 case offsetof(struct sched_ext_ops, flags): 3920 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 3921 return -EINVAL; 3922 ops->flags = *(u64 *)(udata + moff); 3923 return 1; 3924 case offsetof(struct sched_ext_ops, name): 3925 ret = bpf_obj_name_cpy(ops->name, uops->name, 3926 sizeof(ops->name)); 3927 if (ret < 0) 3928 return ret; 3929 if (ret == 0) 3930 return -EINVAL; 3931 return 1; 3932 case offsetof(struct sched_ext_ops, timeout_ms): 3933 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 3934 SCX_WATCHDOG_MAX_TIMEOUT) 3935 return -E2BIG; 3936 ops->timeout_ms = *(u32 *)(udata + moff); 3937 return 1; 3938 case offsetof(struct sched_ext_ops, exit_dump_len): 3939 ops->exit_dump_len = 3940 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 3941 return 1; 3942 } 3943 3944 return 0; 3945 } 3946 3947 static int bpf_scx_check_member(const struct btf_type *t, 3948 const struct btf_member *member, 3949 const struct bpf_prog *prog) 3950 { 3951 u32 moff = __btf_member_bit_offset(t, member) / 8; 3952 3953 switch (moff) { 3954 case offsetof(struct sched_ext_ops, init_task): 3955 case offsetof(struct sched_ext_ops, init): 3956 case offsetof(struct sched_ext_ops, exit): 3957 break; 3958 default: 3959 if (prog->sleepable) 3960 return -EINVAL; 3961 } 3962 3963 return 0; 3964 } 3965 3966 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 3967 { 3968 return scx_ops_enable(kdata, link); 3969 } 3970 3971 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 3972 { 3973 scx_ops_disable(SCX_EXIT_UNREG); 3974 kthread_flush_work(&scx_ops_disable_work); 3975 } 3976 3977 static int bpf_scx_init(struct btf *btf) 3978 { 3979 u32 type_id; 3980 3981 type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); 3982 if (type_id < 0) 3983 return -EINVAL; 3984 task_struct_type = btf_type_by_id(btf, type_id); 3985 task_struct_type_id = type_id; 3986 3987 return 0; 3988 } 3989 3990 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 3991 { 3992 /* 3993 * sched_ext does not support updating the actively-loaded BPF 3994 * scheduler, as registering a BPF scheduler can always fail if the 3995 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 3996 * etc. Similarly, we can always race with unregistration happening 3997 * elsewhere, such as with sysrq. 3998 */ 3999 return -EOPNOTSUPP; 4000 } 4001 4002 static int bpf_scx_validate(void *kdata) 4003 { 4004 return 0; 4005 } 4006 4007 static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 4008 static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} 4009 static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} 4010 static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} 4011 static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } 4012 static void set_weight_stub(struct task_struct *p, u32 weight) {} 4013 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} 4014 static void update_idle_stub(s32 cpu, bool idle) {} 4015 static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 4016 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} 4017 static void enable_stub(struct task_struct *p) {} 4018 static void disable_stub(struct task_struct *p) {} 4019 static s32 init_stub(void) { return -EINVAL; } 4020 static void exit_stub(struct scx_exit_info *info) {} 4021 4022 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 4023 .select_cpu = select_cpu_stub, 4024 .enqueue = enqueue_stub, 4025 .dequeue = dequeue_stub, 4026 .dispatch = dispatch_stub, 4027 .yield = yield_stub, 4028 .set_weight = set_weight_stub, 4029 .set_cpumask = set_cpumask_stub, 4030 .update_idle = update_idle_stub, 4031 .init_task = init_task_stub, 4032 .exit_task = exit_task_stub, 4033 .enable = enable_stub, 4034 .disable = disable_stub, 4035 .init = init_stub, 4036 .exit = exit_stub, 4037 }; 4038 4039 static struct bpf_struct_ops bpf_sched_ext_ops = { 4040 .verifier_ops = &bpf_scx_verifier_ops, 4041 .reg = bpf_scx_reg, 4042 .unreg = bpf_scx_unreg, 4043 .check_member = bpf_scx_check_member, 4044 .init_member = bpf_scx_init_member, 4045 .init = bpf_scx_init, 4046 .update = bpf_scx_update, 4047 .validate = bpf_scx_validate, 4048 .name = "sched_ext_ops", 4049 .owner = THIS_MODULE, 4050 .cfi_stubs = &__bpf_ops_sched_ext_ops 4051 }; 4052 4053 4054 /******************************************************************************** 4055 * System integration and init. 4056 */ 4057 4058 static void sysrq_handle_sched_ext_reset(u8 key) 4059 { 4060 if (scx_ops_helper) 4061 scx_ops_disable(SCX_EXIT_SYSRQ); 4062 else 4063 pr_info("sched_ext: BPF scheduler not yet used\n"); 4064 } 4065 4066 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 4067 .handler = sysrq_handle_sched_ext_reset, 4068 .help_msg = "reset-sched-ext(S)", 4069 .action_msg = "Disable sched_ext and revert all tasks to CFS", 4070 .enable_mask = SYSRQ_ENABLE_RTNICE, 4071 }; 4072 4073 static void sysrq_handle_sched_ext_dump(u8 key) 4074 { 4075 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 4076 4077 if (scx_enabled()) 4078 scx_dump_state(&ei, 0); 4079 } 4080 4081 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 4082 .handler = sysrq_handle_sched_ext_dump, 4083 .help_msg = "dump-sched-ext(D)", 4084 .action_msg = "Trigger sched_ext debug dump", 4085 .enable_mask = SYSRQ_ENABLE_RTNICE, 4086 }; 4087 4088 /** 4089 * print_scx_info - print out sched_ext scheduler state 4090 * @log_lvl: the log level to use when printing 4091 * @p: target task 4092 * 4093 * If a sched_ext scheduler is enabled, print the name and state of the 4094 * scheduler. If @p is on sched_ext, print further information about the task. 4095 * 4096 * This function can be safely called on any task as long as the task_struct 4097 * itself is accessible. While safe, this function isn't synchronized and may 4098 * print out mixups or garbages of limited length. 4099 */ 4100 void print_scx_info(const char *log_lvl, struct task_struct *p) 4101 { 4102 enum scx_ops_enable_state state = scx_ops_enable_state(); 4103 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 4104 char runnable_at_buf[22] = "?"; 4105 struct sched_class *class; 4106 unsigned long runnable_at; 4107 4108 if (state == SCX_OPS_DISABLED) 4109 return; 4110 4111 /* 4112 * Carefully check if the task was running on sched_ext, and then 4113 * carefully copy the time it's been runnable, and its state. 4114 */ 4115 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 4116 class != &ext_sched_class) { 4117 printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, 4118 scx_ops_enable_state_str[state], all); 4119 return; 4120 } 4121 4122 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 4123 sizeof(runnable_at))) 4124 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 4125 jiffies_delta_msecs(runnable_at, jiffies)); 4126 4127 /* print everything onto one line to conserve console space */ 4128 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 4129 log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, 4130 runnable_at_buf); 4131 } 4132 4133 void __init init_sched_ext_class(void) 4134 { 4135 s32 cpu, v; 4136 4137 /* 4138 * The following is to prevent the compiler from optimizing out the enum 4139 * definitions so that BPF scheduler implementations can use them 4140 * through the generated vmlinux.h. 4141 */ 4142 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP); 4143 4144 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 4145 init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); 4146 #ifdef CONFIG_SMP 4147 BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); 4148 BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); 4149 #endif 4150 for_each_possible_cpu(cpu) { 4151 struct rq *rq = cpu_rq(cpu); 4152 4153 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); 4154 INIT_LIST_HEAD(&rq->scx.runnable_list); 4155 } 4156 4157 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 4158 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 4159 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 4160 } 4161 4162 4163 /******************************************************************************** 4164 * Helpers that can be called from the BPF scheduler. 4165 */ 4166 #include <linux/btf_ids.h> 4167 4168 __bpf_kfunc_start_defs(); 4169 4170 /** 4171 * scx_bpf_create_dsq - Create a custom DSQ 4172 * @dsq_id: DSQ to create 4173 * @node: NUMA node to allocate from 4174 * 4175 * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and 4176 * ops.init_task(). 4177 */ 4178 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) 4179 { 4180 if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) 4181 return -EINVAL; 4182 4183 if (unlikely(node >= (int)nr_node_ids || 4184 (node < 0 && node != NUMA_NO_NODE))) 4185 return -EINVAL; 4186 return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); 4187 } 4188 4189 __bpf_kfunc_end_defs(); 4190 4191 BTF_KFUNCS_START(scx_kfunc_ids_sleepable) 4192 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) 4193 BTF_KFUNCS_END(scx_kfunc_ids_sleepable) 4194 4195 static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { 4196 .owner = THIS_MODULE, 4197 .set = &scx_kfunc_ids_sleepable, 4198 }; 4199 4200 __bpf_kfunc_start_defs(); 4201 4202 /** 4203 * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() 4204 * @p: task_struct to select a CPU for 4205 * @prev_cpu: CPU @p was on previously 4206 * @wake_flags: %SCX_WAKE_* flags 4207 * @is_idle: out parameter indicating whether the returned CPU is idle 4208 * 4209 * Can only be called from ops.select_cpu() if the built-in CPU selection is 4210 * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. 4211 * @p, @prev_cpu and @wake_flags match ops.select_cpu(). 4212 * 4213 * Returns the picked CPU with *@is_idle indicating whether the picked CPU is 4214 * currently idle and thus a good candidate for direct dispatching. 4215 */ 4216 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 4217 u64 wake_flags, bool *is_idle) 4218 { 4219 if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { 4220 *is_idle = false; 4221 return prev_cpu; 4222 } 4223 #ifdef CONFIG_SMP 4224 return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); 4225 #else 4226 *is_idle = false; 4227 return prev_cpu; 4228 #endif 4229 } 4230 4231 __bpf_kfunc_end_defs(); 4232 4233 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) 4234 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) 4235 BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) 4236 4237 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { 4238 .owner = THIS_MODULE, 4239 .set = &scx_kfunc_ids_select_cpu, 4240 }; 4241 4242 static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) 4243 { 4244 if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 4245 return false; 4246 4247 lockdep_assert_irqs_disabled(); 4248 4249 if (unlikely(!p)) { 4250 scx_ops_error("called with NULL task"); 4251 return false; 4252 } 4253 4254 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 4255 scx_ops_error("invalid enq_flags 0x%llx", enq_flags); 4256 return false; 4257 } 4258 4259 return true; 4260 } 4261 4262 static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) 4263 { 4264 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 4265 struct task_struct *ddsp_task; 4266 4267 ddsp_task = __this_cpu_read(direct_dispatch_task); 4268 if (ddsp_task) { 4269 mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); 4270 return; 4271 } 4272 4273 if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { 4274 scx_ops_error("dispatch buffer overflow"); 4275 return; 4276 } 4277 4278 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 4279 .task = p, 4280 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 4281 .dsq_id = dsq_id, 4282 .enq_flags = enq_flags, 4283 }; 4284 } 4285 4286 __bpf_kfunc_start_defs(); 4287 4288 /** 4289 * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ 4290 * @p: task_struct to dispatch 4291 * @dsq_id: DSQ to dispatch to 4292 * @slice: duration @p can run for in nsecs 4293 * @enq_flags: SCX_ENQ_* 4294 * 4295 * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe 4296 * to call this function spuriously. Can be called from ops.enqueue(), 4297 * ops.select_cpu(), and ops.dispatch(). 4298 * 4299 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 4300 * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be 4301 * used to target the local DSQ of a CPU other than the enqueueing one. Use 4302 * ops.select_cpu() to be on the target CPU in the first place. 4303 * 4304 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 4305 * will be directly dispatched to the corresponding dispatch queue after 4306 * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be 4307 * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). 4308 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 4309 * task is dispatched. 4310 * 4311 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 4312 * and this function can be called upto ops.dispatch_max_batch times to dispatch 4313 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 4314 * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. 4315 * 4316 * This function doesn't have any locking restrictions and may be called under 4317 * BPF locks (in the future when BPF introduces more flexible locking). 4318 * 4319 * @p is allowed to run for @slice. The scheduling path is triggered on slice 4320 * exhaustion. If zero, the current residual slice is maintained. 4321 */ 4322 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, 4323 u64 enq_flags) 4324 { 4325 if (!scx_dispatch_preamble(p, enq_flags)) 4326 return; 4327 4328 if (slice) 4329 p->scx.slice = slice; 4330 else 4331 p->scx.slice = p->scx.slice ?: 1; 4332 4333 scx_dispatch_commit(p, dsq_id, enq_flags); 4334 } 4335 4336 __bpf_kfunc_end_defs(); 4337 4338 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 4339 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) 4340 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 4341 4342 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 4343 .owner = THIS_MODULE, 4344 .set = &scx_kfunc_ids_enqueue_dispatch, 4345 }; 4346 4347 __bpf_kfunc_start_defs(); 4348 4349 /** 4350 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 4351 * 4352 * Can only be called from ops.dispatch(). 4353 */ 4354 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) 4355 { 4356 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 4357 return 0; 4358 4359 return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); 4360 } 4361 4362 /** 4363 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 4364 * 4365 * Cancel the latest dispatch. Can be called multiple times to cancel further 4366 * dispatches. Can only be called from ops.dispatch(). 4367 */ 4368 __bpf_kfunc void scx_bpf_dispatch_cancel(void) 4369 { 4370 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 4371 4372 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 4373 return; 4374 4375 if (dspc->cursor > 0) 4376 dspc->cursor--; 4377 else 4378 scx_ops_error("dispatch buffer underflow"); 4379 } 4380 4381 /** 4382 * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ 4383 * @dsq_id: DSQ to consume 4384 * 4385 * Consume a task from the non-local DSQ identified by @dsq_id and transfer it 4386 * to the current CPU's local DSQ for execution. Can only be called from 4387 * ops.dispatch(). 4388 * 4389 * This function flushes the in-flight dispatches from scx_bpf_dispatch() before 4390 * trying to consume the specified DSQ. It may also grab rq locks and thus can't 4391 * be called under any BPF locks. 4392 * 4393 * Returns %true if a task has been consumed, %false if there isn't any task to 4394 * consume. 4395 */ 4396 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) 4397 { 4398 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 4399 struct scx_dispatch_q *dsq; 4400 4401 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 4402 return false; 4403 4404 flush_dispatch_buf(dspc->rq, dspc->rf); 4405 4406 dsq = find_non_local_dsq(dsq_id); 4407 if (unlikely(!dsq)) { 4408 scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); 4409 return false; 4410 } 4411 4412 if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { 4413 /* 4414 * A successfully consumed task can be dequeued before it starts 4415 * running while the CPU is trying to migrate other dispatched 4416 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty 4417 * local DSQ. 4418 */ 4419 dspc->nr_tasks++; 4420 return true; 4421 } else { 4422 return false; 4423 } 4424 } 4425 4426 __bpf_kfunc_end_defs(); 4427 4428 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 4429 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) 4430 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) 4431 BTF_ID_FLAGS(func, scx_bpf_consume) 4432 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 4433 4434 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 4435 .owner = THIS_MODULE, 4436 .set = &scx_kfunc_ids_dispatch, 4437 }; 4438 4439 __bpf_kfunc_start_defs(); 4440 4441 /** 4442 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 4443 * @dsq_id: id of the DSQ 4444 * 4445 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 4446 * -%ENOENT is returned. 4447 */ 4448 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) 4449 { 4450 struct scx_dispatch_q *dsq; 4451 s32 ret; 4452 4453 preempt_disable(); 4454 4455 if (dsq_id == SCX_DSQ_LOCAL) { 4456 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 4457 goto out; 4458 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 4459 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 4460 4461 if (ops_cpu_valid(cpu, NULL)) { 4462 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 4463 goto out; 4464 } 4465 } else { 4466 dsq = find_non_local_dsq(dsq_id); 4467 if (dsq) { 4468 ret = READ_ONCE(dsq->nr); 4469 goto out; 4470 } 4471 } 4472 ret = -ENOENT; 4473 out: 4474 preempt_enable(); 4475 return ret; 4476 } 4477 4478 /** 4479 * scx_bpf_destroy_dsq - Destroy a custom DSQ 4480 * @dsq_id: DSQ to destroy 4481 * 4482 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 4483 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 4484 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 4485 * which doesn't exist. Can be called from any online scx_ops operations. 4486 */ 4487 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) 4488 { 4489 destroy_dsq(dsq_id); 4490 } 4491 4492 __bpf_kfunc_end_defs(); 4493 4494 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, 4495 char *fmt, unsigned long long *data, u32 data__sz) 4496 { 4497 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 4498 s32 ret; 4499 4500 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 4501 (data__sz && !data)) { 4502 scx_ops_error("invalid data=%p and data__sz=%u", 4503 (void *)data, data__sz); 4504 return -EINVAL; 4505 } 4506 4507 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 4508 if (ret < 0) { 4509 scx_ops_error("failed to read data fields (%d)", ret); 4510 return ret; 4511 } 4512 4513 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 4514 &bprintf_data); 4515 if (ret < 0) { 4516 scx_ops_error("format preparation failed (%d)", ret); 4517 return ret; 4518 } 4519 4520 ret = bstr_printf(line_buf, line_size, fmt, 4521 bprintf_data.bin_args); 4522 bpf_bprintf_cleanup(&bprintf_data); 4523 if (ret < 0) { 4524 scx_ops_error("(\"%s\", %p, %u) failed to format", 4525 fmt, data, data__sz); 4526 return ret; 4527 } 4528 4529 return ret; 4530 } 4531 4532 static s32 bstr_format(struct scx_bstr_buf *buf, 4533 char *fmt, unsigned long long *data, u32 data__sz) 4534 { 4535 return __bstr_format(buf->data, buf->line, sizeof(buf->line), 4536 fmt, data, data__sz); 4537 } 4538 4539 __bpf_kfunc_start_defs(); 4540 4541 /** 4542 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 4543 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 4544 * @fmt: error message format string 4545 * @data: format string parameters packaged using ___bpf_fill() macro 4546 * @data__sz: @data len, must end in '__sz' for the verifier 4547 * 4548 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 4549 * disabling. 4550 */ 4551 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 4552 unsigned long long *data, u32 data__sz) 4553 { 4554 unsigned long flags; 4555 4556 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 4557 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 4558 scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", 4559 scx_exit_bstr_buf.line); 4560 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 4561 } 4562 4563 /** 4564 * scx_bpf_error_bstr - Indicate fatal error 4565 * @fmt: error message format string 4566 * @data: format string parameters packaged using ___bpf_fill() macro 4567 * @data__sz: @data len, must end in '__sz' for the verifier 4568 * 4569 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 4570 * disabling. 4571 */ 4572 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 4573 u32 data__sz) 4574 { 4575 unsigned long flags; 4576 4577 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 4578 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 4579 scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", 4580 scx_exit_bstr_buf.line); 4581 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 4582 } 4583 4584 /** 4585 * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler 4586 * @fmt: format string 4587 * @data: format string parameters packaged using ___bpf_fill() macro 4588 * @data__sz: @data len, must end in '__sz' for the verifier 4589 * 4590 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 4591 * dump_task() to generate extra debug dump specific to the BPF scheduler. 4592 * 4593 * The extra dump may be multiple lines. A single line may be split over 4594 * multiple calls. The last line is automatically terminated. 4595 */ 4596 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 4597 u32 data__sz) 4598 { 4599 struct scx_dump_data *dd = &scx_dump_data; 4600 struct scx_bstr_buf *buf = &dd->buf; 4601 s32 ret; 4602 4603 if (raw_smp_processor_id() != dd->cpu) { 4604 scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); 4605 return; 4606 } 4607 4608 /* append the formatted string to the line buf */ 4609 ret = __bstr_format(buf->data, buf->line + dd->cursor, 4610 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 4611 if (ret < 0) { 4612 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 4613 dd->prefix, fmt, data, data__sz, ret); 4614 return; 4615 } 4616 4617 dd->cursor += ret; 4618 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 4619 4620 if (!dd->cursor) 4621 return; 4622 4623 /* 4624 * If the line buf overflowed or ends in a newline, flush it into the 4625 * dump. This is to allow the caller to generate a single line over 4626 * multiple calls. As ops_dump_flush() can also handle multiple lines in 4627 * the line buf, the only case which can lead to an unexpected 4628 * truncation is when the caller keeps generating newlines in the middle 4629 * instead of the end consecutively. Don't do that. 4630 */ 4631 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 4632 ops_dump_flush(); 4633 } 4634 4635 /** 4636 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 4637 * 4638 * All valid CPU IDs in the system are smaller than the returned value. 4639 */ 4640 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 4641 { 4642 return nr_cpu_ids; 4643 } 4644 4645 /** 4646 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 4647 */ 4648 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 4649 { 4650 return cpu_possible_mask; 4651 } 4652 4653 /** 4654 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 4655 */ 4656 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 4657 { 4658 return cpu_online_mask; 4659 } 4660 4661 /** 4662 * scx_bpf_put_cpumask - Release a possible/online cpumask 4663 * @cpumask: cpumask to release 4664 */ 4665 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 4666 { 4667 /* 4668 * Empty function body because we aren't actually acquiring or releasing 4669 * a reference to a global cpumask, which is read-only in the caller and 4670 * is never released. The acquire / release semantics here are just used 4671 * to make the cpumask is a trusted pointer in the caller. 4672 */ 4673 } 4674 4675 /** 4676 * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking 4677 * per-CPU cpumask. 4678 * 4679 * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 4680 */ 4681 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) 4682 { 4683 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4684 scx_ops_error("built-in idle tracking is disabled"); 4685 return cpu_none_mask; 4686 } 4687 4688 #ifdef CONFIG_SMP 4689 return idle_masks.cpu; 4690 #else 4691 return cpu_none_mask; 4692 #endif 4693 } 4694 4695 /** 4696 * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, 4697 * per-physical-core cpumask. Can be used to determine if an entire physical 4698 * core is free. 4699 * 4700 * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 4701 */ 4702 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) 4703 { 4704 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4705 scx_ops_error("built-in idle tracking is disabled"); 4706 return cpu_none_mask; 4707 } 4708 4709 #ifdef CONFIG_SMP 4710 if (sched_smt_active()) 4711 return idle_masks.smt; 4712 else 4713 return idle_masks.cpu; 4714 #else 4715 return cpu_none_mask; 4716 #endif 4717 } 4718 4719 /** 4720 * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to 4721 * either the percpu, or SMT idle-tracking cpumask. 4722 */ 4723 __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) 4724 { 4725 /* 4726 * Empty function body because we aren't actually acquiring or releasing 4727 * a reference to a global idle cpumask, which is read-only in the 4728 * caller and is never released. The acquire / release semantics here 4729 * are just used to make the cpumask a trusted pointer in the caller. 4730 */ 4731 } 4732 4733 /** 4734 * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state 4735 * @cpu: cpu to test and clear idle for 4736 * 4737 * Returns %true if @cpu was idle and its idle state was successfully cleared. 4738 * %false otherwise. 4739 * 4740 * Unavailable if ops.update_idle() is implemented and 4741 * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 4742 */ 4743 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) 4744 { 4745 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4746 scx_ops_error("built-in idle tracking is disabled"); 4747 return false; 4748 } 4749 4750 if (ops_cpu_valid(cpu, NULL)) 4751 return test_and_clear_cpu_idle(cpu); 4752 else 4753 return false; 4754 } 4755 4756 /** 4757 * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu 4758 * @cpus_allowed: Allowed cpumask 4759 * @flags: %SCX_PICK_IDLE_CPU_* flags 4760 * 4761 * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu 4762 * number on success. -%EBUSY if no matching cpu was found. 4763 * 4764 * Idle CPU tracking may race against CPU scheduling state transitions. For 4765 * example, this function may return -%EBUSY as CPUs are transitioning into the 4766 * idle state. If the caller then assumes that there will be dispatch events on 4767 * the CPUs as they were all busy, the scheduler may end up stalling with CPUs 4768 * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and 4769 * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch 4770 * event in the near future. 4771 * 4772 * Unavailable if ops.update_idle() is implemented and 4773 * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 4774 */ 4775 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, 4776 u64 flags) 4777 { 4778 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4779 scx_ops_error("built-in idle tracking is disabled"); 4780 return -EBUSY; 4781 } 4782 4783 return scx_pick_idle_cpu(cpus_allowed, flags); 4784 } 4785 4786 /** 4787 * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU 4788 * @cpus_allowed: Allowed cpumask 4789 * @flags: %SCX_PICK_IDLE_CPU_* flags 4790 * 4791 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any 4792 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu 4793 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is 4794 * empty. 4795 * 4796 * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not 4797 * set, this function can't tell which CPUs are idle and will always pick any 4798 * CPU. 4799 */ 4800 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, 4801 u64 flags) 4802 { 4803 s32 cpu; 4804 4805 if (static_branch_likely(&scx_builtin_idle_enabled)) { 4806 cpu = scx_pick_idle_cpu(cpus_allowed, flags); 4807 if (cpu >= 0) 4808 return cpu; 4809 } 4810 4811 cpu = cpumask_any_distribute(cpus_allowed); 4812 if (cpu < nr_cpu_ids) 4813 return cpu; 4814 else 4815 return -EBUSY; 4816 } 4817 4818 /** 4819 * scx_bpf_task_running - Is task currently running? 4820 * @p: task of interest 4821 */ 4822 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 4823 { 4824 return task_rq(p)->curr == p; 4825 } 4826 4827 /** 4828 * scx_bpf_task_cpu - CPU a task is currently associated with 4829 * @p: task of interest 4830 */ 4831 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 4832 { 4833 return task_cpu(p); 4834 } 4835 4836 __bpf_kfunc_end_defs(); 4837 4838 BTF_KFUNCS_START(scx_kfunc_ids_any) 4839 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 4840 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 4841 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) 4842 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) 4843 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) 4844 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 4845 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 4846 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 4847 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 4848 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) 4849 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) 4850 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 4851 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) 4852 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) 4853 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) 4854 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 4855 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 4856 BTF_KFUNCS_END(scx_kfunc_ids_any) 4857 4858 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 4859 .owner = THIS_MODULE, 4860 .set = &scx_kfunc_ids_any, 4861 }; 4862 4863 static int __init scx_init(void) 4864 { 4865 int ret; 4866 4867 /* 4868 * kfunc registration can't be done from init_sched_ext_class() as 4869 * register_btf_kfunc_id_set() needs most of the system to be up. 4870 * 4871 * Some kfuncs are context-sensitive and can only be called from 4872 * specific SCX ops. They are grouped into BTF sets accordingly. 4873 * Unfortunately, BPF currently doesn't have a way of enforcing such 4874 * restrictions. Eventually, the verifier should be able to enforce 4875 * them. For now, register them the same and make each kfunc explicitly 4876 * check using scx_kf_allowed(). 4877 */ 4878 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4879 &scx_kfunc_set_sleepable)) || 4880 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4881 &scx_kfunc_set_select_cpu)) || 4882 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4883 &scx_kfunc_set_enqueue_dispatch)) || 4884 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4885 &scx_kfunc_set_dispatch)) || 4886 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4887 &scx_kfunc_set_any)) || 4888 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 4889 &scx_kfunc_set_any)) || 4890 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 4891 &scx_kfunc_set_any))) { 4892 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 4893 return ret; 4894 } 4895 4896 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 4897 if (ret) { 4898 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 4899 return ret; 4900 } 4901 4902 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 4903 if (!scx_kset) { 4904 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 4905 return -ENOMEM; 4906 } 4907 4908 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 4909 if (ret < 0) { 4910 pr_err("sched_ext: Failed to add global attributes\n"); 4911 return ret; 4912 } 4913 4914 return 0; 4915 } 4916 __initcall(scx_init); 4917