1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 */ 7 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 8 9 enum scx_consts { 10 SCX_DSP_DFL_MAX_BATCH = 32, 11 12 SCX_EXIT_BT_LEN = 64, 13 SCX_EXIT_MSG_LEN = 1024, 14 }; 15 16 enum scx_exit_kind { 17 SCX_EXIT_NONE, 18 SCX_EXIT_DONE, 19 20 SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ 21 SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ 22 SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ 23 24 SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ 25 SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ 26 }; 27 28 /* 29 * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is 30 * being disabled. 31 */ 32 struct scx_exit_info { 33 /* %SCX_EXIT_* - broad category of the exit reason */ 34 enum scx_exit_kind kind; 35 36 /* exit code if gracefully exiting */ 37 s64 exit_code; 38 39 /* textual representation of the above */ 40 const char *reason; 41 42 /* backtrace if exiting due to an error */ 43 unsigned long *bt; 44 u32 bt_len; 45 46 /* informational message */ 47 char *msg; 48 }; 49 50 /* sched_ext_ops.flags */ 51 enum scx_ops_flags { 52 /* 53 * Keep built-in idle tracking even if ops.update_idle() is implemented. 54 */ 55 SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 56 57 /* 58 * By default, if there are no other task to run on the CPU, ext core 59 * keeps running the current task even after its slice expires. If this 60 * flag is specified, such tasks are passed to ops.enqueue() with 61 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 62 */ 63 SCX_OPS_ENQ_LAST = 1LLU << 1, 64 65 /* 66 * An exiting task may schedule after PF_EXITING is set. In such cases, 67 * bpf_task_from_pid() may not be able to find the task and if the BPF 68 * scheduler depends on pid lookup for dispatching, the task will be 69 * lost leading to various issues including RCU grace period stalls. 70 * 71 * To mask this problem, by default, unhashed tasks are automatically 72 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't 73 * depend on pid lookups and wants to handle these tasks directly, the 74 * following flag can be used. 75 */ 76 SCX_OPS_ENQ_EXITING = 1LLU << 2, 77 78 /* 79 * If set, only tasks with policy set to SCHED_EXT are attached to 80 * sched_ext. If clear, SCHED_NORMAL tasks are also included. 81 */ 82 SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 83 84 SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 85 SCX_OPS_ENQ_LAST | 86 SCX_OPS_ENQ_EXITING | 87 SCX_OPS_SWITCH_PARTIAL, 88 }; 89 90 /* argument container for ops.init_task() */ 91 struct scx_init_task_args { 92 /* 93 * Set if ops.init_task() is being invoked on the fork path, as opposed 94 * to the scheduler transition path. 95 */ 96 bool fork; 97 }; 98 99 /* argument container for ops.exit_task() */ 100 struct scx_exit_task_args { 101 /* Whether the task exited before running on sched_ext. */ 102 bool cancelled; 103 }; 104 105 /** 106 * struct sched_ext_ops - Operation table for BPF scheduler implementation 107 * 108 * Userland can implement an arbitrary scheduling policy by implementing and 109 * loading operations in this table. 110 */ 111 struct sched_ext_ops { 112 /** 113 * select_cpu - Pick the target CPU for a task which is being woken up 114 * @p: task being woken up 115 * @prev_cpu: the cpu @p was on before sleeping 116 * @wake_flags: SCX_WAKE_* 117 * 118 * Decision made here isn't final. @p may be moved to any CPU while it 119 * is getting dispatched for execution later. However, as @p is not on 120 * the rq at this point, getting the eventual execution CPU right here 121 * saves a small bit of overhead down the line. 122 * 123 * If an idle CPU is returned, the CPU is kicked and will try to 124 * dispatch. While an explicit custom mechanism can be added, 125 * select_cpu() serves as the default way to wake up idle CPUs. 126 * 127 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p 128 * is dispatched, the ops.enqueue() callback will be skipped. Finally, 129 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the 130 * local DSQ of whatever CPU is returned by this callback. 131 */ 132 s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); 133 134 /** 135 * enqueue - Enqueue a task on the BPF scheduler 136 * @p: task being enqueued 137 * @enq_flags: %SCX_ENQ_* 138 * 139 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() 140 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf 141 * scheduler owns @p and if it fails to dispatch @p, the task will 142 * stall. 143 * 144 * If @p was dispatched from ops.select_cpu(), this callback is 145 * skipped. 146 */ 147 void (*enqueue)(struct task_struct *p, u64 enq_flags); 148 149 /** 150 * dequeue - Remove a task from the BPF scheduler 151 * @p: task being dequeued 152 * @deq_flags: %SCX_DEQ_* 153 * 154 * Remove @p from the BPF scheduler. This is usually called to isolate 155 * the task while updating its scheduling properties (e.g. priority). 156 * 157 * The ext core keeps track of whether the BPF side owns a given task or 158 * not and can gracefully ignore spurious dispatches from BPF side, 159 * which makes it safe to not implement this method. However, depending 160 * on the scheduling logic, this can lead to confusing behaviors - e.g. 161 * scheduling position not being updated across a priority change. 162 */ 163 void (*dequeue)(struct task_struct *p, u64 deq_flags); 164 165 /** 166 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs 167 * @cpu: CPU to dispatch tasks for 168 * @prev: previous task being switched out 169 * 170 * Called when a CPU's local dsq is empty. The operation should dispatch 171 * one or more tasks from the BPF scheduler into the DSQs using 172 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using 173 * scx_bpf_consume(). 174 * 175 * The maximum number of times scx_bpf_dispatch() can be called without 176 * an intervening scx_bpf_consume() is specified by 177 * ops.dispatch_max_batch. See the comments on top of the two functions 178 * for more details. 179 * 180 * When not %NULL, @prev is an SCX task with its slice depleted. If 181 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in 182 * @prev->scx.flags, it is not enqueued yet and will be enqueued after 183 * ops.dispatch() returns. To keep executing @prev, return without 184 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. 185 */ 186 void (*dispatch)(s32 cpu, struct task_struct *prev); 187 188 /** 189 * tick - Periodic tick 190 * @p: task running currently 191 * 192 * This operation is called every 1/HZ seconds on CPUs which are 193 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an 194 * immediate dispatch cycle on the CPU. 195 */ 196 void (*tick)(struct task_struct *p); 197 198 /** 199 * yield - Yield CPU 200 * @from: yielding task 201 * @to: optional yield target task 202 * 203 * If @to is NULL, @from is yielding the CPU to other runnable tasks. 204 * The BPF scheduler should ensure that other available tasks are 205 * dispatched before the yielding task. Return value is ignored in this 206 * case. 207 * 208 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf 209 * scheduler can implement the request, return %true; otherwise, %false. 210 */ 211 bool (*yield)(struct task_struct *from, struct task_struct *to); 212 213 /** 214 * set_weight - Set task weight 215 * @p: task to set weight for 216 * @weight: new eight [1..10000] 217 * 218 * Update @p's weight to @weight. 219 */ 220 void (*set_weight)(struct task_struct *p, u32 weight); 221 222 /** 223 * set_cpumask - Set CPU affinity 224 * @p: task to set CPU affinity for 225 * @cpumask: cpumask of cpus that @p can run on 226 * 227 * Update @p's CPU affinity to @cpumask. 228 */ 229 void (*set_cpumask)(struct task_struct *p, 230 const struct cpumask *cpumask); 231 232 /** 233 * update_idle - Update the idle state of a CPU 234 * @cpu: CPU to udpate the idle state for 235 * @idle: whether entering or exiting the idle state 236 * 237 * This operation is called when @rq's CPU goes or leaves the idle 238 * state. By default, implementing this operation disables the built-in 239 * idle CPU tracking and the following helpers become unavailable: 240 * 241 * - scx_bpf_select_cpu_dfl() 242 * - scx_bpf_test_and_clear_cpu_idle() 243 * - scx_bpf_pick_idle_cpu() 244 * 245 * The user also must implement ops.select_cpu() as the default 246 * implementation relies on scx_bpf_select_cpu_dfl(). 247 * 248 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle 249 * tracking. 250 */ 251 void (*update_idle)(s32 cpu, bool idle); 252 253 /** 254 * init_task - Initialize a task to run in a BPF scheduler 255 * @p: task to initialize for BPF scheduling 256 * @args: init arguments, see the struct definition 257 * 258 * Either we're loading a BPF scheduler or a new task is being forked. 259 * Initialize @p for BPF scheduling. This operation may block and can 260 * be used for allocations, and is called exactly once for a task. 261 * 262 * Return 0 for success, -errno for failure. An error return while 263 * loading will abort loading of the BPF scheduler. During a fork, it 264 * will abort that specific fork. 265 */ 266 s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); 267 268 /** 269 * exit_task - Exit a previously-running task from the system 270 * @p: task to exit 271 * 272 * @p is exiting or the BPF scheduler is being unloaded. Perform any 273 * necessary cleanup for @p. 274 */ 275 void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); 276 277 /** 278 * enable - Enable BPF scheduling for a task 279 * @p: task to enable BPF scheduling for 280 * 281 * Enable @p for BPF scheduling. enable() is called on @p any time it 282 * enters SCX, and is always paired with a matching disable(). 283 */ 284 void (*enable)(struct task_struct *p); 285 286 /** 287 * disable - Disable BPF scheduling for a task 288 * @p: task to disable BPF scheduling for 289 * 290 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. 291 * Disable BPF scheduling for @p. A disable() call is always matched 292 * with a prior enable() call. 293 */ 294 void (*disable)(struct task_struct *p); 295 296 /* 297 * All online ops must come before ops.init(). 298 */ 299 300 /** 301 * init - Initialize the BPF scheduler 302 */ 303 s32 (*init)(void); 304 305 /** 306 * exit - Clean up after the BPF scheduler 307 * @info: Exit info 308 */ 309 void (*exit)(struct scx_exit_info *info); 310 311 /** 312 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch 313 */ 314 u32 dispatch_max_batch; 315 316 /** 317 * flags - %SCX_OPS_* flags 318 */ 319 u64 flags; 320 321 /** 322 * name - BPF scheduler's name 323 * 324 * Must be a non-zero valid BPF object name including only isalnum(), 325 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the 326 * BPF scheduler is enabled. 327 */ 328 char name[SCX_OPS_NAME_LEN]; 329 }; 330 331 enum scx_opi { 332 SCX_OPI_BEGIN = 0, 333 SCX_OPI_NORMAL_BEGIN = 0, 334 SCX_OPI_NORMAL_END = SCX_OP_IDX(init), 335 SCX_OPI_END = SCX_OP_IDX(init), 336 }; 337 338 enum scx_wake_flags { 339 /* expose select WF_* flags as enums */ 340 SCX_WAKE_FORK = WF_FORK, 341 SCX_WAKE_TTWU = WF_TTWU, 342 SCX_WAKE_SYNC = WF_SYNC, 343 }; 344 345 enum scx_enq_flags { 346 /* expose select ENQUEUE_* flags as enums */ 347 SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 348 SCX_ENQ_HEAD = ENQUEUE_HEAD, 349 350 /* high 32bits are SCX specific */ 351 352 /* 353 * The task being enqueued is the only task available for the cpu. By 354 * default, ext core keeps executing such tasks but when 355 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the 356 * %SCX_ENQ_LAST flag set. 357 * 358 * If the BPF scheduler wants to continue executing the task, 359 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. 360 * If the task gets queued on a different dsq or the BPF side, the BPF 361 * scheduler is responsible for triggering a follow-up scheduling event. 362 * Otherwise, Execution may stall. 363 */ 364 SCX_ENQ_LAST = 1LLU << 41, 365 366 /* high 8 bits are internal */ 367 __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, 368 369 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 370 }; 371 372 enum scx_deq_flags { 373 /* expose select DEQUEUE_* flags as enums */ 374 SCX_DEQ_SLEEP = DEQUEUE_SLEEP, 375 }; 376 377 enum scx_pick_idle_cpu_flags { 378 SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 379 }; 380 381 enum scx_ops_enable_state { 382 SCX_OPS_PREPPING, 383 SCX_OPS_ENABLING, 384 SCX_OPS_ENABLED, 385 SCX_OPS_DISABLING, 386 SCX_OPS_DISABLED, 387 }; 388 389 static const char *scx_ops_enable_state_str[] = { 390 [SCX_OPS_PREPPING] = "prepping", 391 [SCX_OPS_ENABLING] = "enabling", 392 [SCX_OPS_ENABLED] = "enabled", 393 [SCX_OPS_DISABLING] = "disabling", 394 [SCX_OPS_DISABLED] = "disabled", 395 }; 396 397 /* 398 * sched_ext_entity->ops_state 399 * 400 * Used to track the task ownership between the SCX core and the BPF scheduler. 401 * State transitions look as follows: 402 * 403 * NONE -> QUEUEING -> QUEUED -> DISPATCHING 404 * ^ | | 405 * | v v 406 * \-------------------------------/ 407 * 408 * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 409 * sites for explanations on the conditions being waited upon and why they are 410 * safe. Transitions out of them into NONE or QUEUED must store_release and the 411 * waiters should load_acquire. 412 * 413 * Tracking scx_ops_state enables sched_ext core to reliably determine whether 414 * any given task can be dispatched by the BPF scheduler at all times and thus 415 * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 416 * to try to dispatch any task anytime regardless of its state as the SCX core 417 * can safely reject invalid dispatches. 418 */ 419 enum scx_ops_state { 420 SCX_OPSS_NONE, /* owned by the SCX core */ 421 SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ 422 SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ 423 SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ 424 425 /* 426 * QSEQ brands each QUEUED instance so that, when dispatch races 427 * dequeue/requeue, the dispatcher can tell whether it still has a claim 428 * on the task being dispatched. 429 * 430 * As some 32bit archs can't do 64bit store_release/load_acquire, 431 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on 432 * 32bit machines. The dispatch race window QSEQ protects is very narrow 433 * and runs with IRQ disabled. 30 bits should be sufficient. 434 */ 435 SCX_OPSS_QSEQ_SHIFT = 2, 436 }; 437 438 /* Use macros to ensure that the type is unsigned long for the masks */ 439 #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) 440 #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) 441 442 /* 443 * During exit, a task may schedule after losing its PIDs. When disabling the 444 * BPF scheduler, we need to be able to iterate tasks in every state to 445 * guarantee system safety. Maintain a dedicated task list which contains every 446 * task between its fork and eventual free. 447 */ 448 static DEFINE_SPINLOCK(scx_tasks_lock); 449 static LIST_HEAD(scx_tasks); 450 451 /* ops enable/disable */ 452 static struct kthread_worker *scx_ops_helper; 453 static DEFINE_MUTEX(scx_ops_enable_mutex); 454 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); 455 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 456 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); 457 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); 458 static bool scx_switching_all; 459 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 460 461 static struct sched_ext_ops scx_ops; 462 static bool scx_warned_zero_slice; 463 464 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); 465 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); 466 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); 467 468 struct static_key_false scx_has_op[SCX_OPI_END] = 469 { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; 470 471 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); 472 static struct scx_exit_info *scx_exit_info; 473 474 /* idle tracking */ 475 #ifdef CONFIG_SMP 476 #ifdef CONFIG_CPUMASK_OFFSTACK 477 #define CL_ALIGNED_IF_ONSTACK 478 #else 479 #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp 480 #endif 481 482 static struct { 483 cpumask_var_t cpu; 484 cpumask_var_t smt; 485 } idle_masks CL_ALIGNED_IF_ONSTACK; 486 487 #endif /* CONFIG_SMP */ 488 489 /* 490 * Direct dispatch marker. 491 * 492 * Non-NULL values are used for direct dispatch from enqueue path. A valid 493 * pointer points to the task currently being enqueued. An ERR_PTR value is used 494 * to indicate that direct dispatch has already happened. 495 */ 496 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 497 498 /* dispatch queues */ 499 static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; 500 501 static const struct rhashtable_params dsq_hash_params = { 502 .key_len = 8, 503 .key_offset = offsetof(struct scx_dispatch_q, id), 504 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 505 }; 506 507 static struct rhashtable dsq_hash; 508 static LLIST_HEAD(dsqs_to_free); 509 510 /* dispatch buf */ 511 struct scx_dsp_buf_ent { 512 struct task_struct *task; 513 unsigned long qseq; 514 u64 dsq_id; 515 u64 enq_flags; 516 }; 517 518 static u32 scx_dsp_max_batch; 519 520 struct scx_dsp_ctx { 521 struct rq *rq; 522 struct rq_flags *rf; 523 u32 cursor; 524 u32 nr_tasks; 525 struct scx_dsp_buf_ent buf[]; 526 }; 527 528 static struct scx_dsp_ctx __percpu *scx_dsp_ctx; 529 530 /* string formatting from BPF */ 531 struct scx_bstr_buf { 532 u64 data[MAX_BPRINTF_VARARGS]; 533 char line[SCX_EXIT_MSG_LEN]; 534 }; 535 536 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 537 static struct scx_bstr_buf scx_exit_bstr_buf; 538 539 /* /sys/kernel/sched_ext interface */ 540 static struct kset *scx_kset; 541 static struct kobject *scx_root_kobj; 542 543 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, 544 s64 exit_code, 545 const char *fmt, ...); 546 547 #define scx_ops_error_kind(err, fmt, args...) \ 548 scx_ops_exit_kind((err), 0, fmt, ##args) 549 550 #define scx_ops_exit(code, fmt, args...) \ 551 scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) 552 553 #define scx_ops_error(fmt, args...) \ 554 scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) 555 556 #define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) 557 558 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */ 559 static u32 higher_bits(u32 flags) 560 { 561 return ~((1 << fls(flags)) - 1); 562 } 563 564 /* return the mask with only the highest bit set */ 565 static u32 highest_bit(u32 flags) 566 { 567 int bit = fls(flags); 568 return ((u64)1 << bit) >> 1; 569 } 570 571 /* 572 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX 573 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate 574 * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check 575 * whether it's running from an allowed context. 576 * 577 * @mask is constant, always inline to cull the mask calculations. 578 */ 579 static __always_inline void scx_kf_allow(u32 mask) 580 { 581 /* nesting is allowed only in increasing scx_kf_mask order */ 582 WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, 583 "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", 584 current->scx.kf_mask, mask); 585 current->scx.kf_mask |= mask; 586 barrier(); 587 } 588 589 static void scx_kf_disallow(u32 mask) 590 { 591 barrier(); 592 current->scx.kf_mask &= ~mask; 593 } 594 595 #define SCX_CALL_OP(mask, op, args...) \ 596 do { \ 597 if (mask) { \ 598 scx_kf_allow(mask); \ 599 scx_ops.op(args); \ 600 scx_kf_disallow(mask); \ 601 } else { \ 602 scx_ops.op(args); \ 603 } \ 604 } while (0) 605 606 #define SCX_CALL_OP_RET(mask, op, args...) \ 607 ({ \ 608 __typeof__(scx_ops.op(args)) __ret; \ 609 if (mask) { \ 610 scx_kf_allow(mask); \ 611 __ret = scx_ops.op(args); \ 612 scx_kf_disallow(mask); \ 613 } else { \ 614 __ret = scx_ops.op(args); \ 615 } \ 616 __ret; \ 617 }) 618 619 /* @mask is constant, always inline to cull unnecessary branches */ 620 static __always_inline bool scx_kf_allowed(u32 mask) 621 { 622 if (unlikely(!(current->scx.kf_mask & mask))) { 623 scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", 624 mask, current->scx.kf_mask); 625 return false; 626 } 627 628 if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { 629 scx_ops_error("sleepable kfunc called from non-sleepable context"); 630 return false; 631 } 632 633 /* 634 * Enforce nesting boundaries. e.g. A kfunc which can be called from 635 * DISPATCH must not be called if we're running DEQUEUE which is nested 636 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE 637 * boundary thanks to the above in_interrupt() check. 638 */ 639 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && 640 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { 641 scx_ops_error("dispatch kfunc called from a nested operation"); 642 return false; 643 } 644 645 return true; 646 } 647 648 649 /* 650 * SCX task iterator. 651 */ 652 struct scx_task_iter { 653 struct sched_ext_entity cursor; 654 struct task_struct *locked; 655 struct rq *rq; 656 struct rq_flags rf; 657 }; 658 659 /** 660 * scx_task_iter_init - Initialize a task iterator 661 * @iter: iterator to init 662 * 663 * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, 664 * @iter must eventually be exited with scx_task_iter_exit(). 665 * 666 * scx_tasks_lock may be released between this and the first next() call or 667 * between any two next() calls. If scx_tasks_lock is released between two 668 * next() calls, the caller is responsible for ensuring that the task being 669 * iterated remains accessible either through RCU read lock or obtaining a 670 * reference count. 671 * 672 * All tasks which existed when the iteration started are guaranteed to be 673 * visited as long as they still exist. 674 */ 675 static void scx_task_iter_init(struct scx_task_iter *iter) 676 { 677 lockdep_assert_held(&scx_tasks_lock); 678 679 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 680 list_add(&iter->cursor.tasks_node, &scx_tasks); 681 iter->locked = NULL; 682 } 683 684 /** 685 * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator 686 * @iter: iterator to unlock rq for 687 * 688 * If @iter is in the middle of a locked iteration, it may be locking the rq of 689 * the task currently being visited. Unlock the rq if so. This function can be 690 * safely called anytime during an iteration. 691 * 692 * Returns %true if the rq @iter was locking is unlocked. %false if @iter was 693 * not locking an rq. 694 */ 695 static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) 696 { 697 if (iter->locked) { 698 task_rq_unlock(iter->rq, iter->locked, &iter->rf); 699 iter->locked = NULL; 700 return true; 701 } else { 702 return false; 703 } 704 } 705 706 /** 707 * scx_task_iter_exit - Exit a task iterator 708 * @iter: iterator to exit 709 * 710 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. 711 * If the iterator holds a task's rq lock, that rq lock is released. See 712 * scx_task_iter_init() for details. 713 */ 714 static void scx_task_iter_exit(struct scx_task_iter *iter) 715 { 716 lockdep_assert_held(&scx_tasks_lock); 717 718 scx_task_iter_rq_unlock(iter); 719 list_del_init(&iter->cursor.tasks_node); 720 } 721 722 /** 723 * scx_task_iter_next - Next task 724 * @iter: iterator to walk 725 * 726 * Visit the next task. See scx_task_iter_init() for details. 727 */ 728 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 729 { 730 struct list_head *cursor = &iter->cursor.tasks_node; 731 struct sched_ext_entity *pos; 732 733 lockdep_assert_held(&scx_tasks_lock); 734 735 list_for_each_entry(pos, cursor, tasks_node) { 736 if (&pos->tasks_node == &scx_tasks) 737 return NULL; 738 if (!(pos->flags & SCX_TASK_CURSOR)) { 739 list_move(cursor, &pos->tasks_node); 740 return container_of(pos, struct task_struct, scx); 741 } 742 } 743 744 /* can't happen, should always terminate at scx_tasks above */ 745 BUG(); 746 } 747 748 /** 749 * scx_task_iter_next_locked - Next non-idle task with its rq locked 750 * @iter: iterator to walk 751 * @include_dead: Whether we should include dead tasks in the iteration 752 * 753 * Visit the non-idle task with its rq lock held. Allows callers to specify 754 * whether they would like to filter out dead tasks. See scx_task_iter_init() 755 * for details. 756 */ 757 static struct task_struct * 758 scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) 759 { 760 struct task_struct *p; 761 retry: 762 scx_task_iter_rq_unlock(iter); 763 764 while ((p = scx_task_iter_next(iter))) { 765 /* 766 * is_idle_task() tests %PF_IDLE which may not be set for CPUs 767 * which haven't yet been onlined. Test sched_class directly. 768 */ 769 if (p->sched_class != &idle_sched_class) 770 break; 771 } 772 if (!p) 773 return NULL; 774 775 iter->rq = task_rq_lock(p, &iter->rf); 776 iter->locked = p; 777 778 /* 779 * If we see %TASK_DEAD, @p already disabled preemption, is about to do 780 * the final __schedule(), won't ever need to be scheduled again and can 781 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter 782 * the final __schedle() while we're locking its rq and thus will stay 783 * alive until the rq is unlocked. 784 */ 785 if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) 786 goto retry; 787 788 return p; 789 } 790 791 static enum scx_ops_enable_state scx_ops_enable_state(void) 792 { 793 return atomic_read(&scx_ops_enable_state_var); 794 } 795 796 static enum scx_ops_enable_state 797 scx_ops_set_enable_state(enum scx_ops_enable_state to) 798 { 799 return atomic_xchg(&scx_ops_enable_state_var, to); 800 } 801 802 static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, 803 enum scx_ops_enable_state from) 804 { 805 int from_v = from; 806 807 return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); 808 } 809 810 static bool scx_ops_bypassing(void) 811 { 812 return unlikely(atomic_read(&scx_ops_bypass_depth)); 813 } 814 815 /** 816 * wait_ops_state - Busy-wait the specified ops state to end 817 * @p: target task 818 * @opss: state to wait the end of 819 * 820 * Busy-wait for @p to transition out of @opss. This can only be used when the 821 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 822 * has load_acquire semantics to ensure that the caller can see the updates made 823 * in the enqueueing and dispatching paths. 824 */ 825 static void wait_ops_state(struct task_struct *p, unsigned long opss) 826 { 827 do { 828 cpu_relax(); 829 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 830 } 831 832 /** 833 * ops_cpu_valid - Verify a cpu number 834 * @cpu: cpu number which came from a BPF ops 835 * @where: extra information reported on error 836 * 837 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 838 * Verify that it is in range and one of the possible cpus. If invalid, trigger 839 * an ops error. 840 */ 841 static bool ops_cpu_valid(s32 cpu, const char *where) 842 { 843 if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { 844 return true; 845 } else { 846 scx_ops_error("invalid CPU %d%s%s", cpu, 847 where ? " " : "", where ?: ""); 848 return false; 849 } 850 } 851 852 /** 853 * ops_sanitize_err - Sanitize a -errno value 854 * @ops_name: operation to blame on failure 855 * @err: -errno value to sanitize 856 * 857 * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return 858 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 859 * cause misbehaviors. For an example, a large negative return from 860 * ops.init_task() triggers an oops when passed up the call chain because the 861 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 862 * handled as a pointer. 863 */ 864 static int ops_sanitize_err(const char *ops_name, s32 err) 865 { 866 if (err < 0 && err >= -MAX_ERRNO) 867 return err; 868 869 scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); 870 return -EPROTO; 871 } 872 873 static void update_curr_scx(struct rq *rq) 874 { 875 struct task_struct *curr = rq->curr; 876 u64 now = rq_clock_task(rq); 877 u64 delta_exec; 878 879 if (time_before_eq64(now, curr->se.exec_start)) 880 return; 881 882 delta_exec = now - curr->se.exec_start; 883 curr->se.exec_start = now; 884 curr->se.sum_exec_runtime += delta_exec; 885 account_group_exec_runtime(curr, delta_exec); 886 cgroup_account_cputime(curr, delta_exec); 887 888 curr->scx.slice -= min(curr->scx.slice, delta_exec); 889 } 890 891 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) 892 { 893 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 894 WRITE_ONCE(dsq->nr, dsq->nr + delta); 895 } 896 897 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, 898 u64 enq_flags) 899 { 900 bool is_local = dsq->id == SCX_DSQ_LOCAL; 901 902 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node)); 903 904 if (!is_local) { 905 raw_spin_lock(&dsq->lock); 906 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 907 scx_ops_error("attempting to dispatch to a destroyed dsq"); 908 /* fall back to the global dsq */ 909 raw_spin_unlock(&dsq->lock); 910 dsq = &scx_dsq_global; 911 raw_spin_lock(&dsq->lock); 912 } 913 } 914 915 if (enq_flags & SCX_ENQ_HEAD) 916 list_add(&p->scx.dsq_node, &dsq->list); 917 else 918 list_add_tail(&p->scx.dsq_node, &dsq->list); 919 920 dsq_mod_nr(dsq, 1); 921 p->scx.dsq = dsq; 922 923 /* 924 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the 925 * direct dispatch path, but we clear them here because the direct 926 * dispatch verdict may be overridden on the enqueue path during e.g. 927 * bypass. 928 */ 929 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 930 p->scx.ddsp_enq_flags = 0; 931 932 /* 933 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 934 * match waiters' load_acquire. 935 */ 936 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 937 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 938 939 if (is_local) { 940 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 941 942 if (sched_class_above(&ext_sched_class, rq->curr->sched_class)) 943 resched_curr(rq); 944 } else { 945 raw_spin_unlock(&dsq->lock); 946 } 947 } 948 949 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 950 { 951 struct scx_dispatch_q *dsq = p->scx.dsq; 952 bool is_local = dsq == &rq->scx.local_dsq; 953 954 if (!dsq) { 955 WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); 956 /* 957 * When dispatching directly from the BPF scheduler to a local 958 * DSQ, the task isn't associated with any DSQ but 959 * @p->scx.holding_cpu may be set under the protection of 960 * %SCX_OPSS_DISPATCHING. 961 */ 962 if (p->scx.holding_cpu >= 0) 963 p->scx.holding_cpu = -1; 964 return; 965 } 966 967 if (!is_local) 968 raw_spin_lock(&dsq->lock); 969 970 /* 971 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node 972 * can't change underneath us. 973 */ 974 if (p->scx.holding_cpu < 0) { 975 /* @p must still be on @dsq, dequeue */ 976 WARN_ON_ONCE(list_empty(&p->scx.dsq_node)); 977 list_del_init(&p->scx.dsq_node); 978 dsq_mod_nr(dsq, -1); 979 } else { 980 /* 981 * We're racing against dispatch_to_local_dsq() which already 982 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 983 * holding_cpu which tells dispatch_to_local_dsq() that it lost 984 * the race. 985 */ 986 WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); 987 p->scx.holding_cpu = -1; 988 } 989 p->scx.dsq = NULL; 990 991 if (!is_local) 992 raw_spin_unlock(&dsq->lock); 993 } 994 995 static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 996 { 997 return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 998 } 999 1000 static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) 1001 { 1002 lockdep_assert(rcu_read_lock_any_held()); 1003 1004 if (dsq_id == SCX_DSQ_GLOBAL) 1005 return &scx_dsq_global; 1006 else 1007 return find_user_dsq(dsq_id); 1008 } 1009 1010 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, 1011 struct task_struct *p) 1012 { 1013 struct scx_dispatch_q *dsq; 1014 1015 if (dsq_id == SCX_DSQ_LOCAL) 1016 return &rq->scx.local_dsq; 1017 1018 dsq = find_non_local_dsq(dsq_id); 1019 if (unlikely(!dsq)) { 1020 scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", 1021 dsq_id, p->comm, p->pid); 1022 return &scx_dsq_global; 1023 } 1024 1025 return dsq; 1026 } 1027 1028 static void mark_direct_dispatch(struct task_struct *ddsp_task, 1029 struct task_struct *p, u64 dsq_id, 1030 u64 enq_flags) 1031 { 1032 /* 1033 * Mark that dispatch already happened from ops.select_cpu() or 1034 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1035 * which can never match a valid task pointer. 1036 */ 1037 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1038 1039 /* @p must match the task on the enqueue path */ 1040 if (unlikely(p != ddsp_task)) { 1041 if (IS_ERR(ddsp_task)) 1042 scx_ops_error("%s[%d] already direct-dispatched", 1043 p->comm, p->pid); 1044 else 1045 scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1046 ddsp_task->comm, ddsp_task->pid, 1047 p->comm, p->pid); 1048 return; 1049 } 1050 1051 /* 1052 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because 1053 * dispatching to the local DSQ of a different CPU requires unlocking 1054 * the current rq which isn't allowed in the enqueue path. Use 1055 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. 1056 */ 1057 if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { 1058 scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); 1059 return; 1060 } 1061 1062 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1063 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1064 1065 p->scx.ddsp_dsq_id = dsq_id; 1066 p->scx.ddsp_enq_flags = enq_flags; 1067 } 1068 1069 static void direct_dispatch(struct task_struct *p, u64 enq_flags) 1070 { 1071 struct scx_dispatch_q *dsq; 1072 1073 enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1074 dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); 1075 dispatch_enqueue(dsq, p, enq_flags); 1076 } 1077 1078 static bool scx_rq_online(struct rq *rq) 1079 { 1080 #ifdef CONFIG_SMP 1081 return likely(rq->online); 1082 #else 1083 return true; 1084 #endif 1085 } 1086 1087 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1088 int sticky_cpu) 1089 { 1090 struct task_struct **ddsp_taskp; 1091 unsigned long qseq; 1092 1093 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1094 1095 /* rq migration */ 1096 if (sticky_cpu == cpu_of(rq)) 1097 goto local_norefill; 1098 1099 if (!scx_rq_online(rq)) 1100 goto local; 1101 1102 if (scx_ops_bypassing()) { 1103 if (enq_flags & SCX_ENQ_LAST) 1104 goto local; 1105 else 1106 goto global; 1107 } 1108 1109 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1110 goto direct; 1111 1112 /* see %SCX_OPS_ENQ_EXITING */ 1113 if (!static_branch_unlikely(&scx_ops_enq_exiting) && 1114 unlikely(p->flags & PF_EXITING)) 1115 goto local; 1116 1117 /* see %SCX_OPS_ENQ_LAST */ 1118 if (!static_branch_unlikely(&scx_ops_enq_last) && 1119 (enq_flags & SCX_ENQ_LAST)) 1120 goto local; 1121 1122 if (!SCX_HAS_OP(enqueue)) 1123 goto global; 1124 1125 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1126 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1127 1128 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1129 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1130 1131 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1132 WARN_ON_ONCE(*ddsp_taskp); 1133 *ddsp_taskp = p; 1134 1135 SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags); 1136 1137 *ddsp_taskp = NULL; 1138 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1139 goto direct; 1140 1141 /* 1142 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1143 * dequeue may be waiting. The store_release matches their load_acquire. 1144 */ 1145 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1146 return; 1147 1148 direct: 1149 direct_dispatch(p, enq_flags); 1150 return; 1151 1152 local: 1153 p->scx.slice = SCX_SLICE_DFL; 1154 local_norefill: 1155 dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); 1156 return; 1157 1158 global: 1159 p->scx.slice = SCX_SLICE_DFL; 1160 dispatch_enqueue(&scx_dsq_global, p, enq_flags); 1161 } 1162 1163 static bool task_runnable(const struct task_struct *p) 1164 { 1165 return !list_empty(&p->scx.runnable_node); 1166 } 1167 1168 static void set_task_runnable(struct rq *rq, struct task_struct *p) 1169 { 1170 lockdep_assert_rq_held(rq); 1171 1172 /* 1173 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being 1174 * appened to the runnable_list. 1175 */ 1176 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 1177 } 1178 1179 static void clr_task_runnable(struct task_struct *p) 1180 { 1181 list_del_init(&p->scx.runnable_node); 1182 } 1183 1184 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) 1185 { 1186 int sticky_cpu = p->scx.sticky_cpu; 1187 1188 enq_flags |= rq->scx.extra_enq_flags; 1189 1190 if (sticky_cpu >= 0) 1191 p->scx.sticky_cpu = -1; 1192 1193 /* 1194 * Restoring a running task will be immediately followed by 1195 * set_next_task_scx() which expects the task to not be on the BPF 1196 * scheduler as tasks can only start running through local DSQs. Force 1197 * direct-dispatch into the local DSQ by setting the sticky_cpu. 1198 */ 1199 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 1200 sticky_cpu = cpu_of(rq); 1201 1202 if (p->scx.flags & SCX_TASK_QUEUED) { 1203 WARN_ON_ONCE(!task_runnable(p)); 1204 return; 1205 } 1206 1207 set_task_runnable(rq, p); 1208 p->scx.flags |= SCX_TASK_QUEUED; 1209 rq->scx.nr_running++; 1210 add_nr_running(rq, 1); 1211 1212 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 1213 } 1214 1215 static void ops_dequeue(struct task_struct *p, u64 deq_flags) 1216 { 1217 unsigned long opss; 1218 1219 clr_task_runnable(p); 1220 1221 /* acquire ensures that we see the preceding updates on QUEUED */ 1222 opss = atomic_long_read_acquire(&p->scx.ops_state); 1223 1224 switch (opss & SCX_OPSS_STATE_MASK) { 1225 case SCX_OPSS_NONE: 1226 break; 1227 case SCX_OPSS_QUEUEING: 1228 /* 1229 * QUEUEING is started and finished while holding @p's rq lock. 1230 * As we're holding the rq lock now, we shouldn't see QUEUEING. 1231 */ 1232 BUG(); 1233 case SCX_OPSS_QUEUED: 1234 if (SCX_HAS_OP(dequeue)) 1235 SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags); 1236 1237 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 1238 SCX_OPSS_NONE)) 1239 break; 1240 fallthrough; 1241 case SCX_OPSS_DISPATCHING: 1242 /* 1243 * If @p is being dispatched from the BPF scheduler to a DSQ, 1244 * wait for the transfer to complete so that @p doesn't get 1245 * added to its DSQ after dequeueing is complete. 1246 * 1247 * As we're waiting on DISPATCHING with the rq locked, the 1248 * dispatching side shouldn't try to lock the rq while 1249 * DISPATCHING is set. See dispatch_to_local_dsq(). 1250 * 1251 * DISPATCHING shouldn't have qseq set and control can reach 1252 * here with NONE @opss from the above QUEUED case block. 1253 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 1254 */ 1255 wait_ops_state(p, SCX_OPSS_DISPATCHING); 1256 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1257 break; 1258 } 1259 } 1260 1261 static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) 1262 { 1263 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 1264 WARN_ON_ONCE(task_runnable(p)); 1265 return; 1266 } 1267 1268 ops_dequeue(p, deq_flags); 1269 1270 if (deq_flags & SCX_DEQ_SLEEP) 1271 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 1272 else 1273 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 1274 1275 p->scx.flags &= ~SCX_TASK_QUEUED; 1276 rq->scx.nr_running--; 1277 sub_nr_running(rq, 1); 1278 1279 dispatch_dequeue(rq, p); 1280 } 1281 1282 static void yield_task_scx(struct rq *rq) 1283 { 1284 struct task_struct *p = rq->curr; 1285 1286 if (SCX_HAS_OP(yield)) 1287 SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL); 1288 else 1289 p->scx.slice = 0; 1290 } 1291 1292 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 1293 { 1294 struct task_struct *from = rq->curr; 1295 1296 if (SCX_HAS_OP(yield)) 1297 return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to); 1298 else 1299 return false; 1300 } 1301 1302 #ifdef CONFIG_SMP 1303 /** 1304 * move_task_to_local_dsq - Move a task from a different rq to a local DSQ 1305 * @rq: rq to move the task into, currently locked 1306 * @p: task to move 1307 * @enq_flags: %SCX_ENQ_* 1308 * 1309 * Move @p which is currently on a different rq to @rq's local DSQ. The caller 1310 * must: 1311 * 1312 * 1. Start with exclusive access to @p either through its DSQ lock or 1313 * %SCX_OPSS_DISPATCHING flag. 1314 * 1315 * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). 1316 * 1317 * 3. Remember task_rq(@p). Release the exclusive access so that we don't 1318 * deadlock with dequeue. 1319 * 1320 * 4. Lock @rq and the task_rq from #3. 1321 * 1322 * 5. Call this function. 1323 * 1324 * Returns %true if @p was successfully moved. %false after racing dequeue and 1325 * losing. 1326 */ 1327 static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, 1328 u64 enq_flags) 1329 { 1330 struct rq *task_rq; 1331 1332 lockdep_assert_rq_held(rq); 1333 1334 /* 1335 * If dequeue got to @p while we were trying to lock both rq's, it'd 1336 * have cleared @p->scx.holding_cpu to -1. While other cpus may have 1337 * updated it to different values afterwards, as this operation can't be 1338 * preempted or recurse, @p->scx.holding_cpu can never become 1339 * raw_smp_processor_id() again before we're done. Thus, we can tell 1340 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is 1341 * still raw_smp_processor_id(). 1342 * 1343 * See dispatch_dequeue() for the counterpart. 1344 */ 1345 if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) 1346 return false; 1347 1348 /* @p->rq couldn't have changed if we're still the holding cpu */ 1349 task_rq = task_rq(p); 1350 lockdep_assert_rq_held(task_rq); 1351 1352 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); 1353 deactivate_task(task_rq, p, 0); 1354 set_task_cpu(p, cpu_of(rq)); 1355 p->scx.sticky_cpu = cpu_of(rq); 1356 1357 /* 1358 * We want to pass scx-specific enq_flags but activate_task() will 1359 * truncate the upper 32 bit. As we own @rq, we can pass them through 1360 * @rq->scx.extra_enq_flags instead. 1361 */ 1362 WARN_ON_ONCE(rq->scx.extra_enq_flags); 1363 rq->scx.extra_enq_flags = enq_flags; 1364 activate_task(rq, p, 0); 1365 rq->scx.extra_enq_flags = 0; 1366 1367 return true; 1368 } 1369 1370 /** 1371 * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked 1372 * @rq: current rq which is locked 1373 * @rf: rq_flags to use when unlocking @rq 1374 * @src_rq: rq to move task from 1375 * @dst_rq: rq to move task to 1376 * 1377 * We're holding @rq lock and trying to dispatch a task from @src_rq to 1378 * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether 1379 * @rq stays locked isn't important as long as the state is restored after 1380 * dispatch_to_local_dsq_unlock(). 1381 */ 1382 static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, 1383 struct rq *src_rq, struct rq *dst_rq) 1384 { 1385 rq_unpin_lock(rq, rf); 1386 1387 if (src_rq == dst_rq) { 1388 raw_spin_rq_unlock(rq); 1389 raw_spin_rq_lock(dst_rq); 1390 } else if (rq == src_rq) { 1391 double_lock_balance(rq, dst_rq); 1392 rq_repin_lock(rq, rf); 1393 } else if (rq == dst_rq) { 1394 double_lock_balance(rq, src_rq); 1395 rq_repin_lock(rq, rf); 1396 } else { 1397 raw_spin_rq_unlock(rq); 1398 double_rq_lock(src_rq, dst_rq); 1399 } 1400 } 1401 1402 /** 1403 * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() 1404 * @rq: current rq which is locked 1405 * @rf: rq_flags to use when unlocking @rq 1406 * @src_rq: rq to move task from 1407 * @dst_rq: rq to move task to 1408 * 1409 * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. 1410 */ 1411 static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, 1412 struct rq *src_rq, struct rq *dst_rq) 1413 { 1414 if (src_rq == dst_rq) { 1415 raw_spin_rq_unlock(dst_rq); 1416 raw_spin_rq_lock(rq); 1417 rq_repin_lock(rq, rf); 1418 } else if (rq == src_rq) { 1419 double_unlock_balance(rq, dst_rq); 1420 } else if (rq == dst_rq) { 1421 double_unlock_balance(rq, src_rq); 1422 } else { 1423 double_rq_unlock(src_rq, dst_rq); 1424 raw_spin_rq_lock(rq); 1425 rq_repin_lock(rq, rf); 1426 } 1427 } 1428 #endif /* CONFIG_SMP */ 1429 1430 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, 1431 struct task_struct *p) 1432 { 1433 lockdep_assert_held(&dsq->lock); /* released on return */ 1434 1435 /* @dsq is locked and @p is on this rq */ 1436 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 1437 list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list); 1438 dsq_mod_nr(dsq, -1); 1439 dsq_mod_nr(&rq->scx.local_dsq, 1); 1440 p->scx.dsq = &rq->scx.local_dsq; 1441 raw_spin_unlock(&dsq->lock); 1442 } 1443 1444 #ifdef CONFIG_SMP 1445 /* 1446 * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p 1447 * can be pulled to @rq. 1448 */ 1449 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) 1450 { 1451 int cpu = cpu_of(rq); 1452 1453 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1454 return false; 1455 if (unlikely(is_migration_disabled(p))) 1456 return false; 1457 if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) 1458 return false; 1459 if (!scx_rq_online(rq)) 1460 return false; 1461 return true; 1462 } 1463 1464 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, 1465 struct scx_dispatch_q *dsq, 1466 struct task_struct *p, struct rq *task_rq) 1467 { 1468 bool moved = false; 1469 1470 lockdep_assert_held(&dsq->lock); /* released on return */ 1471 1472 /* 1473 * @dsq is locked and @p is on a remote rq. @p is currently protected by 1474 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab 1475 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the 1476 * rq lock or fail, do a little dancing from our side. See 1477 * move_task_to_local_dsq(). 1478 */ 1479 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 1480 list_del_init(&p->scx.dsq_node); 1481 dsq_mod_nr(dsq, -1); 1482 p->scx.holding_cpu = raw_smp_processor_id(); 1483 raw_spin_unlock(&dsq->lock); 1484 1485 rq_unpin_lock(rq, rf); 1486 double_lock_balance(rq, task_rq); 1487 rq_repin_lock(rq, rf); 1488 1489 moved = move_task_to_local_dsq(rq, p, 0); 1490 1491 double_unlock_balance(rq, task_rq); 1492 1493 return moved; 1494 } 1495 #else /* CONFIG_SMP */ 1496 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } 1497 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, 1498 struct scx_dispatch_q *dsq, 1499 struct task_struct *p, struct rq *task_rq) { return false; } 1500 #endif /* CONFIG_SMP */ 1501 1502 static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, 1503 struct scx_dispatch_q *dsq) 1504 { 1505 struct task_struct *p; 1506 retry: 1507 if (list_empty(&dsq->list)) 1508 return false; 1509 1510 raw_spin_lock(&dsq->lock); 1511 1512 list_for_each_entry(p, &dsq->list, scx.dsq_node) { 1513 struct rq *task_rq = task_rq(p); 1514 1515 if (rq == task_rq) { 1516 consume_local_task(rq, dsq, p); 1517 return true; 1518 } 1519 1520 if (task_can_run_on_remote_rq(p, rq)) { 1521 if (likely(consume_remote_task(rq, rf, dsq, p, task_rq))) 1522 return true; 1523 goto retry; 1524 } 1525 } 1526 1527 raw_spin_unlock(&dsq->lock); 1528 return false; 1529 } 1530 1531 enum dispatch_to_local_dsq_ret { 1532 DTL_DISPATCHED, /* successfully dispatched */ 1533 DTL_LOST, /* lost race to dequeue */ 1534 DTL_NOT_LOCAL, /* destination is not a local DSQ */ 1535 DTL_INVALID, /* invalid local dsq_id */ 1536 }; 1537 1538 /** 1539 * dispatch_to_local_dsq - Dispatch a task to a local dsq 1540 * @rq: current rq which is locked 1541 * @rf: rq_flags to use when unlocking @rq 1542 * @dsq_id: destination dsq ID 1543 * @p: task to dispatch 1544 * @enq_flags: %SCX_ENQ_* 1545 * 1546 * We're holding @rq lock and want to dispatch @p to the local DSQ identified by 1547 * @dsq_id. This function performs all the synchronization dancing needed 1548 * because local DSQs are protected with rq locks. 1549 * 1550 * The caller must have exclusive ownership of @p (e.g. through 1551 * %SCX_OPSS_DISPATCHING). 1552 */ 1553 static enum dispatch_to_local_dsq_ret 1554 dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, 1555 struct task_struct *p, u64 enq_flags) 1556 { 1557 struct rq *src_rq = task_rq(p); 1558 struct rq *dst_rq; 1559 1560 /* 1561 * We're synchronized against dequeue through DISPATCHING. As @p can't 1562 * be dequeued, its task_rq and cpus_allowed are stable too. 1563 */ 1564 if (dsq_id == SCX_DSQ_LOCAL) { 1565 dst_rq = rq; 1566 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1567 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1568 1569 if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1570 return DTL_INVALID; 1571 dst_rq = cpu_rq(cpu); 1572 } else { 1573 return DTL_NOT_LOCAL; 1574 } 1575 1576 /* if dispatching to @rq that @p is already on, no lock dancing needed */ 1577 if (rq == src_rq && rq == dst_rq) { 1578 dispatch_enqueue(&dst_rq->scx.local_dsq, p, 1579 enq_flags | SCX_ENQ_CLEAR_OPSS); 1580 return DTL_DISPATCHED; 1581 } 1582 1583 #ifdef CONFIG_SMP 1584 if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { 1585 struct rq *locked_dst_rq = dst_rq; 1586 bool dsp; 1587 1588 /* 1589 * @p is on a possibly remote @src_rq which we need to lock to 1590 * move the task. If dequeue is in progress, it'd be locking 1591 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq 1592 * lock while holding DISPATCHING. 1593 * 1594 * As DISPATCHING guarantees that @p is wholly ours, we can 1595 * pretend that we're moving from a DSQ and use the same 1596 * mechanism - mark the task under transfer with holding_cpu, 1597 * release DISPATCHING and then follow the same protocol. 1598 */ 1599 p->scx.holding_cpu = raw_smp_processor_id(); 1600 1601 /* store_release ensures that dequeue sees the above */ 1602 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1603 1604 dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); 1605 1606 /* 1607 * We don't require the BPF scheduler to avoid dispatching to 1608 * offline CPUs mostly for convenience but also because CPUs can 1609 * go offline between scx_bpf_dispatch() calls and here. If @p 1610 * is destined to an offline CPU, queue it on its current CPU 1611 * instead, which should always be safe. As this is an allowed 1612 * behavior, don't trigger an ops error. 1613 */ 1614 if (!scx_rq_online(dst_rq)) 1615 dst_rq = src_rq; 1616 1617 if (src_rq == dst_rq) { 1618 /* 1619 * As @p is staying on the same rq, there's no need to 1620 * go through the full deactivate/activate cycle. 1621 * Optimize by abbreviating the operations in 1622 * move_task_to_local_dsq(). 1623 */ 1624 dsp = p->scx.holding_cpu == raw_smp_processor_id(); 1625 if (likely(dsp)) { 1626 p->scx.holding_cpu = -1; 1627 dispatch_enqueue(&dst_rq->scx.local_dsq, p, 1628 enq_flags); 1629 } 1630 } else { 1631 dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); 1632 } 1633 1634 /* if the destination CPU is idle, wake it up */ 1635 if (dsp && sched_class_above(p->sched_class, 1636 dst_rq->curr->sched_class)) 1637 resched_curr(dst_rq); 1638 1639 dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); 1640 1641 return dsp ? DTL_DISPATCHED : DTL_LOST; 1642 } 1643 #endif /* CONFIG_SMP */ 1644 1645 scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", 1646 cpu_of(dst_rq), p->comm, p->pid); 1647 return DTL_INVALID; 1648 } 1649 1650 /** 1651 * finish_dispatch - Asynchronously finish dispatching a task 1652 * @rq: current rq which is locked 1653 * @rf: rq_flags to use when unlocking @rq 1654 * @p: task to finish dispatching 1655 * @qseq_at_dispatch: qseq when @p started getting dispatched 1656 * @dsq_id: destination DSQ ID 1657 * @enq_flags: %SCX_ENQ_* 1658 * 1659 * Dispatching to local DSQs may need to wait for queueing to complete or 1660 * require rq lock dancing. As we don't wanna do either while inside 1661 * ops.dispatch() to avoid locking order inversion, we split dispatching into 1662 * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the 1663 * task and its qseq. Once ops.dispatch() returns, this function is called to 1664 * finish up. 1665 * 1666 * There is no guarantee that @p is still valid for dispatching or even that it 1667 * was valid in the first place. Make sure that the task is still owned by the 1668 * BPF scheduler and claim the ownership before dispatching. 1669 */ 1670 static void finish_dispatch(struct rq *rq, struct rq_flags *rf, 1671 struct task_struct *p, 1672 unsigned long qseq_at_dispatch, 1673 u64 dsq_id, u64 enq_flags) 1674 { 1675 struct scx_dispatch_q *dsq; 1676 unsigned long opss; 1677 1678 retry: 1679 /* 1680 * No need for _acquire here. @p is accessed only after a successful 1681 * try_cmpxchg to DISPATCHING. 1682 */ 1683 opss = atomic_long_read(&p->scx.ops_state); 1684 1685 switch (opss & SCX_OPSS_STATE_MASK) { 1686 case SCX_OPSS_DISPATCHING: 1687 case SCX_OPSS_NONE: 1688 /* someone else already got to it */ 1689 return; 1690 case SCX_OPSS_QUEUED: 1691 /* 1692 * If qseq doesn't match, @p has gone through at least one 1693 * dispatch/dequeue and re-enqueue cycle between 1694 * scx_bpf_dispatch() and here and we have no claim on it. 1695 */ 1696 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 1697 return; 1698 1699 /* 1700 * While we know @p is accessible, we don't yet have a claim on 1701 * it - the BPF scheduler is allowed to dispatch tasks 1702 * spuriously and there can be a racing dequeue attempt. Let's 1703 * claim @p by atomically transitioning it from QUEUED to 1704 * DISPATCHING. 1705 */ 1706 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 1707 SCX_OPSS_DISPATCHING))) 1708 break; 1709 goto retry; 1710 case SCX_OPSS_QUEUEING: 1711 /* 1712 * do_enqueue_task() is in the process of transferring the task 1713 * to the BPF scheduler while holding @p's rq lock. As we aren't 1714 * holding any kernel or BPF resource that the enqueue path may 1715 * depend upon, it's safe to wait. 1716 */ 1717 wait_ops_state(p, opss); 1718 goto retry; 1719 } 1720 1721 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 1722 1723 switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { 1724 case DTL_DISPATCHED: 1725 break; 1726 case DTL_LOST: 1727 break; 1728 case DTL_INVALID: 1729 dsq_id = SCX_DSQ_GLOBAL; 1730 fallthrough; 1731 case DTL_NOT_LOCAL: 1732 dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), 1733 dsq_id, p); 1734 dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 1735 break; 1736 } 1737 } 1738 1739 static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) 1740 { 1741 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 1742 u32 u; 1743 1744 for (u = 0; u < dspc->cursor; u++) { 1745 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 1746 1747 finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, 1748 ent->enq_flags); 1749 } 1750 1751 dspc->nr_tasks += dspc->cursor; 1752 dspc->cursor = 0; 1753 } 1754 1755 static int balance_scx(struct rq *rq, struct task_struct *prev, 1756 struct rq_flags *rf) 1757 { 1758 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 1759 bool prev_on_scx = prev->sched_class == &ext_sched_class; 1760 1761 lockdep_assert_rq_held(rq); 1762 1763 if (prev_on_scx) { 1764 WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP); 1765 update_curr_scx(rq); 1766 1767 /* 1768 * If @prev is runnable & has slice left, it has priority and 1769 * fetching more just increases latency for the fetched tasks. 1770 * Tell put_prev_task_scx() to put @prev on local_dsq. 1771 * 1772 * See scx_ops_disable_workfn() for the explanation on the 1773 * bypassing test. 1774 */ 1775 if ((prev->scx.flags & SCX_TASK_QUEUED) && 1776 prev->scx.slice && !scx_ops_bypassing()) { 1777 prev->scx.flags |= SCX_TASK_BAL_KEEP; 1778 return 1; 1779 } 1780 } 1781 1782 /* if there already are tasks to run, nothing to do */ 1783 if (rq->scx.local_dsq.nr) 1784 return 1; 1785 1786 if (consume_dispatch_q(rq, rf, &scx_dsq_global)) 1787 return 1; 1788 1789 if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) 1790 return 0; 1791 1792 dspc->rq = rq; 1793 dspc->rf = rf; 1794 1795 /* 1796 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 1797 * the local DSQ might still end up empty after a successful 1798 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 1799 * produced some tasks, retry. The BPF scheduler may depend on this 1800 * looping behavior to simplify its implementation. 1801 */ 1802 do { 1803 dspc->nr_tasks = 0; 1804 1805 SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), 1806 prev_on_scx ? prev : NULL); 1807 1808 flush_dispatch_buf(rq, rf); 1809 1810 if (rq->scx.local_dsq.nr) 1811 return 1; 1812 if (consume_dispatch_q(rq, rf, &scx_dsq_global)) 1813 return 1; 1814 } while (dspc->nr_tasks); 1815 1816 return 0; 1817 } 1818 1819 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 1820 { 1821 if (p->scx.flags & SCX_TASK_QUEUED) { 1822 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1823 dispatch_dequeue(rq, p); 1824 } 1825 1826 p->se.exec_start = rq_clock_task(rq); 1827 1828 clr_task_runnable(p); 1829 } 1830 1831 static void put_prev_task_scx(struct rq *rq, struct task_struct *p) 1832 { 1833 #ifndef CONFIG_SMP 1834 /* 1835 * UP workaround. 1836 * 1837 * Because SCX may transfer tasks across CPUs during dispatch, dispatch 1838 * is performed from its balance operation which isn't called in UP. 1839 * Let's work around by calling it from the operations which come right 1840 * after. 1841 * 1842 * 1. If the prev task is on SCX, pick_next_task() calls 1843 * .put_prev_task() right after. As .put_prev_task() is also called 1844 * from other places, we need to distinguish the calls which can be 1845 * done by looking at the previous task's state - if still queued or 1846 * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). 1847 * This case is handled here. 1848 * 1849 * 2. If the prev task is not on SCX, the first following call into SCX 1850 * will be .pick_next_task(), which is covered by calling 1851 * balance_scx() from pick_next_task_scx(). 1852 * 1853 * Note that we can't merge the first case into the second as 1854 * balance_scx() must be called before the previous SCX task goes 1855 * through put_prev_task_scx(). 1856 * 1857 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. 1858 * Pass in %NULL. 1859 */ 1860 if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) 1861 balance_scx(rq, p, NULL); 1862 #endif 1863 1864 update_curr_scx(rq); 1865 1866 /* 1867 * If we're being called from put_prev_task_balance(), balance_scx() may 1868 * have decided that @p should keep running. 1869 */ 1870 if (p->scx.flags & SCX_TASK_BAL_KEEP) { 1871 p->scx.flags &= ~SCX_TASK_BAL_KEEP; 1872 set_task_runnable(rq, p); 1873 dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); 1874 return; 1875 } 1876 1877 if (p->scx.flags & SCX_TASK_QUEUED) { 1878 set_task_runnable(rq, p); 1879 1880 /* 1881 * If @p has slice left and balance_scx() didn't tag it for 1882 * keeping, @p is getting preempted by a higher priority 1883 * scheduler class. Leave it at the head of the local DSQ. 1884 */ 1885 if (p->scx.slice && !scx_ops_bypassing()) { 1886 dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); 1887 return; 1888 } 1889 1890 /* 1891 * If we're in the pick_next_task path, balance_scx() should 1892 * have already populated the local DSQ if there are any other 1893 * available tasks. If empty, tell ops.enqueue() that @p is the 1894 * only one available for this cpu. ops.enqueue() should put it 1895 * on the local DSQ so that the subsequent pick_next_task_scx() 1896 * can find the task unless it wants to trigger a separate 1897 * follow-up scheduling event. 1898 */ 1899 if (list_empty(&rq->scx.local_dsq.list)) 1900 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 1901 else 1902 do_enqueue_task(rq, p, 0, -1); 1903 } 1904 } 1905 1906 static struct task_struct *first_local_task(struct rq *rq) 1907 { 1908 return list_first_entry_or_null(&rq->scx.local_dsq.list, 1909 struct task_struct, scx.dsq_node); 1910 } 1911 1912 static struct task_struct *pick_next_task_scx(struct rq *rq) 1913 { 1914 struct task_struct *p; 1915 1916 #ifndef CONFIG_SMP 1917 /* UP workaround - see the comment at the head of put_prev_task_scx() */ 1918 if (unlikely(rq->curr->sched_class != &ext_sched_class)) 1919 balance_scx(rq, rq->curr, NULL); 1920 #endif 1921 1922 p = first_local_task(rq); 1923 if (!p) 1924 return NULL; 1925 1926 set_next_task_scx(rq, p, true); 1927 1928 if (unlikely(!p->scx.slice)) { 1929 if (!scx_ops_bypassing() && !scx_warned_zero_slice) { 1930 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", 1931 p->comm, p->pid); 1932 scx_warned_zero_slice = true; 1933 } 1934 p->scx.slice = SCX_SLICE_DFL; 1935 } 1936 1937 return p; 1938 } 1939 1940 #ifdef CONFIG_SMP 1941 1942 static bool test_and_clear_cpu_idle(int cpu) 1943 { 1944 #ifdef CONFIG_SCHED_SMT 1945 /* 1946 * SMT mask should be cleared whether we can claim @cpu or not. The SMT 1947 * cluster is not wholly idle either way. This also prevents 1948 * scx_pick_idle_cpu() from getting caught in an infinite loop. 1949 */ 1950 if (sched_smt_active()) { 1951 const struct cpumask *smt = cpu_smt_mask(cpu); 1952 1953 /* 1954 * If offline, @cpu is not its own sibling and 1955 * scx_pick_idle_cpu() can get caught in an infinite loop as 1956 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu 1957 * is eventually cleared. 1958 */ 1959 if (cpumask_intersects(smt, idle_masks.smt)) 1960 cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 1961 else if (cpumask_test_cpu(cpu, idle_masks.smt)) 1962 __cpumask_clear_cpu(cpu, idle_masks.smt); 1963 } 1964 #endif 1965 return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); 1966 } 1967 1968 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) 1969 { 1970 int cpu; 1971 1972 retry: 1973 if (sched_smt_active()) { 1974 cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); 1975 if (cpu < nr_cpu_ids) 1976 goto found; 1977 1978 if (flags & SCX_PICK_IDLE_CORE) 1979 return -EBUSY; 1980 } 1981 1982 cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); 1983 if (cpu >= nr_cpu_ids) 1984 return -EBUSY; 1985 1986 found: 1987 if (test_and_clear_cpu_idle(cpu)) 1988 return cpu; 1989 else 1990 goto retry; 1991 } 1992 1993 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 1994 u64 wake_flags, bool *found) 1995 { 1996 s32 cpu; 1997 1998 *found = false; 1999 2000 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 2001 scx_ops_error("built-in idle tracking is disabled"); 2002 return prev_cpu; 2003 } 2004 2005 /* 2006 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is 2007 * under utilized, wake up @p to the local DSQ of the waker. Checking 2008 * only for an empty local DSQ is insufficient as it could give the 2009 * wakee an unfair advantage when the system is oversaturated. 2010 * Checking only for the presence of idle CPUs is also insufficient as 2011 * the local DSQ of the waker could have tasks piled up on it even if 2012 * there is an idle core elsewhere on the system. 2013 */ 2014 cpu = smp_processor_id(); 2015 if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && 2016 !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && 2017 cpu_rq(cpu)->scx.local_dsq.nr == 0) { 2018 if (cpumask_test_cpu(cpu, p->cpus_ptr)) 2019 goto cpu_found; 2020 } 2021 2022 if (p->nr_cpus_allowed == 1) { 2023 if (test_and_clear_cpu_idle(prev_cpu)) { 2024 cpu = prev_cpu; 2025 goto cpu_found; 2026 } else { 2027 return prev_cpu; 2028 } 2029 } 2030 2031 /* 2032 * If CPU has SMT, any wholly idle CPU is likely a better pick than 2033 * partially idle @prev_cpu. 2034 */ 2035 if (sched_smt_active()) { 2036 if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && 2037 test_and_clear_cpu_idle(prev_cpu)) { 2038 cpu = prev_cpu; 2039 goto cpu_found; 2040 } 2041 2042 cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); 2043 if (cpu >= 0) 2044 goto cpu_found; 2045 } 2046 2047 if (test_and_clear_cpu_idle(prev_cpu)) { 2048 cpu = prev_cpu; 2049 goto cpu_found; 2050 } 2051 2052 cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); 2053 if (cpu >= 0) 2054 goto cpu_found; 2055 2056 return prev_cpu; 2057 2058 cpu_found: 2059 *found = true; 2060 return cpu; 2061 } 2062 2063 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 2064 { 2065 /* 2066 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 2067 * can be a good migration opportunity with low cache and memory 2068 * footprint. Returning a CPU different than @prev_cpu triggers 2069 * immediate rq migration. However, for SCX, as the current rq 2070 * association doesn't dictate where the task is going to run, this 2071 * doesn't fit well. If necessary, we can later add a dedicated method 2072 * which can decide to preempt self to force it through the regular 2073 * scheduling path. 2074 */ 2075 if (unlikely(wake_flags & WF_EXEC)) 2076 return prev_cpu; 2077 2078 if (SCX_HAS_OP(select_cpu)) { 2079 s32 cpu; 2080 struct task_struct **ddsp_taskp; 2081 2082 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 2083 WARN_ON_ONCE(*ddsp_taskp); 2084 *ddsp_taskp = p; 2085 2086 cpu = SCX_CALL_OP_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, 2087 select_cpu, p, prev_cpu, wake_flags); 2088 *ddsp_taskp = NULL; 2089 if (ops_cpu_valid(cpu, "from ops.select_cpu()")) 2090 return cpu; 2091 else 2092 return prev_cpu; 2093 } else { 2094 bool found; 2095 s32 cpu; 2096 2097 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); 2098 if (found) { 2099 p->scx.slice = SCX_SLICE_DFL; 2100 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 2101 } 2102 return cpu; 2103 } 2104 } 2105 2106 static void set_cpus_allowed_scx(struct task_struct *p, 2107 struct affinity_context *ac) 2108 { 2109 set_cpus_allowed_common(p, ac); 2110 2111 /* 2112 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 2113 * differ from the configured one in @p->cpus_mask. Always tell the bpf 2114 * scheduler the effective one. 2115 * 2116 * Fine-grained memory write control is enforced by BPF making the const 2117 * designation pointless. Cast it away when calling the operation. 2118 */ 2119 if (SCX_HAS_OP(set_cpumask)) 2120 SCX_CALL_OP(SCX_KF_REST, set_cpumask, p, 2121 (struct cpumask *)p->cpus_ptr); 2122 } 2123 2124 static void reset_idle_masks(void) 2125 { 2126 /* 2127 * Consider all online cpus idle. Should converge to the actual state 2128 * quickly. 2129 */ 2130 cpumask_copy(idle_masks.cpu, cpu_online_mask); 2131 cpumask_copy(idle_masks.smt, cpu_online_mask); 2132 } 2133 2134 void __scx_update_idle(struct rq *rq, bool idle) 2135 { 2136 int cpu = cpu_of(rq); 2137 2138 if (SCX_HAS_OP(update_idle)) { 2139 SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); 2140 if (!static_branch_unlikely(&scx_builtin_idle_enabled)) 2141 return; 2142 } 2143 2144 if (idle) 2145 cpumask_set_cpu(cpu, idle_masks.cpu); 2146 else 2147 cpumask_clear_cpu(cpu, idle_masks.cpu); 2148 2149 #ifdef CONFIG_SCHED_SMT 2150 if (sched_smt_active()) { 2151 const struct cpumask *smt = cpu_smt_mask(cpu); 2152 2153 if (idle) { 2154 /* 2155 * idle_masks.smt handling is racy but that's fine as 2156 * it's only for optimization and self-correcting. 2157 */ 2158 for_each_cpu(cpu, smt) { 2159 if (!cpumask_test_cpu(cpu, idle_masks.cpu)) 2160 return; 2161 } 2162 cpumask_or(idle_masks.smt, idle_masks.smt, smt); 2163 } else { 2164 cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 2165 } 2166 } 2167 #endif 2168 } 2169 2170 #else /* CONFIG_SMP */ 2171 2172 static bool test_and_clear_cpu_idle(int cpu) { return false; } 2173 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } 2174 static void reset_idle_masks(void) {} 2175 2176 #endif /* CONFIG_SMP */ 2177 2178 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 2179 { 2180 update_other_load_avgs(rq); 2181 update_curr_scx(rq); 2182 2183 /* 2184 * While bypassing, always resched as we can't trust the slice 2185 * management. 2186 */ 2187 if (scx_ops_bypassing()) 2188 curr->scx.slice = 0; 2189 else if (SCX_HAS_OP(tick)) 2190 SCX_CALL_OP(SCX_KF_REST, tick, curr); 2191 2192 if (!curr->scx.slice) 2193 resched_curr(rq); 2194 } 2195 2196 static enum scx_task_state scx_get_task_state(const struct task_struct *p) 2197 { 2198 return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; 2199 } 2200 2201 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) 2202 { 2203 enum scx_task_state prev_state = scx_get_task_state(p); 2204 bool warn = false; 2205 2206 BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); 2207 2208 switch (state) { 2209 case SCX_TASK_NONE: 2210 break; 2211 case SCX_TASK_INIT: 2212 warn = prev_state != SCX_TASK_NONE; 2213 break; 2214 case SCX_TASK_READY: 2215 warn = prev_state == SCX_TASK_NONE; 2216 break; 2217 case SCX_TASK_ENABLED: 2218 warn = prev_state != SCX_TASK_READY; 2219 break; 2220 default: 2221 warn = true; 2222 return; 2223 } 2224 2225 WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", 2226 prev_state, state, p->comm, p->pid); 2227 2228 p->scx.flags &= ~SCX_TASK_STATE_MASK; 2229 p->scx.flags |= state << SCX_TASK_STATE_SHIFT; 2230 } 2231 2232 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) 2233 { 2234 int ret; 2235 2236 if (SCX_HAS_OP(init_task)) { 2237 struct scx_init_task_args args = { 2238 .fork = fork, 2239 }; 2240 2241 ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); 2242 if (unlikely(ret)) { 2243 ret = ops_sanitize_err("init_task", ret); 2244 return ret; 2245 } 2246 } 2247 2248 scx_set_task_state(p, SCX_TASK_INIT); 2249 2250 return 0; 2251 } 2252 2253 static void set_task_scx_weight(struct task_struct *p) 2254 { 2255 u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 2256 2257 p->scx.weight = sched_weight_to_cgroup(weight); 2258 } 2259 2260 static void scx_ops_enable_task(struct task_struct *p) 2261 { 2262 lockdep_assert_rq_held(task_rq(p)); 2263 2264 /* 2265 * Set the weight before calling ops.enable() so that the scheduler 2266 * doesn't see a stale value if they inspect the task struct. 2267 */ 2268 set_task_scx_weight(p); 2269 if (SCX_HAS_OP(enable)) 2270 SCX_CALL_OP(SCX_KF_REST, enable, p); 2271 scx_set_task_state(p, SCX_TASK_ENABLED); 2272 2273 if (SCX_HAS_OP(set_weight)) 2274 SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); 2275 } 2276 2277 static void scx_ops_disable_task(struct task_struct *p) 2278 { 2279 lockdep_assert_rq_held(task_rq(p)); 2280 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 2281 2282 if (SCX_HAS_OP(disable)) 2283 SCX_CALL_OP(SCX_KF_REST, disable, p); 2284 scx_set_task_state(p, SCX_TASK_READY); 2285 } 2286 2287 static void scx_ops_exit_task(struct task_struct *p) 2288 { 2289 struct scx_exit_task_args args = { 2290 .cancelled = false, 2291 }; 2292 2293 lockdep_assert_rq_held(task_rq(p)); 2294 2295 switch (scx_get_task_state(p)) { 2296 case SCX_TASK_NONE: 2297 return; 2298 case SCX_TASK_INIT: 2299 args.cancelled = true; 2300 break; 2301 case SCX_TASK_READY: 2302 break; 2303 case SCX_TASK_ENABLED: 2304 scx_ops_disable_task(p); 2305 break; 2306 default: 2307 WARN_ON_ONCE(true); 2308 return; 2309 } 2310 2311 if (SCX_HAS_OP(exit_task)) 2312 SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); 2313 scx_set_task_state(p, SCX_TASK_NONE); 2314 } 2315 2316 void init_scx_entity(struct sched_ext_entity *scx) 2317 { 2318 /* 2319 * init_idle() calls this function again after fork sequence is 2320 * complete. Don't touch ->tasks_node as it's already linked. 2321 */ 2322 memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); 2323 2324 INIT_LIST_HEAD(&scx->dsq_node); 2325 scx->sticky_cpu = -1; 2326 scx->holding_cpu = -1; 2327 INIT_LIST_HEAD(&scx->runnable_node); 2328 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 2329 scx->slice = SCX_SLICE_DFL; 2330 } 2331 2332 void scx_pre_fork(struct task_struct *p) 2333 { 2334 /* 2335 * BPF scheduler enable/disable paths want to be able to iterate and 2336 * update all tasks which can become complex when racing forks. As 2337 * enable/disable are very cold paths, let's use a percpu_rwsem to 2338 * exclude forks. 2339 */ 2340 percpu_down_read(&scx_fork_rwsem); 2341 } 2342 2343 int scx_fork(struct task_struct *p) 2344 { 2345 percpu_rwsem_assert_held(&scx_fork_rwsem); 2346 2347 if (scx_enabled()) 2348 return scx_ops_init_task(p, task_group(p), true); 2349 else 2350 return 0; 2351 } 2352 2353 void scx_post_fork(struct task_struct *p) 2354 { 2355 if (scx_enabled()) { 2356 scx_set_task_state(p, SCX_TASK_READY); 2357 2358 /* 2359 * Enable the task immediately if it's running on sched_ext. 2360 * Otherwise, it'll be enabled in switching_to_scx() if and 2361 * when it's ever configured to run with a SCHED_EXT policy. 2362 */ 2363 if (p->sched_class == &ext_sched_class) { 2364 struct rq_flags rf; 2365 struct rq *rq; 2366 2367 rq = task_rq_lock(p, &rf); 2368 scx_ops_enable_task(p); 2369 task_rq_unlock(rq, p, &rf); 2370 } 2371 } 2372 2373 spin_lock_irq(&scx_tasks_lock); 2374 list_add_tail(&p->scx.tasks_node, &scx_tasks); 2375 spin_unlock_irq(&scx_tasks_lock); 2376 2377 percpu_up_read(&scx_fork_rwsem); 2378 } 2379 2380 void scx_cancel_fork(struct task_struct *p) 2381 { 2382 if (scx_enabled()) { 2383 struct rq *rq; 2384 struct rq_flags rf; 2385 2386 rq = task_rq_lock(p, &rf); 2387 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 2388 scx_ops_exit_task(p); 2389 task_rq_unlock(rq, p, &rf); 2390 } 2391 2392 percpu_up_read(&scx_fork_rwsem); 2393 } 2394 2395 void sched_ext_free(struct task_struct *p) 2396 { 2397 unsigned long flags; 2398 2399 spin_lock_irqsave(&scx_tasks_lock, flags); 2400 list_del_init(&p->scx.tasks_node); 2401 spin_unlock_irqrestore(&scx_tasks_lock, flags); 2402 2403 /* 2404 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> 2405 * ENABLED transitions can't race us. Disable ops for @p. 2406 */ 2407 if (scx_get_task_state(p) != SCX_TASK_NONE) { 2408 struct rq_flags rf; 2409 struct rq *rq; 2410 2411 rq = task_rq_lock(p, &rf); 2412 scx_ops_exit_task(p); 2413 task_rq_unlock(rq, p, &rf); 2414 } 2415 } 2416 2417 static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) 2418 { 2419 lockdep_assert_rq_held(task_rq(p)); 2420 2421 set_task_scx_weight(p); 2422 if (SCX_HAS_OP(set_weight)) 2423 SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); 2424 } 2425 2426 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) 2427 { 2428 } 2429 2430 static void switching_to_scx(struct rq *rq, struct task_struct *p) 2431 { 2432 scx_ops_enable_task(p); 2433 2434 /* 2435 * set_cpus_allowed_scx() is not called while @p is associated with a 2436 * different scheduler class. Keep the BPF scheduler up-to-date. 2437 */ 2438 if (SCX_HAS_OP(set_cpumask)) 2439 SCX_CALL_OP(SCX_KF_REST, set_cpumask, p, 2440 (struct cpumask *)p->cpus_ptr); 2441 } 2442 2443 static void switched_from_scx(struct rq *rq, struct task_struct *p) 2444 { 2445 scx_ops_disable_task(p); 2446 } 2447 2448 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} 2449 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 2450 2451 /* 2452 * Omitted operations: 2453 * 2454 * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task 2455 * isn't tied to the CPU at that point. 2456 * 2457 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 2458 * 2459 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 2460 * their current sched_class. Call them directly from sched core instead. 2461 * 2462 * - task_woken: Unnecessary. 2463 */ 2464 DEFINE_SCHED_CLASS(ext) = { 2465 .enqueue_task = enqueue_task_scx, 2466 .dequeue_task = dequeue_task_scx, 2467 .yield_task = yield_task_scx, 2468 .yield_to_task = yield_to_task_scx, 2469 2470 .wakeup_preempt = wakeup_preempt_scx, 2471 2472 .pick_next_task = pick_next_task_scx, 2473 2474 .put_prev_task = put_prev_task_scx, 2475 .set_next_task = set_next_task_scx, 2476 2477 #ifdef CONFIG_SMP 2478 .balance = balance_scx, 2479 .select_task_rq = select_task_rq_scx, 2480 .set_cpus_allowed = set_cpus_allowed_scx, 2481 #endif 2482 2483 .task_tick = task_tick_scx, 2484 2485 .switching_to = switching_to_scx, 2486 .switched_from = switched_from_scx, 2487 .switched_to = switched_to_scx, 2488 .reweight_task = reweight_task_scx, 2489 .prio_changed = prio_changed_scx, 2490 2491 .update_curr = update_curr_scx, 2492 2493 #ifdef CONFIG_UCLAMP_TASK 2494 .uclamp_enabled = 0, 2495 #endif 2496 }; 2497 2498 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) 2499 { 2500 memset(dsq, 0, sizeof(*dsq)); 2501 2502 raw_spin_lock_init(&dsq->lock); 2503 INIT_LIST_HEAD(&dsq->list); 2504 dsq->id = dsq_id; 2505 } 2506 2507 static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) 2508 { 2509 struct scx_dispatch_q *dsq; 2510 int ret; 2511 2512 if (dsq_id & SCX_DSQ_FLAG_BUILTIN) 2513 return ERR_PTR(-EINVAL); 2514 2515 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 2516 if (!dsq) 2517 return ERR_PTR(-ENOMEM); 2518 2519 init_dsq(dsq, dsq_id); 2520 2521 ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, 2522 dsq_hash_params); 2523 if (ret) { 2524 kfree(dsq); 2525 return ERR_PTR(ret); 2526 } 2527 return dsq; 2528 } 2529 2530 static void free_dsq_irq_workfn(struct irq_work *irq_work) 2531 { 2532 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 2533 struct scx_dispatch_q *dsq, *tmp_dsq; 2534 2535 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 2536 kfree_rcu(dsq, rcu); 2537 } 2538 2539 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 2540 2541 static void destroy_dsq(u64 dsq_id) 2542 { 2543 struct scx_dispatch_q *dsq; 2544 unsigned long flags; 2545 2546 rcu_read_lock(); 2547 2548 dsq = find_user_dsq(dsq_id); 2549 if (!dsq) 2550 goto out_unlock_rcu; 2551 2552 raw_spin_lock_irqsave(&dsq->lock, flags); 2553 2554 if (dsq->nr) { 2555 scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", 2556 dsq->id, dsq->nr); 2557 goto out_unlock_dsq; 2558 } 2559 2560 if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) 2561 goto out_unlock_dsq; 2562 2563 /* 2564 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 2565 * queueing more tasks. As this function can be called from anywhere, 2566 * freeing is bounced through an irq work to avoid nesting RCU 2567 * operations inside scheduler locks. 2568 */ 2569 dsq->id = SCX_DSQ_INVALID; 2570 llist_add(&dsq->free_node, &dsqs_to_free); 2571 irq_work_queue(&free_dsq_irq_work); 2572 2573 out_unlock_dsq: 2574 raw_spin_unlock_irqrestore(&dsq->lock, flags); 2575 out_unlock_rcu: 2576 rcu_read_unlock(); 2577 } 2578 2579 2580 /******************************************************************************** 2581 * Sysfs interface and ops enable/disable. 2582 */ 2583 2584 #define SCX_ATTR(_name) \ 2585 static struct kobj_attribute scx_attr_##_name = { \ 2586 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 2587 .show = scx_attr_##_name##_show, \ 2588 } 2589 2590 static ssize_t scx_attr_state_show(struct kobject *kobj, 2591 struct kobj_attribute *ka, char *buf) 2592 { 2593 return sysfs_emit(buf, "%s\n", 2594 scx_ops_enable_state_str[scx_ops_enable_state()]); 2595 } 2596 SCX_ATTR(state); 2597 2598 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 2599 struct kobj_attribute *ka, char *buf) 2600 { 2601 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 2602 } 2603 SCX_ATTR(switch_all); 2604 2605 static struct attribute *scx_global_attrs[] = { 2606 &scx_attr_state.attr, 2607 &scx_attr_switch_all.attr, 2608 NULL, 2609 }; 2610 2611 static const struct attribute_group scx_global_attr_group = { 2612 .attrs = scx_global_attrs, 2613 }; 2614 2615 static void scx_kobj_release(struct kobject *kobj) 2616 { 2617 kfree(kobj); 2618 } 2619 2620 static ssize_t scx_attr_ops_show(struct kobject *kobj, 2621 struct kobj_attribute *ka, char *buf) 2622 { 2623 return sysfs_emit(buf, "%s\n", scx_ops.name); 2624 } 2625 SCX_ATTR(ops); 2626 2627 static struct attribute *scx_sched_attrs[] = { 2628 &scx_attr_ops.attr, 2629 NULL, 2630 }; 2631 ATTRIBUTE_GROUPS(scx_sched); 2632 2633 static const struct kobj_type scx_ktype = { 2634 .release = scx_kobj_release, 2635 .sysfs_ops = &kobj_sysfs_ops, 2636 .default_groups = scx_sched_groups, 2637 }; 2638 2639 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 2640 { 2641 return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); 2642 } 2643 2644 static const struct kset_uevent_ops scx_uevent_ops = { 2645 .uevent = scx_uevent, 2646 }; 2647 2648 /* 2649 * Used by sched_fork() and __setscheduler_prio() to pick the matching 2650 * sched_class. dl/rt are already handled. 2651 */ 2652 bool task_should_scx(struct task_struct *p) 2653 { 2654 if (!scx_enabled() || 2655 unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) 2656 return false; 2657 if (READ_ONCE(scx_switching_all)) 2658 return true; 2659 return p->policy == SCHED_EXT; 2660 } 2661 2662 /** 2663 * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress 2664 * 2665 * Bypassing guarantees that all runnable tasks make forward progress without 2666 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 2667 * be held by tasks that the BPF scheduler is forgetting to run, which 2668 * unfortunately also excludes toggling the static branches. 2669 * 2670 * Let's work around by overriding a couple ops and modifying behaviors based on 2671 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 2672 * to force global FIFO scheduling. 2673 * 2674 * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 2675 * 2676 * b. ops.dispatch() is ignored. 2677 * 2678 * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be 2679 * trusted. Whenever a tick triggers, the running task is rotated to the tail 2680 * of the queue. 2681 * 2682 * d. pick_next_task() suppresses zero slice warning. 2683 */ 2684 static void scx_ops_bypass(bool bypass) 2685 { 2686 int depth, cpu; 2687 2688 if (bypass) { 2689 depth = atomic_inc_return(&scx_ops_bypass_depth); 2690 WARN_ON_ONCE(depth <= 0); 2691 if (depth != 1) 2692 return; 2693 } else { 2694 depth = atomic_dec_return(&scx_ops_bypass_depth); 2695 WARN_ON_ONCE(depth < 0); 2696 if (depth != 0) 2697 return; 2698 } 2699 2700 /* 2701 * We need to guarantee that no tasks are on the BPF scheduler while 2702 * bypassing. Either we see enabled or the enable path sees the 2703 * increased bypass_depth before moving tasks to SCX. 2704 */ 2705 if (!scx_enabled()) 2706 return; 2707 2708 /* 2709 * No task property is changing. We just need to make sure all currently 2710 * queued tasks are re-queued according to the new scx_ops_bypassing() 2711 * state. As an optimization, walk each rq's runnable_list instead of 2712 * the scx_tasks list. 2713 * 2714 * This function can't trust the scheduler and thus can't use 2715 * cpus_read_lock(). Walk all possible CPUs instead of online. 2716 */ 2717 for_each_possible_cpu(cpu) { 2718 struct rq *rq = cpu_rq(cpu); 2719 struct rq_flags rf; 2720 struct task_struct *p, *n; 2721 2722 rq_lock_irqsave(rq, &rf); 2723 2724 /* 2725 * The use of list_for_each_entry_safe_reverse() is required 2726 * because each task is going to be removed from and added back 2727 * to the runnable_list during iteration. Because they're added 2728 * to the tail of the list, safe reverse iteration can still 2729 * visit all nodes. 2730 */ 2731 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 2732 scx.runnable_node) { 2733 struct sched_enq_and_set_ctx ctx; 2734 2735 /* cycling deq/enq is enough, see the function comment */ 2736 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 2737 sched_enq_and_set_task(&ctx); 2738 } 2739 2740 rq_unlock_irqrestore(rq, &rf); 2741 } 2742 } 2743 2744 static void free_exit_info(struct scx_exit_info *ei) 2745 { 2746 kfree(ei->msg); 2747 kfree(ei->bt); 2748 kfree(ei); 2749 } 2750 2751 static struct scx_exit_info *alloc_exit_info(void) 2752 { 2753 struct scx_exit_info *ei; 2754 2755 ei = kzalloc(sizeof(*ei), GFP_KERNEL); 2756 if (!ei) 2757 return NULL; 2758 2759 ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); 2760 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 2761 2762 if (!ei->bt || !ei->msg) { 2763 free_exit_info(ei); 2764 return NULL; 2765 } 2766 2767 return ei; 2768 } 2769 2770 static const char *scx_exit_reason(enum scx_exit_kind kind) 2771 { 2772 switch (kind) { 2773 case SCX_EXIT_UNREG: 2774 return "Scheduler unregistered from user space"; 2775 case SCX_EXIT_UNREG_BPF: 2776 return "Scheduler unregistered from BPF"; 2777 case SCX_EXIT_UNREG_KERN: 2778 return "Scheduler unregistered from the main kernel"; 2779 case SCX_EXIT_ERROR: 2780 return "runtime error"; 2781 case SCX_EXIT_ERROR_BPF: 2782 return "scx_bpf_error"; 2783 default: 2784 return "<UNKNOWN>"; 2785 } 2786 } 2787 2788 static void scx_ops_disable_workfn(struct kthread_work *work) 2789 { 2790 struct scx_exit_info *ei = scx_exit_info; 2791 struct scx_task_iter sti; 2792 struct task_struct *p; 2793 struct rhashtable_iter rht_iter; 2794 struct scx_dispatch_q *dsq; 2795 int i, kind; 2796 2797 kind = atomic_read(&scx_exit_kind); 2798 while (true) { 2799 /* 2800 * NONE indicates that a new scx_ops has been registered since 2801 * disable was scheduled - don't kill the new ops. DONE 2802 * indicates that the ops has already been disabled. 2803 */ 2804 if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) 2805 return; 2806 if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) 2807 break; 2808 } 2809 ei->kind = kind; 2810 ei->reason = scx_exit_reason(ei->kind); 2811 2812 /* guarantee forward progress by bypassing scx_ops */ 2813 scx_ops_bypass(true); 2814 2815 switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { 2816 case SCX_OPS_DISABLING: 2817 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 2818 break; 2819 case SCX_OPS_DISABLED: 2820 pr_warn("sched_ext: ops error detected without ops (%s)\n", 2821 scx_exit_info->msg); 2822 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != 2823 SCX_OPS_DISABLING); 2824 goto done; 2825 default: 2826 break; 2827 } 2828 2829 /* 2830 * Here, every runnable task is guaranteed to make forward progress and 2831 * we can safely use blocking synchronization constructs. Actually 2832 * disable ops. 2833 */ 2834 mutex_lock(&scx_ops_enable_mutex); 2835 2836 static_branch_disable(&__scx_switched_all); 2837 WRITE_ONCE(scx_switching_all, false); 2838 2839 /* 2840 * Avoid racing against fork. See scx_ops_enable() for explanation on 2841 * the locking order. 2842 */ 2843 percpu_down_write(&scx_fork_rwsem); 2844 cpus_read_lock(); 2845 2846 spin_lock_irq(&scx_tasks_lock); 2847 scx_task_iter_init(&sti); 2848 /* 2849 * Invoke scx_ops_exit_task() on all non-idle tasks, including 2850 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, 2851 * we may not have invoked sched_ext_free() on them by the time a 2852 * scheduler is disabled. We must therefore exit the task here, or we'd 2853 * fail to invoke ops.exit_task(), as the scheduler will have been 2854 * unloaded by the time the task is subsequently exited on the 2855 * sched_ext_free() path. 2856 */ 2857 while ((p = scx_task_iter_next_locked(&sti, true))) { 2858 const struct sched_class *old_class = p->sched_class; 2859 struct sched_enq_and_set_ctx ctx; 2860 2861 if (READ_ONCE(p->__state) != TASK_DEAD) { 2862 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, 2863 &ctx); 2864 2865 p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); 2866 __setscheduler_prio(p, p->prio); 2867 check_class_changing(task_rq(p), p, old_class); 2868 2869 sched_enq_and_set_task(&ctx); 2870 2871 check_class_changed(task_rq(p), p, old_class, p->prio); 2872 } 2873 scx_ops_exit_task(p); 2874 } 2875 scx_task_iter_exit(&sti); 2876 spin_unlock_irq(&scx_tasks_lock); 2877 2878 /* no task is on scx, turn off all the switches and flush in-progress calls */ 2879 static_branch_disable_cpuslocked(&__scx_ops_enabled); 2880 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 2881 static_branch_disable_cpuslocked(&scx_has_op[i]); 2882 static_branch_disable_cpuslocked(&scx_ops_enq_last); 2883 static_branch_disable_cpuslocked(&scx_ops_enq_exiting); 2884 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 2885 synchronize_rcu(); 2886 2887 cpus_read_unlock(); 2888 percpu_up_write(&scx_fork_rwsem); 2889 2890 if (ei->kind >= SCX_EXIT_ERROR) { 2891 printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); 2892 2893 if (ei->msg[0] == '\0') 2894 printk(KERN_ERR "sched_ext: %s\n", ei->reason); 2895 else 2896 printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); 2897 2898 stack_trace_print(ei->bt, ei->bt_len, 2); 2899 } 2900 2901 if (scx_ops.exit) 2902 SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); 2903 2904 /* 2905 * Delete the kobject from the hierarchy eagerly in addition to just 2906 * dropping a reference. Otherwise, if the object is deleted 2907 * asynchronously, sysfs could observe an object of the same name still 2908 * in the hierarchy when another scheduler is loaded. 2909 */ 2910 kobject_del(scx_root_kobj); 2911 kobject_put(scx_root_kobj); 2912 scx_root_kobj = NULL; 2913 2914 memset(&scx_ops, 0, sizeof(scx_ops)); 2915 2916 rhashtable_walk_enter(&dsq_hash, &rht_iter); 2917 do { 2918 rhashtable_walk_start(&rht_iter); 2919 2920 while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) 2921 destroy_dsq(dsq->id); 2922 2923 rhashtable_walk_stop(&rht_iter); 2924 } while (dsq == ERR_PTR(-EAGAIN)); 2925 rhashtable_walk_exit(&rht_iter); 2926 2927 free_percpu(scx_dsp_ctx); 2928 scx_dsp_ctx = NULL; 2929 scx_dsp_max_batch = 0; 2930 2931 free_exit_info(scx_exit_info); 2932 scx_exit_info = NULL; 2933 2934 mutex_unlock(&scx_ops_enable_mutex); 2935 2936 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != 2937 SCX_OPS_DISABLING); 2938 done: 2939 scx_ops_bypass(false); 2940 } 2941 2942 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); 2943 2944 static void schedule_scx_ops_disable_work(void) 2945 { 2946 struct kthread_worker *helper = READ_ONCE(scx_ops_helper); 2947 2948 /* 2949 * We may be called spuriously before the first bpf_sched_ext_reg(). If 2950 * scx_ops_helper isn't set up yet, there's nothing to do. 2951 */ 2952 if (helper) 2953 kthread_queue_work(helper, &scx_ops_disable_work); 2954 } 2955 2956 static void scx_ops_disable(enum scx_exit_kind kind) 2957 { 2958 int none = SCX_EXIT_NONE; 2959 2960 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 2961 kind = SCX_EXIT_ERROR; 2962 2963 atomic_try_cmpxchg(&scx_exit_kind, &none, kind); 2964 2965 schedule_scx_ops_disable_work(); 2966 } 2967 2968 static void scx_ops_error_irq_workfn(struct irq_work *irq_work) 2969 { 2970 schedule_scx_ops_disable_work(); 2971 } 2972 2973 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); 2974 2975 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, 2976 s64 exit_code, 2977 const char *fmt, ...) 2978 { 2979 struct scx_exit_info *ei = scx_exit_info; 2980 int none = SCX_EXIT_NONE; 2981 va_list args; 2982 2983 if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) 2984 return; 2985 2986 ei->exit_code = exit_code; 2987 2988 if (kind >= SCX_EXIT_ERROR) 2989 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 2990 2991 va_start(args, fmt); 2992 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 2993 va_end(args); 2994 2995 irq_work_queue(&scx_ops_error_irq_work); 2996 } 2997 2998 static struct kthread_worker *scx_create_rt_helper(const char *name) 2999 { 3000 struct kthread_worker *helper; 3001 3002 helper = kthread_create_worker(0, name); 3003 if (helper) 3004 sched_set_fifo(helper->task); 3005 return helper; 3006 } 3007 3008 static int validate_ops(const struct sched_ext_ops *ops) 3009 { 3010 /* 3011 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 3012 * ops.enqueue() callback isn't implemented. 3013 */ 3014 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 3015 scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 3016 return -EINVAL; 3017 } 3018 3019 return 0; 3020 } 3021 3022 static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) 3023 { 3024 struct scx_task_iter sti; 3025 struct task_struct *p; 3026 int i, ret; 3027 3028 mutex_lock(&scx_ops_enable_mutex); 3029 3030 if (!scx_ops_helper) { 3031 WRITE_ONCE(scx_ops_helper, 3032 scx_create_rt_helper("sched_ext_ops_helper")); 3033 if (!scx_ops_helper) { 3034 ret = -ENOMEM; 3035 goto err_unlock; 3036 } 3037 } 3038 3039 if (scx_ops_enable_state() != SCX_OPS_DISABLED) { 3040 ret = -EBUSY; 3041 goto err_unlock; 3042 } 3043 3044 scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); 3045 if (!scx_root_kobj) { 3046 ret = -ENOMEM; 3047 goto err_unlock; 3048 } 3049 3050 scx_root_kobj->kset = scx_kset; 3051 ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); 3052 if (ret < 0) 3053 goto err; 3054 3055 scx_exit_info = alloc_exit_info(); 3056 if (!scx_exit_info) { 3057 ret = -ENOMEM; 3058 goto err_del; 3059 } 3060 3061 /* 3062 * Set scx_ops, transition to PREPPING and clear exit info to arm the 3063 * disable path. Failure triggers full disabling from here on. 3064 */ 3065 scx_ops = *ops; 3066 3067 WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != 3068 SCX_OPS_DISABLED); 3069 3070 atomic_set(&scx_exit_kind, SCX_EXIT_NONE); 3071 scx_warned_zero_slice = false; 3072 3073 /* 3074 * Keep CPUs stable during enable so that the BPF scheduler can track 3075 * online CPUs by watching ->on/offline_cpu() after ->init(). 3076 */ 3077 cpus_read_lock(); 3078 3079 if (scx_ops.init) { 3080 ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); 3081 if (ret) { 3082 ret = ops_sanitize_err("init", ret); 3083 goto err_disable_unlock_cpus; 3084 } 3085 } 3086 3087 cpus_read_unlock(); 3088 3089 ret = validate_ops(ops); 3090 if (ret) 3091 goto err_disable; 3092 3093 WARN_ON_ONCE(scx_dsp_ctx); 3094 scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 3095 scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, 3096 scx_dsp_max_batch), 3097 __alignof__(struct scx_dsp_ctx)); 3098 if (!scx_dsp_ctx) { 3099 ret = -ENOMEM; 3100 goto err_disable; 3101 } 3102 3103 /* 3104 * Lock out forks before opening the floodgate so that they don't wander 3105 * into the operations prematurely. 3106 * 3107 * We don't need to keep the CPUs stable but grab cpus_read_lock() to 3108 * ease future locking changes for cgroup suport. 3109 * 3110 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the 3111 * following dependency chain: 3112 * 3113 * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock 3114 */ 3115 percpu_down_write(&scx_fork_rwsem); 3116 cpus_read_lock(); 3117 3118 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 3119 if (((void (**)(void))ops)[i]) 3120 static_branch_enable_cpuslocked(&scx_has_op[i]); 3121 3122 if (ops->flags & SCX_OPS_ENQ_LAST) 3123 static_branch_enable_cpuslocked(&scx_ops_enq_last); 3124 3125 if (ops->flags & SCX_OPS_ENQ_EXITING) 3126 static_branch_enable_cpuslocked(&scx_ops_enq_exiting); 3127 3128 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { 3129 reset_idle_masks(); 3130 static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); 3131 } else { 3132 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 3133 } 3134 3135 static_branch_enable_cpuslocked(&__scx_ops_enabled); 3136 3137 /* 3138 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 3139 * preventing new tasks from being added. No need to exclude tasks 3140 * leaving as sched_ext_free() can handle both prepped and enabled 3141 * tasks. Prep all tasks first and then enable them with preemption 3142 * disabled. 3143 */ 3144 spin_lock_irq(&scx_tasks_lock); 3145 3146 scx_task_iter_init(&sti); 3147 while ((p = scx_task_iter_next_locked(&sti, false))) { 3148 get_task_struct(p); 3149 scx_task_iter_rq_unlock(&sti); 3150 spin_unlock_irq(&scx_tasks_lock); 3151 3152 ret = scx_ops_init_task(p, task_group(p), false); 3153 if (ret) { 3154 put_task_struct(p); 3155 spin_lock_irq(&scx_tasks_lock); 3156 scx_task_iter_exit(&sti); 3157 spin_unlock_irq(&scx_tasks_lock); 3158 pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", 3159 ret, p->comm, p->pid); 3160 goto err_disable_unlock_all; 3161 } 3162 3163 put_task_struct(p); 3164 spin_lock_irq(&scx_tasks_lock); 3165 } 3166 scx_task_iter_exit(&sti); 3167 3168 /* 3169 * All tasks are prepped but are still ops-disabled. Ensure that 3170 * %current can't be scheduled out and switch everyone. 3171 * preempt_disable() is necessary because we can't guarantee that 3172 * %current won't be starved if scheduled out while switching. 3173 */ 3174 preempt_disable(); 3175 3176 /* 3177 * From here on, the disable path must assume that tasks have ops 3178 * enabled and need to be recovered. 3179 * 3180 * Transition to ENABLING fails iff the BPF scheduler has already 3181 * triggered scx_bpf_error(). Returning an error code here would lose 3182 * the recorded error information. Exit indicating success so that the 3183 * error is notified through ops.exit() with all the details. 3184 */ 3185 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { 3186 preempt_enable(); 3187 spin_unlock_irq(&scx_tasks_lock); 3188 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 3189 ret = 0; 3190 goto err_disable_unlock_all; 3191 } 3192 3193 /* 3194 * We're fully committed and can't fail. The PREPPED -> ENABLED 3195 * transitions here are synchronized against sched_ext_free() through 3196 * scx_tasks_lock. 3197 */ 3198 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 3199 3200 scx_task_iter_init(&sti); 3201 while ((p = scx_task_iter_next_locked(&sti, false))) { 3202 const struct sched_class *old_class = p->sched_class; 3203 struct sched_enq_and_set_ctx ctx; 3204 3205 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 3206 3207 scx_set_task_state(p, SCX_TASK_READY); 3208 __setscheduler_prio(p, p->prio); 3209 check_class_changing(task_rq(p), p, old_class); 3210 3211 sched_enq_and_set_task(&ctx); 3212 3213 check_class_changed(task_rq(p), p, old_class, p->prio); 3214 } 3215 scx_task_iter_exit(&sti); 3216 3217 spin_unlock_irq(&scx_tasks_lock); 3218 preempt_enable(); 3219 cpus_read_unlock(); 3220 percpu_up_write(&scx_fork_rwsem); 3221 3222 /* see above ENABLING transition for the explanation on exiting with 0 */ 3223 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { 3224 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 3225 ret = 0; 3226 goto err_disable; 3227 } 3228 3229 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 3230 static_branch_enable(&__scx_switched_all); 3231 3232 kobject_uevent(scx_root_kobj, KOBJ_ADD); 3233 mutex_unlock(&scx_ops_enable_mutex); 3234 3235 return 0; 3236 3237 err_del: 3238 kobject_del(scx_root_kobj); 3239 err: 3240 kobject_put(scx_root_kobj); 3241 scx_root_kobj = NULL; 3242 if (scx_exit_info) { 3243 free_exit_info(scx_exit_info); 3244 scx_exit_info = NULL; 3245 } 3246 err_unlock: 3247 mutex_unlock(&scx_ops_enable_mutex); 3248 return ret; 3249 3250 err_disable_unlock_all: 3251 percpu_up_write(&scx_fork_rwsem); 3252 err_disable_unlock_cpus: 3253 cpus_read_unlock(); 3254 err_disable: 3255 mutex_unlock(&scx_ops_enable_mutex); 3256 /* must be fully disabled before returning */ 3257 scx_ops_disable(SCX_EXIT_ERROR); 3258 kthread_flush_work(&scx_ops_disable_work); 3259 return ret; 3260 } 3261 3262 3263 /******************************************************************************** 3264 * bpf_struct_ops plumbing. 3265 */ 3266 #include <linux/bpf_verifier.h> 3267 #include <linux/bpf.h> 3268 #include <linux/btf.h> 3269 3270 extern struct btf *btf_vmlinux; 3271 static const struct btf_type *task_struct_type; 3272 static u32 task_struct_type_id; 3273 3274 static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, 3275 enum bpf_access_type type, 3276 const struct bpf_prog *prog, 3277 struct bpf_insn_access_aux *info) 3278 { 3279 struct btf *btf = bpf_get_btf_vmlinux(); 3280 const struct bpf_struct_ops_desc *st_ops_desc; 3281 const struct btf_member *member; 3282 const struct btf_type *t; 3283 u32 btf_id, member_idx; 3284 const char *mname; 3285 3286 /* struct_ops op args are all sequential, 64-bit numbers */ 3287 if (off != arg_n * sizeof(__u64)) 3288 return false; 3289 3290 /* btf_id should be the type id of struct sched_ext_ops */ 3291 btf_id = prog->aux->attach_btf_id; 3292 st_ops_desc = bpf_struct_ops_find(btf, btf_id); 3293 if (!st_ops_desc) 3294 return false; 3295 3296 /* BTF type of struct sched_ext_ops */ 3297 t = st_ops_desc->type; 3298 3299 member_idx = prog->expected_attach_type; 3300 if (member_idx >= btf_type_vlen(t)) 3301 return false; 3302 3303 /* 3304 * Get the member name of this struct_ops program, which corresponds to 3305 * a field in struct sched_ext_ops. For example, the member name of the 3306 * dispatch struct_ops program (callback) is "dispatch". 3307 */ 3308 member = &btf_type_member(t)[member_idx]; 3309 mname = btf_name_by_offset(btf_vmlinux, member->name_off); 3310 3311 if (!strcmp(mname, op)) { 3312 /* 3313 * The value is a pointer to a type (struct task_struct) given 3314 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), 3315 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program 3316 * should check the pointer to make sure it is not NULL before 3317 * using it, or the verifier will reject the program. 3318 * 3319 * Longer term, this is something that should be addressed by 3320 * BTF, and be fully contained within the verifier. 3321 */ 3322 info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; 3323 info->btf = btf_vmlinux; 3324 info->btf_id = task_struct_type_id; 3325 3326 return true; 3327 } 3328 3329 return false; 3330 } 3331 3332 static bool bpf_scx_is_valid_access(int off, int size, 3333 enum bpf_access_type type, 3334 const struct bpf_prog *prog, 3335 struct bpf_insn_access_aux *info) 3336 { 3337 if (type != BPF_READ) 3338 return false; 3339 if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || 3340 set_arg_maybe_null("yield", 1, off, size, type, prog, info)) 3341 return true; 3342 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 3343 return false; 3344 if (off % size != 0) 3345 return false; 3346 3347 return btf_ctx_access(off, size, type, prog, info); 3348 } 3349 3350 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 3351 const struct bpf_reg_state *reg, int off, 3352 int size) 3353 { 3354 const struct btf_type *t; 3355 3356 t = btf_type_by_id(reg->btf, reg->btf_id); 3357 if (t == task_struct_type) { 3358 if (off >= offsetof(struct task_struct, scx.slice) && 3359 off + size <= offsetofend(struct task_struct, scx.slice)) 3360 return SCALAR_VALUE; 3361 } 3362 3363 return -EACCES; 3364 } 3365 3366 static const struct bpf_func_proto * 3367 bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3368 { 3369 switch (func_id) { 3370 case BPF_FUNC_task_storage_get: 3371 return &bpf_task_storage_get_proto; 3372 case BPF_FUNC_task_storage_delete: 3373 return &bpf_task_storage_delete_proto; 3374 default: 3375 return bpf_base_func_proto(func_id, prog); 3376 } 3377 } 3378 3379 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 3380 .get_func_proto = bpf_scx_get_func_proto, 3381 .is_valid_access = bpf_scx_is_valid_access, 3382 .btf_struct_access = bpf_scx_btf_struct_access, 3383 }; 3384 3385 static int bpf_scx_init_member(const struct btf_type *t, 3386 const struct btf_member *member, 3387 void *kdata, const void *udata) 3388 { 3389 const struct sched_ext_ops *uops = udata; 3390 struct sched_ext_ops *ops = kdata; 3391 u32 moff = __btf_member_bit_offset(t, member) / 8; 3392 int ret; 3393 3394 switch (moff) { 3395 case offsetof(struct sched_ext_ops, dispatch_max_batch): 3396 if (*(u32 *)(udata + moff) > INT_MAX) 3397 return -E2BIG; 3398 ops->dispatch_max_batch = *(u32 *)(udata + moff); 3399 return 1; 3400 case offsetof(struct sched_ext_ops, flags): 3401 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 3402 return -EINVAL; 3403 ops->flags = *(u64 *)(udata + moff); 3404 return 1; 3405 case offsetof(struct sched_ext_ops, name): 3406 ret = bpf_obj_name_cpy(ops->name, uops->name, 3407 sizeof(ops->name)); 3408 if (ret < 0) 3409 return ret; 3410 if (ret == 0) 3411 return -EINVAL; 3412 return 1; 3413 } 3414 3415 return 0; 3416 } 3417 3418 static int bpf_scx_check_member(const struct btf_type *t, 3419 const struct btf_member *member, 3420 const struct bpf_prog *prog) 3421 { 3422 u32 moff = __btf_member_bit_offset(t, member) / 8; 3423 3424 switch (moff) { 3425 case offsetof(struct sched_ext_ops, init_task): 3426 case offsetof(struct sched_ext_ops, init): 3427 case offsetof(struct sched_ext_ops, exit): 3428 break; 3429 default: 3430 if (prog->sleepable) 3431 return -EINVAL; 3432 } 3433 3434 return 0; 3435 } 3436 3437 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 3438 { 3439 return scx_ops_enable(kdata, link); 3440 } 3441 3442 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 3443 { 3444 scx_ops_disable(SCX_EXIT_UNREG); 3445 kthread_flush_work(&scx_ops_disable_work); 3446 } 3447 3448 static int bpf_scx_init(struct btf *btf) 3449 { 3450 u32 type_id; 3451 3452 type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); 3453 if (type_id < 0) 3454 return -EINVAL; 3455 task_struct_type = btf_type_by_id(btf, type_id); 3456 task_struct_type_id = type_id; 3457 3458 return 0; 3459 } 3460 3461 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 3462 { 3463 /* 3464 * sched_ext does not support updating the actively-loaded BPF 3465 * scheduler, as registering a BPF scheduler can always fail if the 3466 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 3467 * etc. Similarly, we can always race with unregistration happening 3468 * elsewhere, such as with sysrq. 3469 */ 3470 return -EOPNOTSUPP; 3471 } 3472 3473 static int bpf_scx_validate(void *kdata) 3474 { 3475 return 0; 3476 } 3477 3478 static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 3479 static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} 3480 static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} 3481 static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} 3482 static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } 3483 static void set_weight_stub(struct task_struct *p, u32 weight) {} 3484 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} 3485 static void update_idle_stub(s32 cpu, bool idle) {} 3486 static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 3487 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} 3488 static void enable_stub(struct task_struct *p) {} 3489 static void disable_stub(struct task_struct *p) {} 3490 static s32 init_stub(void) { return -EINVAL; } 3491 static void exit_stub(struct scx_exit_info *info) {} 3492 3493 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 3494 .select_cpu = select_cpu_stub, 3495 .enqueue = enqueue_stub, 3496 .dequeue = dequeue_stub, 3497 .dispatch = dispatch_stub, 3498 .yield = yield_stub, 3499 .set_weight = set_weight_stub, 3500 .set_cpumask = set_cpumask_stub, 3501 .update_idle = update_idle_stub, 3502 .init_task = init_task_stub, 3503 .exit_task = exit_task_stub, 3504 .enable = enable_stub, 3505 .disable = disable_stub, 3506 .init = init_stub, 3507 .exit = exit_stub, 3508 }; 3509 3510 static struct bpf_struct_ops bpf_sched_ext_ops = { 3511 .verifier_ops = &bpf_scx_verifier_ops, 3512 .reg = bpf_scx_reg, 3513 .unreg = bpf_scx_unreg, 3514 .check_member = bpf_scx_check_member, 3515 .init_member = bpf_scx_init_member, 3516 .init = bpf_scx_init, 3517 .update = bpf_scx_update, 3518 .validate = bpf_scx_validate, 3519 .name = "sched_ext_ops", 3520 .owner = THIS_MODULE, 3521 .cfi_stubs = &__bpf_ops_sched_ext_ops 3522 }; 3523 3524 3525 /******************************************************************************** 3526 * System integration and init. 3527 */ 3528 3529 void __init init_sched_ext_class(void) 3530 { 3531 s32 cpu, v; 3532 3533 /* 3534 * The following is to prevent the compiler from optimizing out the enum 3535 * definitions so that BPF scheduler implementations can use them 3536 * through the generated vmlinux.h. 3537 */ 3538 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP); 3539 3540 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 3541 init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); 3542 #ifdef CONFIG_SMP 3543 BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); 3544 BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); 3545 #endif 3546 for_each_possible_cpu(cpu) { 3547 struct rq *rq = cpu_rq(cpu); 3548 3549 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); 3550 INIT_LIST_HEAD(&rq->scx.runnable_list); 3551 } 3552 } 3553 3554 3555 /******************************************************************************** 3556 * Helpers that can be called from the BPF scheduler. 3557 */ 3558 #include <linux/btf_ids.h> 3559 3560 __bpf_kfunc_start_defs(); 3561 3562 /** 3563 * scx_bpf_create_dsq - Create a custom DSQ 3564 * @dsq_id: DSQ to create 3565 * @node: NUMA node to allocate from 3566 * 3567 * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and 3568 * ops.init_task(). 3569 */ 3570 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) 3571 { 3572 if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) 3573 return -EINVAL; 3574 3575 if (unlikely(node >= (int)nr_node_ids || 3576 (node < 0 && node != NUMA_NO_NODE))) 3577 return -EINVAL; 3578 return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); 3579 } 3580 3581 __bpf_kfunc_end_defs(); 3582 3583 BTF_KFUNCS_START(scx_kfunc_ids_sleepable) 3584 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) 3585 BTF_KFUNCS_END(scx_kfunc_ids_sleepable) 3586 3587 static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { 3588 .owner = THIS_MODULE, 3589 .set = &scx_kfunc_ids_sleepable, 3590 }; 3591 3592 __bpf_kfunc_start_defs(); 3593 3594 /** 3595 * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() 3596 * @p: task_struct to select a CPU for 3597 * @prev_cpu: CPU @p was on previously 3598 * @wake_flags: %SCX_WAKE_* flags 3599 * @is_idle: out parameter indicating whether the returned CPU is idle 3600 * 3601 * Can only be called from ops.select_cpu() if the built-in CPU selection is 3602 * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. 3603 * @p, @prev_cpu and @wake_flags match ops.select_cpu(). 3604 * 3605 * Returns the picked CPU with *@is_idle indicating whether the picked CPU is 3606 * currently idle and thus a good candidate for direct dispatching. 3607 */ 3608 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 3609 u64 wake_flags, bool *is_idle) 3610 { 3611 if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { 3612 *is_idle = false; 3613 return prev_cpu; 3614 } 3615 #ifdef CONFIG_SMP 3616 return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); 3617 #else 3618 *is_idle = false; 3619 return prev_cpu; 3620 #endif 3621 } 3622 3623 __bpf_kfunc_end_defs(); 3624 3625 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) 3626 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) 3627 BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) 3628 3629 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { 3630 .owner = THIS_MODULE, 3631 .set = &scx_kfunc_ids_select_cpu, 3632 }; 3633 3634 static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) 3635 { 3636 if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 3637 return false; 3638 3639 lockdep_assert_irqs_disabled(); 3640 3641 if (unlikely(!p)) { 3642 scx_ops_error("called with NULL task"); 3643 return false; 3644 } 3645 3646 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 3647 scx_ops_error("invalid enq_flags 0x%llx", enq_flags); 3648 return false; 3649 } 3650 3651 return true; 3652 } 3653 3654 static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) 3655 { 3656 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 3657 struct task_struct *ddsp_task; 3658 3659 ddsp_task = __this_cpu_read(direct_dispatch_task); 3660 if (ddsp_task) { 3661 mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); 3662 return; 3663 } 3664 3665 if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { 3666 scx_ops_error("dispatch buffer overflow"); 3667 return; 3668 } 3669 3670 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 3671 .task = p, 3672 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 3673 .dsq_id = dsq_id, 3674 .enq_flags = enq_flags, 3675 }; 3676 } 3677 3678 __bpf_kfunc_start_defs(); 3679 3680 /** 3681 * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ 3682 * @p: task_struct to dispatch 3683 * @dsq_id: DSQ to dispatch to 3684 * @slice: duration @p can run for in nsecs 3685 * @enq_flags: SCX_ENQ_* 3686 * 3687 * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe 3688 * to call this function spuriously. Can be called from ops.enqueue(), 3689 * ops.select_cpu(), and ops.dispatch(). 3690 * 3691 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 3692 * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be 3693 * used to target the local DSQ of a CPU other than the enqueueing one. Use 3694 * ops.select_cpu() to be on the target CPU in the first place. 3695 * 3696 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 3697 * will be directly dispatched to the corresponding dispatch queue after 3698 * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be 3699 * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). 3700 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 3701 * task is dispatched. 3702 * 3703 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 3704 * and this function can be called upto ops.dispatch_max_batch times to dispatch 3705 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 3706 * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. 3707 * 3708 * This function doesn't have any locking restrictions and may be called under 3709 * BPF locks (in the future when BPF introduces more flexible locking). 3710 * 3711 * @p is allowed to run for @slice. The scheduling path is triggered on slice 3712 * exhaustion. If zero, the current residual slice is maintained. 3713 */ 3714 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, 3715 u64 enq_flags) 3716 { 3717 if (!scx_dispatch_preamble(p, enq_flags)) 3718 return; 3719 3720 if (slice) 3721 p->scx.slice = slice; 3722 else 3723 p->scx.slice = p->scx.slice ?: 1; 3724 3725 scx_dispatch_commit(p, dsq_id, enq_flags); 3726 } 3727 3728 __bpf_kfunc_end_defs(); 3729 3730 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 3731 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) 3732 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 3733 3734 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 3735 .owner = THIS_MODULE, 3736 .set = &scx_kfunc_ids_enqueue_dispatch, 3737 }; 3738 3739 __bpf_kfunc_start_defs(); 3740 3741 /** 3742 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 3743 * 3744 * Can only be called from ops.dispatch(). 3745 */ 3746 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) 3747 { 3748 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 3749 return 0; 3750 3751 return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); 3752 } 3753 3754 /** 3755 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 3756 * 3757 * Cancel the latest dispatch. Can be called multiple times to cancel further 3758 * dispatches. Can only be called from ops.dispatch(). 3759 */ 3760 __bpf_kfunc void scx_bpf_dispatch_cancel(void) 3761 { 3762 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 3763 3764 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 3765 return; 3766 3767 if (dspc->cursor > 0) 3768 dspc->cursor--; 3769 else 3770 scx_ops_error("dispatch buffer underflow"); 3771 } 3772 3773 /** 3774 * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ 3775 * @dsq_id: DSQ to consume 3776 * 3777 * Consume a task from the non-local DSQ identified by @dsq_id and transfer it 3778 * to the current CPU's local DSQ for execution. Can only be called from 3779 * ops.dispatch(). 3780 * 3781 * This function flushes the in-flight dispatches from scx_bpf_dispatch() before 3782 * trying to consume the specified DSQ. It may also grab rq locks and thus can't 3783 * be called under any BPF locks. 3784 * 3785 * Returns %true if a task has been consumed, %false if there isn't any task to 3786 * consume. 3787 */ 3788 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) 3789 { 3790 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 3791 struct scx_dispatch_q *dsq; 3792 3793 if (!scx_kf_allowed(SCX_KF_DISPATCH)) 3794 return false; 3795 3796 flush_dispatch_buf(dspc->rq, dspc->rf); 3797 3798 dsq = find_non_local_dsq(dsq_id); 3799 if (unlikely(!dsq)) { 3800 scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); 3801 return false; 3802 } 3803 3804 if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { 3805 /* 3806 * A successfully consumed task can be dequeued before it starts 3807 * running while the CPU is trying to migrate other dispatched 3808 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty 3809 * local DSQ. 3810 */ 3811 dspc->nr_tasks++; 3812 return true; 3813 } else { 3814 return false; 3815 } 3816 } 3817 3818 __bpf_kfunc_end_defs(); 3819 3820 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 3821 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) 3822 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) 3823 BTF_ID_FLAGS(func, scx_bpf_consume) 3824 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 3825 3826 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 3827 .owner = THIS_MODULE, 3828 .set = &scx_kfunc_ids_dispatch, 3829 }; 3830 3831 __bpf_kfunc_start_defs(); 3832 3833 /** 3834 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 3835 * @dsq_id: id of the DSQ 3836 * 3837 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 3838 * -%ENOENT is returned. 3839 */ 3840 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) 3841 { 3842 struct scx_dispatch_q *dsq; 3843 s32 ret; 3844 3845 preempt_disable(); 3846 3847 if (dsq_id == SCX_DSQ_LOCAL) { 3848 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 3849 goto out; 3850 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 3851 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 3852 3853 if (ops_cpu_valid(cpu, NULL)) { 3854 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 3855 goto out; 3856 } 3857 } else { 3858 dsq = find_non_local_dsq(dsq_id); 3859 if (dsq) { 3860 ret = READ_ONCE(dsq->nr); 3861 goto out; 3862 } 3863 } 3864 ret = -ENOENT; 3865 out: 3866 preempt_enable(); 3867 return ret; 3868 } 3869 3870 /** 3871 * scx_bpf_destroy_dsq - Destroy a custom DSQ 3872 * @dsq_id: DSQ to destroy 3873 * 3874 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 3875 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 3876 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 3877 * which doesn't exist. Can be called from any online scx_ops operations. 3878 */ 3879 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) 3880 { 3881 destroy_dsq(dsq_id); 3882 } 3883 3884 __bpf_kfunc_end_defs(); 3885 3886 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, 3887 char *fmt, unsigned long long *data, u32 data__sz) 3888 { 3889 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 3890 s32 ret; 3891 3892 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 3893 (data__sz && !data)) { 3894 scx_ops_error("invalid data=%p and data__sz=%u", 3895 (void *)data, data__sz); 3896 return -EINVAL; 3897 } 3898 3899 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 3900 if (ret < 0) { 3901 scx_ops_error("failed to read data fields (%d)", ret); 3902 return ret; 3903 } 3904 3905 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 3906 &bprintf_data); 3907 if (ret < 0) { 3908 scx_ops_error("format preparation failed (%d)", ret); 3909 return ret; 3910 } 3911 3912 ret = bstr_printf(line_buf, line_size, fmt, 3913 bprintf_data.bin_args); 3914 bpf_bprintf_cleanup(&bprintf_data); 3915 if (ret < 0) { 3916 scx_ops_error("(\"%s\", %p, %u) failed to format", 3917 fmt, data, data__sz); 3918 return ret; 3919 } 3920 3921 return ret; 3922 } 3923 3924 static s32 bstr_format(struct scx_bstr_buf *buf, 3925 char *fmt, unsigned long long *data, u32 data__sz) 3926 { 3927 return __bstr_format(buf->data, buf->line, sizeof(buf->line), 3928 fmt, data, data__sz); 3929 } 3930 3931 __bpf_kfunc_start_defs(); 3932 3933 /** 3934 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 3935 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 3936 * @fmt: error message format string 3937 * @data: format string parameters packaged using ___bpf_fill() macro 3938 * @data__sz: @data len, must end in '__sz' for the verifier 3939 * 3940 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 3941 * disabling. 3942 */ 3943 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 3944 unsigned long long *data, u32 data__sz) 3945 { 3946 unsigned long flags; 3947 3948 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 3949 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 3950 scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", 3951 scx_exit_bstr_buf.line); 3952 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 3953 } 3954 3955 /** 3956 * scx_bpf_error_bstr - Indicate fatal error 3957 * @fmt: error message format string 3958 * @data: format string parameters packaged using ___bpf_fill() macro 3959 * @data__sz: @data len, must end in '__sz' for the verifier 3960 * 3961 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 3962 * disabling. 3963 */ 3964 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 3965 u32 data__sz) 3966 { 3967 unsigned long flags; 3968 3969 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 3970 if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 3971 scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", 3972 scx_exit_bstr_buf.line); 3973 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 3974 } 3975 3976 /** 3977 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 3978 * 3979 * All valid CPU IDs in the system are smaller than the returned value. 3980 */ 3981 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 3982 { 3983 return nr_cpu_ids; 3984 } 3985 3986 /** 3987 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 3988 */ 3989 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 3990 { 3991 return cpu_possible_mask; 3992 } 3993 3994 /** 3995 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 3996 */ 3997 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 3998 { 3999 return cpu_online_mask; 4000 } 4001 4002 /** 4003 * scx_bpf_put_cpumask - Release a possible/online cpumask 4004 * @cpumask: cpumask to release 4005 */ 4006 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 4007 { 4008 /* 4009 * Empty function body because we aren't actually acquiring or releasing 4010 * a reference to a global cpumask, which is read-only in the caller and 4011 * is never released. The acquire / release semantics here are just used 4012 * to make the cpumask is a trusted pointer in the caller. 4013 */ 4014 } 4015 4016 /** 4017 * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking 4018 * per-CPU cpumask. 4019 * 4020 * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 4021 */ 4022 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) 4023 { 4024 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4025 scx_ops_error("built-in idle tracking is disabled"); 4026 return cpu_none_mask; 4027 } 4028 4029 #ifdef CONFIG_SMP 4030 return idle_masks.cpu; 4031 #else 4032 return cpu_none_mask; 4033 #endif 4034 } 4035 4036 /** 4037 * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, 4038 * per-physical-core cpumask. Can be used to determine if an entire physical 4039 * core is free. 4040 * 4041 * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 4042 */ 4043 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) 4044 { 4045 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4046 scx_ops_error("built-in idle tracking is disabled"); 4047 return cpu_none_mask; 4048 } 4049 4050 #ifdef CONFIG_SMP 4051 if (sched_smt_active()) 4052 return idle_masks.smt; 4053 else 4054 return idle_masks.cpu; 4055 #else 4056 return cpu_none_mask; 4057 #endif 4058 } 4059 4060 /** 4061 * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to 4062 * either the percpu, or SMT idle-tracking cpumask. 4063 */ 4064 __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) 4065 { 4066 /* 4067 * Empty function body because we aren't actually acquiring or releasing 4068 * a reference to a global idle cpumask, which is read-only in the 4069 * caller and is never released. The acquire / release semantics here 4070 * are just used to make the cpumask a trusted pointer in the caller. 4071 */ 4072 } 4073 4074 /** 4075 * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state 4076 * @cpu: cpu to test and clear idle for 4077 * 4078 * Returns %true if @cpu was idle and its idle state was successfully cleared. 4079 * %false otherwise. 4080 * 4081 * Unavailable if ops.update_idle() is implemented and 4082 * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 4083 */ 4084 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) 4085 { 4086 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4087 scx_ops_error("built-in idle tracking is disabled"); 4088 return false; 4089 } 4090 4091 if (ops_cpu_valid(cpu, NULL)) 4092 return test_and_clear_cpu_idle(cpu); 4093 else 4094 return false; 4095 } 4096 4097 /** 4098 * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu 4099 * @cpus_allowed: Allowed cpumask 4100 * @flags: %SCX_PICK_IDLE_CPU_* flags 4101 * 4102 * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu 4103 * number on success. -%EBUSY if no matching cpu was found. 4104 * 4105 * Idle CPU tracking may race against CPU scheduling state transitions. For 4106 * example, this function may return -%EBUSY as CPUs are transitioning into the 4107 * idle state. If the caller then assumes that there will be dispatch events on 4108 * the CPUs as they were all busy, the scheduler may end up stalling with CPUs 4109 * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and 4110 * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch 4111 * event in the near future. 4112 * 4113 * Unavailable if ops.update_idle() is implemented and 4114 * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 4115 */ 4116 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, 4117 u64 flags) 4118 { 4119 if (!static_branch_likely(&scx_builtin_idle_enabled)) { 4120 scx_ops_error("built-in idle tracking is disabled"); 4121 return -EBUSY; 4122 } 4123 4124 return scx_pick_idle_cpu(cpus_allowed, flags); 4125 } 4126 4127 /** 4128 * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU 4129 * @cpus_allowed: Allowed cpumask 4130 * @flags: %SCX_PICK_IDLE_CPU_* flags 4131 * 4132 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any 4133 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu 4134 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is 4135 * empty. 4136 * 4137 * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not 4138 * set, this function can't tell which CPUs are idle and will always pick any 4139 * CPU. 4140 */ 4141 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, 4142 u64 flags) 4143 { 4144 s32 cpu; 4145 4146 if (static_branch_likely(&scx_builtin_idle_enabled)) { 4147 cpu = scx_pick_idle_cpu(cpus_allowed, flags); 4148 if (cpu >= 0) 4149 return cpu; 4150 } 4151 4152 cpu = cpumask_any_distribute(cpus_allowed); 4153 if (cpu < nr_cpu_ids) 4154 return cpu; 4155 else 4156 return -EBUSY; 4157 } 4158 4159 /** 4160 * scx_bpf_task_running - Is task currently running? 4161 * @p: task of interest 4162 */ 4163 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 4164 { 4165 return task_rq(p)->curr == p; 4166 } 4167 4168 /** 4169 * scx_bpf_task_cpu - CPU a task is currently associated with 4170 * @p: task of interest 4171 */ 4172 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 4173 { 4174 return task_cpu(p); 4175 } 4176 4177 __bpf_kfunc_end_defs(); 4178 4179 BTF_KFUNCS_START(scx_kfunc_ids_any) 4180 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 4181 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 4182 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) 4183 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) 4184 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 4185 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 4186 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 4187 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 4188 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) 4189 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) 4190 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 4191 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) 4192 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) 4193 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) 4194 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 4195 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 4196 BTF_KFUNCS_END(scx_kfunc_ids_any) 4197 4198 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 4199 .owner = THIS_MODULE, 4200 .set = &scx_kfunc_ids_any, 4201 }; 4202 4203 static int __init scx_init(void) 4204 { 4205 int ret; 4206 4207 /* 4208 * kfunc registration can't be done from init_sched_ext_class() as 4209 * register_btf_kfunc_id_set() needs most of the system to be up. 4210 * 4211 * Some kfuncs are context-sensitive and can only be called from 4212 * specific SCX ops. They are grouped into BTF sets accordingly. 4213 * Unfortunately, BPF currently doesn't have a way of enforcing such 4214 * restrictions. Eventually, the verifier should be able to enforce 4215 * them. For now, register them the same and make each kfunc explicitly 4216 * check using scx_kf_allowed(). 4217 */ 4218 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4219 &scx_kfunc_set_sleepable)) || 4220 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4221 &scx_kfunc_set_select_cpu)) || 4222 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4223 &scx_kfunc_set_enqueue_dispatch)) || 4224 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4225 &scx_kfunc_set_dispatch)) || 4226 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 4227 &scx_kfunc_set_any)) || 4228 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 4229 &scx_kfunc_set_any)) || 4230 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 4231 &scx_kfunc_set_any))) { 4232 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 4233 return ret; 4234 } 4235 4236 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 4237 if (ret) { 4238 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 4239 return ret; 4240 } 4241 4242 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 4243 if (!scx_kset) { 4244 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 4245 return -ENOMEM; 4246 } 4247 4248 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 4249 if (ret < 0) { 4250 pr_err("sched_ext: Failed to add global attributes\n"); 4251 return ret; 4252 } 4253 4254 return 0; 4255 } 4256 __initcall(scx_init); 4257