1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #include <linux/btf_ids.h> 10 #include "ext_idle.h" 11 12 static DEFINE_RAW_SPINLOCK(scx_sched_lock); 13 14 /* 15 * NOTE: sched_ext is in the process of growing multiple scheduler support and 16 * scx_root usage is in a transitional state. Naked dereferences are safe if the 17 * caller is one of the tasks attached to SCX and explicit RCU dereference is 18 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 19 * are used as temporary markers to indicate that the dereferences need to be 20 * updated to point to the associated scheduler instances rather than scx_root. 21 */ 22 struct scx_sched __rcu *scx_root; 23 24 /* 25 * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 26 * Readers can hold either or rcu_read_lock(). 27 */ 28 static LIST_HEAD(scx_sched_all); 29 30 #ifdef CONFIG_EXT_SUB_SCHED 31 static const struct rhashtable_params scx_sched_hash_params = { 32 .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 33 .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 34 .head_offset = offsetof(struct scx_sched, hash_node), 35 .insecure_elasticity = true, /* inserted under scx_sched_lock */ 36 }; 37 38 static struct rhashtable scx_sched_hash; 39 #endif 40 41 /* 42 * During exit, a task may schedule after losing its PIDs. When disabling the 43 * BPF scheduler, we need to be able to iterate tasks in every state to 44 * guarantee system safety. Maintain a dedicated task list which contains every 45 * task between its fork and eventual free. 46 */ 47 static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 48 static LIST_HEAD(scx_tasks); 49 50 /* ops enable/disable */ 51 static DEFINE_MUTEX(scx_enable_mutex); 52 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 53 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 54 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 55 static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 56 static bool scx_init_task_enabled; 57 static bool scx_switching_all; 58 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 59 60 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 61 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 62 63 #ifdef CONFIG_EXT_SUB_SCHED 64 /* 65 * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 66 * tasks for the sub-sched being enabled. Use a global variable instead of a 67 * per-task field as all enables are serialized. 68 */ 69 static struct scx_sched *scx_enabling_sub_sched; 70 #else 71 #define scx_enabling_sub_sched (struct scx_sched *)NULL 72 #endif /* CONFIG_EXT_SUB_SCHED */ 73 74 /* 75 * A monotonically increasing sequence number that is incremented every time a 76 * scheduler is enabled. This can be used to check if any custom sched_ext 77 * scheduler has ever been used in the system. 78 */ 79 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 80 81 /* 82 * Watchdog interval. All scx_sched's share a single watchdog timer and the 83 * interval is half of the shortest sch->watchdog_timeout. 84 */ 85 static unsigned long scx_watchdog_interval; 86 87 /* 88 * The last time the delayed work was run. This delayed work relies on 89 * ksoftirqd being able to run to service timer interrupts, so it's possible 90 * that this work itself could get wedged. To account for this, we check that 91 * it's not stalled in the timer tick, and trigger an error if it is. 92 */ 93 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 94 95 static struct delayed_work scx_watchdog_work; 96 97 /* 98 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 99 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 100 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 101 * lazily when enabling and freed when disabling to avoid waste when sched_ext 102 * isn't active. 103 */ 104 struct scx_kick_syncs { 105 struct rcu_head rcu; 106 unsigned long syncs[]; 107 }; 108 109 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 110 111 /* 112 * Direct dispatch marker. 113 * 114 * Non-NULL values are used for direct dispatch from enqueue path. A valid 115 * pointer points to the task currently being enqueued. An ERR_PTR value is used 116 * to indicate that direct dispatch has already happened. 117 */ 118 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 119 120 static const struct rhashtable_params dsq_hash_params = { 121 .key_len = sizeof_field(struct scx_dispatch_q, id), 122 .key_offset = offsetof(struct scx_dispatch_q, id), 123 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 124 }; 125 126 static LLIST_HEAD(dsqs_to_free); 127 128 /* string formatting from BPF */ 129 struct scx_bstr_buf { 130 u64 data[MAX_BPRINTF_VARARGS]; 131 char line[SCX_EXIT_MSG_LEN]; 132 }; 133 134 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 135 static struct scx_bstr_buf scx_exit_bstr_buf; 136 137 /* ops debug dump */ 138 static DEFINE_RAW_SPINLOCK(scx_dump_lock); 139 140 struct scx_dump_data { 141 s32 cpu; 142 bool first; 143 s32 cursor; 144 struct seq_buf *s; 145 const char *prefix; 146 struct scx_bstr_buf buf; 147 }; 148 149 static struct scx_dump_data scx_dump_data = { 150 .cpu = -1, 151 }; 152 153 /* /sys/kernel/sched_ext interface */ 154 static struct kset *scx_kset; 155 156 /* 157 * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 158 * There usually is no reason to modify these as normal scheduler operation 159 * shouldn't be affected by them. The knobs are primarily for debugging. 160 */ 161 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 162 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 163 164 static int set_slice_us(const char *val, const struct kernel_param *kp) 165 { 166 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 167 } 168 169 static const struct kernel_param_ops slice_us_param_ops = { 170 .set = set_slice_us, 171 .get = param_get_uint, 172 }; 173 174 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 175 { 176 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 177 } 178 179 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 180 .set = set_bypass_lb_intv_us, 181 .get = param_get_uint, 182 }; 183 184 #undef MODULE_PARAM_PREFIX 185 #define MODULE_PARAM_PREFIX "sched_ext." 186 187 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 188 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 189 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 190 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 191 192 #undef MODULE_PARAM_PREFIX 193 194 #define CREATE_TRACE_POINTS 195 #include <trace/events/sched_ext.h> 196 197 static void run_deferred(struct rq *rq); 198 static bool task_dead_and_done(struct task_struct *p); 199 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 200 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 201 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 202 s64 exit_code, const char *fmt, va_list args); 203 204 static __printf(4, 5) bool scx_exit(struct scx_sched *sch, 205 enum scx_exit_kind kind, s64 exit_code, 206 const char *fmt, ...) 207 { 208 va_list args; 209 bool ret; 210 211 va_start(args, fmt); 212 ret = scx_vexit(sch, kind, exit_code, fmt, args); 213 va_end(args); 214 215 return ret; 216 } 217 218 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 219 #define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) 220 221 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 222 223 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 224 { 225 if (time_after(at, now)) 226 return jiffies_to_msecs(at - now); 227 else 228 return -(long)jiffies_to_msecs(now - at); 229 } 230 231 static bool u32_before(u32 a, u32 b) 232 { 233 return (s32)(a - b) < 0; 234 } 235 236 #ifdef CONFIG_EXT_SUB_SCHED 237 /** 238 * scx_parent - Find the parent sched 239 * @sch: sched to find the parent of 240 * 241 * Returns the parent scheduler or %NULL if @sch is root. 242 */ 243 static struct scx_sched *scx_parent(struct scx_sched *sch) 244 { 245 if (sch->level) 246 return sch->ancestors[sch->level - 1]; 247 else 248 return NULL; 249 } 250 251 /** 252 * scx_next_descendant_pre - find the next descendant for pre-order walk 253 * @pos: the current position (%NULL to initiate traversal) 254 * @root: sched whose descendants to walk 255 * 256 * To be used by scx_for_each_descendant_pre(). Find the next descendant to 257 * visit for pre-order traversal of @root's descendants. @root is included in 258 * the iteration and the first node to be visited. 259 */ 260 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 261 struct scx_sched *root) 262 { 263 struct scx_sched *next; 264 265 lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 266 lockdep_is_held(&scx_sched_lock)); 267 268 /* if first iteration, visit @root */ 269 if (!pos) 270 return root; 271 272 /* visit the first child if exists */ 273 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 274 if (next) 275 return next; 276 277 /* no child, visit my or the closest ancestor's next sibling */ 278 while (pos != root) { 279 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 280 return list_next_entry(pos, sibling); 281 pos = scx_parent(pos); 282 } 283 284 return NULL; 285 } 286 287 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 288 { 289 return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 290 scx_sched_hash_params); 291 } 292 293 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 294 { 295 rcu_assign_pointer(p->scx.sched, sch); 296 } 297 #else /* CONFIG_EXT_SUB_SCHED */ 298 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } 299 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 300 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 301 #endif /* CONFIG_EXT_SUB_SCHED */ 302 303 /** 304 * scx_is_descendant - Test whether sched is a descendant 305 * @sch: sched to test 306 * @ancestor: ancestor sched to test against 307 * 308 * Test whether @sch is a descendant of @ancestor. 309 */ 310 static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 311 { 312 if (sch->level < ancestor->level) 313 return false; 314 return sch->ancestors[ancestor->level] == ancestor; 315 } 316 317 /** 318 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 319 * @pos: iteration cursor 320 * @root: sched to walk the descendants of 321 * 322 * Walk @root's descendants. @root is included in the iteration and the first 323 * node to be visited. Must be called with either scx_enable_mutex or 324 * scx_sched_lock held. 325 */ 326 #define scx_for_each_descendant_pre(pos, root) \ 327 for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 328 (pos) = scx_next_descendant_pre((pos), (root))) 329 330 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 331 { 332 return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 333 } 334 335 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 336 { 337 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 338 } 339 340 static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 341 { 342 if (p->sched_class == &stop_sched_class) 343 return &stop_sched_class; 344 345 return __setscheduler_class(p->policy, p->prio); 346 } 347 348 static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 349 { 350 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 351 } 352 353 static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 354 { 355 #ifdef CONFIG_EXT_SUB_SCHED 356 /* 357 * If @sch is a sub-sched which is bypassing, its tasks should go into 358 * the bypass DSQs of the nearest ancestor which is not bypassing. The 359 * not-bypassing ancestor is responsible for scheduling all tasks from 360 * bypassing sub-trees. If all ancestors including root are bypassing, 361 * all tasks should go to the root's bypass DSQs. 362 * 363 * Whenever a sched starts bypassing, all runnable tasks in its subtree 364 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 365 * all tasks are transferred to the right DSQs. 366 */ 367 while (scx_parent(sch) && scx_bypassing(sch, cpu)) 368 sch = scx_parent(sch); 369 #endif /* CONFIG_EXT_SUB_SCHED */ 370 371 return bypass_dsq(sch, cpu); 372 } 373 374 /** 375 * bypass_dsp_enabled - Check if bypass dispatch path is enabled 376 * @sch: scheduler to check 377 * 378 * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 379 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 380 * are bypassing. In the former case, the ancestor is not itself bypassing but 381 * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 382 * the ancestor's bypass dispatch path must be active even though its own 383 * bypass_depth remains zero. 384 * 385 * This function checks bypass_dsp_enable_depth which is managed separately from 386 * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 387 * disable_bypass_dsp(). 388 */ 389 static bool bypass_dsp_enabled(struct scx_sched *sch) 390 { 391 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 392 } 393 394 /** 395 * rq_is_open - Is the rq available for immediate execution of an SCX task? 396 * @rq: rq to test 397 * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 398 * 399 * Returns %true if @rq is currently open for executing an SCX task. After a 400 * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 401 * before going to idle and not inserting a task into @rq's local DSQ after a 402 * %false return doesn't cause @rq to stall. 403 */ 404 static bool rq_is_open(struct rq *rq, u64 enq_flags) 405 { 406 lockdep_assert_rq_held(rq); 407 408 /* 409 * A higher-priority class task is either running or in the process of 410 * waking up on @rq. 411 */ 412 if (sched_class_above(rq->next_class, &ext_sched_class)) 413 return false; 414 415 /* 416 * @rq is either in transition to or in idle and there is no 417 * higher-priority class task waking up on it. 418 */ 419 if (sched_class_above(&ext_sched_class, rq->next_class)) 420 return true; 421 422 /* 423 * @rq is either picking, in transition to, or running an SCX task. 424 */ 425 426 /* 427 * If we're in the dispatch path holding rq lock, $curr may or may not 428 * be ready depending on whether the on-going dispatch decides to extend 429 * $curr's slice. We say yes here and resolve it at the end of dispatch. 430 * See balance_one(). 431 */ 432 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 433 return true; 434 435 /* 436 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 437 * so allow it to avoid spuriously triggering reenq on a combined 438 * PREEMPT|IMMED insertion. 439 */ 440 if (enq_flags & SCX_ENQ_PREEMPT) 441 return true; 442 443 /* 444 * @rq is either in transition to or running an SCX task and can't go 445 * idle without another SCX dispatch cycle. 446 */ 447 return false; 448 } 449 450 /* 451 * Track the rq currently locked. 452 * 453 * This allows kfuncs to safely operate on rq from any scx ops callback, 454 * knowing which rq is already locked. 455 */ 456 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 457 458 static inline void update_locked_rq(struct rq *rq) 459 { 460 /* 461 * Check whether @rq is actually locked. This can help expose bugs 462 * or incorrect assumptions about the context in which a kfunc or 463 * callback is executed. 464 */ 465 if (rq) 466 lockdep_assert_rq_held(rq); 467 __this_cpu_write(scx_locked_rq_state, rq); 468 } 469 470 /* 471 * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not 472 * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. 473 */ 474 #define SCX_CALL_OP(sch, op, locked_rq, args...) \ 475 do { \ 476 struct rq *__prev_locked_rq; \ 477 \ 478 if (locked_rq) { \ 479 __prev_locked_rq = scx_locked_rq(); \ 480 update_locked_rq(locked_rq); \ 481 } \ 482 (sch)->ops.op(args); \ 483 if (locked_rq) \ 484 update_locked_rq(__prev_locked_rq); \ 485 } while (0) 486 487 #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ 488 ({ \ 489 struct rq *__prev_locked_rq; \ 490 __typeof__((sch)->ops.op(args)) __ret; \ 491 \ 492 if (locked_rq) { \ 493 __prev_locked_rq = scx_locked_rq(); \ 494 update_locked_rq(locked_rq); \ 495 } \ 496 __ret = (sch)->ops.op(args); \ 497 if (locked_rq) \ 498 update_locked_rq(__prev_locked_rq); \ 499 __ret; \ 500 }) 501 502 /* 503 * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments 504 * and records them in current->scx.kf_tasks[] for the duration of the call. A 505 * kfunc invoked from inside such an op can then use 506 * scx_kf_arg_task_ok() to verify that its task argument is one of 507 * those subject tasks. 508 * 509 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - 510 * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's 511 * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. 512 * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. 513 * 514 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The 515 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants 516 * while a previous one is still in progress. 517 */ 518 #define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ 519 do { \ 520 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 521 current->scx.kf_tasks[0] = task; \ 522 SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ 523 current->scx.kf_tasks[0] = NULL; \ 524 } while (0) 525 526 #define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ 527 ({ \ 528 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 529 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 530 current->scx.kf_tasks[0] = task; \ 531 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ 532 current->scx.kf_tasks[0] = NULL; \ 533 __ret; \ 534 }) 535 536 #define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ 537 ({ \ 538 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 539 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 540 current->scx.kf_tasks[0] = task0; \ 541 current->scx.kf_tasks[1] = task1; \ 542 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ 543 current->scx.kf_tasks[0] = NULL; \ 544 current->scx.kf_tasks[1] = NULL; \ 545 __ret; \ 546 }) 547 548 /* see SCX_CALL_OP_TASK() */ 549 static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, 550 struct task_struct *p) 551 { 552 if (unlikely((p != current->scx.kf_tasks[0] && 553 p != current->scx.kf_tasks[1]))) { 554 scx_error(sch, "called on a task not being operated on"); 555 return false; 556 } 557 558 return true; 559 } 560 561 enum scx_dsq_iter_flags { 562 /* iterate in the reverse dispatch order */ 563 SCX_DSQ_ITER_REV = 1U << 16, 564 565 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 566 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 567 568 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 569 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 570 __SCX_DSQ_ITER_HAS_SLICE | 571 __SCX_DSQ_ITER_HAS_VTIME, 572 }; 573 574 /** 575 * nldsq_next_task - Iterate to the next task in a non-local DSQ 576 * @dsq: non-local dsq being iterated 577 * @cur: current position, %NULL to start iteration 578 * @rev: walk backwards 579 * 580 * Returns %NULL when iteration is finished. 581 */ 582 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 583 struct task_struct *cur, bool rev) 584 { 585 struct list_head *list_node; 586 struct scx_dsq_list_node *dsq_lnode; 587 588 lockdep_assert_held(&dsq->lock); 589 590 if (cur) 591 list_node = &cur->scx.dsq_list.node; 592 else 593 list_node = &dsq->list; 594 595 /* find the next task, need to skip BPF iteration cursors */ 596 do { 597 if (rev) 598 list_node = list_node->prev; 599 else 600 list_node = list_node->next; 601 602 if (list_node == &dsq->list) 603 return NULL; 604 605 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 606 node); 607 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 608 609 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 610 } 611 612 #define nldsq_for_each_task(p, dsq) \ 613 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 614 (p) = nldsq_next_task((dsq), (p), false)) 615 616 /** 617 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 618 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 619 * @dsq: non-local dsq being iterated 620 * 621 * Find the next task in a cursor based iteration. The caller must have 622 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 623 * between the iteration steps. 624 * 625 * Only tasks which were queued before @cursor was initialized are visible. This 626 * bounds the iteration and guarantees that vtime never jumps in the other 627 * direction while iterating. 628 */ 629 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 630 struct scx_dispatch_q *dsq) 631 { 632 bool rev = cursor->flags & SCX_DSQ_ITER_REV; 633 struct task_struct *p; 634 635 lockdep_assert_held(&dsq->lock); 636 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 637 638 if (list_empty(&cursor->node)) 639 p = NULL; 640 else 641 p = container_of(cursor, struct task_struct, scx.dsq_list); 642 643 /* skip cursors and tasks that were queued after @cursor init */ 644 do { 645 p = nldsq_next_task(dsq, p, rev); 646 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 647 648 if (p) { 649 if (rev) 650 list_move_tail(&cursor->node, &p->scx.dsq_list.node); 651 else 652 list_move(&cursor->node, &p->scx.dsq_list.node); 653 } else { 654 list_del_init(&cursor->node); 655 } 656 657 return p; 658 } 659 660 /** 661 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 662 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 663 * @rq: rq @p was on 664 * @dsq: dsq @p was on 665 * @p: target task 666 * 667 * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 668 * dropped and re-acquired inbetween. Verify that no one else took or is in the 669 * process of taking @p from @dsq. 670 * 671 * On %false return, the caller can assume full ownership of @p. 672 */ 673 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 674 struct rq *rq, struct scx_dispatch_q *dsq, 675 struct task_struct *p) 676 { 677 lockdep_assert_rq_held(rq); 678 lockdep_assert_held(&dsq->lock); 679 680 /* 681 * @p could have already left $src_dsq, got re-enqueud, or be in the 682 * process of being consumed by someone else. 683 */ 684 if (unlikely(p->scx.dsq != dsq || 685 u32_before(cursor->priv, p->scx.dsq_seq) || 686 p->scx.holding_cpu >= 0)) 687 return true; 688 689 /* if @p has stayed on @dsq, its rq couldn't have changed */ 690 if (WARN_ON_ONCE(rq != task_rq(p))) 691 return true; 692 693 return false; 694 } 695 696 /* 697 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 698 * dispatch order. BPF-visible iterator is opaque and larger to allow future 699 * changes without breaking backward compatibility. Can be used with 700 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 701 */ 702 struct bpf_iter_scx_dsq_kern { 703 struct scx_dsq_list_node cursor; 704 struct scx_dispatch_q *dsq; 705 u64 slice; 706 u64 vtime; 707 } __attribute__((aligned(8))); 708 709 struct bpf_iter_scx_dsq { 710 u64 __opaque[6]; 711 } __attribute__((aligned(8))); 712 713 714 static u32 scx_get_task_state(const struct task_struct *p) 715 { 716 return p->scx.flags & SCX_TASK_STATE_MASK; 717 } 718 719 static void scx_set_task_state(struct task_struct *p, u32 state) 720 { 721 u32 prev_state = scx_get_task_state(p); 722 bool warn = false; 723 724 switch (state) { 725 case SCX_TASK_NONE: 726 warn = prev_state == SCX_TASK_DEAD; 727 break; 728 case SCX_TASK_INIT_BEGIN: 729 warn = prev_state != SCX_TASK_NONE; 730 break; 731 case SCX_TASK_INIT: 732 warn = prev_state != SCX_TASK_INIT_BEGIN; 733 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 734 break; 735 case SCX_TASK_READY: 736 warn = !(prev_state == SCX_TASK_INIT || 737 prev_state == SCX_TASK_ENABLED); 738 break; 739 case SCX_TASK_ENABLED: 740 warn = prev_state != SCX_TASK_READY; 741 break; 742 case SCX_TASK_DEAD: 743 warn = !(prev_state == SCX_TASK_NONE || 744 prev_state == SCX_TASK_INIT_BEGIN); 745 break; 746 default: 747 WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", 748 prev_state, state, p->comm, p->pid); 749 return; 750 } 751 752 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 753 prev_state, state, p->comm, p->pid); 754 755 p->scx.flags &= ~SCX_TASK_STATE_MASK; 756 p->scx.flags |= state; 757 } 758 759 /* 760 * SCX task iterator. 761 */ 762 struct scx_task_iter { 763 struct sched_ext_entity cursor; 764 struct task_struct *locked_task; 765 struct rq *rq; 766 struct rq_flags rf; 767 u32 cnt; 768 bool list_locked; 769 #ifdef CONFIG_EXT_SUB_SCHED 770 struct cgroup *cgrp; 771 struct cgroup_subsys_state *css_pos; 772 struct css_task_iter css_iter; 773 #endif 774 }; 775 776 /** 777 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 778 * @iter: iterator to init 779 * @cgrp: Optional root of cgroup subhierarchy to iterate 780 * 781 * Initialize @iter. Once initialized, @iter must eventually be stopped with 782 * scx_task_iter_stop(). 783 * 784 * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 785 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 786 * 787 * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 788 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 789 * task migrations. 790 * 791 * The two modes of iterations are largely independent and it's likely that 792 * scx_tasks can be removed in favor of always using cgroup iteration if 793 * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 794 * 795 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 796 * between this and the first next() call or between any two next() calls. If 797 * the locks are released between two next() calls, the caller is responsible 798 * for ensuring that the task being iterated remains accessible either through 799 * RCU read lock or obtaining a reference count. 800 * 801 * All tasks which existed when the iteration started are guaranteed to be 802 * visited as long as they are not dead. 803 */ 804 static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 805 { 806 memset(iter, 0, sizeof(*iter)); 807 808 #ifdef CONFIG_EXT_SUB_SCHED 809 if (cgrp) { 810 lockdep_assert_held(&cgroup_mutex); 811 iter->cgrp = cgrp; 812 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 813 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 814 &iter->css_iter); 815 return; 816 } 817 #endif 818 raw_spin_lock_irq(&scx_tasks_lock); 819 820 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 821 list_add(&iter->cursor.tasks_node, &scx_tasks); 822 iter->list_locked = true; 823 } 824 825 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 826 { 827 if (iter->locked_task) { 828 __balance_callbacks(iter->rq, &iter->rf); 829 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 830 iter->locked_task = NULL; 831 } 832 } 833 834 /** 835 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 836 * @iter: iterator to unlock 837 * 838 * If @iter is in the middle of a locked iteration, it may be locking the rq of 839 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 840 * This function can be safely called anytime during an iteration. The next 841 * iterator operation will automatically restore the necessary locking. 842 */ 843 static void scx_task_iter_unlock(struct scx_task_iter *iter) 844 { 845 __scx_task_iter_rq_unlock(iter); 846 if (iter->list_locked) { 847 iter->list_locked = false; 848 raw_spin_unlock_irq(&scx_tasks_lock); 849 } 850 } 851 852 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 853 { 854 if (!iter->list_locked) { 855 raw_spin_lock_irq(&scx_tasks_lock); 856 iter->list_locked = true; 857 } 858 } 859 860 /** 861 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 862 * @iter: iterator to exit 863 * 864 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 865 * which is released on return. If the iterator holds a task's rq lock, that rq 866 * lock is also released. See scx_task_iter_start() for details. 867 */ 868 static void scx_task_iter_stop(struct scx_task_iter *iter) 869 { 870 #ifdef CONFIG_EXT_SUB_SCHED 871 if (iter->cgrp) { 872 if (iter->css_pos) 873 css_task_iter_end(&iter->css_iter); 874 __scx_task_iter_rq_unlock(iter); 875 return; 876 } 877 #endif 878 __scx_task_iter_maybe_relock(iter); 879 list_del_init(&iter->cursor.tasks_node); 880 scx_task_iter_unlock(iter); 881 } 882 883 /** 884 * scx_task_iter_next - Next task 885 * @iter: iterator to walk 886 * 887 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 888 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 889 * by holding scx_tasks_lock for too long. 890 */ 891 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 892 { 893 struct list_head *cursor = &iter->cursor.tasks_node; 894 struct sched_ext_entity *pos; 895 896 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 897 scx_task_iter_unlock(iter); 898 cond_resched(); 899 } 900 901 #ifdef CONFIG_EXT_SUB_SCHED 902 if (iter->cgrp) { 903 while (iter->css_pos) { 904 struct task_struct *p; 905 906 p = css_task_iter_next(&iter->css_iter); 907 if (p) 908 return p; 909 910 css_task_iter_end(&iter->css_iter); 911 iter->css_pos = css_next_descendant_pre(iter->css_pos, 912 &iter->cgrp->self); 913 if (iter->css_pos) 914 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 915 &iter->css_iter); 916 } 917 return NULL; 918 } 919 #endif 920 __scx_task_iter_maybe_relock(iter); 921 922 list_for_each_entry(pos, cursor, tasks_node) { 923 if (&pos->tasks_node == &scx_tasks) 924 return NULL; 925 if (!(pos->flags & SCX_TASK_CURSOR)) { 926 list_move(cursor, &pos->tasks_node); 927 return container_of(pos, struct task_struct, scx); 928 } 929 } 930 931 /* can't happen, should always terminate at scx_tasks above */ 932 BUG(); 933 } 934 935 /** 936 * scx_task_iter_next_locked - Next non-idle task with its rq locked 937 * @iter: iterator to walk 938 * 939 * Visit the non-idle task with its rq lock held. Allows callers to specify 940 * whether they would like to filter out dead tasks. See scx_task_iter_start() 941 * for details. 942 */ 943 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 944 { 945 struct task_struct *p; 946 947 __scx_task_iter_rq_unlock(iter); 948 949 while ((p = scx_task_iter_next(iter))) { 950 /* 951 * scx_task_iter is used to prepare and move tasks into SCX 952 * while loading the BPF scheduler and vice-versa while 953 * unloading. The init_tasks ("swappers") should be excluded 954 * from the iteration because: 955 * 956 * - It's unsafe to use __setschduler_prio() on an init_task to 957 * determine the sched_class to use as it won't preserve its 958 * idle_sched_class. 959 * 960 * - ops.init/exit_task() can easily be confused if called with 961 * init_tasks as they, e.g., share PID 0. 962 * 963 * As init_tasks are never scheduled through SCX, they can be 964 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 965 * doesn't work here: 966 * 967 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 968 * yet been onlined. 969 * 970 * - %PF_IDLE can be set on tasks that are not init_tasks. See 971 * play_idle_precise() used by CONFIG_IDLE_INJECT. 972 * 973 * Test for idle_sched_class as only init_tasks are on it. 974 */ 975 if (p->sched_class == &idle_sched_class) 976 continue; 977 978 iter->rq = task_rq_lock(p, &iter->rf); 979 iter->locked_task = p; 980 981 /* 982 * cgroup_task_dead() removes the dead tasks from cset->tasks 983 * after sched_ext_dead() and cgroup iteration may see tasks 984 * which already finished sched_ext_dead(). %SCX_TASK_DEAD is 985 * set by sched_ext_dead() under @p's rq lock. Test it to 986 * avoid visiting tasks which are already dead from SCX POV. 987 */ 988 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 989 __scx_task_iter_rq_unlock(iter); 990 continue; 991 } 992 993 return p; 994 } 995 return NULL; 996 } 997 998 /** 999 * scx_add_event - Increase an event counter for 'name' by 'cnt' 1000 * @sch: scx_sched to account events for 1001 * @name: an event name defined in struct scx_event_stats 1002 * @cnt: the number of the event occurred 1003 * 1004 * This can be used when preemption is not disabled. 1005 */ 1006 #define scx_add_event(sch, name, cnt) do { \ 1007 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1008 trace_sched_ext_event(#name, (cnt)); \ 1009 } while(0) 1010 1011 /** 1012 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1013 * @sch: scx_sched to account events for 1014 * @name: an event name defined in struct scx_event_stats 1015 * @cnt: the number of the event occurred 1016 * 1017 * This should be used only when preemption is disabled. 1018 */ 1019 #define __scx_add_event(sch, name, cnt) do { \ 1020 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1021 trace_sched_ext_event(#name, cnt); \ 1022 } while(0) 1023 1024 /** 1025 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1026 * @dst_e: destination event stats 1027 * @src_e: source event stats 1028 * @kind: a kind of event to be aggregated 1029 */ 1030 #define scx_agg_event(dst_e, src_e, kind) do { \ 1031 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1032 } while(0) 1033 1034 /** 1035 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1036 * @s: output seq_buf 1037 * @events: event stats 1038 * @kind: a kind of event to dump 1039 */ 1040 #define scx_dump_event(s, events, kind) do { \ 1041 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1042 } while (0) 1043 1044 1045 static void scx_read_events(struct scx_sched *sch, 1046 struct scx_event_stats *events); 1047 1048 static enum scx_enable_state scx_enable_state(void) 1049 { 1050 return atomic_read(&scx_enable_state_var); 1051 } 1052 1053 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1054 { 1055 return atomic_xchg(&scx_enable_state_var, to); 1056 } 1057 1058 static bool scx_tryset_enable_state(enum scx_enable_state to, 1059 enum scx_enable_state from) 1060 { 1061 int from_v = from; 1062 1063 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1064 } 1065 1066 /** 1067 * wait_ops_state - Busy-wait the specified ops state to end 1068 * @p: target task 1069 * @opss: state to wait the end of 1070 * 1071 * Busy-wait for @p to transition out of @opss. This can only be used when the 1072 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1073 * has load_acquire semantics to ensure that the caller can see the updates made 1074 * in the enqueueing and dispatching paths. 1075 */ 1076 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1077 { 1078 do { 1079 cpu_relax(); 1080 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1081 } 1082 1083 static inline bool __cpu_valid(s32 cpu) 1084 { 1085 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1086 } 1087 1088 /** 1089 * ops_cpu_valid - Verify a cpu number, to be used on ops input args 1090 * @sch: scx_sched to abort on error 1091 * @cpu: cpu number which came from a BPF ops 1092 * @where: extra information reported on error 1093 * 1094 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1095 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1096 * an ops error. 1097 */ 1098 static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1099 { 1100 if (__cpu_valid(cpu)) { 1101 return true; 1102 } else { 1103 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1104 return false; 1105 } 1106 } 1107 1108 /** 1109 * ops_sanitize_err - Sanitize a -errno value 1110 * @sch: scx_sched to error out on error 1111 * @ops_name: operation to blame on failure 1112 * @err: -errno value to sanitize 1113 * 1114 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1115 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1116 * cause misbehaviors. For an example, a large negative return from 1117 * ops.init_task() triggers an oops when passed up the call chain because the 1118 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1119 * handled as a pointer. 1120 */ 1121 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1122 { 1123 if (err < 0 && err >= -MAX_ERRNO) 1124 return err; 1125 1126 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1127 return -EPROTO; 1128 } 1129 1130 static void deferred_bal_cb_workfn(struct rq *rq) 1131 { 1132 run_deferred(rq); 1133 } 1134 1135 static void deferred_irq_workfn(struct irq_work *irq_work) 1136 { 1137 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1138 1139 raw_spin_rq_lock(rq); 1140 run_deferred(rq); 1141 raw_spin_rq_unlock(rq); 1142 } 1143 1144 /** 1145 * schedule_deferred - Schedule execution of deferred actions on an rq 1146 * @rq: target rq 1147 * 1148 * Schedule execution of deferred actions on @rq. Deferred actions are executed 1149 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1150 * to other rqs. 1151 */ 1152 static void schedule_deferred(struct rq *rq) 1153 { 1154 /* 1155 * This is the fallback when schedule_deferred_locked() can't use 1156 * the cheaper balance callback or wakeup hook paths (the target 1157 * CPU is not in balance or wakeup). Currently, this is primarily 1158 * hit by reenqueue operations targeting a remote CPU. 1159 * 1160 * Queue on the target CPU. The deferred work can run from any CPU 1161 * correctly - the _locked() path already processes remote rqs from 1162 * the calling CPU - but targeting the owning CPU allows IPI delivery 1163 * without waiting for the calling CPU to re-enable IRQs and is 1164 * cheaper as the reenqueue runs locally. 1165 */ 1166 irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); 1167 } 1168 1169 /** 1170 * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1171 * @rq: target rq 1172 * 1173 * Schedule execution of deferred actions on @rq. Equivalent to 1174 * schedule_deferred() but requires @rq to be locked and can be more efficient. 1175 */ 1176 static void schedule_deferred_locked(struct rq *rq) 1177 { 1178 lockdep_assert_rq_held(rq); 1179 1180 /* 1181 * If in the middle of waking up a task, task_woken_scx() will be called 1182 * afterwards which will then run the deferred actions, no need to 1183 * schedule anything. 1184 */ 1185 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1186 return; 1187 1188 /* Don't do anything if there already is a deferred operation. */ 1189 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1190 return; 1191 1192 /* 1193 * If in balance, the balance callbacks will be called before rq lock is 1194 * released. Schedule one. 1195 * 1196 * 1197 * We can't directly insert the callback into the 1198 * rq's list: The call can drop its lock and make the pending balance 1199 * callback visible to unrelated code paths that call rq_pin_lock(). 1200 * 1201 * Just let balance_one() know that it must do it itself. 1202 */ 1203 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1204 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1205 return; 1206 } 1207 1208 /* 1209 * No scheduler hooks available. Use the generic irq_work path. The 1210 * above WAKEUP and BALANCE paths should cover most of the cases and the 1211 * time to IRQ re-enable shouldn't be long. 1212 */ 1213 schedule_deferred(rq); 1214 } 1215 1216 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1217 u64 reenq_flags, struct rq *locked_rq) 1218 { 1219 struct rq *rq; 1220 1221 /* 1222 * Allowing reenqueues doesn't make sense while bypassing. This also 1223 * blocks from new reenqueues to be scheduled on dead scheds. 1224 */ 1225 if (unlikely(READ_ONCE(sch->bypass_depth))) 1226 return; 1227 1228 if (dsq->id == SCX_DSQ_LOCAL) { 1229 rq = container_of(dsq, struct rq, scx.local_dsq); 1230 1231 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1232 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1233 1234 /* 1235 * Pairs with smp_mb() in process_deferred_reenq_locals() and 1236 * guarantees that there is a reenq_local() afterwards. 1237 */ 1238 smp_mb(); 1239 1240 if (list_empty(&drl->node) || 1241 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1242 1243 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1244 1245 if (list_empty(&drl->node)) 1246 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1247 WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1248 } 1249 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1250 rq = this_rq(); 1251 1252 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1253 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1254 1255 /* 1256 * Pairs with smp_mb() in process_deferred_reenq_users() and 1257 * guarantees that there is a reenq_user() afterwards. 1258 */ 1259 smp_mb(); 1260 1261 if (list_empty(&dru->node) || 1262 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1263 1264 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1265 1266 if (list_empty(&dru->node)) 1267 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1268 WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1269 } 1270 } else { 1271 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1272 return; 1273 } 1274 1275 if (rq == locked_rq) 1276 schedule_deferred_locked(rq); 1277 else 1278 schedule_deferred(rq); 1279 } 1280 1281 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1282 { 1283 struct scx_sched *root = rcu_dereference_sched(scx_root); 1284 1285 if (WARN_ON_ONCE(!root)) 1286 return; 1287 1288 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); 1289 } 1290 1291 /** 1292 * touch_core_sched - Update timestamp used for core-sched task ordering 1293 * @rq: rq to read clock from, must be locked 1294 * @p: task to update the timestamp for 1295 * 1296 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1297 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1298 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1299 * exhaustion). 1300 */ 1301 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1302 { 1303 lockdep_assert_rq_held(rq); 1304 1305 #ifdef CONFIG_SCHED_CORE 1306 /* 1307 * It's okay to update the timestamp spuriously. Use 1308 * sched_core_disabled() which is cheaper than enabled(). 1309 * 1310 * As this is used to determine ordering between tasks of sibling CPUs, 1311 * it may be better to use per-core dispatch sequence instead. 1312 */ 1313 if (!sched_core_disabled()) 1314 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1315 #endif 1316 } 1317 1318 /** 1319 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1320 * @rq: rq to read clock from, must be locked 1321 * @p: task being dispatched 1322 * 1323 * If the BPF scheduler implements custom core-sched ordering via 1324 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1325 * ordering within each local DSQ. This function is called from dispatch paths 1326 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1327 */ 1328 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1329 { 1330 lockdep_assert_rq_held(rq); 1331 1332 #ifdef CONFIG_SCHED_CORE 1333 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1334 touch_core_sched(rq, p); 1335 #endif 1336 } 1337 1338 static void update_curr_scx(struct rq *rq) 1339 { 1340 struct task_struct *curr = rq->curr; 1341 s64 delta_exec; 1342 1343 delta_exec = update_curr_common(rq); 1344 if (unlikely(delta_exec <= 0)) 1345 return; 1346 1347 if (curr->scx.slice != SCX_SLICE_INF) { 1348 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1349 if (!curr->scx.slice) 1350 touch_core_sched(rq, curr); 1351 } 1352 1353 dl_server_update(&rq->ext_server, delta_exec); 1354 } 1355 1356 static bool scx_dsq_priq_less(struct rb_node *node_a, 1357 const struct rb_node *node_b) 1358 { 1359 const struct task_struct *a = 1360 container_of(node_a, struct task_struct, scx.dsq_priq); 1361 const struct task_struct *b = 1362 container_of(node_b, struct task_struct, scx.dsq_priq); 1363 1364 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1365 } 1366 1367 static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1368 { 1369 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1370 WRITE_ONCE(dsq->nr, dsq->nr + 1); 1371 1372 /* 1373 * Once @p reaches a local DSQ, it can only leave it by being dispatched 1374 * to the CPU or dequeued. In both cases, the only way @p can go back to 1375 * the BPF sched is through enqueueing. If being inserted into a local 1376 * DSQ with IMMED, persist the state until the next enqueueing event in 1377 * do_enqueue_task() so that we can maintain IMMED protection through 1378 * e.g. SAVE/RESTORE cycles and slice extensions. 1379 */ 1380 if (enq_flags & SCX_ENQ_IMMED) { 1381 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1382 WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1383 return; 1384 } 1385 p->scx.flags |= SCX_TASK_IMMED; 1386 } 1387 1388 if (p->scx.flags & SCX_TASK_IMMED) { 1389 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1390 1391 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1392 return; 1393 1394 rq->scx.nr_immed++; 1395 1396 /* 1397 * If @rq already had other tasks or the current task is not 1398 * done yet, @p can't go on the CPU immediately. Re-enqueue. 1399 */ 1400 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1401 schedule_reenq_local(rq, 0); 1402 } 1403 } 1404 1405 static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1406 { 1407 /* see dsq_inc_nr() */ 1408 WRITE_ONCE(dsq->nr, dsq->nr - 1); 1409 1410 if (p->scx.flags & SCX_TASK_IMMED) { 1411 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1412 1413 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1414 WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1415 return; 1416 1417 rq->scx.nr_immed--; 1418 } 1419 } 1420 1421 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1422 { 1423 p->scx.slice = READ_ONCE(sch->slice_dfl); 1424 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1425 } 1426 1427 /* 1428 * Return true if @p is moving due to an internal SCX migration, false 1429 * otherwise. 1430 */ 1431 static inline bool task_scx_migrating(struct task_struct *p) 1432 { 1433 /* 1434 * We only need to check sticky_cpu: it is set to the destination 1435 * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1436 * and cleared when the task is enqueued on the destination, so it 1437 * is only non-negative during an internal SCX migration. 1438 */ 1439 return p->scx.sticky_cpu >= 0; 1440 } 1441 1442 /* 1443 * Call ops.dequeue() if the task is in BPF custody and not migrating. 1444 * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1445 */ 1446 static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1447 struct task_struct *p, u64 deq_flags) 1448 { 1449 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1450 return; 1451 1452 if (SCX_HAS_OP(sch, dequeue)) 1453 SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); 1454 1455 p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1456 } 1457 1458 static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1459 struct task_struct *p, u64 enq_flags) 1460 { 1461 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1462 1463 call_task_dequeue(sch, rq, p, 0); 1464 1465 /* 1466 * Note that @rq's lock may be dropped between this enqueue and @p 1467 * actually getting on CPU. This gives higher-class tasks (e.g. RT) 1468 * an opportunity to wake up on @rq and prevent @p from running. 1469 * Here are some concrete examples: 1470 * 1471 * Example 1: 1472 * 1473 * We dispatch two tasks from a single ops.dispatch(): 1474 * - First, a local task to this CPU's local DSQ; 1475 * - Second, a local/remote task to a remote CPU's local DSQ. 1476 * We must drop the local rq lock in order to finish the second 1477 * dispatch. In that time, an RT task can wake up on the local rq. 1478 * 1479 * Example 2: 1480 * 1481 * We dispatch a local/remote task to a remote CPU's local DSQ. 1482 * We must drop the remote rq lock before the dispatched task can run, 1483 * which gives an RT task an opportunity to wake up on the remote rq. 1484 * 1485 * Both examples work the same if we replace dispatching with moving 1486 * the tasks from a user-created DSQ. 1487 * 1488 * We must detect these wakeups so that we can re-enqueue IMMED tasks 1489 * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this 1490 * purpose, but for it to be invoked, we must ensure that we bump 1491 * @rq->next_class to &ext_sched_class if it's currently idle. 1492 * 1493 * wakeup_preempt() does the bumping, and since we only invoke it if 1494 * @rq->next_class is below &ext_sched_class, it will also 1495 * resched_curr(rq). 1496 */ 1497 if (sched_class_above(p->sched_class, rq->next_class)) 1498 wakeup_preempt(rq, p, 0); 1499 1500 /* 1501 * If @rq is in balance, the CPU is already vacant and looking for the 1502 * next task to run. No need to preempt or trigger resched after moving 1503 * @p into its local DSQ. 1504 * Note that the wakeup_preempt() above may have already triggered 1505 * a resched if @rq->next_class was idle. It's harmless, since 1506 * need_resched is cleared immediately after task pick. 1507 */ 1508 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1509 return; 1510 1511 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1512 rq->curr->sched_class == &ext_sched_class) { 1513 rq->curr->scx.slice = 0; 1514 resched_curr(rq); 1515 } 1516 } 1517 1518 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1519 struct scx_dispatch_q *dsq, struct task_struct *p, 1520 u64 enq_flags) 1521 { 1522 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1523 1524 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1525 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1526 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1527 1528 if (!is_local) { 1529 raw_spin_lock_nested(&dsq->lock, 1530 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1531 1532 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1533 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1534 /* fall back to the global dsq */ 1535 raw_spin_unlock(&dsq->lock); 1536 dsq = find_global_dsq(sch, task_cpu(p)); 1537 raw_spin_lock(&dsq->lock); 1538 } 1539 } 1540 1541 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1542 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1543 /* 1544 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1545 * their FIFO queues. To avoid confusion and accidentally 1546 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1547 * disallow any internal DSQ from doing vtime ordering of 1548 * tasks. 1549 */ 1550 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1551 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1552 } 1553 1554 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1555 struct rb_node *rbp; 1556 1557 /* 1558 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1559 * linked to both the rbtree and list on PRIQs, this can only be 1560 * tested easily when adding the first task. 1561 */ 1562 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1563 nldsq_next_task(dsq, NULL, false))) 1564 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1565 dsq->id); 1566 1567 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1568 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1569 1570 /* 1571 * Find the previous task and insert after it on the list so 1572 * that @dsq->list is vtime ordered. 1573 */ 1574 rbp = rb_prev(&p->scx.dsq_priq); 1575 if (rbp) { 1576 struct task_struct *prev = 1577 container_of(rbp, struct task_struct, 1578 scx.dsq_priq); 1579 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1580 /* first task unchanged - no update needed */ 1581 } else { 1582 list_add(&p->scx.dsq_list.node, &dsq->list); 1583 /* not builtin and new task is at head - use fastpath */ 1584 rcu_assign_pointer(dsq->first_task, p); 1585 } 1586 } else { 1587 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1588 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1589 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1590 dsq->id); 1591 1592 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1593 list_add(&p->scx.dsq_list.node, &dsq->list); 1594 /* new task inserted at head - use fastpath */ 1595 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1596 rcu_assign_pointer(dsq->first_task, p); 1597 } else { 1598 /* 1599 * dsq->list can contain parked BPF iterator cursors, so 1600 * list_empty() here isn't a reliable proxy for "no real 1601 * task in the DSQ". Test dsq->first_task directly. 1602 */ 1603 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1604 if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1605 rcu_assign_pointer(dsq->first_task, p); 1606 } 1607 } 1608 1609 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1610 WRITE_ONCE(dsq->seq, dsq->seq + 1); 1611 p->scx.dsq_seq = dsq->seq; 1612 1613 dsq_inc_nr(dsq, p, enq_flags); 1614 p->scx.dsq = dsq; 1615 1616 /* 1617 * Update custody and call ops.dequeue() before clearing ops_state: 1618 * once ops_state is cleared, waiters in ops_dequeue() can proceed 1619 * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1620 * ops_state first, both sides would modify p->scx.flags 1621 * concurrently in a non-atomic way. 1622 */ 1623 if (is_local) { 1624 local_dsq_post_enq(sch, dsq, p, enq_flags); 1625 } else { 1626 /* 1627 * Task on global/bypass DSQ: leave custody, task on 1628 * non-terminal DSQ: enter custody. 1629 */ 1630 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1631 call_task_dequeue(sch, rq, p, 0); 1632 else 1633 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1634 1635 raw_spin_unlock(&dsq->lock); 1636 } 1637 1638 /* 1639 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1640 * match waiters' load_acquire. 1641 */ 1642 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1643 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1644 } 1645 1646 static void task_unlink_from_dsq(struct task_struct *p, 1647 struct scx_dispatch_q *dsq) 1648 { 1649 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1650 1651 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1652 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1653 RB_CLEAR_NODE(&p->scx.dsq_priq); 1654 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1655 } 1656 1657 list_del_init(&p->scx.dsq_list.node); 1658 dsq_dec_nr(dsq, p); 1659 1660 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1661 struct task_struct *first_task; 1662 1663 first_task = nldsq_next_task(dsq, NULL, false); 1664 rcu_assign_pointer(dsq->first_task, first_task); 1665 } 1666 } 1667 1668 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1669 { 1670 struct scx_dispatch_q *dsq = p->scx.dsq; 1671 bool is_local = dsq == &rq->scx.local_dsq; 1672 1673 lockdep_assert_rq_held(rq); 1674 1675 if (!dsq) { 1676 /* 1677 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1678 * Unlinking is all that's needed to cancel. 1679 */ 1680 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1681 list_del_init(&p->scx.dsq_list.node); 1682 1683 /* 1684 * When dispatching directly from the BPF scheduler to a local 1685 * DSQ, the task isn't associated with any DSQ but 1686 * @p->scx.holding_cpu may be set under the protection of 1687 * %SCX_OPSS_DISPATCHING. 1688 */ 1689 if (p->scx.holding_cpu >= 0) 1690 p->scx.holding_cpu = -1; 1691 1692 return; 1693 } 1694 1695 if (!is_local) 1696 raw_spin_lock(&dsq->lock); 1697 1698 /* 1699 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1700 * change underneath us. 1701 */ 1702 if (p->scx.holding_cpu < 0) { 1703 /* @p must still be on @dsq, dequeue */ 1704 task_unlink_from_dsq(p, dsq); 1705 } else { 1706 /* 1707 * We're racing against dispatch_to_local_dsq() which already 1708 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1709 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1710 * the race. 1711 */ 1712 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1713 p->scx.holding_cpu = -1; 1714 } 1715 p->scx.dsq = NULL; 1716 1717 if (!is_local) 1718 raw_spin_unlock(&dsq->lock); 1719 } 1720 1721 /* 1722 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1723 * and dsq are locked. 1724 */ 1725 static void dispatch_dequeue_locked(struct task_struct *p, 1726 struct scx_dispatch_q *dsq) 1727 { 1728 lockdep_assert_rq_held(task_rq(p)); 1729 lockdep_assert_held(&dsq->lock); 1730 1731 task_unlink_from_dsq(p, dsq); 1732 p->scx.dsq = NULL; 1733 } 1734 1735 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1736 struct rq *rq, u64 dsq_id, 1737 s32 tcpu) 1738 { 1739 struct scx_dispatch_q *dsq; 1740 1741 if (dsq_id == SCX_DSQ_LOCAL) 1742 return &rq->scx.local_dsq; 1743 1744 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1745 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1746 1747 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1748 return find_global_dsq(sch, tcpu); 1749 1750 return &cpu_rq(cpu)->scx.local_dsq; 1751 } 1752 1753 if (dsq_id == SCX_DSQ_GLOBAL) 1754 dsq = find_global_dsq(sch, tcpu); 1755 else 1756 dsq = find_user_dsq(sch, dsq_id); 1757 1758 if (unlikely(!dsq)) { 1759 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1760 return find_global_dsq(sch, tcpu); 1761 } 1762 1763 return dsq; 1764 } 1765 1766 static void mark_direct_dispatch(struct scx_sched *sch, 1767 struct task_struct *ddsp_task, 1768 struct task_struct *p, u64 dsq_id, 1769 u64 enq_flags) 1770 { 1771 /* 1772 * Mark that dispatch already happened from ops.select_cpu() or 1773 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1774 * which can never match a valid task pointer. 1775 */ 1776 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1777 1778 /* @p must match the task on the enqueue path */ 1779 if (unlikely(p != ddsp_task)) { 1780 if (IS_ERR(ddsp_task)) 1781 scx_error(sch, "%s[%d] already direct-dispatched", 1782 p->comm, p->pid); 1783 else 1784 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1785 ddsp_task->comm, ddsp_task->pid, 1786 p->comm, p->pid); 1787 return; 1788 } 1789 1790 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1791 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1792 1793 p->scx.ddsp_dsq_id = dsq_id; 1794 p->scx.ddsp_enq_flags = enq_flags; 1795 } 1796 1797 /* 1798 * Clear @p direct dispatch state when leaving the scheduler. 1799 * 1800 * Direct dispatch state must be cleared in the following cases: 1801 * - direct_dispatch(): cleared on the synchronous enqueue path, deferred 1802 * dispatch keeps the state until consumed 1803 * - process_ddsp_deferred_locals(): cleared after consuming deferred state, 1804 * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch 1805 * verdict is ignored (local/global/bypass) 1806 * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred 1807 * cancellation and holding_cpu races 1808 * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by 1809 * the scx_bypass() loop, so that stale state is not reused by a subsequent 1810 * scheduler instance 1811 */ 1812 static inline void clear_direct_dispatch(struct task_struct *p) 1813 { 1814 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1815 p->scx.ddsp_enq_flags = 0; 1816 } 1817 1818 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1819 u64 enq_flags) 1820 { 1821 struct rq *rq = task_rq(p); 1822 struct scx_dispatch_q *dsq = 1823 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1824 u64 ddsp_enq_flags; 1825 1826 touch_core_sched_dispatch(rq, p); 1827 1828 p->scx.ddsp_enq_flags |= enq_flags; 1829 1830 /* 1831 * We are in the enqueue path with @rq locked and pinned, and thus can't 1832 * double lock a remote rq and enqueue to its local DSQ. For 1833 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1834 * the enqueue so that it's executed when @rq can be unlocked. 1835 */ 1836 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1837 unsigned long opss; 1838 1839 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1840 1841 switch (opss & SCX_OPSS_STATE_MASK) { 1842 case SCX_OPSS_NONE: 1843 break; 1844 case SCX_OPSS_QUEUEING: 1845 /* 1846 * As @p was never passed to the BPF side, _release is 1847 * not strictly necessary. Still do it for consistency. 1848 */ 1849 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1850 break; 1851 default: 1852 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1853 p->comm, p->pid, opss); 1854 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1855 break; 1856 } 1857 1858 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1859 list_add_tail(&p->scx.dsq_list.node, 1860 &rq->scx.ddsp_deferred_locals); 1861 schedule_deferred_locked(rq); 1862 return; 1863 } 1864 1865 ddsp_enq_flags = p->scx.ddsp_enq_flags; 1866 clear_direct_dispatch(p); 1867 1868 dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1869 } 1870 1871 static bool scx_rq_online(struct rq *rq) 1872 { 1873 /* 1874 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1875 * the online state as seen from the BPF scheduler. cpu_active() test 1876 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1877 * stay set until the current scheduling operation is complete even if 1878 * we aren't locking @rq. 1879 */ 1880 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1881 } 1882 1883 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1884 int sticky_cpu) 1885 { 1886 struct scx_sched *sch = scx_task_sched(p); 1887 struct task_struct **ddsp_taskp; 1888 struct scx_dispatch_q *dsq; 1889 unsigned long qseq; 1890 1891 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1892 1893 /* internal movements - rq migration / RESTORE */ 1894 if (sticky_cpu == cpu_of(rq)) 1895 goto local_norefill; 1896 1897 /* 1898 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 1899 * Note that exiting and migration-disabled tasks that skip 1900 * ops.enqueue() below will lose IMMED protection unless 1901 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 1902 */ 1903 p->scx.flags &= ~SCX_TASK_IMMED; 1904 1905 /* 1906 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 1907 * is offline and are just running the hotplug path. Don't bother the 1908 * BPF scheduler. 1909 */ 1910 if (!scx_rq_online(rq)) 1911 goto local; 1912 1913 if (scx_bypassing(sch, cpu_of(rq))) { 1914 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 1915 goto bypass; 1916 } 1917 1918 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1919 goto direct; 1920 1921 /* see %SCX_OPS_ENQ_EXITING */ 1922 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 1923 unlikely(p->flags & PF_EXITING)) { 1924 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 1925 goto local; 1926 } 1927 1928 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 1929 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 1930 is_migration_disabled(p)) { 1931 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 1932 goto local; 1933 } 1934 1935 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 1936 goto global; 1937 1938 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1939 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1940 1941 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1942 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1943 1944 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1945 WARN_ON_ONCE(*ddsp_taskp); 1946 *ddsp_taskp = p; 1947 1948 SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); 1949 1950 *ddsp_taskp = NULL; 1951 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1952 goto direct; 1953 1954 /* 1955 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 1956 * so ops.dequeue() is called when it leaves custody. 1957 */ 1958 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1959 1960 /* 1961 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1962 * dequeue may be waiting. The store_release matches their load_acquire. 1963 */ 1964 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1965 return; 1966 1967 direct: 1968 direct_dispatch(sch, p, enq_flags); 1969 return; 1970 local_norefill: 1971 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 1972 return; 1973 local: 1974 dsq = &rq->scx.local_dsq; 1975 goto enqueue; 1976 global: 1977 dsq = find_global_dsq(sch, task_cpu(p)); 1978 goto enqueue; 1979 bypass: 1980 dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 1981 goto enqueue; 1982 1983 enqueue: 1984 /* 1985 * For task-ordering, slice refill must be treated as implying the end 1986 * of the current slice. Otherwise, the longer @p stays on the CPU, the 1987 * higher priority it becomes from scx_prio_less()'s POV. 1988 */ 1989 touch_core_sched(rq, p); 1990 refill_task_slice_dfl(sch, p); 1991 clear_direct_dispatch(p); 1992 dispatch_enqueue(sch, rq, dsq, p, enq_flags); 1993 } 1994 1995 static bool task_runnable(const struct task_struct *p) 1996 { 1997 return !list_empty(&p->scx.runnable_node); 1998 } 1999 2000 static void set_task_runnable(struct rq *rq, struct task_struct *p) 2001 { 2002 lockdep_assert_rq_held(rq); 2003 2004 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 2005 p->scx.runnable_at = jiffies; 2006 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 2007 } 2008 2009 /* 2010 * list_add_tail() must be used. scx_bypass() depends on tasks being 2011 * appended to the runnable_list. 2012 */ 2013 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 2014 } 2015 2016 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2017 { 2018 list_del_init(&p->scx.runnable_node); 2019 if (reset_runnable_at) 2020 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2021 } 2022 2023 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 2024 { 2025 struct scx_sched *sch = scx_task_sched(p); 2026 int sticky_cpu = p->scx.sticky_cpu; 2027 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 2028 2029 if (enq_flags & ENQUEUE_WAKEUP) 2030 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2031 2032 /* 2033 * Restoring a running task will be immediately followed by 2034 * set_next_task_scx() which expects the task to not be on the BPF 2035 * scheduler as tasks can only start running through local DSQs. Force 2036 * direct-dispatch into the local DSQ by setting the sticky_cpu. 2037 */ 2038 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2039 sticky_cpu = cpu_of(rq); 2040 2041 if (p->scx.flags & SCX_TASK_QUEUED) { 2042 WARN_ON_ONCE(!task_runnable(p)); 2043 goto out; 2044 } 2045 2046 set_task_runnable(rq, p); 2047 p->scx.flags |= SCX_TASK_QUEUED; 2048 rq->scx.nr_running++; 2049 add_nr_running(rq, 1); 2050 2051 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2052 SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); 2053 2054 if (enq_flags & SCX_ENQ_WAKEUP) 2055 touch_core_sched(rq, p); 2056 2057 /* Start dl_server if this is the first task being enqueued */ 2058 if (rq->scx.nr_running == 1) 2059 dl_server_start(&rq->ext_server); 2060 2061 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2062 2063 if (sticky_cpu >= 0) 2064 p->scx.sticky_cpu = -1; 2065 out: 2066 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2067 2068 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2069 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2070 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2071 } 2072 2073 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2074 { 2075 struct scx_sched *sch = scx_task_sched(p); 2076 unsigned long opss; 2077 2078 /* dequeue is always temporary, don't reset runnable_at */ 2079 clr_task_runnable(p, false); 2080 2081 /* acquire ensures that we see the preceding updates on QUEUED */ 2082 opss = atomic_long_read_acquire(&p->scx.ops_state); 2083 2084 switch (opss & SCX_OPSS_STATE_MASK) { 2085 case SCX_OPSS_NONE: 2086 break; 2087 case SCX_OPSS_QUEUEING: 2088 /* 2089 * QUEUEING is started and finished while holding @p's rq lock. 2090 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2091 */ 2092 BUG(); 2093 case SCX_OPSS_QUEUED: 2094 /* A queued task must always be in BPF scheduler's custody */ 2095 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); 2096 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2097 SCX_OPSS_NONE)) 2098 break; 2099 fallthrough; 2100 case SCX_OPSS_DISPATCHING: 2101 /* 2102 * If @p is being dispatched from the BPF scheduler to a DSQ, 2103 * wait for the transfer to complete so that @p doesn't get 2104 * added to its DSQ after dequeueing is complete. 2105 * 2106 * As we're waiting on DISPATCHING with the rq locked, the 2107 * dispatching side shouldn't try to lock the rq while 2108 * DISPATCHING is set. See dispatch_to_local_dsq(). 2109 * 2110 * DISPATCHING shouldn't have qseq set and control can reach 2111 * here with NONE @opss from the above QUEUED case block. 2112 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2113 */ 2114 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2115 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2116 break; 2117 } 2118 2119 /* 2120 * Call ops.dequeue() if the task is still in BPF custody. 2121 * 2122 * The code that clears ops_state to %SCX_OPSS_NONE does not always 2123 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2124 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2125 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2126 * so that a concurrent dequeue can proceed, but we clear 2127 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2128 * task. So we can see NONE + IN_CUSTODY here and we must handle 2129 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2130 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2131 * it is enqueued on the destination. 2132 */ 2133 call_task_dequeue(sch, rq, p, deq_flags); 2134 } 2135 2136 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2137 { 2138 struct scx_sched *sch = scx_task_sched(p); 2139 u64 deq_flags = core_deq_flags; 2140 2141 /* 2142 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2143 * change (not sleep or core-sched pick). 2144 */ 2145 if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2146 deq_flags |= SCX_DEQ_SCHED_CHANGE; 2147 2148 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2149 WARN_ON_ONCE(task_runnable(p)); 2150 return true; 2151 } 2152 2153 ops_dequeue(rq, p, deq_flags); 2154 2155 /* 2156 * A currently running task which is going off @rq first gets dequeued 2157 * and then stops running. As we want running <-> stopping transitions 2158 * to be contained within runnable <-> quiescent transitions, trigger 2159 * ->stopping() early here instead of in put_prev_task_scx(). 2160 * 2161 * @p may go through multiple stopping <-> running transitions between 2162 * here and put_prev_task_scx() if task attribute changes occur while 2163 * balance_one() leaves @rq unlocked. However, they don't contain any 2164 * information meaningful to the BPF scheduler and can be suppressed by 2165 * skipping the callbacks if the task is !QUEUED. 2166 */ 2167 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2168 update_curr_scx(rq); 2169 SCX_CALL_OP_TASK(sch, stopping, rq, p, false); 2170 } 2171 2172 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2173 SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); 2174 2175 if (deq_flags & SCX_DEQ_SLEEP) 2176 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2177 else 2178 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2179 2180 p->scx.flags &= ~SCX_TASK_QUEUED; 2181 rq->scx.nr_running--; 2182 sub_nr_running(rq, 1); 2183 2184 dispatch_dequeue(rq, p); 2185 clear_direct_dispatch(p); 2186 return true; 2187 } 2188 2189 static void yield_task_scx(struct rq *rq) 2190 { 2191 struct task_struct *p = rq->donor; 2192 struct scx_sched *sch = scx_task_sched(p); 2193 2194 if (SCX_HAS_OP(sch, yield)) 2195 SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); 2196 else 2197 p->scx.slice = 0; 2198 } 2199 2200 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2201 { 2202 struct task_struct *from = rq->donor; 2203 struct scx_sched *sch = scx_task_sched(from); 2204 2205 if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2206 return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); 2207 else 2208 return false; 2209 } 2210 2211 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2212 { 2213 /* 2214 * Preemption between SCX tasks is implemented by resetting the victim 2215 * task's slice to 0 and triggering reschedule on the target CPU. 2216 * Nothing to do. 2217 */ 2218 if (p->sched_class == &ext_sched_class) 2219 return; 2220 2221 /* 2222 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2223 * This captures all preemption cases including: 2224 * 2225 * - A SCX task is currently running. 2226 * 2227 * - @rq is waking from idle due to a SCX task waking to it. 2228 * 2229 * - A higher-priority wakes up while SCX dispatch is in progress. 2230 */ 2231 if (rq->scx.nr_immed) 2232 schedule_reenq_local(rq, 0); 2233 } 2234 2235 static void move_local_task_to_local_dsq(struct scx_sched *sch, 2236 struct task_struct *p, u64 enq_flags, 2237 struct scx_dispatch_q *src_dsq, 2238 struct rq *dst_rq) 2239 { 2240 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2241 2242 /* @dsq is locked and @p is on @dst_rq */ 2243 lockdep_assert_held(&src_dsq->lock); 2244 lockdep_assert_rq_held(dst_rq); 2245 2246 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2247 2248 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2249 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2250 else 2251 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2252 2253 dsq_inc_nr(dst_dsq, p, enq_flags); 2254 p->scx.dsq = dst_dsq; 2255 2256 local_dsq_post_enq(sch, dst_dsq, p, enq_flags); 2257 } 2258 2259 /** 2260 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2261 * @p: task to move 2262 * @enq_flags: %SCX_ENQ_* 2263 * @src_rq: rq to move the task from, locked on entry, released on return 2264 * @dst_rq: rq to move the task into, locked on return 2265 * 2266 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2267 */ 2268 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2269 struct rq *src_rq, struct rq *dst_rq) 2270 { 2271 lockdep_assert_rq_held(src_rq); 2272 2273 /* 2274 * Set sticky_cpu before deactivate_task() to properly mark the 2275 * beginning of an SCX-internal migration. 2276 */ 2277 p->scx.sticky_cpu = cpu_of(dst_rq); 2278 deactivate_task(src_rq, p, 0); 2279 set_task_cpu(p, cpu_of(dst_rq)); 2280 2281 raw_spin_rq_unlock(src_rq); 2282 raw_spin_rq_lock(dst_rq); 2283 2284 /* 2285 * We want to pass scx-specific enq_flags but activate_task() will 2286 * truncate the upper 32 bit. As we own @rq, we can pass them through 2287 * @rq->scx.extra_enq_flags instead. 2288 */ 2289 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2290 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2291 dst_rq->scx.extra_enq_flags = enq_flags; 2292 activate_task(dst_rq, p, 0); 2293 dst_rq->scx.extra_enq_flags = 0; 2294 } 2295 2296 /* 2297 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2298 * differences: 2299 * 2300 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2301 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2302 * this CPU?". 2303 * 2304 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2305 * must be allowed to finish on the CPU that it's currently on regardless of 2306 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2307 * BPF scheduler shouldn't attempt to migrate a task which has migration 2308 * disabled. 2309 * 2310 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2311 * no to the BPF scheduler initiated migrations while offline. 2312 * 2313 * The caller must ensure that @p and @rq are on different CPUs. 2314 */ 2315 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2316 struct task_struct *p, struct rq *rq, 2317 bool enforce) 2318 { 2319 s32 cpu = cpu_of(rq); 2320 2321 WARN_ON_ONCE(task_cpu(p) == cpu); 2322 2323 /* 2324 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2325 * the pinned CPU in migrate_disable_switch() while @p is being switched 2326 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2327 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2328 * @p passing the below task_allowed_on_cpu() check while migration is 2329 * disabled. 2330 * 2331 * Test the migration disabled state first as the race window is narrow 2332 * and the BPF scheduler failing to check migration disabled state can 2333 * easily be masked if task_allowed_on_cpu() is done first. 2334 */ 2335 if (unlikely(is_migration_disabled(p))) { 2336 if (enforce) 2337 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2338 p->comm, p->pid, task_cpu(p), cpu); 2339 return false; 2340 } 2341 2342 /* 2343 * We don't require the BPF scheduler to avoid dispatching to offline 2344 * CPUs mostly for convenience but also because CPUs can go offline 2345 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2346 * picked CPU is outside the allowed mask. 2347 */ 2348 if (!task_allowed_on_cpu(p, cpu)) { 2349 if (enforce) 2350 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2351 cpu, p->comm, p->pid); 2352 return false; 2353 } 2354 2355 if (!scx_rq_online(rq)) { 2356 if (enforce) 2357 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2358 return false; 2359 } 2360 2361 return true; 2362 } 2363 2364 /** 2365 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2366 * @p: target task 2367 * @dsq: locked DSQ @p is currently on 2368 * @src_rq: rq @p is currently on, stable with @dsq locked 2369 * 2370 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2371 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2372 * required when transferring into a local DSQ. Even when transferring into a 2373 * non-local DSQ, it's better to use the same mechanism to protect against 2374 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2375 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2376 * 2377 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2378 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2379 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2380 * dancing from our side. 2381 * 2382 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2383 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2384 * would be cleared to -1. While other cpus may have updated it to different 2385 * values afterwards, as this operation can't be preempted or recurse, the 2386 * holding_cpu can never become this CPU again before we're done. Thus, we can 2387 * tell whether we lost to dequeue by testing whether the holding_cpu still 2388 * points to this CPU. See dispatch_dequeue() for the counterpart. 2389 * 2390 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2391 * still valid. %false if lost to dequeue. 2392 */ 2393 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2394 struct scx_dispatch_q *dsq, 2395 struct rq *src_rq) 2396 { 2397 s32 cpu = raw_smp_processor_id(); 2398 2399 lockdep_assert_held(&dsq->lock); 2400 2401 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2402 task_unlink_from_dsq(p, dsq); 2403 p->scx.holding_cpu = cpu; 2404 2405 raw_spin_unlock(&dsq->lock); 2406 raw_spin_rq_lock(src_rq); 2407 2408 /* task_rq couldn't have changed if we're still the holding cpu */ 2409 return likely(p->scx.holding_cpu == cpu) && 2410 !WARN_ON_ONCE(src_rq != task_rq(p)); 2411 } 2412 2413 static bool consume_remote_task(struct rq *this_rq, 2414 struct task_struct *p, u64 enq_flags, 2415 struct scx_dispatch_q *dsq, struct rq *src_rq) 2416 { 2417 raw_spin_rq_unlock(this_rq); 2418 2419 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2420 move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); 2421 return true; 2422 } else { 2423 raw_spin_rq_unlock(src_rq); 2424 raw_spin_rq_lock(this_rq); 2425 return false; 2426 } 2427 } 2428 2429 /** 2430 * move_task_between_dsqs() - Move a task from one DSQ to another 2431 * @sch: scx_sched being operated on 2432 * @p: target task 2433 * @enq_flags: %SCX_ENQ_* 2434 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2435 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2436 * 2437 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2438 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2439 * will change. As @p's task_rq is locked, this function doesn't need to use the 2440 * holding_cpu mechanism. 2441 * 2442 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2443 * return value, is locked. 2444 */ 2445 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2446 struct task_struct *p, u64 enq_flags, 2447 struct scx_dispatch_q *src_dsq, 2448 struct scx_dispatch_q *dst_dsq) 2449 { 2450 struct rq *src_rq = task_rq(p), *dst_rq; 2451 2452 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2453 lockdep_assert_held(&src_dsq->lock); 2454 lockdep_assert_rq_held(src_rq); 2455 2456 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2457 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2458 if (src_rq != dst_rq && 2459 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2460 dst_dsq = find_global_dsq(sch, task_cpu(p)); 2461 dst_rq = src_rq; 2462 enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2463 } 2464 } else { 2465 /* no need to migrate if destination is a non-local DSQ */ 2466 dst_rq = src_rq; 2467 } 2468 2469 /* 2470 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2471 * CPU, @p will be migrated. 2472 */ 2473 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2474 /* @p is going from a non-local DSQ to a local DSQ */ 2475 if (src_rq == dst_rq) { 2476 task_unlink_from_dsq(p, src_dsq); 2477 move_local_task_to_local_dsq(sch, p, enq_flags, 2478 src_dsq, dst_rq); 2479 raw_spin_unlock(&src_dsq->lock); 2480 } else { 2481 raw_spin_unlock(&src_dsq->lock); 2482 move_remote_task_to_local_dsq(p, enq_flags, 2483 src_rq, dst_rq); 2484 } 2485 } else { 2486 /* 2487 * @p is going from a non-local DSQ to a non-local DSQ. As 2488 * $src_dsq is already locked, do an abbreviated dequeue. 2489 */ 2490 dispatch_dequeue_locked(p, src_dsq); 2491 raw_spin_unlock(&src_dsq->lock); 2492 2493 dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2494 } 2495 2496 return dst_rq; 2497 } 2498 2499 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2500 struct scx_dispatch_q *dsq, u64 enq_flags) 2501 { 2502 struct task_struct *p; 2503 retry: 2504 /* 2505 * The caller can't expect to successfully consume a task if the task's 2506 * addition to @dsq isn't guaranteed to be visible somehow. Test 2507 * @dsq->list without locking and skip if it seems empty. 2508 */ 2509 if (list_empty(&dsq->list)) 2510 return false; 2511 2512 raw_spin_lock(&dsq->lock); 2513 2514 nldsq_for_each_task(p, dsq) { 2515 struct rq *task_rq = task_rq(p); 2516 2517 /* 2518 * This loop can lead to multiple lockup scenarios, e.g. the BPF 2519 * scheduler can put an enormous number of affinitized tasks into 2520 * a contended DSQ, or the outer retry loop can repeatedly race 2521 * against scx_bypass() dequeueing tasks from @dsq trying to put 2522 * the system into the bypass mode. This can easily live-lock the 2523 * machine. If aborting, exit from all non-bypass DSQs. 2524 */ 2525 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2526 break; 2527 2528 if (rq == task_rq) { 2529 task_unlink_from_dsq(p, dsq); 2530 move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); 2531 raw_spin_unlock(&dsq->lock); 2532 return true; 2533 } 2534 2535 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2536 if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) 2537 return true; 2538 goto retry; 2539 } 2540 } 2541 2542 raw_spin_unlock(&dsq->lock); 2543 return false; 2544 } 2545 2546 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2547 { 2548 int node = cpu_to_node(cpu_of(rq)); 2549 2550 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); 2551 } 2552 2553 /** 2554 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2555 * @sch: scx_sched being operated on 2556 * @rq: current rq which is locked 2557 * @dst_dsq: destination DSQ 2558 * @p: task to dispatch 2559 * @enq_flags: %SCX_ENQ_* 2560 * 2561 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2562 * DSQ. This function performs all the synchronization dancing needed because 2563 * local DSQs are protected with rq locks. 2564 * 2565 * The caller must have exclusive ownership of @p (e.g. through 2566 * %SCX_OPSS_DISPATCHING). 2567 */ 2568 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2569 struct scx_dispatch_q *dst_dsq, 2570 struct task_struct *p, u64 enq_flags) 2571 { 2572 struct rq *src_rq = task_rq(p); 2573 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2574 struct rq *locked_rq = rq; 2575 2576 /* 2577 * We're synchronized against dequeue through DISPATCHING. As @p can't 2578 * be dequeued, its task_rq and cpus_allowed are stable too. 2579 * 2580 * If dispatching to @rq that @p is already on, no lock dancing needed. 2581 */ 2582 if (rq == src_rq && rq == dst_rq) { 2583 dispatch_enqueue(sch, rq, dst_dsq, p, 2584 enq_flags | SCX_ENQ_CLEAR_OPSS); 2585 return; 2586 } 2587 2588 if (src_rq != dst_rq && 2589 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2590 dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2591 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2592 return; 2593 } 2594 2595 /* 2596 * @p is on a possibly remote @src_rq which we need to lock to move the 2597 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2598 * on DISPATCHING, so we can't grab @src_rq lock while holding 2599 * DISPATCHING. 2600 * 2601 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2602 * we're moving from a DSQ and use the same mechanism - mark the task 2603 * under transfer with holding_cpu, release DISPATCHING and then follow 2604 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2605 */ 2606 p->scx.holding_cpu = raw_smp_processor_id(); 2607 2608 /* store_release ensures that dequeue sees the above */ 2609 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2610 2611 /* switch to @src_rq lock */ 2612 if (locked_rq != src_rq) { 2613 raw_spin_rq_unlock(locked_rq); 2614 locked_rq = src_rq; 2615 raw_spin_rq_lock(src_rq); 2616 } 2617 2618 /* task_rq couldn't have changed if we're still the holding cpu */ 2619 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2620 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2621 /* 2622 * If @p is staying on the same rq, there's no need to go 2623 * through the full deactivate/activate cycle. Optimize by 2624 * abbreviating move_remote_task_to_local_dsq(). 2625 */ 2626 if (src_rq == dst_rq) { 2627 p->scx.holding_cpu = -1; 2628 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2629 enq_flags); 2630 } else { 2631 move_remote_task_to_local_dsq(p, enq_flags, 2632 src_rq, dst_rq); 2633 /* task has been moved to dst_rq, which is now locked */ 2634 locked_rq = dst_rq; 2635 } 2636 2637 /* if the destination CPU is idle, wake it up */ 2638 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2639 resched_curr(dst_rq); 2640 } 2641 2642 /* switch back to @rq lock */ 2643 if (locked_rq != rq) { 2644 raw_spin_rq_unlock(locked_rq); 2645 raw_spin_rq_lock(rq); 2646 } 2647 } 2648 2649 /** 2650 * finish_dispatch - Asynchronously finish dispatching a task 2651 * @rq: current rq which is locked 2652 * @p: task to finish dispatching 2653 * @qseq_at_dispatch: qseq when @p started getting dispatched 2654 * @dsq_id: destination DSQ ID 2655 * @enq_flags: %SCX_ENQ_* 2656 * 2657 * Dispatching to local DSQs may need to wait for queueing to complete or 2658 * require rq lock dancing. As we don't wanna do either while inside 2659 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2660 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2661 * task and its qseq. Once ops.dispatch() returns, this function is called to 2662 * finish up. 2663 * 2664 * There is no guarantee that @p is still valid for dispatching or even that it 2665 * was valid in the first place. Make sure that the task is still owned by the 2666 * BPF scheduler and claim the ownership before dispatching. 2667 */ 2668 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2669 struct task_struct *p, 2670 unsigned long qseq_at_dispatch, 2671 u64 dsq_id, u64 enq_flags) 2672 { 2673 struct scx_dispatch_q *dsq; 2674 unsigned long opss; 2675 2676 touch_core_sched_dispatch(rq, p); 2677 retry: 2678 /* 2679 * No need for _acquire here. @p is accessed only after a successful 2680 * try_cmpxchg to DISPATCHING. 2681 */ 2682 opss = atomic_long_read(&p->scx.ops_state); 2683 2684 switch (opss & SCX_OPSS_STATE_MASK) { 2685 case SCX_OPSS_DISPATCHING: 2686 case SCX_OPSS_NONE: 2687 /* someone else already got to it */ 2688 return; 2689 case SCX_OPSS_QUEUED: 2690 /* 2691 * If qseq doesn't match, @p has gone through at least one 2692 * dispatch/dequeue and re-enqueue cycle between 2693 * scx_bpf_dsq_insert() and here and we have no claim on it. 2694 */ 2695 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2696 return; 2697 2698 /* see SCX_EV_INSERT_NOT_OWNED definition */ 2699 if (unlikely(!scx_task_on_sched(sch, p))) { 2700 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2701 return; 2702 } 2703 2704 /* 2705 * While we know @p is accessible, we don't yet have a claim on 2706 * it - the BPF scheduler is allowed to dispatch tasks 2707 * spuriously and there can be a racing dequeue attempt. Let's 2708 * claim @p by atomically transitioning it from QUEUED to 2709 * DISPATCHING. 2710 */ 2711 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2712 SCX_OPSS_DISPATCHING))) 2713 break; 2714 goto retry; 2715 case SCX_OPSS_QUEUEING: 2716 /* 2717 * do_enqueue_task() is in the process of transferring the task 2718 * to the BPF scheduler while holding @p's rq lock. As we aren't 2719 * holding any kernel or BPF resource that the enqueue path may 2720 * depend upon, it's safe to wait. 2721 */ 2722 wait_ops_state(p, opss); 2723 goto retry; 2724 } 2725 2726 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2727 2728 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2729 2730 if (dsq->id == SCX_DSQ_LOCAL) 2731 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2732 else 2733 dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2734 } 2735 2736 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2737 { 2738 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2739 u32 u; 2740 2741 for (u = 0; u < dspc->cursor; u++) { 2742 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2743 2744 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2745 ent->enq_flags); 2746 } 2747 2748 dspc->nr_tasks += dspc->cursor; 2749 dspc->cursor = 0; 2750 } 2751 2752 static inline void maybe_queue_balance_callback(struct rq *rq) 2753 { 2754 lockdep_assert_rq_held(rq); 2755 2756 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2757 return; 2758 2759 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2760 deferred_bal_cb_workfn); 2761 2762 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2763 } 2764 2765 /* 2766 * One user of this function is scx_bpf_dispatch() which can be called 2767 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2768 * from the call frame. 2769 */ 2770 static __always_inline bool 2771 scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2772 struct task_struct *prev, bool nested) 2773 { 2774 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2775 int nr_loops = SCX_DSP_MAX_LOOPS; 2776 s32 cpu = cpu_of(rq); 2777 bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2778 scx_task_on_sched(sch, prev); 2779 2780 if (consume_global_dsq(sch, rq)) 2781 return true; 2782 2783 if (bypass_dsp_enabled(sch)) { 2784 /* if @sch is bypassing, only the bypass DSQs are active */ 2785 if (scx_bypassing(sch, cpu)) 2786 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2787 2788 #ifdef CONFIG_EXT_SUB_SCHED 2789 /* 2790 * If @sch isn't bypassing but its children are, @sch is 2791 * responsible for making forward progress for both its own 2792 * tasks that aren't bypassing and the bypassing descendants' 2793 * tasks. The following implements a simple built-in behavior - 2794 * let each CPU try to run the bypass DSQ every Nth time. 2795 * 2796 * Later, if necessary, we can add an ops flag to suppress the 2797 * auto-consumption and a kfunc to consume the bypass DSQ and, 2798 * so that the BPF scheduler can fully control scheduling of 2799 * bypassed tasks. 2800 */ 2801 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2802 2803 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2804 consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { 2805 __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2806 return true; 2807 } 2808 #endif /* CONFIG_EXT_SUB_SCHED */ 2809 } 2810 2811 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2812 return false; 2813 2814 dspc->rq = rq; 2815 2816 /* 2817 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2818 * the local DSQ might still end up empty after a successful 2819 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2820 * produced some tasks, retry. The BPF scheduler may depend on this 2821 * looping behavior to simplify its implementation. 2822 */ 2823 do { 2824 dspc->nr_tasks = 0; 2825 2826 if (nested) { 2827 SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); 2828 } else { 2829 /* stash @prev so that nested invocations can access it */ 2830 rq->scx.sub_dispatch_prev = prev; 2831 SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); 2832 rq->scx.sub_dispatch_prev = NULL; 2833 } 2834 2835 flush_dispatch_buf(sch, rq); 2836 2837 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2838 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2839 return true; 2840 } 2841 if (rq->scx.local_dsq.nr) 2842 return true; 2843 if (consume_global_dsq(sch, rq)) 2844 return true; 2845 2846 /* 2847 * ops.dispatch() can trap us in this loop by repeatedly 2848 * dispatching ineligible tasks. Break out once in a while to 2849 * allow the watchdog to run. As IRQ can't be enabled in 2850 * balance(), we want to complete this scheduling cycle and then 2851 * start a new one. IOW, we want to call resched_curr() on the 2852 * next, most likely idle, task, not the current one. Use 2853 * __scx_bpf_kick_cpu() for deferred kicking. 2854 */ 2855 if (unlikely(!--nr_loops)) { 2856 scx_kick_cpu(sch, cpu, 0); 2857 break; 2858 } 2859 } while (dspc->nr_tasks); 2860 2861 /* 2862 * Prevent the CPU from going idle while bypassed descendants have tasks 2863 * queued. Without this fallback, bypassed tasks could stall if the host 2864 * scheduler's ops.dispatch() doesn't yield any tasks. 2865 */ 2866 if (bypass_dsp_enabled(sch)) 2867 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2868 2869 return false; 2870 } 2871 2872 static int balance_one(struct rq *rq, struct task_struct *prev) 2873 { 2874 struct scx_sched *sch = scx_root; 2875 s32 cpu = cpu_of(rq); 2876 2877 lockdep_assert_rq_held(rq); 2878 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2879 rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2880 2881 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2882 unlikely(rq->scx.cpu_released)) { 2883 /* 2884 * If the previous sched_class for the current CPU was not SCX, 2885 * notify the BPF scheduler that it again has control of the 2886 * core. This callback complements ->cpu_release(), which is 2887 * emitted in switch_class(). 2888 */ 2889 if (SCX_HAS_OP(sch, cpu_acquire)) 2890 SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); 2891 rq->scx.cpu_released = false; 2892 } 2893 2894 if (prev->sched_class == &ext_sched_class) { 2895 update_curr_scx(rq); 2896 2897 /* 2898 * If @prev is runnable & has slice left, it has priority and 2899 * fetching more just increases latency for the fetched tasks. 2900 * Tell pick_task_scx() to keep running @prev. If the BPF 2901 * scheduler wants to handle this explicitly, it should 2902 * implement ->cpu_release(). 2903 * 2904 * See scx_disable_workfn() for the explanation on the bypassing 2905 * test. 2906 */ 2907 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 2908 !scx_bypassing(sch, cpu)) { 2909 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2910 goto has_tasks; 2911 } 2912 } 2913 2914 /* if there already are tasks to run, nothing to do */ 2915 if (rq->scx.local_dsq.nr) 2916 goto has_tasks; 2917 2918 if (scx_dispatch_sched(sch, rq, prev, false)) 2919 goto has_tasks; 2920 2921 /* 2922 * Didn't find another task to run. Keep running @prev unless 2923 * %SCX_OPS_ENQ_LAST is in effect. 2924 */ 2925 if ((prev->scx.flags & SCX_TASK_QUEUED) && 2926 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 2927 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2928 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 2929 goto has_tasks; 2930 } 2931 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2932 return false; 2933 2934 has_tasks: 2935 /* 2936 * @rq may have extra IMMED tasks without reenq scheduled: 2937 * 2938 * - rq_is_open() can't reliably tell when and how slice is going to be 2939 * modified for $curr and allows IMMED tasks to be queued while 2940 * dispatch is in progress. 2941 * 2942 * - A non-IMMED HEAD task can get queued in front of an IMMED task 2943 * between the IMMED queueing and the subsequent scheduling event. 2944 */ 2945 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 2946 schedule_reenq_local(rq, 0); 2947 2948 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2949 return true; 2950 } 2951 2952 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 2953 { 2954 struct scx_sched *sch = scx_task_sched(p); 2955 2956 if (p->scx.flags & SCX_TASK_QUEUED) { 2957 /* 2958 * Core-sched might decide to execute @p before it is 2959 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 2960 */ 2961 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 2962 dispatch_dequeue(rq, p); 2963 } 2964 2965 p->se.exec_start = rq_clock_task(rq); 2966 2967 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2968 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 2969 SCX_CALL_OP_TASK(sch, running, rq, p); 2970 2971 clr_task_runnable(p, true); 2972 2973 /* 2974 * @p is getting newly scheduled or got kicked after someone updated its 2975 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 2976 */ 2977 if ((p->scx.slice == SCX_SLICE_INF) != 2978 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 2979 if (p->scx.slice == SCX_SLICE_INF) 2980 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 2981 else 2982 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 2983 2984 sched_update_tick_dependency(rq); 2985 2986 /* 2987 * For now, let's refresh the load_avgs just when transitioning 2988 * in and out of nohz. In the future, we might want to add a 2989 * mechanism which calls the following periodically on 2990 * tick-stopped CPUs. 2991 */ 2992 update_other_load_avgs(rq); 2993 } 2994 } 2995 2996 static enum scx_cpu_preempt_reason 2997 preempt_reason_from_class(const struct sched_class *class) 2998 { 2999 if (class == &stop_sched_class) 3000 return SCX_CPU_PREEMPT_STOP; 3001 if (class == &dl_sched_class) 3002 return SCX_CPU_PREEMPT_DL; 3003 if (class == &rt_sched_class) 3004 return SCX_CPU_PREEMPT_RT; 3005 return SCX_CPU_PREEMPT_UNKNOWN; 3006 } 3007 3008 static void switch_class(struct rq *rq, struct task_struct *next) 3009 { 3010 struct scx_sched *sch = scx_root; 3011 const struct sched_class *next_class = next->sched_class; 3012 3013 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3014 return; 3015 3016 /* 3017 * The callback is conceptually meant to convey that the CPU is no 3018 * longer under the control of SCX. Therefore, don't invoke the callback 3019 * if the next class is below SCX (in which case the BPF scheduler has 3020 * actively decided not to schedule any tasks on the CPU). 3021 */ 3022 if (sched_class_above(&ext_sched_class, next_class)) 3023 return; 3024 3025 /* 3026 * At this point we know that SCX was preempted by a higher priority 3027 * sched_class, so invoke the ->cpu_release() callback if we have not 3028 * done so already. We only send the callback once between SCX being 3029 * preempted, and it regaining control of the CPU. 3030 * 3031 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3032 * next time that balance_one() is invoked. 3033 */ 3034 if (!rq->scx.cpu_released) { 3035 if (SCX_HAS_OP(sch, cpu_release)) { 3036 struct scx_cpu_release_args args = { 3037 .reason = preempt_reason_from_class(next_class), 3038 .task = next, 3039 }; 3040 3041 SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); 3042 } 3043 rq->scx.cpu_released = true; 3044 } 3045 } 3046 3047 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3048 struct task_struct *next) 3049 { 3050 struct scx_sched *sch = scx_task_sched(p); 3051 3052 /* see kick_sync_wait_bal_cb() */ 3053 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3054 3055 update_curr_scx(rq); 3056 3057 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3058 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3059 SCX_CALL_OP_TASK(sch, stopping, rq, p, true); 3060 3061 if (p->scx.flags & SCX_TASK_QUEUED) { 3062 set_task_runnable(rq, p); 3063 3064 /* 3065 * If @p has slice left and is being put, @p is getting 3066 * preempted by a higher priority scheduler class or core-sched 3067 * forcing a different task. Leave it at the head of the local 3068 * DSQ unless it was an IMMED task. IMMED tasks should not 3069 * linger on a busy CPU, reenqueue them to the BPF scheduler. 3070 */ 3071 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3072 if (p->scx.flags & SCX_TASK_IMMED) { 3073 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3074 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3075 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3076 } else { 3077 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3078 } 3079 goto switch_class; 3080 } 3081 3082 /* 3083 * If @p is runnable but we're about to enter a lower 3084 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3085 * ops.enqueue() that @p is the only one available for this cpu, 3086 * which should trigger an explicit follow-up scheduling event. 3087 */ 3088 if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3089 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3090 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3091 } else { 3092 do_enqueue_task(rq, p, 0, -1); 3093 } 3094 } 3095 3096 switch_class: 3097 if (next && next->sched_class != &ext_sched_class) 3098 switch_class(rq, next); 3099 } 3100 3101 static void kick_sync_wait_bal_cb(struct rq *rq) 3102 { 3103 struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3104 unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3105 bool waited; 3106 s32 cpu; 3107 3108 /* 3109 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3110 * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3111 * flush) while we wait for its kick_sync to advance. 3112 * 3113 * Also, keep advancing our own kick_sync so that new kick_sync waits 3114 * targeting us, which can start after we drop the lock, cannot form 3115 * cyclic dependencies. 3116 */ 3117 retry: 3118 waited = false; 3119 for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3120 /* 3121 * smp_load_acquire() pairs with smp_store_release() on 3122 * kick_sync updates on the target CPUs. 3123 */ 3124 if (cpu == cpu_of(rq) || 3125 smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3126 cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3127 continue; 3128 } 3129 3130 raw_spin_rq_unlock_irq(rq); 3131 while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3132 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3133 cpu_relax(); 3134 } 3135 raw_spin_rq_lock_irq(rq); 3136 waited = true; 3137 } 3138 3139 if (waited) 3140 goto retry; 3141 } 3142 3143 static struct task_struct *first_local_task(struct rq *rq) 3144 { 3145 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3146 struct task_struct, scx.dsq_list.node); 3147 } 3148 3149 static struct task_struct * 3150 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3151 { 3152 struct task_struct *prev = rq->curr; 3153 bool keep_prev; 3154 struct task_struct *p; 3155 3156 /* see kick_sync_wait_bal_cb() */ 3157 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3158 3159 rq_modified_begin(rq, &ext_sched_class); 3160 3161 rq_unpin_lock(rq, rf); 3162 balance_one(rq, prev); 3163 rq_repin_lock(rq, rf); 3164 maybe_queue_balance_callback(rq); 3165 3166 /* 3167 * Defer to a balance callback which can drop rq lock and enable 3168 * IRQs. Waiting directly in the pick path would deadlock against 3169 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3170 */ 3171 if (unlikely(rq->scx.kick_sync_pending)) { 3172 rq->scx.kick_sync_pending = false; 3173 queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3174 kick_sync_wait_bal_cb); 3175 } 3176 3177 /* 3178 * If any higher-priority sched class enqueued a runnable task on 3179 * this rq during balance_one(), abort and return RETRY_TASK, so 3180 * that the scheduler loop can restart. 3181 * 3182 * If @force_scx is true, always try to pick a SCHED_EXT task, 3183 * regardless of any higher-priority sched classes activity. 3184 */ 3185 if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3186 return RETRY_TASK; 3187 3188 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3189 if (unlikely(keep_prev && 3190 prev->sched_class != &ext_sched_class)) { 3191 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3192 keep_prev = false; 3193 } 3194 3195 /* 3196 * If balance_one() is telling us to keep running @prev, replenish slice 3197 * if necessary and keep running @prev. Otherwise, pop the first one 3198 * from the local DSQ. 3199 */ 3200 if (keep_prev) { 3201 p = prev; 3202 if (!p->scx.slice) 3203 refill_task_slice_dfl(scx_task_sched(p), p); 3204 } else { 3205 p = first_local_task(rq); 3206 if (!p) 3207 return NULL; 3208 3209 if (unlikely(!p->scx.slice)) { 3210 struct scx_sched *sch = scx_task_sched(p); 3211 3212 if (!scx_bypassing(sch, cpu_of(rq)) && 3213 !sch->warned_zero_slice) { 3214 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3215 p->comm, p->pid, __func__); 3216 sch->warned_zero_slice = true; 3217 } 3218 refill_task_slice_dfl(sch, p); 3219 } 3220 } 3221 3222 return p; 3223 } 3224 3225 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3226 { 3227 return do_pick_task_scx(rq, rf, false); 3228 } 3229 3230 /* 3231 * Select the next task to run from the ext scheduling class. 3232 * 3233 * Use do_pick_task_scx() directly with @force_scx enabled, since the 3234 * dl_server must always select a sched_ext task. 3235 */ 3236 static struct task_struct * 3237 ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3238 { 3239 if (!scx_enabled()) 3240 return NULL; 3241 3242 return do_pick_task_scx(dl_se->rq, rf, true); 3243 } 3244 3245 /* 3246 * Initialize the ext server deadline entity. 3247 */ 3248 void ext_server_init(struct rq *rq) 3249 { 3250 struct sched_dl_entity *dl_se = &rq->ext_server; 3251 3252 init_dl_entity(dl_se); 3253 3254 dl_server_init(dl_se, rq, ext_server_pick_task); 3255 } 3256 3257 #ifdef CONFIG_SCHED_CORE 3258 /** 3259 * scx_prio_less - Task ordering for core-sched 3260 * @a: task A 3261 * @b: task B 3262 * @in_fi: in forced idle state 3263 * 3264 * Core-sched is implemented as an additional scheduling layer on top of the 3265 * usual sched_class'es and needs to find out the expected task ordering. For 3266 * SCX, core-sched calls this function to interrogate the task ordering. 3267 * 3268 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3269 * to implement the default task ordering. The older the timestamp, the higher 3270 * priority the task - the global FIFO ordering matching the default scheduling 3271 * behavior. 3272 * 3273 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3274 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3275 */ 3276 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3277 bool in_fi) 3278 { 3279 struct scx_sched *sch_a = scx_task_sched(a); 3280 struct scx_sched *sch_b = scx_task_sched(b); 3281 3282 /* 3283 * The const qualifiers are dropped from task_struct pointers when 3284 * calling ops.core_sched_before(). Accesses are controlled by the 3285 * verifier. 3286 */ 3287 if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3288 !scx_bypassing(sch_a, task_cpu(a))) 3289 return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, 3290 task_rq(a), 3291 (struct task_struct *)a, 3292 (struct task_struct *)b); 3293 else 3294 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3295 } 3296 #endif /* CONFIG_SCHED_CORE */ 3297 3298 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3299 { 3300 struct scx_sched *sch = scx_task_sched(p); 3301 bool bypassing; 3302 3303 /* 3304 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3305 * can be a good migration opportunity with low cache and memory 3306 * footprint. Returning a CPU different than @prev_cpu triggers 3307 * immediate rq migration. However, for SCX, as the current rq 3308 * association doesn't dictate where the task is going to run, this 3309 * doesn't fit well. If necessary, we can later add a dedicated method 3310 * which can decide to preempt self to force it through the regular 3311 * scheduling path. 3312 */ 3313 if (unlikely(wake_flags & WF_EXEC)) 3314 return prev_cpu; 3315 3316 bypassing = scx_bypassing(sch, task_cpu(p)); 3317 if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3318 s32 cpu; 3319 struct task_struct **ddsp_taskp; 3320 3321 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3322 WARN_ON_ONCE(*ddsp_taskp); 3323 *ddsp_taskp = p; 3324 3325 this_rq()->scx.in_select_cpu = true; 3326 cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); 3327 this_rq()->scx.in_select_cpu = false; 3328 p->scx.selected_cpu = cpu; 3329 *ddsp_taskp = NULL; 3330 if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3331 return cpu; 3332 else 3333 return prev_cpu; 3334 } else { 3335 s32 cpu; 3336 3337 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3338 if (cpu >= 0) { 3339 refill_task_slice_dfl(sch, p); 3340 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3341 } else { 3342 cpu = prev_cpu; 3343 } 3344 p->scx.selected_cpu = cpu; 3345 3346 if (bypassing) 3347 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3348 return cpu; 3349 } 3350 } 3351 3352 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3353 { 3354 run_deferred(rq); 3355 } 3356 3357 static void set_cpus_allowed_scx(struct task_struct *p, 3358 struct affinity_context *ac) 3359 { 3360 struct scx_sched *sch = scx_task_sched(p); 3361 3362 set_cpus_allowed_common(p, ac); 3363 3364 if (task_dead_and_done(p)) 3365 return; 3366 3367 /* 3368 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3369 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3370 * scheduler the effective one. 3371 * 3372 * Fine-grained memory write control is enforced by BPF making the const 3373 * designation pointless. Cast it away when calling the operation. 3374 */ 3375 if (SCX_HAS_OP(sch, set_cpumask)) 3376 SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); 3377 } 3378 3379 static void handle_hotplug(struct rq *rq, bool online) 3380 { 3381 struct scx_sched *sch = scx_root; 3382 s32 cpu = cpu_of(rq); 3383 3384 atomic_long_inc(&scx_hotplug_seq); 3385 3386 /* 3387 * scx_root updates are protected by cpus_read_lock() and will stay 3388 * stable here. Note that we can't depend on scx_enabled() test as the 3389 * hotplug ops need to be enabled before __scx_enabled is set. 3390 */ 3391 if (unlikely(!sch)) 3392 return; 3393 3394 if (scx_enabled()) 3395 scx_idle_update_selcpu_topology(&sch->ops); 3396 3397 if (online && SCX_HAS_OP(sch, cpu_online)) 3398 SCX_CALL_OP(sch, cpu_online, NULL, cpu); 3399 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3400 SCX_CALL_OP(sch, cpu_offline, NULL, cpu); 3401 else 3402 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3403 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3404 "cpu %d going %s, exiting scheduler", cpu, 3405 online ? "online" : "offline"); 3406 } 3407 3408 void scx_rq_activate(struct rq *rq) 3409 { 3410 handle_hotplug(rq, true); 3411 } 3412 3413 void scx_rq_deactivate(struct rq *rq) 3414 { 3415 handle_hotplug(rq, false); 3416 } 3417 3418 static void rq_online_scx(struct rq *rq) 3419 { 3420 rq->scx.flags |= SCX_RQ_ONLINE; 3421 } 3422 3423 static void rq_offline_scx(struct rq *rq) 3424 { 3425 rq->scx.flags &= ~SCX_RQ_ONLINE; 3426 } 3427 3428 static bool check_rq_for_timeouts(struct rq *rq) 3429 { 3430 struct scx_sched *sch; 3431 struct task_struct *p; 3432 struct rq_flags rf; 3433 bool timed_out = false; 3434 3435 rq_lock_irqsave(rq, &rf); 3436 sch = rcu_dereference_bh(scx_root); 3437 if (unlikely(!sch)) 3438 goto out_unlock; 3439 3440 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3441 struct scx_sched *sch = scx_task_sched(p); 3442 unsigned long last_runnable = p->scx.runnable_at; 3443 3444 if (unlikely(time_after(jiffies, 3445 last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3446 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3447 3448 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, 3449 "%s[%d] failed to run for %u.%03us", 3450 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); 3451 timed_out = true; 3452 break; 3453 } 3454 } 3455 out_unlock: 3456 rq_unlock_irqrestore(rq, &rf); 3457 return timed_out; 3458 } 3459 3460 static void scx_watchdog_workfn(struct work_struct *work) 3461 { 3462 unsigned long intv; 3463 int cpu; 3464 3465 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3466 3467 for_each_online_cpu(cpu) { 3468 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3469 break; 3470 3471 cond_resched(); 3472 } 3473 3474 intv = READ_ONCE(scx_watchdog_interval); 3475 if (intv < ULONG_MAX) 3476 queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3477 } 3478 3479 void scx_tick(struct rq *rq) 3480 { 3481 struct scx_sched *root; 3482 unsigned long last_check; 3483 3484 if (!scx_enabled()) 3485 return; 3486 3487 root = rcu_dereference_bh(scx_root); 3488 if (unlikely(!root)) 3489 return; 3490 3491 last_check = READ_ONCE(scx_watchdog_timestamp); 3492 if (unlikely(time_after(jiffies, 3493 last_check + READ_ONCE(root->watchdog_timeout)))) { 3494 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3495 3496 scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3497 "watchdog failed to check in for %u.%03us", 3498 dur_ms / 1000, dur_ms % 1000); 3499 } 3500 3501 update_other_load_avgs(rq); 3502 } 3503 3504 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3505 { 3506 struct scx_sched *sch = scx_task_sched(curr); 3507 3508 update_curr_scx(rq); 3509 3510 /* 3511 * While disabling, always resched and refresh core-sched timestamp as 3512 * we can't trust the slice management or ops.core_sched_before(). 3513 */ 3514 if (scx_bypassing(sch, cpu_of(rq))) { 3515 curr->scx.slice = 0; 3516 touch_core_sched(rq, curr); 3517 } else if (SCX_HAS_OP(sch, tick)) { 3518 SCX_CALL_OP_TASK(sch, tick, rq, curr); 3519 } 3520 3521 if (!curr->scx.slice) 3522 resched_curr(rq); 3523 } 3524 3525 #ifdef CONFIG_EXT_GROUP_SCHED 3526 static struct cgroup *tg_cgrp(struct task_group *tg) 3527 { 3528 /* 3529 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3530 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3531 * root cgroup. 3532 */ 3533 if (tg && tg->css.cgroup) 3534 return tg->css.cgroup; 3535 else 3536 return &cgrp_dfl_root.cgrp; 3537 } 3538 3539 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3540 3541 #else /* CONFIG_EXT_GROUP_SCHED */ 3542 3543 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3544 3545 #endif /* CONFIG_EXT_GROUP_SCHED */ 3546 3547 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3548 { 3549 int ret; 3550 3551 p->scx.disallow = false; 3552 3553 if (SCX_HAS_OP(sch, init_task)) { 3554 struct scx_init_task_args args = { 3555 SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3556 .fork = fork, 3557 }; 3558 3559 ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); 3560 if (unlikely(ret)) { 3561 ret = ops_sanitize_err(sch, "init_task", ret); 3562 return ret; 3563 } 3564 } 3565 3566 if (p->scx.disallow) { 3567 if (unlikely(scx_parent(sch))) { 3568 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3569 p->comm, p->pid); 3570 } else if (unlikely(fork)) { 3571 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3572 p->comm, p->pid); 3573 } else { 3574 struct rq *rq; 3575 struct rq_flags rf; 3576 3577 rq = task_rq_lock(p, &rf); 3578 3579 /* 3580 * We're in the load path and @p->policy will be applied 3581 * right after. Reverting @p->policy here and rejecting 3582 * %SCHED_EXT transitions from scx_check_setscheduler() 3583 * guarantees that if ops.init_task() sets @p->disallow, 3584 * @p can never be in SCX. 3585 */ 3586 if (p->policy == SCHED_EXT) { 3587 p->policy = SCHED_NORMAL; 3588 atomic_long_inc(&scx_nr_rejected); 3589 } 3590 3591 task_rq_unlock(rq, p, &rf); 3592 } 3593 } 3594 3595 return 0; 3596 } 3597 3598 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3599 { 3600 struct rq *rq = task_rq(p); 3601 u32 weight; 3602 3603 lockdep_assert_rq_held(rq); 3604 3605 /* 3606 * Verify the task is not in BPF scheduler's custody. If flag 3607 * transitions are consistent, the flag should always be clear 3608 * here. 3609 */ 3610 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3611 3612 /* 3613 * Set the weight before calling ops.enable() so that the scheduler 3614 * doesn't see a stale value if they inspect the task struct. 3615 */ 3616 if (task_has_idle_policy(p)) 3617 weight = WEIGHT_IDLEPRIO; 3618 else 3619 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3620 3621 p->scx.weight = sched_weight_to_cgroup(weight); 3622 3623 if (SCX_HAS_OP(sch, enable)) 3624 SCX_CALL_OP_TASK(sch, enable, rq, p); 3625 3626 if (SCX_HAS_OP(sch, set_weight)) 3627 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3628 } 3629 3630 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3631 { 3632 __scx_enable_task(sch, p); 3633 scx_set_task_state(p, SCX_TASK_ENABLED); 3634 } 3635 3636 static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3637 { 3638 struct rq *rq = task_rq(p); 3639 3640 lockdep_assert_rq_held(rq); 3641 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3642 3643 clear_direct_dispatch(p); 3644 3645 if (SCX_HAS_OP(sch, disable)) 3646 SCX_CALL_OP_TASK(sch, disable, rq, p); 3647 scx_set_task_state(p, SCX_TASK_READY); 3648 3649 /* 3650 * Verify the task is not in BPF scheduler's custody. If flag 3651 * transitions are consistent, the flag should always be clear 3652 * here. 3653 */ 3654 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3655 } 3656 3657 static void __scx_disable_and_exit_task(struct scx_sched *sch, 3658 struct task_struct *p) 3659 { 3660 struct scx_exit_task_args args = { 3661 .cancelled = false, 3662 }; 3663 3664 lockdep_assert_held(&p->pi_lock); 3665 lockdep_assert_rq_held(task_rq(p)); 3666 3667 switch (scx_get_task_state(p)) { 3668 case SCX_TASK_NONE: 3669 return; 3670 case SCX_TASK_INIT: 3671 args.cancelled = true; 3672 break; 3673 case SCX_TASK_READY: 3674 break; 3675 case SCX_TASK_ENABLED: 3676 scx_disable_task(sch, p); 3677 break; 3678 default: 3679 WARN_ON_ONCE(true); 3680 return; 3681 } 3682 3683 if (SCX_HAS_OP(sch, exit_task)) 3684 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3685 } 3686 3687 /* 3688 * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never 3689 * ran. The task state has not been transitioned, so this mirrors the 3690 * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). 3691 */ 3692 static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) 3693 { 3694 struct scx_exit_task_args args = { .cancelled = true }; 3695 3696 lockdep_assert_held(&p->pi_lock); 3697 lockdep_assert_rq_held(task_rq(p)); 3698 3699 if (SCX_HAS_OP(sch, exit_task)) 3700 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3701 } 3702 3703 static void scx_disable_and_exit_task(struct scx_sched *sch, 3704 struct task_struct *p) 3705 { 3706 __scx_disable_and_exit_task(sch, p); 3707 3708 /* 3709 * If set, @p exited between __scx_init_task() and scx_enable_task() in 3710 * scx_sub_enable() and is initialized for both the associated sched and 3711 * its parent. Exit for the child too - scx_enable_task() never ran for 3712 * it, so undo only init_task. The flag is only set on the sub-enable 3713 * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. 3714 */ 3715 if (p->scx.flags & SCX_TASK_SUB_INIT) { 3716 if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) 3717 scx_sub_init_cancel_task(scx_enabling_sub_sched, p); 3718 p->scx.flags &= ~SCX_TASK_SUB_INIT; 3719 } 3720 3721 scx_set_task_sched(p, NULL); 3722 scx_set_task_state(p, SCX_TASK_NONE); 3723 } 3724 3725 void init_scx_entity(struct sched_ext_entity *scx) 3726 { 3727 memset(scx, 0, sizeof(*scx)); 3728 INIT_LIST_HEAD(&scx->dsq_list.node); 3729 RB_CLEAR_NODE(&scx->dsq_priq); 3730 scx->sticky_cpu = -1; 3731 scx->holding_cpu = -1; 3732 INIT_LIST_HEAD(&scx->runnable_node); 3733 scx->runnable_at = jiffies; 3734 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3735 scx->slice = SCX_SLICE_DFL; 3736 } 3737 3738 void scx_pre_fork(struct task_struct *p) 3739 { 3740 /* 3741 * BPF scheduler enable/disable paths want to be able to iterate and 3742 * update all tasks which can become complex when racing forks. As 3743 * enable/disable are very cold paths, let's use a percpu_rwsem to 3744 * exclude forks. 3745 */ 3746 percpu_down_read(&scx_fork_rwsem); 3747 } 3748 3749 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3750 { 3751 s32 ret; 3752 3753 percpu_rwsem_assert_held(&scx_fork_rwsem); 3754 3755 if (scx_init_task_enabled) { 3756 #ifdef CONFIG_EXT_SUB_SCHED 3757 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3758 #else 3759 struct scx_sched *sch = scx_root; 3760 #endif 3761 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 3762 ret = __scx_init_task(sch, p, true); 3763 if (unlikely(ret)) { 3764 scx_set_task_state(p, SCX_TASK_NONE); 3765 return ret; 3766 } 3767 scx_set_task_state(p, SCX_TASK_INIT); 3768 scx_set_task_sched(p, sch); 3769 } 3770 3771 return 0; 3772 } 3773 3774 void scx_post_fork(struct task_struct *p) 3775 { 3776 if (scx_init_task_enabled) { 3777 scx_set_task_state(p, SCX_TASK_READY); 3778 3779 /* 3780 * Enable the task immediately if it's running on sched_ext. 3781 * Otherwise, it'll be enabled in switching_to_scx() if and 3782 * when it's ever configured to run with a SCHED_EXT policy. 3783 */ 3784 if (p->sched_class == &ext_sched_class) { 3785 struct rq_flags rf; 3786 struct rq *rq; 3787 3788 rq = task_rq_lock(p, &rf); 3789 scx_enable_task(scx_task_sched(p), p); 3790 task_rq_unlock(rq, p, &rf); 3791 } 3792 } 3793 3794 raw_spin_lock_irq(&scx_tasks_lock); 3795 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3796 raw_spin_unlock_irq(&scx_tasks_lock); 3797 3798 percpu_up_read(&scx_fork_rwsem); 3799 } 3800 3801 void scx_cancel_fork(struct task_struct *p) 3802 { 3803 if (scx_enabled()) { 3804 struct rq *rq; 3805 struct rq_flags rf; 3806 3807 rq = task_rq_lock(p, &rf); 3808 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3809 scx_disable_and_exit_task(scx_task_sched(p), p); 3810 task_rq_unlock(rq, p, &rf); 3811 } 3812 3813 percpu_up_read(&scx_fork_rwsem); 3814 } 3815 3816 /** 3817 * task_dead_and_done - Is a task dead and done running? 3818 * @p: target task 3819 * 3820 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3821 * task no longer exists from SCX's POV. However, certain sched_class ops may be 3822 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3823 * may try to switch a task which finished sched_ext_dead() back into SCX 3824 * triggering invalid SCX task state transitions and worse. 3825 * 3826 * Once a task has finished the final switch, sched_ext_dead() is the only thing 3827 * that needs to happen on the task. Use this test to short-circuit sched_class 3828 * operations which may be called on dead tasks. 3829 */ 3830 static bool task_dead_and_done(struct task_struct *p) 3831 { 3832 struct rq *rq = task_rq(p); 3833 3834 lockdep_assert_rq_held(rq); 3835 3836 /* 3837 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3838 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3839 * won't ever run again. 3840 */ 3841 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 3842 !task_on_cpu(rq, p); 3843 } 3844 3845 void sched_ext_dead(struct task_struct *p) 3846 { 3847 unsigned long flags; 3848 3849 /* 3850 * By the time control reaches here, @p has %TASK_DEAD set, switched out 3851 * for the last time and then dropped the rq lock - task_dead_and_done() 3852 * should be returning %true nullifying the straggling sched_class ops. 3853 * Remove from scx_tasks and exit @p. 3854 */ 3855 raw_spin_lock_irqsave(&scx_tasks_lock, flags); 3856 list_del_init(&p->scx.tasks_node); 3857 raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); 3858 3859 /* 3860 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 3861 * ENABLED transitions can't race us. Disable ops for @p. 3862 * 3863 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see 3864 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup 3865 * iteration is only used from sub-sched paths, which require root 3866 * enabled. Root enable transitions every live task to at least READY. 3867 * 3868 * %INIT_BEGIN means ops.init_task() is running for @p. Don't call 3869 * into ops; transition to %DEAD so the post-init recheck unwinds 3870 * via scx_sub_init_cancel_task(). 3871 */ 3872 if (scx_get_task_state(p) != SCX_TASK_NONE) { 3873 struct rq_flags rf; 3874 struct rq *rq; 3875 3876 rq = task_rq_lock(p, &rf); 3877 if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) 3878 scx_disable_and_exit_task(scx_task_sched(p), p); 3879 scx_set_task_state(p, SCX_TASK_DEAD); 3880 task_rq_unlock(rq, p, &rf); 3881 } 3882 } 3883 3884 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 3885 const struct load_weight *lw) 3886 { 3887 struct scx_sched *sch = scx_task_sched(p); 3888 3889 lockdep_assert_rq_held(task_rq(p)); 3890 3891 if (task_dead_and_done(p)) 3892 return; 3893 3894 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 3895 if (SCX_HAS_OP(sch, set_weight)) 3896 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3897 } 3898 3899 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 3900 { 3901 } 3902 3903 static void switching_to_scx(struct rq *rq, struct task_struct *p) 3904 { 3905 struct scx_sched *sch = scx_task_sched(p); 3906 3907 if (task_dead_and_done(p)) 3908 return; 3909 3910 scx_enable_task(sch, p); 3911 3912 /* 3913 * set_cpus_allowed_scx() is not called while @p is associated with a 3914 * different scheduler class. Keep the BPF scheduler up-to-date. 3915 */ 3916 if (SCX_HAS_OP(sch, set_cpumask)) 3917 SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); 3918 } 3919 3920 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3921 { 3922 if (task_dead_and_done(p)) 3923 return; 3924 3925 /* 3926 * %NONE means SCX is no longer tracking @p at the task level (e.g. 3927 * scx_fail_parent() handed @p back to the parent at NONE pending the 3928 * parent's own teardown). There is nothing to disable; calling 3929 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a 3930 * NONE -> READY validation failure. 3931 */ 3932 if (scx_get_task_state(p) == SCX_TASK_NONE) 3933 return; 3934 3935 scx_disable_task(scx_task_sched(p), p); 3936 } 3937 3938 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 3939 3940 int scx_check_setscheduler(struct task_struct *p, int policy) 3941 { 3942 lockdep_assert_rq_held(task_rq(p)); 3943 3944 /* if disallow, reject transitioning into SCX */ 3945 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 3946 p->policy != policy && policy == SCHED_EXT) 3947 return -EACCES; 3948 3949 return 0; 3950 } 3951 3952 static void process_ddsp_deferred_locals(struct rq *rq) 3953 { 3954 struct task_struct *p; 3955 3956 lockdep_assert_rq_held(rq); 3957 3958 /* 3959 * Now that @rq can be unlocked, execute the deferred enqueueing of 3960 * tasks directly dispatched to the local DSQs of other CPUs. See 3961 * direct_dispatch(). Keep popping from the head instead of using 3962 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 3963 * temporarily. 3964 */ 3965 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 3966 struct task_struct, scx.dsq_list.node))) { 3967 struct scx_sched *sch = scx_task_sched(p); 3968 struct scx_dispatch_q *dsq; 3969 u64 dsq_id = p->scx.ddsp_dsq_id; 3970 u64 enq_flags = p->scx.ddsp_enq_flags; 3971 3972 list_del_init(&p->scx.dsq_list.node); 3973 clear_direct_dispatch(p); 3974 3975 dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); 3976 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 3977 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 3978 } 3979 } 3980 3981 /* 3982 * Determine whether @p should be reenqueued from a local DSQ. 3983 * 3984 * @reenq_flags is mutable and accumulates state across the DSQ walk: 3985 * 3986 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 3987 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 3988 * the head consumes the first slot. 3989 * 3990 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 3991 * rq_is_open() is true. 3992 * 3993 * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 3994 * AND the current task is done — i.e. it will execute immediately. All other 3995 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 3996 * every IMMED task behind it gets reenqueued. 3997 * 3998 * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 3999 * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 4000 * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 4001 * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 4002 * in process_deferred_reenq_locals(). 4003 */ 4004 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 4005 { 4006 bool first; 4007 4008 first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 4009 *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 4010 4011 *reason = SCX_TASK_REENQ_KFUNC; 4012 4013 if ((p->scx.flags & SCX_TASK_IMMED) && 4014 (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 4015 __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 4016 *reason = SCX_TASK_REENQ_IMMED; 4017 return true; 4018 } 4019 4020 return *reenq_flags & SCX_REENQ_ANY; 4021 } 4022 4023 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 4024 { 4025 LIST_HEAD(tasks); 4026 u32 nr_enqueued = 0; 4027 struct task_struct *p, *n; 4028 4029 lockdep_assert_rq_held(rq); 4030 4031 if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 4032 reenq_flags &= ~__SCX_REENQ_TSR_MASK; 4033 if (rq_is_open(rq, 0)) 4034 reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 4035 4036 /* 4037 * The BPF scheduler may choose to dispatch tasks back to 4038 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 4039 * first to avoid processing the same tasks repeatedly. 4040 */ 4041 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 4042 scx.dsq_list.node) { 4043 struct scx_sched *task_sch = scx_task_sched(p); 4044 u32 reason; 4045 4046 /* 4047 * If @p is being migrated, @p's current CPU may not agree with 4048 * its allowed CPUs and the migration_cpu_stop is about to 4049 * deactivate and re-activate @p anyway. Skip re-enqueueing. 4050 * 4051 * While racing sched property changes may also dequeue and 4052 * re-enqueue a migrating task while its current CPU and allowed 4053 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 4054 * the current local DSQ for running tasks and thus are not 4055 * visible to the BPF scheduler. 4056 */ 4057 if (p->migration_pending) 4058 continue; 4059 4060 if (!scx_is_descendant(task_sch, sch)) 4061 continue; 4062 4063 if (!local_task_should_reenq(p, &reenq_flags, &reason)) 4064 continue; 4065 4066 dispatch_dequeue(rq, p); 4067 4068 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4069 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4070 p->scx.flags |= reason; 4071 4072 list_add_tail(&p->scx.dsq_list.node, &tasks); 4073 } 4074 4075 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 4076 list_del_init(&p->scx.dsq_list.node); 4077 4078 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 4079 4080 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4081 nr_enqueued++; 4082 } 4083 4084 return nr_enqueued; 4085 } 4086 4087 static void process_deferred_reenq_locals(struct rq *rq) 4088 { 4089 u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4090 4091 lockdep_assert_rq_held(rq); 4092 4093 while (true) { 4094 struct scx_sched *sch; 4095 u64 reenq_flags; 4096 bool skip = false; 4097 4098 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4099 struct scx_deferred_reenq_local *drl = 4100 list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4101 struct scx_deferred_reenq_local, 4102 node); 4103 struct scx_sched_pcpu *sch_pcpu; 4104 4105 if (!drl) 4106 return; 4107 4108 sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4109 deferred_reenq_local); 4110 sch = sch_pcpu->sch; 4111 4112 reenq_flags = drl->flags; 4113 WRITE_ONCE(drl->flags, 0); 4114 list_del_init(&drl->node); 4115 4116 if (likely(drl->seq != seq)) { 4117 drl->seq = seq; 4118 drl->cnt = 0; 4119 } else { 4120 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4121 scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4122 drl->cnt); 4123 skip = true; 4124 } 4125 4126 __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4127 } 4128 } 4129 4130 if (!skip) { 4131 /* see schedule_dsq_reenq() */ 4132 smp_mb(); 4133 4134 reenq_local(sch, rq, reenq_flags); 4135 } 4136 } 4137 } 4138 4139 static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4140 { 4141 *reason = SCX_TASK_REENQ_KFUNC; 4142 return reenq_flags & SCX_REENQ_ANY; 4143 } 4144 4145 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4146 { 4147 struct rq *locked_rq = rq; 4148 struct scx_sched *sch = dsq->sched; 4149 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4150 struct task_struct *p; 4151 s32 nr_enqueued = 0; 4152 4153 lockdep_assert_rq_held(rq); 4154 4155 raw_spin_lock(&dsq->lock); 4156 4157 while (likely(!READ_ONCE(sch->bypass_depth))) { 4158 struct rq *task_rq; 4159 u32 reason; 4160 4161 p = nldsq_cursor_next_task(&cursor, dsq); 4162 if (!p) 4163 break; 4164 4165 if (!user_task_should_reenq(p, reenq_flags, &reason)) 4166 continue; 4167 4168 task_rq = task_rq(p); 4169 4170 if (locked_rq != task_rq) { 4171 if (locked_rq) 4172 raw_spin_rq_unlock(locked_rq); 4173 if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4174 raw_spin_unlock(&dsq->lock); 4175 raw_spin_rq_lock(task_rq); 4176 raw_spin_lock(&dsq->lock); 4177 } 4178 locked_rq = task_rq; 4179 4180 /* did we lose @p while switching locks? */ 4181 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4182 continue; 4183 } 4184 4185 /* @p is on @dsq, its rq and @dsq are locked */ 4186 dispatch_dequeue_locked(p, dsq); 4187 raw_spin_unlock(&dsq->lock); 4188 4189 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4190 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4191 p->scx.flags |= reason; 4192 4193 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4194 4195 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4196 4197 if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4198 raw_spin_rq_unlock(locked_rq); 4199 locked_rq = NULL; 4200 cpu_relax(); 4201 } 4202 4203 raw_spin_lock(&dsq->lock); 4204 } 4205 4206 list_del_init(&cursor.node); 4207 raw_spin_unlock(&dsq->lock); 4208 4209 if (locked_rq != rq) { 4210 if (locked_rq) 4211 raw_spin_rq_unlock(locked_rq); 4212 raw_spin_rq_lock(rq); 4213 } 4214 } 4215 4216 static void process_deferred_reenq_users(struct rq *rq) 4217 { 4218 lockdep_assert_rq_held(rq); 4219 4220 while (true) { 4221 struct scx_dispatch_q *dsq; 4222 u64 reenq_flags; 4223 4224 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4225 struct scx_deferred_reenq_user *dru = 4226 list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4227 struct scx_deferred_reenq_user, 4228 node); 4229 struct scx_dsq_pcpu *dsq_pcpu; 4230 4231 if (!dru) 4232 return; 4233 4234 dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4235 deferred_reenq_user); 4236 dsq = dsq_pcpu->dsq; 4237 reenq_flags = dru->flags; 4238 WRITE_ONCE(dru->flags, 0); 4239 list_del_init(&dru->node); 4240 } 4241 4242 /* see schedule_dsq_reenq() */ 4243 smp_mb(); 4244 4245 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4246 reenq_user(rq, dsq, reenq_flags); 4247 } 4248 } 4249 4250 static void run_deferred(struct rq *rq) 4251 { 4252 process_ddsp_deferred_locals(rq); 4253 4254 if (!list_empty(&rq->scx.deferred_reenq_locals)) 4255 process_deferred_reenq_locals(rq); 4256 4257 if (!list_empty(&rq->scx.deferred_reenq_users)) 4258 process_deferred_reenq_users(rq); 4259 } 4260 4261 #ifdef CONFIG_NO_HZ_FULL 4262 bool scx_can_stop_tick(struct rq *rq) 4263 { 4264 struct task_struct *p = rq->curr; 4265 struct scx_sched *sch = scx_task_sched(p); 4266 4267 if (p->sched_class != &ext_sched_class) 4268 return true; 4269 4270 if (scx_bypassing(sch, cpu_of(rq))) 4271 return false; 4272 4273 /* 4274 * @rq can dispatch from different DSQs, so we can't tell whether it 4275 * needs the tick or not by looking at nr_running. Allow stopping ticks 4276 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4277 */ 4278 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4279 } 4280 #endif 4281 4282 #ifdef CONFIG_EXT_GROUP_SCHED 4283 4284 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4285 static bool scx_cgroup_enabled; 4286 4287 void scx_tg_init(struct task_group *tg) 4288 { 4289 tg->scx.weight = CGROUP_WEIGHT_DFL; 4290 tg->scx.bw_period_us = default_bw_period_us(); 4291 tg->scx.bw_quota_us = RUNTIME_INF; 4292 tg->scx.idle = false; 4293 } 4294 4295 int scx_tg_online(struct task_group *tg) 4296 { 4297 struct scx_sched *sch = scx_root; 4298 int ret = 0; 4299 4300 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4301 4302 if (scx_cgroup_enabled) { 4303 if (SCX_HAS_OP(sch, cgroup_init)) { 4304 struct scx_cgroup_init_args args = 4305 { .weight = tg->scx.weight, 4306 .bw_period_us = tg->scx.bw_period_us, 4307 .bw_quota_us = tg->scx.bw_quota_us, 4308 .bw_burst_us = tg->scx.bw_burst_us }; 4309 4310 ret = SCX_CALL_OP_RET(sch, cgroup_init, 4311 NULL, tg->css.cgroup, &args); 4312 if (ret) 4313 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4314 } 4315 if (ret == 0) 4316 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4317 } else { 4318 tg->scx.flags |= SCX_TG_ONLINE; 4319 } 4320 4321 return ret; 4322 } 4323 4324 void scx_tg_offline(struct task_group *tg) 4325 { 4326 struct scx_sched *sch = scx_root; 4327 4328 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4329 4330 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4331 (tg->scx.flags & SCX_TG_INITED)) 4332 SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); 4333 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4334 } 4335 4336 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4337 { 4338 struct scx_sched *sch = scx_root; 4339 struct cgroup_subsys_state *css; 4340 struct task_struct *p; 4341 int ret; 4342 4343 if (!scx_cgroup_enabled) 4344 return 0; 4345 4346 cgroup_taskset_for_each(p, css, tset) { 4347 struct cgroup *from = tg_cgrp(task_group(p)); 4348 struct cgroup *to = tg_cgrp(css_tg(css)); 4349 4350 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4351 4352 /* 4353 * sched_move_task() omits identity migrations. Let's match the 4354 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4355 * always match one-to-one. 4356 */ 4357 if (from == to) 4358 continue; 4359 4360 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4361 ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, 4362 p, from, css->cgroup); 4363 if (ret) 4364 goto err; 4365 } 4366 4367 p->scx.cgrp_moving_from = from; 4368 } 4369 4370 return 0; 4371 4372 err: 4373 cgroup_taskset_for_each(p, css, tset) { 4374 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4375 p->scx.cgrp_moving_from) 4376 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4377 p, p->scx.cgrp_moving_from, css->cgroup); 4378 p->scx.cgrp_moving_from = NULL; 4379 } 4380 4381 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4382 } 4383 4384 void scx_cgroup_move_task(struct task_struct *p) 4385 { 4386 struct scx_sched *sch = scx_root; 4387 4388 if (!scx_cgroup_enabled) 4389 return; 4390 4391 /* 4392 * @p must have ops.cgroup_prep_move() called on it and thus 4393 * cgrp_moving_from set. 4394 */ 4395 if (SCX_HAS_OP(sch, cgroup_move) && 4396 !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) 4397 SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), 4398 p, p->scx.cgrp_moving_from, 4399 tg_cgrp(task_group(p))); 4400 p->scx.cgrp_moving_from = NULL; 4401 } 4402 4403 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4404 { 4405 struct scx_sched *sch = scx_root; 4406 struct cgroup_subsys_state *css; 4407 struct task_struct *p; 4408 4409 if (!scx_cgroup_enabled) 4410 return; 4411 4412 cgroup_taskset_for_each(p, css, tset) { 4413 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4414 p->scx.cgrp_moving_from) 4415 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4416 p, p->scx.cgrp_moving_from, css->cgroup); 4417 p->scx.cgrp_moving_from = NULL; 4418 } 4419 } 4420 4421 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4422 { 4423 struct scx_sched *sch; 4424 4425 percpu_down_read(&scx_cgroup_ops_rwsem); 4426 sch = scx_root; 4427 4428 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4429 tg->scx.weight != weight) 4430 SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); 4431 4432 tg->scx.weight = weight; 4433 4434 percpu_up_read(&scx_cgroup_ops_rwsem); 4435 } 4436 4437 void scx_group_set_idle(struct task_group *tg, bool idle) 4438 { 4439 struct scx_sched *sch; 4440 4441 percpu_down_read(&scx_cgroup_ops_rwsem); 4442 sch = scx_root; 4443 4444 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4445 SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); 4446 4447 /* Update the task group's idle state */ 4448 tg->scx.idle = idle; 4449 4450 percpu_up_read(&scx_cgroup_ops_rwsem); 4451 } 4452 4453 void scx_group_set_bandwidth(struct task_group *tg, 4454 u64 period_us, u64 quota_us, u64 burst_us) 4455 { 4456 struct scx_sched *sch; 4457 4458 percpu_down_read(&scx_cgroup_ops_rwsem); 4459 sch = scx_root; 4460 4461 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4462 (tg->scx.bw_period_us != period_us || 4463 tg->scx.bw_quota_us != quota_us || 4464 tg->scx.bw_burst_us != burst_us)) 4465 SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, 4466 tg_cgrp(tg), period_us, quota_us, burst_us); 4467 4468 tg->scx.bw_period_us = period_us; 4469 tg->scx.bw_quota_us = quota_us; 4470 tg->scx.bw_burst_us = burst_us; 4471 4472 percpu_up_read(&scx_cgroup_ops_rwsem); 4473 } 4474 #endif /* CONFIG_EXT_GROUP_SCHED */ 4475 4476 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4477 static struct cgroup *root_cgroup(void) 4478 { 4479 return &cgrp_dfl_root.cgrp; 4480 } 4481 4482 static void scx_cgroup_lock(void) 4483 { 4484 #ifdef CONFIG_EXT_GROUP_SCHED 4485 percpu_down_write(&scx_cgroup_ops_rwsem); 4486 #endif 4487 cgroup_lock(); 4488 } 4489 4490 static void scx_cgroup_unlock(void) 4491 { 4492 cgroup_unlock(); 4493 #ifdef CONFIG_EXT_GROUP_SCHED 4494 percpu_up_write(&scx_cgroup_ops_rwsem); 4495 #endif 4496 } 4497 #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4498 static struct cgroup *root_cgroup(void) { return NULL; } 4499 static void scx_cgroup_lock(void) {} 4500 static void scx_cgroup_unlock(void) {} 4501 #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4502 4503 #ifdef CONFIG_EXT_SUB_SCHED 4504 static struct cgroup *sch_cgroup(struct scx_sched *sch) 4505 { 4506 return sch->cgrp; 4507 } 4508 4509 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4510 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4511 { 4512 struct cgroup *pos; 4513 struct cgroup_subsys_state *css; 4514 4515 cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4516 rcu_assign_pointer(pos->scx_sched, sch); 4517 } 4518 #else /* CONFIG_EXT_SUB_SCHED */ 4519 static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4520 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4521 #endif /* CONFIG_EXT_SUB_SCHED */ 4522 4523 /* 4524 * Omitted operations: 4525 * 4526 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4527 * 4528 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4529 * their current sched_class. Call them directly from sched core instead. 4530 */ 4531 DEFINE_SCHED_CLASS(ext) = { 4532 .enqueue_task = enqueue_task_scx, 4533 .dequeue_task = dequeue_task_scx, 4534 .yield_task = yield_task_scx, 4535 .yield_to_task = yield_to_task_scx, 4536 4537 .wakeup_preempt = wakeup_preempt_scx, 4538 4539 .pick_task = pick_task_scx, 4540 4541 .put_prev_task = put_prev_task_scx, 4542 .set_next_task = set_next_task_scx, 4543 4544 .select_task_rq = select_task_rq_scx, 4545 .task_woken = task_woken_scx, 4546 .set_cpus_allowed = set_cpus_allowed_scx, 4547 4548 .rq_online = rq_online_scx, 4549 .rq_offline = rq_offline_scx, 4550 4551 .task_tick = task_tick_scx, 4552 4553 .switching_to = switching_to_scx, 4554 .switched_from = switched_from_scx, 4555 .switched_to = switched_to_scx, 4556 .reweight_task = reweight_task_scx, 4557 .prio_changed = prio_changed_scx, 4558 4559 .update_curr = update_curr_scx, 4560 4561 #ifdef CONFIG_UCLAMP_TASK 4562 .uclamp_enabled = 1, 4563 #endif 4564 }; 4565 4566 static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4567 struct scx_sched *sch) 4568 { 4569 s32 cpu; 4570 4571 memset(dsq, 0, sizeof(*dsq)); 4572 4573 raw_spin_lock_init(&dsq->lock); 4574 INIT_LIST_HEAD(&dsq->list); 4575 dsq->id = dsq_id; 4576 dsq->sched = sch; 4577 4578 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4579 if (!dsq->pcpu) 4580 return -ENOMEM; 4581 4582 for_each_possible_cpu(cpu) { 4583 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4584 4585 pcpu->dsq = dsq; 4586 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4587 } 4588 4589 return 0; 4590 } 4591 4592 static void exit_dsq(struct scx_dispatch_q *dsq) 4593 { 4594 s32 cpu; 4595 4596 for_each_possible_cpu(cpu) { 4597 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4598 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4599 struct rq *rq = cpu_rq(cpu); 4600 4601 /* 4602 * There must have been a RCU grace period since the last 4603 * insertion and @dsq should be off the deferred list by now. 4604 */ 4605 if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4606 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4607 list_del_init(&dru->node); 4608 } 4609 } 4610 4611 free_percpu(dsq->pcpu); 4612 } 4613 4614 static void free_dsq_rcufn(struct rcu_head *rcu) 4615 { 4616 struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4617 4618 exit_dsq(dsq); 4619 kfree(dsq); 4620 } 4621 4622 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4623 { 4624 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4625 struct scx_dispatch_q *dsq, *tmp_dsq; 4626 4627 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4628 call_rcu(&dsq->rcu, free_dsq_rcufn); 4629 } 4630 4631 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4632 4633 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4634 { 4635 struct scx_dispatch_q *dsq; 4636 unsigned long flags; 4637 4638 rcu_read_lock(); 4639 4640 dsq = find_user_dsq(sch, dsq_id); 4641 if (!dsq) 4642 goto out_unlock_rcu; 4643 4644 raw_spin_lock_irqsave(&dsq->lock, flags); 4645 4646 if (dsq->nr) { 4647 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4648 dsq->id, dsq->nr); 4649 goto out_unlock_dsq; 4650 } 4651 4652 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4653 dsq_hash_params)) 4654 goto out_unlock_dsq; 4655 4656 /* 4657 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4658 * queueing more tasks. As this function can be called from anywhere, 4659 * freeing is bounced through an irq work to avoid nesting RCU 4660 * operations inside scheduler locks. 4661 */ 4662 dsq->id = SCX_DSQ_INVALID; 4663 if (llist_add(&dsq->free_node, &dsqs_to_free)) 4664 irq_work_queue(&free_dsq_irq_work); 4665 4666 out_unlock_dsq: 4667 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4668 out_unlock_rcu: 4669 rcu_read_unlock(); 4670 } 4671 4672 #ifdef CONFIG_EXT_GROUP_SCHED 4673 static void scx_cgroup_exit(struct scx_sched *sch) 4674 { 4675 struct cgroup_subsys_state *css; 4676 4677 scx_cgroup_enabled = false; 4678 4679 /* 4680 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4681 * cgroups and exit all the inited ones, all online cgroups are exited. 4682 */ 4683 css_for_each_descendant_post(css, &root_task_group.css) { 4684 struct task_group *tg = css_tg(css); 4685 4686 if (!(tg->scx.flags & SCX_TG_INITED)) 4687 continue; 4688 tg->scx.flags &= ~SCX_TG_INITED; 4689 4690 if (!sch->ops.cgroup_exit) 4691 continue; 4692 4693 SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); 4694 } 4695 } 4696 4697 static int scx_cgroup_init(struct scx_sched *sch) 4698 { 4699 struct cgroup_subsys_state *css; 4700 int ret; 4701 4702 /* 4703 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4704 * cgroups and init, all online cgroups are initialized. 4705 */ 4706 css_for_each_descendant_pre(css, &root_task_group.css) { 4707 struct task_group *tg = css_tg(css); 4708 struct scx_cgroup_init_args args = { 4709 .weight = tg->scx.weight, 4710 .bw_period_us = tg->scx.bw_period_us, 4711 .bw_quota_us = tg->scx.bw_quota_us, 4712 .bw_burst_us = tg->scx.bw_burst_us, 4713 }; 4714 4715 if ((tg->scx.flags & 4716 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4717 continue; 4718 4719 if (!sch->ops.cgroup_init) { 4720 tg->scx.flags |= SCX_TG_INITED; 4721 continue; 4722 } 4723 4724 ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, 4725 css->cgroup, &args); 4726 if (ret) { 4727 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4728 return ret; 4729 } 4730 tg->scx.flags |= SCX_TG_INITED; 4731 } 4732 4733 WARN_ON_ONCE(scx_cgroup_enabled); 4734 scx_cgroup_enabled = true; 4735 4736 return 0; 4737 } 4738 4739 #else 4740 static void scx_cgroup_exit(struct scx_sched *sch) {} 4741 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4742 #endif 4743 4744 4745 /******************************************************************************** 4746 * Sysfs interface and ops enable/disable. 4747 */ 4748 4749 #define SCX_ATTR(_name) \ 4750 static struct kobj_attribute scx_attr_##_name = { \ 4751 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4752 .show = scx_attr_##_name##_show, \ 4753 } 4754 4755 static ssize_t scx_attr_state_show(struct kobject *kobj, 4756 struct kobj_attribute *ka, char *buf) 4757 { 4758 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4759 } 4760 SCX_ATTR(state); 4761 4762 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4763 struct kobj_attribute *ka, char *buf) 4764 { 4765 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4766 } 4767 SCX_ATTR(switch_all); 4768 4769 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4770 struct kobj_attribute *ka, char *buf) 4771 { 4772 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4773 } 4774 SCX_ATTR(nr_rejected); 4775 4776 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4777 struct kobj_attribute *ka, char *buf) 4778 { 4779 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4780 } 4781 SCX_ATTR(hotplug_seq); 4782 4783 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4784 struct kobj_attribute *ka, char *buf) 4785 { 4786 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4787 } 4788 SCX_ATTR(enable_seq); 4789 4790 static struct attribute *scx_global_attrs[] = { 4791 &scx_attr_state.attr, 4792 &scx_attr_switch_all.attr, 4793 &scx_attr_nr_rejected.attr, 4794 &scx_attr_hotplug_seq.attr, 4795 &scx_attr_enable_seq.attr, 4796 NULL, 4797 }; 4798 4799 static const struct attribute_group scx_global_attr_group = { 4800 .attrs = scx_global_attrs, 4801 }; 4802 4803 static void free_pnode(struct scx_sched_pnode *pnode); 4804 static void free_exit_info(struct scx_exit_info *ei); 4805 4806 static void scx_sched_free_rcu_work(struct work_struct *work) 4807 { 4808 struct rcu_work *rcu_work = to_rcu_work(work); 4809 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 4810 struct rhashtable_iter rht_iter; 4811 struct scx_dispatch_q *dsq; 4812 int cpu, node; 4813 4814 irq_work_sync(&sch->disable_irq_work); 4815 kthread_destroy_worker(sch->helper); 4816 timer_shutdown_sync(&sch->bypass_lb_timer); 4817 free_cpumask_var(sch->bypass_lb_donee_cpumask); 4818 free_cpumask_var(sch->bypass_lb_resched_cpumask); 4819 4820 #ifdef CONFIG_EXT_SUB_SCHED 4821 kfree(sch->cgrp_path); 4822 if (sch_cgroup(sch)) 4823 cgroup_put(sch_cgroup(sch)); 4824 if (sch->sub_kset) 4825 kobject_put(&sch->sub_kset->kobj); 4826 #endif /* CONFIG_EXT_SUB_SCHED */ 4827 4828 for_each_possible_cpu(cpu) { 4829 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 4830 4831 /* 4832 * $sch would have entered bypass mode before the RCU grace 4833 * period. As that blocks new deferrals, all 4834 * deferred_reenq_local_node's must be off-list by now. 4835 */ 4836 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 4837 4838 exit_dsq(bypass_dsq(sch, cpu)); 4839 } 4840 4841 free_percpu(sch->pcpu); 4842 4843 for_each_node_state(node, N_POSSIBLE) 4844 free_pnode(sch->pnode[node]); 4845 kfree(sch->pnode); 4846 4847 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 4848 do { 4849 rhashtable_walk_start(&rht_iter); 4850 4851 while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 4852 destroy_dsq(sch, dsq->id); 4853 4854 rhashtable_walk_stop(&rht_iter); 4855 } while (dsq == ERR_PTR(-EAGAIN)); 4856 rhashtable_walk_exit(&rht_iter); 4857 4858 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 4859 free_exit_info(sch->exit_info); 4860 kfree(sch); 4861 } 4862 4863 static void scx_kobj_release(struct kobject *kobj) 4864 { 4865 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4866 4867 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 4868 queue_rcu_work(system_dfl_wq, &sch->rcu_work); 4869 } 4870 4871 static ssize_t scx_attr_ops_show(struct kobject *kobj, 4872 struct kobj_attribute *ka, char *buf) 4873 { 4874 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4875 4876 return sysfs_emit(buf, "%s\n", sch->ops.name); 4877 } 4878 SCX_ATTR(ops); 4879 4880 #define scx_attr_event_show(buf, at, events, kind) ({ \ 4881 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4882 }) 4883 4884 static ssize_t scx_attr_events_show(struct kobject *kobj, 4885 struct kobj_attribute *ka, char *buf) 4886 { 4887 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4888 struct scx_event_stats events; 4889 int at = 0; 4890 4891 scx_read_events(sch, &events); 4892 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4893 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4894 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4895 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4896 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4897 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 4898 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 4899 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 4900 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4901 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4902 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4903 at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 4904 at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 4905 return at; 4906 } 4907 SCX_ATTR(events); 4908 4909 static struct attribute *scx_sched_attrs[] = { 4910 &scx_attr_ops.attr, 4911 &scx_attr_events.attr, 4912 NULL, 4913 }; 4914 ATTRIBUTE_GROUPS(scx_sched); 4915 4916 static const struct kobj_type scx_ktype = { 4917 .release = scx_kobj_release, 4918 .sysfs_ops = &kobj_sysfs_ops, 4919 .default_groups = scx_sched_groups, 4920 }; 4921 4922 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 4923 { 4924 const struct scx_sched *sch; 4925 4926 /* 4927 * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) 4928 * and sub-scheduler kset kobjects (kset_ktype) through the parent 4929 * chain walk. Filter out the latter to avoid invalid casts. 4930 */ 4931 if (kobj->ktype != &scx_ktype) 4932 return 0; 4933 4934 sch = container_of(kobj, struct scx_sched, kobj); 4935 4936 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 4937 } 4938 4939 static const struct kset_uevent_ops scx_uevent_ops = { 4940 .uevent = scx_uevent, 4941 }; 4942 4943 /* 4944 * Used by sched_fork() and __setscheduler_prio() to pick the matching 4945 * sched_class. dl/rt are already handled. 4946 */ 4947 bool task_should_scx(int policy) 4948 { 4949 if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) 4950 return false; 4951 if (READ_ONCE(scx_switching_all)) 4952 return true; 4953 return policy == SCHED_EXT; 4954 } 4955 4956 bool scx_allow_ttwu_queue(const struct task_struct *p) 4957 { 4958 struct scx_sched *sch; 4959 4960 if (!scx_enabled()) 4961 return true; 4962 4963 sch = scx_task_sched(p); 4964 if (unlikely(!sch)) 4965 return true; 4966 4967 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 4968 return true; 4969 4970 if (unlikely(p->sched_class != &ext_sched_class)) 4971 return true; 4972 4973 return false; 4974 } 4975 4976 /** 4977 * handle_lockup - sched_ext common lockup handler 4978 * @fmt: format string 4979 * 4980 * Called on system stall or lockup condition and initiates abort of sched_ext 4981 * if enabled, which may resolve the reported lockup. 4982 * 4983 * Returns %true if sched_ext is enabled and abort was initiated, which may 4984 * resolve the lockup. %false if sched_ext is not enabled or abort was already 4985 * initiated by someone else. 4986 */ 4987 static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 4988 { 4989 struct scx_sched *sch; 4990 va_list args; 4991 bool ret; 4992 4993 guard(rcu)(); 4994 4995 sch = rcu_dereference(scx_root); 4996 if (unlikely(!sch)) 4997 return false; 4998 4999 switch (scx_enable_state()) { 5000 case SCX_ENABLING: 5001 case SCX_ENABLED: 5002 va_start(args, fmt); 5003 ret = scx_verror(sch, fmt, args); 5004 va_end(args); 5005 return ret; 5006 default: 5007 return false; 5008 } 5009 } 5010 5011 /** 5012 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 5013 * 5014 * While there are various reasons why RCU CPU stalls can occur on a system 5015 * that may not be caused by the current BPF scheduler, try kicking out the 5016 * current scheduler in an attempt to recover the system to a good state before 5017 * issuing panics. 5018 * 5019 * Returns %true if sched_ext is enabled and abort was initiated, which may 5020 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 5021 * else already initiated abort. 5022 */ 5023 bool scx_rcu_cpu_stall(void) 5024 { 5025 return handle_lockup("RCU CPU stall detected!"); 5026 } 5027 5028 /** 5029 * scx_softlockup - sched_ext softlockup handler 5030 * @dur_s: number of seconds of CPU stuck due to soft lockup 5031 * 5032 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 5033 * live-lock the system by making many CPUs target the same DSQ to the point 5034 * where soft-lockup detection triggers. This function is called from 5035 * soft-lockup watchdog when the triggering point is close and tries to unjam 5036 * the system and aborting the BPF scheduler. 5037 */ 5038 void scx_softlockup(u32 dur_s) 5039 { 5040 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 5041 return; 5042 5043 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 5044 smp_processor_id(), dur_s); 5045 } 5046 5047 /* 5048 * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), 5049 * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing 5050 * it from NMI context can lead to deadlocks. Defer via irq_work; the 5051 * disable path runs off irq_work anyway. 5052 */ 5053 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); 5054 5055 static void scx_hardlockup_irq_workfn(struct irq_work *work) 5056 { 5057 int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); 5058 5059 if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) 5060 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 5061 cpu); 5062 } 5063 5064 static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); 5065 5066 /** 5067 * scx_hardlockup - sched_ext hardlockup handler 5068 * 5069 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 5070 * numerous affinitized tasks in a single queue and directing all CPUs at it. 5071 * Try kicking out the current scheduler in an attempt to recover the system to 5072 * a good state before taking more drastic actions. 5073 * 5074 * Queues an irq_work; the handle_lockup() call happens in IRQ context (see 5075 * scx_hardlockup_irq_workfn). 5076 * 5077 * Returns %true if sched_ext is enabled and the work was queued, %false 5078 * otherwise. 5079 */ 5080 bool scx_hardlockup(int cpu) 5081 { 5082 if (!rcu_access_pointer(scx_root)) 5083 return false; 5084 5085 atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); 5086 irq_work_queue(&scx_hardlockup_irq_work); 5087 return true; 5088 } 5089 5090 static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 5091 struct cpumask *donee_mask, struct cpumask *resched_mask, 5092 u32 nr_donor_target, u32 nr_donee_target) 5093 { 5094 struct rq *donor_rq = cpu_rq(donor); 5095 struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 5096 struct task_struct *p, *n; 5097 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 5098 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 5099 u32 nr_balanced = 0, min_delta_us; 5100 5101 /* 5102 * All we want to guarantee is reasonable forward progress. No reason to 5103 * fine tune. Assuming every task on @donor_dsq runs their full slice, 5104 * consider offloading iff the total queued duration is over the 5105 * threshold. 5106 */ 5107 min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 5108 if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 5109 return 0; 5110 5111 raw_spin_rq_lock_irq(donor_rq); 5112 raw_spin_lock(&donor_dsq->lock); 5113 list_add(&cursor.node, &donor_dsq->list); 5114 resume: 5115 n = container_of(&cursor, struct task_struct, scx.dsq_list); 5116 n = nldsq_next_task(donor_dsq, n, false); 5117 5118 while ((p = n)) { 5119 struct scx_dispatch_q *donee_dsq; 5120 int donee; 5121 5122 n = nldsq_next_task(donor_dsq, n, false); 5123 5124 if (donor_dsq->nr <= nr_donor_target) 5125 break; 5126 5127 if (cpumask_empty(donee_mask)) 5128 break; 5129 5130 /* 5131 * If an earlier pass placed @p on @donor_dsq from a different 5132 * CPU and the donee hasn't consumed it yet, @p is still on the 5133 * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved 5134 * without its rq locked. Skip. 5135 */ 5136 if (task_rq(p) != donor_rq) 5137 continue; 5138 5139 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5140 if (donee >= nr_cpu_ids) 5141 continue; 5142 5143 donee_dsq = bypass_dsq(sch, donee); 5144 5145 /* 5146 * $p's rq is not locked but $p's DSQ lock protects its 5147 * scheduling properties making this test safe. 5148 */ 5149 if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5150 continue; 5151 5152 /* 5153 * Moving $p from one non-local DSQ to another. The source rq 5154 * and DSQ are already locked. Do an abbreviated dequeue and 5155 * then perform enqueue without unlocking $donor_dsq. 5156 * 5157 * We don't want to drop and reacquire the lock on each 5158 * iteration as @donor_dsq can be very long and potentially 5159 * highly contended. Donee DSQs are less likely to be contended. 5160 * The nested locking is safe as only this LB moves tasks 5161 * between bypass DSQs. 5162 */ 5163 dispatch_dequeue_locked(p, donor_dsq); 5164 dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5165 5166 /* 5167 * $donee might have been idle and need to be woken up. No need 5168 * to be clever. Kick every CPU that receives tasks. 5169 */ 5170 cpumask_set_cpu(donee, resched_mask); 5171 5172 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5173 cpumask_clear_cpu(donee, donee_mask); 5174 5175 nr_balanced++; 5176 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5177 list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5178 raw_spin_unlock(&donor_dsq->lock); 5179 raw_spin_rq_unlock_irq(donor_rq); 5180 cpu_relax(); 5181 raw_spin_rq_lock_irq(donor_rq); 5182 raw_spin_lock(&donor_dsq->lock); 5183 goto resume; 5184 } 5185 } 5186 5187 list_del_init(&cursor.node); 5188 raw_spin_unlock(&donor_dsq->lock); 5189 raw_spin_rq_unlock_irq(donor_rq); 5190 5191 return nr_balanced; 5192 } 5193 5194 static void bypass_lb_node(struct scx_sched *sch, int node) 5195 { 5196 const struct cpumask *node_mask = cpumask_of_node(node); 5197 struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; 5198 struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; 5199 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5200 u32 nr_target, nr_donor_target; 5201 u32 before_min = U32_MAX, before_max = 0; 5202 u32 after_min = U32_MAX, after_max = 0; 5203 int cpu; 5204 5205 /* count the target tasks and CPUs */ 5206 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5207 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5208 5209 nr_tasks += nr; 5210 nr_cpus++; 5211 5212 before_min = min(nr, before_min); 5213 before_max = max(nr, before_max); 5214 } 5215 5216 if (!nr_cpus) 5217 return; 5218 5219 /* 5220 * We don't want CPUs to have more than $nr_donor_target tasks and 5221 * balancing to fill donee CPUs upto $nr_target. Once targets are 5222 * calculated, find the donee CPUs. 5223 */ 5224 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5225 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5226 5227 cpumask_clear(donee_mask); 5228 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5229 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5230 cpumask_set_cpu(cpu, donee_mask); 5231 } 5232 5233 /* iterate !donee CPUs and see if they should be offloaded */ 5234 cpumask_clear(resched_mask); 5235 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5236 if (cpumask_empty(donee_mask)) 5237 break; 5238 if (cpumask_test_cpu(cpu, donee_mask)) 5239 continue; 5240 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5241 continue; 5242 5243 nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5244 nr_donor_target, nr_target); 5245 } 5246 5247 for_each_cpu(cpu, resched_mask) 5248 resched_cpu(cpu); 5249 5250 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5251 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5252 5253 after_min = min(nr, after_min); 5254 after_max = max(nr, after_max); 5255 5256 } 5257 5258 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5259 before_min, before_max, after_min, after_max); 5260 } 5261 5262 /* 5263 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5264 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5265 * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5266 * lightly loaded CPUs, such imbalance can lead to very high execution latency 5267 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5268 * outcomes, a simple load balancing mechanism is implemented by the following 5269 * timer which runs periodically while bypass mode is in effect. 5270 */ 5271 static void scx_bypass_lb_timerfn(struct timer_list *timer) 5272 { 5273 struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5274 int node; 5275 u32 intv_us; 5276 5277 if (!bypass_dsp_enabled(sch)) 5278 return; 5279 5280 for_each_node_with_cpus(node) 5281 bypass_lb_node(sch, node); 5282 5283 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5284 if (intv_us) 5285 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5286 } 5287 5288 static bool inc_bypass_depth(struct scx_sched *sch) 5289 { 5290 lockdep_assert_held(&scx_bypass_lock); 5291 5292 WARN_ON_ONCE(sch->bypass_depth < 0); 5293 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5294 if (sch->bypass_depth != 1) 5295 return false; 5296 5297 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5298 sch->bypass_timestamp = ktime_get_ns(); 5299 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5300 return true; 5301 } 5302 5303 static bool dec_bypass_depth(struct scx_sched *sch) 5304 { 5305 lockdep_assert_held(&scx_bypass_lock); 5306 5307 WARN_ON_ONCE(sch->bypass_depth < 1); 5308 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5309 if (sch->bypass_depth != 0) 5310 return false; 5311 5312 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5313 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5314 ktime_get_ns() - sch->bypass_timestamp); 5315 return true; 5316 } 5317 5318 static void enable_bypass_dsp(struct scx_sched *sch) 5319 { 5320 struct scx_sched *host = scx_parent(sch) ?: sch; 5321 u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5322 s32 ret; 5323 5324 /* 5325 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5326 * Shouldn't stagger. 5327 */ 5328 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5329 return; 5330 5331 /* 5332 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5333 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5334 * called iff @sch is not already bypassed due to an ancestor bypassing, 5335 * we can assume that the parent is not bypassing and thus will be the 5336 * host of the bypass DSQs. 5337 * 5338 * While the situation may change in the future, the following 5339 * guarantees that the nearest non-bypassing ancestor or root has bypass 5340 * dispatch enabled while a descendant is bypassing, which is all that's 5341 * required. 5342 * 5343 * bypass_dsp_enabled() test is used to determine whether to enter the 5344 * bypass dispatch handling path from both bypassing and hosting scheds. 5345 * Bump enable depth on both @sch and bypass dispatch host. 5346 */ 5347 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5348 WARN_ON_ONCE(ret <= 0); 5349 5350 if (host != sch) { 5351 ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5352 WARN_ON_ONCE(ret <= 0); 5353 } 5354 5355 /* 5356 * The LB timer will stop running if bypass dispatch is disabled. Start 5357 * after enabling bypass dispatch. 5358 */ 5359 if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5360 mod_timer(&host->bypass_lb_timer, 5361 jiffies + usecs_to_jiffies(intv_us)); 5362 } 5363 5364 /* may be called without holding scx_bypass_lock */ 5365 static void disable_bypass_dsp(struct scx_sched *sch) 5366 { 5367 s32 ret; 5368 5369 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5370 return; 5371 5372 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5373 WARN_ON_ONCE(ret < 0); 5374 5375 if (scx_parent(sch)) { 5376 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5377 WARN_ON_ONCE(ret < 0); 5378 } 5379 } 5380 5381 /** 5382 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5383 * @sch: sched to bypass 5384 * @bypass: true for bypass, false for unbypass 5385 * 5386 * Bypassing guarantees that all runnable tasks make forward progress without 5387 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5388 * be held by tasks that the BPF scheduler is forgetting to run, which 5389 * unfortunately also excludes toggling the static branches. 5390 * 5391 * Let's work around by overriding a couple ops and modifying behaviors based on 5392 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5393 * to force global FIFO scheduling. 5394 * 5395 * - ops.select_cpu() is ignored and the default select_cpu() is used. 5396 * 5397 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5398 * %SCX_OPS_ENQ_LAST is also ignored. 5399 * 5400 * - ops.dispatch() is ignored. 5401 * 5402 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5403 * can't be trusted. Whenever a tick triggers, the running task is rotated to 5404 * the tail of the queue with core_sched_at touched. 5405 * 5406 * - pick_next_task() suppresses zero slice warning. 5407 * 5408 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5409 * operations. 5410 * 5411 * - scx_prio_less() reverts to the default core_sched_at order. 5412 */ 5413 static void scx_bypass(struct scx_sched *sch, bool bypass) 5414 { 5415 struct scx_sched *pos; 5416 unsigned long flags; 5417 int cpu; 5418 5419 raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5420 5421 if (bypass) { 5422 if (!inc_bypass_depth(sch)) 5423 goto unlock; 5424 5425 enable_bypass_dsp(sch); 5426 } else { 5427 if (!dec_bypass_depth(sch)) 5428 goto unlock; 5429 } 5430 5431 /* 5432 * Bypass state is propagated to all descendants - an scx_sched bypasses 5433 * if itself or any of its ancestors are in bypass mode. 5434 */ 5435 raw_spin_lock(&scx_sched_lock); 5436 scx_for_each_descendant_pre(pos, sch) { 5437 if (pos == sch) 5438 continue; 5439 if (bypass) 5440 inc_bypass_depth(pos); 5441 else 5442 dec_bypass_depth(pos); 5443 } 5444 raw_spin_unlock(&scx_sched_lock); 5445 5446 /* 5447 * No task property is changing. We just need to make sure all currently 5448 * queued tasks are re-queued according to the new scx_bypassing() 5449 * state. As an optimization, walk each rq's runnable_list instead of 5450 * the scx_tasks list. 5451 * 5452 * This function can't trust the scheduler and thus can't use 5453 * cpus_read_lock(). Walk all possible CPUs instead of online. 5454 */ 5455 for_each_possible_cpu(cpu) { 5456 struct rq *rq = cpu_rq(cpu); 5457 struct task_struct *p, *n; 5458 5459 raw_spin_rq_lock(rq); 5460 raw_spin_lock(&scx_sched_lock); 5461 5462 scx_for_each_descendant_pre(pos, sch) { 5463 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5464 5465 if (pos->bypass_depth) 5466 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5467 else 5468 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5469 } 5470 5471 raw_spin_unlock(&scx_sched_lock); 5472 5473 /* 5474 * We need to guarantee that no tasks are on the BPF scheduler 5475 * while bypassing. Either we see enabled or the enable path 5476 * sees scx_bypassing() before moving tasks to SCX. 5477 */ 5478 if (!scx_enabled()) { 5479 raw_spin_rq_unlock(rq); 5480 continue; 5481 } 5482 5483 /* 5484 * The use of list_for_each_entry_safe_reverse() is required 5485 * because each task is going to be removed from and added back 5486 * to the runnable_list during iteration. Because they're added 5487 * to the tail of the list, safe reverse iteration can still 5488 * visit all nodes. 5489 */ 5490 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5491 scx.runnable_node) { 5492 if (!scx_is_descendant(scx_task_sched(p), sch)) 5493 continue; 5494 5495 /* cycling deq/enq is enough, see the function comment */ 5496 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5497 /* nothing */ ; 5498 } 5499 } 5500 5501 /* resched to restore ticks and idle state */ 5502 if (cpu_online(cpu) || cpu == smp_processor_id()) 5503 resched_curr(rq); 5504 5505 raw_spin_rq_unlock(rq); 5506 } 5507 5508 /* disarming must come after moving all tasks out of the bypass DSQs */ 5509 if (!bypass) 5510 disable_bypass_dsp(sch); 5511 unlock: 5512 raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5513 } 5514 5515 static void free_exit_info(struct scx_exit_info *ei) 5516 { 5517 kvfree(ei->dump); 5518 kfree(ei->msg); 5519 kfree(ei->bt); 5520 kfree(ei); 5521 } 5522 5523 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5524 { 5525 struct scx_exit_info *ei; 5526 5527 ei = kzalloc_obj(*ei); 5528 if (!ei) 5529 return NULL; 5530 5531 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5532 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5533 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5534 5535 if (!ei->bt || !ei->msg || !ei->dump) { 5536 free_exit_info(ei); 5537 return NULL; 5538 } 5539 5540 return ei; 5541 } 5542 5543 static const char *scx_exit_reason(enum scx_exit_kind kind) 5544 { 5545 switch (kind) { 5546 case SCX_EXIT_UNREG: 5547 return "unregistered from user space"; 5548 case SCX_EXIT_UNREG_BPF: 5549 return "unregistered from BPF"; 5550 case SCX_EXIT_UNREG_KERN: 5551 return "unregistered from the main kernel"; 5552 case SCX_EXIT_SYSRQ: 5553 return "disabled by sysrq-S"; 5554 case SCX_EXIT_PARENT: 5555 return "parent exiting"; 5556 case SCX_EXIT_ERROR: 5557 return "runtime error"; 5558 case SCX_EXIT_ERROR_BPF: 5559 return "scx_bpf_error"; 5560 case SCX_EXIT_ERROR_STALL: 5561 return "runnable task stall"; 5562 default: 5563 return "<UNKNOWN>"; 5564 } 5565 } 5566 5567 static void free_kick_syncs(void) 5568 { 5569 int cpu; 5570 5571 for_each_possible_cpu(cpu) { 5572 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5573 struct scx_kick_syncs *to_free; 5574 5575 to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5576 if (to_free) 5577 kvfree_rcu(to_free, rcu); 5578 } 5579 } 5580 5581 static void refresh_watchdog(void) 5582 { 5583 struct scx_sched *sch; 5584 unsigned long intv = ULONG_MAX; 5585 5586 /* take the shortest timeout and use its half for watchdog interval */ 5587 rcu_read_lock(); 5588 list_for_each_entry_rcu(sch, &scx_sched_all, all) 5589 intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5590 rcu_read_unlock(); 5591 5592 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5593 WRITE_ONCE(scx_watchdog_interval, intv); 5594 5595 if (intv < ULONG_MAX) 5596 mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5597 else 5598 cancel_delayed_work_sync(&scx_watchdog_work); 5599 } 5600 5601 static s32 scx_link_sched(struct scx_sched *sch) 5602 { 5603 const char *err_msg = ""; 5604 s32 ret = 0; 5605 5606 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5607 #ifdef CONFIG_EXT_SUB_SCHED 5608 struct scx_sched *parent = scx_parent(sch); 5609 5610 if (parent) { 5611 /* 5612 * scx_claim_exit() propagates exit_kind transition to 5613 * its sub-scheds while holding scx_sched_lock - either 5614 * we can see the parent's non-NONE exit_kind or the 5615 * parent can shoot us down. 5616 */ 5617 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5618 err_msg = "parent disabled"; 5619 ret = -ENOENT; 5620 break; 5621 } 5622 5623 ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5624 &sch->hash_node, scx_sched_hash_params); 5625 if (ret) { 5626 err_msg = "failed to insert into scx_sched_hash"; 5627 break; 5628 } 5629 5630 list_add_tail(&sch->sibling, &parent->children); 5631 } 5632 #endif /* CONFIG_EXT_SUB_SCHED */ 5633 5634 list_add_tail_rcu(&sch->all, &scx_sched_all); 5635 } 5636 5637 /* 5638 * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after 5639 * the guard above is released. 5640 */ 5641 if (ret) { 5642 scx_error(sch, "%s (%d)", err_msg, ret); 5643 return ret; 5644 } 5645 5646 refresh_watchdog(); 5647 return 0; 5648 } 5649 5650 static void scx_unlink_sched(struct scx_sched *sch) 5651 { 5652 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5653 #ifdef CONFIG_EXT_SUB_SCHED 5654 if (scx_parent(sch)) { 5655 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5656 scx_sched_hash_params); 5657 list_del_init(&sch->sibling); 5658 } 5659 #endif /* CONFIG_EXT_SUB_SCHED */ 5660 list_del_rcu(&sch->all); 5661 } 5662 5663 refresh_watchdog(); 5664 } 5665 5666 /* 5667 * Called to disable future dumps and wait for in-progress one while disabling 5668 * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5669 * This prevents calling dump ops on a dead sch. 5670 */ 5671 static void scx_disable_dump(struct scx_sched *sch) 5672 { 5673 guard(raw_spinlock_irqsave)(&scx_dump_lock); 5674 sch->dump_disabled = true; 5675 } 5676 5677 #ifdef CONFIG_EXT_SUB_SCHED 5678 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5679 5680 static void drain_descendants(struct scx_sched *sch) 5681 { 5682 /* 5683 * Child scheds that finished the critical part of disabling will take 5684 * themselves off @sch->children. Wait for it to drain. As propagation 5685 * is recursive, empty @sch->children means that all proper descendant 5686 * scheds reached unlinking stage. 5687 */ 5688 wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5689 } 5690 5691 static void scx_fail_parent(struct scx_sched *sch, 5692 struct task_struct *failed, s32 fail_code) 5693 { 5694 struct scx_sched *parent = scx_parent(sch); 5695 struct scx_task_iter sti; 5696 struct task_struct *p; 5697 5698 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5699 fail_code, failed->comm, failed->pid); 5700 5701 /* 5702 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5703 * it. This may cause downstream failures on the BPF side but $parent is 5704 * dying anyway. 5705 */ 5706 scx_bypass(parent, true); 5707 5708 scx_task_iter_start(&sti, sch->cgrp); 5709 while ((p = scx_task_iter_next_locked(&sti))) { 5710 if (scx_task_on_sched(parent, p)) 5711 continue; 5712 5713 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5714 scx_disable_and_exit_task(sch, p); 5715 scx_set_task_sched(p, parent); 5716 } 5717 } 5718 scx_task_iter_stop(&sti); 5719 } 5720 5721 static void scx_sub_disable(struct scx_sched *sch) 5722 { 5723 struct scx_sched *parent = scx_parent(sch); 5724 struct scx_task_iter sti; 5725 struct task_struct *p; 5726 int ret; 5727 5728 /* 5729 * Guarantee forward progress and wait for descendants to be disabled. 5730 * To limit disruptions, $parent is not bypassed. Tasks are fully 5731 * prepped and then inserted back into $parent. 5732 */ 5733 scx_bypass(sch, true); 5734 drain_descendants(sch); 5735 5736 /* 5737 * Here, every runnable task is guaranteed to make forward progress and 5738 * we can safely use blocking synchronization constructs. Actually 5739 * disable ops. 5740 */ 5741 mutex_lock(&scx_enable_mutex); 5742 percpu_down_write(&scx_fork_rwsem); 5743 scx_cgroup_lock(); 5744 5745 set_cgroup_sched(sch_cgroup(sch), parent); 5746 5747 scx_task_iter_start(&sti, sch->cgrp); 5748 while ((p = scx_task_iter_next_locked(&sti))) { 5749 struct rq *rq; 5750 struct rq_flags rf; 5751 5752 /* filter out duplicate visits */ 5753 if (scx_task_on_sched(parent, p)) 5754 continue; 5755 5756 /* 5757 * By the time control reaches here, all descendant schedulers 5758 * should already have been disabled. 5759 */ 5760 WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 5761 5762 /* 5763 * If $p is about to be freed, nothing prevents $sch from 5764 * unloading before $p reaches sched_ext_free(). Disable and 5765 * exit $p right away. 5766 */ 5767 if (!tryget_task_struct(p)) { 5768 scx_disable_and_exit_task(sch, p); 5769 continue; 5770 } 5771 5772 scx_task_iter_unlock(&sti); 5773 5774 /* 5775 * $p is READY or ENABLED on @sch. Initialize for $parent, 5776 * disable and exit from @sch, and then switch over to $parent. 5777 * 5778 * If a task fails to initialize for $parent, the only available 5779 * action is disabling $parent too. While this allows disabling 5780 * of a child sched to cause the parent scheduler to fail, the 5781 * failure can only originate from ops.init_task() of the 5782 * parent. A child can't directly affect the parent through its 5783 * own failures. 5784 */ 5785 ret = __scx_init_task(parent, p, false); 5786 if (ret) { 5787 scx_fail_parent(sch, p, ret); 5788 put_task_struct(p); 5789 break; 5790 } 5791 5792 rq = task_rq_lock(p, &rf); 5793 5794 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 5795 /* 5796 * sched_ext_dead() raced us between __scx_init_task() 5797 * and this rq lock and ran exit_task() on @sch (the 5798 * sched @p was on at that point), not on $parent. 5799 * $parent's just-completed init is owed an exit_task() 5800 * and we issue it here. 5801 */ 5802 scx_sub_init_cancel_task(parent, p); 5803 task_rq_unlock(rq, p, &rf); 5804 put_task_struct(p); 5805 continue; 5806 } 5807 5808 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5809 /* 5810 * $p is initialized for $parent and still attached to 5811 * @sch. Disable and exit for @sch, switch over to 5812 * $parent, override the state to READY to account for 5813 * $p having already been initialized, and then enable. 5814 */ 5815 scx_disable_and_exit_task(sch, p); 5816 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 5817 scx_set_task_state(p, SCX_TASK_INIT); 5818 scx_set_task_sched(p, parent); 5819 scx_set_task_state(p, SCX_TASK_READY); 5820 scx_enable_task(parent, p); 5821 } 5822 5823 task_rq_unlock(rq, p, &rf); 5824 put_task_struct(p); 5825 } 5826 scx_task_iter_stop(&sti); 5827 5828 scx_disable_dump(sch); 5829 5830 scx_cgroup_unlock(); 5831 percpu_up_write(&scx_fork_rwsem); 5832 5833 /* 5834 * All tasks are moved off of @sch but there may still be on-going 5835 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 5836 * the expedited version as ancestors may be waiting in bypass mode. 5837 * Also, tell the parent that there is no need to keep running bypass 5838 * DSQs for us. 5839 */ 5840 synchronize_rcu_expedited(); 5841 disable_bypass_dsp(sch); 5842 5843 scx_unlink_sched(sch); 5844 5845 mutex_unlock(&scx_enable_mutex); 5846 5847 /* 5848 * @sch is now unlinked from the parent's children list. Notify and call 5849 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 5850 * after unlinking and releasing all locks. See scx_claim_exit(). 5851 */ 5852 wake_up_all(&scx_unlink_waitq); 5853 5854 if (parent->ops.sub_detach && sch->sub_attached) { 5855 struct scx_sub_detach_args sub_detach_args = { 5856 .ops = &sch->ops, 5857 .cgroup_path = sch->cgrp_path, 5858 }; 5859 SCX_CALL_OP(parent, sub_detach, NULL, 5860 &sub_detach_args); 5861 } 5862 5863 if (sch->ops.exit) 5864 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 5865 if (sch->sub_kset) 5866 kobject_del(&sch->sub_kset->kobj); 5867 kobject_del(&sch->kobj); 5868 } 5869 #else /* CONFIG_EXT_SUB_SCHED */ 5870 static void drain_descendants(struct scx_sched *sch) { } 5871 static void scx_sub_disable(struct scx_sched *sch) { } 5872 #endif /* CONFIG_EXT_SUB_SCHED */ 5873 5874 static void scx_root_disable(struct scx_sched *sch) 5875 { 5876 struct scx_exit_info *ei = sch->exit_info; 5877 struct scx_task_iter sti; 5878 struct task_struct *p; 5879 int cpu; 5880 5881 /* guarantee forward progress and wait for descendants to be disabled */ 5882 scx_bypass(sch, true); 5883 drain_descendants(sch); 5884 5885 switch (scx_set_enable_state(SCX_DISABLING)) { 5886 case SCX_DISABLING: 5887 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 5888 break; 5889 case SCX_DISABLED: 5890 pr_warn("sched_ext: ops error detected without ops (%s)\n", 5891 sch->exit_info->msg); 5892 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 5893 goto done; 5894 default: 5895 break; 5896 } 5897 5898 /* 5899 * Here, every runnable task is guaranteed to make forward progress and 5900 * we can safely use blocking synchronization constructs. Actually 5901 * disable ops. 5902 */ 5903 mutex_lock(&scx_enable_mutex); 5904 5905 static_branch_disable(&__scx_switched_all); 5906 WRITE_ONCE(scx_switching_all, false); 5907 5908 /* 5909 * Shut down cgroup support before tasks so that the cgroup attach path 5910 * doesn't race against scx_disable_and_exit_task(). 5911 */ 5912 scx_cgroup_lock(); 5913 scx_cgroup_exit(sch); 5914 scx_cgroup_unlock(); 5915 5916 /* 5917 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 5918 * must be switched out and exited synchronously. 5919 */ 5920 percpu_down_write(&scx_fork_rwsem); 5921 5922 scx_init_task_enabled = false; 5923 5924 scx_task_iter_start(&sti, NULL); 5925 while ((p = scx_task_iter_next_locked(&sti))) { 5926 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 5927 const struct sched_class *old_class = p->sched_class; 5928 const struct sched_class *new_class = scx_setscheduler_class(p); 5929 5930 update_rq_clock(task_rq(p)); 5931 5932 if (old_class != new_class) 5933 queue_flags |= DEQUEUE_CLASS; 5934 5935 scoped_guard (sched_change, p, queue_flags) { 5936 p->sched_class = new_class; 5937 } 5938 5939 scx_disable_and_exit_task(scx_task_sched(p), p); 5940 } 5941 scx_task_iter_stop(&sti); 5942 5943 scx_disable_dump(sch); 5944 5945 scx_cgroup_lock(); 5946 set_cgroup_sched(sch_cgroup(sch), NULL); 5947 scx_cgroup_unlock(); 5948 5949 percpu_up_write(&scx_fork_rwsem); 5950 5951 /* 5952 * Invalidate all the rq clocks to prevent getting outdated 5953 * rq clocks from a previous scx scheduler. 5954 */ 5955 for_each_possible_cpu(cpu) { 5956 struct rq *rq = cpu_rq(cpu); 5957 scx_rq_clock_invalidate(rq); 5958 } 5959 5960 /* no task is on scx, turn off all the switches and flush in-progress calls */ 5961 static_branch_disable(&__scx_enabled); 5962 bitmap_zero(sch->has_op, SCX_OPI_END); 5963 scx_idle_disable(); 5964 synchronize_rcu(); 5965 5966 if (ei->kind >= SCX_EXIT_ERROR) { 5967 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 5968 sch->ops.name, ei->reason); 5969 5970 if (ei->msg[0] != '\0') 5971 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 5972 #ifdef CONFIG_STACKTRACE 5973 stack_trace_print(ei->bt, ei->bt_len, 2); 5974 #endif 5975 } else { 5976 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 5977 sch->ops.name, ei->reason); 5978 } 5979 5980 if (sch->ops.exit) 5981 SCX_CALL_OP(sch, exit, NULL, ei); 5982 5983 scx_unlink_sched(sch); 5984 5985 /* 5986 * scx_root clearing must be inside cpus_read_lock(). See 5987 * handle_hotplug(). 5988 */ 5989 cpus_read_lock(); 5990 RCU_INIT_POINTER(scx_root, NULL); 5991 cpus_read_unlock(); 5992 5993 /* 5994 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 5995 * could observe an object of the same name still in the hierarchy when 5996 * the next scheduler is loaded. 5997 */ 5998 #ifdef CONFIG_EXT_SUB_SCHED 5999 if (sch->sub_kset) 6000 kobject_del(&sch->sub_kset->kobj); 6001 #endif 6002 kobject_del(&sch->kobj); 6003 6004 free_kick_syncs(); 6005 6006 mutex_unlock(&scx_enable_mutex); 6007 6008 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6009 done: 6010 scx_bypass(sch, false); 6011 } 6012 6013 /* 6014 * Claim the exit on @sch. The caller must ensure that the helper kthread work 6015 * is kicked before the current task can be preempted. Once exit_kind is 6016 * claimed, scx_error() can no longer trigger, so if the current task gets 6017 * preempted and the BPF scheduler fails to schedule it back, the helper work 6018 * will never be kicked and the whole system can wedge. 6019 */ 6020 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 6021 { 6022 int none = SCX_EXIT_NONE; 6023 6024 lockdep_assert_preemption_disabled(); 6025 6026 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 6027 kind = SCX_EXIT_ERROR; 6028 6029 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 6030 return false; 6031 6032 /* 6033 * Some CPUs may be trapped in the dispatch paths. Set the aborting 6034 * flag to break potential live-lock scenarios, ensuring we can 6035 * successfully reach scx_bypass(). 6036 */ 6037 WRITE_ONCE(sch->aborting, true); 6038 6039 /* 6040 * Propagate exits to descendants immediately. Each has a dedicated 6041 * helper kthread and can run in parallel. While most of disabling is 6042 * serialized, running them in separate threads allows parallelizing 6043 * ops.exit(), which can take arbitrarily long prolonging bypass mode. 6044 * 6045 * To guarantee forward progress, this propagation must be in-line so 6046 * that ->aborting is synchronously asserted for all sub-scheds. The 6047 * propagation is also the interlocking point against sub-sched 6048 * attachment. See scx_link_sched(). 6049 * 6050 * This doesn't cause recursions as propagation only takes place for 6051 * non-propagation exits. 6052 */ 6053 if (kind != SCX_EXIT_PARENT) { 6054 scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 6055 struct scx_sched *pos; 6056 scx_for_each_descendant_pre(pos, sch) 6057 scx_disable(pos, SCX_EXIT_PARENT); 6058 } 6059 } 6060 6061 return true; 6062 } 6063 6064 static void scx_disable_workfn(struct kthread_work *work) 6065 { 6066 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 6067 struct scx_exit_info *ei = sch->exit_info; 6068 int kind; 6069 6070 kind = atomic_read(&sch->exit_kind); 6071 while (true) { 6072 if (kind == SCX_EXIT_DONE) /* already disabled? */ 6073 return; 6074 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 6075 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 6076 break; 6077 } 6078 ei->kind = kind; 6079 ei->reason = scx_exit_reason(ei->kind); 6080 6081 if (scx_parent(sch)) 6082 scx_sub_disable(sch); 6083 else 6084 scx_root_disable(sch); 6085 } 6086 6087 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 6088 { 6089 guard(preempt)(); 6090 if (scx_claim_exit(sch, kind)) 6091 irq_work_queue(&sch->disable_irq_work); 6092 } 6093 6094 /** 6095 * scx_flush_disable_work - flush the disable work and wait for it to finish 6096 * @sch: the scheduler 6097 * 6098 * sch->disable_work might still not queued, causing kthread_flush_work() 6099 * as a noop. Syncing the irq_work first is required to guarantee the 6100 * kthread work has been queued before waiting for it. 6101 */ 6102 static void scx_flush_disable_work(struct scx_sched *sch) 6103 { 6104 int kind; 6105 6106 do { 6107 irq_work_sync(&sch->disable_irq_work); 6108 kthread_flush_work(&sch->disable_work); 6109 kind = atomic_read(&sch->exit_kind); 6110 } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); 6111 } 6112 6113 static void dump_newline(struct seq_buf *s) 6114 { 6115 trace_sched_ext_dump(""); 6116 6117 /* @s may be zero sized and seq_buf triggers WARN if so */ 6118 if (s->size) 6119 seq_buf_putc(s, '\n'); 6120 } 6121 6122 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 6123 { 6124 va_list args; 6125 6126 #ifdef CONFIG_TRACEPOINTS 6127 if (trace_sched_ext_dump_enabled()) { 6128 /* protected by scx_dump_lock */ 6129 static char line_buf[SCX_EXIT_MSG_LEN]; 6130 6131 va_start(args, fmt); 6132 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 6133 va_end(args); 6134 6135 trace_call__sched_ext_dump(line_buf); 6136 } 6137 #endif 6138 /* @s may be zero sized and seq_buf triggers WARN if so */ 6139 if (s->size) { 6140 va_start(args, fmt); 6141 seq_buf_vprintf(s, fmt, args); 6142 va_end(args); 6143 6144 seq_buf_putc(s, '\n'); 6145 } 6146 } 6147 6148 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 6149 const unsigned long *bt, unsigned int len) 6150 { 6151 unsigned int i; 6152 6153 for (i = 0; i < len; i++) 6154 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 6155 } 6156 6157 static void ops_dump_init(struct seq_buf *s, const char *prefix) 6158 { 6159 struct scx_dump_data *dd = &scx_dump_data; 6160 6161 lockdep_assert_irqs_disabled(); 6162 6163 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 6164 dd->first = true; 6165 dd->cursor = 0; 6166 dd->s = s; 6167 dd->prefix = prefix; 6168 } 6169 6170 static void ops_dump_flush(void) 6171 { 6172 struct scx_dump_data *dd = &scx_dump_data; 6173 char *line = dd->buf.line; 6174 6175 if (!dd->cursor) 6176 return; 6177 6178 /* 6179 * There's something to flush and this is the first line. Insert a blank 6180 * line to distinguish ops dump. 6181 */ 6182 if (dd->first) { 6183 dump_newline(dd->s); 6184 dd->first = false; 6185 } 6186 6187 /* 6188 * There may be multiple lines in $line. Scan and emit each line 6189 * separately. 6190 */ 6191 while (true) { 6192 char *end = line; 6193 char c; 6194 6195 while (*end != '\n' && *end != '\0') 6196 end++; 6197 6198 /* 6199 * If $line overflowed, it may not have newline at the end. 6200 * Always emit with a newline. 6201 */ 6202 c = *end; 6203 *end = '\0'; 6204 dump_line(dd->s, "%s%s", dd->prefix, line); 6205 if (c == '\0') 6206 break; 6207 6208 /* move to the next line */ 6209 end++; 6210 if (*end == '\0') 6211 break; 6212 line = end; 6213 } 6214 6215 dd->cursor = 0; 6216 } 6217 6218 static void ops_dump_exit(void) 6219 { 6220 ops_dump_flush(); 6221 scx_dump_data.cpu = -1; 6222 } 6223 6224 static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, 6225 struct rq *rq, struct task_struct *p, char marker) 6226 { 6227 static unsigned long bt[SCX_EXIT_BT_LEN]; 6228 struct scx_sched *task_sch = scx_task_sched(p); 6229 const char *own_marker; 6230 char sch_id_buf[32]; 6231 char dsq_id_buf[19] = "(n/a)"; 6232 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6233 unsigned int bt_len = 0; 6234 6235 own_marker = task_sch == sch ? "*" : ""; 6236 6237 if (task_sch->level == 0) 6238 scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6239 else 6240 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6241 task_sch->level, task_sch->ops.sub_cgroup_id); 6242 6243 if (p->scx.dsq) 6244 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6245 (unsigned long long)p->scx.dsq->id); 6246 6247 dump_newline(s); 6248 dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6249 marker, task_state_to_char(p), p->comm, p->pid, 6250 own_marker, sch_id_buf, 6251 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6252 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6253 scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6254 p->scx.flags & ~SCX_TASK_STATE_MASK, 6255 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6256 ops_state >> SCX_OPSS_QSEQ_SHIFT); 6257 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6258 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6259 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6260 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6261 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6262 p->migration_disabled); 6263 6264 if (SCX_HAS_OP(sch, dump_task)) { 6265 ops_dump_init(s, " "); 6266 SCX_CALL_OP(sch, dump_task, rq, dctx, p); 6267 ops_dump_exit(); 6268 } 6269 6270 #ifdef CONFIG_STACKTRACE 6271 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6272 #endif 6273 if (bt_len) { 6274 dump_newline(s); 6275 dump_stack_trace(s, " ", bt, bt_len); 6276 } 6277 } 6278 6279 /* 6280 * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6281 * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6282 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6283 * separately. For error dumps, @dump_all_tasks=true since only the failing 6284 * scheduler is dumped. 6285 */ 6286 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6287 size_t dump_len, bool dump_all_tasks) 6288 { 6289 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6290 struct scx_dump_ctx dctx = { 6291 .kind = ei->kind, 6292 .exit_code = ei->exit_code, 6293 .reason = ei->reason, 6294 .at_ns = ktime_get_ns(), 6295 .at_jiffies = jiffies, 6296 }; 6297 struct seq_buf s; 6298 struct scx_event_stats events; 6299 char *buf; 6300 int cpu; 6301 6302 guard(raw_spinlock_irqsave)(&scx_dump_lock); 6303 6304 if (sch->dump_disabled) 6305 return; 6306 6307 seq_buf_init(&s, ei->dump, dump_len); 6308 6309 #ifdef CONFIG_EXT_SUB_SCHED 6310 if (sch->level == 0) 6311 dump_line(&s, "%s: root", sch->ops.name); 6312 else 6313 dump_line(&s, "%s: sub%d-%llu %s", 6314 sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6315 sch->cgrp_path); 6316 #endif 6317 if (ei->kind == SCX_EXIT_NONE) { 6318 dump_line(&s, "Debug dump triggered by %s", ei->reason); 6319 } else { 6320 dump_line(&s, "%s[%d] triggered exit kind %d:", 6321 current->comm, current->pid, ei->kind); 6322 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6323 dump_newline(&s); 6324 dump_line(&s, "Backtrace:"); 6325 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6326 } 6327 6328 if (SCX_HAS_OP(sch, dump)) { 6329 ops_dump_init(&s, ""); 6330 SCX_CALL_OP(sch, dump, NULL, &dctx); 6331 ops_dump_exit(); 6332 } 6333 6334 dump_newline(&s); 6335 dump_line(&s, "CPU states"); 6336 dump_line(&s, "----------"); 6337 6338 for_each_possible_cpu(cpu) { 6339 struct rq *rq = cpu_rq(cpu); 6340 struct rq_flags rf; 6341 struct task_struct *p; 6342 struct seq_buf ns; 6343 size_t avail, used; 6344 bool idle; 6345 6346 rq_lock_irqsave(rq, &rf); 6347 6348 idle = list_empty(&rq->scx.runnable_list) && 6349 rq->curr->sched_class == &idle_sched_class; 6350 6351 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6352 goto next; 6353 6354 /* 6355 * We don't yet know whether ops.dump_cpu() will produce output 6356 * and we may want to skip the default CPU dump if it doesn't. 6357 * Use a nested seq_buf to generate the standard dump so that we 6358 * can decide whether to commit later. 6359 */ 6360 avail = seq_buf_get_buf(&s, &buf); 6361 seq_buf_init(&ns, buf, avail); 6362 6363 dump_newline(&ns); 6364 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6365 cpu, rq->scx.nr_running, rq->scx.flags, 6366 rq->scx.cpu_released, rq->scx.ops_qseq, 6367 rq->scx.kick_sync); 6368 dump_line(&ns, " curr=%s[%d] class=%ps", 6369 rq->curr->comm, rq->curr->pid, 6370 rq->curr->sched_class); 6371 if (!cpumask_empty(rq->scx.cpus_to_kick)) 6372 dump_line(&ns, " cpus_to_kick : %*pb", 6373 cpumask_pr_args(rq->scx.cpus_to_kick)); 6374 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6375 dump_line(&ns, " idle_to_kick : %*pb", 6376 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6377 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6378 dump_line(&ns, " cpus_to_preempt: %*pb", 6379 cpumask_pr_args(rq->scx.cpus_to_preempt)); 6380 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6381 dump_line(&ns, " cpus_to_wait : %*pb", 6382 cpumask_pr_args(rq->scx.cpus_to_wait)); 6383 if (!cpumask_empty(rq->scx.cpus_to_sync)) 6384 dump_line(&ns, " cpus_to_sync : %*pb", 6385 cpumask_pr_args(rq->scx.cpus_to_sync)); 6386 6387 used = seq_buf_used(&ns); 6388 if (SCX_HAS_OP(sch, dump_cpu)) { 6389 ops_dump_init(&ns, " "); 6390 SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); 6391 ops_dump_exit(); 6392 } 6393 6394 /* 6395 * If idle && nothing generated by ops.dump_cpu(), there's 6396 * nothing interesting. Skip. 6397 */ 6398 if (idle && used == seq_buf_used(&ns)) 6399 goto next; 6400 6401 /* 6402 * $s may already have overflowed when $ns was created. If so, 6403 * calling commit on it will trigger BUG. 6404 */ 6405 if (avail) { 6406 seq_buf_commit(&s, seq_buf_used(&ns)); 6407 if (seq_buf_has_overflowed(&ns)) 6408 seq_buf_set_overflow(&s); 6409 } 6410 6411 if (rq->curr->sched_class == &ext_sched_class && 6412 (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6413 scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); 6414 6415 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6416 if (dump_all_tasks || scx_task_on_sched(sch, p)) 6417 scx_dump_task(sch, &s, &dctx, rq, p, ' '); 6418 next: 6419 rq_unlock_irqrestore(rq, &rf); 6420 } 6421 6422 dump_newline(&s); 6423 dump_line(&s, "Event counters"); 6424 dump_line(&s, "--------------"); 6425 6426 scx_read_events(sch, &events); 6427 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6428 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6429 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6430 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6431 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6432 scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6433 scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6434 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6435 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6436 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6437 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6438 scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6439 scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6440 6441 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6442 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6443 trunc_marker, sizeof(trunc_marker)); 6444 } 6445 6446 static void scx_disable_irq_workfn(struct irq_work *irq_work) 6447 { 6448 struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6449 struct scx_exit_info *ei = sch->exit_info; 6450 6451 if (ei->kind >= SCX_EXIT_ERROR) 6452 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6453 6454 kthread_queue_work(sch->helper, &sch->disable_work); 6455 } 6456 6457 static bool scx_vexit(struct scx_sched *sch, 6458 enum scx_exit_kind kind, s64 exit_code, 6459 const char *fmt, va_list args) 6460 { 6461 struct scx_exit_info *ei = sch->exit_info; 6462 6463 guard(preempt)(); 6464 6465 if (!scx_claim_exit(sch, kind)) 6466 return false; 6467 6468 ei->exit_code = exit_code; 6469 #ifdef CONFIG_STACKTRACE 6470 if (kind >= SCX_EXIT_ERROR) 6471 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6472 #endif 6473 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6474 6475 /* 6476 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6477 * in scx_disable_workfn(). 6478 */ 6479 ei->kind = kind; 6480 ei->reason = scx_exit_reason(ei->kind); 6481 6482 irq_work_queue(&sch->disable_irq_work); 6483 return true; 6484 } 6485 6486 static int alloc_kick_syncs(void) 6487 { 6488 int cpu; 6489 6490 /* 6491 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6492 * can exceed percpu allocator limits on large machines. 6493 */ 6494 for_each_possible_cpu(cpu) { 6495 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6496 struct scx_kick_syncs *new_ksyncs; 6497 6498 WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6499 6500 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6501 GFP_KERNEL, cpu_to_node(cpu)); 6502 if (!new_ksyncs) { 6503 free_kick_syncs(); 6504 return -ENOMEM; 6505 } 6506 6507 rcu_assign_pointer(*ksyncs, new_ksyncs); 6508 } 6509 6510 return 0; 6511 } 6512 6513 static void free_pnode(struct scx_sched_pnode *pnode) 6514 { 6515 if (!pnode) 6516 return; 6517 exit_dsq(&pnode->global_dsq); 6518 kfree(pnode); 6519 } 6520 6521 static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6522 { 6523 struct scx_sched_pnode *pnode; 6524 6525 pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6526 if (!pnode) 6527 return NULL; 6528 6529 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6530 kfree(pnode); 6531 return NULL; 6532 } 6533 6534 return pnode; 6535 } 6536 6537 /* 6538 * Allocate and initialize a new scx_sched. @cgrp's reference is always 6539 * consumed whether the function succeeds or fails. 6540 */ 6541 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, 6542 struct cgroup *cgrp, 6543 struct scx_sched *parent) 6544 { 6545 struct scx_sched *sch; 6546 s32 level = parent ? parent->level + 1 : 0; 6547 s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6548 6549 sch = kzalloc_flex(*sch, ancestors, level + 1); 6550 if (!sch) { 6551 ret = -ENOMEM; 6552 goto err_put_cgrp; 6553 } 6554 6555 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6556 if (!sch->exit_info) { 6557 ret = -ENOMEM; 6558 goto err_free_sch; 6559 } 6560 6561 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6562 if (ret < 0) 6563 goto err_free_ei; 6564 6565 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6566 if (!sch->pnode) { 6567 ret = -ENOMEM; 6568 goto err_free_hash; 6569 } 6570 6571 for_each_node_state(node, N_POSSIBLE) { 6572 sch->pnode[node] = alloc_pnode(sch, node); 6573 if (!sch->pnode[node]) { 6574 ret = -ENOMEM; 6575 goto err_free_pnode; 6576 } 6577 } 6578 6579 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6580 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6581 dsp_ctx.buf, sch->dsp_max_batch), 6582 __alignof__(struct scx_sched_pcpu)); 6583 if (!sch->pcpu) { 6584 ret = -ENOMEM; 6585 goto err_free_pnode; 6586 } 6587 6588 for_each_possible_cpu(cpu) { 6589 ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6590 if (ret) { 6591 bypass_fail_cpu = cpu; 6592 goto err_free_pcpu; 6593 } 6594 } 6595 6596 for_each_possible_cpu(cpu) { 6597 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6598 6599 pcpu->sch = sch; 6600 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6601 } 6602 6603 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6604 if (IS_ERR(sch->helper)) { 6605 ret = PTR_ERR(sch->helper); 6606 goto err_free_pcpu; 6607 } 6608 6609 sched_set_fifo(sch->helper->task); 6610 6611 if (parent) 6612 memcpy(sch->ancestors, parent->ancestors, 6613 level * sizeof(parent->ancestors[0])); 6614 sch->ancestors[level] = sch; 6615 sch->level = level; 6616 6617 if (ops->timeout_ms) 6618 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6619 else 6620 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6621 6622 sch->slice_dfl = SCX_SLICE_DFL; 6623 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6624 sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); 6625 kthread_init_work(&sch->disable_work, scx_disable_workfn); 6626 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6627 6628 if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { 6629 ret = -ENOMEM; 6630 goto err_stop_helper; 6631 } 6632 if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { 6633 ret = -ENOMEM; 6634 goto err_free_lb_cpumask; 6635 } 6636 sch->ops = *ops; 6637 rcu_assign_pointer(ops->priv, sch); 6638 6639 sch->kobj.kset = scx_kset; 6640 INIT_LIST_HEAD(&sch->all); 6641 6642 #ifdef CONFIG_EXT_SUB_SCHED 6643 char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6644 if (!buf) { 6645 ret = -ENOMEM; 6646 goto err_free_lb_resched; 6647 } 6648 cgroup_path(cgrp, buf, PATH_MAX); 6649 sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6650 kfree(buf); 6651 if (!sch->cgrp_path) { 6652 ret = -ENOMEM; 6653 goto err_free_lb_resched; 6654 } 6655 6656 sch->cgrp = cgrp; 6657 INIT_LIST_HEAD(&sch->children); 6658 INIT_LIST_HEAD(&sch->sibling); 6659 6660 if (parent) 6661 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6662 &parent->sub_kset->kobj, 6663 "sub-%llu", cgroup_id(cgrp)); 6664 else 6665 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6666 6667 if (ret < 0) { 6668 RCU_INIT_POINTER(ops->priv, NULL); 6669 kobject_put(&sch->kobj); 6670 return ERR_PTR(ret); 6671 } 6672 6673 if (ops->sub_attach) { 6674 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6675 if (!sch->sub_kset) { 6676 RCU_INIT_POINTER(ops->priv, NULL); 6677 kobject_put(&sch->kobj); 6678 return ERR_PTR(-ENOMEM); 6679 } 6680 } 6681 #else /* CONFIG_EXT_SUB_SCHED */ 6682 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6683 if (ret < 0) { 6684 RCU_INIT_POINTER(ops->priv, NULL); 6685 kobject_put(&sch->kobj); 6686 return ERR_PTR(ret); 6687 } 6688 #endif /* CONFIG_EXT_SUB_SCHED */ 6689 return sch; 6690 6691 #ifdef CONFIG_EXT_SUB_SCHED 6692 err_free_lb_resched: 6693 RCU_INIT_POINTER(ops->priv, NULL); 6694 free_cpumask_var(sch->bypass_lb_resched_cpumask); 6695 #endif 6696 err_free_lb_cpumask: 6697 free_cpumask_var(sch->bypass_lb_donee_cpumask); 6698 err_stop_helper: 6699 kthread_destroy_worker(sch->helper); 6700 err_free_pcpu: 6701 for_each_possible_cpu(cpu) { 6702 if (cpu == bypass_fail_cpu) 6703 break; 6704 exit_dsq(bypass_dsq(sch, cpu)); 6705 } 6706 free_percpu(sch->pcpu); 6707 err_free_pnode: 6708 for_each_node_state(node, N_POSSIBLE) 6709 free_pnode(sch->pnode[node]); 6710 kfree(sch->pnode); 6711 err_free_hash: 6712 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 6713 err_free_ei: 6714 free_exit_info(sch->exit_info); 6715 err_free_sch: 6716 kfree(sch); 6717 err_put_cgrp: 6718 #ifdef CONFIG_EXT_SUB_SCHED 6719 cgroup_put(cgrp); 6720 #endif 6721 return ERR_PTR(ret); 6722 } 6723 6724 static int check_hotplug_seq(struct scx_sched *sch, 6725 const struct sched_ext_ops *ops) 6726 { 6727 unsigned long long global_hotplug_seq; 6728 6729 /* 6730 * If a hotplug event has occurred between when a scheduler was 6731 * initialized, and when we were able to attach, exit and notify user 6732 * space about it. 6733 */ 6734 if (ops->hotplug_seq) { 6735 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 6736 if (ops->hotplug_seq != global_hotplug_seq) { 6737 scx_exit(sch, SCX_EXIT_UNREG_KERN, 6738 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 6739 "expected hotplug seq %llu did not match actual %llu", 6740 ops->hotplug_seq, global_hotplug_seq); 6741 return -EBUSY; 6742 } 6743 } 6744 6745 return 0; 6746 } 6747 6748 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 6749 { 6750 /* 6751 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 6752 * ops.enqueue() callback isn't implemented. 6753 */ 6754 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 6755 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 6756 return -EINVAL; 6757 } 6758 6759 /* 6760 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 6761 * selection policy to be enabled. 6762 */ 6763 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 6764 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 6765 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 6766 return -EINVAL; 6767 } 6768 6769 if (ops->cpu_acquire || ops->cpu_release) 6770 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 6771 6772 return 0; 6773 } 6774 6775 /* 6776 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6777 * starvation. During the READY -> ENABLED task switching loop, the calling 6778 * thread's sched_class gets switched from fair to ext. As fair has higher 6779 * priority than ext, the calling thread can be indefinitely starved under 6780 * fair-class saturation, leading to a system hang. 6781 */ 6782 struct scx_enable_cmd { 6783 struct kthread_work work; 6784 struct sched_ext_ops *ops; 6785 int ret; 6786 }; 6787 6788 static void scx_root_enable_workfn(struct kthread_work *work) 6789 { 6790 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 6791 struct sched_ext_ops *ops = cmd->ops; 6792 struct cgroup *cgrp = root_cgroup(); 6793 struct scx_sched *sch; 6794 struct scx_task_iter sti; 6795 struct task_struct *p; 6796 int i, cpu, ret; 6797 6798 mutex_lock(&scx_enable_mutex); 6799 6800 if (scx_enable_state() != SCX_DISABLED) { 6801 ret = -EBUSY; 6802 goto err_unlock; 6803 } 6804 6805 /* 6806 * @ops->priv binds @ops to its scx_sched instance. It is set here by 6807 * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), 6808 * which runs after scx_root_disable() has dropped scx_enable_mutex. If 6809 * it's still non-NULL here, a previous attachment on @ops has not 6810 * finished tearing down; proceeding would let the in-flight unreg's 6811 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. 6812 */ 6813 if (rcu_access_pointer(ops->priv)) { 6814 ret = -EBUSY; 6815 goto err_unlock; 6816 } 6817 6818 ret = alloc_kick_syncs(); 6819 if (ret) 6820 goto err_unlock; 6821 6822 #ifdef CONFIG_EXT_SUB_SCHED 6823 cgroup_get(cgrp); 6824 #endif 6825 sch = scx_alloc_and_add_sched(ops, cgrp, NULL); 6826 if (IS_ERR(sch)) { 6827 ret = PTR_ERR(sch); 6828 goto err_free_ksyncs; 6829 } 6830 6831 /* 6832 * Transition to ENABLING and clear exit info to arm the disable path. 6833 * Failure triggers full disabling from here on. 6834 */ 6835 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 6836 WARN_ON_ONCE(scx_root); 6837 6838 atomic_long_set(&scx_nr_rejected, 0); 6839 6840 for_each_possible_cpu(cpu) { 6841 struct rq *rq = cpu_rq(cpu); 6842 6843 rq->scx.local_dsq.sched = sch; 6844 rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 6845 } 6846 6847 /* 6848 * Keep CPUs stable during enable so that the BPF scheduler can track 6849 * online CPUs by watching ->on/offline_cpu() after ->init(). 6850 */ 6851 cpus_read_lock(); 6852 6853 /* 6854 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 6855 * See handle_hotplug(). 6856 */ 6857 rcu_assign_pointer(scx_root, sch); 6858 6859 ret = scx_link_sched(sch); 6860 if (ret) { 6861 cpus_read_unlock(); 6862 goto err_disable; 6863 } 6864 6865 scx_idle_enable(ops); 6866 6867 if (sch->ops.init) { 6868 ret = SCX_CALL_OP_RET(sch, init, NULL); 6869 if (ret) { 6870 ret = ops_sanitize_err(sch, "init", ret); 6871 cpus_read_unlock(); 6872 scx_error(sch, "ops.init() failed (%d)", ret); 6873 goto err_disable; 6874 } 6875 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 6876 } 6877 6878 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 6879 if (((void (**)(void))ops)[i]) 6880 set_bit(i, sch->has_op); 6881 6882 ret = check_hotplug_seq(sch, ops); 6883 if (ret) { 6884 cpus_read_unlock(); 6885 goto err_disable; 6886 } 6887 scx_idle_update_selcpu_topology(ops); 6888 6889 cpus_read_unlock(); 6890 6891 ret = validate_ops(sch, ops); 6892 if (ret) 6893 goto err_disable; 6894 6895 /* 6896 * Once __scx_enabled is set, %current can be switched to SCX anytime. 6897 * This can lead to stalls as some BPF schedulers (e.g. userspace 6898 * scheduling) may not function correctly before all tasks are switched. 6899 * Init in bypass mode to guarantee forward progress. 6900 */ 6901 scx_bypass(sch, true); 6902 6903 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 6904 if (((void (**)(void))ops)[i]) 6905 set_bit(i, sch->has_op); 6906 6907 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 6908 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 6909 6910 /* 6911 * Lock out forks, cgroup on/offlining and moves before opening the 6912 * floodgate so that they don't wander into the operations prematurely. 6913 */ 6914 percpu_down_write(&scx_fork_rwsem); 6915 6916 WARN_ON_ONCE(scx_init_task_enabled); 6917 scx_init_task_enabled = true; 6918 6919 /* 6920 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 6921 * preventing new tasks from being added. No need to exclude tasks 6922 * leaving as sched_ext_free() can handle both prepped and enabled 6923 * tasks. Prep all tasks first and then enable them with preemption 6924 * disabled. 6925 * 6926 * All cgroups should be initialized before scx_init_task() so that the 6927 * BPF scheduler can reliably track each task's cgroup membership from 6928 * scx_init_task(). Lock out cgroup on/offlining and task migrations 6929 * while tasks are being initialized so that scx_cgroup_can_attach() 6930 * never sees uninitialized tasks. 6931 */ 6932 scx_cgroup_lock(); 6933 set_cgroup_sched(sch_cgroup(sch), sch); 6934 ret = scx_cgroup_init(sch); 6935 if (ret) 6936 goto err_disable_unlock_all; 6937 6938 scx_task_iter_start(&sti, NULL); 6939 while ((p = scx_task_iter_next_locked(&sti))) { 6940 struct rq_flags rf; 6941 struct rq *rq; 6942 6943 /* 6944 * @p may already be dead, have lost all its usages counts and 6945 * be waiting for RCU grace period before being freed. @p can't 6946 * be initialized for SCX in such cases and should be ignored. 6947 */ 6948 if (!tryget_task_struct(p)) 6949 continue; 6950 6951 /* 6952 * Set %INIT_BEGIN under the iter's rq lock so that a concurrent 6953 * sched_ext_dead() does not call ops.exit_task() on @p while 6954 * ops.init_task() is running. If sched_ext_dead() runs before 6955 * this store, it has already removed @p from scx_tasks and the 6956 * iter won't visit @p; if it runs after, it observes 6957 * %INIT_BEGIN and transitions to %DEAD without calling ops, 6958 * leaving the post-init recheck below to unwind. 6959 */ 6960 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 6961 scx_task_iter_unlock(&sti); 6962 6963 ret = __scx_init_task(sch, p, false); 6964 6965 rq = task_rq_lock(p, &rf); 6966 6967 if (unlikely(ret)) { 6968 if (scx_get_task_state(p) != SCX_TASK_DEAD) 6969 scx_set_task_state(p, SCX_TASK_NONE); 6970 task_rq_unlock(rq, p, &rf); 6971 scx_task_iter_stop(&sti); 6972 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 6973 ret, p->comm, p->pid); 6974 put_task_struct(p); 6975 goto err_disable_unlock_all; 6976 } 6977 6978 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 6979 /* 6980 * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. 6981 * ops.exit_task() is owed to the sched __scx_init_task() 6982 * ran against; call it now. 6983 */ 6984 scx_sub_init_cancel_task(sch, p); 6985 } else { 6986 scx_set_task_state(p, SCX_TASK_INIT); 6987 scx_set_task_sched(p, sch); 6988 scx_set_task_state(p, SCX_TASK_READY); 6989 } 6990 6991 task_rq_unlock(rq, p, &rf); 6992 put_task_struct(p); 6993 } 6994 scx_task_iter_stop(&sti); 6995 scx_cgroup_unlock(); 6996 percpu_up_write(&scx_fork_rwsem); 6997 6998 /* 6999 * All tasks are READY. It's safe to turn on scx_enabled() and switch 7000 * all eligible tasks. 7001 */ 7002 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 7003 static_branch_enable(&__scx_enabled); 7004 7005 /* 7006 * We're fully committed and can't fail. The task READY -> ENABLED 7007 * transitions here are synchronized against sched_ext_free() through 7008 * scx_tasks_lock. 7009 */ 7010 percpu_down_write(&scx_fork_rwsem); 7011 scx_task_iter_start(&sti, NULL); 7012 while ((p = scx_task_iter_next_locked(&sti))) { 7013 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 7014 const struct sched_class *old_class = p->sched_class; 7015 const struct sched_class *new_class = scx_setscheduler_class(p); 7016 7017 if (scx_get_task_state(p) != SCX_TASK_READY) 7018 continue; 7019 7020 if (old_class != new_class) 7021 queue_flags |= DEQUEUE_CLASS; 7022 7023 scoped_guard (sched_change, p, queue_flags) { 7024 p->scx.slice = READ_ONCE(sch->slice_dfl); 7025 p->sched_class = new_class; 7026 } 7027 } 7028 scx_task_iter_stop(&sti); 7029 percpu_up_write(&scx_fork_rwsem); 7030 7031 scx_bypass(sch, false); 7032 7033 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 7034 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 7035 goto err_disable; 7036 } 7037 7038 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 7039 static_branch_enable(&__scx_switched_all); 7040 7041 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 7042 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 7043 kobject_uevent(&sch->kobj, KOBJ_ADD); 7044 mutex_unlock(&scx_enable_mutex); 7045 7046 atomic_long_inc(&scx_enable_seq); 7047 7048 cmd->ret = 0; 7049 return; 7050 7051 err_free_ksyncs: 7052 free_kick_syncs(); 7053 err_unlock: 7054 mutex_unlock(&scx_enable_mutex); 7055 cmd->ret = ret; 7056 return; 7057 7058 err_disable_unlock_all: 7059 scx_cgroup_unlock(); 7060 percpu_up_write(&scx_fork_rwsem); 7061 /* we'll soon enter disable path, keep bypass on */ 7062 err_disable: 7063 mutex_unlock(&scx_enable_mutex); 7064 /* 7065 * Returning an error code here would not pass all the error information 7066 * to userspace. Record errno using scx_error() for cases scx_error() 7067 * wasn't already invoked and exit indicating success so that the error 7068 * is notified through ops.exit() with all the details. 7069 * 7070 * Flush scx_disable_work to ensure that error is reported before init 7071 * completion. sch's base reference will be put by bpf_scx_unreg(). 7072 */ 7073 scx_error(sch, "scx_root_enable() failed (%d)", ret); 7074 scx_flush_disable_work(sch); 7075 cmd->ret = 0; 7076 } 7077 7078 #ifdef CONFIG_EXT_SUB_SCHED 7079 /* verify that a scheduler can be attached to @cgrp and return the parent */ 7080 static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 7081 { 7082 struct scx_sched *parent = cgrp->scx_sched; 7083 struct scx_sched *pos; 7084 7085 lockdep_assert_held(&scx_sched_lock); 7086 7087 /* can't attach twice to the same cgroup */ 7088 if (parent->cgrp == cgrp) 7089 return ERR_PTR(-EBUSY); 7090 7091 /* does $parent allow sub-scheds? */ 7092 if (!parent->ops.sub_attach) 7093 return ERR_PTR(-EOPNOTSUPP); 7094 7095 /* can't insert between $parent and its exiting children */ 7096 list_for_each_entry(pos, &parent->children, sibling) 7097 if (cgroup_is_descendant(pos->cgrp, cgrp)) 7098 return ERR_PTR(-EBUSY); 7099 7100 return parent; 7101 } 7102 7103 static bool assert_task_ready_or_enabled(struct task_struct *p) 7104 { 7105 u32 state = scx_get_task_state(p); 7106 7107 switch (state) { 7108 case SCX_TASK_READY: 7109 case SCX_TASK_ENABLED: 7110 return true; 7111 default: 7112 WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 7113 state, p->comm, p->pid); 7114 return false; 7115 } 7116 } 7117 7118 static void scx_sub_enable_workfn(struct kthread_work *work) 7119 { 7120 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7121 struct sched_ext_ops *ops = cmd->ops; 7122 struct cgroup *cgrp; 7123 struct scx_sched *parent, *sch; 7124 struct scx_task_iter sti; 7125 struct task_struct *p; 7126 s32 i, ret; 7127 7128 mutex_lock(&scx_enable_mutex); 7129 7130 if (!scx_enabled()) { 7131 ret = -ENODEV; 7132 goto out_unlock; 7133 } 7134 7135 /* See scx_root_enable_workfn() for the @ops->priv check. */ 7136 if (rcu_access_pointer(ops->priv)) { 7137 ret = -EBUSY; 7138 goto out_unlock; 7139 } 7140 7141 cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 7142 if (IS_ERR(cgrp)) { 7143 ret = PTR_ERR(cgrp); 7144 goto out_unlock; 7145 } 7146 7147 raw_spin_lock_irq(&scx_sched_lock); 7148 parent = find_parent_sched(cgrp); 7149 if (IS_ERR(parent)) { 7150 raw_spin_unlock_irq(&scx_sched_lock); 7151 ret = PTR_ERR(parent); 7152 goto out_put_cgrp; 7153 } 7154 kobject_get(&parent->kobj); 7155 raw_spin_unlock_irq(&scx_sched_lock); 7156 7157 /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ 7158 sch = scx_alloc_and_add_sched(ops, cgrp, parent); 7159 kobject_put(&parent->kobj); 7160 if (IS_ERR(sch)) { 7161 ret = PTR_ERR(sch); 7162 goto out_unlock; 7163 } 7164 7165 ret = scx_link_sched(sch); 7166 if (ret) 7167 goto err_disable; 7168 7169 if (sch->level >= SCX_SUB_MAX_DEPTH) { 7170 scx_error(sch, "max nesting depth %d violated", 7171 SCX_SUB_MAX_DEPTH); 7172 goto err_disable; 7173 } 7174 7175 if (sch->ops.init) { 7176 ret = SCX_CALL_OP_RET(sch, init, NULL); 7177 if (ret) { 7178 ret = ops_sanitize_err(sch, "init", ret); 7179 scx_error(sch, "ops.init() failed (%d)", ret); 7180 goto err_disable; 7181 } 7182 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7183 } 7184 7185 if (validate_ops(sch, ops)) 7186 goto err_disable; 7187 7188 struct scx_sub_attach_args sub_attach_args = { 7189 .ops = &sch->ops, 7190 .cgroup_path = sch->cgrp_path, 7191 }; 7192 7193 ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, 7194 &sub_attach_args); 7195 if (ret) { 7196 ret = ops_sanitize_err(sch, "sub_attach", ret); 7197 scx_error(sch, "parent rejected (%d)", ret); 7198 goto err_disable; 7199 } 7200 sch->sub_attached = true; 7201 7202 scx_bypass(sch, true); 7203 7204 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 7205 if (((void (**)(void))ops)[i]) 7206 set_bit(i, sch->has_op); 7207 7208 percpu_down_write(&scx_fork_rwsem); 7209 scx_cgroup_lock(); 7210 7211 /* 7212 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 7213 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 7214 */ 7215 set_cgroup_sched(sch_cgroup(sch), sch); 7216 if (!(cgrp->self.flags & CSS_ONLINE)) { 7217 scx_error(sch, "cgroup is not online"); 7218 goto err_unlock_and_disable; 7219 } 7220 7221 /* 7222 * Initialize tasks for the new child $sch without exiting them for 7223 * $parent so that the tasks can always be reverted back to $parent 7224 * sched on child init failure. 7225 */ 7226 WARN_ON_ONCE(scx_enabling_sub_sched); 7227 scx_enabling_sub_sched = sch; 7228 7229 scx_task_iter_start(&sti, sch->cgrp); 7230 while ((p = scx_task_iter_next_locked(&sti))) { 7231 struct rq *rq; 7232 struct rq_flags rf; 7233 7234 /* 7235 * Task iteration may visit the same task twice when racing 7236 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 7237 * finished __scx_init_task() and skip if set. 7238 * 7239 * A task may exit and get freed between __scx_init_task() 7240 * completion and scx_enable_task(). In such cases, 7241 * scx_disable_and_exit_task() must exit the task for both the 7242 * parent and child scheds. 7243 */ 7244 if (p->scx.flags & SCX_TASK_SUB_INIT) 7245 continue; 7246 7247 /* see scx_root_enable() */ 7248 if (!tryget_task_struct(p)) 7249 continue; 7250 7251 if (!assert_task_ready_or_enabled(p)) { 7252 ret = -EINVAL; 7253 goto abort; 7254 } 7255 7256 scx_task_iter_unlock(&sti); 7257 7258 /* 7259 * As $p is still on $parent, it can't be transitioned to INIT. 7260 * Let's worry about task state later. Use __scx_init_task(). 7261 */ 7262 ret = __scx_init_task(sch, p, false); 7263 if (ret) 7264 goto abort; 7265 7266 rq = task_rq_lock(p, &rf); 7267 7268 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7269 /* 7270 * sched_ext_dead() raced us between __scx_init_task() 7271 * and this rq lock and ran exit_task() on $parent (the 7272 * sched @p was on at that point), not on @sch. @sch's 7273 * just-completed init is owed an exit_task() and we 7274 * issue it here. 7275 */ 7276 scx_sub_init_cancel_task(sch, p); 7277 task_rq_unlock(rq, p, &rf); 7278 put_task_struct(p); 7279 continue; 7280 } 7281 7282 p->scx.flags |= SCX_TASK_SUB_INIT; 7283 task_rq_unlock(rq, p, &rf); 7284 7285 put_task_struct(p); 7286 } 7287 scx_task_iter_stop(&sti); 7288 7289 /* 7290 * All tasks are prepped. Disable/exit tasks for $parent and enable for 7291 * the new @sch. 7292 */ 7293 scx_task_iter_start(&sti, sch->cgrp); 7294 while ((p = scx_task_iter_next_locked(&sti))) { 7295 /* 7296 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7297 * duplicate iterations. 7298 */ 7299 if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7300 continue; 7301 7302 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7303 /* 7304 * $p must be either READY or ENABLED. If ENABLED, 7305 * __scx_disabled_and_exit_task() first disables and 7306 * makes it READY. However, after exiting $p, it will 7307 * leave $p as READY. 7308 */ 7309 assert_task_ready_or_enabled(p); 7310 __scx_disable_and_exit_task(parent, p); 7311 7312 /* 7313 * $p is now only initialized for @sch and READY, which 7314 * is what we want. Assign it to @sch and enable. 7315 */ 7316 scx_set_task_sched(p, sch); 7317 scx_enable_task(sch, p); 7318 7319 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7320 } 7321 } 7322 scx_task_iter_stop(&sti); 7323 7324 scx_enabling_sub_sched = NULL; 7325 7326 scx_cgroup_unlock(); 7327 percpu_up_write(&scx_fork_rwsem); 7328 7329 scx_bypass(sch, false); 7330 7331 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7332 kobject_uevent(&sch->kobj, KOBJ_ADD); 7333 ret = 0; 7334 goto out_unlock; 7335 7336 out_put_cgrp: 7337 cgroup_put(cgrp); 7338 out_unlock: 7339 mutex_unlock(&scx_enable_mutex); 7340 cmd->ret = ret; 7341 return; 7342 7343 abort: 7344 put_task_struct(p); 7345 scx_task_iter_stop(&sti); 7346 7347 /* 7348 * Undo __scx_init_task() for tasks we marked. scx_enable_task() never 7349 * ran for @sch on them, so calling scx_disable_task() here would invoke 7350 * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched 7351 * must stay set until SUB_INIT is cleared from every marked task - 7352 * scx_disable_and_exit_task() reads it when a task exits concurrently. 7353 */ 7354 scx_task_iter_start(&sti, sch->cgrp); 7355 while ((p = scx_task_iter_next_locked(&sti))) { 7356 if (p->scx.flags & SCX_TASK_SUB_INIT) { 7357 scx_sub_init_cancel_task(sch, p); 7358 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7359 } 7360 } 7361 scx_task_iter_stop(&sti); 7362 scx_enabling_sub_sched = NULL; 7363 err_unlock_and_disable: 7364 /* we'll soon enter disable path, keep bypass on */ 7365 scx_cgroup_unlock(); 7366 percpu_up_write(&scx_fork_rwsem); 7367 err_disable: 7368 mutex_unlock(&scx_enable_mutex); 7369 scx_flush_disable_work(sch); 7370 cmd->ret = 0; 7371 } 7372 7373 static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7374 unsigned long action, void *data) 7375 { 7376 struct cgroup *cgrp = data; 7377 struct cgroup *parent = cgroup_parent(cgrp); 7378 7379 if (!cgroup_on_dfl(cgrp)) 7380 return NOTIFY_OK; 7381 7382 switch (action) { 7383 case CGROUP_LIFETIME_ONLINE: 7384 /* inherit ->scx_sched from $parent */ 7385 if (parent) 7386 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7387 break; 7388 case CGROUP_LIFETIME_OFFLINE: 7389 /* if there is a sched attached, shoot it down */ 7390 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7391 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7392 SCX_ECODE_RSN_CGROUP_OFFLINE, 7393 "cgroup %llu going offline", cgroup_id(cgrp)); 7394 break; 7395 } 7396 7397 return NOTIFY_OK; 7398 } 7399 7400 static struct notifier_block scx_cgroup_lifetime_nb = { 7401 .notifier_call = scx_cgroup_lifetime_notify, 7402 }; 7403 7404 static s32 __init scx_cgroup_lifetime_notifier_init(void) 7405 { 7406 return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7407 &scx_cgroup_lifetime_nb); 7408 } 7409 core_initcall(scx_cgroup_lifetime_notifier_init); 7410 #endif /* CONFIG_EXT_SUB_SCHED */ 7411 7412 static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 7413 { 7414 static struct kthread_worker *helper; 7415 static DEFINE_MUTEX(helper_mutex); 7416 struct scx_enable_cmd cmd; 7417 7418 if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { 7419 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7420 return -EINVAL; 7421 } 7422 7423 if (!READ_ONCE(helper)) { 7424 mutex_lock(&helper_mutex); 7425 if (!helper) { 7426 struct kthread_worker *w = 7427 kthread_run_worker(0, "scx_enable_helper"); 7428 if (IS_ERR_OR_NULL(w)) { 7429 mutex_unlock(&helper_mutex); 7430 return -ENOMEM; 7431 } 7432 sched_set_fifo(w->task); 7433 WRITE_ONCE(helper, w); 7434 } 7435 mutex_unlock(&helper_mutex); 7436 } 7437 7438 #ifdef CONFIG_EXT_SUB_SCHED 7439 if (ops->sub_cgroup_id > 1) 7440 kthread_init_work(&cmd.work, scx_sub_enable_workfn); 7441 else 7442 #endif /* CONFIG_EXT_SUB_SCHED */ 7443 kthread_init_work(&cmd.work, scx_root_enable_workfn); 7444 cmd.ops = ops; 7445 7446 kthread_queue_work(READ_ONCE(helper), &cmd.work); 7447 kthread_flush_work(&cmd.work); 7448 return cmd.ret; 7449 } 7450 7451 7452 /******************************************************************************** 7453 * bpf_struct_ops plumbing. 7454 */ 7455 #include <linux/bpf_verifier.h> 7456 #include <linux/bpf.h> 7457 #include <linux/btf.h> 7458 7459 static const struct btf_type *task_struct_type; 7460 7461 static bool bpf_scx_is_valid_access(int off, int size, 7462 enum bpf_access_type type, 7463 const struct bpf_prog *prog, 7464 struct bpf_insn_access_aux *info) 7465 { 7466 if (type != BPF_READ) 7467 return false; 7468 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7469 return false; 7470 if (off % size != 0) 7471 return false; 7472 7473 return btf_ctx_access(off, size, type, prog, info); 7474 } 7475 7476 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7477 const struct bpf_reg_state *reg, int off, 7478 int size) 7479 { 7480 const struct btf_type *t; 7481 7482 t = btf_type_by_id(reg->btf, reg->btf_id); 7483 if (t == task_struct_type) { 7484 /* 7485 * COMPAT: Will be removed in v6.23. 7486 */ 7487 if ((off >= offsetof(struct task_struct, scx.slice) && 7488 off + size <= offsetofend(struct task_struct, scx.slice)) || 7489 (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7490 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7491 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7492 return SCALAR_VALUE; 7493 } 7494 7495 if (off >= offsetof(struct task_struct, scx.disallow) && 7496 off + size <= offsetofend(struct task_struct, scx.disallow)) 7497 return SCALAR_VALUE; 7498 } 7499 7500 return -EACCES; 7501 } 7502 7503 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7504 .get_func_proto = bpf_base_func_proto, 7505 .is_valid_access = bpf_scx_is_valid_access, 7506 .btf_struct_access = bpf_scx_btf_struct_access, 7507 }; 7508 7509 static int bpf_scx_init_member(const struct btf_type *t, 7510 const struct btf_member *member, 7511 void *kdata, const void *udata) 7512 { 7513 const struct sched_ext_ops *uops = udata; 7514 struct sched_ext_ops *ops = kdata; 7515 u32 moff = __btf_member_bit_offset(t, member) / 8; 7516 int ret; 7517 7518 switch (moff) { 7519 case offsetof(struct sched_ext_ops, dispatch_max_batch): 7520 if (*(u32 *)(udata + moff) > INT_MAX) 7521 return -E2BIG; 7522 ops->dispatch_max_batch = *(u32 *)(udata + moff); 7523 return 1; 7524 case offsetof(struct sched_ext_ops, flags): 7525 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7526 return -EINVAL; 7527 ops->flags = *(u64 *)(udata + moff); 7528 return 1; 7529 case offsetof(struct sched_ext_ops, name): 7530 ret = bpf_obj_name_cpy(ops->name, uops->name, 7531 sizeof(ops->name)); 7532 if (ret < 0) 7533 return ret; 7534 if (ret == 0) 7535 return -EINVAL; 7536 return 1; 7537 case offsetof(struct sched_ext_ops, timeout_ms): 7538 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7539 SCX_WATCHDOG_MAX_TIMEOUT) 7540 return -E2BIG; 7541 ops->timeout_ms = *(u32 *)(udata + moff); 7542 return 1; 7543 case offsetof(struct sched_ext_ops, exit_dump_len): 7544 ops->exit_dump_len = 7545 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7546 return 1; 7547 case offsetof(struct sched_ext_ops, hotplug_seq): 7548 ops->hotplug_seq = *(u64 *)(udata + moff); 7549 return 1; 7550 #ifdef CONFIG_EXT_SUB_SCHED 7551 case offsetof(struct sched_ext_ops, sub_cgroup_id): 7552 ops->sub_cgroup_id = *(u64 *)(udata + moff); 7553 return 1; 7554 #endif /* CONFIG_EXT_SUB_SCHED */ 7555 } 7556 7557 return 0; 7558 } 7559 7560 #ifdef CONFIG_EXT_SUB_SCHED 7561 static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 7562 { 7563 struct scx_sched *sch; 7564 7565 guard(rcu)(); 7566 sch = scx_prog_sched(prog->aux); 7567 if (unlikely(!sch)) 7568 return; 7569 7570 scx_error(sch, "dispatch recursion detected"); 7571 } 7572 #endif /* CONFIG_EXT_SUB_SCHED */ 7573 7574 static int bpf_scx_check_member(const struct btf_type *t, 7575 const struct btf_member *member, 7576 const struct bpf_prog *prog) 7577 { 7578 u32 moff = __btf_member_bit_offset(t, member) / 8; 7579 7580 switch (moff) { 7581 case offsetof(struct sched_ext_ops, init_task): 7582 #ifdef CONFIG_EXT_GROUP_SCHED 7583 case offsetof(struct sched_ext_ops, cgroup_init): 7584 case offsetof(struct sched_ext_ops, cgroup_exit): 7585 case offsetof(struct sched_ext_ops, cgroup_prep_move): 7586 #endif 7587 case offsetof(struct sched_ext_ops, cpu_online): 7588 case offsetof(struct sched_ext_ops, cpu_offline): 7589 case offsetof(struct sched_ext_ops, init): 7590 case offsetof(struct sched_ext_ops, exit): 7591 case offsetof(struct sched_ext_ops, sub_attach): 7592 case offsetof(struct sched_ext_ops, sub_detach): 7593 break; 7594 default: 7595 if (prog->sleepable) 7596 return -EINVAL; 7597 } 7598 7599 #ifdef CONFIG_EXT_SUB_SCHED 7600 /* 7601 * Enable private stack for operations that can nest along the 7602 * hierarchy. 7603 * 7604 * XXX - Ideally, we should only do this for scheds that allow 7605 * sub-scheds and sub-scheds themselves but I don't know how to access 7606 * struct_ops from here. 7607 */ 7608 switch (moff) { 7609 case offsetof(struct sched_ext_ops, dispatch): 7610 prog->aux->priv_stack_requested = true; 7611 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 7612 } 7613 #endif /* CONFIG_EXT_SUB_SCHED */ 7614 7615 return 0; 7616 } 7617 7618 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 7619 { 7620 return scx_enable(kdata, link); 7621 } 7622 7623 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 7624 { 7625 struct sched_ext_ops *ops = kdata; 7626 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 7627 7628 scx_disable(sch, SCX_EXIT_UNREG); 7629 scx_flush_disable_work(sch); 7630 RCU_INIT_POINTER(ops->priv, NULL); 7631 kobject_put(&sch->kobj); 7632 } 7633 7634 static int bpf_scx_init(struct btf *btf) 7635 { 7636 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 7637 7638 return 0; 7639 } 7640 7641 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 7642 { 7643 /* 7644 * sched_ext does not support updating the actively-loaded BPF 7645 * scheduler, as registering a BPF scheduler can always fail if the 7646 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 7647 * etc. Similarly, we can always race with unregistration happening 7648 * elsewhere, such as with sysrq. 7649 */ 7650 return -EOPNOTSUPP; 7651 } 7652 7653 static int bpf_scx_validate(void *kdata) 7654 { 7655 return 0; 7656 } 7657 7658 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 7659 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 7660 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 7661 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 7662 static void sched_ext_ops__tick(struct task_struct *p) {} 7663 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 7664 static void sched_ext_ops__running(struct task_struct *p) {} 7665 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 7666 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 7667 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 7668 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 7669 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 7670 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 7671 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 7672 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 7673 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 7674 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 7675 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 7676 static void sched_ext_ops__enable(struct task_struct *p) {} 7677 static void sched_ext_ops__disable(struct task_struct *p) {} 7678 #ifdef CONFIG_EXT_GROUP_SCHED 7679 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 7680 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 7681 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 7682 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7683 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7684 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 7685 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 7686 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 7687 #endif /* CONFIG_EXT_GROUP_SCHED */ 7688 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 7689 static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 7690 static void sched_ext_ops__cpu_online(s32 cpu) {} 7691 static void sched_ext_ops__cpu_offline(s32 cpu) {} 7692 static s32 sched_ext_ops__init(void) { return -EINVAL; } 7693 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 7694 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 7695 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 7696 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 7697 7698 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 7699 .select_cpu = sched_ext_ops__select_cpu, 7700 .enqueue = sched_ext_ops__enqueue, 7701 .dequeue = sched_ext_ops__dequeue, 7702 .dispatch = sched_ext_ops__dispatch, 7703 .tick = sched_ext_ops__tick, 7704 .runnable = sched_ext_ops__runnable, 7705 .running = sched_ext_ops__running, 7706 .stopping = sched_ext_ops__stopping, 7707 .quiescent = sched_ext_ops__quiescent, 7708 .yield = sched_ext_ops__yield, 7709 .core_sched_before = sched_ext_ops__core_sched_before, 7710 .set_weight = sched_ext_ops__set_weight, 7711 .set_cpumask = sched_ext_ops__set_cpumask, 7712 .update_idle = sched_ext_ops__update_idle, 7713 .cpu_acquire = sched_ext_ops__cpu_acquire, 7714 .cpu_release = sched_ext_ops__cpu_release, 7715 .init_task = sched_ext_ops__init_task, 7716 .exit_task = sched_ext_ops__exit_task, 7717 .enable = sched_ext_ops__enable, 7718 .disable = sched_ext_ops__disable, 7719 #ifdef CONFIG_EXT_GROUP_SCHED 7720 .cgroup_init = sched_ext_ops__cgroup_init, 7721 .cgroup_exit = sched_ext_ops__cgroup_exit, 7722 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 7723 .cgroup_move = sched_ext_ops__cgroup_move, 7724 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 7725 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 7726 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 7727 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 7728 #endif 7729 .sub_attach = sched_ext_ops__sub_attach, 7730 .sub_detach = sched_ext_ops__sub_detach, 7731 .cpu_online = sched_ext_ops__cpu_online, 7732 .cpu_offline = sched_ext_ops__cpu_offline, 7733 .init = sched_ext_ops__init, 7734 .exit = sched_ext_ops__exit, 7735 .dump = sched_ext_ops__dump, 7736 .dump_cpu = sched_ext_ops__dump_cpu, 7737 .dump_task = sched_ext_ops__dump_task, 7738 }; 7739 7740 static struct bpf_struct_ops bpf_sched_ext_ops = { 7741 .verifier_ops = &bpf_scx_verifier_ops, 7742 .reg = bpf_scx_reg, 7743 .unreg = bpf_scx_unreg, 7744 .check_member = bpf_scx_check_member, 7745 .init_member = bpf_scx_init_member, 7746 .init = bpf_scx_init, 7747 .update = bpf_scx_update, 7748 .validate = bpf_scx_validate, 7749 .name = "sched_ext_ops", 7750 .owner = THIS_MODULE, 7751 .cfi_stubs = &__bpf_ops_sched_ext_ops 7752 }; 7753 7754 7755 /******************************************************************************** 7756 * System integration and init. 7757 */ 7758 7759 static void sysrq_handle_sched_ext_reset(u8 key) 7760 { 7761 struct scx_sched *sch; 7762 7763 rcu_read_lock(); 7764 sch = rcu_dereference(scx_root); 7765 if (likely(sch)) 7766 scx_disable(sch, SCX_EXIT_SYSRQ); 7767 else 7768 pr_info("sched_ext: BPF schedulers not loaded\n"); 7769 rcu_read_unlock(); 7770 } 7771 7772 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 7773 .handler = sysrq_handle_sched_ext_reset, 7774 .help_msg = "reset-sched-ext(S)", 7775 .action_msg = "Disable sched_ext and revert all tasks to CFS", 7776 .enable_mask = SYSRQ_ENABLE_RTNICE, 7777 }; 7778 7779 static void sysrq_handle_sched_ext_dump(u8 key) 7780 { 7781 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 7782 struct scx_sched *sch; 7783 7784 list_for_each_entry_rcu(sch, &scx_sched_all, all) 7785 scx_dump_state(sch, &ei, 0, false); 7786 } 7787 7788 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 7789 .handler = sysrq_handle_sched_ext_dump, 7790 .help_msg = "dump-sched-ext(D)", 7791 .action_msg = "Trigger sched_ext debug dump", 7792 .enable_mask = SYSRQ_ENABLE_RTNICE, 7793 }; 7794 7795 static bool can_skip_idle_kick(struct rq *rq) 7796 { 7797 lockdep_assert_rq_held(rq); 7798 7799 /* 7800 * We can skip idle kicking if @rq is going to go through at least one 7801 * full SCX scheduling cycle before going idle. Just checking whether 7802 * curr is not idle is insufficient because we could be racing 7803 * balance_one() trying to pull the next task from a remote rq, which 7804 * may fail, and @rq may become idle afterwards. 7805 * 7806 * The race window is small and we don't and can't guarantee that @rq is 7807 * only kicked while idle anyway. Skip only when sure. 7808 */ 7809 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 7810 } 7811 7812 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 7813 { 7814 struct rq *rq = cpu_rq(cpu); 7815 struct scx_rq *this_scx = &this_rq->scx; 7816 const struct sched_class *cur_class; 7817 bool should_wait = false; 7818 unsigned long flags; 7819 7820 raw_spin_rq_lock_irqsave(rq, flags); 7821 cur_class = rq->curr->sched_class; 7822 7823 /* 7824 * During CPU hotplug, a CPU may depend on kicking itself to make 7825 * forward progress. Allow kicking self regardless of online state. If 7826 * @cpu is running a higher class task, we have no control over @cpu. 7827 * Skip kicking. 7828 */ 7829 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 7830 !sched_class_above(cur_class, &ext_sched_class)) { 7831 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 7832 if (cur_class == &ext_sched_class) 7833 rq->curr->scx.slice = 0; 7834 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7835 } 7836 7837 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 7838 if (cur_class == &ext_sched_class) { 7839 cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 7840 ksyncs[cpu] = rq->scx.kick_sync; 7841 should_wait = true; 7842 } 7843 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7844 } 7845 7846 resched_curr(rq); 7847 } else { 7848 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7849 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7850 } 7851 7852 raw_spin_rq_unlock_irqrestore(rq, flags); 7853 7854 return should_wait; 7855 } 7856 7857 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 7858 { 7859 struct rq *rq = cpu_rq(cpu); 7860 unsigned long flags; 7861 7862 raw_spin_rq_lock_irqsave(rq, flags); 7863 7864 if (!can_skip_idle_kick(rq) && 7865 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 7866 resched_curr(rq); 7867 7868 raw_spin_rq_unlock_irqrestore(rq, flags); 7869 } 7870 7871 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 7872 { 7873 struct rq *this_rq = this_rq(); 7874 struct scx_rq *this_scx = &this_rq->scx; 7875 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 7876 bool should_wait = false; 7877 unsigned long *ksyncs; 7878 s32 cpu; 7879 7880 /* can race with free_kick_syncs() during scheduler disable */ 7881 if (unlikely(!ksyncs_pcpu)) 7882 return; 7883 7884 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 7885 7886 for_each_cpu(cpu, this_scx->cpus_to_kick) { 7887 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 7888 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 7889 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7890 } 7891 7892 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 7893 kick_one_cpu_if_idle(cpu, this_rq); 7894 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7895 } 7896 7897 /* 7898 * Can't wait in hardirq — kick_sync can't advance, deadlocking if 7899 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 7900 */ 7901 if (should_wait) { 7902 raw_spin_rq_lock(this_rq); 7903 this_scx->kick_sync_pending = true; 7904 resched_curr(this_rq); 7905 raw_spin_rq_unlock(this_rq); 7906 } 7907 } 7908 7909 /** 7910 * print_scx_info - print out sched_ext scheduler state 7911 * @log_lvl: the log level to use when printing 7912 * @p: target task 7913 * 7914 * If a sched_ext scheduler is enabled, print the name and state of the 7915 * scheduler. If @p is on sched_ext, print further information about the task. 7916 * 7917 * This function can be safely called on any task as long as the task_struct 7918 * itself is accessible. While safe, this function isn't synchronized and may 7919 * print out mixups or garbages of limited length. 7920 */ 7921 void print_scx_info(const char *log_lvl, struct task_struct *p) 7922 { 7923 struct scx_sched *sch; 7924 enum scx_enable_state state = scx_enable_state(); 7925 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 7926 char runnable_at_buf[22] = "?"; 7927 struct sched_class *class; 7928 unsigned long runnable_at; 7929 7930 guard(rcu)(); 7931 7932 sch = scx_task_sched_rcu(p); 7933 7934 if (!sch) 7935 return; 7936 7937 /* 7938 * Carefully check if the task was running on sched_ext, and then 7939 * carefully copy the time it's been runnable, and its state. 7940 */ 7941 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 7942 class != &ext_sched_class) { 7943 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 7944 scx_enable_state_str[state], all); 7945 return; 7946 } 7947 7948 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 7949 sizeof(runnable_at))) 7950 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 7951 jiffies_delta_msecs(runnable_at, jiffies)); 7952 7953 /* print everything onto one line to conserve console space */ 7954 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 7955 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 7956 runnable_at_buf); 7957 } 7958 7959 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 7960 { 7961 struct scx_sched *sch; 7962 7963 guard(rcu)(); 7964 7965 sch = rcu_dereference(scx_root); 7966 if (!sch) 7967 return NOTIFY_OK; 7968 7969 /* 7970 * SCX schedulers often have userspace components which are sometimes 7971 * involved in critial scheduling paths. PM operations involve freezing 7972 * userspace which can lead to scheduling misbehaviors including stalls. 7973 * Let's bypass while PM operations are in progress. 7974 */ 7975 switch (event) { 7976 case PM_HIBERNATION_PREPARE: 7977 case PM_SUSPEND_PREPARE: 7978 case PM_RESTORE_PREPARE: 7979 scx_bypass(sch, true); 7980 break; 7981 case PM_POST_HIBERNATION: 7982 case PM_POST_SUSPEND: 7983 case PM_POST_RESTORE: 7984 scx_bypass(sch, false); 7985 break; 7986 } 7987 7988 return NOTIFY_OK; 7989 } 7990 7991 static struct notifier_block scx_pm_notifier = { 7992 .notifier_call = scx_pm_handler, 7993 }; 7994 7995 void __init init_sched_ext_class(void) 7996 { 7997 s32 cpu, v; 7998 7999 /* 8000 * The following is to prevent the compiler from optimizing out the enum 8001 * definitions so that BPF scheduler implementations can use them 8002 * through the generated vmlinux.h. 8003 */ 8004 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 8005 SCX_TG_ONLINE); 8006 8007 scx_idle_init_masks(); 8008 8009 for_each_possible_cpu(cpu) { 8010 struct rq *rq = cpu_rq(cpu); 8011 int n = cpu_to_node(cpu); 8012 8013 /* local_dsq's sch will be set during scx_root_enable() */ 8014 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 8015 8016 INIT_LIST_HEAD(&rq->scx.runnable_list); 8017 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 8018 8019 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 8020 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 8021 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 8022 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 8023 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 8024 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 8025 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 8026 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 8027 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 8028 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 8029 8030 if (cpu_online(cpu)) 8031 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 8032 } 8033 8034 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 8035 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 8036 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 8037 8038 #ifdef CONFIG_EXT_SUB_SCHED 8039 BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 8040 #endif /* CONFIG_EXT_SUB_SCHED */ 8041 } 8042 8043 8044 /******************************************************************************** 8045 * Helpers that can be called from the BPF scheduler. 8046 */ 8047 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) 8048 { 8049 bool is_local = dsq_id == SCX_DSQ_LOCAL || 8050 (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; 8051 8052 if (*enq_flags & SCX_ENQ_IMMED) { 8053 if (unlikely(!is_local)) { 8054 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 8055 return false; 8056 } 8057 } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { 8058 *enq_flags |= SCX_ENQ_IMMED; 8059 } 8060 8061 return true; 8062 } 8063 8064 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 8065 u64 dsq_id, u64 *enq_flags) 8066 { 8067 lockdep_assert_irqs_disabled(); 8068 8069 if (unlikely(!p)) { 8070 scx_error(sch, "called with NULL task"); 8071 return false; 8072 } 8073 8074 if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 8075 scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); 8076 return false; 8077 } 8078 8079 /* see SCX_EV_INSERT_NOT_OWNED definition */ 8080 if (unlikely(!scx_task_on_sched(sch, p))) { 8081 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 8082 return false; 8083 } 8084 8085 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8086 return false; 8087 8088 return true; 8089 } 8090 8091 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 8092 u64 dsq_id, u64 enq_flags) 8093 { 8094 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8095 struct task_struct *ddsp_task; 8096 8097 ddsp_task = __this_cpu_read(direct_dispatch_task); 8098 if (ddsp_task) { 8099 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 8100 return; 8101 } 8102 8103 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 8104 scx_error(sch, "dispatch buffer overflow"); 8105 return; 8106 } 8107 8108 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 8109 .task = p, 8110 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 8111 .dsq_id = dsq_id, 8112 .enq_flags = enq_flags, 8113 }; 8114 } 8115 8116 __bpf_kfunc_start_defs(); 8117 8118 /** 8119 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 8120 * @p: task_struct to insert 8121 * @dsq_id: DSQ to insert into 8122 * @slice: duration @p can run for in nsecs, 0 to keep the current value 8123 * @enq_flags: SCX_ENQ_* 8124 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8125 * 8126 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 8127 * call this function spuriously. Can be called from ops.enqueue(), 8128 * ops.select_cpu(), and ops.dispatch(). 8129 * 8130 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 8131 * and @p must match the task being enqueued. 8132 * 8133 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 8134 * will be directly inserted into the corresponding dispatch queue after 8135 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 8136 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 8137 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 8138 * task is inserted. 8139 * 8140 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 8141 * and this function can be called upto ops.dispatch_max_batch times to insert 8142 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 8143 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 8144 * counter. 8145 * 8146 * This function doesn't have any locking restrictions and may be called under 8147 * BPF locks (in the future when BPF introduces more flexible locking). 8148 * 8149 * @p is allowed to run for @slice. The scheduling path is triggered on slice 8150 * exhaustion. If zero, the current residual slice is maintained. If 8151 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 8152 * scx_bpf_kick_cpu() to trigger scheduling. 8153 * 8154 * Returns %true on successful insertion, %false on failure. On the root 8155 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8156 * to check the return value. 8157 */ 8158 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 8159 u64 slice, u64 enq_flags, 8160 const struct bpf_prog_aux *aux) 8161 { 8162 struct scx_sched *sch; 8163 8164 guard(rcu)(); 8165 sch = scx_prog_sched(aux); 8166 if (unlikely(!sch)) 8167 return false; 8168 8169 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8170 return false; 8171 8172 if (slice) 8173 p->scx.slice = slice; 8174 else 8175 p->scx.slice = p->scx.slice ?: 1; 8176 8177 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 8178 8179 return true; 8180 } 8181 8182 /* 8183 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 8184 */ 8185 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 8186 u64 slice, u64 enq_flags, 8187 const struct bpf_prog_aux *aux) 8188 { 8189 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 8190 } 8191 8192 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 8193 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 8194 { 8195 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8196 return false; 8197 8198 if (slice) 8199 p->scx.slice = slice; 8200 else 8201 p->scx.slice = p->scx.slice ?: 1; 8202 8203 p->scx.dsq_vtime = vtime; 8204 8205 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8206 8207 return true; 8208 } 8209 8210 struct scx_bpf_dsq_insert_vtime_args { 8211 /* @p can't be packed together as KF_RCU is not transitive */ 8212 u64 dsq_id; 8213 u64 slice; 8214 u64 vtime; 8215 u64 enq_flags; 8216 }; 8217 8218 /** 8219 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 8220 * @p: task_struct to insert 8221 * @args: struct containing the rest of the arguments 8222 * @args->dsq_id: DSQ to insert into 8223 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 8224 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 8225 * @args->enq_flags: SCX_ENQ_* 8226 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8227 * 8228 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 8229 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 8230 * as an inline wrapper in common.bpf.h. 8231 * 8232 * Insert @p into the vtime priority queue of the DSQ identified by 8233 * @args->dsq_id. Tasks queued into the priority queue are ordered by 8234 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 8235 * 8236 * @args->vtime ordering is according to time_before64() which considers 8237 * wrapping. A numerically larger vtime may indicate an earlier position in the 8238 * ordering and vice-versa. 8239 * 8240 * A DSQ can only be used as a FIFO or priority queue at any given time and this 8241 * function must not be called on a DSQ which already has one or more FIFO tasks 8242 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 8243 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 8244 * 8245 * Returns %true on successful insertion, %false on failure. On the root 8246 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8247 * to check the return value. 8248 */ 8249 __bpf_kfunc bool 8250 __scx_bpf_dsq_insert_vtime(struct task_struct *p, 8251 struct scx_bpf_dsq_insert_vtime_args *args, 8252 const struct bpf_prog_aux *aux) 8253 { 8254 struct scx_sched *sch; 8255 8256 guard(rcu)(); 8257 8258 sch = scx_prog_sched(aux); 8259 if (unlikely(!sch)) 8260 return false; 8261 8262 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 8263 args->vtime, args->enq_flags); 8264 } 8265 8266 /* 8267 * COMPAT: Will be removed in v6.23. 8268 */ 8269 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 8270 u64 slice, u64 vtime, u64 enq_flags) 8271 { 8272 struct scx_sched *sch; 8273 8274 guard(rcu)(); 8275 8276 sch = rcu_dereference(scx_root); 8277 if (unlikely(!sch)) 8278 return; 8279 8280 #ifdef CONFIG_EXT_SUB_SCHED 8281 /* 8282 * Disallow if any sub-scheds are attached. There is no way to tell 8283 * which scheduler called us, just error out @p's scheduler. 8284 */ 8285 if (unlikely(!list_empty(&sch->children))) { 8286 scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 8287 return; 8288 } 8289 #endif 8290 8291 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8292 } 8293 8294 __bpf_kfunc_end_defs(); 8295 8296 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8297 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8298 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8299 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8300 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8301 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8302 8303 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8304 .owner = THIS_MODULE, 8305 .set = &scx_kfunc_ids_enqueue_dispatch, 8306 .filter = scx_kfunc_context_filter, 8307 }; 8308 8309 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8310 struct task_struct *p, u64 dsq_id, u64 enq_flags) 8311 { 8312 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8313 struct scx_sched *sch; 8314 struct rq *this_rq, *src_rq, *locked_rq; 8315 bool dispatched = false; 8316 bool in_balance; 8317 unsigned long flags; 8318 8319 /* 8320 * The verifier considers an iterator slot initialized on any 8321 * KF_ITER_NEW return, so a BPF program may legally reach here after 8322 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. 8323 */ 8324 if (unlikely(!src_dsq)) 8325 return false; 8326 8327 sch = src_dsq->sched; 8328 8329 if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) 8330 return false; 8331 8332 /* 8333 * If the BPF scheduler keeps calling this function repeatedly, it can 8334 * cause similar live-lock conditions as consume_dispatch_q(). 8335 */ 8336 if (unlikely(READ_ONCE(sch->aborting))) 8337 return false; 8338 8339 if (unlikely(!scx_task_on_sched(sch, p))) { 8340 scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8341 p->comm, p->pid); 8342 return false; 8343 } 8344 8345 /* 8346 * Can be called from either ops.dispatch() locking this_rq() or any 8347 * context where no rq lock is held. If latter, lock @p's task_rq which 8348 * we'll likely need anyway. 8349 */ 8350 src_rq = task_rq(p); 8351 8352 local_irq_save(flags); 8353 this_rq = this_rq(); 8354 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8355 8356 if (in_balance) { 8357 if (this_rq != src_rq) { 8358 raw_spin_rq_unlock(this_rq); 8359 raw_spin_rq_lock(src_rq); 8360 } 8361 } else { 8362 raw_spin_rq_lock(src_rq); 8363 } 8364 8365 locked_rq = src_rq; 8366 raw_spin_lock(&src_dsq->lock); 8367 8368 /* did someone else get to it while we dropped the locks? */ 8369 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8370 raw_spin_unlock(&src_dsq->lock); 8371 goto out; 8372 } 8373 8374 /* @p is still on $src_dsq and stable, determine the destination */ 8375 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8376 8377 /* 8378 * Apply vtime and slice updates before moving so that the new time is 8379 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8380 * this is safe as we're locking it. 8381 */ 8382 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8383 p->scx.dsq_vtime = kit->vtime; 8384 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8385 p->scx.slice = kit->slice; 8386 8387 /* execute move */ 8388 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8389 dispatched = true; 8390 out: 8391 if (in_balance) { 8392 if (this_rq != locked_rq) { 8393 raw_spin_rq_unlock(locked_rq); 8394 raw_spin_rq_lock(this_rq); 8395 } 8396 } else { 8397 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8398 } 8399 8400 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8401 __SCX_DSQ_ITER_HAS_VTIME); 8402 return dispatched; 8403 } 8404 8405 __bpf_kfunc_start_defs(); 8406 8407 /** 8408 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8409 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8410 * 8411 * Can only be called from ops.dispatch(). 8412 */ 8413 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8414 { 8415 struct scx_sched *sch; 8416 8417 guard(rcu)(); 8418 8419 sch = scx_prog_sched(aux); 8420 if (unlikely(!sch)) 8421 return 0; 8422 8423 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8424 } 8425 8426 /** 8427 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8428 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8429 * 8430 * Cancel the latest dispatch. Can be called multiple times to cancel further 8431 * dispatches. Can only be called from ops.dispatch(). 8432 */ 8433 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 8434 { 8435 struct scx_sched *sch; 8436 struct scx_dsp_ctx *dspc; 8437 8438 guard(rcu)(); 8439 8440 sch = scx_prog_sched(aux); 8441 if (unlikely(!sch)) 8442 return; 8443 8444 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8445 8446 if (dspc->cursor > 0) 8447 dspc->cursor--; 8448 else 8449 scx_error(sch, "dispatch buffer underflow"); 8450 } 8451 8452 /** 8453 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 8454 * @dsq_id: DSQ to move task from. Must be a user-created DSQ 8455 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8456 * @enq_flags: %SCX_ENQ_* 8457 * 8458 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 8459 * local DSQ for execution with @enq_flags applied. Can only be called from 8460 * ops.dispatch(). 8461 * 8462 * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as 8463 * sources. Local DSQs support reenqueueing (a task can be picked up for 8464 * execution, dequeued for property changes, or reenqueued), but the BPF 8465 * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL 8466 * is similar but also doesn't support reenqueueing, as it maps to multiple 8467 * per-node DSQs making the scope difficult to define; this may change in the 8468 * future. 8469 * 8470 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 8471 * before trying to move from the specified DSQ. It may also grab rq locks and 8472 * thus can't be called under any BPF locks. 8473 * 8474 * Returns %true if a task has been moved, %false if there isn't any task to 8475 * move. 8476 */ 8477 __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, 8478 const struct bpf_prog_aux *aux) 8479 { 8480 struct scx_dispatch_q *dsq; 8481 struct scx_sched *sch; 8482 struct scx_dsp_ctx *dspc; 8483 8484 guard(rcu)(); 8485 8486 sch = scx_prog_sched(aux); 8487 if (unlikely(!sch)) 8488 return false; 8489 8490 if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) 8491 return false; 8492 8493 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8494 8495 flush_dispatch_buf(sch, dspc->rq); 8496 8497 dsq = find_user_dsq(sch, dsq_id); 8498 if (unlikely(!dsq)) { 8499 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 8500 return false; 8501 } 8502 8503 if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { 8504 /* 8505 * A successfully consumed task can be dequeued before it starts 8506 * running while the CPU is trying to migrate other dispatched 8507 * tasks. Bump nr_tasks to tell balance_one() to retry on empty 8508 * local DSQ. 8509 */ 8510 dspc->nr_tasks++; 8511 return true; 8512 } else { 8513 return false; 8514 } 8515 } 8516 8517 /* 8518 * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. 8519 */ 8520 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 8521 { 8522 return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); 8523 } 8524 8525 /** 8526 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 8527 * @it__iter: DSQ iterator in progress 8528 * @slice: duration the moved task can run for in nsecs 8529 * 8530 * Override the slice of the next task that will be moved from @it__iter using 8531 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 8532 * slice duration is kept. 8533 */ 8534 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 8535 u64 slice) 8536 { 8537 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8538 8539 kit->slice = slice; 8540 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 8541 } 8542 8543 /** 8544 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 8545 * @it__iter: DSQ iterator in progress 8546 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 8547 * 8548 * Override the vtime of the next task that will be moved from @it__iter using 8549 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 8550 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 8551 * override is ignored and cleared. 8552 */ 8553 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 8554 u64 vtime) 8555 { 8556 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8557 8558 kit->vtime = vtime; 8559 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 8560 } 8561 8562 /** 8563 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 8564 * @it__iter: DSQ iterator in progress 8565 * @p: task to transfer 8566 * @dsq_id: DSQ to move @p to 8567 * @enq_flags: SCX_ENQ_* 8568 * 8569 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 8570 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 8571 * be the destination. 8572 * 8573 * For the transfer to be successful, @p must still be on the DSQ and have been 8574 * queued before the DSQ iteration started. This function doesn't care whether 8575 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 8576 * been queued before the iteration started. 8577 * 8578 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 8579 * 8580 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 8581 * lock (e.g. BPF timers or SYSCALL programs). 8582 * 8583 * Returns %true if @p has been consumed, %false if @p had already been 8584 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 8585 * DSQ. 8586 */ 8587 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 8588 struct task_struct *p, u64 dsq_id, 8589 u64 enq_flags) 8590 { 8591 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8592 p, dsq_id, enq_flags); 8593 } 8594 8595 /** 8596 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 8597 * @it__iter: DSQ iterator in progress 8598 * @p: task to transfer 8599 * @dsq_id: DSQ to move @p to 8600 * @enq_flags: SCX_ENQ_* 8601 * 8602 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 8603 * priority queue of the DSQ specified by @dsq_id. The destination must be a 8604 * user DSQ as only user DSQs support priority queue. 8605 * 8606 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 8607 * and scx_bpf_dsq_move_set_vtime() to update. 8608 * 8609 * All other aspects are identical to scx_bpf_dsq_move(). See 8610 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 8611 */ 8612 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 8613 struct task_struct *p, u64 dsq_id, 8614 u64 enq_flags) 8615 { 8616 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8617 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8618 } 8619 8620 #ifdef CONFIG_EXT_SUB_SCHED 8621 /** 8622 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 8623 * @cgroup_id: cgroup ID of the child scheduler to dispatch 8624 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8625 * 8626 * Allows a parent scheduler to trigger dispatching on one of its direct 8627 * child schedulers. The child scheduler runs its dispatch operation to 8628 * move tasks from dispatch queues to the local runqueue. 8629 * 8630 * Returns: true on success, false if cgroup_id is invalid, not a direct 8631 * child, or caller lacks dispatch permission. 8632 */ 8633 __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 8634 { 8635 struct rq *this_rq = this_rq(); 8636 struct scx_sched *parent, *child; 8637 8638 guard(rcu)(); 8639 parent = scx_prog_sched(aux); 8640 if (unlikely(!parent)) 8641 return false; 8642 8643 child = scx_find_sub_sched(cgroup_id); 8644 8645 if (unlikely(!child)) 8646 return false; 8647 8648 if (unlikely(scx_parent(child) != parent)) { 8649 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 8650 cgroup_id); 8651 return false; 8652 } 8653 8654 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 8655 true); 8656 } 8657 #endif /* CONFIG_EXT_SUB_SCHED */ 8658 8659 __bpf_kfunc_end_defs(); 8660 8661 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 8662 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 8663 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 8664 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 8665 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) 8666 /* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ 8667 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8668 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8669 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8670 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8671 #ifdef CONFIG_EXT_SUB_SCHED 8672 BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 8673 #endif 8674 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 8675 8676 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 8677 .owner = THIS_MODULE, 8678 .set = &scx_kfunc_ids_dispatch, 8679 .filter = scx_kfunc_context_filter, 8680 }; 8681 8682 __bpf_kfunc_start_defs(); 8683 8684 /** 8685 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 8686 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8687 * 8688 * Iterate over all of the tasks currently enqueued on the local DSQ of the 8689 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 8690 * processed tasks. Can only be called from ops.cpu_release(). 8691 */ 8692 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 8693 { 8694 struct scx_sched *sch; 8695 struct rq *rq; 8696 8697 guard(rcu)(); 8698 sch = scx_prog_sched(aux); 8699 if (unlikely(!sch)) 8700 return 0; 8701 8702 rq = cpu_rq(smp_processor_id()); 8703 lockdep_assert_rq_held(rq); 8704 8705 return reenq_local(sch, rq, SCX_REENQ_ANY); 8706 } 8707 8708 __bpf_kfunc_end_defs(); 8709 8710 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 8711 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 8712 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 8713 8714 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 8715 .owner = THIS_MODULE, 8716 .set = &scx_kfunc_ids_cpu_release, 8717 .filter = scx_kfunc_context_filter, 8718 }; 8719 8720 __bpf_kfunc_start_defs(); 8721 8722 /** 8723 * scx_bpf_create_dsq - Create a custom DSQ 8724 * @dsq_id: DSQ to create 8725 * @node: NUMA node to allocate from 8726 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8727 * 8728 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 8729 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 8730 */ 8731 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 8732 { 8733 struct scx_dispatch_q *dsq; 8734 struct scx_sched *sch; 8735 s32 ret; 8736 8737 if (unlikely(node >= (int)nr_node_ids || 8738 (node < 0 && node != NUMA_NO_NODE))) 8739 return -EINVAL; 8740 8741 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 8742 return -EINVAL; 8743 8744 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 8745 if (!dsq) 8746 return -ENOMEM; 8747 8748 /* 8749 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 8750 * @sch and update afterwards. 8751 */ 8752 ret = init_dsq(dsq, dsq_id, NULL); 8753 if (ret) { 8754 kfree(dsq); 8755 return ret; 8756 } 8757 8758 rcu_read_lock(); 8759 8760 sch = scx_prog_sched(aux); 8761 if (sch) { 8762 dsq->sched = sch; 8763 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 8764 dsq_hash_params); 8765 } else { 8766 ret = -ENODEV; 8767 } 8768 8769 rcu_read_unlock(); 8770 if (ret) { 8771 exit_dsq(dsq); 8772 kfree(dsq); 8773 } 8774 return ret; 8775 } 8776 8777 __bpf_kfunc_end_defs(); 8778 8779 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 8780 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 8781 /* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ 8782 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8783 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8784 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8785 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8786 /* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ 8787 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 8788 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 8789 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 8790 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 8791 8792 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 8793 .owner = THIS_MODULE, 8794 .set = &scx_kfunc_ids_unlocked, 8795 .filter = scx_kfunc_context_filter, 8796 }; 8797 8798 __bpf_kfunc_start_defs(); 8799 8800 /** 8801 * scx_bpf_task_set_slice - Set task's time slice 8802 * @p: task of interest 8803 * @slice: time slice to set in nsecs 8804 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8805 * 8806 * Set @p's time slice to @slice. Returns %true on success, %false if the 8807 * calling scheduler doesn't have authority over @p. 8808 */ 8809 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 8810 const struct bpf_prog_aux *aux) 8811 { 8812 struct scx_sched *sch; 8813 8814 guard(rcu)(); 8815 sch = scx_prog_sched(aux); 8816 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 8817 return false; 8818 8819 p->scx.slice = slice; 8820 return true; 8821 } 8822 8823 /** 8824 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 8825 * @p: task of interest 8826 * @vtime: virtual time to set 8827 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8828 * 8829 * Set @p's virtual time to @vtime. Returns %true on success, %false if the 8830 * calling scheduler doesn't have authority over @p. 8831 */ 8832 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 8833 const struct bpf_prog_aux *aux) 8834 { 8835 struct scx_sched *sch; 8836 8837 guard(rcu)(); 8838 sch = scx_prog_sched(aux); 8839 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 8840 return false; 8841 8842 p->scx.dsq_vtime = vtime; 8843 return true; 8844 } 8845 8846 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 8847 { 8848 struct rq *this_rq; 8849 unsigned long irq_flags; 8850 8851 if (!ops_cpu_valid(sch, cpu, NULL)) 8852 return; 8853 8854 local_irq_save(irq_flags); 8855 8856 this_rq = this_rq(); 8857 8858 /* 8859 * While bypassing for PM ops, IRQ handling may not be online which can 8860 * lead to irq_work_queue() malfunction such as infinite busy wait for 8861 * IRQ status update. Suppress kicking. 8862 */ 8863 if (scx_bypassing(sch, cpu_of(this_rq))) 8864 goto out; 8865 8866 /* 8867 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 8868 * rq locks. We can probably be smarter and avoid bouncing if called 8869 * from ops which don't hold a rq lock. 8870 */ 8871 if (flags & SCX_KICK_IDLE) { 8872 struct rq *target_rq = cpu_rq(cpu); 8873 8874 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 8875 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 8876 8877 if (raw_spin_rq_trylock(target_rq)) { 8878 if (can_skip_idle_kick(target_rq)) { 8879 raw_spin_rq_unlock(target_rq); 8880 goto out; 8881 } 8882 raw_spin_rq_unlock(target_rq); 8883 } 8884 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 8885 } else { 8886 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 8887 8888 if (flags & SCX_KICK_PREEMPT) 8889 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 8890 if (flags & SCX_KICK_WAIT) 8891 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 8892 } 8893 8894 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 8895 out: 8896 local_irq_restore(irq_flags); 8897 } 8898 8899 /** 8900 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 8901 * @cpu: cpu to kick 8902 * @flags: %SCX_KICK_* flags 8903 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8904 * 8905 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 8906 * trigger rescheduling on a busy CPU. This can be called from any online 8907 * scx_ops operation and the actual kicking is performed asynchronously through 8908 * an irq work. 8909 */ 8910 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 8911 { 8912 struct scx_sched *sch; 8913 8914 guard(rcu)(); 8915 sch = scx_prog_sched(aux); 8916 if (likely(sch)) 8917 scx_kick_cpu(sch, cpu, flags); 8918 } 8919 8920 /** 8921 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 8922 * @dsq_id: id of the DSQ 8923 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8924 * 8925 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 8926 * -%ENOENT is returned. 8927 */ 8928 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) 8929 { 8930 struct scx_sched *sch; 8931 struct scx_dispatch_q *dsq; 8932 s32 ret; 8933 8934 preempt_disable(); 8935 8936 sch = scx_prog_sched(aux); 8937 if (unlikely(!sch)) { 8938 ret = -ENODEV; 8939 goto out; 8940 } 8941 8942 if (dsq_id == SCX_DSQ_LOCAL) { 8943 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 8944 goto out; 8945 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 8946 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 8947 8948 if (ops_cpu_valid(sch, cpu, NULL)) { 8949 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 8950 goto out; 8951 } 8952 } else { 8953 dsq = find_user_dsq(sch, dsq_id); 8954 if (dsq) { 8955 ret = READ_ONCE(dsq->nr); 8956 goto out; 8957 } 8958 } 8959 ret = -ENOENT; 8960 out: 8961 preempt_enable(); 8962 return ret; 8963 } 8964 8965 /** 8966 * scx_bpf_destroy_dsq - Destroy a custom DSQ 8967 * @dsq_id: DSQ to destroy 8968 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8969 * 8970 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 8971 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 8972 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 8973 * which doesn't exist. Can be called from any online scx_ops operations. 8974 */ 8975 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) 8976 { 8977 struct scx_sched *sch; 8978 8979 guard(rcu)(); 8980 sch = scx_prog_sched(aux); 8981 if (sch) 8982 destroy_dsq(sch, dsq_id); 8983 } 8984 8985 /** 8986 * bpf_iter_scx_dsq_new - Create a DSQ iterator 8987 * @it: iterator to initialize 8988 * @dsq_id: DSQ to iterate 8989 * @flags: %SCX_DSQ_ITER_* 8990 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8991 * 8992 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 8993 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 8994 * tasks which are already queued when this function is invoked. 8995 */ 8996 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 8997 u64 flags, const struct bpf_prog_aux *aux) 8998 { 8999 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9000 struct scx_sched *sch; 9001 9002 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 9003 sizeof(struct bpf_iter_scx_dsq)); 9004 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 9005 __alignof__(struct bpf_iter_scx_dsq)); 9006 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 9007 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 9008 9009 /* 9010 * next() and destroy() will be called regardless of the return value. 9011 * Always clear $kit->dsq. 9012 */ 9013 kit->dsq = NULL; 9014 9015 sch = scx_prog_sched(aux); 9016 if (unlikely(!sch)) 9017 return -ENODEV; 9018 9019 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 9020 return -EINVAL; 9021 9022 kit->dsq = find_user_dsq(sch, dsq_id); 9023 if (!kit->dsq) 9024 return -ENOENT; 9025 9026 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 9027 9028 return 0; 9029 } 9030 9031 /** 9032 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 9033 * @it: iterator to progress 9034 * 9035 * Return the next task. See bpf_iter_scx_dsq_new(). 9036 */ 9037 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 9038 { 9039 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9040 9041 if (!kit->dsq) 9042 return NULL; 9043 9044 guard(raw_spinlock_irqsave)(&kit->dsq->lock); 9045 9046 return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 9047 } 9048 9049 /** 9050 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 9051 * @it: iterator to destroy 9052 * 9053 * Undo scx_iter_scx_dsq_new(). 9054 */ 9055 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 9056 { 9057 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9058 9059 if (!kit->dsq) 9060 return; 9061 9062 if (!list_empty(&kit->cursor.node)) { 9063 unsigned long flags; 9064 9065 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 9066 list_del_init(&kit->cursor.node); 9067 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 9068 } 9069 kit->dsq = NULL; 9070 } 9071 9072 /** 9073 * scx_bpf_dsq_peek - Lockless peek at the first element. 9074 * @dsq_id: DSQ to examine. 9075 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9076 * 9077 * Read the first element in the DSQ. This is semantically equivalent to using 9078 * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 9079 * this provides only a point-in-time snapshot, and the contents may change 9080 * by the time any subsequent locking operation reads the queue. 9081 * 9082 * Returns the pointer, or NULL indicates an empty queue OR internal error. 9083 */ 9084 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 9085 const struct bpf_prog_aux *aux) 9086 { 9087 struct scx_sched *sch; 9088 struct scx_dispatch_q *dsq; 9089 9090 sch = scx_prog_sched(aux); 9091 if (unlikely(!sch)) 9092 return NULL; 9093 9094 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 9095 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 9096 return NULL; 9097 } 9098 9099 dsq = find_user_dsq(sch, dsq_id); 9100 if (unlikely(!dsq)) { 9101 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 9102 return NULL; 9103 } 9104 9105 return rcu_dereference(dsq->first_task); 9106 } 9107 9108 /** 9109 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 9110 * @dsq_id: DSQ to re-enqueue 9111 * @reenq_flags: %SCX_RENQ_* 9112 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9113 * 9114 * Iterate over all of the tasks currently enqueued on the DSQ identified by 9115 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 9116 * supported: 9117 * 9118 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 9119 * - User DSQs 9120 * 9121 * Re-enqueues are performed asynchronously. Can be called from anywhere. 9122 */ 9123 __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 9124 const struct bpf_prog_aux *aux) 9125 { 9126 struct scx_sched *sch; 9127 struct scx_dispatch_q *dsq; 9128 9129 guard(preempt)(); 9130 9131 sch = scx_prog_sched(aux); 9132 if (unlikely(!sch)) 9133 return; 9134 9135 if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 9136 scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 9137 return; 9138 } 9139 9140 /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 9141 if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 9142 reenq_flags |= SCX_REENQ_ANY; 9143 9144 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 9145 schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); 9146 } 9147 9148 /** 9149 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9150 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9151 * 9152 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9153 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 9154 * anywhere. 9155 * 9156 * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 9157 * future. 9158 */ 9159 __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 9160 { 9161 scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 9162 } 9163 9164 __bpf_kfunc_end_defs(); 9165 9166 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 9167 size_t line_size, char *fmt, unsigned long long *data, 9168 u32 data__sz) 9169 { 9170 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 9171 s32 ret; 9172 9173 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 9174 (data__sz && !data)) { 9175 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 9176 return -EINVAL; 9177 } 9178 9179 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 9180 if (ret < 0) { 9181 scx_error(sch, "failed to read data fields (%d)", ret); 9182 return ret; 9183 } 9184 9185 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 9186 &bprintf_data); 9187 if (ret < 0) { 9188 scx_error(sch, "format preparation failed (%d)", ret); 9189 return ret; 9190 } 9191 9192 ret = bstr_printf(line_buf, line_size, fmt, 9193 bprintf_data.bin_args); 9194 bpf_bprintf_cleanup(&bprintf_data); 9195 if (ret < 0) { 9196 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 9197 return ret; 9198 } 9199 9200 return ret; 9201 } 9202 9203 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 9204 char *fmt, unsigned long long *data, u32 data__sz) 9205 { 9206 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 9207 fmt, data, data__sz); 9208 } 9209 9210 __bpf_kfunc_start_defs(); 9211 9212 /** 9213 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 9214 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 9215 * @fmt: error message format string 9216 * @data: format string parameters packaged using ___bpf_fill() macro 9217 * @data__sz: @data len, must end in '__sz' for the verifier 9218 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9219 * 9220 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 9221 * disabling. 9222 */ 9223 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 9224 unsigned long long *data, u32 data__sz, 9225 const struct bpf_prog_aux *aux) 9226 { 9227 struct scx_sched *sch; 9228 unsigned long flags; 9229 9230 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9231 sch = scx_prog_sched(aux); 9232 if (likely(sch) && 9233 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9234 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 9235 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9236 } 9237 9238 /** 9239 * scx_bpf_error_bstr - Indicate fatal error 9240 * @fmt: error message format string 9241 * @data: format string parameters packaged using ___bpf_fill() macro 9242 * @data__sz: @data len, must end in '__sz' for the verifier 9243 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9244 * 9245 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 9246 * disabling. 9247 */ 9248 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 9249 u32 data__sz, const struct bpf_prog_aux *aux) 9250 { 9251 struct scx_sched *sch; 9252 unsigned long flags; 9253 9254 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9255 sch = scx_prog_sched(aux); 9256 if (likely(sch) && 9257 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9258 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 9259 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9260 } 9261 9262 /** 9263 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 9264 * @fmt: format string 9265 * @data: format string parameters packaged using ___bpf_fill() macro 9266 * @data__sz: @data len, must end in '__sz' for the verifier 9267 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9268 * 9269 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 9270 * dump_task() to generate extra debug dump specific to the BPF scheduler. 9271 * 9272 * The extra dump may be multiple lines. A single line may be split over 9273 * multiple calls. The last line is automatically terminated. 9274 */ 9275 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 9276 u32 data__sz, const struct bpf_prog_aux *aux) 9277 { 9278 struct scx_sched *sch; 9279 struct scx_dump_data *dd = &scx_dump_data; 9280 struct scx_bstr_buf *buf = &dd->buf; 9281 s32 ret; 9282 9283 guard(rcu)(); 9284 9285 sch = scx_prog_sched(aux); 9286 if (unlikely(!sch)) 9287 return; 9288 9289 if (raw_smp_processor_id() != dd->cpu) { 9290 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 9291 return; 9292 } 9293 9294 /* append the formatted string to the line buf */ 9295 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 9296 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 9297 if (ret < 0) { 9298 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 9299 dd->prefix, fmt, data, data__sz, ret); 9300 return; 9301 } 9302 9303 dd->cursor += ret; 9304 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 9305 9306 if (!dd->cursor) 9307 return; 9308 9309 /* 9310 * If the line buf overflowed or ends in a newline, flush it into the 9311 * dump. This is to allow the caller to generate a single line over 9312 * multiple calls. As ops_dump_flush() can also handle multiple lines in 9313 * the line buf, the only case which can lead to an unexpected 9314 * truncation is when the caller keeps generating newlines in the middle 9315 * instead of the end consecutively. Don't do that. 9316 */ 9317 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9318 ops_dump_flush(); 9319 } 9320 9321 /** 9322 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9323 * @cpu: CPU of interest 9324 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9325 * 9326 * Return the maximum relative capacity of @cpu in relation to the most 9327 * performant CPU in the system. The return value is in the range [1, 9328 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9329 */ 9330 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9331 { 9332 struct scx_sched *sch; 9333 9334 guard(rcu)(); 9335 9336 sch = scx_prog_sched(aux); 9337 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9338 return arch_scale_cpu_capacity(cpu); 9339 else 9340 return SCX_CPUPERF_ONE; 9341 } 9342 9343 /** 9344 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9345 * @cpu: CPU of interest 9346 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9347 * 9348 * Return the current relative performance of @cpu in relation to its maximum. 9349 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9350 * 9351 * The current performance level of a CPU in relation to the maximum performance 9352 * available in the system can be calculated as follows: 9353 * 9354 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9355 * 9356 * The result is in the range [1, %SCX_CPUPERF_ONE]. 9357 */ 9358 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9359 { 9360 struct scx_sched *sch; 9361 9362 guard(rcu)(); 9363 9364 sch = scx_prog_sched(aux); 9365 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9366 return arch_scale_freq_capacity(cpu); 9367 else 9368 return SCX_CPUPERF_ONE; 9369 } 9370 9371 /** 9372 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 9373 * @cpu: CPU of interest 9374 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 9375 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9376 * 9377 * Set the target performance level of @cpu to @perf. @perf is in linear 9378 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 9379 * schedutil cpufreq governor chooses the target frequency. 9380 * 9381 * The actual performance level chosen, CPU grouping, and the overhead and 9382 * latency of the operations are dependent on the hardware and cpufreq driver in 9383 * use. Consult hardware and cpufreq documentation for more information. The 9384 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 9385 */ 9386 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 9387 { 9388 struct scx_sched *sch; 9389 9390 guard(rcu)(); 9391 9392 sch = scx_prog_sched(aux); 9393 if (unlikely(!sch)) 9394 return; 9395 9396 if (unlikely(perf > SCX_CPUPERF_ONE)) { 9397 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 9398 return; 9399 } 9400 9401 if (ops_cpu_valid(sch, cpu, NULL)) { 9402 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 9403 struct rq_flags rf; 9404 9405 /* 9406 * When called with an rq lock held, restrict the operation 9407 * to the corresponding CPU to prevent ABBA deadlocks. 9408 */ 9409 if (locked_rq && rq != locked_rq) { 9410 scx_error(sch, "Invalid target CPU %d", cpu); 9411 return; 9412 } 9413 9414 /* 9415 * If no rq lock is held, allow to operate on any CPU by 9416 * acquiring the corresponding rq lock. 9417 */ 9418 if (!locked_rq) { 9419 rq_lock_irqsave(rq, &rf); 9420 update_rq_clock(rq); 9421 } 9422 9423 rq->scx.cpuperf_target = perf; 9424 cpufreq_update_util(rq, 0); 9425 9426 if (!locked_rq) 9427 rq_unlock_irqrestore(rq, &rf); 9428 } 9429 } 9430 9431 /** 9432 * scx_bpf_nr_node_ids - Return the number of possible node IDs 9433 * 9434 * All valid node IDs in the system are smaller than the returned value. 9435 */ 9436 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 9437 { 9438 return nr_node_ids; 9439 } 9440 9441 /** 9442 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 9443 * 9444 * All valid CPU IDs in the system are smaller than the returned value. 9445 */ 9446 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 9447 { 9448 return nr_cpu_ids; 9449 } 9450 9451 /** 9452 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 9453 */ 9454 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 9455 { 9456 return cpu_possible_mask; 9457 } 9458 9459 /** 9460 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 9461 */ 9462 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 9463 { 9464 return cpu_online_mask; 9465 } 9466 9467 /** 9468 * scx_bpf_put_cpumask - Release a possible/online cpumask 9469 * @cpumask: cpumask to release 9470 */ 9471 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 9472 { 9473 /* 9474 * Empty function body because we aren't actually acquiring or releasing 9475 * a reference to a global cpumask, which is read-only in the caller and 9476 * is never released. The acquire / release semantics here are just used 9477 * to make the cpumask is a trusted pointer in the caller. 9478 */ 9479 } 9480 9481 /** 9482 * scx_bpf_task_running - Is task currently running? 9483 * @p: task of interest 9484 */ 9485 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 9486 { 9487 return task_rq(p)->curr == p; 9488 } 9489 9490 /** 9491 * scx_bpf_task_cpu - CPU a task is currently associated with 9492 * @p: task of interest 9493 */ 9494 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 9495 { 9496 return task_cpu(p); 9497 } 9498 9499 /** 9500 * scx_bpf_cpu_rq - Fetch the rq of a CPU 9501 * @cpu: CPU of the rq 9502 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9503 */ 9504 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 9505 { 9506 struct scx_sched *sch; 9507 9508 guard(rcu)(); 9509 9510 sch = scx_prog_sched(aux); 9511 if (unlikely(!sch)) 9512 return NULL; 9513 9514 if (!ops_cpu_valid(sch, cpu, NULL)) 9515 return NULL; 9516 9517 if (!sch->warned_deprecated_rq) { 9518 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 9519 "use scx_bpf_locked_rq() when holding rq lock " 9520 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 9521 sch->warned_deprecated_rq = true; 9522 } 9523 9524 return cpu_rq(cpu); 9525 } 9526 9527 /** 9528 * scx_bpf_locked_rq - Return the rq currently locked by SCX 9529 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9530 * 9531 * Returns the rq if a rq lock is currently held by SCX. 9532 * Otherwise emits an error and returns NULL. 9533 */ 9534 __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 9535 { 9536 struct scx_sched *sch; 9537 struct rq *rq; 9538 9539 guard(preempt)(); 9540 9541 sch = scx_prog_sched(aux); 9542 if (unlikely(!sch)) 9543 return NULL; 9544 9545 rq = scx_locked_rq(); 9546 if (!rq) { 9547 scx_error(sch, "accessing rq without holding rq lock"); 9548 return NULL; 9549 } 9550 9551 return rq; 9552 } 9553 9554 /** 9555 * scx_bpf_cpu_curr - Return remote CPU's curr task 9556 * @cpu: CPU of interest 9557 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9558 * 9559 * Callers must hold RCU read lock (KF_RCU). 9560 */ 9561 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 9562 { 9563 struct scx_sched *sch; 9564 9565 guard(rcu)(); 9566 9567 sch = scx_prog_sched(aux); 9568 if (unlikely(!sch)) 9569 return NULL; 9570 9571 if (!ops_cpu_valid(sch, cpu, NULL)) 9572 return NULL; 9573 9574 return rcu_dereference(cpu_rq(cpu)->curr); 9575 } 9576 9577 /** 9578 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 9579 * clock for the current CPU. The clock returned is in nanoseconds. 9580 * 9581 * It provides the following properties: 9582 * 9583 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 9584 * to account for execution time and track tasks' runtime properties. 9585 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 9586 * eventually reads a hardware timestamp counter -- is neither performant nor 9587 * scalable. scx_bpf_now() aims to provide a high-performance clock by 9588 * using the rq clock in the scheduler core whenever possible. 9589 * 9590 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 9591 * scheduler use cases, the required clock resolution is lower than the most 9592 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 9593 * uses the rq clock in the scheduler core whenever it is valid. It considers 9594 * that the rq clock is valid from the time the rq clock is updated 9595 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 9596 * 9597 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 9598 * guarantees the clock never goes backward when comparing them in the same 9599 * CPU. On the other hand, when comparing clocks in different CPUs, there 9600 * is no such guarantee -- the clock can go backward. It provides a 9601 * monotonically *non-decreasing* clock so that it would provide the same 9602 * clock values in two different scx_bpf_now() calls in the same CPU 9603 * during the same period of when the rq clock is valid. 9604 */ 9605 __bpf_kfunc u64 scx_bpf_now(void) 9606 { 9607 struct rq *rq; 9608 u64 clock; 9609 9610 preempt_disable(); 9611 9612 rq = this_rq(); 9613 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 9614 /* 9615 * If the rq clock is valid, use the cached rq clock. 9616 * 9617 * Note that scx_bpf_now() is re-entrant between a process 9618 * context and an interrupt context (e.g., timer interrupt). 9619 * However, we don't need to consider the race between them 9620 * because such race is not observable from a caller. 9621 */ 9622 clock = READ_ONCE(rq->scx.clock); 9623 } else { 9624 /* 9625 * Otherwise, return a fresh rq clock. 9626 * 9627 * The rq clock is updated outside of the rq lock. 9628 * In this case, keep the updated rq clock invalid so the next 9629 * kfunc call outside the rq lock gets a fresh rq clock. 9630 */ 9631 clock = sched_clock_cpu(cpu_of(rq)); 9632 } 9633 9634 preempt_enable(); 9635 9636 return clock; 9637 } 9638 9639 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 9640 { 9641 struct scx_event_stats *e_cpu; 9642 int cpu; 9643 9644 /* Aggregate per-CPU event counters into @events. */ 9645 memset(events, 0, sizeof(*events)); 9646 for_each_possible_cpu(cpu) { 9647 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 9648 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 9649 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 9650 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 9651 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 9652 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 9653 scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 9654 scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 9655 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 9656 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 9657 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 9658 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 9659 scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 9660 scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); 9661 } 9662 } 9663 9664 /* 9665 * scx_bpf_events - Get a system-wide event counter to 9666 * @events: output buffer from a BPF program 9667 * @events__sz: @events len, must end in '__sz'' for the verifier 9668 */ 9669 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 9670 size_t events__sz) 9671 { 9672 struct scx_sched *sch; 9673 struct scx_event_stats e_sys; 9674 9675 rcu_read_lock(); 9676 sch = rcu_dereference(scx_root); 9677 if (sch) 9678 scx_read_events(sch, &e_sys); 9679 else 9680 memset(&e_sys, 0, sizeof(e_sys)); 9681 rcu_read_unlock(); 9682 9683 /* 9684 * We cannot entirely trust a BPF-provided size since a BPF program 9685 * might be compiled against a different vmlinux.h, of which 9686 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 9687 * (an older vmlinux.h). Hence, we use the smaller size to avoid 9688 * memory corruption. 9689 */ 9690 events__sz = min(events__sz, sizeof(*events)); 9691 memcpy(events, &e_sys, events__sz); 9692 } 9693 9694 #ifdef CONFIG_CGROUP_SCHED 9695 /** 9696 * scx_bpf_task_cgroup - Return the sched cgroup of a task 9697 * @p: task of interest 9698 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9699 * 9700 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 9701 * from the scheduler's POV. SCX operations should use this function to 9702 * determine @p's current cgroup as, unlike following @p->cgroups, 9703 * @p->sched_task_group is stable for the duration of the SCX op. See 9704 * SCX_CALL_OP_TASK() for details. 9705 */ 9706 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 9707 const struct bpf_prog_aux *aux) 9708 { 9709 struct task_group *tg = p->sched_task_group; 9710 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 9711 struct scx_sched *sch; 9712 9713 guard(rcu)(); 9714 9715 sch = scx_prog_sched(aux); 9716 if (unlikely(!sch)) 9717 goto out; 9718 9719 if (!scx_kf_arg_task_ok(sch, p)) 9720 goto out; 9721 9722 cgrp = tg_cgrp(tg); 9723 9724 out: 9725 cgroup_get(cgrp); 9726 return cgrp; 9727 } 9728 #endif /* CONFIG_CGROUP_SCHED */ 9729 9730 __bpf_kfunc_end_defs(); 9731 9732 BTF_KFUNCS_START(scx_kfunc_ids_any) 9733 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 9734 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 9735 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 9736 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) 9737 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) 9738 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 9739 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 9740 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 9741 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 9742 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 9743 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 9744 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 9745 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 9746 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 9747 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 9748 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 9749 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 9750 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 9751 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 9752 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 9753 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 9754 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 9755 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 9756 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 9757 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 9758 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 9759 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 9760 BTF_ID_FLAGS(func, scx_bpf_now) 9761 BTF_ID_FLAGS(func, scx_bpf_events) 9762 #ifdef CONFIG_CGROUP_SCHED 9763 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 9764 #endif 9765 BTF_KFUNCS_END(scx_kfunc_ids_any) 9766 9767 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 9768 .owner = THIS_MODULE, 9769 .set = &scx_kfunc_ids_any, 9770 .filter = scx_kfunc_context_filter, 9771 }; 9772 9773 /* 9774 * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc 9775 * group; an op may permit zero or more groups, with the union expressed in 9776 * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) 9777 * consults this table to decide whether a context-sensitive kfunc is callable 9778 * from a given SCX op. 9779 */ 9780 enum scx_kf_allow_flags { 9781 SCX_KF_ALLOW_UNLOCKED = 1 << 0, 9782 SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, 9783 SCX_KF_ALLOW_DISPATCH = 1 << 2, 9784 SCX_KF_ALLOW_ENQUEUE = 1 << 3, 9785 SCX_KF_ALLOW_SELECT_CPU = 1 << 4, 9786 }; 9787 9788 /* 9789 * Map each SCX op to the union of kfunc groups it permits, indexed by 9790 * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not 9791 * context-sensitive. 9792 */ 9793 static const u32 scx_kf_allow_flags[] = { 9794 [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 9795 [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 9796 [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, 9797 [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, 9798 [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, 9799 [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, 9800 #ifdef CONFIG_EXT_GROUP_SCHED 9801 [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, 9802 [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, 9803 [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, 9804 [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, 9805 [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, 9806 [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, 9807 [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, 9808 #endif /* CONFIG_EXT_GROUP_SCHED */ 9809 [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, 9810 [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, 9811 [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, 9812 [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, 9813 [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, 9814 [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, 9815 }; 9816 9817 /* 9818 * Verifier-time filter for SCX kfuncs. Registered via the .filter field on 9819 * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc 9820 * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or 9821 * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the 9822 * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by 9823 * falling through to "allow" when none of the SCX sets contain the kfunc. 9824 */ 9825 int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) 9826 { 9827 bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); 9828 bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); 9829 bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); 9830 bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); 9831 bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); 9832 bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); 9833 bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); 9834 u32 moff, flags; 9835 9836 /* Not an SCX kfunc - allow. */ 9837 if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || 9838 in_cpu_release || in_idle || in_any)) 9839 return 0; 9840 9841 /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ 9842 if (prog->type == BPF_PROG_TYPE_SYSCALL) 9843 return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; 9844 9845 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) 9846 return (in_any || in_idle) ? 0 : -EACCES; 9847 9848 /* 9849 * add_subprog_and_kfunc() collects all kfunc calls, including dead code 9850 * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets 9851 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; 9852 * do_check_main() re-runs the filter with st_ops set and enforces the 9853 * actual restrictions. 9854 */ 9855 if (!prog->aux->st_ops) 9856 return 0; 9857 9858 /* 9859 * Non-SCX struct_ops: SCX kfuncs are not permitted. 9860 */ 9861 if (prog->aux->st_ops != &bpf_sched_ext_ops) 9862 return -EACCES; 9863 9864 /* SCX struct_ops: check the per-op allow list. */ 9865 if (in_any || in_idle) 9866 return 0; 9867 9868 moff = prog->aux->attach_st_ops_member_off; 9869 flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; 9870 9871 if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) 9872 return 0; 9873 if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) 9874 return 0; 9875 if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) 9876 return 0; 9877 if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) 9878 return 0; 9879 if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) 9880 return 0; 9881 9882 return -EACCES; 9883 } 9884 9885 static int __init scx_init(void) 9886 { 9887 int ret; 9888 9889 /* 9890 * kfunc registration can't be done from init_sched_ext_class() as 9891 * register_btf_kfunc_id_set() needs most of the system to be up. 9892 * 9893 * Some kfuncs are context-sensitive and can only be called from 9894 * specific SCX ops. They are grouped into per-context BTF sets, each 9895 * registered with scx_kfunc_context_filter as its .filter callback. The 9896 * BPF core dedups identical filter pointers per hook 9897 * (btf_populate_kfunc_set()), so the filter is invoked exactly once per 9898 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op 9899 * restrictions at verify time. 9900 */ 9901 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9902 &scx_kfunc_set_enqueue_dispatch)) || 9903 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9904 &scx_kfunc_set_dispatch)) || 9905 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9906 &scx_kfunc_set_cpu_release)) || 9907 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9908 &scx_kfunc_set_unlocked)) || 9909 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9910 &scx_kfunc_set_unlocked)) || 9911 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9912 &scx_kfunc_set_any)) || 9913 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 9914 &scx_kfunc_set_any)) || 9915 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9916 &scx_kfunc_set_any))) { 9917 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 9918 return ret; 9919 } 9920 9921 ret = scx_idle_init(); 9922 if (ret) { 9923 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 9924 return ret; 9925 } 9926 9927 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 9928 if (ret) { 9929 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 9930 return ret; 9931 } 9932 9933 ret = register_pm_notifier(&scx_pm_notifier); 9934 if (ret) { 9935 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 9936 return ret; 9937 } 9938 9939 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 9940 if (!scx_kset) { 9941 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 9942 return -ENOMEM; 9943 } 9944 9945 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 9946 if (ret < 0) { 9947 pr_err("sched_ext: Failed to add global attributes\n"); 9948 return ret; 9949 } 9950 9951 return 0; 9952 } 9953 __initcall(scx_init); 9954