1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #include <linux/btf_ids.h> 10 #include "ext_idle.h" 11 12 static DEFINE_RAW_SPINLOCK(scx_sched_lock); 13 14 /* 15 * NOTE: sched_ext is in the process of growing multiple scheduler support and 16 * scx_root usage is in a transitional state. Naked dereferences are safe if the 17 * caller is one of the tasks attached to SCX and explicit RCU dereference is 18 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 19 * are used as temporary markers to indicate that the dereferences need to be 20 * updated to point to the associated scheduler instances rather than scx_root. 21 */ 22 struct scx_sched __rcu *scx_root; 23 24 /* 25 * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 26 * Readers can hold either or rcu_read_lock(). 27 */ 28 static LIST_HEAD(scx_sched_all); 29 30 #ifdef CONFIG_EXT_SUB_SCHED 31 static const struct rhashtable_params scx_sched_hash_params = { 32 .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 33 .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 34 .head_offset = offsetof(struct scx_sched, hash_node), 35 .insecure_elasticity = true, /* inserted under scx_sched_lock */ 36 }; 37 38 static struct rhashtable scx_sched_hash; 39 #endif 40 41 /* 42 * During exit, a task may schedule after losing its PIDs. When disabling the 43 * BPF scheduler, we need to be able to iterate tasks in every state to 44 * guarantee system safety. Maintain a dedicated task list which contains every 45 * task between its fork and eventual free. 46 */ 47 static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 48 static LIST_HEAD(scx_tasks); 49 50 /* ops enable/disable */ 51 static DEFINE_MUTEX(scx_enable_mutex); 52 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 53 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 54 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 55 static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 56 static bool scx_init_task_enabled; 57 static bool scx_switching_all; 58 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 59 60 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 61 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 62 63 #ifdef CONFIG_EXT_SUB_SCHED 64 /* 65 * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 66 * tasks for the sub-sched being enabled. Use a global variable instead of a 67 * per-task field as all enables are serialized. 68 */ 69 static struct scx_sched *scx_enabling_sub_sched; 70 #else 71 #define scx_enabling_sub_sched (struct scx_sched *)NULL 72 #endif /* CONFIG_EXT_SUB_SCHED */ 73 74 /* 75 * A monotonically increasing sequence number that is incremented every time a 76 * scheduler is enabled. This can be used to check if any custom sched_ext 77 * scheduler has ever been used in the system. 78 */ 79 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 80 81 /* 82 * Watchdog interval. All scx_sched's share a single watchdog timer and the 83 * interval is half of the shortest sch->watchdog_timeout. 84 */ 85 static unsigned long scx_watchdog_interval; 86 87 /* 88 * The last time the delayed work was run. This delayed work relies on 89 * ksoftirqd being able to run to service timer interrupts, so it's possible 90 * that this work itself could get wedged. To account for this, we check that 91 * it's not stalled in the timer tick, and trigger an error if it is. 92 */ 93 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 94 95 static struct delayed_work scx_watchdog_work; 96 97 /* 98 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 99 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 100 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 101 * lazily when enabling and freed when disabling to avoid waste when sched_ext 102 * isn't active. 103 */ 104 struct scx_kick_syncs { 105 struct rcu_head rcu; 106 unsigned long syncs[]; 107 }; 108 109 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 110 111 /* 112 * Direct dispatch marker. 113 * 114 * Non-NULL values are used for direct dispatch from enqueue path. A valid 115 * pointer points to the task currently being enqueued. An ERR_PTR value is used 116 * to indicate that direct dispatch has already happened. 117 */ 118 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 119 120 static const struct rhashtable_params dsq_hash_params = { 121 .key_len = sizeof_field(struct scx_dispatch_q, id), 122 .key_offset = offsetof(struct scx_dispatch_q, id), 123 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 124 }; 125 126 static LLIST_HEAD(dsqs_to_free); 127 128 /* string formatting from BPF */ 129 struct scx_bstr_buf { 130 u64 data[MAX_BPRINTF_VARARGS]; 131 char line[SCX_EXIT_MSG_LEN]; 132 }; 133 134 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 135 static struct scx_bstr_buf scx_exit_bstr_buf; 136 137 /* ops debug dump */ 138 static DEFINE_RAW_SPINLOCK(scx_dump_lock); 139 140 struct scx_dump_data { 141 s32 cpu; 142 bool first; 143 s32 cursor; 144 struct seq_buf *s; 145 const char *prefix; 146 struct scx_bstr_buf buf; 147 }; 148 149 static struct scx_dump_data scx_dump_data = { 150 .cpu = -1, 151 }; 152 153 /* /sys/kernel/sched_ext interface */ 154 static struct kset *scx_kset; 155 156 /* 157 * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 158 * There usually is no reason to modify these as normal scheduler operation 159 * shouldn't be affected by them. The knobs are primarily for debugging. 160 */ 161 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 162 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 163 164 static int set_slice_us(const char *val, const struct kernel_param *kp) 165 { 166 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 167 } 168 169 static const struct kernel_param_ops slice_us_param_ops = { 170 .set = set_slice_us, 171 .get = param_get_uint, 172 }; 173 174 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 175 { 176 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 177 } 178 179 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 180 .set = set_bypass_lb_intv_us, 181 .get = param_get_uint, 182 }; 183 184 #undef MODULE_PARAM_PREFIX 185 #define MODULE_PARAM_PREFIX "sched_ext." 186 187 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 188 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 189 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 190 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 191 192 #undef MODULE_PARAM_PREFIX 193 194 #define CREATE_TRACE_POINTS 195 #include <trace/events/sched_ext.h> 196 197 static void run_deferred(struct rq *rq); 198 static bool task_dead_and_done(struct task_struct *p); 199 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 200 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 201 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 202 s64 exit_code, const char *fmt, va_list args); 203 204 static __printf(4, 5) bool scx_exit(struct scx_sched *sch, 205 enum scx_exit_kind kind, s64 exit_code, 206 const char *fmt, ...) 207 { 208 va_list args; 209 bool ret; 210 211 va_start(args, fmt); 212 ret = scx_vexit(sch, kind, exit_code, fmt, args); 213 va_end(args); 214 215 return ret; 216 } 217 218 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 219 #define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) 220 221 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 222 223 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 224 { 225 if (time_after(at, now)) 226 return jiffies_to_msecs(at - now); 227 else 228 return -(long)jiffies_to_msecs(now - at); 229 } 230 231 static bool u32_before(u32 a, u32 b) 232 { 233 return (s32)(a - b) < 0; 234 } 235 236 #ifdef CONFIG_EXT_SUB_SCHED 237 /** 238 * scx_parent - Find the parent sched 239 * @sch: sched to find the parent of 240 * 241 * Returns the parent scheduler or %NULL if @sch is root. 242 */ 243 static struct scx_sched *scx_parent(struct scx_sched *sch) 244 { 245 if (sch->level) 246 return sch->ancestors[sch->level - 1]; 247 else 248 return NULL; 249 } 250 251 /** 252 * scx_next_descendant_pre - find the next descendant for pre-order walk 253 * @pos: the current position (%NULL to initiate traversal) 254 * @root: sched whose descendants to walk 255 * 256 * To be used by scx_for_each_descendant_pre(). Find the next descendant to 257 * visit for pre-order traversal of @root's descendants. @root is included in 258 * the iteration and the first node to be visited. 259 */ 260 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 261 struct scx_sched *root) 262 { 263 struct scx_sched *next; 264 265 lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 266 lockdep_is_held(&scx_sched_lock)); 267 268 /* if first iteration, visit @root */ 269 if (!pos) 270 return root; 271 272 /* visit the first child if exists */ 273 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 274 if (next) 275 return next; 276 277 /* no child, visit my or the closest ancestor's next sibling */ 278 while (pos != root) { 279 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 280 return list_next_entry(pos, sibling); 281 pos = scx_parent(pos); 282 } 283 284 return NULL; 285 } 286 287 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 288 { 289 return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 290 scx_sched_hash_params); 291 } 292 293 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 294 { 295 rcu_assign_pointer(p->scx.sched, sch); 296 } 297 #else /* CONFIG_EXT_SUB_SCHED */ 298 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } 299 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 300 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 301 #endif /* CONFIG_EXT_SUB_SCHED */ 302 303 /** 304 * scx_is_descendant - Test whether sched is a descendant 305 * @sch: sched to test 306 * @ancestor: ancestor sched to test against 307 * 308 * Test whether @sch is a descendant of @ancestor. 309 */ 310 static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 311 { 312 if (sch->level < ancestor->level) 313 return false; 314 return sch->ancestors[ancestor->level] == ancestor; 315 } 316 317 /** 318 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 319 * @pos: iteration cursor 320 * @root: sched to walk the descendants of 321 * 322 * Walk @root's descendants. @root is included in the iteration and the first 323 * node to be visited. Must be called with either scx_enable_mutex or 324 * scx_sched_lock held. 325 */ 326 #define scx_for_each_descendant_pre(pos, root) \ 327 for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 328 (pos) = scx_next_descendant_pre((pos), (root))) 329 330 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 331 { 332 return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 333 } 334 335 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 336 { 337 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 338 } 339 340 static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 341 { 342 if (p->sched_class == &stop_sched_class) 343 return &stop_sched_class; 344 345 return __setscheduler_class(p->policy, p->prio); 346 } 347 348 static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 349 { 350 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 351 } 352 353 static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 354 { 355 #ifdef CONFIG_EXT_SUB_SCHED 356 /* 357 * If @sch is a sub-sched which is bypassing, its tasks should go into 358 * the bypass DSQs of the nearest ancestor which is not bypassing. The 359 * not-bypassing ancestor is responsible for scheduling all tasks from 360 * bypassing sub-trees. If all ancestors including root are bypassing, 361 * all tasks should go to the root's bypass DSQs. 362 * 363 * Whenever a sched starts bypassing, all runnable tasks in its subtree 364 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 365 * all tasks are transferred to the right DSQs. 366 */ 367 while (scx_parent(sch) && scx_bypassing(sch, cpu)) 368 sch = scx_parent(sch); 369 #endif /* CONFIG_EXT_SUB_SCHED */ 370 371 return bypass_dsq(sch, cpu); 372 } 373 374 /** 375 * bypass_dsp_enabled - Check if bypass dispatch path is enabled 376 * @sch: scheduler to check 377 * 378 * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 379 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 380 * are bypassing. In the former case, the ancestor is not itself bypassing but 381 * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 382 * the ancestor's bypass dispatch path must be active even though its own 383 * bypass_depth remains zero. 384 * 385 * This function checks bypass_dsp_enable_depth which is managed separately from 386 * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 387 * disable_bypass_dsp(). 388 */ 389 static bool bypass_dsp_enabled(struct scx_sched *sch) 390 { 391 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 392 } 393 394 /** 395 * rq_is_open - Is the rq available for immediate execution of an SCX task? 396 * @rq: rq to test 397 * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 398 * 399 * Returns %true if @rq is currently open for executing an SCX task. After a 400 * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 401 * before going to idle and not inserting a task into @rq's local DSQ after a 402 * %false return doesn't cause @rq to stall. 403 */ 404 static bool rq_is_open(struct rq *rq, u64 enq_flags) 405 { 406 lockdep_assert_rq_held(rq); 407 408 /* 409 * A higher-priority class task is either running or in the process of 410 * waking up on @rq. 411 */ 412 if (sched_class_above(rq->next_class, &ext_sched_class)) 413 return false; 414 415 /* 416 * @rq is either in transition to or in idle and there is no 417 * higher-priority class task waking up on it. 418 */ 419 if (sched_class_above(&ext_sched_class, rq->next_class)) 420 return true; 421 422 /* 423 * @rq is either picking, in transition to, or running an SCX task. 424 */ 425 426 /* 427 * If we're in the dispatch path holding rq lock, $curr may or may not 428 * be ready depending on whether the on-going dispatch decides to extend 429 * $curr's slice. We say yes here and resolve it at the end of dispatch. 430 * See balance_one(). 431 */ 432 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 433 return true; 434 435 /* 436 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 437 * so allow it to avoid spuriously triggering reenq on a combined 438 * PREEMPT|IMMED insertion. 439 */ 440 if (enq_flags & SCX_ENQ_PREEMPT) 441 return true; 442 443 /* 444 * @rq is either in transition to or running an SCX task and can't go 445 * idle without another SCX dispatch cycle. 446 */ 447 return false; 448 } 449 450 /* 451 * Track the rq currently locked. 452 * 453 * This allows kfuncs to safely operate on rq from any scx ops callback, 454 * knowing which rq is already locked. 455 */ 456 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 457 458 static inline void update_locked_rq(struct rq *rq) 459 { 460 /* 461 * Check whether @rq is actually locked. This can help expose bugs 462 * or incorrect assumptions about the context in which a kfunc or 463 * callback is executed. 464 */ 465 if (rq) 466 lockdep_assert_rq_held(rq); 467 __this_cpu_write(scx_locked_rq_state, rq); 468 } 469 470 /* 471 * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not 472 * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. 473 */ 474 #define SCX_CALL_OP(sch, op, locked_rq, args...) \ 475 do { \ 476 struct rq *__prev_locked_rq; \ 477 \ 478 if (locked_rq) { \ 479 __prev_locked_rq = scx_locked_rq(); \ 480 update_locked_rq(locked_rq); \ 481 } \ 482 (sch)->ops.op(args); \ 483 if (locked_rq) \ 484 update_locked_rq(__prev_locked_rq); \ 485 } while (0) 486 487 #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ 488 ({ \ 489 struct rq *__prev_locked_rq; \ 490 __typeof__((sch)->ops.op(args)) __ret; \ 491 \ 492 if (locked_rq) { \ 493 __prev_locked_rq = scx_locked_rq(); \ 494 update_locked_rq(locked_rq); \ 495 } \ 496 __ret = (sch)->ops.op(args); \ 497 if (locked_rq) \ 498 update_locked_rq(__prev_locked_rq); \ 499 __ret; \ 500 }) 501 502 /* 503 * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments 504 * and records them in current->scx.kf_tasks[] for the duration of the call. A 505 * kfunc invoked from inside such an op can then use 506 * scx_kf_arg_task_ok() to verify that its task argument is one of 507 * those subject tasks. 508 * 509 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - 510 * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's 511 * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. 512 * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. 513 * 514 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The 515 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants 516 * while a previous one is still in progress. 517 */ 518 #define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ 519 do { \ 520 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 521 current->scx.kf_tasks[0] = task; \ 522 SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ 523 current->scx.kf_tasks[0] = NULL; \ 524 } while (0) 525 526 #define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ 527 ({ \ 528 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 529 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 530 current->scx.kf_tasks[0] = task; \ 531 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ 532 current->scx.kf_tasks[0] = NULL; \ 533 __ret; \ 534 }) 535 536 #define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ 537 ({ \ 538 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 539 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 540 current->scx.kf_tasks[0] = task0; \ 541 current->scx.kf_tasks[1] = task1; \ 542 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ 543 current->scx.kf_tasks[0] = NULL; \ 544 current->scx.kf_tasks[1] = NULL; \ 545 __ret; \ 546 }) 547 548 /* see SCX_CALL_OP_TASK() */ 549 static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, 550 struct task_struct *p) 551 { 552 if (unlikely((p != current->scx.kf_tasks[0] && 553 p != current->scx.kf_tasks[1]))) { 554 scx_error(sch, "called on a task not being operated on"); 555 return false; 556 } 557 558 return true; 559 } 560 561 enum scx_dsq_iter_flags { 562 /* iterate in the reverse dispatch order */ 563 SCX_DSQ_ITER_REV = 1U << 16, 564 565 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 566 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 567 568 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 569 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 570 __SCX_DSQ_ITER_HAS_SLICE | 571 __SCX_DSQ_ITER_HAS_VTIME, 572 }; 573 574 /** 575 * nldsq_next_task - Iterate to the next task in a non-local DSQ 576 * @dsq: non-local dsq being iterated 577 * @cur: current position, %NULL to start iteration 578 * @rev: walk backwards 579 * 580 * Returns %NULL when iteration is finished. 581 */ 582 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 583 struct task_struct *cur, bool rev) 584 { 585 struct list_head *list_node; 586 struct scx_dsq_list_node *dsq_lnode; 587 588 lockdep_assert_held(&dsq->lock); 589 590 if (cur) 591 list_node = &cur->scx.dsq_list.node; 592 else 593 list_node = &dsq->list; 594 595 /* find the next task, need to skip BPF iteration cursors */ 596 do { 597 if (rev) 598 list_node = list_node->prev; 599 else 600 list_node = list_node->next; 601 602 if (list_node == &dsq->list) 603 return NULL; 604 605 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 606 node); 607 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 608 609 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 610 } 611 612 #define nldsq_for_each_task(p, dsq) \ 613 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 614 (p) = nldsq_next_task((dsq), (p), false)) 615 616 /** 617 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 618 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 619 * @dsq: non-local dsq being iterated 620 * 621 * Find the next task in a cursor based iteration. The caller must have 622 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 623 * between the iteration steps. 624 * 625 * Only tasks which were queued before @cursor was initialized are visible. This 626 * bounds the iteration and guarantees that vtime never jumps in the other 627 * direction while iterating. 628 */ 629 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 630 struct scx_dispatch_q *dsq) 631 { 632 bool rev = cursor->flags & SCX_DSQ_ITER_REV; 633 struct task_struct *p; 634 635 lockdep_assert_held(&dsq->lock); 636 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 637 638 if (list_empty(&cursor->node)) 639 p = NULL; 640 else 641 p = container_of(cursor, struct task_struct, scx.dsq_list); 642 643 /* skip cursors and tasks that were queued after @cursor init */ 644 do { 645 p = nldsq_next_task(dsq, p, rev); 646 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 647 648 if (p) { 649 if (rev) 650 list_move_tail(&cursor->node, &p->scx.dsq_list.node); 651 else 652 list_move(&cursor->node, &p->scx.dsq_list.node); 653 } else { 654 list_del_init(&cursor->node); 655 } 656 657 return p; 658 } 659 660 /** 661 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 662 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 663 * @rq: rq @p was on 664 * @dsq: dsq @p was on 665 * @p: target task 666 * 667 * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 668 * dropped and re-acquired inbetween. Verify that no one else took or is in the 669 * process of taking @p from @dsq. 670 * 671 * On %false return, the caller can assume full ownership of @p. 672 */ 673 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 674 struct rq *rq, struct scx_dispatch_q *dsq, 675 struct task_struct *p) 676 { 677 lockdep_assert_rq_held(rq); 678 lockdep_assert_held(&dsq->lock); 679 680 /* 681 * @p could have already left $src_dsq, got re-enqueud, or be in the 682 * process of being consumed by someone else. 683 */ 684 if (unlikely(p->scx.dsq != dsq || 685 u32_before(cursor->priv, p->scx.dsq_seq) || 686 p->scx.holding_cpu >= 0)) 687 return true; 688 689 /* if @p has stayed on @dsq, its rq couldn't have changed */ 690 if (WARN_ON_ONCE(rq != task_rq(p))) 691 return true; 692 693 return false; 694 } 695 696 /* 697 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 698 * dispatch order. BPF-visible iterator is opaque and larger to allow future 699 * changes without breaking backward compatibility. Can be used with 700 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 701 */ 702 struct bpf_iter_scx_dsq_kern { 703 struct scx_dsq_list_node cursor; 704 struct scx_dispatch_q *dsq; 705 u64 slice; 706 u64 vtime; 707 } __attribute__((aligned(8))); 708 709 struct bpf_iter_scx_dsq { 710 u64 __opaque[6]; 711 } __attribute__((aligned(8))); 712 713 714 static u32 scx_get_task_state(const struct task_struct *p) 715 { 716 return p->scx.flags & SCX_TASK_STATE_MASK; 717 } 718 719 static void scx_set_task_state(struct task_struct *p, u32 state) 720 { 721 u32 prev_state = scx_get_task_state(p); 722 bool warn = false; 723 724 switch (state) { 725 case SCX_TASK_NONE: 726 warn = prev_state == SCX_TASK_DEAD; 727 break; 728 case SCX_TASK_INIT_BEGIN: 729 warn = prev_state != SCX_TASK_NONE; 730 break; 731 case SCX_TASK_INIT: 732 warn = prev_state != SCX_TASK_INIT_BEGIN; 733 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 734 break; 735 case SCX_TASK_READY: 736 warn = !(prev_state == SCX_TASK_INIT || 737 prev_state == SCX_TASK_ENABLED); 738 break; 739 case SCX_TASK_ENABLED: 740 warn = prev_state != SCX_TASK_READY; 741 break; 742 case SCX_TASK_DEAD: 743 warn = !(prev_state == SCX_TASK_NONE || 744 prev_state == SCX_TASK_INIT_BEGIN); 745 break; 746 default: 747 WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", 748 prev_state, state, p->comm, p->pid); 749 return; 750 } 751 752 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 753 prev_state, state, p->comm, p->pid); 754 755 p->scx.flags &= ~SCX_TASK_STATE_MASK; 756 p->scx.flags |= state; 757 } 758 759 /* 760 * SCX task iterator. 761 */ 762 struct scx_task_iter { 763 struct sched_ext_entity cursor; 764 struct task_struct *locked_task; 765 struct rq *rq; 766 struct rq_flags rf; 767 u32 cnt; 768 bool list_locked; 769 #ifdef CONFIG_EXT_SUB_SCHED 770 struct cgroup *cgrp; 771 struct cgroup_subsys_state *css_pos; 772 struct css_task_iter css_iter; 773 #endif 774 }; 775 776 /** 777 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 778 * @iter: iterator to init 779 * @cgrp: Optional root of cgroup subhierarchy to iterate 780 * 781 * Initialize @iter. Once initialized, @iter must eventually be stopped with 782 * scx_task_iter_stop(). 783 * 784 * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 785 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 786 * 787 * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 788 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 789 * task migrations. 790 * 791 * The two modes of iterations are largely independent and it's likely that 792 * scx_tasks can be removed in favor of always using cgroup iteration if 793 * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 794 * 795 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 796 * between this and the first next() call or between any two next() calls. If 797 * the locks are released between two next() calls, the caller is responsible 798 * for ensuring that the task being iterated remains accessible either through 799 * RCU read lock or obtaining a reference count. 800 * 801 * All tasks which existed when the iteration started are guaranteed to be 802 * visited as long as they are not dead. 803 */ 804 static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 805 { 806 memset(iter, 0, sizeof(*iter)); 807 808 #ifdef CONFIG_EXT_SUB_SCHED 809 if (cgrp) { 810 lockdep_assert_held(&cgroup_mutex); 811 iter->cgrp = cgrp; 812 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 813 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 814 &iter->css_iter); 815 return; 816 } 817 #endif 818 raw_spin_lock_irq(&scx_tasks_lock); 819 820 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 821 list_add(&iter->cursor.tasks_node, &scx_tasks); 822 iter->list_locked = true; 823 } 824 825 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 826 { 827 if (iter->locked_task) { 828 __balance_callbacks(iter->rq, &iter->rf); 829 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 830 iter->locked_task = NULL; 831 } 832 } 833 834 /** 835 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 836 * @iter: iterator to unlock 837 * 838 * If @iter is in the middle of a locked iteration, it may be locking the rq of 839 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 840 * This function can be safely called anytime during an iteration. The next 841 * iterator operation will automatically restore the necessary locking. 842 */ 843 static void scx_task_iter_unlock(struct scx_task_iter *iter) 844 { 845 __scx_task_iter_rq_unlock(iter); 846 if (iter->list_locked) { 847 iter->list_locked = false; 848 raw_spin_unlock_irq(&scx_tasks_lock); 849 } 850 } 851 852 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 853 { 854 if (!iter->list_locked) { 855 raw_spin_lock_irq(&scx_tasks_lock); 856 iter->list_locked = true; 857 } 858 } 859 860 /** 861 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 862 * @iter: iterator to exit 863 * 864 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 865 * which is released on return. If the iterator holds a task's rq lock, that rq 866 * lock is also released. See scx_task_iter_start() for details. 867 */ 868 static void scx_task_iter_stop(struct scx_task_iter *iter) 869 { 870 #ifdef CONFIG_EXT_SUB_SCHED 871 if (iter->cgrp) { 872 if (iter->css_pos) 873 css_task_iter_end(&iter->css_iter); 874 __scx_task_iter_rq_unlock(iter); 875 return; 876 } 877 #endif 878 __scx_task_iter_maybe_relock(iter); 879 list_del_init(&iter->cursor.tasks_node); 880 scx_task_iter_unlock(iter); 881 } 882 883 /** 884 * scx_task_iter_next - Next task 885 * @iter: iterator to walk 886 * 887 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 888 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 889 * by holding scx_tasks_lock for too long. 890 */ 891 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 892 { 893 struct list_head *cursor = &iter->cursor.tasks_node; 894 struct sched_ext_entity *pos; 895 896 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 897 scx_task_iter_unlock(iter); 898 cond_resched(); 899 } 900 901 #ifdef CONFIG_EXT_SUB_SCHED 902 if (iter->cgrp) { 903 while (iter->css_pos) { 904 struct task_struct *p; 905 906 p = css_task_iter_next(&iter->css_iter); 907 if (p) 908 return p; 909 910 css_task_iter_end(&iter->css_iter); 911 iter->css_pos = css_next_descendant_pre(iter->css_pos, 912 &iter->cgrp->self); 913 if (iter->css_pos) 914 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 915 &iter->css_iter); 916 } 917 return NULL; 918 } 919 #endif 920 __scx_task_iter_maybe_relock(iter); 921 922 list_for_each_entry(pos, cursor, tasks_node) { 923 if (&pos->tasks_node == &scx_tasks) 924 return NULL; 925 if (!(pos->flags & SCX_TASK_CURSOR)) { 926 list_move(cursor, &pos->tasks_node); 927 return container_of(pos, struct task_struct, scx); 928 } 929 } 930 931 /* can't happen, should always terminate at scx_tasks above */ 932 BUG(); 933 } 934 935 /** 936 * scx_task_iter_next_locked - Next non-idle task with its rq locked 937 * @iter: iterator to walk 938 * 939 * Visit the non-idle task with its rq lock held. Allows callers to specify 940 * whether they would like to filter out dead tasks. See scx_task_iter_start() 941 * for details. 942 */ 943 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 944 { 945 struct task_struct *p; 946 947 __scx_task_iter_rq_unlock(iter); 948 949 while ((p = scx_task_iter_next(iter))) { 950 /* 951 * scx_task_iter is used to prepare and move tasks into SCX 952 * while loading the BPF scheduler and vice-versa while 953 * unloading. The init_tasks ("swappers") should be excluded 954 * from the iteration because: 955 * 956 * - It's unsafe to use __setschduler_prio() on an init_task to 957 * determine the sched_class to use as it won't preserve its 958 * idle_sched_class. 959 * 960 * - ops.init/exit_task() can easily be confused if called with 961 * init_tasks as they, e.g., share PID 0. 962 * 963 * As init_tasks are never scheduled through SCX, they can be 964 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 965 * doesn't work here: 966 * 967 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 968 * yet been onlined. 969 * 970 * - %PF_IDLE can be set on tasks that are not init_tasks. See 971 * play_idle_precise() used by CONFIG_IDLE_INJECT. 972 * 973 * Test for idle_sched_class as only init_tasks are on it. 974 */ 975 if (p->sched_class == &idle_sched_class) 976 continue; 977 978 iter->rq = task_rq_lock(p, &iter->rf); 979 iter->locked_task = p; 980 981 /* 982 * cgroup_task_dead() removes the dead tasks from cset->tasks 983 * after sched_ext_dead() and cgroup iteration may see tasks 984 * which already finished sched_ext_dead(). %SCX_TASK_DEAD is 985 * set by sched_ext_dead() under @p's rq lock. Test it to 986 * avoid visiting tasks which are already dead from SCX POV. 987 */ 988 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 989 __scx_task_iter_rq_unlock(iter); 990 continue; 991 } 992 993 return p; 994 } 995 return NULL; 996 } 997 998 /** 999 * scx_add_event - Increase an event counter for 'name' by 'cnt' 1000 * @sch: scx_sched to account events for 1001 * @name: an event name defined in struct scx_event_stats 1002 * @cnt: the number of the event occurred 1003 * 1004 * This can be used when preemption is not disabled. 1005 */ 1006 #define scx_add_event(sch, name, cnt) do { \ 1007 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1008 trace_sched_ext_event(#name, (cnt)); \ 1009 } while(0) 1010 1011 /** 1012 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1013 * @sch: scx_sched to account events for 1014 * @name: an event name defined in struct scx_event_stats 1015 * @cnt: the number of the event occurred 1016 * 1017 * This should be used only when preemption is disabled. 1018 */ 1019 #define __scx_add_event(sch, name, cnt) do { \ 1020 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1021 trace_sched_ext_event(#name, cnt); \ 1022 } while(0) 1023 1024 /** 1025 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1026 * @dst_e: destination event stats 1027 * @src_e: source event stats 1028 * @kind: a kind of event to be aggregated 1029 */ 1030 #define scx_agg_event(dst_e, src_e, kind) do { \ 1031 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1032 } while(0) 1033 1034 /** 1035 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1036 * @s: output seq_buf 1037 * @events: event stats 1038 * @kind: a kind of event to dump 1039 */ 1040 #define scx_dump_event(s, events, kind) do { \ 1041 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1042 } while (0) 1043 1044 1045 static void scx_read_events(struct scx_sched *sch, 1046 struct scx_event_stats *events); 1047 1048 static enum scx_enable_state scx_enable_state(void) 1049 { 1050 return atomic_read(&scx_enable_state_var); 1051 } 1052 1053 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1054 { 1055 return atomic_xchg(&scx_enable_state_var, to); 1056 } 1057 1058 static bool scx_tryset_enable_state(enum scx_enable_state to, 1059 enum scx_enable_state from) 1060 { 1061 int from_v = from; 1062 1063 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1064 } 1065 1066 /** 1067 * wait_ops_state - Busy-wait the specified ops state to end 1068 * @p: target task 1069 * @opss: state to wait the end of 1070 * 1071 * Busy-wait for @p to transition out of @opss. This can only be used when the 1072 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1073 * has load_acquire semantics to ensure that the caller can see the updates made 1074 * in the enqueueing and dispatching paths. 1075 */ 1076 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1077 { 1078 do { 1079 cpu_relax(); 1080 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1081 } 1082 1083 static inline bool __cpu_valid(s32 cpu) 1084 { 1085 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1086 } 1087 1088 /** 1089 * ops_cpu_valid - Verify a cpu number, to be used on ops input args 1090 * @sch: scx_sched to abort on error 1091 * @cpu: cpu number which came from a BPF ops 1092 * @where: extra information reported on error 1093 * 1094 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1095 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1096 * an ops error. 1097 */ 1098 static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1099 { 1100 if (__cpu_valid(cpu)) { 1101 return true; 1102 } else { 1103 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1104 return false; 1105 } 1106 } 1107 1108 /** 1109 * ops_sanitize_err - Sanitize a -errno value 1110 * @sch: scx_sched to error out on error 1111 * @ops_name: operation to blame on failure 1112 * @err: -errno value to sanitize 1113 * 1114 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1115 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1116 * cause misbehaviors. For an example, a large negative return from 1117 * ops.init_task() triggers an oops when passed up the call chain because the 1118 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1119 * handled as a pointer. 1120 */ 1121 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1122 { 1123 if (err < 0 && err >= -MAX_ERRNO) 1124 return err; 1125 1126 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1127 return -EPROTO; 1128 } 1129 1130 static void deferred_bal_cb_workfn(struct rq *rq) 1131 { 1132 run_deferred(rq); 1133 } 1134 1135 static void deferred_irq_workfn(struct irq_work *irq_work) 1136 { 1137 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1138 1139 raw_spin_rq_lock(rq); 1140 run_deferred(rq); 1141 raw_spin_rq_unlock(rq); 1142 } 1143 1144 /** 1145 * schedule_deferred - Schedule execution of deferred actions on an rq 1146 * @rq: target rq 1147 * 1148 * Schedule execution of deferred actions on @rq. Deferred actions are executed 1149 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1150 * to other rqs. 1151 */ 1152 static void schedule_deferred(struct rq *rq) 1153 { 1154 /* 1155 * This is the fallback when schedule_deferred_locked() can't use 1156 * the cheaper balance callback or wakeup hook paths (the target 1157 * CPU is not in balance or wakeup). Currently, this is primarily 1158 * hit by reenqueue operations targeting a remote CPU. 1159 * 1160 * Queue on the target CPU. The deferred work can run from any CPU 1161 * correctly - the _locked() path already processes remote rqs from 1162 * the calling CPU - but targeting the owning CPU allows IPI delivery 1163 * without waiting for the calling CPU to re-enable IRQs and is 1164 * cheaper as the reenqueue runs locally. 1165 */ 1166 irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); 1167 } 1168 1169 /** 1170 * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1171 * @rq: target rq 1172 * 1173 * Schedule execution of deferred actions on @rq. Equivalent to 1174 * schedule_deferred() but requires @rq to be locked and can be more efficient. 1175 */ 1176 static void schedule_deferred_locked(struct rq *rq) 1177 { 1178 lockdep_assert_rq_held(rq); 1179 1180 /* 1181 * If in the middle of waking up a task, task_woken_scx() will be called 1182 * afterwards which will then run the deferred actions, no need to 1183 * schedule anything. 1184 */ 1185 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1186 return; 1187 1188 /* Don't do anything if there already is a deferred operation. */ 1189 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1190 return; 1191 1192 /* 1193 * If in balance, the balance callbacks will be called before rq lock is 1194 * released. Schedule one. 1195 * 1196 * 1197 * We can't directly insert the callback into the 1198 * rq's list: The call can drop its lock and make the pending balance 1199 * callback visible to unrelated code paths that call rq_pin_lock(). 1200 * 1201 * Just let balance_one() know that it must do it itself. 1202 */ 1203 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1204 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1205 return; 1206 } 1207 1208 /* 1209 * No scheduler hooks available. Use the generic irq_work path. The 1210 * above WAKEUP and BALANCE paths should cover most of the cases and the 1211 * time to IRQ re-enable shouldn't be long. 1212 */ 1213 schedule_deferred(rq); 1214 } 1215 1216 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1217 u64 reenq_flags, struct rq *locked_rq) 1218 { 1219 struct rq *rq; 1220 1221 /* 1222 * Allowing reenqueues doesn't make sense while bypassing. This also 1223 * blocks from new reenqueues to be scheduled on dead scheds. 1224 */ 1225 if (unlikely(READ_ONCE(sch->bypass_depth))) 1226 return; 1227 1228 if (dsq->id == SCX_DSQ_LOCAL) { 1229 rq = container_of(dsq, struct rq, scx.local_dsq); 1230 1231 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1232 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1233 1234 /* 1235 * Pairs with smp_mb() in process_deferred_reenq_locals() and 1236 * guarantees that there is a reenq_local() afterwards. 1237 */ 1238 smp_mb(); 1239 1240 if (list_empty(&drl->node) || 1241 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1242 1243 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1244 1245 if (list_empty(&drl->node)) 1246 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1247 WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1248 } 1249 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1250 rq = this_rq(); 1251 1252 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1253 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1254 1255 /* 1256 * Pairs with smp_mb() in process_deferred_reenq_users() and 1257 * guarantees that there is a reenq_user() afterwards. 1258 */ 1259 smp_mb(); 1260 1261 if (list_empty(&dru->node) || 1262 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1263 1264 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1265 1266 if (list_empty(&dru->node)) 1267 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1268 WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1269 } 1270 } else { 1271 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1272 return; 1273 } 1274 1275 if (rq == locked_rq) 1276 schedule_deferred_locked(rq); 1277 else 1278 schedule_deferred(rq); 1279 } 1280 1281 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1282 { 1283 struct scx_sched *root = rcu_dereference_sched(scx_root); 1284 1285 if (WARN_ON_ONCE(!root)) 1286 return; 1287 1288 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); 1289 } 1290 1291 /** 1292 * touch_core_sched - Update timestamp used for core-sched task ordering 1293 * @rq: rq to read clock from, must be locked 1294 * @p: task to update the timestamp for 1295 * 1296 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1297 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1298 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1299 * exhaustion). 1300 */ 1301 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1302 { 1303 lockdep_assert_rq_held(rq); 1304 1305 #ifdef CONFIG_SCHED_CORE 1306 /* 1307 * It's okay to update the timestamp spuriously. Use 1308 * sched_core_disabled() which is cheaper than enabled(). 1309 * 1310 * As this is used to determine ordering between tasks of sibling CPUs, 1311 * it may be better to use per-core dispatch sequence instead. 1312 */ 1313 if (!sched_core_disabled()) 1314 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1315 #endif 1316 } 1317 1318 /** 1319 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1320 * @rq: rq to read clock from, must be locked 1321 * @p: task being dispatched 1322 * 1323 * If the BPF scheduler implements custom core-sched ordering via 1324 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1325 * ordering within each local DSQ. This function is called from dispatch paths 1326 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1327 */ 1328 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1329 { 1330 lockdep_assert_rq_held(rq); 1331 1332 #ifdef CONFIG_SCHED_CORE 1333 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1334 touch_core_sched(rq, p); 1335 #endif 1336 } 1337 1338 static void update_curr_scx(struct rq *rq) 1339 { 1340 struct task_struct *curr = rq->curr; 1341 s64 delta_exec; 1342 1343 delta_exec = update_curr_common(rq); 1344 if (unlikely(delta_exec <= 0)) 1345 return; 1346 1347 if (curr->scx.slice != SCX_SLICE_INF) { 1348 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1349 if (!curr->scx.slice) 1350 touch_core_sched(rq, curr); 1351 } 1352 1353 dl_server_update(&rq->ext_server, delta_exec); 1354 } 1355 1356 static bool scx_dsq_priq_less(struct rb_node *node_a, 1357 const struct rb_node *node_b) 1358 { 1359 const struct task_struct *a = 1360 container_of(node_a, struct task_struct, scx.dsq_priq); 1361 const struct task_struct *b = 1362 container_of(node_b, struct task_struct, scx.dsq_priq); 1363 1364 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1365 } 1366 1367 static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1368 { 1369 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1370 WRITE_ONCE(dsq->nr, dsq->nr + 1); 1371 1372 /* 1373 * Once @p reaches a local DSQ, it can only leave it by being dispatched 1374 * to the CPU or dequeued. In both cases, the only way @p can go back to 1375 * the BPF sched is through enqueueing. If being inserted into a local 1376 * DSQ with IMMED, persist the state until the next enqueueing event in 1377 * do_enqueue_task() so that we can maintain IMMED protection through 1378 * e.g. SAVE/RESTORE cycles and slice extensions. 1379 */ 1380 if (enq_flags & SCX_ENQ_IMMED) { 1381 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1382 WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1383 return; 1384 } 1385 p->scx.flags |= SCX_TASK_IMMED; 1386 } 1387 1388 if (p->scx.flags & SCX_TASK_IMMED) { 1389 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1390 1391 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1392 return; 1393 1394 rq->scx.nr_immed++; 1395 1396 /* 1397 * If @rq already had other tasks or the current task is not 1398 * done yet, @p can't go on the CPU immediately. Re-enqueue. 1399 */ 1400 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1401 schedule_reenq_local(rq, 0); 1402 } 1403 } 1404 1405 static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1406 { 1407 /* see dsq_inc_nr() */ 1408 WRITE_ONCE(dsq->nr, dsq->nr - 1); 1409 1410 if (p->scx.flags & SCX_TASK_IMMED) { 1411 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1412 1413 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1414 WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1415 return; 1416 1417 rq->scx.nr_immed--; 1418 } 1419 } 1420 1421 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1422 { 1423 p->scx.slice = READ_ONCE(sch->slice_dfl); 1424 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1425 } 1426 1427 /* 1428 * Return true if @p is moving due to an internal SCX migration, false 1429 * otherwise. 1430 */ 1431 static inline bool task_scx_migrating(struct task_struct *p) 1432 { 1433 /* 1434 * We only need to check sticky_cpu: it is set to the destination 1435 * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1436 * and cleared when the task is enqueued on the destination, so it 1437 * is only non-negative during an internal SCX migration. 1438 */ 1439 return p->scx.sticky_cpu >= 0; 1440 } 1441 1442 /* 1443 * Call ops.dequeue() if the task is in BPF custody and not migrating. 1444 * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1445 */ 1446 static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1447 struct task_struct *p, u64 deq_flags) 1448 { 1449 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1450 return; 1451 1452 if (SCX_HAS_OP(sch, dequeue)) 1453 SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); 1454 1455 p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1456 } 1457 1458 static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1459 struct task_struct *p, u64 enq_flags) 1460 { 1461 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1462 1463 call_task_dequeue(sch, rq, p, 0); 1464 1465 /* 1466 * Note that @rq's lock may be dropped between this enqueue and @p 1467 * actually getting on CPU. This gives higher-class tasks (e.g. RT) 1468 * an opportunity to wake up on @rq and prevent @p from running. 1469 * Here are some concrete examples: 1470 * 1471 * Example 1: 1472 * 1473 * We dispatch two tasks from a single ops.dispatch(): 1474 * - First, a local task to this CPU's local DSQ; 1475 * - Second, a local/remote task to a remote CPU's local DSQ. 1476 * We must drop the local rq lock in order to finish the second 1477 * dispatch. In that time, an RT task can wake up on the local rq. 1478 * 1479 * Example 2: 1480 * 1481 * We dispatch a local/remote task to a remote CPU's local DSQ. 1482 * We must drop the remote rq lock before the dispatched task can run, 1483 * which gives an RT task an opportunity to wake up on the remote rq. 1484 * 1485 * Both examples work the same if we replace dispatching with moving 1486 * the tasks from a user-created DSQ. 1487 * 1488 * We must detect these wakeups so that we can re-enqueue IMMED tasks 1489 * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this 1490 * purpose, but for it to be invoked, we must ensure that we bump 1491 * @rq->next_class to &ext_sched_class if it's currently idle. 1492 * 1493 * wakeup_preempt() does the bumping, and since we only invoke it if 1494 * @rq->next_class is below &ext_sched_class, it will also 1495 * resched_curr(rq). 1496 */ 1497 if (sched_class_above(p->sched_class, rq->next_class)) 1498 wakeup_preempt(rq, p, 0); 1499 1500 /* 1501 * If @rq is in balance, the CPU is already vacant and looking for the 1502 * next task to run. No need to preempt or trigger resched after moving 1503 * @p into its local DSQ. 1504 * Note that the wakeup_preempt() above may have already triggered 1505 * a resched if @rq->next_class was idle. It's harmless, since 1506 * need_resched is cleared immediately after task pick. 1507 */ 1508 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1509 return; 1510 1511 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1512 rq->curr->sched_class == &ext_sched_class) { 1513 rq->curr->scx.slice = 0; 1514 resched_curr(rq); 1515 } 1516 } 1517 1518 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1519 struct scx_dispatch_q *dsq, struct task_struct *p, 1520 u64 enq_flags) 1521 { 1522 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1523 1524 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1525 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1526 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1527 1528 if (!is_local) { 1529 raw_spin_lock_nested(&dsq->lock, 1530 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1531 1532 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1533 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1534 /* fall back to the global dsq */ 1535 raw_spin_unlock(&dsq->lock); 1536 dsq = find_global_dsq(sch, task_cpu(p)); 1537 raw_spin_lock(&dsq->lock); 1538 } 1539 } 1540 1541 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1542 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1543 /* 1544 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1545 * their FIFO queues. To avoid confusion and accidentally 1546 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1547 * disallow any internal DSQ from doing vtime ordering of 1548 * tasks. 1549 */ 1550 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1551 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1552 } 1553 1554 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1555 struct rb_node *rbp; 1556 1557 /* 1558 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1559 * linked to both the rbtree and list on PRIQs, this can only be 1560 * tested easily when adding the first task. 1561 */ 1562 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1563 nldsq_next_task(dsq, NULL, false))) 1564 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1565 dsq->id); 1566 1567 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1568 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1569 1570 /* 1571 * Find the previous task and insert after it on the list so 1572 * that @dsq->list is vtime ordered. 1573 */ 1574 rbp = rb_prev(&p->scx.dsq_priq); 1575 if (rbp) { 1576 struct task_struct *prev = 1577 container_of(rbp, struct task_struct, 1578 scx.dsq_priq); 1579 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1580 /* first task unchanged - no update needed */ 1581 } else { 1582 list_add(&p->scx.dsq_list.node, &dsq->list); 1583 /* not builtin and new task is at head - use fastpath */ 1584 rcu_assign_pointer(dsq->first_task, p); 1585 } 1586 } else { 1587 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1588 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1589 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1590 dsq->id); 1591 1592 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1593 list_add(&p->scx.dsq_list.node, &dsq->list); 1594 /* new task inserted at head - use fastpath */ 1595 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1596 rcu_assign_pointer(dsq->first_task, p); 1597 } else { 1598 /* 1599 * dsq->list can contain parked BPF iterator cursors, so 1600 * list_empty() here isn't a reliable proxy for "no real 1601 * task in the DSQ". Test dsq->first_task directly. 1602 */ 1603 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1604 if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1605 rcu_assign_pointer(dsq->first_task, p); 1606 } 1607 } 1608 1609 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1610 WRITE_ONCE(dsq->seq, dsq->seq + 1); 1611 p->scx.dsq_seq = dsq->seq; 1612 1613 dsq_inc_nr(dsq, p, enq_flags); 1614 p->scx.dsq = dsq; 1615 1616 /* 1617 * Update custody and call ops.dequeue() before clearing ops_state: 1618 * once ops_state is cleared, waiters in ops_dequeue() can proceed 1619 * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1620 * ops_state first, both sides would modify p->scx.flags 1621 * concurrently in a non-atomic way. 1622 */ 1623 if (is_local) { 1624 local_dsq_post_enq(sch, dsq, p, enq_flags); 1625 } else { 1626 /* 1627 * Task on global/bypass DSQ: leave custody, task on 1628 * non-terminal DSQ: enter custody. 1629 */ 1630 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1631 call_task_dequeue(sch, rq, p, 0); 1632 else 1633 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1634 1635 raw_spin_unlock(&dsq->lock); 1636 } 1637 1638 /* 1639 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1640 * match waiters' load_acquire. 1641 */ 1642 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1643 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1644 } 1645 1646 static void task_unlink_from_dsq(struct task_struct *p, 1647 struct scx_dispatch_q *dsq) 1648 { 1649 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1650 1651 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1652 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1653 RB_CLEAR_NODE(&p->scx.dsq_priq); 1654 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1655 } 1656 1657 list_del_init(&p->scx.dsq_list.node); 1658 dsq_dec_nr(dsq, p); 1659 1660 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1661 struct task_struct *first_task; 1662 1663 first_task = nldsq_next_task(dsq, NULL, false); 1664 rcu_assign_pointer(dsq->first_task, first_task); 1665 } 1666 } 1667 1668 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1669 { 1670 struct scx_dispatch_q *dsq = p->scx.dsq; 1671 bool is_local = dsq == &rq->scx.local_dsq; 1672 1673 lockdep_assert_rq_held(rq); 1674 1675 if (!dsq) { 1676 /* 1677 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1678 * Unlinking is all that's needed to cancel. 1679 */ 1680 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1681 list_del_init(&p->scx.dsq_list.node); 1682 1683 /* 1684 * When dispatching directly from the BPF scheduler to a local 1685 * DSQ, the task isn't associated with any DSQ but 1686 * @p->scx.holding_cpu may be set under the protection of 1687 * %SCX_OPSS_DISPATCHING. 1688 */ 1689 if (p->scx.holding_cpu >= 0) 1690 p->scx.holding_cpu = -1; 1691 1692 return; 1693 } 1694 1695 if (!is_local) 1696 raw_spin_lock(&dsq->lock); 1697 1698 /* 1699 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1700 * change underneath us. 1701 */ 1702 if (p->scx.holding_cpu < 0) { 1703 /* @p must still be on @dsq, dequeue */ 1704 task_unlink_from_dsq(p, dsq); 1705 } else { 1706 /* 1707 * We're racing against dispatch_to_local_dsq() which already 1708 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1709 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1710 * the race. 1711 */ 1712 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1713 p->scx.holding_cpu = -1; 1714 } 1715 p->scx.dsq = NULL; 1716 1717 if (!is_local) 1718 raw_spin_unlock(&dsq->lock); 1719 } 1720 1721 /* 1722 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1723 * and dsq are locked. 1724 */ 1725 static void dispatch_dequeue_locked(struct task_struct *p, 1726 struct scx_dispatch_q *dsq) 1727 { 1728 lockdep_assert_rq_held(task_rq(p)); 1729 lockdep_assert_held(&dsq->lock); 1730 1731 task_unlink_from_dsq(p, dsq); 1732 p->scx.dsq = NULL; 1733 } 1734 1735 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1736 struct rq *rq, u64 dsq_id, 1737 s32 tcpu) 1738 { 1739 struct scx_dispatch_q *dsq; 1740 1741 if (dsq_id == SCX_DSQ_LOCAL) 1742 return &rq->scx.local_dsq; 1743 1744 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1745 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1746 1747 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1748 return find_global_dsq(sch, tcpu); 1749 1750 return &cpu_rq(cpu)->scx.local_dsq; 1751 } 1752 1753 if (dsq_id == SCX_DSQ_GLOBAL) 1754 dsq = find_global_dsq(sch, tcpu); 1755 else 1756 dsq = find_user_dsq(sch, dsq_id); 1757 1758 if (unlikely(!dsq)) { 1759 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1760 return find_global_dsq(sch, tcpu); 1761 } 1762 1763 return dsq; 1764 } 1765 1766 static void mark_direct_dispatch(struct scx_sched *sch, 1767 struct task_struct *ddsp_task, 1768 struct task_struct *p, u64 dsq_id, 1769 u64 enq_flags) 1770 { 1771 /* 1772 * Mark that dispatch already happened from ops.select_cpu() or 1773 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1774 * which can never match a valid task pointer. 1775 */ 1776 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1777 1778 /* @p must match the task on the enqueue path */ 1779 if (unlikely(p != ddsp_task)) { 1780 if (IS_ERR(ddsp_task)) 1781 scx_error(sch, "%s[%d] already direct-dispatched", 1782 p->comm, p->pid); 1783 else 1784 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1785 ddsp_task->comm, ddsp_task->pid, 1786 p->comm, p->pid); 1787 return; 1788 } 1789 1790 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1791 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1792 1793 p->scx.ddsp_dsq_id = dsq_id; 1794 p->scx.ddsp_enq_flags = enq_flags; 1795 } 1796 1797 /* 1798 * Clear @p direct dispatch state when leaving the scheduler. 1799 * 1800 * Direct dispatch state must be cleared in the following cases: 1801 * - direct_dispatch(): cleared on the synchronous enqueue path, deferred 1802 * dispatch keeps the state until consumed 1803 * - process_ddsp_deferred_locals(): cleared after consuming deferred state, 1804 * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch 1805 * verdict is ignored (local/global/bypass) 1806 * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred 1807 * cancellation and holding_cpu races 1808 * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by 1809 * the scx_bypass() loop, so that stale state is not reused by a subsequent 1810 * scheduler instance 1811 */ 1812 static inline void clear_direct_dispatch(struct task_struct *p) 1813 { 1814 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1815 p->scx.ddsp_enq_flags = 0; 1816 } 1817 1818 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1819 u64 enq_flags) 1820 { 1821 struct rq *rq = task_rq(p); 1822 struct scx_dispatch_q *dsq = 1823 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1824 u64 ddsp_enq_flags; 1825 1826 touch_core_sched_dispatch(rq, p); 1827 1828 p->scx.ddsp_enq_flags |= enq_flags; 1829 1830 /* 1831 * We are in the enqueue path with @rq locked and pinned, and thus can't 1832 * double lock a remote rq and enqueue to its local DSQ. For 1833 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1834 * the enqueue so that it's executed when @rq can be unlocked. 1835 */ 1836 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1837 unsigned long opss; 1838 1839 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1840 1841 switch (opss & SCX_OPSS_STATE_MASK) { 1842 case SCX_OPSS_NONE: 1843 break; 1844 case SCX_OPSS_QUEUEING: 1845 /* 1846 * As @p was never passed to the BPF side, _release is 1847 * not strictly necessary. Still do it for consistency. 1848 */ 1849 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1850 break; 1851 default: 1852 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1853 p->comm, p->pid, opss); 1854 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1855 break; 1856 } 1857 1858 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1859 list_add_tail(&p->scx.dsq_list.node, 1860 &rq->scx.ddsp_deferred_locals); 1861 schedule_deferred_locked(rq); 1862 return; 1863 } 1864 1865 ddsp_enq_flags = p->scx.ddsp_enq_flags; 1866 clear_direct_dispatch(p); 1867 1868 dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1869 } 1870 1871 static bool scx_rq_online(struct rq *rq) 1872 { 1873 /* 1874 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1875 * the online state as seen from the BPF scheduler. cpu_active() test 1876 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1877 * stay set until the current scheduling operation is complete even if 1878 * we aren't locking @rq. 1879 */ 1880 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1881 } 1882 1883 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1884 int sticky_cpu) 1885 { 1886 struct scx_sched *sch = scx_task_sched(p); 1887 struct task_struct **ddsp_taskp; 1888 struct scx_dispatch_q *dsq; 1889 unsigned long qseq; 1890 1891 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1892 1893 /* internal movements - rq migration / RESTORE */ 1894 if (sticky_cpu == cpu_of(rq)) 1895 goto local_norefill; 1896 1897 /* 1898 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 1899 * Note that exiting and migration-disabled tasks that skip 1900 * ops.enqueue() below will lose IMMED protection unless 1901 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 1902 */ 1903 p->scx.flags &= ~SCX_TASK_IMMED; 1904 1905 /* 1906 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 1907 * is offline and are just running the hotplug path. Don't bother the 1908 * BPF scheduler. 1909 */ 1910 if (!scx_rq_online(rq)) 1911 goto local; 1912 1913 if (scx_bypassing(sch, cpu_of(rq))) { 1914 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 1915 goto bypass; 1916 } 1917 1918 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1919 goto direct; 1920 1921 /* see %SCX_OPS_ENQ_EXITING */ 1922 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 1923 unlikely(p->flags & PF_EXITING)) { 1924 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 1925 goto local; 1926 } 1927 1928 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 1929 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 1930 is_migration_disabled(p)) { 1931 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 1932 goto local; 1933 } 1934 1935 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 1936 goto global; 1937 1938 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1939 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1940 1941 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1942 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1943 1944 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1945 WARN_ON_ONCE(*ddsp_taskp); 1946 *ddsp_taskp = p; 1947 1948 SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); 1949 1950 *ddsp_taskp = NULL; 1951 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1952 goto direct; 1953 1954 /* 1955 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 1956 * so ops.dequeue() is called when it leaves custody. 1957 */ 1958 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1959 1960 /* 1961 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1962 * dequeue may be waiting. The store_release matches their load_acquire. 1963 */ 1964 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1965 return; 1966 1967 direct: 1968 direct_dispatch(sch, p, enq_flags); 1969 return; 1970 local_norefill: 1971 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 1972 return; 1973 local: 1974 dsq = &rq->scx.local_dsq; 1975 goto enqueue; 1976 global: 1977 dsq = find_global_dsq(sch, task_cpu(p)); 1978 goto enqueue; 1979 bypass: 1980 dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 1981 goto enqueue; 1982 1983 enqueue: 1984 /* 1985 * For task-ordering, slice refill must be treated as implying the end 1986 * of the current slice. Otherwise, the longer @p stays on the CPU, the 1987 * higher priority it becomes from scx_prio_less()'s POV. 1988 */ 1989 touch_core_sched(rq, p); 1990 refill_task_slice_dfl(sch, p); 1991 clear_direct_dispatch(p); 1992 dispatch_enqueue(sch, rq, dsq, p, enq_flags); 1993 } 1994 1995 static bool task_runnable(const struct task_struct *p) 1996 { 1997 return !list_empty(&p->scx.runnable_node); 1998 } 1999 2000 static void set_task_runnable(struct rq *rq, struct task_struct *p) 2001 { 2002 lockdep_assert_rq_held(rq); 2003 2004 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 2005 p->scx.runnable_at = jiffies; 2006 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 2007 } 2008 2009 /* 2010 * list_add_tail() must be used. scx_bypass() depends on tasks being 2011 * appended to the runnable_list. 2012 */ 2013 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 2014 } 2015 2016 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2017 { 2018 list_del_init(&p->scx.runnable_node); 2019 if (reset_runnable_at) 2020 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2021 } 2022 2023 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 2024 { 2025 struct scx_sched *sch = scx_task_sched(p); 2026 int sticky_cpu = p->scx.sticky_cpu; 2027 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 2028 2029 if (enq_flags & ENQUEUE_WAKEUP) 2030 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2031 2032 /* 2033 * Restoring a running task will be immediately followed by 2034 * set_next_task_scx() which expects the task to not be on the BPF 2035 * scheduler as tasks can only start running through local DSQs. Force 2036 * direct-dispatch into the local DSQ by setting the sticky_cpu. 2037 */ 2038 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2039 sticky_cpu = cpu_of(rq); 2040 2041 if (p->scx.flags & SCX_TASK_QUEUED) { 2042 WARN_ON_ONCE(!task_runnable(p)); 2043 goto out; 2044 } 2045 2046 set_task_runnable(rq, p); 2047 p->scx.flags |= SCX_TASK_QUEUED; 2048 rq->scx.nr_running++; 2049 add_nr_running(rq, 1); 2050 2051 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2052 SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); 2053 2054 if (enq_flags & SCX_ENQ_WAKEUP) 2055 touch_core_sched(rq, p); 2056 2057 /* Start dl_server if this is the first task being enqueued */ 2058 if (rq->scx.nr_running == 1) 2059 dl_server_start(&rq->ext_server); 2060 2061 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2062 2063 if (sticky_cpu >= 0) 2064 p->scx.sticky_cpu = -1; 2065 out: 2066 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2067 2068 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2069 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2070 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2071 } 2072 2073 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2074 { 2075 struct scx_sched *sch = scx_task_sched(p); 2076 unsigned long opss; 2077 2078 /* dequeue is always temporary, don't reset runnable_at */ 2079 clr_task_runnable(p, false); 2080 2081 retry: 2082 /* acquire ensures that we see the preceding updates on QUEUED */ 2083 opss = atomic_long_read_acquire(&p->scx.ops_state); 2084 2085 switch (opss & SCX_OPSS_STATE_MASK) { 2086 case SCX_OPSS_NONE: 2087 break; 2088 case SCX_OPSS_QUEUEING: 2089 /* 2090 * QUEUEING is started and finished while holding @p's rq lock. 2091 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2092 */ 2093 BUG(); 2094 case SCX_OPSS_QUEUED: 2095 /* 2096 * A queued task must always be in BPF scheduler's custody. If 2097 * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another 2098 * CPU has already passed call_task_dequeue() (which clears the 2099 * flag), but has not yet written SCX_OPSS_NONE. That final 2100 * store does not require this rq's lock, so retrying with 2101 * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, 2102 * handled by the fallthrough) on a subsequent iteration. 2103 */ 2104 if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { 2105 cpu_relax(); 2106 goto retry; 2107 } 2108 2109 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2110 SCX_OPSS_NONE)) 2111 break; 2112 fallthrough; 2113 case SCX_OPSS_DISPATCHING: 2114 /* 2115 * If @p is being dispatched from the BPF scheduler to a DSQ, 2116 * wait for the transfer to complete so that @p doesn't get 2117 * added to its DSQ after dequeueing is complete. 2118 * 2119 * As we're waiting on DISPATCHING with the rq locked, the 2120 * dispatching side shouldn't try to lock the rq while 2121 * DISPATCHING is set. See dispatch_to_local_dsq(). 2122 * 2123 * DISPATCHING shouldn't have qseq set and control can reach 2124 * here with NONE @opss from the above QUEUED case block. 2125 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2126 */ 2127 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2128 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2129 break; 2130 } 2131 2132 /* 2133 * Call ops.dequeue() if the task is still in BPF custody. 2134 * 2135 * The code that clears ops_state to %SCX_OPSS_NONE does not always 2136 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2137 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2138 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2139 * so that a concurrent dequeue can proceed, but we clear 2140 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2141 * task. So we can see NONE + IN_CUSTODY here and we must handle 2142 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2143 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2144 * it is enqueued on the destination. 2145 */ 2146 call_task_dequeue(sch, rq, p, deq_flags); 2147 } 2148 2149 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2150 { 2151 struct scx_sched *sch = scx_task_sched(p); 2152 u64 deq_flags = core_deq_flags; 2153 2154 /* 2155 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2156 * change (not sleep or core-sched pick). 2157 */ 2158 if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2159 deq_flags |= SCX_DEQ_SCHED_CHANGE; 2160 2161 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2162 WARN_ON_ONCE(task_runnable(p)); 2163 return true; 2164 } 2165 2166 ops_dequeue(rq, p, deq_flags); 2167 2168 /* 2169 * A currently running task which is going off @rq first gets dequeued 2170 * and then stops running. As we want running <-> stopping transitions 2171 * to be contained within runnable <-> quiescent transitions, trigger 2172 * ->stopping() early here instead of in put_prev_task_scx(). 2173 * 2174 * @p may go through multiple stopping <-> running transitions between 2175 * here and put_prev_task_scx() if task attribute changes occur while 2176 * balance_one() leaves @rq unlocked. However, they don't contain any 2177 * information meaningful to the BPF scheduler and can be suppressed by 2178 * skipping the callbacks if the task is !QUEUED. 2179 */ 2180 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2181 update_curr_scx(rq); 2182 SCX_CALL_OP_TASK(sch, stopping, rq, p, false); 2183 } 2184 2185 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2186 SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); 2187 2188 if (deq_flags & SCX_DEQ_SLEEP) 2189 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2190 else 2191 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2192 2193 p->scx.flags &= ~SCX_TASK_QUEUED; 2194 rq->scx.nr_running--; 2195 sub_nr_running(rq, 1); 2196 2197 dispatch_dequeue(rq, p); 2198 clear_direct_dispatch(p); 2199 return true; 2200 } 2201 2202 static void yield_task_scx(struct rq *rq) 2203 { 2204 struct task_struct *p = rq->donor; 2205 struct scx_sched *sch = scx_task_sched(p); 2206 2207 if (SCX_HAS_OP(sch, yield)) 2208 SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); 2209 else 2210 p->scx.slice = 0; 2211 } 2212 2213 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2214 { 2215 struct task_struct *from = rq->donor; 2216 struct scx_sched *sch = scx_task_sched(from); 2217 2218 if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2219 return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); 2220 else 2221 return false; 2222 } 2223 2224 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2225 { 2226 /* 2227 * Preemption between SCX tasks is implemented by resetting the victim 2228 * task's slice to 0 and triggering reschedule on the target CPU. 2229 * Nothing to do. 2230 */ 2231 if (p->sched_class == &ext_sched_class) 2232 return; 2233 2234 /* 2235 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2236 * This captures all preemption cases including: 2237 * 2238 * - A SCX task is currently running. 2239 * 2240 * - @rq is waking from idle due to a SCX task waking to it. 2241 * 2242 * - A higher-priority wakes up while SCX dispatch is in progress. 2243 */ 2244 if (rq->scx.nr_immed) 2245 schedule_reenq_local(rq, 0); 2246 } 2247 2248 static void move_local_task_to_local_dsq(struct scx_sched *sch, 2249 struct task_struct *p, u64 enq_flags, 2250 struct scx_dispatch_q *src_dsq, 2251 struct rq *dst_rq) 2252 { 2253 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2254 2255 /* @dsq is locked and @p is on @dst_rq */ 2256 lockdep_assert_held(&src_dsq->lock); 2257 lockdep_assert_rq_held(dst_rq); 2258 2259 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2260 2261 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2262 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2263 else 2264 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2265 2266 dsq_inc_nr(dst_dsq, p, enq_flags); 2267 p->scx.dsq = dst_dsq; 2268 2269 local_dsq_post_enq(sch, dst_dsq, p, enq_flags); 2270 } 2271 2272 /** 2273 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2274 * @p: task to move 2275 * @enq_flags: %SCX_ENQ_* 2276 * @src_rq: rq to move the task from, locked on entry, released on return 2277 * @dst_rq: rq to move the task into, locked on return 2278 * 2279 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2280 */ 2281 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2282 struct rq *src_rq, struct rq *dst_rq) 2283 { 2284 lockdep_assert_rq_held(src_rq); 2285 2286 /* 2287 * Set sticky_cpu before deactivate_task() to properly mark the 2288 * beginning of an SCX-internal migration. 2289 */ 2290 p->scx.sticky_cpu = cpu_of(dst_rq); 2291 deactivate_task(src_rq, p, 0); 2292 set_task_cpu(p, cpu_of(dst_rq)); 2293 2294 raw_spin_rq_unlock(src_rq); 2295 raw_spin_rq_lock(dst_rq); 2296 2297 /* 2298 * We want to pass scx-specific enq_flags but activate_task() will 2299 * truncate the upper 32 bit. As we own @rq, we can pass them through 2300 * @rq->scx.extra_enq_flags instead. 2301 */ 2302 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2303 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2304 dst_rq->scx.extra_enq_flags = enq_flags; 2305 activate_task(dst_rq, p, 0); 2306 dst_rq->scx.extra_enq_flags = 0; 2307 } 2308 2309 /* 2310 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2311 * differences: 2312 * 2313 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2314 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2315 * this CPU?". 2316 * 2317 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2318 * must be allowed to finish on the CPU that it's currently on regardless of 2319 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2320 * BPF scheduler shouldn't attempt to migrate a task which has migration 2321 * disabled. 2322 * 2323 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2324 * no to the BPF scheduler initiated migrations while offline. 2325 * 2326 * The caller must ensure that @p and @rq are on different CPUs. 2327 */ 2328 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2329 struct task_struct *p, struct rq *rq, 2330 bool enforce) 2331 { 2332 s32 cpu = cpu_of(rq); 2333 2334 WARN_ON_ONCE(task_cpu(p) == cpu); 2335 2336 /* 2337 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2338 * the pinned CPU in migrate_disable_switch() while @p is being switched 2339 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2340 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2341 * @p passing the below task_allowed_on_cpu() check while migration is 2342 * disabled. 2343 * 2344 * Test the migration disabled state first as the race window is narrow 2345 * and the BPF scheduler failing to check migration disabled state can 2346 * easily be masked if task_allowed_on_cpu() is done first. 2347 */ 2348 if (unlikely(is_migration_disabled(p))) { 2349 if (enforce) 2350 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2351 p->comm, p->pid, task_cpu(p), cpu); 2352 return false; 2353 } 2354 2355 /* 2356 * We don't require the BPF scheduler to avoid dispatching to offline 2357 * CPUs mostly for convenience but also because CPUs can go offline 2358 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2359 * picked CPU is outside the allowed mask. 2360 */ 2361 if (!task_allowed_on_cpu(p, cpu)) { 2362 if (enforce) 2363 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2364 cpu, p->comm, p->pid); 2365 return false; 2366 } 2367 2368 if (!scx_rq_online(rq)) { 2369 if (enforce) 2370 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2371 return false; 2372 } 2373 2374 return true; 2375 } 2376 2377 /** 2378 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2379 * @p: target task 2380 * @dsq: locked DSQ @p is currently on 2381 * @src_rq: rq @p is currently on, stable with @dsq locked 2382 * 2383 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2384 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2385 * required when transferring into a local DSQ. Even when transferring into a 2386 * non-local DSQ, it's better to use the same mechanism to protect against 2387 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2388 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2389 * 2390 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2391 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2392 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2393 * dancing from our side. 2394 * 2395 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2396 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2397 * would be cleared to -1. While other cpus may have updated it to different 2398 * values afterwards, as this operation can't be preempted or recurse, the 2399 * holding_cpu can never become this CPU again before we're done. Thus, we can 2400 * tell whether we lost to dequeue by testing whether the holding_cpu still 2401 * points to this CPU. See dispatch_dequeue() for the counterpart. 2402 * 2403 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2404 * still valid. %false if lost to dequeue. 2405 */ 2406 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2407 struct scx_dispatch_q *dsq, 2408 struct rq *src_rq) 2409 { 2410 s32 cpu = raw_smp_processor_id(); 2411 2412 lockdep_assert_held(&dsq->lock); 2413 2414 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2415 task_unlink_from_dsq(p, dsq); 2416 p->scx.holding_cpu = cpu; 2417 2418 raw_spin_unlock(&dsq->lock); 2419 raw_spin_rq_lock(src_rq); 2420 2421 /* task_rq couldn't have changed if we're still the holding cpu */ 2422 return likely(p->scx.holding_cpu == cpu) && 2423 !WARN_ON_ONCE(src_rq != task_rq(p)); 2424 } 2425 2426 static bool consume_remote_task(struct rq *this_rq, 2427 struct task_struct *p, u64 enq_flags, 2428 struct scx_dispatch_q *dsq, struct rq *src_rq) 2429 { 2430 raw_spin_rq_unlock(this_rq); 2431 2432 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2433 move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); 2434 return true; 2435 } else { 2436 raw_spin_rq_unlock(src_rq); 2437 raw_spin_rq_lock(this_rq); 2438 return false; 2439 } 2440 } 2441 2442 /** 2443 * move_task_between_dsqs() - Move a task from one DSQ to another 2444 * @sch: scx_sched being operated on 2445 * @p: target task 2446 * @enq_flags: %SCX_ENQ_* 2447 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2448 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2449 * 2450 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2451 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2452 * will change. As @p's task_rq is locked, this function doesn't need to use the 2453 * holding_cpu mechanism. 2454 * 2455 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2456 * return value, is locked. 2457 */ 2458 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2459 struct task_struct *p, u64 enq_flags, 2460 struct scx_dispatch_q *src_dsq, 2461 struct scx_dispatch_q *dst_dsq) 2462 { 2463 struct rq *src_rq = task_rq(p), *dst_rq; 2464 2465 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2466 lockdep_assert_held(&src_dsq->lock); 2467 lockdep_assert_rq_held(src_rq); 2468 2469 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2470 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2471 if (src_rq != dst_rq && 2472 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2473 dst_dsq = find_global_dsq(sch, task_cpu(p)); 2474 dst_rq = src_rq; 2475 enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2476 } 2477 } else { 2478 /* no need to migrate if destination is a non-local DSQ */ 2479 dst_rq = src_rq; 2480 } 2481 2482 /* 2483 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2484 * CPU, @p will be migrated. 2485 */ 2486 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2487 /* @p is going from a non-local DSQ to a local DSQ */ 2488 if (src_rq == dst_rq) { 2489 task_unlink_from_dsq(p, src_dsq); 2490 move_local_task_to_local_dsq(sch, p, enq_flags, 2491 src_dsq, dst_rq); 2492 raw_spin_unlock(&src_dsq->lock); 2493 } else { 2494 raw_spin_unlock(&src_dsq->lock); 2495 move_remote_task_to_local_dsq(p, enq_flags, 2496 src_rq, dst_rq); 2497 } 2498 } else { 2499 /* 2500 * @p is going from a non-local DSQ to a non-local DSQ. As 2501 * $src_dsq is already locked, do an abbreviated dequeue. 2502 */ 2503 dispatch_dequeue_locked(p, src_dsq); 2504 raw_spin_unlock(&src_dsq->lock); 2505 2506 dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2507 } 2508 2509 return dst_rq; 2510 } 2511 2512 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2513 struct scx_dispatch_q *dsq, u64 enq_flags) 2514 { 2515 struct task_struct *p; 2516 retry: 2517 /* 2518 * The caller can't expect to successfully consume a task if the task's 2519 * addition to @dsq isn't guaranteed to be visible somehow. Test 2520 * @dsq->list without locking and skip if it seems empty. 2521 */ 2522 if (list_empty(&dsq->list)) 2523 return false; 2524 2525 raw_spin_lock(&dsq->lock); 2526 2527 nldsq_for_each_task(p, dsq) { 2528 struct rq *task_rq = task_rq(p); 2529 2530 /* 2531 * This loop can lead to multiple lockup scenarios, e.g. the BPF 2532 * scheduler can put an enormous number of affinitized tasks into 2533 * a contended DSQ, or the outer retry loop can repeatedly race 2534 * against scx_bypass() dequeueing tasks from @dsq trying to put 2535 * the system into the bypass mode. This can easily live-lock the 2536 * machine. If aborting, exit from all non-bypass DSQs. 2537 */ 2538 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2539 break; 2540 2541 if (rq == task_rq) { 2542 task_unlink_from_dsq(p, dsq); 2543 move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); 2544 raw_spin_unlock(&dsq->lock); 2545 return true; 2546 } 2547 2548 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2549 if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) 2550 return true; 2551 goto retry; 2552 } 2553 } 2554 2555 raw_spin_unlock(&dsq->lock); 2556 return false; 2557 } 2558 2559 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2560 { 2561 int node = cpu_to_node(cpu_of(rq)); 2562 2563 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); 2564 } 2565 2566 /** 2567 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2568 * @sch: scx_sched being operated on 2569 * @rq: current rq which is locked 2570 * @dst_dsq: destination DSQ 2571 * @p: task to dispatch 2572 * @enq_flags: %SCX_ENQ_* 2573 * 2574 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2575 * DSQ. This function performs all the synchronization dancing needed because 2576 * local DSQs are protected with rq locks. 2577 * 2578 * The caller must have exclusive ownership of @p (e.g. through 2579 * %SCX_OPSS_DISPATCHING). 2580 */ 2581 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2582 struct scx_dispatch_q *dst_dsq, 2583 struct task_struct *p, u64 enq_flags) 2584 { 2585 struct rq *src_rq = task_rq(p); 2586 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2587 struct rq *locked_rq = rq; 2588 2589 /* 2590 * We're synchronized against dequeue through DISPATCHING. As @p can't 2591 * be dequeued, its task_rq and cpus_allowed are stable too. 2592 * 2593 * If dispatching to @rq that @p is already on, no lock dancing needed. 2594 */ 2595 if (rq == src_rq && rq == dst_rq) { 2596 dispatch_enqueue(sch, rq, dst_dsq, p, 2597 enq_flags | SCX_ENQ_CLEAR_OPSS); 2598 return; 2599 } 2600 2601 if (src_rq != dst_rq && 2602 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2603 dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2604 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2605 return; 2606 } 2607 2608 /* 2609 * @p is on a possibly remote @src_rq which we need to lock to move the 2610 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2611 * on DISPATCHING, so we can't grab @src_rq lock while holding 2612 * DISPATCHING. 2613 * 2614 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2615 * we're moving from a DSQ and use the same mechanism - mark the task 2616 * under transfer with holding_cpu, release DISPATCHING and then follow 2617 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2618 */ 2619 p->scx.holding_cpu = raw_smp_processor_id(); 2620 2621 /* store_release ensures that dequeue sees the above */ 2622 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2623 2624 /* switch to @src_rq lock */ 2625 if (locked_rq != src_rq) { 2626 raw_spin_rq_unlock(locked_rq); 2627 locked_rq = src_rq; 2628 raw_spin_rq_lock(src_rq); 2629 } 2630 2631 /* task_rq couldn't have changed if we're still the holding cpu */ 2632 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2633 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2634 /* 2635 * If @p is staying on the same rq, there's no need to go 2636 * through the full deactivate/activate cycle. Optimize by 2637 * abbreviating move_remote_task_to_local_dsq(). 2638 */ 2639 if (src_rq == dst_rq) { 2640 p->scx.holding_cpu = -1; 2641 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2642 enq_flags); 2643 } else { 2644 move_remote_task_to_local_dsq(p, enq_flags, 2645 src_rq, dst_rq); 2646 /* task has been moved to dst_rq, which is now locked */ 2647 locked_rq = dst_rq; 2648 } 2649 2650 /* if the destination CPU is idle, wake it up */ 2651 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2652 resched_curr(dst_rq); 2653 } 2654 2655 /* switch back to @rq lock */ 2656 if (locked_rq != rq) { 2657 raw_spin_rq_unlock(locked_rq); 2658 raw_spin_rq_lock(rq); 2659 } 2660 } 2661 2662 /** 2663 * finish_dispatch - Asynchronously finish dispatching a task 2664 * @rq: current rq which is locked 2665 * @p: task to finish dispatching 2666 * @qseq_at_dispatch: qseq when @p started getting dispatched 2667 * @dsq_id: destination DSQ ID 2668 * @enq_flags: %SCX_ENQ_* 2669 * 2670 * Dispatching to local DSQs may need to wait for queueing to complete or 2671 * require rq lock dancing. As we don't wanna do either while inside 2672 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2673 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2674 * task and its qseq. Once ops.dispatch() returns, this function is called to 2675 * finish up. 2676 * 2677 * There is no guarantee that @p is still valid for dispatching or even that it 2678 * was valid in the first place. Make sure that the task is still owned by the 2679 * BPF scheduler and claim the ownership before dispatching. 2680 */ 2681 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2682 struct task_struct *p, 2683 unsigned long qseq_at_dispatch, 2684 u64 dsq_id, u64 enq_flags) 2685 { 2686 struct scx_dispatch_q *dsq; 2687 unsigned long opss; 2688 2689 touch_core_sched_dispatch(rq, p); 2690 retry: 2691 /* 2692 * No need for _acquire here. @p is accessed only after a successful 2693 * try_cmpxchg to DISPATCHING. 2694 */ 2695 opss = atomic_long_read(&p->scx.ops_state); 2696 2697 switch (opss & SCX_OPSS_STATE_MASK) { 2698 case SCX_OPSS_DISPATCHING: 2699 case SCX_OPSS_NONE: 2700 /* someone else already got to it */ 2701 return; 2702 case SCX_OPSS_QUEUED: 2703 /* 2704 * If qseq doesn't match, @p has gone through at least one 2705 * dispatch/dequeue and re-enqueue cycle between 2706 * scx_bpf_dsq_insert() and here and we have no claim on it. 2707 */ 2708 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2709 return; 2710 2711 /* see SCX_EV_INSERT_NOT_OWNED definition */ 2712 if (unlikely(!scx_task_on_sched(sch, p))) { 2713 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2714 return; 2715 } 2716 2717 /* 2718 * While we know @p is accessible, we don't yet have a claim on 2719 * it - the BPF scheduler is allowed to dispatch tasks 2720 * spuriously and there can be a racing dequeue attempt. Let's 2721 * claim @p by atomically transitioning it from QUEUED to 2722 * DISPATCHING. 2723 */ 2724 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2725 SCX_OPSS_DISPATCHING))) 2726 break; 2727 goto retry; 2728 case SCX_OPSS_QUEUEING: 2729 /* 2730 * do_enqueue_task() is in the process of transferring the task 2731 * to the BPF scheduler while holding @p's rq lock. As we aren't 2732 * holding any kernel or BPF resource that the enqueue path may 2733 * depend upon, it's safe to wait. 2734 */ 2735 wait_ops_state(p, opss); 2736 goto retry; 2737 } 2738 2739 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2740 2741 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2742 2743 if (dsq->id == SCX_DSQ_LOCAL) 2744 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2745 else 2746 dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2747 } 2748 2749 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2750 { 2751 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2752 u32 u; 2753 2754 for (u = 0; u < dspc->cursor; u++) { 2755 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2756 2757 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2758 ent->enq_flags); 2759 } 2760 2761 dspc->nr_tasks += dspc->cursor; 2762 dspc->cursor = 0; 2763 } 2764 2765 static inline void maybe_queue_balance_callback(struct rq *rq) 2766 { 2767 lockdep_assert_rq_held(rq); 2768 2769 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2770 return; 2771 2772 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2773 deferred_bal_cb_workfn); 2774 2775 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2776 } 2777 2778 /* 2779 * One user of this function is scx_bpf_dispatch() which can be called 2780 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2781 * from the call frame. 2782 */ 2783 static __always_inline bool 2784 scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2785 struct task_struct *prev, bool nested) 2786 { 2787 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2788 int nr_loops = SCX_DSP_MAX_LOOPS; 2789 s32 cpu = cpu_of(rq); 2790 bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2791 scx_task_on_sched(sch, prev); 2792 2793 if (consume_global_dsq(sch, rq)) 2794 return true; 2795 2796 if (bypass_dsp_enabled(sch)) { 2797 /* if @sch is bypassing, only the bypass DSQs are active */ 2798 if (scx_bypassing(sch, cpu)) 2799 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2800 2801 #ifdef CONFIG_EXT_SUB_SCHED 2802 /* 2803 * If @sch isn't bypassing but its children are, @sch is 2804 * responsible for making forward progress for both its own 2805 * tasks that aren't bypassing and the bypassing descendants' 2806 * tasks. The following implements a simple built-in behavior - 2807 * let each CPU try to run the bypass DSQ every Nth time. 2808 * 2809 * Later, if necessary, we can add an ops flag to suppress the 2810 * auto-consumption and a kfunc to consume the bypass DSQ and, 2811 * so that the BPF scheduler can fully control scheduling of 2812 * bypassed tasks. 2813 */ 2814 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2815 2816 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2817 consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { 2818 __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2819 return true; 2820 } 2821 #endif /* CONFIG_EXT_SUB_SCHED */ 2822 } 2823 2824 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2825 return false; 2826 2827 dspc->rq = rq; 2828 2829 /* 2830 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2831 * the local DSQ might still end up empty after a successful 2832 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2833 * produced some tasks, retry. The BPF scheduler may depend on this 2834 * looping behavior to simplify its implementation. 2835 */ 2836 do { 2837 dspc->nr_tasks = 0; 2838 2839 if (nested) { 2840 SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); 2841 } else { 2842 /* stash @prev so that nested invocations can access it */ 2843 rq->scx.sub_dispatch_prev = prev; 2844 SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); 2845 rq->scx.sub_dispatch_prev = NULL; 2846 } 2847 2848 flush_dispatch_buf(sch, rq); 2849 2850 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2851 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2852 return true; 2853 } 2854 if (rq->scx.local_dsq.nr) 2855 return true; 2856 if (consume_global_dsq(sch, rq)) 2857 return true; 2858 2859 /* 2860 * ops.dispatch() can trap us in this loop by repeatedly 2861 * dispatching ineligible tasks. Break out once in a while to 2862 * allow the watchdog to run. As IRQ can't be enabled in 2863 * balance(), we want to complete this scheduling cycle and then 2864 * start a new one. IOW, we want to call resched_curr() on the 2865 * next, most likely idle, task, not the current one. Use 2866 * __scx_bpf_kick_cpu() for deferred kicking. 2867 */ 2868 if (unlikely(!--nr_loops)) { 2869 scx_kick_cpu(sch, cpu, 0); 2870 break; 2871 } 2872 } while (dspc->nr_tasks); 2873 2874 /* 2875 * Prevent the CPU from going idle while bypassed descendants have tasks 2876 * queued. Without this fallback, bypassed tasks could stall if the host 2877 * scheduler's ops.dispatch() doesn't yield any tasks. 2878 */ 2879 if (bypass_dsp_enabled(sch)) 2880 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2881 2882 return false; 2883 } 2884 2885 static int balance_one(struct rq *rq, struct task_struct *prev) 2886 { 2887 struct scx_sched *sch = scx_root; 2888 s32 cpu = cpu_of(rq); 2889 2890 lockdep_assert_rq_held(rq); 2891 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2892 rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2893 2894 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2895 unlikely(rq->scx.cpu_released)) { 2896 /* 2897 * If the previous sched_class for the current CPU was not SCX, 2898 * notify the BPF scheduler that it again has control of the 2899 * core. This callback complements ->cpu_release(), which is 2900 * emitted in switch_class(). 2901 */ 2902 if (SCX_HAS_OP(sch, cpu_acquire)) 2903 SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); 2904 rq->scx.cpu_released = false; 2905 } 2906 2907 if (prev->sched_class == &ext_sched_class) { 2908 update_curr_scx(rq); 2909 2910 /* 2911 * If @prev is runnable & has slice left, it has priority and 2912 * fetching more just increases latency for the fetched tasks. 2913 * Tell pick_task_scx() to keep running @prev. If the BPF 2914 * scheduler wants to handle this explicitly, it should 2915 * implement ->cpu_release(). 2916 * 2917 * See scx_disable_workfn() for the explanation on the bypassing 2918 * test. 2919 */ 2920 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 2921 !scx_bypassing(sch, cpu)) { 2922 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2923 goto has_tasks; 2924 } 2925 } 2926 2927 /* if there already are tasks to run, nothing to do */ 2928 if (rq->scx.local_dsq.nr) 2929 goto has_tasks; 2930 2931 if (scx_dispatch_sched(sch, rq, prev, false)) 2932 goto has_tasks; 2933 2934 /* 2935 * Didn't find another task to run. Keep running @prev unless 2936 * %SCX_OPS_ENQ_LAST is in effect. 2937 */ 2938 if ((prev->scx.flags & SCX_TASK_QUEUED) && 2939 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 2940 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2941 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 2942 goto has_tasks; 2943 } 2944 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2945 return false; 2946 2947 has_tasks: 2948 /* 2949 * @rq may have extra IMMED tasks without reenq scheduled: 2950 * 2951 * - rq_is_open() can't reliably tell when and how slice is going to be 2952 * modified for $curr and allows IMMED tasks to be queued while 2953 * dispatch is in progress. 2954 * 2955 * - A non-IMMED HEAD task can get queued in front of an IMMED task 2956 * between the IMMED queueing and the subsequent scheduling event. 2957 */ 2958 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 2959 schedule_reenq_local(rq, 0); 2960 2961 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2962 return true; 2963 } 2964 2965 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 2966 { 2967 struct scx_sched *sch = scx_task_sched(p); 2968 2969 if (p->scx.flags & SCX_TASK_QUEUED) { 2970 /* 2971 * Core-sched might decide to execute @p before it is 2972 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 2973 */ 2974 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 2975 dispatch_dequeue(rq, p); 2976 } 2977 2978 p->se.exec_start = rq_clock_task(rq); 2979 2980 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2981 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 2982 SCX_CALL_OP_TASK(sch, running, rq, p); 2983 2984 clr_task_runnable(p, true); 2985 2986 /* 2987 * @p is getting newly scheduled or got kicked after someone updated its 2988 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 2989 */ 2990 if ((p->scx.slice == SCX_SLICE_INF) != 2991 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 2992 if (p->scx.slice == SCX_SLICE_INF) 2993 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 2994 else 2995 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 2996 2997 sched_update_tick_dependency(rq); 2998 2999 /* 3000 * For now, let's refresh the load_avgs just when transitioning 3001 * in and out of nohz. In the future, we might want to add a 3002 * mechanism which calls the following periodically on 3003 * tick-stopped CPUs. 3004 */ 3005 update_other_load_avgs(rq); 3006 } 3007 } 3008 3009 static enum scx_cpu_preempt_reason 3010 preempt_reason_from_class(const struct sched_class *class) 3011 { 3012 if (class == &stop_sched_class) 3013 return SCX_CPU_PREEMPT_STOP; 3014 if (class == &dl_sched_class) 3015 return SCX_CPU_PREEMPT_DL; 3016 if (class == &rt_sched_class) 3017 return SCX_CPU_PREEMPT_RT; 3018 return SCX_CPU_PREEMPT_UNKNOWN; 3019 } 3020 3021 static void switch_class(struct rq *rq, struct task_struct *next) 3022 { 3023 struct scx_sched *sch = scx_root; 3024 const struct sched_class *next_class = next->sched_class; 3025 3026 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3027 return; 3028 3029 /* 3030 * The callback is conceptually meant to convey that the CPU is no 3031 * longer under the control of SCX. Therefore, don't invoke the callback 3032 * if the next class is below SCX (in which case the BPF scheduler has 3033 * actively decided not to schedule any tasks on the CPU). 3034 */ 3035 if (sched_class_above(&ext_sched_class, next_class)) 3036 return; 3037 3038 /* 3039 * At this point we know that SCX was preempted by a higher priority 3040 * sched_class, so invoke the ->cpu_release() callback if we have not 3041 * done so already. We only send the callback once between SCX being 3042 * preempted, and it regaining control of the CPU. 3043 * 3044 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3045 * next time that balance_one() is invoked. 3046 */ 3047 if (!rq->scx.cpu_released) { 3048 if (SCX_HAS_OP(sch, cpu_release)) { 3049 struct scx_cpu_release_args args = { 3050 .reason = preempt_reason_from_class(next_class), 3051 .task = next, 3052 }; 3053 3054 SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); 3055 } 3056 rq->scx.cpu_released = true; 3057 } 3058 } 3059 3060 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3061 struct task_struct *next) 3062 { 3063 struct scx_sched *sch = scx_task_sched(p); 3064 3065 /* see kick_sync_wait_bal_cb() */ 3066 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3067 3068 update_curr_scx(rq); 3069 3070 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3071 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3072 SCX_CALL_OP_TASK(sch, stopping, rq, p, true); 3073 3074 if (p->scx.flags & SCX_TASK_QUEUED) { 3075 set_task_runnable(rq, p); 3076 3077 /* 3078 * If @p has slice left and is being put, @p is getting 3079 * preempted by a higher priority scheduler class or core-sched 3080 * forcing a different task. Leave it at the head of the local 3081 * DSQ unless it was an IMMED task. IMMED tasks should not 3082 * linger on a busy CPU, reenqueue them to the BPF scheduler. 3083 */ 3084 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3085 if (p->scx.flags & SCX_TASK_IMMED) { 3086 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3087 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3088 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3089 } else { 3090 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3091 } 3092 goto switch_class; 3093 } 3094 3095 /* 3096 * If @p is runnable but we're about to enter a lower 3097 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3098 * ops.enqueue() that @p is the only one available for this cpu, 3099 * which should trigger an explicit follow-up scheduling event. 3100 */ 3101 if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3102 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3103 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3104 } else { 3105 do_enqueue_task(rq, p, 0, -1); 3106 } 3107 } 3108 3109 switch_class: 3110 if (next && next->sched_class != &ext_sched_class) 3111 switch_class(rq, next); 3112 } 3113 3114 static void kick_sync_wait_bal_cb(struct rq *rq) 3115 { 3116 struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3117 unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3118 bool waited; 3119 s32 cpu; 3120 3121 /* 3122 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3123 * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3124 * flush) while we wait for its kick_sync to advance. 3125 * 3126 * Also, keep advancing our own kick_sync so that new kick_sync waits 3127 * targeting us, which can start after we drop the lock, cannot form 3128 * cyclic dependencies. 3129 */ 3130 retry: 3131 waited = false; 3132 for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3133 /* 3134 * smp_load_acquire() pairs with smp_store_release() on 3135 * kick_sync updates on the target CPUs. 3136 */ 3137 if (cpu == cpu_of(rq) || 3138 smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3139 cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3140 continue; 3141 } 3142 3143 raw_spin_rq_unlock_irq(rq); 3144 while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3145 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3146 cpu_relax(); 3147 } 3148 raw_spin_rq_lock_irq(rq); 3149 waited = true; 3150 } 3151 3152 if (waited) 3153 goto retry; 3154 } 3155 3156 static struct task_struct *first_local_task(struct rq *rq) 3157 { 3158 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3159 struct task_struct, scx.dsq_list.node); 3160 } 3161 3162 static struct task_struct * 3163 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3164 { 3165 struct task_struct *prev = rq->curr; 3166 bool keep_prev; 3167 struct task_struct *p; 3168 3169 /* see kick_sync_wait_bal_cb() */ 3170 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3171 3172 rq_modified_begin(rq, &ext_sched_class); 3173 3174 rq_unpin_lock(rq, rf); 3175 balance_one(rq, prev); 3176 rq_repin_lock(rq, rf); 3177 maybe_queue_balance_callback(rq); 3178 3179 /* 3180 * Defer to a balance callback which can drop rq lock and enable 3181 * IRQs. Waiting directly in the pick path would deadlock against 3182 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3183 */ 3184 if (unlikely(rq->scx.kick_sync_pending)) { 3185 rq->scx.kick_sync_pending = false; 3186 queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3187 kick_sync_wait_bal_cb); 3188 } 3189 3190 /* 3191 * If any higher-priority sched class enqueued a runnable task on 3192 * this rq during balance_one(), abort and return RETRY_TASK, so 3193 * that the scheduler loop can restart. 3194 * 3195 * If @force_scx is true, always try to pick a SCHED_EXT task, 3196 * regardless of any higher-priority sched classes activity. 3197 */ 3198 if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3199 return RETRY_TASK; 3200 3201 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3202 if (unlikely(keep_prev && 3203 prev->sched_class != &ext_sched_class)) { 3204 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3205 keep_prev = false; 3206 } 3207 3208 /* 3209 * If balance_one() is telling us to keep running @prev, replenish slice 3210 * if necessary and keep running @prev. Otherwise, pop the first one 3211 * from the local DSQ. 3212 */ 3213 if (keep_prev) { 3214 p = prev; 3215 if (!p->scx.slice) 3216 refill_task_slice_dfl(scx_task_sched(p), p); 3217 } else { 3218 p = first_local_task(rq); 3219 if (!p) 3220 return NULL; 3221 3222 if (unlikely(!p->scx.slice)) { 3223 struct scx_sched *sch = scx_task_sched(p); 3224 3225 if (!scx_bypassing(sch, cpu_of(rq)) && 3226 !sch->warned_zero_slice) { 3227 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3228 p->comm, p->pid, __func__); 3229 sch->warned_zero_slice = true; 3230 } 3231 refill_task_slice_dfl(sch, p); 3232 } 3233 } 3234 3235 return p; 3236 } 3237 3238 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3239 { 3240 return do_pick_task_scx(rq, rf, false); 3241 } 3242 3243 /* 3244 * Select the next task to run from the ext scheduling class. 3245 * 3246 * Use do_pick_task_scx() directly with @force_scx enabled, since the 3247 * dl_server must always select a sched_ext task. 3248 */ 3249 static struct task_struct * 3250 ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3251 { 3252 if (!scx_enabled()) 3253 return NULL; 3254 3255 return do_pick_task_scx(dl_se->rq, rf, true); 3256 } 3257 3258 /* 3259 * Initialize the ext server deadline entity. 3260 */ 3261 void ext_server_init(struct rq *rq) 3262 { 3263 struct sched_dl_entity *dl_se = &rq->ext_server; 3264 3265 init_dl_entity(dl_se); 3266 3267 dl_server_init(dl_se, rq, ext_server_pick_task); 3268 } 3269 3270 #ifdef CONFIG_SCHED_CORE 3271 /** 3272 * scx_prio_less - Task ordering for core-sched 3273 * @a: task A 3274 * @b: task B 3275 * @in_fi: in forced idle state 3276 * 3277 * Core-sched is implemented as an additional scheduling layer on top of the 3278 * usual sched_class'es and needs to find out the expected task ordering. For 3279 * SCX, core-sched calls this function to interrogate the task ordering. 3280 * 3281 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3282 * to implement the default task ordering. The older the timestamp, the higher 3283 * priority the task - the global FIFO ordering matching the default scheduling 3284 * behavior. 3285 * 3286 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3287 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3288 */ 3289 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3290 bool in_fi) 3291 { 3292 struct scx_sched *sch_a = scx_task_sched(a); 3293 struct scx_sched *sch_b = scx_task_sched(b); 3294 3295 /* 3296 * The const qualifiers are dropped from task_struct pointers when 3297 * calling ops.core_sched_before(). Accesses are controlled by the 3298 * verifier. 3299 */ 3300 if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3301 !scx_bypassing(sch_a, task_cpu(a))) 3302 return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, 3303 task_rq(a), 3304 (struct task_struct *)a, 3305 (struct task_struct *)b); 3306 else 3307 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3308 } 3309 #endif /* CONFIG_SCHED_CORE */ 3310 3311 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3312 { 3313 struct scx_sched *sch = scx_task_sched(p); 3314 bool bypassing; 3315 3316 /* 3317 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3318 * can be a good migration opportunity with low cache and memory 3319 * footprint. Returning a CPU different than @prev_cpu triggers 3320 * immediate rq migration. However, for SCX, as the current rq 3321 * association doesn't dictate where the task is going to run, this 3322 * doesn't fit well. If necessary, we can later add a dedicated method 3323 * which can decide to preempt self to force it through the regular 3324 * scheduling path. 3325 */ 3326 if (unlikely(wake_flags & WF_EXEC)) 3327 return prev_cpu; 3328 3329 bypassing = scx_bypassing(sch, task_cpu(p)); 3330 if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3331 s32 cpu; 3332 struct task_struct **ddsp_taskp; 3333 3334 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3335 WARN_ON_ONCE(*ddsp_taskp); 3336 *ddsp_taskp = p; 3337 3338 this_rq()->scx.in_select_cpu = true; 3339 cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); 3340 this_rq()->scx.in_select_cpu = false; 3341 p->scx.selected_cpu = cpu; 3342 *ddsp_taskp = NULL; 3343 if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3344 return cpu; 3345 else 3346 return prev_cpu; 3347 } else { 3348 s32 cpu; 3349 3350 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3351 if (cpu >= 0) { 3352 refill_task_slice_dfl(sch, p); 3353 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3354 } else { 3355 cpu = prev_cpu; 3356 } 3357 p->scx.selected_cpu = cpu; 3358 3359 if (bypassing) 3360 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3361 return cpu; 3362 } 3363 } 3364 3365 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3366 { 3367 run_deferred(rq); 3368 } 3369 3370 static void set_cpus_allowed_scx(struct task_struct *p, 3371 struct affinity_context *ac) 3372 { 3373 struct scx_sched *sch = scx_task_sched(p); 3374 3375 set_cpus_allowed_common(p, ac); 3376 3377 if (task_dead_and_done(p)) 3378 return; 3379 3380 /* 3381 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3382 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3383 * scheduler the effective one. 3384 * 3385 * Fine-grained memory write control is enforced by BPF making the const 3386 * designation pointless. Cast it away when calling the operation. 3387 */ 3388 if (SCX_HAS_OP(sch, set_cpumask)) 3389 SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); 3390 } 3391 3392 static void handle_hotplug(struct rq *rq, bool online) 3393 { 3394 struct scx_sched *sch = scx_root; 3395 s32 cpu = cpu_of(rq); 3396 3397 atomic_long_inc(&scx_hotplug_seq); 3398 3399 /* 3400 * scx_root updates are protected by cpus_read_lock() and will stay 3401 * stable here. Note that we can't depend on scx_enabled() test as the 3402 * hotplug ops need to be enabled before __scx_enabled is set. 3403 */ 3404 if (unlikely(!sch)) 3405 return; 3406 3407 if (scx_enabled()) 3408 scx_idle_update_selcpu_topology(&sch->ops); 3409 3410 if (online && SCX_HAS_OP(sch, cpu_online)) 3411 SCX_CALL_OP(sch, cpu_online, NULL, cpu); 3412 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3413 SCX_CALL_OP(sch, cpu_offline, NULL, cpu); 3414 else 3415 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3416 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3417 "cpu %d going %s, exiting scheduler", cpu, 3418 online ? "online" : "offline"); 3419 } 3420 3421 void scx_rq_activate(struct rq *rq) 3422 { 3423 handle_hotplug(rq, true); 3424 } 3425 3426 void scx_rq_deactivate(struct rq *rq) 3427 { 3428 handle_hotplug(rq, false); 3429 } 3430 3431 static void rq_online_scx(struct rq *rq) 3432 { 3433 rq->scx.flags |= SCX_RQ_ONLINE; 3434 } 3435 3436 static void rq_offline_scx(struct rq *rq) 3437 { 3438 rq->scx.flags &= ~SCX_RQ_ONLINE; 3439 } 3440 3441 static bool check_rq_for_timeouts(struct rq *rq) 3442 { 3443 struct scx_sched *sch; 3444 struct task_struct *p; 3445 struct rq_flags rf; 3446 bool timed_out = false; 3447 3448 rq_lock_irqsave(rq, &rf); 3449 sch = rcu_dereference_bh(scx_root); 3450 if (unlikely(!sch)) 3451 goto out_unlock; 3452 3453 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3454 struct scx_sched *sch = scx_task_sched(p); 3455 unsigned long last_runnable = p->scx.runnable_at; 3456 3457 if (unlikely(time_after(jiffies, 3458 last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3459 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3460 3461 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, 3462 "%s[%d] failed to run for %u.%03us", 3463 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); 3464 timed_out = true; 3465 break; 3466 } 3467 } 3468 out_unlock: 3469 rq_unlock_irqrestore(rq, &rf); 3470 return timed_out; 3471 } 3472 3473 static void scx_watchdog_workfn(struct work_struct *work) 3474 { 3475 unsigned long intv; 3476 int cpu; 3477 3478 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3479 3480 for_each_online_cpu(cpu) { 3481 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3482 break; 3483 3484 cond_resched(); 3485 } 3486 3487 intv = READ_ONCE(scx_watchdog_interval); 3488 if (intv < ULONG_MAX) 3489 queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3490 } 3491 3492 void scx_tick(struct rq *rq) 3493 { 3494 struct scx_sched *root; 3495 unsigned long last_check; 3496 3497 if (!scx_enabled()) 3498 return; 3499 3500 root = rcu_dereference_bh(scx_root); 3501 if (unlikely(!root)) 3502 return; 3503 3504 last_check = READ_ONCE(scx_watchdog_timestamp); 3505 if (unlikely(time_after(jiffies, 3506 last_check + READ_ONCE(root->watchdog_timeout)))) { 3507 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3508 3509 scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3510 "watchdog failed to check in for %u.%03us", 3511 dur_ms / 1000, dur_ms % 1000); 3512 } 3513 3514 update_other_load_avgs(rq); 3515 } 3516 3517 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3518 { 3519 struct scx_sched *sch = scx_task_sched(curr); 3520 3521 update_curr_scx(rq); 3522 3523 /* 3524 * While disabling, always resched and refresh core-sched timestamp as 3525 * we can't trust the slice management or ops.core_sched_before(). 3526 */ 3527 if (scx_bypassing(sch, cpu_of(rq))) { 3528 curr->scx.slice = 0; 3529 touch_core_sched(rq, curr); 3530 } else if (SCX_HAS_OP(sch, tick)) { 3531 SCX_CALL_OP_TASK(sch, tick, rq, curr); 3532 } 3533 3534 if (!curr->scx.slice) 3535 resched_curr(rq); 3536 } 3537 3538 #ifdef CONFIG_EXT_GROUP_SCHED 3539 static struct cgroup *tg_cgrp(struct task_group *tg) 3540 { 3541 /* 3542 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3543 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3544 * root cgroup. 3545 */ 3546 if (tg && tg->css.cgroup) 3547 return tg->css.cgroup; 3548 else 3549 return &cgrp_dfl_root.cgrp; 3550 } 3551 3552 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3553 3554 #else /* CONFIG_EXT_GROUP_SCHED */ 3555 3556 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3557 3558 #endif /* CONFIG_EXT_GROUP_SCHED */ 3559 3560 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3561 { 3562 int ret; 3563 3564 p->scx.disallow = false; 3565 3566 if (SCX_HAS_OP(sch, init_task)) { 3567 struct scx_init_task_args args = { 3568 SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3569 .fork = fork, 3570 }; 3571 3572 ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); 3573 if (unlikely(ret)) { 3574 ret = ops_sanitize_err(sch, "init_task", ret); 3575 return ret; 3576 } 3577 } 3578 3579 if (p->scx.disallow) { 3580 if (unlikely(scx_parent(sch))) { 3581 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3582 p->comm, p->pid); 3583 } else if (unlikely(fork)) { 3584 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3585 p->comm, p->pid); 3586 } else { 3587 struct rq *rq; 3588 struct rq_flags rf; 3589 3590 rq = task_rq_lock(p, &rf); 3591 3592 /* 3593 * We're in the load path and @p->policy will be applied 3594 * right after. Reverting @p->policy here and rejecting 3595 * %SCHED_EXT transitions from scx_check_setscheduler() 3596 * guarantees that if ops.init_task() sets @p->disallow, 3597 * @p can never be in SCX. 3598 */ 3599 if (p->policy == SCHED_EXT) { 3600 p->policy = SCHED_NORMAL; 3601 atomic_long_inc(&scx_nr_rejected); 3602 } 3603 3604 task_rq_unlock(rq, p, &rf); 3605 } 3606 } 3607 3608 return 0; 3609 } 3610 3611 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3612 { 3613 struct rq *rq = task_rq(p); 3614 u32 weight; 3615 3616 lockdep_assert_rq_held(rq); 3617 3618 /* 3619 * Verify the task is not in BPF scheduler's custody. If flag 3620 * transitions are consistent, the flag should always be clear 3621 * here. 3622 */ 3623 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3624 3625 /* 3626 * Set the weight before calling ops.enable() so that the scheduler 3627 * doesn't see a stale value if they inspect the task struct. 3628 */ 3629 if (task_has_idle_policy(p)) 3630 weight = WEIGHT_IDLEPRIO; 3631 else 3632 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3633 3634 p->scx.weight = sched_weight_to_cgroup(weight); 3635 3636 if (SCX_HAS_OP(sch, enable)) 3637 SCX_CALL_OP_TASK(sch, enable, rq, p); 3638 3639 if (SCX_HAS_OP(sch, set_weight)) 3640 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3641 } 3642 3643 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3644 { 3645 __scx_enable_task(sch, p); 3646 scx_set_task_state(p, SCX_TASK_ENABLED); 3647 } 3648 3649 static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3650 { 3651 struct rq *rq = task_rq(p); 3652 3653 lockdep_assert_rq_held(rq); 3654 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3655 3656 clear_direct_dispatch(p); 3657 3658 if (SCX_HAS_OP(sch, disable)) 3659 SCX_CALL_OP_TASK(sch, disable, rq, p); 3660 scx_set_task_state(p, SCX_TASK_READY); 3661 3662 /* 3663 * Verify the task is not in BPF scheduler's custody. If flag 3664 * transitions are consistent, the flag should always be clear 3665 * here. 3666 */ 3667 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3668 } 3669 3670 static void __scx_disable_and_exit_task(struct scx_sched *sch, 3671 struct task_struct *p) 3672 { 3673 struct scx_exit_task_args args = { 3674 .cancelled = false, 3675 }; 3676 3677 lockdep_assert_held(&p->pi_lock); 3678 lockdep_assert_rq_held(task_rq(p)); 3679 3680 switch (scx_get_task_state(p)) { 3681 case SCX_TASK_NONE: 3682 return; 3683 case SCX_TASK_INIT: 3684 args.cancelled = true; 3685 break; 3686 case SCX_TASK_READY: 3687 break; 3688 case SCX_TASK_ENABLED: 3689 scx_disable_task(sch, p); 3690 break; 3691 default: 3692 WARN_ON_ONCE(true); 3693 return; 3694 } 3695 3696 if (SCX_HAS_OP(sch, exit_task)) 3697 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3698 } 3699 3700 /* 3701 * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never 3702 * ran. The task state has not been transitioned, so this mirrors the 3703 * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). 3704 */ 3705 static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) 3706 { 3707 struct scx_exit_task_args args = { .cancelled = true }; 3708 3709 lockdep_assert_held(&p->pi_lock); 3710 lockdep_assert_rq_held(task_rq(p)); 3711 3712 if (SCX_HAS_OP(sch, exit_task)) 3713 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3714 } 3715 3716 static void scx_disable_and_exit_task(struct scx_sched *sch, 3717 struct task_struct *p) 3718 { 3719 __scx_disable_and_exit_task(sch, p); 3720 3721 /* 3722 * If set, @p exited between __scx_init_task() and scx_enable_task() in 3723 * scx_sub_enable() and is initialized for both the associated sched and 3724 * its parent. Exit for the child too - scx_enable_task() never ran for 3725 * it, so undo only init_task. The flag is only set on the sub-enable 3726 * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. 3727 */ 3728 if (p->scx.flags & SCX_TASK_SUB_INIT) { 3729 if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) 3730 scx_sub_init_cancel_task(scx_enabling_sub_sched, p); 3731 p->scx.flags &= ~SCX_TASK_SUB_INIT; 3732 } 3733 3734 scx_set_task_sched(p, NULL); 3735 scx_set_task_state(p, SCX_TASK_NONE); 3736 } 3737 3738 void init_scx_entity(struct sched_ext_entity *scx) 3739 { 3740 memset(scx, 0, sizeof(*scx)); 3741 INIT_LIST_HEAD(&scx->dsq_list.node); 3742 RB_CLEAR_NODE(&scx->dsq_priq); 3743 scx->sticky_cpu = -1; 3744 scx->holding_cpu = -1; 3745 INIT_LIST_HEAD(&scx->runnable_node); 3746 scx->runnable_at = jiffies; 3747 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3748 scx->slice = SCX_SLICE_DFL; 3749 } 3750 3751 void scx_pre_fork(struct task_struct *p) 3752 { 3753 /* 3754 * BPF scheduler enable/disable paths want to be able to iterate and 3755 * update all tasks which can become complex when racing forks. As 3756 * enable/disable are very cold paths, let's use a percpu_rwsem to 3757 * exclude forks. 3758 */ 3759 percpu_down_read(&scx_fork_rwsem); 3760 } 3761 3762 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3763 { 3764 s32 ret; 3765 3766 percpu_rwsem_assert_held(&scx_fork_rwsem); 3767 3768 if (scx_init_task_enabled) { 3769 #ifdef CONFIG_EXT_SUB_SCHED 3770 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3771 #else 3772 struct scx_sched *sch = scx_root; 3773 #endif 3774 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 3775 ret = __scx_init_task(sch, p, true); 3776 if (unlikely(ret)) { 3777 scx_set_task_state(p, SCX_TASK_NONE); 3778 return ret; 3779 } 3780 scx_set_task_state(p, SCX_TASK_INIT); 3781 scx_set_task_sched(p, sch); 3782 } 3783 3784 return 0; 3785 } 3786 3787 void scx_post_fork(struct task_struct *p) 3788 { 3789 if (scx_init_task_enabled) { 3790 scx_set_task_state(p, SCX_TASK_READY); 3791 3792 /* 3793 * Enable the task immediately if it's running on sched_ext. 3794 * Otherwise, it'll be enabled in switching_to_scx() if and 3795 * when it's ever configured to run with a SCHED_EXT policy. 3796 */ 3797 if (p->sched_class == &ext_sched_class) { 3798 struct rq_flags rf; 3799 struct rq *rq; 3800 3801 rq = task_rq_lock(p, &rf); 3802 scx_enable_task(scx_task_sched(p), p); 3803 task_rq_unlock(rq, p, &rf); 3804 } 3805 } 3806 3807 raw_spin_lock_irq(&scx_tasks_lock); 3808 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3809 raw_spin_unlock_irq(&scx_tasks_lock); 3810 3811 percpu_up_read(&scx_fork_rwsem); 3812 } 3813 3814 void scx_cancel_fork(struct task_struct *p) 3815 { 3816 if (scx_enabled()) { 3817 struct rq *rq; 3818 struct rq_flags rf; 3819 3820 rq = task_rq_lock(p, &rf); 3821 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3822 scx_disable_and_exit_task(scx_task_sched(p), p); 3823 task_rq_unlock(rq, p, &rf); 3824 } 3825 3826 percpu_up_read(&scx_fork_rwsem); 3827 } 3828 3829 /** 3830 * task_dead_and_done - Is a task dead and done running? 3831 * @p: target task 3832 * 3833 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3834 * task no longer exists from SCX's POV. However, certain sched_class ops may be 3835 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3836 * may try to switch a task which finished sched_ext_dead() back into SCX 3837 * triggering invalid SCX task state transitions and worse. 3838 * 3839 * Once a task has finished the final switch, sched_ext_dead() is the only thing 3840 * that needs to happen on the task. Use this test to short-circuit sched_class 3841 * operations which may be called on dead tasks. 3842 */ 3843 static bool task_dead_and_done(struct task_struct *p) 3844 { 3845 struct rq *rq = task_rq(p); 3846 3847 lockdep_assert_rq_held(rq); 3848 3849 /* 3850 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3851 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3852 * won't ever run again. 3853 */ 3854 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 3855 !task_on_cpu(rq, p); 3856 } 3857 3858 void sched_ext_dead(struct task_struct *p) 3859 { 3860 unsigned long flags; 3861 3862 /* 3863 * By the time control reaches here, @p has %TASK_DEAD set, switched out 3864 * for the last time and then dropped the rq lock - task_dead_and_done() 3865 * should be returning %true nullifying the straggling sched_class ops. 3866 * Remove from scx_tasks and exit @p. 3867 */ 3868 raw_spin_lock_irqsave(&scx_tasks_lock, flags); 3869 list_del_init(&p->scx.tasks_node); 3870 raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); 3871 3872 /* 3873 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 3874 * ENABLED transitions can't race us. Disable ops for @p. 3875 * 3876 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see 3877 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup 3878 * iteration is only used from sub-sched paths, which require root 3879 * enabled. Root enable transitions every live task to at least READY. 3880 * 3881 * %INIT_BEGIN means ops.init_task() is running for @p. Don't call 3882 * into ops; transition to %DEAD so the post-init recheck unwinds 3883 * via scx_sub_init_cancel_task(). 3884 */ 3885 if (scx_get_task_state(p) != SCX_TASK_NONE) { 3886 struct rq_flags rf; 3887 struct rq *rq; 3888 3889 rq = task_rq_lock(p, &rf); 3890 if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) 3891 scx_disable_and_exit_task(scx_task_sched(p), p); 3892 scx_set_task_state(p, SCX_TASK_DEAD); 3893 task_rq_unlock(rq, p, &rf); 3894 } 3895 } 3896 3897 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 3898 const struct load_weight *lw) 3899 { 3900 struct scx_sched *sch = scx_task_sched(p); 3901 3902 lockdep_assert_rq_held(task_rq(p)); 3903 3904 if (task_dead_and_done(p)) 3905 return; 3906 3907 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 3908 if (SCX_HAS_OP(sch, set_weight)) 3909 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3910 } 3911 3912 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 3913 { 3914 } 3915 3916 static void switching_to_scx(struct rq *rq, struct task_struct *p) 3917 { 3918 struct scx_sched *sch = scx_task_sched(p); 3919 3920 if (task_dead_and_done(p)) 3921 return; 3922 3923 scx_enable_task(sch, p); 3924 3925 /* 3926 * set_cpus_allowed_scx() is not called while @p is associated with a 3927 * different scheduler class. Keep the BPF scheduler up-to-date. 3928 */ 3929 if (SCX_HAS_OP(sch, set_cpumask)) 3930 SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); 3931 } 3932 3933 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3934 { 3935 if (task_dead_and_done(p)) 3936 return; 3937 3938 /* 3939 * %NONE means SCX is no longer tracking @p at the task level (e.g. 3940 * scx_fail_parent() handed @p back to the parent at NONE pending the 3941 * parent's own teardown). There is nothing to disable; calling 3942 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a 3943 * NONE -> READY validation failure. 3944 */ 3945 if (scx_get_task_state(p) == SCX_TASK_NONE) 3946 return; 3947 3948 scx_disable_task(scx_task_sched(p), p); 3949 } 3950 3951 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 3952 3953 int scx_check_setscheduler(struct task_struct *p, int policy) 3954 { 3955 lockdep_assert_rq_held(task_rq(p)); 3956 3957 /* if disallow, reject transitioning into SCX */ 3958 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 3959 p->policy != policy && policy == SCHED_EXT) 3960 return -EACCES; 3961 3962 return 0; 3963 } 3964 3965 static void process_ddsp_deferred_locals(struct rq *rq) 3966 { 3967 struct task_struct *p; 3968 3969 lockdep_assert_rq_held(rq); 3970 3971 /* 3972 * Now that @rq can be unlocked, execute the deferred enqueueing of 3973 * tasks directly dispatched to the local DSQs of other CPUs. See 3974 * direct_dispatch(). Keep popping from the head instead of using 3975 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 3976 * temporarily. 3977 */ 3978 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 3979 struct task_struct, scx.dsq_list.node))) { 3980 struct scx_sched *sch = scx_task_sched(p); 3981 struct scx_dispatch_q *dsq; 3982 u64 dsq_id = p->scx.ddsp_dsq_id; 3983 u64 enq_flags = p->scx.ddsp_enq_flags; 3984 3985 list_del_init(&p->scx.dsq_list.node); 3986 clear_direct_dispatch(p); 3987 3988 dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); 3989 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 3990 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 3991 } 3992 } 3993 3994 /* 3995 * Determine whether @p should be reenqueued from a local DSQ. 3996 * 3997 * @reenq_flags is mutable and accumulates state across the DSQ walk: 3998 * 3999 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 4000 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 4001 * the head consumes the first slot. 4002 * 4003 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 4004 * rq_is_open() is true. 4005 * 4006 * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 4007 * AND the current task is done — i.e. it will execute immediately. All other 4008 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 4009 * every IMMED task behind it gets reenqueued. 4010 * 4011 * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 4012 * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 4013 * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 4014 * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 4015 * in process_deferred_reenq_locals(). 4016 */ 4017 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 4018 { 4019 bool first; 4020 4021 first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 4022 *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 4023 4024 *reason = SCX_TASK_REENQ_KFUNC; 4025 4026 if ((p->scx.flags & SCX_TASK_IMMED) && 4027 (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 4028 __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 4029 *reason = SCX_TASK_REENQ_IMMED; 4030 return true; 4031 } 4032 4033 return *reenq_flags & SCX_REENQ_ANY; 4034 } 4035 4036 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 4037 { 4038 LIST_HEAD(tasks); 4039 u32 nr_enqueued = 0; 4040 struct task_struct *p, *n; 4041 4042 lockdep_assert_rq_held(rq); 4043 4044 if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 4045 reenq_flags &= ~__SCX_REENQ_TSR_MASK; 4046 if (rq_is_open(rq, 0)) 4047 reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 4048 4049 /* 4050 * The BPF scheduler may choose to dispatch tasks back to 4051 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 4052 * first to avoid processing the same tasks repeatedly. 4053 */ 4054 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 4055 scx.dsq_list.node) { 4056 struct scx_sched *task_sch = scx_task_sched(p); 4057 u32 reason; 4058 4059 /* 4060 * If @p is being migrated, @p's current CPU may not agree with 4061 * its allowed CPUs and the migration_cpu_stop is about to 4062 * deactivate and re-activate @p anyway. Skip re-enqueueing. 4063 * 4064 * While racing sched property changes may also dequeue and 4065 * re-enqueue a migrating task while its current CPU and allowed 4066 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 4067 * the current local DSQ for running tasks and thus are not 4068 * visible to the BPF scheduler. 4069 */ 4070 if (p->migration_pending) 4071 continue; 4072 4073 if (!scx_is_descendant(task_sch, sch)) 4074 continue; 4075 4076 if (!local_task_should_reenq(p, &reenq_flags, &reason)) 4077 continue; 4078 4079 dispatch_dequeue(rq, p); 4080 4081 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4082 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4083 p->scx.flags |= reason; 4084 4085 list_add_tail(&p->scx.dsq_list.node, &tasks); 4086 } 4087 4088 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 4089 list_del_init(&p->scx.dsq_list.node); 4090 4091 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 4092 4093 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4094 nr_enqueued++; 4095 } 4096 4097 return nr_enqueued; 4098 } 4099 4100 static void process_deferred_reenq_locals(struct rq *rq) 4101 { 4102 u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4103 4104 lockdep_assert_rq_held(rq); 4105 4106 while (true) { 4107 struct scx_sched *sch; 4108 u64 reenq_flags; 4109 bool skip = false; 4110 4111 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4112 struct scx_deferred_reenq_local *drl = 4113 list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4114 struct scx_deferred_reenq_local, 4115 node); 4116 struct scx_sched_pcpu *sch_pcpu; 4117 4118 if (!drl) 4119 return; 4120 4121 sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4122 deferred_reenq_local); 4123 sch = sch_pcpu->sch; 4124 4125 reenq_flags = drl->flags; 4126 WRITE_ONCE(drl->flags, 0); 4127 list_del_init(&drl->node); 4128 4129 if (likely(drl->seq != seq)) { 4130 drl->seq = seq; 4131 drl->cnt = 0; 4132 } else { 4133 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4134 scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4135 drl->cnt); 4136 skip = true; 4137 } 4138 4139 __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4140 } 4141 } 4142 4143 if (!skip) { 4144 /* see schedule_dsq_reenq() */ 4145 smp_mb(); 4146 4147 reenq_local(sch, rq, reenq_flags); 4148 } 4149 } 4150 } 4151 4152 static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4153 { 4154 *reason = SCX_TASK_REENQ_KFUNC; 4155 return reenq_flags & SCX_REENQ_ANY; 4156 } 4157 4158 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4159 { 4160 struct rq *locked_rq = rq; 4161 struct scx_sched *sch = dsq->sched; 4162 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4163 struct task_struct *p; 4164 s32 nr_enqueued = 0; 4165 4166 lockdep_assert_rq_held(rq); 4167 4168 raw_spin_lock(&dsq->lock); 4169 4170 while (likely(!READ_ONCE(sch->bypass_depth))) { 4171 struct rq *task_rq; 4172 u32 reason; 4173 4174 p = nldsq_cursor_next_task(&cursor, dsq); 4175 if (!p) 4176 break; 4177 4178 if (!user_task_should_reenq(p, reenq_flags, &reason)) 4179 continue; 4180 4181 task_rq = task_rq(p); 4182 4183 if (locked_rq != task_rq) { 4184 if (locked_rq) 4185 raw_spin_rq_unlock(locked_rq); 4186 if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4187 raw_spin_unlock(&dsq->lock); 4188 raw_spin_rq_lock(task_rq); 4189 raw_spin_lock(&dsq->lock); 4190 } 4191 locked_rq = task_rq; 4192 4193 /* did we lose @p while switching locks? */ 4194 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4195 continue; 4196 } 4197 4198 /* @p is on @dsq, its rq and @dsq are locked */ 4199 dispatch_dequeue_locked(p, dsq); 4200 raw_spin_unlock(&dsq->lock); 4201 4202 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4203 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4204 p->scx.flags |= reason; 4205 4206 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4207 4208 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4209 4210 if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4211 raw_spin_rq_unlock(locked_rq); 4212 locked_rq = NULL; 4213 cpu_relax(); 4214 } 4215 4216 raw_spin_lock(&dsq->lock); 4217 } 4218 4219 list_del_init(&cursor.node); 4220 raw_spin_unlock(&dsq->lock); 4221 4222 if (locked_rq != rq) { 4223 if (locked_rq) 4224 raw_spin_rq_unlock(locked_rq); 4225 raw_spin_rq_lock(rq); 4226 } 4227 } 4228 4229 static void process_deferred_reenq_users(struct rq *rq) 4230 { 4231 lockdep_assert_rq_held(rq); 4232 4233 while (true) { 4234 struct scx_dispatch_q *dsq; 4235 u64 reenq_flags; 4236 4237 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4238 struct scx_deferred_reenq_user *dru = 4239 list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4240 struct scx_deferred_reenq_user, 4241 node); 4242 struct scx_dsq_pcpu *dsq_pcpu; 4243 4244 if (!dru) 4245 return; 4246 4247 dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4248 deferred_reenq_user); 4249 dsq = dsq_pcpu->dsq; 4250 reenq_flags = dru->flags; 4251 WRITE_ONCE(dru->flags, 0); 4252 list_del_init(&dru->node); 4253 } 4254 4255 /* see schedule_dsq_reenq() */ 4256 smp_mb(); 4257 4258 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4259 reenq_user(rq, dsq, reenq_flags); 4260 } 4261 } 4262 4263 static void run_deferred(struct rq *rq) 4264 { 4265 process_ddsp_deferred_locals(rq); 4266 4267 if (!list_empty(&rq->scx.deferred_reenq_locals)) 4268 process_deferred_reenq_locals(rq); 4269 4270 if (!list_empty(&rq->scx.deferred_reenq_users)) 4271 process_deferred_reenq_users(rq); 4272 } 4273 4274 #ifdef CONFIG_NO_HZ_FULL 4275 bool scx_can_stop_tick(struct rq *rq) 4276 { 4277 struct task_struct *p = rq->curr; 4278 struct scx_sched *sch = scx_task_sched(p); 4279 4280 if (p->sched_class != &ext_sched_class) 4281 return true; 4282 4283 if (scx_bypassing(sch, cpu_of(rq))) 4284 return false; 4285 4286 /* 4287 * @rq can dispatch from different DSQs, so we can't tell whether it 4288 * needs the tick or not by looking at nr_running. Allow stopping ticks 4289 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4290 */ 4291 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4292 } 4293 #endif 4294 4295 #ifdef CONFIG_EXT_GROUP_SCHED 4296 4297 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4298 static bool scx_cgroup_enabled; 4299 4300 void scx_tg_init(struct task_group *tg) 4301 { 4302 tg->scx.weight = CGROUP_WEIGHT_DFL; 4303 tg->scx.bw_period_us = default_bw_period_us(); 4304 tg->scx.bw_quota_us = RUNTIME_INF; 4305 tg->scx.idle = false; 4306 } 4307 4308 int scx_tg_online(struct task_group *tg) 4309 { 4310 struct scx_sched *sch = scx_root; 4311 int ret = 0; 4312 4313 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4314 4315 if (scx_cgroup_enabled) { 4316 if (SCX_HAS_OP(sch, cgroup_init)) { 4317 struct scx_cgroup_init_args args = 4318 { .weight = tg->scx.weight, 4319 .bw_period_us = tg->scx.bw_period_us, 4320 .bw_quota_us = tg->scx.bw_quota_us, 4321 .bw_burst_us = tg->scx.bw_burst_us }; 4322 4323 ret = SCX_CALL_OP_RET(sch, cgroup_init, 4324 NULL, tg->css.cgroup, &args); 4325 if (ret) 4326 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4327 } 4328 if (ret == 0) 4329 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4330 } else { 4331 tg->scx.flags |= SCX_TG_ONLINE; 4332 } 4333 4334 return ret; 4335 } 4336 4337 void scx_tg_offline(struct task_group *tg) 4338 { 4339 struct scx_sched *sch = scx_root; 4340 4341 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4342 4343 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4344 (tg->scx.flags & SCX_TG_INITED)) 4345 SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); 4346 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4347 } 4348 4349 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4350 { 4351 struct scx_sched *sch = scx_root; 4352 struct cgroup_subsys_state *css; 4353 struct task_struct *p; 4354 int ret; 4355 4356 if (!scx_cgroup_enabled) 4357 return 0; 4358 4359 cgroup_taskset_for_each(p, css, tset) { 4360 struct cgroup *from = tg_cgrp(task_group(p)); 4361 struct cgroup *to = tg_cgrp(css_tg(css)); 4362 4363 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4364 4365 /* 4366 * sched_move_task() omits identity migrations. Let's match the 4367 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4368 * always match one-to-one. 4369 */ 4370 if (from == to) 4371 continue; 4372 4373 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4374 ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, 4375 p, from, css->cgroup); 4376 if (ret) 4377 goto err; 4378 } 4379 4380 p->scx.cgrp_moving_from = from; 4381 } 4382 4383 return 0; 4384 4385 err: 4386 cgroup_taskset_for_each(p, css, tset) { 4387 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4388 p->scx.cgrp_moving_from) 4389 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4390 p, p->scx.cgrp_moving_from, css->cgroup); 4391 p->scx.cgrp_moving_from = NULL; 4392 } 4393 4394 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4395 } 4396 4397 void scx_cgroup_move_task(struct task_struct *p) 4398 { 4399 struct scx_sched *sch = scx_root; 4400 4401 if (!scx_cgroup_enabled) 4402 return; 4403 4404 /* 4405 * @p must have ops.cgroup_prep_move() called on it and thus 4406 * cgrp_moving_from set. 4407 */ 4408 if (SCX_HAS_OP(sch, cgroup_move) && 4409 !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) 4410 SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), 4411 p, p->scx.cgrp_moving_from, 4412 tg_cgrp(task_group(p))); 4413 p->scx.cgrp_moving_from = NULL; 4414 } 4415 4416 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4417 { 4418 struct scx_sched *sch = scx_root; 4419 struct cgroup_subsys_state *css; 4420 struct task_struct *p; 4421 4422 if (!scx_cgroup_enabled) 4423 return; 4424 4425 cgroup_taskset_for_each(p, css, tset) { 4426 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4427 p->scx.cgrp_moving_from) 4428 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4429 p, p->scx.cgrp_moving_from, css->cgroup); 4430 p->scx.cgrp_moving_from = NULL; 4431 } 4432 } 4433 4434 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4435 { 4436 struct scx_sched *sch; 4437 4438 percpu_down_read(&scx_cgroup_ops_rwsem); 4439 sch = scx_root; 4440 4441 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4442 tg->scx.weight != weight) 4443 SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); 4444 4445 tg->scx.weight = weight; 4446 4447 percpu_up_read(&scx_cgroup_ops_rwsem); 4448 } 4449 4450 void scx_group_set_idle(struct task_group *tg, bool idle) 4451 { 4452 struct scx_sched *sch; 4453 4454 percpu_down_read(&scx_cgroup_ops_rwsem); 4455 sch = scx_root; 4456 4457 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4458 SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); 4459 4460 /* Update the task group's idle state */ 4461 tg->scx.idle = idle; 4462 4463 percpu_up_read(&scx_cgroup_ops_rwsem); 4464 } 4465 4466 void scx_group_set_bandwidth(struct task_group *tg, 4467 u64 period_us, u64 quota_us, u64 burst_us) 4468 { 4469 struct scx_sched *sch; 4470 4471 percpu_down_read(&scx_cgroup_ops_rwsem); 4472 sch = scx_root; 4473 4474 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4475 (tg->scx.bw_period_us != period_us || 4476 tg->scx.bw_quota_us != quota_us || 4477 tg->scx.bw_burst_us != burst_us)) 4478 SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, 4479 tg_cgrp(tg), period_us, quota_us, burst_us); 4480 4481 tg->scx.bw_period_us = period_us; 4482 tg->scx.bw_quota_us = quota_us; 4483 tg->scx.bw_burst_us = burst_us; 4484 4485 percpu_up_read(&scx_cgroup_ops_rwsem); 4486 } 4487 #endif /* CONFIG_EXT_GROUP_SCHED */ 4488 4489 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4490 static struct cgroup *root_cgroup(void) 4491 { 4492 return &cgrp_dfl_root.cgrp; 4493 } 4494 4495 static void scx_cgroup_lock(void) 4496 { 4497 #ifdef CONFIG_EXT_GROUP_SCHED 4498 percpu_down_write(&scx_cgroup_ops_rwsem); 4499 #endif 4500 cgroup_lock(); 4501 } 4502 4503 static void scx_cgroup_unlock(void) 4504 { 4505 cgroup_unlock(); 4506 #ifdef CONFIG_EXT_GROUP_SCHED 4507 percpu_up_write(&scx_cgroup_ops_rwsem); 4508 #endif 4509 } 4510 #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4511 static struct cgroup *root_cgroup(void) { return NULL; } 4512 static void scx_cgroup_lock(void) {} 4513 static void scx_cgroup_unlock(void) {} 4514 #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4515 4516 #ifdef CONFIG_EXT_SUB_SCHED 4517 static struct cgroup *sch_cgroup(struct scx_sched *sch) 4518 { 4519 return sch->cgrp; 4520 } 4521 4522 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4523 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4524 { 4525 struct cgroup *pos; 4526 struct cgroup_subsys_state *css; 4527 4528 cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4529 rcu_assign_pointer(pos->scx_sched, sch); 4530 } 4531 #else /* CONFIG_EXT_SUB_SCHED */ 4532 static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4533 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4534 #endif /* CONFIG_EXT_SUB_SCHED */ 4535 4536 /* 4537 * Omitted operations: 4538 * 4539 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4540 * 4541 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4542 * their current sched_class. Call them directly from sched core instead. 4543 */ 4544 DEFINE_SCHED_CLASS(ext) = { 4545 .enqueue_task = enqueue_task_scx, 4546 .dequeue_task = dequeue_task_scx, 4547 .yield_task = yield_task_scx, 4548 .yield_to_task = yield_to_task_scx, 4549 4550 .wakeup_preempt = wakeup_preempt_scx, 4551 4552 .pick_task = pick_task_scx, 4553 4554 .put_prev_task = put_prev_task_scx, 4555 .set_next_task = set_next_task_scx, 4556 4557 .select_task_rq = select_task_rq_scx, 4558 .task_woken = task_woken_scx, 4559 .set_cpus_allowed = set_cpus_allowed_scx, 4560 4561 .rq_online = rq_online_scx, 4562 .rq_offline = rq_offline_scx, 4563 4564 .task_tick = task_tick_scx, 4565 4566 .switching_to = switching_to_scx, 4567 .switched_from = switched_from_scx, 4568 .switched_to = switched_to_scx, 4569 .reweight_task = reweight_task_scx, 4570 .prio_changed = prio_changed_scx, 4571 4572 .update_curr = update_curr_scx, 4573 4574 #ifdef CONFIG_UCLAMP_TASK 4575 .uclamp_enabled = 1, 4576 #endif 4577 }; 4578 4579 static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4580 struct scx_sched *sch) 4581 { 4582 s32 cpu; 4583 4584 memset(dsq, 0, sizeof(*dsq)); 4585 4586 raw_spin_lock_init(&dsq->lock); 4587 INIT_LIST_HEAD(&dsq->list); 4588 dsq->id = dsq_id; 4589 dsq->sched = sch; 4590 4591 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4592 if (!dsq->pcpu) 4593 return -ENOMEM; 4594 4595 for_each_possible_cpu(cpu) { 4596 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4597 4598 pcpu->dsq = dsq; 4599 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4600 } 4601 4602 return 0; 4603 } 4604 4605 static void exit_dsq(struct scx_dispatch_q *dsq) 4606 { 4607 s32 cpu; 4608 4609 for_each_possible_cpu(cpu) { 4610 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4611 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4612 struct rq *rq = cpu_rq(cpu); 4613 4614 /* 4615 * There must have been a RCU grace period since the last 4616 * insertion and @dsq should be off the deferred list by now. 4617 */ 4618 if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4619 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4620 list_del_init(&dru->node); 4621 } 4622 } 4623 4624 free_percpu(dsq->pcpu); 4625 } 4626 4627 static void free_dsq_rcufn(struct rcu_head *rcu) 4628 { 4629 struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4630 4631 exit_dsq(dsq); 4632 kfree(dsq); 4633 } 4634 4635 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4636 { 4637 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4638 struct scx_dispatch_q *dsq, *tmp_dsq; 4639 4640 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4641 call_rcu(&dsq->rcu, free_dsq_rcufn); 4642 } 4643 4644 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4645 4646 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4647 { 4648 struct scx_dispatch_q *dsq; 4649 unsigned long flags; 4650 4651 rcu_read_lock(); 4652 4653 dsq = find_user_dsq(sch, dsq_id); 4654 if (!dsq) 4655 goto out_unlock_rcu; 4656 4657 raw_spin_lock_irqsave(&dsq->lock, flags); 4658 4659 if (dsq->nr) { 4660 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4661 dsq->id, dsq->nr); 4662 goto out_unlock_dsq; 4663 } 4664 4665 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4666 dsq_hash_params)) 4667 goto out_unlock_dsq; 4668 4669 /* 4670 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4671 * queueing more tasks. As this function can be called from anywhere, 4672 * freeing is bounced through an irq work to avoid nesting RCU 4673 * operations inside scheduler locks. 4674 */ 4675 dsq->id = SCX_DSQ_INVALID; 4676 if (llist_add(&dsq->free_node, &dsqs_to_free)) 4677 irq_work_queue(&free_dsq_irq_work); 4678 4679 out_unlock_dsq: 4680 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4681 out_unlock_rcu: 4682 rcu_read_unlock(); 4683 } 4684 4685 #ifdef CONFIG_EXT_GROUP_SCHED 4686 static void scx_cgroup_exit(struct scx_sched *sch) 4687 { 4688 struct cgroup_subsys_state *css; 4689 4690 scx_cgroup_enabled = false; 4691 4692 /* 4693 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4694 * cgroups and exit all the inited ones, all online cgroups are exited. 4695 */ 4696 css_for_each_descendant_post(css, &root_task_group.css) { 4697 struct task_group *tg = css_tg(css); 4698 4699 if (!(tg->scx.flags & SCX_TG_INITED)) 4700 continue; 4701 tg->scx.flags &= ~SCX_TG_INITED; 4702 4703 if (!sch->ops.cgroup_exit) 4704 continue; 4705 4706 SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); 4707 } 4708 } 4709 4710 static int scx_cgroup_init(struct scx_sched *sch) 4711 { 4712 struct cgroup_subsys_state *css; 4713 int ret; 4714 4715 /* 4716 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4717 * cgroups and init, all online cgroups are initialized. 4718 */ 4719 css_for_each_descendant_pre(css, &root_task_group.css) { 4720 struct task_group *tg = css_tg(css); 4721 struct scx_cgroup_init_args args = { 4722 .weight = tg->scx.weight, 4723 .bw_period_us = tg->scx.bw_period_us, 4724 .bw_quota_us = tg->scx.bw_quota_us, 4725 .bw_burst_us = tg->scx.bw_burst_us, 4726 }; 4727 4728 if ((tg->scx.flags & 4729 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4730 continue; 4731 4732 if (!sch->ops.cgroup_init) { 4733 tg->scx.flags |= SCX_TG_INITED; 4734 continue; 4735 } 4736 4737 ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, 4738 css->cgroup, &args); 4739 if (ret) { 4740 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4741 return ret; 4742 } 4743 tg->scx.flags |= SCX_TG_INITED; 4744 } 4745 4746 WARN_ON_ONCE(scx_cgroup_enabled); 4747 scx_cgroup_enabled = true; 4748 4749 return 0; 4750 } 4751 4752 #else 4753 static void scx_cgroup_exit(struct scx_sched *sch) {} 4754 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4755 #endif 4756 4757 4758 /******************************************************************************** 4759 * Sysfs interface and ops enable/disable. 4760 */ 4761 4762 #define SCX_ATTR(_name) \ 4763 static struct kobj_attribute scx_attr_##_name = { \ 4764 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4765 .show = scx_attr_##_name##_show, \ 4766 } 4767 4768 static ssize_t scx_attr_state_show(struct kobject *kobj, 4769 struct kobj_attribute *ka, char *buf) 4770 { 4771 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4772 } 4773 SCX_ATTR(state); 4774 4775 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4776 struct kobj_attribute *ka, char *buf) 4777 { 4778 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4779 } 4780 SCX_ATTR(switch_all); 4781 4782 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4783 struct kobj_attribute *ka, char *buf) 4784 { 4785 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4786 } 4787 SCX_ATTR(nr_rejected); 4788 4789 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4790 struct kobj_attribute *ka, char *buf) 4791 { 4792 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4793 } 4794 SCX_ATTR(hotplug_seq); 4795 4796 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4797 struct kobj_attribute *ka, char *buf) 4798 { 4799 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4800 } 4801 SCX_ATTR(enable_seq); 4802 4803 static struct attribute *scx_global_attrs[] = { 4804 &scx_attr_state.attr, 4805 &scx_attr_switch_all.attr, 4806 &scx_attr_nr_rejected.attr, 4807 &scx_attr_hotplug_seq.attr, 4808 &scx_attr_enable_seq.attr, 4809 NULL, 4810 }; 4811 4812 static const struct attribute_group scx_global_attr_group = { 4813 .attrs = scx_global_attrs, 4814 }; 4815 4816 static void free_pnode(struct scx_sched_pnode *pnode); 4817 static void free_exit_info(struct scx_exit_info *ei); 4818 4819 static void scx_sched_free_rcu_work(struct work_struct *work) 4820 { 4821 struct rcu_work *rcu_work = to_rcu_work(work); 4822 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 4823 struct rhashtable_iter rht_iter; 4824 struct scx_dispatch_q *dsq; 4825 int cpu, node; 4826 4827 irq_work_sync(&sch->disable_irq_work); 4828 kthread_destroy_worker(sch->helper); 4829 timer_shutdown_sync(&sch->bypass_lb_timer); 4830 free_cpumask_var(sch->bypass_lb_donee_cpumask); 4831 free_cpumask_var(sch->bypass_lb_resched_cpumask); 4832 4833 #ifdef CONFIG_EXT_SUB_SCHED 4834 kfree(sch->cgrp_path); 4835 if (sch_cgroup(sch)) 4836 cgroup_put(sch_cgroup(sch)); 4837 if (sch->sub_kset) 4838 kobject_put(&sch->sub_kset->kobj); 4839 #endif /* CONFIG_EXT_SUB_SCHED */ 4840 4841 for_each_possible_cpu(cpu) { 4842 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 4843 4844 /* 4845 * $sch would have entered bypass mode before the RCU grace 4846 * period. As that blocks new deferrals, all 4847 * deferred_reenq_local_node's must be off-list by now. 4848 */ 4849 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 4850 4851 exit_dsq(bypass_dsq(sch, cpu)); 4852 } 4853 4854 free_percpu(sch->pcpu); 4855 4856 for_each_node_state(node, N_POSSIBLE) 4857 free_pnode(sch->pnode[node]); 4858 kfree(sch->pnode); 4859 4860 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 4861 do { 4862 rhashtable_walk_start(&rht_iter); 4863 4864 while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 4865 destroy_dsq(sch, dsq->id); 4866 4867 rhashtable_walk_stop(&rht_iter); 4868 } while (dsq == ERR_PTR(-EAGAIN)); 4869 rhashtable_walk_exit(&rht_iter); 4870 4871 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 4872 free_exit_info(sch->exit_info); 4873 kfree(sch); 4874 } 4875 4876 static void scx_kobj_release(struct kobject *kobj) 4877 { 4878 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4879 4880 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 4881 queue_rcu_work(system_dfl_wq, &sch->rcu_work); 4882 } 4883 4884 static ssize_t scx_attr_ops_show(struct kobject *kobj, 4885 struct kobj_attribute *ka, char *buf) 4886 { 4887 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4888 4889 return sysfs_emit(buf, "%s\n", sch->ops.name); 4890 } 4891 SCX_ATTR(ops); 4892 4893 #define scx_attr_event_show(buf, at, events, kind) ({ \ 4894 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4895 }) 4896 4897 static ssize_t scx_attr_events_show(struct kobject *kobj, 4898 struct kobj_attribute *ka, char *buf) 4899 { 4900 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4901 struct scx_event_stats events; 4902 int at = 0; 4903 4904 scx_read_events(sch, &events); 4905 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4906 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4907 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4908 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4909 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4910 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 4911 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 4912 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 4913 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4914 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4915 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4916 at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 4917 at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 4918 return at; 4919 } 4920 SCX_ATTR(events); 4921 4922 static struct attribute *scx_sched_attrs[] = { 4923 &scx_attr_ops.attr, 4924 &scx_attr_events.attr, 4925 NULL, 4926 }; 4927 ATTRIBUTE_GROUPS(scx_sched); 4928 4929 static const struct kobj_type scx_ktype = { 4930 .release = scx_kobj_release, 4931 .sysfs_ops = &kobj_sysfs_ops, 4932 .default_groups = scx_sched_groups, 4933 }; 4934 4935 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 4936 { 4937 const struct scx_sched *sch; 4938 4939 /* 4940 * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) 4941 * and sub-scheduler kset kobjects (kset_ktype) through the parent 4942 * chain walk. Filter out the latter to avoid invalid casts. 4943 */ 4944 if (kobj->ktype != &scx_ktype) 4945 return 0; 4946 4947 sch = container_of(kobj, struct scx_sched, kobj); 4948 4949 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 4950 } 4951 4952 static const struct kset_uevent_ops scx_uevent_ops = { 4953 .uevent = scx_uevent, 4954 }; 4955 4956 /* 4957 * Used by sched_fork() and __setscheduler_prio() to pick the matching 4958 * sched_class. dl/rt are already handled. 4959 */ 4960 bool task_should_scx(int policy) 4961 { 4962 /* if disabled, nothing should be on it */ 4963 if (!scx_enabled()) 4964 return false; 4965 4966 /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ 4967 if (READ_ONCE(scx_switching_all)) 4968 return true; 4969 4970 /* 4971 * scx is tearing down - keep new SCHED_EXT tasks out. 4972 * 4973 * Must come after scx_switching_all test, which serves as a proxy 4974 * for __scx_switched_all. While __scx_switched_all is set, we must 4975 * return true via the branch above: a fork routed to fair would 4976 * stall because next_active_class() skips fair. 4977 * 4978 * This can develop into a deadlock - scx holds scx_enable_mutex across 4979 * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is 4980 * the stalled task, the disable path can never grab the mutex to clear 4981 * scx_switching_all. 4982 */ 4983 if (unlikely(scx_enable_state() == SCX_DISABLING)) 4984 return false; 4985 4986 return policy == SCHED_EXT; 4987 } 4988 4989 bool scx_allow_ttwu_queue(const struct task_struct *p) 4990 { 4991 struct scx_sched *sch; 4992 4993 if (!scx_enabled()) 4994 return true; 4995 4996 sch = scx_task_sched(p); 4997 if (unlikely(!sch)) 4998 return true; 4999 5000 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 5001 return true; 5002 5003 if (unlikely(p->sched_class != &ext_sched_class)) 5004 return true; 5005 5006 return false; 5007 } 5008 5009 /** 5010 * handle_lockup - sched_ext common lockup handler 5011 * @fmt: format string 5012 * 5013 * Called on system stall or lockup condition and initiates abort of sched_ext 5014 * if enabled, which may resolve the reported lockup. 5015 * 5016 * Returns %true if sched_ext is enabled and abort was initiated, which may 5017 * resolve the lockup. %false if sched_ext is not enabled or abort was already 5018 * initiated by someone else. 5019 */ 5020 static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 5021 { 5022 struct scx_sched *sch; 5023 va_list args; 5024 bool ret; 5025 5026 guard(rcu)(); 5027 5028 sch = rcu_dereference(scx_root); 5029 if (unlikely(!sch)) 5030 return false; 5031 5032 switch (scx_enable_state()) { 5033 case SCX_ENABLING: 5034 case SCX_ENABLED: 5035 va_start(args, fmt); 5036 ret = scx_verror(sch, fmt, args); 5037 va_end(args); 5038 return ret; 5039 default: 5040 return false; 5041 } 5042 } 5043 5044 /** 5045 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 5046 * 5047 * While there are various reasons why RCU CPU stalls can occur on a system 5048 * that may not be caused by the current BPF scheduler, try kicking out the 5049 * current scheduler in an attempt to recover the system to a good state before 5050 * issuing panics. 5051 * 5052 * Returns %true if sched_ext is enabled and abort was initiated, which may 5053 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 5054 * else already initiated abort. 5055 */ 5056 bool scx_rcu_cpu_stall(void) 5057 { 5058 return handle_lockup("RCU CPU stall detected!"); 5059 } 5060 5061 /** 5062 * scx_softlockup - sched_ext softlockup handler 5063 * @dur_s: number of seconds of CPU stuck due to soft lockup 5064 * 5065 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 5066 * live-lock the system by making many CPUs target the same DSQ to the point 5067 * where soft-lockup detection triggers. This function is called from 5068 * soft-lockup watchdog when the triggering point is close and tries to unjam 5069 * the system and aborting the BPF scheduler. 5070 */ 5071 void scx_softlockup(u32 dur_s) 5072 { 5073 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 5074 return; 5075 5076 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 5077 smp_processor_id(), dur_s); 5078 } 5079 5080 /* 5081 * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), 5082 * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing 5083 * it from NMI context can lead to deadlocks. Defer via irq_work; the 5084 * disable path runs off irq_work anyway. 5085 */ 5086 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); 5087 5088 static void scx_hardlockup_irq_workfn(struct irq_work *work) 5089 { 5090 int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); 5091 5092 if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) 5093 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 5094 cpu); 5095 } 5096 5097 static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); 5098 5099 /** 5100 * scx_hardlockup - sched_ext hardlockup handler 5101 * 5102 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 5103 * numerous affinitized tasks in a single queue and directing all CPUs at it. 5104 * Try kicking out the current scheduler in an attempt to recover the system to 5105 * a good state before taking more drastic actions. 5106 * 5107 * Queues an irq_work; the handle_lockup() call happens in IRQ context (see 5108 * scx_hardlockup_irq_workfn). 5109 * 5110 * Returns %true if sched_ext is enabled and the work was queued, %false 5111 * otherwise. 5112 */ 5113 bool scx_hardlockup(int cpu) 5114 { 5115 if (!rcu_access_pointer(scx_root)) 5116 return false; 5117 5118 atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); 5119 irq_work_queue(&scx_hardlockup_irq_work); 5120 return true; 5121 } 5122 5123 static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 5124 struct cpumask *donee_mask, struct cpumask *resched_mask, 5125 u32 nr_donor_target, u32 nr_donee_target) 5126 { 5127 struct rq *donor_rq = cpu_rq(donor); 5128 struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 5129 struct task_struct *p, *n; 5130 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 5131 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 5132 u32 nr_balanced = 0, min_delta_us; 5133 5134 /* 5135 * All we want to guarantee is reasonable forward progress. No reason to 5136 * fine tune. Assuming every task on @donor_dsq runs their full slice, 5137 * consider offloading iff the total queued duration is over the 5138 * threshold. 5139 */ 5140 min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 5141 if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 5142 return 0; 5143 5144 raw_spin_rq_lock_irq(donor_rq); 5145 raw_spin_lock(&donor_dsq->lock); 5146 list_add(&cursor.node, &donor_dsq->list); 5147 resume: 5148 n = container_of(&cursor, struct task_struct, scx.dsq_list); 5149 n = nldsq_next_task(donor_dsq, n, false); 5150 5151 while ((p = n)) { 5152 struct scx_dispatch_q *donee_dsq; 5153 int donee; 5154 5155 n = nldsq_next_task(donor_dsq, n, false); 5156 5157 if (donor_dsq->nr <= nr_donor_target) 5158 break; 5159 5160 if (cpumask_empty(donee_mask)) 5161 break; 5162 5163 /* 5164 * If an earlier pass placed @p on @donor_dsq from a different 5165 * CPU and the donee hasn't consumed it yet, @p is still on the 5166 * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved 5167 * without its rq locked. Skip. 5168 */ 5169 if (task_rq(p) != donor_rq) 5170 continue; 5171 5172 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5173 if (donee >= nr_cpu_ids) 5174 continue; 5175 5176 donee_dsq = bypass_dsq(sch, donee); 5177 5178 /* 5179 * $p's rq is not locked but $p's DSQ lock protects its 5180 * scheduling properties making this test safe. 5181 */ 5182 if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5183 continue; 5184 5185 /* 5186 * Moving $p from one non-local DSQ to another. The source rq 5187 * and DSQ are already locked. Do an abbreviated dequeue and 5188 * then perform enqueue without unlocking $donor_dsq. 5189 * 5190 * We don't want to drop and reacquire the lock on each 5191 * iteration as @donor_dsq can be very long and potentially 5192 * highly contended. Donee DSQs are less likely to be contended. 5193 * The nested locking is safe as only this LB moves tasks 5194 * between bypass DSQs. 5195 */ 5196 dispatch_dequeue_locked(p, donor_dsq); 5197 dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5198 5199 /* 5200 * $donee might have been idle and need to be woken up. No need 5201 * to be clever. Kick every CPU that receives tasks. 5202 */ 5203 cpumask_set_cpu(donee, resched_mask); 5204 5205 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5206 cpumask_clear_cpu(donee, donee_mask); 5207 5208 nr_balanced++; 5209 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5210 list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5211 raw_spin_unlock(&donor_dsq->lock); 5212 raw_spin_rq_unlock_irq(donor_rq); 5213 cpu_relax(); 5214 raw_spin_rq_lock_irq(donor_rq); 5215 raw_spin_lock(&donor_dsq->lock); 5216 goto resume; 5217 } 5218 } 5219 5220 list_del_init(&cursor.node); 5221 raw_spin_unlock(&donor_dsq->lock); 5222 raw_spin_rq_unlock_irq(donor_rq); 5223 5224 return nr_balanced; 5225 } 5226 5227 static void bypass_lb_node(struct scx_sched *sch, int node) 5228 { 5229 const struct cpumask *node_mask = cpumask_of_node(node); 5230 struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; 5231 struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; 5232 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5233 u32 nr_target, nr_donor_target; 5234 u32 before_min = U32_MAX, before_max = 0; 5235 u32 after_min = U32_MAX, after_max = 0; 5236 int cpu; 5237 5238 /* count the target tasks and CPUs */ 5239 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5240 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5241 5242 nr_tasks += nr; 5243 nr_cpus++; 5244 5245 before_min = min(nr, before_min); 5246 before_max = max(nr, before_max); 5247 } 5248 5249 if (!nr_cpus) 5250 return; 5251 5252 /* 5253 * We don't want CPUs to have more than $nr_donor_target tasks and 5254 * balancing to fill donee CPUs upto $nr_target. Once targets are 5255 * calculated, find the donee CPUs. 5256 */ 5257 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5258 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5259 5260 cpumask_clear(donee_mask); 5261 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5262 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5263 cpumask_set_cpu(cpu, donee_mask); 5264 } 5265 5266 /* iterate !donee CPUs and see if they should be offloaded */ 5267 cpumask_clear(resched_mask); 5268 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5269 if (cpumask_empty(donee_mask)) 5270 break; 5271 if (cpumask_test_cpu(cpu, donee_mask)) 5272 continue; 5273 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5274 continue; 5275 5276 nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5277 nr_donor_target, nr_target); 5278 } 5279 5280 for_each_cpu(cpu, resched_mask) 5281 resched_cpu(cpu); 5282 5283 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5284 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5285 5286 after_min = min(nr, after_min); 5287 after_max = max(nr, after_max); 5288 5289 } 5290 5291 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5292 before_min, before_max, after_min, after_max); 5293 } 5294 5295 /* 5296 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5297 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5298 * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5299 * lightly loaded CPUs, such imbalance can lead to very high execution latency 5300 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5301 * outcomes, a simple load balancing mechanism is implemented by the following 5302 * timer which runs periodically while bypass mode is in effect. 5303 */ 5304 static void scx_bypass_lb_timerfn(struct timer_list *timer) 5305 { 5306 struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5307 int node; 5308 u32 intv_us; 5309 5310 if (!bypass_dsp_enabled(sch)) 5311 return; 5312 5313 for_each_node_with_cpus(node) 5314 bypass_lb_node(sch, node); 5315 5316 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5317 if (intv_us) 5318 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5319 } 5320 5321 static bool inc_bypass_depth(struct scx_sched *sch) 5322 { 5323 lockdep_assert_held(&scx_bypass_lock); 5324 5325 WARN_ON_ONCE(sch->bypass_depth < 0); 5326 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5327 if (sch->bypass_depth != 1) 5328 return false; 5329 5330 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5331 sch->bypass_timestamp = ktime_get_ns(); 5332 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5333 return true; 5334 } 5335 5336 static bool dec_bypass_depth(struct scx_sched *sch) 5337 { 5338 lockdep_assert_held(&scx_bypass_lock); 5339 5340 WARN_ON_ONCE(sch->bypass_depth < 1); 5341 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5342 if (sch->bypass_depth != 0) 5343 return false; 5344 5345 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5346 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5347 ktime_get_ns() - sch->bypass_timestamp); 5348 return true; 5349 } 5350 5351 static void enable_bypass_dsp(struct scx_sched *sch) 5352 { 5353 struct scx_sched *host = scx_parent(sch) ?: sch; 5354 u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5355 s32 ret; 5356 5357 /* 5358 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5359 * Shouldn't stagger. 5360 */ 5361 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5362 return; 5363 5364 /* 5365 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5366 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5367 * called iff @sch is not already bypassed due to an ancestor bypassing, 5368 * we can assume that the parent is not bypassing and thus will be the 5369 * host of the bypass DSQs. 5370 * 5371 * While the situation may change in the future, the following 5372 * guarantees that the nearest non-bypassing ancestor or root has bypass 5373 * dispatch enabled while a descendant is bypassing, which is all that's 5374 * required. 5375 * 5376 * bypass_dsp_enabled() test is used to determine whether to enter the 5377 * bypass dispatch handling path from both bypassing and hosting scheds. 5378 * Bump enable depth on both @sch and bypass dispatch host. 5379 */ 5380 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5381 WARN_ON_ONCE(ret <= 0); 5382 5383 if (host != sch) { 5384 ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5385 WARN_ON_ONCE(ret <= 0); 5386 } 5387 5388 /* 5389 * The LB timer will stop running if bypass dispatch is disabled. Start 5390 * after enabling bypass dispatch. 5391 */ 5392 if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5393 mod_timer(&host->bypass_lb_timer, 5394 jiffies + usecs_to_jiffies(intv_us)); 5395 } 5396 5397 /* may be called without holding scx_bypass_lock */ 5398 static void disable_bypass_dsp(struct scx_sched *sch) 5399 { 5400 s32 ret; 5401 5402 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5403 return; 5404 5405 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5406 WARN_ON_ONCE(ret < 0); 5407 5408 if (scx_parent(sch)) { 5409 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5410 WARN_ON_ONCE(ret < 0); 5411 } 5412 } 5413 5414 /** 5415 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5416 * @sch: sched to bypass 5417 * @bypass: true for bypass, false for unbypass 5418 * 5419 * Bypassing guarantees that all runnable tasks make forward progress without 5420 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5421 * be held by tasks that the BPF scheduler is forgetting to run, which 5422 * unfortunately also excludes toggling the static branches. 5423 * 5424 * Let's work around by overriding a couple ops and modifying behaviors based on 5425 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5426 * to force global FIFO scheduling. 5427 * 5428 * - ops.select_cpu() is ignored and the default select_cpu() is used. 5429 * 5430 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5431 * %SCX_OPS_ENQ_LAST is also ignored. 5432 * 5433 * - ops.dispatch() is ignored. 5434 * 5435 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5436 * can't be trusted. Whenever a tick triggers, the running task is rotated to 5437 * the tail of the queue with core_sched_at touched. 5438 * 5439 * - pick_next_task() suppresses zero slice warning. 5440 * 5441 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5442 * operations. 5443 * 5444 * - scx_prio_less() reverts to the default core_sched_at order. 5445 */ 5446 static void scx_bypass(struct scx_sched *sch, bool bypass) 5447 { 5448 struct scx_sched *pos; 5449 unsigned long flags; 5450 int cpu; 5451 5452 raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5453 5454 if (bypass) { 5455 if (!inc_bypass_depth(sch)) 5456 goto unlock; 5457 5458 enable_bypass_dsp(sch); 5459 } else { 5460 if (!dec_bypass_depth(sch)) 5461 goto unlock; 5462 } 5463 5464 /* 5465 * Bypass state is propagated to all descendants - an scx_sched bypasses 5466 * if itself or any of its ancestors are in bypass mode. 5467 */ 5468 raw_spin_lock(&scx_sched_lock); 5469 scx_for_each_descendant_pre(pos, sch) { 5470 if (pos == sch) 5471 continue; 5472 if (bypass) 5473 inc_bypass_depth(pos); 5474 else 5475 dec_bypass_depth(pos); 5476 } 5477 raw_spin_unlock(&scx_sched_lock); 5478 5479 /* 5480 * No task property is changing. We just need to make sure all currently 5481 * queued tasks are re-queued according to the new scx_bypassing() 5482 * state. As an optimization, walk each rq's runnable_list instead of 5483 * the scx_tasks list. 5484 * 5485 * This function can't trust the scheduler and thus can't use 5486 * cpus_read_lock(). Walk all possible CPUs instead of online. 5487 */ 5488 for_each_possible_cpu(cpu) { 5489 struct rq *rq = cpu_rq(cpu); 5490 struct task_struct *p, *n; 5491 5492 raw_spin_rq_lock(rq); 5493 raw_spin_lock(&scx_sched_lock); 5494 5495 scx_for_each_descendant_pre(pos, sch) { 5496 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5497 5498 if (pos->bypass_depth) 5499 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5500 else 5501 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5502 } 5503 5504 raw_spin_unlock(&scx_sched_lock); 5505 5506 /* 5507 * We need to guarantee that no tasks are on the BPF scheduler 5508 * while bypassing. Either we see enabled or the enable path 5509 * sees scx_bypassing() before moving tasks to SCX. 5510 */ 5511 if (!scx_enabled()) { 5512 raw_spin_rq_unlock(rq); 5513 continue; 5514 } 5515 5516 /* 5517 * The use of list_for_each_entry_safe_reverse() is required 5518 * because each task is going to be removed from and added back 5519 * to the runnable_list during iteration. Because they're added 5520 * to the tail of the list, safe reverse iteration can still 5521 * visit all nodes. 5522 */ 5523 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5524 scx.runnable_node) { 5525 if (!scx_is_descendant(scx_task_sched(p), sch)) 5526 continue; 5527 5528 /* cycling deq/enq is enough, see the function comment */ 5529 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5530 /* nothing */ ; 5531 } 5532 } 5533 5534 /* resched to restore ticks and idle state */ 5535 if (cpu_online(cpu) || cpu == smp_processor_id()) 5536 resched_curr(rq); 5537 5538 raw_spin_rq_unlock(rq); 5539 } 5540 5541 /* disarming must come after moving all tasks out of the bypass DSQs */ 5542 if (!bypass) 5543 disable_bypass_dsp(sch); 5544 unlock: 5545 raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5546 } 5547 5548 static void free_exit_info(struct scx_exit_info *ei) 5549 { 5550 kvfree(ei->dump); 5551 kfree(ei->msg); 5552 kfree(ei->bt); 5553 kfree(ei); 5554 } 5555 5556 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5557 { 5558 struct scx_exit_info *ei; 5559 5560 ei = kzalloc_obj(*ei); 5561 if (!ei) 5562 return NULL; 5563 5564 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5565 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5566 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5567 5568 if (!ei->bt || !ei->msg || !ei->dump) { 5569 free_exit_info(ei); 5570 return NULL; 5571 } 5572 5573 return ei; 5574 } 5575 5576 static const char *scx_exit_reason(enum scx_exit_kind kind) 5577 { 5578 switch (kind) { 5579 case SCX_EXIT_UNREG: 5580 return "unregistered from user space"; 5581 case SCX_EXIT_UNREG_BPF: 5582 return "unregistered from BPF"; 5583 case SCX_EXIT_UNREG_KERN: 5584 return "unregistered from the main kernel"; 5585 case SCX_EXIT_SYSRQ: 5586 return "disabled by sysrq-S"; 5587 case SCX_EXIT_PARENT: 5588 return "parent exiting"; 5589 case SCX_EXIT_ERROR: 5590 return "runtime error"; 5591 case SCX_EXIT_ERROR_BPF: 5592 return "scx_bpf_error"; 5593 case SCX_EXIT_ERROR_STALL: 5594 return "runnable task stall"; 5595 default: 5596 return "<UNKNOWN>"; 5597 } 5598 } 5599 5600 static void free_kick_syncs(void) 5601 { 5602 int cpu; 5603 5604 for_each_possible_cpu(cpu) { 5605 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5606 struct scx_kick_syncs *to_free; 5607 5608 to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5609 if (to_free) 5610 kvfree_rcu(to_free, rcu); 5611 } 5612 } 5613 5614 static void refresh_watchdog(void) 5615 { 5616 struct scx_sched *sch; 5617 unsigned long intv = ULONG_MAX; 5618 5619 /* take the shortest timeout and use its half for watchdog interval */ 5620 rcu_read_lock(); 5621 list_for_each_entry_rcu(sch, &scx_sched_all, all) 5622 intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5623 rcu_read_unlock(); 5624 5625 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5626 WRITE_ONCE(scx_watchdog_interval, intv); 5627 5628 if (intv < ULONG_MAX) 5629 mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5630 else 5631 cancel_delayed_work_sync(&scx_watchdog_work); 5632 } 5633 5634 static s32 scx_link_sched(struct scx_sched *sch) 5635 { 5636 const char *err_msg = ""; 5637 s32 ret = 0; 5638 5639 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5640 #ifdef CONFIG_EXT_SUB_SCHED 5641 struct scx_sched *parent = scx_parent(sch); 5642 5643 if (parent) { 5644 /* 5645 * scx_claim_exit() propagates exit_kind transition to 5646 * its sub-scheds while holding scx_sched_lock - either 5647 * we can see the parent's non-NONE exit_kind or the 5648 * parent can shoot us down. 5649 */ 5650 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5651 err_msg = "parent disabled"; 5652 ret = -ENOENT; 5653 break; 5654 } 5655 5656 ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5657 &sch->hash_node, scx_sched_hash_params); 5658 if (ret) { 5659 err_msg = "failed to insert into scx_sched_hash"; 5660 break; 5661 } 5662 5663 list_add_tail(&sch->sibling, &parent->children); 5664 } 5665 #endif /* CONFIG_EXT_SUB_SCHED */ 5666 5667 list_add_tail_rcu(&sch->all, &scx_sched_all); 5668 } 5669 5670 /* 5671 * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after 5672 * the guard above is released. 5673 */ 5674 if (ret) { 5675 scx_error(sch, "%s (%d)", err_msg, ret); 5676 return ret; 5677 } 5678 5679 refresh_watchdog(); 5680 return 0; 5681 } 5682 5683 static void scx_unlink_sched(struct scx_sched *sch) 5684 { 5685 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5686 #ifdef CONFIG_EXT_SUB_SCHED 5687 if (scx_parent(sch)) { 5688 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5689 scx_sched_hash_params); 5690 list_del_init(&sch->sibling); 5691 } 5692 #endif /* CONFIG_EXT_SUB_SCHED */ 5693 list_del_rcu(&sch->all); 5694 } 5695 5696 refresh_watchdog(); 5697 } 5698 5699 /* 5700 * Called to disable future dumps and wait for in-progress one while disabling 5701 * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5702 * This prevents calling dump ops on a dead sch. 5703 */ 5704 static void scx_disable_dump(struct scx_sched *sch) 5705 { 5706 guard(raw_spinlock_irqsave)(&scx_dump_lock); 5707 sch->dump_disabled = true; 5708 } 5709 5710 #ifdef CONFIG_EXT_SUB_SCHED 5711 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5712 5713 static void drain_descendants(struct scx_sched *sch) 5714 { 5715 /* 5716 * Child scheds that finished the critical part of disabling will take 5717 * themselves off @sch->children. Wait for it to drain. As propagation 5718 * is recursive, empty @sch->children means that all proper descendant 5719 * scheds reached unlinking stage. 5720 */ 5721 wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5722 } 5723 5724 static void scx_fail_parent(struct scx_sched *sch, 5725 struct task_struct *failed, s32 fail_code) 5726 { 5727 struct scx_sched *parent = scx_parent(sch); 5728 struct scx_task_iter sti; 5729 struct task_struct *p; 5730 5731 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5732 fail_code, failed->comm, failed->pid); 5733 5734 /* 5735 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5736 * it. This may cause downstream failures on the BPF side but $parent is 5737 * dying anyway. 5738 */ 5739 scx_bypass(parent, true); 5740 5741 scx_task_iter_start(&sti, sch->cgrp); 5742 while ((p = scx_task_iter_next_locked(&sti))) { 5743 if (scx_task_on_sched(parent, p)) 5744 continue; 5745 5746 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5747 scx_disable_and_exit_task(sch, p); 5748 scx_set_task_sched(p, parent); 5749 } 5750 } 5751 scx_task_iter_stop(&sti); 5752 } 5753 5754 static void scx_sub_disable(struct scx_sched *sch) 5755 { 5756 struct scx_sched *parent = scx_parent(sch); 5757 struct scx_task_iter sti; 5758 struct task_struct *p; 5759 int ret; 5760 5761 /* 5762 * Guarantee forward progress and wait for descendants to be disabled. 5763 * To limit disruptions, $parent is not bypassed. Tasks are fully 5764 * prepped and then inserted back into $parent. 5765 */ 5766 scx_bypass(sch, true); 5767 drain_descendants(sch); 5768 5769 /* 5770 * Here, every runnable task is guaranteed to make forward progress and 5771 * we can safely use blocking synchronization constructs. Actually 5772 * disable ops. 5773 */ 5774 mutex_lock(&scx_enable_mutex); 5775 percpu_down_write(&scx_fork_rwsem); 5776 scx_cgroup_lock(); 5777 5778 set_cgroup_sched(sch_cgroup(sch), parent); 5779 5780 scx_task_iter_start(&sti, sch->cgrp); 5781 while ((p = scx_task_iter_next_locked(&sti))) { 5782 struct rq *rq; 5783 struct rq_flags rf; 5784 5785 /* filter out duplicate visits */ 5786 if (scx_task_on_sched(parent, p)) 5787 continue; 5788 5789 /* 5790 * By the time control reaches here, all descendant schedulers 5791 * should already have been disabled. 5792 */ 5793 WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 5794 5795 /* 5796 * If $p is about to be freed, nothing prevents $sch from 5797 * unloading before $p reaches sched_ext_free(). Disable and 5798 * exit $p right away. 5799 */ 5800 if (!tryget_task_struct(p)) { 5801 scx_disable_and_exit_task(sch, p); 5802 continue; 5803 } 5804 5805 scx_task_iter_unlock(&sti); 5806 5807 /* 5808 * $p is READY or ENABLED on @sch. Initialize for $parent, 5809 * disable and exit from @sch, and then switch over to $parent. 5810 * 5811 * If a task fails to initialize for $parent, the only available 5812 * action is disabling $parent too. While this allows disabling 5813 * of a child sched to cause the parent scheduler to fail, the 5814 * failure can only originate from ops.init_task() of the 5815 * parent. A child can't directly affect the parent through its 5816 * own failures. 5817 */ 5818 ret = __scx_init_task(parent, p, false); 5819 if (ret) { 5820 scx_fail_parent(sch, p, ret); 5821 put_task_struct(p); 5822 break; 5823 } 5824 5825 rq = task_rq_lock(p, &rf); 5826 5827 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 5828 /* 5829 * sched_ext_dead() raced us between __scx_init_task() 5830 * and this rq lock and ran exit_task() on @sch (the 5831 * sched @p was on at that point), not on $parent. 5832 * $parent's just-completed init is owed an exit_task() 5833 * and we issue it here. 5834 */ 5835 scx_sub_init_cancel_task(parent, p); 5836 task_rq_unlock(rq, p, &rf); 5837 put_task_struct(p); 5838 continue; 5839 } 5840 5841 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5842 /* 5843 * $p is initialized for $parent and still attached to 5844 * @sch. Disable and exit for @sch, switch over to 5845 * $parent, override the state to READY to account for 5846 * $p having already been initialized, and then enable. 5847 */ 5848 scx_disable_and_exit_task(sch, p); 5849 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 5850 scx_set_task_state(p, SCX_TASK_INIT); 5851 scx_set_task_sched(p, parent); 5852 scx_set_task_state(p, SCX_TASK_READY); 5853 scx_enable_task(parent, p); 5854 } 5855 5856 task_rq_unlock(rq, p, &rf); 5857 put_task_struct(p); 5858 } 5859 scx_task_iter_stop(&sti); 5860 5861 scx_disable_dump(sch); 5862 5863 scx_cgroup_unlock(); 5864 percpu_up_write(&scx_fork_rwsem); 5865 5866 /* 5867 * All tasks are moved off of @sch but there may still be on-going 5868 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 5869 * the expedited version as ancestors may be waiting in bypass mode. 5870 * Also, tell the parent that there is no need to keep running bypass 5871 * DSQs for us. 5872 */ 5873 synchronize_rcu_expedited(); 5874 disable_bypass_dsp(sch); 5875 5876 scx_unlink_sched(sch); 5877 5878 mutex_unlock(&scx_enable_mutex); 5879 5880 /* 5881 * @sch is now unlinked from the parent's children list. Notify and call 5882 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 5883 * after unlinking and releasing all locks. See scx_claim_exit(). 5884 */ 5885 wake_up_all(&scx_unlink_waitq); 5886 5887 if (parent->ops.sub_detach && sch->sub_attached) { 5888 struct scx_sub_detach_args sub_detach_args = { 5889 .ops = &sch->ops, 5890 .cgroup_path = sch->cgrp_path, 5891 }; 5892 SCX_CALL_OP(parent, sub_detach, NULL, 5893 &sub_detach_args); 5894 } 5895 5896 if (sch->ops.exit) 5897 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 5898 if (sch->sub_kset) 5899 kobject_del(&sch->sub_kset->kobj); 5900 kobject_del(&sch->kobj); 5901 } 5902 #else /* CONFIG_EXT_SUB_SCHED */ 5903 static void drain_descendants(struct scx_sched *sch) { } 5904 static void scx_sub_disable(struct scx_sched *sch) { } 5905 #endif /* CONFIG_EXT_SUB_SCHED */ 5906 5907 static void scx_root_disable(struct scx_sched *sch) 5908 { 5909 struct scx_exit_info *ei = sch->exit_info; 5910 struct scx_task_iter sti; 5911 struct task_struct *p; 5912 int cpu; 5913 5914 /* guarantee forward progress and wait for descendants to be disabled */ 5915 scx_bypass(sch, true); 5916 drain_descendants(sch); 5917 5918 switch (scx_set_enable_state(SCX_DISABLING)) { 5919 case SCX_DISABLING: 5920 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 5921 break; 5922 case SCX_DISABLED: 5923 pr_warn("sched_ext: ops error detected without ops (%s)\n", 5924 sch->exit_info->msg); 5925 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 5926 goto done; 5927 default: 5928 break; 5929 } 5930 5931 /* 5932 * Here, every runnable task is guaranteed to make forward progress and 5933 * we can safely use blocking synchronization constructs. Actually 5934 * disable ops. 5935 */ 5936 mutex_lock(&scx_enable_mutex); 5937 5938 static_branch_disable(&__scx_switched_all); 5939 WRITE_ONCE(scx_switching_all, false); 5940 5941 /* 5942 * Shut down cgroup support before tasks so that the cgroup attach path 5943 * doesn't race against scx_disable_and_exit_task(). 5944 */ 5945 scx_cgroup_lock(); 5946 scx_cgroup_exit(sch); 5947 scx_cgroup_unlock(); 5948 5949 /* 5950 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 5951 * must be switched out and exited synchronously. 5952 */ 5953 percpu_down_write(&scx_fork_rwsem); 5954 5955 scx_init_task_enabled = false; 5956 5957 scx_task_iter_start(&sti, NULL); 5958 while ((p = scx_task_iter_next_locked(&sti))) { 5959 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 5960 const struct sched_class *old_class = p->sched_class; 5961 const struct sched_class *new_class = scx_setscheduler_class(p); 5962 5963 update_rq_clock(task_rq(p)); 5964 5965 if (old_class != new_class) 5966 queue_flags |= DEQUEUE_CLASS; 5967 5968 scoped_guard (sched_change, p, queue_flags) { 5969 p->sched_class = new_class; 5970 } 5971 5972 scx_disable_and_exit_task(scx_task_sched(p), p); 5973 } 5974 scx_task_iter_stop(&sti); 5975 5976 scx_disable_dump(sch); 5977 5978 scx_cgroup_lock(); 5979 set_cgroup_sched(sch_cgroup(sch), NULL); 5980 scx_cgroup_unlock(); 5981 5982 percpu_up_write(&scx_fork_rwsem); 5983 5984 /* 5985 * Invalidate all the rq clocks to prevent getting outdated 5986 * rq clocks from a previous scx scheduler. 5987 */ 5988 for_each_possible_cpu(cpu) { 5989 struct rq *rq = cpu_rq(cpu); 5990 scx_rq_clock_invalidate(rq); 5991 } 5992 5993 /* no task is on scx, turn off all the switches and flush in-progress calls */ 5994 static_branch_disable(&__scx_enabled); 5995 bitmap_zero(sch->has_op, SCX_OPI_END); 5996 scx_idle_disable(); 5997 synchronize_rcu(); 5998 5999 if (ei->kind >= SCX_EXIT_ERROR) { 6000 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 6001 sch->ops.name, ei->reason); 6002 6003 if (ei->msg[0] != '\0') 6004 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 6005 #ifdef CONFIG_STACKTRACE 6006 stack_trace_print(ei->bt, ei->bt_len, 2); 6007 #endif 6008 } else { 6009 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 6010 sch->ops.name, ei->reason); 6011 } 6012 6013 if (sch->ops.exit) 6014 SCX_CALL_OP(sch, exit, NULL, ei); 6015 6016 scx_unlink_sched(sch); 6017 6018 /* 6019 * scx_root clearing must be inside cpus_read_lock(). See 6020 * handle_hotplug(). 6021 */ 6022 cpus_read_lock(); 6023 RCU_INIT_POINTER(scx_root, NULL); 6024 cpus_read_unlock(); 6025 6026 /* 6027 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 6028 * could observe an object of the same name still in the hierarchy when 6029 * the next scheduler is loaded. 6030 */ 6031 #ifdef CONFIG_EXT_SUB_SCHED 6032 if (sch->sub_kset) 6033 kobject_del(&sch->sub_kset->kobj); 6034 #endif 6035 kobject_del(&sch->kobj); 6036 6037 free_kick_syncs(); 6038 6039 mutex_unlock(&scx_enable_mutex); 6040 6041 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6042 done: 6043 scx_bypass(sch, false); 6044 } 6045 6046 /* 6047 * Claim the exit on @sch. The caller must ensure that the helper kthread work 6048 * is kicked before the current task can be preempted. Once exit_kind is 6049 * claimed, scx_error() can no longer trigger, so if the current task gets 6050 * preempted and the BPF scheduler fails to schedule it back, the helper work 6051 * will never be kicked and the whole system can wedge. 6052 */ 6053 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 6054 { 6055 int none = SCX_EXIT_NONE; 6056 6057 lockdep_assert_preemption_disabled(); 6058 6059 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 6060 kind = SCX_EXIT_ERROR; 6061 6062 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 6063 return false; 6064 6065 /* 6066 * Some CPUs may be trapped in the dispatch paths. Set the aborting 6067 * flag to break potential live-lock scenarios, ensuring we can 6068 * successfully reach scx_bypass(). 6069 */ 6070 WRITE_ONCE(sch->aborting, true); 6071 6072 /* 6073 * Propagate exits to descendants immediately. Each has a dedicated 6074 * helper kthread and can run in parallel. While most of disabling is 6075 * serialized, running them in separate threads allows parallelizing 6076 * ops.exit(), which can take arbitrarily long prolonging bypass mode. 6077 * 6078 * To guarantee forward progress, this propagation must be in-line so 6079 * that ->aborting is synchronously asserted for all sub-scheds. The 6080 * propagation is also the interlocking point against sub-sched 6081 * attachment. See scx_link_sched(). 6082 * 6083 * This doesn't cause recursions as propagation only takes place for 6084 * non-propagation exits. 6085 */ 6086 if (kind != SCX_EXIT_PARENT) { 6087 scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 6088 struct scx_sched *pos; 6089 scx_for_each_descendant_pre(pos, sch) 6090 scx_disable(pos, SCX_EXIT_PARENT); 6091 } 6092 } 6093 6094 return true; 6095 } 6096 6097 static void scx_disable_workfn(struct kthread_work *work) 6098 { 6099 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 6100 struct scx_exit_info *ei = sch->exit_info; 6101 int kind; 6102 6103 kind = atomic_read(&sch->exit_kind); 6104 while (true) { 6105 if (kind == SCX_EXIT_DONE) /* already disabled? */ 6106 return; 6107 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 6108 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 6109 break; 6110 } 6111 ei->kind = kind; 6112 ei->reason = scx_exit_reason(ei->kind); 6113 6114 if (scx_parent(sch)) 6115 scx_sub_disable(sch); 6116 else 6117 scx_root_disable(sch); 6118 } 6119 6120 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 6121 { 6122 guard(preempt)(); 6123 if (scx_claim_exit(sch, kind)) 6124 irq_work_queue(&sch->disable_irq_work); 6125 } 6126 6127 /** 6128 * scx_flush_disable_work - flush the disable work and wait for it to finish 6129 * @sch: the scheduler 6130 * 6131 * sch->disable_work might still not queued, causing kthread_flush_work() 6132 * as a noop. Syncing the irq_work first is required to guarantee the 6133 * kthread work has been queued before waiting for it. 6134 */ 6135 static void scx_flush_disable_work(struct scx_sched *sch) 6136 { 6137 int kind; 6138 6139 do { 6140 irq_work_sync(&sch->disable_irq_work); 6141 kthread_flush_work(&sch->disable_work); 6142 kind = atomic_read(&sch->exit_kind); 6143 } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); 6144 } 6145 6146 static void dump_newline(struct seq_buf *s) 6147 { 6148 trace_sched_ext_dump(""); 6149 6150 /* @s may be zero sized and seq_buf triggers WARN if so */ 6151 if (s->size) 6152 seq_buf_putc(s, '\n'); 6153 } 6154 6155 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 6156 { 6157 va_list args; 6158 6159 #ifdef CONFIG_TRACEPOINTS 6160 if (trace_sched_ext_dump_enabled()) { 6161 /* protected by scx_dump_lock */ 6162 static char line_buf[SCX_EXIT_MSG_LEN]; 6163 6164 va_start(args, fmt); 6165 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 6166 va_end(args); 6167 6168 trace_sched_ext_dump(line_buf); 6169 } 6170 #endif 6171 /* @s may be zero sized and seq_buf triggers WARN if so */ 6172 if (s->size) { 6173 va_start(args, fmt); 6174 seq_buf_vprintf(s, fmt, args); 6175 va_end(args); 6176 6177 seq_buf_putc(s, '\n'); 6178 } 6179 } 6180 6181 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 6182 const unsigned long *bt, unsigned int len) 6183 { 6184 unsigned int i; 6185 6186 for (i = 0; i < len; i++) 6187 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 6188 } 6189 6190 static void ops_dump_init(struct seq_buf *s, const char *prefix) 6191 { 6192 struct scx_dump_data *dd = &scx_dump_data; 6193 6194 lockdep_assert_irqs_disabled(); 6195 6196 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 6197 dd->first = true; 6198 dd->cursor = 0; 6199 dd->s = s; 6200 dd->prefix = prefix; 6201 } 6202 6203 static void ops_dump_flush(void) 6204 { 6205 struct scx_dump_data *dd = &scx_dump_data; 6206 char *line = dd->buf.line; 6207 6208 if (!dd->cursor) 6209 return; 6210 6211 /* 6212 * There's something to flush and this is the first line. Insert a blank 6213 * line to distinguish ops dump. 6214 */ 6215 if (dd->first) { 6216 dump_newline(dd->s); 6217 dd->first = false; 6218 } 6219 6220 /* 6221 * There may be multiple lines in $line. Scan and emit each line 6222 * separately. 6223 */ 6224 while (true) { 6225 char *end = line; 6226 char c; 6227 6228 while (*end != '\n' && *end != '\0') 6229 end++; 6230 6231 /* 6232 * If $line overflowed, it may not have newline at the end. 6233 * Always emit with a newline. 6234 */ 6235 c = *end; 6236 *end = '\0'; 6237 dump_line(dd->s, "%s%s", dd->prefix, line); 6238 if (c == '\0') 6239 break; 6240 6241 /* move to the next line */ 6242 end++; 6243 if (*end == '\0') 6244 break; 6245 line = end; 6246 } 6247 6248 dd->cursor = 0; 6249 } 6250 6251 static void ops_dump_exit(void) 6252 { 6253 ops_dump_flush(); 6254 scx_dump_data.cpu = -1; 6255 } 6256 6257 static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, 6258 struct rq *rq, struct task_struct *p, char marker) 6259 { 6260 static unsigned long bt[SCX_EXIT_BT_LEN]; 6261 struct scx_sched *task_sch = scx_task_sched(p); 6262 const char *own_marker; 6263 char sch_id_buf[32]; 6264 char dsq_id_buf[19] = "(n/a)"; 6265 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6266 unsigned int bt_len = 0; 6267 6268 own_marker = task_sch == sch ? "*" : ""; 6269 6270 if (task_sch->level == 0) 6271 scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6272 else 6273 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6274 task_sch->level, task_sch->ops.sub_cgroup_id); 6275 6276 if (p->scx.dsq) 6277 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6278 (unsigned long long)p->scx.dsq->id); 6279 6280 dump_newline(s); 6281 dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6282 marker, task_state_to_char(p), p->comm, p->pid, 6283 own_marker, sch_id_buf, 6284 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6285 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6286 scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6287 p->scx.flags & ~SCX_TASK_STATE_MASK, 6288 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6289 ops_state >> SCX_OPSS_QSEQ_SHIFT); 6290 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6291 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6292 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6293 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6294 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6295 p->migration_disabled); 6296 6297 if (SCX_HAS_OP(sch, dump_task)) { 6298 ops_dump_init(s, " "); 6299 SCX_CALL_OP(sch, dump_task, rq, dctx, p); 6300 ops_dump_exit(); 6301 } 6302 6303 #ifdef CONFIG_STACKTRACE 6304 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6305 #endif 6306 if (bt_len) { 6307 dump_newline(s); 6308 dump_stack_trace(s, " ", bt, bt_len); 6309 } 6310 } 6311 6312 /* 6313 * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6314 * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6315 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6316 * separately. For error dumps, @dump_all_tasks=true since only the failing 6317 * scheduler is dumped. 6318 */ 6319 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6320 size_t dump_len, bool dump_all_tasks) 6321 { 6322 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6323 struct scx_dump_ctx dctx = { 6324 .kind = ei->kind, 6325 .exit_code = ei->exit_code, 6326 .reason = ei->reason, 6327 .at_ns = ktime_get_ns(), 6328 .at_jiffies = jiffies, 6329 }; 6330 struct seq_buf s; 6331 struct scx_event_stats events; 6332 char *buf; 6333 int cpu; 6334 6335 guard(raw_spinlock_irqsave)(&scx_dump_lock); 6336 6337 if (sch->dump_disabled) 6338 return; 6339 6340 seq_buf_init(&s, ei->dump, dump_len); 6341 6342 #ifdef CONFIG_EXT_SUB_SCHED 6343 if (sch->level == 0) 6344 dump_line(&s, "%s: root", sch->ops.name); 6345 else 6346 dump_line(&s, "%s: sub%d-%llu %s", 6347 sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6348 sch->cgrp_path); 6349 #endif 6350 if (ei->kind == SCX_EXIT_NONE) { 6351 dump_line(&s, "Debug dump triggered by %s", ei->reason); 6352 } else { 6353 dump_line(&s, "%s[%d] triggered exit kind %d:", 6354 current->comm, current->pid, ei->kind); 6355 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6356 dump_newline(&s); 6357 dump_line(&s, "Backtrace:"); 6358 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6359 } 6360 6361 if (SCX_HAS_OP(sch, dump)) { 6362 ops_dump_init(&s, ""); 6363 SCX_CALL_OP(sch, dump, NULL, &dctx); 6364 ops_dump_exit(); 6365 } 6366 6367 dump_newline(&s); 6368 dump_line(&s, "CPU states"); 6369 dump_line(&s, "----------"); 6370 6371 for_each_possible_cpu(cpu) { 6372 struct rq *rq = cpu_rq(cpu); 6373 struct rq_flags rf; 6374 struct task_struct *p; 6375 struct seq_buf ns; 6376 size_t avail, used; 6377 bool idle; 6378 6379 rq_lock_irqsave(rq, &rf); 6380 6381 idle = list_empty(&rq->scx.runnable_list) && 6382 rq->curr->sched_class == &idle_sched_class; 6383 6384 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6385 goto next; 6386 6387 /* 6388 * We don't yet know whether ops.dump_cpu() will produce output 6389 * and we may want to skip the default CPU dump if it doesn't. 6390 * Use a nested seq_buf to generate the standard dump so that we 6391 * can decide whether to commit later. 6392 */ 6393 avail = seq_buf_get_buf(&s, &buf); 6394 seq_buf_init(&ns, buf, avail); 6395 6396 dump_newline(&ns); 6397 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6398 cpu, rq->scx.nr_running, rq->scx.flags, 6399 rq->scx.cpu_released, rq->scx.ops_qseq, 6400 rq->scx.kick_sync); 6401 dump_line(&ns, " curr=%s[%d] class=%ps", 6402 rq->curr->comm, rq->curr->pid, 6403 rq->curr->sched_class); 6404 if (!cpumask_empty(rq->scx.cpus_to_kick)) 6405 dump_line(&ns, " cpus_to_kick : %*pb", 6406 cpumask_pr_args(rq->scx.cpus_to_kick)); 6407 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6408 dump_line(&ns, " idle_to_kick : %*pb", 6409 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6410 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6411 dump_line(&ns, " cpus_to_preempt: %*pb", 6412 cpumask_pr_args(rq->scx.cpus_to_preempt)); 6413 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6414 dump_line(&ns, " cpus_to_wait : %*pb", 6415 cpumask_pr_args(rq->scx.cpus_to_wait)); 6416 if (!cpumask_empty(rq->scx.cpus_to_sync)) 6417 dump_line(&ns, " cpus_to_sync : %*pb", 6418 cpumask_pr_args(rq->scx.cpus_to_sync)); 6419 6420 used = seq_buf_used(&ns); 6421 if (SCX_HAS_OP(sch, dump_cpu)) { 6422 ops_dump_init(&ns, " "); 6423 SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); 6424 ops_dump_exit(); 6425 } 6426 6427 /* 6428 * If idle && nothing generated by ops.dump_cpu(), there's 6429 * nothing interesting. Skip. 6430 */ 6431 if (idle && used == seq_buf_used(&ns)) 6432 goto next; 6433 6434 /* 6435 * $s may already have overflowed when $ns was created. If so, 6436 * calling commit on it will trigger BUG. 6437 */ 6438 if (avail) { 6439 seq_buf_commit(&s, seq_buf_used(&ns)); 6440 if (seq_buf_has_overflowed(&ns)) 6441 seq_buf_set_overflow(&s); 6442 } 6443 6444 if (rq->curr->sched_class == &ext_sched_class && 6445 (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6446 scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); 6447 6448 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6449 if (dump_all_tasks || scx_task_on_sched(sch, p)) 6450 scx_dump_task(sch, &s, &dctx, rq, p, ' '); 6451 next: 6452 rq_unlock_irqrestore(rq, &rf); 6453 } 6454 6455 dump_newline(&s); 6456 dump_line(&s, "Event counters"); 6457 dump_line(&s, "--------------"); 6458 6459 scx_read_events(sch, &events); 6460 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6461 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6462 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6463 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6464 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6465 scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6466 scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6467 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6468 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6469 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6470 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6471 scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6472 scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6473 6474 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6475 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6476 trunc_marker, sizeof(trunc_marker)); 6477 } 6478 6479 static void scx_disable_irq_workfn(struct irq_work *irq_work) 6480 { 6481 struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6482 struct scx_exit_info *ei = sch->exit_info; 6483 6484 if (ei->kind >= SCX_EXIT_ERROR) 6485 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6486 6487 kthread_queue_work(sch->helper, &sch->disable_work); 6488 } 6489 6490 static bool scx_vexit(struct scx_sched *sch, 6491 enum scx_exit_kind kind, s64 exit_code, 6492 const char *fmt, va_list args) 6493 { 6494 struct scx_exit_info *ei = sch->exit_info; 6495 6496 guard(preempt)(); 6497 6498 if (!scx_claim_exit(sch, kind)) 6499 return false; 6500 6501 ei->exit_code = exit_code; 6502 #ifdef CONFIG_STACKTRACE 6503 if (kind >= SCX_EXIT_ERROR) 6504 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6505 #endif 6506 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6507 6508 /* 6509 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6510 * in scx_disable_workfn(). 6511 */ 6512 ei->kind = kind; 6513 ei->reason = scx_exit_reason(ei->kind); 6514 6515 irq_work_queue(&sch->disable_irq_work); 6516 return true; 6517 } 6518 6519 static int alloc_kick_syncs(void) 6520 { 6521 int cpu; 6522 6523 /* 6524 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6525 * can exceed percpu allocator limits on large machines. 6526 */ 6527 for_each_possible_cpu(cpu) { 6528 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6529 struct scx_kick_syncs *new_ksyncs; 6530 6531 WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6532 6533 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6534 GFP_KERNEL, cpu_to_node(cpu)); 6535 if (!new_ksyncs) { 6536 free_kick_syncs(); 6537 return -ENOMEM; 6538 } 6539 6540 rcu_assign_pointer(*ksyncs, new_ksyncs); 6541 } 6542 6543 return 0; 6544 } 6545 6546 static void free_pnode(struct scx_sched_pnode *pnode) 6547 { 6548 if (!pnode) 6549 return; 6550 exit_dsq(&pnode->global_dsq); 6551 kfree(pnode); 6552 } 6553 6554 static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6555 { 6556 struct scx_sched_pnode *pnode; 6557 6558 pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6559 if (!pnode) 6560 return NULL; 6561 6562 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6563 kfree(pnode); 6564 return NULL; 6565 } 6566 6567 return pnode; 6568 } 6569 6570 /* 6571 * Allocate and initialize a new scx_sched. @cgrp's reference is always 6572 * consumed whether the function succeeds or fails. 6573 */ 6574 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, 6575 struct cgroup *cgrp, 6576 struct scx_sched *parent) 6577 { 6578 struct scx_sched *sch; 6579 s32 level = parent ? parent->level + 1 : 0; 6580 s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6581 6582 sch = kzalloc_flex(*sch, ancestors, level + 1); 6583 if (!sch) { 6584 ret = -ENOMEM; 6585 goto err_put_cgrp; 6586 } 6587 6588 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6589 if (!sch->exit_info) { 6590 ret = -ENOMEM; 6591 goto err_free_sch; 6592 } 6593 6594 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6595 if (ret < 0) 6596 goto err_free_ei; 6597 6598 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6599 if (!sch->pnode) { 6600 ret = -ENOMEM; 6601 goto err_free_hash; 6602 } 6603 6604 for_each_node_state(node, N_POSSIBLE) { 6605 sch->pnode[node] = alloc_pnode(sch, node); 6606 if (!sch->pnode[node]) { 6607 ret = -ENOMEM; 6608 goto err_free_pnode; 6609 } 6610 } 6611 6612 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6613 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6614 dsp_ctx.buf, sch->dsp_max_batch), 6615 __alignof__(struct scx_sched_pcpu)); 6616 if (!sch->pcpu) { 6617 ret = -ENOMEM; 6618 goto err_free_pnode; 6619 } 6620 6621 for_each_possible_cpu(cpu) { 6622 ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6623 if (ret) { 6624 bypass_fail_cpu = cpu; 6625 goto err_free_pcpu; 6626 } 6627 } 6628 6629 for_each_possible_cpu(cpu) { 6630 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6631 6632 pcpu->sch = sch; 6633 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6634 } 6635 6636 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6637 if (IS_ERR(sch->helper)) { 6638 ret = PTR_ERR(sch->helper); 6639 goto err_free_pcpu; 6640 } 6641 6642 sched_set_fifo(sch->helper->task); 6643 6644 if (parent) 6645 memcpy(sch->ancestors, parent->ancestors, 6646 level * sizeof(parent->ancestors[0])); 6647 sch->ancestors[level] = sch; 6648 sch->level = level; 6649 6650 if (ops->timeout_ms) 6651 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6652 else 6653 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6654 6655 sch->slice_dfl = SCX_SLICE_DFL; 6656 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6657 sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); 6658 kthread_init_work(&sch->disable_work, scx_disable_workfn); 6659 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6660 6661 if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { 6662 ret = -ENOMEM; 6663 goto err_stop_helper; 6664 } 6665 if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { 6666 ret = -ENOMEM; 6667 goto err_free_lb_cpumask; 6668 } 6669 sch->ops = *ops; 6670 rcu_assign_pointer(ops->priv, sch); 6671 6672 sch->kobj.kset = scx_kset; 6673 INIT_LIST_HEAD(&sch->all); 6674 6675 #ifdef CONFIG_EXT_SUB_SCHED 6676 char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6677 if (!buf) { 6678 ret = -ENOMEM; 6679 goto err_free_lb_resched; 6680 } 6681 cgroup_path(cgrp, buf, PATH_MAX); 6682 sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6683 kfree(buf); 6684 if (!sch->cgrp_path) { 6685 ret = -ENOMEM; 6686 goto err_free_lb_resched; 6687 } 6688 6689 sch->cgrp = cgrp; 6690 INIT_LIST_HEAD(&sch->children); 6691 INIT_LIST_HEAD(&sch->sibling); 6692 6693 if (parent) 6694 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6695 &parent->sub_kset->kobj, 6696 "sub-%llu", cgroup_id(cgrp)); 6697 else 6698 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6699 6700 if (ret < 0) { 6701 RCU_INIT_POINTER(ops->priv, NULL); 6702 kobject_put(&sch->kobj); 6703 return ERR_PTR(ret); 6704 } 6705 6706 if (ops->sub_attach) { 6707 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6708 if (!sch->sub_kset) { 6709 RCU_INIT_POINTER(ops->priv, NULL); 6710 kobject_put(&sch->kobj); 6711 return ERR_PTR(-ENOMEM); 6712 } 6713 } 6714 #else /* CONFIG_EXT_SUB_SCHED */ 6715 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6716 if (ret < 0) { 6717 RCU_INIT_POINTER(ops->priv, NULL); 6718 kobject_put(&sch->kobj); 6719 return ERR_PTR(ret); 6720 } 6721 #endif /* CONFIG_EXT_SUB_SCHED */ 6722 return sch; 6723 6724 #ifdef CONFIG_EXT_SUB_SCHED 6725 err_free_lb_resched: 6726 RCU_INIT_POINTER(ops->priv, NULL); 6727 free_cpumask_var(sch->bypass_lb_resched_cpumask); 6728 #endif 6729 err_free_lb_cpumask: 6730 free_cpumask_var(sch->bypass_lb_donee_cpumask); 6731 err_stop_helper: 6732 kthread_destroy_worker(sch->helper); 6733 err_free_pcpu: 6734 for_each_possible_cpu(cpu) { 6735 if (cpu == bypass_fail_cpu) 6736 break; 6737 exit_dsq(bypass_dsq(sch, cpu)); 6738 } 6739 free_percpu(sch->pcpu); 6740 err_free_pnode: 6741 for_each_node_state(node, N_POSSIBLE) 6742 free_pnode(sch->pnode[node]); 6743 kfree(sch->pnode); 6744 err_free_hash: 6745 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 6746 err_free_ei: 6747 free_exit_info(sch->exit_info); 6748 err_free_sch: 6749 kfree(sch); 6750 err_put_cgrp: 6751 #ifdef CONFIG_EXT_SUB_SCHED 6752 cgroup_put(cgrp); 6753 #endif 6754 return ERR_PTR(ret); 6755 } 6756 6757 static int check_hotplug_seq(struct scx_sched *sch, 6758 const struct sched_ext_ops *ops) 6759 { 6760 unsigned long long global_hotplug_seq; 6761 6762 /* 6763 * If a hotplug event has occurred between when a scheduler was 6764 * initialized, and when we were able to attach, exit and notify user 6765 * space about it. 6766 */ 6767 if (ops->hotplug_seq) { 6768 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 6769 if (ops->hotplug_seq != global_hotplug_seq) { 6770 scx_exit(sch, SCX_EXIT_UNREG_KERN, 6771 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 6772 "expected hotplug seq %llu did not match actual %llu", 6773 ops->hotplug_seq, global_hotplug_seq); 6774 return -EBUSY; 6775 } 6776 } 6777 6778 return 0; 6779 } 6780 6781 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 6782 { 6783 /* 6784 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 6785 * ops.enqueue() callback isn't implemented. 6786 */ 6787 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 6788 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 6789 return -EINVAL; 6790 } 6791 6792 /* 6793 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 6794 * selection policy to be enabled. 6795 */ 6796 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 6797 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 6798 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 6799 return -EINVAL; 6800 } 6801 6802 if (ops->cpu_acquire || ops->cpu_release) 6803 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 6804 6805 return 0; 6806 } 6807 6808 /* 6809 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6810 * starvation. During the READY -> ENABLED task switching loop, the calling 6811 * thread's sched_class gets switched from fair to ext. As fair has higher 6812 * priority than ext, the calling thread can be indefinitely starved under 6813 * fair-class saturation, leading to a system hang. 6814 */ 6815 struct scx_enable_cmd { 6816 struct kthread_work work; 6817 struct sched_ext_ops *ops; 6818 int ret; 6819 }; 6820 6821 static void scx_root_enable_workfn(struct kthread_work *work) 6822 { 6823 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 6824 struct sched_ext_ops *ops = cmd->ops; 6825 struct cgroup *cgrp = root_cgroup(); 6826 struct scx_sched *sch; 6827 struct scx_task_iter sti; 6828 struct task_struct *p; 6829 int i, cpu, ret; 6830 6831 mutex_lock(&scx_enable_mutex); 6832 6833 if (scx_enable_state() != SCX_DISABLED) { 6834 ret = -EBUSY; 6835 goto err_unlock; 6836 } 6837 6838 /* 6839 * @ops->priv binds @ops to its scx_sched instance. It is set here by 6840 * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), 6841 * which runs after scx_root_disable() has dropped scx_enable_mutex. If 6842 * it's still non-NULL here, a previous attachment on @ops has not 6843 * finished tearing down; proceeding would let the in-flight unreg's 6844 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. 6845 */ 6846 if (rcu_access_pointer(ops->priv)) { 6847 ret = -EBUSY; 6848 goto err_unlock; 6849 } 6850 6851 ret = alloc_kick_syncs(); 6852 if (ret) 6853 goto err_unlock; 6854 6855 #ifdef CONFIG_EXT_SUB_SCHED 6856 cgroup_get(cgrp); 6857 #endif 6858 sch = scx_alloc_and_add_sched(ops, cgrp, NULL); 6859 if (IS_ERR(sch)) { 6860 ret = PTR_ERR(sch); 6861 goto err_free_ksyncs; 6862 } 6863 6864 /* 6865 * Transition to ENABLING and clear exit info to arm the disable path. 6866 * Failure triggers full disabling from here on. 6867 */ 6868 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 6869 WARN_ON_ONCE(scx_root); 6870 6871 atomic_long_set(&scx_nr_rejected, 0); 6872 6873 for_each_possible_cpu(cpu) { 6874 struct rq *rq = cpu_rq(cpu); 6875 6876 rq->scx.local_dsq.sched = sch; 6877 rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 6878 } 6879 6880 /* 6881 * Keep CPUs stable during enable so that the BPF scheduler can track 6882 * online CPUs by watching ->on/offline_cpu() after ->init(). 6883 */ 6884 cpus_read_lock(); 6885 6886 /* 6887 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 6888 * See handle_hotplug(). 6889 */ 6890 rcu_assign_pointer(scx_root, sch); 6891 6892 ret = scx_link_sched(sch); 6893 if (ret) { 6894 cpus_read_unlock(); 6895 goto err_disable; 6896 } 6897 6898 scx_idle_enable(ops); 6899 6900 if (sch->ops.init) { 6901 ret = SCX_CALL_OP_RET(sch, init, NULL); 6902 if (ret) { 6903 ret = ops_sanitize_err(sch, "init", ret); 6904 cpus_read_unlock(); 6905 scx_error(sch, "ops.init() failed (%d)", ret); 6906 goto err_disable; 6907 } 6908 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 6909 } 6910 6911 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 6912 if (((void (**)(void))ops)[i]) 6913 set_bit(i, sch->has_op); 6914 6915 ret = check_hotplug_seq(sch, ops); 6916 if (ret) { 6917 cpus_read_unlock(); 6918 goto err_disable; 6919 } 6920 scx_idle_update_selcpu_topology(ops); 6921 6922 cpus_read_unlock(); 6923 6924 ret = validate_ops(sch, ops); 6925 if (ret) 6926 goto err_disable; 6927 6928 /* 6929 * Once __scx_enabled is set, %current can be switched to SCX anytime. 6930 * This can lead to stalls as some BPF schedulers (e.g. userspace 6931 * scheduling) may not function correctly before all tasks are switched. 6932 * Init in bypass mode to guarantee forward progress. 6933 */ 6934 scx_bypass(sch, true); 6935 6936 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 6937 if (((void (**)(void))ops)[i]) 6938 set_bit(i, sch->has_op); 6939 6940 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 6941 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 6942 6943 /* 6944 * Lock out forks, cgroup on/offlining and moves before opening the 6945 * floodgate so that they don't wander into the operations prematurely. 6946 */ 6947 percpu_down_write(&scx_fork_rwsem); 6948 6949 WARN_ON_ONCE(scx_init_task_enabled); 6950 scx_init_task_enabled = true; 6951 6952 /* 6953 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 6954 * preventing new tasks from being added. No need to exclude tasks 6955 * leaving as sched_ext_free() can handle both prepped and enabled 6956 * tasks. Prep all tasks first and then enable them with preemption 6957 * disabled. 6958 * 6959 * All cgroups should be initialized before scx_init_task() so that the 6960 * BPF scheduler can reliably track each task's cgroup membership from 6961 * scx_init_task(). Lock out cgroup on/offlining and task migrations 6962 * while tasks are being initialized so that scx_cgroup_can_attach() 6963 * never sees uninitialized tasks. 6964 */ 6965 scx_cgroup_lock(); 6966 set_cgroup_sched(sch_cgroup(sch), sch); 6967 ret = scx_cgroup_init(sch); 6968 if (ret) 6969 goto err_disable_unlock_all; 6970 6971 scx_task_iter_start(&sti, NULL); 6972 while ((p = scx_task_iter_next_locked(&sti))) { 6973 struct rq_flags rf; 6974 struct rq *rq; 6975 6976 /* 6977 * @p may already be dead, have lost all its usages counts and 6978 * be waiting for RCU grace period before being freed. @p can't 6979 * be initialized for SCX in such cases and should be ignored. 6980 */ 6981 if (!tryget_task_struct(p)) 6982 continue; 6983 6984 /* 6985 * Set %INIT_BEGIN under the iter's rq lock so that a concurrent 6986 * sched_ext_dead() does not call ops.exit_task() on @p while 6987 * ops.init_task() is running. If sched_ext_dead() runs before 6988 * this store, it has already removed @p from scx_tasks and the 6989 * iter won't visit @p; if it runs after, it observes 6990 * %INIT_BEGIN and transitions to %DEAD without calling ops, 6991 * leaving the post-init recheck below to unwind. 6992 */ 6993 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 6994 scx_task_iter_unlock(&sti); 6995 6996 ret = __scx_init_task(sch, p, false); 6997 6998 rq = task_rq_lock(p, &rf); 6999 7000 if (unlikely(ret)) { 7001 if (scx_get_task_state(p) != SCX_TASK_DEAD) 7002 scx_set_task_state(p, SCX_TASK_NONE); 7003 task_rq_unlock(rq, p, &rf); 7004 scx_task_iter_stop(&sti); 7005 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 7006 ret, p->comm, p->pid); 7007 put_task_struct(p); 7008 goto err_disable_unlock_all; 7009 } 7010 7011 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7012 /* 7013 * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. 7014 * ops.exit_task() is owed to the sched __scx_init_task() 7015 * ran against; call it now. 7016 */ 7017 scx_sub_init_cancel_task(sch, p); 7018 } else { 7019 scx_set_task_state(p, SCX_TASK_INIT); 7020 scx_set_task_sched(p, sch); 7021 scx_set_task_state(p, SCX_TASK_READY); 7022 } 7023 7024 task_rq_unlock(rq, p, &rf); 7025 put_task_struct(p); 7026 } 7027 scx_task_iter_stop(&sti); 7028 scx_cgroup_unlock(); 7029 percpu_up_write(&scx_fork_rwsem); 7030 7031 /* 7032 * All tasks are READY. It's safe to turn on scx_enabled() and switch 7033 * all eligible tasks. 7034 */ 7035 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 7036 static_branch_enable(&__scx_enabled); 7037 7038 /* 7039 * We're fully committed and can't fail. The task READY -> ENABLED 7040 * transitions here are synchronized against sched_ext_free() through 7041 * scx_tasks_lock. 7042 */ 7043 percpu_down_write(&scx_fork_rwsem); 7044 scx_task_iter_start(&sti, NULL); 7045 while ((p = scx_task_iter_next_locked(&sti))) { 7046 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 7047 const struct sched_class *old_class = p->sched_class; 7048 const struct sched_class *new_class = scx_setscheduler_class(p); 7049 7050 if (scx_get_task_state(p) != SCX_TASK_READY) 7051 continue; 7052 7053 if (old_class != new_class) 7054 queue_flags |= DEQUEUE_CLASS; 7055 7056 scoped_guard (sched_change, p, queue_flags) { 7057 p->scx.slice = READ_ONCE(sch->slice_dfl); 7058 p->sched_class = new_class; 7059 } 7060 } 7061 scx_task_iter_stop(&sti); 7062 percpu_up_write(&scx_fork_rwsem); 7063 7064 scx_bypass(sch, false); 7065 7066 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 7067 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 7068 goto err_disable; 7069 } 7070 7071 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 7072 static_branch_enable(&__scx_switched_all); 7073 7074 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 7075 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 7076 kobject_uevent(&sch->kobj, KOBJ_ADD); 7077 mutex_unlock(&scx_enable_mutex); 7078 7079 atomic_long_inc(&scx_enable_seq); 7080 7081 cmd->ret = 0; 7082 return; 7083 7084 err_free_ksyncs: 7085 free_kick_syncs(); 7086 err_unlock: 7087 mutex_unlock(&scx_enable_mutex); 7088 cmd->ret = ret; 7089 return; 7090 7091 err_disable_unlock_all: 7092 scx_cgroup_unlock(); 7093 percpu_up_write(&scx_fork_rwsem); 7094 /* we'll soon enter disable path, keep bypass on */ 7095 err_disable: 7096 mutex_unlock(&scx_enable_mutex); 7097 /* 7098 * Returning an error code here would not pass all the error information 7099 * to userspace. Record errno using scx_error() for cases scx_error() 7100 * wasn't already invoked and exit indicating success so that the error 7101 * is notified through ops.exit() with all the details. 7102 * 7103 * Flush scx_disable_work to ensure that error is reported before init 7104 * completion. sch's base reference will be put by bpf_scx_unreg(). 7105 */ 7106 scx_error(sch, "scx_root_enable() failed (%d)", ret); 7107 scx_flush_disable_work(sch); 7108 cmd->ret = 0; 7109 } 7110 7111 #ifdef CONFIG_EXT_SUB_SCHED 7112 /* verify that a scheduler can be attached to @cgrp and return the parent */ 7113 static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 7114 { 7115 struct scx_sched *parent = cgrp->scx_sched; 7116 struct scx_sched *pos; 7117 7118 lockdep_assert_held(&scx_sched_lock); 7119 7120 /* can't attach twice to the same cgroup */ 7121 if (parent->cgrp == cgrp) 7122 return ERR_PTR(-EBUSY); 7123 7124 /* does $parent allow sub-scheds? */ 7125 if (!parent->ops.sub_attach) 7126 return ERR_PTR(-EOPNOTSUPP); 7127 7128 /* can't insert between $parent and its exiting children */ 7129 list_for_each_entry(pos, &parent->children, sibling) 7130 if (cgroup_is_descendant(pos->cgrp, cgrp)) 7131 return ERR_PTR(-EBUSY); 7132 7133 return parent; 7134 } 7135 7136 static bool assert_task_ready_or_enabled(struct task_struct *p) 7137 { 7138 u32 state = scx_get_task_state(p); 7139 7140 switch (state) { 7141 case SCX_TASK_READY: 7142 case SCX_TASK_ENABLED: 7143 return true; 7144 default: 7145 WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 7146 state, p->comm, p->pid); 7147 return false; 7148 } 7149 } 7150 7151 static void scx_sub_enable_workfn(struct kthread_work *work) 7152 { 7153 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7154 struct sched_ext_ops *ops = cmd->ops; 7155 struct cgroup *cgrp; 7156 struct scx_sched *parent, *sch; 7157 struct scx_task_iter sti; 7158 struct task_struct *p; 7159 s32 i, ret; 7160 7161 mutex_lock(&scx_enable_mutex); 7162 7163 if (!scx_enabled()) { 7164 ret = -ENODEV; 7165 goto out_unlock; 7166 } 7167 7168 /* See scx_root_enable_workfn() for the @ops->priv check. */ 7169 if (rcu_access_pointer(ops->priv)) { 7170 ret = -EBUSY; 7171 goto out_unlock; 7172 } 7173 7174 cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 7175 if (IS_ERR(cgrp)) { 7176 ret = PTR_ERR(cgrp); 7177 goto out_unlock; 7178 } 7179 7180 raw_spin_lock_irq(&scx_sched_lock); 7181 parent = find_parent_sched(cgrp); 7182 if (IS_ERR(parent)) { 7183 raw_spin_unlock_irq(&scx_sched_lock); 7184 ret = PTR_ERR(parent); 7185 goto out_put_cgrp; 7186 } 7187 kobject_get(&parent->kobj); 7188 raw_spin_unlock_irq(&scx_sched_lock); 7189 7190 /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ 7191 sch = scx_alloc_and_add_sched(ops, cgrp, parent); 7192 kobject_put(&parent->kobj); 7193 if (IS_ERR(sch)) { 7194 ret = PTR_ERR(sch); 7195 goto out_unlock; 7196 } 7197 7198 ret = scx_link_sched(sch); 7199 if (ret) 7200 goto err_disable; 7201 7202 if (sch->level >= SCX_SUB_MAX_DEPTH) { 7203 scx_error(sch, "max nesting depth %d violated", 7204 SCX_SUB_MAX_DEPTH); 7205 goto err_disable; 7206 } 7207 7208 if (sch->ops.init) { 7209 ret = SCX_CALL_OP_RET(sch, init, NULL); 7210 if (ret) { 7211 ret = ops_sanitize_err(sch, "init", ret); 7212 scx_error(sch, "ops.init() failed (%d)", ret); 7213 goto err_disable; 7214 } 7215 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7216 } 7217 7218 if (validate_ops(sch, ops)) 7219 goto err_disable; 7220 7221 struct scx_sub_attach_args sub_attach_args = { 7222 .ops = &sch->ops, 7223 .cgroup_path = sch->cgrp_path, 7224 }; 7225 7226 ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, 7227 &sub_attach_args); 7228 if (ret) { 7229 ret = ops_sanitize_err(sch, "sub_attach", ret); 7230 scx_error(sch, "parent rejected (%d)", ret); 7231 goto err_disable; 7232 } 7233 sch->sub_attached = true; 7234 7235 scx_bypass(sch, true); 7236 7237 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 7238 if (((void (**)(void))ops)[i]) 7239 set_bit(i, sch->has_op); 7240 7241 percpu_down_write(&scx_fork_rwsem); 7242 scx_cgroup_lock(); 7243 7244 /* 7245 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 7246 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 7247 */ 7248 set_cgroup_sched(sch_cgroup(sch), sch); 7249 if (!(cgrp->self.flags & CSS_ONLINE)) { 7250 scx_error(sch, "cgroup is not online"); 7251 goto err_unlock_and_disable; 7252 } 7253 7254 /* 7255 * Initialize tasks for the new child $sch without exiting them for 7256 * $parent so that the tasks can always be reverted back to $parent 7257 * sched on child init failure. 7258 */ 7259 WARN_ON_ONCE(scx_enabling_sub_sched); 7260 scx_enabling_sub_sched = sch; 7261 7262 scx_task_iter_start(&sti, sch->cgrp); 7263 while ((p = scx_task_iter_next_locked(&sti))) { 7264 struct rq *rq; 7265 struct rq_flags rf; 7266 7267 /* 7268 * Task iteration may visit the same task twice when racing 7269 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 7270 * finished __scx_init_task() and skip if set. 7271 * 7272 * A task may exit and get freed between __scx_init_task() 7273 * completion and scx_enable_task(). In such cases, 7274 * scx_disable_and_exit_task() must exit the task for both the 7275 * parent and child scheds. 7276 */ 7277 if (p->scx.flags & SCX_TASK_SUB_INIT) 7278 continue; 7279 7280 /* see scx_root_enable() */ 7281 if (!tryget_task_struct(p)) 7282 continue; 7283 7284 if (!assert_task_ready_or_enabled(p)) { 7285 ret = -EINVAL; 7286 goto abort; 7287 } 7288 7289 scx_task_iter_unlock(&sti); 7290 7291 /* 7292 * As $p is still on $parent, it can't be transitioned to INIT. 7293 * Let's worry about task state later. Use __scx_init_task(). 7294 */ 7295 ret = __scx_init_task(sch, p, false); 7296 if (ret) 7297 goto abort; 7298 7299 rq = task_rq_lock(p, &rf); 7300 7301 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7302 /* 7303 * sched_ext_dead() raced us between __scx_init_task() 7304 * and this rq lock and ran exit_task() on $parent (the 7305 * sched @p was on at that point), not on @sch. @sch's 7306 * just-completed init is owed an exit_task() and we 7307 * issue it here. 7308 */ 7309 scx_sub_init_cancel_task(sch, p); 7310 task_rq_unlock(rq, p, &rf); 7311 put_task_struct(p); 7312 continue; 7313 } 7314 7315 p->scx.flags |= SCX_TASK_SUB_INIT; 7316 task_rq_unlock(rq, p, &rf); 7317 7318 put_task_struct(p); 7319 } 7320 scx_task_iter_stop(&sti); 7321 7322 /* 7323 * All tasks are prepped. Disable/exit tasks for $parent and enable for 7324 * the new @sch. 7325 */ 7326 scx_task_iter_start(&sti, sch->cgrp); 7327 while ((p = scx_task_iter_next_locked(&sti))) { 7328 /* 7329 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7330 * duplicate iterations. 7331 */ 7332 if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7333 continue; 7334 7335 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7336 /* 7337 * $p must be either READY or ENABLED. If ENABLED, 7338 * __scx_disabled_and_exit_task() first disables and 7339 * makes it READY. However, after exiting $p, it will 7340 * leave $p as READY. 7341 */ 7342 assert_task_ready_or_enabled(p); 7343 __scx_disable_and_exit_task(parent, p); 7344 7345 /* 7346 * $p is now only initialized for @sch and READY, which 7347 * is what we want. Assign it to @sch and enable. 7348 */ 7349 scx_set_task_sched(p, sch); 7350 scx_enable_task(sch, p); 7351 7352 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7353 } 7354 } 7355 scx_task_iter_stop(&sti); 7356 7357 scx_enabling_sub_sched = NULL; 7358 7359 scx_cgroup_unlock(); 7360 percpu_up_write(&scx_fork_rwsem); 7361 7362 scx_bypass(sch, false); 7363 7364 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7365 kobject_uevent(&sch->kobj, KOBJ_ADD); 7366 ret = 0; 7367 goto out_unlock; 7368 7369 out_put_cgrp: 7370 cgroup_put(cgrp); 7371 out_unlock: 7372 mutex_unlock(&scx_enable_mutex); 7373 cmd->ret = ret; 7374 return; 7375 7376 abort: 7377 put_task_struct(p); 7378 scx_task_iter_stop(&sti); 7379 7380 /* 7381 * Undo __scx_init_task() for tasks we marked. scx_enable_task() never 7382 * ran for @sch on them, so calling scx_disable_task() here would invoke 7383 * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched 7384 * must stay set until SUB_INIT is cleared from every marked task - 7385 * scx_disable_and_exit_task() reads it when a task exits concurrently. 7386 */ 7387 scx_task_iter_start(&sti, sch->cgrp); 7388 while ((p = scx_task_iter_next_locked(&sti))) { 7389 if (p->scx.flags & SCX_TASK_SUB_INIT) { 7390 scx_sub_init_cancel_task(sch, p); 7391 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7392 } 7393 } 7394 scx_task_iter_stop(&sti); 7395 scx_enabling_sub_sched = NULL; 7396 err_unlock_and_disable: 7397 /* we'll soon enter disable path, keep bypass on */ 7398 scx_cgroup_unlock(); 7399 percpu_up_write(&scx_fork_rwsem); 7400 err_disable: 7401 mutex_unlock(&scx_enable_mutex); 7402 scx_flush_disable_work(sch); 7403 cmd->ret = 0; 7404 } 7405 7406 static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7407 unsigned long action, void *data) 7408 { 7409 struct cgroup *cgrp = data; 7410 struct cgroup *parent = cgroup_parent(cgrp); 7411 7412 if (!cgroup_on_dfl(cgrp)) 7413 return NOTIFY_OK; 7414 7415 switch (action) { 7416 case CGROUP_LIFETIME_ONLINE: 7417 /* inherit ->scx_sched from $parent */ 7418 if (parent) 7419 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7420 break; 7421 case CGROUP_LIFETIME_OFFLINE: 7422 /* if there is a sched attached, shoot it down */ 7423 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7424 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7425 SCX_ECODE_RSN_CGROUP_OFFLINE, 7426 "cgroup %llu going offline", cgroup_id(cgrp)); 7427 break; 7428 } 7429 7430 return NOTIFY_OK; 7431 } 7432 7433 static struct notifier_block scx_cgroup_lifetime_nb = { 7434 .notifier_call = scx_cgroup_lifetime_notify, 7435 }; 7436 7437 static s32 __init scx_cgroup_lifetime_notifier_init(void) 7438 { 7439 return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7440 &scx_cgroup_lifetime_nb); 7441 } 7442 core_initcall(scx_cgroup_lifetime_notifier_init); 7443 #endif /* CONFIG_EXT_SUB_SCHED */ 7444 7445 static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 7446 { 7447 static struct kthread_worker *helper; 7448 static DEFINE_MUTEX(helper_mutex); 7449 struct scx_enable_cmd cmd; 7450 7451 if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { 7452 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7453 return -EINVAL; 7454 } 7455 7456 if (!READ_ONCE(helper)) { 7457 mutex_lock(&helper_mutex); 7458 if (!helper) { 7459 struct kthread_worker *w = 7460 kthread_run_worker(0, "scx_enable_helper"); 7461 if (IS_ERR_OR_NULL(w)) { 7462 mutex_unlock(&helper_mutex); 7463 return -ENOMEM; 7464 } 7465 sched_set_fifo(w->task); 7466 WRITE_ONCE(helper, w); 7467 } 7468 mutex_unlock(&helper_mutex); 7469 } 7470 7471 #ifdef CONFIG_EXT_SUB_SCHED 7472 if (ops->sub_cgroup_id > 1) 7473 kthread_init_work(&cmd.work, scx_sub_enable_workfn); 7474 else 7475 #endif /* CONFIG_EXT_SUB_SCHED */ 7476 kthread_init_work(&cmd.work, scx_root_enable_workfn); 7477 cmd.ops = ops; 7478 7479 kthread_queue_work(READ_ONCE(helper), &cmd.work); 7480 kthread_flush_work(&cmd.work); 7481 return cmd.ret; 7482 } 7483 7484 7485 /******************************************************************************** 7486 * bpf_struct_ops plumbing. 7487 */ 7488 #include <linux/bpf_verifier.h> 7489 #include <linux/bpf.h> 7490 #include <linux/btf.h> 7491 7492 static const struct btf_type *task_struct_type; 7493 7494 static bool bpf_scx_is_valid_access(int off, int size, 7495 enum bpf_access_type type, 7496 const struct bpf_prog *prog, 7497 struct bpf_insn_access_aux *info) 7498 { 7499 if (type != BPF_READ) 7500 return false; 7501 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7502 return false; 7503 if (off % size != 0) 7504 return false; 7505 7506 return btf_ctx_access(off, size, type, prog, info); 7507 } 7508 7509 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7510 const struct bpf_reg_state *reg, int off, 7511 int size) 7512 { 7513 const struct btf_type *t; 7514 7515 t = btf_type_by_id(reg->btf, reg->btf_id); 7516 if (t == task_struct_type) { 7517 /* 7518 * COMPAT: Will be removed in v6.23. 7519 */ 7520 if ((off >= offsetof(struct task_struct, scx.slice) && 7521 off + size <= offsetofend(struct task_struct, scx.slice)) || 7522 (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7523 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7524 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7525 return SCALAR_VALUE; 7526 } 7527 7528 if (off >= offsetof(struct task_struct, scx.disallow) && 7529 off + size <= offsetofend(struct task_struct, scx.disallow)) 7530 return SCALAR_VALUE; 7531 } 7532 7533 return -EACCES; 7534 } 7535 7536 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7537 .get_func_proto = bpf_base_func_proto, 7538 .is_valid_access = bpf_scx_is_valid_access, 7539 .btf_struct_access = bpf_scx_btf_struct_access, 7540 }; 7541 7542 static int bpf_scx_init_member(const struct btf_type *t, 7543 const struct btf_member *member, 7544 void *kdata, const void *udata) 7545 { 7546 const struct sched_ext_ops *uops = udata; 7547 struct sched_ext_ops *ops = kdata; 7548 u32 moff = __btf_member_bit_offset(t, member) / 8; 7549 int ret; 7550 7551 switch (moff) { 7552 case offsetof(struct sched_ext_ops, dispatch_max_batch): 7553 if (*(u32 *)(udata + moff) > INT_MAX) 7554 return -E2BIG; 7555 ops->dispatch_max_batch = *(u32 *)(udata + moff); 7556 return 1; 7557 case offsetof(struct sched_ext_ops, flags): 7558 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7559 return -EINVAL; 7560 ops->flags = *(u64 *)(udata + moff); 7561 return 1; 7562 case offsetof(struct sched_ext_ops, name): 7563 ret = bpf_obj_name_cpy(ops->name, uops->name, 7564 sizeof(ops->name)); 7565 if (ret < 0) 7566 return ret; 7567 if (ret == 0) 7568 return -EINVAL; 7569 return 1; 7570 case offsetof(struct sched_ext_ops, timeout_ms): 7571 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7572 SCX_WATCHDOG_MAX_TIMEOUT) 7573 return -E2BIG; 7574 ops->timeout_ms = *(u32 *)(udata + moff); 7575 return 1; 7576 case offsetof(struct sched_ext_ops, exit_dump_len): 7577 ops->exit_dump_len = 7578 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7579 return 1; 7580 case offsetof(struct sched_ext_ops, hotplug_seq): 7581 ops->hotplug_seq = *(u64 *)(udata + moff); 7582 return 1; 7583 #ifdef CONFIG_EXT_SUB_SCHED 7584 case offsetof(struct sched_ext_ops, sub_cgroup_id): 7585 ops->sub_cgroup_id = *(u64 *)(udata + moff); 7586 return 1; 7587 #endif /* CONFIG_EXT_SUB_SCHED */ 7588 } 7589 7590 return 0; 7591 } 7592 7593 #ifdef CONFIG_EXT_SUB_SCHED 7594 static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 7595 { 7596 struct scx_sched *sch; 7597 7598 guard(rcu)(); 7599 sch = scx_prog_sched(prog->aux); 7600 if (unlikely(!sch)) 7601 return; 7602 7603 scx_error(sch, "dispatch recursion detected"); 7604 } 7605 #endif /* CONFIG_EXT_SUB_SCHED */ 7606 7607 static int bpf_scx_check_member(const struct btf_type *t, 7608 const struct btf_member *member, 7609 const struct bpf_prog *prog) 7610 { 7611 u32 moff = __btf_member_bit_offset(t, member) / 8; 7612 7613 switch (moff) { 7614 case offsetof(struct sched_ext_ops, init_task): 7615 #ifdef CONFIG_EXT_GROUP_SCHED 7616 case offsetof(struct sched_ext_ops, cgroup_init): 7617 case offsetof(struct sched_ext_ops, cgroup_exit): 7618 case offsetof(struct sched_ext_ops, cgroup_prep_move): 7619 #endif 7620 case offsetof(struct sched_ext_ops, cpu_online): 7621 case offsetof(struct sched_ext_ops, cpu_offline): 7622 case offsetof(struct sched_ext_ops, init): 7623 case offsetof(struct sched_ext_ops, exit): 7624 case offsetof(struct sched_ext_ops, sub_attach): 7625 case offsetof(struct sched_ext_ops, sub_detach): 7626 break; 7627 default: 7628 if (prog->sleepable) 7629 return -EINVAL; 7630 } 7631 7632 #ifdef CONFIG_EXT_SUB_SCHED 7633 /* 7634 * Enable private stack for operations that can nest along the 7635 * hierarchy. 7636 * 7637 * XXX - Ideally, we should only do this for scheds that allow 7638 * sub-scheds and sub-scheds themselves but I don't know how to access 7639 * struct_ops from here. 7640 */ 7641 switch (moff) { 7642 case offsetof(struct sched_ext_ops, dispatch): 7643 prog->aux->priv_stack_requested = true; 7644 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 7645 } 7646 #endif /* CONFIG_EXT_SUB_SCHED */ 7647 7648 return 0; 7649 } 7650 7651 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 7652 { 7653 return scx_enable(kdata, link); 7654 } 7655 7656 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 7657 { 7658 struct sched_ext_ops *ops = kdata; 7659 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 7660 7661 scx_disable(sch, SCX_EXIT_UNREG); 7662 scx_flush_disable_work(sch); 7663 RCU_INIT_POINTER(ops->priv, NULL); 7664 kobject_put(&sch->kobj); 7665 } 7666 7667 static int bpf_scx_init(struct btf *btf) 7668 { 7669 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 7670 7671 return 0; 7672 } 7673 7674 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 7675 { 7676 /* 7677 * sched_ext does not support updating the actively-loaded BPF 7678 * scheduler, as registering a BPF scheduler can always fail if the 7679 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 7680 * etc. Similarly, we can always race with unregistration happening 7681 * elsewhere, such as with sysrq. 7682 */ 7683 return -EOPNOTSUPP; 7684 } 7685 7686 static int bpf_scx_validate(void *kdata) 7687 { 7688 return 0; 7689 } 7690 7691 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 7692 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 7693 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 7694 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 7695 static void sched_ext_ops__tick(struct task_struct *p) {} 7696 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 7697 static void sched_ext_ops__running(struct task_struct *p) {} 7698 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 7699 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 7700 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 7701 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 7702 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 7703 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 7704 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 7705 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 7706 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 7707 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 7708 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 7709 static void sched_ext_ops__enable(struct task_struct *p) {} 7710 static void sched_ext_ops__disable(struct task_struct *p) {} 7711 #ifdef CONFIG_EXT_GROUP_SCHED 7712 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 7713 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 7714 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 7715 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7716 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7717 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 7718 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 7719 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 7720 #endif /* CONFIG_EXT_GROUP_SCHED */ 7721 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 7722 static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 7723 static void sched_ext_ops__cpu_online(s32 cpu) {} 7724 static void sched_ext_ops__cpu_offline(s32 cpu) {} 7725 static s32 sched_ext_ops__init(void) { return -EINVAL; } 7726 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 7727 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 7728 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 7729 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 7730 7731 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 7732 .select_cpu = sched_ext_ops__select_cpu, 7733 .enqueue = sched_ext_ops__enqueue, 7734 .dequeue = sched_ext_ops__dequeue, 7735 .dispatch = sched_ext_ops__dispatch, 7736 .tick = sched_ext_ops__tick, 7737 .runnable = sched_ext_ops__runnable, 7738 .running = sched_ext_ops__running, 7739 .stopping = sched_ext_ops__stopping, 7740 .quiescent = sched_ext_ops__quiescent, 7741 .yield = sched_ext_ops__yield, 7742 .core_sched_before = sched_ext_ops__core_sched_before, 7743 .set_weight = sched_ext_ops__set_weight, 7744 .set_cpumask = sched_ext_ops__set_cpumask, 7745 .update_idle = sched_ext_ops__update_idle, 7746 .cpu_acquire = sched_ext_ops__cpu_acquire, 7747 .cpu_release = sched_ext_ops__cpu_release, 7748 .init_task = sched_ext_ops__init_task, 7749 .exit_task = sched_ext_ops__exit_task, 7750 .enable = sched_ext_ops__enable, 7751 .disable = sched_ext_ops__disable, 7752 #ifdef CONFIG_EXT_GROUP_SCHED 7753 .cgroup_init = sched_ext_ops__cgroup_init, 7754 .cgroup_exit = sched_ext_ops__cgroup_exit, 7755 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 7756 .cgroup_move = sched_ext_ops__cgroup_move, 7757 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 7758 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 7759 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 7760 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 7761 #endif 7762 .sub_attach = sched_ext_ops__sub_attach, 7763 .sub_detach = sched_ext_ops__sub_detach, 7764 .cpu_online = sched_ext_ops__cpu_online, 7765 .cpu_offline = sched_ext_ops__cpu_offline, 7766 .init = sched_ext_ops__init, 7767 .exit = sched_ext_ops__exit, 7768 .dump = sched_ext_ops__dump, 7769 .dump_cpu = sched_ext_ops__dump_cpu, 7770 .dump_task = sched_ext_ops__dump_task, 7771 }; 7772 7773 static struct bpf_struct_ops bpf_sched_ext_ops = { 7774 .verifier_ops = &bpf_scx_verifier_ops, 7775 .reg = bpf_scx_reg, 7776 .unreg = bpf_scx_unreg, 7777 .check_member = bpf_scx_check_member, 7778 .init_member = bpf_scx_init_member, 7779 .init = bpf_scx_init, 7780 .update = bpf_scx_update, 7781 .validate = bpf_scx_validate, 7782 .name = "sched_ext_ops", 7783 .owner = THIS_MODULE, 7784 .cfi_stubs = &__bpf_ops_sched_ext_ops 7785 }; 7786 7787 7788 /******************************************************************************** 7789 * System integration and init. 7790 */ 7791 7792 static void sysrq_handle_sched_ext_reset(u8 key) 7793 { 7794 struct scx_sched *sch; 7795 7796 rcu_read_lock(); 7797 sch = rcu_dereference(scx_root); 7798 if (likely(sch)) 7799 scx_disable(sch, SCX_EXIT_SYSRQ); 7800 else 7801 pr_info("sched_ext: BPF schedulers not loaded\n"); 7802 rcu_read_unlock(); 7803 } 7804 7805 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 7806 .handler = sysrq_handle_sched_ext_reset, 7807 .help_msg = "reset-sched-ext(S)", 7808 .action_msg = "Disable sched_ext and revert all tasks to CFS", 7809 .enable_mask = SYSRQ_ENABLE_RTNICE, 7810 }; 7811 7812 static void sysrq_handle_sched_ext_dump(u8 key) 7813 { 7814 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 7815 struct scx_sched *sch; 7816 7817 list_for_each_entry_rcu(sch, &scx_sched_all, all) 7818 scx_dump_state(sch, &ei, 0, false); 7819 } 7820 7821 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 7822 .handler = sysrq_handle_sched_ext_dump, 7823 .help_msg = "dump-sched-ext(D)", 7824 .action_msg = "Trigger sched_ext debug dump", 7825 .enable_mask = SYSRQ_ENABLE_RTNICE, 7826 }; 7827 7828 static bool can_skip_idle_kick(struct rq *rq) 7829 { 7830 lockdep_assert_rq_held(rq); 7831 7832 /* 7833 * We can skip idle kicking if @rq is going to go through at least one 7834 * full SCX scheduling cycle before going idle. Just checking whether 7835 * curr is not idle is insufficient because we could be racing 7836 * balance_one() trying to pull the next task from a remote rq, which 7837 * may fail, and @rq may become idle afterwards. 7838 * 7839 * The race window is small and we don't and can't guarantee that @rq is 7840 * only kicked while idle anyway. Skip only when sure. 7841 */ 7842 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 7843 } 7844 7845 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 7846 { 7847 struct rq *rq = cpu_rq(cpu); 7848 struct scx_rq *this_scx = &this_rq->scx; 7849 const struct sched_class *cur_class; 7850 bool should_wait = false; 7851 unsigned long flags; 7852 7853 raw_spin_rq_lock_irqsave(rq, flags); 7854 cur_class = rq->curr->sched_class; 7855 7856 /* 7857 * During CPU hotplug, a CPU may depend on kicking itself to make 7858 * forward progress. Allow kicking self regardless of online state. If 7859 * @cpu is running a higher class task, we have no control over @cpu. 7860 * Skip kicking. 7861 */ 7862 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 7863 !sched_class_above(cur_class, &ext_sched_class)) { 7864 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 7865 if (cur_class == &ext_sched_class) 7866 rq->curr->scx.slice = 0; 7867 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7868 } 7869 7870 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 7871 if (cur_class == &ext_sched_class) { 7872 cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 7873 ksyncs[cpu] = rq->scx.kick_sync; 7874 should_wait = true; 7875 } 7876 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7877 } 7878 7879 resched_curr(rq); 7880 } else { 7881 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7882 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7883 } 7884 7885 raw_spin_rq_unlock_irqrestore(rq, flags); 7886 7887 return should_wait; 7888 } 7889 7890 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 7891 { 7892 struct rq *rq = cpu_rq(cpu); 7893 unsigned long flags; 7894 7895 raw_spin_rq_lock_irqsave(rq, flags); 7896 7897 if (!can_skip_idle_kick(rq) && 7898 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 7899 resched_curr(rq); 7900 7901 raw_spin_rq_unlock_irqrestore(rq, flags); 7902 } 7903 7904 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 7905 { 7906 struct rq *this_rq = this_rq(); 7907 struct scx_rq *this_scx = &this_rq->scx; 7908 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 7909 bool should_wait = false; 7910 unsigned long *ksyncs; 7911 s32 cpu; 7912 7913 /* can race with free_kick_syncs() during scheduler disable */ 7914 if (unlikely(!ksyncs_pcpu)) 7915 return; 7916 7917 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 7918 7919 for_each_cpu(cpu, this_scx->cpus_to_kick) { 7920 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 7921 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 7922 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7923 } 7924 7925 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 7926 kick_one_cpu_if_idle(cpu, this_rq); 7927 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7928 } 7929 7930 /* 7931 * Can't wait in hardirq — kick_sync can't advance, deadlocking if 7932 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 7933 */ 7934 if (should_wait) { 7935 raw_spin_rq_lock(this_rq); 7936 this_scx->kick_sync_pending = true; 7937 resched_curr(this_rq); 7938 raw_spin_rq_unlock(this_rq); 7939 } 7940 } 7941 7942 /** 7943 * print_scx_info - print out sched_ext scheduler state 7944 * @log_lvl: the log level to use when printing 7945 * @p: target task 7946 * 7947 * If a sched_ext scheduler is enabled, print the name and state of the 7948 * scheduler. If @p is on sched_ext, print further information about the task. 7949 * 7950 * This function can be safely called on any task as long as the task_struct 7951 * itself is accessible. While safe, this function isn't synchronized and may 7952 * print out mixups or garbages of limited length. 7953 */ 7954 void print_scx_info(const char *log_lvl, struct task_struct *p) 7955 { 7956 struct scx_sched *sch; 7957 enum scx_enable_state state = scx_enable_state(); 7958 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 7959 char runnable_at_buf[22] = "?"; 7960 struct sched_class *class; 7961 unsigned long runnable_at; 7962 7963 guard(rcu)(); 7964 7965 sch = scx_task_sched_rcu(p); 7966 7967 if (!sch) 7968 return; 7969 7970 /* 7971 * Carefully check if the task was running on sched_ext, and then 7972 * carefully copy the time it's been runnable, and its state. 7973 */ 7974 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 7975 class != &ext_sched_class) { 7976 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 7977 scx_enable_state_str[state], all); 7978 return; 7979 } 7980 7981 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 7982 sizeof(runnable_at))) 7983 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 7984 jiffies_delta_msecs(runnable_at, jiffies)); 7985 7986 /* print everything onto one line to conserve console space */ 7987 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 7988 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 7989 runnable_at_buf); 7990 } 7991 7992 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 7993 { 7994 struct scx_sched *sch; 7995 7996 guard(rcu)(); 7997 7998 sch = rcu_dereference(scx_root); 7999 if (!sch) 8000 return NOTIFY_OK; 8001 8002 /* 8003 * SCX schedulers often have userspace components which are sometimes 8004 * involved in critial scheduling paths. PM operations involve freezing 8005 * userspace which can lead to scheduling misbehaviors including stalls. 8006 * Let's bypass while PM operations are in progress. 8007 */ 8008 switch (event) { 8009 case PM_HIBERNATION_PREPARE: 8010 case PM_SUSPEND_PREPARE: 8011 case PM_RESTORE_PREPARE: 8012 scx_bypass(sch, true); 8013 break; 8014 case PM_POST_HIBERNATION: 8015 case PM_POST_SUSPEND: 8016 case PM_POST_RESTORE: 8017 scx_bypass(sch, false); 8018 break; 8019 } 8020 8021 return NOTIFY_OK; 8022 } 8023 8024 static struct notifier_block scx_pm_notifier = { 8025 .notifier_call = scx_pm_handler, 8026 }; 8027 8028 void __init init_sched_ext_class(void) 8029 { 8030 s32 cpu, v; 8031 8032 /* 8033 * The following is to prevent the compiler from optimizing out the enum 8034 * definitions so that BPF scheduler implementations can use them 8035 * through the generated vmlinux.h. 8036 */ 8037 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 8038 SCX_TG_ONLINE); 8039 8040 scx_idle_init_masks(); 8041 8042 for_each_possible_cpu(cpu) { 8043 struct rq *rq = cpu_rq(cpu); 8044 int n = cpu_to_node(cpu); 8045 8046 /* local_dsq's sch will be set during scx_root_enable() */ 8047 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 8048 8049 INIT_LIST_HEAD(&rq->scx.runnable_list); 8050 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 8051 8052 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 8053 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 8054 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 8055 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 8056 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 8057 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 8058 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 8059 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 8060 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 8061 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 8062 8063 if (cpu_online(cpu)) 8064 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 8065 } 8066 8067 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 8068 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 8069 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 8070 8071 #ifdef CONFIG_EXT_SUB_SCHED 8072 BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 8073 #endif /* CONFIG_EXT_SUB_SCHED */ 8074 } 8075 8076 8077 /******************************************************************************** 8078 * Helpers that can be called from the BPF scheduler. 8079 */ 8080 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) 8081 { 8082 bool is_local = dsq_id == SCX_DSQ_LOCAL || 8083 (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; 8084 8085 if (*enq_flags & SCX_ENQ_IMMED) { 8086 if (unlikely(!is_local)) { 8087 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 8088 return false; 8089 } 8090 } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { 8091 *enq_flags |= SCX_ENQ_IMMED; 8092 } 8093 8094 return true; 8095 } 8096 8097 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 8098 u64 dsq_id, u64 *enq_flags) 8099 { 8100 lockdep_assert_irqs_disabled(); 8101 8102 if (unlikely(!p)) { 8103 scx_error(sch, "called with NULL task"); 8104 return false; 8105 } 8106 8107 if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 8108 scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); 8109 return false; 8110 } 8111 8112 /* see SCX_EV_INSERT_NOT_OWNED definition */ 8113 if (unlikely(!scx_task_on_sched(sch, p))) { 8114 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 8115 return false; 8116 } 8117 8118 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8119 return false; 8120 8121 return true; 8122 } 8123 8124 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 8125 u64 dsq_id, u64 enq_flags) 8126 { 8127 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8128 struct task_struct *ddsp_task; 8129 8130 ddsp_task = __this_cpu_read(direct_dispatch_task); 8131 if (ddsp_task) { 8132 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 8133 return; 8134 } 8135 8136 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 8137 scx_error(sch, "dispatch buffer overflow"); 8138 return; 8139 } 8140 8141 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 8142 .task = p, 8143 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 8144 .dsq_id = dsq_id, 8145 .enq_flags = enq_flags, 8146 }; 8147 } 8148 8149 __bpf_kfunc_start_defs(); 8150 8151 /** 8152 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 8153 * @p: task_struct to insert 8154 * @dsq_id: DSQ to insert into 8155 * @slice: duration @p can run for in nsecs, 0 to keep the current value 8156 * @enq_flags: SCX_ENQ_* 8157 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8158 * 8159 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 8160 * call this function spuriously. Can be called from ops.enqueue(), 8161 * ops.select_cpu(), and ops.dispatch(). 8162 * 8163 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 8164 * and @p must match the task being enqueued. 8165 * 8166 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 8167 * will be directly inserted into the corresponding dispatch queue after 8168 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 8169 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 8170 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 8171 * task is inserted. 8172 * 8173 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 8174 * and this function can be called upto ops.dispatch_max_batch times to insert 8175 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 8176 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 8177 * counter. 8178 * 8179 * This function doesn't have any locking restrictions and may be called under 8180 * BPF locks (in the future when BPF introduces more flexible locking). 8181 * 8182 * @p is allowed to run for @slice. The scheduling path is triggered on slice 8183 * exhaustion. If zero, the current residual slice is maintained. If 8184 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 8185 * scx_bpf_kick_cpu() to trigger scheduling. 8186 * 8187 * Returns %true on successful insertion, %false on failure. On the root 8188 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8189 * to check the return value. 8190 */ 8191 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 8192 u64 slice, u64 enq_flags, 8193 const struct bpf_prog_aux *aux) 8194 { 8195 struct scx_sched *sch; 8196 8197 guard(rcu)(); 8198 sch = scx_prog_sched(aux); 8199 if (unlikely(!sch)) 8200 return false; 8201 8202 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8203 return false; 8204 8205 if (slice) 8206 p->scx.slice = slice; 8207 else 8208 p->scx.slice = p->scx.slice ?: 1; 8209 8210 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 8211 8212 return true; 8213 } 8214 8215 /* 8216 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 8217 */ 8218 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 8219 u64 slice, u64 enq_flags, 8220 const struct bpf_prog_aux *aux) 8221 { 8222 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 8223 } 8224 8225 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 8226 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 8227 { 8228 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8229 return false; 8230 8231 if (slice) 8232 p->scx.slice = slice; 8233 else 8234 p->scx.slice = p->scx.slice ?: 1; 8235 8236 p->scx.dsq_vtime = vtime; 8237 8238 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8239 8240 return true; 8241 } 8242 8243 struct scx_bpf_dsq_insert_vtime_args { 8244 /* @p can't be packed together as KF_RCU is not transitive */ 8245 u64 dsq_id; 8246 u64 slice; 8247 u64 vtime; 8248 u64 enq_flags; 8249 }; 8250 8251 /** 8252 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 8253 * @p: task_struct to insert 8254 * @args: struct containing the rest of the arguments 8255 * @args->dsq_id: DSQ to insert into 8256 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 8257 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 8258 * @args->enq_flags: SCX_ENQ_* 8259 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8260 * 8261 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 8262 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 8263 * as an inline wrapper in common.bpf.h. 8264 * 8265 * Insert @p into the vtime priority queue of the DSQ identified by 8266 * @args->dsq_id. Tasks queued into the priority queue are ordered by 8267 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 8268 * 8269 * @args->vtime ordering is according to time_before64() which considers 8270 * wrapping. A numerically larger vtime may indicate an earlier position in the 8271 * ordering and vice-versa. 8272 * 8273 * A DSQ can only be used as a FIFO or priority queue at any given time and this 8274 * function must not be called on a DSQ which already has one or more FIFO tasks 8275 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 8276 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 8277 * 8278 * Returns %true on successful insertion, %false on failure. On the root 8279 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8280 * to check the return value. 8281 */ 8282 __bpf_kfunc bool 8283 __scx_bpf_dsq_insert_vtime(struct task_struct *p, 8284 struct scx_bpf_dsq_insert_vtime_args *args, 8285 const struct bpf_prog_aux *aux) 8286 { 8287 struct scx_sched *sch; 8288 8289 guard(rcu)(); 8290 8291 sch = scx_prog_sched(aux); 8292 if (unlikely(!sch)) 8293 return false; 8294 8295 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 8296 args->vtime, args->enq_flags); 8297 } 8298 8299 /* 8300 * COMPAT: Will be removed in v6.23. 8301 */ 8302 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 8303 u64 slice, u64 vtime, u64 enq_flags) 8304 { 8305 struct scx_sched *sch; 8306 8307 guard(rcu)(); 8308 8309 sch = rcu_dereference(scx_root); 8310 if (unlikely(!sch)) 8311 return; 8312 8313 #ifdef CONFIG_EXT_SUB_SCHED 8314 /* 8315 * Disallow if any sub-scheds are attached. There is no way to tell 8316 * which scheduler called us, just error out @p's scheduler. 8317 */ 8318 if (unlikely(!list_empty(&sch->children))) { 8319 scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 8320 return; 8321 } 8322 #endif 8323 8324 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8325 } 8326 8327 __bpf_kfunc_end_defs(); 8328 8329 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8330 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8331 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8332 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8333 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8334 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8335 8336 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8337 .owner = THIS_MODULE, 8338 .set = &scx_kfunc_ids_enqueue_dispatch, 8339 .filter = scx_kfunc_context_filter, 8340 }; 8341 8342 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8343 struct task_struct *p, u64 dsq_id, u64 enq_flags) 8344 { 8345 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8346 struct scx_sched *sch; 8347 struct rq *this_rq, *src_rq, *locked_rq; 8348 bool dispatched = false; 8349 bool in_balance; 8350 unsigned long flags; 8351 8352 /* 8353 * The verifier considers an iterator slot initialized on any 8354 * KF_ITER_NEW return, so a BPF program may legally reach here after 8355 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. 8356 */ 8357 if (unlikely(!src_dsq)) 8358 return false; 8359 8360 sch = src_dsq->sched; 8361 8362 if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) 8363 return false; 8364 8365 /* 8366 * If the BPF scheduler keeps calling this function repeatedly, it can 8367 * cause similar live-lock conditions as consume_dispatch_q(). 8368 */ 8369 if (unlikely(READ_ONCE(sch->aborting))) 8370 return false; 8371 8372 if (unlikely(!scx_task_on_sched(sch, p))) { 8373 scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8374 p->comm, p->pid); 8375 return false; 8376 } 8377 8378 /* 8379 * Can be called from either ops.dispatch() locking this_rq() or any 8380 * context where no rq lock is held. If latter, lock @p's task_rq which 8381 * we'll likely need anyway. 8382 */ 8383 src_rq = task_rq(p); 8384 8385 local_irq_save(flags); 8386 this_rq = this_rq(); 8387 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8388 8389 if (in_balance) { 8390 if (this_rq != src_rq) { 8391 raw_spin_rq_unlock(this_rq); 8392 raw_spin_rq_lock(src_rq); 8393 } 8394 } else { 8395 raw_spin_rq_lock(src_rq); 8396 } 8397 8398 locked_rq = src_rq; 8399 raw_spin_lock(&src_dsq->lock); 8400 8401 /* did someone else get to it while we dropped the locks? */ 8402 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8403 raw_spin_unlock(&src_dsq->lock); 8404 goto out; 8405 } 8406 8407 /* @p is still on $src_dsq and stable, determine the destination */ 8408 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8409 8410 /* 8411 * Apply vtime and slice updates before moving so that the new time is 8412 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8413 * this is safe as we're locking it. 8414 */ 8415 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8416 p->scx.dsq_vtime = kit->vtime; 8417 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8418 p->scx.slice = kit->slice; 8419 8420 /* execute move */ 8421 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8422 dispatched = true; 8423 out: 8424 if (in_balance) { 8425 if (this_rq != locked_rq) { 8426 raw_spin_rq_unlock(locked_rq); 8427 raw_spin_rq_lock(this_rq); 8428 } 8429 } else { 8430 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8431 } 8432 8433 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8434 __SCX_DSQ_ITER_HAS_VTIME); 8435 return dispatched; 8436 } 8437 8438 __bpf_kfunc_start_defs(); 8439 8440 /** 8441 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8442 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8443 * 8444 * Can only be called from ops.dispatch(). 8445 */ 8446 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8447 { 8448 struct scx_sched *sch; 8449 8450 guard(rcu)(); 8451 8452 sch = scx_prog_sched(aux); 8453 if (unlikely(!sch)) 8454 return 0; 8455 8456 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8457 } 8458 8459 /** 8460 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8461 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8462 * 8463 * Cancel the latest dispatch. Can be called multiple times to cancel further 8464 * dispatches. Can only be called from ops.dispatch(). 8465 */ 8466 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 8467 { 8468 struct scx_sched *sch; 8469 struct scx_dsp_ctx *dspc; 8470 8471 guard(rcu)(); 8472 8473 sch = scx_prog_sched(aux); 8474 if (unlikely(!sch)) 8475 return; 8476 8477 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8478 8479 if (dspc->cursor > 0) 8480 dspc->cursor--; 8481 else 8482 scx_error(sch, "dispatch buffer underflow"); 8483 } 8484 8485 /** 8486 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 8487 * @dsq_id: DSQ to move task from. Must be a user-created DSQ 8488 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8489 * @enq_flags: %SCX_ENQ_* 8490 * 8491 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 8492 * local DSQ for execution with @enq_flags applied. Can only be called from 8493 * ops.dispatch(). 8494 * 8495 * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as 8496 * sources. Local DSQs support reenqueueing (a task can be picked up for 8497 * execution, dequeued for property changes, or reenqueued), but the BPF 8498 * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL 8499 * is similar but also doesn't support reenqueueing, as it maps to multiple 8500 * per-node DSQs making the scope difficult to define; this may change in the 8501 * future. 8502 * 8503 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 8504 * before trying to move from the specified DSQ. It may also grab rq locks and 8505 * thus can't be called under any BPF locks. 8506 * 8507 * Returns %true if a task has been moved, %false if there isn't any task to 8508 * move. 8509 */ 8510 __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, 8511 const struct bpf_prog_aux *aux) 8512 { 8513 struct scx_dispatch_q *dsq; 8514 struct scx_sched *sch; 8515 struct scx_dsp_ctx *dspc; 8516 8517 guard(rcu)(); 8518 8519 sch = scx_prog_sched(aux); 8520 if (unlikely(!sch)) 8521 return false; 8522 8523 if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) 8524 return false; 8525 8526 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8527 8528 flush_dispatch_buf(sch, dspc->rq); 8529 8530 dsq = find_user_dsq(sch, dsq_id); 8531 if (unlikely(!dsq)) { 8532 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 8533 return false; 8534 } 8535 8536 if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { 8537 /* 8538 * A successfully consumed task can be dequeued before it starts 8539 * running while the CPU is trying to migrate other dispatched 8540 * tasks. Bump nr_tasks to tell balance_one() to retry on empty 8541 * local DSQ. 8542 */ 8543 dspc->nr_tasks++; 8544 return true; 8545 } else { 8546 return false; 8547 } 8548 } 8549 8550 /* 8551 * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. 8552 */ 8553 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 8554 { 8555 return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); 8556 } 8557 8558 /** 8559 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 8560 * @it__iter: DSQ iterator in progress 8561 * @slice: duration the moved task can run for in nsecs 8562 * 8563 * Override the slice of the next task that will be moved from @it__iter using 8564 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 8565 * slice duration is kept. 8566 */ 8567 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 8568 u64 slice) 8569 { 8570 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8571 8572 kit->slice = slice; 8573 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 8574 } 8575 8576 /** 8577 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 8578 * @it__iter: DSQ iterator in progress 8579 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 8580 * 8581 * Override the vtime of the next task that will be moved from @it__iter using 8582 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 8583 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 8584 * override is ignored and cleared. 8585 */ 8586 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 8587 u64 vtime) 8588 { 8589 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8590 8591 kit->vtime = vtime; 8592 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 8593 } 8594 8595 /** 8596 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 8597 * @it__iter: DSQ iterator in progress 8598 * @p: task to transfer 8599 * @dsq_id: DSQ to move @p to 8600 * @enq_flags: SCX_ENQ_* 8601 * 8602 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 8603 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 8604 * be the destination. 8605 * 8606 * For the transfer to be successful, @p must still be on the DSQ and have been 8607 * queued before the DSQ iteration started. This function doesn't care whether 8608 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 8609 * been queued before the iteration started. 8610 * 8611 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 8612 * 8613 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 8614 * lock (e.g. BPF timers or SYSCALL programs). 8615 * 8616 * Returns %true if @p has been consumed, %false if @p had already been 8617 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 8618 * DSQ. 8619 */ 8620 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 8621 struct task_struct *p, u64 dsq_id, 8622 u64 enq_flags) 8623 { 8624 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8625 p, dsq_id, enq_flags); 8626 } 8627 8628 /** 8629 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 8630 * @it__iter: DSQ iterator in progress 8631 * @p: task to transfer 8632 * @dsq_id: DSQ to move @p to 8633 * @enq_flags: SCX_ENQ_* 8634 * 8635 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 8636 * priority queue of the DSQ specified by @dsq_id. The destination must be a 8637 * user DSQ as only user DSQs support priority queue. 8638 * 8639 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 8640 * and scx_bpf_dsq_move_set_vtime() to update. 8641 * 8642 * All other aspects are identical to scx_bpf_dsq_move(). See 8643 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 8644 */ 8645 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 8646 struct task_struct *p, u64 dsq_id, 8647 u64 enq_flags) 8648 { 8649 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8650 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8651 } 8652 8653 #ifdef CONFIG_EXT_SUB_SCHED 8654 /** 8655 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 8656 * @cgroup_id: cgroup ID of the child scheduler to dispatch 8657 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8658 * 8659 * Allows a parent scheduler to trigger dispatching on one of its direct 8660 * child schedulers. The child scheduler runs its dispatch operation to 8661 * move tasks from dispatch queues to the local runqueue. 8662 * 8663 * Returns: true on success, false if cgroup_id is invalid, not a direct 8664 * child, or caller lacks dispatch permission. 8665 */ 8666 __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 8667 { 8668 struct rq *this_rq = this_rq(); 8669 struct scx_sched *parent, *child; 8670 8671 guard(rcu)(); 8672 parent = scx_prog_sched(aux); 8673 if (unlikely(!parent)) 8674 return false; 8675 8676 child = scx_find_sub_sched(cgroup_id); 8677 8678 if (unlikely(!child)) 8679 return false; 8680 8681 if (unlikely(scx_parent(child) != parent)) { 8682 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 8683 cgroup_id); 8684 return false; 8685 } 8686 8687 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 8688 true); 8689 } 8690 #endif /* CONFIG_EXT_SUB_SCHED */ 8691 8692 __bpf_kfunc_end_defs(); 8693 8694 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 8695 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 8696 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 8697 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 8698 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) 8699 /* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ 8700 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8701 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8702 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8703 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8704 #ifdef CONFIG_EXT_SUB_SCHED 8705 BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 8706 #endif 8707 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 8708 8709 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 8710 .owner = THIS_MODULE, 8711 .set = &scx_kfunc_ids_dispatch, 8712 .filter = scx_kfunc_context_filter, 8713 }; 8714 8715 __bpf_kfunc_start_defs(); 8716 8717 /** 8718 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 8719 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8720 * 8721 * Iterate over all of the tasks currently enqueued on the local DSQ of the 8722 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 8723 * processed tasks. Can only be called from ops.cpu_release(). 8724 */ 8725 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 8726 { 8727 struct scx_sched *sch; 8728 struct rq *rq; 8729 8730 guard(rcu)(); 8731 sch = scx_prog_sched(aux); 8732 if (unlikely(!sch)) 8733 return 0; 8734 8735 rq = cpu_rq(smp_processor_id()); 8736 lockdep_assert_rq_held(rq); 8737 8738 return reenq_local(sch, rq, SCX_REENQ_ANY); 8739 } 8740 8741 __bpf_kfunc_end_defs(); 8742 8743 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 8744 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 8745 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 8746 8747 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 8748 .owner = THIS_MODULE, 8749 .set = &scx_kfunc_ids_cpu_release, 8750 .filter = scx_kfunc_context_filter, 8751 }; 8752 8753 __bpf_kfunc_start_defs(); 8754 8755 /** 8756 * scx_bpf_create_dsq - Create a custom DSQ 8757 * @dsq_id: DSQ to create 8758 * @node: NUMA node to allocate from 8759 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8760 * 8761 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 8762 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 8763 */ 8764 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 8765 { 8766 struct scx_dispatch_q *dsq; 8767 struct scx_sched *sch; 8768 s32 ret; 8769 8770 if (unlikely(node >= (int)nr_node_ids || 8771 (node < 0 && node != NUMA_NO_NODE))) 8772 return -EINVAL; 8773 8774 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 8775 return -EINVAL; 8776 8777 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 8778 if (!dsq) 8779 return -ENOMEM; 8780 8781 /* 8782 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 8783 * @sch and update afterwards. 8784 */ 8785 ret = init_dsq(dsq, dsq_id, NULL); 8786 if (ret) { 8787 kfree(dsq); 8788 return ret; 8789 } 8790 8791 rcu_read_lock(); 8792 8793 sch = scx_prog_sched(aux); 8794 if (sch) { 8795 dsq->sched = sch; 8796 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 8797 dsq_hash_params); 8798 } else { 8799 ret = -ENODEV; 8800 } 8801 8802 rcu_read_unlock(); 8803 if (ret) { 8804 exit_dsq(dsq); 8805 kfree(dsq); 8806 } 8807 return ret; 8808 } 8809 8810 __bpf_kfunc_end_defs(); 8811 8812 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 8813 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 8814 /* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ 8815 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8816 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8817 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8818 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8819 /* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ 8820 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 8821 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 8822 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 8823 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 8824 8825 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 8826 .owner = THIS_MODULE, 8827 .set = &scx_kfunc_ids_unlocked, 8828 .filter = scx_kfunc_context_filter, 8829 }; 8830 8831 __bpf_kfunc_start_defs(); 8832 8833 /** 8834 * scx_bpf_task_set_slice - Set task's time slice 8835 * @p: task of interest 8836 * @slice: time slice to set in nsecs 8837 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8838 * 8839 * Set @p's time slice to @slice. Returns %true on success, %false if the 8840 * calling scheduler doesn't have authority over @p. 8841 */ 8842 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 8843 const struct bpf_prog_aux *aux) 8844 { 8845 struct scx_sched *sch; 8846 8847 guard(rcu)(); 8848 sch = scx_prog_sched(aux); 8849 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 8850 return false; 8851 8852 p->scx.slice = slice; 8853 return true; 8854 } 8855 8856 /** 8857 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 8858 * @p: task of interest 8859 * @vtime: virtual time to set 8860 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8861 * 8862 * Set @p's virtual time to @vtime. Returns %true on success, %false if the 8863 * calling scheduler doesn't have authority over @p. 8864 */ 8865 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 8866 const struct bpf_prog_aux *aux) 8867 { 8868 struct scx_sched *sch; 8869 8870 guard(rcu)(); 8871 sch = scx_prog_sched(aux); 8872 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 8873 return false; 8874 8875 p->scx.dsq_vtime = vtime; 8876 return true; 8877 } 8878 8879 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 8880 { 8881 struct rq *this_rq; 8882 unsigned long irq_flags; 8883 8884 if (!ops_cpu_valid(sch, cpu, NULL)) 8885 return; 8886 8887 local_irq_save(irq_flags); 8888 8889 this_rq = this_rq(); 8890 8891 /* 8892 * While bypassing for PM ops, IRQ handling may not be online which can 8893 * lead to irq_work_queue() malfunction such as infinite busy wait for 8894 * IRQ status update. Suppress kicking. 8895 */ 8896 if (scx_bypassing(sch, cpu_of(this_rq))) 8897 goto out; 8898 8899 /* 8900 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 8901 * rq locks. We can probably be smarter and avoid bouncing if called 8902 * from ops which don't hold a rq lock. 8903 */ 8904 if (flags & SCX_KICK_IDLE) { 8905 struct rq *target_rq = cpu_rq(cpu); 8906 8907 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 8908 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 8909 8910 if (raw_spin_rq_trylock(target_rq)) { 8911 if (can_skip_idle_kick(target_rq)) { 8912 raw_spin_rq_unlock(target_rq); 8913 goto out; 8914 } 8915 raw_spin_rq_unlock(target_rq); 8916 } 8917 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 8918 } else { 8919 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 8920 8921 if (flags & SCX_KICK_PREEMPT) 8922 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 8923 if (flags & SCX_KICK_WAIT) 8924 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 8925 } 8926 8927 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 8928 out: 8929 local_irq_restore(irq_flags); 8930 } 8931 8932 /** 8933 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 8934 * @cpu: cpu to kick 8935 * @flags: %SCX_KICK_* flags 8936 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8937 * 8938 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 8939 * trigger rescheduling on a busy CPU. This can be called from any online 8940 * scx_ops operation and the actual kicking is performed asynchronously through 8941 * an irq work. 8942 */ 8943 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 8944 { 8945 struct scx_sched *sch; 8946 8947 guard(rcu)(); 8948 sch = scx_prog_sched(aux); 8949 if (likely(sch)) 8950 scx_kick_cpu(sch, cpu, flags); 8951 } 8952 8953 /** 8954 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 8955 * @dsq_id: id of the DSQ 8956 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8957 * 8958 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 8959 * -%ENOENT is returned. 8960 */ 8961 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) 8962 { 8963 struct scx_sched *sch; 8964 struct scx_dispatch_q *dsq; 8965 s32 ret; 8966 8967 preempt_disable(); 8968 8969 sch = scx_prog_sched(aux); 8970 if (unlikely(!sch)) { 8971 ret = -ENODEV; 8972 goto out; 8973 } 8974 8975 if (dsq_id == SCX_DSQ_LOCAL) { 8976 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 8977 goto out; 8978 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 8979 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 8980 8981 if (ops_cpu_valid(sch, cpu, NULL)) { 8982 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 8983 goto out; 8984 } 8985 } else { 8986 dsq = find_user_dsq(sch, dsq_id); 8987 if (dsq) { 8988 ret = READ_ONCE(dsq->nr); 8989 goto out; 8990 } 8991 } 8992 ret = -ENOENT; 8993 out: 8994 preempt_enable(); 8995 return ret; 8996 } 8997 8998 /** 8999 * scx_bpf_destroy_dsq - Destroy a custom DSQ 9000 * @dsq_id: DSQ to destroy 9001 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9002 * 9003 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 9004 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 9005 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 9006 * which doesn't exist. Can be called from any online scx_ops operations. 9007 */ 9008 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) 9009 { 9010 struct scx_sched *sch; 9011 9012 guard(rcu)(); 9013 sch = scx_prog_sched(aux); 9014 if (sch) 9015 destroy_dsq(sch, dsq_id); 9016 } 9017 9018 /** 9019 * bpf_iter_scx_dsq_new - Create a DSQ iterator 9020 * @it: iterator to initialize 9021 * @dsq_id: DSQ to iterate 9022 * @flags: %SCX_DSQ_ITER_* 9023 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9024 * 9025 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 9026 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 9027 * tasks which are already queued when this function is invoked. 9028 */ 9029 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 9030 u64 flags, const struct bpf_prog_aux *aux) 9031 { 9032 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9033 struct scx_sched *sch; 9034 9035 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 9036 sizeof(struct bpf_iter_scx_dsq)); 9037 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 9038 __alignof__(struct bpf_iter_scx_dsq)); 9039 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 9040 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 9041 9042 /* 9043 * next() and destroy() will be called regardless of the return value. 9044 * Always clear $kit->dsq. 9045 */ 9046 kit->dsq = NULL; 9047 9048 sch = scx_prog_sched(aux); 9049 if (unlikely(!sch)) 9050 return -ENODEV; 9051 9052 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 9053 return -EINVAL; 9054 9055 kit->dsq = find_user_dsq(sch, dsq_id); 9056 if (!kit->dsq) 9057 return -ENOENT; 9058 9059 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 9060 9061 return 0; 9062 } 9063 9064 /** 9065 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 9066 * @it: iterator to progress 9067 * 9068 * Return the next task. See bpf_iter_scx_dsq_new(). 9069 */ 9070 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 9071 { 9072 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9073 9074 if (!kit->dsq) 9075 return NULL; 9076 9077 guard(raw_spinlock_irqsave)(&kit->dsq->lock); 9078 9079 return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 9080 } 9081 9082 /** 9083 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 9084 * @it: iterator to destroy 9085 * 9086 * Undo scx_iter_scx_dsq_new(). 9087 */ 9088 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 9089 { 9090 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9091 9092 if (!kit->dsq) 9093 return; 9094 9095 if (!list_empty(&kit->cursor.node)) { 9096 unsigned long flags; 9097 9098 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 9099 list_del_init(&kit->cursor.node); 9100 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 9101 } 9102 kit->dsq = NULL; 9103 } 9104 9105 /** 9106 * scx_bpf_dsq_peek - Lockless peek at the first element. 9107 * @dsq_id: DSQ to examine. 9108 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9109 * 9110 * Read the first element in the DSQ. This is semantically equivalent to using 9111 * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 9112 * this provides only a point-in-time snapshot, and the contents may change 9113 * by the time any subsequent locking operation reads the queue. 9114 * 9115 * Returns the pointer, or NULL indicates an empty queue OR internal error. 9116 */ 9117 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 9118 const struct bpf_prog_aux *aux) 9119 { 9120 struct scx_sched *sch; 9121 struct scx_dispatch_q *dsq; 9122 9123 sch = scx_prog_sched(aux); 9124 if (unlikely(!sch)) 9125 return NULL; 9126 9127 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 9128 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 9129 return NULL; 9130 } 9131 9132 dsq = find_user_dsq(sch, dsq_id); 9133 if (unlikely(!dsq)) { 9134 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 9135 return NULL; 9136 } 9137 9138 return rcu_dereference(dsq->first_task); 9139 } 9140 9141 /** 9142 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 9143 * @dsq_id: DSQ to re-enqueue 9144 * @reenq_flags: %SCX_RENQ_* 9145 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9146 * 9147 * Iterate over all of the tasks currently enqueued on the DSQ identified by 9148 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 9149 * supported: 9150 * 9151 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 9152 * - User DSQs 9153 * 9154 * Re-enqueues are performed asynchronously. Can be called from anywhere. 9155 */ 9156 __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 9157 const struct bpf_prog_aux *aux) 9158 { 9159 struct scx_sched *sch; 9160 struct scx_dispatch_q *dsq; 9161 9162 guard(preempt)(); 9163 9164 sch = scx_prog_sched(aux); 9165 if (unlikely(!sch)) 9166 return; 9167 9168 if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 9169 scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 9170 return; 9171 } 9172 9173 /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 9174 if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 9175 reenq_flags |= SCX_REENQ_ANY; 9176 9177 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 9178 schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); 9179 } 9180 9181 /** 9182 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9183 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9184 * 9185 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9186 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 9187 * anywhere. 9188 * 9189 * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 9190 * future. 9191 */ 9192 __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 9193 { 9194 scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 9195 } 9196 9197 __bpf_kfunc_end_defs(); 9198 9199 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 9200 size_t line_size, char *fmt, unsigned long long *data, 9201 u32 data__sz) 9202 { 9203 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 9204 s32 ret; 9205 9206 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 9207 (data__sz && !data)) { 9208 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 9209 return -EINVAL; 9210 } 9211 9212 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 9213 if (ret < 0) { 9214 scx_error(sch, "failed to read data fields (%d)", ret); 9215 return ret; 9216 } 9217 9218 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 9219 &bprintf_data); 9220 if (ret < 0) { 9221 scx_error(sch, "format preparation failed (%d)", ret); 9222 return ret; 9223 } 9224 9225 ret = bstr_printf(line_buf, line_size, fmt, 9226 bprintf_data.bin_args); 9227 bpf_bprintf_cleanup(&bprintf_data); 9228 if (ret < 0) { 9229 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 9230 return ret; 9231 } 9232 9233 return ret; 9234 } 9235 9236 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 9237 char *fmt, unsigned long long *data, u32 data__sz) 9238 { 9239 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 9240 fmt, data, data__sz); 9241 } 9242 9243 __bpf_kfunc_start_defs(); 9244 9245 /** 9246 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 9247 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 9248 * @fmt: error message format string 9249 * @data: format string parameters packaged using ___bpf_fill() macro 9250 * @data__sz: @data len, must end in '__sz' for the verifier 9251 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9252 * 9253 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 9254 * disabling. 9255 */ 9256 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 9257 unsigned long long *data, u32 data__sz, 9258 const struct bpf_prog_aux *aux) 9259 { 9260 struct scx_sched *sch; 9261 unsigned long flags; 9262 9263 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9264 sch = scx_prog_sched(aux); 9265 if (likely(sch) && 9266 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9267 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 9268 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9269 } 9270 9271 /** 9272 * scx_bpf_error_bstr - Indicate fatal error 9273 * @fmt: error message format string 9274 * @data: format string parameters packaged using ___bpf_fill() macro 9275 * @data__sz: @data len, must end in '__sz' for the verifier 9276 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9277 * 9278 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 9279 * disabling. 9280 */ 9281 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 9282 u32 data__sz, const struct bpf_prog_aux *aux) 9283 { 9284 struct scx_sched *sch; 9285 unsigned long flags; 9286 9287 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9288 sch = scx_prog_sched(aux); 9289 if (likely(sch) && 9290 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9291 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 9292 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9293 } 9294 9295 /** 9296 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 9297 * @fmt: format string 9298 * @data: format string parameters packaged using ___bpf_fill() macro 9299 * @data__sz: @data len, must end in '__sz' for the verifier 9300 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9301 * 9302 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 9303 * dump_task() to generate extra debug dump specific to the BPF scheduler. 9304 * 9305 * The extra dump may be multiple lines. A single line may be split over 9306 * multiple calls. The last line is automatically terminated. 9307 */ 9308 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 9309 u32 data__sz, const struct bpf_prog_aux *aux) 9310 { 9311 struct scx_sched *sch; 9312 struct scx_dump_data *dd = &scx_dump_data; 9313 struct scx_bstr_buf *buf = &dd->buf; 9314 s32 ret; 9315 9316 guard(rcu)(); 9317 9318 sch = scx_prog_sched(aux); 9319 if (unlikely(!sch)) 9320 return; 9321 9322 if (raw_smp_processor_id() != dd->cpu) { 9323 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 9324 return; 9325 } 9326 9327 /* append the formatted string to the line buf */ 9328 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 9329 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 9330 if (ret < 0) { 9331 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 9332 dd->prefix, fmt, data, data__sz, ret); 9333 return; 9334 } 9335 9336 dd->cursor += ret; 9337 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 9338 9339 if (!dd->cursor) 9340 return; 9341 9342 /* 9343 * If the line buf overflowed or ends in a newline, flush it into the 9344 * dump. This is to allow the caller to generate a single line over 9345 * multiple calls. As ops_dump_flush() can also handle multiple lines in 9346 * the line buf, the only case which can lead to an unexpected 9347 * truncation is when the caller keeps generating newlines in the middle 9348 * instead of the end consecutively. Don't do that. 9349 */ 9350 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9351 ops_dump_flush(); 9352 } 9353 9354 /** 9355 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9356 * @cpu: CPU of interest 9357 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9358 * 9359 * Return the maximum relative capacity of @cpu in relation to the most 9360 * performant CPU in the system. The return value is in the range [1, 9361 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9362 */ 9363 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9364 { 9365 struct scx_sched *sch; 9366 9367 guard(rcu)(); 9368 9369 sch = scx_prog_sched(aux); 9370 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9371 return arch_scale_cpu_capacity(cpu); 9372 else 9373 return SCX_CPUPERF_ONE; 9374 } 9375 9376 /** 9377 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9378 * @cpu: CPU of interest 9379 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9380 * 9381 * Return the current relative performance of @cpu in relation to its maximum. 9382 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9383 * 9384 * The current performance level of a CPU in relation to the maximum performance 9385 * available in the system can be calculated as follows: 9386 * 9387 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9388 * 9389 * The result is in the range [1, %SCX_CPUPERF_ONE]. 9390 */ 9391 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9392 { 9393 struct scx_sched *sch; 9394 9395 guard(rcu)(); 9396 9397 sch = scx_prog_sched(aux); 9398 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9399 return arch_scale_freq_capacity(cpu); 9400 else 9401 return SCX_CPUPERF_ONE; 9402 } 9403 9404 /** 9405 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 9406 * @cpu: CPU of interest 9407 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 9408 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9409 * 9410 * Set the target performance level of @cpu to @perf. @perf is in linear 9411 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 9412 * schedutil cpufreq governor chooses the target frequency. 9413 * 9414 * The actual performance level chosen, CPU grouping, and the overhead and 9415 * latency of the operations are dependent on the hardware and cpufreq driver in 9416 * use. Consult hardware and cpufreq documentation for more information. The 9417 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 9418 */ 9419 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 9420 { 9421 struct scx_sched *sch; 9422 9423 guard(rcu)(); 9424 9425 sch = scx_prog_sched(aux); 9426 if (unlikely(!sch)) 9427 return; 9428 9429 if (unlikely(perf > SCX_CPUPERF_ONE)) { 9430 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 9431 return; 9432 } 9433 9434 if (ops_cpu_valid(sch, cpu, NULL)) { 9435 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 9436 struct rq_flags rf; 9437 9438 /* 9439 * When called with an rq lock held, restrict the operation 9440 * to the corresponding CPU to prevent ABBA deadlocks. 9441 */ 9442 if (locked_rq && rq != locked_rq) { 9443 scx_error(sch, "Invalid target CPU %d", cpu); 9444 return; 9445 } 9446 9447 /* 9448 * If no rq lock is held, allow to operate on any CPU by 9449 * acquiring the corresponding rq lock. 9450 */ 9451 if (!locked_rq) { 9452 rq_lock_irqsave(rq, &rf); 9453 update_rq_clock(rq); 9454 } 9455 9456 rq->scx.cpuperf_target = perf; 9457 cpufreq_update_util(rq, 0); 9458 9459 if (!locked_rq) 9460 rq_unlock_irqrestore(rq, &rf); 9461 } 9462 } 9463 9464 /** 9465 * scx_bpf_nr_node_ids - Return the number of possible node IDs 9466 * 9467 * All valid node IDs in the system are smaller than the returned value. 9468 */ 9469 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 9470 { 9471 return nr_node_ids; 9472 } 9473 9474 /** 9475 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 9476 * 9477 * All valid CPU IDs in the system are smaller than the returned value. 9478 */ 9479 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 9480 { 9481 return nr_cpu_ids; 9482 } 9483 9484 /** 9485 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 9486 */ 9487 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 9488 { 9489 return cpu_possible_mask; 9490 } 9491 9492 /** 9493 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 9494 */ 9495 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 9496 { 9497 return cpu_online_mask; 9498 } 9499 9500 /** 9501 * scx_bpf_put_cpumask - Release a possible/online cpumask 9502 * @cpumask: cpumask to release 9503 */ 9504 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 9505 { 9506 /* 9507 * Empty function body because we aren't actually acquiring or releasing 9508 * a reference to a global cpumask, which is read-only in the caller and 9509 * is never released. The acquire / release semantics here are just used 9510 * to make the cpumask is a trusted pointer in the caller. 9511 */ 9512 } 9513 9514 /** 9515 * scx_bpf_task_running - Is task currently running? 9516 * @p: task of interest 9517 */ 9518 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 9519 { 9520 return task_rq(p)->curr == p; 9521 } 9522 9523 /** 9524 * scx_bpf_task_cpu - CPU a task is currently associated with 9525 * @p: task of interest 9526 */ 9527 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 9528 { 9529 return task_cpu(p); 9530 } 9531 9532 /** 9533 * scx_bpf_cpu_rq - Fetch the rq of a CPU 9534 * @cpu: CPU of the rq 9535 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9536 */ 9537 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 9538 { 9539 struct scx_sched *sch; 9540 9541 guard(rcu)(); 9542 9543 sch = scx_prog_sched(aux); 9544 if (unlikely(!sch)) 9545 return NULL; 9546 9547 if (!ops_cpu_valid(sch, cpu, NULL)) 9548 return NULL; 9549 9550 if (!sch->warned_deprecated_rq) { 9551 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 9552 "use scx_bpf_locked_rq() when holding rq lock " 9553 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 9554 sch->warned_deprecated_rq = true; 9555 } 9556 9557 return cpu_rq(cpu); 9558 } 9559 9560 /** 9561 * scx_bpf_locked_rq - Return the rq currently locked by SCX 9562 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9563 * 9564 * Returns the rq if a rq lock is currently held by SCX. 9565 * Otherwise emits an error and returns NULL. 9566 */ 9567 __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 9568 { 9569 struct scx_sched *sch; 9570 struct rq *rq; 9571 9572 guard(preempt)(); 9573 9574 sch = scx_prog_sched(aux); 9575 if (unlikely(!sch)) 9576 return NULL; 9577 9578 rq = scx_locked_rq(); 9579 if (!rq) { 9580 scx_error(sch, "accessing rq without holding rq lock"); 9581 return NULL; 9582 } 9583 9584 return rq; 9585 } 9586 9587 /** 9588 * scx_bpf_cpu_curr - Return remote CPU's curr task 9589 * @cpu: CPU of interest 9590 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9591 * 9592 * Callers must hold RCU read lock (KF_RCU). 9593 */ 9594 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 9595 { 9596 struct scx_sched *sch; 9597 9598 guard(rcu)(); 9599 9600 sch = scx_prog_sched(aux); 9601 if (unlikely(!sch)) 9602 return NULL; 9603 9604 if (!ops_cpu_valid(sch, cpu, NULL)) 9605 return NULL; 9606 9607 return rcu_dereference(cpu_rq(cpu)->curr); 9608 } 9609 9610 /** 9611 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 9612 * clock for the current CPU. The clock returned is in nanoseconds. 9613 * 9614 * It provides the following properties: 9615 * 9616 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 9617 * to account for execution time and track tasks' runtime properties. 9618 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 9619 * eventually reads a hardware timestamp counter -- is neither performant nor 9620 * scalable. scx_bpf_now() aims to provide a high-performance clock by 9621 * using the rq clock in the scheduler core whenever possible. 9622 * 9623 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 9624 * scheduler use cases, the required clock resolution is lower than the most 9625 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 9626 * uses the rq clock in the scheduler core whenever it is valid. It considers 9627 * that the rq clock is valid from the time the rq clock is updated 9628 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 9629 * 9630 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 9631 * guarantees the clock never goes backward when comparing them in the same 9632 * CPU. On the other hand, when comparing clocks in different CPUs, there 9633 * is no such guarantee -- the clock can go backward. It provides a 9634 * monotonically *non-decreasing* clock so that it would provide the same 9635 * clock values in two different scx_bpf_now() calls in the same CPU 9636 * during the same period of when the rq clock is valid. 9637 */ 9638 __bpf_kfunc u64 scx_bpf_now(void) 9639 { 9640 struct rq *rq; 9641 u64 clock; 9642 9643 preempt_disable(); 9644 9645 rq = this_rq(); 9646 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 9647 /* 9648 * If the rq clock is valid, use the cached rq clock. 9649 * 9650 * Note that scx_bpf_now() is re-entrant between a process 9651 * context and an interrupt context (e.g., timer interrupt). 9652 * However, we don't need to consider the race between them 9653 * because such race is not observable from a caller. 9654 */ 9655 clock = READ_ONCE(rq->scx.clock); 9656 } else { 9657 /* 9658 * Otherwise, return a fresh rq clock. 9659 * 9660 * The rq clock is updated outside of the rq lock. 9661 * In this case, keep the updated rq clock invalid so the next 9662 * kfunc call outside the rq lock gets a fresh rq clock. 9663 */ 9664 clock = sched_clock_cpu(cpu_of(rq)); 9665 } 9666 9667 preempt_enable(); 9668 9669 return clock; 9670 } 9671 9672 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 9673 { 9674 struct scx_event_stats *e_cpu; 9675 int cpu; 9676 9677 /* Aggregate per-CPU event counters into @events. */ 9678 memset(events, 0, sizeof(*events)); 9679 for_each_possible_cpu(cpu) { 9680 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 9681 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 9682 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 9683 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 9684 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 9685 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 9686 scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 9687 scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 9688 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 9689 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 9690 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 9691 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 9692 scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 9693 scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); 9694 } 9695 } 9696 9697 /* 9698 * scx_bpf_events - Get a system-wide event counter to 9699 * @events: output buffer from a BPF program 9700 * @events__sz: @events len, must end in '__sz'' for the verifier 9701 */ 9702 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 9703 size_t events__sz) 9704 { 9705 struct scx_sched *sch; 9706 struct scx_event_stats e_sys; 9707 9708 rcu_read_lock(); 9709 sch = rcu_dereference(scx_root); 9710 if (sch) 9711 scx_read_events(sch, &e_sys); 9712 else 9713 memset(&e_sys, 0, sizeof(e_sys)); 9714 rcu_read_unlock(); 9715 9716 /* 9717 * We cannot entirely trust a BPF-provided size since a BPF program 9718 * might be compiled against a different vmlinux.h, of which 9719 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 9720 * (an older vmlinux.h). Hence, we use the smaller size to avoid 9721 * memory corruption. 9722 */ 9723 events__sz = min(events__sz, sizeof(*events)); 9724 memcpy(events, &e_sys, events__sz); 9725 } 9726 9727 #ifdef CONFIG_CGROUP_SCHED 9728 /** 9729 * scx_bpf_task_cgroup - Return the sched cgroup of a task 9730 * @p: task of interest 9731 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9732 * 9733 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 9734 * from the scheduler's POV. SCX operations should use this function to 9735 * determine @p's current cgroup as, unlike following @p->cgroups, 9736 * @p->sched_task_group is stable for the duration of the SCX op. See 9737 * SCX_CALL_OP_TASK() for details. 9738 */ 9739 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 9740 const struct bpf_prog_aux *aux) 9741 { 9742 struct task_group *tg = p->sched_task_group; 9743 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 9744 struct scx_sched *sch; 9745 9746 guard(rcu)(); 9747 9748 sch = scx_prog_sched(aux); 9749 if (unlikely(!sch)) 9750 goto out; 9751 9752 if (!scx_kf_arg_task_ok(sch, p)) 9753 goto out; 9754 9755 cgrp = tg_cgrp(tg); 9756 9757 out: 9758 cgroup_get(cgrp); 9759 return cgrp; 9760 } 9761 #endif /* CONFIG_CGROUP_SCHED */ 9762 9763 __bpf_kfunc_end_defs(); 9764 9765 BTF_KFUNCS_START(scx_kfunc_ids_any) 9766 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 9767 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 9768 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 9769 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) 9770 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) 9771 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 9772 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 9773 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 9774 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 9775 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 9776 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 9777 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 9778 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 9779 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 9780 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 9781 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 9782 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 9783 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 9784 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 9785 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 9786 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 9787 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 9788 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 9789 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 9790 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 9791 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 9792 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 9793 BTF_ID_FLAGS(func, scx_bpf_now) 9794 BTF_ID_FLAGS(func, scx_bpf_events) 9795 #ifdef CONFIG_CGROUP_SCHED 9796 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 9797 #endif 9798 BTF_KFUNCS_END(scx_kfunc_ids_any) 9799 9800 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 9801 .owner = THIS_MODULE, 9802 .set = &scx_kfunc_ids_any, 9803 .filter = scx_kfunc_context_filter, 9804 }; 9805 9806 /* 9807 * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc 9808 * group; an op may permit zero or more groups, with the union expressed in 9809 * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) 9810 * consults this table to decide whether a context-sensitive kfunc is callable 9811 * from a given SCX op. 9812 */ 9813 enum scx_kf_allow_flags { 9814 SCX_KF_ALLOW_UNLOCKED = 1 << 0, 9815 SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, 9816 SCX_KF_ALLOW_DISPATCH = 1 << 2, 9817 SCX_KF_ALLOW_ENQUEUE = 1 << 3, 9818 SCX_KF_ALLOW_SELECT_CPU = 1 << 4, 9819 }; 9820 9821 /* 9822 * Map each SCX op to the union of kfunc groups it permits, indexed by 9823 * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not 9824 * context-sensitive. 9825 */ 9826 static const u32 scx_kf_allow_flags[] = { 9827 [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 9828 [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 9829 [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, 9830 [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, 9831 [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, 9832 [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, 9833 #ifdef CONFIG_EXT_GROUP_SCHED 9834 [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, 9835 [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, 9836 [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, 9837 [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, 9838 [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, 9839 [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, 9840 [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, 9841 #endif /* CONFIG_EXT_GROUP_SCHED */ 9842 [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, 9843 [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, 9844 [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, 9845 [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, 9846 [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, 9847 [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, 9848 }; 9849 9850 /* 9851 * Verifier-time filter for SCX kfuncs. Registered via the .filter field on 9852 * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc 9853 * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or 9854 * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the 9855 * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by 9856 * falling through to "allow" when none of the SCX sets contain the kfunc. 9857 */ 9858 int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) 9859 { 9860 bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); 9861 bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); 9862 bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); 9863 bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); 9864 bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); 9865 bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); 9866 bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); 9867 u32 moff, flags; 9868 9869 /* Not an SCX kfunc - allow. */ 9870 if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || 9871 in_cpu_release || in_idle || in_any)) 9872 return 0; 9873 9874 /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ 9875 if (prog->type == BPF_PROG_TYPE_SYSCALL) 9876 return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; 9877 9878 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) 9879 return (in_any || in_idle) ? 0 : -EACCES; 9880 9881 /* 9882 * add_subprog_and_kfunc() collects all kfunc calls, including dead code 9883 * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets 9884 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; 9885 * do_check_main() re-runs the filter with st_ops set and enforces the 9886 * actual restrictions. 9887 */ 9888 if (!prog->aux->st_ops) 9889 return 0; 9890 9891 /* 9892 * Non-SCX struct_ops: SCX kfuncs are not permitted. 9893 */ 9894 if (prog->aux->st_ops != &bpf_sched_ext_ops) 9895 return -EACCES; 9896 9897 /* SCX struct_ops: check the per-op allow list. */ 9898 if (in_any || in_idle) 9899 return 0; 9900 9901 moff = prog->aux->attach_st_ops_member_off; 9902 flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; 9903 9904 if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) 9905 return 0; 9906 if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) 9907 return 0; 9908 if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) 9909 return 0; 9910 if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) 9911 return 0; 9912 if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) 9913 return 0; 9914 9915 return -EACCES; 9916 } 9917 9918 static int __init scx_init(void) 9919 { 9920 int ret; 9921 9922 /* 9923 * kfunc registration can't be done from init_sched_ext_class() as 9924 * register_btf_kfunc_id_set() needs most of the system to be up. 9925 * 9926 * Some kfuncs are context-sensitive and can only be called from 9927 * specific SCX ops. They are grouped into per-context BTF sets, each 9928 * registered with scx_kfunc_context_filter as its .filter callback. The 9929 * BPF core dedups identical filter pointers per hook 9930 * (btf_populate_kfunc_set()), so the filter is invoked exactly once per 9931 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op 9932 * restrictions at verify time. 9933 */ 9934 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9935 &scx_kfunc_set_enqueue_dispatch)) || 9936 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9937 &scx_kfunc_set_dispatch)) || 9938 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9939 &scx_kfunc_set_cpu_release)) || 9940 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9941 &scx_kfunc_set_unlocked)) || 9942 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9943 &scx_kfunc_set_unlocked)) || 9944 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9945 &scx_kfunc_set_any)) || 9946 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 9947 &scx_kfunc_set_any)) || 9948 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9949 &scx_kfunc_set_any))) { 9950 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 9951 return ret; 9952 } 9953 9954 ret = scx_idle_init(); 9955 if (ret) { 9956 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 9957 return ret; 9958 } 9959 9960 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 9961 if (ret) { 9962 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 9963 return ret; 9964 } 9965 9966 ret = register_pm_notifier(&scx_pm_notifier); 9967 if (ret) { 9968 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 9969 return ret; 9970 } 9971 9972 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 9973 if (!scx_kset) { 9974 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 9975 return -ENOMEM; 9976 } 9977 9978 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 9979 if (ret < 0) { 9980 pr_err("sched_ext: Failed to add global attributes\n"); 9981 return ret; 9982 } 9983 9984 return 0; 9985 } 9986 __initcall(scx_init); 9987