1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #include <linux/btf_ids.h> 10 #include "ext_idle.h" 11 12 static DEFINE_RAW_SPINLOCK(scx_sched_lock); 13 14 /* 15 * NOTE: sched_ext is in the process of growing multiple scheduler support and 16 * scx_root usage is in a transitional state. Naked dereferences are safe if the 17 * caller is one of the tasks attached to SCX and explicit RCU dereference is 18 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 19 * are used as temporary markers to indicate that the dereferences need to be 20 * updated to point to the associated scheduler instances rather than scx_root. 21 */ 22 struct scx_sched __rcu *scx_root; 23 24 /* 25 * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 26 * Readers can hold either or rcu_read_lock(). 27 */ 28 static LIST_HEAD(scx_sched_all); 29 30 #ifdef CONFIG_EXT_SUB_SCHED 31 static const struct rhashtable_params scx_sched_hash_params = { 32 .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 33 .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 34 .head_offset = offsetof(struct scx_sched, hash_node), 35 .insecure_elasticity = true, /* inserted under scx_sched_lock */ 36 }; 37 38 static struct rhashtable scx_sched_hash; 39 #endif 40 41 /* 42 * During exit, a task may schedule after losing its PIDs. When disabling the 43 * BPF scheduler, we need to be able to iterate tasks in every state to 44 * guarantee system safety. Maintain a dedicated task list which contains every 45 * task between its fork and eventual free. 46 */ 47 static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 48 static LIST_HEAD(scx_tasks); 49 50 /* ops enable/disable */ 51 static DEFINE_MUTEX(scx_enable_mutex); 52 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 53 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 54 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 55 static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 56 static bool scx_init_task_enabled; 57 static bool scx_switching_all; 58 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 59 60 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 61 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 62 63 #ifdef CONFIG_EXT_SUB_SCHED 64 /* 65 * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 66 * tasks for the sub-sched being enabled. Use a global variable instead of a 67 * per-task field as all enables are serialized. 68 */ 69 static struct scx_sched *scx_enabling_sub_sched; 70 #else 71 #define scx_enabling_sub_sched (struct scx_sched *)NULL 72 #endif /* CONFIG_EXT_SUB_SCHED */ 73 74 /* 75 * A monotonically increasing sequence number that is incremented every time a 76 * scheduler is enabled. This can be used to check if any custom sched_ext 77 * scheduler has ever been used in the system. 78 */ 79 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 80 81 /* 82 * Watchdog interval. All scx_sched's share a single watchdog timer and the 83 * interval is half of the shortest sch->watchdog_timeout. 84 */ 85 static unsigned long scx_watchdog_interval; 86 87 /* 88 * The last time the delayed work was run. This delayed work relies on 89 * ksoftirqd being able to run to service timer interrupts, so it's possible 90 * that this work itself could get wedged. To account for this, we check that 91 * it's not stalled in the timer tick, and trigger an error if it is. 92 */ 93 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 94 95 static struct delayed_work scx_watchdog_work; 96 97 /* 98 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 99 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 100 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 101 * lazily when enabling and freed when disabling to avoid waste when sched_ext 102 * isn't active. 103 */ 104 struct scx_kick_syncs { 105 struct rcu_head rcu; 106 unsigned long syncs[]; 107 }; 108 109 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 110 111 /* 112 * Direct dispatch marker. 113 * 114 * Non-NULL values are used for direct dispatch from enqueue path. A valid 115 * pointer points to the task currently being enqueued. An ERR_PTR value is used 116 * to indicate that direct dispatch has already happened. 117 */ 118 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 119 120 static const struct rhashtable_params dsq_hash_params = { 121 .key_len = sizeof_field(struct scx_dispatch_q, id), 122 .key_offset = offsetof(struct scx_dispatch_q, id), 123 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 124 }; 125 126 static LLIST_HEAD(dsqs_to_free); 127 128 /* string formatting from BPF */ 129 struct scx_bstr_buf { 130 u64 data[MAX_BPRINTF_VARARGS]; 131 char line[SCX_EXIT_MSG_LEN]; 132 }; 133 134 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 135 static struct scx_bstr_buf scx_exit_bstr_buf; 136 137 /* ops debug dump */ 138 static DEFINE_RAW_SPINLOCK(scx_dump_lock); 139 140 struct scx_dump_data { 141 s32 cpu; 142 bool first; 143 s32 cursor; 144 struct seq_buf *s; 145 const char *prefix; 146 struct scx_bstr_buf buf; 147 }; 148 149 static struct scx_dump_data scx_dump_data = { 150 .cpu = -1, 151 }; 152 153 /* /sys/kernel/sched_ext interface */ 154 static struct kset *scx_kset; 155 156 /* 157 * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 158 * There usually is no reason to modify these as normal scheduler operation 159 * shouldn't be affected by them. The knobs are primarily for debugging. 160 */ 161 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 162 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 163 164 static int set_slice_us(const char *val, const struct kernel_param *kp) 165 { 166 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 167 } 168 169 static const struct kernel_param_ops slice_us_param_ops = { 170 .set = set_slice_us, 171 .get = param_get_uint, 172 }; 173 174 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 175 { 176 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 177 } 178 179 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 180 .set = set_bypass_lb_intv_us, 181 .get = param_get_uint, 182 }; 183 184 #undef MODULE_PARAM_PREFIX 185 #define MODULE_PARAM_PREFIX "sched_ext." 186 187 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 188 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 189 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 190 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 191 192 #undef MODULE_PARAM_PREFIX 193 194 #define CREATE_TRACE_POINTS 195 #include <trace/events/sched_ext.h> 196 197 static void run_deferred(struct rq *rq); 198 static bool task_dead_and_done(struct task_struct *p); 199 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 200 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 201 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 202 s64 exit_code, const char *fmt, va_list args); 203 204 static __printf(4, 5) bool scx_exit(struct scx_sched *sch, 205 enum scx_exit_kind kind, s64 exit_code, 206 const char *fmt, ...) 207 { 208 va_list args; 209 bool ret; 210 211 va_start(args, fmt); 212 ret = scx_vexit(sch, kind, exit_code, fmt, args); 213 va_end(args); 214 215 return ret; 216 } 217 218 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 219 #define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) 220 221 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 222 223 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 224 { 225 if (time_after(at, now)) 226 return jiffies_to_msecs(at - now); 227 else 228 return -(long)jiffies_to_msecs(now - at); 229 } 230 231 static bool u32_before(u32 a, u32 b) 232 { 233 return (s32)(a - b) < 0; 234 } 235 236 #ifdef CONFIG_EXT_SUB_SCHED 237 /** 238 * scx_parent - Find the parent sched 239 * @sch: sched to find the parent of 240 * 241 * Returns the parent scheduler or %NULL if @sch is root. 242 */ 243 static struct scx_sched *scx_parent(struct scx_sched *sch) 244 { 245 if (sch->level) 246 return sch->ancestors[sch->level - 1]; 247 else 248 return NULL; 249 } 250 251 /** 252 * scx_next_descendant_pre - find the next descendant for pre-order walk 253 * @pos: the current position (%NULL to initiate traversal) 254 * @root: sched whose descendants to walk 255 * 256 * To be used by scx_for_each_descendant_pre(). Find the next descendant to 257 * visit for pre-order traversal of @root's descendants. @root is included in 258 * the iteration and the first node to be visited. 259 */ 260 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 261 struct scx_sched *root) 262 { 263 struct scx_sched *next; 264 265 lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 266 lockdep_is_held(&scx_sched_lock)); 267 268 /* if first iteration, visit @root */ 269 if (!pos) 270 return root; 271 272 /* visit the first child if exists */ 273 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 274 if (next) 275 return next; 276 277 /* no child, visit my or the closest ancestor's next sibling */ 278 while (pos != root) { 279 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 280 return list_next_entry(pos, sibling); 281 pos = scx_parent(pos); 282 } 283 284 return NULL; 285 } 286 287 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 288 { 289 return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 290 scx_sched_hash_params); 291 } 292 293 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 294 { 295 rcu_assign_pointer(p->scx.sched, sch); 296 } 297 #else /* CONFIG_EXT_SUB_SCHED */ 298 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } 299 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 300 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 301 #endif /* CONFIG_EXT_SUB_SCHED */ 302 303 /** 304 * scx_is_descendant - Test whether sched is a descendant 305 * @sch: sched to test 306 * @ancestor: ancestor sched to test against 307 * 308 * Test whether @sch is a descendant of @ancestor. 309 */ 310 static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 311 { 312 if (sch->level < ancestor->level) 313 return false; 314 return sch->ancestors[ancestor->level] == ancestor; 315 } 316 317 /** 318 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 319 * @pos: iteration cursor 320 * @root: sched to walk the descendants of 321 * 322 * Walk @root's descendants. @root is included in the iteration and the first 323 * node to be visited. Must be called with either scx_enable_mutex or 324 * scx_sched_lock held. 325 */ 326 #define scx_for_each_descendant_pre(pos, root) \ 327 for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 328 (pos) = scx_next_descendant_pre((pos), (root))) 329 330 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 331 { 332 return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 333 } 334 335 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 336 { 337 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 338 } 339 340 static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 341 { 342 if (p->sched_class == &stop_sched_class) 343 return &stop_sched_class; 344 345 return __setscheduler_class(p->policy, p->prio); 346 } 347 348 static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 349 { 350 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 351 } 352 353 static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 354 { 355 #ifdef CONFIG_EXT_SUB_SCHED 356 /* 357 * If @sch is a sub-sched which is bypassing, its tasks should go into 358 * the bypass DSQs of the nearest ancestor which is not bypassing. The 359 * not-bypassing ancestor is responsible for scheduling all tasks from 360 * bypassing sub-trees. If all ancestors including root are bypassing, 361 * all tasks should go to the root's bypass DSQs. 362 * 363 * Whenever a sched starts bypassing, all runnable tasks in its subtree 364 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 365 * all tasks are transferred to the right DSQs. 366 */ 367 while (scx_parent(sch) && scx_bypassing(sch, cpu)) 368 sch = scx_parent(sch); 369 #endif /* CONFIG_EXT_SUB_SCHED */ 370 371 return bypass_dsq(sch, cpu); 372 } 373 374 /** 375 * bypass_dsp_enabled - Check if bypass dispatch path is enabled 376 * @sch: scheduler to check 377 * 378 * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 379 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 380 * are bypassing. In the former case, the ancestor is not itself bypassing but 381 * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 382 * the ancestor's bypass dispatch path must be active even though its own 383 * bypass_depth remains zero. 384 * 385 * This function checks bypass_dsp_enable_depth which is managed separately from 386 * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 387 * disable_bypass_dsp(). 388 */ 389 static bool bypass_dsp_enabled(struct scx_sched *sch) 390 { 391 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 392 } 393 394 /** 395 * rq_is_open - Is the rq available for immediate execution of an SCX task? 396 * @rq: rq to test 397 * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 398 * 399 * Returns %true if @rq is currently open for executing an SCX task. After a 400 * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 401 * before going to idle and not inserting a task into @rq's local DSQ after a 402 * %false return doesn't cause @rq to stall. 403 */ 404 static bool rq_is_open(struct rq *rq, u64 enq_flags) 405 { 406 lockdep_assert_rq_held(rq); 407 408 /* 409 * A higher-priority class task is either running or in the process of 410 * waking up on @rq. 411 */ 412 if (sched_class_above(rq->next_class, &ext_sched_class)) 413 return false; 414 415 /* 416 * @rq is either in transition to or in idle and there is no 417 * higher-priority class task waking up on it. 418 */ 419 if (sched_class_above(&ext_sched_class, rq->next_class)) 420 return true; 421 422 /* 423 * @rq is either picking, in transition to, or running an SCX task. 424 */ 425 426 /* 427 * If we're in the dispatch path holding rq lock, $curr may or may not 428 * be ready depending on whether the on-going dispatch decides to extend 429 * $curr's slice. We say yes here and resolve it at the end of dispatch. 430 * See balance_one(). 431 */ 432 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 433 return true; 434 435 /* 436 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 437 * so allow it to avoid spuriously triggering reenq on a combined 438 * PREEMPT|IMMED insertion. 439 */ 440 if (enq_flags & SCX_ENQ_PREEMPT) 441 return true; 442 443 /* 444 * @rq is either in transition to or running an SCX task and can't go 445 * idle without another SCX dispatch cycle. 446 */ 447 return false; 448 } 449 450 /* 451 * Track the rq currently locked. 452 * 453 * This allows kfuncs to safely operate on rq from any scx ops callback, 454 * knowing which rq is already locked. 455 */ 456 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 457 458 static inline void update_locked_rq(struct rq *rq) 459 { 460 /* 461 * Check whether @rq is actually locked. This can help expose bugs 462 * or incorrect assumptions about the context in which a kfunc or 463 * callback is executed. 464 */ 465 if (rq) 466 lockdep_assert_rq_held(rq); 467 __this_cpu_write(scx_locked_rq_state, rq); 468 } 469 470 /* 471 * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not 472 * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. 473 */ 474 #define SCX_CALL_OP(sch, op, locked_rq, args...) \ 475 do { \ 476 struct rq *__prev_locked_rq; \ 477 \ 478 if (locked_rq) { \ 479 __prev_locked_rq = scx_locked_rq(); \ 480 update_locked_rq(locked_rq); \ 481 } \ 482 (sch)->ops.op(args); \ 483 if (locked_rq) \ 484 update_locked_rq(__prev_locked_rq); \ 485 } while (0) 486 487 #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ 488 ({ \ 489 struct rq *__prev_locked_rq; \ 490 __typeof__((sch)->ops.op(args)) __ret; \ 491 \ 492 if (locked_rq) { \ 493 __prev_locked_rq = scx_locked_rq(); \ 494 update_locked_rq(locked_rq); \ 495 } \ 496 __ret = (sch)->ops.op(args); \ 497 if (locked_rq) \ 498 update_locked_rq(__prev_locked_rq); \ 499 __ret; \ 500 }) 501 502 /* 503 * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments 504 * and records them in current->scx.kf_tasks[] for the duration of the call. A 505 * kfunc invoked from inside such an op can then use 506 * scx_kf_arg_task_ok() to verify that its task argument is one of 507 * those subject tasks. 508 * 509 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - 510 * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's 511 * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. 512 * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. 513 * 514 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The 515 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants 516 * while a previous one is still in progress. 517 */ 518 #define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ 519 do { \ 520 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 521 current->scx.kf_tasks[0] = task; \ 522 SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ 523 current->scx.kf_tasks[0] = NULL; \ 524 } while (0) 525 526 #define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ 527 ({ \ 528 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 529 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 530 current->scx.kf_tasks[0] = task; \ 531 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ 532 current->scx.kf_tasks[0] = NULL; \ 533 __ret; \ 534 }) 535 536 #define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ 537 ({ \ 538 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 539 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 540 current->scx.kf_tasks[0] = task0; \ 541 current->scx.kf_tasks[1] = task1; \ 542 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ 543 current->scx.kf_tasks[0] = NULL; \ 544 current->scx.kf_tasks[1] = NULL; \ 545 __ret; \ 546 }) 547 548 /* see SCX_CALL_OP_TASK() */ 549 static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, 550 struct task_struct *p) 551 { 552 if (unlikely((p != current->scx.kf_tasks[0] && 553 p != current->scx.kf_tasks[1]))) { 554 scx_error(sch, "called on a task not being operated on"); 555 return false; 556 } 557 558 return true; 559 } 560 561 enum scx_dsq_iter_flags { 562 /* iterate in the reverse dispatch order */ 563 SCX_DSQ_ITER_REV = 1U << 16, 564 565 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 566 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 567 568 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 569 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 570 __SCX_DSQ_ITER_HAS_SLICE | 571 __SCX_DSQ_ITER_HAS_VTIME, 572 }; 573 574 /** 575 * nldsq_next_task - Iterate to the next task in a non-local DSQ 576 * @dsq: non-local dsq being iterated 577 * @cur: current position, %NULL to start iteration 578 * @rev: walk backwards 579 * 580 * Returns %NULL when iteration is finished. 581 */ 582 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 583 struct task_struct *cur, bool rev) 584 { 585 struct list_head *list_node; 586 struct scx_dsq_list_node *dsq_lnode; 587 588 lockdep_assert_held(&dsq->lock); 589 590 if (cur) 591 list_node = &cur->scx.dsq_list.node; 592 else 593 list_node = &dsq->list; 594 595 /* find the next task, need to skip BPF iteration cursors */ 596 do { 597 if (rev) 598 list_node = list_node->prev; 599 else 600 list_node = list_node->next; 601 602 if (list_node == &dsq->list) 603 return NULL; 604 605 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 606 node); 607 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 608 609 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 610 } 611 612 #define nldsq_for_each_task(p, dsq) \ 613 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 614 (p) = nldsq_next_task((dsq), (p), false)) 615 616 /** 617 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 618 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 619 * @dsq: non-local dsq being iterated 620 * 621 * Find the next task in a cursor based iteration. The caller must have 622 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 623 * between the iteration steps. 624 * 625 * Only tasks which were queued before @cursor was initialized are visible. This 626 * bounds the iteration and guarantees that vtime never jumps in the other 627 * direction while iterating. 628 */ 629 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 630 struct scx_dispatch_q *dsq) 631 { 632 bool rev = cursor->flags & SCX_DSQ_ITER_REV; 633 struct task_struct *p; 634 635 lockdep_assert_held(&dsq->lock); 636 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 637 638 if (list_empty(&cursor->node)) 639 p = NULL; 640 else 641 p = container_of(cursor, struct task_struct, scx.dsq_list); 642 643 /* skip cursors and tasks that were queued after @cursor init */ 644 do { 645 p = nldsq_next_task(dsq, p, rev); 646 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 647 648 if (p) { 649 if (rev) 650 list_move_tail(&cursor->node, &p->scx.dsq_list.node); 651 else 652 list_move(&cursor->node, &p->scx.dsq_list.node); 653 } else { 654 list_del_init(&cursor->node); 655 } 656 657 return p; 658 } 659 660 /** 661 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 662 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 663 * @rq: rq @p was on 664 * @dsq: dsq @p was on 665 * @p: target task 666 * 667 * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 668 * dropped and re-acquired inbetween. Verify that no one else took or is in the 669 * process of taking @p from @dsq. 670 * 671 * On %false return, the caller can assume full ownership of @p. 672 */ 673 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 674 struct rq *rq, struct scx_dispatch_q *dsq, 675 struct task_struct *p) 676 { 677 lockdep_assert_rq_held(rq); 678 lockdep_assert_held(&dsq->lock); 679 680 /* 681 * @p could have already left $src_dsq, got re-enqueud, or be in the 682 * process of being consumed by someone else. 683 */ 684 if (unlikely(p->scx.dsq != dsq || 685 u32_before(cursor->priv, p->scx.dsq_seq) || 686 p->scx.holding_cpu >= 0)) 687 return true; 688 689 /* if @p has stayed on @dsq, its rq couldn't have changed */ 690 if (WARN_ON_ONCE(rq != task_rq(p))) 691 return true; 692 693 return false; 694 } 695 696 /* 697 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 698 * dispatch order. BPF-visible iterator is opaque and larger to allow future 699 * changes without breaking backward compatibility. Can be used with 700 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 701 */ 702 struct bpf_iter_scx_dsq_kern { 703 struct scx_dsq_list_node cursor; 704 struct scx_dispatch_q *dsq; 705 u64 slice; 706 u64 vtime; 707 } __attribute__((aligned(8))); 708 709 struct bpf_iter_scx_dsq { 710 u64 __opaque[6]; 711 } __attribute__((aligned(8))); 712 713 714 static u32 scx_get_task_state(const struct task_struct *p) 715 { 716 return p->scx.flags & SCX_TASK_STATE_MASK; 717 } 718 719 static void scx_set_task_state(struct task_struct *p, u32 state) 720 { 721 u32 prev_state = scx_get_task_state(p); 722 bool warn = false; 723 724 switch (state) { 725 case SCX_TASK_NONE: 726 warn = prev_state == SCX_TASK_DEAD; 727 break; 728 case SCX_TASK_INIT_BEGIN: 729 warn = prev_state != SCX_TASK_NONE; 730 break; 731 case SCX_TASK_INIT: 732 warn = prev_state != SCX_TASK_INIT_BEGIN; 733 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 734 break; 735 case SCX_TASK_READY: 736 warn = !(prev_state == SCX_TASK_INIT || 737 prev_state == SCX_TASK_ENABLED); 738 break; 739 case SCX_TASK_ENABLED: 740 warn = prev_state != SCX_TASK_READY; 741 break; 742 case SCX_TASK_DEAD: 743 warn = !(prev_state == SCX_TASK_NONE || 744 prev_state == SCX_TASK_INIT_BEGIN); 745 break; 746 default: 747 WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", 748 prev_state, state, p->comm, p->pid); 749 return; 750 } 751 752 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 753 prev_state, state, p->comm, p->pid); 754 755 p->scx.flags &= ~SCX_TASK_STATE_MASK; 756 p->scx.flags |= state; 757 } 758 759 /* 760 * SCX task iterator. 761 */ 762 struct scx_task_iter { 763 struct sched_ext_entity cursor; 764 struct task_struct *locked_task; 765 struct rq *rq; 766 struct rq_flags rf; 767 u32 cnt; 768 bool list_locked; 769 #ifdef CONFIG_EXT_SUB_SCHED 770 struct cgroup *cgrp; 771 struct cgroup_subsys_state *css_pos; 772 struct css_task_iter css_iter; 773 #endif 774 }; 775 776 /** 777 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 778 * @iter: iterator to init 779 * @cgrp: Optional root of cgroup subhierarchy to iterate 780 * 781 * Initialize @iter. Once initialized, @iter must eventually be stopped with 782 * scx_task_iter_stop(). 783 * 784 * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 785 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 786 * 787 * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 788 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 789 * task migrations. 790 * 791 * The two modes of iterations are largely independent and it's likely that 792 * scx_tasks can be removed in favor of always using cgroup iteration if 793 * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 794 * 795 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 796 * between this and the first next() call or between any two next() calls. If 797 * the locks are released between two next() calls, the caller is responsible 798 * for ensuring that the task being iterated remains accessible either through 799 * RCU read lock or obtaining a reference count. 800 * 801 * All tasks which existed when the iteration started are guaranteed to be 802 * visited as long as they are not dead. 803 */ 804 static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 805 { 806 memset(iter, 0, sizeof(*iter)); 807 808 #ifdef CONFIG_EXT_SUB_SCHED 809 if (cgrp) { 810 lockdep_assert_held(&cgroup_mutex); 811 iter->cgrp = cgrp; 812 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 813 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 814 &iter->css_iter); 815 return; 816 } 817 #endif 818 raw_spin_lock_irq(&scx_tasks_lock); 819 820 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 821 list_add(&iter->cursor.tasks_node, &scx_tasks); 822 iter->list_locked = true; 823 } 824 825 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 826 { 827 if (iter->locked_task) { 828 __balance_callbacks(iter->rq, &iter->rf); 829 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 830 iter->locked_task = NULL; 831 } 832 } 833 834 /** 835 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 836 * @iter: iterator to unlock 837 * 838 * If @iter is in the middle of a locked iteration, it may be locking the rq of 839 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 840 * This function can be safely called anytime during an iteration. The next 841 * iterator operation will automatically restore the necessary locking. 842 */ 843 static void scx_task_iter_unlock(struct scx_task_iter *iter) 844 { 845 __scx_task_iter_rq_unlock(iter); 846 if (iter->list_locked) { 847 iter->list_locked = false; 848 raw_spin_unlock_irq(&scx_tasks_lock); 849 } 850 } 851 852 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 853 { 854 if (!iter->list_locked) { 855 raw_spin_lock_irq(&scx_tasks_lock); 856 iter->list_locked = true; 857 } 858 } 859 860 /** 861 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 862 * @iter: iterator to exit 863 * 864 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 865 * which is released on return. If the iterator holds a task's rq lock, that rq 866 * lock is also released. See scx_task_iter_start() for details. 867 */ 868 static void scx_task_iter_stop(struct scx_task_iter *iter) 869 { 870 #ifdef CONFIG_EXT_SUB_SCHED 871 if (iter->cgrp) { 872 if (iter->css_pos) 873 css_task_iter_end(&iter->css_iter); 874 __scx_task_iter_rq_unlock(iter); 875 return; 876 } 877 #endif 878 __scx_task_iter_maybe_relock(iter); 879 list_del_init(&iter->cursor.tasks_node); 880 scx_task_iter_unlock(iter); 881 } 882 883 /** 884 * scx_task_iter_next - Next task 885 * @iter: iterator to walk 886 * 887 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 888 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 889 * by holding scx_tasks_lock for too long. 890 */ 891 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 892 { 893 struct list_head *cursor = &iter->cursor.tasks_node; 894 struct sched_ext_entity *pos; 895 896 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 897 scx_task_iter_unlock(iter); 898 cond_resched(); 899 } 900 901 #ifdef CONFIG_EXT_SUB_SCHED 902 if (iter->cgrp) { 903 while (iter->css_pos) { 904 struct task_struct *p; 905 906 p = css_task_iter_next(&iter->css_iter); 907 if (p) 908 return p; 909 910 css_task_iter_end(&iter->css_iter); 911 iter->css_pos = css_next_descendant_pre(iter->css_pos, 912 &iter->cgrp->self); 913 if (iter->css_pos) 914 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 915 &iter->css_iter); 916 } 917 return NULL; 918 } 919 #endif 920 __scx_task_iter_maybe_relock(iter); 921 922 list_for_each_entry(pos, cursor, tasks_node) { 923 if (&pos->tasks_node == &scx_tasks) 924 return NULL; 925 if (!(pos->flags & SCX_TASK_CURSOR)) { 926 list_move(cursor, &pos->tasks_node); 927 return container_of(pos, struct task_struct, scx); 928 } 929 } 930 931 /* can't happen, should always terminate at scx_tasks above */ 932 BUG(); 933 } 934 935 /** 936 * scx_task_iter_next_locked - Next non-idle task with its rq locked 937 * @iter: iterator to walk 938 * 939 * Visit the non-idle task with its rq lock held. Allows callers to specify 940 * whether they would like to filter out dead tasks. See scx_task_iter_start() 941 * for details. 942 */ 943 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 944 { 945 struct task_struct *p; 946 947 __scx_task_iter_rq_unlock(iter); 948 949 while ((p = scx_task_iter_next(iter))) { 950 /* 951 * scx_task_iter is used to prepare and move tasks into SCX 952 * while loading the BPF scheduler and vice-versa while 953 * unloading. The init_tasks ("swappers") should be excluded 954 * from the iteration because: 955 * 956 * - It's unsafe to use __setschduler_prio() on an init_task to 957 * determine the sched_class to use as it won't preserve its 958 * idle_sched_class. 959 * 960 * - ops.init/exit_task() can easily be confused if called with 961 * init_tasks as they, e.g., share PID 0. 962 * 963 * As init_tasks are never scheduled through SCX, they can be 964 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 965 * doesn't work here: 966 * 967 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 968 * yet been onlined. 969 * 970 * - %PF_IDLE can be set on tasks that are not init_tasks. See 971 * play_idle_precise() used by CONFIG_IDLE_INJECT. 972 * 973 * Test for idle_sched_class as only init_tasks are on it. 974 */ 975 if (p->sched_class == &idle_sched_class) 976 continue; 977 978 iter->rq = task_rq_lock(p, &iter->rf); 979 iter->locked_task = p; 980 981 /* 982 * cgroup_task_dead() removes the dead tasks from cset->tasks 983 * after sched_ext_dead() and cgroup iteration may see tasks 984 * which already finished sched_ext_dead(). %SCX_TASK_DEAD is 985 * set by sched_ext_dead() under @p's rq lock. Test it to 986 * avoid visiting tasks which are already dead from SCX POV. 987 */ 988 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 989 __scx_task_iter_rq_unlock(iter); 990 continue; 991 } 992 993 return p; 994 } 995 return NULL; 996 } 997 998 /** 999 * scx_add_event - Increase an event counter for 'name' by 'cnt' 1000 * @sch: scx_sched to account events for 1001 * @name: an event name defined in struct scx_event_stats 1002 * @cnt: the number of the event occurred 1003 * 1004 * This can be used when preemption is not disabled. 1005 */ 1006 #define scx_add_event(sch, name, cnt) do { \ 1007 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1008 trace_sched_ext_event(#name, (cnt)); \ 1009 } while(0) 1010 1011 /** 1012 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1013 * @sch: scx_sched to account events for 1014 * @name: an event name defined in struct scx_event_stats 1015 * @cnt: the number of the event occurred 1016 * 1017 * This should be used only when preemption is disabled. 1018 */ 1019 #define __scx_add_event(sch, name, cnt) do { \ 1020 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1021 trace_sched_ext_event(#name, cnt); \ 1022 } while(0) 1023 1024 /** 1025 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1026 * @dst_e: destination event stats 1027 * @src_e: source event stats 1028 * @kind: a kind of event to be aggregated 1029 */ 1030 #define scx_agg_event(dst_e, src_e, kind) do { \ 1031 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1032 } while(0) 1033 1034 /** 1035 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1036 * @s: output seq_buf 1037 * @events: event stats 1038 * @kind: a kind of event to dump 1039 */ 1040 #define scx_dump_event(s, events, kind) do { \ 1041 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1042 } while (0) 1043 1044 1045 static void scx_read_events(struct scx_sched *sch, 1046 struct scx_event_stats *events); 1047 1048 static enum scx_enable_state scx_enable_state(void) 1049 { 1050 return atomic_read(&scx_enable_state_var); 1051 } 1052 1053 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1054 { 1055 return atomic_xchg(&scx_enable_state_var, to); 1056 } 1057 1058 static bool scx_tryset_enable_state(enum scx_enable_state to, 1059 enum scx_enable_state from) 1060 { 1061 int from_v = from; 1062 1063 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1064 } 1065 1066 /** 1067 * wait_ops_state - Busy-wait the specified ops state to end 1068 * @p: target task 1069 * @opss: state to wait the end of 1070 * 1071 * Busy-wait for @p to transition out of @opss. This can only be used when the 1072 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1073 * has load_acquire semantics to ensure that the caller can see the updates made 1074 * in the enqueueing and dispatching paths. 1075 */ 1076 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1077 { 1078 do { 1079 cpu_relax(); 1080 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1081 } 1082 1083 static inline bool __cpu_valid(s32 cpu) 1084 { 1085 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1086 } 1087 1088 /** 1089 * ops_cpu_valid - Verify a cpu number, to be used on ops input args 1090 * @sch: scx_sched to abort on error 1091 * @cpu: cpu number which came from a BPF ops 1092 * @where: extra information reported on error 1093 * 1094 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1095 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1096 * an ops error. 1097 */ 1098 static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1099 { 1100 if (__cpu_valid(cpu)) { 1101 return true; 1102 } else { 1103 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1104 return false; 1105 } 1106 } 1107 1108 /** 1109 * ops_sanitize_err - Sanitize a -errno value 1110 * @sch: scx_sched to error out on error 1111 * @ops_name: operation to blame on failure 1112 * @err: -errno value to sanitize 1113 * 1114 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1115 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1116 * cause misbehaviors. For an example, a large negative return from 1117 * ops.init_task() triggers an oops when passed up the call chain because the 1118 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1119 * handled as a pointer. 1120 */ 1121 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1122 { 1123 if (err < 0 && err >= -MAX_ERRNO) 1124 return err; 1125 1126 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1127 return -EPROTO; 1128 } 1129 1130 static void deferred_bal_cb_workfn(struct rq *rq) 1131 { 1132 run_deferred(rq); 1133 } 1134 1135 static void deferred_irq_workfn(struct irq_work *irq_work) 1136 { 1137 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1138 1139 raw_spin_rq_lock(rq); 1140 run_deferred(rq); 1141 raw_spin_rq_unlock(rq); 1142 } 1143 1144 /** 1145 * schedule_deferred - Schedule execution of deferred actions on an rq 1146 * @rq: target rq 1147 * 1148 * Schedule execution of deferred actions on @rq. Deferred actions are executed 1149 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1150 * to other rqs. 1151 */ 1152 static void schedule_deferred(struct rq *rq) 1153 { 1154 /* 1155 * This is the fallback when schedule_deferred_locked() can't use 1156 * the cheaper balance callback or wakeup hook paths (the target 1157 * CPU is not in balance or wakeup). Currently, this is primarily 1158 * hit by reenqueue operations targeting a remote CPU. 1159 * 1160 * Queue on the target CPU. The deferred work can run from any CPU 1161 * correctly - the _locked() path already processes remote rqs from 1162 * the calling CPU - but targeting the owning CPU allows IPI delivery 1163 * without waiting for the calling CPU to re-enable IRQs and is 1164 * cheaper as the reenqueue runs locally. 1165 */ 1166 irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); 1167 } 1168 1169 /** 1170 * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1171 * @rq: target rq 1172 * 1173 * Schedule execution of deferred actions on @rq. Equivalent to 1174 * schedule_deferred() but requires @rq to be locked and can be more efficient. 1175 */ 1176 static void schedule_deferred_locked(struct rq *rq) 1177 { 1178 lockdep_assert_rq_held(rq); 1179 1180 /* 1181 * If in the middle of waking up a task, task_woken_scx() will be called 1182 * afterwards which will then run the deferred actions, no need to 1183 * schedule anything. 1184 */ 1185 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1186 return; 1187 1188 /* Don't do anything if there already is a deferred operation. */ 1189 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1190 return; 1191 1192 /* 1193 * If in balance, the balance callbacks will be called before rq lock is 1194 * released. Schedule one. 1195 * 1196 * 1197 * We can't directly insert the callback into the 1198 * rq's list: The call can drop its lock and make the pending balance 1199 * callback visible to unrelated code paths that call rq_pin_lock(). 1200 * 1201 * Just let balance_one() know that it must do it itself. 1202 */ 1203 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1204 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1205 return; 1206 } 1207 1208 /* 1209 * No scheduler hooks available. Use the generic irq_work path. The 1210 * above WAKEUP and BALANCE paths should cover most of the cases and the 1211 * time to IRQ re-enable shouldn't be long. 1212 */ 1213 schedule_deferred(rq); 1214 } 1215 1216 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1217 u64 reenq_flags, struct rq *locked_rq) 1218 { 1219 struct rq *rq; 1220 1221 /* 1222 * Allowing reenqueues doesn't make sense while bypassing. This also 1223 * blocks from new reenqueues to be scheduled on dead scheds. 1224 */ 1225 if (unlikely(READ_ONCE(sch->bypass_depth))) 1226 return; 1227 1228 if (dsq->id == SCX_DSQ_LOCAL) { 1229 rq = container_of(dsq, struct rq, scx.local_dsq); 1230 1231 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1232 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1233 1234 /* 1235 * Pairs with smp_mb() in process_deferred_reenq_locals() and 1236 * guarantees that there is a reenq_local() afterwards. 1237 */ 1238 smp_mb(); 1239 1240 if (list_empty(&drl->node) || 1241 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1242 1243 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1244 1245 if (list_empty(&drl->node)) 1246 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1247 WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1248 } 1249 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1250 rq = this_rq(); 1251 1252 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1253 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1254 1255 /* 1256 * Pairs with smp_mb() in process_deferred_reenq_users() and 1257 * guarantees that there is a reenq_user() afterwards. 1258 */ 1259 smp_mb(); 1260 1261 if (list_empty(&dru->node) || 1262 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1263 1264 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1265 1266 if (list_empty(&dru->node)) 1267 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1268 WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1269 } 1270 } else { 1271 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1272 return; 1273 } 1274 1275 if (rq == locked_rq) 1276 schedule_deferred_locked(rq); 1277 else 1278 schedule_deferred(rq); 1279 } 1280 1281 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1282 { 1283 struct scx_sched *root = rcu_dereference_sched(scx_root); 1284 1285 if (WARN_ON_ONCE(!root)) 1286 return; 1287 1288 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); 1289 } 1290 1291 /** 1292 * touch_core_sched - Update timestamp used for core-sched task ordering 1293 * @rq: rq to read clock from, must be locked 1294 * @p: task to update the timestamp for 1295 * 1296 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1297 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1298 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1299 * exhaustion). 1300 */ 1301 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1302 { 1303 lockdep_assert_rq_held(rq); 1304 1305 #ifdef CONFIG_SCHED_CORE 1306 /* 1307 * It's okay to update the timestamp spuriously. Use 1308 * sched_core_disabled() which is cheaper than enabled(). 1309 * 1310 * As this is used to determine ordering between tasks of sibling CPUs, 1311 * it may be better to use per-core dispatch sequence instead. 1312 */ 1313 if (!sched_core_disabled()) 1314 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1315 #endif 1316 } 1317 1318 /** 1319 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1320 * @rq: rq to read clock from, must be locked 1321 * @p: task being dispatched 1322 * 1323 * If the BPF scheduler implements custom core-sched ordering via 1324 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1325 * ordering within each local DSQ. This function is called from dispatch paths 1326 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1327 */ 1328 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1329 { 1330 lockdep_assert_rq_held(rq); 1331 1332 #ifdef CONFIG_SCHED_CORE 1333 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1334 touch_core_sched(rq, p); 1335 #endif 1336 } 1337 1338 static void update_curr_scx(struct rq *rq) 1339 { 1340 struct task_struct *curr = rq->curr; 1341 s64 delta_exec; 1342 1343 delta_exec = update_curr_common(rq); 1344 if (unlikely(delta_exec <= 0)) 1345 return; 1346 1347 if (curr->scx.slice != SCX_SLICE_INF) { 1348 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1349 if (!curr->scx.slice) 1350 touch_core_sched(rq, curr); 1351 } 1352 1353 dl_server_update(&rq->ext_server, delta_exec); 1354 } 1355 1356 static bool scx_dsq_priq_less(struct rb_node *node_a, 1357 const struct rb_node *node_b) 1358 { 1359 const struct task_struct *a = 1360 container_of(node_a, struct task_struct, scx.dsq_priq); 1361 const struct task_struct *b = 1362 container_of(node_b, struct task_struct, scx.dsq_priq); 1363 1364 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1365 } 1366 1367 static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1368 { 1369 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1370 WRITE_ONCE(dsq->nr, dsq->nr + 1); 1371 1372 /* 1373 * Once @p reaches a local DSQ, it can only leave it by being dispatched 1374 * to the CPU or dequeued. In both cases, the only way @p can go back to 1375 * the BPF sched is through enqueueing. If being inserted into a local 1376 * DSQ with IMMED, persist the state until the next enqueueing event in 1377 * do_enqueue_task() so that we can maintain IMMED protection through 1378 * e.g. SAVE/RESTORE cycles and slice extensions. 1379 */ 1380 if (enq_flags & SCX_ENQ_IMMED) { 1381 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1382 WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1383 return; 1384 } 1385 p->scx.flags |= SCX_TASK_IMMED; 1386 } 1387 1388 if (p->scx.flags & SCX_TASK_IMMED) { 1389 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1390 1391 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1392 return; 1393 1394 rq->scx.nr_immed++; 1395 1396 /* 1397 * If @rq already had other tasks or the current task is not 1398 * done yet, @p can't go on the CPU immediately. Re-enqueue. 1399 */ 1400 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1401 schedule_reenq_local(rq, 0); 1402 } 1403 } 1404 1405 static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1406 { 1407 /* see dsq_inc_nr() */ 1408 WRITE_ONCE(dsq->nr, dsq->nr - 1); 1409 1410 if (p->scx.flags & SCX_TASK_IMMED) { 1411 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1412 1413 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1414 WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1415 return; 1416 1417 rq->scx.nr_immed--; 1418 } 1419 } 1420 1421 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1422 { 1423 p->scx.slice = READ_ONCE(sch->slice_dfl); 1424 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1425 } 1426 1427 /* 1428 * Return true if @p is moving due to an internal SCX migration, false 1429 * otherwise. 1430 */ 1431 static inline bool task_scx_migrating(struct task_struct *p) 1432 { 1433 /* 1434 * We only need to check sticky_cpu: it is set to the destination 1435 * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1436 * and cleared when the task is enqueued on the destination, so it 1437 * is only non-negative during an internal SCX migration. 1438 */ 1439 return p->scx.sticky_cpu >= 0; 1440 } 1441 1442 /* 1443 * Call ops.dequeue() if the task is in BPF custody and not migrating. 1444 * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1445 */ 1446 static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1447 struct task_struct *p, u64 deq_flags) 1448 { 1449 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1450 return; 1451 1452 if (SCX_HAS_OP(sch, dequeue)) 1453 SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); 1454 1455 p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1456 } 1457 1458 static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1459 struct task_struct *p, u64 enq_flags) 1460 { 1461 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1462 1463 call_task_dequeue(sch, rq, p, 0); 1464 1465 /* 1466 * Note that @rq's lock may be dropped between this enqueue and @p 1467 * actually getting on CPU. This gives higher-class tasks (e.g. RT) 1468 * an opportunity to wake up on @rq and prevent @p from running. 1469 * Here are some concrete examples: 1470 * 1471 * Example 1: 1472 * 1473 * We dispatch two tasks from a single ops.dispatch(): 1474 * - First, a local task to this CPU's local DSQ; 1475 * - Second, a local/remote task to a remote CPU's local DSQ. 1476 * We must drop the local rq lock in order to finish the second 1477 * dispatch. In that time, an RT task can wake up on the local rq. 1478 * 1479 * Example 2: 1480 * 1481 * We dispatch a local/remote task to a remote CPU's local DSQ. 1482 * We must drop the remote rq lock before the dispatched task can run, 1483 * which gives an RT task an opportunity to wake up on the remote rq. 1484 * 1485 * Both examples work the same if we replace dispatching with moving 1486 * the tasks from a user-created DSQ. 1487 * 1488 * We must detect these wakeups so that we can re-enqueue IMMED tasks 1489 * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this 1490 * purpose, but for it to be invoked, we must ensure that we bump 1491 * @rq->next_class to &ext_sched_class if it's currently idle. 1492 * 1493 * wakeup_preempt() does the bumping, and since we only invoke it if 1494 * @rq->next_class is below &ext_sched_class, it will also 1495 * resched_curr(rq). 1496 */ 1497 if (sched_class_above(p->sched_class, rq->next_class)) 1498 wakeup_preempt(rq, p, 0); 1499 1500 /* 1501 * If @rq is in balance, the CPU is already vacant and looking for the 1502 * next task to run. No need to preempt or trigger resched after moving 1503 * @p into its local DSQ. 1504 * Note that the wakeup_preempt() above may have already triggered 1505 * a resched if @rq->next_class was idle. It's harmless, since 1506 * need_resched is cleared immediately after task pick. 1507 */ 1508 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1509 return; 1510 1511 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1512 rq->curr->sched_class == &ext_sched_class) { 1513 rq->curr->scx.slice = 0; 1514 resched_curr(rq); 1515 } 1516 } 1517 1518 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1519 struct scx_dispatch_q *dsq, struct task_struct *p, 1520 u64 enq_flags) 1521 { 1522 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1523 1524 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1525 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1526 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1527 1528 if (!is_local) { 1529 raw_spin_lock_nested(&dsq->lock, 1530 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1531 1532 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1533 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1534 /* fall back to the global dsq */ 1535 raw_spin_unlock(&dsq->lock); 1536 dsq = find_global_dsq(sch, task_cpu(p)); 1537 raw_spin_lock(&dsq->lock); 1538 } 1539 } 1540 1541 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1542 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1543 /* 1544 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1545 * their FIFO queues. To avoid confusion and accidentally 1546 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1547 * disallow any internal DSQ from doing vtime ordering of 1548 * tasks. 1549 */ 1550 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1551 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1552 } 1553 1554 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1555 struct rb_node *rbp; 1556 1557 /* 1558 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1559 * linked to both the rbtree and list on PRIQs, this can only be 1560 * tested easily when adding the first task. 1561 */ 1562 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1563 nldsq_next_task(dsq, NULL, false))) 1564 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1565 dsq->id); 1566 1567 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1568 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1569 1570 /* 1571 * Find the previous task and insert after it on the list so 1572 * that @dsq->list is vtime ordered. 1573 */ 1574 rbp = rb_prev(&p->scx.dsq_priq); 1575 if (rbp) { 1576 struct task_struct *prev = 1577 container_of(rbp, struct task_struct, 1578 scx.dsq_priq); 1579 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1580 /* first task unchanged - no update needed */ 1581 } else { 1582 list_add(&p->scx.dsq_list.node, &dsq->list); 1583 /* not builtin and new task is at head - use fastpath */ 1584 rcu_assign_pointer(dsq->first_task, p); 1585 } 1586 } else { 1587 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1588 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1589 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1590 dsq->id); 1591 1592 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1593 list_add(&p->scx.dsq_list.node, &dsq->list); 1594 /* new task inserted at head - use fastpath */ 1595 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1596 rcu_assign_pointer(dsq->first_task, p); 1597 } else { 1598 /* 1599 * dsq->list can contain parked BPF iterator cursors, so 1600 * list_empty() here isn't a reliable proxy for "no real 1601 * task in the DSQ". Test dsq->first_task directly. 1602 */ 1603 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1604 if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1605 rcu_assign_pointer(dsq->first_task, p); 1606 } 1607 } 1608 1609 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1610 WRITE_ONCE(dsq->seq, dsq->seq + 1); 1611 p->scx.dsq_seq = dsq->seq; 1612 1613 dsq_inc_nr(dsq, p, enq_flags); 1614 p->scx.dsq = dsq; 1615 1616 /* 1617 * Update custody and call ops.dequeue() before clearing ops_state: 1618 * once ops_state is cleared, waiters in ops_dequeue() can proceed 1619 * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1620 * ops_state first, both sides would modify p->scx.flags 1621 * concurrently in a non-atomic way. 1622 */ 1623 if (is_local) { 1624 local_dsq_post_enq(sch, dsq, p, enq_flags); 1625 } else { 1626 /* 1627 * Task on global/bypass DSQ: leave custody, task on 1628 * non-terminal DSQ: enter custody. 1629 */ 1630 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1631 call_task_dequeue(sch, rq, p, 0); 1632 else 1633 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1634 1635 raw_spin_unlock(&dsq->lock); 1636 } 1637 1638 /* 1639 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1640 * match waiters' load_acquire. 1641 */ 1642 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1643 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1644 } 1645 1646 static void task_unlink_from_dsq(struct task_struct *p, 1647 struct scx_dispatch_q *dsq) 1648 { 1649 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1650 1651 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1652 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1653 RB_CLEAR_NODE(&p->scx.dsq_priq); 1654 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1655 } 1656 1657 list_del_init(&p->scx.dsq_list.node); 1658 dsq_dec_nr(dsq, p); 1659 1660 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1661 struct task_struct *first_task; 1662 1663 first_task = nldsq_next_task(dsq, NULL, false); 1664 rcu_assign_pointer(dsq->first_task, first_task); 1665 } 1666 } 1667 1668 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1669 { 1670 struct scx_dispatch_q *dsq = p->scx.dsq; 1671 bool is_local = dsq == &rq->scx.local_dsq; 1672 1673 lockdep_assert_rq_held(rq); 1674 1675 if (!dsq) { 1676 /* 1677 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1678 * Unlinking is all that's needed to cancel. 1679 */ 1680 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1681 list_del_init(&p->scx.dsq_list.node); 1682 1683 /* 1684 * When dispatching directly from the BPF scheduler to a local 1685 * DSQ, the task isn't associated with any DSQ but 1686 * @p->scx.holding_cpu may be set under the protection of 1687 * %SCX_OPSS_DISPATCHING. 1688 */ 1689 if (p->scx.holding_cpu >= 0) 1690 p->scx.holding_cpu = -1; 1691 1692 return; 1693 } 1694 1695 if (!is_local) 1696 raw_spin_lock(&dsq->lock); 1697 1698 /* 1699 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1700 * change underneath us. 1701 */ 1702 if (p->scx.holding_cpu < 0) { 1703 /* @p must still be on @dsq, dequeue */ 1704 task_unlink_from_dsq(p, dsq); 1705 } else { 1706 /* 1707 * We're racing against dispatch_to_local_dsq() which already 1708 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1709 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1710 * the race. 1711 */ 1712 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1713 p->scx.holding_cpu = -1; 1714 } 1715 p->scx.dsq = NULL; 1716 1717 if (!is_local) 1718 raw_spin_unlock(&dsq->lock); 1719 } 1720 1721 /* 1722 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1723 * and dsq are locked. 1724 */ 1725 static void dispatch_dequeue_locked(struct task_struct *p, 1726 struct scx_dispatch_q *dsq) 1727 { 1728 lockdep_assert_rq_held(task_rq(p)); 1729 lockdep_assert_held(&dsq->lock); 1730 1731 task_unlink_from_dsq(p, dsq); 1732 p->scx.dsq = NULL; 1733 } 1734 1735 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1736 struct rq *rq, u64 dsq_id, 1737 s32 tcpu) 1738 { 1739 struct scx_dispatch_q *dsq; 1740 1741 if (dsq_id == SCX_DSQ_LOCAL) 1742 return &rq->scx.local_dsq; 1743 1744 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1745 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1746 1747 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1748 return find_global_dsq(sch, tcpu); 1749 1750 return &cpu_rq(cpu)->scx.local_dsq; 1751 } 1752 1753 if (dsq_id == SCX_DSQ_GLOBAL) 1754 dsq = find_global_dsq(sch, tcpu); 1755 else 1756 dsq = find_user_dsq(sch, dsq_id); 1757 1758 if (unlikely(!dsq)) { 1759 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1760 return find_global_dsq(sch, tcpu); 1761 } 1762 1763 return dsq; 1764 } 1765 1766 static void mark_direct_dispatch(struct scx_sched *sch, 1767 struct task_struct *ddsp_task, 1768 struct task_struct *p, u64 dsq_id, 1769 u64 enq_flags) 1770 { 1771 /* 1772 * Mark that dispatch already happened from ops.select_cpu() or 1773 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1774 * which can never match a valid task pointer. 1775 */ 1776 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1777 1778 /* @p must match the task on the enqueue path */ 1779 if (unlikely(p != ddsp_task)) { 1780 if (IS_ERR(ddsp_task)) 1781 scx_error(sch, "%s[%d] already direct-dispatched", 1782 p->comm, p->pid); 1783 else 1784 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1785 ddsp_task->comm, ddsp_task->pid, 1786 p->comm, p->pid); 1787 return; 1788 } 1789 1790 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1791 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1792 1793 p->scx.ddsp_dsq_id = dsq_id; 1794 p->scx.ddsp_enq_flags = enq_flags; 1795 } 1796 1797 /* 1798 * Clear @p direct dispatch state when leaving the scheduler. 1799 * 1800 * Direct dispatch state must be cleared in the following cases: 1801 * - direct_dispatch(): cleared on the synchronous enqueue path, deferred 1802 * dispatch keeps the state until consumed 1803 * - process_ddsp_deferred_locals(): cleared after consuming deferred state, 1804 * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch 1805 * verdict is ignored (local/global/bypass) 1806 * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred 1807 * cancellation and holding_cpu races 1808 * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by 1809 * the scx_bypass() loop, so that stale state is not reused by a subsequent 1810 * scheduler instance 1811 */ 1812 static inline void clear_direct_dispatch(struct task_struct *p) 1813 { 1814 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1815 p->scx.ddsp_enq_flags = 0; 1816 } 1817 1818 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1819 u64 enq_flags) 1820 { 1821 struct rq *rq = task_rq(p); 1822 struct scx_dispatch_q *dsq = 1823 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1824 u64 ddsp_enq_flags; 1825 1826 touch_core_sched_dispatch(rq, p); 1827 1828 p->scx.ddsp_enq_flags |= enq_flags; 1829 1830 /* 1831 * We are in the enqueue path with @rq locked and pinned, and thus can't 1832 * double lock a remote rq and enqueue to its local DSQ. For 1833 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1834 * the enqueue so that it's executed when @rq can be unlocked. 1835 */ 1836 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1837 unsigned long opss; 1838 1839 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1840 1841 switch (opss & SCX_OPSS_STATE_MASK) { 1842 case SCX_OPSS_NONE: 1843 break; 1844 case SCX_OPSS_QUEUEING: 1845 /* 1846 * As @p was never passed to the BPF side, _release is 1847 * not strictly necessary. Still do it for consistency. 1848 */ 1849 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1850 break; 1851 default: 1852 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1853 p->comm, p->pid, opss); 1854 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1855 break; 1856 } 1857 1858 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1859 list_add_tail(&p->scx.dsq_list.node, 1860 &rq->scx.ddsp_deferred_locals); 1861 schedule_deferred_locked(rq); 1862 return; 1863 } 1864 1865 ddsp_enq_flags = p->scx.ddsp_enq_flags; 1866 clear_direct_dispatch(p); 1867 1868 dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1869 } 1870 1871 static bool scx_rq_online(struct rq *rq) 1872 { 1873 /* 1874 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1875 * the online state as seen from the BPF scheduler. cpu_active() test 1876 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1877 * stay set until the current scheduling operation is complete even if 1878 * we aren't locking @rq. 1879 */ 1880 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1881 } 1882 1883 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1884 int sticky_cpu) 1885 { 1886 struct scx_sched *sch = scx_task_sched(p); 1887 struct task_struct **ddsp_taskp; 1888 struct scx_dispatch_q *dsq; 1889 unsigned long qseq; 1890 1891 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1892 1893 /* internal movements - rq migration / RESTORE */ 1894 if (sticky_cpu == cpu_of(rq)) 1895 goto local_norefill; 1896 1897 /* 1898 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 1899 * Note that exiting and migration-disabled tasks that skip 1900 * ops.enqueue() below will lose IMMED protection unless 1901 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 1902 */ 1903 p->scx.flags &= ~SCX_TASK_IMMED; 1904 1905 /* 1906 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 1907 * is offline and are just running the hotplug path. Don't bother the 1908 * BPF scheduler. 1909 */ 1910 if (!scx_rq_online(rq)) 1911 goto local; 1912 1913 if (scx_bypassing(sch, cpu_of(rq))) { 1914 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 1915 goto bypass; 1916 } 1917 1918 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1919 goto direct; 1920 1921 /* see %SCX_OPS_ENQ_EXITING */ 1922 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 1923 unlikely(p->flags & PF_EXITING)) { 1924 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 1925 goto local; 1926 } 1927 1928 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 1929 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 1930 is_migration_disabled(p)) { 1931 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 1932 goto local; 1933 } 1934 1935 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 1936 goto global; 1937 1938 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1939 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1940 1941 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1942 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1943 1944 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1945 WARN_ON_ONCE(*ddsp_taskp); 1946 *ddsp_taskp = p; 1947 1948 SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); 1949 1950 *ddsp_taskp = NULL; 1951 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1952 goto direct; 1953 1954 /* 1955 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 1956 * so ops.dequeue() is called when it leaves custody. 1957 */ 1958 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1959 1960 /* 1961 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1962 * dequeue may be waiting. The store_release matches their load_acquire. 1963 */ 1964 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1965 return; 1966 1967 direct: 1968 direct_dispatch(sch, p, enq_flags); 1969 return; 1970 local_norefill: 1971 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 1972 return; 1973 local: 1974 dsq = &rq->scx.local_dsq; 1975 goto enqueue; 1976 global: 1977 dsq = find_global_dsq(sch, task_cpu(p)); 1978 goto enqueue; 1979 bypass: 1980 dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 1981 goto enqueue; 1982 1983 enqueue: 1984 /* 1985 * For task-ordering, slice refill must be treated as implying the end 1986 * of the current slice. Otherwise, the longer @p stays on the CPU, the 1987 * higher priority it becomes from scx_prio_less()'s POV. 1988 */ 1989 touch_core_sched(rq, p); 1990 refill_task_slice_dfl(sch, p); 1991 clear_direct_dispatch(p); 1992 dispatch_enqueue(sch, rq, dsq, p, enq_flags); 1993 } 1994 1995 static bool task_runnable(const struct task_struct *p) 1996 { 1997 return !list_empty(&p->scx.runnable_node); 1998 } 1999 2000 static void set_task_runnable(struct rq *rq, struct task_struct *p) 2001 { 2002 lockdep_assert_rq_held(rq); 2003 2004 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 2005 p->scx.runnable_at = jiffies; 2006 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 2007 } 2008 2009 /* 2010 * list_add_tail() must be used. scx_bypass() depends on tasks being 2011 * appended to the runnable_list. 2012 */ 2013 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 2014 } 2015 2016 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2017 { 2018 list_del_init(&p->scx.runnable_node); 2019 if (reset_runnable_at) 2020 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2021 } 2022 2023 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 2024 { 2025 struct scx_sched *sch = scx_task_sched(p); 2026 int sticky_cpu = p->scx.sticky_cpu; 2027 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 2028 2029 if (enq_flags & ENQUEUE_WAKEUP) 2030 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2031 2032 /* 2033 * Restoring a running task will be immediately followed by 2034 * set_next_task_scx() which expects the task to not be on the BPF 2035 * scheduler as tasks can only start running through local DSQs. Force 2036 * direct-dispatch into the local DSQ by setting the sticky_cpu. 2037 */ 2038 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2039 sticky_cpu = cpu_of(rq); 2040 2041 if (p->scx.flags & SCX_TASK_QUEUED) { 2042 WARN_ON_ONCE(!task_runnable(p)); 2043 goto out; 2044 } 2045 2046 set_task_runnable(rq, p); 2047 p->scx.flags |= SCX_TASK_QUEUED; 2048 rq->scx.nr_running++; 2049 add_nr_running(rq, 1); 2050 2051 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2052 SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); 2053 2054 if (enq_flags & SCX_ENQ_WAKEUP) 2055 touch_core_sched(rq, p); 2056 2057 /* Start dl_server if this is the first task being enqueued */ 2058 if (rq->scx.nr_running == 1) 2059 dl_server_start(&rq->ext_server); 2060 2061 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2062 2063 if (sticky_cpu >= 0) 2064 p->scx.sticky_cpu = -1; 2065 out: 2066 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2067 2068 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2069 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2070 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2071 } 2072 2073 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2074 { 2075 struct scx_sched *sch = scx_task_sched(p); 2076 unsigned long opss; 2077 2078 /* dequeue is always temporary, don't reset runnable_at */ 2079 clr_task_runnable(p, false); 2080 2081 retry: 2082 /* acquire ensures that we see the preceding updates on QUEUED */ 2083 opss = atomic_long_read_acquire(&p->scx.ops_state); 2084 2085 switch (opss & SCX_OPSS_STATE_MASK) { 2086 case SCX_OPSS_NONE: 2087 break; 2088 case SCX_OPSS_QUEUEING: 2089 /* 2090 * QUEUEING is started and finished while holding @p's rq lock. 2091 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2092 */ 2093 BUG(); 2094 case SCX_OPSS_QUEUED: 2095 /* 2096 * A queued task must always be in BPF scheduler's custody. If 2097 * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another 2098 * CPU has already passed call_task_dequeue() (which clears the 2099 * flag), but has not yet written SCX_OPSS_NONE. That final 2100 * store does not require this rq's lock, so retrying with 2101 * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, 2102 * handled by the fallthrough) on a subsequent iteration. 2103 */ 2104 if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { 2105 cpu_relax(); 2106 goto retry; 2107 } 2108 2109 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2110 SCX_OPSS_NONE)) 2111 break; 2112 fallthrough; 2113 case SCX_OPSS_DISPATCHING: 2114 /* 2115 * If @p is being dispatched from the BPF scheduler to a DSQ, 2116 * wait for the transfer to complete so that @p doesn't get 2117 * added to its DSQ after dequeueing is complete. 2118 * 2119 * As we're waiting on DISPATCHING with the rq locked, the 2120 * dispatching side shouldn't try to lock the rq while 2121 * DISPATCHING is set. See dispatch_to_local_dsq(). 2122 * 2123 * DISPATCHING shouldn't have qseq set and control can reach 2124 * here with NONE @opss from the above QUEUED case block. 2125 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2126 */ 2127 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2128 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2129 break; 2130 } 2131 2132 /* 2133 * Call ops.dequeue() if the task is still in BPF custody. 2134 * 2135 * The code that clears ops_state to %SCX_OPSS_NONE does not always 2136 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2137 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2138 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2139 * so that a concurrent dequeue can proceed, but we clear 2140 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2141 * task. So we can see NONE + IN_CUSTODY here and we must handle 2142 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2143 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2144 * it is enqueued on the destination. 2145 */ 2146 call_task_dequeue(sch, rq, p, deq_flags); 2147 } 2148 2149 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2150 { 2151 struct scx_sched *sch = scx_task_sched(p); 2152 u64 deq_flags = core_deq_flags; 2153 2154 /* 2155 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2156 * change (not sleep or core-sched pick). 2157 */ 2158 if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2159 deq_flags |= SCX_DEQ_SCHED_CHANGE; 2160 2161 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2162 WARN_ON_ONCE(task_runnable(p)); 2163 return true; 2164 } 2165 2166 ops_dequeue(rq, p, deq_flags); 2167 2168 /* 2169 * A currently running task which is going off @rq first gets dequeued 2170 * and then stops running. As we want running <-> stopping transitions 2171 * to be contained within runnable <-> quiescent transitions, trigger 2172 * ->stopping() early here instead of in put_prev_task_scx(). 2173 * 2174 * @p may go through multiple stopping <-> running transitions between 2175 * here and put_prev_task_scx() if task attribute changes occur while 2176 * balance_one() leaves @rq unlocked. However, they don't contain any 2177 * information meaningful to the BPF scheduler and can be suppressed by 2178 * skipping the callbacks if the task is !QUEUED. 2179 */ 2180 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2181 update_curr_scx(rq); 2182 SCX_CALL_OP_TASK(sch, stopping, rq, p, false); 2183 } 2184 2185 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2186 SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); 2187 2188 if (deq_flags & SCX_DEQ_SLEEP) 2189 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2190 else 2191 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2192 2193 p->scx.flags &= ~SCX_TASK_QUEUED; 2194 rq->scx.nr_running--; 2195 sub_nr_running(rq, 1); 2196 2197 dispatch_dequeue(rq, p); 2198 clear_direct_dispatch(p); 2199 return true; 2200 } 2201 2202 static void yield_task_scx(struct rq *rq) 2203 { 2204 struct task_struct *p = rq->donor; 2205 struct scx_sched *sch = scx_task_sched(p); 2206 2207 if (SCX_HAS_OP(sch, yield)) 2208 SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); 2209 else 2210 p->scx.slice = 0; 2211 } 2212 2213 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2214 { 2215 struct task_struct *from = rq->donor; 2216 struct scx_sched *sch = scx_task_sched(from); 2217 2218 if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2219 return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); 2220 else 2221 return false; 2222 } 2223 2224 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2225 { 2226 /* 2227 * Preemption between SCX tasks is implemented by resetting the victim 2228 * task's slice to 0 and triggering reschedule on the target CPU. 2229 * Nothing to do. 2230 */ 2231 if (p->sched_class == &ext_sched_class) 2232 return; 2233 2234 /* 2235 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2236 * This captures all preemption cases including: 2237 * 2238 * - A SCX task is currently running. 2239 * 2240 * - @rq is waking from idle due to a SCX task waking to it. 2241 * 2242 * - A higher-priority wakes up while SCX dispatch is in progress. 2243 */ 2244 if (rq->scx.nr_immed) 2245 schedule_reenq_local(rq, 0); 2246 } 2247 2248 static void move_local_task_to_local_dsq(struct scx_sched *sch, 2249 struct task_struct *p, u64 enq_flags, 2250 struct scx_dispatch_q *src_dsq, 2251 struct rq *dst_rq) 2252 { 2253 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2254 2255 /* @dsq is locked and @p is on @dst_rq */ 2256 lockdep_assert_held(&src_dsq->lock); 2257 lockdep_assert_rq_held(dst_rq); 2258 2259 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2260 2261 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2262 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2263 else 2264 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2265 2266 dsq_inc_nr(dst_dsq, p, enq_flags); 2267 p->scx.dsq = dst_dsq; 2268 2269 local_dsq_post_enq(sch, dst_dsq, p, enq_flags); 2270 } 2271 2272 /** 2273 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2274 * @p: task to move 2275 * @enq_flags: %SCX_ENQ_* 2276 * @src_rq: rq to move the task from, locked on entry, released on return 2277 * @dst_rq: rq to move the task into, locked on return 2278 * 2279 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2280 */ 2281 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2282 struct rq *src_rq, struct rq *dst_rq) 2283 { 2284 lockdep_assert_rq_held(src_rq); 2285 2286 /* 2287 * Set sticky_cpu before deactivate_task() to properly mark the 2288 * beginning of an SCX-internal migration. 2289 */ 2290 p->scx.sticky_cpu = cpu_of(dst_rq); 2291 deactivate_task(src_rq, p, 0); 2292 set_task_cpu(p, cpu_of(dst_rq)); 2293 2294 raw_spin_rq_unlock(src_rq); 2295 raw_spin_rq_lock(dst_rq); 2296 2297 /* 2298 * We want to pass scx-specific enq_flags but activate_task() will 2299 * truncate the upper 32 bit. As we own @rq, we can pass them through 2300 * @rq->scx.extra_enq_flags instead. 2301 */ 2302 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2303 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2304 dst_rq->scx.extra_enq_flags = enq_flags; 2305 activate_task(dst_rq, p, 0); 2306 dst_rq->scx.extra_enq_flags = 0; 2307 } 2308 2309 /* 2310 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2311 * differences: 2312 * 2313 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2314 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2315 * this CPU?". 2316 * 2317 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2318 * must be allowed to finish on the CPU that it's currently on regardless of 2319 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2320 * BPF scheduler shouldn't attempt to migrate a task which has migration 2321 * disabled. 2322 * 2323 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2324 * no to the BPF scheduler initiated migrations while offline. 2325 * 2326 * The caller must ensure that @p and @rq are on different CPUs. 2327 */ 2328 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2329 struct task_struct *p, struct rq *rq, 2330 bool enforce) 2331 { 2332 s32 cpu = cpu_of(rq); 2333 2334 WARN_ON_ONCE(task_cpu(p) == cpu); 2335 2336 /* 2337 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2338 * the pinned CPU in migrate_disable_switch() while @p is being switched 2339 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2340 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2341 * @p passing the below task_allowed_on_cpu() check while migration is 2342 * disabled. 2343 * 2344 * Test the migration disabled state first as the race window is narrow 2345 * and the BPF scheduler failing to check migration disabled state can 2346 * easily be masked if task_allowed_on_cpu() is done first. 2347 */ 2348 if (unlikely(is_migration_disabled(p))) { 2349 if (enforce) 2350 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2351 p->comm, p->pid, task_cpu(p), cpu); 2352 return false; 2353 } 2354 2355 /* 2356 * We don't require the BPF scheduler to avoid dispatching to offline 2357 * CPUs mostly for convenience but also because CPUs can go offline 2358 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2359 * picked CPU is outside the allowed mask. 2360 */ 2361 if (!task_allowed_on_cpu(p, cpu)) { 2362 if (enforce) 2363 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2364 cpu, p->comm, p->pid); 2365 return false; 2366 } 2367 2368 if (!scx_rq_online(rq)) { 2369 if (enforce) 2370 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2371 return false; 2372 } 2373 2374 return true; 2375 } 2376 2377 /** 2378 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2379 * @p: target task 2380 * @dsq: locked DSQ @p is currently on 2381 * @src_rq: rq @p is currently on, stable with @dsq locked 2382 * 2383 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2384 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2385 * required when transferring into a local DSQ. Even when transferring into a 2386 * non-local DSQ, it's better to use the same mechanism to protect against 2387 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2388 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2389 * 2390 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2391 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2392 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2393 * dancing from our side. 2394 * 2395 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2396 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2397 * would be cleared to -1. While other cpus may have updated it to different 2398 * values afterwards, as this operation can't be preempted or recurse, the 2399 * holding_cpu can never become this CPU again before we're done. Thus, we can 2400 * tell whether we lost to dequeue by testing whether the holding_cpu still 2401 * points to this CPU. See dispatch_dequeue() for the counterpart. 2402 * 2403 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2404 * still valid. %false if lost to dequeue. 2405 */ 2406 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2407 struct scx_dispatch_q *dsq, 2408 struct rq *src_rq) 2409 { 2410 s32 cpu = raw_smp_processor_id(); 2411 2412 lockdep_assert_held(&dsq->lock); 2413 2414 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2415 task_unlink_from_dsq(p, dsq); 2416 p->scx.holding_cpu = cpu; 2417 2418 raw_spin_unlock(&dsq->lock); 2419 raw_spin_rq_lock(src_rq); 2420 2421 /* task_rq couldn't have changed if we're still the holding cpu */ 2422 return likely(p->scx.holding_cpu == cpu) && 2423 !WARN_ON_ONCE(src_rq != task_rq(p)); 2424 } 2425 2426 static bool consume_remote_task(struct rq *this_rq, 2427 struct task_struct *p, u64 enq_flags, 2428 struct scx_dispatch_q *dsq, struct rq *src_rq) 2429 { 2430 raw_spin_rq_unlock(this_rq); 2431 2432 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2433 move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); 2434 return true; 2435 } else { 2436 raw_spin_rq_unlock(src_rq); 2437 raw_spin_rq_lock(this_rq); 2438 return false; 2439 } 2440 } 2441 2442 /** 2443 * move_task_between_dsqs() - Move a task from one DSQ to another 2444 * @sch: scx_sched being operated on 2445 * @p: target task 2446 * @enq_flags: %SCX_ENQ_* 2447 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2448 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2449 * 2450 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2451 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2452 * will change. As @p's task_rq is locked, this function doesn't need to use the 2453 * holding_cpu mechanism. 2454 * 2455 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2456 * return value, is locked. 2457 */ 2458 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2459 struct task_struct *p, u64 enq_flags, 2460 struct scx_dispatch_q *src_dsq, 2461 struct scx_dispatch_q *dst_dsq) 2462 { 2463 struct rq *src_rq = task_rq(p), *dst_rq; 2464 2465 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2466 lockdep_assert_held(&src_dsq->lock); 2467 lockdep_assert_rq_held(src_rq); 2468 2469 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2470 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2471 if (src_rq != dst_rq && 2472 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2473 dst_dsq = find_global_dsq(sch, task_cpu(p)); 2474 dst_rq = src_rq; 2475 enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2476 } 2477 } else { 2478 /* no need to migrate if destination is a non-local DSQ */ 2479 dst_rq = src_rq; 2480 } 2481 2482 /* 2483 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2484 * CPU, @p will be migrated. 2485 */ 2486 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2487 /* @p is going from a non-local DSQ to a local DSQ */ 2488 if (src_rq == dst_rq) { 2489 task_unlink_from_dsq(p, src_dsq); 2490 move_local_task_to_local_dsq(sch, p, enq_flags, 2491 src_dsq, dst_rq); 2492 raw_spin_unlock(&src_dsq->lock); 2493 } else { 2494 raw_spin_unlock(&src_dsq->lock); 2495 move_remote_task_to_local_dsq(p, enq_flags, 2496 src_rq, dst_rq); 2497 } 2498 } else { 2499 /* 2500 * @p is going from a non-local DSQ to a non-local DSQ. As 2501 * $src_dsq is already locked, do an abbreviated dequeue. 2502 */ 2503 dispatch_dequeue_locked(p, src_dsq); 2504 raw_spin_unlock(&src_dsq->lock); 2505 2506 dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2507 } 2508 2509 return dst_rq; 2510 } 2511 2512 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2513 struct scx_dispatch_q *dsq, u64 enq_flags) 2514 { 2515 struct task_struct *p; 2516 retry: 2517 /* 2518 * The caller can't expect to successfully consume a task if the task's 2519 * addition to @dsq isn't guaranteed to be visible somehow. Test 2520 * @dsq->list without locking and skip if it seems empty. 2521 */ 2522 if (list_empty(&dsq->list)) 2523 return false; 2524 2525 raw_spin_lock(&dsq->lock); 2526 2527 nldsq_for_each_task(p, dsq) { 2528 struct rq *task_rq = task_rq(p); 2529 2530 /* 2531 * This loop can lead to multiple lockup scenarios, e.g. the BPF 2532 * scheduler can put an enormous number of affinitized tasks into 2533 * a contended DSQ, or the outer retry loop can repeatedly race 2534 * against scx_bypass() dequeueing tasks from @dsq trying to put 2535 * the system into the bypass mode. This can easily live-lock the 2536 * machine. If aborting, exit from all non-bypass DSQs. 2537 */ 2538 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2539 break; 2540 2541 if (rq == task_rq) { 2542 task_unlink_from_dsq(p, dsq); 2543 move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); 2544 raw_spin_unlock(&dsq->lock); 2545 return true; 2546 } 2547 2548 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2549 if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) 2550 return true; 2551 goto retry; 2552 } 2553 } 2554 2555 raw_spin_unlock(&dsq->lock); 2556 return false; 2557 } 2558 2559 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2560 { 2561 int node = cpu_to_node(cpu_of(rq)); 2562 2563 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); 2564 } 2565 2566 /** 2567 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2568 * @sch: scx_sched being operated on 2569 * @rq: current rq which is locked 2570 * @dst_dsq: destination DSQ 2571 * @p: task to dispatch 2572 * @enq_flags: %SCX_ENQ_* 2573 * 2574 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2575 * DSQ. This function performs all the synchronization dancing needed because 2576 * local DSQs are protected with rq locks. 2577 * 2578 * The caller must have exclusive ownership of @p (e.g. through 2579 * %SCX_OPSS_DISPATCHING). 2580 */ 2581 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2582 struct scx_dispatch_q *dst_dsq, 2583 struct task_struct *p, u64 enq_flags) 2584 { 2585 struct rq *src_rq = task_rq(p); 2586 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2587 struct rq *locked_rq = rq; 2588 2589 /* 2590 * We're synchronized against dequeue through DISPATCHING. As @p can't 2591 * be dequeued, its task_rq and cpus_allowed are stable too. 2592 * 2593 * If dispatching to @rq that @p is already on, no lock dancing needed. 2594 */ 2595 if (rq == src_rq && rq == dst_rq) { 2596 dispatch_enqueue(sch, rq, dst_dsq, p, 2597 enq_flags | SCX_ENQ_CLEAR_OPSS); 2598 return; 2599 } 2600 2601 if (src_rq != dst_rq && 2602 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2603 dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2604 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2605 return; 2606 } 2607 2608 /* 2609 * @p is on a possibly remote @src_rq which we need to lock to move the 2610 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2611 * on DISPATCHING, so we can't grab @src_rq lock while holding 2612 * DISPATCHING. 2613 * 2614 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2615 * we're moving from a DSQ and use the same mechanism - mark the task 2616 * under transfer with holding_cpu, release DISPATCHING and then follow 2617 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2618 */ 2619 p->scx.holding_cpu = raw_smp_processor_id(); 2620 2621 /* store_release ensures that dequeue sees the above */ 2622 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2623 2624 /* switch to @src_rq lock */ 2625 if (locked_rq != src_rq) { 2626 raw_spin_rq_unlock(locked_rq); 2627 locked_rq = src_rq; 2628 raw_spin_rq_lock(src_rq); 2629 } 2630 2631 /* task_rq couldn't have changed if we're still the holding cpu */ 2632 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2633 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2634 /* 2635 * If @p is staying on the same rq, there's no need to go 2636 * through the full deactivate/activate cycle. Optimize by 2637 * abbreviating move_remote_task_to_local_dsq(). 2638 */ 2639 if (src_rq == dst_rq) { 2640 p->scx.holding_cpu = -1; 2641 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2642 enq_flags); 2643 } else { 2644 move_remote_task_to_local_dsq(p, enq_flags, 2645 src_rq, dst_rq); 2646 /* task has been moved to dst_rq, which is now locked */ 2647 locked_rq = dst_rq; 2648 } 2649 2650 /* if the destination CPU is idle, wake it up */ 2651 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2652 resched_curr(dst_rq); 2653 } 2654 2655 /* switch back to @rq lock */ 2656 if (locked_rq != rq) { 2657 raw_spin_rq_unlock(locked_rq); 2658 raw_spin_rq_lock(rq); 2659 } 2660 } 2661 2662 /** 2663 * finish_dispatch - Asynchronously finish dispatching a task 2664 * @rq: current rq which is locked 2665 * @p: task to finish dispatching 2666 * @qseq_at_dispatch: qseq when @p started getting dispatched 2667 * @dsq_id: destination DSQ ID 2668 * @enq_flags: %SCX_ENQ_* 2669 * 2670 * Dispatching to local DSQs may need to wait for queueing to complete or 2671 * require rq lock dancing. As we don't wanna do either while inside 2672 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2673 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2674 * task and its qseq. Once ops.dispatch() returns, this function is called to 2675 * finish up. 2676 * 2677 * There is no guarantee that @p is still valid for dispatching or even that it 2678 * was valid in the first place. Make sure that the task is still owned by the 2679 * BPF scheduler and claim the ownership before dispatching. 2680 */ 2681 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2682 struct task_struct *p, 2683 unsigned long qseq_at_dispatch, 2684 u64 dsq_id, u64 enq_flags) 2685 { 2686 struct scx_dispatch_q *dsq; 2687 unsigned long opss; 2688 2689 touch_core_sched_dispatch(rq, p); 2690 retry: 2691 /* 2692 * No need for _acquire here. @p is accessed only after a successful 2693 * try_cmpxchg to DISPATCHING. 2694 */ 2695 opss = atomic_long_read(&p->scx.ops_state); 2696 2697 switch (opss & SCX_OPSS_STATE_MASK) { 2698 case SCX_OPSS_DISPATCHING: 2699 case SCX_OPSS_NONE: 2700 /* someone else already got to it */ 2701 return; 2702 case SCX_OPSS_QUEUED: 2703 /* 2704 * If qseq doesn't match, @p has gone through at least one 2705 * dispatch/dequeue and re-enqueue cycle between 2706 * scx_bpf_dsq_insert() and here and we have no claim on it. 2707 */ 2708 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2709 return; 2710 2711 /* see SCX_EV_INSERT_NOT_OWNED definition */ 2712 if (unlikely(!scx_task_on_sched(sch, p))) { 2713 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2714 return; 2715 } 2716 2717 /* 2718 * While we know @p is accessible, we don't yet have a claim on 2719 * it - the BPF scheduler is allowed to dispatch tasks 2720 * spuriously and there can be a racing dequeue attempt. Let's 2721 * claim @p by atomically transitioning it from QUEUED to 2722 * DISPATCHING. 2723 */ 2724 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2725 SCX_OPSS_DISPATCHING))) 2726 break; 2727 goto retry; 2728 case SCX_OPSS_QUEUEING: 2729 /* 2730 * do_enqueue_task() is in the process of transferring the task 2731 * to the BPF scheduler while holding @p's rq lock. As we aren't 2732 * holding any kernel or BPF resource that the enqueue path may 2733 * depend upon, it's safe to wait. 2734 */ 2735 wait_ops_state(p, opss); 2736 goto retry; 2737 } 2738 2739 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2740 2741 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2742 2743 if (dsq->id == SCX_DSQ_LOCAL) 2744 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2745 else 2746 dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2747 } 2748 2749 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2750 { 2751 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2752 u32 u; 2753 2754 for (u = 0; u < dspc->cursor; u++) { 2755 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2756 2757 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2758 ent->enq_flags); 2759 } 2760 2761 dspc->nr_tasks += dspc->cursor; 2762 dspc->cursor = 0; 2763 } 2764 2765 static inline void maybe_queue_balance_callback(struct rq *rq) 2766 { 2767 lockdep_assert_rq_held(rq); 2768 2769 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2770 return; 2771 2772 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2773 deferred_bal_cb_workfn); 2774 2775 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2776 } 2777 2778 /* 2779 * One user of this function is scx_bpf_dispatch() which can be called 2780 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2781 * from the call frame. 2782 */ 2783 static __always_inline bool 2784 scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2785 struct task_struct *prev, bool nested) 2786 { 2787 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2788 int nr_loops = SCX_DSP_MAX_LOOPS; 2789 s32 cpu = cpu_of(rq); 2790 bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2791 scx_task_on_sched(sch, prev); 2792 2793 if (consume_global_dsq(sch, rq)) 2794 return true; 2795 2796 if (bypass_dsp_enabled(sch)) { 2797 /* if @sch is bypassing, only the bypass DSQs are active */ 2798 if (scx_bypassing(sch, cpu)) 2799 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2800 2801 #ifdef CONFIG_EXT_SUB_SCHED 2802 /* 2803 * If @sch isn't bypassing but its children are, @sch is 2804 * responsible for making forward progress for both its own 2805 * tasks that aren't bypassing and the bypassing descendants' 2806 * tasks. The following implements a simple built-in behavior - 2807 * let each CPU try to run the bypass DSQ every Nth time. 2808 * 2809 * Later, if necessary, we can add an ops flag to suppress the 2810 * auto-consumption and a kfunc to consume the bypass DSQ and, 2811 * so that the BPF scheduler can fully control scheduling of 2812 * bypassed tasks. 2813 */ 2814 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2815 2816 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2817 consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { 2818 __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2819 return true; 2820 } 2821 #endif /* CONFIG_EXT_SUB_SCHED */ 2822 } 2823 2824 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2825 return false; 2826 2827 dspc->rq = rq; 2828 2829 /* 2830 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2831 * the local DSQ might still end up empty after a successful 2832 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2833 * produced some tasks, retry. The BPF scheduler may depend on this 2834 * looping behavior to simplify its implementation. 2835 */ 2836 do { 2837 dspc->nr_tasks = 0; 2838 2839 if (nested) { 2840 SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); 2841 } else { 2842 /* stash @prev so that nested invocations can access it */ 2843 rq->scx.sub_dispatch_prev = prev; 2844 SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); 2845 rq->scx.sub_dispatch_prev = NULL; 2846 } 2847 2848 flush_dispatch_buf(sch, rq); 2849 2850 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2851 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2852 return true; 2853 } 2854 if (rq->scx.local_dsq.nr) 2855 return true; 2856 if (consume_global_dsq(sch, rq)) 2857 return true; 2858 2859 /* 2860 * ops.dispatch() can trap us in this loop by repeatedly 2861 * dispatching ineligible tasks. Break out once in a while to 2862 * allow the watchdog to run. As IRQ can't be enabled in 2863 * balance(), we want to complete this scheduling cycle and then 2864 * start a new one. IOW, we want to call resched_curr() on the 2865 * next, most likely idle, task, not the current one. Use 2866 * __scx_bpf_kick_cpu() for deferred kicking. 2867 */ 2868 if (unlikely(!--nr_loops)) { 2869 scx_kick_cpu(sch, cpu, 0); 2870 break; 2871 } 2872 } while (dspc->nr_tasks); 2873 2874 /* 2875 * Prevent the CPU from going idle while bypassed descendants have tasks 2876 * queued. Without this fallback, bypassed tasks could stall if the host 2877 * scheduler's ops.dispatch() doesn't yield any tasks. 2878 */ 2879 if (bypass_dsp_enabled(sch)) 2880 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2881 2882 return false; 2883 } 2884 2885 static int balance_one(struct rq *rq, struct task_struct *prev) 2886 { 2887 struct scx_sched *sch = scx_root; 2888 s32 cpu = cpu_of(rq); 2889 2890 lockdep_assert_rq_held(rq); 2891 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2892 rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2893 2894 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2895 unlikely(rq->scx.cpu_released)) { 2896 /* 2897 * If the previous sched_class for the current CPU was not SCX, 2898 * notify the BPF scheduler that it again has control of the 2899 * core. This callback complements ->cpu_release(), which is 2900 * emitted in switch_class(). 2901 */ 2902 if (SCX_HAS_OP(sch, cpu_acquire)) 2903 SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); 2904 rq->scx.cpu_released = false; 2905 } 2906 2907 if (prev->sched_class == &ext_sched_class) { 2908 update_curr_scx(rq); 2909 2910 /* 2911 * If @prev is runnable & has slice left, it has priority and 2912 * fetching more just increases latency for the fetched tasks. 2913 * Tell pick_task_scx() to keep running @prev. If the BPF 2914 * scheduler wants to handle this explicitly, it should 2915 * implement ->cpu_release(). 2916 * 2917 * See scx_disable_workfn() for the explanation on the bypassing 2918 * test. 2919 */ 2920 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 2921 !scx_bypassing(sch, cpu)) { 2922 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2923 goto has_tasks; 2924 } 2925 } 2926 2927 /* if there already are tasks to run, nothing to do */ 2928 if (rq->scx.local_dsq.nr) 2929 goto has_tasks; 2930 2931 if (scx_dispatch_sched(sch, rq, prev, false)) 2932 goto has_tasks; 2933 2934 /* 2935 * Didn't find another task to run. Keep running @prev unless 2936 * %SCX_OPS_ENQ_LAST is in effect. 2937 */ 2938 if ((prev->scx.flags & SCX_TASK_QUEUED) && 2939 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 2940 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2941 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 2942 goto has_tasks; 2943 } 2944 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2945 return false; 2946 2947 has_tasks: 2948 /* 2949 * @rq may have extra IMMED tasks without reenq scheduled: 2950 * 2951 * - rq_is_open() can't reliably tell when and how slice is going to be 2952 * modified for $curr and allows IMMED tasks to be queued while 2953 * dispatch is in progress. 2954 * 2955 * - A non-IMMED HEAD task can get queued in front of an IMMED task 2956 * between the IMMED queueing and the subsequent scheduling event. 2957 */ 2958 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 2959 schedule_reenq_local(rq, 0); 2960 2961 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2962 return true; 2963 } 2964 2965 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 2966 { 2967 struct scx_sched *sch = scx_task_sched(p); 2968 2969 if (p->scx.flags & SCX_TASK_QUEUED) { 2970 /* 2971 * Core-sched might decide to execute @p before it is 2972 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 2973 */ 2974 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 2975 dispatch_dequeue(rq, p); 2976 } 2977 2978 p->se.exec_start = rq_clock_task(rq); 2979 2980 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2981 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 2982 SCX_CALL_OP_TASK(sch, running, rq, p); 2983 2984 clr_task_runnable(p, true); 2985 2986 /* 2987 * @p is getting newly scheduled or got kicked after someone updated its 2988 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 2989 */ 2990 if ((p->scx.slice == SCX_SLICE_INF) != 2991 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 2992 if (p->scx.slice == SCX_SLICE_INF) 2993 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 2994 else 2995 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 2996 2997 sched_update_tick_dependency(rq); 2998 2999 /* 3000 * For now, let's refresh the load_avgs just when transitioning 3001 * in and out of nohz. In the future, we might want to add a 3002 * mechanism which calls the following periodically on 3003 * tick-stopped CPUs. 3004 */ 3005 update_other_load_avgs(rq); 3006 } 3007 } 3008 3009 static enum scx_cpu_preempt_reason 3010 preempt_reason_from_class(const struct sched_class *class) 3011 { 3012 if (class == &stop_sched_class) 3013 return SCX_CPU_PREEMPT_STOP; 3014 if (class == &dl_sched_class) 3015 return SCX_CPU_PREEMPT_DL; 3016 if (class == &rt_sched_class) 3017 return SCX_CPU_PREEMPT_RT; 3018 return SCX_CPU_PREEMPT_UNKNOWN; 3019 } 3020 3021 static void switch_class(struct rq *rq, struct task_struct *next) 3022 { 3023 struct scx_sched *sch = scx_root; 3024 const struct sched_class *next_class = next->sched_class; 3025 3026 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3027 return; 3028 3029 /* 3030 * The callback is conceptually meant to convey that the CPU is no 3031 * longer under the control of SCX. Therefore, don't invoke the callback 3032 * if the next class is below SCX (in which case the BPF scheduler has 3033 * actively decided not to schedule any tasks on the CPU). 3034 */ 3035 if (sched_class_above(&ext_sched_class, next_class)) 3036 return; 3037 3038 /* 3039 * At this point we know that SCX was preempted by a higher priority 3040 * sched_class, so invoke the ->cpu_release() callback if we have not 3041 * done so already. We only send the callback once between SCX being 3042 * preempted, and it regaining control of the CPU. 3043 * 3044 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3045 * next time that balance_one() is invoked. 3046 */ 3047 if (!rq->scx.cpu_released) { 3048 if (SCX_HAS_OP(sch, cpu_release)) { 3049 struct scx_cpu_release_args args = { 3050 .reason = preempt_reason_from_class(next_class), 3051 .task = next, 3052 }; 3053 3054 SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); 3055 } 3056 rq->scx.cpu_released = true; 3057 } 3058 } 3059 3060 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3061 struct task_struct *next) 3062 { 3063 struct scx_sched *sch = scx_task_sched(p); 3064 3065 /* see kick_sync_wait_bal_cb() */ 3066 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3067 3068 update_curr_scx(rq); 3069 3070 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3071 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3072 SCX_CALL_OP_TASK(sch, stopping, rq, p, true); 3073 3074 if (p->scx.flags & SCX_TASK_QUEUED) { 3075 set_task_runnable(rq, p); 3076 3077 /* 3078 * If @p has slice left and is being put, @p is getting 3079 * preempted by a higher priority scheduler class or core-sched 3080 * forcing a different task. Leave it at the head of the local 3081 * DSQ unless it was an IMMED task. IMMED tasks should not 3082 * linger on a busy CPU, reenqueue them to the BPF scheduler. 3083 */ 3084 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3085 if (p->scx.flags & SCX_TASK_IMMED) { 3086 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3087 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3088 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3089 } else { 3090 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3091 } 3092 goto switch_class; 3093 } 3094 3095 /* 3096 * If @p is runnable but we're about to enter a lower 3097 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3098 * ops.enqueue() that @p is the only one available for this cpu, 3099 * which should trigger an explicit follow-up scheduling event. 3100 */ 3101 if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3102 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3103 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3104 } else { 3105 do_enqueue_task(rq, p, 0, -1); 3106 } 3107 } 3108 3109 switch_class: 3110 if (next && next->sched_class != &ext_sched_class) 3111 switch_class(rq, next); 3112 } 3113 3114 static void kick_sync_wait_bal_cb(struct rq *rq) 3115 { 3116 struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3117 unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3118 bool waited; 3119 s32 cpu; 3120 3121 /* 3122 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3123 * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3124 * flush) while we wait for its kick_sync to advance. 3125 * 3126 * Also, keep advancing our own kick_sync so that new kick_sync waits 3127 * targeting us, which can start after we drop the lock, cannot form 3128 * cyclic dependencies. 3129 */ 3130 retry: 3131 waited = false; 3132 for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3133 /* 3134 * smp_load_acquire() pairs with smp_store_release() on 3135 * kick_sync updates on the target CPUs. 3136 */ 3137 if (cpu == cpu_of(rq) || 3138 smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3139 cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3140 continue; 3141 } 3142 3143 raw_spin_rq_unlock_irq(rq); 3144 while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3145 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3146 cpu_relax(); 3147 } 3148 raw_spin_rq_lock_irq(rq); 3149 waited = true; 3150 } 3151 3152 if (waited) 3153 goto retry; 3154 } 3155 3156 static struct task_struct *first_local_task(struct rq *rq) 3157 { 3158 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3159 struct task_struct, scx.dsq_list.node); 3160 } 3161 3162 static struct task_struct * 3163 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3164 { 3165 struct task_struct *prev = rq->curr; 3166 bool keep_prev; 3167 struct task_struct *p; 3168 3169 /* see kick_sync_wait_bal_cb() */ 3170 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3171 3172 rq_modified_begin(rq, &ext_sched_class); 3173 3174 rq_unpin_lock(rq, rf); 3175 balance_one(rq, prev); 3176 rq_repin_lock(rq, rf); 3177 maybe_queue_balance_callback(rq); 3178 3179 /* 3180 * Defer to a balance callback which can drop rq lock and enable 3181 * IRQs. Waiting directly in the pick path would deadlock against 3182 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3183 */ 3184 if (unlikely(rq->scx.kick_sync_pending)) { 3185 rq->scx.kick_sync_pending = false; 3186 queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3187 kick_sync_wait_bal_cb); 3188 } 3189 3190 /* 3191 * If any higher-priority sched class enqueued a runnable task on 3192 * this rq during balance_one(), abort and return RETRY_TASK, so 3193 * that the scheduler loop can restart. 3194 * 3195 * If @force_scx is true, always try to pick a SCHED_EXT task, 3196 * regardless of any higher-priority sched classes activity. 3197 */ 3198 if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3199 return RETRY_TASK; 3200 3201 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3202 if (unlikely(keep_prev && 3203 prev->sched_class != &ext_sched_class)) { 3204 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3205 keep_prev = false; 3206 } 3207 3208 /* 3209 * If balance_one() is telling us to keep running @prev, replenish slice 3210 * if necessary and keep running @prev. Otherwise, pop the first one 3211 * from the local DSQ. 3212 */ 3213 if (keep_prev) { 3214 p = prev; 3215 if (!p->scx.slice) 3216 refill_task_slice_dfl(scx_task_sched(p), p); 3217 } else { 3218 p = first_local_task(rq); 3219 if (!p) 3220 return NULL; 3221 3222 if (unlikely(!p->scx.slice)) { 3223 struct scx_sched *sch = scx_task_sched(p); 3224 3225 if (!scx_bypassing(sch, cpu_of(rq)) && 3226 !sch->warned_zero_slice) { 3227 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3228 p->comm, p->pid, __func__); 3229 sch->warned_zero_slice = true; 3230 } 3231 refill_task_slice_dfl(sch, p); 3232 } 3233 } 3234 3235 return p; 3236 } 3237 3238 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3239 { 3240 return do_pick_task_scx(rq, rf, false); 3241 } 3242 3243 /* 3244 * Select the next task to run from the ext scheduling class. 3245 * 3246 * Use do_pick_task_scx() directly with @force_scx enabled, since the 3247 * dl_server must always select a sched_ext task. 3248 */ 3249 static struct task_struct * 3250 ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3251 { 3252 if (!scx_enabled()) 3253 return NULL; 3254 3255 return do_pick_task_scx(dl_se->rq, rf, true); 3256 } 3257 3258 /* 3259 * Initialize the ext server deadline entity. 3260 */ 3261 void ext_server_init(struct rq *rq) 3262 { 3263 struct sched_dl_entity *dl_se = &rq->ext_server; 3264 3265 init_dl_entity(dl_se); 3266 3267 dl_server_init(dl_se, rq, ext_server_pick_task); 3268 } 3269 3270 #ifdef CONFIG_SCHED_CORE 3271 /** 3272 * scx_prio_less - Task ordering for core-sched 3273 * @a: task A 3274 * @b: task B 3275 * @in_fi: in forced idle state 3276 * 3277 * Core-sched is implemented as an additional scheduling layer on top of the 3278 * usual sched_class'es and needs to find out the expected task ordering. For 3279 * SCX, core-sched calls this function to interrogate the task ordering. 3280 * 3281 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3282 * to implement the default task ordering. The older the timestamp, the higher 3283 * priority the task - the global FIFO ordering matching the default scheduling 3284 * behavior. 3285 * 3286 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3287 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3288 */ 3289 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3290 bool in_fi) 3291 { 3292 struct scx_sched *sch_a = scx_task_sched(a); 3293 struct scx_sched *sch_b = scx_task_sched(b); 3294 3295 /* 3296 * The const qualifiers are dropped from task_struct pointers when 3297 * calling ops.core_sched_before(). Accesses are controlled by the 3298 * verifier. 3299 */ 3300 if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3301 !scx_bypassing(sch_a, task_cpu(a))) 3302 return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, 3303 task_rq(a), 3304 (struct task_struct *)a, 3305 (struct task_struct *)b); 3306 else 3307 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3308 } 3309 #endif /* CONFIG_SCHED_CORE */ 3310 3311 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3312 { 3313 struct scx_sched *sch = scx_task_sched(p); 3314 bool bypassing; 3315 3316 /* 3317 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3318 * can be a good migration opportunity with low cache and memory 3319 * footprint. Returning a CPU different than @prev_cpu triggers 3320 * immediate rq migration. However, for SCX, as the current rq 3321 * association doesn't dictate where the task is going to run, this 3322 * doesn't fit well. If necessary, we can later add a dedicated method 3323 * which can decide to preempt self to force it through the regular 3324 * scheduling path. 3325 */ 3326 if (unlikely(wake_flags & WF_EXEC)) 3327 return prev_cpu; 3328 3329 bypassing = scx_bypassing(sch, task_cpu(p)); 3330 if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3331 s32 cpu; 3332 struct task_struct **ddsp_taskp; 3333 3334 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3335 WARN_ON_ONCE(*ddsp_taskp); 3336 *ddsp_taskp = p; 3337 3338 this_rq()->scx.in_select_cpu = true; 3339 cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); 3340 this_rq()->scx.in_select_cpu = false; 3341 p->scx.selected_cpu = cpu; 3342 *ddsp_taskp = NULL; 3343 if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3344 return cpu; 3345 else 3346 return prev_cpu; 3347 } else { 3348 s32 cpu; 3349 3350 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3351 if (cpu >= 0) { 3352 refill_task_slice_dfl(sch, p); 3353 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3354 } else { 3355 cpu = prev_cpu; 3356 } 3357 p->scx.selected_cpu = cpu; 3358 3359 if (bypassing) 3360 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3361 return cpu; 3362 } 3363 } 3364 3365 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3366 { 3367 run_deferred(rq); 3368 } 3369 3370 static void set_cpus_allowed_scx(struct task_struct *p, 3371 struct affinity_context *ac) 3372 { 3373 struct scx_sched *sch = scx_task_sched(p); 3374 3375 set_cpus_allowed_common(p, ac); 3376 3377 if (task_dead_and_done(p)) 3378 return; 3379 3380 /* 3381 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3382 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3383 * scheduler the effective one. 3384 * 3385 * Fine-grained memory write control is enforced by BPF making the const 3386 * designation pointless. Cast it away when calling the operation. 3387 */ 3388 if (SCX_HAS_OP(sch, set_cpumask)) 3389 SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); 3390 } 3391 3392 static void handle_hotplug(struct rq *rq, bool online) 3393 { 3394 struct scx_sched *sch = scx_root; 3395 s32 cpu = cpu_of(rq); 3396 3397 atomic_long_inc(&scx_hotplug_seq); 3398 3399 /* 3400 * scx_root updates are protected by cpus_read_lock() and will stay 3401 * stable here. Note that we can't depend on scx_enabled() test as the 3402 * hotplug ops need to be enabled before __scx_enabled is set. 3403 */ 3404 if (unlikely(!sch)) 3405 return; 3406 3407 if (scx_enabled()) 3408 scx_idle_update_selcpu_topology(&sch->ops); 3409 3410 if (online && SCX_HAS_OP(sch, cpu_online)) 3411 SCX_CALL_OP(sch, cpu_online, NULL, cpu); 3412 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3413 SCX_CALL_OP(sch, cpu_offline, NULL, cpu); 3414 else 3415 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3416 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3417 "cpu %d going %s, exiting scheduler", cpu, 3418 online ? "online" : "offline"); 3419 } 3420 3421 void scx_rq_activate(struct rq *rq) 3422 { 3423 handle_hotplug(rq, true); 3424 } 3425 3426 void scx_rq_deactivate(struct rq *rq) 3427 { 3428 handle_hotplug(rq, false); 3429 } 3430 3431 static void rq_online_scx(struct rq *rq) 3432 { 3433 rq->scx.flags |= SCX_RQ_ONLINE; 3434 } 3435 3436 static void rq_offline_scx(struct rq *rq) 3437 { 3438 rq->scx.flags &= ~SCX_RQ_ONLINE; 3439 } 3440 3441 static bool check_rq_for_timeouts(struct rq *rq) 3442 { 3443 struct scx_sched *sch; 3444 struct task_struct *p; 3445 struct rq_flags rf; 3446 bool timed_out = false; 3447 3448 rq_lock_irqsave(rq, &rf); 3449 sch = rcu_dereference_bh(scx_root); 3450 if (unlikely(!sch)) 3451 goto out_unlock; 3452 3453 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3454 struct scx_sched *sch = scx_task_sched(p); 3455 unsigned long last_runnable = p->scx.runnable_at; 3456 3457 if (unlikely(time_after(jiffies, 3458 last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3459 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3460 3461 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, 3462 "%s[%d] failed to run for %u.%03us", 3463 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); 3464 timed_out = true; 3465 break; 3466 } 3467 } 3468 out_unlock: 3469 rq_unlock_irqrestore(rq, &rf); 3470 return timed_out; 3471 } 3472 3473 static void scx_watchdog_workfn(struct work_struct *work) 3474 { 3475 unsigned long intv; 3476 int cpu; 3477 3478 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3479 3480 for_each_online_cpu(cpu) { 3481 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3482 break; 3483 3484 cond_resched(); 3485 } 3486 3487 intv = READ_ONCE(scx_watchdog_interval); 3488 if (intv < ULONG_MAX) 3489 queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3490 } 3491 3492 void scx_tick(struct rq *rq) 3493 { 3494 struct scx_sched *root; 3495 unsigned long last_check; 3496 3497 if (!scx_enabled()) 3498 return; 3499 3500 root = rcu_dereference_bh(scx_root); 3501 if (unlikely(!root)) 3502 return; 3503 3504 last_check = READ_ONCE(scx_watchdog_timestamp); 3505 if (unlikely(time_after(jiffies, 3506 last_check + READ_ONCE(root->watchdog_timeout)))) { 3507 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3508 3509 scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3510 "watchdog failed to check in for %u.%03us", 3511 dur_ms / 1000, dur_ms % 1000); 3512 } 3513 3514 update_other_load_avgs(rq); 3515 } 3516 3517 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3518 { 3519 struct scx_sched *sch = scx_task_sched(curr); 3520 3521 update_curr_scx(rq); 3522 3523 /* 3524 * While disabling, always resched and refresh core-sched timestamp as 3525 * we can't trust the slice management or ops.core_sched_before(). 3526 */ 3527 if (scx_bypassing(sch, cpu_of(rq))) { 3528 curr->scx.slice = 0; 3529 touch_core_sched(rq, curr); 3530 } else if (SCX_HAS_OP(sch, tick)) { 3531 SCX_CALL_OP_TASK(sch, tick, rq, curr); 3532 } 3533 3534 if (!curr->scx.slice) 3535 resched_curr(rq); 3536 } 3537 3538 #ifdef CONFIG_EXT_GROUP_SCHED 3539 static struct cgroup *tg_cgrp(struct task_group *tg) 3540 { 3541 /* 3542 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3543 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3544 * root cgroup. 3545 */ 3546 if (tg && tg->css.cgroup) 3547 return tg->css.cgroup; 3548 else 3549 return &cgrp_dfl_root.cgrp; 3550 } 3551 3552 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3553 3554 #else /* CONFIG_EXT_GROUP_SCHED */ 3555 3556 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3557 3558 #endif /* CONFIG_EXT_GROUP_SCHED */ 3559 3560 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3561 { 3562 int ret; 3563 3564 p->scx.disallow = false; 3565 3566 if (SCX_HAS_OP(sch, init_task)) { 3567 struct scx_init_task_args args = { 3568 SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3569 .fork = fork, 3570 }; 3571 3572 ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); 3573 if (unlikely(ret)) { 3574 ret = ops_sanitize_err(sch, "init_task", ret); 3575 return ret; 3576 } 3577 } 3578 3579 if (p->scx.disallow) { 3580 if (unlikely(scx_parent(sch))) { 3581 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3582 p->comm, p->pid); 3583 } else if (unlikely(fork)) { 3584 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3585 p->comm, p->pid); 3586 } else { 3587 struct rq *rq; 3588 struct rq_flags rf; 3589 3590 rq = task_rq_lock(p, &rf); 3591 3592 /* 3593 * We're in the load path and @p->policy will be applied 3594 * right after. Reverting @p->policy here and rejecting 3595 * %SCHED_EXT transitions from scx_check_setscheduler() 3596 * guarantees that if ops.init_task() sets @p->disallow, 3597 * @p can never be in SCX. 3598 */ 3599 if (p->policy == SCHED_EXT) { 3600 p->policy = SCHED_NORMAL; 3601 atomic_long_inc(&scx_nr_rejected); 3602 } 3603 3604 task_rq_unlock(rq, p, &rf); 3605 } 3606 } 3607 3608 return 0; 3609 } 3610 3611 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3612 { 3613 struct rq *rq = task_rq(p); 3614 u32 weight; 3615 3616 lockdep_assert_rq_held(rq); 3617 3618 /* 3619 * Verify the task is not in BPF scheduler's custody. If flag 3620 * transitions are consistent, the flag should always be clear 3621 * here. 3622 */ 3623 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3624 3625 /* 3626 * Set the weight before calling ops.enable() so that the scheduler 3627 * doesn't see a stale value if they inspect the task struct. 3628 */ 3629 if (task_has_idle_policy(p)) 3630 weight = WEIGHT_IDLEPRIO; 3631 else 3632 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3633 3634 p->scx.weight = sched_weight_to_cgroup(weight); 3635 3636 if (SCX_HAS_OP(sch, enable)) 3637 SCX_CALL_OP_TASK(sch, enable, rq, p); 3638 3639 if (SCX_HAS_OP(sch, set_weight)) 3640 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3641 } 3642 3643 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3644 { 3645 __scx_enable_task(sch, p); 3646 scx_set_task_state(p, SCX_TASK_ENABLED); 3647 } 3648 3649 static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3650 { 3651 struct rq *rq = task_rq(p); 3652 3653 lockdep_assert_rq_held(rq); 3654 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3655 3656 clear_direct_dispatch(p); 3657 3658 if (SCX_HAS_OP(sch, disable)) 3659 SCX_CALL_OP_TASK(sch, disable, rq, p); 3660 scx_set_task_state(p, SCX_TASK_READY); 3661 3662 /* 3663 * Verify the task is not in BPF scheduler's custody. If flag 3664 * transitions are consistent, the flag should always be clear 3665 * here. 3666 */ 3667 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3668 } 3669 3670 static void __scx_disable_and_exit_task(struct scx_sched *sch, 3671 struct task_struct *p) 3672 { 3673 struct scx_exit_task_args args = { 3674 .cancelled = false, 3675 }; 3676 3677 lockdep_assert_held(&p->pi_lock); 3678 lockdep_assert_rq_held(task_rq(p)); 3679 3680 switch (scx_get_task_state(p)) { 3681 case SCX_TASK_NONE: 3682 return; 3683 case SCX_TASK_INIT: 3684 args.cancelled = true; 3685 break; 3686 case SCX_TASK_READY: 3687 break; 3688 case SCX_TASK_ENABLED: 3689 scx_disable_task(sch, p); 3690 break; 3691 default: 3692 WARN_ON_ONCE(true); 3693 return; 3694 } 3695 3696 if (SCX_HAS_OP(sch, exit_task)) 3697 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3698 } 3699 3700 /* 3701 * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never 3702 * ran. The task state has not been transitioned, so this mirrors the 3703 * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). 3704 */ 3705 static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) 3706 { 3707 struct scx_exit_task_args args = { .cancelled = true }; 3708 3709 lockdep_assert_held(&p->pi_lock); 3710 lockdep_assert_rq_held(task_rq(p)); 3711 3712 if (SCX_HAS_OP(sch, exit_task)) 3713 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3714 } 3715 3716 static void scx_disable_and_exit_task(struct scx_sched *sch, 3717 struct task_struct *p) 3718 { 3719 __scx_disable_and_exit_task(sch, p); 3720 3721 /* 3722 * If set, @p exited between __scx_init_task() and scx_enable_task() in 3723 * scx_sub_enable() and is initialized for both the associated sched and 3724 * its parent. Exit for the child too - scx_enable_task() never ran for 3725 * it, so undo only init_task. The flag is only set on the sub-enable 3726 * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. 3727 */ 3728 if (p->scx.flags & SCX_TASK_SUB_INIT) { 3729 if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) 3730 scx_sub_init_cancel_task(scx_enabling_sub_sched, p); 3731 p->scx.flags &= ~SCX_TASK_SUB_INIT; 3732 } 3733 3734 scx_set_task_sched(p, NULL); 3735 scx_set_task_state(p, SCX_TASK_NONE); 3736 } 3737 3738 void init_scx_entity(struct sched_ext_entity *scx) 3739 { 3740 memset(scx, 0, sizeof(*scx)); 3741 INIT_LIST_HEAD(&scx->dsq_list.node); 3742 RB_CLEAR_NODE(&scx->dsq_priq); 3743 scx->sticky_cpu = -1; 3744 scx->holding_cpu = -1; 3745 INIT_LIST_HEAD(&scx->runnable_node); 3746 scx->runnable_at = jiffies; 3747 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3748 scx->slice = SCX_SLICE_DFL; 3749 } 3750 3751 void scx_pre_fork(struct task_struct *p) 3752 { 3753 /* 3754 * BPF scheduler enable/disable paths want to be able to iterate and 3755 * update all tasks which can become complex when racing forks. As 3756 * enable/disable are very cold paths, let's use a percpu_rwsem to 3757 * exclude forks. 3758 */ 3759 percpu_down_read(&scx_fork_rwsem); 3760 } 3761 3762 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3763 { 3764 s32 ret; 3765 3766 percpu_rwsem_assert_held(&scx_fork_rwsem); 3767 3768 if (scx_init_task_enabled) { 3769 #ifdef CONFIG_EXT_SUB_SCHED 3770 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3771 #else 3772 struct scx_sched *sch = scx_root; 3773 #endif 3774 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 3775 ret = __scx_init_task(sch, p, true); 3776 if (unlikely(ret)) { 3777 scx_set_task_state(p, SCX_TASK_NONE); 3778 return ret; 3779 } 3780 scx_set_task_state(p, SCX_TASK_INIT); 3781 scx_set_task_sched(p, sch); 3782 } 3783 3784 return 0; 3785 } 3786 3787 void scx_post_fork(struct task_struct *p) 3788 { 3789 if (scx_init_task_enabled) { 3790 scx_set_task_state(p, SCX_TASK_READY); 3791 3792 /* 3793 * Enable the task immediately if it's running on sched_ext. 3794 * Otherwise, it'll be enabled in switching_to_scx() if and 3795 * when it's ever configured to run with a SCHED_EXT policy. 3796 */ 3797 if (p->sched_class == &ext_sched_class) { 3798 struct rq_flags rf; 3799 struct rq *rq; 3800 3801 rq = task_rq_lock(p, &rf); 3802 scx_enable_task(scx_task_sched(p), p); 3803 task_rq_unlock(rq, p, &rf); 3804 } 3805 } 3806 3807 raw_spin_lock_irq(&scx_tasks_lock); 3808 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3809 raw_spin_unlock_irq(&scx_tasks_lock); 3810 3811 percpu_up_read(&scx_fork_rwsem); 3812 } 3813 3814 void scx_cancel_fork(struct task_struct *p) 3815 { 3816 if (scx_enabled()) { 3817 struct rq *rq; 3818 struct rq_flags rf; 3819 3820 rq = task_rq_lock(p, &rf); 3821 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3822 scx_disable_and_exit_task(scx_task_sched(p), p); 3823 task_rq_unlock(rq, p, &rf); 3824 } 3825 3826 percpu_up_read(&scx_fork_rwsem); 3827 } 3828 3829 /** 3830 * task_dead_and_done - Is a task dead and done running? 3831 * @p: target task 3832 * 3833 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3834 * task no longer exists from SCX's POV. However, certain sched_class ops may be 3835 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3836 * may try to switch a task which finished sched_ext_dead() back into SCX 3837 * triggering invalid SCX task state transitions and worse. 3838 * 3839 * Once a task has finished the final switch, sched_ext_dead() is the only thing 3840 * that needs to happen on the task. Use this test to short-circuit sched_class 3841 * operations which may be called on dead tasks. 3842 */ 3843 static bool task_dead_and_done(struct task_struct *p) 3844 { 3845 struct rq *rq = task_rq(p); 3846 3847 lockdep_assert_rq_held(rq); 3848 3849 /* 3850 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3851 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3852 * won't ever run again. 3853 */ 3854 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 3855 !task_on_cpu(rq, p); 3856 } 3857 3858 void sched_ext_dead(struct task_struct *p) 3859 { 3860 unsigned long flags; 3861 3862 /* 3863 * By the time control reaches here, @p has %TASK_DEAD set, switched out 3864 * for the last time and then dropped the rq lock - task_dead_and_done() 3865 * should be returning %true nullifying the straggling sched_class ops. 3866 * Remove from scx_tasks and exit @p. 3867 */ 3868 raw_spin_lock_irqsave(&scx_tasks_lock, flags); 3869 list_del_init(&p->scx.tasks_node); 3870 raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); 3871 3872 /* 3873 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 3874 * ENABLED transitions can't race us. Disable ops for @p. 3875 * 3876 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see 3877 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup 3878 * iteration is only used from sub-sched paths, which require root 3879 * enabled. Root enable transitions every live task to at least READY. 3880 * 3881 * %INIT_BEGIN means ops.init_task() is running for @p. Don't call 3882 * into ops; transition to %DEAD so the post-init recheck unwinds 3883 * via scx_sub_init_cancel_task(). 3884 */ 3885 if (scx_get_task_state(p) != SCX_TASK_NONE) { 3886 struct rq_flags rf; 3887 struct rq *rq; 3888 3889 rq = task_rq_lock(p, &rf); 3890 if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) 3891 scx_disable_and_exit_task(scx_task_sched(p), p); 3892 scx_set_task_state(p, SCX_TASK_DEAD); 3893 task_rq_unlock(rq, p, &rf); 3894 } 3895 } 3896 3897 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 3898 const struct load_weight *lw) 3899 { 3900 struct scx_sched *sch = scx_task_sched(p); 3901 3902 lockdep_assert_rq_held(task_rq(p)); 3903 3904 if (task_dead_and_done(p)) 3905 return; 3906 3907 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 3908 if (SCX_HAS_OP(sch, set_weight)) 3909 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3910 } 3911 3912 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 3913 { 3914 } 3915 3916 static void switching_to_scx(struct rq *rq, struct task_struct *p) 3917 { 3918 struct scx_sched *sch = scx_task_sched(p); 3919 3920 if (task_dead_and_done(p)) 3921 return; 3922 3923 scx_enable_task(sch, p); 3924 3925 /* 3926 * set_cpus_allowed_scx() is not called while @p is associated with a 3927 * different scheduler class. Keep the BPF scheduler up-to-date. 3928 */ 3929 if (SCX_HAS_OP(sch, set_cpumask)) 3930 SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); 3931 } 3932 3933 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3934 { 3935 if (task_dead_and_done(p)) 3936 return; 3937 3938 /* 3939 * %NONE means SCX is no longer tracking @p at the task level (e.g. 3940 * scx_fail_parent() handed @p back to the parent at NONE pending the 3941 * parent's own teardown). There is nothing to disable; calling 3942 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a 3943 * NONE -> READY validation failure. 3944 */ 3945 if (scx_get_task_state(p) == SCX_TASK_NONE) 3946 return; 3947 3948 scx_disable_task(scx_task_sched(p), p); 3949 } 3950 3951 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 3952 3953 int scx_check_setscheduler(struct task_struct *p, int policy) 3954 { 3955 lockdep_assert_rq_held(task_rq(p)); 3956 3957 /* if disallow, reject transitioning into SCX */ 3958 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 3959 p->policy != policy && policy == SCHED_EXT) 3960 return -EACCES; 3961 3962 return 0; 3963 } 3964 3965 static void process_ddsp_deferred_locals(struct rq *rq) 3966 { 3967 struct task_struct *p; 3968 3969 lockdep_assert_rq_held(rq); 3970 3971 /* 3972 * Now that @rq can be unlocked, execute the deferred enqueueing of 3973 * tasks directly dispatched to the local DSQs of other CPUs. See 3974 * direct_dispatch(). Keep popping from the head instead of using 3975 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 3976 * temporarily. 3977 */ 3978 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 3979 struct task_struct, scx.dsq_list.node))) { 3980 struct scx_sched *sch = scx_task_sched(p); 3981 struct scx_dispatch_q *dsq; 3982 u64 dsq_id = p->scx.ddsp_dsq_id; 3983 u64 enq_flags = p->scx.ddsp_enq_flags; 3984 3985 list_del_init(&p->scx.dsq_list.node); 3986 clear_direct_dispatch(p); 3987 3988 dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); 3989 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 3990 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 3991 } 3992 } 3993 3994 /* 3995 * Determine whether @p should be reenqueued from a local DSQ. 3996 * 3997 * @reenq_flags is mutable and accumulates state across the DSQ walk: 3998 * 3999 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 4000 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 4001 * the head consumes the first slot. 4002 * 4003 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 4004 * rq_is_open() is true. 4005 * 4006 * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 4007 * AND the current task is done — i.e. it will execute immediately. All other 4008 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 4009 * every IMMED task behind it gets reenqueued. 4010 * 4011 * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 4012 * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 4013 * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 4014 * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 4015 * in process_deferred_reenq_locals(). 4016 */ 4017 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 4018 { 4019 bool first; 4020 4021 first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 4022 *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 4023 4024 *reason = SCX_TASK_REENQ_KFUNC; 4025 4026 if ((p->scx.flags & SCX_TASK_IMMED) && 4027 (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 4028 __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 4029 *reason = SCX_TASK_REENQ_IMMED; 4030 return true; 4031 } 4032 4033 return *reenq_flags & SCX_REENQ_ANY; 4034 } 4035 4036 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 4037 { 4038 LIST_HEAD(tasks); 4039 u32 nr_enqueued = 0; 4040 struct task_struct *p, *n; 4041 4042 lockdep_assert_rq_held(rq); 4043 4044 if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 4045 reenq_flags &= ~__SCX_REENQ_TSR_MASK; 4046 if (rq_is_open(rq, 0)) 4047 reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 4048 4049 /* 4050 * The BPF scheduler may choose to dispatch tasks back to 4051 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 4052 * first to avoid processing the same tasks repeatedly. 4053 */ 4054 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 4055 scx.dsq_list.node) { 4056 struct scx_sched *task_sch = scx_task_sched(p); 4057 u32 reason; 4058 4059 /* 4060 * If @p is being migrated, @p's current CPU may not agree with 4061 * its allowed CPUs and the migration_cpu_stop is about to 4062 * deactivate and re-activate @p anyway. Skip re-enqueueing. 4063 * 4064 * While racing sched property changes may also dequeue and 4065 * re-enqueue a migrating task while its current CPU and allowed 4066 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 4067 * the current local DSQ for running tasks and thus are not 4068 * visible to the BPF scheduler. 4069 */ 4070 if (p->migration_pending) 4071 continue; 4072 4073 if (!scx_is_descendant(task_sch, sch)) 4074 continue; 4075 4076 if (!local_task_should_reenq(p, &reenq_flags, &reason)) 4077 continue; 4078 4079 dispatch_dequeue(rq, p); 4080 4081 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4082 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4083 p->scx.flags |= reason; 4084 4085 list_add_tail(&p->scx.dsq_list.node, &tasks); 4086 } 4087 4088 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 4089 list_del_init(&p->scx.dsq_list.node); 4090 4091 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 4092 4093 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4094 nr_enqueued++; 4095 } 4096 4097 return nr_enqueued; 4098 } 4099 4100 static void process_deferred_reenq_locals(struct rq *rq) 4101 { 4102 u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4103 4104 lockdep_assert_rq_held(rq); 4105 4106 while (true) { 4107 struct scx_sched *sch; 4108 u64 reenq_flags; 4109 bool skip = false; 4110 4111 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4112 struct scx_deferred_reenq_local *drl = 4113 list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4114 struct scx_deferred_reenq_local, 4115 node); 4116 struct scx_sched_pcpu *sch_pcpu; 4117 4118 if (!drl) 4119 return; 4120 4121 sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4122 deferred_reenq_local); 4123 sch = sch_pcpu->sch; 4124 4125 reenq_flags = drl->flags; 4126 WRITE_ONCE(drl->flags, 0); 4127 list_del_init(&drl->node); 4128 4129 if (likely(drl->seq != seq)) { 4130 drl->seq = seq; 4131 drl->cnt = 0; 4132 } else { 4133 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4134 scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4135 drl->cnt); 4136 skip = true; 4137 } 4138 4139 __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4140 } 4141 } 4142 4143 if (!skip) { 4144 /* see schedule_dsq_reenq() */ 4145 smp_mb(); 4146 4147 reenq_local(sch, rq, reenq_flags); 4148 } 4149 } 4150 } 4151 4152 static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4153 { 4154 *reason = SCX_TASK_REENQ_KFUNC; 4155 return reenq_flags & SCX_REENQ_ANY; 4156 } 4157 4158 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4159 { 4160 struct rq *locked_rq = rq; 4161 struct scx_sched *sch = dsq->sched; 4162 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4163 struct task_struct *p; 4164 s32 nr_enqueued = 0; 4165 4166 lockdep_assert_rq_held(rq); 4167 4168 raw_spin_lock(&dsq->lock); 4169 4170 while (likely(!READ_ONCE(sch->bypass_depth))) { 4171 struct rq *task_rq; 4172 u32 reason; 4173 4174 p = nldsq_cursor_next_task(&cursor, dsq); 4175 if (!p) 4176 break; 4177 4178 if (!user_task_should_reenq(p, reenq_flags, &reason)) 4179 continue; 4180 4181 task_rq = task_rq(p); 4182 4183 if (locked_rq != task_rq) { 4184 if (locked_rq) 4185 raw_spin_rq_unlock(locked_rq); 4186 if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4187 raw_spin_unlock(&dsq->lock); 4188 raw_spin_rq_lock(task_rq); 4189 raw_spin_lock(&dsq->lock); 4190 } 4191 locked_rq = task_rq; 4192 4193 /* did we lose @p while switching locks? */ 4194 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4195 continue; 4196 } 4197 4198 /* @p is on @dsq, its rq and @dsq are locked */ 4199 dispatch_dequeue_locked(p, dsq); 4200 raw_spin_unlock(&dsq->lock); 4201 4202 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4203 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4204 p->scx.flags |= reason; 4205 4206 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4207 4208 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4209 4210 if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4211 raw_spin_rq_unlock(locked_rq); 4212 locked_rq = NULL; 4213 cpu_relax(); 4214 } 4215 4216 raw_spin_lock(&dsq->lock); 4217 } 4218 4219 list_del_init(&cursor.node); 4220 raw_spin_unlock(&dsq->lock); 4221 4222 if (locked_rq != rq) { 4223 if (locked_rq) 4224 raw_spin_rq_unlock(locked_rq); 4225 raw_spin_rq_lock(rq); 4226 } 4227 } 4228 4229 static void process_deferred_reenq_users(struct rq *rq) 4230 { 4231 lockdep_assert_rq_held(rq); 4232 4233 while (true) { 4234 struct scx_dispatch_q *dsq; 4235 u64 reenq_flags; 4236 4237 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4238 struct scx_deferred_reenq_user *dru = 4239 list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4240 struct scx_deferred_reenq_user, 4241 node); 4242 struct scx_dsq_pcpu *dsq_pcpu; 4243 4244 if (!dru) 4245 return; 4246 4247 dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4248 deferred_reenq_user); 4249 dsq = dsq_pcpu->dsq; 4250 reenq_flags = dru->flags; 4251 WRITE_ONCE(dru->flags, 0); 4252 list_del_init(&dru->node); 4253 } 4254 4255 /* see schedule_dsq_reenq() */ 4256 smp_mb(); 4257 4258 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4259 reenq_user(rq, dsq, reenq_flags); 4260 } 4261 } 4262 4263 static void run_deferred(struct rq *rq) 4264 { 4265 process_ddsp_deferred_locals(rq); 4266 4267 if (!list_empty(&rq->scx.deferred_reenq_locals)) 4268 process_deferred_reenq_locals(rq); 4269 4270 if (!list_empty(&rq->scx.deferred_reenq_users)) 4271 process_deferred_reenq_users(rq); 4272 } 4273 4274 #ifdef CONFIG_NO_HZ_FULL 4275 bool scx_can_stop_tick(struct rq *rq) 4276 { 4277 struct task_struct *p = rq->curr; 4278 struct scx_sched *sch = scx_task_sched(p); 4279 4280 if (p->sched_class != &ext_sched_class) 4281 return true; 4282 4283 if (scx_bypassing(sch, cpu_of(rq))) 4284 return false; 4285 4286 /* 4287 * @rq can dispatch from different DSQs, so we can't tell whether it 4288 * needs the tick or not by looking at nr_running. Allow stopping ticks 4289 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4290 */ 4291 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4292 } 4293 #endif 4294 4295 #ifdef CONFIG_EXT_GROUP_SCHED 4296 4297 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4298 static bool scx_cgroup_enabled; 4299 4300 void scx_tg_init(struct task_group *tg) 4301 { 4302 tg->scx.weight = CGROUP_WEIGHT_DFL; 4303 tg->scx.bw_period_us = default_bw_period_us(); 4304 tg->scx.bw_quota_us = RUNTIME_INF; 4305 tg->scx.idle = false; 4306 } 4307 4308 int scx_tg_online(struct task_group *tg) 4309 { 4310 struct scx_sched *sch = scx_root; 4311 int ret = 0; 4312 4313 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4314 4315 if (scx_cgroup_enabled) { 4316 if (SCX_HAS_OP(sch, cgroup_init)) { 4317 struct scx_cgroup_init_args args = 4318 { .weight = tg->scx.weight, 4319 .bw_period_us = tg->scx.bw_period_us, 4320 .bw_quota_us = tg->scx.bw_quota_us, 4321 .bw_burst_us = tg->scx.bw_burst_us }; 4322 4323 ret = SCX_CALL_OP_RET(sch, cgroup_init, 4324 NULL, tg->css.cgroup, &args); 4325 if (ret) 4326 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4327 } 4328 if (ret == 0) 4329 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4330 } else { 4331 tg->scx.flags |= SCX_TG_ONLINE; 4332 } 4333 4334 return ret; 4335 } 4336 4337 void scx_tg_offline(struct task_group *tg) 4338 { 4339 struct scx_sched *sch = scx_root; 4340 4341 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4342 4343 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4344 (tg->scx.flags & SCX_TG_INITED)) 4345 SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); 4346 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4347 } 4348 4349 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4350 { 4351 struct scx_sched *sch = scx_root; 4352 struct cgroup_subsys_state *css; 4353 struct task_struct *p; 4354 int ret; 4355 4356 if (!scx_cgroup_enabled) 4357 return 0; 4358 4359 cgroup_taskset_for_each(p, css, tset) { 4360 struct cgroup *from = tg_cgrp(task_group(p)); 4361 struct cgroup *to = tg_cgrp(css_tg(css)); 4362 4363 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4364 4365 /* 4366 * sched_move_task() omits identity migrations. Let's match the 4367 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4368 * always match one-to-one. 4369 */ 4370 if (from == to) 4371 continue; 4372 4373 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4374 ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, 4375 p, from, css->cgroup); 4376 if (ret) 4377 goto err; 4378 } 4379 4380 p->scx.cgrp_moving_from = from; 4381 } 4382 4383 return 0; 4384 4385 err: 4386 cgroup_taskset_for_each(p, css, tset) { 4387 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4388 p->scx.cgrp_moving_from) 4389 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4390 p, p->scx.cgrp_moving_from, css->cgroup); 4391 p->scx.cgrp_moving_from = NULL; 4392 } 4393 4394 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4395 } 4396 4397 void scx_cgroup_move_task(struct task_struct *p) 4398 { 4399 struct scx_sched *sch = scx_root; 4400 4401 if (!scx_cgroup_enabled) 4402 return; 4403 4404 /* 4405 * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's 4406 * cgroup changes. Migration keys off css rather than cgroup identity, 4407 * so it can hand an unchanged-cgroup task here with cgrp_moving_from 4408 * NULL. Nothing to report to the BPF scheduler then, so skip it and 4409 * keep prep_move and move paired. 4410 */ 4411 if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from) 4412 SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), 4413 p, p->scx.cgrp_moving_from, 4414 tg_cgrp(task_group(p))); 4415 p->scx.cgrp_moving_from = NULL; 4416 } 4417 4418 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4419 { 4420 struct scx_sched *sch = scx_root; 4421 struct cgroup_subsys_state *css; 4422 struct task_struct *p; 4423 4424 if (!scx_cgroup_enabled) 4425 return; 4426 4427 cgroup_taskset_for_each(p, css, tset) { 4428 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4429 p->scx.cgrp_moving_from) 4430 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4431 p, p->scx.cgrp_moving_from, css->cgroup); 4432 p->scx.cgrp_moving_from = NULL; 4433 } 4434 } 4435 4436 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4437 { 4438 struct scx_sched *sch; 4439 4440 percpu_down_read(&scx_cgroup_ops_rwsem); 4441 sch = scx_root; 4442 4443 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4444 tg->scx.weight != weight) 4445 SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); 4446 4447 tg->scx.weight = weight; 4448 4449 percpu_up_read(&scx_cgroup_ops_rwsem); 4450 } 4451 4452 void scx_group_set_idle(struct task_group *tg, bool idle) 4453 { 4454 struct scx_sched *sch; 4455 4456 percpu_down_read(&scx_cgroup_ops_rwsem); 4457 sch = scx_root; 4458 4459 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4460 SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); 4461 4462 /* Update the task group's idle state */ 4463 tg->scx.idle = idle; 4464 4465 percpu_up_read(&scx_cgroup_ops_rwsem); 4466 } 4467 4468 void scx_group_set_bandwidth(struct task_group *tg, 4469 u64 period_us, u64 quota_us, u64 burst_us) 4470 { 4471 struct scx_sched *sch; 4472 4473 percpu_down_read(&scx_cgroup_ops_rwsem); 4474 sch = scx_root; 4475 4476 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4477 (tg->scx.bw_period_us != period_us || 4478 tg->scx.bw_quota_us != quota_us || 4479 tg->scx.bw_burst_us != burst_us)) 4480 SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, 4481 tg_cgrp(tg), period_us, quota_us, burst_us); 4482 4483 tg->scx.bw_period_us = period_us; 4484 tg->scx.bw_quota_us = quota_us; 4485 tg->scx.bw_burst_us = burst_us; 4486 4487 percpu_up_read(&scx_cgroup_ops_rwsem); 4488 } 4489 #endif /* CONFIG_EXT_GROUP_SCHED */ 4490 4491 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4492 static struct cgroup *root_cgroup(void) 4493 { 4494 return &cgrp_dfl_root.cgrp; 4495 } 4496 4497 static void scx_cgroup_lock(void) 4498 { 4499 #ifdef CONFIG_EXT_GROUP_SCHED 4500 percpu_down_write(&scx_cgroup_ops_rwsem); 4501 #endif 4502 cgroup_lock(); 4503 } 4504 4505 static void scx_cgroup_unlock(void) 4506 { 4507 cgroup_unlock(); 4508 #ifdef CONFIG_EXT_GROUP_SCHED 4509 percpu_up_write(&scx_cgroup_ops_rwsem); 4510 #endif 4511 } 4512 #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4513 static struct cgroup *root_cgroup(void) { return NULL; } 4514 static void scx_cgroup_lock(void) {} 4515 static void scx_cgroup_unlock(void) {} 4516 #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4517 4518 #ifdef CONFIG_EXT_SUB_SCHED 4519 static struct cgroup *sch_cgroup(struct scx_sched *sch) 4520 { 4521 return sch->cgrp; 4522 } 4523 4524 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4525 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4526 { 4527 struct cgroup *pos; 4528 struct cgroup_subsys_state *css; 4529 4530 cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4531 rcu_assign_pointer(pos->scx_sched, sch); 4532 } 4533 #else /* CONFIG_EXT_SUB_SCHED */ 4534 static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4535 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4536 #endif /* CONFIG_EXT_SUB_SCHED */ 4537 4538 /* 4539 * Omitted operations: 4540 * 4541 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4542 * 4543 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4544 * their current sched_class. Call them directly from sched core instead. 4545 */ 4546 DEFINE_SCHED_CLASS(ext) = { 4547 .enqueue_task = enqueue_task_scx, 4548 .dequeue_task = dequeue_task_scx, 4549 .yield_task = yield_task_scx, 4550 .yield_to_task = yield_to_task_scx, 4551 4552 .wakeup_preempt = wakeup_preempt_scx, 4553 4554 .pick_task = pick_task_scx, 4555 4556 .put_prev_task = put_prev_task_scx, 4557 .set_next_task = set_next_task_scx, 4558 4559 .select_task_rq = select_task_rq_scx, 4560 .task_woken = task_woken_scx, 4561 .set_cpus_allowed = set_cpus_allowed_scx, 4562 4563 .rq_online = rq_online_scx, 4564 .rq_offline = rq_offline_scx, 4565 4566 .task_tick = task_tick_scx, 4567 4568 .switching_to = switching_to_scx, 4569 .switched_from = switched_from_scx, 4570 .switched_to = switched_to_scx, 4571 .reweight_task = reweight_task_scx, 4572 .prio_changed = prio_changed_scx, 4573 4574 .update_curr = update_curr_scx, 4575 4576 #ifdef CONFIG_UCLAMP_TASK 4577 .uclamp_enabled = 1, 4578 #endif 4579 }; 4580 4581 static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4582 struct scx_sched *sch) 4583 { 4584 s32 cpu; 4585 4586 memset(dsq, 0, sizeof(*dsq)); 4587 4588 raw_spin_lock_init(&dsq->lock); 4589 INIT_LIST_HEAD(&dsq->list); 4590 dsq->id = dsq_id; 4591 dsq->sched = sch; 4592 4593 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4594 if (!dsq->pcpu) 4595 return -ENOMEM; 4596 4597 for_each_possible_cpu(cpu) { 4598 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4599 4600 pcpu->dsq = dsq; 4601 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4602 } 4603 4604 return 0; 4605 } 4606 4607 static void exit_dsq(struct scx_dispatch_q *dsq) 4608 { 4609 s32 cpu; 4610 4611 for_each_possible_cpu(cpu) { 4612 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4613 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4614 struct rq *rq = cpu_rq(cpu); 4615 4616 /* 4617 * There must have been a RCU grace period since the last 4618 * insertion and @dsq should be off the deferred list by now. 4619 */ 4620 if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4621 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4622 list_del_init(&dru->node); 4623 } 4624 } 4625 4626 free_percpu(dsq->pcpu); 4627 } 4628 4629 static void free_dsq_rcufn(struct rcu_head *rcu) 4630 { 4631 struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4632 4633 exit_dsq(dsq); 4634 kfree(dsq); 4635 } 4636 4637 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4638 { 4639 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4640 struct scx_dispatch_q *dsq, *tmp_dsq; 4641 4642 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4643 call_rcu(&dsq->rcu, free_dsq_rcufn); 4644 } 4645 4646 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4647 4648 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4649 { 4650 struct scx_dispatch_q *dsq; 4651 unsigned long flags; 4652 4653 rcu_read_lock(); 4654 4655 dsq = find_user_dsq(sch, dsq_id); 4656 if (!dsq) 4657 goto out_unlock_rcu; 4658 4659 raw_spin_lock_irqsave(&dsq->lock, flags); 4660 4661 if (dsq->nr) { 4662 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4663 dsq->id, dsq->nr); 4664 goto out_unlock_dsq; 4665 } 4666 4667 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4668 dsq_hash_params)) 4669 goto out_unlock_dsq; 4670 4671 /* 4672 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4673 * queueing more tasks. As this function can be called from anywhere, 4674 * freeing is bounced through an irq work to avoid nesting RCU 4675 * operations inside scheduler locks. 4676 */ 4677 dsq->id = SCX_DSQ_INVALID; 4678 if (llist_add(&dsq->free_node, &dsqs_to_free)) 4679 irq_work_queue(&free_dsq_irq_work); 4680 4681 out_unlock_dsq: 4682 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4683 out_unlock_rcu: 4684 rcu_read_unlock(); 4685 } 4686 4687 #ifdef CONFIG_EXT_GROUP_SCHED 4688 static void scx_cgroup_exit(struct scx_sched *sch) 4689 { 4690 struct cgroup_subsys_state *css; 4691 4692 scx_cgroup_enabled = false; 4693 4694 /* 4695 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4696 * cgroups and exit all the inited ones, all online cgroups are exited. 4697 */ 4698 css_for_each_descendant_post(css, &root_task_group.css) { 4699 struct task_group *tg = css_tg(css); 4700 4701 if (!(tg->scx.flags & SCX_TG_INITED)) 4702 continue; 4703 tg->scx.flags &= ~SCX_TG_INITED; 4704 4705 if (!sch->ops.cgroup_exit) 4706 continue; 4707 4708 SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); 4709 } 4710 } 4711 4712 static int scx_cgroup_init(struct scx_sched *sch) 4713 { 4714 struct cgroup_subsys_state *css; 4715 int ret; 4716 4717 /* 4718 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4719 * cgroups and init, all online cgroups are initialized. 4720 */ 4721 css_for_each_descendant_pre(css, &root_task_group.css) { 4722 struct task_group *tg = css_tg(css); 4723 struct scx_cgroup_init_args args = { 4724 .weight = tg->scx.weight, 4725 .bw_period_us = tg->scx.bw_period_us, 4726 .bw_quota_us = tg->scx.bw_quota_us, 4727 .bw_burst_us = tg->scx.bw_burst_us, 4728 }; 4729 4730 if ((tg->scx.flags & 4731 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4732 continue; 4733 4734 if (!sch->ops.cgroup_init) { 4735 tg->scx.flags |= SCX_TG_INITED; 4736 continue; 4737 } 4738 4739 ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, 4740 css->cgroup, &args); 4741 if (ret) { 4742 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4743 return ret; 4744 } 4745 tg->scx.flags |= SCX_TG_INITED; 4746 } 4747 4748 WARN_ON_ONCE(scx_cgroup_enabled); 4749 scx_cgroup_enabled = true; 4750 4751 return 0; 4752 } 4753 4754 #else 4755 static void scx_cgroup_exit(struct scx_sched *sch) {} 4756 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4757 #endif 4758 4759 4760 /******************************************************************************** 4761 * Sysfs interface and ops enable/disable. 4762 */ 4763 4764 #define SCX_ATTR(_name) \ 4765 static struct kobj_attribute scx_attr_##_name = { \ 4766 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4767 .show = scx_attr_##_name##_show, \ 4768 } 4769 4770 static ssize_t scx_attr_state_show(struct kobject *kobj, 4771 struct kobj_attribute *ka, char *buf) 4772 { 4773 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4774 } 4775 SCX_ATTR(state); 4776 4777 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4778 struct kobj_attribute *ka, char *buf) 4779 { 4780 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4781 } 4782 SCX_ATTR(switch_all); 4783 4784 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4785 struct kobj_attribute *ka, char *buf) 4786 { 4787 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4788 } 4789 SCX_ATTR(nr_rejected); 4790 4791 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4792 struct kobj_attribute *ka, char *buf) 4793 { 4794 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4795 } 4796 SCX_ATTR(hotplug_seq); 4797 4798 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4799 struct kobj_attribute *ka, char *buf) 4800 { 4801 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4802 } 4803 SCX_ATTR(enable_seq); 4804 4805 static struct attribute *scx_global_attrs[] = { 4806 &scx_attr_state.attr, 4807 &scx_attr_switch_all.attr, 4808 &scx_attr_nr_rejected.attr, 4809 &scx_attr_hotplug_seq.attr, 4810 &scx_attr_enable_seq.attr, 4811 NULL, 4812 }; 4813 4814 static const struct attribute_group scx_global_attr_group = { 4815 .attrs = scx_global_attrs, 4816 }; 4817 4818 static void free_pnode(struct scx_sched_pnode *pnode); 4819 static void free_exit_info(struct scx_exit_info *ei); 4820 4821 static void scx_sched_free_rcu_work(struct work_struct *work) 4822 { 4823 struct rcu_work *rcu_work = to_rcu_work(work); 4824 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 4825 struct rhashtable_iter rht_iter; 4826 struct scx_dispatch_q *dsq; 4827 int cpu, node; 4828 4829 irq_work_sync(&sch->disable_irq_work); 4830 kthread_destroy_worker(sch->helper); 4831 timer_shutdown_sync(&sch->bypass_lb_timer); 4832 free_cpumask_var(sch->bypass_lb_donee_cpumask); 4833 free_cpumask_var(sch->bypass_lb_resched_cpumask); 4834 4835 #ifdef CONFIG_EXT_SUB_SCHED 4836 kfree(sch->cgrp_path); 4837 if (sch_cgroup(sch)) 4838 cgroup_put(sch_cgroup(sch)); 4839 if (sch->sub_kset) 4840 kobject_put(&sch->sub_kset->kobj); 4841 #endif /* CONFIG_EXT_SUB_SCHED */ 4842 4843 for_each_possible_cpu(cpu) { 4844 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 4845 4846 /* 4847 * $sch would have entered bypass mode before the RCU grace 4848 * period. As that blocks new deferrals, all 4849 * deferred_reenq_local_node's must be off-list by now. 4850 */ 4851 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 4852 4853 exit_dsq(bypass_dsq(sch, cpu)); 4854 } 4855 4856 free_percpu(sch->pcpu); 4857 4858 for_each_node_state(node, N_POSSIBLE) 4859 free_pnode(sch->pnode[node]); 4860 kfree(sch->pnode); 4861 4862 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 4863 do { 4864 rhashtable_walk_start(&rht_iter); 4865 4866 while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 4867 destroy_dsq(sch, dsq->id); 4868 4869 rhashtable_walk_stop(&rht_iter); 4870 } while (dsq == ERR_PTR(-EAGAIN)); 4871 rhashtable_walk_exit(&rht_iter); 4872 4873 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 4874 free_exit_info(sch->exit_info); 4875 kfree(sch); 4876 } 4877 4878 static void scx_kobj_release(struct kobject *kobj) 4879 { 4880 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4881 4882 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 4883 queue_rcu_work(system_dfl_wq, &sch->rcu_work); 4884 } 4885 4886 static ssize_t scx_attr_ops_show(struct kobject *kobj, 4887 struct kobj_attribute *ka, char *buf) 4888 { 4889 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4890 4891 return sysfs_emit(buf, "%s\n", sch->ops.name); 4892 } 4893 SCX_ATTR(ops); 4894 4895 #define scx_attr_event_show(buf, at, events, kind) ({ \ 4896 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4897 }) 4898 4899 static ssize_t scx_attr_events_show(struct kobject *kobj, 4900 struct kobj_attribute *ka, char *buf) 4901 { 4902 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4903 struct scx_event_stats events; 4904 int at = 0; 4905 4906 scx_read_events(sch, &events); 4907 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4908 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4909 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4910 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4911 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4912 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 4913 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 4914 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 4915 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4916 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4917 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4918 at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 4919 at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 4920 return at; 4921 } 4922 SCX_ATTR(events); 4923 4924 static struct attribute *scx_sched_attrs[] = { 4925 &scx_attr_ops.attr, 4926 &scx_attr_events.attr, 4927 NULL, 4928 }; 4929 ATTRIBUTE_GROUPS(scx_sched); 4930 4931 static const struct kobj_type scx_ktype = { 4932 .release = scx_kobj_release, 4933 .sysfs_ops = &kobj_sysfs_ops, 4934 .default_groups = scx_sched_groups, 4935 }; 4936 4937 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 4938 { 4939 const struct scx_sched *sch; 4940 4941 /* 4942 * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) 4943 * and sub-scheduler kset kobjects (kset_ktype) through the parent 4944 * chain walk. Filter out the latter to avoid invalid casts. 4945 */ 4946 if (kobj->ktype != &scx_ktype) 4947 return 0; 4948 4949 sch = container_of(kobj, struct scx_sched, kobj); 4950 4951 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 4952 } 4953 4954 static const struct kset_uevent_ops scx_uevent_ops = { 4955 .uevent = scx_uevent, 4956 }; 4957 4958 /* 4959 * Used by sched_fork() and __setscheduler_prio() to pick the matching 4960 * sched_class. dl/rt are already handled. 4961 */ 4962 bool task_should_scx(int policy) 4963 { 4964 /* if disabled, nothing should be on it */ 4965 if (!scx_enabled()) 4966 return false; 4967 4968 /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ 4969 if (READ_ONCE(scx_switching_all)) 4970 return true; 4971 4972 /* 4973 * scx is tearing down - keep new SCHED_EXT tasks out. 4974 * 4975 * Must come after scx_switching_all test, which serves as a proxy 4976 * for __scx_switched_all. While __scx_switched_all is set, we must 4977 * return true via the branch above: a fork routed to fair would 4978 * stall because next_active_class() skips fair. 4979 * 4980 * This can develop into a deadlock - scx holds scx_enable_mutex across 4981 * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is 4982 * the stalled task, the disable path can never grab the mutex to clear 4983 * scx_switching_all. 4984 */ 4985 if (unlikely(scx_enable_state() == SCX_DISABLING)) 4986 return false; 4987 4988 return policy == SCHED_EXT; 4989 } 4990 4991 bool scx_allow_ttwu_queue(const struct task_struct *p) 4992 { 4993 struct scx_sched *sch; 4994 4995 if (!scx_enabled()) 4996 return true; 4997 4998 sch = scx_task_sched(p); 4999 if (unlikely(!sch)) 5000 return true; 5001 5002 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 5003 return true; 5004 5005 if (unlikely(p->sched_class != &ext_sched_class)) 5006 return true; 5007 5008 return false; 5009 } 5010 5011 /** 5012 * handle_lockup - sched_ext common lockup handler 5013 * @fmt: format string 5014 * 5015 * Called on system stall or lockup condition and initiates abort of sched_ext 5016 * if enabled, which may resolve the reported lockup. 5017 * 5018 * Returns %true if sched_ext is enabled and abort was initiated, which may 5019 * resolve the lockup. %false if sched_ext is not enabled or abort was already 5020 * initiated by someone else. 5021 */ 5022 static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 5023 { 5024 struct scx_sched *sch; 5025 va_list args; 5026 bool ret; 5027 5028 guard(rcu)(); 5029 5030 sch = rcu_dereference(scx_root); 5031 if (unlikely(!sch)) 5032 return false; 5033 5034 switch (scx_enable_state()) { 5035 case SCX_ENABLING: 5036 case SCX_ENABLED: 5037 va_start(args, fmt); 5038 ret = scx_verror(sch, fmt, args); 5039 va_end(args); 5040 return ret; 5041 default: 5042 return false; 5043 } 5044 } 5045 5046 /** 5047 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 5048 * 5049 * While there are various reasons why RCU CPU stalls can occur on a system 5050 * that may not be caused by the current BPF scheduler, try kicking out the 5051 * current scheduler in an attempt to recover the system to a good state before 5052 * issuing panics. 5053 * 5054 * Returns %true if sched_ext is enabled and abort was initiated, which may 5055 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 5056 * else already initiated abort. 5057 */ 5058 bool scx_rcu_cpu_stall(void) 5059 { 5060 return handle_lockup("RCU CPU stall detected!"); 5061 } 5062 5063 /** 5064 * scx_softlockup - sched_ext softlockup handler 5065 * @dur_s: number of seconds of CPU stuck due to soft lockup 5066 * 5067 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 5068 * live-lock the system by making many CPUs target the same DSQ to the point 5069 * where soft-lockup detection triggers. This function is called from 5070 * soft-lockup watchdog when the triggering point is close and tries to unjam 5071 * the system and aborting the BPF scheduler. 5072 */ 5073 void scx_softlockup(u32 dur_s) 5074 { 5075 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 5076 return; 5077 5078 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 5079 smp_processor_id(), dur_s); 5080 } 5081 5082 /* 5083 * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), 5084 * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing 5085 * it from NMI context can lead to deadlocks. Defer via irq_work; the 5086 * disable path runs off irq_work anyway. 5087 */ 5088 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); 5089 5090 static void scx_hardlockup_irq_workfn(struct irq_work *work) 5091 { 5092 int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); 5093 5094 if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) 5095 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 5096 cpu); 5097 } 5098 5099 static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); 5100 5101 /** 5102 * scx_hardlockup - sched_ext hardlockup handler 5103 * 5104 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 5105 * numerous affinitized tasks in a single queue and directing all CPUs at it. 5106 * Try kicking out the current scheduler in an attempt to recover the system to 5107 * a good state before taking more drastic actions. 5108 * 5109 * Queues an irq_work; the handle_lockup() call happens in IRQ context (see 5110 * scx_hardlockup_irq_workfn). 5111 * 5112 * Returns %true if sched_ext is enabled and the work was queued, %false 5113 * otherwise. 5114 */ 5115 bool scx_hardlockup(int cpu) 5116 { 5117 if (!rcu_access_pointer(scx_root)) 5118 return false; 5119 5120 atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); 5121 irq_work_queue(&scx_hardlockup_irq_work); 5122 return true; 5123 } 5124 5125 static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 5126 struct cpumask *donee_mask, struct cpumask *resched_mask, 5127 u32 nr_donor_target, u32 nr_donee_target) 5128 { 5129 struct rq *donor_rq = cpu_rq(donor); 5130 struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 5131 struct task_struct *p, *n; 5132 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 5133 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 5134 u32 nr_balanced = 0, min_delta_us; 5135 5136 /* 5137 * All we want to guarantee is reasonable forward progress. No reason to 5138 * fine tune. Assuming every task on @donor_dsq runs their full slice, 5139 * consider offloading iff the total queued duration is over the 5140 * threshold. 5141 */ 5142 min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 5143 if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 5144 return 0; 5145 5146 raw_spin_rq_lock_irq(donor_rq); 5147 raw_spin_lock(&donor_dsq->lock); 5148 list_add(&cursor.node, &donor_dsq->list); 5149 resume: 5150 n = container_of(&cursor, struct task_struct, scx.dsq_list); 5151 n = nldsq_next_task(donor_dsq, n, false); 5152 5153 while ((p = n)) { 5154 struct scx_dispatch_q *donee_dsq; 5155 int donee; 5156 5157 n = nldsq_next_task(donor_dsq, n, false); 5158 5159 if (donor_dsq->nr <= nr_donor_target) 5160 break; 5161 5162 if (cpumask_empty(donee_mask)) 5163 break; 5164 5165 /* 5166 * If an earlier pass placed @p on @donor_dsq from a different 5167 * CPU and the donee hasn't consumed it yet, @p is still on the 5168 * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved 5169 * without its rq locked. Skip. 5170 */ 5171 if (task_rq(p) != donor_rq) 5172 continue; 5173 5174 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5175 if (donee >= nr_cpu_ids) 5176 continue; 5177 5178 donee_dsq = bypass_dsq(sch, donee); 5179 5180 /* 5181 * $p's rq is not locked but $p's DSQ lock protects its 5182 * scheduling properties making this test safe. 5183 */ 5184 if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5185 continue; 5186 5187 /* 5188 * Moving $p from one non-local DSQ to another. The source rq 5189 * and DSQ are already locked. Do an abbreviated dequeue and 5190 * then perform enqueue without unlocking $donor_dsq. 5191 * 5192 * We don't want to drop and reacquire the lock on each 5193 * iteration as @donor_dsq can be very long and potentially 5194 * highly contended. Donee DSQs are less likely to be contended. 5195 * The nested locking is safe as only this LB moves tasks 5196 * between bypass DSQs. 5197 */ 5198 dispatch_dequeue_locked(p, donor_dsq); 5199 dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5200 5201 /* 5202 * $donee might have been idle and need to be woken up. No need 5203 * to be clever. Kick every CPU that receives tasks. 5204 */ 5205 cpumask_set_cpu(donee, resched_mask); 5206 5207 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5208 cpumask_clear_cpu(donee, donee_mask); 5209 5210 nr_balanced++; 5211 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5212 list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5213 raw_spin_unlock(&donor_dsq->lock); 5214 raw_spin_rq_unlock_irq(donor_rq); 5215 cpu_relax(); 5216 raw_spin_rq_lock_irq(donor_rq); 5217 raw_spin_lock(&donor_dsq->lock); 5218 goto resume; 5219 } 5220 } 5221 5222 list_del_init(&cursor.node); 5223 raw_spin_unlock(&donor_dsq->lock); 5224 raw_spin_rq_unlock_irq(donor_rq); 5225 5226 return nr_balanced; 5227 } 5228 5229 static void bypass_lb_node(struct scx_sched *sch, int node) 5230 { 5231 const struct cpumask *node_mask = cpumask_of_node(node); 5232 struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; 5233 struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; 5234 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5235 u32 nr_target, nr_donor_target; 5236 u32 before_min = U32_MAX, before_max = 0; 5237 u32 after_min = U32_MAX, after_max = 0; 5238 int cpu; 5239 5240 /* count the target tasks and CPUs */ 5241 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5242 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5243 5244 nr_tasks += nr; 5245 nr_cpus++; 5246 5247 before_min = min(nr, before_min); 5248 before_max = max(nr, before_max); 5249 } 5250 5251 if (!nr_cpus) 5252 return; 5253 5254 /* 5255 * We don't want CPUs to have more than $nr_donor_target tasks and 5256 * balancing to fill donee CPUs upto $nr_target. Once targets are 5257 * calculated, find the donee CPUs. 5258 */ 5259 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5260 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5261 5262 cpumask_clear(donee_mask); 5263 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5264 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5265 cpumask_set_cpu(cpu, donee_mask); 5266 } 5267 5268 /* iterate !donee CPUs and see if they should be offloaded */ 5269 cpumask_clear(resched_mask); 5270 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5271 if (cpumask_empty(donee_mask)) 5272 break; 5273 if (cpumask_test_cpu(cpu, donee_mask)) 5274 continue; 5275 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5276 continue; 5277 5278 nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5279 nr_donor_target, nr_target); 5280 } 5281 5282 for_each_cpu(cpu, resched_mask) 5283 resched_cpu(cpu); 5284 5285 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5286 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5287 5288 after_min = min(nr, after_min); 5289 after_max = max(nr, after_max); 5290 5291 } 5292 5293 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5294 before_min, before_max, after_min, after_max); 5295 } 5296 5297 /* 5298 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5299 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5300 * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5301 * lightly loaded CPUs, such imbalance can lead to very high execution latency 5302 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5303 * outcomes, a simple load balancing mechanism is implemented by the following 5304 * timer which runs periodically while bypass mode is in effect. 5305 */ 5306 static void scx_bypass_lb_timerfn(struct timer_list *timer) 5307 { 5308 struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5309 int node; 5310 u32 intv_us; 5311 5312 if (!bypass_dsp_enabled(sch)) 5313 return; 5314 5315 for_each_node_with_cpus(node) 5316 bypass_lb_node(sch, node); 5317 5318 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5319 if (intv_us) 5320 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5321 } 5322 5323 static bool inc_bypass_depth(struct scx_sched *sch) 5324 { 5325 lockdep_assert_held(&scx_bypass_lock); 5326 5327 WARN_ON_ONCE(sch->bypass_depth < 0); 5328 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5329 if (sch->bypass_depth != 1) 5330 return false; 5331 5332 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5333 sch->bypass_timestamp = ktime_get_ns(); 5334 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5335 return true; 5336 } 5337 5338 static bool dec_bypass_depth(struct scx_sched *sch) 5339 { 5340 lockdep_assert_held(&scx_bypass_lock); 5341 5342 WARN_ON_ONCE(sch->bypass_depth < 1); 5343 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5344 if (sch->bypass_depth != 0) 5345 return false; 5346 5347 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5348 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5349 ktime_get_ns() - sch->bypass_timestamp); 5350 return true; 5351 } 5352 5353 static void enable_bypass_dsp(struct scx_sched *sch) 5354 { 5355 struct scx_sched *host = scx_parent(sch) ?: sch; 5356 u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5357 s32 ret; 5358 5359 /* 5360 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5361 * Shouldn't stagger. 5362 */ 5363 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5364 return; 5365 5366 /* 5367 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5368 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5369 * called iff @sch is not already bypassed due to an ancestor bypassing, 5370 * we can assume that the parent is not bypassing and thus will be the 5371 * host of the bypass DSQs. 5372 * 5373 * While the situation may change in the future, the following 5374 * guarantees that the nearest non-bypassing ancestor or root has bypass 5375 * dispatch enabled while a descendant is bypassing, which is all that's 5376 * required. 5377 * 5378 * bypass_dsp_enabled() test is used to determine whether to enter the 5379 * bypass dispatch handling path from both bypassing and hosting scheds. 5380 * Bump enable depth on both @sch and bypass dispatch host. 5381 */ 5382 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5383 WARN_ON_ONCE(ret <= 0); 5384 5385 if (host != sch) { 5386 ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5387 WARN_ON_ONCE(ret <= 0); 5388 } 5389 5390 /* 5391 * The LB timer will stop running if bypass dispatch is disabled. Start 5392 * after enabling bypass dispatch. 5393 */ 5394 if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5395 mod_timer(&host->bypass_lb_timer, 5396 jiffies + usecs_to_jiffies(intv_us)); 5397 } 5398 5399 /* may be called without holding scx_bypass_lock */ 5400 static void disable_bypass_dsp(struct scx_sched *sch) 5401 { 5402 s32 ret; 5403 5404 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5405 return; 5406 5407 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5408 WARN_ON_ONCE(ret < 0); 5409 5410 if (scx_parent(sch)) { 5411 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5412 WARN_ON_ONCE(ret < 0); 5413 } 5414 } 5415 5416 /** 5417 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5418 * @sch: sched to bypass 5419 * @bypass: true for bypass, false for unbypass 5420 * 5421 * Bypassing guarantees that all runnable tasks make forward progress without 5422 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5423 * be held by tasks that the BPF scheduler is forgetting to run, which 5424 * unfortunately also excludes toggling the static branches. 5425 * 5426 * Let's work around by overriding a couple ops and modifying behaviors based on 5427 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5428 * to force global FIFO scheduling. 5429 * 5430 * - ops.select_cpu() is ignored and the default select_cpu() is used. 5431 * 5432 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5433 * %SCX_OPS_ENQ_LAST is also ignored. 5434 * 5435 * - ops.dispatch() is ignored. 5436 * 5437 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5438 * can't be trusted. Whenever a tick triggers, the running task is rotated to 5439 * the tail of the queue with core_sched_at touched. 5440 * 5441 * - pick_next_task() suppresses zero slice warning. 5442 * 5443 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5444 * operations. 5445 * 5446 * - scx_prio_less() reverts to the default core_sched_at order. 5447 */ 5448 static void scx_bypass(struct scx_sched *sch, bool bypass) 5449 { 5450 struct scx_sched *pos; 5451 unsigned long flags; 5452 int cpu; 5453 5454 raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5455 5456 if (bypass) { 5457 if (!inc_bypass_depth(sch)) 5458 goto unlock; 5459 5460 enable_bypass_dsp(sch); 5461 } else { 5462 if (!dec_bypass_depth(sch)) 5463 goto unlock; 5464 } 5465 5466 /* 5467 * Bypass state is propagated to all descendants - an scx_sched bypasses 5468 * if itself or any of its ancestors are in bypass mode. 5469 */ 5470 raw_spin_lock(&scx_sched_lock); 5471 scx_for_each_descendant_pre(pos, sch) { 5472 if (pos == sch) 5473 continue; 5474 if (bypass) 5475 inc_bypass_depth(pos); 5476 else 5477 dec_bypass_depth(pos); 5478 } 5479 raw_spin_unlock(&scx_sched_lock); 5480 5481 /* 5482 * No task property is changing. We just need to make sure all currently 5483 * queued tasks are re-queued according to the new scx_bypassing() 5484 * state. As an optimization, walk each rq's runnable_list instead of 5485 * the scx_tasks list. 5486 * 5487 * This function can't trust the scheduler and thus can't use 5488 * cpus_read_lock(). Walk all possible CPUs instead of online. 5489 */ 5490 for_each_possible_cpu(cpu) { 5491 struct rq *rq = cpu_rq(cpu); 5492 struct task_struct *p, *n; 5493 5494 raw_spin_rq_lock(rq); 5495 raw_spin_lock(&scx_sched_lock); 5496 5497 scx_for_each_descendant_pre(pos, sch) { 5498 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5499 5500 if (pos->bypass_depth) 5501 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5502 else 5503 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5504 } 5505 5506 raw_spin_unlock(&scx_sched_lock); 5507 5508 /* 5509 * We need to guarantee that no tasks are on the BPF scheduler 5510 * while bypassing. Either we see enabled or the enable path 5511 * sees scx_bypassing() before moving tasks to SCX. 5512 */ 5513 if (!scx_enabled()) { 5514 raw_spin_rq_unlock(rq); 5515 continue; 5516 } 5517 5518 /* 5519 * The use of list_for_each_entry_safe_reverse() is required 5520 * because each task is going to be removed from and added back 5521 * to the runnable_list during iteration. Because they're added 5522 * to the tail of the list, safe reverse iteration can still 5523 * visit all nodes. 5524 */ 5525 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5526 scx.runnable_node) { 5527 if (!scx_is_descendant(scx_task_sched(p), sch)) 5528 continue; 5529 5530 /* cycling deq/enq is enough, see the function comment */ 5531 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5532 /* nothing */ ; 5533 } 5534 } 5535 5536 /* resched to restore ticks and idle state */ 5537 if (cpu_online(cpu) || cpu == smp_processor_id()) 5538 resched_curr(rq); 5539 5540 raw_spin_rq_unlock(rq); 5541 } 5542 5543 /* disarming must come after moving all tasks out of the bypass DSQs */ 5544 if (!bypass) 5545 disable_bypass_dsp(sch); 5546 unlock: 5547 raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5548 } 5549 5550 static void free_exit_info(struct scx_exit_info *ei) 5551 { 5552 kvfree(ei->dump); 5553 kfree(ei->msg); 5554 kfree(ei->bt); 5555 kfree(ei); 5556 } 5557 5558 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5559 { 5560 struct scx_exit_info *ei; 5561 5562 ei = kzalloc_obj(*ei); 5563 if (!ei) 5564 return NULL; 5565 5566 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5567 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5568 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5569 5570 if (!ei->bt || !ei->msg || !ei->dump) { 5571 free_exit_info(ei); 5572 return NULL; 5573 } 5574 5575 return ei; 5576 } 5577 5578 static const char *scx_exit_reason(enum scx_exit_kind kind) 5579 { 5580 switch (kind) { 5581 case SCX_EXIT_UNREG: 5582 return "unregistered from user space"; 5583 case SCX_EXIT_UNREG_BPF: 5584 return "unregistered from BPF"; 5585 case SCX_EXIT_UNREG_KERN: 5586 return "unregistered from the main kernel"; 5587 case SCX_EXIT_SYSRQ: 5588 return "disabled by sysrq-S"; 5589 case SCX_EXIT_PARENT: 5590 return "parent exiting"; 5591 case SCX_EXIT_ERROR: 5592 return "runtime error"; 5593 case SCX_EXIT_ERROR_BPF: 5594 return "scx_bpf_error"; 5595 case SCX_EXIT_ERROR_STALL: 5596 return "runnable task stall"; 5597 default: 5598 return "<UNKNOWN>"; 5599 } 5600 } 5601 5602 static void free_kick_syncs(void) 5603 { 5604 int cpu; 5605 5606 for_each_possible_cpu(cpu) { 5607 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5608 struct scx_kick_syncs *to_free; 5609 5610 to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5611 if (to_free) 5612 kvfree_rcu(to_free, rcu); 5613 } 5614 } 5615 5616 static void refresh_watchdog(void) 5617 { 5618 struct scx_sched *sch; 5619 unsigned long intv = ULONG_MAX; 5620 5621 /* take the shortest timeout and use its half for watchdog interval */ 5622 rcu_read_lock(); 5623 list_for_each_entry_rcu(sch, &scx_sched_all, all) 5624 intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5625 rcu_read_unlock(); 5626 5627 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5628 WRITE_ONCE(scx_watchdog_interval, intv); 5629 5630 if (intv < ULONG_MAX) 5631 mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5632 else 5633 cancel_delayed_work_sync(&scx_watchdog_work); 5634 } 5635 5636 static s32 scx_link_sched(struct scx_sched *sch) 5637 { 5638 const char *err_msg = ""; 5639 s32 ret = 0; 5640 5641 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5642 #ifdef CONFIG_EXT_SUB_SCHED 5643 struct scx_sched *parent = scx_parent(sch); 5644 5645 if (parent) { 5646 /* 5647 * scx_claim_exit() propagates exit_kind transition to 5648 * its sub-scheds while holding scx_sched_lock - either 5649 * we can see the parent's non-NONE exit_kind or the 5650 * parent can shoot us down. 5651 */ 5652 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5653 err_msg = "parent disabled"; 5654 ret = -ENOENT; 5655 break; 5656 } 5657 5658 ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5659 &sch->hash_node, scx_sched_hash_params); 5660 if (ret) { 5661 err_msg = "failed to insert into scx_sched_hash"; 5662 break; 5663 } 5664 5665 list_add_tail(&sch->sibling, &parent->children); 5666 } 5667 #endif /* CONFIG_EXT_SUB_SCHED */ 5668 5669 list_add_tail_rcu(&sch->all, &scx_sched_all); 5670 } 5671 5672 /* 5673 * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after 5674 * the guard above is released. 5675 */ 5676 if (ret) { 5677 scx_error(sch, "%s (%d)", err_msg, ret); 5678 return ret; 5679 } 5680 5681 refresh_watchdog(); 5682 return 0; 5683 } 5684 5685 static void scx_unlink_sched(struct scx_sched *sch) 5686 { 5687 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5688 #ifdef CONFIG_EXT_SUB_SCHED 5689 if (scx_parent(sch)) { 5690 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5691 scx_sched_hash_params); 5692 list_del_init(&sch->sibling); 5693 } 5694 #endif /* CONFIG_EXT_SUB_SCHED */ 5695 list_del_rcu(&sch->all); 5696 } 5697 5698 refresh_watchdog(); 5699 } 5700 5701 /* 5702 * Called to disable future dumps and wait for in-progress one while disabling 5703 * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5704 * This prevents calling dump ops on a dead sch. 5705 */ 5706 static void scx_disable_dump(struct scx_sched *sch) 5707 { 5708 guard(raw_spinlock_irqsave)(&scx_dump_lock); 5709 sch->dump_disabled = true; 5710 } 5711 5712 #ifdef CONFIG_EXT_SUB_SCHED 5713 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5714 5715 static void drain_descendants(struct scx_sched *sch) 5716 { 5717 /* 5718 * Child scheds that finished the critical part of disabling will take 5719 * themselves off @sch->children. Wait for it to drain. As propagation 5720 * is recursive, empty @sch->children means that all proper descendant 5721 * scheds reached unlinking stage. 5722 */ 5723 wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5724 } 5725 5726 static void scx_fail_parent(struct scx_sched *sch, 5727 struct task_struct *failed, s32 fail_code) 5728 { 5729 struct scx_sched *parent = scx_parent(sch); 5730 struct scx_task_iter sti; 5731 struct task_struct *p; 5732 5733 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5734 fail_code, failed->comm, failed->pid); 5735 5736 /* 5737 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5738 * it. This may cause downstream failures on the BPF side but $parent is 5739 * dying anyway. 5740 */ 5741 scx_bypass(parent, true); 5742 5743 scx_task_iter_start(&sti, sch->cgrp); 5744 while ((p = scx_task_iter_next_locked(&sti))) { 5745 if (scx_task_on_sched(parent, p)) 5746 continue; 5747 5748 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5749 scx_disable_and_exit_task(sch, p); 5750 scx_set_task_sched(p, parent); 5751 } 5752 } 5753 scx_task_iter_stop(&sti); 5754 } 5755 5756 static void scx_sub_disable(struct scx_sched *sch) 5757 { 5758 struct scx_sched *parent = scx_parent(sch); 5759 struct scx_task_iter sti; 5760 struct task_struct *p; 5761 int ret; 5762 5763 /* 5764 * Guarantee forward progress and wait for descendants to be disabled. 5765 * To limit disruptions, $parent is not bypassed. Tasks are fully 5766 * prepped and then inserted back into $parent. 5767 */ 5768 scx_bypass(sch, true); 5769 drain_descendants(sch); 5770 5771 /* 5772 * Here, every runnable task is guaranteed to make forward progress and 5773 * we can safely use blocking synchronization constructs. Actually 5774 * disable ops. 5775 */ 5776 mutex_lock(&scx_enable_mutex); 5777 percpu_down_write(&scx_fork_rwsem); 5778 scx_cgroup_lock(); 5779 5780 set_cgroup_sched(sch_cgroup(sch), parent); 5781 5782 scx_task_iter_start(&sti, sch->cgrp); 5783 while ((p = scx_task_iter_next_locked(&sti))) { 5784 struct rq *rq; 5785 struct rq_flags rf; 5786 5787 /* filter out duplicate visits */ 5788 if (scx_task_on_sched(parent, p)) 5789 continue; 5790 5791 /* 5792 * By the time control reaches here, all descendant schedulers 5793 * should already have been disabled. 5794 */ 5795 WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 5796 5797 /* 5798 * If $p is about to be freed, nothing prevents $sch from 5799 * unloading before $p reaches sched_ext_free(). Disable and 5800 * exit $p right away. 5801 */ 5802 if (!tryget_task_struct(p)) { 5803 scx_disable_and_exit_task(sch, p); 5804 continue; 5805 } 5806 5807 scx_task_iter_unlock(&sti); 5808 5809 /* 5810 * $p is READY or ENABLED on @sch. Initialize for $parent, 5811 * disable and exit from @sch, and then switch over to $parent. 5812 * 5813 * If a task fails to initialize for $parent, the only available 5814 * action is disabling $parent too. While this allows disabling 5815 * of a child sched to cause the parent scheduler to fail, the 5816 * failure can only originate from ops.init_task() of the 5817 * parent. A child can't directly affect the parent through its 5818 * own failures. 5819 */ 5820 ret = __scx_init_task(parent, p, false); 5821 if (ret) { 5822 scx_fail_parent(sch, p, ret); 5823 put_task_struct(p); 5824 break; 5825 } 5826 5827 rq = task_rq_lock(p, &rf); 5828 5829 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 5830 /* 5831 * sched_ext_dead() raced us between __scx_init_task() 5832 * and this rq lock and ran exit_task() on @sch (the 5833 * sched @p was on at that point), not on $parent. 5834 * $parent's just-completed init is owed an exit_task() 5835 * and we issue it here. 5836 */ 5837 scx_sub_init_cancel_task(parent, p); 5838 task_rq_unlock(rq, p, &rf); 5839 put_task_struct(p); 5840 continue; 5841 } 5842 5843 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5844 /* 5845 * $p is initialized for $parent and still attached to 5846 * @sch. Disable and exit for @sch, switch over to 5847 * $parent, override the state to READY to account for 5848 * $p having already been initialized, and then enable. 5849 */ 5850 scx_disable_and_exit_task(sch, p); 5851 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 5852 scx_set_task_state(p, SCX_TASK_INIT); 5853 scx_set_task_sched(p, parent); 5854 scx_set_task_state(p, SCX_TASK_READY); 5855 scx_enable_task(parent, p); 5856 } 5857 5858 task_rq_unlock(rq, p, &rf); 5859 put_task_struct(p); 5860 } 5861 scx_task_iter_stop(&sti); 5862 5863 scx_disable_dump(sch); 5864 5865 scx_cgroup_unlock(); 5866 percpu_up_write(&scx_fork_rwsem); 5867 5868 /* 5869 * All tasks are moved off of @sch but there may still be on-going 5870 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 5871 * the expedited version as ancestors may be waiting in bypass mode. 5872 * Also, tell the parent that there is no need to keep running bypass 5873 * DSQs for us. 5874 */ 5875 synchronize_rcu_expedited(); 5876 disable_bypass_dsp(sch); 5877 5878 scx_unlink_sched(sch); 5879 5880 mutex_unlock(&scx_enable_mutex); 5881 5882 /* 5883 * @sch is now unlinked from the parent's children list. Notify and call 5884 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 5885 * after unlinking and releasing all locks. See scx_claim_exit(). 5886 */ 5887 wake_up_all(&scx_unlink_waitq); 5888 5889 if (parent->ops.sub_detach && sch->sub_attached) { 5890 struct scx_sub_detach_args sub_detach_args = { 5891 .ops = &sch->ops, 5892 .cgroup_path = sch->cgrp_path, 5893 }; 5894 SCX_CALL_OP(parent, sub_detach, NULL, 5895 &sub_detach_args); 5896 } 5897 5898 if (sch->ops.exit) 5899 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 5900 if (sch->sub_kset) 5901 kobject_del(&sch->sub_kset->kobj); 5902 kobject_del(&sch->kobj); 5903 } 5904 #else /* CONFIG_EXT_SUB_SCHED */ 5905 static void drain_descendants(struct scx_sched *sch) { } 5906 static void scx_sub_disable(struct scx_sched *sch) { } 5907 #endif /* CONFIG_EXT_SUB_SCHED */ 5908 5909 static void scx_root_disable(struct scx_sched *sch) 5910 { 5911 struct scx_exit_info *ei = sch->exit_info; 5912 struct scx_task_iter sti; 5913 struct task_struct *p; 5914 int cpu; 5915 5916 /* guarantee forward progress and wait for descendants to be disabled */ 5917 scx_bypass(sch, true); 5918 drain_descendants(sch); 5919 5920 switch (scx_set_enable_state(SCX_DISABLING)) { 5921 case SCX_DISABLING: 5922 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 5923 break; 5924 case SCX_DISABLED: 5925 pr_warn("sched_ext: ops error detected without ops (%s)\n", 5926 sch->exit_info->msg); 5927 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 5928 goto done; 5929 default: 5930 break; 5931 } 5932 5933 /* 5934 * Here, every runnable task is guaranteed to make forward progress and 5935 * we can safely use blocking synchronization constructs. Actually 5936 * disable ops. 5937 */ 5938 mutex_lock(&scx_enable_mutex); 5939 5940 static_branch_disable(&__scx_switched_all); 5941 WRITE_ONCE(scx_switching_all, false); 5942 5943 /* 5944 * Shut down cgroup support before tasks so that the cgroup attach path 5945 * doesn't race against scx_disable_and_exit_task(). 5946 */ 5947 scx_cgroup_lock(); 5948 scx_cgroup_exit(sch); 5949 scx_cgroup_unlock(); 5950 5951 /* 5952 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 5953 * must be switched out and exited synchronously. 5954 */ 5955 percpu_down_write(&scx_fork_rwsem); 5956 5957 scx_init_task_enabled = false; 5958 5959 scx_task_iter_start(&sti, NULL); 5960 while ((p = scx_task_iter_next_locked(&sti))) { 5961 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 5962 const struct sched_class *old_class = p->sched_class; 5963 const struct sched_class *new_class = scx_setscheduler_class(p); 5964 5965 update_rq_clock(task_rq(p)); 5966 5967 if (old_class != new_class) 5968 queue_flags |= DEQUEUE_CLASS; 5969 5970 scoped_guard (sched_change, p, queue_flags) { 5971 p->sched_class = new_class; 5972 } 5973 5974 scx_disable_and_exit_task(scx_task_sched(p), p); 5975 } 5976 scx_task_iter_stop(&sti); 5977 5978 scx_disable_dump(sch); 5979 5980 scx_cgroup_lock(); 5981 set_cgroup_sched(sch_cgroup(sch), NULL); 5982 scx_cgroup_unlock(); 5983 5984 percpu_up_write(&scx_fork_rwsem); 5985 5986 /* 5987 * Invalidate all the rq clocks to prevent getting outdated 5988 * rq clocks from a previous scx scheduler. 5989 */ 5990 for_each_possible_cpu(cpu) { 5991 struct rq *rq = cpu_rq(cpu); 5992 scx_rq_clock_invalidate(rq); 5993 } 5994 5995 /* no task is on scx, turn off all the switches and flush in-progress calls */ 5996 static_branch_disable(&__scx_enabled); 5997 bitmap_zero(sch->has_op, SCX_OPI_END); 5998 scx_idle_disable(); 5999 synchronize_rcu(); 6000 6001 if (ei->kind >= SCX_EXIT_ERROR) { 6002 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 6003 sch->ops.name, ei->reason); 6004 6005 if (ei->msg[0] != '\0') 6006 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 6007 #ifdef CONFIG_STACKTRACE 6008 stack_trace_print(ei->bt, ei->bt_len, 2); 6009 #endif 6010 } else { 6011 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 6012 sch->ops.name, ei->reason); 6013 } 6014 6015 if (sch->ops.exit) 6016 SCX_CALL_OP(sch, exit, NULL, ei); 6017 6018 scx_unlink_sched(sch); 6019 6020 /* 6021 * scx_root clearing must be inside cpus_read_lock(). See 6022 * handle_hotplug(). 6023 */ 6024 cpus_read_lock(); 6025 RCU_INIT_POINTER(scx_root, NULL); 6026 cpus_read_unlock(); 6027 6028 /* 6029 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 6030 * could observe an object of the same name still in the hierarchy when 6031 * the next scheduler is loaded. 6032 */ 6033 #ifdef CONFIG_EXT_SUB_SCHED 6034 if (sch->sub_kset) 6035 kobject_del(&sch->sub_kset->kobj); 6036 #endif 6037 kobject_del(&sch->kobj); 6038 6039 free_kick_syncs(); 6040 6041 mutex_unlock(&scx_enable_mutex); 6042 6043 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6044 done: 6045 scx_bypass(sch, false); 6046 } 6047 6048 /* 6049 * Claim the exit on @sch. The caller must ensure that the helper kthread work 6050 * is kicked before the current task can be preempted. Once exit_kind is 6051 * claimed, scx_error() can no longer trigger, so if the current task gets 6052 * preempted and the BPF scheduler fails to schedule it back, the helper work 6053 * will never be kicked and the whole system can wedge. 6054 */ 6055 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 6056 { 6057 int none = SCX_EXIT_NONE; 6058 6059 lockdep_assert_preemption_disabled(); 6060 6061 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 6062 kind = SCX_EXIT_ERROR; 6063 6064 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 6065 return false; 6066 6067 /* 6068 * Some CPUs may be trapped in the dispatch paths. Set the aborting 6069 * flag to break potential live-lock scenarios, ensuring we can 6070 * successfully reach scx_bypass(). 6071 */ 6072 WRITE_ONCE(sch->aborting, true); 6073 6074 /* 6075 * Propagate exits to descendants immediately. Each has a dedicated 6076 * helper kthread and can run in parallel. While most of disabling is 6077 * serialized, running them in separate threads allows parallelizing 6078 * ops.exit(), which can take arbitrarily long prolonging bypass mode. 6079 * 6080 * To guarantee forward progress, this propagation must be in-line so 6081 * that ->aborting is synchronously asserted for all sub-scheds. The 6082 * propagation is also the interlocking point against sub-sched 6083 * attachment. See scx_link_sched(). 6084 * 6085 * This doesn't cause recursions as propagation only takes place for 6086 * non-propagation exits. 6087 */ 6088 if (kind != SCX_EXIT_PARENT) { 6089 scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 6090 struct scx_sched *pos; 6091 scx_for_each_descendant_pre(pos, sch) 6092 scx_disable(pos, SCX_EXIT_PARENT); 6093 } 6094 } 6095 6096 return true; 6097 } 6098 6099 static void scx_disable_workfn(struct kthread_work *work) 6100 { 6101 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 6102 struct scx_exit_info *ei = sch->exit_info; 6103 int kind; 6104 6105 kind = atomic_read(&sch->exit_kind); 6106 while (true) { 6107 if (kind == SCX_EXIT_DONE) /* already disabled? */ 6108 return; 6109 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 6110 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 6111 break; 6112 } 6113 ei->kind = kind; 6114 ei->reason = scx_exit_reason(ei->kind); 6115 6116 if (scx_parent(sch)) 6117 scx_sub_disable(sch); 6118 else 6119 scx_root_disable(sch); 6120 } 6121 6122 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 6123 { 6124 guard(preempt)(); 6125 if (scx_claim_exit(sch, kind)) 6126 irq_work_queue(&sch->disable_irq_work); 6127 } 6128 6129 /** 6130 * scx_flush_disable_work - flush the disable work and wait for it to finish 6131 * @sch: the scheduler 6132 * 6133 * sch->disable_work might still not queued, causing kthread_flush_work() 6134 * as a noop. Syncing the irq_work first is required to guarantee the 6135 * kthread work has been queued before waiting for it. 6136 */ 6137 static void scx_flush_disable_work(struct scx_sched *sch) 6138 { 6139 int kind; 6140 6141 do { 6142 irq_work_sync(&sch->disable_irq_work); 6143 kthread_flush_work(&sch->disable_work); 6144 kind = atomic_read(&sch->exit_kind); 6145 } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); 6146 } 6147 6148 static void dump_newline(struct seq_buf *s) 6149 { 6150 trace_sched_ext_dump(""); 6151 6152 /* @s may be zero sized and seq_buf triggers WARN if so */ 6153 if (s->size) 6154 seq_buf_putc(s, '\n'); 6155 } 6156 6157 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 6158 { 6159 va_list args; 6160 6161 #ifdef CONFIG_TRACEPOINTS 6162 if (trace_sched_ext_dump_enabled()) { 6163 /* protected by scx_dump_lock */ 6164 static char line_buf[SCX_EXIT_MSG_LEN]; 6165 6166 va_start(args, fmt); 6167 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 6168 va_end(args); 6169 6170 trace_call__sched_ext_dump(line_buf); 6171 } 6172 #endif 6173 /* @s may be zero sized and seq_buf triggers WARN if so */ 6174 if (s->size) { 6175 va_start(args, fmt); 6176 seq_buf_vprintf(s, fmt, args); 6177 va_end(args); 6178 6179 seq_buf_putc(s, '\n'); 6180 } 6181 } 6182 6183 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 6184 const unsigned long *bt, unsigned int len) 6185 { 6186 unsigned int i; 6187 6188 for (i = 0; i < len; i++) 6189 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 6190 } 6191 6192 static void ops_dump_init(struct seq_buf *s, const char *prefix) 6193 { 6194 struct scx_dump_data *dd = &scx_dump_data; 6195 6196 lockdep_assert_irqs_disabled(); 6197 6198 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 6199 dd->first = true; 6200 dd->cursor = 0; 6201 dd->s = s; 6202 dd->prefix = prefix; 6203 } 6204 6205 static void ops_dump_flush(void) 6206 { 6207 struct scx_dump_data *dd = &scx_dump_data; 6208 char *line = dd->buf.line; 6209 6210 if (!dd->cursor) 6211 return; 6212 6213 /* 6214 * There's something to flush and this is the first line. Insert a blank 6215 * line to distinguish ops dump. 6216 */ 6217 if (dd->first) { 6218 dump_newline(dd->s); 6219 dd->first = false; 6220 } 6221 6222 /* 6223 * There may be multiple lines in $line. Scan and emit each line 6224 * separately. 6225 */ 6226 while (true) { 6227 char *end = line; 6228 char c; 6229 6230 while (*end != '\n' && *end != '\0') 6231 end++; 6232 6233 /* 6234 * If $line overflowed, it may not have newline at the end. 6235 * Always emit with a newline. 6236 */ 6237 c = *end; 6238 *end = '\0'; 6239 dump_line(dd->s, "%s%s", dd->prefix, line); 6240 if (c == '\0') 6241 break; 6242 6243 /* move to the next line */ 6244 end++; 6245 if (*end == '\0') 6246 break; 6247 line = end; 6248 } 6249 6250 dd->cursor = 0; 6251 } 6252 6253 static void ops_dump_exit(void) 6254 { 6255 ops_dump_flush(); 6256 scx_dump_data.cpu = -1; 6257 } 6258 6259 static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, 6260 struct rq *rq, struct task_struct *p, char marker) 6261 { 6262 static unsigned long bt[SCX_EXIT_BT_LEN]; 6263 struct scx_sched *task_sch = scx_task_sched(p); 6264 const char *own_marker; 6265 char sch_id_buf[32]; 6266 char dsq_id_buf[19] = "(n/a)"; 6267 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6268 unsigned int bt_len = 0; 6269 6270 own_marker = task_sch == sch ? "*" : ""; 6271 6272 if (task_sch->level == 0) 6273 scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6274 else 6275 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6276 task_sch->level, task_sch->ops.sub_cgroup_id); 6277 6278 if (p->scx.dsq) 6279 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6280 (unsigned long long)p->scx.dsq->id); 6281 6282 dump_newline(s); 6283 dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6284 marker, task_state_to_char(p), p->comm, p->pid, 6285 own_marker, sch_id_buf, 6286 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6287 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6288 scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6289 p->scx.flags & ~SCX_TASK_STATE_MASK, 6290 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6291 ops_state >> SCX_OPSS_QSEQ_SHIFT); 6292 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6293 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6294 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6295 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6296 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6297 p->migration_disabled); 6298 6299 if (SCX_HAS_OP(sch, dump_task)) { 6300 ops_dump_init(s, " "); 6301 SCX_CALL_OP(sch, dump_task, rq, dctx, p); 6302 ops_dump_exit(); 6303 } 6304 6305 #ifdef CONFIG_STACKTRACE 6306 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6307 #endif 6308 if (bt_len) { 6309 dump_newline(s); 6310 dump_stack_trace(s, " ", bt, bt_len); 6311 } 6312 } 6313 6314 /* 6315 * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6316 * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6317 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6318 * separately. For error dumps, @dump_all_tasks=true since only the failing 6319 * scheduler is dumped. 6320 */ 6321 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6322 size_t dump_len, bool dump_all_tasks) 6323 { 6324 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6325 struct scx_dump_ctx dctx = { 6326 .kind = ei->kind, 6327 .exit_code = ei->exit_code, 6328 .reason = ei->reason, 6329 .at_ns = ktime_get_ns(), 6330 .at_jiffies = jiffies, 6331 }; 6332 struct seq_buf s; 6333 struct scx_event_stats events; 6334 char *buf; 6335 int cpu; 6336 6337 guard(raw_spinlock_irqsave)(&scx_dump_lock); 6338 6339 if (sch->dump_disabled) 6340 return; 6341 6342 seq_buf_init(&s, ei->dump, dump_len); 6343 6344 #ifdef CONFIG_EXT_SUB_SCHED 6345 if (sch->level == 0) 6346 dump_line(&s, "%s: root", sch->ops.name); 6347 else 6348 dump_line(&s, "%s: sub%d-%llu %s", 6349 sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6350 sch->cgrp_path); 6351 #endif 6352 if (ei->kind == SCX_EXIT_NONE) { 6353 dump_line(&s, "Debug dump triggered by %s", ei->reason); 6354 } else { 6355 dump_line(&s, "%s[%d] triggered exit kind %d:", 6356 current->comm, current->pid, ei->kind); 6357 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6358 dump_newline(&s); 6359 dump_line(&s, "Backtrace:"); 6360 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6361 } 6362 6363 if (SCX_HAS_OP(sch, dump)) { 6364 ops_dump_init(&s, ""); 6365 SCX_CALL_OP(sch, dump, NULL, &dctx); 6366 ops_dump_exit(); 6367 } 6368 6369 dump_newline(&s); 6370 dump_line(&s, "CPU states"); 6371 dump_line(&s, "----------"); 6372 6373 for_each_possible_cpu(cpu) { 6374 struct rq *rq = cpu_rq(cpu); 6375 struct rq_flags rf; 6376 struct task_struct *p; 6377 struct seq_buf ns; 6378 size_t avail, used; 6379 bool idle; 6380 6381 rq_lock_irqsave(rq, &rf); 6382 6383 idle = list_empty(&rq->scx.runnable_list) && 6384 rq->curr->sched_class == &idle_sched_class; 6385 6386 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6387 goto next; 6388 6389 /* 6390 * We don't yet know whether ops.dump_cpu() will produce output 6391 * and we may want to skip the default CPU dump if it doesn't. 6392 * Use a nested seq_buf to generate the standard dump so that we 6393 * can decide whether to commit later. 6394 */ 6395 avail = seq_buf_get_buf(&s, &buf); 6396 seq_buf_init(&ns, buf, avail); 6397 6398 dump_newline(&ns); 6399 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6400 cpu, rq->scx.nr_running, rq->scx.flags, 6401 rq->scx.cpu_released, rq->scx.ops_qseq, 6402 rq->scx.kick_sync); 6403 dump_line(&ns, " curr=%s[%d] class=%ps", 6404 rq->curr->comm, rq->curr->pid, 6405 rq->curr->sched_class); 6406 if (!cpumask_empty(rq->scx.cpus_to_kick)) 6407 dump_line(&ns, " cpus_to_kick : %*pb", 6408 cpumask_pr_args(rq->scx.cpus_to_kick)); 6409 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6410 dump_line(&ns, " idle_to_kick : %*pb", 6411 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6412 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6413 dump_line(&ns, " cpus_to_preempt: %*pb", 6414 cpumask_pr_args(rq->scx.cpus_to_preempt)); 6415 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6416 dump_line(&ns, " cpus_to_wait : %*pb", 6417 cpumask_pr_args(rq->scx.cpus_to_wait)); 6418 if (!cpumask_empty(rq->scx.cpus_to_sync)) 6419 dump_line(&ns, " cpus_to_sync : %*pb", 6420 cpumask_pr_args(rq->scx.cpus_to_sync)); 6421 6422 used = seq_buf_used(&ns); 6423 if (SCX_HAS_OP(sch, dump_cpu)) { 6424 ops_dump_init(&ns, " "); 6425 SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); 6426 ops_dump_exit(); 6427 } 6428 6429 /* 6430 * If idle && nothing generated by ops.dump_cpu(), there's 6431 * nothing interesting. Skip. 6432 */ 6433 if (idle && used == seq_buf_used(&ns)) 6434 goto next; 6435 6436 /* 6437 * $s may already have overflowed when $ns was created. If so, 6438 * calling commit on it will trigger BUG. 6439 */ 6440 if (avail) { 6441 seq_buf_commit(&s, seq_buf_used(&ns)); 6442 if (seq_buf_has_overflowed(&ns)) 6443 seq_buf_set_overflow(&s); 6444 } 6445 6446 if (rq->curr->sched_class == &ext_sched_class && 6447 (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6448 scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); 6449 6450 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6451 if (dump_all_tasks || scx_task_on_sched(sch, p)) 6452 scx_dump_task(sch, &s, &dctx, rq, p, ' '); 6453 next: 6454 rq_unlock_irqrestore(rq, &rf); 6455 } 6456 6457 dump_newline(&s); 6458 dump_line(&s, "Event counters"); 6459 dump_line(&s, "--------------"); 6460 6461 scx_read_events(sch, &events); 6462 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6463 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6464 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6465 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6466 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6467 scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6468 scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6469 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6470 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6471 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6472 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6473 scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6474 scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6475 6476 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6477 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6478 trunc_marker, sizeof(trunc_marker)); 6479 } 6480 6481 static void scx_disable_irq_workfn(struct irq_work *irq_work) 6482 { 6483 struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6484 struct scx_exit_info *ei = sch->exit_info; 6485 6486 if (ei->kind >= SCX_EXIT_ERROR) 6487 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6488 6489 kthread_queue_work(sch->helper, &sch->disable_work); 6490 } 6491 6492 static bool scx_vexit(struct scx_sched *sch, 6493 enum scx_exit_kind kind, s64 exit_code, 6494 const char *fmt, va_list args) 6495 { 6496 struct scx_exit_info *ei = sch->exit_info; 6497 6498 guard(preempt)(); 6499 6500 if (!scx_claim_exit(sch, kind)) 6501 return false; 6502 6503 ei->exit_code = exit_code; 6504 #ifdef CONFIG_STACKTRACE 6505 if (kind >= SCX_EXIT_ERROR) 6506 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6507 #endif 6508 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6509 6510 /* 6511 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6512 * in scx_disable_workfn(). 6513 */ 6514 ei->kind = kind; 6515 ei->reason = scx_exit_reason(ei->kind); 6516 6517 irq_work_queue(&sch->disable_irq_work); 6518 return true; 6519 } 6520 6521 static int alloc_kick_syncs(void) 6522 { 6523 int cpu; 6524 6525 /* 6526 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6527 * can exceed percpu allocator limits on large machines. 6528 */ 6529 for_each_possible_cpu(cpu) { 6530 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6531 struct scx_kick_syncs *new_ksyncs; 6532 6533 WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6534 6535 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6536 GFP_KERNEL, cpu_to_node(cpu)); 6537 if (!new_ksyncs) { 6538 free_kick_syncs(); 6539 return -ENOMEM; 6540 } 6541 6542 rcu_assign_pointer(*ksyncs, new_ksyncs); 6543 } 6544 6545 return 0; 6546 } 6547 6548 static void free_pnode(struct scx_sched_pnode *pnode) 6549 { 6550 if (!pnode) 6551 return; 6552 exit_dsq(&pnode->global_dsq); 6553 kfree(pnode); 6554 } 6555 6556 static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6557 { 6558 struct scx_sched_pnode *pnode; 6559 6560 pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6561 if (!pnode) 6562 return NULL; 6563 6564 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6565 kfree(pnode); 6566 return NULL; 6567 } 6568 6569 return pnode; 6570 } 6571 6572 /* 6573 * Allocate and initialize a new scx_sched. @cgrp's reference is always 6574 * consumed whether the function succeeds or fails. 6575 */ 6576 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, 6577 struct cgroup *cgrp, 6578 struct scx_sched *parent) 6579 { 6580 struct scx_sched *sch; 6581 s32 level = parent ? parent->level + 1 : 0; 6582 s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6583 6584 sch = kzalloc_flex(*sch, ancestors, level + 1); 6585 if (!sch) { 6586 ret = -ENOMEM; 6587 goto err_put_cgrp; 6588 } 6589 6590 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6591 if (!sch->exit_info) { 6592 ret = -ENOMEM; 6593 goto err_free_sch; 6594 } 6595 6596 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6597 if (ret < 0) 6598 goto err_free_ei; 6599 6600 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6601 if (!sch->pnode) { 6602 ret = -ENOMEM; 6603 goto err_free_hash; 6604 } 6605 6606 for_each_node_state(node, N_POSSIBLE) { 6607 sch->pnode[node] = alloc_pnode(sch, node); 6608 if (!sch->pnode[node]) { 6609 ret = -ENOMEM; 6610 goto err_free_pnode; 6611 } 6612 } 6613 6614 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6615 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6616 dsp_ctx.buf, sch->dsp_max_batch), 6617 __alignof__(struct scx_sched_pcpu)); 6618 if (!sch->pcpu) { 6619 ret = -ENOMEM; 6620 goto err_free_pnode; 6621 } 6622 6623 for_each_possible_cpu(cpu) { 6624 ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6625 if (ret) { 6626 bypass_fail_cpu = cpu; 6627 goto err_free_pcpu; 6628 } 6629 } 6630 6631 for_each_possible_cpu(cpu) { 6632 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6633 6634 pcpu->sch = sch; 6635 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6636 } 6637 6638 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6639 if (IS_ERR(sch->helper)) { 6640 ret = PTR_ERR(sch->helper); 6641 goto err_free_pcpu; 6642 } 6643 6644 sched_set_fifo(sch->helper->task); 6645 6646 if (parent) 6647 memcpy(sch->ancestors, parent->ancestors, 6648 level * sizeof(parent->ancestors[0])); 6649 sch->ancestors[level] = sch; 6650 sch->level = level; 6651 6652 if (ops->timeout_ms) 6653 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6654 else 6655 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6656 6657 sch->slice_dfl = SCX_SLICE_DFL; 6658 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6659 sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); 6660 kthread_init_work(&sch->disable_work, scx_disable_workfn); 6661 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6662 6663 if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { 6664 ret = -ENOMEM; 6665 goto err_stop_helper; 6666 } 6667 if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { 6668 ret = -ENOMEM; 6669 goto err_free_lb_cpumask; 6670 } 6671 sch->ops = *ops; 6672 rcu_assign_pointer(ops->priv, sch); 6673 6674 sch->kobj.kset = scx_kset; 6675 INIT_LIST_HEAD(&sch->all); 6676 6677 #ifdef CONFIG_EXT_SUB_SCHED 6678 char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6679 if (!buf) { 6680 ret = -ENOMEM; 6681 goto err_free_lb_resched; 6682 } 6683 cgroup_path(cgrp, buf, PATH_MAX); 6684 sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6685 kfree(buf); 6686 if (!sch->cgrp_path) { 6687 ret = -ENOMEM; 6688 goto err_free_lb_resched; 6689 } 6690 6691 sch->cgrp = cgrp; 6692 INIT_LIST_HEAD(&sch->children); 6693 INIT_LIST_HEAD(&sch->sibling); 6694 6695 if (parent) 6696 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6697 &parent->sub_kset->kobj, 6698 "sub-%llu", cgroup_id(cgrp)); 6699 else 6700 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6701 6702 if (ret < 0) { 6703 RCU_INIT_POINTER(ops->priv, NULL); 6704 kobject_put(&sch->kobj); 6705 return ERR_PTR(ret); 6706 } 6707 6708 if (ops->sub_attach) { 6709 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6710 if (!sch->sub_kset) { 6711 RCU_INIT_POINTER(ops->priv, NULL); 6712 kobject_put(&sch->kobj); 6713 return ERR_PTR(-ENOMEM); 6714 } 6715 } 6716 #else /* CONFIG_EXT_SUB_SCHED */ 6717 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6718 if (ret < 0) { 6719 RCU_INIT_POINTER(ops->priv, NULL); 6720 kobject_put(&sch->kobj); 6721 return ERR_PTR(ret); 6722 } 6723 #endif /* CONFIG_EXT_SUB_SCHED */ 6724 return sch; 6725 6726 #ifdef CONFIG_EXT_SUB_SCHED 6727 err_free_lb_resched: 6728 RCU_INIT_POINTER(ops->priv, NULL); 6729 free_cpumask_var(sch->bypass_lb_resched_cpumask); 6730 #endif 6731 err_free_lb_cpumask: 6732 free_cpumask_var(sch->bypass_lb_donee_cpumask); 6733 err_stop_helper: 6734 kthread_destroy_worker(sch->helper); 6735 err_free_pcpu: 6736 for_each_possible_cpu(cpu) { 6737 if (cpu == bypass_fail_cpu) 6738 break; 6739 exit_dsq(bypass_dsq(sch, cpu)); 6740 } 6741 free_percpu(sch->pcpu); 6742 err_free_pnode: 6743 for_each_node_state(node, N_POSSIBLE) 6744 free_pnode(sch->pnode[node]); 6745 kfree(sch->pnode); 6746 err_free_hash: 6747 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 6748 err_free_ei: 6749 free_exit_info(sch->exit_info); 6750 err_free_sch: 6751 kfree(sch); 6752 err_put_cgrp: 6753 #ifdef CONFIG_EXT_SUB_SCHED 6754 cgroup_put(cgrp); 6755 #endif 6756 return ERR_PTR(ret); 6757 } 6758 6759 static int check_hotplug_seq(struct scx_sched *sch, 6760 const struct sched_ext_ops *ops) 6761 { 6762 unsigned long long global_hotplug_seq; 6763 6764 /* 6765 * If a hotplug event has occurred between when a scheduler was 6766 * initialized, and when we were able to attach, exit and notify user 6767 * space about it. 6768 */ 6769 if (ops->hotplug_seq) { 6770 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 6771 if (ops->hotplug_seq != global_hotplug_seq) { 6772 scx_exit(sch, SCX_EXIT_UNREG_KERN, 6773 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 6774 "expected hotplug seq %llu did not match actual %llu", 6775 ops->hotplug_seq, global_hotplug_seq); 6776 return -EBUSY; 6777 } 6778 } 6779 6780 return 0; 6781 } 6782 6783 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 6784 { 6785 /* 6786 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 6787 * ops.enqueue() callback isn't implemented. 6788 */ 6789 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 6790 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 6791 return -EINVAL; 6792 } 6793 6794 /* 6795 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 6796 * selection policy to be enabled. 6797 */ 6798 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 6799 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 6800 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 6801 return -EINVAL; 6802 } 6803 6804 if (ops->cpu_acquire || ops->cpu_release) 6805 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 6806 6807 return 0; 6808 } 6809 6810 /* 6811 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6812 * starvation. During the READY -> ENABLED task switching loop, the calling 6813 * thread's sched_class gets switched from fair to ext. As fair has higher 6814 * priority than ext, the calling thread can be indefinitely starved under 6815 * fair-class saturation, leading to a system hang. 6816 */ 6817 struct scx_enable_cmd { 6818 struct kthread_work work; 6819 struct sched_ext_ops *ops; 6820 int ret; 6821 }; 6822 6823 static void scx_root_enable_workfn(struct kthread_work *work) 6824 { 6825 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 6826 struct sched_ext_ops *ops = cmd->ops; 6827 struct cgroup *cgrp = root_cgroup(); 6828 struct scx_sched *sch; 6829 struct scx_task_iter sti; 6830 struct task_struct *p; 6831 int i, cpu, ret; 6832 6833 mutex_lock(&scx_enable_mutex); 6834 6835 if (scx_enable_state() != SCX_DISABLED) { 6836 ret = -EBUSY; 6837 goto err_unlock; 6838 } 6839 6840 /* 6841 * @ops->priv binds @ops to its scx_sched instance. It is set here by 6842 * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), 6843 * which runs after scx_root_disable() has dropped scx_enable_mutex. If 6844 * it's still non-NULL here, a previous attachment on @ops has not 6845 * finished tearing down; proceeding would let the in-flight unreg's 6846 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. 6847 */ 6848 if (rcu_access_pointer(ops->priv)) { 6849 ret = -EBUSY; 6850 goto err_unlock; 6851 } 6852 6853 ret = alloc_kick_syncs(); 6854 if (ret) 6855 goto err_unlock; 6856 6857 #ifdef CONFIG_EXT_SUB_SCHED 6858 cgroup_get(cgrp); 6859 #endif 6860 sch = scx_alloc_and_add_sched(ops, cgrp, NULL); 6861 if (IS_ERR(sch)) { 6862 ret = PTR_ERR(sch); 6863 goto err_free_ksyncs; 6864 } 6865 6866 /* 6867 * Transition to ENABLING and clear exit info to arm the disable path. 6868 * Failure triggers full disabling from here on. 6869 */ 6870 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 6871 WARN_ON_ONCE(scx_root); 6872 6873 atomic_long_set(&scx_nr_rejected, 0); 6874 6875 for_each_possible_cpu(cpu) { 6876 struct rq *rq = cpu_rq(cpu); 6877 6878 rq->scx.local_dsq.sched = sch; 6879 rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 6880 } 6881 6882 /* 6883 * Keep CPUs stable during enable so that the BPF scheduler can track 6884 * online CPUs by watching ->on/offline_cpu() after ->init(). 6885 */ 6886 cpus_read_lock(); 6887 6888 /* 6889 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 6890 * See handle_hotplug(). 6891 */ 6892 rcu_assign_pointer(scx_root, sch); 6893 6894 ret = scx_link_sched(sch); 6895 if (ret) { 6896 cpus_read_unlock(); 6897 goto err_disable; 6898 } 6899 6900 scx_idle_enable(ops); 6901 6902 if (sch->ops.init) { 6903 ret = SCX_CALL_OP_RET(sch, init, NULL); 6904 if (ret) { 6905 ret = ops_sanitize_err(sch, "init", ret); 6906 cpus_read_unlock(); 6907 scx_error(sch, "ops.init() failed (%d)", ret); 6908 goto err_disable; 6909 } 6910 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 6911 } 6912 6913 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 6914 if (((void (**)(void))ops)[i]) 6915 set_bit(i, sch->has_op); 6916 6917 ret = check_hotplug_seq(sch, ops); 6918 if (ret) { 6919 cpus_read_unlock(); 6920 goto err_disable; 6921 } 6922 scx_idle_update_selcpu_topology(ops); 6923 6924 cpus_read_unlock(); 6925 6926 ret = validate_ops(sch, ops); 6927 if (ret) 6928 goto err_disable; 6929 6930 /* 6931 * Once __scx_enabled is set, %current can be switched to SCX anytime. 6932 * This can lead to stalls as some BPF schedulers (e.g. userspace 6933 * scheduling) may not function correctly before all tasks are switched. 6934 * Init in bypass mode to guarantee forward progress. 6935 */ 6936 scx_bypass(sch, true); 6937 6938 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 6939 if (((void (**)(void))ops)[i]) 6940 set_bit(i, sch->has_op); 6941 6942 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 6943 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 6944 6945 /* 6946 * Lock out forks, cgroup on/offlining and moves before opening the 6947 * floodgate so that they don't wander into the operations prematurely. 6948 */ 6949 percpu_down_write(&scx_fork_rwsem); 6950 6951 WARN_ON_ONCE(scx_init_task_enabled); 6952 scx_init_task_enabled = true; 6953 6954 /* 6955 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 6956 * preventing new tasks from being added. No need to exclude tasks 6957 * leaving as sched_ext_free() can handle both prepped and enabled 6958 * tasks. Prep all tasks first and then enable them with preemption 6959 * disabled. 6960 * 6961 * All cgroups should be initialized before scx_init_task() so that the 6962 * BPF scheduler can reliably track each task's cgroup membership from 6963 * scx_init_task(). Lock out cgroup on/offlining and task migrations 6964 * while tasks are being initialized so that scx_cgroup_can_attach() 6965 * never sees uninitialized tasks. 6966 */ 6967 scx_cgroup_lock(); 6968 set_cgroup_sched(sch_cgroup(sch), sch); 6969 ret = scx_cgroup_init(sch); 6970 if (ret) 6971 goto err_disable_unlock_all; 6972 6973 scx_task_iter_start(&sti, NULL); 6974 while ((p = scx_task_iter_next_locked(&sti))) { 6975 struct rq_flags rf; 6976 struct rq *rq; 6977 6978 /* 6979 * @p may already be dead, have lost all its usages counts and 6980 * be waiting for RCU grace period before being freed. @p can't 6981 * be initialized for SCX in such cases and should be ignored. 6982 */ 6983 if (!tryget_task_struct(p)) 6984 continue; 6985 6986 /* 6987 * Set %INIT_BEGIN under the iter's rq lock so that a concurrent 6988 * sched_ext_dead() does not call ops.exit_task() on @p while 6989 * ops.init_task() is running. If sched_ext_dead() runs before 6990 * this store, it has already removed @p from scx_tasks and the 6991 * iter won't visit @p; if it runs after, it observes 6992 * %INIT_BEGIN and transitions to %DEAD without calling ops, 6993 * leaving the post-init recheck below to unwind. 6994 */ 6995 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 6996 scx_task_iter_unlock(&sti); 6997 6998 ret = __scx_init_task(sch, p, false); 6999 7000 rq = task_rq_lock(p, &rf); 7001 7002 if (unlikely(ret)) { 7003 if (scx_get_task_state(p) != SCX_TASK_DEAD) 7004 scx_set_task_state(p, SCX_TASK_NONE); 7005 task_rq_unlock(rq, p, &rf); 7006 scx_task_iter_stop(&sti); 7007 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 7008 ret, p->comm, p->pid); 7009 put_task_struct(p); 7010 goto err_disable_unlock_all; 7011 } 7012 7013 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7014 /* 7015 * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. 7016 * ops.exit_task() is owed to the sched __scx_init_task() 7017 * ran against; call it now. 7018 */ 7019 scx_sub_init_cancel_task(sch, p); 7020 } else { 7021 scx_set_task_state(p, SCX_TASK_INIT); 7022 scx_set_task_sched(p, sch); 7023 scx_set_task_state(p, SCX_TASK_READY); 7024 } 7025 7026 task_rq_unlock(rq, p, &rf); 7027 put_task_struct(p); 7028 } 7029 scx_task_iter_stop(&sti); 7030 scx_cgroup_unlock(); 7031 percpu_up_write(&scx_fork_rwsem); 7032 7033 /* 7034 * All tasks are READY. It's safe to turn on scx_enabled() and switch 7035 * all eligible tasks. 7036 */ 7037 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 7038 static_branch_enable(&__scx_enabled); 7039 7040 /* 7041 * We're fully committed and can't fail. The task READY -> ENABLED 7042 * transitions here are synchronized against sched_ext_free() through 7043 * scx_tasks_lock. 7044 */ 7045 percpu_down_write(&scx_fork_rwsem); 7046 scx_task_iter_start(&sti, NULL); 7047 while ((p = scx_task_iter_next_locked(&sti))) { 7048 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 7049 const struct sched_class *old_class = p->sched_class; 7050 const struct sched_class *new_class = scx_setscheduler_class(p); 7051 7052 if (scx_get_task_state(p) != SCX_TASK_READY) 7053 continue; 7054 7055 if (old_class != new_class) 7056 queue_flags |= DEQUEUE_CLASS; 7057 7058 scoped_guard (sched_change, p, queue_flags) { 7059 p->scx.slice = READ_ONCE(sch->slice_dfl); 7060 p->sched_class = new_class; 7061 } 7062 } 7063 scx_task_iter_stop(&sti); 7064 percpu_up_write(&scx_fork_rwsem); 7065 7066 scx_bypass(sch, false); 7067 7068 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 7069 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 7070 goto err_disable; 7071 } 7072 7073 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 7074 static_branch_enable(&__scx_switched_all); 7075 7076 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 7077 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 7078 kobject_uevent(&sch->kobj, KOBJ_ADD); 7079 mutex_unlock(&scx_enable_mutex); 7080 7081 atomic_long_inc(&scx_enable_seq); 7082 7083 cmd->ret = 0; 7084 return; 7085 7086 err_free_ksyncs: 7087 free_kick_syncs(); 7088 err_unlock: 7089 mutex_unlock(&scx_enable_mutex); 7090 cmd->ret = ret; 7091 return; 7092 7093 err_disable_unlock_all: 7094 scx_cgroup_unlock(); 7095 percpu_up_write(&scx_fork_rwsem); 7096 /* we'll soon enter disable path, keep bypass on */ 7097 err_disable: 7098 mutex_unlock(&scx_enable_mutex); 7099 /* 7100 * Returning an error code here would not pass all the error information 7101 * to userspace. Record errno using scx_error() for cases scx_error() 7102 * wasn't already invoked and exit indicating success so that the error 7103 * is notified through ops.exit() with all the details. 7104 * 7105 * Flush scx_disable_work to ensure that error is reported before init 7106 * completion. sch's base reference will be put by bpf_scx_unreg(). 7107 */ 7108 scx_error(sch, "scx_root_enable() failed (%d)", ret); 7109 scx_flush_disable_work(sch); 7110 cmd->ret = 0; 7111 } 7112 7113 #ifdef CONFIG_EXT_SUB_SCHED 7114 /* verify that a scheduler can be attached to @cgrp and return the parent */ 7115 static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 7116 { 7117 struct scx_sched *parent = cgrp->scx_sched; 7118 struct scx_sched *pos; 7119 7120 lockdep_assert_held(&scx_sched_lock); 7121 7122 /* can't attach twice to the same cgroup */ 7123 if (parent->cgrp == cgrp) 7124 return ERR_PTR(-EBUSY); 7125 7126 /* does $parent allow sub-scheds? */ 7127 if (!parent->ops.sub_attach) 7128 return ERR_PTR(-EOPNOTSUPP); 7129 7130 /* can't insert between $parent and its exiting children */ 7131 list_for_each_entry(pos, &parent->children, sibling) 7132 if (cgroup_is_descendant(pos->cgrp, cgrp)) 7133 return ERR_PTR(-EBUSY); 7134 7135 return parent; 7136 } 7137 7138 static bool assert_task_ready_or_enabled(struct task_struct *p) 7139 { 7140 u32 state = scx_get_task_state(p); 7141 7142 switch (state) { 7143 case SCX_TASK_READY: 7144 case SCX_TASK_ENABLED: 7145 return true; 7146 default: 7147 WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 7148 state, p->comm, p->pid); 7149 return false; 7150 } 7151 } 7152 7153 static void scx_sub_enable_workfn(struct kthread_work *work) 7154 { 7155 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7156 struct sched_ext_ops *ops = cmd->ops; 7157 struct cgroup *cgrp; 7158 struct scx_sched *parent, *sch; 7159 struct scx_task_iter sti; 7160 struct task_struct *p; 7161 s32 i, ret; 7162 7163 mutex_lock(&scx_enable_mutex); 7164 7165 if (!scx_enabled()) { 7166 ret = -ENODEV; 7167 goto out_unlock; 7168 } 7169 7170 /* See scx_root_enable_workfn() for the @ops->priv check. */ 7171 if (rcu_access_pointer(ops->priv)) { 7172 ret = -EBUSY; 7173 goto out_unlock; 7174 } 7175 7176 cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 7177 if (IS_ERR(cgrp)) { 7178 ret = PTR_ERR(cgrp); 7179 goto out_unlock; 7180 } 7181 7182 raw_spin_lock_irq(&scx_sched_lock); 7183 parent = find_parent_sched(cgrp); 7184 if (IS_ERR(parent)) { 7185 raw_spin_unlock_irq(&scx_sched_lock); 7186 ret = PTR_ERR(parent); 7187 goto out_put_cgrp; 7188 } 7189 kobject_get(&parent->kobj); 7190 raw_spin_unlock_irq(&scx_sched_lock); 7191 7192 /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ 7193 sch = scx_alloc_and_add_sched(ops, cgrp, parent); 7194 kobject_put(&parent->kobj); 7195 if (IS_ERR(sch)) { 7196 ret = PTR_ERR(sch); 7197 goto out_unlock; 7198 } 7199 7200 ret = scx_link_sched(sch); 7201 if (ret) 7202 goto err_disable; 7203 7204 if (sch->level >= SCX_SUB_MAX_DEPTH) { 7205 scx_error(sch, "max nesting depth %d violated", 7206 SCX_SUB_MAX_DEPTH); 7207 goto err_disable; 7208 } 7209 7210 if (sch->ops.init) { 7211 ret = SCX_CALL_OP_RET(sch, init, NULL); 7212 if (ret) { 7213 ret = ops_sanitize_err(sch, "init", ret); 7214 scx_error(sch, "ops.init() failed (%d)", ret); 7215 goto err_disable; 7216 } 7217 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7218 } 7219 7220 if (validate_ops(sch, ops)) 7221 goto err_disable; 7222 7223 struct scx_sub_attach_args sub_attach_args = { 7224 .ops = &sch->ops, 7225 .cgroup_path = sch->cgrp_path, 7226 }; 7227 7228 ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, 7229 &sub_attach_args); 7230 if (ret) { 7231 ret = ops_sanitize_err(sch, "sub_attach", ret); 7232 scx_error(sch, "parent rejected (%d)", ret); 7233 goto err_disable; 7234 } 7235 sch->sub_attached = true; 7236 7237 scx_bypass(sch, true); 7238 7239 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 7240 if (((void (**)(void))ops)[i]) 7241 set_bit(i, sch->has_op); 7242 7243 percpu_down_write(&scx_fork_rwsem); 7244 scx_cgroup_lock(); 7245 7246 /* 7247 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 7248 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 7249 */ 7250 set_cgroup_sched(sch_cgroup(sch), sch); 7251 if (!(cgrp->self.flags & CSS_ONLINE)) { 7252 scx_error(sch, "cgroup is not online"); 7253 goto err_unlock_and_disable; 7254 } 7255 7256 /* 7257 * Initialize tasks for the new child $sch without exiting them for 7258 * $parent so that the tasks can always be reverted back to $parent 7259 * sched on child init failure. 7260 */ 7261 WARN_ON_ONCE(scx_enabling_sub_sched); 7262 scx_enabling_sub_sched = sch; 7263 7264 scx_task_iter_start(&sti, sch->cgrp); 7265 while ((p = scx_task_iter_next_locked(&sti))) { 7266 struct rq *rq; 7267 struct rq_flags rf; 7268 7269 /* 7270 * Task iteration may visit the same task twice when racing 7271 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 7272 * finished __scx_init_task() and skip if set. 7273 * 7274 * A task may exit and get freed between __scx_init_task() 7275 * completion and scx_enable_task(). In such cases, 7276 * scx_disable_and_exit_task() must exit the task for both the 7277 * parent and child scheds. 7278 */ 7279 if (p->scx.flags & SCX_TASK_SUB_INIT) 7280 continue; 7281 7282 /* see scx_root_enable() */ 7283 if (!tryget_task_struct(p)) 7284 continue; 7285 7286 if (!assert_task_ready_or_enabled(p)) { 7287 ret = -EINVAL; 7288 goto abort; 7289 } 7290 7291 scx_task_iter_unlock(&sti); 7292 7293 /* 7294 * As $p is still on $parent, it can't be transitioned to INIT. 7295 * Let's worry about task state later. Use __scx_init_task(). 7296 */ 7297 ret = __scx_init_task(sch, p, false); 7298 if (ret) 7299 goto abort; 7300 7301 rq = task_rq_lock(p, &rf); 7302 7303 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7304 /* 7305 * sched_ext_dead() raced us between __scx_init_task() 7306 * and this rq lock and ran exit_task() on $parent (the 7307 * sched @p was on at that point), not on @sch. @sch's 7308 * just-completed init is owed an exit_task() and we 7309 * issue it here. 7310 */ 7311 scx_sub_init_cancel_task(sch, p); 7312 task_rq_unlock(rq, p, &rf); 7313 put_task_struct(p); 7314 continue; 7315 } 7316 7317 p->scx.flags |= SCX_TASK_SUB_INIT; 7318 task_rq_unlock(rq, p, &rf); 7319 7320 put_task_struct(p); 7321 } 7322 scx_task_iter_stop(&sti); 7323 7324 /* 7325 * All tasks are prepped. Disable/exit tasks for $parent and enable for 7326 * the new @sch. 7327 */ 7328 scx_task_iter_start(&sti, sch->cgrp); 7329 while ((p = scx_task_iter_next_locked(&sti))) { 7330 /* 7331 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7332 * duplicate iterations. 7333 */ 7334 if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7335 continue; 7336 7337 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7338 /* 7339 * $p must be either READY or ENABLED. If ENABLED, 7340 * __scx_disabled_and_exit_task() first disables and 7341 * makes it READY. However, after exiting $p, it will 7342 * leave $p as READY. 7343 */ 7344 assert_task_ready_or_enabled(p); 7345 __scx_disable_and_exit_task(parent, p); 7346 7347 /* 7348 * $p is now only initialized for @sch and READY, which 7349 * is what we want. Assign it to @sch and enable. 7350 */ 7351 scx_set_task_sched(p, sch); 7352 scx_enable_task(sch, p); 7353 7354 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7355 } 7356 } 7357 scx_task_iter_stop(&sti); 7358 7359 scx_enabling_sub_sched = NULL; 7360 7361 scx_cgroup_unlock(); 7362 percpu_up_write(&scx_fork_rwsem); 7363 7364 scx_bypass(sch, false); 7365 7366 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7367 kobject_uevent(&sch->kobj, KOBJ_ADD); 7368 ret = 0; 7369 goto out_unlock; 7370 7371 out_put_cgrp: 7372 cgroup_put(cgrp); 7373 out_unlock: 7374 mutex_unlock(&scx_enable_mutex); 7375 cmd->ret = ret; 7376 return; 7377 7378 abort: 7379 put_task_struct(p); 7380 scx_task_iter_stop(&sti); 7381 7382 /* 7383 * Undo __scx_init_task() for tasks we marked. scx_enable_task() never 7384 * ran for @sch on them, so calling scx_disable_task() here would invoke 7385 * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched 7386 * must stay set until SUB_INIT is cleared from every marked task - 7387 * scx_disable_and_exit_task() reads it when a task exits concurrently. 7388 */ 7389 scx_task_iter_start(&sti, sch->cgrp); 7390 while ((p = scx_task_iter_next_locked(&sti))) { 7391 if (p->scx.flags & SCX_TASK_SUB_INIT) { 7392 scx_sub_init_cancel_task(sch, p); 7393 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7394 } 7395 } 7396 scx_task_iter_stop(&sti); 7397 scx_enabling_sub_sched = NULL; 7398 err_unlock_and_disable: 7399 /* we'll soon enter disable path, keep bypass on */ 7400 scx_cgroup_unlock(); 7401 percpu_up_write(&scx_fork_rwsem); 7402 err_disable: 7403 mutex_unlock(&scx_enable_mutex); 7404 scx_flush_disable_work(sch); 7405 cmd->ret = 0; 7406 } 7407 7408 static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7409 unsigned long action, void *data) 7410 { 7411 struct cgroup *cgrp = data; 7412 struct cgroup *parent = cgroup_parent(cgrp); 7413 7414 if (!cgroup_on_dfl(cgrp)) 7415 return NOTIFY_OK; 7416 7417 switch (action) { 7418 case CGROUP_LIFETIME_ONLINE: 7419 /* inherit ->scx_sched from $parent */ 7420 if (parent) 7421 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7422 break; 7423 case CGROUP_LIFETIME_OFFLINE: 7424 /* if there is a sched attached, shoot it down */ 7425 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7426 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7427 SCX_ECODE_RSN_CGROUP_OFFLINE, 7428 "cgroup %llu going offline", cgroup_id(cgrp)); 7429 break; 7430 } 7431 7432 return NOTIFY_OK; 7433 } 7434 7435 static struct notifier_block scx_cgroup_lifetime_nb = { 7436 .notifier_call = scx_cgroup_lifetime_notify, 7437 }; 7438 7439 static s32 __init scx_cgroup_lifetime_notifier_init(void) 7440 { 7441 return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7442 &scx_cgroup_lifetime_nb); 7443 } 7444 core_initcall(scx_cgroup_lifetime_notifier_init); 7445 #endif /* CONFIG_EXT_SUB_SCHED */ 7446 7447 static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 7448 { 7449 static struct kthread_worker *helper; 7450 static DEFINE_MUTEX(helper_mutex); 7451 struct scx_enable_cmd cmd; 7452 7453 if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { 7454 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7455 return -EINVAL; 7456 } 7457 7458 if (!READ_ONCE(helper)) { 7459 mutex_lock(&helper_mutex); 7460 if (!helper) { 7461 struct kthread_worker *w = 7462 kthread_run_worker(0, "scx_enable_helper"); 7463 if (IS_ERR_OR_NULL(w)) { 7464 mutex_unlock(&helper_mutex); 7465 return -ENOMEM; 7466 } 7467 sched_set_fifo(w->task); 7468 WRITE_ONCE(helper, w); 7469 } 7470 mutex_unlock(&helper_mutex); 7471 } 7472 7473 #ifdef CONFIG_EXT_SUB_SCHED 7474 if (ops->sub_cgroup_id > 1) 7475 kthread_init_work(&cmd.work, scx_sub_enable_workfn); 7476 else 7477 #endif /* CONFIG_EXT_SUB_SCHED */ 7478 kthread_init_work(&cmd.work, scx_root_enable_workfn); 7479 cmd.ops = ops; 7480 7481 kthread_queue_work(READ_ONCE(helper), &cmd.work); 7482 kthread_flush_work(&cmd.work); 7483 return cmd.ret; 7484 } 7485 7486 7487 /******************************************************************************** 7488 * bpf_struct_ops plumbing. 7489 */ 7490 #include <linux/bpf_verifier.h> 7491 #include <linux/bpf.h> 7492 #include <linux/btf.h> 7493 7494 static const struct btf_type *task_struct_type; 7495 7496 static bool bpf_scx_is_valid_access(int off, int size, 7497 enum bpf_access_type type, 7498 const struct bpf_prog *prog, 7499 struct bpf_insn_access_aux *info) 7500 { 7501 if (type != BPF_READ) 7502 return false; 7503 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7504 return false; 7505 if (off % size != 0) 7506 return false; 7507 7508 return btf_ctx_access(off, size, type, prog, info); 7509 } 7510 7511 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7512 const struct bpf_reg_state *reg, int off, 7513 int size) 7514 { 7515 const struct btf_type *t; 7516 7517 t = btf_type_by_id(reg->btf, reg->btf_id); 7518 if (t == task_struct_type) { 7519 /* 7520 * COMPAT: Will be removed in v6.23. 7521 */ 7522 if ((off >= offsetof(struct task_struct, scx.slice) && 7523 off + size <= offsetofend(struct task_struct, scx.slice)) || 7524 (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7525 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7526 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7527 return SCALAR_VALUE; 7528 } 7529 7530 if (off >= offsetof(struct task_struct, scx.disallow) && 7531 off + size <= offsetofend(struct task_struct, scx.disallow)) 7532 return SCALAR_VALUE; 7533 } 7534 7535 return -EACCES; 7536 } 7537 7538 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7539 .get_func_proto = bpf_base_func_proto, 7540 .is_valid_access = bpf_scx_is_valid_access, 7541 .btf_struct_access = bpf_scx_btf_struct_access, 7542 }; 7543 7544 static int bpf_scx_init_member(const struct btf_type *t, 7545 const struct btf_member *member, 7546 void *kdata, const void *udata) 7547 { 7548 const struct sched_ext_ops *uops = udata; 7549 struct sched_ext_ops *ops = kdata; 7550 u32 moff = __btf_member_bit_offset(t, member) / 8; 7551 int ret; 7552 7553 switch (moff) { 7554 case offsetof(struct sched_ext_ops, dispatch_max_batch): 7555 if (*(u32 *)(udata + moff) > INT_MAX) 7556 return -E2BIG; 7557 ops->dispatch_max_batch = *(u32 *)(udata + moff); 7558 return 1; 7559 case offsetof(struct sched_ext_ops, flags): 7560 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7561 return -EINVAL; 7562 ops->flags = *(u64 *)(udata + moff); 7563 return 1; 7564 case offsetof(struct sched_ext_ops, name): 7565 ret = bpf_obj_name_cpy(ops->name, uops->name, 7566 sizeof(ops->name)); 7567 if (ret < 0) 7568 return ret; 7569 if (ret == 0) 7570 return -EINVAL; 7571 return 1; 7572 case offsetof(struct sched_ext_ops, timeout_ms): 7573 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7574 SCX_WATCHDOG_MAX_TIMEOUT) 7575 return -E2BIG; 7576 ops->timeout_ms = *(u32 *)(udata + moff); 7577 return 1; 7578 case offsetof(struct sched_ext_ops, exit_dump_len): 7579 ops->exit_dump_len = 7580 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7581 return 1; 7582 case offsetof(struct sched_ext_ops, hotplug_seq): 7583 ops->hotplug_seq = *(u64 *)(udata + moff); 7584 return 1; 7585 #ifdef CONFIG_EXT_SUB_SCHED 7586 case offsetof(struct sched_ext_ops, sub_cgroup_id): 7587 ops->sub_cgroup_id = *(u64 *)(udata + moff); 7588 return 1; 7589 #endif /* CONFIG_EXT_SUB_SCHED */ 7590 } 7591 7592 return 0; 7593 } 7594 7595 #ifdef CONFIG_EXT_SUB_SCHED 7596 static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 7597 { 7598 struct scx_sched *sch; 7599 7600 guard(rcu)(); 7601 sch = scx_prog_sched(prog->aux); 7602 if (unlikely(!sch)) 7603 return; 7604 7605 scx_error(sch, "dispatch recursion detected"); 7606 } 7607 #endif /* CONFIG_EXT_SUB_SCHED */ 7608 7609 static int bpf_scx_check_member(const struct btf_type *t, 7610 const struct btf_member *member, 7611 const struct bpf_prog *prog) 7612 { 7613 u32 moff = __btf_member_bit_offset(t, member) / 8; 7614 7615 switch (moff) { 7616 case offsetof(struct sched_ext_ops, init_task): 7617 #ifdef CONFIG_EXT_GROUP_SCHED 7618 case offsetof(struct sched_ext_ops, cgroup_init): 7619 case offsetof(struct sched_ext_ops, cgroup_exit): 7620 case offsetof(struct sched_ext_ops, cgroup_prep_move): 7621 #endif 7622 case offsetof(struct sched_ext_ops, cpu_online): 7623 case offsetof(struct sched_ext_ops, cpu_offline): 7624 case offsetof(struct sched_ext_ops, init): 7625 case offsetof(struct sched_ext_ops, exit): 7626 case offsetof(struct sched_ext_ops, sub_attach): 7627 case offsetof(struct sched_ext_ops, sub_detach): 7628 break; 7629 default: 7630 if (prog->sleepable) 7631 return -EINVAL; 7632 } 7633 7634 #ifdef CONFIG_EXT_SUB_SCHED 7635 /* 7636 * Enable private stack for operations that can nest along the 7637 * hierarchy. 7638 * 7639 * XXX - Ideally, we should only do this for scheds that allow 7640 * sub-scheds and sub-scheds themselves but I don't know how to access 7641 * struct_ops from here. 7642 */ 7643 switch (moff) { 7644 case offsetof(struct sched_ext_ops, dispatch): 7645 prog->aux->priv_stack_requested = true; 7646 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 7647 } 7648 #endif /* CONFIG_EXT_SUB_SCHED */ 7649 7650 return 0; 7651 } 7652 7653 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 7654 { 7655 return scx_enable(kdata, link); 7656 } 7657 7658 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 7659 { 7660 struct sched_ext_ops *ops = kdata; 7661 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 7662 7663 scx_disable(sch, SCX_EXIT_UNREG); 7664 scx_flush_disable_work(sch); 7665 RCU_INIT_POINTER(ops->priv, NULL); 7666 kobject_put(&sch->kobj); 7667 } 7668 7669 static int bpf_scx_init(struct btf *btf) 7670 { 7671 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 7672 7673 return 0; 7674 } 7675 7676 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 7677 { 7678 /* 7679 * sched_ext does not support updating the actively-loaded BPF 7680 * scheduler, as registering a BPF scheduler can always fail if the 7681 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 7682 * etc. Similarly, we can always race with unregistration happening 7683 * elsewhere, such as with sysrq. 7684 */ 7685 return -EOPNOTSUPP; 7686 } 7687 7688 static int bpf_scx_validate(void *kdata) 7689 { 7690 return 0; 7691 } 7692 7693 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 7694 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 7695 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 7696 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 7697 static void sched_ext_ops__tick(struct task_struct *p) {} 7698 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 7699 static void sched_ext_ops__running(struct task_struct *p) {} 7700 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 7701 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 7702 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 7703 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 7704 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 7705 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 7706 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 7707 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 7708 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 7709 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 7710 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 7711 static void sched_ext_ops__enable(struct task_struct *p) {} 7712 static void sched_ext_ops__disable(struct task_struct *p) {} 7713 #ifdef CONFIG_EXT_GROUP_SCHED 7714 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 7715 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 7716 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 7717 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7718 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7719 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 7720 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 7721 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 7722 #endif /* CONFIG_EXT_GROUP_SCHED */ 7723 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 7724 static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 7725 static void sched_ext_ops__cpu_online(s32 cpu) {} 7726 static void sched_ext_ops__cpu_offline(s32 cpu) {} 7727 static s32 sched_ext_ops__init(void) { return -EINVAL; } 7728 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 7729 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 7730 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 7731 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 7732 7733 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 7734 .select_cpu = sched_ext_ops__select_cpu, 7735 .enqueue = sched_ext_ops__enqueue, 7736 .dequeue = sched_ext_ops__dequeue, 7737 .dispatch = sched_ext_ops__dispatch, 7738 .tick = sched_ext_ops__tick, 7739 .runnable = sched_ext_ops__runnable, 7740 .running = sched_ext_ops__running, 7741 .stopping = sched_ext_ops__stopping, 7742 .quiescent = sched_ext_ops__quiescent, 7743 .yield = sched_ext_ops__yield, 7744 .core_sched_before = sched_ext_ops__core_sched_before, 7745 .set_weight = sched_ext_ops__set_weight, 7746 .set_cpumask = sched_ext_ops__set_cpumask, 7747 .update_idle = sched_ext_ops__update_idle, 7748 .cpu_acquire = sched_ext_ops__cpu_acquire, 7749 .cpu_release = sched_ext_ops__cpu_release, 7750 .init_task = sched_ext_ops__init_task, 7751 .exit_task = sched_ext_ops__exit_task, 7752 .enable = sched_ext_ops__enable, 7753 .disable = sched_ext_ops__disable, 7754 #ifdef CONFIG_EXT_GROUP_SCHED 7755 .cgroup_init = sched_ext_ops__cgroup_init, 7756 .cgroup_exit = sched_ext_ops__cgroup_exit, 7757 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 7758 .cgroup_move = sched_ext_ops__cgroup_move, 7759 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 7760 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 7761 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 7762 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 7763 #endif 7764 .sub_attach = sched_ext_ops__sub_attach, 7765 .sub_detach = sched_ext_ops__sub_detach, 7766 .cpu_online = sched_ext_ops__cpu_online, 7767 .cpu_offline = sched_ext_ops__cpu_offline, 7768 .init = sched_ext_ops__init, 7769 .exit = sched_ext_ops__exit, 7770 .dump = sched_ext_ops__dump, 7771 .dump_cpu = sched_ext_ops__dump_cpu, 7772 .dump_task = sched_ext_ops__dump_task, 7773 }; 7774 7775 static struct bpf_struct_ops bpf_sched_ext_ops = { 7776 .verifier_ops = &bpf_scx_verifier_ops, 7777 .reg = bpf_scx_reg, 7778 .unreg = bpf_scx_unreg, 7779 .check_member = bpf_scx_check_member, 7780 .init_member = bpf_scx_init_member, 7781 .init = bpf_scx_init, 7782 .update = bpf_scx_update, 7783 .validate = bpf_scx_validate, 7784 .name = "sched_ext_ops", 7785 .owner = THIS_MODULE, 7786 .cfi_stubs = &__bpf_ops_sched_ext_ops 7787 }; 7788 7789 7790 /******************************************************************************** 7791 * System integration and init. 7792 */ 7793 7794 static void sysrq_handle_sched_ext_reset(u8 key) 7795 { 7796 struct scx_sched *sch; 7797 7798 rcu_read_lock(); 7799 sch = rcu_dereference(scx_root); 7800 if (likely(sch)) 7801 scx_disable(sch, SCX_EXIT_SYSRQ); 7802 else 7803 pr_info("sched_ext: BPF schedulers not loaded\n"); 7804 rcu_read_unlock(); 7805 } 7806 7807 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 7808 .handler = sysrq_handle_sched_ext_reset, 7809 .help_msg = "reset-sched-ext(S)", 7810 .action_msg = "Disable sched_ext and revert all tasks to CFS", 7811 .enable_mask = SYSRQ_ENABLE_RTNICE, 7812 }; 7813 7814 static void sysrq_handle_sched_ext_dump(u8 key) 7815 { 7816 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 7817 struct scx_sched *sch; 7818 7819 list_for_each_entry_rcu(sch, &scx_sched_all, all) 7820 scx_dump_state(sch, &ei, 0, false); 7821 } 7822 7823 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 7824 .handler = sysrq_handle_sched_ext_dump, 7825 .help_msg = "dump-sched-ext(D)", 7826 .action_msg = "Trigger sched_ext debug dump", 7827 .enable_mask = SYSRQ_ENABLE_RTNICE, 7828 }; 7829 7830 static bool can_skip_idle_kick(struct rq *rq) 7831 { 7832 lockdep_assert_rq_held(rq); 7833 7834 /* 7835 * We can skip idle kicking if @rq is going to go through at least one 7836 * full SCX scheduling cycle before going idle. Just checking whether 7837 * curr is not idle is insufficient because we could be racing 7838 * balance_one() trying to pull the next task from a remote rq, which 7839 * may fail, and @rq may become idle afterwards. 7840 * 7841 * The race window is small and we don't and can't guarantee that @rq is 7842 * only kicked while idle anyway. Skip only when sure. 7843 */ 7844 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 7845 } 7846 7847 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 7848 { 7849 struct rq *rq = cpu_rq(cpu); 7850 struct scx_rq *this_scx = &this_rq->scx; 7851 const struct sched_class *cur_class; 7852 bool should_wait = false; 7853 unsigned long flags; 7854 7855 raw_spin_rq_lock_irqsave(rq, flags); 7856 cur_class = rq->curr->sched_class; 7857 7858 /* 7859 * During CPU hotplug, a CPU may depend on kicking itself to make 7860 * forward progress. Allow kicking self regardless of online state. If 7861 * @cpu is running a higher class task, we have no control over @cpu. 7862 * Skip kicking. 7863 */ 7864 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 7865 !sched_class_above(cur_class, &ext_sched_class)) { 7866 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 7867 if (cur_class == &ext_sched_class) 7868 rq->curr->scx.slice = 0; 7869 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7870 } 7871 7872 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 7873 if (cur_class == &ext_sched_class) { 7874 cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 7875 ksyncs[cpu] = rq->scx.kick_sync; 7876 should_wait = true; 7877 } 7878 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7879 } 7880 7881 resched_curr(rq); 7882 } else { 7883 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7884 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7885 } 7886 7887 raw_spin_rq_unlock_irqrestore(rq, flags); 7888 7889 return should_wait; 7890 } 7891 7892 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 7893 { 7894 struct rq *rq = cpu_rq(cpu); 7895 unsigned long flags; 7896 7897 raw_spin_rq_lock_irqsave(rq, flags); 7898 7899 if (!can_skip_idle_kick(rq) && 7900 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 7901 resched_curr(rq); 7902 7903 raw_spin_rq_unlock_irqrestore(rq, flags); 7904 } 7905 7906 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 7907 { 7908 struct rq *this_rq = this_rq(); 7909 struct scx_rq *this_scx = &this_rq->scx; 7910 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 7911 bool should_wait = false; 7912 unsigned long *ksyncs; 7913 s32 cpu; 7914 7915 /* can race with free_kick_syncs() during scheduler disable */ 7916 if (unlikely(!ksyncs_pcpu)) 7917 return; 7918 7919 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 7920 7921 for_each_cpu(cpu, this_scx->cpus_to_kick) { 7922 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 7923 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 7924 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7925 } 7926 7927 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 7928 kick_one_cpu_if_idle(cpu, this_rq); 7929 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7930 } 7931 7932 /* 7933 * Can't wait in hardirq — kick_sync can't advance, deadlocking if 7934 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 7935 */ 7936 if (should_wait) { 7937 raw_spin_rq_lock(this_rq); 7938 this_scx->kick_sync_pending = true; 7939 resched_curr(this_rq); 7940 raw_spin_rq_unlock(this_rq); 7941 } 7942 } 7943 7944 /** 7945 * print_scx_info - print out sched_ext scheduler state 7946 * @log_lvl: the log level to use when printing 7947 * @p: target task 7948 * 7949 * If a sched_ext scheduler is enabled, print the name and state of the 7950 * scheduler. If @p is on sched_ext, print further information about the task. 7951 * 7952 * This function can be safely called on any task as long as the task_struct 7953 * itself is accessible. While safe, this function isn't synchronized and may 7954 * print out mixups or garbages of limited length. 7955 */ 7956 void print_scx_info(const char *log_lvl, struct task_struct *p) 7957 { 7958 struct scx_sched *sch; 7959 enum scx_enable_state state = scx_enable_state(); 7960 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 7961 char runnable_at_buf[22] = "?"; 7962 struct sched_class *class; 7963 unsigned long runnable_at; 7964 7965 guard(rcu)(); 7966 7967 sch = scx_task_sched_rcu(p); 7968 7969 if (!sch) 7970 return; 7971 7972 /* 7973 * Carefully check if the task was running on sched_ext, and then 7974 * carefully copy the time it's been runnable, and its state. 7975 */ 7976 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 7977 class != &ext_sched_class) { 7978 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 7979 scx_enable_state_str[state], all); 7980 return; 7981 } 7982 7983 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 7984 sizeof(runnable_at))) 7985 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 7986 jiffies_delta_msecs(runnable_at, jiffies)); 7987 7988 /* print everything onto one line to conserve console space */ 7989 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 7990 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 7991 runnable_at_buf); 7992 } 7993 7994 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 7995 { 7996 struct scx_sched *sch; 7997 7998 guard(rcu)(); 7999 8000 sch = rcu_dereference(scx_root); 8001 if (!sch) 8002 return NOTIFY_OK; 8003 8004 /* 8005 * SCX schedulers often have userspace components which are sometimes 8006 * involved in critial scheduling paths. PM operations involve freezing 8007 * userspace which can lead to scheduling misbehaviors including stalls. 8008 * Let's bypass while PM operations are in progress. 8009 */ 8010 switch (event) { 8011 case PM_HIBERNATION_PREPARE: 8012 case PM_SUSPEND_PREPARE: 8013 case PM_RESTORE_PREPARE: 8014 scx_bypass(sch, true); 8015 break; 8016 case PM_POST_HIBERNATION: 8017 case PM_POST_SUSPEND: 8018 case PM_POST_RESTORE: 8019 scx_bypass(sch, false); 8020 break; 8021 } 8022 8023 return NOTIFY_OK; 8024 } 8025 8026 static struct notifier_block scx_pm_notifier = { 8027 .notifier_call = scx_pm_handler, 8028 }; 8029 8030 void __init init_sched_ext_class(void) 8031 { 8032 s32 cpu, v; 8033 8034 /* 8035 * The following is to prevent the compiler from optimizing out the enum 8036 * definitions so that BPF scheduler implementations can use them 8037 * through the generated vmlinux.h. 8038 */ 8039 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 8040 SCX_TG_ONLINE); 8041 8042 scx_idle_init_masks(); 8043 8044 for_each_possible_cpu(cpu) { 8045 struct rq *rq = cpu_rq(cpu); 8046 int n = cpu_to_node(cpu); 8047 8048 /* local_dsq's sch will be set during scx_root_enable() */ 8049 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 8050 8051 INIT_LIST_HEAD(&rq->scx.runnable_list); 8052 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 8053 8054 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 8055 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 8056 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 8057 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 8058 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 8059 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 8060 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 8061 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 8062 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 8063 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 8064 8065 if (cpu_online(cpu)) 8066 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 8067 } 8068 8069 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 8070 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 8071 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 8072 8073 #ifdef CONFIG_EXT_SUB_SCHED 8074 BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 8075 #endif /* CONFIG_EXT_SUB_SCHED */ 8076 } 8077 8078 8079 /******************************************************************************** 8080 * Helpers that can be called from the BPF scheduler. 8081 */ 8082 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) 8083 { 8084 bool is_local = dsq_id == SCX_DSQ_LOCAL || 8085 (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; 8086 8087 if (*enq_flags & SCX_ENQ_IMMED) { 8088 if (unlikely(!is_local)) { 8089 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 8090 return false; 8091 } 8092 } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { 8093 *enq_flags |= SCX_ENQ_IMMED; 8094 } 8095 8096 return true; 8097 } 8098 8099 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 8100 u64 dsq_id, u64 *enq_flags) 8101 { 8102 lockdep_assert_irqs_disabled(); 8103 8104 if (unlikely(!p)) { 8105 scx_error(sch, "called with NULL task"); 8106 return false; 8107 } 8108 8109 if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 8110 scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); 8111 return false; 8112 } 8113 8114 /* see SCX_EV_INSERT_NOT_OWNED definition */ 8115 if (unlikely(!scx_task_on_sched(sch, p))) { 8116 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 8117 return false; 8118 } 8119 8120 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8121 return false; 8122 8123 return true; 8124 } 8125 8126 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 8127 u64 dsq_id, u64 enq_flags) 8128 { 8129 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8130 struct task_struct *ddsp_task; 8131 8132 ddsp_task = __this_cpu_read(direct_dispatch_task); 8133 if (ddsp_task) { 8134 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 8135 return; 8136 } 8137 8138 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 8139 scx_error(sch, "dispatch buffer overflow"); 8140 return; 8141 } 8142 8143 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 8144 .task = p, 8145 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 8146 .dsq_id = dsq_id, 8147 .enq_flags = enq_flags, 8148 }; 8149 } 8150 8151 __bpf_kfunc_start_defs(); 8152 8153 /** 8154 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 8155 * @p: task_struct to insert 8156 * @dsq_id: DSQ to insert into 8157 * @slice: duration @p can run for in nsecs, 0 to keep the current value 8158 * @enq_flags: SCX_ENQ_* 8159 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8160 * 8161 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 8162 * call this function spuriously. Can be called from ops.enqueue(), 8163 * ops.select_cpu(), and ops.dispatch(). 8164 * 8165 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 8166 * and @p must match the task being enqueued. 8167 * 8168 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 8169 * will be directly inserted into the corresponding dispatch queue after 8170 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 8171 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 8172 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 8173 * task is inserted. 8174 * 8175 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 8176 * and this function can be called upto ops.dispatch_max_batch times to insert 8177 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 8178 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 8179 * counter. 8180 * 8181 * This function doesn't have any locking restrictions and may be called under 8182 * BPF locks (in the future when BPF introduces more flexible locking). 8183 * 8184 * @p is allowed to run for @slice. The scheduling path is triggered on slice 8185 * exhaustion. If zero, the current residual slice is maintained. If 8186 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 8187 * scx_bpf_kick_cpu() to trigger scheduling. 8188 * 8189 * Returns %true on successful insertion, %false on failure. On the root 8190 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8191 * to check the return value. 8192 */ 8193 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 8194 u64 slice, u64 enq_flags, 8195 const struct bpf_prog_aux *aux) 8196 { 8197 struct scx_sched *sch; 8198 8199 guard(rcu)(); 8200 sch = scx_prog_sched(aux); 8201 if (unlikely(!sch)) 8202 return false; 8203 8204 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8205 return false; 8206 8207 if (slice) 8208 p->scx.slice = slice; 8209 else 8210 p->scx.slice = p->scx.slice ?: 1; 8211 8212 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 8213 8214 return true; 8215 } 8216 8217 /* 8218 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 8219 */ 8220 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 8221 u64 slice, u64 enq_flags, 8222 const struct bpf_prog_aux *aux) 8223 { 8224 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 8225 } 8226 8227 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 8228 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 8229 { 8230 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8231 return false; 8232 8233 if (slice) 8234 p->scx.slice = slice; 8235 else 8236 p->scx.slice = p->scx.slice ?: 1; 8237 8238 p->scx.dsq_vtime = vtime; 8239 8240 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8241 8242 return true; 8243 } 8244 8245 struct scx_bpf_dsq_insert_vtime_args { 8246 /* @p can't be packed together as KF_RCU is not transitive */ 8247 u64 dsq_id; 8248 u64 slice; 8249 u64 vtime; 8250 u64 enq_flags; 8251 }; 8252 8253 /** 8254 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 8255 * @p: task_struct to insert 8256 * @args: struct containing the rest of the arguments 8257 * @args->dsq_id: DSQ to insert into 8258 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 8259 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 8260 * @args->enq_flags: SCX_ENQ_* 8261 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8262 * 8263 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 8264 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 8265 * as an inline wrapper in common.bpf.h. 8266 * 8267 * Insert @p into the vtime priority queue of the DSQ identified by 8268 * @args->dsq_id. Tasks queued into the priority queue are ordered by 8269 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 8270 * 8271 * @args->vtime ordering is according to time_before64() which considers 8272 * wrapping. A numerically larger vtime may indicate an earlier position in the 8273 * ordering and vice-versa. 8274 * 8275 * A DSQ can only be used as a FIFO or priority queue at any given time and this 8276 * function must not be called on a DSQ which already has one or more FIFO tasks 8277 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 8278 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 8279 * 8280 * Returns %true on successful insertion, %false on failure. On the root 8281 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8282 * to check the return value. 8283 */ 8284 __bpf_kfunc bool 8285 __scx_bpf_dsq_insert_vtime(struct task_struct *p, 8286 struct scx_bpf_dsq_insert_vtime_args *args, 8287 const struct bpf_prog_aux *aux) 8288 { 8289 struct scx_sched *sch; 8290 8291 guard(rcu)(); 8292 8293 sch = scx_prog_sched(aux); 8294 if (unlikely(!sch)) 8295 return false; 8296 8297 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 8298 args->vtime, args->enq_flags); 8299 } 8300 8301 /* 8302 * COMPAT: Will be removed in v6.23. 8303 */ 8304 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 8305 u64 slice, u64 vtime, u64 enq_flags) 8306 { 8307 struct scx_sched *sch; 8308 8309 guard(rcu)(); 8310 8311 sch = rcu_dereference(scx_root); 8312 if (unlikely(!sch)) 8313 return; 8314 8315 #ifdef CONFIG_EXT_SUB_SCHED 8316 /* 8317 * Disallow if any sub-scheds are attached. There is no way to tell 8318 * which scheduler called us, just error out @p's scheduler. 8319 */ 8320 if (unlikely(!list_empty(&sch->children))) { 8321 scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 8322 return; 8323 } 8324 #endif 8325 8326 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8327 } 8328 8329 __bpf_kfunc_end_defs(); 8330 8331 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8332 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8333 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8334 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8335 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8336 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8337 8338 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8339 .owner = THIS_MODULE, 8340 .set = &scx_kfunc_ids_enqueue_dispatch, 8341 .filter = scx_kfunc_context_filter, 8342 }; 8343 8344 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8345 struct task_struct *p, u64 dsq_id, u64 enq_flags) 8346 { 8347 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8348 struct scx_sched *sch; 8349 struct rq *this_rq, *src_rq, *locked_rq; 8350 bool dispatched = false; 8351 bool in_balance; 8352 unsigned long flags; 8353 8354 /* 8355 * The verifier considers an iterator slot initialized on any 8356 * KF_ITER_NEW return, so a BPF program may legally reach here after 8357 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. 8358 */ 8359 if (unlikely(!src_dsq)) 8360 return false; 8361 8362 sch = src_dsq->sched; 8363 8364 if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) 8365 return false; 8366 8367 /* 8368 * If the BPF scheduler keeps calling this function repeatedly, it can 8369 * cause similar live-lock conditions as consume_dispatch_q(). 8370 */ 8371 if (unlikely(READ_ONCE(sch->aborting))) 8372 return false; 8373 8374 if (unlikely(!scx_task_on_sched(sch, p))) { 8375 scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8376 p->comm, p->pid); 8377 return false; 8378 } 8379 8380 /* 8381 * Can be called from either ops.dispatch() locking this_rq() or any 8382 * context where no rq lock is held. If latter, lock @p's task_rq which 8383 * we'll likely need anyway. 8384 */ 8385 src_rq = task_rq(p); 8386 8387 local_irq_save(flags); 8388 this_rq = this_rq(); 8389 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8390 8391 if (in_balance) { 8392 if (this_rq != src_rq) { 8393 raw_spin_rq_unlock(this_rq); 8394 raw_spin_rq_lock(src_rq); 8395 } 8396 } else { 8397 raw_spin_rq_lock(src_rq); 8398 } 8399 8400 locked_rq = src_rq; 8401 raw_spin_lock(&src_dsq->lock); 8402 8403 /* did someone else get to it while we dropped the locks? */ 8404 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8405 raw_spin_unlock(&src_dsq->lock); 8406 goto out; 8407 } 8408 8409 /* @p is still on $src_dsq and stable, determine the destination */ 8410 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8411 8412 /* 8413 * Apply vtime and slice updates before moving so that the new time is 8414 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8415 * this is safe as we're locking it. 8416 */ 8417 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8418 p->scx.dsq_vtime = kit->vtime; 8419 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8420 p->scx.slice = kit->slice; 8421 8422 /* execute move */ 8423 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8424 dispatched = true; 8425 out: 8426 if (in_balance) { 8427 if (this_rq != locked_rq) { 8428 raw_spin_rq_unlock(locked_rq); 8429 raw_spin_rq_lock(this_rq); 8430 } 8431 } else { 8432 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8433 } 8434 8435 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8436 __SCX_DSQ_ITER_HAS_VTIME); 8437 return dispatched; 8438 } 8439 8440 __bpf_kfunc_start_defs(); 8441 8442 /** 8443 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8444 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8445 * 8446 * Can only be called from ops.dispatch(). 8447 */ 8448 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8449 { 8450 struct scx_sched *sch; 8451 8452 guard(rcu)(); 8453 8454 sch = scx_prog_sched(aux); 8455 if (unlikely(!sch)) 8456 return 0; 8457 8458 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8459 } 8460 8461 /** 8462 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8463 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8464 * 8465 * Cancel the latest dispatch. Can be called multiple times to cancel further 8466 * dispatches. Can only be called from ops.dispatch(). 8467 */ 8468 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 8469 { 8470 struct scx_sched *sch; 8471 struct scx_dsp_ctx *dspc; 8472 8473 guard(rcu)(); 8474 8475 sch = scx_prog_sched(aux); 8476 if (unlikely(!sch)) 8477 return; 8478 8479 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8480 8481 if (dspc->cursor > 0) 8482 dspc->cursor--; 8483 else 8484 scx_error(sch, "dispatch buffer underflow"); 8485 } 8486 8487 /** 8488 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 8489 * @dsq_id: DSQ to move task from. Must be a user-created DSQ 8490 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8491 * @enq_flags: %SCX_ENQ_* 8492 * 8493 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 8494 * local DSQ for execution with @enq_flags applied. Can only be called from 8495 * ops.dispatch(). 8496 * 8497 * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as 8498 * sources. Local DSQs support reenqueueing (a task can be picked up for 8499 * execution, dequeued for property changes, or reenqueued), but the BPF 8500 * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL 8501 * is similar but also doesn't support reenqueueing, as it maps to multiple 8502 * per-node DSQs making the scope difficult to define; this may change in the 8503 * future. 8504 * 8505 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 8506 * before trying to move from the specified DSQ. It may also grab rq locks and 8507 * thus can't be called under any BPF locks. 8508 * 8509 * Returns %true if a task has been moved, %false if there isn't any task to 8510 * move. 8511 */ 8512 __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, 8513 const struct bpf_prog_aux *aux) 8514 { 8515 struct scx_dispatch_q *dsq; 8516 struct scx_sched *sch; 8517 struct scx_dsp_ctx *dspc; 8518 8519 guard(rcu)(); 8520 8521 sch = scx_prog_sched(aux); 8522 if (unlikely(!sch)) 8523 return false; 8524 8525 if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) 8526 return false; 8527 8528 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8529 8530 flush_dispatch_buf(sch, dspc->rq); 8531 8532 dsq = find_user_dsq(sch, dsq_id); 8533 if (unlikely(!dsq)) { 8534 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 8535 return false; 8536 } 8537 8538 if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { 8539 /* 8540 * A successfully consumed task can be dequeued before it starts 8541 * running while the CPU is trying to migrate other dispatched 8542 * tasks. Bump nr_tasks to tell balance_one() to retry on empty 8543 * local DSQ. 8544 */ 8545 dspc->nr_tasks++; 8546 return true; 8547 } else { 8548 return false; 8549 } 8550 } 8551 8552 /* 8553 * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. 8554 */ 8555 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 8556 { 8557 return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); 8558 } 8559 8560 /** 8561 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 8562 * @it__iter: DSQ iterator in progress 8563 * @slice: duration the moved task can run for in nsecs 8564 * 8565 * Override the slice of the next task that will be moved from @it__iter using 8566 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 8567 * slice duration is kept. 8568 */ 8569 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 8570 u64 slice) 8571 { 8572 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8573 8574 kit->slice = slice; 8575 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 8576 } 8577 8578 /** 8579 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 8580 * @it__iter: DSQ iterator in progress 8581 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 8582 * 8583 * Override the vtime of the next task that will be moved from @it__iter using 8584 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 8585 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 8586 * override is ignored and cleared. 8587 */ 8588 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 8589 u64 vtime) 8590 { 8591 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8592 8593 kit->vtime = vtime; 8594 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 8595 } 8596 8597 /** 8598 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 8599 * @it__iter: DSQ iterator in progress 8600 * @p: task to transfer 8601 * @dsq_id: DSQ to move @p to 8602 * @enq_flags: SCX_ENQ_* 8603 * 8604 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 8605 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 8606 * be the destination. 8607 * 8608 * For the transfer to be successful, @p must still be on the DSQ and have been 8609 * queued before the DSQ iteration started. This function doesn't care whether 8610 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 8611 * been queued before the iteration started. 8612 * 8613 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 8614 * 8615 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 8616 * lock (e.g. BPF timers or SYSCALL programs). 8617 * 8618 * Returns %true if @p has been consumed, %false if @p had already been 8619 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 8620 * DSQ. 8621 */ 8622 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 8623 struct task_struct *p, u64 dsq_id, 8624 u64 enq_flags) 8625 { 8626 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8627 p, dsq_id, enq_flags); 8628 } 8629 8630 /** 8631 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 8632 * @it__iter: DSQ iterator in progress 8633 * @p: task to transfer 8634 * @dsq_id: DSQ to move @p to 8635 * @enq_flags: SCX_ENQ_* 8636 * 8637 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 8638 * priority queue of the DSQ specified by @dsq_id. The destination must be a 8639 * user DSQ as only user DSQs support priority queue. 8640 * 8641 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 8642 * and scx_bpf_dsq_move_set_vtime() to update. 8643 * 8644 * All other aspects are identical to scx_bpf_dsq_move(). See 8645 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 8646 */ 8647 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 8648 struct task_struct *p, u64 dsq_id, 8649 u64 enq_flags) 8650 { 8651 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8652 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8653 } 8654 8655 #ifdef CONFIG_EXT_SUB_SCHED 8656 /** 8657 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 8658 * @cgroup_id: cgroup ID of the child scheduler to dispatch 8659 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8660 * 8661 * Allows a parent scheduler to trigger dispatching on one of its direct 8662 * child schedulers. The child scheduler runs its dispatch operation to 8663 * move tasks from dispatch queues to the local runqueue. 8664 * 8665 * Returns: true on success, false if cgroup_id is invalid, not a direct 8666 * child, or caller lacks dispatch permission. 8667 */ 8668 __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 8669 { 8670 struct rq *this_rq = this_rq(); 8671 struct scx_sched *parent, *child; 8672 8673 guard(rcu)(); 8674 parent = scx_prog_sched(aux); 8675 if (unlikely(!parent)) 8676 return false; 8677 8678 child = scx_find_sub_sched(cgroup_id); 8679 8680 if (unlikely(!child)) 8681 return false; 8682 8683 if (unlikely(scx_parent(child) != parent)) { 8684 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 8685 cgroup_id); 8686 return false; 8687 } 8688 8689 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 8690 true); 8691 } 8692 #endif /* CONFIG_EXT_SUB_SCHED */ 8693 8694 __bpf_kfunc_end_defs(); 8695 8696 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 8697 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 8698 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 8699 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 8700 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) 8701 /* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ 8702 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8703 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8704 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8705 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8706 #ifdef CONFIG_EXT_SUB_SCHED 8707 BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 8708 #endif 8709 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 8710 8711 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 8712 .owner = THIS_MODULE, 8713 .set = &scx_kfunc_ids_dispatch, 8714 .filter = scx_kfunc_context_filter, 8715 }; 8716 8717 __bpf_kfunc_start_defs(); 8718 8719 /** 8720 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 8721 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8722 * 8723 * Iterate over all of the tasks currently enqueued on the local DSQ of the 8724 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 8725 * processed tasks. Can only be called from ops.cpu_release(). 8726 */ 8727 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 8728 { 8729 struct scx_sched *sch; 8730 struct rq *rq; 8731 8732 guard(rcu)(); 8733 sch = scx_prog_sched(aux); 8734 if (unlikely(!sch)) 8735 return 0; 8736 8737 rq = cpu_rq(smp_processor_id()); 8738 lockdep_assert_rq_held(rq); 8739 8740 return reenq_local(sch, rq, SCX_REENQ_ANY); 8741 } 8742 8743 __bpf_kfunc_end_defs(); 8744 8745 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 8746 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 8747 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 8748 8749 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 8750 .owner = THIS_MODULE, 8751 .set = &scx_kfunc_ids_cpu_release, 8752 .filter = scx_kfunc_context_filter, 8753 }; 8754 8755 __bpf_kfunc_start_defs(); 8756 8757 /** 8758 * scx_bpf_create_dsq - Create a custom DSQ 8759 * @dsq_id: DSQ to create 8760 * @node: NUMA node to allocate from 8761 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8762 * 8763 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 8764 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 8765 */ 8766 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 8767 { 8768 struct scx_dispatch_q *dsq; 8769 struct scx_sched *sch; 8770 s32 ret; 8771 8772 if (unlikely(node >= (int)nr_node_ids || 8773 (node < 0 && node != NUMA_NO_NODE))) 8774 return -EINVAL; 8775 8776 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 8777 return -EINVAL; 8778 8779 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 8780 if (!dsq) 8781 return -ENOMEM; 8782 8783 /* 8784 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 8785 * @sch and update afterwards. 8786 */ 8787 ret = init_dsq(dsq, dsq_id, NULL); 8788 if (ret) { 8789 kfree(dsq); 8790 return ret; 8791 } 8792 8793 rcu_read_lock(); 8794 8795 sch = scx_prog_sched(aux); 8796 if (sch) { 8797 dsq->sched = sch; 8798 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 8799 dsq_hash_params); 8800 } else { 8801 ret = -ENODEV; 8802 } 8803 8804 rcu_read_unlock(); 8805 if (ret) { 8806 exit_dsq(dsq); 8807 kfree(dsq); 8808 } 8809 return ret; 8810 } 8811 8812 __bpf_kfunc_end_defs(); 8813 8814 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 8815 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 8816 /* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ 8817 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8818 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8819 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8820 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8821 /* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ 8822 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 8823 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 8824 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 8825 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 8826 8827 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 8828 .owner = THIS_MODULE, 8829 .set = &scx_kfunc_ids_unlocked, 8830 .filter = scx_kfunc_context_filter, 8831 }; 8832 8833 __bpf_kfunc_start_defs(); 8834 8835 /** 8836 * scx_bpf_task_set_slice - Set task's time slice 8837 * @p: task of interest 8838 * @slice: time slice to set in nsecs 8839 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8840 * 8841 * Set @p's time slice to @slice. Returns %true on success, %false if the 8842 * calling scheduler doesn't have authority over @p. 8843 */ 8844 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 8845 const struct bpf_prog_aux *aux) 8846 { 8847 struct scx_sched *sch; 8848 8849 guard(rcu)(); 8850 sch = scx_prog_sched(aux); 8851 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 8852 return false; 8853 8854 p->scx.slice = slice; 8855 return true; 8856 } 8857 8858 /** 8859 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 8860 * @p: task of interest 8861 * @vtime: virtual time to set 8862 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8863 * 8864 * Set @p's virtual time to @vtime. Returns %true on success, %false if the 8865 * calling scheduler doesn't have authority over @p. 8866 */ 8867 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 8868 const struct bpf_prog_aux *aux) 8869 { 8870 struct scx_sched *sch; 8871 8872 guard(rcu)(); 8873 sch = scx_prog_sched(aux); 8874 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 8875 return false; 8876 8877 p->scx.dsq_vtime = vtime; 8878 return true; 8879 } 8880 8881 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 8882 { 8883 struct rq *this_rq; 8884 unsigned long irq_flags; 8885 8886 if (!ops_cpu_valid(sch, cpu, NULL)) 8887 return; 8888 8889 local_irq_save(irq_flags); 8890 8891 this_rq = this_rq(); 8892 8893 /* 8894 * While bypassing for PM ops, IRQ handling may not be online which can 8895 * lead to irq_work_queue() malfunction such as infinite busy wait for 8896 * IRQ status update. Suppress kicking. 8897 */ 8898 if (scx_bypassing(sch, cpu_of(this_rq))) 8899 goto out; 8900 8901 /* 8902 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 8903 * rq locks. We can probably be smarter and avoid bouncing if called 8904 * from ops which don't hold a rq lock. 8905 */ 8906 if (flags & SCX_KICK_IDLE) { 8907 struct rq *target_rq = cpu_rq(cpu); 8908 8909 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 8910 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 8911 8912 if (raw_spin_rq_trylock(target_rq)) { 8913 if (can_skip_idle_kick(target_rq)) { 8914 raw_spin_rq_unlock(target_rq); 8915 goto out; 8916 } 8917 raw_spin_rq_unlock(target_rq); 8918 } 8919 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 8920 } else { 8921 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 8922 8923 if (flags & SCX_KICK_PREEMPT) 8924 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 8925 if (flags & SCX_KICK_WAIT) 8926 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 8927 } 8928 8929 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 8930 out: 8931 local_irq_restore(irq_flags); 8932 } 8933 8934 /** 8935 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 8936 * @cpu: cpu to kick 8937 * @flags: %SCX_KICK_* flags 8938 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8939 * 8940 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 8941 * trigger rescheduling on a busy CPU. This can be called from any online 8942 * scx_ops operation and the actual kicking is performed asynchronously through 8943 * an irq work. 8944 */ 8945 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 8946 { 8947 struct scx_sched *sch; 8948 8949 guard(rcu)(); 8950 sch = scx_prog_sched(aux); 8951 if (likely(sch)) 8952 scx_kick_cpu(sch, cpu, flags); 8953 } 8954 8955 /** 8956 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 8957 * @dsq_id: id of the DSQ 8958 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8959 * 8960 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 8961 * -%ENOENT is returned. 8962 */ 8963 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) 8964 { 8965 struct scx_sched *sch; 8966 struct scx_dispatch_q *dsq; 8967 s32 ret; 8968 8969 preempt_disable(); 8970 8971 sch = scx_prog_sched(aux); 8972 if (unlikely(!sch)) { 8973 ret = -ENODEV; 8974 goto out; 8975 } 8976 8977 if (dsq_id == SCX_DSQ_LOCAL) { 8978 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 8979 goto out; 8980 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 8981 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 8982 8983 if (ops_cpu_valid(sch, cpu, NULL)) { 8984 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 8985 goto out; 8986 } 8987 } else { 8988 dsq = find_user_dsq(sch, dsq_id); 8989 if (dsq) { 8990 ret = READ_ONCE(dsq->nr); 8991 goto out; 8992 } 8993 } 8994 ret = -ENOENT; 8995 out: 8996 preempt_enable(); 8997 return ret; 8998 } 8999 9000 /** 9001 * scx_bpf_destroy_dsq - Destroy a custom DSQ 9002 * @dsq_id: DSQ to destroy 9003 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9004 * 9005 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 9006 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 9007 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 9008 * which doesn't exist. Can be called from any online scx_ops operations. 9009 */ 9010 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) 9011 { 9012 struct scx_sched *sch; 9013 9014 guard(rcu)(); 9015 sch = scx_prog_sched(aux); 9016 if (sch) 9017 destroy_dsq(sch, dsq_id); 9018 } 9019 9020 /** 9021 * bpf_iter_scx_dsq_new - Create a DSQ iterator 9022 * @it: iterator to initialize 9023 * @dsq_id: DSQ to iterate 9024 * @flags: %SCX_DSQ_ITER_* 9025 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9026 * 9027 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 9028 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 9029 * tasks which are already queued when this function is invoked. 9030 */ 9031 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 9032 u64 flags, const struct bpf_prog_aux *aux) 9033 { 9034 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9035 struct scx_sched *sch; 9036 9037 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 9038 sizeof(struct bpf_iter_scx_dsq)); 9039 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 9040 __alignof__(struct bpf_iter_scx_dsq)); 9041 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 9042 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 9043 9044 /* 9045 * next() and destroy() will be called regardless of the return value. 9046 * Always clear $kit->dsq. 9047 */ 9048 kit->dsq = NULL; 9049 9050 sch = scx_prog_sched(aux); 9051 if (unlikely(!sch)) 9052 return -ENODEV; 9053 9054 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 9055 return -EINVAL; 9056 9057 kit->dsq = find_user_dsq(sch, dsq_id); 9058 if (!kit->dsq) 9059 return -ENOENT; 9060 9061 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 9062 9063 return 0; 9064 } 9065 9066 /** 9067 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 9068 * @it: iterator to progress 9069 * 9070 * Return the next task. See bpf_iter_scx_dsq_new(). 9071 */ 9072 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 9073 { 9074 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9075 9076 if (!kit->dsq) 9077 return NULL; 9078 9079 guard(raw_spinlock_irqsave)(&kit->dsq->lock); 9080 9081 return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 9082 } 9083 9084 /** 9085 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 9086 * @it: iterator to destroy 9087 * 9088 * Undo scx_iter_scx_dsq_new(). 9089 */ 9090 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 9091 { 9092 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9093 9094 if (!kit->dsq) 9095 return; 9096 9097 if (!list_empty(&kit->cursor.node)) { 9098 unsigned long flags; 9099 9100 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 9101 list_del_init(&kit->cursor.node); 9102 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 9103 } 9104 kit->dsq = NULL; 9105 } 9106 9107 /** 9108 * scx_bpf_dsq_peek - Lockless peek at the first element. 9109 * @dsq_id: DSQ to examine. 9110 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9111 * 9112 * Read the first element in the DSQ. This is semantically equivalent to using 9113 * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 9114 * this provides only a point-in-time snapshot, and the contents may change 9115 * by the time any subsequent locking operation reads the queue. 9116 * 9117 * Returns the pointer, or NULL indicates an empty queue OR internal error. 9118 */ 9119 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 9120 const struct bpf_prog_aux *aux) 9121 { 9122 struct scx_sched *sch; 9123 struct scx_dispatch_q *dsq; 9124 9125 sch = scx_prog_sched(aux); 9126 if (unlikely(!sch)) 9127 return NULL; 9128 9129 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 9130 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 9131 return NULL; 9132 } 9133 9134 dsq = find_user_dsq(sch, dsq_id); 9135 if (unlikely(!dsq)) { 9136 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 9137 return NULL; 9138 } 9139 9140 return rcu_dereference(dsq->first_task); 9141 } 9142 9143 /** 9144 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 9145 * @dsq_id: DSQ to re-enqueue 9146 * @reenq_flags: %SCX_RENQ_* 9147 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9148 * 9149 * Iterate over all of the tasks currently enqueued on the DSQ identified by 9150 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 9151 * supported: 9152 * 9153 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 9154 * - User DSQs 9155 * 9156 * Re-enqueues are performed asynchronously. Can be called from anywhere. 9157 */ 9158 __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 9159 const struct bpf_prog_aux *aux) 9160 { 9161 struct scx_sched *sch; 9162 struct scx_dispatch_q *dsq; 9163 9164 guard(preempt)(); 9165 9166 sch = scx_prog_sched(aux); 9167 if (unlikely(!sch)) 9168 return; 9169 9170 if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 9171 scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 9172 return; 9173 } 9174 9175 /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 9176 if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 9177 reenq_flags |= SCX_REENQ_ANY; 9178 9179 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 9180 schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); 9181 } 9182 9183 /** 9184 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9185 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9186 * 9187 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9188 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 9189 * anywhere. 9190 * 9191 * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 9192 * future. 9193 */ 9194 __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 9195 { 9196 scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 9197 } 9198 9199 __bpf_kfunc_end_defs(); 9200 9201 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 9202 size_t line_size, char *fmt, unsigned long long *data, 9203 u32 data__sz) 9204 { 9205 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 9206 s32 ret; 9207 9208 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 9209 (data__sz && !data)) { 9210 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 9211 return -EINVAL; 9212 } 9213 9214 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 9215 if (ret < 0) { 9216 scx_error(sch, "failed to read data fields (%d)", ret); 9217 return ret; 9218 } 9219 9220 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 9221 &bprintf_data); 9222 if (ret < 0) { 9223 scx_error(sch, "format preparation failed (%d)", ret); 9224 return ret; 9225 } 9226 9227 ret = bstr_printf(line_buf, line_size, fmt, 9228 bprintf_data.bin_args); 9229 bpf_bprintf_cleanup(&bprintf_data); 9230 if (ret < 0) { 9231 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 9232 return ret; 9233 } 9234 9235 return ret; 9236 } 9237 9238 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 9239 char *fmt, unsigned long long *data, u32 data__sz) 9240 { 9241 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 9242 fmt, data, data__sz); 9243 } 9244 9245 __bpf_kfunc_start_defs(); 9246 9247 /** 9248 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 9249 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 9250 * @fmt: error message format string 9251 * @data: format string parameters packaged using ___bpf_fill() macro 9252 * @data__sz: @data len, must end in '__sz' for the verifier 9253 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9254 * 9255 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 9256 * disabling. 9257 */ 9258 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 9259 unsigned long long *data, u32 data__sz, 9260 const struct bpf_prog_aux *aux) 9261 { 9262 struct scx_sched *sch; 9263 unsigned long flags; 9264 9265 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9266 sch = scx_prog_sched(aux); 9267 if (likely(sch) && 9268 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9269 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 9270 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9271 } 9272 9273 /** 9274 * scx_bpf_error_bstr - Indicate fatal error 9275 * @fmt: error message format string 9276 * @data: format string parameters packaged using ___bpf_fill() macro 9277 * @data__sz: @data len, must end in '__sz' for the verifier 9278 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9279 * 9280 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 9281 * disabling. 9282 */ 9283 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 9284 u32 data__sz, const struct bpf_prog_aux *aux) 9285 { 9286 struct scx_sched *sch; 9287 unsigned long flags; 9288 9289 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9290 sch = scx_prog_sched(aux); 9291 if (likely(sch) && 9292 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9293 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 9294 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9295 } 9296 9297 /** 9298 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 9299 * @fmt: format string 9300 * @data: format string parameters packaged using ___bpf_fill() macro 9301 * @data__sz: @data len, must end in '__sz' for the verifier 9302 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9303 * 9304 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 9305 * dump_task() to generate extra debug dump specific to the BPF scheduler. 9306 * 9307 * The extra dump may be multiple lines. A single line may be split over 9308 * multiple calls. The last line is automatically terminated. 9309 */ 9310 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 9311 u32 data__sz, const struct bpf_prog_aux *aux) 9312 { 9313 struct scx_sched *sch; 9314 struct scx_dump_data *dd = &scx_dump_data; 9315 struct scx_bstr_buf *buf = &dd->buf; 9316 s32 ret; 9317 9318 guard(rcu)(); 9319 9320 sch = scx_prog_sched(aux); 9321 if (unlikely(!sch)) 9322 return; 9323 9324 if (raw_smp_processor_id() != dd->cpu) { 9325 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 9326 return; 9327 } 9328 9329 /* append the formatted string to the line buf */ 9330 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 9331 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 9332 if (ret < 0) { 9333 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 9334 dd->prefix, fmt, data, data__sz, ret); 9335 return; 9336 } 9337 9338 dd->cursor += ret; 9339 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 9340 9341 if (!dd->cursor) 9342 return; 9343 9344 /* 9345 * If the line buf overflowed or ends in a newline, flush it into the 9346 * dump. This is to allow the caller to generate a single line over 9347 * multiple calls. As ops_dump_flush() can also handle multiple lines in 9348 * the line buf, the only case which can lead to an unexpected 9349 * truncation is when the caller keeps generating newlines in the middle 9350 * instead of the end consecutively. Don't do that. 9351 */ 9352 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9353 ops_dump_flush(); 9354 } 9355 9356 /** 9357 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9358 * @cpu: CPU of interest 9359 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9360 * 9361 * Return the maximum relative capacity of @cpu in relation to the most 9362 * performant CPU in the system. The return value is in the range [1, 9363 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9364 */ 9365 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9366 { 9367 struct scx_sched *sch; 9368 9369 guard(rcu)(); 9370 9371 sch = scx_prog_sched(aux); 9372 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9373 return arch_scale_cpu_capacity(cpu); 9374 else 9375 return SCX_CPUPERF_ONE; 9376 } 9377 9378 /** 9379 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9380 * @cpu: CPU of interest 9381 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9382 * 9383 * Return the current relative performance of @cpu in relation to its maximum. 9384 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9385 * 9386 * The current performance level of a CPU in relation to the maximum performance 9387 * available in the system can be calculated as follows: 9388 * 9389 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9390 * 9391 * The result is in the range [1, %SCX_CPUPERF_ONE]. 9392 */ 9393 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9394 { 9395 struct scx_sched *sch; 9396 9397 guard(rcu)(); 9398 9399 sch = scx_prog_sched(aux); 9400 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9401 return arch_scale_freq_capacity(cpu); 9402 else 9403 return SCX_CPUPERF_ONE; 9404 } 9405 9406 /** 9407 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 9408 * @cpu: CPU of interest 9409 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 9410 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9411 * 9412 * Set the target performance level of @cpu to @perf. @perf is in linear 9413 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 9414 * schedutil cpufreq governor chooses the target frequency. 9415 * 9416 * The actual performance level chosen, CPU grouping, and the overhead and 9417 * latency of the operations are dependent on the hardware and cpufreq driver in 9418 * use. Consult hardware and cpufreq documentation for more information. The 9419 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 9420 */ 9421 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 9422 { 9423 struct scx_sched *sch; 9424 9425 guard(rcu)(); 9426 9427 sch = scx_prog_sched(aux); 9428 if (unlikely(!sch)) 9429 return; 9430 9431 if (unlikely(perf > SCX_CPUPERF_ONE)) { 9432 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 9433 return; 9434 } 9435 9436 if (ops_cpu_valid(sch, cpu, NULL)) { 9437 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 9438 struct rq_flags rf; 9439 9440 /* 9441 * When called with an rq lock held, restrict the operation 9442 * to the corresponding CPU to prevent ABBA deadlocks. 9443 */ 9444 if (locked_rq && rq != locked_rq) { 9445 scx_error(sch, "Invalid target CPU %d", cpu); 9446 return; 9447 } 9448 9449 /* 9450 * If no rq lock is held, allow to operate on any CPU by 9451 * acquiring the corresponding rq lock. 9452 */ 9453 if (!locked_rq) { 9454 rq_lock_irqsave(rq, &rf); 9455 update_rq_clock(rq); 9456 } 9457 9458 rq->scx.cpuperf_target = perf; 9459 cpufreq_update_util(rq, 0); 9460 9461 if (!locked_rq) 9462 rq_unlock_irqrestore(rq, &rf); 9463 } 9464 } 9465 9466 /** 9467 * scx_bpf_nr_node_ids - Return the number of possible node IDs 9468 * 9469 * All valid node IDs in the system are smaller than the returned value. 9470 */ 9471 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 9472 { 9473 return nr_node_ids; 9474 } 9475 9476 /** 9477 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 9478 * 9479 * All valid CPU IDs in the system are smaller than the returned value. 9480 */ 9481 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 9482 { 9483 return nr_cpu_ids; 9484 } 9485 9486 /** 9487 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 9488 */ 9489 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 9490 { 9491 return cpu_possible_mask; 9492 } 9493 9494 /** 9495 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 9496 */ 9497 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 9498 { 9499 return cpu_online_mask; 9500 } 9501 9502 /** 9503 * scx_bpf_put_cpumask - Release a possible/online cpumask 9504 * @cpumask: cpumask to release 9505 */ 9506 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 9507 { 9508 /* 9509 * Empty function body because we aren't actually acquiring or releasing 9510 * a reference to a global cpumask, which is read-only in the caller and 9511 * is never released. The acquire / release semantics here are just used 9512 * to make the cpumask is a trusted pointer in the caller. 9513 */ 9514 } 9515 9516 /** 9517 * scx_bpf_task_running - Is task currently running? 9518 * @p: task of interest 9519 */ 9520 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 9521 { 9522 return task_rq(p)->curr == p; 9523 } 9524 9525 /** 9526 * scx_bpf_task_cpu - CPU a task is currently associated with 9527 * @p: task of interest 9528 */ 9529 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 9530 { 9531 return task_cpu(p); 9532 } 9533 9534 /** 9535 * scx_bpf_cpu_rq - Fetch the rq of a CPU 9536 * @cpu: CPU of the rq 9537 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9538 */ 9539 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 9540 { 9541 struct scx_sched *sch; 9542 9543 guard(rcu)(); 9544 9545 sch = scx_prog_sched(aux); 9546 if (unlikely(!sch)) 9547 return NULL; 9548 9549 if (!ops_cpu_valid(sch, cpu, NULL)) 9550 return NULL; 9551 9552 if (!sch->warned_deprecated_rq) { 9553 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 9554 "use scx_bpf_locked_rq() when holding rq lock " 9555 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 9556 sch->warned_deprecated_rq = true; 9557 } 9558 9559 return cpu_rq(cpu); 9560 } 9561 9562 /** 9563 * scx_bpf_locked_rq - Return the rq currently locked by SCX 9564 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9565 * 9566 * Returns the rq if a rq lock is currently held by SCX. 9567 * Otherwise emits an error and returns NULL. 9568 */ 9569 __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 9570 { 9571 struct scx_sched *sch; 9572 struct rq *rq; 9573 9574 guard(preempt)(); 9575 9576 sch = scx_prog_sched(aux); 9577 if (unlikely(!sch)) 9578 return NULL; 9579 9580 rq = scx_locked_rq(); 9581 if (!rq) { 9582 scx_error(sch, "accessing rq without holding rq lock"); 9583 return NULL; 9584 } 9585 9586 return rq; 9587 } 9588 9589 /** 9590 * scx_bpf_cpu_curr - Return remote CPU's curr task 9591 * @cpu: CPU of interest 9592 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9593 * 9594 * Callers must hold RCU read lock (KF_RCU). 9595 */ 9596 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 9597 { 9598 struct scx_sched *sch; 9599 9600 guard(rcu)(); 9601 9602 sch = scx_prog_sched(aux); 9603 if (unlikely(!sch)) 9604 return NULL; 9605 9606 if (!ops_cpu_valid(sch, cpu, NULL)) 9607 return NULL; 9608 9609 return rcu_dereference(cpu_rq(cpu)->curr); 9610 } 9611 9612 /** 9613 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 9614 * clock for the current CPU. The clock returned is in nanoseconds. 9615 * 9616 * It provides the following properties: 9617 * 9618 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 9619 * to account for execution time and track tasks' runtime properties. 9620 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 9621 * eventually reads a hardware timestamp counter -- is neither performant nor 9622 * scalable. scx_bpf_now() aims to provide a high-performance clock by 9623 * using the rq clock in the scheduler core whenever possible. 9624 * 9625 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 9626 * scheduler use cases, the required clock resolution is lower than the most 9627 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 9628 * uses the rq clock in the scheduler core whenever it is valid. It considers 9629 * that the rq clock is valid from the time the rq clock is updated 9630 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 9631 * 9632 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 9633 * guarantees the clock never goes backward when comparing them in the same 9634 * CPU. On the other hand, when comparing clocks in different CPUs, there 9635 * is no such guarantee -- the clock can go backward. It provides a 9636 * monotonically *non-decreasing* clock so that it would provide the same 9637 * clock values in two different scx_bpf_now() calls in the same CPU 9638 * during the same period of when the rq clock is valid. 9639 */ 9640 __bpf_kfunc u64 scx_bpf_now(void) 9641 { 9642 struct rq *rq; 9643 u64 clock; 9644 9645 preempt_disable(); 9646 9647 rq = this_rq(); 9648 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 9649 /* 9650 * If the rq clock is valid, use the cached rq clock. 9651 * 9652 * Note that scx_bpf_now() is re-entrant between a process 9653 * context and an interrupt context (e.g., timer interrupt). 9654 * However, we don't need to consider the race between them 9655 * because such race is not observable from a caller. 9656 */ 9657 clock = READ_ONCE(rq->scx.clock); 9658 } else { 9659 /* 9660 * Otherwise, return a fresh rq clock. 9661 * 9662 * The rq clock is updated outside of the rq lock. 9663 * In this case, keep the updated rq clock invalid so the next 9664 * kfunc call outside the rq lock gets a fresh rq clock. 9665 */ 9666 clock = sched_clock_cpu(cpu_of(rq)); 9667 } 9668 9669 preempt_enable(); 9670 9671 return clock; 9672 } 9673 9674 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 9675 { 9676 struct scx_event_stats *e_cpu; 9677 int cpu; 9678 9679 /* Aggregate per-CPU event counters into @events. */ 9680 memset(events, 0, sizeof(*events)); 9681 for_each_possible_cpu(cpu) { 9682 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 9683 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 9684 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 9685 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 9686 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 9687 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 9688 scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 9689 scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 9690 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 9691 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 9692 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 9693 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 9694 scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 9695 scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); 9696 } 9697 } 9698 9699 /* 9700 * scx_bpf_events - Get a system-wide event counter to 9701 * @events: output buffer from a BPF program 9702 * @events__sz: @events len, must end in '__sz'' for the verifier 9703 */ 9704 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 9705 size_t events__sz) 9706 { 9707 struct scx_sched *sch; 9708 struct scx_event_stats e_sys; 9709 9710 rcu_read_lock(); 9711 sch = rcu_dereference(scx_root); 9712 if (sch) 9713 scx_read_events(sch, &e_sys); 9714 else 9715 memset(&e_sys, 0, sizeof(e_sys)); 9716 rcu_read_unlock(); 9717 9718 /* 9719 * We cannot entirely trust a BPF-provided size since a BPF program 9720 * might be compiled against a different vmlinux.h, of which 9721 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 9722 * (an older vmlinux.h). Hence, we use the smaller size to avoid 9723 * memory corruption. 9724 */ 9725 events__sz = min(events__sz, sizeof(*events)); 9726 memcpy(events, &e_sys, events__sz); 9727 } 9728 9729 #ifdef CONFIG_CGROUP_SCHED 9730 /** 9731 * scx_bpf_task_cgroup - Return the sched cgroup of a task 9732 * @p: task of interest 9733 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9734 * 9735 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 9736 * from the scheduler's POV. SCX operations should use this function to 9737 * determine @p's current cgroup as, unlike following @p->cgroups, 9738 * @p->sched_task_group is stable for the duration of the SCX op. See 9739 * SCX_CALL_OP_TASK() for details. 9740 */ 9741 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 9742 const struct bpf_prog_aux *aux) 9743 { 9744 struct task_group *tg = p->sched_task_group; 9745 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 9746 struct scx_sched *sch; 9747 9748 guard(rcu)(); 9749 9750 sch = scx_prog_sched(aux); 9751 if (unlikely(!sch)) 9752 goto out; 9753 9754 if (!scx_kf_arg_task_ok(sch, p)) 9755 goto out; 9756 9757 cgrp = tg_cgrp(tg); 9758 9759 out: 9760 cgroup_get(cgrp); 9761 return cgrp; 9762 } 9763 #endif /* CONFIG_CGROUP_SCHED */ 9764 9765 __bpf_kfunc_end_defs(); 9766 9767 BTF_KFUNCS_START(scx_kfunc_ids_any) 9768 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 9769 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 9770 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 9771 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) 9772 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) 9773 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 9774 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 9775 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 9776 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 9777 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 9778 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 9779 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 9780 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 9781 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 9782 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 9783 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 9784 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 9785 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 9786 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 9787 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 9788 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 9789 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 9790 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 9791 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 9792 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 9793 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 9794 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 9795 BTF_ID_FLAGS(func, scx_bpf_now) 9796 BTF_ID_FLAGS(func, scx_bpf_events) 9797 #ifdef CONFIG_CGROUP_SCHED 9798 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 9799 #endif 9800 BTF_KFUNCS_END(scx_kfunc_ids_any) 9801 9802 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 9803 .owner = THIS_MODULE, 9804 .set = &scx_kfunc_ids_any, 9805 .filter = scx_kfunc_context_filter, 9806 }; 9807 9808 /* 9809 * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc 9810 * group; an op may permit zero or more groups, with the union expressed in 9811 * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) 9812 * consults this table to decide whether a context-sensitive kfunc is callable 9813 * from a given SCX op. 9814 */ 9815 enum scx_kf_allow_flags { 9816 SCX_KF_ALLOW_UNLOCKED = 1 << 0, 9817 SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, 9818 SCX_KF_ALLOW_DISPATCH = 1 << 2, 9819 SCX_KF_ALLOW_ENQUEUE = 1 << 3, 9820 SCX_KF_ALLOW_SELECT_CPU = 1 << 4, 9821 }; 9822 9823 /* 9824 * Map each SCX op to the union of kfunc groups it permits, indexed by 9825 * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not 9826 * context-sensitive. 9827 */ 9828 static const u32 scx_kf_allow_flags[] = { 9829 [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 9830 [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 9831 [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, 9832 [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, 9833 [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, 9834 [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, 9835 #ifdef CONFIG_EXT_GROUP_SCHED 9836 [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, 9837 [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, 9838 [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, 9839 [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, 9840 [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, 9841 [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, 9842 [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, 9843 #endif /* CONFIG_EXT_GROUP_SCHED */ 9844 [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, 9845 [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, 9846 [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, 9847 [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, 9848 [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, 9849 [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, 9850 }; 9851 9852 /* 9853 * Verifier-time filter for SCX kfuncs. Registered via the .filter field on 9854 * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc 9855 * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or 9856 * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the 9857 * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by 9858 * falling through to "allow" when none of the SCX sets contain the kfunc. 9859 */ 9860 int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) 9861 { 9862 bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); 9863 bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); 9864 bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); 9865 bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); 9866 bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); 9867 bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); 9868 bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); 9869 u32 moff, flags; 9870 9871 /* Not an SCX kfunc - allow. */ 9872 if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || 9873 in_cpu_release || in_idle || in_any)) 9874 return 0; 9875 9876 /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ 9877 if (prog->type == BPF_PROG_TYPE_SYSCALL) 9878 return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; 9879 9880 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) 9881 return (in_any || in_idle) ? 0 : -EACCES; 9882 9883 /* 9884 * add_subprog_and_kfunc() collects all kfunc calls, including dead code 9885 * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets 9886 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; 9887 * do_check_main() re-runs the filter with st_ops set and enforces the 9888 * actual restrictions. 9889 */ 9890 if (!prog->aux->st_ops) 9891 return 0; 9892 9893 /* 9894 * Non-SCX struct_ops: SCX kfuncs are not permitted. 9895 */ 9896 if (prog->aux->st_ops != &bpf_sched_ext_ops) 9897 return -EACCES; 9898 9899 /* SCX struct_ops: check the per-op allow list. */ 9900 if (in_any || in_idle) 9901 return 0; 9902 9903 moff = prog->aux->attach_st_ops_member_off; 9904 flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; 9905 9906 if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) 9907 return 0; 9908 if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) 9909 return 0; 9910 if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) 9911 return 0; 9912 if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) 9913 return 0; 9914 if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) 9915 return 0; 9916 9917 return -EACCES; 9918 } 9919 9920 static int __init scx_init(void) 9921 { 9922 int ret; 9923 9924 /* 9925 * kfunc registration can't be done from init_sched_ext_class() as 9926 * register_btf_kfunc_id_set() needs most of the system to be up. 9927 * 9928 * Some kfuncs are context-sensitive and can only be called from 9929 * specific SCX ops. They are grouped into per-context BTF sets, each 9930 * registered with scx_kfunc_context_filter as its .filter callback. The 9931 * BPF core dedups identical filter pointers per hook 9932 * (btf_populate_kfunc_set()), so the filter is invoked exactly once per 9933 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op 9934 * restrictions at verify time. 9935 */ 9936 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9937 &scx_kfunc_set_enqueue_dispatch)) || 9938 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9939 &scx_kfunc_set_dispatch)) || 9940 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9941 &scx_kfunc_set_cpu_release)) || 9942 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9943 &scx_kfunc_set_unlocked)) || 9944 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9945 &scx_kfunc_set_unlocked)) || 9946 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9947 &scx_kfunc_set_any)) || 9948 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 9949 &scx_kfunc_set_any)) || 9950 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9951 &scx_kfunc_set_any))) { 9952 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 9953 return ret; 9954 } 9955 9956 ret = scx_idle_init(); 9957 if (ret) { 9958 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 9959 return ret; 9960 } 9961 9962 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 9963 if (ret) { 9964 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 9965 return ret; 9966 } 9967 9968 ret = register_pm_notifier(&scx_pm_notifier); 9969 if (ret) { 9970 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 9971 return ret; 9972 } 9973 9974 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 9975 if (!scx_kset) { 9976 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 9977 return -ENOMEM; 9978 } 9979 9980 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 9981 if (ret < 0) { 9982 pr_err("sched_ext: Failed to add global attributes\n"); 9983 return ret; 9984 } 9985 9986 return 0; 9987 } 9988 __initcall(scx_init); 9989