1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #include <linux/btf_ids.h> 10 #include "ext_idle.h" 11 12 static DEFINE_RAW_SPINLOCK(scx_sched_lock); 13 14 /* 15 * NOTE: sched_ext is in the process of growing multiple scheduler support and 16 * scx_root usage is in a transitional state. Naked dereferences are safe if the 17 * caller is one of the tasks attached to SCX and explicit RCU dereference is 18 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 19 * are used as temporary markers to indicate that the dereferences need to be 20 * updated to point to the associated scheduler instances rather than scx_root. 21 */ 22 struct scx_sched __rcu *scx_root; 23 24 /* 25 * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 26 * Readers can hold either or rcu_read_lock(). 27 */ 28 static LIST_HEAD(scx_sched_all); 29 30 #ifdef CONFIG_EXT_SUB_SCHED 31 static const struct rhashtable_params scx_sched_hash_params = { 32 .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 33 .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 34 .head_offset = offsetof(struct scx_sched, hash_node), 35 }; 36 37 static struct rhashtable scx_sched_hash; 38 #endif 39 40 /* 41 * During exit, a task may schedule after losing its PIDs. When disabling the 42 * BPF scheduler, we need to be able to iterate tasks in every state to 43 * guarantee system safety. Maintain a dedicated task list which contains every 44 * task between its fork and eventual free. 45 */ 46 static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 47 static LIST_HEAD(scx_tasks); 48 49 /* ops enable/disable */ 50 static DEFINE_MUTEX(scx_enable_mutex); 51 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 52 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 53 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 54 static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 55 static cpumask_var_t scx_bypass_lb_donee_cpumask; 56 static cpumask_var_t scx_bypass_lb_resched_cpumask; 57 static bool scx_init_task_enabled; 58 static bool scx_switching_all; 59 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 60 61 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 62 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 63 64 #ifdef CONFIG_EXT_SUB_SCHED 65 /* 66 * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 67 * tasks for the sub-sched being enabled. Use a global variable instead of a 68 * per-task field as all enables are serialized. 69 */ 70 static struct scx_sched *scx_enabling_sub_sched; 71 #else 72 #define scx_enabling_sub_sched (struct scx_sched *)NULL 73 #endif /* CONFIG_EXT_SUB_SCHED */ 74 75 /* 76 * A monotically increasing sequence number that is incremented every time a 77 * scheduler is enabled. This can be used by to check if any custom sched_ext 78 * scheduler has ever been used in the system. 79 */ 80 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 81 82 /* 83 * Watchdog interval. All scx_sched's share a single watchdog timer and the 84 * interval is half of the shortest sch->watchdog_timeout. 85 */ 86 static unsigned long scx_watchdog_interval; 87 88 /* 89 * The last time the delayed work was run. This delayed work relies on 90 * ksoftirqd being able to run to service timer interrupts, so it's possible 91 * that this work itself could get wedged. To account for this, we check that 92 * it's not stalled in the timer tick, and trigger an error if it is. 93 */ 94 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 95 96 static struct delayed_work scx_watchdog_work; 97 98 /* 99 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 100 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 101 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 102 * lazily when enabling and freed when disabling to avoid waste when sched_ext 103 * isn't active. 104 */ 105 struct scx_kick_syncs { 106 struct rcu_head rcu; 107 unsigned long syncs[]; 108 }; 109 110 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 111 112 /* 113 * Direct dispatch marker. 114 * 115 * Non-NULL values are used for direct dispatch from enqueue path. A valid 116 * pointer points to the task currently being enqueued. An ERR_PTR value is used 117 * to indicate that direct dispatch has already happened. 118 */ 119 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 120 121 static const struct rhashtable_params dsq_hash_params = { 122 .key_len = sizeof_field(struct scx_dispatch_q, id), 123 .key_offset = offsetof(struct scx_dispatch_q, id), 124 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 125 }; 126 127 static LLIST_HEAD(dsqs_to_free); 128 129 /* string formatting from BPF */ 130 struct scx_bstr_buf { 131 u64 data[MAX_BPRINTF_VARARGS]; 132 char line[SCX_EXIT_MSG_LEN]; 133 }; 134 135 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 136 static struct scx_bstr_buf scx_exit_bstr_buf; 137 138 /* ops debug dump */ 139 static DEFINE_RAW_SPINLOCK(scx_dump_lock); 140 141 struct scx_dump_data { 142 s32 cpu; 143 bool first; 144 s32 cursor; 145 struct seq_buf *s; 146 const char *prefix; 147 struct scx_bstr_buf buf; 148 }; 149 150 static struct scx_dump_data scx_dump_data = { 151 .cpu = -1, 152 }; 153 154 /* /sys/kernel/sched_ext interface */ 155 static struct kset *scx_kset; 156 157 /* 158 * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 159 * There usually is no reason to modify these as normal scheduler operation 160 * shouldn't be affected by them. The knobs are primarily for debugging. 161 */ 162 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 163 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 164 165 static int set_slice_us(const char *val, const struct kernel_param *kp) 166 { 167 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 168 } 169 170 static const struct kernel_param_ops slice_us_param_ops = { 171 .set = set_slice_us, 172 .get = param_get_uint, 173 }; 174 175 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 176 { 177 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 178 } 179 180 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 181 .set = set_bypass_lb_intv_us, 182 .get = param_get_uint, 183 }; 184 185 #undef MODULE_PARAM_PREFIX 186 #define MODULE_PARAM_PREFIX "sched_ext." 187 188 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 189 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 190 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 191 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 192 193 #undef MODULE_PARAM_PREFIX 194 195 #define CREATE_TRACE_POINTS 196 #include <trace/events/sched_ext.h> 197 198 static void run_deferred(struct rq *rq); 199 static bool task_dead_and_done(struct task_struct *p); 200 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 201 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 202 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 203 s64 exit_code, const char *fmt, va_list args); 204 205 static __printf(4, 5) bool scx_exit(struct scx_sched *sch, 206 enum scx_exit_kind kind, s64 exit_code, 207 const char *fmt, ...) 208 { 209 va_list args; 210 bool ret; 211 212 va_start(args, fmt); 213 ret = scx_vexit(sch, kind, exit_code, fmt, args); 214 va_end(args); 215 216 return ret; 217 } 218 219 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 220 #define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) 221 222 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 223 224 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 225 { 226 if (time_after(at, now)) 227 return jiffies_to_msecs(at - now); 228 else 229 return -(long)jiffies_to_msecs(now - at); 230 } 231 232 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */ 233 static u32 higher_bits(u32 flags) 234 { 235 return ~((1 << fls(flags)) - 1); 236 } 237 238 /* return the mask with only the highest bit set */ 239 static u32 highest_bit(u32 flags) 240 { 241 int bit = fls(flags); 242 return ((u64)1 << bit) >> 1; 243 } 244 245 static bool u32_before(u32 a, u32 b) 246 { 247 return (s32)(a - b) < 0; 248 } 249 250 #ifdef CONFIG_EXT_SUB_SCHED 251 /** 252 * scx_parent - Find the parent sched 253 * @sch: sched to find the parent of 254 * 255 * Returns the parent scheduler or %NULL if @sch is root. 256 */ 257 static struct scx_sched *scx_parent(struct scx_sched *sch) 258 { 259 if (sch->level) 260 return sch->ancestors[sch->level - 1]; 261 else 262 return NULL; 263 } 264 265 /** 266 * scx_next_descendant_pre - find the next descendant for pre-order walk 267 * @pos: the current position (%NULL to initiate traversal) 268 * @root: sched whose descendants to walk 269 * 270 * To be used by scx_for_each_descendant_pre(). Find the next descendant to 271 * visit for pre-order traversal of @root's descendants. @root is included in 272 * the iteration and the first node to be visited. 273 */ 274 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 275 struct scx_sched *root) 276 { 277 struct scx_sched *next; 278 279 lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 280 lockdep_is_held(&scx_sched_lock)); 281 282 /* if first iteration, visit @root */ 283 if (!pos) 284 return root; 285 286 /* visit the first child if exists */ 287 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 288 if (next) 289 return next; 290 291 /* no child, visit my or the closest ancestor's next sibling */ 292 while (pos != root) { 293 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 294 return list_next_entry(pos, sibling); 295 pos = scx_parent(pos); 296 } 297 298 return NULL; 299 } 300 301 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 302 { 303 return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 304 scx_sched_hash_params); 305 } 306 307 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 308 { 309 rcu_assign_pointer(p->scx.sched, sch); 310 } 311 #else /* CONFIG_EXT_SUB_SCHED */ 312 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } 313 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 314 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } 315 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 316 #endif /* CONFIG_EXT_SUB_SCHED */ 317 318 /** 319 * scx_is_descendant - Test whether sched is a descendant 320 * @sch: sched to test 321 * @ancestor: ancestor sched to test against 322 * 323 * Test whether @sch is a descendant of @ancestor. 324 */ 325 static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 326 { 327 if (sch->level < ancestor->level) 328 return false; 329 return sch->ancestors[ancestor->level] == ancestor; 330 } 331 332 /** 333 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 334 * @pos: iteration cursor 335 * @root: sched to walk the descendants of 336 * 337 * Walk @root's descendants. @root is included in the iteration and the first 338 * node to be visited. Must be called with either scx_enable_mutex or 339 * scx_sched_lock held. 340 */ 341 #define scx_for_each_descendant_pre(pos, root) \ 342 for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 343 (pos) = scx_next_descendant_pre((pos), (root))) 344 345 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 346 { 347 return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 348 } 349 350 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 351 { 352 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 353 } 354 355 static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 356 { 357 if (p->sched_class == &stop_sched_class) 358 return &stop_sched_class; 359 360 return __setscheduler_class(p->policy, p->prio); 361 } 362 363 static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 364 { 365 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 366 } 367 368 static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 369 { 370 #ifdef CONFIG_EXT_SUB_SCHED 371 /* 372 * If @sch is a sub-sched which is bypassing, its tasks should go into 373 * the bypass DSQs of the nearest ancestor which is not bypassing. The 374 * not-bypassing ancestor is responsible for scheduling all tasks from 375 * bypassing sub-trees. If all ancestors including root are bypassing, 376 * all tasks should go to the root's bypass DSQs. 377 * 378 * Whenever a sched starts bypassing, all runnable tasks in its subtree 379 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 380 * all tasks are transferred to the right DSQs. 381 */ 382 while (scx_parent(sch) && scx_bypassing(sch, cpu)) 383 sch = scx_parent(sch); 384 #endif /* CONFIG_EXT_SUB_SCHED */ 385 386 return bypass_dsq(sch, cpu); 387 } 388 389 /** 390 * bypass_dsp_enabled - Check if bypass dispatch path is enabled 391 * @sch: scheduler to check 392 * 393 * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 394 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 395 * are bypassing. In the former case, the ancestor is not itself bypassing but 396 * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 397 * the ancestor's bypass dispatch path must be active even though its own 398 * bypass_depth remains zero. 399 * 400 * This function checks bypass_dsp_enable_depth which is managed separately from 401 * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 402 * disable_bypass_dsp(). 403 */ 404 static bool bypass_dsp_enabled(struct scx_sched *sch) 405 { 406 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 407 } 408 409 /** 410 * rq_is_open - Is the rq available for immediate execution of an SCX task? 411 * @rq: rq to test 412 * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 413 * 414 * Returns %true if @rq is currently open for executing an SCX task. After a 415 * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 416 * before going to idle and not inserting a task into @rq's local DSQ after a 417 * %false return doesn't cause @rq to stall. 418 */ 419 static bool rq_is_open(struct rq *rq, u64 enq_flags) 420 { 421 lockdep_assert_rq_held(rq); 422 423 /* 424 * A higher-priority class task is either running or in the process of 425 * waking up on @rq. 426 */ 427 if (sched_class_above(rq->next_class, &ext_sched_class)) 428 return false; 429 430 /* 431 * @rq is either in transition to or in idle and there is no 432 * higher-priority class task waking up on it. 433 */ 434 if (sched_class_above(&ext_sched_class, rq->next_class)) 435 return true; 436 437 /* 438 * @rq is either picking, in transition to, or running an SCX task. 439 */ 440 441 /* 442 * If we're in the dispatch path holding rq lock, $curr may or may not 443 * be ready depending on whether the on-going dispatch decides to extend 444 * $curr's slice. We say yes here and resolve it at the end of dispatch. 445 * See balance_one(). 446 */ 447 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 448 return true; 449 450 /* 451 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 452 * so allow it to avoid spuriously triggering reenq on a combined 453 * PREEMPT|IMMED insertion. 454 */ 455 if (enq_flags & SCX_ENQ_PREEMPT) 456 return true; 457 458 /* 459 * @rq is either in transition to or running an SCX task and can't go 460 * idle without another SCX dispatch cycle. 461 */ 462 return false; 463 } 464 465 /* 466 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX 467 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate 468 * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check 469 * whether it's running from an allowed context. 470 * 471 * @mask is constant, always inline to cull the mask calculations. 472 */ 473 static __always_inline void scx_kf_allow(u32 mask) 474 { 475 /* nesting is allowed only in increasing scx_kf_mask order */ 476 WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, 477 "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", 478 current->scx.kf_mask, mask); 479 current->scx.kf_mask |= mask; 480 barrier(); 481 } 482 483 static void scx_kf_disallow(u32 mask) 484 { 485 barrier(); 486 current->scx.kf_mask &= ~mask; 487 } 488 489 /* 490 * Track the rq currently locked. 491 * 492 * This allows kfuncs to safely operate on rq from any scx ops callback, 493 * knowing which rq is already locked. 494 */ 495 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 496 497 static inline void update_locked_rq(struct rq *rq) 498 { 499 /* 500 * Check whether @rq is actually locked. This can help expose bugs 501 * or incorrect assumptions about the context in which a kfunc or 502 * callback is executed. 503 */ 504 if (rq) 505 lockdep_assert_rq_held(rq); 506 __this_cpu_write(scx_locked_rq_state, rq); 507 } 508 509 #define SCX_CALL_OP(sch, mask, op, rq, args...) \ 510 do { \ 511 if (rq) \ 512 update_locked_rq(rq); \ 513 if (mask) { \ 514 scx_kf_allow(mask); \ 515 (sch)->ops.op(args); \ 516 scx_kf_disallow(mask); \ 517 } else { \ 518 (sch)->ops.op(args); \ 519 } \ 520 if (rq) \ 521 update_locked_rq(NULL); \ 522 } while (0) 523 524 #define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ 525 ({ \ 526 __typeof__((sch)->ops.op(args)) __ret; \ 527 \ 528 if (rq) \ 529 update_locked_rq(rq); \ 530 if (mask) { \ 531 scx_kf_allow(mask); \ 532 __ret = (sch)->ops.op(args); \ 533 scx_kf_disallow(mask); \ 534 } else { \ 535 __ret = (sch)->ops.op(args); \ 536 } \ 537 if (rq) \ 538 update_locked_rq(NULL); \ 539 __ret; \ 540 }) 541 542 /* 543 * Some kfuncs are allowed only on the tasks that are subjects of the 544 * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such 545 * restrictions, the following SCX_CALL_OP_*() variants should be used when 546 * invoking scx_ops operations that take task arguments. These can only be used 547 * for non-nesting operations due to the way the tasks are tracked. 548 * 549 * kfuncs which can only operate on such tasks can in turn use 550 * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on 551 * the specific task. 552 */ 553 #define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ 554 do { \ 555 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 556 current->scx.kf_tasks[0] = task; \ 557 SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ 558 current->scx.kf_tasks[0] = NULL; \ 559 } while (0) 560 561 #define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ 562 ({ \ 563 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 564 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 565 current->scx.kf_tasks[0] = task; \ 566 __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ 567 current->scx.kf_tasks[0] = NULL; \ 568 __ret; \ 569 }) 570 571 #define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ 572 ({ \ 573 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 574 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ 575 current->scx.kf_tasks[0] = task0; \ 576 current->scx.kf_tasks[1] = task1; \ 577 __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ 578 current->scx.kf_tasks[0] = NULL; \ 579 current->scx.kf_tasks[1] = NULL; \ 580 __ret; \ 581 }) 582 583 /* @mask is constant, always inline to cull unnecessary branches */ 584 static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) 585 { 586 if (unlikely(!(current->scx.kf_mask & mask))) { 587 scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", 588 mask, current->scx.kf_mask); 589 return false; 590 } 591 592 /* 593 * Enforce nesting boundaries. e.g. A kfunc which can be called from 594 * DISPATCH must not be called if we're running DEQUEUE which is nested 595 * inside ops.dispatch(). We don't need to check boundaries for any 596 * blocking kfuncs as the verifier ensures they're only called from 597 * sleepable progs. 598 */ 599 if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && 600 (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { 601 scx_error(sch, "cpu_release kfunc called from a nested operation"); 602 return false; 603 } 604 605 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && 606 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { 607 scx_error(sch, "dispatch kfunc called from a nested operation"); 608 return false; 609 } 610 611 return true; 612 } 613 614 /* see SCX_CALL_OP_TASK() */ 615 static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, 616 u32 mask, 617 struct task_struct *p) 618 { 619 if (!scx_kf_allowed(sch, mask)) 620 return false; 621 622 if (unlikely((p != current->scx.kf_tasks[0] && 623 p != current->scx.kf_tasks[1]))) { 624 scx_error(sch, "called on a task not being operated on"); 625 return false; 626 } 627 628 return true; 629 } 630 631 enum scx_dsq_iter_flags { 632 /* iterate in the reverse dispatch order */ 633 SCX_DSQ_ITER_REV = 1U << 16, 634 635 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 636 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 637 638 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 639 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 640 __SCX_DSQ_ITER_HAS_SLICE | 641 __SCX_DSQ_ITER_HAS_VTIME, 642 }; 643 644 /** 645 * nldsq_next_task - Iterate to the next task in a non-local DSQ 646 * @dsq: non-local dsq being iterated 647 * @cur: current position, %NULL to start iteration 648 * @rev: walk backwards 649 * 650 * Returns %NULL when iteration is finished. 651 */ 652 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 653 struct task_struct *cur, bool rev) 654 { 655 struct list_head *list_node; 656 struct scx_dsq_list_node *dsq_lnode; 657 658 lockdep_assert_held(&dsq->lock); 659 660 if (cur) 661 list_node = &cur->scx.dsq_list.node; 662 else 663 list_node = &dsq->list; 664 665 /* find the next task, need to skip BPF iteration cursors */ 666 do { 667 if (rev) 668 list_node = list_node->prev; 669 else 670 list_node = list_node->next; 671 672 if (list_node == &dsq->list) 673 return NULL; 674 675 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 676 node); 677 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 678 679 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 680 } 681 682 #define nldsq_for_each_task(p, dsq) \ 683 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 684 (p) = nldsq_next_task((dsq), (p), false)) 685 686 /** 687 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 688 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 689 * @dsq: non-local dsq being iterated 690 * 691 * Find the next task in a cursor based iteration. The caller must have 692 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 693 * between the iteration steps. 694 * 695 * Only tasks which were queued before @cursor was initialized are visible. This 696 * bounds the iteration and guarantees that vtime never jumps in the other 697 * direction while iterating. 698 */ 699 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 700 struct scx_dispatch_q *dsq) 701 { 702 bool rev = cursor->flags & SCX_DSQ_ITER_REV; 703 struct task_struct *p; 704 705 lockdep_assert_held(&dsq->lock); 706 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 707 708 if (list_empty(&cursor->node)) 709 p = NULL; 710 else 711 p = container_of(cursor, struct task_struct, scx.dsq_list); 712 713 /* skip cursors and tasks that were queued after @cursor init */ 714 do { 715 p = nldsq_next_task(dsq, p, rev); 716 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 717 718 if (p) { 719 if (rev) 720 list_move_tail(&cursor->node, &p->scx.dsq_list.node); 721 else 722 list_move(&cursor->node, &p->scx.dsq_list.node); 723 } else { 724 list_del_init(&cursor->node); 725 } 726 727 return p; 728 } 729 730 /** 731 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 732 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 733 * @rq: rq @p was on 734 * @dsq: dsq @p was on 735 * @p: target task 736 * 737 * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 738 * dropped and re-acquired inbetween. Verify that no one else took or is in the 739 * process of taking @p from @dsq. 740 * 741 * On %false return, the caller can assume full ownership of @p. 742 */ 743 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 744 struct rq *rq, struct scx_dispatch_q *dsq, 745 struct task_struct *p) 746 { 747 lockdep_assert_rq_held(rq); 748 lockdep_assert_held(&dsq->lock); 749 750 /* 751 * @p could have already left $src_dsq, got re-enqueud, or be in the 752 * process of being consumed by someone else. 753 */ 754 if (unlikely(p->scx.dsq != dsq || 755 u32_before(cursor->priv, p->scx.dsq_seq) || 756 p->scx.holding_cpu >= 0)) 757 return true; 758 759 /* if @p has stayed on @dsq, its rq couldn't have changed */ 760 if (WARN_ON_ONCE(rq != task_rq(p))) 761 return true; 762 763 return false; 764 } 765 766 /* 767 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 768 * dispatch order. BPF-visible iterator is opaque and larger to allow future 769 * changes without breaking backward compatibility. Can be used with 770 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 771 */ 772 struct bpf_iter_scx_dsq_kern { 773 struct scx_dsq_list_node cursor; 774 struct scx_dispatch_q *dsq; 775 u64 slice; 776 u64 vtime; 777 } __attribute__((aligned(8))); 778 779 struct bpf_iter_scx_dsq { 780 u64 __opaque[6]; 781 } __attribute__((aligned(8))); 782 783 784 /* 785 * SCX task iterator. 786 */ 787 struct scx_task_iter { 788 struct sched_ext_entity cursor; 789 struct task_struct *locked_task; 790 struct rq *rq; 791 struct rq_flags rf; 792 u32 cnt; 793 bool list_locked; 794 #ifdef CONFIG_EXT_SUB_SCHED 795 struct cgroup *cgrp; 796 struct cgroup_subsys_state *css_pos; 797 struct css_task_iter css_iter; 798 #endif 799 }; 800 801 /** 802 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 803 * @iter: iterator to init 804 * @cgrp: Optional root of cgroup subhierarchy to iterate 805 * 806 * Initialize @iter. Once initialized, @iter must eventually be stopped with 807 * scx_task_iter_stop(). 808 * 809 * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 810 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 811 * 812 * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 813 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 814 * task migrations. 815 * 816 * The two modes of iterations are largely independent and it's likely that 817 * scx_tasks can be removed in favor of always using cgroup iteration if 818 * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 819 * 820 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 821 * between this and the first next() call or between any two next() calls. If 822 * the locks are released between two next() calls, the caller is responsible 823 * for ensuring that the task being iterated remains accessible either through 824 * RCU read lock or obtaining a reference count. 825 * 826 * All tasks which existed when the iteration started are guaranteed to be 827 * visited as long as they are not dead. 828 */ 829 static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 830 { 831 memset(iter, 0, sizeof(*iter)); 832 833 #ifdef CONFIG_EXT_SUB_SCHED 834 if (cgrp) { 835 lockdep_assert_held(&cgroup_mutex); 836 iter->cgrp = cgrp; 837 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 838 css_task_iter_start(iter->css_pos, 0, &iter->css_iter); 839 return; 840 } 841 #endif 842 raw_spin_lock_irq(&scx_tasks_lock); 843 844 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 845 list_add(&iter->cursor.tasks_node, &scx_tasks); 846 iter->list_locked = true; 847 } 848 849 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 850 { 851 if (iter->locked_task) { 852 __balance_callbacks(iter->rq, &iter->rf); 853 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 854 iter->locked_task = NULL; 855 } 856 } 857 858 /** 859 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 860 * @iter: iterator to unlock 861 * 862 * If @iter is in the middle of a locked iteration, it may be locking the rq of 863 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 864 * This function can be safely called anytime during an iteration. The next 865 * iterator operation will automatically restore the necessary locking. 866 */ 867 static void scx_task_iter_unlock(struct scx_task_iter *iter) 868 { 869 __scx_task_iter_rq_unlock(iter); 870 if (iter->list_locked) { 871 iter->list_locked = false; 872 raw_spin_unlock_irq(&scx_tasks_lock); 873 } 874 } 875 876 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 877 { 878 if (!iter->list_locked) { 879 raw_spin_lock_irq(&scx_tasks_lock); 880 iter->list_locked = true; 881 } 882 } 883 884 /** 885 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 886 * @iter: iterator to exit 887 * 888 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 889 * which is released on return. If the iterator holds a task's rq lock, that rq 890 * lock is also released. See scx_task_iter_start() for details. 891 */ 892 static void scx_task_iter_stop(struct scx_task_iter *iter) 893 { 894 #ifdef CONFIG_EXT_SUB_SCHED 895 if (iter->cgrp) { 896 if (iter->css_pos) 897 css_task_iter_end(&iter->css_iter); 898 __scx_task_iter_rq_unlock(iter); 899 return; 900 } 901 #endif 902 __scx_task_iter_maybe_relock(iter); 903 list_del_init(&iter->cursor.tasks_node); 904 scx_task_iter_unlock(iter); 905 } 906 907 /** 908 * scx_task_iter_next - Next task 909 * @iter: iterator to walk 910 * 911 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 912 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 913 * by holding scx_tasks_lock for too long. 914 */ 915 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 916 { 917 struct list_head *cursor = &iter->cursor.tasks_node; 918 struct sched_ext_entity *pos; 919 920 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 921 scx_task_iter_unlock(iter); 922 cond_resched(); 923 } 924 925 #ifdef CONFIG_EXT_SUB_SCHED 926 if (iter->cgrp) { 927 while (iter->css_pos) { 928 struct task_struct *p; 929 930 p = css_task_iter_next(&iter->css_iter); 931 if (p) 932 return p; 933 934 css_task_iter_end(&iter->css_iter); 935 iter->css_pos = css_next_descendant_pre(iter->css_pos, 936 &iter->cgrp->self); 937 if (iter->css_pos) 938 css_task_iter_start(iter->css_pos, 0, &iter->css_iter); 939 } 940 return NULL; 941 } 942 #endif 943 __scx_task_iter_maybe_relock(iter); 944 945 list_for_each_entry(pos, cursor, tasks_node) { 946 if (&pos->tasks_node == &scx_tasks) 947 return NULL; 948 if (!(pos->flags & SCX_TASK_CURSOR)) { 949 list_move(cursor, &pos->tasks_node); 950 return container_of(pos, struct task_struct, scx); 951 } 952 } 953 954 /* can't happen, should always terminate at scx_tasks above */ 955 BUG(); 956 } 957 958 /** 959 * scx_task_iter_next_locked - Next non-idle task with its rq locked 960 * @iter: iterator to walk 961 * 962 * Visit the non-idle task with its rq lock held. Allows callers to specify 963 * whether they would like to filter out dead tasks. See scx_task_iter_start() 964 * for details. 965 */ 966 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 967 { 968 struct task_struct *p; 969 970 __scx_task_iter_rq_unlock(iter); 971 972 while ((p = scx_task_iter_next(iter))) { 973 /* 974 * scx_task_iter is used to prepare and move tasks into SCX 975 * while loading the BPF scheduler and vice-versa while 976 * unloading. The init_tasks ("swappers") should be excluded 977 * from the iteration because: 978 * 979 * - It's unsafe to use __setschduler_prio() on an init_task to 980 * determine the sched_class to use as it won't preserve its 981 * idle_sched_class. 982 * 983 * - ops.init/exit_task() can easily be confused if called with 984 * init_tasks as they, e.g., share PID 0. 985 * 986 * As init_tasks are never scheduled through SCX, they can be 987 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 988 * doesn't work here: 989 * 990 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 991 * yet been onlined. 992 * 993 * - %PF_IDLE can be set on tasks that are not init_tasks. See 994 * play_idle_precise() used by CONFIG_IDLE_INJECT. 995 * 996 * Test for idle_sched_class as only init_tasks are on it. 997 */ 998 if (p->sched_class != &idle_sched_class) 999 break; 1000 } 1001 if (!p) 1002 return NULL; 1003 1004 iter->rq = task_rq_lock(p, &iter->rf); 1005 iter->locked_task = p; 1006 1007 return p; 1008 } 1009 1010 /** 1011 * scx_add_event - Increase an event counter for 'name' by 'cnt' 1012 * @sch: scx_sched to account events for 1013 * @name: an event name defined in struct scx_event_stats 1014 * @cnt: the number of the event occurred 1015 * 1016 * This can be used when preemption is not disabled. 1017 */ 1018 #define scx_add_event(sch, name, cnt) do { \ 1019 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1020 trace_sched_ext_event(#name, (cnt)); \ 1021 } while(0) 1022 1023 /** 1024 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1025 * @sch: scx_sched to account events for 1026 * @name: an event name defined in struct scx_event_stats 1027 * @cnt: the number of the event occurred 1028 * 1029 * This should be used only when preemption is disabled. 1030 */ 1031 #define __scx_add_event(sch, name, cnt) do { \ 1032 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1033 trace_sched_ext_event(#name, cnt); \ 1034 } while(0) 1035 1036 /** 1037 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1038 * @dst_e: destination event stats 1039 * @src_e: source event stats 1040 * @kind: a kind of event to be aggregated 1041 */ 1042 #define scx_agg_event(dst_e, src_e, kind) do { \ 1043 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1044 } while(0) 1045 1046 /** 1047 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1048 * @s: output seq_buf 1049 * @events: event stats 1050 * @kind: a kind of event to dump 1051 */ 1052 #define scx_dump_event(s, events, kind) do { \ 1053 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1054 } while (0) 1055 1056 1057 static void scx_read_events(struct scx_sched *sch, 1058 struct scx_event_stats *events); 1059 1060 static enum scx_enable_state scx_enable_state(void) 1061 { 1062 return atomic_read(&scx_enable_state_var); 1063 } 1064 1065 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1066 { 1067 return atomic_xchg(&scx_enable_state_var, to); 1068 } 1069 1070 static bool scx_tryset_enable_state(enum scx_enable_state to, 1071 enum scx_enable_state from) 1072 { 1073 int from_v = from; 1074 1075 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1076 } 1077 1078 /** 1079 * wait_ops_state - Busy-wait the specified ops state to end 1080 * @p: target task 1081 * @opss: state to wait the end of 1082 * 1083 * Busy-wait for @p to transition out of @opss. This can only be used when the 1084 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1085 * has load_acquire semantics to ensure that the caller can see the updates made 1086 * in the enqueueing and dispatching paths. 1087 */ 1088 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1089 { 1090 do { 1091 cpu_relax(); 1092 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1093 } 1094 1095 static inline bool __cpu_valid(s32 cpu) 1096 { 1097 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1098 } 1099 1100 /** 1101 * ops_cpu_valid - Verify a cpu number, to be used on ops input args 1102 * @sch: scx_sched to abort on error 1103 * @cpu: cpu number which came from a BPF ops 1104 * @where: extra information reported on error 1105 * 1106 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1107 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1108 * an ops error. 1109 */ 1110 static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1111 { 1112 if (__cpu_valid(cpu)) { 1113 return true; 1114 } else { 1115 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1116 return false; 1117 } 1118 } 1119 1120 /** 1121 * ops_sanitize_err - Sanitize a -errno value 1122 * @sch: scx_sched to error out on error 1123 * @ops_name: operation to blame on failure 1124 * @err: -errno value to sanitize 1125 * 1126 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1127 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1128 * cause misbehaviors. For an example, a large negative return from 1129 * ops.init_task() triggers an oops when passed up the call chain because the 1130 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1131 * handled as a pointer. 1132 */ 1133 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1134 { 1135 if (err < 0 && err >= -MAX_ERRNO) 1136 return err; 1137 1138 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1139 return -EPROTO; 1140 } 1141 1142 static void deferred_bal_cb_workfn(struct rq *rq) 1143 { 1144 run_deferred(rq); 1145 } 1146 1147 static void deferred_irq_workfn(struct irq_work *irq_work) 1148 { 1149 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1150 1151 raw_spin_rq_lock(rq); 1152 run_deferred(rq); 1153 raw_spin_rq_unlock(rq); 1154 } 1155 1156 /** 1157 * schedule_deferred - Schedule execution of deferred actions on an rq 1158 * @rq: target rq 1159 * 1160 * Schedule execution of deferred actions on @rq. Deferred actions are executed 1161 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1162 * to other rqs. 1163 */ 1164 static void schedule_deferred(struct rq *rq) 1165 { 1166 /* 1167 * Queue an irq work. They are executed on IRQ re-enable which may take 1168 * a bit longer than the scheduler hook in schedule_deferred_locked(). 1169 */ 1170 irq_work_queue(&rq->scx.deferred_irq_work); 1171 } 1172 1173 /** 1174 * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1175 * @rq: target rq 1176 * 1177 * Schedule execution of deferred actions on @rq. Equivalent to 1178 * schedule_deferred() but requires @rq to be locked and can be more efficient. 1179 */ 1180 static void schedule_deferred_locked(struct rq *rq) 1181 { 1182 lockdep_assert_rq_held(rq); 1183 1184 /* 1185 * If in the middle of waking up a task, task_woken_scx() will be called 1186 * afterwards which will then run the deferred actions, no need to 1187 * schedule anything. 1188 */ 1189 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1190 return; 1191 1192 /* Don't do anything if there already is a deferred operation. */ 1193 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1194 return; 1195 1196 /* 1197 * If in balance, the balance callbacks will be called before rq lock is 1198 * released. Schedule one. 1199 * 1200 * 1201 * We can't directly insert the callback into the 1202 * rq's list: The call can drop its lock and make the pending balance 1203 * callback visible to unrelated code paths that call rq_pin_lock(). 1204 * 1205 * Just let balance_one() know that it must do it itself. 1206 */ 1207 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1208 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1209 return; 1210 } 1211 1212 /* 1213 * No scheduler hooks available. Use the generic irq_work path. The 1214 * above WAKEUP and BALANCE paths should cover most of the cases and the 1215 * time to IRQ re-enable shouldn't be long. 1216 */ 1217 schedule_deferred(rq); 1218 } 1219 1220 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1221 u64 reenq_flags) 1222 { 1223 /* 1224 * Allowing reenqueues doesn't make sense while bypassing. This also 1225 * blocks from new reenqueues to be scheduled on dead scheds. 1226 */ 1227 if (unlikely(READ_ONCE(sch->bypass_depth))) 1228 return; 1229 1230 if (dsq->id == SCX_DSQ_LOCAL) { 1231 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1232 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1233 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1234 1235 /* 1236 * Pairs with smp_mb() in process_deferred_reenq_locals() and 1237 * guarantees that there is a reenq_local() afterwards. 1238 */ 1239 smp_mb(); 1240 1241 if (list_empty(&drl->node) || 1242 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1243 1244 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1245 1246 if (list_empty(&drl->node)) 1247 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1248 WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1249 } 1250 1251 schedule_deferred(rq); 1252 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1253 struct rq *rq = this_rq(); 1254 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1255 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1256 1257 /* 1258 * Pairs with smp_mb() in process_deferred_reenq_users() and 1259 * guarantees that there is a reenq_user() afterwards. 1260 */ 1261 smp_mb(); 1262 1263 if (list_empty(&dru->node) || 1264 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1265 1266 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1267 1268 if (list_empty(&dru->node)) 1269 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1270 WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1271 } 1272 1273 schedule_deferred(rq); 1274 } else { 1275 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1276 } 1277 } 1278 1279 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1280 { 1281 struct scx_sched *root = rcu_dereference_sched(scx_root); 1282 1283 if (WARN_ON_ONCE(!root)) 1284 return; 1285 1286 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags); 1287 } 1288 1289 /** 1290 * touch_core_sched - Update timestamp used for core-sched task ordering 1291 * @rq: rq to read clock from, must be locked 1292 * @p: task to update the timestamp for 1293 * 1294 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1295 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1296 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1297 * exhaustion). 1298 */ 1299 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1300 { 1301 lockdep_assert_rq_held(rq); 1302 1303 #ifdef CONFIG_SCHED_CORE 1304 /* 1305 * It's okay to update the timestamp spuriously. Use 1306 * sched_core_disabled() which is cheaper than enabled(). 1307 * 1308 * As this is used to determine ordering between tasks of sibling CPUs, 1309 * it may be better to use per-core dispatch sequence instead. 1310 */ 1311 if (!sched_core_disabled()) 1312 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1313 #endif 1314 } 1315 1316 /** 1317 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1318 * @rq: rq to read clock from, must be locked 1319 * @p: task being dispatched 1320 * 1321 * If the BPF scheduler implements custom core-sched ordering via 1322 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1323 * ordering within each local DSQ. This function is called from dispatch paths 1324 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1325 */ 1326 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1327 { 1328 lockdep_assert_rq_held(rq); 1329 1330 #ifdef CONFIG_SCHED_CORE 1331 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1332 touch_core_sched(rq, p); 1333 #endif 1334 } 1335 1336 static void update_curr_scx(struct rq *rq) 1337 { 1338 struct task_struct *curr = rq->curr; 1339 s64 delta_exec; 1340 1341 delta_exec = update_curr_common(rq); 1342 if (unlikely(delta_exec <= 0)) 1343 return; 1344 1345 if (curr->scx.slice != SCX_SLICE_INF) { 1346 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1347 if (!curr->scx.slice) 1348 touch_core_sched(rq, curr); 1349 } 1350 1351 dl_server_update(&rq->ext_server, delta_exec); 1352 } 1353 1354 static bool scx_dsq_priq_less(struct rb_node *node_a, 1355 const struct rb_node *node_b) 1356 { 1357 const struct task_struct *a = 1358 container_of(node_a, struct task_struct, scx.dsq_priq); 1359 const struct task_struct *b = 1360 container_of(node_b, struct task_struct, scx.dsq_priq); 1361 1362 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1363 } 1364 1365 static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1366 { 1367 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1368 WRITE_ONCE(dsq->nr, dsq->nr + 1); 1369 1370 /* 1371 * Once @p reaches a local DSQ, it can only leave it by being dispatched 1372 * to the CPU or dequeued. In both cases, the only way @p can go back to 1373 * the BPF sched is through enqueueing. If being inserted into a local 1374 * DSQ with IMMED, persist the state until the next enqueueing event in 1375 * do_enqueue_task() so that we can maintain IMMED protection through 1376 * e.g. SAVE/RESTORE cycles and slice extensions. 1377 */ 1378 if (enq_flags & SCX_ENQ_IMMED) { 1379 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1380 WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1381 return; 1382 } 1383 p->scx.flags |= SCX_TASK_IMMED; 1384 } 1385 1386 if (p->scx.flags & SCX_TASK_IMMED) { 1387 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1388 1389 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1390 return; 1391 1392 rq->scx.nr_immed++; 1393 1394 /* 1395 * If @rq already had other tasks or the current task is not 1396 * done yet, @p can't go on the CPU immediately. Re-enqueue. 1397 */ 1398 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1399 schedule_reenq_local(rq, 0); 1400 } 1401 } 1402 1403 static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1404 { 1405 /* see dsq_inc_nr() */ 1406 WRITE_ONCE(dsq->nr, dsq->nr - 1); 1407 1408 if (p->scx.flags & SCX_TASK_IMMED) { 1409 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1410 1411 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1412 WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1413 return; 1414 1415 rq->scx.nr_immed--; 1416 } 1417 } 1418 1419 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1420 { 1421 p->scx.slice = READ_ONCE(sch->slice_dfl); 1422 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1423 } 1424 1425 /* 1426 * Return true if @p is moving due to an internal SCX migration, false 1427 * otherwise. 1428 */ 1429 static inline bool task_scx_migrating(struct task_struct *p) 1430 { 1431 /* 1432 * We only need to check sticky_cpu: it is set to the destination 1433 * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1434 * and cleared when the task is enqueued on the destination, so it 1435 * is only non-negative during an internal SCX migration. 1436 */ 1437 return p->scx.sticky_cpu >= 0; 1438 } 1439 1440 /* 1441 * Call ops.dequeue() if the task is in BPF custody and not migrating. 1442 * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1443 */ 1444 static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1445 struct task_struct *p, u64 deq_flags) 1446 { 1447 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1448 return; 1449 1450 if (SCX_HAS_OP(sch, dequeue)) 1451 SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, deq_flags); 1452 1453 p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1454 } 1455 1456 static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, 1457 u64 enq_flags) 1458 { 1459 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1460 bool preempt = false; 1461 1462 call_task_dequeue(scx_root, rq, p, 0); 1463 1464 /* 1465 * If @rq is in balance, the CPU is already vacant and looking for the 1466 * next task to run. No need to preempt or trigger resched after moving 1467 * @p into its local DSQ. 1468 */ 1469 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1470 return; 1471 1472 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1473 rq->curr->sched_class == &ext_sched_class) { 1474 rq->curr->scx.slice = 0; 1475 preempt = true; 1476 } 1477 1478 if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) 1479 resched_curr(rq); 1480 } 1481 1482 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1483 struct scx_dispatch_q *dsq, struct task_struct *p, 1484 u64 enq_flags) 1485 { 1486 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1487 1488 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1489 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1490 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1491 1492 if (!is_local) { 1493 raw_spin_lock_nested(&dsq->lock, 1494 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1495 1496 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1497 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1498 /* fall back to the global dsq */ 1499 raw_spin_unlock(&dsq->lock); 1500 dsq = find_global_dsq(sch, task_cpu(p)); 1501 raw_spin_lock(&dsq->lock); 1502 } 1503 } 1504 1505 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1506 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1507 /* 1508 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1509 * their FIFO queues. To avoid confusion and accidentally 1510 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1511 * disallow any internal DSQ from doing vtime ordering of 1512 * tasks. 1513 */ 1514 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1515 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1516 } 1517 1518 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1519 struct rb_node *rbp; 1520 1521 /* 1522 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1523 * linked to both the rbtree and list on PRIQs, this can only be 1524 * tested easily when adding the first task. 1525 */ 1526 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1527 nldsq_next_task(dsq, NULL, false))) 1528 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1529 dsq->id); 1530 1531 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1532 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1533 1534 /* 1535 * Find the previous task and insert after it on the list so 1536 * that @dsq->list is vtime ordered. 1537 */ 1538 rbp = rb_prev(&p->scx.dsq_priq); 1539 if (rbp) { 1540 struct task_struct *prev = 1541 container_of(rbp, struct task_struct, 1542 scx.dsq_priq); 1543 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1544 /* first task unchanged - no update needed */ 1545 } else { 1546 list_add(&p->scx.dsq_list.node, &dsq->list); 1547 /* not builtin and new task is at head - use fastpath */ 1548 rcu_assign_pointer(dsq->first_task, p); 1549 } 1550 } else { 1551 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1552 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1553 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1554 dsq->id); 1555 1556 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1557 list_add(&p->scx.dsq_list.node, &dsq->list); 1558 /* new task inserted at head - use fastpath */ 1559 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1560 rcu_assign_pointer(dsq->first_task, p); 1561 } else { 1562 bool was_empty; 1563 1564 was_empty = list_empty(&dsq->list); 1565 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1566 if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1567 rcu_assign_pointer(dsq->first_task, p); 1568 } 1569 } 1570 1571 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1572 WRITE_ONCE(dsq->seq, dsq->seq + 1); 1573 p->scx.dsq_seq = dsq->seq; 1574 1575 dsq_inc_nr(dsq, p, enq_flags); 1576 p->scx.dsq = dsq; 1577 1578 /* 1579 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the 1580 * direct dispatch path, but we clear them here because the direct 1581 * dispatch verdict may be overridden on the enqueue path during e.g. 1582 * bypass. 1583 */ 1584 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1585 p->scx.ddsp_enq_flags = 0; 1586 1587 /* 1588 * Update custody and call ops.dequeue() before clearing ops_state: 1589 * once ops_state is cleared, waiters in ops_dequeue() can proceed 1590 * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1591 * ops_state first, both sides would modify p->scx.flags 1592 * concurrently in a non-atomic way. 1593 */ 1594 if (is_local) { 1595 local_dsq_post_enq(dsq, p, enq_flags); 1596 } else { 1597 /* 1598 * Task on global/bypass DSQ: leave custody, task on 1599 * non-terminal DSQ: enter custody. 1600 */ 1601 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1602 call_task_dequeue(sch, rq, p, 0); 1603 else 1604 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1605 1606 raw_spin_unlock(&dsq->lock); 1607 } 1608 1609 /* 1610 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1611 * match waiters' load_acquire. 1612 */ 1613 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1614 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1615 } 1616 1617 static void task_unlink_from_dsq(struct task_struct *p, 1618 struct scx_dispatch_q *dsq) 1619 { 1620 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1621 1622 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1623 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1624 RB_CLEAR_NODE(&p->scx.dsq_priq); 1625 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1626 } 1627 1628 list_del_init(&p->scx.dsq_list.node); 1629 dsq_dec_nr(dsq, p); 1630 1631 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1632 struct task_struct *first_task; 1633 1634 first_task = nldsq_next_task(dsq, NULL, false); 1635 rcu_assign_pointer(dsq->first_task, first_task); 1636 } 1637 } 1638 1639 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1640 { 1641 struct scx_dispatch_q *dsq = p->scx.dsq; 1642 bool is_local = dsq == &rq->scx.local_dsq; 1643 1644 lockdep_assert_rq_held(rq); 1645 1646 if (!dsq) { 1647 /* 1648 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1649 * Unlinking is all that's needed to cancel. 1650 */ 1651 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1652 list_del_init(&p->scx.dsq_list.node); 1653 1654 /* 1655 * When dispatching directly from the BPF scheduler to a local 1656 * DSQ, the task isn't associated with any DSQ but 1657 * @p->scx.holding_cpu may be set under the protection of 1658 * %SCX_OPSS_DISPATCHING. 1659 */ 1660 if (p->scx.holding_cpu >= 0) 1661 p->scx.holding_cpu = -1; 1662 1663 return; 1664 } 1665 1666 if (!is_local) 1667 raw_spin_lock(&dsq->lock); 1668 1669 /* 1670 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1671 * change underneath us. 1672 */ 1673 if (p->scx.holding_cpu < 0) { 1674 /* @p must still be on @dsq, dequeue */ 1675 task_unlink_from_dsq(p, dsq); 1676 } else { 1677 /* 1678 * We're racing against dispatch_to_local_dsq() which already 1679 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1680 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1681 * the race. 1682 */ 1683 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1684 p->scx.holding_cpu = -1; 1685 } 1686 p->scx.dsq = NULL; 1687 1688 if (!is_local) 1689 raw_spin_unlock(&dsq->lock); 1690 } 1691 1692 /* 1693 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1694 * and dsq are locked. 1695 */ 1696 static void dispatch_dequeue_locked(struct task_struct *p, 1697 struct scx_dispatch_q *dsq) 1698 { 1699 lockdep_assert_rq_held(task_rq(p)); 1700 lockdep_assert_held(&dsq->lock); 1701 1702 task_unlink_from_dsq(p, dsq); 1703 p->scx.dsq = NULL; 1704 } 1705 1706 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1707 struct rq *rq, u64 dsq_id, 1708 s32 tcpu) 1709 { 1710 struct scx_dispatch_q *dsq; 1711 1712 if (dsq_id == SCX_DSQ_LOCAL) 1713 return &rq->scx.local_dsq; 1714 1715 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1716 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1717 1718 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1719 return find_global_dsq(sch, tcpu); 1720 1721 return &cpu_rq(cpu)->scx.local_dsq; 1722 } 1723 1724 if (dsq_id == SCX_DSQ_GLOBAL) 1725 dsq = find_global_dsq(sch, tcpu); 1726 else 1727 dsq = find_user_dsq(sch, dsq_id); 1728 1729 if (unlikely(!dsq)) { 1730 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1731 return find_global_dsq(sch, tcpu); 1732 } 1733 1734 return dsq; 1735 } 1736 1737 static void mark_direct_dispatch(struct scx_sched *sch, 1738 struct task_struct *ddsp_task, 1739 struct task_struct *p, u64 dsq_id, 1740 u64 enq_flags) 1741 { 1742 /* 1743 * Mark that dispatch already happened from ops.select_cpu() or 1744 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1745 * which can never match a valid task pointer. 1746 */ 1747 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1748 1749 /* @p must match the task on the enqueue path */ 1750 if (unlikely(p != ddsp_task)) { 1751 if (IS_ERR(ddsp_task)) 1752 scx_error(sch, "%s[%d] already direct-dispatched", 1753 p->comm, p->pid); 1754 else 1755 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1756 ddsp_task->comm, ddsp_task->pid, 1757 p->comm, p->pid); 1758 return; 1759 } 1760 1761 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1762 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1763 1764 p->scx.ddsp_dsq_id = dsq_id; 1765 p->scx.ddsp_enq_flags = enq_flags; 1766 } 1767 1768 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1769 u64 enq_flags) 1770 { 1771 struct rq *rq = task_rq(p); 1772 struct scx_dispatch_q *dsq = 1773 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1774 1775 touch_core_sched_dispatch(rq, p); 1776 1777 p->scx.ddsp_enq_flags |= enq_flags; 1778 1779 /* 1780 * We are in the enqueue path with @rq locked and pinned, and thus can't 1781 * double lock a remote rq and enqueue to its local DSQ. For 1782 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1783 * the enqueue so that it's executed when @rq can be unlocked. 1784 */ 1785 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1786 unsigned long opss; 1787 1788 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1789 1790 switch (opss & SCX_OPSS_STATE_MASK) { 1791 case SCX_OPSS_NONE: 1792 break; 1793 case SCX_OPSS_QUEUEING: 1794 /* 1795 * As @p was never passed to the BPF side, _release is 1796 * not strictly necessary. Still do it for consistency. 1797 */ 1798 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1799 break; 1800 default: 1801 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1802 p->comm, p->pid, opss); 1803 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1804 break; 1805 } 1806 1807 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1808 list_add_tail(&p->scx.dsq_list.node, 1809 &rq->scx.ddsp_deferred_locals); 1810 schedule_deferred_locked(rq); 1811 return; 1812 } 1813 1814 dispatch_enqueue(sch, rq, dsq, p, 1815 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1816 } 1817 1818 static bool scx_rq_online(struct rq *rq) 1819 { 1820 /* 1821 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1822 * the online state as seen from the BPF scheduler. cpu_active() test 1823 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1824 * stay set until the current scheduling operation is complete even if 1825 * we aren't locking @rq. 1826 */ 1827 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1828 } 1829 1830 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1831 int sticky_cpu) 1832 { 1833 struct scx_sched *sch = scx_task_sched(p); 1834 struct task_struct **ddsp_taskp; 1835 struct scx_dispatch_q *dsq; 1836 unsigned long qseq; 1837 1838 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1839 1840 /* internal movements - rq migration / RESTORE */ 1841 if (sticky_cpu == cpu_of(rq)) 1842 goto local_norefill; 1843 1844 /* 1845 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 1846 * Note that exiting and migration-disabled tasks that skip 1847 * ops.enqueue() below will lose IMMED protection unless 1848 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 1849 */ 1850 p->scx.flags &= ~SCX_TASK_IMMED; 1851 1852 /* 1853 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 1854 * is offline and are just running the hotplug path. Don't bother the 1855 * BPF scheduler. 1856 */ 1857 if (!scx_rq_online(rq)) 1858 goto local; 1859 1860 if (scx_bypassing(sch, cpu_of(rq))) { 1861 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 1862 goto bypass; 1863 } 1864 1865 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1866 goto direct; 1867 1868 /* see %SCX_OPS_ENQ_EXITING */ 1869 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 1870 unlikely(p->flags & PF_EXITING)) { 1871 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 1872 goto local; 1873 } 1874 1875 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 1876 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 1877 is_migration_disabled(p)) { 1878 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 1879 goto local; 1880 } 1881 1882 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 1883 goto global; 1884 1885 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1886 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1887 1888 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1889 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1890 1891 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1892 WARN_ON_ONCE(*ddsp_taskp); 1893 *ddsp_taskp = p; 1894 1895 SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); 1896 1897 *ddsp_taskp = NULL; 1898 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1899 goto direct; 1900 1901 /* 1902 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 1903 * so ops.dequeue() is called when it leaves custody. 1904 */ 1905 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1906 1907 /* 1908 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1909 * dequeue may be waiting. The store_release matches their load_acquire. 1910 */ 1911 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1912 return; 1913 1914 direct: 1915 direct_dispatch(sch, p, enq_flags); 1916 return; 1917 local_norefill: 1918 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 1919 return; 1920 local: 1921 dsq = &rq->scx.local_dsq; 1922 goto enqueue; 1923 global: 1924 dsq = find_global_dsq(sch, task_cpu(p)); 1925 goto enqueue; 1926 bypass: 1927 dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 1928 goto enqueue; 1929 1930 enqueue: 1931 /* 1932 * For task-ordering, slice refill must be treated as implying the end 1933 * of the current slice. Otherwise, the longer @p stays on the CPU, the 1934 * higher priority it becomes from scx_prio_less()'s POV. 1935 */ 1936 touch_core_sched(rq, p); 1937 refill_task_slice_dfl(sch, p); 1938 dispatch_enqueue(sch, rq, dsq, p, enq_flags); 1939 } 1940 1941 static bool task_runnable(const struct task_struct *p) 1942 { 1943 return !list_empty(&p->scx.runnable_node); 1944 } 1945 1946 static void set_task_runnable(struct rq *rq, struct task_struct *p) 1947 { 1948 lockdep_assert_rq_held(rq); 1949 1950 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 1951 p->scx.runnable_at = jiffies; 1952 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 1953 } 1954 1955 /* 1956 * list_add_tail() must be used. scx_bypass() depends on tasks being 1957 * appended to the runnable_list. 1958 */ 1959 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 1960 } 1961 1962 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 1963 { 1964 list_del_init(&p->scx.runnable_node); 1965 if (reset_runnable_at) 1966 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 1967 } 1968 1969 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 1970 { 1971 struct scx_sched *sch = scx_task_sched(p); 1972 int sticky_cpu = p->scx.sticky_cpu; 1973 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 1974 1975 if (enq_flags & ENQUEUE_WAKEUP) 1976 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 1977 1978 /* 1979 * Restoring a running task will be immediately followed by 1980 * set_next_task_scx() which expects the task to not be on the BPF 1981 * scheduler as tasks can only start running through local DSQs. Force 1982 * direct-dispatch into the local DSQ by setting the sticky_cpu. 1983 */ 1984 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 1985 sticky_cpu = cpu_of(rq); 1986 1987 if (p->scx.flags & SCX_TASK_QUEUED) { 1988 WARN_ON_ONCE(!task_runnable(p)); 1989 goto out; 1990 } 1991 1992 set_task_runnable(rq, p); 1993 p->scx.flags |= SCX_TASK_QUEUED; 1994 rq->scx.nr_running++; 1995 add_nr_running(rq, 1); 1996 1997 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 1998 SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); 1999 2000 if (enq_flags & SCX_ENQ_WAKEUP) 2001 touch_core_sched(rq, p); 2002 2003 /* Start dl_server if this is the first task being enqueued */ 2004 if (rq->scx.nr_running == 1) 2005 dl_server_start(&rq->ext_server); 2006 2007 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2008 2009 if (sticky_cpu >= 0) 2010 p->scx.sticky_cpu = -1; 2011 out: 2012 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2013 2014 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2015 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2016 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2017 } 2018 2019 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2020 { 2021 struct scx_sched *sch = scx_task_sched(p); 2022 unsigned long opss; 2023 2024 /* dequeue is always temporary, don't reset runnable_at */ 2025 clr_task_runnable(p, false); 2026 2027 /* acquire ensures that we see the preceding updates on QUEUED */ 2028 opss = atomic_long_read_acquire(&p->scx.ops_state); 2029 2030 switch (opss & SCX_OPSS_STATE_MASK) { 2031 case SCX_OPSS_NONE: 2032 break; 2033 case SCX_OPSS_QUEUEING: 2034 /* 2035 * QUEUEING is started and finished while holding @p's rq lock. 2036 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2037 */ 2038 BUG(); 2039 case SCX_OPSS_QUEUED: 2040 /* A queued task must always be in BPF scheduler's custody */ 2041 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); 2042 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2043 SCX_OPSS_NONE)) 2044 break; 2045 fallthrough; 2046 case SCX_OPSS_DISPATCHING: 2047 /* 2048 * If @p is being dispatched from the BPF scheduler to a DSQ, 2049 * wait for the transfer to complete so that @p doesn't get 2050 * added to its DSQ after dequeueing is complete. 2051 * 2052 * As we're waiting on DISPATCHING with the rq locked, the 2053 * dispatching side shouldn't try to lock the rq while 2054 * DISPATCHING is set. See dispatch_to_local_dsq(). 2055 * 2056 * DISPATCHING shouldn't have qseq set and control can reach 2057 * here with NONE @opss from the above QUEUED case block. 2058 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2059 */ 2060 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2061 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2062 break; 2063 } 2064 2065 /* 2066 * Call ops.dequeue() if the task is still in BPF custody. 2067 * 2068 * The code that clears ops_state to %SCX_OPSS_NONE does not always 2069 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2070 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2071 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2072 * so that a concurrent dequeue can proceed, but we clear 2073 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2074 * task. So we can see NONE + IN_CUSTODY here and we must handle 2075 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2076 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2077 * it is enqueued on the destination. 2078 */ 2079 call_task_dequeue(sch, rq, p, deq_flags); 2080 } 2081 2082 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2083 { 2084 struct scx_sched *sch = scx_task_sched(p); 2085 u64 deq_flags = core_deq_flags; 2086 2087 /* 2088 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2089 * change (not sleep or core-sched pick). 2090 */ 2091 if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2092 deq_flags |= SCX_DEQ_SCHED_CHANGE; 2093 2094 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2095 WARN_ON_ONCE(task_runnable(p)); 2096 return true; 2097 } 2098 2099 ops_dequeue(rq, p, deq_flags); 2100 2101 /* 2102 * A currently running task which is going off @rq first gets dequeued 2103 * and then stops running. As we want running <-> stopping transitions 2104 * to be contained within runnable <-> quiescent transitions, trigger 2105 * ->stopping() early here instead of in put_prev_task_scx(). 2106 * 2107 * @p may go through multiple stopping <-> running transitions between 2108 * here and put_prev_task_scx() if task attribute changes occur while 2109 * balance_one() leaves @rq unlocked. However, they don't contain any 2110 * information meaningful to the BPF scheduler and can be suppressed by 2111 * skipping the callbacks if the task is !QUEUED. 2112 */ 2113 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2114 update_curr_scx(rq); 2115 SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); 2116 } 2117 2118 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2119 SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); 2120 2121 if (deq_flags & SCX_DEQ_SLEEP) 2122 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2123 else 2124 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2125 2126 p->scx.flags &= ~SCX_TASK_QUEUED; 2127 rq->scx.nr_running--; 2128 sub_nr_running(rq, 1); 2129 2130 dispatch_dequeue(rq, p); 2131 return true; 2132 } 2133 2134 static void yield_task_scx(struct rq *rq) 2135 { 2136 struct task_struct *p = rq->donor; 2137 struct scx_sched *sch = scx_task_sched(p); 2138 2139 if (SCX_HAS_OP(sch, yield)) 2140 SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); 2141 else 2142 p->scx.slice = 0; 2143 } 2144 2145 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2146 { 2147 struct task_struct *from = rq->donor; 2148 struct scx_sched *sch = scx_task_sched(from); 2149 2150 if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2151 return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, 2152 from, to); 2153 else 2154 return false; 2155 } 2156 2157 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2158 { 2159 /* 2160 * Preemption between SCX tasks is implemented by resetting the victim 2161 * task's slice to 0 and triggering reschedule on the target CPU. 2162 * Nothing to do. 2163 */ 2164 if (p->sched_class == &ext_sched_class) 2165 return; 2166 2167 /* 2168 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2169 * This captures all preemption cases including: 2170 * 2171 * - A SCX task is currently running. 2172 * 2173 * - @rq is waking from idle due to a SCX task waking to it. 2174 * 2175 * - A higher-priority wakes up while SCX dispatch is in progress. 2176 */ 2177 if (rq->scx.nr_immed) 2178 schedule_reenq_local(rq, 0); 2179 } 2180 2181 static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2182 struct scx_dispatch_q *src_dsq, 2183 struct rq *dst_rq) 2184 { 2185 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2186 2187 /* @dsq is locked and @p is on @dst_rq */ 2188 lockdep_assert_held(&src_dsq->lock); 2189 lockdep_assert_rq_held(dst_rq); 2190 2191 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2192 2193 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2194 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2195 else 2196 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2197 2198 dsq_inc_nr(dst_dsq, p, enq_flags); 2199 p->scx.dsq = dst_dsq; 2200 2201 local_dsq_post_enq(dst_dsq, p, enq_flags); 2202 } 2203 2204 /** 2205 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2206 * @p: task to move 2207 * @enq_flags: %SCX_ENQ_* 2208 * @src_rq: rq to move the task from, locked on entry, released on return 2209 * @dst_rq: rq to move the task into, locked on return 2210 * 2211 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2212 */ 2213 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2214 struct rq *src_rq, struct rq *dst_rq) 2215 { 2216 lockdep_assert_rq_held(src_rq); 2217 2218 /* 2219 * Set sticky_cpu before deactivate_task() to properly mark the 2220 * beginning of an SCX-internal migration. 2221 */ 2222 p->scx.sticky_cpu = cpu_of(dst_rq); 2223 deactivate_task(src_rq, p, 0); 2224 set_task_cpu(p, cpu_of(dst_rq)); 2225 2226 raw_spin_rq_unlock(src_rq); 2227 raw_spin_rq_lock(dst_rq); 2228 2229 /* 2230 * We want to pass scx-specific enq_flags but activate_task() will 2231 * truncate the upper 32 bit. As we own @rq, we can pass them through 2232 * @rq->scx.extra_enq_flags instead. 2233 */ 2234 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2235 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2236 dst_rq->scx.extra_enq_flags = enq_flags; 2237 activate_task(dst_rq, p, 0); 2238 dst_rq->scx.extra_enq_flags = 0; 2239 } 2240 2241 /* 2242 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2243 * differences: 2244 * 2245 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2246 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2247 * this CPU?". 2248 * 2249 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2250 * must be allowed to finish on the CPU that it's currently on regardless of 2251 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2252 * BPF scheduler shouldn't attempt to migrate a task which has migration 2253 * disabled. 2254 * 2255 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2256 * no to the BPF scheduler initiated migrations while offline. 2257 * 2258 * The caller must ensure that @p and @rq are on different CPUs. 2259 */ 2260 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2261 struct task_struct *p, struct rq *rq, 2262 bool enforce) 2263 { 2264 s32 cpu = cpu_of(rq); 2265 2266 WARN_ON_ONCE(task_cpu(p) == cpu); 2267 2268 /* 2269 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2270 * the pinned CPU in migrate_disable_switch() while @p is being switched 2271 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2272 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2273 * @p passing the below task_allowed_on_cpu() check while migration is 2274 * disabled. 2275 * 2276 * Test the migration disabled state first as the race window is narrow 2277 * and the BPF scheduler failing to check migration disabled state can 2278 * easily be masked if task_allowed_on_cpu() is done first. 2279 */ 2280 if (unlikely(is_migration_disabled(p))) { 2281 if (enforce) 2282 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2283 p->comm, p->pid, task_cpu(p), cpu); 2284 return false; 2285 } 2286 2287 /* 2288 * We don't require the BPF scheduler to avoid dispatching to offline 2289 * CPUs mostly for convenience but also because CPUs can go offline 2290 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2291 * picked CPU is outside the allowed mask. 2292 */ 2293 if (!task_allowed_on_cpu(p, cpu)) { 2294 if (enforce) 2295 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2296 cpu, p->comm, p->pid); 2297 return false; 2298 } 2299 2300 if (!scx_rq_online(rq)) { 2301 if (enforce) 2302 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2303 return false; 2304 } 2305 2306 return true; 2307 } 2308 2309 /** 2310 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2311 * @p: target task 2312 * @dsq: locked DSQ @p is currently on 2313 * @src_rq: rq @p is currently on, stable with @dsq locked 2314 * 2315 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2316 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2317 * required when transferring into a local DSQ. Even when transferring into a 2318 * non-local DSQ, it's better to use the same mechanism to protect against 2319 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2320 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2321 * 2322 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2323 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2324 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2325 * dancing from our side. 2326 * 2327 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2328 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2329 * would be cleared to -1. While other cpus may have updated it to different 2330 * values afterwards, as this operation can't be preempted or recurse, the 2331 * holding_cpu can never become this CPU again before we're done. Thus, we can 2332 * tell whether we lost to dequeue by testing whether the holding_cpu still 2333 * points to this CPU. See dispatch_dequeue() for the counterpart. 2334 * 2335 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2336 * still valid. %false if lost to dequeue. 2337 */ 2338 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2339 struct scx_dispatch_q *dsq, 2340 struct rq *src_rq) 2341 { 2342 s32 cpu = raw_smp_processor_id(); 2343 2344 lockdep_assert_held(&dsq->lock); 2345 2346 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2347 task_unlink_from_dsq(p, dsq); 2348 p->scx.holding_cpu = cpu; 2349 2350 raw_spin_unlock(&dsq->lock); 2351 raw_spin_rq_lock(src_rq); 2352 2353 /* task_rq couldn't have changed if we're still the holding cpu */ 2354 return likely(p->scx.holding_cpu == cpu) && 2355 !WARN_ON_ONCE(src_rq != task_rq(p)); 2356 } 2357 2358 static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, 2359 struct scx_dispatch_q *dsq, struct rq *src_rq) 2360 { 2361 raw_spin_rq_unlock(this_rq); 2362 2363 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2364 move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); 2365 return true; 2366 } else { 2367 raw_spin_rq_unlock(src_rq); 2368 raw_spin_rq_lock(this_rq); 2369 return false; 2370 } 2371 } 2372 2373 /** 2374 * move_task_between_dsqs() - Move a task from one DSQ to another 2375 * @sch: scx_sched being operated on 2376 * @p: target task 2377 * @enq_flags: %SCX_ENQ_* 2378 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2379 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2380 * 2381 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2382 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2383 * will change. As @p's task_rq is locked, this function doesn't need to use the 2384 * holding_cpu mechanism. 2385 * 2386 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2387 * return value, is locked. 2388 */ 2389 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2390 struct task_struct *p, u64 enq_flags, 2391 struct scx_dispatch_q *src_dsq, 2392 struct scx_dispatch_q *dst_dsq) 2393 { 2394 struct rq *src_rq = task_rq(p), *dst_rq; 2395 2396 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2397 lockdep_assert_held(&src_dsq->lock); 2398 lockdep_assert_rq_held(src_rq); 2399 2400 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2401 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2402 if (src_rq != dst_rq && 2403 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2404 dst_dsq = find_global_dsq(sch, task_cpu(p)); 2405 dst_rq = src_rq; 2406 enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2407 } 2408 } else { 2409 /* no need to migrate if destination is a non-local DSQ */ 2410 dst_rq = src_rq; 2411 } 2412 2413 /* 2414 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2415 * CPU, @p will be migrated. 2416 */ 2417 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2418 /* @p is going from a non-local DSQ to a local DSQ */ 2419 if (src_rq == dst_rq) { 2420 task_unlink_from_dsq(p, src_dsq); 2421 move_local_task_to_local_dsq(p, enq_flags, 2422 src_dsq, dst_rq); 2423 raw_spin_unlock(&src_dsq->lock); 2424 } else { 2425 raw_spin_unlock(&src_dsq->lock); 2426 move_remote_task_to_local_dsq(p, enq_flags, 2427 src_rq, dst_rq); 2428 } 2429 } else { 2430 /* 2431 * @p is going from a non-local DSQ to a non-local DSQ. As 2432 * $src_dsq is already locked, do an abbreviated dequeue. 2433 */ 2434 dispatch_dequeue_locked(p, src_dsq); 2435 raw_spin_unlock(&src_dsq->lock); 2436 2437 dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2438 } 2439 2440 return dst_rq; 2441 } 2442 2443 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2444 struct scx_dispatch_q *dsq) 2445 { 2446 struct task_struct *p; 2447 retry: 2448 /* 2449 * The caller can't expect to successfully consume a task if the task's 2450 * addition to @dsq isn't guaranteed to be visible somehow. Test 2451 * @dsq->list without locking and skip if it seems empty. 2452 */ 2453 if (list_empty(&dsq->list)) 2454 return false; 2455 2456 raw_spin_lock(&dsq->lock); 2457 2458 nldsq_for_each_task(p, dsq) { 2459 struct rq *task_rq = task_rq(p); 2460 2461 /* 2462 * This loop can lead to multiple lockup scenarios, e.g. the BPF 2463 * scheduler can put an enormous number of affinitized tasks into 2464 * a contended DSQ, or the outer retry loop can repeatedly race 2465 * against scx_bypass() dequeueing tasks from @dsq trying to put 2466 * the system into the bypass mode. This can easily live-lock the 2467 * machine. If aborting, exit from all non-bypass DSQs. 2468 */ 2469 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2470 break; 2471 2472 if (rq == task_rq) { 2473 task_unlink_from_dsq(p, dsq); 2474 move_local_task_to_local_dsq(p, 0, dsq, rq); 2475 raw_spin_unlock(&dsq->lock); 2476 return true; 2477 } 2478 2479 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2480 if (likely(consume_remote_task(rq, p, dsq, task_rq))) 2481 return true; 2482 goto retry; 2483 } 2484 } 2485 2486 raw_spin_unlock(&dsq->lock); 2487 return false; 2488 } 2489 2490 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2491 { 2492 int node = cpu_to_node(cpu_of(rq)); 2493 2494 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq); 2495 } 2496 2497 /** 2498 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2499 * @sch: scx_sched being operated on 2500 * @rq: current rq which is locked 2501 * @dst_dsq: destination DSQ 2502 * @p: task to dispatch 2503 * @enq_flags: %SCX_ENQ_* 2504 * 2505 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2506 * DSQ. This function performs all the synchronization dancing needed because 2507 * local DSQs are protected with rq locks. 2508 * 2509 * The caller must have exclusive ownership of @p (e.g. through 2510 * %SCX_OPSS_DISPATCHING). 2511 */ 2512 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2513 struct scx_dispatch_q *dst_dsq, 2514 struct task_struct *p, u64 enq_flags) 2515 { 2516 struct rq *src_rq = task_rq(p); 2517 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2518 struct rq *locked_rq = rq; 2519 2520 /* 2521 * We're synchronized against dequeue through DISPATCHING. As @p can't 2522 * be dequeued, its task_rq and cpus_allowed are stable too. 2523 * 2524 * If dispatching to @rq that @p is already on, no lock dancing needed. 2525 */ 2526 if (rq == src_rq && rq == dst_rq) { 2527 dispatch_enqueue(sch, rq, dst_dsq, p, 2528 enq_flags | SCX_ENQ_CLEAR_OPSS); 2529 return; 2530 } 2531 2532 if (src_rq != dst_rq && 2533 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2534 dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2535 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2536 return; 2537 } 2538 2539 /* 2540 * @p is on a possibly remote @src_rq which we need to lock to move the 2541 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2542 * on DISPATCHING, so we can't grab @src_rq lock while holding 2543 * DISPATCHING. 2544 * 2545 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2546 * we're moving from a DSQ and use the same mechanism - mark the task 2547 * under transfer with holding_cpu, release DISPATCHING and then follow 2548 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2549 */ 2550 p->scx.holding_cpu = raw_smp_processor_id(); 2551 2552 /* store_release ensures that dequeue sees the above */ 2553 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2554 2555 /* switch to @src_rq lock */ 2556 if (locked_rq != src_rq) { 2557 raw_spin_rq_unlock(locked_rq); 2558 locked_rq = src_rq; 2559 raw_spin_rq_lock(src_rq); 2560 } 2561 2562 /* task_rq couldn't have changed if we're still the holding cpu */ 2563 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2564 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2565 /* 2566 * If @p is staying on the same rq, there's no need to go 2567 * through the full deactivate/activate cycle. Optimize by 2568 * abbreviating move_remote_task_to_local_dsq(). 2569 */ 2570 if (src_rq == dst_rq) { 2571 p->scx.holding_cpu = -1; 2572 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2573 enq_flags); 2574 } else { 2575 move_remote_task_to_local_dsq(p, enq_flags, 2576 src_rq, dst_rq); 2577 /* task has been moved to dst_rq, which is now locked */ 2578 locked_rq = dst_rq; 2579 } 2580 2581 /* if the destination CPU is idle, wake it up */ 2582 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2583 resched_curr(dst_rq); 2584 } 2585 2586 /* switch back to @rq lock */ 2587 if (locked_rq != rq) { 2588 raw_spin_rq_unlock(locked_rq); 2589 raw_spin_rq_lock(rq); 2590 } 2591 } 2592 2593 /** 2594 * finish_dispatch - Asynchronously finish dispatching a task 2595 * @rq: current rq which is locked 2596 * @p: task to finish dispatching 2597 * @qseq_at_dispatch: qseq when @p started getting dispatched 2598 * @dsq_id: destination DSQ ID 2599 * @enq_flags: %SCX_ENQ_* 2600 * 2601 * Dispatching to local DSQs may need to wait for queueing to complete or 2602 * require rq lock dancing. As we don't wanna do either while inside 2603 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2604 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2605 * task and its qseq. Once ops.dispatch() returns, this function is called to 2606 * finish up. 2607 * 2608 * There is no guarantee that @p is still valid for dispatching or even that it 2609 * was valid in the first place. Make sure that the task is still owned by the 2610 * BPF scheduler and claim the ownership before dispatching. 2611 */ 2612 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2613 struct task_struct *p, 2614 unsigned long qseq_at_dispatch, 2615 u64 dsq_id, u64 enq_flags) 2616 { 2617 struct scx_dispatch_q *dsq; 2618 unsigned long opss; 2619 2620 touch_core_sched_dispatch(rq, p); 2621 retry: 2622 /* 2623 * No need for _acquire here. @p is accessed only after a successful 2624 * try_cmpxchg to DISPATCHING. 2625 */ 2626 opss = atomic_long_read(&p->scx.ops_state); 2627 2628 switch (opss & SCX_OPSS_STATE_MASK) { 2629 case SCX_OPSS_DISPATCHING: 2630 case SCX_OPSS_NONE: 2631 /* someone else already got to it */ 2632 return; 2633 case SCX_OPSS_QUEUED: 2634 /* 2635 * If qseq doesn't match, @p has gone through at least one 2636 * dispatch/dequeue and re-enqueue cycle between 2637 * scx_bpf_dsq_insert() and here and we have no claim on it. 2638 */ 2639 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2640 return; 2641 2642 /* see SCX_EV_INSERT_NOT_OWNED definition */ 2643 if (unlikely(!scx_task_on_sched(sch, p))) { 2644 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2645 return; 2646 } 2647 2648 /* 2649 * While we know @p is accessible, we don't yet have a claim on 2650 * it - the BPF scheduler is allowed to dispatch tasks 2651 * spuriously and there can be a racing dequeue attempt. Let's 2652 * claim @p by atomically transitioning it from QUEUED to 2653 * DISPATCHING. 2654 */ 2655 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2656 SCX_OPSS_DISPATCHING))) 2657 break; 2658 goto retry; 2659 case SCX_OPSS_QUEUEING: 2660 /* 2661 * do_enqueue_task() is in the process of transferring the task 2662 * to the BPF scheduler while holding @p's rq lock. As we aren't 2663 * holding any kernel or BPF resource that the enqueue path may 2664 * depend upon, it's safe to wait. 2665 */ 2666 wait_ops_state(p, opss); 2667 goto retry; 2668 } 2669 2670 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2671 2672 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2673 2674 if (dsq->id == SCX_DSQ_LOCAL) 2675 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2676 else 2677 dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2678 } 2679 2680 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2681 { 2682 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2683 u32 u; 2684 2685 for (u = 0; u < dspc->cursor; u++) { 2686 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2687 2688 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2689 ent->enq_flags); 2690 } 2691 2692 dspc->nr_tasks += dspc->cursor; 2693 dspc->cursor = 0; 2694 } 2695 2696 static inline void maybe_queue_balance_callback(struct rq *rq) 2697 { 2698 lockdep_assert_rq_held(rq); 2699 2700 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2701 return; 2702 2703 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2704 deferred_bal_cb_workfn); 2705 2706 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2707 } 2708 2709 /* 2710 * One user of this function is scx_bpf_dispatch() which can be called 2711 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2712 * from the call frame. 2713 */ 2714 static __always_inline bool 2715 scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2716 struct task_struct *prev, bool nested) 2717 { 2718 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2719 int nr_loops = SCX_DSP_MAX_LOOPS; 2720 s32 cpu = cpu_of(rq); 2721 bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2722 scx_task_on_sched(sch, prev); 2723 2724 if (consume_global_dsq(sch, rq)) 2725 return true; 2726 2727 if (bypass_dsp_enabled(sch)) { 2728 /* if @sch is bypassing, only the bypass DSQs are active */ 2729 if (scx_bypassing(sch, cpu)) 2730 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu)); 2731 2732 #ifdef CONFIG_EXT_SUB_SCHED 2733 /* 2734 * If @sch isn't bypassing but its children are, @sch is 2735 * responsible for making forward progress for both its own 2736 * tasks that aren't bypassing and the bypassing descendants' 2737 * tasks. The following implements a simple built-in behavior - 2738 * let each CPU try to run the bypass DSQ every Nth time. 2739 * 2740 * Later, if necessary, we can add an ops flag to suppress the 2741 * auto-consumption and a kfunc to consume the bypass DSQ and, 2742 * so that the BPF scheduler can fully control scheduling of 2743 * bypassed tasks. 2744 */ 2745 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2746 2747 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2748 consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu))) { 2749 __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2750 return true; 2751 } 2752 #endif /* CONFIG_EXT_SUB_SCHED */ 2753 } 2754 2755 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2756 return false; 2757 2758 dspc->rq = rq; 2759 2760 /* 2761 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2762 * the local DSQ might still end up empty after a successful 2763 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2764 * produced some tasks, retry. The BPF scheduler may depend on this 2765 * looping behavior to simplify its implementation. 2766 */ 2767 do { 2768 dspc->nr_tasks = 0; 2769 2770 if (nested) { 2771 /* 2772 * If nested, don't update kf_mask as the originating 2773 * invocation would already have set it up. 2774 */ 2775 SCX_CALL_OP(sch, 0, dispatch, rq, cpu, 2776 prev_on_sch ? prev : NULL); 2777 } else { 2778 /* 2779 * If not nested, stash @prev so that nested invocations 2780 * can access it. 2781 */ 2782 rq->scx.sub_dispatch_prev = prev; 2783 SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu, 2784 prev_on_sch ? prev : NULL); 2785 rq->scx.sub_dispatch_prev = NULL; 2786 } 2787 2788 flush_dispatch_buf(sch, rq); 2789 2790 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2791 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2792 return true; 2793 } 2794 if (rq->scx.local_dsq.nr) 2795 return true; 2796 if (consume_global_dsq(sch, rq)) 2797 return true; 2798 2799 /* 2800 * ops.dispatch() can trap us in this loop by repeatedly 2801 * dispatching ineligible tasks. Break out once in a while to 2802 * allow the watchdog to run. As IRQ can't be enabled in 2803 * balance(), we want to complete this scheduling cycle and then 2804 * start a new one. IOW, we want to call resched_curr() on the 2805 * next, most likely idle, task, not the current one. Use 2806 * __scx_bpf_kick_cpu() for deferred kicking. 2807 */ 2808 if (unlikely(!--nr_loops)) { 2809 scx_kick_cpu(sch, cpu, 0); 2810 break; 2811 } 2812 } while (dspc->nr_tasks); 2813 2814 /* 2815 * Prevent the CPU from going idle while bypassed descendants have tasks 2816 * queued. Without this fallback, bypassed tasks could stall if the host 2817 * scheduler's ops.dispatch() doesn't yield any tasks. 2818 */ 2819 if (bypass_dsp_enabled(sch)) 2820 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu)); 2821 2822 return false; 2823 } 2824 2825 static int balance_one(struct rq *rq, struct task_struct *prev) 2826 { 2827 struct scx_sched *sch = scx_root; 2828 s32 cpu = cpu_of(rq); 2829 2830 lockdep_assert_rq_held(rq); 2831 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2832 rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2833 2834 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2835 unlikely(rq->scx.cpu_released)) { 2836 /* 2837 * If the previous sched_class for the current CPU was not SCX, 2838 * notify the BPF scheduler that it again has control of the 2839 * core. This callback complements ->cpu_release(), which is 2840 * emitted in switch_class(). 2841 */ 2842 if (SCX_HAS_OP(sch, cpu_acquire)) 2843 SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, cpu, NULL); 2844 rq->scx.cpu_released = false; 2845 } 2846 2847 if (prev->sched_class == &ext_sched_class) { 2848 update_curr_scx(rq); 2849 2850 /* 2851 * If @prev is runnable & has slice left, it has priority and 2852 * fetching more just increases latency for the fetched tasks. 2853 * Tell pick_task_scx() to keep running @prev. If the BPF 2854 * scheduler wants to handle this explicitly, it should 2855 * implement ->cpu_release(). 2856 * 2857 * See scx_disable_workfn() for the explanation on the bypassing 2858 * test. 2859 */ 2860 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 2861 !scx_bypassing(sch, cpu)) { 2862 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2863 goto has_tasks; 2864 } 2865 } 2866 2867 /* if there already are tasks to run, nothing to do */ 2868 if (rq->scx.local_dsq.nr) 2869 goto has_tasks; 2870 2871 if (scx_dispatch_sched(sch, rq, prev, false)) 2872 goto has_tasks; 2873 2874 /* 2875 * Didn't find another task to run. Keep running @prev unless 2876 * %SCX_OPS_ENQ_LAST is in effect. 2877 */ 2878 if ((prev->scx.flags & SCX_TASK_QUEUED) && 2879 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 2880 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2881 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 2882 goto has_tasks; 2883 } 2884 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2885 return false; 2886 2887 has_tasks: 2888 /* 2889 * @rq may have extra IMMED tasks without reenq scheduled: 2890 * 2891 * - rq_is_open() can't reliably tell when and how slice is going to be 2892 * modified for $curr and allows IMMED tasks to be queued while 2893 * dispatch is in progress. 2894 * 2895 * - A non-IMMED HEAD task can get queued in front of an IMMED task 2896 * between the IMMED queueing and the subsequent scheduling event. 2897 */ 2898 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 2899 schedule_reenq_local(rq, 0); 2900 2901 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2902 return true; 2903 } 2904 2905 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 2906 { 2907 struct scx_sched *sch = scx_task_sched(p); 2908 2909 if (p->scx.flags & SCX_TASK_QUEUED) { 2910 /* 2911 * Core-sched might decide to execute @p before it is 2912 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 2913 */ 2914 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 2915 dispatch_dequeue(rq, p); 2916 } 2917 2918 p->se.exec_start = rq_clock_task(rq); 2919 2920 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2921 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 2922 SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); 2923 2924 clr_task_runnable(p, true); 2925 2926 /* 2927 * @p is getting newly scheduled or got kicked after someone updated its 2928 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 2929 */ 2930 if ((p->scx.slice == SCX_SLICE_INF) != 2931 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 2932 if (p->scx.slice == SCX_SLICE_INF) 2933 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 2934 else 2935 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 2936 2937 sched_update_tick_dependency(rq); 2938 2939 /* 2940 * For now, let's refresh the load_avgs just when transitioning 2941 * in and out of nohz. In the future, we might want to add a 2942 * mechanism which calls the following periodically on 2943 * tick-stopped CPUs. 2944 */ 2945 update_other_load_avgs(rq); 2946 } 2947 } 2948 2949 static enum scx_cpu_preempt_reason 2950 preempt_reason_from_class(const struct sched_class *class) 2951 { 2952 if (class == &stop_sched_class) 2953 return SCX_CPU_PREEMPT_STOP; 2954 if (class == &dl_sched_class) 2955 return SCX_CPU_PREEMPT_DL; 2956 if (class == &rt_sched_class) 2957 return SCX_CPU_PREEMPT_RT; 2958 return SCX_CPU_PREEMPT_UNKNOWN; 2959 } 2960 2961 static void switch_class(struct rq *rq, struct task_struct *next) 2962 { 2963 struct scx_sched *sch = scx_root; 2964 const struct sched_class *next_class = next->sched_class; 2965 2966 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 2967 return; 2968 2969 /* 2970 * The callback is conceptually meant to convey that the CPU is no 2971 * longer under the control of SCX. Therefore, don't invoke the callback 2972 * if the next class is below SCX (in which case the BPF scheduler has 2973 * actively decided not to schedule any tasks on the CPU). 2974 */ 2975 if (sched_class_above(&ext_sched_class, next_class)) 2976 return; 2977 2978 /* 2979 * At this point we know that SCX was preempted by a higher priority 2980 * sched_class, so invoke the ->cpu_release() callback if we have not 2981 * done so already. We only send the callback once between SCX being 2982 * preempted, and it regaining control of the CPU. 2983 * 2984 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 2985 * next time that balance_one() is invoked. 2986 */ 2987 if (!rq->scx.cpu_released) { 2988 if (SCX_HAS_OP(sch, cpu_release)) { 2989 struct scx_cpu_release_args args = { 2990 .reason = preempt_reason_from_class(next_class), 2991 .task = next, 2992 }; 2993 2994 SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, 2995 cpu_of(rq), &args); 2996 } 2997 rq->scx.cpu_released = true; 2998 } 2999 } 3000 3001 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3002 struct task_struct *next) 3003 { 3004 struct scx_sched *sch = scx_task_sched(p); 3005 3006 /* see kick_cpus_irq_workfn() */ 3007 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3008 3009 update_curr_scx(rq); 3010 3011 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3012 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3013 SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); 3014 3015 if (p->scx.flags & SCX_TASK_QUEUED) { 3016 set_task_runnable(rq, p); 3017 3018 /* 3019 * If @p has slice left and is being put, @p is getting 3020 * preempted by a higher priority scheduler class or core-sched 3021 * forcing a different task. Leave it at the head of the local 3022 * DSQ unless it was an IMMED task. IMMED tasks should not 3023 * linger on a busy CPU, reenqueue them to the BPF scheduler. 3024 */ 3025 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3026 if (p->scx.flags & SCX_TASK_IMMED) { 3027 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3028 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3029 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3030 } else { 3031 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3032 } 3033 goto switch_class; 3034 } 3035 3036 /* 3037 * If @p is runnable but we're about to enter a lower 3038 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3039 * ops.enqueue() that @p is the only one available for this cpu, 3040 * which should trigger an explicit follow-up scheduling event. 3041 */ 3042 if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3043 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3044 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3045 } else { 3046 do_enqueue_task(rq, p, 0, -1); 3047 } 3048 } 3049 3050 switch_class: 3051 if (next && next->sched_class != &ext_sched_class) 3052 switch_class(rq, next); 3053 } 3054 3055 static struct task_struct *first_local_task(struct rq *rq) 3056 { 3057 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3058 struct task_struct, scx.dsq_list.node); 3059 } 3060 3061 static struct task_struct * 3062 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3063 { 3064 struct task_struct *prev = rq->curr; 3065 bool keep_prev; 3066 struct task_struct *p; 3067 3068 /* see kick_cpus_irq_workfn() */ 3069 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3070 3071 rq_modified_begin(rq, &ext_sched_class); 3072 3073 rq_unpin_lock(rq, rf); 3074 balance_one(rq, prev); 3075 rq_repin_lock(rq, rf); 3076 maybe_queue_balance_callback(rq); 3077 3078 /* 3079 * If any higher-priority sched class enqueued a runnable task on 3080 * this rq during balance_one(), abort and return RETRY_TASK, so 3081 * that the scheduler loop can restart. 3082 * 3083 * If @force_scx is true, always try to pick a SCHED_EXT task, 3084 * regardless of any higher-priority sched classes activity. 3085 */ 3086 if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3087 return RETRY_TASK; 3088 3089 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3090 if (unlikely(keep_prev && 3091 prev->sched_class != &ext_sched_class)) { 3092 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3093 keep_prev = false; 3094 } 3095 3096 /* 3097 * If balance_one() is telling us to keep running @prev, replenish slice 3098 * if necessary and keep running @prev. Otherwise, pop the first one 3099 * from the local DSQ. 3100 */ 3101 if (keep_prev) { 3102 p = prev; 3103 if (!p->scx.slice) 3104 refill_task_slice_dfl(scx_task_sched(p), p); 3105 } else { 3106 p = first_local_task(rq); 3107 if (!p) 3108 return NULL; 3109 3110 if (unlikely(!p->scx.slice)) { 3111 struct scx_sched *sch = scx_task_sched(p); 3112 3113 if (!scx_bypassing(sch, cpu_of(rq)) && 3114 !sch->warned_zero_slice) { 3115 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3116 p->comm, p->pid, __func__); 3117 sch->warned_zero_slice = true; 3118 } 3119 refill_task_slice_dfl(sch, p); 3120 } 3121 } 3122 3123 return p; 3124 } 3125 3126 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3127 { 3128 return do_pick_task_scx(rq, rf, false); 3129 } 3130 3131 /* 3132 * Select the next task to run from the ext scheduling class. 3133 * 3134 * Use do_pick_task_scx() directly with @force_scx enabled, since the 3135 * dl_server must always select a sched_ext task. 3136 */ 3137 static struct task_struct * 3138 ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3139 { 3140 if (!scx_enabled()) 3141 return NULL; 3142 3143 return do_pick_task_scx(dl_se->rq, rf, true); 3144 } 3145 3146 /* 3147 * Initialize the ext server deadline entity. 3148 */ 3149 void ext_server_init(struct rq *rq) 3150 { 3151 struct sched_dl_entity *dl_se = &rq->ext_server; 3152 3153 init_dl_entity(dl_se); 3154 3155 dl_server_init(dl_se, rq, ext_server_pick_task); 3156 } 3157 3158 #ifdef CONFIG_SCHED_CORE 3159 /** 3160 * scx_prio_less - Task ordering for core-sched 3161 * @a: task A 3162 * @b: task B 3163 * @in_fi: in forced idle state 3164 * 3165 * Core-sched is implemented as an additional scheduling layer on top of the 3166 * usual sched_class'es and needs to find out the expected task ordering. For 3167 * SCX, core-sched calls this function to interrogate the task ordering. 3168 * 3169 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3170 * to implement the default task ordering. The older the timestamp, the higher 3171 * priority the task - the global FIFO ordering matching the default scheduling 3172 * behavior. 3173 * 3174 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3175 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3176 */ 3177 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3178 bool in_fi) 3179 { 3180 struct scx_sched *sch_a = scx_task_sched(a); 3181 struct scx_sched *sch_b = scx_task_sched(b); 3182 3183 /* 3184 * The const qualifiers are dropped from task_struct pointers when 3185 * calling ops.core_sched_before(). Accesses are controlled by the 3186 * verifier. 3187 */ 3188 if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3189 !scx_bypassing(sch_a, task_cpu(a))) 3190 return SCX_CALL_OP_2TASKS_RET(sch_a, SCX_KF_REST, core_sched_before, 3191 NULL, 3192 (struct task_struct *)a, 3193 (struct task_struct *)b); 3194 else 3195 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3196 } 3197 #endif /* CONFIG_SCHED_CORE */ 3198 3199 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3200 { 3201 struct scx_sched *sch = scx_task_sched(p); 3202 bool bypassing; 3203 3204 /* 3205 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3206 * can be a good migration opportunity with low cache and memory 3207 * footprint. Returning a CPU different than @prev_cpu triggers 3208 * immediate rq migration. However, for SCX, as the current rq 3209 * association doesn't dictate where the task is going to run, this 3210 * doesn't fit well. If necessary, we can later add a dedicated method 3211 * which can decide to preempt self to force it through the regular 3212 * scheduling path. 3213 */ 3214 if (unlikely(wake_flags & WF_EXEC)) 3215 return prev_cpu; 3216 3217 bypassing = scx_bypassing(sch, task_cpu(p)); 3218 if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3219 s32 cpu; 3220 struct task_struct **ddsp_taskp; 3221 3222 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3223 WARN_ON_ONCE(*ddsp_taskp); 3224 *ddsp_taskp = p; 3225 3226 cpu = SCX_CALL_OP_TASK_RET(sch, 3227 SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, 3228 select_cpu, NULL, p, prev_cpu, 3229 wake_flags); 3230 p->scx.selected_cpu = cpu; 3231 *ddsp_taskp = NULL; 3232 if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3233 return cpu; 3234 else 3235 return prev_cpu; 3236 } else { 3237 s32 cpu; 3238 3239 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3240 if (cpu >= 0) { 3241 refill_task_slice_dfl(sch, p); 3242 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3243 } else { 3244 cpu = prev_cpu; 3245 } 3246 p->scx.selected_cpu = cpu; 3247 3248 if (bypassing) 3249 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3250 return cpu; 3251 } 3252 } 3253 3254 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3255 { 3256 run_deferred(rq); 3257 } 3258 3259 static void set_cpus_allowed_scx(struct task_struct *p, 3260 struct affinity_context *ac) 3261 { 3262 struct scx_sched *sch = scx_task_sched(p); 3263 3264 set_cpus_allowed_common(p, ac); 3265 3266 if (task_dead_and_done(p)) 3267 return; 3268 3269 /* 3270 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3271 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3272 * scheduler the effective one. 3273 * 3274 * Fine-grained memory write control is enforced by BPF making the const 3275 * designation pointless. Cast it away when calling the operation. 3276 */ 3277 if (SCX_HAS_OP(sch, set_cpumask)) 3278 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, 3279 p, (struct cpumask *)p->cpus_ptr); 3280 } 3281 3282 static void handle_hotplug(struct rq *rq, bool online) 3283 { 3284 struct scx_sched *sch = scx_root; 3285 s32 cpu = cpu_of(rq); 3286 3287 atomic_long_inc(&scx_hotplug_seq); 3288 3289 /* 3290 * scx_root updates are protected by cpus_read_lock() and will stay 3291 * stable here. Note that we can't depend on scx_enabled() test as the 3292 * hotplug ops need to be enabled before __scx_enabled is set. 3293 */ 3294 if (unlikely(!sch)) 3295 return; 3296 3297 if (scx_enabled()) 3298 scx_idle_update_selcpu_topology(&sch->ops); 3299 3300 if (online && SCX_HAS_OP(sch, cpu_online)) 3301 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); 3302 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3303 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); 3304 else 3305 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3306 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3307 "cpu %d going %s, exiting scheduler", cpu, 3308 online ? "online" : "offline"); 3309 } 3310 3311 void scx_rq_activate(struct rq *rq) 3312 { 3313 handle_hotplug(rq, true); 3314 } 3315 3316 void scx_rq_deactivate(struct rq *rq) 3317 { 3318 handle_hotplug(rq, false); 3319 } 3320 3321 static void rq_online_scx(struct rq *rq) 3322 { 3323 rq->scx.flags |= SCX_RQ_ONLINE; 3324 } 3325 3326 static void rq_offline_scx(struct rq *rq) 3327 { 3328 rq->scx.flags &= ~SCX_RQ_ONLINE; 3329 } 3330 3331 static bool check_rq_for_timeouts(struct rq *rq) 3332 { 3333 struct scx_sched *sch; 3334 struct task_struct *p; 3335 struct rq_flags rf; 3336 bool timed_out = false; 3337 3338 rq_lock_irqsave(rq, &rf); 3339 sch = rcu_dereference_bh(scx_root); 3340 if (unlikely(!sch)) 3341 goto out_unlock; 3342 3343 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3344 struct scx_sched *sch = scx_task_sched(p); 3345 unsigned long last_runnable = p->scx.runnable_at; 3346 3347 if (unlikely(time_after(jiffies, 3348 last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3349 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3350 3351 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, 3352 "%s[%d] failed to run for %u.%03us", 3353 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); 3354 timed_out = true; 3355 break; 3356 } 3357 } 3358 out_unlock: 3359 rq_unlock_irqrestore(rq, &rf); 3360 return timed_out; 3361 } 3362 3363 static void scx_watchdog_workfn(struct work_struct *work) 3364 { 3365 unsigned long intv; 3366 int cpu; 3367 3368 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3369 3370 for_each_online_cpu(cpu) { 3371 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3372 break; 3373 3374 cond_resched(); 3375 } 3376 3377 intv = READ_ONCE(scx_watchdog_interval); 3378 if (intv < ULONG_MAX) 3379 queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3380 } 3381 3382 void scx_tick(struct rq *rq) 3383 { 3384 struct scx_sched *root; 3385 unsigned long last_check; 3386 3387 if (!scx_enabled()) 3388 return; 3389 3390 root = rcu_dereference_bh(scx_root); 3391 if (unlikely(!root)) 3392 return; 3393 3394 last_check = READ_ONCE(scx_watchdog_timestamp); 3395 if (unlikely(time_after(jiffies, 3396 last_check + READ_ONCE(root->watchdog_timeout)))) { 3397 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3398 3399 scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3400 "watchdog failed to check in for %u.%03us", 3401 dur_ms / 1000, dur_ms % 1000); 3402 } 3403 3404 update_other_load_avgs(rq); 3405 } 3406 3407 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3408 { 3409 struct scx_sched *sch = scx_task_sched(curr); 3410 3411 update_curr_scx(rq); 3412 3413 /* 3414 * While disabling, always resched and refresh core-sched timestamp as 3415 * we can't trust the slice management or ops.core_sched_before(). 3416 */ 3417 if (scx_bypassing(sch, cpu_of(rq))) { 3418 curr->scx.slice = 0; 3419 touch_core_sched(rq, curr); 3420 } else if (SCX_HAS_OP(sch, tick)) { 3421 SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); 3422 } 3423 3424 if (!curr->scx.slice) 3425 resched_curr(rq); 3426 } 3427 3428 #ifdef CONFIG_EXT_GROUP_SCHED 3429 static struct cgroup *tg_cgrp(struct task_group *tg) 3430 { 3431 /* 3432 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3433 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3434 * root cgroup. 3435 */ 3436 if (tg && tg->css.cgroup) 3437 return tg->css.cgroup; 3438 else 3439 return &cgrp_dfl_root.cgrp; 3440 } 3441 3442 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3443 3444 #else /* CONFIG_EXT_GROUP_SCHED */ 3445 3446 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3447 3448 #endif /* CONFIG_EXT_GROUP_SCHED */ 3449 3450 static u32 scx_get_task_state(const struct task_struct *p) 3451 { 3452 return p->scx.flags & SCX_TASK_STATE_MASK; 3453 } 3454 3455 static void scx_set_task_state(struct task_struct *p, u32 state) 3456 { 3457 u32 prev_state = scx_get_task_state(p); 3458 bool warn = false; 3459 3460 switch (state) { 3461 case SCX_TASK_NONE: 3462 break; 3463 case SCX_TASK_INIT: 3464 warn = prev_state != SCX_TASK_NONE; 3465 break; 3466 case SCX_TASK_READY: 3467 warn = prev_state == SCX_TASK_NONE; 3468 break; 3469 case SCX_TASK_ENABLED: 3470 warn = prev_state != SCX_TASK_READY; 3471 break; 3472 default: 3473 warn = true; 3474 return; 3475 } 3476 3477 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 3478 prev_state, state, p->comm, p->pid); 3479 3480 p->scx.flags &= ~SCX_TASK_STATE_MASK; 3481 p->scx.flags |= state; 3482 } 3483 3484 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3485 { 3486 int ret; 3487 3488 p->scx.disallow = false; 3489 3490 if (SCX_HAS_OP(sch, init_task)) { 3491 struct scx_init_task_args args = { 3492 SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3493 .fork = fork, 3494 }; 3495 3496 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, 3497 p, &args); 3498 if (unlikely(ret)) { 3499 ret = ops_sanitize_err(sch, "init_task", ret); 3500 return ret; 3501 } 3502 } 3503 3504 if (p->scx.disallow) { 3505 if (unlikely(scx_parent(sch))) { 3506 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3507 p->comm, p->pid); 3508 } else if (unlikely(fork)) { 3509 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3510 p->comm, p->pid); 3511 } else { 3512 struct rq *rq; 3513 struct rq_flags rf; 3514 3515 rq = task_rq_lock(p, &rf); 3516 3517 /* 3518 * We're in the load path and @p->policy will be applied 3519 * right after. Reverting @p->policy here and rejecting 3520 * %SCHED_EXT transitions from scx_check_setscheduler() 3521 * guarantees that if ops.init_task() sets @p->disallow, 3522 * @p can never be in SCX. 3523 */ 3524 if (p->policy == SCHED_EXT) { 3525 p->policy = SCHED_NORMAL; 3526 atomic_long_inc(&scx_nr_rejected); 3527 } 3528 3529 task_rq_unlock(rq, p, &rf); 3530 } 3531 } 3532 3533 return 0; 3534 } 3535 3536 static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3537 { 3538 int ret; 3539 3540 ret = __scx_init_task(sch, p, fork); 3541 if (!ret) { 3542 /* 3543 * While @p's rq is not locked. @p is not visible to the rest of 3544 * SCX yet and it's safe to update the flags and state. 3545 */ 3546 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 3547 scx_set_task_state(p, SCX_TASK_INIT); 3548 } 3549 return ret; 3550 } 3551 3552 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3553 { 3554 struct rq *rq = task_rq(p); 3555 u32 weight; 3556 3557 lockdep_assert_rq_held(rq); 3558 3559 /* 3560 * Verify the task is not in BPF scheduler's custody. If flag 3561 * transitions are consistent, the flag should always be clear 3562 * here. 3563 */ 3564 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3565 3566 /* 3567 * Set the weight before calling ops.enable() so that the scheduler 3568 * doesn't see a stale value if they inspect the task struct. 3569 */ 3570 if (task_has_idle_policy(p)) 3571 weight = WEIGHT_IDLEPRIO; 3572 else 3573 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3574 3575 p->scx.weight = sched_weight_to_cgroup(weight); 3576 3577 if (SCX_HAS_OP(sch, enable)) 3578 SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); 3579 3580 if (SCX_HAS_OP(sch, set_weight)) 3581 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, 3582 p, p->scx.weight); 3583 } 3584 3585 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3586 { 3587 __scx_enable_task(sch, p); 3588 scx_set_task_state(p, SCX_TASK_ENABLED); 3589 } 3590 3591 static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3592 { 3593 struct rq *rq = task_rq(p); 3594 3595 lockdep_assert_rq_held(rq); 3596 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3597 3598 if (SCX_HAS_OP(sch, disable)) 3599 SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); 3600 scx_set_task_state(p, SCX_TASK_READY); 3601 3602 /* 3603 * Verify the task is not in BPF scheduler's custody. If flag 3604 * transitions are consistent, the flag should always be clear 3605 * here. 3606 */ 3607 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3608 } 3609 3610 static void __scx_disable_and_exit_task(struct scx_sched *sch, 3611 struct task_struct *p) 3612 { 3613 struct scx_exit_task_args args = { 3614 .cancelled = false, 3615 }; 3616 3617 lockdep_assert_held(&p->pi_lock); 3618 lockdep_assert_rq_held(task_rq(p)); 3619 3620 switch (scx_get_task_state(p)) { 3621 case SCX_TASK_NONE: 3622 return; 3623 case SCX_TASK_INIT: 3624 args.cancelled = true; 3625 break; 3626 case SCX_TASK_READY: 3627 break; 3628 case SCX_TASK_ENABLED: 3629 scx_disable_task(sch, p); 3630 break; 3631 default: 3632 WARN_ON_ONCE(true); 3633 return; 3634 } 3635 3636 if (SCX_HAS_OP(sch, exit_task)) 3637 SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), 3638 p, &args); 3639 } 3640 3641 static void scx_disable_and_exit_task(struct scx_sched *sch, 3642 struct task_struct *p) 3643 { 3644 __scx_disable_and_exit_task(sch, p); 3645 3646 /* 3647 * If set, @p exited between __scx_init_task() and scx_enable_task() in 3648 * scx_sub_enable() and is initialized for both the associated sched and 3649 * its parent. Disable and exit for the child too. 3650 */ 3651 if ((p->scx.flags & SCX_TASK_SUB_INIT) && 3652 !WARN_ON_ONCE(!scx_enabling_sub_sched)) { 3653 __scx_disable_and_exit_task(scx_enabling_sub_sched, p); 3654 p->scx.flags &= ~SCX_TASK_SUB_INIT; 3655 } 3656 3657 scx_set_task_sched(p, NULL); 3658 scx_set_task_state(p, SCX_TASK_NONE); 3659 } 3660 3661 void init_scx_entity(struct sched_ext_entity *scx) 3662 { 3663 memset(scx, 0, sizeof(*scx)); 3664 INIT_LIST_HEAD(&scx->dsq_list.node); 3665 RB_CLEAR_NODE(&scx->dsq_priq); 3666 scx->sticky_cpu = -1; 3667 scx->holding_cpu = -1; 3668 INIT_LIST_HEAD(&scx->runnable_node); 3669 scx->runnable_at = jiffies; 3670 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3671 scx->slice = SCX_SLICE_DFL; 3672 } 3673 3674 void scx_pre_fork(struct task_struct *p) 3675 { 3676 /* 3677 * BPF scheduler enable/disable paths want to be able to iterate and 3678 * update all tasks which can become complex when racing forks. As 3679 * enable/disable are very cold paths, let's use a percpu_rwsem to 3680 * exclude forks. 3681 */ 3682 percpu_down_read(&scx_fork_rwsem); 3683 } 3684 3685 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3686 { 3687 s32 ret; 3688 3689 percpu_rwsem_assert_held(&scx_fork_rwsem); 3690 3691 if (scx_init_task_enabled) { 3692 #ifdef CONFIG_EXT_SUB_SCHED 3693 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3694 #else 3695 struct scx_sched *sch = scx_root; 3696 #endif 3697 ret = scx_init_task(sch, p, true); 3698 if (!ret) 3699 scx_set_task_sched(p, sch); 3700 return ret; 3701 } 3702 3703 return 0; 3704 } 3705 3706 void scx_post_fork(struct task_struct *p) 3707 { 3708 if (scx_init_task_enabled) { 3709 scx_set_task_state(p, SCX_TASK_READY); 3710 3711 /* 3712 * Enable the task immediately if it's running on sched_ext. 3713 * Otherwise, it'll be enabled in switching_to_scx() if and 3714 * when it's ever configured to run with a SCHED_EXT policy. 3715 */ 3716 if (p->sched_class == &ext_sched_class) { 3717 struct rq_flags rf; 3718 struct rq *rq; 3719 3720 rq = task_rq_lock(p, &rf); 3721 scx_enable_task(scx_task_sched(p), p); 3722 task_rq_unlock(rq, p, &rf); 3723 } 3724 } 3725 3726 raw_spin_lock_irq(&scx_tasks_lock); 3727 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3728 raw_spin_unlock_irq(&scx_tasks_lock); 3729 3730 percpu_up_read(&scx_fork_rwsem); 3731 } 3732 3733 void scx_cancel_fork(struct task_struct *p) 3734 { 3735 if (scx_enabled()) { 3736 struct rq *rq; 3737 struct rq_flags rf; 3738 3739 rq = task_rq_lock(p, &rf); 3740 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3741 scx_disable_and_exit_task(scx_task_sched(p), p); 3742 task_rq_unlock(rq, p, &rf); 3743 } 3744 3745 percpu_up_read(&scx_fork_rwsem); 3746 } 3747 3748 /** 3749 * task_dead_and_done - Is a task dead and done running? 3750 * @p: target task 3751 * 3752 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3753 * task no longer exists from SCX's POV. However, certain sched_class ops may be 3754 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3755 * may try to switch a task which finished sched_ext_dead() back into SCX 3756 * triggering invalid SCX task state transitions and worse. 3757 * 3758 * Once a task has finished the final switch, sched_ext_dead() is the only thing 3759 * that needs to happen on the task. Use this test to short-circuit sched_class 3760 * operations which may be called on dead tasks. 3761 */ 3762 static bool task_dead_and_done(struct task_struct *p) 3763 { 3764 struct rq *rq = task_rq(p); 3765 3766 lockdep_assert_rq_held(rq); 3767 3768 /* 3769 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3770 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3771 * won't ever run again. 3772 */ 3773 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 3774 !task_on_cpu(rq, p); 3775 } 3776 3777 void sched_ext_dead(struct task_struct *p) 3778 { 3779 unsigned long flags; 3780 3781 /* 3782 * By the time control reaches here, @p has %TASK_DEAD set, switched out 3783 * for the last time and then dropped the rq lock - task_dead_and_done() 3784 * should be returning %true nullifying the straggling sched_class ops. 3785 * Remove from scx_tasks and exit @p. 3786 */ 3787 raw_spin_lock_irqsave(&scx_tasks_lock, flags); 3788 list_del_init(&p->scx.tasks_node); 3789 raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); 3790 3791 /* 3792 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 3793 * ENABLED transitions can't race us. Disable ops for @p. 3794 */ 3795 if (scx_get_task_state(p) != SCX_TASK_NONE) { 3796 struct rq_flags rf; 3797 struct rq *rq; 3798 3799 rq = task_rq_lock(p, &rf); 3800 scx_disable_and_exit_task(scx_task_sched(p), p); 3801 task_rq_unlock(rq, p, &rf); 3802 } 3803 } 3804 3805 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 3806 const struct load_weight *lw) 3807 { 3808 struct scx_sched *sch = scx_task_sched(p); 3809 3810 lockdep_assert_rq_held(task_rq(p)); 3811 3812 if (task_dead_and_done(p)) 3813 return; 3814 3815 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 3816 if (SCX_HAS_OP(sch, set_weight)) 3817 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, 3818 p, p->scx.weight); 3819 } 3820 3821 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 3822 { 3823 } 3824 3825 static void switching_to_scx(struct rq *rq, struct task_struct *p) 3826 { 3827 struct scx_sched *sch = scx_task_sched(p); 3828 3829 if (task_dead_and_done(p)) 3830 return; 3831 3832 scx_enable_task(sch, p); 3833 3834 /* 3835 * set_cpus_allowed_scx() is not called while @p is associated with a 3836 * different scheduler class. Keep the BPF scheduler up-to-date. 3837 */ 3838 if (SCX_HAS_OP(sch, set_cpumask)) 3839 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, 3840 p, (struct cpumask *)p->cpus_ptr); 3841 } 3842 3843 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3844 { 3845 if (task_dead_and_done(p)) 3846 return; 3847 3848 scx_disable_task(scx_task_sched(p), p); 3849 } 3850 3851 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 3852 3853 int scx_check_setscheduler(struct task_struct *p, int policy) 3854 { 3855 lockdep_assert_rq_held(task_rq(p)); 3856 3857 /* if disallow, reject transitioning into SCX */ 3858 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 3859 p->policy != policy && policy == SCHED_EXT) 3860 return -EACCES; 3861 3862 return 0; 3863 } 3864 3865 static void process_ddsp_deferred_locals(struct rq *rq) 3866 { 3867 struct task_struct *p; 3868 3869 lockdep_assert_rq_held(rq); 3870 3871 /* 3872 * Now that @rq can be unlocked, execute the deferred enqueueing of 3873 * tasks directly dispatched to the local DSQs of other CPUs. See 3874 * direct_dispatch(). Keep popping from the head instead of using 3875 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 3876 * temporarily. 3877 */ 3878 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 3879 struct task_struct, scx.dsq_list.node))) { 3880 struct scx_sched *sch = scx_task_sched(p); 3881 struct scx_dispatch_q *dsq; 3882 3883 list_del_init(&p->scx.dsq_list.node); 3884 3885 dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 3886 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 3887 dispatch_to_local_dsq(sch, rq, dsq, p, 3888 p->scx.ddsp_enq_flags); 3889 } 3890 } 3891 3892 /* 3893 * Determine whether @p should be reenqueued from a local DSQ. 3894 * 3895 * @reenq_flags is mutable and accumulates state across the DSQ walk: 3896 * 3897 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 3898 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 3899 * the head consumes the first slot. 3900 * 3901 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 3902 * rq_is_open() is true. 3903 * 3904 * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 3905 * AND the current task is done — i.e. it will execute immediately. All other 3906 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 3907 * every IMMED task behind it gets reenqueued. 3908 * 3909 * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 3910 * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 3911 * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 3912 * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 3913 * in process_deferred_reenq_locals(). 3914 */ 3915 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 3916 { 3917 bool first; 3918 3919 first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 3920 *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 3921 3922 *reason = SCX_TASK_REENQ_KFUNC; 3923 3924 if ((p->scx.flags & SCX_TASK_IMMED) && 3925 (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 3926 __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 3927 *reason = SCX_TASK_REENQ_IMMED; 3928 return true; 3929 } 3930 3931 return *reenq_flags & SCX_REENQ_ANY; 3932 } 3933 3934 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 3935 { 3936 LIST_HEAD(tasks); 3937 u32 nr_enqueued = 0; 3938 struct task_struct *p, *n; 3939 3940 lockdep_assert_rq_held(rq); 3941 3942 if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 3943 reenq_flags &= ~__SCX_REENQ_TSR_MASK; 3944 if (rq_is_open(rq, 0)) 3945 reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 3946 3947 /* 3948 * The BPF scheduler may choose to dispatch tasks back to 3949 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 3950 * first to avoid processing the same tasks repeatedly. 3951 */ 3952 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 3953 scx.dsq_list.node) { 3954 struct scx_sched *task_sch = scx_task_sched(p); 3955 u32 reason; 3956 3957 /* 3958 * If @p is being migrated, @p's current CPU may not agree with 3959 * its allowed CPUs and the migration_cpu_stop is about to 3960 * deactivate and re-activate @p anyway. Skip re-enqueueing. 3961 * 3962 * While racing sched property changes may also dequeue and 3963 * re-enqueue a migrating task while its current CPU and allowed 3964 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 3965 * the current local DSQ for running tasks and thus are not 3966 * visible to the BPF scheduler. 3967 */ 3968 if (p->migration_pending) 3969 continue; 3970 3971 if (!scx_is_descendant(task_sch, sch)) 3972 continue; 3973 3974 if (!local_task_should_reenq(p, &reenq_flags, &reason)) 3975 continue; 3976 3977 dispatch_dequeue(rq, p); 3978 3979 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 3980 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3981 p->scx.flags |= reason; 3982 3983 list_add_tail(&p->scx.dsq_list.node, &tasks); 3984 } 3985 3986 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 3987 list_del_init(&p->scx.dsq_list.node); 3988 3989 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3990 3991 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3992 nr_enqueued++; 3993 } 3994 3995 return nr_enqueued; 3996 } 3997 3998 static void process_deferred_reenq_locals(struct rq *rq) 3999 { 4000 u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4001 4002 lockdep_assert_rq_held(rq); 4003 4004 while (true) { 4005 struct scx_sched *sch; 4006 u64 reenq_flags; 4007 bool skip = false; 4008 4009 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4010 struct scx_deferred_reenq_local *drl = 4011 list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4012 struct scx_deferred_reenq_local, 4013 node); 4014 struct scx_sched_pcpu *sch_pcpu; 4015 4016 if (!drl) 4017 return; 4018 4019 sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4020 deferred_reenq_local); 4021 sch = sch_pcpu->sch; 4022 4023 reenq_flags = drl->flags; 4024 WRITE_ONCE(drl->flags, 0); 4025 list_del_init(&drl->node); 4026 4027 if (likely(drl->seq != seq)) { 4028 drl->seq = seq; 4029 drl->cnt = 0; 4030 } else { 4031 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4032 scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4033 drl->cnt); 4034 skip = true; 4035 } 4036 4037 __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4038 } 4039 } 4040 4041 if (!skip) { 4042 /* see schedule_dsq_reenq() */ 4043 smp_mb(); 4044 4045 reenq_local(sch, rq, reenq_flags); 4046 } 4047 } 4048 } 4049 4050 static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4051 { 4052 *reason = SCX_TASK_REENQ_KFUNC; 4053 return reenq_flags & SCX_REENQ_ANY; 4054 } 4055 4056 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4057 { 4058 struct rq *locked_rq = rq; 4059 struct scx_sched *sch = dsq->sched; 4060 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4061 struct task_struct *p; 4062 s32 nr_enqueued = 0; 4063 4064 lockdep_assert_rq_held(rq); 4065 4066 raw_spin_lock(&dsq->lock); 4067 4068 while (likely(!READ_ONCE(sch->bypass_depth))) { 4069 struct rq *task_rq; 4070 u32 reason; 4071 4072 p = nldsq_cursor_next_task(&cursor, dsq); 4073 if (!p) 4074 break; 4075 4076 if (!user_task_should_reenq(p, reenq_flags, &reason)) 4077 continue; 4078 4079 task_rq = task_rq(p); 4080 4081 if (locked_rq != task_rq) { 4082 if (locked_rq) 4083 raw_spin_rq_unlock(locked_rq); 4084 if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4085 raw_spin_unlock(&dsq->lock); 4086 raw_spin_rq_lock(task_rq); 4087 raw_spin_lock(&dsq->lock); 4088 } 4089 locked_rq = task_rq; 4090 4091 /* did we lose @p while switching locks? */ 4092 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4093 continue; 4094 } 4095 4096 /* @p is on @dsq, its rq and @dsq are locked */ 4097 dispatch_dequeue_locked(p, dsq); 4098 raw_spin_unlock(&dsq->lock); 4099 4100 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4101 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4102 p->scx.flags |= reason; 4103 4104 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4105 4106 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4107 4108 if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4109 raw_spin_rq_unlock(locked_rq); 4110 locked_rq = NULL; 4111 cpu_relax(); 4112 } 4113 4114 raw_spin_lock(&dsq->lock); 4115 } 4116 4117 list_del_init(&cursor.node); 4118 raw_spin_unlock(&dsq->lock); 4119 4120 if (locked_rq != rq) { 4121 if (locked_rq) 4122 raw_spin_rq_unlock(locked_rq); 4123 raw_spin_rq_lock(rq); 4124 } 4125 } 4126 4127 static void process_deferred_reenq_users(struct rq *rq) 4128 { 4129 lockdep_assert_rq_held(rq); 4130 4131 while (true) { 4132 struct scx_dispatch_q *dsq; 4133 u64 reenq_flags; 4134 4135 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4136 struct scx_deferred_reenq_user *dru = 4137 list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4138 struct scx_deferred_reenq_user, 4139 node); 4140 struct scx_dsq_pcpu *dsq_pcpu; 4141 4142 if (!dru) 4143 return; 4144 4145 dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4146 deferred_reenq_user); 4147 dsq = dsq_pcpu->dsq; 4148 reenq_flags = dru->flags; 4149 WRITE_ONCE(dru->flags, 0); 4150 list_del_init(&dru->node); 4151 } 4152 4153 /* see schedule_dsq_reenq() */ 4154 smp_mb(); 4155 4156 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4157 reenq_user(rq, dsq, reenq_flags); 4158 } 4159 } 4160 4161 static void run_deferred(struct rq *rq) 4162 { 4163 process_ddsp_deferred_locals(rq); 4164 4165 if (!list_empty(&rq->scx.deferred_reenq_locals)) 4166 process_deferred_reenq_locals(rq); 4167 4168 if (!list_empty(&rq->scx.deferred_reenq_users)) 4169 process_deferred_reenq_users(rq); 4170 } 4171 4172 #ifdef CONFIG_NO_HZ_FULL 4173 bool scx_can_stop_tick(struct rq *rq) 4174 { 4175 struct task_struct *p = rq->curr; 4176 struct scx_sched *sch = scx_task_sched(p); 4177 4178 if (p->sched_class != &ext_sched_class) 4179 return true; 4180 4181 if (scx_bypassing(sch, cpu_of(rq))) 4182 return false; 4183 4184 /* 4185 * @rq can dispatch from different DSQs, so we can't tell whether it 4186 * needs the tick or not by looking at nr_running. Allow stopping ticks 4187 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4188 */ 4189 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4190 } 4191 #endif 4192 4193 #ifdef CONFIG_EXT_GROUP_SCHED 4194 4195 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4196 static bool scx_cgroup_enabled; 4197 4198 void scx_tg_init(struct task_group *tg) 4199 { 4200 tg->scx.weight = CGROUP_WEIGHT_DFL; 4201 tg->scx.bw_period_us = default_bw_period_us(); 4202 tg->scx.bw_quota_us = RUNTIME_INF; 4203 tg->scx.idle = false; 4204 } 4205 4206 int scx_tg_online(struct task_group *tg) 4207 { 4208 struct scx_sched *sch = scx_root; 4209 int ret = 0; 4210 4211 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4212 4213 if (scx_cgroup_enabled) { 4214 if (SCX_HAS_OP(sch, cgroup_init)) { 4215 struct scx_cgroup_init_args args = 4216 { .weight = tg->scx.weight, 4217 .bw_period_us = tg->scx.bw_period_us, 4218 .bw_quota_us = tg->scx.bw_quota_us, 4219 .bw_burst_us = tg->scx.bw_burst_us }; 4220 4221 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, 4222 NULL, tg->css.cgroup, &args); 4223 if (ret) 4224 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4225 } 4226 if (ret == 0) 4227 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4228 } else { 4229 tg->scx.flags |= SCX_TG_ONLINE; 4230 } 4231 4232 return ret; 4233 } 4234 4235 void scx_tg_offline(struct task_group *tg) 4236 { 4237 struct scx_sched *sch = scx_root; 4238 4239 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4240 4241 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4242 (tg->scx.flags & SCX_TG_INITED)) 4243 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, 4244 tg->css.cgroup); 4245 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4246 } 4247 4248 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4249 { 4250 struct scx_sched *sch = scx_root; 4251 struct cgroup_subsys_state *css; 4252 struct task_struct *p; 4253 int ret; 4254 4255 if (!scx_cgroup_enabled) 4256 return 0; 4257 4258 cgroup_taskset_for_each(p, css, tset) { 4259 struct cgroup *from = tg_cgrp(task_group(p)); 4260 struct cgroup *to = tg_cgrp(css_tg(css)); 4261 4262 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4263 4264 /* 4265 * sched_move_task() omits identity migrations. Let's match the 4266 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4267 * always match one-to-one. 4268 */ 4269 if (from == to) 4270 continue; 4271 4272 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4273 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, 4274 cgroup_prep_move, NULL, 4275 p, from, css->cgroup); 4276 if (ret) 4277 goto err; 4278 } 4279 4280 p->scx.cgrp_moving_from = from; 4281 } 4282 4283 return 0; 4284 4285 err: 4286 cgroup_taskset_for_each(p, css, tset) { 4287 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4288 p->scx.cgrp_moving_from) 4289 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, 4290 p, p->scx.cgrp_moving_from, css->cgroup); 4291 p->scx.cgrp_moving_from = NULL; 4292 } 4293 4294 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4295 } 4296 4297 void scx_cgroup_move_task(struct task_struct *p) 4298 { 4299 struct scx_sched *sch = scx_root; 4300 4301 if (!scx_cgroup_enabled) 4302 return; 4303 4304 /* 4305 * @p must have ops.cgroup_prep_move() called on it and thus 4306 * cgrp_moving_from set. 4307 */ 4308 if (SCX_HAS_OP(sch, cgroup_move) && 4309 !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) 4310 SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, 4311 p, p->scx.cgrp_moving_from, 4312 tg_cgrp(task_group(p))); 4313 p->scx.cgrp_moving_from = NULL; 4314 } 4315 4316 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4317 { 4318 struct scx_sched *sch = scx_root; 4319 struct cgroup_subsys_state *css; 4320 struct task_struct *p; 4321 4322 if (!scx_cgroup_enabled) 4323 return; 4324 4325 cgroup_taskset_for_each(p, css, tset) { 4326 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4327 p->scx.cgrp_moving_from) 4328 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, 4329 p, p->scx.cgrp_moving_from, css->cgroup); 4330 p->scx.cgrp_moving_from = NULL; 4331 } 4332 } 4333 4334 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4335 { 4336 struct scx_sched *sch = scx_root; 4337 4338 percpu_down_read(&scx_cgroup_ops_rwsem); 4339 4340 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4341 tg->scx.weight != weight) 4342 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, 4343 tg_cgrp(tg), weight); 4344 4345 tg->scx.weight = weight; 4346 4347 percpu_up_read(&scx_cgroup_ops_rwsem); 4348 } 4349 4350 void scx_group_set_idle(struct task_group *tg, bool idle) 4351 { 4352 struct scx_sched *sch = scx_root; 4353 4354 percpu_down_read(&scx_cgroup_ops_rwsem); 4355 4356 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4357 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, 4358 tg_cgrp(tg), idle); 4359 4360 /* Update the task group's idle state */ 4361 tg->scx.idle = idle; 4362 4363 percpu_up_read(&scx_cgroup_ops_rwsem); 4364 } 4365 4366 void scx_group_set_bandwidth(struct task_group *tg, 4367 u64 period_us, u64 quota_us, u64 burst_us) 4368 { 4369 struct scx_sched *sch = scx_root; 4370 4371 percpu_down_read(&scx_cgroup_ops_rwsem); 4372 4373 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4374 (tg->scx.bw_period_us != period_us || 4375 tg->scx.bw_quota_us != quota_us || 4376 tg->scx.bw_burst_us != burst_us)) 4377 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, 4378 tg_cgrp(tg), period_us, quota_us, burst_us); 4379 4380 tg->scx.bw_period_us = period_us; 4381 tg->scx.bw_quota_us = quota_us; 4382 tg->scx.bw_burst_us = burst_us; 4383 4384 percpu_up_read(&scx_cgroup_ops_rwsem); 4385 } 4386 #endif /* CONFIG_EXT_GROUP_SCHED */ 4387 4388 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4389 static struct cgroup *root_cgroup(void) 4390 { 4391 return &cgrp_dfl_root.cgrp; 4392 } 4393 4394 static struct cgroup *sch_cgroup(struct scx_sched *sch) 4395 { 4396 return sch->cgrp; 4397 } 4398 4399 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4400 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4401 { 4402 struct cgroup *pos; 4403 struct cgroup_subsys_state *css; 4404 4405 cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4406 rcu_assign_pointer(pos->scx_sched, sch); 4407 } 4408 4409 static void scx_cgroup_lock(void) 4410 { 4411 #ifdef CONFIG_EXT_GROUP_SCHED 4412 percpu_down_write(&scx_cgroup_ops_rwsem); 4413 #endif 4414 cgroup_lock(); 4415 } 4416 4417 static void scx_cgroup_unlock(void) 4418 { 4419 cgroup_unlock(); 4420 #ifdef CONFIG_EXT_GROUP_SCHED 4421 percpu_up_write(&scx_cgroup_ops_rwsem); 4422 #endif 4423 } 4424 #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4425 static struct cgroup *root_cgroup(void) { return NULL; } 4426 static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4427 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4428 static void scx_cgroup_lock(void) {} 4429 static void scx_cgroup_unlock(void) {} 4430 #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4431 4432 /* 4433 * Omitted operations: 4434 * 4435 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4436 * 4437 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4438 * their current sched_class. Call them directly from sched core instead. 4439 */ 4440 DEFINE_SCHED_CLASS(ext) = { 4441 .enqueue_task = enqueue_task_scx, 4442 .dequeue_task = dequeue_task_scx, 4443 .yield_task = yield_task_scx, 4444 .yield_to_task = yield_to_task_scx, 4445 4446 .wakeup_preempt = wakeup_preempt_scx, 4447 4448 .pick_task = pick_task_scx, 4449 4450 .put_prev_task = put_prev_task_scx, 4451 .set_next_task = set_next_task_scx, 4452 4453 .select_task_rq = select_task_rq_scx, 4454 .task_woken = task_woken_scx, 4455 .set_cpus_allowed = set_cpus_allowed_scx, 4456 4457 .rq_online = rq_online_scx, 4458 .rq_offline = rq_offline_scx, 4459 4460 .task_tick = task_tick_scx, 4461 4462 .switching_to = switching_to_scx, 4463 .switched_from = switched_from_scx, 4464 .switched_to = switched_to_scx, 4465 .reweight_task = reweight_task_scx, 4466 .prio_changed = prio_changed_scx, 4467 4468 .update_curr = update_curr_scx, 4469 4470 #ifdef CONFIG_UCLAMP_TASK 4471 .uclamp_enabled = 1, 4472 #endif 4473 }; 4474 4475 static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4476 struct scx_sched *sch) 4477 { 4478 s32 cpu; 4479 4480 memset(dsq, 0, sizeof(*dsq)); 4481 4482 raw_spin_lock_init(&dsq->lock); 4483 INIT_LIST_HEAD(&dsq->list); 4484 dsq->id = dsq_id; 4485 dsq->sched = sch; 4486 4487 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4488 if (!dsq->pcpu) 4489 return -ENOMEM; 4490 4491 for_each_possible_cpu(cpu) { 4492 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4493 4494 pcpu->dsq = dsq; 4495 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4496 } 4497 4498 return 0; 4499 } 4500 4501 static void exit_dsq(struct scx_dispatch_q *dsq) 4502 { 4503 s32 cpu; 4504 4505 for_each_possible_cpu(cpu) { 4506 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4507 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4508 struct rq *rq = cpu_rq(cpu); 4509 4510 /* 4511 * There must have been a RCU grace period since the last 4512 * insertion and @dsq should be off the deferred list by now. 4513 */ 4514 if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4515 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4516 list_del_init(&dru->node); 4517 } 4518 } 4519 4520 free_percpu(dsq->pcpu); 4521 } 4522 4523 static void free_dsq_rcufn(struct rcu_head *rcu) 4524 { 4525 struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4526 4527 exit_dsq(dsq); 4528 kfree(dsq); 4529 } 4530 4531 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4532 { 4533 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4534 struct scx_dispatch_q *dsq, *tmp_dsq; 4535 4536 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4537 call_rcu(&dsq->rcu, free_dsq_rcufn); 4538 } 4539 4540 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4541 4542 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4543 { 4544 struct scx_dispatch_q *dsq; 4545 unsigned long flags; 4546 4547 rcu_read_lock(); 4548 4549 dsq = find_user_dsq(sch, dsq_id); 4550 if (!dsq) 4551 goto out_unlock_rcu; 4552 4553 raw_spin_lock_irqsave(&dsq->lock, flags); 4554 4555 if (dsq->nr) { 4556 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4557 dsq->id, dsq->nr); 4558 goto out_unlock_dsq; 4559 } 4560 4561 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4562 dsq_hash_params)) 4563 goto out_unlock_dsq; 4564 4565 /* 4566 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4567 * queueing more tasks. As this function can be called from anywhere, 4568 * freeing is bounced through an irq work to avoid nesting RCU 4569 * operations inside scheduler locks. 4570 */ 4571 dsq->id = SCX_DSQ_INVALID; 4572 if (llist_add(&dsq->free_node, &dsqs_to_free)) 4573 irq_work_queue(&free_dsq_irq_work); 4574 4575 out_unlock_dsq: 4576 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4577 out_unlock_rcu: 4578 rcu_read_unlock(); 4579 } 4580 4581 #ifdef CONFIG_EXT_GROUP_SCHED 4582 static void scx_cgroup_exit(struct scx_sched *sch) 4583 { 4584 struct cgroup_subsys_state *css; 4585 4586 scx_cgroup_enabled = false; 4587 4588 /* 4589 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4590 * cgroups and exit all the inited ones, all online cgroups are exited. 4591 */ 4592 css_for_each_descendant_post(css, &root_task_group.css) { 4593 struct task_group *tg = css_tg(css); 4594 4595 if (!(tg->scx.flags & SCX_TG_INITED)) 4596 continue; 4597 tg->scx.flags &= ~SCX_TG_INITED; 4598 4599 if (!sch->ops.cgroup_exit) 4600 continue; 4601 4602 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, 4603 css->cgroup); 4604 } 4605 } 4606 4607 static int scx_cgroup_init(struct scx_sched *sch) 4608 { 4609 struct cgroup_subsys_state *css; 4610 int ret; 4611 4612 /* 4613 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4614 * cgroups and init, all online cgroups are initialized. 4615 */ 4616 css_for_each_descendant_pre(css, &root_task_group.css) { 4617 struct task_group *tg = css_tg(css); 4618 struct scx_cgroup_init_args args = { 4619 .weight = tg->scx.weight, 4620 .bw_period_us = tg->scx.bw_period_us, 4621 .bw_quota_us = tg->scx.bw_quota_us, 4622 .bw_burst_us = tg->scx.bw_burst_us, 4623 }; 4624 4625 if ((tg->scx.flags & 4626 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4627 continue; 4628 4629 if (!sch->ops.cgroup_init) { 4630 tg->scx.flags |= SCX_TG_INITED; 4631 continue; 4632 } 4633 4634 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, 4635 css->cgroup, &args); 4636 if (ret) { 4637 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4638 return ret; 4639 } 4640 tg->scx.flags |= SCX_TG_INITED; 4641 } 4642 4643 WARN_ON_ONCE(scx_cgroup_enabled); 4644 scx_cgroup_enabled = true; 4645 4646 return 0; 4647 } 4648 4649 #else 4650 static void scx_cgroup_exit(struct scx_sched *sch) {} 4651 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4652 #endif 4653 4654 4655 /******************************************************************************** 4656 * Sysfs interface and ops enable/disable. 4657 */ 4658 4659 #define SCX_ATTR(_name) \ 4660 static struct kobj_attribute scx_attr_##_name = { \ 4661 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4662 .show = scx_attr_##_name##_show, \ 4663 } 4664 4665 static ssize_t scx_attr_state_show(struct kobject *kobj, 4666 struct kobj_attribute *ka, char *buf) 4667 { 4668 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4669 } 4670 SCX_ATTR(state); 4671 4672 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4673 struct kobj_attribute *ka, char *buf) 4674 { 4675 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4676 } 4677 SCX_ATTR(switch_all); 4678 4679 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4680 struct kobj_attribute *ka, char *buf) 4681 { 4682 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4683 } 4684 SCX_ATTR(nr_rejected); 4685 4686 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4687 struct kobj_attribute *ka, char *buf) 4688 { 4689 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4690 } 4691 SCX_ATTR(hotplug_seq); 4692 4693 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4694 struct kobj_attribute *ka, char *buf) 4695 { 4696 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4697 } 4698 SCX_ATTR(enable_seq); 4699 4700 static struct attribute *scx_global_attrs[] = { 4701 &scx_attr_state.attr, 4702 &scx_attr_switch_all.attr, 4703 &scx_attr_nr_rejected.attr, 4704 &scx_attr_hotplug_seq.attr, 4705 &scx_attr_enable_seq.attr, 4706 NULL, 4707 }; 4708 4709 static const struct attribute_group scx_global_attr_group = { 4710 .attrs = scx_global_attrs, 4711 }; 4712 4713 static void free_pnode(struct scx_sched_pnode *pnode); 4714 static void free_exit_info(struct scx_exit_info *ei); 4715 4716 static void scx_sched_free_rcu_work(struct work_struct *work) 4717 { 4718 struct rcu_work *rcu_work = to_rcu_work(work); 4719 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 4720 struct rhashtable_iter rht_iter; 4721 struct scx_dispatch_q *dsq; 4722 int cpu, node; 4723 4724 irq_work_sync(&sch->disable_irq_work); 4725 kthread_destroy_worker(sch->helper); 4726 timer_shutdown_sync(&sch->bypass_lb_timer); 4727 4728 #ifdef CONFIG_EXT_SUB_SCHED 4729 kfree(sch->cgrp_path); 4730 if (sch_cgroup(sch)) 4731 cgroup_put(sch_cgroup(sch)); 4732 #endif /* CONFIG_EXT_SUB_SCHED */ 4733 4734 for_each_possible_cpu(cpu) { 4735 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 4736 4737 /* 4738 * $sch would have entered bypass mode before the RCU grace 4739 * period. As that blocks new deferrals, all 4740 * deferred_reenq_local_node's must be off-list by now. 4741 */ 4742 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 4743 4744 exit_dsq(bypass_dsq(sch, cpu)); 4745 } 4746 4747 free_percpu(sch->pcpu); 4748 4749 for_each_node_state(node, N_POSSIBLE) 4750 free_pnode(sch->pnode[node]); 4751 kfree(sch->pnode); 4752 4753 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 4754 do { 4755 rhashtable_walk_start(&rht_iter); 4756 4757 while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 4758 destroy_dsq(sch, dsq->id); 4759 4760 rhashtable_walk_stop(&rht_iter); 4761 } while (dsq == ERR_PTR(-EAGAIN)); 4762 rhashtable_walk_exit(&rht_iter); 4763 4764 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 4765 free_exit_info(sch->exit_info); 4766 kfree(sch); 4767 } 4768 4769 static void scx_kobj_release(struct kobject *kobj) 4770 { 4771 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4772 4773 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 4774 queue_rcu_work(system_dfl_wq, &sch->rcu_work); 4775 } 4776 4777 static ssize_t scx_attr_ops_show(struct kobject *kobj, 4778 struct kobj_attribute *ka, char *buf) 4779 { 4780 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4781 4782 return sysfs_emit(buf, "%s\n", sch->ops.name); 4783 } 4784 SCX_ATTR(ops); 4785 4786 #define scx_attr_event_show(buf, at, events, kind) ({ \ 4787 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4788 }) 4789 4790 static ssize_t scx_attr_events_show(struct kobject *kobj, 4791 struct kobj_attribute *ka, char *buf) 4792 { 4793 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4794 struct scx_event_stats events; 4795 int at = 0; 4796 4797 scx_read_events(sch, &events); 4798 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4799 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4800 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4801 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4802 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4803 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 4804 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 4805 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 4806 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4807 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4808 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4809 at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 4810 at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 4811 return at; 4812 } 4813 SCX_ATTR(events); 4814 4815 static struct attribute *scx_sched_attrs[] = { 4816 &scx_attr_ops.attr, 4817 &scx_attr_events.attr, 4818 NULL, 4819 }; 4820 ATTRIBUTE_GROUPS(scx_sched); 4821 4822 static const struct kobj_type scx_ktype = { 4823 .release = scx_kobj_release, 4824 .sysfs_ops = &kobj_sysfs_ops, 4825 .default_groups = scx_sched_groups, 4826 }; 4827 4828 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 4829 { 4830 const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4831 4832 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 4833 } 4834 4835 static const struct kset_uevent_ops scx_uevent_ops = { 4836 .uevent = scx_uevent, 4837 }; 4838 4839 /* 4840 * Used by sched_fork() and __setscheduler_prio() to pick the matching 4841 * sched_class. dl/rt are already handled. 4842 */ 4843 bool task_should_scx(int policy) 4844 { 4845 if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) 4846 return false; 4847 if (READ_ONCE(scx_switching_all)) 4848 return true; 4849 return policy == SCHED_EXT; 4850 } 4851 4852 bool scx_allow_ttwu_queue(const struct task_struct *p) 4853 { 4854 struct scx_sched *sch; 4855 4856 if (!scx_enabled()) 4857 return true; 4858 4859 sch = scx_task_sched(p); 4860 if (unlikely(!sch)) 4861 return true; 4862 4863 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 4864 return true; 4865 4866 if (unlikely(p->sched_class != &ext_sched_class)) 4867 return true; 4868 4869 return false; 4870 } 4871 4872 /** 4873 * handle_lockup - sched_ext common lockup handler 4874 * @fmt: format string 4875 * 4876 * Called on system stall or lockup condition and initiates abort of sched_ext 4877 * if enabled, which may resolve the reported lockup. 4878 * 4879 * Returns %true if sched_ext is enabled and abort was initiated, which may 4880 * resolve the lockup. %false if sched_ext is not enabled or abort was already 4881 * initiated by someone else. 4882 */ 4883 static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 4884 { 4885 struct scx_sched *sch; 4886 va_list args; 4887 bool ret; 4888 4889 guard(rcu)(); 4890 4891 sch = rcu_dereference(scx_root); 4892 if (unlikely(!sch)) 4893 return false; 4894 4895 switch (scx_enable_state()) { 4896 case SCX_ENABLING: 4897 case SCX_ENABLED: 4898 va_start(args, fmt); 4899 ret = scx_verror(sch, fmt, args); 4900 va_end(args); 4901 return ret; 4902 default: 4903 return false; 4904 } 4905 } 4906 4907 /** 4908 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 4909 * 4910 * While there are various reasons why RCU CPU stalls can occur on a system 4911 * that may not be caused by the current BPF scheduler, try kicking out the 4912 * current scheduler in an attempt to recover the system to a good state before 4913 * issuing panics. 4914 * 4915 * Returns %true if sched_ext is enabled and abort was initiated, which may 4916 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 4917 * else already initiated abort. 4918 */ 4919 bool scx_rcu_cpu_stall(void) 4920 { 4921 return handle_lockup("RCU CPU stall detected!"); 4922 } 4923 4924 /** 4925 * scx_softlockup - sched_ext softlockup handler 4926 * @dur_s: number of seconds of CPU stuck due to soft lockup 4927 * 4928 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 4929 * live-lock the system by making many CPUs target the same DSQ to the point 4930 * where soft-lockup detection triggers. This function is called from 4931 * soft-lockup watchdog when the triggering point is close and tries to unjam 4932 * the system and aborting the BPF scheduler. 4933 */ 4934 void scx_softlockup(u32 dur_s) 4935 { 4936 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 4937 return; 4938 4939 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 4940 smp_processor_id(), dur_s); 4941 } 4942 4943 /** 4944 * scx_hardlockup - sched_ext hardlockup handler 4945 * 4946 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 4947 * numerous affinitized tasks in a single queue and directing all CPUs at it. 4948 * Try kicking out the current scheduler in an attempt to recover the system to 4949 * a good state before taking more drastic actions. 4950 * 4951 * Returns %true if sched_ext is enabled and abort was initiated, which may 4952 * resolve the reported hardlockdup. %false if sched_ext is not enabled or 4953 * someone else already initiated abort. 4954 */ 4955 bool scx_hardlockup(int cpu) 4956 { 4957 if (!handle_lockup("hard lockup - CPU %d", cpu)) 4958 return false; 4959 4960 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 4961 cpu); 4962 return true; 4963 } 4964 4965 static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 4966 struct cpumask *donee_mask, struct cpumask *resched_mask, 4967 u32 nr_donor_target, u32 nr_donee_target) 4968 { 4969 struct rq *donor_rq = cpu_rq(donor); 4970 struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 4971 struct task_struct *p, *n; 4972 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 4973 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 4974 u32 nr_balanced = 0, min_delta_us; 4975 4976 /* 4977 * All we want to guarantee is reasonable forward progress. No reason to 4978 * fine tune. Assuming every task on @donor_dsq runs their full slice, 4979 * consider offloading iff the total queued duration is over the 4980 * threshold. 4981 */ 4982 min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 4983 if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 4984 return 0; 4985 4986 raw_spin_rq_lock_irq(donor_rq); 4987 raw_spin_lock(&donor_dsq->lock); 4988 list_add(&cursor.node, &donor_dsq->list); 4989 resume: 4990 n = container_of(&cursor, struct task_struct, scx.dsq_list); 4991 n = nldsq_next_task(donor_dsq, n, false); 4992 4993 while ((p = n)) { 4994 struct scx_dispatch_q *donee_dsq; 4995 int donee; 4996 4997 n = nldsq_next_task(donor_dsq, n, false); 4998 4999 if (donor_dsq->nr <= nr_donor_target) 5000 break; 5001 5002 if (cpumask_empty(donee_mask)) 5003 break; 5004 5005 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5006 if (donee >= nr_cpu_ids) 5007 continue; 5008 5009 donee_dsq = bypass_dsq(sch, donee); 5010 5011 /* 5012 * $p's rq is not locked but $p's DSQ lock protects its 5013 * scheduling properties making this test safe. 5014 */ 5015 if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5016 continue; 5017 5018 /* 5019 * Moving $p from one non-local DSQ to another. The source rq 5020 * and DSQ are already locked. Do an abbreviated dequeue and 5021 * then perform enqueue without unlocking $donor_dsq. 5022 * 5023 * We don't want to drop and reacquire the lock on each 5024 * iteration as @donor_dsq can be very long and potentially 5025 * highly contended. Donee DSQs are less likely to be contended. 5026 * The nested locking is safe as only this LB moves tasks 5027 * between bypass DSQs. 5028 */ 5029 dispatch_dequeue_locked(p, donor_dsq); 5030 dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5031 5032 /* 5033 * $donee might have been idle and need to be woken up. No need 5034 * to be clever. Kick every CPU that receives tasks. 5035 */ 5036 cpumask_set_cpu(donee, resched_mask); 5037 5038 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5039 cpumask_clear_cpu(donee, donee_mask); 5040 5041 nr_balanced++; 5042 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5043 list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5044 raw_spin_unlock(&donor_dsq->lock); 5045 raw_spin_rq_unlock_irq(donor_rq); 5046 cpu_relax(); 5047 raw_spin_rq_lock_irq(donor_rq); 5048 raw_spin_lock(&donor_dsq->lock); 5049 goto resume; 5050 } 5051 } 5052 5053 list_del_init(&cursor.node); 5054 raw_spin_unlock(&donor_dsq->lock); 5055 raw_spin_rq_unlock_irq(donor_rq); 5056 5057 return nr_balanced; 5058 } 5059 5060 static void bypass_lb_node(struct scx_sched *sch, int node) 5061 { 5062 const struct cpumask *node_mask = cpumask_of_node(node); 5063 struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; 5064 struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; 5065 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5066 u32 nr_target, nr_donor_target; 5067 u32 before_min = U32_MAX, before_max = 0; 5068 u32 after_min = U32_MAX, after_max = 0; 5069 int cpu; 5070 5071 /* count the target tasks and CPUs */ 5072 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5073 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5074 5075 nr_tasks += nr; 5076 nr_cpus++; 5077 5078 before_min = min(nr, before_min); 5079 before_max = max(nr, before_max); 5080 } 5081 5082 if (!nr_cpus) 5083 return; 5084 5085 /* 5086 * We don't want CPUs to have more than $nr_donor_target tasks and 5087 * balancing to fill donee CPUs upto $nr_target. Once targets are 5088 * calculated, find the donee CPUs. 5089 */ 5090 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5091 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5092 5093 cpumask_clear(donee_mask); 5094 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5095 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5096 cpumask_set_cpu(cpu, donee_mask); 5097 } 5098 5099 /* iterate !donee CPUs and see if they should be offloaded */ 5100 cpumask_clear(resched_mask); 5101 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5102 if (cpumask_empty(donee_mask)) 5103 break; 5104 if (cpumask_test_cpu(cpu, donee_mask)) 5105 continue; 5106 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5107 continue; 5108 5109 nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5110 nr_donor_target, nr_target); 5111 } 5112 5113 for_each_cpu(cpu, resched_mask) 5114 resched_cpu(cpu); 5115 5116 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5117 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5118 5119 after_min = min(nr, after_min); 5120 after_max = max(nr, after_max); 5121 5122 } 5123 5124 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5125 before_min, before_max, after_min, after_max); 5126 } 5127 5128 /* 5129 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5130 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5131 * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5132 * lightly loaded CPUs, such imbalance can lead to very high execution latency 5133 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5134 * outcomes, a simple load balancing mechanism is implemented by the following 5135 * timer which runs periodically while bypass mode is in effect. 5136 */ 5137 static void scx_bypass_lb_timerfn(struct timer_list *timer) 5138 { 5139 struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5140 int node; 5141 u32 intv_us; 5142 5143 if (!bypass_dsp_enabled(sch)) 5144 return; 5145 5146 for_each_node_with_cpus(node) 5147 bypass_lb_node(sch, node); 5148 5149 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5150 if (intv_us) 5151 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5152 } 5153 5154 static bool inc_bypass_depth(struct scx_sched *sch) 5155 { 5156 lockdep_assert_held(&scx_bypass_lock); 5157 5158 WARN_ON_ONCE(sch->bypass_depth < 0); 5159 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5160 if (sch->bypass_depth != 1) 5161 return false; 5162 5163 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5164 sch->bypass_timestamp = ktime_get_ns(); 5165 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5166 return true; 5167 } 5168 5169 static bool dec_bypass_depth(struct scx_sched *sch) 5170 { 5171 lockdep_assert_held(&scx_bypass_lock); 5172 5173 WARN_ON_ONCE(sch->bypass_depth < 1); 5174 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5175 if (sch->bypass_depth != 0) 5176 return false; 5177 5178 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5179 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5180 ktime_get_ns() - sch->bypass_timestamp); 5181 return true; 5182 } 5183 5184 static void enable_bypass_dsp(struct scx_sched *sch) 5185 { 5186 struct scx_sched *host = scx_parent(sch) ?: sch; 5187 u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5188 s32 ret; 5189 5190 /* 5191 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5192 * Shouldn't stagger. 5193 */ 5194 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5195 return; 5196 5197 /* 5198 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5199 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5200 * called iff @sch is not already bypassed due to an ancestor bypassing, 5201 * we can assume that the parent is not bypassing and thus will be the 5202 * host of the bypass DSQs. 5203 * 5204 * While the situation may change in the future, the following 5205 * guarantees that the nearest non-bypassing ancestor or root has bypass 5206 * dispatch enabled while a descendant is bypassing, which is all that's 5207 * required. 5208 * 5209 * bypass_dsp_enabled() test is used to determine whether to enter the 5210 * bypass dispatch handling path from both bypassing and hosting scheds. 5211 * Bump enable depth on both @sch and bypass dispatch host. 5212 */ 5213 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5214 WARN_ON_ONCE(ret <= 0); 5215 5216 if (host != sch) { 5217 ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5218 WARN_ON_ONCE(ret <= 0); 5219 } 5220 5221 /* 5222 * The LB timer will stop running if bypass dispatch is disabled. Start 5223 * after enabling bypass dispatch. 5224 */ 5225 if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5226 mod_timer(&host->bypass_lb_timer, 5227 jiffies + usecs_to_jiffies(intv_us)); 5228 } 5229 5230 /* may be called without holding scx_bypass_lock */ 5231 static void disable_bypass_dsp(struct scx_sched *sch) 5232 { 5233 s32 ret; 5234 5235 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5236 return; 5237 5238 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5239 WARN_ON_ONCE(ret < 0); 5240 5241 if (scx_parent(sch)) { 5242 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5243 WARN_ON_ONCE(ret < 0); 5244 } 5245 } 5246 5247 /** 5248 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5249 * @sch: sched to bypass 5250 * @bypass: true for bypass, false for unbypass 5251 * 5252 * Bypassing guarantees that all runnable tasks make forward progress without 5253 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5254 * be held by tasks that the BPF scheduler is forgetting to run, which 5255 * unfortunately also excludes toggling the static branches. 5256 * 5257 * Let's work around by overriding a couple ops and modifying behaviors based on 5258 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5259 * to force global FIFO scheduling. 5260 * 5261 * - ops.select_cpu() is ignored and the default select_cpu() is used. 5262 * 5263 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5264 * %SCX_OPS_ENQ_LAST is also ignored. 5265 * 5266 * - ops.dispatch() is ignored. 5267 * 5268 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5269 * can't be trusted. Whenever a tick triggers, the running task is rotated to 5270 * the tail of the queue with core_sched_at touched. 5271 * 5272 * - pick_next_task() suppresses zero slice warning. 5273 * 5274 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5275 * operations. 5276 * 5277 * - scx_prio_less() reverts to the default core_sched_at order. 5278 */ 5279 static void scx_bypass(struct scx_sched *sch, bool bypass) 5280 { 5281 struct scx_sched *pos; 5282 unsigned long flags; 5283 int cpu; 5284 5285 raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5286 5287 if (bypass) { 5288 if (!inc_bypass_depth(sch)) 5289 goto unlock; 5290 5291 enable_bypass_dsp(sch); 5292 } else { 5293 if (!dec_bypass_depth(sch)) 5294 goto unlock; 5295 } 5296 5297 /* 5298 * Bypass state is propagated to all descendants - an scx_sched bypasses 5299 * if itself or any of its ancestors are in bypass mode. 5300 */ 5301 raw_spin_lock(&scx_sched_lock); 5302 scx_for_each_descendant_pre(pos, sch) { 5303 if (pos == sch) 5304 continue; 5305 if (bypass) 5306 inc_bypass_depth(pos); 5307 else 5308 dec_bypass_depth(pos); 5309 } 5310 raw_spin_unlock(&scx_sched_lock); 5311 5312 /* 5313 * No task property is changing. We just need to make sure all currently 5314 * queued tasks are re-queued according to the new scx_bypassing() 5315 * state. As an optimization, walk each rq's runnable_list instead of 5316 * the scx_tasks list. 5317 * 5318 * This function can't trust the scheduler and thus can't use 5319 * cpus_read_lock(). Walk all possible CPUs instead of online. 5320 */ 5321 for_each_possible_cpu(cpu) { 5322 struct rq *rq = cpu_rq(cpu); 5323 struct task_struct *p, *n; 5324 5325 raw_spin_rq_lock(rq); 5326 raw_spin_lock(&scx_sched_lock); 5327 5328 scx_for_each_descendant_pre(pos, sch) { 5329 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5330 5331 if (pos->bypass_depth) 5332 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5333 else 5334 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5335 } 5336 5337 raw_spin_unlock(&scx_sched_lock); 5338 5339 /* 5340 * We need to guarantee that no tasks are on the BPF scheduler 5341 * while bypassing. Either we see enabled or the enable path 5342 * sees scx_bypassing() before moving tasks to SCX. 5343 */ 5344 if (!scx_enabled()) { 5345 raw_spin_rq_unlock(rq); 5346 continue; 5347 } 5348 5349 /* 5350 * The use of list_for_each_entry_safe_reverse() is required 5351 * because each task is going to be removed from and added back 5352 * to the runnable_list during iteration. Because they're added 5353 * to the tail of the list, safe reverse iteration can still 5354 * visit all nodes. 5355 */ 5356 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5357 scx.runnable_node) { 5358 if (!scx_is_descendant(scx_task_sched(p), sch)) 5359 continue; 5360 5361 /* cycling deq/enq is enough, see the function comment */ 5362 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5363 /* nothing */ ; 5364 } 5365 } 5366 5367 /* resched to restore ticks and idle state */ 5368 if (cpu_online(cpu) || cpu == smp_processor_id()) 5369 resched_curr(rq); 5370 5371 raw_spin_rq_unlock(rq); 5372 } 5373 5374 /* disarming must come after moving all tasks out of the bypass DSQs */ 5375 if (!bypass) 5376 disable_bypass_dsp(sch); 5377 unlock: 5378 raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5379 } 5380 5381 static void free_exit_info(struct scx_exit_info *ei) 5382 { 5383 kvfree(ei->dump); 5384 kfree(ei->msg); 5385 kfree(ei->bt); 5386 kfree(ei); 5387 } 5388 5389 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5390 { 5391 struct scx_exit_info *ei; 5392 5393 ei = kzalloc_obj(*ei); 5394 if (!ei) 5395 return NULL; 5396 5397 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5398 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5399 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5400 5401 if (!ei->bt || !ei->msg || !ei->dump) { 5402 free_exit_info(ei); 5403 return NULL; 5404 } 5405 5406 return ei; 5407 } 5408 5409 static const char *scx_exit_reason(enum scx_exit_kind kind) 5410 { 5411 switch (kind) { 5412 case SCX_EXIT_UNREG: 5413 return "unregistered from user space"; 5414 case SCX_EXIT_UNREG_BPF: 5415 return "unregistered from BPF"; 5416 case SCX_EXIT_UNREG_KERN: 5417 return "unregistered from the main kernel"; 5418 case SCX_EXIT_SYSRQ: 5419 return "disabled by sysrq-S"; 5420 case SCX_EXIT_PARENT: 5421 return "parent exiting"; 5422 case SCX_EXIT_ERROR: 5423 return "runtime error"; 5424 case SCX_EXIT_ERROR_BPF: 5425 return "scx_bpf_error"; 5426 case SCX_EXIT_ERROR_STALL: 5427 return "runnable task stall"; 5428 default: 5429 return "<UNKNOWN>"; 5430 } 5431 } 5432 5433 static void free_kick_syncs(void) 5434 { 5435 int cpu; 5436 5437 for_each_possible_cpu(cpu) { 5438 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5439 struct scx_kick_syncs *to_free; 5440 5441 to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5442 if (to_free) 5443 kvfree_rcu(to_free, rcu); 5444 } 5445 } 5446 5447 static void refresh_watchdog(void) 5448 { 5449 struct scx_sched *sch; 5450 unsigned long intv = ULONG_MAX; 5451 5452 /* take the shortest timeout and use its half for watchdog interval */ 5453 rcu_read_lock(); 5454 list_for_each_entry_rcu(sch, &scx_sched_all, all) 5455 intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5456 rcu_read_unlock(); 5457 5458 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5459 WRITE_ONCE(scx_watchdog_interval, intv); 5460 5461 if (intv < ULONG_MAX) 5462 mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5463 else 5464 cancel_delayed_work_sync(&scx_watchdog_work); 5465 } 5466 5467 static s32 scx_link_sched(struct scx_sched *sch) 5468 { 5469 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5470 #ifdef CONFIG_EXT_SUB_SCHED 5471 struct scx_sched *parent = scx_parent(sch); 5472 s32 ret; 5473 5474 if (parent) { 5475 /* 5476 * scx_claim_exit() propagates exit_kind transition to 5477 * its sub-scheds while holding scx_sched_lock - either 5478 * we can see the parent's non-NONE exit_kind or the 5479 * parent can shoot us down. 5480 */ 5481 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5482 scx_error(sch, "parent disabled"); 5483 return -ENOENT; 5484 } 5485 5486 ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5487 &sch->hash_node, scx_sched_hash_params); 5488 if (ret) { 5489 scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); 5490 return ret; 5491 } 5492 5493 list_add_tail(&sch->sibling, &parent->children); 5494 } 5495 #endif /* CONFIG_EXT_SUB_SCHED */ 5496 5497 list_add_tail_rcu(&sch->all, &scx_sched_all); 5498 } 5499 5500 refresh_watchdog(); 5501 return 0; 5502 } 5503 5504 static void scx_unlink_sched(struct scx_sched *sch) 5505 { 5506 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5507 #ifdef CONFIG_EXT_SUB_SCHED 5508 if (scx_parent(sch)) { 5509 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5510 scx_sched_hash_params); 5511 list_del_init(&sch->sibling); 5512 } 5513 #endif /* CONFIG_EXT_SUB_SCHED */ 5514 list_del_rcu(&sch->all); 5515 } 5516 5517 refresh_watchdog(); 5518 } 5519 5520 /* 5521 * Called to disable future dumps and wait for in-progress one while disabling 5522 * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5523 * This prevents calling dump ops on a dead sch. 5524 */ 5525 static void scx_disable_dump(struct scx_sched *sch) 5526 { 5527 guard(raw_spinlock_irqsave)(&scx_dump_lock); 5528 sch->dump_disabled = true; 5529 } 5530 5531 #ifdef CONFIG_EXT_SUB_SCHED 5532 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5533 5534 static void drain_descendants(struct scx_sched *sch) 5535 { 5536 /* 5537 * Child scheds that finished the critical part of disabling will take 5538 * themselves off @sch->children. Wait for it to drain. As propagation 5539 * is recursive, empty @sch->children means that all proper descendant 5540 * scheds reached unlinking stage. 5541 */ 5542 wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5543 } 5544 5545 static void scx_fail_parent(struct scx_sched *sch, 5546 struct task_struct *failed, s32 fail_code) 5547 { 5548 struct scx_sched *parent = scx_parent(sch); 5549 struct scx_task_iter sti; 5550 struct task_struct *p; 5551 5552 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5553 fail_code, failed->comm, failed->pid); 5554 5555 /* 5556 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5557 * it. This may cause downstream failures on the BPF side but $parent is 5558 * dying anyway. 5559 */ 5560 scx_bypass(parent, true); 5561 5562 scx_task_iter_start(&sti, sch->cgrp); 5563 while ((p = scx_task_iter_next_locked(&sti))) { 5564 if (scx_task_on_sched(parent, p)) 5565 continue; 5566 5567 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5568 scx_disable_and_exit_task(sch, p); 5569 rcu_assign_pointer(p->scx.sched, parent); 5570 } 5571 } 5572 scx_task_iter_stop(&sti); 5573 } 5574 5575 static void scx_sub_disable(struct scx_sched *sch) 5576 { 5577 struct scx_sched *parent = scx_parent(sch); 5578 struct scx_task_iter sti; 5579 struct task_struct *p; 5580 int ret; 5581 5582 /* 5583 * Guarantee forward progress and wait for descendants to be disabled. 5584 * To limit disruptions, $parent is not bypassed. Tasks are fully 5585 * prepped and then inserted back into $parent. 5586 */ 5587 scx_bypass(sch, true); 5588 drain_descendants(sch); 5589 5590 /* 5591 * Here, every runnable task is guaranteed to make forward progress and 5592 * we can safely use blocking synchronization constructs. Actually 5593 * disable ops. 5594 */ 5595 mutex_lock(&scx_enable_mutex); 5596 percpu_down_write(&scx_fork_rwsem); 5597 scx_cgroup_lock(); 5598 5599 set_cgroup_sched(sch_cgroup(sch), parent); 5600 5601 scx_task_iter_start(&sti, sch->cgrp); 5602 while ((p = scx_task_iter_next_locked(&sti))) { 5603 struct rq *rq; 5604 struct rq_flags rf; 5605 5606 /* filter out duplicate visits */ 5607 if (scx_task_on_sched(parent, p)) 5608 continue; 5609 5610 /* 5611 * By the time control reaches here, all descendant schedulers 5612 * should already have been disabled. 5613 */ 5614 WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 5615 5616 /* 5617 * If $p is about to be freed, nothing prevents $sch from 5618 * unloading before $p reaches sched_ext_free(). Disable and 5619 * exit $p right away. 5620 */ 5621 if (!tryget_task_struct(p)) { 5622 scx_disable_and_exit_task(sch, p); 5623 continue; 5624 } 5625 5626 scx_task_iter_unlock(&sti); 5627 5628 /* 5629 * $p is READY or ENABLED on @sch. Initialize for $parent, 5630 * disable and exit from @sch, and then switch over to $parent. 5631 * 5632 * If a task fails to initialize for $parent, the only available 5633 * action is disabling $parent too. While this allows disabling 5634 * of a child sched to cause the parent scheduler to fail, the 5635 * failure can only originate from ops.init_task() of the 5636 * parent. A child can't directly affect the parent through its 5637 * own failures. 5638 */ 5639 ret = __scx_init_task(parent, p, false); 5640 if (ret) { 5641 scx_fail_parent(sch, p, ret); 5642 put_task_struct(p); 5643 break; 5644 } 5645 5646 rq = task_rq_lock(p, &rf); 5647 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5648 /* 5649 * $p is initialized for $parent and still attached to 5650 * @sch. Disable and exit for @sch, switch over to 5651 * $parent, override the state to READY to account for 5652 * $p having already been initialized, and then enable. 5653 */ 5654 scx_disable_and_exit_task(sch, p); 5655 scx_set_task_state(p, SCX_TASK_INIT); 5656 rcu_assign_pointer(p->scx.sched, parent); 5657 scx_set_task_state(p, SCX_TASK_READY); 5658 scx_enable_task(parent, p); 5659 } 5660 task_rq_unlock(rq, p, &rf); 5661 5662 put_task_struct(p); 5663 } 5664 scx_task_iter_stop(&sti); 5665 5666 scx_disable_dump(sch); 5667 5668 scx_cgroup_unlock(); 5669 percpu_up_write(&scx_fork_rwsem); 5670 5671 /* 5672 * All tasks are moved off of @sch but there may still be on-going 5673 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 5674 * the expedited version as ancestors may be waiting in bypass mode. 5675 * Also, tell the parent that there is no need to keep running bypass 5676 * DSQs for us. 5677 */ 5678 synchronize_rcu_expedited(); 5679 disable_bypass_dsp(sch); 5680 5681 scx_unlink_sched(sch); 5682 5683 mutex_unlock(&scx_enable_mutex); 5684 5685 /* 5686 * @sch is now unlinked from the parent's children list. Notify and call 5687 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 5688 * after unlinking and releasing all locks. See scx_claim_exit(). 5689 */ 5690 wake_up_all(&scx_unlink_waitq); 5691 5692 if (parent->ops.sub_detach && sch->sub_attached) { 5693 struct scx_sub_detach_args sub_detach_args = { 5694 .ops = &sch->ops, 5695 .cgroup_path = sch->cgrp_path, 5696 }; 5697 SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL, 5698 &sub_detach_args); 5699 } 5700 5701 if (sch->ops.exit) 5702 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info); 5703 kobject_del(&sch->kobj); 5704 } 5705 #else /* CONFIG_EXT_SUB_SCHED */ 5706 static void drain_descendants(struct scx_sched *sch) { } 5707 static void scx_sub_disable(struct scx_sched *sch) { } 5708 #endif /* CONFIG_EXT_SUB_SCHED */ 5709 5710 static void scx_root_disable(struct scx_sched *sch) 5711 { 5712 struct scx_exit_info *ei = sch->exit_info; 5713 struct scx_task_iter sti; 5714 struct task_struct *p; 5715 int cpu; 5716 5717 /* guarantee forward progress and wait for descendants to be disabled */ 5718 scx_bypass(sch, true); 5719 drain_descendants(sch); 5720 5721 switch (scx_set_enable_state(SCX_DISABLING)) { 5722 case SCX_DISABLING: 5723 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 5724 break; 5725 case SCX_DISABLED: 5726 pr_warn("sched_ext: ops error detected without ops (%s)\n", 5727 sch->exit_info->msg); 5728 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 5729 goto done; 5730 default: 5731 break; 5732 } 5733 5734 /* 5735 * Here, every runnable task is guaranteed to make forward progress and 5736 * we can safely use blocking synchronization constructs. Actually 5737 * disable ops. 5738 */ 5739 mutex_lock(&scx_enable_mutex); 5740 5741 static_branch_disable(&__scx_switched_all); 5742 WRITE_ONCE(scx_switching_all, false); 5743 5744 /* 5745 * Shut down cgroup support before tasks so that the cgroup attach path 5746 * doesn't race against scx_disable_and_exit_task(). 5747 */ 5748 scx_cgroup_lock(); 5749 scx_cgroup_exit(sch); 5750 scx_cgroup_unlock(); 5751 5752 /* 5753 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 5754 * must be switched out and exited synchronously. 5755 */ 5756 percpu_down_write(&scx_fork_rwsem); 5757 5758 scx_init_task_enabled = false; 5759 5760 scx_task_iter_start(&sti, NULL); 5761 while ((p = scx_task_iter_next_locked(&sti))) { 5762 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 5763 const struct sched_class *old_class = p->sched_class; 5764 const struct sched_class *new_class = scx_setscheduler_class(p); 5765 5766 update_rq_clock(task_rq(p)); 5767 5768 if (old_class != new_class) 5769 queue_flags |= DEQUEUE_CLASS; 5770 5771 scoped_guard (sched_change, p, queue_flags) { 5772 p->sched_class = new_class; 5773 } 5774 5775 scx_disable_and_exit_task(scx_task_sched(p), p); 5776 } 5777 scx_task_iter_stop(&sti); 5778 5779 scx_disable_dump(sch); 5780 5781 scx_cgroup_lock(); 5782 set_cgroup_sched(sch_cgroup(sch), NULL); 5783 scx_cgroup_unlock(); 5784 5785 percpu_up_write(&scx_fork_rwsem); 5786 5787 /* 5788 * Invalidate all the rq clocks to prevent getting outdated 5789 * rq clocks from a previous scx scheduler. 5790 */ 5791 for_each_possible_cpu(cpu) { 5792 struct rq *rq = cpu_rq(cpu); 5793 scx_rq_clock_invalidate(rq); 5794 } 5795 5796 /* no task is on scx, turn off all the switches and flush in-progress calls */ 5797 static_branch_disable(&__scx_enabled); 5798 bitmap_zero(sch->has_op, SCX_OPI_END); 5799 scx_idle_disable(); 5800 synchronize_rcu(); 5801 5802 if (ei->kind >= SCX_EXIT_ERROR) { 5803 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 5804 sch->ops.name, ei->reason); 5805 5806 if (ei->msg[0] != '\0') 5807 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 5808 #ifdef CONFIG_STACKTRACE 5809 stack_trace_print(ei->bt, ei->bt_len, 2); 5810 #endif 5811 } else { 5812 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", 5813 sch->ops.name, ei->reason); 5814 } 5815 5816 if (sch->ops.exit) 5817 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); 5818 5819 scx_unlink_sched(sch); 5820 5821 /* 5822 * scx_root clearing must be inside cpus_read_lock(). See 5823 * handle_hotplug(). 5824 */ 5825 cpus_read_lock(); 5826 RCU_INIT_POINTER(scx_root, NULL); 5827 cpus_read_unlock(); 5828 5829 /* 5830 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 5831 * could observe an object of the same name still in the hierarchy when 5832 * the next scheduler is loaded. 5833 */ 5834 kobject_del(&sch->kobj); 5835 5836 free_kick_syncs(); 5837 5838 mutex_unlock(&scx_enable_mutex); 5839 5840 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 5841 done: 5842 scx_bypass(sch, false); 5843 } 5844 5845 /* 5846 * Claim the exit on @sch. The caller must ensure that the helper kthread work 5847 * is kicked before the current task can be preempted. Once exit_kind is 5848 * claimed, scx_error() can no longer trigger, so if the current task gets 5849 * preempted and the BPF scheduler fails to schedule it back, the helper work 5850 * will never be kicked and the whole system can wedge. 5851 */ 5852 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 5853 { 5854 int none = SCX_EXIT_NONE; 5855 5856 lockdep_assert_preemption_disabled(); 5857 5858 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 5859 kind = SCX_EXIT_ERROR; 5860 5861 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 5862 return false; 5863 5864 /* 5865 * Some CPUs may be trapped in the dispatch paths. Set the aborting 5866 * flag to break potential live-lock scenarios, ensuring we can 5867 * successfully reach scx_bypass(). 5868 */ 5869 WRITE_ONCE(sch->aborting, true); 5870 5871 /* 5872 * Propagate exits to descendants immediately. Each has a dedicated 5873 * helper kthread and can run in parallel. While most of disabling is 5874 * serialized, running them in separate threads allows parallelizing 5875 * ops.exit(), which can take arbitrarily long prolonging bypass mode. 5876 * 5877 * To guarantee forward progress, this propagation must be in-line so 5878 * that ->aborting is synchronously asserted for all sub-scheds. The 5879 * propagation is also the interlocking point against sub-sched 5880 * attachment. See scx_link_sched(). 5881 * 5882 * This doesn't cause recursions as propagation only takes place for 5883 * non-propagation exits. 5884 */ 5885 if (kind != SCX_EXIT_PARENT) { 5886 scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 5887 struct scx_sched *pos; 5888 scx_for_each_descendant_pre(pos, sch) 5889 scx_disable(pos, SCX_EXIT_PARENT); 5890 } 5891 } 5892 5893 return true; 5894 } 5895 5896 static void scx_disable_workfn(struct kthread_work *work) 5897 { 5898 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 5899 struct scx_exit_info *ei = sch->exit_info; 5900 int kind; 5901 5902 kind = atomic_read(&sch->exit_kind); 5903 while (true) { 5904 if (kind == SCX_EXIT_DONE) /* already disabled? */ 5905 return; 5906 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 5907 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 5908 break; 5909 } 5910 ei->kind = kind; 5911 ei->reason = scx_exit_reason(ei->kind); 5912 5913 if (scx_parent(sch)) 5914 scx_sub_disable(sch); 5915 else 5916 scx_root_disable(sch); 5917 } 5918 5919 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 5920 { 5921 guard(preempt)(); 5922 if (scx_claim_exit(sch, kind)) 5923 irq_work_queue(&sch->disable_irq_work); 5924 } 5925 5926 static void dump_newline(struct seq_buf *s) 5927 { 5928 trace_sched_ext_dump(""); 5929 5930 /* @s may be zero sized and seq_buf triggers WARN if so */ 5931 if (s->size) 5932 seq_buf_putc(s, '\n'); 5933 } 5934 5935 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 5936 { 5937 va_list args; 5938 5939 #ifdef CONFIG_TRACEPOINTS 5940 if (trace_sched_ext_dump_enabled()) { 5941 /* protected by scx_dump_lock */ 5942 static char line_buf[SCX_EXIT_MSG_LEN]; 5943 5944 va_start(args, fmt); 5945 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 5946 va_end(args); 5947 5948 trace_sched_ext_dump(line_buf); 5949 } 5950 #endif 5951 /* @s may be zero sized and seq_buf triggers WARN if so */ 5952 if (s->size) { 5953 va_start(args, fmt); 5954 seq_buf_vprintf(s, fmt, args); 5955 va_end(args); 5956 5957 seq_buf_putc(s, '\n'); 5958 } 5959 } 5960 5961 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 5962 const unsigned long *bt, unsigned int len) 5963 { 5964 unsigned int i; 5965 5966 for (i = 0; i < len; i++) 5967 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 5968 } 5969 5970 static void ops_dump_init(struct seq_buf *s, const char *prefix) 5971 { 5972 struct scx_dump_data *dd = &scx_dump_data; 5973 5974 lockdep_assert_irqs_disabled(); 5975 5976 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 5977 dd->first = true; 5978 dd->cursor = 0; 5979 dd->s = s; 5980 dd->prefix = prefix; 5981 } 5982 5983 static void ops_dump_flush(void) 5984 { 5985 struct scx_dump_data *dd = &scx_dump_data; 5986 char *line = dd->buf.line; 5987 5988 if (!dd->cursor) 5989 return; 5990 5991 /* 5992 * There's something to flush and this is the first line. Insert a blank 5993 * line to distinguish ops dump. 5994 */ 5995 if (dd->first) { 5996 dump_newline(dd->s); 5997 dd->first = false; 5998 } 5999 6000 /* 6001 * There may be multiple lines in $line. Scan and emit each line 6002 * separately. 6003 */ 6004 while (true) { 6005 char *end = line; 6006 char c; 6007 6008 while (*end != '\n' && *end != '\0') 6009 end++; 6010 6011 /* 6012 * If $line overflowed, it may not have newline at the end. 6013 * Always emit with a newline. 6014 */ 6015 c = *end; 6016 *end = '\0'; 6017 dump_line(dd->s, "%s%s", dd->prefix, line); 6018 if (c == '\0') 6019 break; 6020 6021 /* move to the next line */ 6022 end++; 6023 if (*end == '\0') 6024 break; 6025 line = end; 6026 } 6027 6028 dd->cursor = 0; 6029 } 6030 6031 static void ops_dump_exit(void) 6032 { 6033 ops_dump_flush(); 6034 scx_dump_data.cpu = -1; 6035 } 6036 6037 static void scx_dump_task(struct scx_sched *sch, 6038 struct seq_buf *s, struct scx_dump_ctx *dctx, 6039 struct task_struct *p, char marker) 6040 { 6041 static unsigned long bt[SCX_EXIT_BT_LEN]; 6042 struct scx_sched *task_sch = scx_task_sched(p); 6043 const char *own_marker; 6044 char sch_id_buf[32]; 6045 char dsq_id_buf[19] = "(n/a)"; 6046 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6047 unsigned int bt_len = 0; 6048 6049 own_marker = task_sch == sch ? "*" : ""; 6050 6051 if (task_sch->level == 0) 6052 scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6053 else 6054 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6055 task_sch->level, task_sch->ops.sub_cgroup_id); 6056 6057 if (p->scx.dsq) 6058 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6059 (unsigned long long)p->scx.dsq->id); 6060 6061 dump_newline(s); 6062 dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6063 marker, task_state_to_char(p), p->comm, p->pid, 6064 own_marker, sch_id_buf, 6065 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6066 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6067 scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6068 p->scx.flags & ~SCX_TASK_STATE_MASK, 6069 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6070 ops_state >> SCX_OPSS_QSEQ_SHIFT); 6071 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6072 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6073 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6074 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6075 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6076 p->migration_disabled); 6077 6078 if (SCX_HAS_OP(sch, dump_task)) { 6079 ops_dump_init(s, " "); 6080 SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); 6081 ops_dump_exit(); 6082 } 6083 6084 #ifdef CONFIG_STACKTRACE 6085 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6086 #endif 6087 if (bt_len) { 6088 dump_newline(s); 6089 dump_stack_trace(s, " ", bt, bt_len); 6090 } 6091 } 6092 6093 /* 6094 * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6095 * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6096 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6097 * separately. For error dumps, @dump_all_tasks=true since only the failing 6098 * scheduler is dumped. 6099 */ 6100 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6101 size_t dump_len, bool dump_all_tasks) 6102 { 6103 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6104 struct scx_dump_ctx dctx = { 6105 .kind = ei->kind, 6106 .exit_code = ei->exit_code, 6107 .reason = ei->reason, 6108 .at_ns = ktime_get_ns(), 6109 .at_jiffies = jiffies, 6110 }; 6111 struct seq_buf s; 6112 struct scx_event_stats events; 6113 char *buf; 6114 int cpu; 6115 6116 guard(raw_spinlock_irqsave)(&scx_dump_lock); 6117 6118 if (sch->dump_disabled) 6119 return; 6120 6121 seq_buf_init(&s, ei->dump, dump_len); 6122 6123 #ifdef CONFIG_EXT_SUB_SCHED 6124 if (sch->level == 0) 6125 dump_line(&s, "%s: root", sch->ops.name); 6126 else 6127 dump_line(&s, "%s: sub%d-%llu %s", 6128 sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6129 sch->cgrp_path); 6130 #endif 6131 if (ei->kind == SCX_EXIT_NONE) { 6132 dump_line(&s, "Debug dump triggered by %s", ei->reason); 6133 } else { 6134 dump_line(&s, "%s[%d] triggered exit kind %d:", 6135 current->comm, current->pid, ei->kind); 6136 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6137 dump_newline(&s); 6138 dump_line(&s, "Backtrace:"); 6139 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6140 } 6141 6142 if (SCX_HAS_OP(sch, dump)) { 6143 ops_dump_init(&s, ""); 6144 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); 6145 ops_dump_exit(); 6146 } 6147 6148 dump_newline(&s); 6149 dump_line(&s, "CPU states"); 6150 dump_line(&s, "----------"); 6151 6152 for_each_possible_cpu(cpu) { 6153 struct rq *rq = cpu_rq(cpu); 6154 struct rq_flags rf; 6155 struct task_struct *p; 6156 struct seq_buf ns; 6157 size_t avail, used; 6158 bool idle; 6159 6160 rq_lock_irqsave(rq, &rf); 6161 6162 idle = list_empty(&rq->scx.runnable_list) && 6163 rq->curr->sched_class == &idle_sched_class; 6164 6165 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6166 goto next; 6167 6168 /* 6169 * We don't yet know whether ops.dump_cpu() will produce output 6170 * and we may want to skip the default CPU dump if it doesn't. 6171 * Use a nested seq_buf to generate the standard dump so that we 6172 * can decide whether to commit later. 6173 */ 6174 avail = seq_buf_get_buf(&s, &buf); 6175 seq_buf_init(&ns, buf, avail); 6176 6177 dump_newline(&ns); 6178 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6179 cpu, rq->scx.nr_running, rq->scx.flags, 6180 rq->scx.cpu_released, rq->scx.ops_qseq, 6181 rq->scx.kick_sync); 6182 dump_line(&ns, " curr=%s[%d] class=%ps", 6183 rq->curr->comm, rq->curr->pid, 6184 rq->curr->sched_class); 6185 if (!cpumask_empty(rq->scx.cpus_to_kick)) 6186 dump_line(&ns, " cpus_to_kick : %*pb", 6187 cpumask_pr_args(rq->scx.cpus_to_kick)); 6188 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6189 dump_line(&ns, " idle_to_kick : %*pb", 6190 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6191 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6192 dump_line(&ns, " cpus_to_preempt: %*pb", 6193 cpumask_pr_args(rq->scx.cpus_to_preempt)); 6194 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6195 dump_line(&ns, " cpus_to_wait : %*pb", 6196 cpumask_pr_args(rq->scx.cpus_to_wait)); 6197 6198 used = seq_buf_used(&ns); 6199 if (SCX_HAS_OP(sch, dump_cpu)) { 6200 ops_dump_init(&ns, " "); 6201 SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, 6202 &dctx, cpu, idle); 6203 ops_dump_exit(); 6204 } 6205 6206 /* 6207 * If idle && nothing generated by ops.dump_cpu(), there's 6208 * nothing interesting. Skip. 6209 */ 6210 if (idle && used == seq_buf_used(&ns)) 6211 goto next; 6212 6213 /* 6214 * $s may already have overflowed when $ns was created. If so, 6215 * calling commit on it will trigger BUG. 6216 */ 6217 if (avail) { 6218 seq_buf_commit(&s, seq_buf_used(&ns)); 6219 if (seq_buf_has_overflowed(&ns)) 6220 seq_buf_set_overflow(&s); 6221 } 6222 6223 if (rq->curr->sched_class == &ext_sched_class && 6224 (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6225 scx_dump_task(sch, &s, &dctx, rq->curr, '*'); 6226 6227 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6228 if (dump_all_tasks || scx_task_on_sched(sch, p)) 6229 scx_dump_task(sch, &s, &dctx, p, ' '); 6230 next: 6231 rq_unlock_irqrestore(rq, &rf); 6232 } 6233 6234 dump_newline(&s); 6235 dump_line(&s, "Event counters"); 6236 dump_line(&s, "--------------"); 6237 6238 scx_read_events(sch, &events); 6239 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6240 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6241 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6242 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6243 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6244 scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6245 scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6246 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6247 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6248 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6249 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6250 scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6251 scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6252 6253 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6254 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6255 trunc_marker, sizeof(trunc_marker)); 6256 } 6257 6258 static void scx_disable_irq_workfn(struct irq_work *irq_work) 6259 { 6260 struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6261 struct scx_exit_info *ei = sch->exit_info; 6262 6263 if (ei->kind >= SCX_EXIT_ERROR) 6264 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6265 6266 kthread_queue_work(sch->helper, &sch->disable_work); 6267 } 6268 6269 static bool scx_vexit(struct scx_sched *sch, 6270 enum scx_exit_kind kind, s64 exit_code, 6271 const char *fmt, va_list args) 6272 { 6273 struct scx_exit_info *ei = sch->exit_info; 6274 6275 guard(preempt)(); 6276 6277 if (!scx_claim_exit(sch, kind)) 6278 return false; 6279 6280 ei->exit_code = exit_code; 6281 #ifdef CONFIG_STACKTRACE 6282 if (kind >= SCX_EXIT_ERROR) 6283 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6284 #endif 6285 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6286 6287 /* 6288 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6289 * in scx_disable_workfn(). 6290 */ 6291 ei->kind = kind; 6292 ei->reason = scx_exit_reason(ei->kind); 6293 6294 irq_work_queue(&sch->disable_irq_work); 6295 return true; 6296 } 6297 6298 static int alloc_kick_syncs(void) 6299 { 6300 int cpu; 6301 6302 /* 6303 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6304 * can exceed percpu allocator limits on large machines. 6305 */ 6306 for_each_possible_cpu(cpu) { 6307 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6308 struct scx_kick_syncs *new_ksyncs; 6309 6310 WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6311 6312 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6313 GFP_KERNEL, cpu_to_node(cpu)); 6314 if (!new_ksyncs) { 6315 free_kick_syncs(); 6316 return -ENOMEM; 6317 } 6318 6319 rcu_assign_pointer(*ksyncs, new_ksyncs); 6320 } 6321 6322 return 0; 6323 } 6324 6325 static void free_pnode(struct scx_sched_pnode *pnode) 6326 { 6327 if (!pnode) 6328 return; 6329 exit_dsq(&pnode->global_dsq); 6330 kfree(pnode); 6331 } 6332 6333 static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6334 { 6335 struct scx_sched_pnode *pnode; 6336 6337 pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6338 if (!pnode) 6339 return NULL; 6340 6341 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6342 kfree(pnode); 6343 return NULL; 6344 } 6345 6346 return pnode; 6347 } 6348 6349 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, 6350 struct cgroup *cgrp, 6351 struct scx_sched *parent) 6352 { 6353 struct scx_sched *sch; 6354 s32 level = parent ? parent->level + 1 : 0; 6355 s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6356 6357 sch = kzalloc_flex(*sch, ancestors, level); 6358 if (!sch) 6359 return ERR_PTR(-ENOMEM); 6360 6361 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6362 if (!sch->exit_info) { 6363 ret = -ENOMEM; 6364 goto err_free_sch; 6365 } 6366 6367 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6368 if (ret < 0) 6369 goto err_free_ei; 6370 6371 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6372 if (!sch->pnode) { 6373 ret = -ENOMEM; 6374 goto err_free_hash; 6375 } 6376 6377 for_each_node_state(node, N_POSSIBLE) { 6378 sch->pnode[node] = alloc_pnode(sch, node); 6379 if (!sch->pnode[node]) { 6380 ret = -ENOMEM; 6381 goto err_free_pnode; 6382 } 6383 } 6384 6385 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6386 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6387 dsp_ctx.buf, sch->dsp_max_batch), 6388 __alignof__(struct scx_sched_pcpu)); 6389 if (!sch->pcpu) { 6390 ret = -ENOMEM; 6391 goto err_free_pnode; 6392 } 6393 6394 for_each_possible_cpu(cpu) { 6395 ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6396 if (ret) { 6397 bypass_fail_cpu = cpu; 6398 goto err_free_pcpu; 6399 } 6400 } 6401 6402 for_each_possible_cpu(cpu) { 6403 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6404 6405 pcpu->sch = sch; 6406 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6407 } 6408 6409 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6410 if (IS_ERR(sch->helper)) { 6411 ret = PTR_ERR(sch->helper); 6412 goto err_free_pcpu; 6413 } 6414 6415 sched_set_fifo(sch->helper->task); 6416 6417 if (parent) 6418 memcpy(sch->ancestors, parent->ancestors, 6419 level * sizeof(parent->ancestors[0])); 6420 sch->ancestors[level] = sch; 6421 sch->level = level; 6422 6423 if (ops->timeout_ms) 6424 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6425 else 6426 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6427 6428 sch->slice_dfl = SCX_SLICE_DFL; 6429 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6430 init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); 6431 kthread_init_work(&sch->disable_work, scx_disable_workfn); 6432 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6433 sch->ops = *ops; 6434 rcu_assign_pointer(ops->priv, sch); 6435 6436 sch->kobj.kset = scx_kset; 6437 6438 #ifdef CONFIG_EXT_SUB_SCHED 6439 char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6440 if (!buf) 6441 goto err_stop_helper; 6442 cgroup_path(cgrp, buf, PATH_MAX); 6443 sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6444 kfree(buf); 6445 if (!sch->cgrp_path) 6446 goto err_stop_helper; 6447 6448 sch->cgrp = cgrp; 6449 INIT_LIST_HEAD(&sch->children); 6450 INIT_LIST_HEAD(&sch->sibling); 6451 6452 if (parent) 6453 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6454 &parent->sub_kset->kobj, 6455 "sub-%llu", cgroup_id(cgrp)); 6456 else 6457 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6458 6459 if (ret < 0) { 6460 kfree(sch->cgrp_path); 6461 goto err_stop_helper; 6462 } 6463 6464 if (ops->sub_attach) { 6465 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6466 if (!sch->sub_kset) { 6467 kobject_put(&sch->kobj); 6468 return ERR_PTR(-ENOMEM); 6469 } 6470 } 6471 6472 #else /* CONFIG_EXT_SUB_SCHED */ 6473 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6474 if (ret < 0) 6475 goto err_stop_helper; 6476 #endif /* CONFIG_EXT_SUB_SCHED */ 6477 return sch; 6478 6479 err_stop_helper: 6480 kthread_destroy_worker(sch->helper); 6481 err_free_pcpu: 6482 for_each_possible_cpu(cpu) { 6483 if (cpu == bypass_fail_cpu) 6484 break; 6485 exit_dsq(bypass_dsq(sch, cpu)); 6486 } 6487 free_percpu(sch->pcpu); 6488 err_free_pnode: 6489 for_each_node_state(node, N_POSSIBLE) 6490 free_pnode(sch->pnode[node]); 6491 kfree(sch->pnode); 6492 err_free_hash: 6493 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 6494 err_free_ei: 6495 free_exit_info(sch->exit_info); 6496 err_free_sch: 6497 kfree(sch); 6498 return ERR_PTR(ret); 6499 } 6500 6501 static int check_hotplug_seq(struct scx_sched *sch, 6502 const struct sched_ext_ops *ops) 6503 { 6504 unsigned long long global_hotplug_seq; 6505 6506 /* 6507 * If a hotplug event has occurred between when a scheduler was 6508 * initialized, and when we were able to attach, exit and notify user 6509 * space about it. 6510 */ 6511 if (ops->hotplug_seq) { 6512 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 6513 if (ops->hotplug_seq != global_hotplug_seq) { 6514 scx_exit(sch, SCX_EXIT_UNREG_KERN, 6515 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 6516 "expected hotplug seq %llu did not match actual %llu", 6517 ops->hotplug_seq, global_hotplug_seq); 6518 return -EBUSY; 6519 } 6520 } 6521 6522 return 0; 6523 } 6524 6525 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 6526 { 6527 /* 6528 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 6529 * ops.enqueue() callback isn't implemented. 6530 */ 6531 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 6532 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 6533 return -EINVAL; 6534 } 6535 6536 /* 6537 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 6538 * selection policy to be enabled. 6539 */ 6540 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 6541 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 6542 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 6543 return -EINVAL; 6544 } 6545 6546 if (ops->cpu_acquire || ops->cpu_release) 6547 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 6548 6549 return 0; 6550 } 6551 6552 /* 6553 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6554 * starvation. During the READY -> ENABLED task switching loop, the calling 6555 * thread's sched_class gets switched from fair to ext. As fair has higher 6556 * priority than ext, the calling thread can be indefinitely starved under 6557 * fair-class saturation, leading to a system hang. 6558 */ 6559 struct scx_enable_cmd { 6560 struct kthread_work work; 6561 struct sched_ext_ops *ops; 6562 int ret; 6563 }; 6564 6565 static void scx_root_enable_workfn(struct kthread_work *work) 6566 { 6567 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 6568 struct sched_ext_ops *ops = cmd->ops; 6569 struct scx_sched *sch; 6570 struct scx_task_iter sti; 6571 struct task_struct *p; 6572 int i, cpu, ret; 6573 6574 mutex_lock(&scx_enable_mutex); 6575 6576 if (scx_enable_state() != SCX_DISABLED) { 6577 ret = -EBUSY; 6578 goto err_unlock; 6579 } 6580 6581 ret = alloc_kick_syncs(); 6582 if (ret) 6583 goto err_unlock; 6584 6585 sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL); 6586 if (IS_ERR(sch)) { 6587 ret = PTR_ERR(sch); 6588 goto err_free_ksyncs; 6589 } 6590 6591 /* 6592 * Transition to ENABLING and clear exit info to arm the disable path. 6593 * Failure triggers full disabling from here on. 6594 */ 6595 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 6596 WARN_ON_ONCE(scx_root); 6597 6598 atomic_long_set(&scx_nr_rejected, 0); 6599 6600 for_each_possible_cpu(cpu) { 6601 struct rq *rq = cpu_rq(cpu); 6602 6603 rq->scx.local_dsq.sched = sch; 6604 rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 6605 } 6606 6607 /* 6608 * Keep CPUs stable during enable so that the BPF scheduler can track 6609 * online CPUs by watching ->on/offline_cpu() after ->init(). 6610 */ 6611 cpus_read_lock(); 6612 6613 /* 6614 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 6615 * See handle_hotplug(). 6616 */ 6617 rcu_assign_pointer(scx_root, sch); 6618 6619 ret = scx_link_sched(sch); 6620 if (ret) 6621 goto err_disable; 6622 6623 scx_idle_enable(ops); 6624 6625 if (sch->ops.init) { 6626 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); 6627 if (ret) { 6628 ret = ops_sanitize_err(sch, "init", ret); 6629 cpus_read_unlock(); 6630 scx_error(sch, "ops.init() failed (%d)", ret); 6631 goto err_disable; 6632 } 6633 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 6634 } 6635 6636 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 6637 if (((void (**)(void))ops)[i]) 6638 set_bit(i, sch->has_op); 6639 6640 ret = check_hotplug_seq(sch, ops); 6641 if (ret) { 6642 cpus_read_unlock(); 6643 goto err_disable; 6644 } 6645 scx_idle_update_selcpu_topology(ops); 6646 6647 cpus_read_unlock(); 6648 6649 ret = validate_ops(sch, ops); 6650 if (ret) 6651 goto err_disable; 6652 6653 /* 6654 * Once __scx_enabled is set, %current can be switched to SCX anytime. 6655 * This can lead to stalls as some BPF schedulers (e.g. userspace 6656 * scheduling) may not function correctly before all tasks are switched. 6657 * Init in bypass mode to guarantee forward progress. 6658 */ 6659 scx_bypass(sch, true); 6660 6661 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 6662 if (((void (**)(void))ops)[i]) 6663 set_bit(i, sch->has_op); 6664 6665 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 6666 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 6667 6668 /* 6669 * Lock out forks, cgroup on/offlining and moves before opening the 6670 * floodgate so that they don't wander into the operations prematurely. 6671 */ 6672 percpu_down_write(&scx_fork_rwsem); 6673 6674 WARN_ON_ONCE(scx_init_task_enabled); 6675 scx_init_task_enabled = true; 6676 6677 /* 6678 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 6679 * preventing new tasks from being added. No need to exclude tasks 6680 * leaving as sched_ext_free() can handle both prepped and enabled 6681 * tasks. Prep all tasks first and then enable them with preemption 6682 * disabled. 6683 * 6684 * All cgroups should be initialized before scx_init_task() so that the 6685 * BPF scheduler can reliably track each task's cgroup membership from 6686 * scx_init_task(). Lock out cgroup on/offlining and task migrations 6687 * while tasks are being initialized so that scx_cgroup_can_attach() 6688 * never sees uninitialized tasks. 6689 */ 6690 scx_cgroup_lock(); 6691 set_cgroup_sched(sch_cgroup(sch), sch); 6692 ret = scx_cgroup_init(sch); 6693 if (ret) 6694 goto err_disable_unlock_all; 6695 6696 scx_task_iter_start(&sti, NULL); 6697 while ((p = scx_task_iter_next_locked(&sti))) { 6698 /* 6699 * @p may already be dead, have lost all its usages counts and 6700 * be waiting for RCU grace period before being freed. @p can't 6701 * be initialized for SCX in such cases and should be ignored. 6702 */ 6703 if (!tryget_task_struct(p)) 6704 continue; 6705 6706 scx_task_iter_unlock(&sti); 6707 6708 ret = scx_init_task(sch, p, false); 6709 if (ret) { 6710 put_task_struct(p); 6711 scx_task_iter_stop(&sti); 6712 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 6713 ret, p->comm, p->pid); 6714 goto err_disable_unlock_all; 6715 } 6716 6717 scx_set_task_sched(p, sch); 6718 scx_set_task_state(p, SCX_TASK_READY); 6719 6720 put_task_struct(p); 6721 } 6722 scx_task_iter_stop(&sti); 6723 scx_cgroup_unlock(); 6724 percpu_up_write(&scx_fork_rwsem); 6725 6726 /* 6727 * All tasks are READY. It's safe to turn on scx_enabled() and switch 6728 * all eligible tasks. 6729 */ 6730 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 6731 static_branch_enable(&__scx_enabled); 6732 6733 /* 6734 * We're fully committed and can't fail. The task READY -> ENABLED 6735 * transitions here are synchronized against sched_ext_free() through 6736 * scx_tasks_lock. 6737 */ 6738 percpu_down_write(&scx_fork_rwsem); 6739 scx_task_iter_start(&sti, NULL); 6740 while ((p = scx_task_iter_next_locked(&sti))) { 6741 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 6742 const struct sched_class *old_class = p->sched_class; 6743 const struct sched_class *new_class = scx_setscheduler_class(p); 6744 6745 if (scx_get_task_state(p) != SCX_TASK_READY) 6746 continue; 6747 6748 if (old_class != new_class) 6749 queue_flags |= DEQUEUE_CLASS; 6750 6751 scoped_guard (sched_change, p, queue_flags) { 6752 p->scx.slice = READ_ONCE(sch->slice_dfl); 6753 p->sched_class = new_class; 6754 } 6755 } 6756 scx_task_iter_stop(&sti); 6757 percpu_up_write(&scx_fork_rwsem); 6758 6759 scx_bypass(sch, false); 6760 6761 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 6762 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 6763 goto err_disable; 6764 } 6765 6766 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 6767 static_branch_enable(&__scx_switched_all); 6768 6769 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 6770 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 6771 kobject_uevent(&sch->kobj, KOBJ_ADD); 6772 mutex_unlock(&scx_enable_mutex); 6773 6774 atomic_long_inc(&scx_enable_seq); 6775 6776 cmd->ret = 0; 6777 return; 6778 6779 err_free_ksyncs: 6780 free_kick_syncs(); 6781 err_unlock: 6782 mutex_unlock(&scx_enable_mutex); 6783 cmd->ret = ret; 6784 return; 6785 6786 err_disable_unlock_all: 6787 scx_cgroup_unlock(); 6788 percpu_up_write(&scx_fork_rwsem); 6789 /* we'll soon enter disable path, keep bypass on */ 6790 err_disable: 6791 mutex_unlock(&scx_enable_mutex); 6792 /* 6793 * Returning an error code here would not pass all the error information 6794 * to userspace. Record errno using scx_error() for cases scx_error() 6795 * wasn't already invoked and exit indicating success so that the error 6796 * is notified through ops.exit() with all the details. 6797 * 6798 * Flush scx_disable_work to ensure that error is reported before init 6799 * completion. sch's base reference will be put by bpf_scx_unreg(). 6800 */ 6801 scx_error(sch, "scx_root_enable() failed (%d)", ret); 6802 kthread_flush_work(&sch->disable_work); 6803 cmd->ret = 0; 6804 } 6805 6806 #ifdef CONFIG_EXT_SUB_SCHED 6807 /* verify that a scheduler can be attached to @cgrp and return the parent */ 6808 static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 6809 { 6810 struct scx_sched *parent = cgrp->scx_sched; 6811 struct scx_sched *pos; 6812 6813 lockdep_assert_held(&scx_sched_lock); 6814 6815 /* can't attach twice to the same cgroup */ 6816 if (parent->cgrp == cgrp) 6817 return ERR_PTR(-EBUSY); 6818 6819 /* does $parent allow sub-scheds? */ 6820 if (!parent->ops.sub_attach) 6821 return ERR_PTR(-EOPNOTSUPP); 6822 6823 /* can't insert between $parent and its exiting children */ 6824 list_for_each_entry(pos, &parent->children, sibling) 6825 if (cgroup_is_descendant(pos->cgrp, cgrp)) 6826 return ERR_PTR(-EBUSY); 6827 6828 return parent; 6829 } 6830 6831 static bool assert_task_ready_or_enabled(struct task_struct *p) 6832 { 6833 u32 state = scx_get_task_state(p); 6834 6835 switch (state) { 6836 case SCX_TASK_READY: 6837 case SCX_TASK_ENABLED: 6838 return true; 6839 default: 6840 WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 6841 state, p->comm, p->pid); 6842 return false; 6843 } 6844 } 6845 6846 static void scx_sub_enable_workfn(struct kthread_work *work) 6847 { 6848 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 6849 struct sched_ext_ops *ops = cmd->ops; 6850 struct cgroup *cgrp; 6851 struct scx_sched *parent, *sch; 6852 struct scx_task_iter sti; 6853 struct task_struct *p; 6854 s32 i, ret; 6855 6856 mutex_lock(&scx_enable_mutex); 6857 6858 if (!scx_enabled()) { 6859 ret = -ENODEV; 6860 goto out_unlock; 6861 } 6862 6863 cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 6864 if (IS_ERR(cgrp)) { 6865 ret = PTR_ERR(cgrp); 6866 goto out_unlock; 6867 } 6868 6869 raw_spin_lock_irq(&scx_sched_lock); 6870 parent = find_parent_sched(cgrp); 6871 if (IS_ERR(parent)) { 6872 raw_spin_unlock_irq(&scx_sched_lock); 6873 ret = PTR_ERR(parent); 6874 goto out_put_cgrp; 6875 } 6876 kobject_get(&parent->kobj); 6877 raw_spin_unlock_irq(&scx_sched_lock); 6878 6879 sch = scx_alloc_and_add_sched(ops, cgrp, parent); 6880 kobject_put(&parent->kobj); 6881 if (IS_ERR(sch)) { 6882 ret = PTR_ERR(sch); 6883 goto out_put_cgrp; 6884 } 6885 6886 ret = scx_link_sched(sch); 6887 if (ret) 6888 goto err_disable; 6889 6890 if (sch->level >= SCX_SUB_MAX_DEPTH) { 6891 scx_error(sch, "max nesting depth %d violated", 6892 SCX_SUB_MAX_DEPTH); 6893 goto err_disable; 6894 } 6895 6896 if (sch->ops.init) { 6897 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); 6898 if (ret) { 6899 ret = ops_sanitize_err(sch, "init", ret); 6900 scx_error(sch, "ops.init() failed (%d)", ret); 6901 goto err_disable; 6902 } 6903 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 6904 } 6905 6906 if (validate_ops(sch, ops)) 6907 goto err_disable; 6908 6909 struct scx_sub_attach_args sub_attach_args = { 6910 .ops = &sch->ops, 6911 .cgroup_path = sch->cgrp_path, 6912 }; 6913 6914 ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL, 6915 &sub_attach_args); 6916 if (ret) { 6917 ret = ops_sanitize_err(sch, "sub_attach", ret); 6918 scx_error(sch, "parent rejected (%d)", ret); 6919 goto err_disable; 6920 } 6921 sch->sub_attached = true; 6922 6923 scx_bypass(sch, true); 6924 6925 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 6926 if (((void (**)(void))ops)[i]) 6927 set_bit(i, sch->has_op); 6928 6929 percpu_down_write(&scx_fork_rwsem); 6930 scx_cgroup_lock(); 6931 6932 /* 6933 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 6934 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 6935 */ 6936 set_cgroup_sched(sch_cgroup(sch), sch); 6937 if (!(cgrp->self.flags & CSS_ONLINE)) { 6938 scx_error(sch, "cgroup is not online"); 6939 goto err_unlock_and_disable; 6940 } 6941 6942 /* 6943 * Initialize tasks for the new child $sch without exiting them for 6944 * $parent so that the tasks can always be reverted back to $parent 6945 * sched on child init failure. 6946 */ 6947 WARN_ON_ONCE(scx_enabling_sub_sched); 6948 scx_enabling_sub_sched = sch; 6949 6950 scx_task_iter_start(&sti, sch->cgrp); 6951 while ((p = scx_task_iter_next_locked(&sti))) { 6952 struct rq *rq; 6953 struct rq_flags rf; 6954 6955 /* 6956 * Task iteration may visit the same task twice when racing 6957 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 6958 * finished __scx_init_task() and skip if set. 6959 * 6960 * A task may exit and get freed between __scx_init_task() 6961 * completion and scx_enable_task(). In such cases, 6962 * scx_disable_and_exit_task() must exit the task for both the 6963 * parent and child scheds. 6964 */ 6965 if (p->scx.flags & SCX_TASK_SUB_INIT) 6966 continue; 6967 6968 /* see scx_root_enable() */ 6969 if (!tryget_task_struct(p)) 6970 continue; 6971 6972 if (!assert_task_ready_or_enabled(p)) { 6973 ret = -EINVAL; 6974 goto abort; 6975 } 6976 6977 scx_task_iter_unlock(&sti); 6978 6979 /* 6980 * As $p is still on $parent, it can't be transitioned to INIT. 6981 * Let's worry about task state later. Use __scx_init_task(). 6982 */ 6983 ret = __scx_init_task(sch, p, false); 6984 if (ret) 6985 goto abort; 6986 6987 rq = task_rq_lock(p, &rf); 6988 p->scx.flags |= SCX_TASK_SUB_INIT; 6989 task_rq_unlock(rq, p, &rf); 6990 6991 put_task_struct(p); 6992 } 6993 scx_task_iter_stop(&sti); 6994 6995 /* 6996 * All tasks are prepped. Disable/exit tasks for $parent and enable for 6997 * the new @sch. 6998 */ 6999 scx_task_iter_start(&sti, sch->cgrp); 7000 while ((p = scx_task_iter_next_locked(&sti))) { 7001 /* 7002 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7003 * duplicate iterations. 7004 */ 7005 if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7006 continue; 7007 7008 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7009 /* 7010 * $p must be either READY or ENABLED. If ENABLED, 7011 * __scx_disabled_and_exit_task() first disables and 7012 * makes it READY. However, after exiting $p, it will 7013 * leave $p as READY. 7014 */ 7015 assert_task_ready_or_enabled(p); 7016 __scx_disable_and_exit_task(parent, p); 7017 7018 /* 7019 * $p is now only initialized for @sch and READY, which 7020 * is what we want. Assign it to @sch and enable. 7021 */ 7022 rcu_assign_pointer(p->scx.sched, sch); 7023 scx_enable_task(sch, p); 7024 7025 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7026 } 7027 } 7028 scx_task_iter_stop(&sti); 7029 7030 scx_enabling_sub_sched = NULL; 7031 7032 scx_cgroup_unlock(); 7033 percpu_up_write(&scx_fork_rwsem); 7034 7035 scx_bypass(sch, false); 7036 7037 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7038 kobject_uevent(&sch->kobj, KOBJ_ADD); 7039 ret = 0; 7040 goto out_unlock; 7041 7042 abort: 7043 put_task_struct(p); 7044 scx_task_iter_stop(&sti); 7045 scx_enabling_sub_sched = NULL; 7046 7047 scx_task_iter_start(&sti, sch->cgrp); 7048 while ((p = scx_task_iter_next_locked(&sti))) { 7049 if (p->scx.flags & SCX_TASK_SUB_INIT) { 7050 __scx_disable_and_exit_task(sch, p); 7051 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7052 } 7053 } 7054 scx_task_iter_stop(&sti); 7055 scx_cgroup_unlock(); 7056 percpu_up_write(&scx_fork_rwsem); 7057 out_put_cgrp: 7058 cgroup_put(cgrp); 7059 out_unlock: 7060 mutex_unlock(&scx_enable_mutex); 7061 cmd->ret = ret; 7062 return; 7063 7064 err_unlock_and_disable: 7065 /* we'll soon enter disable path, keep bypass on */ 7066 scx_cgroup_unlock(); 7067 percpu_up_write(&scx_fork_rwsem); 7068 err_disable: 7069 mutex_unlock(&scx_enable_mutex); 7070 kthread_flush_work(&sch->disable_work); 7071 cmd->ret = 0; 7072 } 7073 7074 static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7075 unsigned long action, void *data) 7076 { 7077 struct cgroup *cgrp = data; 7078 struct cgroup *parent = cgroup_parent(cgrp); 7079 7080 if (!cgroup_on_dfl(cgrp)) 7081 return NOTIFY_OK; 7082 7083 switch (action) { 7084 case CGROUP_LIFETIME_ONLINE: 7085 /* inherit ->scx_sched from $parent */ 7086 if (parent) 7087 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7088 break; 7089 case CGROUP_LIFETIME_OFFLINE: 7090 /* if there is a sched attached, shoot it down */ 7091 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7092 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7093 SCX_ECODE_RSN_CGROUP_OFFLINE, 7094 "cgroup %llu going offline", cgroup_id(cgrp)); 7095 break; 7096 } 7097 7098 return NOTIFY_OK; 7099 } 7100 7101 static struct notifier_block scx_cgroup_lifetime_nb = { 7102 .notifier_call = scx_cgroup_lifetime_notify, 7103 }; 7104 7105 static s32 __init scx_cgroup_lifetime_notifier_init(void) 7106 { 7107 return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7108 &scx_cgroup_lifetime_nb); 7109 } 7110 core_initcall(scx_cgroup_lifetime_notifier_init); 7111 #endif /* CONFIG_EXT_SUB_SCHED */ 7112 7113 static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 7114 { 7115 static struct kthread_worker *helper; 7116 static DEFINE_MUTEX(helper_mutex); 7117 struct scx_enable_cmd cmd; 7118 7119 if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), 7120 cpu_possible_mask)) { 7121 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7122 return -EINVAL; 7123 } 7124 7125 if (!READ_ONCE(helper)) { 7126 mutex_lock(&helper_mutex); 7127 if (!helper) { 7128 struct kthread_worker *w = 7129 kthread_run_worker(0, "scx_enable_helper"); 7130 if (IS_ERR_OR_NULL(w)) { 7131 mutex_unlock(&helper_mutex); 7132 return -ENOMEM; 7133 } 7134 sched_set_fifo(w->task); 7135 WRITE_ONCE(helper, w); 7136 } 7137 mutex_unlock(&helper_mutex); 7138 } 7139 7140 #ifdef CONFIG_EXT_SUB_SCHED 7141 if (ops->sub_cgroup_id > 1) 7142 kthread_init_work(&cmd.work, scx_sub_enable_workfn); 7143 else 7144 #endif /* CONFIG_EXT_SUB_SCHED */ 7145 kthread_init_work(&cmd.work, scx_root_enable_workfn); 7146 cmd.ops = ops; 7147 7148 kthread_queue_work(READ_ONCE(helper), &cmd.work); 7149 kthread_flush_work(&cmd.work); 7150 return cmd.ret; 7151 } 7152 7153 7154 /******************************************************************************** 7155 * bpf_struct_ops plumbing. 7156 */ 7157 #include <linux/bpf_verifier.h> 7158 #include <linux/bpf.h> 7159 #include <linux/btf.h> 7160 7161 static const struct btf_type *task_struct_type; 7162 7163 static bool bpf_scx_is_valid_access(int off, int size, 7164 enum bpf_access_type type, 7165 const struct bpf_prog *prog, 7166 struct bpf_insn_access_aux *info) 7167 { 7168 if (type != BPF_READ) 7169 return false; 7170 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7171 return false; 7172 if (off % size != 0) 7173 return false; 7174 7175 return btf_ctx_access(off, size, type, prog, info); 7176 } 7177 7178 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7179 const struct bpf_reg_state *reg, int off, 7180 int size) 7181 { 7182 const struct btf_type *t; 7183 7184 t = btf_type_by_id(reg->btf, reg->btf_id); 7185 if (t == task_struct_type) { 7186 /* 7187 * COMPAT: Will be removed in v6.23. 7188 */ 7189 if ((off >= offsetof(struct task_struct, scx.slice) && 7190 off + size <= offsetofend(struct task_struct, scx.slice)) || 7191 (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7192 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7193 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7194 return SCALAR_VALUE; 7195 } 7196 7197 if (off >= offsetof(struct task_struct, scx.disallow) && 7198 off + size <= offsetofend(struct task_struct, scx.disallow)) 7199 return SCALAR_VALUE; 7200 } 7201 7202 return -EACCES; 7203 } 7204 7205 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7206 .get_func_proto = bpf_base_func_proto, 7207 .is_valid_access = bpf_scx_is_valid_access, 7208 .btf_struct_access = bpf_scx_btf_struct_access, 7209 }; 7210 7211 static int bpf_scx_init_member(const struct btf_type *t, 7212 const struct btf_member *member, 7213 void *kdata, const void *udata) 7214 { 7215 const struct sched_ext_ops *uops = udata; 7216 struct sched_ext_ops *ops = kdata; 7217 u32 moff = __btf_member_bit_offset(t, member) / 8; 7218 int ret; 7219 7220 switch (moff) { 7221 case offsetof(struct sched_ext_ops, dispatch_max_batch): 7222 if (*(u32 *)(udata + moff) > INT_MAX) 7223 return -E2BIG; 7224 ops->dispatch_max_batch = *(u32 *)(udata + moff); 7225 return 1; 7226 case offsetof(struct sched_ext_ops, flags): 7227 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7228 return -EINVAL; 7229 ops->flags = *(u64 *)(udata + moff); 7230 return 1; 7231 case offsetof(struct sched_ext_ops, name): 7232 ret = bpf_obj_name_cpy(ops->name, uops->name, 7233 sizeof(ops->name)); 7234 if (ret < 0) 7235 return ret; 7236 if (ret == 0) 7237 return -EINVAL; 7238 return 1; 7239 case offsetof(struct sched_ext_ops, timeout_ms): 7240 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7241 SCX_WATCHDOG_MAX_TIMEOUT) 7242 return -E2BIG; 7243 ops->timeout_ms = *(u32 *)(udata + moff); 7244 return 1; 7245 case offsetof(struct sched_ext_ops, exit_dump_len): 7246 ops->exit_dump_len = 7247 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7248 return 1; 7249 case offsetof(struct sched_ext_ops, hotplug_seq): 7250 ops->hotplug_seq = *(u64 *)(udata + moff); 7251 return 1; 7252 #ifdef CONFIG_EXT_SUB_SCHED 7253 case offsetof(struct sched_ext_ops, sub_cgroup_id): 7254 ops->sub_cgroup_id = *(u64 *)(udata + moff); 7255 return 1; 7256 #endif /* CONFIG_EXT_SUB_SCHED */ 7257 } 7258 7259 return 0; 7260 } 7261 7262 #ifdef CONFIG_EXT_SUB_SCHED 7263 static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 7264 { 7265 struct scx_sched *sch; 7266 7267 guard(rcu)(); 7268 sch = scx_prog_sched(prog->aux); 7269 if (unlikely(!sch)) 7270 return; 7271 7272 scx_error(sch, "dispatch recursion detected"); 7273 } 7274 #endif /* CONFIG_EXT_SUB_SCHED */ 7275 7276 static int bpf_scx_check_member(const struct btf_type *t, 7277 const struct btf_member *member, 7278 const struct bpf_prog *prog) 7279 { 7280 u32 moff = __btf_member_bit_offset(t, member) / 8; 7281 7282 switch (moff) { 7283 case offsetof(struct sched_ext_ops, init_task): 7284 #ifdef CONFIG_EXT_GROUP_SCHED 7285 case offsetof(struct sched_ext_ops, cgroup_init): 7286 case offsetof(struct sched_ext_ops, cgroup_exit): 7287 case offsetof(struct sched_ext_ops, cgroup_prep_move): 7288 #endif 7289 case offsetof(struct sched_ext_ops, cpu_online): 7290 case offsetof(struct sched_ext_ops, cpu_offline): 7291 case offsetof(struct sched_ext_ops, init): 7292 case offsetof(struct sched_ext_ops, exit): 7293 case offsetof(struct sched_ext_ops, sub_attach): 7294 case offsetof(struct sched_ext_ops, sub_detach): 7295 break; 7296 default: 7297 if (prog->sleepable) 7298 return -EINVAL; 7299 } 7300 7301 #ifdef CONFIG_EXT_SUB_SCHED 7302 /* 7303 * Enable private stack for operations that can nest along the 7304 * hierarchy. 7305 * 7306 * XXX - Ideally, we should only do this for scheds that allow 7307 * sub-scheds and sub-scheds themselves but I don't know how to access 7308 * struct_ops from here. 7309 */ 7310 switch (moff) { 7311 case offsetof(struct sched_ext_ops, dispatch): 7312 prog->aux->priv_stack_requested = true; 7313 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 7314 } 7315 #endif /* CONFIG_EXT_SUB_SCHED */ 7316 7317 return 0; 7318 } 7319 7320 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 7321 { 7322 return scx_enable(kdata, link); 7323 } 7324 7325 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 7326 { 7327 struct sched_ext_ops *ops = kdata; 7328 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 7329 7330 scx_disable(sch, SCX_EXIT_UNREG); 7331 kthread_flush_work(&sch->disable_work); 7332 RCU_INIT_POINTER(ops->priv, NULL); 7333 kobject_put(&sch->kobj); 7334 } 7335 7336 static int bpf_scx_init(struct btf *btf) 7337 { 7338 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 7339 7340 return 0; 7341 } 7342 7343 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 7344 { 7345 /* 7346 * sched_ext does not support updating the actively-loaded BPF 7347 * scheduler, as registering a BPF scheduler can always fail if the 7348 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 7349 * etc. Similarly, we can always race with unregistration happening 7350 * elsewhere, such as with sysrq. 7351 */ 7352 return -EOPNOTSUPP; 7353 } 7354 7355 static int bpf_scx_validate(void *kdata) 7356 { 7357 return 0; 7358 } 7359 7360 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 7361 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 7362 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 7363 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 7364 static void sched_ext_ops__tick(struct task_struct *p) {} 7365 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 7366 static void sched_ext_ops__running(struct task_struct *p) {} 7367 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 7368 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 7369 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 7370 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 7371 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 7372 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 7373 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 7374 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 7375 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 7376 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 7377 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 7378 static void sched_ext_ops__enable(struct task_struct *p) {} 7379 static void sched_ext_ops__disable(struct task_struct *p) {} 7380 #ifdef CONFIG_EXT_GROUP_SCHED 7381 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 7382 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 7383 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 7384 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7385 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 7386 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 7387 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 7388 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 7389 #endif /* CONFIG_EXT_GROUP_SCHED */ 7390 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 7391 static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 7392 static void sched_ext_ops__cpu_online(s32 cpu) {} 7393 static void sched_ext_ops__cpu_offline(s32 cpu) {} 7394 static s32 sched_ext_ops__init(void) { return -EINVAL; } 7395 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 7396 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 7397 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 7398 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 7399 7400 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 7401 .select_cpu = sched_ext_ops__select_cpu, 7402 .enqueue = sched_ext_ops__enqueue, 7403 .dequeue = sched_ext_ops__dequeue, 7404 .dispatch = sched_ext_ops__dispatch, 7405 .tick = sched_ext_ops__tick, 7406 .runnable = sched_ext_ops__runnable, 7407 .running = sched_ext_ops__running, 7408 .stopping = sched_ext_ops__stopping, 7409 .quiescent = sched_ext_ops__quiescent, 7410 .yield = sched_ext_ops__yield, 7411 .core_sched_before = sched_ext_ops__core_sched_before, 7412 .set_weight = sched_ext_ops__set_weight, 7413 .set_cpumask = sched_ext_ops__set_cpumask, 7414 .update_idle = sched_ext_ops__update_idle, 7415 .cpu_acquire = sched_ext_ops__cpu_acquire, 7416 .cpu_release = sched_ext_ops__cpu_release, 7417 .init_task = sched_ext_ops__init_task, 7418 .exit_task = sched_ext_ops__exit_task, 7419 .enable = sched_ext_ops__enable, 7420 .disable = sched_ext_ops__disable, 7421 #ifdef CONFIG_EXT_GROUP_SCHED 7422 .cgroup_init = sched_ext_ops__cgroup_init, 7423 .cgroup_exit = sched_ext_ops__cgroup_exit, 7424 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 7425 .cgroup_move = sched_ext_ops__cgroup_move, 7426 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 7427 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 7428 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 7429 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 7430 #endif 7431 .sub_attach = sched_ext_ops__sub_attach, 7432 .sub_detach = sched_ext_ops__sub_detach, 7433 .cpu_online = sched_ext_ops__cpu_online, 7434 .cpu_offline = sched_ext_ops__cpu_offline, 7435 .init = sched_ext_ops__init, 7436 .exit = sched_ext_ops__exit, 7437 .dump = sched_ext_ops__dump, 7438 .dump_cpu = sched_ext_ops__dump_cpu, 7439 .dump_task = sched_ext_ops__dump_task, 7440 }; 7441 7442 static struct bpf_struct_ops bpf_sched_ext_ops = { 7443 .verifier_ops = &bpf_scx_verifier_ops, 7444 .reg = bpf_scx_reg, 7445 .unreg = bpf_scx_unreg, 7446 .check_member = bpf_scx_check_member, 7447 .init_member = bpf_scx_init_member, 7448 .init = bpf_scx_init, 7449 .update = bpf_scx_update, 7450 .validate = bpf_scx_validate, 7451 .name = "sched_ext_ops", 7452 .owner = THIS_MODULE, 7453 .cfi_stubs = &__bpf_ops_sched_ext_ops 7454 }; 7455 7456 7457 /******************************************************************************** 7458 * System integration and init. 7459 */ 7460 7461 static void sysrq_handle_sched_ext_reset(u8 key) 7462 { 7463 struct scx_sched *sch; 7464 7465 rcu_read_lock(); 7466 sch = rcu_dereference(scx_root); 7467 if (likely(sch)) 7468 scx_disable(sch, SCX_EXIT_SYSRQ); 7469 else 7470 pr_info("sched_ext: BPF schedulers not loaded\n"); 7471 rcu_read_unlock(); 7472 } 7473 7474 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 7475 .handler = sysrq_handle_sched_ext_reset, 7476 .help_msg = "reset-sched-ext(S)", 7477 .action_msg = "Disable sched_ext and revert all tasks to CFS", 7478 .enable_mask = SYSRQ_ENABLE_RTNICE, 7479 }; 7480 7481 static void sysrq_handle_sched_ext_dump(u8 key) 7482 { 7483 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; 7484 struct scx_sched *sch; 7485 7486 list_for_each_entry_rcu(sch, &scx_sched_all, all) 7487 scx_dump_state(sch, &ei, 0, false); 7488 } 7489 7490 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 7491 .handler = sysrq_handle_sched_ext_dump, 7492 .help_msg = "dump-sched-ext(D)", 7493 .action_msg = "Trigger sched_ext debug dump", 7494 .enable_mask = SYSRQ_ENABLE_RTNICE, 7495 }; 7496 7497 static bool can_skip_idle_kick(struct rq *rq) 7498 { 7499 lockdep_assert_rq_held(rq); 7500 7501 /* 7502 * We can skip idle kicking if @rq is going to go through at least one 7503 * full SCX scheduling cycle before going idle. Just checking whether 7504 * curr is not idle is insufficient because we could be racing 7505 * balance_one() trying to pull the next task from a remote rq, which 7506 * may fail, and @rq may become idle afterwards. 7507 * 7508 * The race window is small and we don't and can't guarantee that @rq is 7509 * only kicked while idle anyway. Skip only when sure. 7510 */ 7511 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 7512 } 7513 7514 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 7515 { 7516 struct rq *rq = cpu_rq(cpu); 7517 struct scx_rq *this_scx = &this_rq->scx; 7518 const struct sched_class *cur_class; 7519 bool should_wait = false; 7520 unsigned long flags; 7521 7522 raw_spin_rq_lock_irqsave(rq, flags); 7523 cur_class = rq->curr->sched_class; 7524 7525 /* 7526 * During CPU hotplug, a CPU may depend on kicking itself to make 7527 * forward progress. Allow kicking self regardless of online state. If 7528 * @cpu is running a higher class task, we have no control over @cpu. 7529 * Skip kicking. 7530 */ 7531 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 7532 !sched_class_above(cur_class, &ext_sched_class)) { 7533 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 7534 if (cur_class == &ext_sched_class) 7535 rq->curr->scx.slice = 0; 7536 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7537 } 7538 7539 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 7540 if (cur_class == &ext_sched_class) { 7541 ksyncs[cpu] = rq->scx.kick_sync; 7542 should_wait = true; 7543 } else { 7544 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7545 } 7546 } 7547 7548 resched_curr(rq); 7549 } else { 7550 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 7551 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7552 } 7553 7554 raw_spin_rq_unlock_irqrestore(rq, flags); 7555 7556 return should_wait; 7557 } 7558 7559 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 7560 { 7561 struct rq *rq = cpu_rq(cpu); 7562 unsigned long flags; 7563 7564 raw_spin_rq_lock_irqsave(rq, flags); 7565 7566 if (!can_skip_idle_kick(rq) && 7567 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 7568 resched_curr(rq); 7569 7570 raw_spin_rq_unlock_irqrestore(rq, flags); 7571 } 7572 7573 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 7574 { 7575 struct rq *this_rq = this_rq(); 7576 struct scx_rq *this_scx = &this_rq->scx; 7577 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 7578 bool should_wait = false; 7579 unsigned long *ksyncs; 7580 s32 cpu; 7581 7582 if (unlikely(!ksyncs_pcpu)) { 7583 pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); 7584 return; 7585 } 7586 7587 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 7588 7589 for_each_cpu(cpu, this_scx->cpus_to_kick) { 7590 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 7591 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 7592 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7593 } 7594 7595 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 7596 kick_one_cpu_if_idle(cpu, this_rq); 7597 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7598 } 7599 7600 if (!should_wait) 7601 return; 7602 7603 for_each_cpu(cpu, this_scx->cpus_to_wait) { 7604 unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; 7605 7606 /* 7607 * Busy-wait until the task running at the time of kicking is no 7608 * longer running. This can be used to implement e.g. core 7609 * scheduling. 7610 * 7611 * smp_cond_load_acquire() pairs with store_releases in 7612 * pick_task_scx() and put_prev_task_scx(). The former breaks 7613 * the wait if SCX's scheduling path is entered even if the same 7614 * task is picked subsequently. The latter is necessary to break 7615 * the wait when $cpu is taken by a higher sched class. 7616 */ 7617 if (cpu != cpu_of(this_rq)) 7618 smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); 7619 7620 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7621 } 7622 } 7623 7624 /** 7625 * print_scx_info - print out sched_ext scheduler state 7626 * @log_lvl: the log level to use when printing 7627 * @p: target task 7628 * 7629 * If a sched_ext scheduler is enabled, print the name and state of the 7630 * scheduler. If @p is on sched_ext, print further information about the task. 7631 * 7632 * This function can be safely called on any task as long as the task_struct 7633 * itself is accessible. While safe, this function isn't synchronized and may 7634 * print out mixups or garbages of limited length. 7635 */ 7636 void print_scx_info(const char *log_lvl, struct task_struct *p) 7637 { 7638 struct scx_sched *sch = scx_root; 7639 enum scx_enable_state state = scx_enable_state(); 7640 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 7641 char runnable_at_buf[22] = "?"; 7642 struct sched_class *class; 7643 unsigned long runnable_at; 7644 7645 if (state == SCX_DISABLED) 7646 return; 7647 7648 /* 7649 * Carefully check if the task was running on sched_ext, and then 7650 * carefully copy the time it's been runnable, and its state. 7651 */ 7652 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 7653 class != &ext_sched_class) { 7654 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 7655 scx_enable_state_str[state], all); 7656 return; 7657 } 7658 7659 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 7660 sizeof(runnable_at))) 7661 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 7662 jiffies_delta_msecs(runnable_at, jiffies)); 7663 7664 /* print everything onto one line to conserve console space */ 7665 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 7666 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 7667 runnable_at_buf); 7668 } 7669 7670 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 7671 { 7672 struct scx_sched *sch; 7673 7674 guard(rcu)(); 7675 7676 sch = rcu_dereference(scx_root); 7677 if (!sch) 7678 return NOTIFY_OK; 7679 7680 /* 7681 * SCX schedulers often have userspace components which are sometimes 7682 * involved in critial scheduling paths. PM operations involve freezing 7683 * userspace which can lead to scheduling misbehaviors including stalls. 7684 * Let's bypass while PM operations are in progress. 7685 */ 7686 switch (event) { 7687 case PM_HIBERNATION_PREPARE: 7688 case PM_SUSPEND_PREPARE: 7689 case PM_RESTORE_PREPARE: 7690 scx_bypass(sch, true); 7691 break; 7692 case PM_POST_HIBERNATION: 7693 case PM_POST_SUSPEND: 7694 case PM_POST_RESTORE: 7695 scx_bypass(sch, false); 7696 break; 7697 } 7698 7699 return NOTIFY_OK; 7700 } 7701 7702 static struct notifier_block scx_pm_notifier = { 7703 .notifier_call = scx_pm_handler, 7704 }; 7705 7706 void __init init_sched_ext_class(void) 7707 { 7708 s32 cpu, v; 7709 7710 /* 7711 * The following is to prevent the compiler from optimizing out the enum 7712 * definitions so that BPF scheduler implementations can use them 7713 * through the generated vmlinux.h. 7714 */ 7715 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 7716 SCX_TG_ONLINE); 7717 7718 scx_idle_init_masks(); 7719 7720 for_each_possible_cpu(cpu) { 7721 struct rq *rq = cpu_rq(cpu); 7722 int n = cpu_to_node(cpu); 7723 7724 /* local_dsq's sch will be set during scx_root_enable() */ 7725 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 7726 7727 INIT_LIST_HEAD(&rq->scx.runnable_list); 7728 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 7729 7730 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 7731 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 7732 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 7733 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 7734 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 7735 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 7736 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 7737 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 7738 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 7739 7740 if (cpu_online(cpu)) 7741 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 7742 } 7743 7744 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 7745 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 7746 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 7747 7748 #ifdef CONFIG_EXT_SUB_SCHED 7749 BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 7750 #endif /* CONFIG_EXT_SUB_SCHED */ 7751 } 7752 7753 7754 /******************************************************************************** 7755 * Helpers that can be called from the BPF scheduler. 7756 */ 7757 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 enq_flags) 7758 { 7759 if ((enq_flags & SCX_ENQ_IMMED) && 7760 unlikely(dsq_id != SCX_DSQ_LOCAL && 7761 (dsq_id & SCX_DSQ_LOCAL_ON) != SCX_DSQ_LOCAL_ON)) { 7762 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 7763 return false; 7764 } 7765 7766 return true; 7767 } 7768 7769 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 7770 u64 dsq_id, u64 enq_flags) 7771 { 7772 if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 7773 return false; 7774 7775 lockdep_assert_irqs_disabled(); 7776 7777 if (unlikely(!p)) { 7778 scx_error(sch, "called with NULL task"); 7779 return false; 7780 } 7781 7782 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 7783 scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); 7784 return false; 7785 } 7786 7787 /* see SCX_EV_INSERT_NOT_OWNED definition */ 7788 if (unlikely(!scx_task_on_sched(sch, p))) { 7789 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 7790 return false; 7791 } 7792 7793 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 7794 return false; 7795 7796 return true; 7797 } 7798 7799 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 7800 u64 dsq_id, u64 enq_flags) 7801 { 7802 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 7803 struct task_struct *ddsp_task; 7804 7805 ddsp_task = __this_cpu_read(direct_dispatch_task); 7806 if (ddsp_task) { 7807 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 7808 return; 7809 } 7810 7811 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 7812 scx_error(sch, "dispatch buffer overflow"); 7813 return; 7814 } 7815 7816 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 7817 .task = p, 7818 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 7819 .dsq_id = dsq_id, 7820 .enq_flags = enq_flags, 7821 }; 7822 } 7823 7824 __bpf_kfunc_start_defs(); 7825 7826 /** 7827 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 7828 * @p: task_struct to insert 7829 * @dsq_id: DSQ to insert into 7830 * @slice: duration @p can run for in nsecs, 0 to keep the current value 7831 * @enq_flags: SCX_ENQ_* 7832 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 7833 * 7834 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 7835 * call this function spuriously. Can be called from ops.enqueue(), 7836 * ops.select_cpu(), and ops.dispatch(). 7837 * 7838 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 7839 * and @p must match the task being enqueued. 7840 * 7841 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 7842 * will be directly inserted into the corresponding dispatch queue after 7843 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 7844 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 7845 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 7846 * task is inserted. 7847 * 7848 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 7849 * and this function can be called upto ops.dispatch_max_batch times to insert 7850 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 7851 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 7852 * counter. 7853 * 7854 * This function doesn't have any locking restrictions and may be called under 7855 * BPF locks (in the future when BPF introduces more flexible locking). 7856 * 7857 * @p is allowed to run for @slice. The scheduling path is triggered on slice 7858 * exhaustion. If zero, the current residual slice is maintained. If 7859 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 7860 * scx_bpf_kick_cpu() to trigger scheduling. 7861 * 7862 * Returns %true on successful insertion, %false on failure. On the root 7863 * scheduler, %false return triggers scheduler abort and the caller doesn't need 7864 * to check the return value. 7865 */ 7866 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 7867 u64 slice, u64 enq_flags, 7868 const struct bpf_prog_aux *aux) 7869 { 7870 struct scx_sched *sch; 7871 7872 guard(rcu)(); 7873 sch = scx_prog_sched(aux); 7874 if (unlikely(!sch)) 7875 return false; 7876 7877 if (!scx_dsq_insert_preamble(sch, p, dsq_id, enq_flags)) 7878 return false; 7879 7880 if (slice) 7881 p->scx.slice = slice; 7882 else 7883 p->scx.slice = p->scx.slice ?: 1; 7884 7885 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 7886 7887 return true; 7888 } 7889 7890 /* 7891 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 7892 */ 7893 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 7894 u64 slice, u64 enq_flags, 7895 const struct bpf_prog_aux *aux) 7896 { 7897 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 7898 } 7899 7900 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 7901 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 7902 { 7903 if (!scx_dsq_insert_preamble(sch, p, dsq_id, enq_flags)) 7904 return false; 7905 7906 if (slice) 7907 p->scx.slice = slice; 7908 else 7909 p->scx.slice = p->scx.slice ?: 1; 7910 7911 p->scx.dsq_vtime = vtime; 7912 7913 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 7914 7915 return true; 7916 } 7917 7918 struct scx_bpf_dsq_insert_vtime_args { 7919 /* @p can't be packed together as KF_RCU is not transitive */ 7920 u64 dsq_id; 7921 u64 slice; 7922 u64 vtime; 7923 u64 enq_flags; 7924 }; 7925 7926 /** 7927 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 7928 * @p: task_struct to insert 7929 * @args: struct containing the rest of the arguments 7930 * @args->dsq_id: DSQ to insert into 7931 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 7932 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 7933 * @args->enq_flags: SCX_ENQ_* 7934 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 7935 * 7936 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 7937 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 7938 * as an inline wrapper in common.bpf.h. 7939 * 7940 * Insert @p into the vtime priority queue of the DSQ identified by 7941 * @args->dsq_id. Tasks queued into the priority queue are ordered by 7942 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 7943 * 7944 * @args->vtime ordering is according to time_before64() which considers 7945 * wrapping. A numerically larger vtime may indicate an earlier position in the 7946 * ordering and vice-versa. 7947 * 7948 * A DSQ can only be used as a FIFO or priority queue at any given time and this 7949 * function must not be called on a DSQ which already has one or more FIFO tasks 7950 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 7951 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 7952 * 7953 * Returns %true on successful insertion, %false on failure. On the root 7954 * scheduler, %false return triggers scheduler abort and the caller doesn't need 7955 * to check the return value. 7956 */ 7957 __bpf_kfunc bool 7958 __scx_bpf_dsq_insert_vtime(struct task_struct *p, 7959 struct scx_bpf_dsq_insert_vtime_args *args, 7960 const struct bpf_prog_aux *aux) 7961 { 7962 struct scx_sched *sch; 7963 7964 guard(rcu)(); 7965 7966 sch = scx_prog_sched(aux); 7967 if (unlikely(!sch)) 7968 return false; 7969 7970 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 7971 args->vtime, args->enq_flags); 7972 } 7973 7974 /* 7975 * COMPAT: Will be removed in v6.23. 7976 */ 7977 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 7978 u64 slice, u64 vtime, u64 enq_flags) 7979 { 7980 struct scx_sched *sch; 7981 7982 guard(rcu)(); 7983 7984 sch = rcu_dereference(scx_root); 7985 if (unlikely(!sch)) 7986 return; 7987 7988 #ifdef CONFIG_EXT_SUB_SCHED 7989 /* 7990 * Disallow if any sub-scheds are attached. There is no way to tell 7991 * which scheduler called us, just error out @p's scheduler. 7992 */ 7993 if (unlikely(!list_empty(&sch->children))) { 7994 scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 7995 return; 7996 } 7997 #endif 7998 7999 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8000 } 8001 8002 __bpf_kfunc_end_defs(); 8003 8004 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8005 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8006 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8007 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8008 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8009 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8010 8011 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8012 .owner = THIS_MODULE, 8013 .set = &scx_kfunc_ids_enqueue_dispatch, 8014 }; 8015 8016 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8017 struct task_struct *p, u64 dsq_id, u64 enq_flags) 8018 { 8019 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8020 struct scx_sched *sch = src_dsq->sched; 8021 struct rq *this_rq, *src_rq, *locked_rq; 8022 bool dispatched = false; 8023 bool in_balance; 8024 unsigned long flags; 8025 8026 if (!scx_kf_allowed_if_unlocked() && 8027 !scx_kf_allowed(sch, SCX_KF_DISPATCH)) 8028 return false; 8029 8030 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8031 return false; 8032 8033 /* 8034 * If the BPF scheduler keeps calling this function repeatedly, it can 8035 * cause similar live-lock conditions as consume_dispatch_q(). 8036 */ 8037 if (unlikely(READ_ONCE(sch->aborting))) 8038 return false; 8039 8040 if (unlikely(!scx_task_on_sched(sch, p))) { 8041 scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8042 p->comm, p->pid); 8043 } 8044 8045 /* 8046 * Can be called from either ops.dispatch() locking this_rq() or any 8047 * context where no rq lock is held. If latter, lock @p's task_rq which 8048 * we'll likely need anyway. 8049 */ 8050 src_rq = task_rq(p); 8051 8052 local_irq_save(flags); 8053 this_rq = this_rq(); 8054 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8055 8056 if (in_balance) { 8057 if (this_rq != src_rq) { 8058 raw_spin_rq_unlock(this_rq); 8059 raw_spin_rq_lock(src_rq); 8060 } 8061 } else { 8062 raw_spin_rq_lock(src_rq); 8063 } 8064 8065 locked_rq = src_rq; 8066 raw_spin_lock(&src_dsq->lock); 8067 8068 /* did someone else get to it while we dropped the locks? */ 8069 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8070 raw_spin_unlock(&src_dsq->lock); 8071 goto out; 8072 } 8073 8074 /* @p is still on $src_dsq and stable, determine the destination */ 8075 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8076 8077 /* 8078 * Apply vtime and slice updates before moving so that the new time is 8079 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8080 * this is safe as we're locking it. 8081 */ 8082 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8083 p->scx.dsq_vtime = kit->vtime; 8084 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8085 p->scx.slice = kit->slice; 8086 8087 /* execute move */ 8088 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8089 dispatched = true; 8090 out: 8091 if (in_balance) { 8092 if (this_rq != locked_rq) { 8093 raw_spin_rq_unlock(locked_rq); 8094 raw_spin_rq_lock(this_rq); 8095 } 8096 } else { 8097 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8098 } 8099 8100 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8101 __SCX_DSQ_ITER_HAS_VTIME); 8102 return dispatched; 8103 } 8104 8105 __bpf_kfunc_start_defs(); 8106 8107 /** 8108 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8109 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8110 * 8111 * Can only be called from ops.dispatch(). 8112 */ 8113 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8114 { 8115 struct scx_sched *sch; 8116 8117 guard(rcu)(); 8118 8119 sch = scx_prog_sched(aux); 8120 if (unlikely(!sch)) 8121 return 0; 8122 8123 if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) 8124 return 0; 8125 8126 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8127 } 8128 8129 /** 8130 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8131 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8132 * 8133 * Cancel the latest dispatch. Can be called multiple times to cancel further 8134 * dispatches. Can only be called from ops.dispatch(). 8135 */ 8136 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 8137 { 8138 struct scx_sched *sch; 8139 struct scx_dsp_ctx *dspc; 8140 8141 guard(rcu)(); 8142 8143 sch = scx_prog_sched(aux); 8144 if (unlikely(!sch)) 8145 return; 8146 8147 if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) 8148 return; 8149 8150 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8151 8152 if (dspc->cursor > 0) 8153 dspc->cursor--; 8154 else 8155 scx_error(sch, "dispatch buffer underflow"); 8156 } 8157 8158 /** 8159 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 8160 * @dsq_id: DSQ to move task from 8161 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8162 * 8163 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 8164 * local DSQ for execution. Can only be called from ops.dispatch(). 8165 * 8166 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 8167 * before trying to move from the specified DSQ. It may also grab rq locks and 8168 * thus can't be called under any BPF locks. 8169 * 8170 * Returns %true if a task has been moved, %false if there isn't any task to 8171 * move. 8172 */ 8173 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 8174 { 8175 struct scx_dispatch_q *dsq; 8176 struct scx_sched *sch; 8177 struct scx_dsp_ctx *dspc; 8178 8179 guard(rcu)(); 8180 8181 sch = scx_prog_sched(aux); 8182 if (unlikely(!sch)) 8183 return false; 8184 8185 if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) 8186 return false; 8187 8188 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8189 8190 flush_dispatch_buf(sch, dspc->rq); 8191 8192 dsq = find_user_dsq(sch, dsq_id); 8193 if (unlikely(!dsq)) { 8194 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 8195 return false; 8196 } 8197 8198 if (consume_dispatch_q(sch, dspc->rq, dsq)) { 8199 /* 8200 * A successfully consumed task can be dequeued before it starts 8201 * running while the CPU is trying to migrate other dispatched 8202 * tasks. Bump nr_tasks to tell balance_one() to retry on empty 8203 * local DSQ. 8204 */ 8205 dspc->nr_tasks++; 8206 return true; 8207 } else { 8208 return false; 8209 } 8210 } 8211 8212 /** 8213 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 8214 * @it__iter: DSQ iterator in progress 8215 * @slice: duration the moved task can run for in nsecs 8216 * 8217 * Override the slice of the next task that will be moved from @it__iter using 8218 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 8219 * slice duration is kept. 8220 */ 8221 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 8222 u64 slice) 8223 { 8224 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8225 8226 kit->slice = slice; 8227 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 8228 } 8229 8230 /** 8231 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 8232 * @it__iter: DSQ iterator in progress 8233 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 8234 * 8235 * Override the vtime of the next task that will be moved from @it__iter using 8236 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 8237 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 8238 * override is ignored and cleared. 8239 */ 8240 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 8241 u64 vtime) 8242 { 8243 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8244 8245 kit->vtime = vtime; 8246 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 8247 } 8248 8249 /** 8250 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 8251 * @it__iter: DSQ iterator in progress 8252 * @p: task to transfer 8253 * @dsq_id: DSQ to move @p to 8254 * @enq_flags: SCX_ENQ_* 8255 * 8256 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 8257 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 8258 * be the destination. 8259 * 8260 * For the transfer to be successful, @p must still be on the DSQ and have been 8261 * queued before the DSQ iteration started. This function doesn't care whether 8262 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 8263 * been queued before the iteration started. 8264 * 8265 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 8266 * 8267 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 8268 * lock (e.g. BPF timers or SYSCALL programs). 8269 * 8270 * Returns %true if @p has been consumed, %false if @p had already been 8271 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 8272 * DSQ. 8273 */ 8274 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 8275 struct task_struct *p, u64 dsq_id, 8276 u64 enq_flags) 8277 { 8278 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8279 p, dsq_id, enq_flags); 8280 } 8281 8282 /** 8283 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 8284 * @it__iter: DSQ iterator in progress 8285 * @p: task to transfer 8286 * @dsq_id: DSQ to move @p to 8287 * @enq_flags: SCX_ENQ_* 8288 * 8289 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 8290 * priority queue of the DSQ specified by @dsq_id. The destination must be a 8291 * user DSQ as only user DSQs support priority queue. 8292 * 8293 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 8294 * and scx_bpf_dsq_move_set_vtime() to update. 8295 * 8296 * All other aspects are identical to scx_bpf_dsq_move(). See 8297 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 8298 */ 8299 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 8300 struct task_struct *p, u64 dsq_id, 8301 u64 enq_flags) 8302 { 8303 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 8304 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8305 } 8306 8307 #ifdef CONFIG_EXT_SUB_SCHED 8308 /** 8309 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 8310 * @cgroup_id: cgroup ID of the child scheduler to dispatch 8311 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8312 * 8313 * Allows a parent scheduler to trigger dispatching on one of its direct 8314 * child schedulers. The child scheduler runs its dispatch operation to 8315 * move tasks from dispatch queues to the local runqueue. 8316 * 8317 * Returns: true on success, false if cgroup_id is invalid, not a direct 8318 * child, or caller lacks dispatch permission. 8319 */ 8320 __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 8321 { 8322 struct rq *this_rq = this_rq(); 8323 struct scx_sched *parent, *child; 8324 8325 guard(rcu)(); 8326 parent = scx_prog_sched(aux); 8327 if (unlikely(!parent)) 8328 return false; 8329 8330 if (!scx_kf_allowed(parent, SCX_KF_DISPATCH)) 8331 return false; 8332 8333 child = scx_find_sub_sched(cgroup_id); 8334 8335 if (unlikely(!child)) 8336 return false; 8337 8338 if (unlikely(scx_parent(child) != parent)) { 8339 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 8340 cgroup_id); 8341 return false; 8342 } 8343 8344 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 8345 true); 8346 } 8347 #endif /* CONFIG_EXT_SUB_SCHED */ 8348 8349 __bpf_kfunc_end_defs(); 8350 8351 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 8352 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 8353 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 8354 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 8355 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8356 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8357 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8358 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8359 #ifdef CONFIG_EXT_SUB_SCHED 8360 BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 8361 #endif 8362 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 8363 8364 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 8365 .owner = THIS_MODULE, 8366 .set = &scx_kfunc_ids_dispatch, 8367 }; 8368 8369 __bpf_kfunc_start_defs(); 8370 8371 /** 8372 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 8373 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8374 * 8375 * Iterate over all of the tasks currently enqueued on the local DSQ of the 8376 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 8377 * processed tasks. Can only be called from ops.cpu_release(). 8378 */ 8379 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 8380 { 8381 struct scx_sched *sch; 8382 struct rq *rq; 8383 8384 guard(rcu)(); 8385 sch = scx_prog_sched(aux); 8386 if (unlikely(!sch)) 8387 return 0; 8388 8389 if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) 8390 return 0; 8391 8392 rq = cpu_rq(smp_processor_id()); 8393 lockdep_assert_rq_held(rq); 8394 8395 return reenq_local(sch, rq, SCX_REENQ_ANY); 8396 } 8397 8398 __bpf_kfunc_end_defs(); 8399 8400 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 8401 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 8402 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 8403 8404 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 8405 .owner = THIS_MODULE, 8406 .set = &scx_kfunc_ids_cpu_release, 8407 }; 8408 8409 __bpf_kfunc_start_defs(); 8410 8411 /** 8412 * scx_bpf_create_dsq - Create a custom DSQ 8413 * @dsq_id: DSQ to create 8414 * @node: NUMA node to allocate from 8415 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8416 * 8417 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 8418 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 8419 */ 8420 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 8421 { 8422 struct scx_dispatch_q *dsq; 8423 struct scx_sched *sch; 8424 s32 ret; 8425 8426 if (unlikely(node >= (int)nr_node_ids || 8427 (node < 0 && node != NUMA_NO_NODE))) 8428 return -EINVAL; 8429 8430 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 8431 return -EINVAL; 8432 8433 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 8434 if (!dsq) 8435 return -ENOMEM; 8436 8437 /* 8438 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 8439 * @sch and update afterwards. 8440 */ 8441 ret = init_dsq(dsq, dsq_id, NULL); 8442 if (ret) { 8443 kfree(dsq); 8444 return ret; 8445 } 8446 8447 rcu_read_lock(); 8448 8449 sch = scx_prog_sched(aux); 8450 if (sch) { 8451 dsq->sched = sch; 8452 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 8453 dsq_hash_params); 8454 } else { 8455 ret = -ENODEV; 8456 } 8457 8458 rcu_read_unlock(); 8459 if (ret) { 8460 exit_dsq(dsq); 8461 kfree(dsq); 8462 } 8463 return ret; 8464 } 8465 8466 __bpf_kfunc_end_defs(); 8467 8468 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 8469 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 8470 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 8471 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 8472 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 8473 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 8474 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 8475 8476 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 8477 .owner = THIS_MODULE, 8478 .set = &scx_kfunc_ids_unlocked, 8479 }; 8480 8481 __bpf_kfunc_start_defs(); 8482 8483 /** 8484 * scx_bpf_task_set_slice - Set task's time slice 8485 * @p: task of interest 8486 * @slice: time slice to set in nsecs 8487 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8488 * 8489 * Set @p's time slice to @slice. Returns %true on success, %false if the 8490 * calling scheduler doesn't have authority over @p. 8491 */ 8492 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 8493 const struct bpf_prog_aux *aux) 8494 { 8495 struct scx_sched *sch; 8496 8497 guard(rcu)(); 8498 sch = scx_prog_sched(aux); 8499 if (unlikely(!scx_task_on_sched(sch, p))) 8500 return false; 8501 8502 p->scx.slice = slice; 8503 return true; 8504 } 8505 8506 /** 8507 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 8508 * @p: task of interest 8509 * @vtime: virtual time to set 8510 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8511 * 8512 * Set @p's virtual time to @vtime. Returns %true on success, %false if the 8513 * calling scheduler doesn't have authority over @p. 8514 */ 8515 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 8516 const struct bpf_prog_aux *aux) 8517 { 8518 struct scx_sched *sch; 8519 8520 guard(rcu)(); 8521 sch = scx_prog_sched(aux); 8522 if (unlikely(!scx_task_on_sched(sch, p))) 8523 return false; 8524 8525 p->scx.dsq_vtime = vtime; 8526 return true; 8527 } 8528 8529 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 8530 { 8531 struct rq *this_rq; 8532 unsigned long irq_flags; 8533 8534 if (!ops_cpu_valid(sch, cpu, NULL)) 8535 return; 8536 8537 local_irq_save(irq_flags); 8538 8539 this_rq = this_rq(); 8540 8541 /* 8542 * While bypassing for PM ops, IRQ handling may not be online which can 8543 * lead to irq_work_queue() malfunction such as infinite busy wait for 8544 * IRQ status update. Suppress kicking. 8545 */ 8546 if (scx_bypassing(sch, cpu_of(this_rq))) 8547 goto out; 8548 8549 /* 8550 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 8551 * rq locks. We can probably be smarter and avoid bouncing if called 8552 * from ops which don't hold a rq lock. 8553 */ 8554 if (flags & SCX_KICK_IDLE) { 8555 struct rq *target_rq = cpu_rq(cpu); 8556 8557 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 8558 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 8559 8560 if (raw_spin_rq_trylock(target_rq)) { 8561 if (can_skip_idle_kick(target_rq)) { 8562 raw_spin_rq_unlock(target_rq); 8563 goto out; 8564 } 8565 raw_spin_rq_unlock(target_rq); 8566 } 8567 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 8568 } else { 8569 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 8570 8571 if (flags & SCX_KICK_PREEMPT) 8572 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 8573 if (flags & SCX_KICK_WAIT) 8574 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 8575 } 8576 8577 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 8578 out: 8579 local_irq_restore(irq_flags); 8580 } 8581 8582 /** 8583 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 8584 * @cpu: cpu to kick 8585 * @flags: %SCX_KICK_* flags 8586 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8587 * 8588 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 8589 * trigger rescheduling on a busy CPU. This can be called from any online 8590 * scx_ops operation and the actual kicking is performed asynchronously through 8591 * an irq work. 8592 */ 8593 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 8594 { 8595 struct scx_sched *sch; 8596 8597 guard(rcu)(); 8598 sch = scx_prog_sched(aux); 8599 if (likely(sch)) 8600 scx_kick_cpu(sch, cpu, flags); 8601 } 8602 8603 /** 8604 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 8605 * @dsq_id: id of the DSQ 8606 * 8607 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 8608 * -%ENOENT is returned. 8609 */ 8610 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) 8611 { 8612 struct scx_sched *sch; 8613 struct scx_dispatch_q *dsq; 8614 s32 ret; 8615 8616 preempt_disable(); 8617 8618 sch = rcu_dereference_sched(scx_root); 8619 if (unlikely(!sch)) { 8620 ret = -ENODEV; 8621 goto out; 8622 } 8623 8624 if (dsq_id == SCX_DSQ_LOCAL) { 8625 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 8626 goto out; 8627 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 8628 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 8629 8630 if (ops_cpu_valid(sch, cpu, NULL)) { 8631 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 8632 goto out; 8633 } 8634 } else { 8635 dsq = find_user_dsq(sch, dsq_id); 8636 if (dsq) { 8637 ret = READ_ONCE(dsq->nr); 8638 goto out; 8639 } 8640 } 8641 ret = -ENOENT; 8642 out: 8643 preempt_enable(); 8644 return ret; 8645 } 8646 8647 /** 8648 * scx_bpf_destroy_dsq - Destroy a custom DSQ 8649 * @dsq_id: DSQ to destroy 8650 * 8651 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 8652 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 8653 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 8654 * which doesn't exist. Can be called from any online scx_ops operations. 8655 */ 8656 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) 8657 { 8658 struct scx_sched *sch; 8659 8660 rcu_read_lock(); 8661 sch = rcu_dereference(scx_root); 8662 if (sch) 8663 destroy_dsq(sch, dsq_id); 8664 rcu_read_unlock(); 8665 } 8666 8667 /** 8668 * bpf_iter_scx_dsq_new - Create a DSQ iterator 8669 * @it: iterator to initialize 8670 * @dsq_id: DSQ to iterate 8671 * @flags: %SCX_DSQ_ITER_* 8672 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8673 * 8674 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 8675 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 8676 * tasks which are already queued when this function is invoked. 8677 */ 8678 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 8679 u64 flags, const struct bpf_prog_aux *aux) 8680 { 8681 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 8682 struct scx_sched *sch; 8683 8684 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 8685 sizeof(struct bpf_iter_scx_dsq)); 8686 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 8687 __alignof__(struct bpf_iter_scx_dsq)); 8688 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 8689 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 8690 8691 /* 8692 * next() and destroy() will be called regardless of the return value. 8693 * Always clear $kit->dsq. 8694 */ 8695 kit->dsq = NULL; 8696 8697 sch = scx_prog_sched(aux); 8698 if (unlikely(!sch)) 8699 return -ENODEV; 8700 8701 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 8702 return -EINVAL; 8703 8704 kit->dsq = find_user_dsq(sch, dsq_id); 8705 if (!kit->dsq) 8706 return -ENOENT; 8707 8708 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 8709 8710 return 0; 8711 } 8712 8713 /** 8714 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 8715 * @it: iterator to progress 8716 * 8717 * Return the next task. See bpf_iter_scx_dsq_new(). 8718 */ 8719 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 8720 { 8721 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 8722 8723 if (!kit->dsq) 8724 return NULL; 8725 8726 guard(raw_spinlock_irqsave)(&kit->dsq->lock); 8727 8728 return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 8729 } 8730 8731 /** 8732 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 8733 * @it: iterator to destroy 8734 * 8735 * Undo scx_iter_scx_dsq_new(). 8736 */ 8737 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 8738 { 8739 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 8740 8741 if (!kit->dsq) 8742 return; 8743 8744 if (!list_empty(&kit->cursor.node)) { 8745 unsigned long flags; 8746 8747 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 8748 list_del_init(&kit->cursor.node); 8749 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 8750 } 8751 kit->dsq = NULL; 8752 } 8753 8754 /** 8755 * scx_bpf_dsq_peek - Lockless peek at the first element. 8756 * @dsq_id: DSQ to examine. 8757 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8758 * 8759 * Read the first element in the DSQ. This is semantically equivalent to using 8760 * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 8761 * this provides only a point-in-time snapshot, and the contents may change 8762 * by the time any subsequent locking operation reads the queue. 8763 * 8764 * Returns the pointer, or NULL indicates an empty queue OR internal error. 8765 */ 8766 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 8767 const struct bpf_prog_aux *aux) 8768 { 8769 struct scx_sched *sch; 8770 struct scx_dispatch_q *dsq; 8771 8772 sch = scx_prog_sched(aux); 8773 if (unlikely(!sch)) 8774 return NULL; 8775 8776 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 8777 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 8778 return NULL; 8779 } 8780 8781 dsq = find_user_dsq(sch, dsq_id); 8782 if (unlikely(!dsq)) { 8783 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 8784 return NULL; 8785 } 8786 8787 return rcu_dereference(dsq->first_task); 8788 } 8789 8790 /** 8791 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 8792 * @dsq_id: DSQ to re-enqueue 8793 * @reenq_flags: %SCX_RENQ_* 8794 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8795 * 8796 * Iterate over all of the tasks currently enqueued on the DSQ identified by 8797 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 8798 * supported: 8799 * 8800 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 8801 * - User DSQs 8802 * 8803 * Re-enqueues are performed asynchronously. Can be called from anywhere. 8804 */ 8805 __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 8806 const struct bpf_prog_aux *aux) 8807 { 8808 struct scx_sched *sch; 8809 struct scx_dispatch_q *dsq; 8810 8811 guard(preempt)(); 8812 8813 sch = scx_prog_sched(aux); 8814 if (unlikely(!sch)) 8815 return; 8816 8817 if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 8818 scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 8819 return; 8820 } 8821 8822 /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 8823 if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 8824 reenq_flags |= SCX_REENQ_ANY; 8825 8826 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 8827 schedule_dsq_reenq(sch, dsq, reenq_flags); 8828 } 8829 8830 /** 8831 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 8832 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8833 * 8834 * Iterate over all of the tasks currently enqueued on the local DSQ of the 8835 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 8836 * anywhere. 8837 * 8838 * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 8839 * future. 8840 */ 8841 __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 8842 { 8843 scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 8844 } 8845 8846 __bpf_kfunc_end_defs(); 8847 8848 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 8849 size_t line_size, char *fmt, unsigned long long *data, 8850 u32 data__sz) 8851 { 8852 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 8853 s32 ret; 8854 8855 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 8856 (data__sz && !data)) { 8857 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 8858 return -EINVAL; 8859 } 8860 8861 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 8862 if (ret < 0) { 8863 scx_error(sch, "failed to read data fields (%d)", ret); 8864 return ret; 8865 } 8866 8867 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 8868 &bprintf_data); 8869 if (ret < 0) { 8870 scx_error(sch, "format preparation failed (%d)", ret); 8871 return ret; 8872 } 8873 8874 ret = bstr_printf(line_buf, line_size, fmt, 8875 bprintf_data.bin_args); 8876 bpf_bprintf_cleanup(&bprintf_data); 8877 if (ret < 0) { 8878 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 8879 return ret; 8880 } 8881 8882 return ret; 8883 } 8884 8885 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 8886 char *fmt, unsigned long long *data, u32 data__sz) 8887 { 8888 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 8889 fmt, data, data__sz); 8890 } 8891 8892 __bpf_kfunc_start_defs(); 8893 8894 /** 8895 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 8896 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 8897 * @fmt: error message format string 8898 * @data: format string parameters packaged using ___bpf_fill() macro 8899 * @data__sz: @data len, must end in '__sz' for the verifier 8900 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8901 * 8902 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 8903 * disabling. 8904 */ 8905 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 8906 unsigned long long *data, u32 data__sz, 8907 const struct bpf_prog_aux *aux) 8908 { 8909 struct scx_sched *sch; 8910 unsigned long flags; 8911 8912 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 8913 sch = scx_prog_sched(aux); 8914 if (likely(sch) && 8915 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 8916 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 8917 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 8918 } 8919 8920 /** 8921 * scx_bpf_error_bstr - Indicate fatal error 8922 * @fmt: error message format string 8923 * @data: format string parameters packaged using ___bpf_fill() macro 8924 * @data__sz: @data len, must end in '__sz' for the verifier 8925 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8926 * 8927 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 8928 * disabling. 8929 */ 8930 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 8931 u32 data__sz, const struct bpf_prog_aux *aux) 8932 { 8933 struct scx_sched *sch; 8934 unsigned long flags; 8935 8936 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 8937 sch = scx_prog_sched(aux); 8938 if (likely(sch) && 8939 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 8940 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 8941 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 8942 } 8943 8944 /** 8945 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 8946 * @fmt: format string 8947 * @data: format string parameters packaged using ___bpf_fill() macro 8948 * @data__sz: @data len, must end in '__sz' for the verifier 8949 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8950 * 8951 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 8952 * dump_task() to generate extra debug dump specific to the BPF scheduler. 8953 * 8954 * The extra dump may be multiple lines. A single line may be split over 8955 * multiple calls. The last line is automatically terminated. 8956 */ 8957 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 8958 u32 data__sz, const struct bpf_prog_aux *aux) 8959 { 8960 struct scx_sched *sch; 8961 struct scx_dump_data *dd = &scx_dump_data; 8962 struct scx_bstr_buf *buf = &dd->buf; 8963 s32 ret; 8964 8965 guard(rcu)(); 8966 8967 sch = scx_prog_sched(aux); 8968 if (unlikely(!sch)) 8969 return; 8970 8971 if (raw_smp_processor_id() != dd->cpu) { 8972 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 8973 return; 8974 } 8975 8976 /* append the formatted string to the line buf */ 8977 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 8978 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 8979 if (ret < 0) { 8980 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 8981 dd->prefix, fmt, data, data__sz, ret); 8982 return; 8983 } 8984 8985 dd->cursor += ret; 8986 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 8987 8988 if (!dd->cursor) 8989 return; 8990 8991 /* 8992 * If the line buf overflowed or ends in a newline, flush it into the 8993 * dump. This is to allow the caller to generate a single line over 8994 * multiple calls. As ops_dump_flush() can also handle multiple lines in 8995 * the line buf, the only case which can lead to an unexpected 8996 * truncation is when the caller keeps generating newlines in the middle 8997 * instead of the end consecutively. Don't do that. 8998 */ 8999 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9000 ops_dump_flush(); 9001 } 9002 9003 /** 9004 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9005 * @cpu: CPU of interest 9006 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9007 * 9008 * Return the maximum relative capacity of @cpu in relation to the most 9009 * performant CPU in the system. The return value is in the range [1, 9010 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9011 */ 9012 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9013 { 9014 struct scx_sched *sch; 9015 9016 guard(rcu)(); 9017 9018 sch = scx_prog_sched(aux); 9019 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9020 return arch_scale_cpu_capacity(cpu); 9021 else 9022 return SCX_CPUPERF_ONE; 9023 } 9024 9025 /** 9026 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9027 * @cpu: CPU of interest 9028 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9029 * 9030 * Return the current relative performance of @cpu in relation to its maximum. 9031 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9032 * 9033 * The current performance level of a CPU in relation to the maximum performance 9034 * available in the system can be calculated as follows: 9035 * 9036 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9037 * 9038 * The result is in the range [1, %SCX_CPUPERF_ONE]. 9039 */ 9040 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9041 { 9042 struct scx_sched *sch; 9043 9044 guard(rcu)(); 9045 9046 sch = scx_prog_sched(aux); 9047 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 9048 return arch_scale_freq_capacity(cpu); 9049 else 9050 return SCX_CPUPERF_ONE; 9051 } 9052 9053 /** 9054 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 9055 * @cpu: CPU of interest 9056 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 9057 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9058 * 9059 * Set the target performance level of @cpu to @perf. @perf is in linear 9060 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 9061 * schedutil cpufreq governor chooses the target frequency. 9062 * 9063 * The actual performance level chosen, CPU grouping, and the overhead and 9064 * latency of the operations are dependent on the hardware and cpufreq driver in 9065 * use. Consult hardware and cpufreq documentation for more information. The 9066 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 9067 */ 9068 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 9069 { 9070 struct scx_sched *sch; 9071 9072 guard(rcu)(); 9073 9074 sch = scx_prog_sched(aux); 9075 if (unlikely(!sch)) 9076 return; 9077 9078 if (unlikely(perf > SCX_CPUPERF_ONE)) { 9079 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 9080 return; 9081 } 9082 9083 if (ops_cpu_valid(sch, cpu, NULL)) { 9084 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 9085 struct rq_flags rf; 9086 9087 /* 9088 * When called with an rq lock held, restrict the operation 9089 * to the corresponding CPU to prevent ABBA deadlocks. 9090 */ 9091 if (locked_rq && rq != locked_rq) { 9092 scx_error(sch, "Invalid target CPU %d", cpu); 9093 return; 9094 } 9095 9096 /* 9097 * If no rq lock is held, allow to operate on any CPU by 9098 * acquiring the corresponding rq lock. 9099 */ 9100 if (!locked_rq) { 9101 rq_lock_irqsave(rq, &rf); 9102 update_rq_clock(rq); 9103 } 9104 9105 rq->scx.cpuperf_target = perf; 9106 cpufreq_update_util(rq, 0); 9107 9108 if (!locked_rq) 9109 rq_unlock_irqrestore(rq, &rf); 9110 } 9111 } 9112 9113 /** 9114 * scx_bpf_nr_node_ids - Return the number of possible node IDs 9115 * 9116 * All valid node IDs in the system are smaller than the returned value. 9117 */ 9118 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 9119 { 9120 return nr_node_ids; 9121 } 9122 9123 /** 9124 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 9125 * 9126 * All valid CPU IDs in the system are smaller than the returned value. 9127 */ 9128 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 9129 { 9130 return nr_cpu_ids; 9131 } 9132 9133 /** 9134 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 9135 */ 9136 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 9137 { 9138 return cpu_possible_mask; 9139 } 9140 9141 /** 9142 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 9143 */ 9144 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 9145 { 9146 return cpu_online_mask; 9147 } 9148 9149 /** 9150 * scx_bpf_put_cpumask - Release a possible/online cpumask 9151 * @cpumask: cpumask to release 9152 */ 9153 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 9154 { 9155 /* 9156 * Empty function body because we aren't actually acquiring or releasing 9157 * a reference to a global cpumask, which is read-only in the caller and 9158 * is never released. The acquire / release semantics here are just used 9159 * to make the cpumask is a trusted pointer in the caller. 9160 */ 9161 } 9162 9163 /** 9164 * scx_bpf_task_running - Is task currently running? 9165 * @p: task of interest 9166 */ 9167 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 9168 { 9169 return task_rq(p)->curr == p; 9170 } 9171 9172 /** 9173 * scx_bpf_task_cpu - CPU a task is currently associated with 9174 * @p: task of interest 9175 */ 9176 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 9177 { 9178 return task_cpu(p); 9179 } 9180 9181 /** 9182 * scx_bpf_cpu_rq - Fetch the rq of a CPU 9183 * @cpu: CPU of the rq 9184 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9185 */ 9186 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 9187 { 9188 struct scx_sched *sch; 9189 9190 guard(rcu)(); 9191 9192 sch = scx_prog_sched(aux); 9193 if (unlikely(!sch)) 9194 return NULL; 9195 9196 if (!ops_cpu_valid(sch, cpu, NULL)) 9197 return NULL; 9198 9199 if (!sch->warned_deprecated_rq) { 9200 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 9201 "use scx_bpf_locked_rq() when holding rq lock " 9202 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 9203 sch->warned_deprecated_rq = true; 9204 } 9205 9206 return cpu_rq(cpu); 9207 } 9208 9209 /** 9210 * scx_bpf_locked_rq - Return the rq currently locked by SCX 9211 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9212 * 9213 * Returns the rq if a rq lock is currently held by SCX. 9214 * Otherwise emits an error and returns NULL. 9215 */ 9216 __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 9217 { 9218 struct scx_sched *sch; 9219 struct rq *rq; 9220 9221 guard(preempt)(); 9222 9223 sch = scx_prog_sched(aux); 9224 if (unlikely(!sch)) 9225 return NULL; 9226 9227 rq = scx_locked_rq(); 9228 if (!rq) { 9229 scx_error(sch, "accessing rq without holding rq lock"); 9230 return NULL; 9231 } 9232 9233 return rq; 9234 } 9235 9236 /** 9237 * scx_bpf_cpu_curr - Return remote CPU's curr task 9238 * @cpu: CPU of interest 9239 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9240 * 9241 * Callers must hold RCU read lock (KF_RCU). 9242 */ 9243 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 9244 { 9245 struct scx_sched *sch; 9246 9247 guard(rcu)(); 9248 9249 sch = scx_prog_sched(aux); 9250 if (unlikely(!sch)) 9251 return NULL; 9252 9253 if (!ops_cpu_valid(sch, cpu, NULL)) 9254 return NULL; 9255 9256 return rcu_dereference(cpu_rq(cpu)->curr); 9257 } 9258 9259 /** 9260 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 9261 * clock for the current CPU. The clock returned is in nanoseconds. 9262 * 9263 * It provides the following properties: 9264 * 9265 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 9266 * to account for execution time and track tasks' runtime properties. 9267 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 9268 * eventually reads a hardware timestamp counter -- is neither performant nor 9269 * scalable. scx_bpf_now() aims to provide a high-performance clock by 9270 * using the rq clock in the scheduler core whenever possible. 9271 * 9272 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 9273 * scheduler use cases, the required clock resolution is lower than the most 9274 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 9275 * uses the rq clock in the scheduler core whenever it is valid. It considers 9276 * that the rq clock is valid from the time the rq clock is updated 9277 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 9278 * 9279 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 9280 * guarantees the clock never goes backward when comparing them in the same 9281 * CPU. On the other hand, when comparing clocks in different CPUs, there 9282 * is no such guarantee -- the clock can go backward. It provides a 9283 * monotonically *non-decreasing* clock so that it would provide the same 9284 * clock values in two different scx_bpf_now() calls in the same CPU 9285 * during the same period of when the rq clock is valid. 9286 */ 9287 __bpf_kfunc u64 scx_bpf_now(void) 9288 { 9289 struct rq *rq; 9290 u64 clock; 9291 9292 preempt_disable(); 9293 9294 rq = this_rq(); 9295 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 9296 /* 9297 * If the rq clock is valid, use the cached rq clock. 9298 * 9299 * Note that scx_bpf_now() is re-entrant between a process 9300 * context and an interrupt context (e.g., timer interrupt). 9301 * However, we don't need to consider the race between them 9302 * because such race is not observable from a caller. 9303 */ 9304 clock = READ_ONCE(rq->scx.clock); 9305 } else { 9306 /* 9307 * Otherwise, return a fresh rq clock. 9308 * 9309 * The rq clock is updated outside of the rq lock. 9310 * In this case, keep the updated rq clock invalid so the next 9311 * kfunc call outside the rq lock gets a fresh rq clock. 9312 */ 9313 clock = sched_clock_cpu(cpu_of(rq)); 9314 } 9315 9316 preempt_enable(); 9317 9318 return clock; 9319 } 9320 9321 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 9322 { 9323 struct scx_event_stats *e_cpu; 9324 int cpu; 9325 9326 /* Aggregate per-CPU event counters into @events. */ 9327 memset(events, 0, sizeof(*events)); 9328 for_each_possible_cpu(cpu) { 9329 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 9330 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 9331 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 9332 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 9333 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 9334 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 9335 scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 9336 scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 9337 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 9338 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 9339 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 9340 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 9341 scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 9342 } 9343 } 9344 9345 /* 9346 * scx_bpf_events - Get a system-wide event counter to 9347 * @events: output buffer from a BPF program 9348 * @events__sz: @events len, must end in '__sz'' for the verifier 9349 */ 9350 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 9351 size_t events__sz) 9352 { 9353 struct scx_sched *sch; 9354 struct scx_event_stats e_sys; 9355 9356 rcu_read_lock(); 9357 sch = rcu_dereference(scx_root); 9358 if (sch) 9359 scx_read_events(sch, &e_sys); 9360 else 9361 memset(&e_sys, 0, sizeof(e_sys)); 9362 rcu_read_unlock(); 9363 9364 /* 9365 * We cannot entirely trust a BPF-provided size since a BPF program 9366 * might be compiled against a different vmlinux.h, of which 9367 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 9368 * (an older vmlinux.h). Hence, we use the smaller size to avoid 9369 * memory corruption. 9370 */ 9371 events__sz = min(events__sz, sizeof(*events)); 9372 memcpy(events, &e_sys, events__sz); 9373 } 9374 9375 #ifdef CONFIG_CGROUP_SCHED 9376 /** 9377 * scx_bpf_task_cgroup - Return the sched cgroup of a task 9378 * @p: task of interest 9379 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9380 * 9381 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 9382 * from the scheduler's POV. SCX operations should use this function to 9383 * determine @p's current cgroup as, unlike following @p->cgroups, 9384 * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all 9385 * rq-locked operations. Can be called on the parameter tasks of rq-locked 9386 * operations. The restriction guarantees that @p's rq is locked by the caller. 9387 */ 9388 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 9389 const struct bpf_prog_aux *aux) 9390 { 9391 struct task_group *tg = p->sched_task_group; 9392 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 9393 struct scx_sched *sch; 9394 9395 guard(rcu)(); 9396 9397 sch = scx_prog_sched(aux); 9398 if (unlikely(!sch)) 9399 goto out; 9400 9401 if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) 9402 goto out; 9403 9404 cgrp = tg_cgrp(tg); 9405 9406 out: 9407 cgroup_get(cgrp); 9408 return cgrp; 9409 } 9410 #endif /* CONFIG_CGROUP_SCHED */ 9411 9412 __bpf_kfunc_end_defs(); 9413 9414 BTF_KFUNCS_START(scx_kfunc_ids_any) 9415 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 9416 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 9417 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 9418 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 9419 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 9420 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 9421 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 9422 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 9423 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 9424 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 9425 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 9426 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 9427 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 9428 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 9429 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 9430 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 9431 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 9432 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 9433 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 9434 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 9435 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 9436 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 9437 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 9438 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 9439 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 9440 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 9441 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 9442 BTF_ID_FLAGS(func, scx_bpf_now) 9443 BTF_ID_FLAGS(func, scx_bpf_events) 9444 #ifdef CONFIG_CGROUP_SCHED 9445 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 9446 #endif 9447 BTF_KFUNCS_END(scx_kfunc_ids_any) 9448 9449 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 9450 .owner = THIS_MODULE, 9451 .set = &scx_kfunc_ids_any, 9452 }; 9453 9454 static int __init scx_init(void) 9455 { 9456 int ret; 9457 9458 /* 9459 * kfunc registration can't be done from init_sched_ext_class() as 9460 * register_btf_kfunc_id_set() needs most of the system to be up. 9461 * 9462 * Some kfuncs are context-sensitive and can only be called from 9463 * specific SCX ops. They are grouped into BTF sets accordingly. 9464 * Unfortunately, BPF currently doesn't have a way of enforcing such 9465 * restrictions. Eventually, the verifier should be able to enforce 9466 * them. For now, register them the same and make each kfunc explicitly 9467 * check using scx_kf_allowed(). 9468 */ 9469 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9470 &scx_kfunc_set_enqueue_dispatch)) || 9471 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9472 &scx_kfunc_set_dispatch)) || 9473 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9474 &scx_kfunc_set_cpu_release)) || 9475 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9476 &scx_kfunc_set_unlocked)) || 9477 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9478 &scx_kfunc_set_unlocked)) || 9479 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 9480 &scx_kfunc_set_any)) || 9481 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 9482 &scx_kfunc_set_any)) || 9483 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 9484 &scx_kfunc_set_any))) { 9485 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 9486 return ret; 9487 } 9488 9489 ret = scx_idle_init(); 9490 if (ret) { 9491 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 9492 return ret; 9493 } 9494 9495 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 9496 if (ret) { 9497 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 9498 return ret; 9499 } 9500 9501 ret = register_pm_notifier(&scx_pm_notifier); 9502 if (ret) { 9503 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 9504 return ret; 9505 } 9506 9507 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 9508 if (!scx_kset) { 9509 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 9510 return -ENOMEM; 9511 } 9512 9513 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 9514 if (ret < 0) { 9515 pr_err("sched_ext: Failed to add global attributes\n"); 9516 return ret; 9517 } 9518 9519 if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || 9520 !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { 9521 pr_err("sched_ext: Failed to allocate cpumasks\n"); 9522 return -ENOMEM; 9523 } 9524 9525 return 0; 9526 } 9527 __initcall(scx_init); 9528