1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 10 static DEFINE_RAW_SPINLOCK(scx_sched_lock); 11 12 /* 13 * NOTE: sched_ext is in the process of growing multiple scheduler support and 14 * scx_root usage is in a transitional state. Naked dereferences are safe if the 15 * caller is one of the tasks attached to SCX and explicit RCU dereference is 16 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 17 * are used as temporary markers to indicate that the dereferences need to be 18 * updated to point to the associated scheduler instances rather than scx_root. 19 */ 20 struct scx_sched __rcu *scx_root; 21 22 /* 23 * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 24 * Readers can hold either or rcu_read_lock(). 25 */ 26 static LIST_HEAD(scx_sched_all); 27 28 #ifdef CONFIG_EXT_SUB_SCHED 29 static const struct rhashtable_params scx_sched_hash_params = { 30 .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 31 .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 32 .head_offset = offsetof(struct scx_sched, hash_node), 33 .insecure_elasticity = true, /* inserted under scx_sched_lock */ 34 }; 35 36 static struct rhashtable scx_sched_hash; 37 #endif 38 39 /* see SCX_OPS_TID_TO_TASK */ 40 static const struct rhashtable_params scx_tid_hash_params = { 41 .key_len = sizeof_field(struct sched_ext_entity, tid), 42 .key_offset = offsetof(struct sched_ext_entity, tid), 43 .head_offset = offsetof(struct sched_ext_entity, tid_hash_node), 44 .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */ 45 }; 46 static struct rhashtable scx_tid_hash; 47 48 /* 49 * During exit, a task may schedule after losing its PIDs. When disabling the 50 * BPF scheduler, we need to be able to iterate tasks in every state to 51 * guarantee system safety. Maintain a dedicated task list which contains every 52 * task between its fork and eventual free. 53 */ 54 static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 55 static LIST_HEAD(scx_tasks); 56 57 /* ops enable/disable */ 58 static DEFINE_MUTEX(scx_enable_mutex); 59 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 60 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 61 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 62 static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 63 static bool scx_init_task_enabled; 64 static bool scx_switching_all; 65 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 66 static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled); 67 68 /* 69 * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler 70 * and the tid->task table is live. Wraps the static key so callers don't 71 * take the address, and hints "likely enabled" for the common case where 72 * the feature is in use. 73 */ 74 static inline bool scx_tid_to_task_enabled(void) 75 { 76 return static_branch_likely(&__scx_tid_to_task_enabled); 77 } 78 79 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 80 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 81 82 /* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */ 83 static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1); 84 85 #ifdef CONFIG_EXT_SUB_SCHED 86 /* 87 * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 88 * tasks for the sub-sched being enabled. Use a global variable instead of a 89 * per-task field as all enables are serialized. 90 */ 91 static struct scx_sched *scx_enabling_sub_sched; 92 #else 93 #define scx_enabling_sub_sched (struct scx_sched *)NULL 94 #endif /* CONFIG_EXT_SUB_SCHED */ 95 96 /* 97 * A monotonically increasing sequence number that is incremented every time a 98 * scheduler is enabled. This can be used to check if any custom sched_ext 99 * scheduler has ever been used in the system. 100 */ 101 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 102 103 /* 104 * Watchdog interval. All scx_sched's share a single watchdog timer and the 105 * interval is half of the shortest sch->watchdog_timeout. 106 */ 107 static unsigned long scx_watchdog_interval; 108 109 /* 110 * The last time the delayed work was run. This delayed work relies on 111 * ksoftirqd being able to run to service timer interrupts, so it's possible 112 * that this work itself could get wedged. To account for this, we check that 113 * it's not stalled in the timer tick, and trigger an error if it is. 114 */ 115 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 116 117 static struct delayed_work scx_watchdog_work; 118 119 /* 120 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 121 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 122 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 123 * lazily when enabling and freed when disabling to avoid waste when sched_ext 124 * isn't active. 125 */ 126 struct scx_kick_syncs { 127 struct rcu_head rcu; 128 unsigned long syncs[]; 129 }; 130 131 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 132 133 /* 134 * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of 135 * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without 136 * further synchronization. See scx_alloc_tid(). 137 */ 138 struct scx_tid_alloc { 139 u64 next; 140 u64 end; 141 }; 142 static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc); 143 144 /* 145 * Direct dispatch marker. 146 * 147 * Non-NULL values are used for direct dispatch from enqueue path. A valid 148 * pointer points to the task currently being enqueued. An ERR_PTR value is used 149 * to indicate that direct dispatch has already happened. 150 */ 151 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 152 153 static const struct rhashtable_params dsq_hash_params = { 154 .key_len = sizeof_field(struct scx_dispatch_q, id), 155 .key_offset = offsetof(struct scx_dispatch_q, id), 156 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 157 }; 158 159 static LLIST_HEAD(dsqs_to_free); 160 161 /* string formatting from BPF */ 162 struct scx_bstr_buf { 163 u64 data[MAX_BPRINTF_VARARGS]; 164 char line[SCX_EXIT_MSG_LEN]; 165 }; 166 167 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 168 static struct scx_bstr_buf scx_exit_bstr_buf; 169 170 /* ops debug dump */ 171 static DEFINE_RAW_SPINLOCK(scx_dump_lock); 172 173 struct scx_dump_data { 174 s32 cpu; 175 bool first; 176 s32 cursor; 177 struct seq_buf *s; 178 const char *prefix; 179 struct scx_bstr_buf buf; 180 }; 181 182 static struct scx_dump_data scx_dump_data = { 183 .cpu = -1, 184 }; 185 186 /* /sys/kernel/sched_ext interface */ 187 static struct kset *scx_kset; 188 189 /* 190 * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 191 * There usually is no reason to modify these as normal scheduler operation 192 * shouldn't be affected by them. The knobs are primarily for debugging. 193 */ 194 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 195 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 196 197 static int set_slice_us(const char *val, const struct kernel_param *kp) 198 { 199 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 200 } 201 202 static const struct kernel_param_ops slice_us_param_ops = { 203 .set = set_slice_us, 204 .get = param_get_uint, 205 }; 206 207 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 208 { 209 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 210 } 211 212 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 213 .set = set_bypass_lb_intv_us, 214 .get = param_get_uint, 215 }; 216 217 #undef MODULE_PARAM_PREFIX 218 #define MODULE_PARAM_PREFIX "sched_ext." 219 220 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 221 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 222 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 223 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 224 225 #undef MODULE_PARAM_PREFIX 226 227 #define CREATE_TRACE_POINTS 228 #include <trace/events/sched_ext.h> 229 230 static void run_deferred(struct rq *rq); 231 static bool task_dead_and_done(struct task_struct *p); 232 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 233 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 234 235 __printf(5, 6) bool __scx_exit(struct scx_sched *sch, 236 enum scx_exit_kind kind, s64 exit_code, 237 s32 exit_cpu, const char *fmt, ...) 238 { 239 va_list args; 240 bool ret; 241 242 va_start(args, fmt); 243 ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args); 244 va_end(args); 245 246 return ret; 247 } 248 249 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 250 251 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 252 { 253 if (time_after(at, now)) 254 return jiffies_to_msecs(at - now); 255 else 256 return -(long)jiffies_to_msecs(now - at); 257 } 258 259 static bool u32_before(u32 a, u32 b) 260 { 261 return (s32)(a - b) < 0; 262 } 263 264 #ifdef CONFIG_EXT_SUB_SCHED 265 /** 266 * scx_parent - Find the parent sched 267 * @sch: sched to find the parent of 268 * 269 * Returns the parent scheduler or %NULL if @sch is root. 270 */ 271 static struct scx_sched *scx_parent(struct scx_sched *sch) 272 { 273 if (sch->level) 274 return sch->ancestors[sch->level - 1]; 275 else 276 return NULL; 277 } 278 279 /** 280 * scx_next_descendant_pre - find the next descendant for pre-order walk 281 * @pos: the current position (%NULL to initiate traversal) 282 * @root: sched whose descendants to walk 283 * 284 * To be used by scx_for_each_descendant_pre(). Find the next descendant to 285 * visit for pre-order traversal of @root's descendants. @root is included in 286 * the iteration and the first node to be visited. 287 */ 288 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 289 struct scx_sched *root) 290 { 291 struct scx_sched *next; 292 293 lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 294 lockdep_is_held(&scx_sched_lock)); 295 296 /* if first iteration, visit @root */ 297 if (!pos) 298 return root; 299 300 /* visit the first child if exists */ 301 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 302 if (next) 303 return next; 304 305 /* no child, visit my or the closest ancestor's next sibling */ 306 while (pos != root) { 307 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 308 return list_next_entry(pos, sibling); 309 pos = scx_parent(pos); 310 } 311 312 return NULL; 313 } 314 315 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 316 { 317 return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 318 scx_sched_hash_params); 319 } 320 321 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 322 { 323 rcu_assign_pointer(p->scx.sched, sch); 324 } 325 #else /* CONFIG_EXT_SUB_SCHED */ 326 static inline struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } 327 static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 328 static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 329 #endif /* CONFIG_EXT_SUB_SCHED */ 330 331 /** 332 * scx_is_descendant - Test whether sched is a descendant 333 * @sch: sched to test 334 * @ancestor: ancestor sched to test against 335 * 336 * Test whether @sch is a descendant of @ancestor. 337 */ 338 static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 339 { 340 if (sch->level < ancestor->level) 341 return false; 342 return sch->ancestors[ancestor->level] == ancestor; 343 } 344 345 /** 346 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 347 * @pos: iteration cursor 348 * @root: sched to walk the descendants of 349 * 350 * Walk @root's descendants. @root is included in the iteration and the first 351 * node to be visited. Must be called with either scx_enable_mutex or 352 * scx_sched_lock held. 353 */ 354 #define scx_for_each_descendant_pre(pos, root) \ 355 for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 356 (pos) = scx_next_descendant_pre((pos), (root))) 357 358 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 359 { 360 return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 361 } 362 363 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 364 { 365 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 366 } 367 368 static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 369 { 370 if (p->sched_class == &stop_sched_class) 371 return &stop_sched_class; 372 373 return __setscheduler_class(p->policy, p->prio); 374 } 375 376 static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 377 { 378 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 379 } 380 381 static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 382 { 383 #ifdef CONFIG_EXT_SUB_SCHED 384 /* 385 * If @sch is a sub-sched which is bypassing, its tasks should go into 386 * the bypass DSQs of the nearest ancestor which is not bypassing. The 387 * not-bypassing ancestor is responsible for scheduling all tasks from 388 * bypassing sub-trees. If all ancestors including root are bypassing, 389 * all tasks should go to the root's bypass DSQs. 390 * 391 * Whenever a sched starts bypassing, all runnable tasks in its subtree 392 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 393 * all tasks are transferred to the right DSQs. 394 */ 395 while (scx_parent(sch) && scx_bypassing(sch, cpu)) 396 sch = scx_parent(sch); 397 #endif /* CONFIG_EXT_SUB_SCHED */ 398 399 return bypass_dsq(sch, cpu); 400 } 401 402 /** 403 * bypass_dsp_enabled - Check if bypass dispatch path is enabled 404 * @sch: scheduler to check 405 * 406 * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 407 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 408 * are bypassing. In the former case, the ancestor is not itself bypassing but 409 * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 410 * the ancestor's bypass dispatch path must be active even though its own 411 * bypass_depth remains zero. 412 * 413 * This function checks bypass_dsp_enable_depth which is managed separately from 414 * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 415 * disable_bypass_dsp(). 416 */ 417 static bool bypass_dsp_enabled(struct scx_sched *sch) 418 { 419 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 420 } 421 422 /** 423 * rq_is_open - Is the rq available for immediate execution of an SCX task? 424 * @rq: rq to test 425 * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 426 * 427 * Returns %true if @rq is currently open for executing an SCX task. After a 428 * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 429 * before going to idle and not inserting a task into @rq's local DSQ after a 430 * %false return doesn't cause @rq to stall. 431 */ 432 static bool rq_is_open(struct rq *rq, u64 enq_flags) 433 { 434 lockdep_assert_rq_held(rq); 435 436 /* 437 * A higher-priority class task is either running or in the process of 438 * waking up on @rq. 439 */ 440 if (sched_class_above(rq->next_class, &ext_sched_class)) 441 return false; 442 443 /* 444 * @rq is either in transition to or in idle and there is no 445 * higher-priority class task waking up on it. 446 */ 447 if (sched_class_above(&ext_sched_class, rq->next_class)) 448 return true; 449 450 /* 451 * @rq is either picking, in transition to, or running an SCX task. 452 */ 453 454 /* 455 * If we're in the dispatch path holding rq lock, $curr may or may not 456 * be ready depending on whether the on-going dispatch decides to extend 457 * $curr's slice. We say yes here and resolve it at the end of dispatch. 458 * See balance_one(). 459 */ 460 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 461 return true; 462 463 /* 464 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 465 * so allow it to avoid spuriously triggering reenq on a combined 466 * PREEMPT|IMMED insertion. 467 */ 468 if (enq_flags & SCX_ENQ_PREEMPT) 469 return true; 470 471 /* 472 * @rq is either in transition to or running an SCX task and can't go 473 * idle without another SCX dispatch cycle. 474 */ 475 return false; 476 } 477 478 /* 479 * Track the rq currently locked. 480 * 481 * This allows kfuncs to safely operate on rq from any scx ops callback, 482 * knowing which rq is already locked. 483 */ 484 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 485 486 static inline void update_locked_rq(struct rq *rq) 487 { 488 /* 489 * Check whether @rq is actually locked. This can help expose bugs 490 * or incorrect assumptions about the context in which a kfunc or 491 * callback is executed. 492 */ 493 if (rq) 494 lockdep_assert_rq_held(rq); 495 __this_cpu_write(scx_locked_rq_state, rq); 496 } 497 498 /* 499 * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not 500 * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. 501 */ 502 #define SCX_CALL_OP(sch, op, locked_rq, args...) \ 503 do { \ 504 struct rq *__prev_locked_rq; \ 505 \ 506 if (locked_rq) { \ 507 __prev_locked_rq = scx_locked_rq(); \ 508 update_locked_rq(locked_rq); \ 509 } \ 510 (sch)->ops.op(args); \ 511 if (locked_rq) \ 512 update_locked_rq(__prev_locked_rq); \ 513 } while (0) 514 515 /* 516 * Flipped on enable per sch->is_cid_type. Declared in ext_internal.h so 517 * subsystem inlines can read it. 518 */ 519 DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type); 520 521 /* 522 * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form 523 * schedulers it resolves to the matching cid; for cpu-form it passes @cpu 524 * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op 525 * (currently only ops.select_cpu); it validates the BPF-supplied cid and 526 * triggers scx_error() on @sch if invalid. 527 */ 528 static s32 scx_cpu_arg(s32 cpu) 529 { 530 if (scx_is_cid_type()) 531 return __scx_cpu_to_cid(cpu); 532 return cpu; 533 } 534 535 static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid) 536 { 537 if (cpu_or_cid < 0 || !scx_is_cid_type()) 538 return cpu_or_cid; 539 return scx_cid_to_cpu(sch, cpu_or_cid); 540 } 541 542 #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ 543 ({ \ 544 struct rq *__prev_locked_rq; \ 545 __typeof__((sch)->ops.op(args)) __ret; \ 546 \ 547 if (locked_rq) { \ 548 __prev_locked_rq = scx_locked_rq(); \ 549 update_locked_rq(locked_rq); \ 550 } \ 551 __ret = (sch)->ops.op(args); \ 552 if (locked_rq) \ 553 update_locked_rq(__prev_locked_rq); \ 554 __ret; \ 555 }) 556 557 /* 558 * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments 559 * and records them in current->scx.kf_tasks[] for the duration of the call. A 560 * kfunc invoked from inside such an op can then use 561 * scx_kf_arg_task_ok() to verify that its task argument is one of 562 * those subject tasks. 563 * 564 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - 565 * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's 566 * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. 567 * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. 568 * 569 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The 570 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants 571 * while a previous one is still in progress. 572 */ 573 #define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ 574 do { \ 575 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 576 current->scx.kf_tasks[0] = task; \ 577 SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ 578 current->scx.kf_tasks[0] = NULL; \ 579 } while (0) 580 581 #define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ 582 ({ \ 583 __typeof__((sch)->ops.op(task, ##args)) __ret; \ 584 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 585 current->scx.kf_tasks[0] = task; \ 586 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ 587 current->scx.kf_tasks[0] = NULL; \ 588 __ret; \ 589 }) 590 591 #define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ 592 ({ \ 593 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 594 WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 595 current->scx.kf_tasks[0] = task0; \ 596 current->scx.kf_tasks[1] = task1; \ 597 __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ 598 current->scx.kf_tasks[0] = NULL; \ 599 current->scx.kf_tasks[1] = NULL; \ 600 __ret; \ 601 }) 602 603 /** 604 * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task 605 * @sch: scx_sched being invoked 606 * @rq: rq to update as the currently-locked rq, or NULL 607 * @task: task whose affinity is changing 608 * @cpumask: new cpumask 609 * 610 * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu 611 * scratch in ext_cid.c and dispatch through the ops_cid union view. Caller 612 * must hold @rq's rq lock so this_cpu_ptr is stable across the call. 613 */ 614 static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq, 615 struct task_struct *task, 616 const struct cpumask *cpumask) 617 { 618 WARN_ON_ONCE(current->scx.kf_tasks[0]); 619 current->scx.kf_tasks[0] = task; 620 if (rq) 621 update_locked_rq(rq); 622 623 if (scx_is_cid_type()) { 624 struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch); 625 /* 626 * Build the per-CPU arena cmask and hand BPF its arena address. 627 * Caller holds the rq lock with IRQs disabled, which makes us 628 * the sole user of the scratch area. 629 */ 630 scx_cpumask_to_cmask(cpumask, kern_va); 631 sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va)); 632 } else { 633 sch->ops.set_cpumask(task, cpumask); 634 } 635 636 if (rq) 637 update_locked_rq(NULL); 638 current->scx.kf_tasks[0] = NULL; 639 } 640 641 /* see SCX_CALL_OP_TASK() */ 642 static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, 643 struct task_struct *p) 644 { 645 if (unlikely((p != current->scx.kf_tasks[0] && 646 p != current->scx.kf_tasks[1]))) { 647 scx_error(sch, "called on a task not being operated on"); 648 return false; 649 } 650 651 return true; 652 } 653 654 enum scx_dsq_iter_flags { 655 /* iterate in the reverse dispatch order */ 656 SCX_DSQ_ITER_REV = 1U << 16, 657 658 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 659 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 660 661 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 662 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 663 __SCX_DSQ_ITER_HAS_SLICE | 664 __SCX_DSQ_ITER_HAS_VTIME, 665 }; 666 667 /** 668 * nldsq_next_task - Iterate to the next task in a non-local DSQ 669 * @dsq: non-local dsq being iterated 670 * @cur: current position, %NULL to start iteration 671 * @rev: walk backwards 672 * 673 * Returns %NULL when iteration is finished. 674 */ 675 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 676 struct task_struct *cur, bool rev) 677 { 678 struct list_head *list_node; 679 struct scx_dsq_list_node *dsq_lnode; 680 681 lockdep_assert_held(&dsq->lock); 682 683 if (cur) 684 list_node = &cur->scx.dsq_list.node; 685 else 686 list_node = &dsq->list; 687 688 /* find the next task, need to skip BPF iteration cursors */ 689 do { 690 if (rev) 691 list_node = list_node->prev; 692 else 693 list_node = list_node->next; 694 695 if (list_node == &dsq->list) 696 return NULL; 697 698 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 699 node); 700 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 701 702 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 703 } 704 705 #define nldsq_for_each_task(p, dsq) \ 706 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 707 (p) = nldsq_next_task((dsq), (p), false)) 708 709 /** 710 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 711 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 712 * @dsq: non-local dsq being iterated 713 * 714 * Find the next task in a cursor based iteration. The caller must have 715 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 716 * between the iteration steps. 717 * 718 * Only tasks which were queued before @cursor was initialized are visible. This 719 * bounds the iteration and guarantees that vtime never jumps in the other 720 * direction while iterating. 721 */ 722 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 723 struct scx_dispatch_q *dsq) 724 { 725 bool rev = cursor->flags & SCX_DSQ_ITER_REV; 726 struct task_struct *p; 727 728 lockdep_assert_held(&dsq->lock); 729 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 730 731 if (list_empty(&cursor->node)) 732 p = NULL; 733 else 734 p = container_of(cursor, struct task_struct, scx.dsq_list); 735 736 /* skip cursors and tasks that were queued after @cursor init */ 737 do { 738 p = nldsq_next_task(dsq, p, rev); 739 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 740 741 if (p) { 742 if (rev) 743 list_move_tail(&cursor->node, &p->scx.dsq_list.node); 744 else 745 list_move(&cursor->node, &p->scx.dsq_list.node); 746 } else { 747 list_del_init(&cursor->node); 748 } 749 750 return p; 751 } 752 753 /** 754 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 755 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 756 * @rq: rq @p was on 757 * @dsq: dsq @p was on 758 * @p: target task 759 * 760 * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 761 * dropped and re-acquired inbetween. Verify that no one else took or is in the 762 * process of taking @p from @dsq. 763 * 764 * On %false return, the caller can assume full ownership of @p. 765 */ 766 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 767 struct rq *rq, struct scx_dispatch_q *dsq, 768 struct task_struct *p) 769 { 770 lockdep_assert_rq_held(rq); 771 lockdep_assert_held(&dsq->lock); 772 773 /* 774 * @p could have already left $src_dsq, got re-enqueud, or be in the 775 * process of being consumed by someone else. 776 */ 777 if (unlikely(p->scx.dsq != dsq || 778 u32_before(cursor->priv, p->scx.dsq_seq) || 779 p->scx.holding_cpu >= 0)) 780 return true; 781 782 /* if @p has stayed on @dsq, its rq couldn't have changed */ 783 if (WARN_ON_ONCE(rq != task_rq(p))) 784 return true; 785 786 return false; 787 } 788 789 /* 790 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 791 * dispatch order. BPF-visible iterator is opaque and larger to allow future 792 * changes without breaking backward compatibility. Can be used with 793 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 794 */ 795 struct bpf_iter_scx_dsq_kern { 796 struct scx_dsq_list_node cursor; 797 struct scx_dispatch_q *dsq; 798 u64 slice; 799 u64 vtime; 800 } __attribute__((aligned(8))); 801 802 struct bpf_iter_scx_dsq { 803 u64 __opaque[6]; 804 } __attribute__((aligned(8))); 805 806 807 static u32 scx_get_task_state(const struct task_struct *p) 808 { 809 return p->scx.flags & SCX_TASK_STATE_MASK; 810 } 811 812 static void scx_set_task_state(struct task_struct *p, u32 state) 813 { 814 u32 prev_state = scx_get_task_state(p); 815 bool warn = false; 816 817 switch (state) { 818 case SCX_TASK_NONE: 819 warn = prev_state == SCX_TASK_DEAD; 820 break; 821 case SCX_TASK_INIT_BEGIN: 822 warn = prev_state != SCX_TASK_NONE; 823 break; 824 case SCX_TASK_INIT: 825 warn = prev_state != SCX_TASK_INIT_BEGIN; 826 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 827 break; 828 case SCX_TASK_READY: 829 warn = !(prev_state == SCX_TASK_INIT || 830 prev_state == SCX_TASK_ENABLED); 831 break; 832 case SCX_TASK_ENABLED: 833 warn = prev_state != SCX_TASK_READY; 834 break; 835 case SCX_TASK_DEAD: 836 warn = !(prev_state == SCX_TASK_NONE || 837 prev_state == SCX_TASK_INIT_BEGIN); 838 break; 839 default: 840 WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", 841 prev_state, state, p->comm, p->pid); 842 return; 843 } 844 845 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 846 prev_state, state, p->comm, p->pid); 847 848 p->scx.flags &= ~SCX_TASK_STATE_MASK; 849 p->scx.flags |= state; 850 } 851 852 /* 853 * SCX task iterator. 854 */ 855 struct scx_task_iter { 856 struct sched_ext_entity cursor; 857 struct task_struct *locked_task; 858 struct rq *rq; 859 struct rq_flags rf; 860 u32 cnt; 861 bool list_locked; 862 #ifdef CONFIG_EXT_SUB_SCHED 863 struct cgroup *cgrp; 864 struct cgroup_subsys_state *css_pos; 865 struct css_task_iter css_iter; 866 #endif 867 }; 868 869 /** 870 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 871 * @iter: iterator to init 872 * @cgrp: Optional root of cgroup subhierarchy to iterate 873 * 874 * Initialize @iter. Once initialized, @iter must eventually be stopped with 875 * scx_task_iter_stop(). 876 * 877 * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 878 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 879 * 880 * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 881 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 882 * task migrations. 883 * 884 * The two modes of iterations are largely independent and it's likely that 885 * scx_tasks can be removed in favor of always using cgroup iteration if 886 * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 887 * 888 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 889 * between this and the first next() call or between any two next() calls. If 890 * the locks are released between two next() calls, the caller is responsible 891 * for ensuring that the task being iterated remains accessible either through 892 * RCU read lock or obtaining a reference count. 893 * 894 * All tasks which existed when the iteration started are guaranteed to be 895 * visited as long as they are not dead. 896 */ 897 static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 898 { 899 memset(iter, 0, sizeof(*iter)); 900 901 #ifdef CONFIG_EXT_SUB_SCHED 902 if (cgrp) { 903 lockdep_assert_held(&cgroup_mutex); 904 iter->cgrp = cgrp; 905 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 906 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 907 &iter->css_iter); 908 return; 909 } 910 #endif 911 raw_spin_lock_irq(&scx_tasks_lock); 912 913 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 914 list_add(&iter->cursor.tasks_node, &scx_tasks); 915 iter->list_locked = true; 916 } 917 918 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 919 { 920 if (iter->locked_task) { 921 __balance_callbacks(iter->rq, &iter->rf); 922 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 923 iter->locked_task = NULL; 924 } 925 } 926 927 /** 928 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 929 * @iter: iterator to unlock 930 * 931 * If @iter is in the middle of a locked iteration, it may be locking the rq of 932 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 933 * This function can be safely called anytime during an iteration. The next 934 * iterator operation will automatically restore the necessary locking. 935 */ 936 static void scx_task_iter_unlock(struct scx_task_iter *iter) 937 { 938 __scx_task_iter_rq_unlock(iter); 939 if (iter->list_locked) { 940 iter->list_locked = false; 941 raw_spin_unlock_irq(&scx_tasks_lock); 942 } 943 } 944 945 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 946 { 947 if (!iter->list_locked) { 948 raw_spin_lock_irq(&scx_tasks_lock); 949 iter->list_locked = true; 950 } 951 } 952 953 /** 954 * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq 955 * @iter: iterator to relock 956 * @p: task whose rq to lock, or %NULL for scx_tasks_lock only 957 * 958 * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once 959 * re-acquired, both locks are managed by the iterator from here on. 960 */ 961 static void scx_task_iter_relock(struct scx_task_iter *iter, 962 struct task_struct *p) 963 { 964 __scx_task_iter_maybe_relock(iter); 965 if (p) { 966 iter->rq = task_rq_lock(p, &iter->rf); 967 iter->locked_task = p; 968 } 969 } 970 971 /** 972 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 973 * @iter: iterator to exit 974 * 975 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 976 * which is released on return. If the iterator holds a task's rq lock, that rq 977 * lock is also released. See scx_task_iter_start() for details. 978 */ 979 static void scx_task_iter_stop(struct scx_task_iter *iter) 980 { 981 #ifdef CONFIG_EXT_SUB_SCHED 982 if (iter->cgrp) { 983 if (iter->css_pos) 984 css_task_iter_end(&iter->css_iter); 985 __scx_task_iter_rq_unlock(iter); 986 return; 987 } 988 #endif 989 __scx_task_iter_maybe_relock(iter); 990 list_del_init(&iter->cursor.tasks_node); 991 scx_task_iter_unlock(iter); 992 } 993 994 /** 995 * scx_task_iter_next - Next task 996 * @iter: iterator to walk 997 * 998 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 999 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 1000 * by holding scx_tasks_lock for too long. 1001 */ 1002 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 1003 { 1004 struct list_head *cursor = &iter->cursor.tasks_node; 1005 struct sched_ext_entity *pos; 1006 1007 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 1008 scx_task_iter_unlock(iter); 1009 cond_resched(); 1010 } 1011 1012 #ifdef CONFIG_EXT_SUB_SCHED 1013 if (iter->cgrp) { 1014 while (iter->css_pos) { 1015 struct task_struct *p; 1016 1017 p = css_task_iter_next(&iter->css_iter); 1018 if (p) 1019 return p; 1020 1021 css_task_iter_end(&iter->css_iter); 1022 iter->css_pos = css_next_descendant_pre(iter->css_pos, 1023 &iter->cgrp->self); 1024 if (iter->css_pos) 1025 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 1026 &iter->css_iter); 1027 } 1028 return NULL; 1029 } 1030 #endif 1031 __scx_task_iter_maybe_relock(iter); 1032 1033 list_for_each_entry(pos, cursor, tasks_node) { 1034 if (&pos->tasks_node == &scx_tasks) 1035 return NULL; 1036 if (!(pos->flags & SCX_TASK_CURSOR)) { 1037 list_move(cursor, &pos->tasks_node); 1038 return container_of(pos, struct task_struct, scx); 1039 } 1040 } 1041 1042 /* can't happen, should always terminate at scx_tasks above */ 1043 BUG(); 1044 } 1045 1046 /** 1047 * scx_task_iter_next_locked - Next non-idle task with its rq locked 1048 * @iter: iterator to walk 1049 * 1050 * Visit the non-idle task with its rq lock held. Allows callers to specify 1051 * whether they would like to filter out dead tasks. See scx_task_iter_start() 1052 * for details. 1053 */ 1054 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 1055 { 1056 struct task_struct *p; 1057 1058 __scx_task_iter_rq_unlock(iter); 1059 1060 while ((p = scx_task_iter_next(iter))) { 1061 /* 1062 * scx_task_iter is used to prepare and move tasks into SCX 1063 * while loading the BPF scheduler and vice-versa while 1064 * unloading. The init_tasks ("swappers") should be excluded 1065 * from the iteration because: 1066 * 1067 * - It's unsafe to use __setschduler_prio() on an init_task to 1068 * determine the sched_class to use as it won't preserve its 1069 * idle_sched_class. 1070 * 1071 * - ops.init/exit_task() can easily be confused if called with 1072 * init_tasks as they, e.g., share PID 0. 1073 * 1074 * As init_tasks are never scheduled through SCX, they can be 1075 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 1076 * doesn't work here: 1077 * 1078 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 1079 * yet been onlined. 1080 * 1081 * - %PF_IDLE can be set on tasks that are not init_tasks. See 1082 * play_idle_precise() used by CONFIG_IDLE_INJECT. 1083 * 1084 * Test for idle_sched_class as only init_tasks are on it. 1085 */ 1086 if (p->sched_class == &idle_sched_class) 1087 continue; 1088 1089 iter->rq = task_rq_lock(p, &iter->rf); 1090 iter->locked_task = p; 1091 1092 /* 1093 * cgroup_task_dead() removes the dead tasks from cset->tasks 1094 * after sched_ext_dead() and cgroup iteration may see tasks 1095 * which already finished sched_ext_dead(). %SCX_TASK_DEAD is 1096 * set by sched_ext_dead() under @p's rq lock. Test it to 1097 * avoid visiting tasks which are already dead from SCX POV. 1098 */ 1099 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 1100 __scx_task_iter_rq_unlock(iter); 1101 continue; 1102 } 1103 1104 return p; 1105 } 1106 return NULL; 1107 } 1108 1109 /** 1110 * scx_add_event - Increase an event counter for 'name' by 'cnt' 1111 * @sch: scx_sched to account events for 1112 * @name: an event name defined in struct scx_event_stats 1113 * @cnt: the number of the event occurred 1114 * 1115 * This can be used when preemption is not disabled. 1116 */ 1117 #define scx_add_event(sch, name, cnt) do { \ 1118 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1119 trace_sched_ext_event(#name, (cnt)); \ 1120 } while(0) 1121 1122 /** 1123 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1124 * @sch: scx_sched to account events for 1125 * @name: an event name defined in struct scx_event_stats 1126 * @cnt: the number of the event occurred 1127 * 1128 * This should be used only when preemption is disabled. 1129 */ 1130 #define __scx_add_event(sch, name, cnt) do { \ 1131 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1132 trace_sched_ext_event(#name, cnt); \ 1133 } while(0) 1134 1135 /** 1136 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1137 * @dst_e: destination event stats 1138 * @src_e: source event stats 1139 * @kind: a kind of event to be aggregated 1140 */ 1141 #define scx_agg_event(dst_e, src_e, kind) do { \ 1142 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1143 } while(0) 1144 1145 /** 1146 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1147 * @s: output seq_buf 1148 * @events: event stats 1149 * @kind: a kind of event to dump 1150 */ 1151 #define scx_dump_event(s, events, kind) do { \ 1152 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1153 } while (0) 1154 1155 1156 static void scx_read_events(struct scx_sched *sch, 1157 struct scx_event_stats *events); 1158 1159 static enum scx_enable_state scx_enable_state(void) 1160 { 1161 return atomic_read(&scx_enable_state_var); 1162 } 1163 1164 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1165 { 1166 return atomic_xchg(&scx_enable_state_var, to); 1167 } 1168 1169 static bool scx_tryset_enable_state(enum scx_enable_state to, 1170 enum scx_enable_state from) 1171 { 1172 int from_v = from; 1173 1174 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1175 } 1176 1177 /** 1178 * wait_ops_state - Busy-wait the specified ops state to end 1179 * @p: target task 1180 * @opss: state to wait the end of 1181 * 1182 * Busy-wait for @p to transition out of @opss. This can only be used when the 1183 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1184 * has load_acquire semantics to ensure that the caller can see the updates made 1185 * in the enqueueing and dispatching paths. 1186 */ 1187 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1188 { 1189 do { 1190 cpu_relax(); 1191 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1192 } 1193 1194 static inline bool __cpu_valid(s32 cpu) 1195 { 1196 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1197 } 1198 1199 /** 1200 * scx_cpu_valid - Verify a cpu number, to be used on ops input args 1201 * @sch: scx_sched to abort on error 1202 * @cpu: cpu number which came from a BPF ops 1203 * @where: extra information reported on error 1204 * 1205 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1206 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1207 * an ops error. 1208 */ 1209 bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1210 { 1211 if (__cpu_valid(cpu)) { 1212 return true; 1213 } else { 1214 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1215 return false; 1216 } 1217 } 1218 1219 /** 1220 * ops_sanitize_err - Sanitize a -errno value 1221 * @sch: scx_sched to error out on error 1222 * @ops_name: operation to blame on failure 1223 * @err: -errno value to sanitize 1224 * 1225 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1226 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1227 * cause misbehaviors. For an example, a large negative return from 1228 * ops.init_task() triggers an oops when passed up the call chain because the 1229 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1230 * handled as a pointer. 1231 */ 1232 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1233 { 1234 if (err < 0 && err >= -MAX_ERRNO) 1235 return err; 1236 1237 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1238 return -EPROTO; 1239 } 1240 1241 static void deferred_bal_cb_workfn(struct rq *rq) 1242 { 1243 run_deferred(rq); 1244 } 1245 1246 static void deferred_irq_workfn(struct irq_work *irq_work) 1247 { 1248 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1249 1250 raw_spin_rq_lock(rq); 1251 run_deferred(rq); 1252 raw_spin_rq_unlock(rq); 1253 } 1254 1255 /** 1256 * schedule_deferred - Schedule execution of deferred actions on an rq 1257 * @rq: target rq 1258 * 1259 * Schedule execution of deferred actions on @rq. Deferred actions are executed 1260 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1261 * to other rqs. 1262 */ 1263 static void schedule_deferred(struct rq *rq) 1264 { 1265 /* 1266 * This is the fallback when schedule_deferred_locked() can't use 1267 * the cheaper balance callback or wakeup hook paths (the target 1268 * CPU is not in balance or wakeup). Currently, this is primarily 1269 * hit by reenqueue operations targeting a remote CPU. 1270 * 1271 * Queue on the target CPU. The deferred work can run from any CPU 1272 * correctly - the _locked() path already processes remote rqs from 1273 * the calling CPU - but targeting the owning CPU allows IPI delivery 1274 * without waiting for the calling CPU to re-enable IRQs and is 1275 * cheaper as the reenqueue runs locally. 1276 */ 1277 irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); 1278 } 1279 1280 /** 1281 * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1282 * @rq: target rq 1283 * 1284 * Schedule execution of deferred actions on @rq. Equivalent to 1285 * schedule_deferred() but requires @rq to be locked and can be more efficient. 1286 */ 1287 static void schedule_deferred_locked(struct rq *rq) 1288 { 1289 lockdep_assert_rq_held(rq); 1290 1291 /* 1292 * If in the middle of waking up a task, task_woken_scx() will be called 1293 * afterwards which will then run the deferred actions, no need to 1294 * schedule anything. 1295 */ 1296 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1297 return; 1298 1299 /* Don't do anything if there already is a deferred operation. */ 1300 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1301 return; 1302 1303 /* 1304 * If in balance, the balance callbacks will be called before rq lock is 1305 * released. Schedule one. 1306 * 1307 * 1308 * We can't directly insert the callback into the 1309 * rq's list: The call can drop its lock and make the pending balance 1310 * callback visible to unrelated code paths that call rq_pin_lock(). 1311 * 1312 * Just let balance_one() know that it must do it itself. 1313 */ 1314 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1315 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1316 return; 1317 } 1318 1319 /* 1320 * No scheduler hooks available. Use the generic irq_work path. The 1321 * above WAKEUP and BALANCE paths should cover most of the cases and the 1322 * time to IRQ re-enable shouldn't be long. 1323 */ 1324 schedule_deferred(rq); 1325 } 1326 1327 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1328 u64 reenq_flags, struct rq *locked_rq) 1329 { 1330 struct rq *rq; 1331 1332 /* 1333 * Allowing reenqueues doesn't make sense while bypassing. This also 1334 * blocks from new reenqueues to be scheduled on dead scheds. 1335 */ 1336 if (unlikely(READ_ONCE(sch->bypass_depth))) 1337 return; 1338 1339 if (dsq->id == SCX_DSQ_LOCAL) { 1340 rq = container_of(dsq, struct rq, scx.local_dsq); 1341 1342 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1343 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1344 1345 /* 1346 * Pairs with smp_mb() in process_deferred_reenq_locals() and 1347 * guarantees that there is a reenq_local() afterwards. 1348 */ 1349 smp_mb(); 1350 1351 if (list_empty(&drl->node) || 1352 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1353 1354 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1355 1356 if (list_empty(&drl->node)) 1357 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1358 WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1359 } 1360 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1361 rq = this_rq(); 1362 1363 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1364 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1365 1366 /* 1367 * Pairs with smp_mb() in process_deferred_reenq_users() and 1368 * guarantees that there is a reenq_user() afterwards. 1369 */ 1370 smp_mb(); 1371 1372 if (list_empty(&dru->node) || 1373 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1374 1375 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1376 1377 if (list_empty(&dru->node)) 1378 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1379 WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1380 } 1381 } else { 1382 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1383 return; 1384 } 1385 1386 if (rq == locked_rq) 1387 schedule_deferred_locked(rq); 1388 else 1389 schedule_deferred(rq); 1390 } 1391 1392 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1393 { 1394 struct scx_sched *root = rcu_dereference_sched(scx_root); 1395 1396 if (WARN_ON_ONCE(!root)) 1397 return; 1398 1399 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); 1400 } 1401 1402 /** 1403 * touch_core_sched - Update timestamp used for core-sched task ordering 1404 * @rq: rq to read clock from, must be locked 1405 * @p: task to update the timestamp for 1406 * 1407 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1408 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1409 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1410 * exhaustion). 1411 */ 1412 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1413 { 1414 lockdep_assert_rq_held(rq); 1415 1416 #ifdef CONFIG_SCHED_CORE 1417 /* 1418 * It's okay to update the timestamp spuriously. Use 1419 * sched_core_disabled() which is cheaper than enabled(). 1420 * 1421 * As this is used to determine ordering between tasks of sibling CPUs, 1422 * it may be better to use per-core dispatch sequence instead. 1423 */ 1424 if (!sched_core_disabled()) 1425 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1426 #endif 1427 } 1428 1429 /** 1430 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1431 * @rq: rq to read clock from, must be locked 1432 * @p: task being dispatched 1433 * 1434 * If the BPF scheduler implements custom core-sched ordering via 1435 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1436 * ordering within each local DSQ. This function is called from dispatch paths 1437 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1438 */ 1439 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1440 { 1441 lockdep_assert_rq_held(rq); 1442 1443 #ifdef CONFIG_SCHED_CORE 1444 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1445 touch_core_sched(rq, p); 1446 #endif 1447 } 1448 1449 static void update_curr_scx(struct rq *rq) 1450 { 1451 struct task_struct *curr = rq->curr; 1452 s64 delta_exec; 1453 1454 delta_exec = update_curr_common(rq); 1455 if (unlikely(delta_exec <= 0)) 1456 return; 1457 1458 if (curr->scx.slice != SCX_SLICE_INF) { 1459 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1460 if (!curr->scx.slice) 1461 touch_core_sched(rq, curr); 1462 } 1463 1464 dl_server_update(&rq->ext_server, delta_exec); 1465 } 1466 1467 static bool scx_dsq_priq_less(struct rb_node *node_a, 1468 const struct rb_node *node_b) 1469 { 1470 const struct task_struct *a = 1471 container_of(node_a, struct task_struct, scx.dsq_priq); 1472 const struct task_struct *b = 1473 container_of(node_b, struct task_struct, scx.dsq_priq); 1474 1475 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1476 } 1477 1478 static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1479 { 1480 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1481 WRITE_ONCE(dsq->nr, dsq->nr + 1); 1482 1483 /* 1484 * Once @p reaches a local DSQ, it can only leave it by being dispatched 1485 * to the CPU or dequeued. In both cases, the only way @p can go back to 1486 * the BPF sched is through enqueueing. If being inserted into a local 1487 * DSQ with IMMED, persist the state until the next enqueueing event in 1488 * do_enqueue_task() so that we can maintain IMMED protection through 1489 * e.g. SAVE/RESTORE cycles and slice extensions. 1490 */ 1491 if (enq_flags & SCX_ENQ_IMMED) { 1492 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1493 WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1494 return; 1495 } 1496 p->scx.flags |= SCX_TASK_IMMED; 1497 } 1498 1499 if (p->scx.flags & SCX_TASK_IMMED) { 1500 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1501 1502 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1503 return; 1504 1505 rq->scx.nr_immed++; 1506 1507 /* 1508 * If @rq already had other tasks or the current task is not 1509 * done yet, @p can't go on the CPU immediately. Re-enqueue. 1510 */ 1511 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1512 schedule_reenq_local(rq, 0); 1513 } 1514 } 1515 1516 static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1517 { 1518 /* see dsq_inc_nr() */ 1519 WRITE_ONCE(dsq->nr, dsq->nr - 1); 1520 1521 if (p->scx.flags & SCX_TASK_IMMED) { 1522 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1523 1524 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1525 WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1526 return; 1527 1528 rq->scx.nr_immed--; 1529 } 1530 } 1531 1532 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1533 { 1534 p->scx.slice = READ_ONCE(sch->slice_dfl); 1535 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1536 } 1537 1538 /* 1539 * Return true if @p is moving due to an internal SCX migration, false 1540 * otherwise. 1541 */ 1542 static inline bool task_scx_migrating(struct task_struct *p) 1543 { 1544 /* 1545 * We only need to check sticky_cpu: it is set to the destination 1546 * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1547 * and cleared when the task is enqueued on the destination, so it 1548 * is only non-negative during an internal SCX migration. 1549 */ 1550 return p->scx.sticky_cpu >= 0; 1551 } 1552 1553 /* 1554 * Call ops.dequeue() if the task is in BPF custody and not migrating. 1555 * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1556 */ 1557 static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1558 struct task_struct *p, u64 deq_flags) 1559 { 1560 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1561 return; 1562 1563 if (SCX_HAS_OP(sch, dequeue)) 1564 SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); 1565 1566 p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1567 } 1568 1569 static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1570 struct task_struct *p, u64 enq_flags) 1571 { 1572 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1573 1574 call_task_dequeue(sch, rq, p, 0); 1575 1576 /* 1577 * Note that @rq's lock may be dropped between this enqueue and @p 1578 * actually getting on CPU. This gives higher-class tasks (e.g. RT) 1579 * an opportunity to wake up on @rq and prevent @p from running. 1580 * Here are some concrete examples: 1581 * 1582 * Example 1: 1583 * 1584 * We dispatch two tasks from a single ops.dispatch(): 1585 * - First, a local task to this CPU's local DSQ; 1586 * - Second, a local/remote task to a remote CPU's local DSQ. 1587 * We must drop the local rq lock in order to finish the second 1588 * dispatch. In that time, an RT task can wake up on the local rq. 1589 * 1590 * Example 2: 1591 * 1592 * We dispatch a local/remote task to a remote CPU's local DSQ. 1593 * We must drop the remote rq lock before the dispatched task can run, 1594 * which gives an RT task an opportunity to wake up on the remote rq. 1595 * 1596 * Both examples work the same if we replace dispatching with moving 1597 * the tasks from a user-created DSQ. 1598 * 1599 * We must detect these wakeups so that we can re-enqueue IMMED tasks 1600 * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this 1601 * purpose, but for it to be invoked, we must ensure that we bump 1602 * @rq->next_class to &ext_sched_class if it's currently idle. 1603 * 1604 * wakeup_preempt() does the bumping, and since we only invoke it if 1605 * @rq->next_class is below &ext_sched_class, it will also 1606 * resched_curr(rq). 1607 */ 1608 if (sched_class_above(p->sched_class, rq->next_class)) 1609 wakeup_preempt(rq, p, 0); 1610 1611 /* 1612 * If @rq is in balance, the CPU is already vacant and looking for the 1613 * next task to run. No need to preempt or trigger resched after moving 1614 * @p into its local DSQ. 1615 * Note that the wakeup_preempt() above may have already triggered 1616 * a resched if @rq->next_class was idle. It's harmless, since 1617 * need_resched is cleared immediately after task pick. 1618 */ 1619 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1620 return; 1621 1622 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1623 rq->curr->sched_class == &ext_sched_class) { 1624 rq->curr->scx.slice = 0; 1625 resched_curr(rq); 1626 } 1627 } 1628 1629 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1630 struct scx_dispatch_q *dsq, struct task_struct *p, 1631 u64 enq_flags) 1632 { 1633 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1634 1635 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1636 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1637 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1638 1639 if (!is_local) { 1640 raw_spin_lock_nested(&dsq->lock, 1641 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1642 1643 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1644 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1645 /* fall back to the global dsq */ 1646 raw_spin_unlock(&dsq->lock); 1647 dsq = find_global_dsq(sch, task_cpu(p)); 1648 raw_spin_lock(&dsq->lock); 1649 } 1650 } 1651 1652 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1653 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1654 /* 1655 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1656 * their FIFO queues. To avoid confusion and accidentally 1657 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1658 * disallow any internal DSQ from doing vtime ordering of 1659 * tasks. 1660 */ 1661 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1662 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1663 } 1664 1665 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1666 struct rb_node *rbp; 1667 1668 /* 1669 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1670 * linked to both the rbtree and list on PRIQs, this can only be 1671 * tested easily when adding the first task. 1672 */ 1673 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1674 nldsq_next_task(dsq, NULL, false))) 1675 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1676 dsq->id); 1677 1678 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1679 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1680 1681 /* 1682 * Find the previous task and insert after it on the list so 1683 * that @dsq->list is vtime ordered. 1684 */ 1685 rbp = rb_prev(&p->scx.dsq_priq); 1686 if (rbp) { 1687 struct task_struct *prev = 1688 container_of(rbp, struct task_struct, 1689 scx.dsq_priq); 1690 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1691 /* first task unchanged - no update needed */ 1692 } else { 1693 list_add(&p->scx.dsq_list.node, &dsq->list); 1694 /* not builtin and new task is at head - use fastpath */ 1695 rcu_assign_pointer(dsq->first_task, p); 1696 } 1697 } else { 1698 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1699 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1700 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1701 dsq->id); 1702 1703 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1704 list_add(&p->scx.dsq_list.node, &dsq->list); 1705 /* new task inserted at head - use fastpath */ 1706 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1707 rcu_assign_pointer(dsq->first_task, p); 1708 } else { 1709 /* 1710 * dsq->list can contain parked BPF iterator cursors, so 1711 * list_empty() here isn't a reliable proxy for "no real 1712 * task in the DSQ". Test dsq->first_task directly. 1713 */ 1714 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1715 if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1716 rcu_assign_pointer(dsq->first_task, p); 1717 } 1718 } 1719 1720 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1721 WRITE_ONCE(dsq->seq, dsq->seq + 1); 1722 p->scx.dsq_seq = dsq->seq; 1723 1724 dsq_inc_nr(dsq, p, enq_flags); 1725 p->scx.dsq = dsq; 1726 1727 /* 1728 * Update custody and call ops.dequeue() before clearing ops_state: 1729 * once ops_state is cleared, waiters in ops_dequeue() can proceed 1730 * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1731 * ops_state first, both sides would modify p->scx.flags 1732 * concurrently in a non-atomic way. 1733 */ 1734 if (is_local) { 1735 local_dsq_post_enq(sch, dsq, p, enq_flags); 1736 } else { 1737 /* 1738 * Task on global/bypass DSQ: leave custody, task on 1739 * non-terminal DSQ: enter custody. 1740 */ 1741 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1742 call_task_dequeue(sch, rq, p, 0); 1743 else 1744 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1745 1746 raw_spin_unlock(&dsq->lock); 1747 } 1748 1749 /* 1750 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1751 * match waiters' load_acquire. 1752 */ 1753 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1754 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1755 } 1756 1757 static void task_unlink_from_dsq(struct task_struct *p, 1758 struct scx_dispatch_q *dsq) 1759 { 1760 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1761 1762 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1763 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1764 RB_CLEAR_NODE(&p->scx.dsq_priq); 1765 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1766 } 1767 1768 list_del_init(&p->scx.dsq_list.node); 1769 dsq_dec_nr(dsq, p); 1770 1771 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1772 struct task_struct *first_task; 1773 1774 first_task = nldsq_next_task(dsq, NULL, false); 1775 rcu_assign_pointer(dsq->first_task, first_task); 1776 } 1777 } 1778 1779 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1780 { 1781 struct scx_dispatch_q *dsq = p->scx.dsq; 1782 bool is_local = dsq == &rq->scx.local_dsq; 1783 1784 lockdep_assert_rq_held(rq); 1785 1786 if (!dsq) { 1787 /* 1788 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1789 * Unlinking is all that's needed to cancel. 1790 */ 1791 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1792 list_del_init(&p->scx.dsq_list.node); 1793 1794 /* 1795 * When dispatching directly from the BPF scheduler to a local 1796 * DSQ, the task isn't associated with any DSQ but 1797 * @p->scx.holding_cpu may be set under the protection of 1798 * %SCX_OPSS_DISPATCHING. 1799 */ 1800 if (p->scx.holding_cpu >= 0) 1801 p->scx.holding_cpu = -1; 1802 1803 return; 1804 } 1805 1806 if (!is_local) 1807 raw_spin_lock(&dsq->lock); 1808 1809 /* 1810 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1811 * change underneath us. 1812 */ 1813 if (p->scx.holding_cpu < 0) { 1814 /* @p must still be on @dsq, dequeue */ 1815 task_unlink_from_dsq(p, dsq); 1816 } else { 1817 /* 1818 * We're racing against dispatch_to_local_dsq() which already 1819 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1820 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1821 * the race. 1822 */ 1823 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1824 p->scx.holding_cpu = -1; 1825 } 1826 p->scx.dsq = NULL; 1827 1828 if (!is_local) 1829 raw_spin_unlock(&dsq->lock); 1830 } 1831 1832 /* 1833 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1834 * and dsq are locked. 1835 */ 1836 static void dispatch_dequeue_locked(struct task_struct *p, 1837 struct scx_dispatch_q *dsq) 1838 { 1839 lockdep_assert_rq_held(task_rq(p)); 1840 lockdep_assert_held(&dsq->lock); 1841 1842 task_unlink_from_dsq(p, dsq); 1843 p->scx.dsq = NULL; 1844 } 1845 1846 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1847 struct rq *rq, u64 dsq_id, 1848 s32 tcpu) 1849 { 1850 struct scx_dispatch_q *dsq; 1851 1852 if (dsq_id == SCX_DSQ_LOCAL) 1853 return &rq->scx.local_dsq; 1854 1855 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1856 s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); 1857 1858 if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1859 return find_global_dsq(sch, tcpu); 1860 1861 return &cpu_rq(cpu)->scx.local_dsq; 1862 } 1863 1864 if (dsq_id == SCX_DSQ_GLOBAL) 1865 dsq = find_global_dsq(sch, tcpu); 1866 else 1867 dsq = find_user_dsq(sch, dsq_id); 1868 1869 if (unlikely(!dsq)) { 1870 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1871 return find_global_dsq(sch, tcpu); 1872 } 1873 1874 return dsq; 1875 } 1876 1877 static void mark_direct_dispatch(struct scx_sched *sch, 1878 struct task_struct *ddsp_task, 1879 struct task_struct *p, u64 dsq_id, 1880 u64 enq_flags) 1881 { 1882 /* 1883 * Mark that dispatch already happened from ops.select_cpu() or 1884 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1885 * which can never match a valid task pointer. 1886 */ 1887 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1888 1889 /* @p must match the task on the enqueue path */ 1890 if (unlikely(p != ddsp_task)) { 1891 if (IS_ERR(ddsp_task)) 1892 scx_error(sch, "%s[%d] already direct-dispatched", 1893 p->comm, p->pid); 1894 else 1895 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1896 ddsp_task->comm, ddsp_task->pid, 1897 p->comm, p->pid); 1898 return; 1899 } 1900 1901 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1902 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1903 1904 p->scx.ddsp_dsq_id = dsq_id; 1905 p->scx.ddsp_enq_flags = enq_flags; 1906 } 1907 1908 /* 1909 * Clear @p direct dispatch state when leaving the scheduler. 1910 * 1911 * Direct dispatch state must be cleared in the following cases: 1912 * - direct_dispatch(): cleared on the synchronous enqueue path, deferred 1913 * dispatch keeps the state until consumed 1914 * - process_ddsp_deferred_locals(): cleared after consuming deferred state, 1915 * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch 1916 * verdict is ignored (local/global/bypass) 1917 * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred 1918 * cancellation and holding_cpu races 1919 * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by 1920 * the scx_bypass() loop, so that stale state is not reused by a subsequent 1921 * scheduler instance 1922 */ 1923 static inline void clear_direct_dispatch(struct task_struct *p) 1924 { 1925 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1926 p->scx.ddsp_enq_flags = 0; 1927 } 1928 1929 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1930 u64 enq_flags) 1931 { 1932 struct rq *rq = task_rq(p); 1933 struct scx_dispatch_q *dsq = 1934 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1935 u64 ddsp_enq_flags; 1936 1937 touch_core_sched_dispatch(rq, p); 1938 1939 p->scx.ddsp_enq_flags |= enq_flags; 1940 1941 /* 1942 * We are in the enqueue path with @rq locked and pinned, and thus can't 1943 * double lock a remote rq and enqueue to its local DSQ. For 1944 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1945 * the enqueue so that it's executed when @rq can be unlocked. 1946 */ 1947 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1948 unsigned long opss; 1949 1950 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1951 1952 switch (opss & SCX_OPSS_STATE_MASK) { 1953 case SCX_OPSS_NONE: 1954 break; 1955 case SCX_OPSS_QUEUEING: 1956 /* 1957 * As @p was never passed to the BPF side, _release is 1958 * not strictly necessary. Still do it for consistency. 1959 */ 1960 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1961 break; 1962 default: 1963 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1964 p->comm, p->pid, opss); 1965 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1966 break; 1967 } 1968 1969 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1970 list_add_tail(&p->scx.dsq_list.node, 1971 &rq->scx.ddsp_deferred_locals); 1972 schedule_deferred_locked(rq); 1973 return; 1974 } 1975 1976 ddsp_enq_flags = p->scx.ddsp_enq_flags; 1977 clear_direct_dispatch(p); 1978 1979 dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1980 } 1981 1982 static bool scx_rq_online(struct rq *rq) 1983 { 1984 /* 1985 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1986 * the online state as seen from the BPF scheduler. cpu_active() test 1987 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1988 * stay set until the current scheduling operation is complete even if 1989 * we aren't locking @rq. 1990 */ 1991 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1992 } 1993 1994 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1995 int sticky_cpu) 1996 { 1997 struct scx_sched *sch = scx_task_sched(p); 1998 struct task_struct **ddsp_taskp; 1999 struct scx_dispatch_q *dsq; 2000 unsigned long qseq; 2001 2002 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 2003 2004 /* internal movements - rq migration / RESTORE */ 2005 if (sticky_cpu == cpu_of(rq)) 2006 goto local_norefill; 2007 2008 /* 2009 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 2010 * Note that exiting and migration-disabled tasks that skip 2011 * ops.enqueue() below will lose IMMED protection unless 2012 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 2013 */ 2014 p->scx.flags &= ~SCX_TASK_IMMED; 2015 2016 /* 2017 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 2018 * is offline and are just running the hotplug path. Don't bother the 2019 * BPF scheduler. 2020 */ 2021 if (!scx_rq_online(rq)) 2022 goto local; 2023 2024 if (scx_bypassing(sch, cpu_of(rq))) { 2025 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 2026 goto bypass; 2027 } 2028 2029 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2030 goto direct; 2031 2032 /* see %SCX_OPS_ENQ_EXITING */ 2033 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 2034 unlikely(p->flags & PF_EXITING)) { 2035 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 2036 goto local; 2037 } 2038 2039 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 2040 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 2041 is_migration_disabled(p)) { 2042 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 2043 goto local; 2044 } 2045 2046 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 2047 goto global; 2048 2049 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 2050 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 2051 2052 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2053 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 2054 2055 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 2056 WARN_ON_ONCE(*ddsp_taskp); 2057 *ddsp_taskp = p; 2058 2059 SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); 2060 2061 *ddsp_taskp = NULL; 2062 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2063 goto direct; 2064 2065 /* 2066 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 2067 * so ops.dequeue() is called when it leaves custody. 2068 */ 2069 p->scx.flags |= SCX_TASK_IN_CUSTODY; 2070 2071 /* 2072 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 2073 * dequeue may be waiting. The store_release matches their load_acquire. 2074 */ 2075 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 2076 return; 2077 2078 direct: 2079 direct_dispatch(sch, p, enq_flags); 2080 return; 2081 local_norefill: 2082 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 2083 return; 2084 local: 2085 dsq = &rq->scx.local_dsq; 2086 goto enqueue; 2087 global: 2088 dsq = find_global_dsq(sch, task_cpu(p)); 2089 goto enqueue; 2090 bypass: 2091 dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 2092 goto enqueue; 2093 2094 enqueue: 2095 /* 2096 * For task-ordering, slice refill must be treated as implying the end 2097 * of the current slice. Otherwise, the longer @p stays on the CPU, the 2098 * higher priority it becomes from scx_prio_less()'s POV. 2099 */ 2100 touch_core_sched(rq, p); 2101 refill_task_slice_dfl(sch, p); 2102 clear_direct_dispatch(p); 2103 dispatch_enqueue(sch, rq, dsq, p, enq_flags); 2104 } 2105 2106 static bool task_runnable(const struct task_struct *p) 2107 { 2108 return !list_empty(&p->scx.runnable_node); 2109 } 2110 2111 static void set_task_runnable(struct rq *rq, struct task_struct *p) 2112 { 2113 lockdep_assert_rq_held(rq); 2114 2115 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 2116 p->scx.runnable_at = jiffies; 2117 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 2118 } 2119 2120 /* 2121 * list_add_tail() must be used. scx_bypass() depends on tasks being 2122 * appended to the runnable_list. 2123 */ 2124 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 2125 } 2126 2127 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2128 { 2129 list_del_init(&p->scx.runnable_node); 2130 if (reset_runnable_at) 2131 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2132 } 2133 2134 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 2135 { 2136 struct scx_sched *sch = scx_task_sched(p); 2137 int sticky_cpu = p->scx.sticky_cpu; 2138 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 2139 2140 if (enq_flags & ENQUEUE_WAKEUP) 2141 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2142 2143 /* 2144 * Restoring a running task will be immediately followed by 2145 * set_next_task_scx() which expects the task to not be on the BPF 2146 * scheduler as tasks can only start running through local DSQs. Force 2147 * direct-dispatch into the local DSQ by setting the sticky_cpu. 2148 */ 2149 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2150 sticky_cpu = cpu_of(rq); 2151 2152 if (p->scx.flags & SCX_TASK_QUEUED) { 2153 WARN_ON_ONCE(!task_runnable(p)); 2154 goto out; 2155 } 2156 2157 set_task_runnable(rq, p); 2158 p->scx.flags |= SCX_TASK_QUEUED; 2159 rq->scx.nr_running++; 2160 add_nr_running(rq, 1); 2161 2162 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2163 SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); 2164 2165 if (enq_flags & SCX_ENQ_WAKEUP) 2166 touch_core_sched(rq, p); 2167 2168 /* Start dl_server if this is the first task being enqueued */ 2169 if (rq->scx.nr_running == 1) 2170 dl_server_start(&rq->ext_server); 2171 2172 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2173 2174 if (sticky_cpu >= 0) 2175 p->scx.sticky_cpu = -1; 2176 out: 2177 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2178 2179 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2180 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2181 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2182 } 2183 2184 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2185 { 2186 struct scx_sched *sch = scx_task_sched(p); 2187 unsigned long opss; 2188 2189 /* dequeue is always temporary, don't reset runnable_at */ 2190 clr_task_runnable(p, false); 2191 2192 retry: 2193 /* acquire ensures that we see the preceding updates on QUEUED */ 2194 opss = atomic_long_read_acquire(&p->scx.ops_state); 2195 2196 switch (opss & SCX_OPSS_STATE_MASK) { 2197 case SCX_OPSS_NONE: 2198 break; 2199 case SCX_OPSS_QUEUEING: 2200 /* 2201 * QUEUEING is started and finished while holding @p's rq lock. 2202 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2203 */ 2204 BUG(); 2205 case SCX_OPSS_QUEUED: 2206 /* 2207 * A queued task must always be in BPF scheduler's custody. If 2208 * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another 2209 * CPU has already passed call_task_dequeue() (which clears the 2210 * flag), but has not yet written SCX_OPSS_NONE. That final 2211 * store does not require this rq's lock, so retrying with 2212 * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, 2213 * handled by the fallthrough) on a subsequent iteration. 2214 */ 2215 if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { 2216 cpu_relax(); 2217 goto retry; 2218 } 2219 2220 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2221 SCX_OPSS_NONE)) 2222 break; 2223 fallthrough; 2224 case SCX_OPSS_DISPATCHING: 2225 /* 2226 * If @p is being dispatched from the BPF scheduler to a DSQ, 2227 * wait for the transfer to complete so that @p doesn't get 2228 * added to its DSQ after dequeueing is complete. 2229 * 2230 * As we're waiting on DISPATCHING with the rq locked, the 2231 * dispatching side shouldn't try to lock the rq while 2232 * DISPATCHING is set. See dispatch_to_local_dsq(). 2233 * 2234 * DISPATCHING shouldn't have qseq set and control can reach 2235 * here with NONE @opss from the above QUEUED case block. 2236 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2237 */ 2238 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2239 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2240 break; 2241 } 2242 2243 /* 2244 * Call ops.dequeue() if the task is still in BPF custody. 2245 * 2246 * The code that clears ops_state to %SCX_OPSS_NONE does not always 2247 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2248 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2249 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2250 * so that a concurrent dequeue can proceed, but we clear 2251 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2252 * task. So we can see NONE + IN_CUSTODY here and we must handle 2253 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2254 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2255 * it is enqueued on the destination. 2256 */ 2257 call_task_dequeue(sch, rq, p, deq_flags); 2258 } 2259 2260 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2261 { 2262 struct scx_sched *sch = scx_task_sched(p); 2263 u64 deq_flags = core_deq_flags; 2264 2265 /* 2266 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2267 * change (not sleep or core-sched pick). 2268 */ 2269 if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2270 deq_flags |= SCX_DEQ_SCHED_CHANGE; 2271 2272 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2273 WARN_ON_ONCE(task_runnable(p)); 2274 return true; 2275 } 2276 2277 ops_dequeue(rq, p, deq_flags); 2278 2279 /* 2280 * A currently running task which is going off @rq first gets dequeued 2281 * and then stops running. As we want running <-> stopping transitions 2282 * to be contained within runnable <-> quiescent transitions, trigger 2283 * ->stopping() early here instead of in put_prev_task_scx(). 2284 * 2285 * @p may go through multiple stopping <-> running transitions between 2286 * here and put_prev_task_scx() if task attribute changes occur while 2287 * balance_one() leaves @rq unlocked. However, they don't contain any 2288 * information meaningful to the BPF scheduler and can be suppressed by 2289 * skipping the callbacks if the task is !QUEUED. 2290 */ 2291 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2292 update_curr_scx(rq); 2293 SCX_CALL_OP_TASK(sch, stopping, rq, p, false); 2294 } 2295 2296 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2297 SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); 2298 2299 if (deq_flags & SCX_DEQ_SLEEP) 2300 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2301 else 2302 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2303 2304 p->scx.flags &= ~SCX_TASK_QUEUED; 2305 rq->scx.nr_running--; 2306 sub_nr_running(rq, 1); 2307 2308 dispatch_dequeue(rq, p); 2309 clear_direct_dispatch(p); 2310 return true; 2311 } 2312 2313 static void yield_task_scx(struct rq *rq) 2314 { 2315 struct task_struct *p = rq->donor; 2316 struct scx_sched *sch = scx_task_sched(p); 2317 2318 if (SCX_HAS_OP(sch, yield)) 2319 SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); 2320 else 2321 p->scx.slice = 0; 2322 } 2323 2324 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2325 { 2326 struct task_struct *from = rq->donor; 2327 struct scx_sched *sch = scx_task_sched(from); 2328 2329 if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2330 return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); 2331 else 2332 return false; 2333 } 2334 2335 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2336 { 2337 /* 2338 * Preemption between SCX tasks is implemented by resetting the victim 2339 * task's slice to 0 and triggering reschedule on the target CPU. 2340 * Nothing to do. 2341 */ 2342 if (p->sched_class == &ext_sched_class) 2343 return; 2344 2345 /* 2346 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2347 * This captures all preemption cases including: 2348 * 2349 * - A SCX task is currently running. 2350 * 2351 * - @rq is waking from idle due to a SCX task waking to it. 2352 * 2353 * - A higher-priority wakes up while SCX dispatch is in progress. 2354 */ 2355 if (rq->scx.nr_immed) 2356 schedule_reenq_local(rq, 0); 2357 } 2358 2359 static void move_local_task_to_local_dsq(struct scx_sched *sch, 2360 struct task_struct *p, u64 enq_flags, 2361 struct scx_dispatch_q *src_dsq, 2362 struct rq *dst_rq) 2363 { 2364 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2365 2366 /* @dsq is locked and @p is on @dst_rq */ 2367 lockdep_assert_held(&src_dsq->lock); 2368 lockdep_assert_rq_held(dst_rq); 2369 2370 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2371 2372 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2373 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2374 else 2375 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2376 2377 dsq_inc_nr(dst_dsq, p, enq_flags); 2378 p->scx.dsq = dst_dsq; 2379 2380 local_dsq_post_enq(sch, dst_dsq, p, enq_flags); 2381 } 2382 2383 /** 2384 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2385 * @p: task to move 2386 * @enq_flags: %SCX_ENQ_* 2387 * @src_rq: rq to move the task from, locked on entry, released on return 2388 * @dst_rq: rq to move the task into, locked on return 2389 * 2390 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2391 */ 2392 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2393 struct rq *src_rq, struct rq *dst_rq) 2394 { 2395 lockdep_assert_rq_held(src_rq); 2396 2397 /* 2398 * Set sticky_cpu before deactivate_task() to properly mark the 2399 * beginning of an SCX-internal migration. 2400 */ 2401 p->scx.sticky_cpu = cpu_of(dst_rq); 2402 deactivate_task(src_rq, p, 0); 2403 set_task_cpu(p, cpu_of(dst_rq)); 2404 2405 raw_spin_rq_unlock(src_rq); 2406 raw_spin_rq_lock(dst_rq); 2407 2408 /* 2409 * We want to pass scx-specific enq_flags but activate_task() will 2410 * truncate the upper 32 bit. As we own @rq, we can pass them through 2411 * @rq->scx.extra_enq_flags instead. 2412 */ 2413 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2414 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2415 dst_rq->scx.extra_enq_flags = enq_flags; 2416 activate_task(dst_rq, p, 0); 2417 dst_rq->scx.extra_enq_flags = 0; 2418 } 2419 2420 /* 2421 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2422 * differences: 2423 * 2424 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2425 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2426 * this CPU?". 2427 * 2428 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2429 * must be allowed to finish on the CPU that it's currently on regardless of 2430 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2431 * BPF scheduler shouldn't attempt to migrate a task which has migration 2432 * disabled. 2433 * 2434 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2435 * no to the BPF scheduler initiated migrations while offline. 2436 * 2437 * The caller must ensure that @p and @rq are on different CPUs. 2438 */ 2439 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2440 struct task_struct *p, struct rq *rq, 2441 bool enforce) 2442 { 2443 s32 cpu = cpu_of(rq); 2444 2445 WARN_ON_ONCE(task_cpu(p) == cpu); 2446 2447 /* 2448 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2449 * the pinned CPU in migrate_disable_switch() while @p is being switched 2450 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2451 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2452 * @p passing the below task_allowed_on_cpu() check while migration is 2453 * disabled. 2454 * 2455 * Test the migration disabled state first as the race window is narrow 2456 * and the BPF scheduler failing to check migration disabled state can 2457 * easily be masked if task_allowed_on_cpu() is done first. 2458 */ 2459 if (unlikely(is_migration_disabled(p))) { 2460 if (enforce) 2461 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2462 p->comm, p->pid, task_cpu(p), cpu); 2463 return false; 2464 } 2465 2466 /* 2467 * We don't require the BPF scheduler to avoid dispatching to offline 2468 * CPUs mostly for convenience but also because CPUs can go offline 2469 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2470 * picked CPU is outside the allowed mask. 2471 */ 2472 if (!task_allowed_on_cpu(p, cpu)) { 2473 if (enforce) 2474 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2475 cpu, p->comm, p->pid); 2476 return false; 2477 } 2478 2479 if (!scx_rq_online(rq)) { 2480 if (enforce) 2481 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2482 return false; 2483 } 2484 2485 return true; 2486 } 2487 2488 /** 2489 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2490 * @p: target task 2491 * @dsq: locked DSQ @p is currently on 2492 * @src_rq: rq @p is currently on, stable with @dsq locked 2493 * 2494 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2495 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2496 * required when transferring into a local DSQ. Even when transferring into a 2497 * non-local DSQ, it's better to use the same mechanism to protect against 2498 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2499 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2500 * 2501 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2502 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2503 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2504 * dancing from our side. 2505 * 2506 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2507 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2508 * would be cleared to -1. While other cpus may have updated it to different 2509 * values afterwards, as this operation can't be preempted or recurse, the 2510 * holding_cpu can never become this CPU again before we're done. Thus, we can 2511 * tell whether we lost to dequeue by testing whether the holding_cpu still 2512 * points to this CPU. See dispatch_dequeue() for the counterpart. 2513 * 2514 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2515 * still valid. %false if lost to dequeue. 2516 */ 2517 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2518 struct scx_dispatch_q *dsq, 2519 struct rq *src_rq) 2520 { 2521 s32 cpu = raw_smp_processor_id(); 2522 2523 lockdep_assert_held(&dsq->lock); 2524 2525 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2526 task_unlink_from_dsq(p, dsq); 2527 p->scx.holding_cpu = cpu; 2528 2529 raw_spin_unlock(&dsq->lock); 2530 raw_spin_rq_lock(src_rq); 2531 2532 /* task_rq couldn't have changed if we're still the holding cpu */ 2533 return likely(p->scx.holding_cpu == cpu) && 2534 !WARN_ON_ONCE(src_rq != task_rq(p)); 2535 } 2536 2537 static bool consume_remote_task(struct rq *this_rq, 2538 struct task_struct *p, u64 enq_flags, 2539 struct scx_dispatch_q *dsq, struct rq *src_rq) 2540 { 2541 raw_spin_rq_unlock(this_rq); 2542 2543 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2544 move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); 2545 return true; 2546 } else { 2547 raw_spin_rq_unlock(src_rq); 2548 raw_spin_rq_lock(this_rq); 2549 return false; 2550 } 2551 } 2552 2553 /** 2554 * move_task_between_dsqs() - Move a task from one DSQ to another 2555 * @sch: scx_sched being operated on 2556 * @p: target task 2557 * @enq_flags: %SCX_ENQ_* 2558 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2559 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2560 * 2561 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2562 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2563 * will change. As @p's task_rq is locked, this function doesn't need to use the 2564 * holding_cpu mechanism. 2565 * 2566 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2567 * return value, is locked. 2568 */ 2569 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2570 struct task_struct *p, u64 enq_flags, 2571 struct scx_dispatch_q *src_dsq, 2572 struct scx_dispatch_q *dst_dsq) 2573 { 2574 struct rq *src_rq = task_rq(p), *dst_rq; 2575 2576 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2577 lockdep_assert_held(&src_dsq->lock); 2578 lockdep_assert_rq_held(src_rq); 2579 2580 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2581 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2582 if (src_rq != dst_rq && 2583 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2584 dst_dsq = find_global_dsq(sch, task_cpu(p)); 2585 dst_rq = src_rq; 2586 enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2587 } 2588 } else { 2589 /* no need to migrate if destination is a non-local DSQ */ 2590 dst_rq = src_rq; 2591 } 2592 2593 /* 2594 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2595 * CPU, @p will be migrated. 2596 */ 2597 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2598 /* @p is going from a non-local DSQ to a local DSQ */ 2599 if (src_rq == dst_rq) { 2600 task_unlink_from_dsq(p, src_dsq); 2601 move_local_task_to_local_dsq(sch, p, enq_flags, 2602 src_dsq, dst_rq); 2603 raw_spin_unlock(&src_dsq->lock); 2604 } else { 2605 raw_spin_unlock(&src_dsq->lock); 2606 move_remote_task_to_local_dsq(p, enq_flags, 2607 src_rq, dst_rq); 2608 } 2609 } else { 2610 /* 2611 * @p is going from a non-local DSQ to a non-local DSQ. As 2612 * $src_dsq is already locked, do an abbreviated dequeue. 2613 */ 2614 dispatch_dequeue_locked(p, src_dsq); 2615 raw_spin_unlock(&src_dsq->lock); 2616 2617 dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2618 } 2619 2620 return dst_rq; 2621 } 2622 2623 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2624 struct scx_dispatch_q *dsq, u64 enq_flags) 2625 { 2626 struct task_struct *p; 2627 retry: 2628 /* 2629 * The caller can't expect to successfully consume a task if the task's 2630 * addition to @dsq isn't guaranteed to be visible somehow. Test 2631 * @dsq->list without locking and skip if it seems empty. 2632 */ 2633 if (list_empty(&dsq->list)) 2634 return false; 2635 2636 raw_spin_lock(&dsq->lock); 2637 2638 nldsq_for_each_task(p, dsq) { 2639 struct rq *task_rq = task_rq(p); 2640 2641 /* 2642 * This loop can lead to multiple lockup scenarios, e.g. the BPF 2643 * scheduler can put an enormous number of affinitized tasks into 2644 * a contended DSQ, or the outer retry loop can repeatedly race 2645 * against scx_bypass() dequeueing tasks from @dsq trying to put 2646 * the system into the bypass mode. This can easily live-lock the 2647 * machine. If aborting, exit from all non-bypass DSQs. 2648 */ 2649 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2650 break; 2651 2652 if (rq == task_rq) { 2653 task_unlink_from_dsq(p, dsq); 2654 move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); 2655 raw_spin_unlock(&dsq->lock); 2656 return true; 2657 } 2658 2659 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2660 if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) 2661 return true; 2662 goto retry; 2663 } 2664 } 2665 2666 raw_spin_unlock(&dsq->lock); 2667 return false; 2668 } 2669 2670 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2671 { 2672 int node = cpu_to_node(cpu_of(rq)); 2673 2674 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); 2675 } 2676 2677 /** 2678 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2679 * @sch: scx_sched being operated on 2680 * @rq: current rq which is locked 2681 * @dst_dsq: destination DSQ 2682 * @p: task to dispatch 2683 * @enq_flags: %SCX_ENQ_* 2684 * 2685 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2686 * DSQ. This function performs all the synchronization dancing needed because 2687 * local DSQs are protected with rq locks. 2688 * 2689 * The caller must have exclusive ownership of @p (e.g. through 2690 * %SCX_OPSS_DISPATCHING). 2691 */ 2692 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2693 struct scx_dispatch_q *dst_dsq, 2694 struct task_struct *p, u64 enq_flags) 2695 { 2696 struct rq *src_rq = task_rq(p); 2697 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2698 struct rq *locked_rq = rq; 2699 2700 /* 2701 * We're synchronized against dequeue through DISPATCHING. As @p can't 2702 * be dequeued, its task_rq and cpus_allowed are stable too. 2703 * 2704 * If dispatching to @rq that @p is already on, no lock dancing needed. 2705 */ 2706 if (rq == src_rq && rq == dst_rq) { 2707 dispatch_enqueue(sch, rq, dst_dsq, p, 2708 enq_flags | SCX_ENQ_CLEAR_OPSS); 2709 return; 2710 } 2711 2712 if (src_rq != dst_rq && 2713 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2714 dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2715 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2716 return; 2717 } 2718 2719 /* 2720 * @p is on a possibly remote @src_rq which we need to lock to move the 2721 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2722 * on DISPATCHING, so we can't grab @src_rq lock while holding 2723 * DISPATCHING. 2724 * 2725 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2726 * we're moving from a DSQ and use the same mechanism - mark the task 2727 * under transfer with holding_cpu, release DISPATCHING and then follow 2728 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2729 */ 2730 p->scx.holding_cpu = raw_smp_processor_id(); 2731 2732 /* store_release ensures that dequeue sees the above */ 2733 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2734 2735 /* switch to @src_rq lock */ 2736 if (locked_rq != src_rq) { 2737 raw_spin_rq_unlock(locked_rq); 2738 locked_rq = src_rq; 2739 raw_spin_rq_lock(src_rq); 2740 } 2741 2742 /* task_rq couldn't have changed if we're still the holding cpu */ 2743 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2744 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2745 /* 2746 * If @p is staying on the same rq, there's no need to go 2747 * through the full deactivate/activate cycle. Optimize by 2748 * abbreviating move_remote_task_to_local_dsq(). 2749 */ 2750 if (src_rq == dst_rq) { 2751 p->scx.holding_cpu = -1; 2752 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2753 enq_flags); 2754 } else { 2755 move_remote_task_to_local_dsq(p, enq_flags, 2756 src_rq, dst_rq); 2757 /* task has been moved to dst_rq, which is now locked */ 2758 locked_rq = dst_rq; 2759 } 2760 2761 /* if the destination CPU is idle, wake it up */ 2762 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2763 resched_curr(dst_rq); 2764 } 2765 2766 /* switch back to @rq lock */ 2767 if (locked_rq != rq) { 2768 raw_spin_rq_unlock(locked_rq); 2769 raw_spin_rq_lock(rq); 2770 } 2771 } 2772 2773 /** 2774 * finish_dispatch - Asynchronously finish dispatching a task 2775 * @rq: current rq which is locked 2776 * @p: task to finish dispatching 2777 * @qseq_at_dispatch: qseq when @p started getting dispatched 2778 * @dsq_id: destination DSQ ID 2779 * @enq_flags: %SCX_ENQ_* 2780 * 2781 * Dispatching to local DSQs may need to wait for queueing to complete or 2782 * require rq lock dancing. As we don't wanna do either while inside 2783 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2784 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2785 * task and its qseq. Once ops.dispatch() returns, this function is called to 2786 * finish up. 2787 * 2788 * There is no guarantee that @p is still valid for dispatching or even that it 2789 * was valid in the first place. Make sure that the task is still owned by the 2790 * BPF scheduler and claim the ownership before dispatching. 2791 */ 2792 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2793 struct task_struct *p, 2794 unsigned long qseq_at_dispatch, 2795 u64 dsq_id, u64 enq_flags) 2796 { 2797 struct scx_dispatch_q *dsq; 2798 unsigned long opss; 2799 2800 touch_core_sched_dispatch(rq, p); 2801 retry: 2802 /* 2803 * No need for _acquire here. @p is accessed only after a successful 2804 * try_cmpxchg to DISPATCHING. 2805 */ 2806 opss = atomic_long_read(&p->scx.ops_state); 2807 2808 switch (opss & SCX_OPSS_STATE_MASK) { 2809 case SCX_OPSS_DISPATCHING: 2810 case SCX_OPSS_NONE: 2811 /* someone else already got to it */ 2812 return; 2813 case SCX_OPSS_QUEUED: 2814 /* 2815 * If qseq doesn't match, @p has gone through at least one 2816 * dispatch/dequeue and re-enqueue cycle between 2817 * scx_bpf_dsq_insert() and here and we have no claim on it. 2818 */ 2819 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2820 return; 2821 2822 /* see SCX_EV_INSERT_NOT_OWNED definition */ 2823 if (unlikely(!scx_task_on_sched(sch, p))) { 2824 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2825 return; 2826 } 2827 2828 /* 2829 * While we know @p is accessible, we don't yet have a claim on 2830 * it - the BPF scheduler is allowed to dispatch tasks 2831 * spuriously and there can be a racing dequeue attempt. Let's 2832 * claim @p by atomically transitioning it from QUEUED to 2833 * DISPATCHING. 2834 */ 2835 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2836 SCX_OPSS_DISPATCHING))) 2837 break; 2838 goto retry; 2839 case SCX_OPSS_QUEUEING: 2840 /* 2841 * do_enqueue_task() is in the process of transferring the task 2842 * to the BPF scheduler while holding @p's rq lock. As we aren't 2843 * holding any kernel or BPF resource that the enqueue path may 2844 * depend upon, it's safe to wait. 2845 */ 2846 wait_ops_state(p, opss); 2847 goto retry; 2848 } 2849 2850 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2851 2852 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2853 2854 if (dsq->id == SCX_DSQ_LOCAL) 2855 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2856 else 2857 dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2858 } 2859 2860 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2861 { 2862 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2863 u32 u; 2864 2865 for (u = 0; u < dspc->cursor; u++) { 2866 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2867 2868 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2869 ent->enq_flags); 2870 } 2871 2872 dspc->nr_tasks += dspc->cursor; 2873 dspc->cursor = 0; 2874 } 2875 2876 static inline void maybe_queue_balance_callback(struct rq *rq) 2877 { 2878 lockdep_assert_rq_held(rq); 2879 2880 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2881 return; 2882 2883 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2884 deferred_bal_cb_workfn); 2885 2886 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2887 } 2888 2889 /* 2890 * One user of this function is scx_bpf_dispatch() which can be called 2891 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2892 * from the call frame. 2893 */ 2894 static __always_inline bool 2895 scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2896 struct task_struct *prev, bool nested) 2897 { 2898 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2899 int nr_loops = SCX_DSP_MAX_LOOPS; 2900 s32 cpu = cpu_of(rq); 2901 bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2902 scx_task_on_sched(sch, prev); 2903 2904 if (consume_global_dsq(sch, rq)) 2905 return true; 2906 2907 if (bypass_dsp_enabled(sch)) { 2908 /* if @sch is bypassing, only the bypass DSQs are active */ 2909 if (scx_bypassing(sch, cpu)) 2910 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2911 2912 #ifdef CONFIG_EXT_SUB_SCHED 2913 /* 2914 * If @sch isn't bypassing but its children are, @sch is 2915 * responsible for making forward progress for both its own 2916 * tasks that aren't bypassing and the bypassing descendants' 2917 * tasks. The following implements a simple built-in behavior - 2918 * let each CPU try to run the bypass DSQ every Nth time. 2919 * 2920 * Later, if necessary, we can add an ops flag to suppress the 2921 * auto-consumption and a kfunc to consume the bypass DSQ and, 2922 * so that the BPF scheduler can fully control scheduling of 2923 * bypassed tasks. 2924 */ 2925 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2926 2927 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2928 consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { 2929 __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2930 return true; 2931 } 2932 #endif /* CONFIG_EXT_SUB_SCHED */ 2933 } 2934 2935 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2936 return false; 2937 2938 dspc->rq = rq; 2939 2940 /* 2941 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2942 * the local DSQ might still end up empty after a successful 2943 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2944 * produced some tasks, retry. The BPF scheduler may depend on this 2945 * looping behavior to simplify its implementation. 2946 */ 2947 do { 2948 dspc->nr_tasks = 0; 2949 2950 if (nested) { 2951 SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), 2952 prev_on_sch ? prev : NULL); 2953 } else { 2954 /* stash @prev so that nested invocations can access it */ 2955 rq->scx.sub_dispatch_prev = prev; 2956 SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), 2957 prev_on_sch ? prev : NULL); 2958 rq->scx.sub_dispatch_prev = NULL; 2959 } 2960 2961 flush_dispatch_buf(sch, rq); 2962 2963 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2964 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2965 return true; 2966 } 2967 if (rq->scx.local_dsq.nr) 2968 return true; 2969 if (consume_global_dsq(sch, rq)) 2970 return true; 2971 2972 /* 2973 * ops.dispatch() can trap us in this loop by repeatedly 2974 * dispatching ineligible tasks. Break out once in a while to 2975 * allow the watchdog to run. As IRQ can't be enabled in 2976 * balance(), we want to complete this scheduling cycle and then 2977 * start a new one. IOW, we want to call resched_curr() on the 2978 * next, most likely idle, task, not the current one. Use 2979 * __scx_bpf_kick_cpu() for deferred kicking. 2980 */ 2981 if (unlikely(!--nr_loops)) { 2982 scx_kick_cpu(sch, cpu, 0); 2983 break; 2984 } 2985 } while (dspc->nr_tasks); 2986 2987 /* 2988 * Prevent the CPU from going idle while bypassed descendants have tasks 2989 * queued. Without this fallback, bypassed tasks could stall if the host 2990 * scheduler's ops.dispatch() doesn't yield any tasks. 2991 */ 2992 if (bypass_dsp_enabled(sch)) 2993 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2994 2995 return false; 2996 } 2997 2998 static int balance_one(struct rq *rq, struct task_struct *prev) 2999 { 3000 struct scx_sched *sch = scx_root; 3001 s32 cpu = cpu_of(rq); 3002 3003 lockdep_assert_rq_held(rq); 3004 rq->scx.flags |= SCX_RQ_IN_BALANCE; 3005 rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 3006 3007 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 3008 unlikely(rq->scx.cpu_released)) { 3009 /* 3010 * If the previous sched_class for the current CPU was not SCX, 3011 * notify the BPF scheduler that it again has control of the 3012 * core. This callback complements ->cpu_release(), which is 3013 * emitted in switch_class(). 3014 */ 3015 if (sch->ops.cpu_acquire) 3016 SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); 3017 rq->scx.cpu_released = false; 3018 } 3019 3020 if (prev->sched_class == &ext_sched_class) { 3021 update_curr_scx(rq); 3022 3023 /* 3024 * If @prev is runnable & has slice left, it has priority and 3025 * fetching more just increases latency for the fetched tasks. 3026 * Tell pick_task_scx() to keep running @prev. If the BPF 3027 * scheduler wants to handle this explicitly, it should 3028 * implement ->cpu_release(). 3029 * 3030 * See scx_disable_workfn() for the explanation on the bypassing 3031 * test. 3032 */ 3033 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 3034 !scx_bypassing(sch, cpu)) { 3035 rq->scx.flags |= SCX_RQ_BAL_KEEP; 3036 goto has_tasks; 3037 } 3038 } 3039 3040 /* if there already are tasks to run, nothing to do */ 3041 if (rq->scx.local_dsq.nr) 3042 goto has_tasks; 3043 3044 if (scx_dispatch_sched(sch, rq, prev, false)) 3045 goto has_tasks; 3046 3047 /* 3048 * Didn't find another task to run. Keep running @prev unless 3049 * %SCX_OPS_ENQ_LAST is in effect. 3050 */ 3051 if ((prev->scx.flags & SCX_TASK_QUEUED) && 3052 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 3053 rq->scx.flags |= SCX_RQ_BAL_KEEP; 3054 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 3055 goto has_tasks; 3056 } 3057 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 3058 return false; 3059 3060 has_tasks: 3061 /* 3062 * @rq may have extra IMMED tasks without reenq scheduled: 3063 * 3064 * - rq_is_open() can't reliably tell when and how slice is going to be 3065 * modified for $curr and allows IMMED tasks to be queued while 3066 * dispatch is in progress. 3067 * 3068 * - A non-IMMED HEAD task can get queued in front of an IMMED task 3069 * between the IMMED queueing and the subsequent scheduling event. 3070 */ 3071 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 3072 schedule_reenq_local(rq, 0); 3073 3074 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 3075 return true; 3076 } 3077 3078 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 3079 { 3080 struct scx_sched *sch = scx_task_sched(p); 3081 3082 if (p->scx.flags & SCX_TASK_QUEUED) { 3083 /* 3084 * Core-sched might decide to execute @p before it is 3085 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 3086 */ 3087 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 3088 dispatch_dequeue(rq, p); 3089 } 3090 3091 p->se.exec_start = rq_clock_task(rq); 3092 3093 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3094 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 3095 SCX_CALL_OP_TASK(sch, running, rq, p); 3096 3097 clr_task_runnable(p, true); 3098 3099 /* 3100 * @p is getting newly scheduled or got kicked after someone updated its 3101 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 3102 */ 3103 if ((p->scx.slice == SCX_SLICE_INF) != 3104 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 3105 if (p->scx.slice == SCX_SLICE_INF) 3106 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 3107 else 3108 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 3109 3110 sched_update_tick_dependency(rq); 3111 3112 /* 3113 * For now, let's refresh the load_avgs just when transitioning 3114 * in and out of nohz. In the future, we might want to add a 3115 * mechanism which calls the following periodically on 3116 * tick-stopped CPUs. 3117 */ 3118 update_other_load_avgs(rq); 3119 } 3120 } 3121 3122 static enum scx_cpu_preempt_reason 3123 preempt_reason_from_class(const struct sched_class *class) 3124 { 3125 if (class == &stop_sched_class) 3126 return SCX_CPU_PREEMPT_STOP; 3127 if (class == &dl_sched_class) 3128 return SCX_CPU_PREEMPT_DL; 3129 if (class == &rt_sched_class) 3130 return SCX_CPU_PREEMPT_RT; 3131 return SCX_CPU_PREEMPT_UNKNOWN; 3132 } 3133 3134 static void switch_class(struct rq *rq, struct task_struct *next) 3135 { 3136 struct scx_sched *sch = scx_root; 3137 const struct sched_class *next_class = next->sched_class; 3138 3139 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3140 return; 3141 3142 /* 3143 * The callback is conceptually meant to convey that the CPU is no 3144 * longer under the control of SCX. Therefore, don't invoke the callback 3145 * if the next class is below SCX (in which case the BPF scheduler has 3146 * actively decided not to schedule any tasks on the CPU). 3147 */ 3148 if (sched_class_above(&ext_sched_class, next_class)) 3149 return; 3150 3151 /* 3152 * At this point we know that SCX was preempted by a higher priority 3153 * sched_class, so invoke the ->cpu_release() callback if we have not 3154 * done so already. We only send the callback once between SCX being 3155 * preempted, and it regaining control of the CPU. 3156 * 3157 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3158 * next time that balance_one() is invoked. 3159 */ 3160 if (!rq->scx.cpu_released) { 3161 if (sch->ops.cpu_release) { 3162 struct scx_cpu_release_args args = { 3163 .reason = preempt_reason_from_class(next_class), 3164 .task = next, 3165 }; 3166 3167 SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); 3168 } 3169 rq->scx.cpu_released = true; 3170 } 3171 } 3172 3173 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3174 struct task_struct *next) 3175 { 3176 struct scx_sched *sch = scx_task_sched(p); 3177 3178 /* see kick_sync_wait_bal_cb() */ 3179 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3180 3181 update_curr_scx(rq); 3182 3183 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3184 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3185 SCX_CALL_OP_TASK(sch, stopping, rq, p, true); 3186 3187 if (p->scx.flags & SCX_TASK_QUEUED) { 3188 set_task_runnable(rq, p); 3189 3190 /* 3191 * If @p has slice left and is being put, @p is getting 3192 * preempted by a higher priority scheduler class or core-sched 3193 * forcing a different task. Leave it at the head of the local 3194 * DSQ unless it was an IMMED task. IMMED tasks should not 3195 * linger on a busy CPU, reenqueue them to the BPF scheduler. 3196 */ 3197 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3198 if (p->scx.flags & SCX_TASK_IMMED) { 3199 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3200 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3201 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3202 } else { 3203 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3204 } 3205 goto switch_class; 3206 } 3207 3208 /* 3209 * If @p is runnable but we're about to enter a lower 3210 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3211 * ops.enqueue() that @p is the only one available for this cpu, 3212 * which should trigger an explicit follow-up scheduling event. 3213 */ 3214 if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3215 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3216 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3217 } else { 3218 do_enqueue_task(rq, p, 0, -1); 3219 } 3220 } 3221 3222 switch_class: 3223 if (next && next->sched_class != &ext_sched_class) 3224 switch_class(rq, next); 3225 } 3226 3227 static void kick_sync_wait_bal_cb(struct rq *rq) 3228 { 3229 struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3230 unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3231 bool waited; 3232 s32 cpu; 3233 3234 /* 3235 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3236 * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3237 * flush) while we wait for its kick_sync to advance. 3238 * 3239 * Also, keep advancing our own kick_sync so that new kick_sync waits 3240 * targeting us, which can start after we drop the lock, cannot form 3241 * cyclic dependencies. 3242 */ 3243 retry: 3244 waited = false; 3245 for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3246 /* 3247 * smp_load_acquire() pairs with smp_store_release() on 3248 * kick_sync updates on the target CPUs. 3249 */ 3250 if (cpu == cpu_of(rq) || 3251 smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3252 cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3253 continue; 3254 } 3255 3256 raw_spin_rq_unlock_irq(rq); 3257 while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3258 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3259 cpu_relax(); 3260 } 3261 raw_spin_rq_lock_irq(rq); 3262 waited = true; 3263 } 3264 3265 if (waited) 3266 goto retry; 3267 } 3268 3269 static struct task_struct *first_local_task(struct rq *rq) 3270 { 3271 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3272 struct task_struct, scx.dsq_list.node); 3273 } 3274 3275 static struct task_struct * 3276 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3277 { 3278 struct task_struct *prev = rq->curr; 3279 bool keep_prev; 3280 struct task_struct *p; 3281 3282 /* see kick_sync_wait_bal_cb() */ 3283 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3284 3285 rq_modified_begin(rq, &ext_sched_class); 3286 3287 rq_unpin_lock(rq, rf); 3288 balance_one(rq, prev); 3289 rq_repin_lock(rq, rf); 3290 maybe_queue_balance_callback(rq); 3291 3292 /* 3293 * Defer to a balance callback which can drop rq lock and enable 3294 * IRQs. Waiting directly in the pick path would deadlock against 3295 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3296 */ 3297 if (unlikely(rq->scx.kick_sync_pending)) { 3298 rq->scx.kick_sync_pending = false; 3299 queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3300 kick_sync_wait_bal_cb); 3301 } 3302 3303 /* 3304 * If any higher-priority sched class enqueued a runnable task on 3305 * this rq during balance_one(), abort and return RETRY_TASK, so 3306 * that the scheduler loop can restart. 3307 * 3308 * If @force_scx is true, always try to pick a SCHED_EXT task, 3309 * regardless of any higher-priority sched classes activity. 3310 */ 3311 if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3312 return RETRY_TASK; 3313 3314 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3315 if (unlikely(keep_prev && 3316 prev->sched_class != &ext_sched_class)) { 3317 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3318 keep_prev = false; 3319 } 3320 3321 /* 3322 * If balance_one() is telling us to keep running @prev, replenish slice 3323 * if necessary and keep running @prev. Otherwise, pop the first one 3324 * from the local DSQ. 3325 */ 3326 if (keep_prev) { 3327 p = prev; 3328 if (!p->scx.slice) 3329 refill_task_slice_dfl(scx_task_sched(p), p); 3330 } else { 3331 p = first_local_task(rq); 3332 if (!p) 3333 return NULL; 3334 3335 if (unlikely(!p->scx.slice)) { 3336 struct scx_sched *sch = scx_task_sched(p); 3337 3338 if (!scx_bypassing(sch, cpu_of(rq)) && 3339 !sch->warned_zero_slice) { 3340 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3341 p->comm, p->pid, __func__); 3342 sch->warned_zero_slice = true; 3343 } 3344 refill_task_slice_dfl(sch, p); 3345 } 3346 } 3347 3348 return p; 3349 } 3350 3351 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3352 { 3353 return do_pick_task_scx(rq, rf, false); 3354 } 3355 3356 /* 3357 * Select the next task to run from the ext scheduling class. 3358 * 3359 * Use do_pick_task_scx() directly with @force_scx enabled, since the 3360 * dl_server must always select a sched_ext task. 3361 */ 3362 static struct task_struct * 3363 ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3364 { 3365 if (!scx_enabled()) 3366 return NULL; 3367 3368 return do_pick_task_scx(dl_se->rq, rf, true); 3369 } 3370 3371 /* 3372 * Initialize the ext server deadline entity. 3373 */ 3374 void ext_server_init(struct rq *rq) 3375 { 3376 struct sched_dl_entity *dl_se = &rq->ext_server; 3377 3378 init_dl_entity(dl_se); 3379 3380 dl_server_init(dl_se, rq, ext_server_pick_task); 3381 } 3382 3383 #ifdef CONFIG_SCHED_CORE 3384 /** 3385 * scx_prio_less - Task ordering for core-sched 3386 * @a: task A 3387 * @b: task B 3388 * @in_fi: in forced idle state 3389 * 3390 * Core-sched is implemented as an additional scheduling layer on top of the 3391 * usual sched_class'es and needs to find out the expected task ordering. For 3392 * SCX, core-sched calls this function to interrogate the task ordering. 3393 * 3394 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3395 * to implement the default task ordering. The older the timestamp, the higher 3396 * priority the task - the global FIFO ordering matching the default scheduling 3397 * behavior. 3398 * 3399 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3400 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3401 */ 3402 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3403 bool in_fi) 3404 { 3405 struct scx_sched *sch_a = scx_task_sched(a); 3406 struct scx_sched *sch_b = scx_task_sched(b); 3407 3408 /* 3409 * The const qualifiers are dropped from task_struct pointers when 3410 * calling ops.core_sched_before(). Accesses are controlled by the 3411 * verifier. 3412 */ 3413 if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3414 !scx_bypassing(sch_a, task_cpu(a))) 3415 return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, 3416 task_rq(a), 3417 (struct task_struct *)a, 3418 (struct task_struct *)b); 3419 else 3420 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3421 } 3422 #endif /* CONFIG_SCHED_CORE */ 3423 3424 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3425 { 3426 struct scx_sched *sch = scx_task_sched(p); 3427 bool bypassing; 3428 3429 /* 3430 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3431 * can be a good migration opportunity with low cache and memory 3432 * footprint. Returning a CPU different than @prev_cpu triggers 3433 * immediate rq migration. However, for SCX, as the current rq 3434 * association doesn't dictate where the task is going to run, this 3435 * doesn't fit well. If necessary, we can later add a dedicated method 3436 * which can decide to preempt self to force it through the regular 3437 * scheduling path. 3438 */ 3439 if (unlikely(wake_flags & WF_EXEC)) 3440 return prev_cpu; 3441 3442 bypassing = scx_bypassing(sch, task_cpu(p)); 3443 if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3444 s32 cpu; 3445 struct task_struct **ddsp_taskp; 3446 3447 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3448 WARN_ON_ONCE(*ddsp_taskp); 3449 *ddsp_taskp = p; 3450 3451 this_rq()->scx.in_select_cpu = true; 3452 cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, 3453 scx_cpu_arg(prev_cpu), wake_flags); 3454 cpu = scx_cpu_ret(sch, cpu); 3455 this_rq()->scx.in_select_cpu = false; 3456 p->scx.selected_cpu = cpu; 3457 *ddsp_taskp = NULL; 3458 if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3459 return cpu; 3460 else 3461 return prev_cpu; 3462 } else { 3463 s32 cpu; 3464 3465 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3466 if (cpu >= 0) { 3467 refill_task_slice_dfl(sch, p); 3468 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3469 } else { 3470 cpu = prev_cpu; 3471 } 3472 p->scx.selected_cpu = cpu; 3473 3474 if (bypassing) 3475 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3476 return cpu; 3477 } 3478 } 3479 3480 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3481 { 3482 run_deferred(rq); 3483 } 3484 3485 static void set_cpus_allowed_scx(struct task_struct *p, 3486 struct affinity_context *ac) 3487 { 3488 struct scx_sched *sch = scx_task_sched(p); 3489 3490 set_cpus_allowed_common(p, ac); 3491 3492 if (task_dead_and_done(p)) 3493 return; 3494 3495 /* 3496 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3497 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3498 * scheduler the effective one. 3499 * 3500 * Fine-grained memory write control is enforced by BPF making the const 3501 * designation pointless. Cast it away when calling the operation. 3502 */ 3503 if (SCX_HAS_OP(sch, set_cpumask)) 3504 scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr); 3505 } 3506 3507 static void handle_hotplug(struct rq *rq, bool online) 3508 { 3509 struct scx_sched *sch = scx_root; 3510 s32 cpu = cpu_of(rq); 3511 3512 atomic_long_inc(&scx_hotplug_seq); 3513 3514 /* 3515 * scx_root updates are protected by cpus_read_lock() and will stay 3516 * stable here. Note that we can't depend on scx_enabled() test as the 3517 * hotplug ops need to be enabled before __scx_enabled is set. 3518 */ 3519 if (unlikely(!sch)) 3520 return; 3521 3522 if (scx_enabled()) 3523 scx_idle_update_selcpu_topology(&sch->ops); 3524 3525 if (online && SCX_HAS_OP(sch, cpu_online)) 3526 SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu)); 3527 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3528 SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu)); 3529 else 3530 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3531 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3532 "cpu %d going %s, exiting scheduler", cpu, 3533 online ? "online" : "offline"); 3534 } 3535 3536 void scx_rq_activate(struct rq *rq) 3537 { 3538 handle_hotplug(rq, true); 3539 } 3540 3541 void scx_rq_deactivate(struct rq *rq) 3542 { 3543 handle_hotplug(rq, false); 3544 } 3545 3546 static void rq_online_scx(struct rq *rq) 3547 { 3548 rq->scx.flags |= SCX_RQ_ONLINE; 3549 } 3550 3551 static void rq_offline_scx(struct rq *rq) 3552 { 3553 rq->scx.flags &= ~SCX_RQ_ONLINE; 3554 } 3555 3556 static bool check_rq_for_timeouts(struct rq *rq) 3557 { 3558 struct scx_sched *sch; 3559 struct task_struct *p; 3560 struct rq_flags rf; 3561 bool timed_out = false; 3562 3563 rq_lock_irqsave(rq, &rf); 3564 sch = rcu_dereference_bh(scx_root); 3565 if (unlikely(!sch)) 3566 goto out_unlock; 3567 3568 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3569 struct scx_sched *sch = scx_task_sched(p); 3570 unsigned long last_runnable = p->scx.runnable_at; 3571 3572 if (unlikely(time_after(jiffies, 3573 last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3574 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3575 3576 __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq), 3577 "%s[%d] failed to run for %u.%03us", 3578 p->comm, p->pid, dur_ms / 1000, 3579 dur_ms % 1000); 3580 timed_out = true; 3581 break; 3582 } 3583 } 3584 out_unlock: 3585 rq_unlock_irqrestore(rq, &rf); 3586 return timed_out; 3587 } 3588 3589 static void scx_watchdog_workfn(struct work_struct *work) 3590 { 3591 unsigned long intv; 3592 int cpu; 3593 3594 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3595 3596 for_each_online_cpu(cpu) { 3597 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3598 break; 3599 3600 cond_resched(); 3601 } 3602 3603 intv = READ_ONCE(scx_watchdog_interval); 3604 if (intv < ULONG_MAX) 3605 queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3606 } 3607 3608 void scx_tick(struct rq *rq) 3609 { 3610 struct scx_sched *root; 3611 unsigned long last_check; 3612 3613 if (!scx_enabled()) 3614 return; 3615 3616 root = rcu_dereference_bh(scx_root); 3617 if (unlikely(!root)) 3618 return; 3619 3620 last_check = READ_ONCE(scx_watchdog_timestamp); 3621 if (unlikely(time_after(jiffies, 3622 last_check + READ_ONCE(root->watchdog_timeout)))) { 3623 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3624 3625 scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3626 "watchdog failed to check in for %u.%03us", 3627 dur_ms / 1000, dur_ms % 1000); 3628 } 3629 3630 update_other_load_avgs(rq); 3631 } 3632 3633 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3634 { 3635 struct scx_sched *sch = scx_task_sched(curr); 3636 3637 update_curr_scx(rq); 3638 3639 /* 3640 * While disabling, always resched and refresh core-sched timestamp as 3641 * we can't trust the slice management or ops.core_sched_before(). 3642 */ 3643 if (scx_bypassing(sch, cpu_of(rq))) { 3644 curr->scx.slice = 0; 3645 touch_core_sched(rq, curr); 3646 } else if (SCX_HAS_OP(sch, tick)) { 3647 SCX_CALL_OP_TASK(sch, tick, rq, curr); 3648 } 3649 3650 if (!curr->scx.slice) 3651 resched_curr(rq); 3652 } 3653 3654 #ifdef CONFIG_EXT_GROUP_SCHED 3655 static struct cgroup *tg_cgrp(struct task_group *tg) 3656 { 3657 /* 3658 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3659 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3660 * root cgroup. 3661 */ 3662 if (tg && tg->css.cgroup) 3663 return tg->css.cgroup; 3664 else 3665 return &cgrp_dfl_root.cgrp; 3666 } 3667 3668 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3669 3670 #else /* CONFIG_EXT_GROUP_SCHED */ 3671 3672 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3673 3674 #endif /* CONFIG_EXT_GROUP_SCHED */ 3675 3676 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3677 { 3678 int ret; 3679 3680 p->scx.disallow = false; 3681 3682 if (SCX_HAS_OP(sch, init_task)) { 3683 struct scx_init_task_args args = { 3684 SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3685 .fork = fork, 3686 }; 3687 3688 ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); 3689 if (unlikely(ret)) { 3690 ret = ops_sanitize_err(sch, "init_task", ret); 3691 return ret; 3692 } 3693 } 3694 3695 if (p->scx.disallow) { 3696 if (unlikely(scx_parent(sch))) { 3697 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3698 p->comm, p->pid); 3699 } else if (unlikely(fork)) { 3700 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3701 p->comm, p->pid); 3702 } else { 3703 struct rq *rq; 3704 struct rq_flags rf; 3705 3706 rq = task_rq_lock(p, &rf); 3707 3708 /* 3709 * We're in the load path and @p->policy will be applied 3710 * right after. Reverting @p->policy here and rejecting 3711 * %SCHED_EXT transitions from scx_check_setscheduler() 3712 * guarantees that if ops.init_task() sets @p->disallow, 3713 * @p can never be in SCX. 3714 */ 3715 if (p->policy == SCHED_EXT) { 3716 p->policy = SCHED_NORMAL; 3717 atomic_long_inc(&scx_nr_rejected); 3718 } 3719 3720 task_rq_unlock(rq, p, &rf); 3721 } 3722 } 3723 3724 return 0; 3725 } 3726 3727 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3728 { 3729 struct rq *rq = task_rq(p); 3730 u32 weight; 3731 3732 lockdep_assert_rq_held(rq); 3733 3734 /* 3735 * Verify the task is not in BPF scheduler's custody. If flag 3736 * transitions are consistent, the flag should always be clear 3737 * here. 3738 */ 3739 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3740 3741 /* 3742 * Set the weight before calling ops.enable() so that the scheduler 3743 * doesn't see a stale value if they inspect the task struct. 3744 */ 3745 if (task_has_idle_policy(p)) 3746 weight = WEIGHT_IDLEPRIO; 3747 else 3748 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3749 3750 p->scx.weight = sched_weight_to_cgroup(weight); 3751 3752 if (SCX_HAS_OP(sch, enable)) 3753 SCX_CALL_OP_TASK(sch, enable, rq, p); 3754 3755 if (SCX_HAS_OP(sch, set_weight)) 3756 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3757 } 3758 3759 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3760 { 3761 __scx_enable_task(sch, p); 3762 scx_set_task_state(p, SCX_TASK_ENABLED); 3763 } 3764 3765 static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3766 { 3767 struct rq *rq = task_rq(p); 3768 3769 lockdep_assert_rq_held(rq); 3770 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3771 3772 clear_direct_dispatch(p); 3773 3774 if (SCX_HAS_OP(sch, disable)) 3775 SCX_CALL_OP_TASK(sch, disable, rq, p); 3776 scx_set_task_state(p, SCX_TASK_READY); 3777 3778 /* 3779 * Verify the task is not in BPF scheduler's custody. If flag 3780 * transitions are consistent, the flag should always be clear 3781 * here. 3782 */ 3783 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3784 } 3785 3786 static void __scx_disable_and_exit_task(struct scx_sched *sch, 3787 struct task_struct *p) 3788 { 3789 struct scx_exit_task_args args = { 3790 .cancelled = false, 3791 }; 3792 3793 lockdep_assert_held(&p->pi_lock); 3794 lockdep_assert_rq_held(task_rq(p)); 3795 3796 switch (scx_get_task_state(p)) { 3797 case SCX_TASK_NONE: 3798 return; 3799 case SCX_TASK_INIT: 3800 args.cancelled = true; 3801 break; 3802 case SCX_TASK_READY: 3803 break; 3804 case SCX_TASK_ENABLED: 3805 scx_disable_task(sch, p); 3806 break; 3807 default: 3808 WARN_ON_ONCE(true); 3809 return; 3810 } 3811 3812 if (SCX_HAS_OP(sch, exit_task)) 3813 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3814 } 3815 3816 /* 3817 * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never 3818 * ran. The task state has not been transitioned, so this mirrors the 3819 * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). 3820 */ 3821 static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) 3822 { 3823 struct scx_exit_task_args args = { .cancelled = true }; 3824 3825 lockdep_assert_held(&p->pi_lock); 3826 lockdep_assert_rq_held(task_rq(p)); 3827 3828 if (SCX_HAS_OP(sch, exit_task)) 3829 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3830 } 3831 3832 static void scx_disable_and_exit_task(struct scx_sched *sch, 3833 struct task_struct *p) 3834 { 3835 __scx_disable_and_exit_task(sch, p); 3836 3837 /* 3838 * If set, @p exited between __scx_init_task() and scx_enable_task() in 3839 * scx_sub_enable() and is initialized for both the associated sched and 3840 * its parent. Exit for the child too - scx_enable_task() never ran for 3841 * it, so undo only init_task. The flag is only set on the sub-enable 3842 * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. 3843 */ 3844 if (p->scx.flags & SCX_TASK_SUB_INIT) { 3845 if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) 3846 scx_sub_init_cancel_task(scx_enabling_sub_sched, p); 3847 p->scx.flags &= ~SCX_TASK_SUB_INIT; 3848 } 3849 3850 scx_set_task_sched(p, NULL); 3851 scx_set_task_state(p, SCX_TASK_NONE); 3852 } 3853 3854 void init_scx_entity(struct sched_ext_entity *scx) 3855 { 3856 memset(scx, 0, sizeof(*scx)); 3857 INIT_LIST_HEAD(&scx->dsq_list.node); 3858 RB_CLEAR_NODE(&scx->dsq_priq); 3859 scx->sticky_cpu = -1; 3860 scx->holding_cpu = -1; 3861 INIT_LIST_HEAD(&scx->runnable_node); 3862 scx->runnable_at = jiffies; 3863 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3864 scx->slice = SCX_SLICE_DFL; 3865 } 3866 3867 /* See scx_tid_alloc / scx_tid_cursor. */ 3868 static u64 scx_alloc_tid(void) 3869 { 3870 struct scx_tid_alloc *ta; 3871 3872 guard(preempt)(); 3873 ta = this_cpu_ptr(&scx_tid_alloc); 3874 3875 if (unlikely(ta->next >= ta->end)) { 3876 ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor); 3877 ta->end = ta->next + SCX_TID_CHUNK; 3878 } 3879 return ta->next++; 3880 } 3881 3882 static void scx_tid_hash_insert(struct task_struct *p) 3883 { 3884 int ret; 3885 3886 lockdep_assert_held(&scx_tasks_lock); 3887 3888 ret = rhashtable_lookup_insert_fast(&scx_tid_hash, 3889 &p->scx.tid_hash_node, 3890 scx_tid_hash_params); 3891 WARN_ON_ONCE(ret); 3892 } 3893 3894 void scx_pre_fork(struct task_struct *p) 3895 { 3896 /* 3897 * BPF scheduler enable/disable paths want to be able to iterate and 3898 * update all tasks which can become complex when racing forks. As 3899 * enable/disable are very cold paths, let's use a percpu_rwsem to 3900 * exclude forks. 3901 */ 3902 percpu_down_read(&scx_fork_rwsem); 3903 } 3904 3905 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3906 { 3907 s32 ret; 3908 3909 percpu_rwsem_assert_held(&scx_fork_rwsem); 3910 3911 p->scx.tid = scx_alloc_tid(); 3912 3913 if (scx_init_task_enabled) { 3914 #ifdef CONFIG_EXT_SUB_SCHED 3915 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3916 #else 3917 struct scx_sched *sch = scx_root; 3918 #endif 3919 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 3920 ret = __scx_init_task(sch, p, true); 3921 if (unlikely(ret)) { 3922 scx_set_task_state(p, SCX_TASK_NONE); 3923 return ret; 3924 } 3925 scx_set_task_state(p, SCX_TASK_INIT); 3926 scx_set_task_sched(p, sch); 3927 } 3928 3929 return 0; 3930 } 3931 3932 void scx_post_fork(struct task_struct *p) 3933 { 3934 if (scx_init_task_enabled) { 3935 scx_set_task_state(p, SCX_TASK_READY); 3936 3937 /* 3938 * Enable the task immediately if it's running on sched_ext. 3939 * Otherwise, it'll be enabled in switching_to_scx() if and 3940 * when it's ever configured to run with a SCHED_EXT policy. 3941 */ 3942 if (p->sched_class == &ext_sched_class) { 3943 struct rq_flags rf; 3944 struct rq *rq; 3945 3946 rq = task_rq_lock(p, &rf); 3947 scx_enable_task(scx_task_sched(p), p); 3948 task_rq_unlock(rq, p, &rf); 3949 } 3950 } 3951 3952 scoped_guard(raw_spinlock_irq, &scx_tasks_lock) { 3953 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3954 if (scx_tid_to_task_enabled()) 3955 scx_tid_hash_insert(p); 3956 } 3957 3958 percpu_up_read(&scx_fork_rwsem); 3959 } 3960 3961 void scx_cancel_fork(struct task_struct *p) 3962 { 3963 if (scx_enabled()) { 3964 struct rq *rq; 3965 struct rq_flags rf; 3966 3967 rq = task_rq_lock(p, &rf); 3968 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3969 scx_disable_and_exit_task(scx_task_sched(p), p); 3970 task_rq_unlock(rq, p, &rf); 3971 } 3972 3973 percpu_up_read(&scx_fork_rwsem); 3974 } 3975 3976 /** 3977 * task_dead_and_done - Is a task dead and done running? 3978 * @p: target task 3979 * 3980 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3981 * task no longer exists from SCX's POV. However, certain sched_class ops may be 3982 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3983 * may try to switch a task which finished sched_ext_dead() back into SCX 3984 * triggering invalid SCX task state transitions and worse. 3985 * 3986 * Once a task has finished the final switch, sched_ext_dead() is the only thing 3987 * that needs to happen on the task. Use this test to short-circuit sched_class 3988 * operations which may be called on dead tasks. 3989 */ 3990 static bool task_dead_and_done(struct task_struct *p) 3991 { 3992 struct rq *rq = task_rq(p); 3993 3994 lockdep_assert_rq_held(rq); 3995 3996 /* 3997 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3998 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3999 * won't ever run again. 4000 */ 4001 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 4002 !task_on_cpu(rq, p); 4003 } 4004 4005 void sched_ext_dead(struct task_struct *p) 4006 { 4007 /* 4008 * By the time control reaches here, @p has %TASK_DEAD set, switched out 4009 * for the last time and then dropped the rq lock - task_dead_and_done() 4010 * should be returning %true nullifying the straggling sched_class ops. 4011 * Remove from scx_tasks and exit @p. 4012 */ 4013 scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) { 4014 list_del_init(&p->scx.tasks_node); 4015 if (scx_tid_to_task_enabled()) 4016 rhashtable_remove_fast(&scx_tid_hash, 4017 &p->scx.tid_hash_node, 4018 scx_tid_hash_params); 4019 } 4020 4021 /* 4022 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 4023 * ENABLED transitions can't race us. Disable ops for @p. 4024 * 4025 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see 4026 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup 4027 * iteration is only used from sub-sched paths, which require root 4028 * enabled. Root enable transitions every live task to at least READY. 4029 * 4030 * %INIT_BEGIN means ops.init_task() is running for @p. Don't call 4031 * into ops; transition to %DEAD so the post-init recheck unwinds 4032 * via scx_sub_init_cancel_task(). 4033 */ 4034 if (scx_get_task_state(p) != SCX_TASK_NONE) { 4035 struct rq_flags rf; 4036 struct rq *rq; 4037 4038 rq = task_rq_lock(p, &rf); 4039 if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) 4040 scx_disable_and_exit_task(scx_task_sched(p), p); 4041 scx_set_task_state(p, SCX_TASK_DEAD); 4042 task_rq_unlock(rq, p, &rf); 4043 } 4044 } 4045 4046 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 4047 const struct load_weight *lw) 4048 { 4049 struct scx_sched *sch = scx_task_sched(p); 4050 4051 lockdep_assert_rq_held(task_rq(p)); 4052 4053 if (task_dead_and_done(p)) 4054 return; 4055 4056 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 4057 if (SCX_HAS_OP(sch, set_weight)) 4058 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 4059 } 4060 4061 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 4062 { 4063 } 4064 4065 static void switching_to_scx(struct rq *rq, struct task_struct *p) 4066 { 4067 struct scx_sched *sch = scx_task_sched(p); 4068 4069 if (task_dead_and_done(p)) 4070 return; 4071 4072 scx_enable_task(sch, p); 4073 4074 /* 4075 * set_cpus_allowed_scx() is not called while @p is associated with a 4076 * different scheduler class. Keep the BPF scheduler up-to-date. 4077 */ 4078 if (SCX_HAS_OP(sch, set_cpumask)) 4079 scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr); 4080 } 4081 4082 static void switched_from_scx(struct rq *rq, struct task_struct *p) 4083 { 4084 if (task_dead_and_done(p)) 4085 return; 4086 4087 /* 4088 * %NONE means SCX is no longer tracking @p at the task level (e.g. 4089 * scx_fail_parent() handed @p back to the parent at NONE pending the 4090 * parent's own teardown). There is nothing to disable; calling 4091 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a 4092 * NONE -> READY validation failure. 4093 */ 4094 if (scx_get_task_state(p) == SCX_TASK_NONE) 4095 return; 4096 4097 scx_disable_task(scx_task_sched(p), p); 4098 } 4099 4100 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 4101 4102 int scx_check_setscheduler(struct task_struct *p, int policy) 4103 { 4104 lockdep_assert_rq_held(task_rq(p)); 4105 4106 /* if disallow, reject transitioning into SCX */ 4107 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 4108 p->policy != policy && policy == SCHED_EXT) 4109 return -EACCES; 4110 4111 return 0; 4112 } 4113 4114 static void process_ddsp_deferred_locals(struct rq *rq) 4115 { 4116 struct task_struct *p; 4117 4118 lockdep_assert_rq_held(rq); 4119 4120 /* 4121 * Now that @rq can be unlocked, execute the deferred enqueueing of 4122 * tasks directly dispatched to the local DSQs of other CPUs. See 4123 * direct_dispatch(). Keep popping from the head instead of using 4124 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 4125 * temporarily. 4126 */ 4127 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 4128 struct task_struct, scx.dsq_list.node))) { 4129 struct scx_sched *sch = scx_task_sched(p); 4130 struct scx_dispatch_q *dsq; 4131 u64 dsq_id = p->scx.ddsp_dsq_id; 4132 u64 enq_flags = p->scx.ddsp_enq_flags; 4133 4134 list_del_init(&p->scx.dsq_list.node); 4135 clear_direct_dispatch(p); 4136 4137 dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); 4138 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 4139 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 4140 } 4141 } 4142 4143 /* 4144 * Determine whether @p should be reenqueued from a local DSQ. 4145 * 4146 * @reenq_flags is mutable and accumulates state across the DSQ walk: 4147 * 4148 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 4149 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 4150 * the head consumes the first slot. 4151 * 4152 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 4153 * rq_is_open() is true. 4154 * 4155 * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 4156 * AND the current task is done — i.e. it will execute immediately. All other 4157 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 4158 * every IMMED task behind it gets reenqueued. 4159 * 4160 * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 4161 * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 4162 * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 4163 * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 4164 * in process_deferred_reenq_locals(). 4165 */ 4166 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 4167 { 4168 bool first; 4169 4170 first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 4171 *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 4172 4173 *reason = SCX_TASK_REENQ_KFUNC; 4174 4175 if ((p->scx.flags & SCX_TASK_IMMED) && 4176 (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 4177 __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 4178 *reason = SCX_TASK_REENQ_IMMED; 4179 return true; 4180 } 4181 4182 return *reenq_flags & SCX_REENQ_ANY; 4183 } 4184 4185 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 4186 { 4187 LIST_HEAD(tasks); 4188 u32 nr_enqueued = 0; 4189 struct task_struct *p, *n; 4190 4191 lockdep_assert_rq_held(rq); 4192 4193 if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 4194 reenq_flags &= ~__SCX_REENQ_TSR_MASK; 4195 if (rq_is_open(rq, 0)) 4196 reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 4197 4198 /* 4199 * The BPF scheduler may choose to dispatch tasks back to 4200 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 4201 * first to avoid processing the same tasks repeatedly. 4202 */ 4203 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 4204 scx.dsq_list.node) { 4205 struct scx_sched *task_sch = scx_task_sched(p); 4206 u32 reason; 4207 4208 /* 4209 * If @p is being migrated, @p's current CPU may not agree with 4210 * its allowed CPUs and the migration_cpu_stop is about to 4211 * deactivate and re-activate @p anyway. Skip re-enqueueing. 4212 * 4213 * While racing sched property changes may also dequeue and 4214 * re-enqueue a migrating task while its current CPU and allowed 4215 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 4216 * the current local DSQ for running tasks and thus are not 4217 * visible to the BPF scheduler. 4218 */ 4219 if (p->migration_pending) 4220 continue; 4221 4222 if (!scx_is_descendant(task_sch, sch)) 4223 continue; 4224 4225 if (!local_task_should_reenq(p, &reenq_flags, &reason)) 4226 continue; 4227 4228 dispatch_dequeue(rq, p); 4229 4230 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4231 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4232 p->scx.flags |= reason; 4233 4234 list_add_tail(&p->scx.dsq_list.node, &tasks); 4235 } 4236 4237 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 4238 list_del_init(&p->scx.dsq_list.node); 4239 4240 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 4241 4242 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4243 nr_enqueued++; 4244 } 4245 4246 return nr_enqueued; 4247 } 4248 4249 static void process_deferred_reenq_locals(struct rq *rq) 4250 { 4251 u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4252 4253 lockdep_assert_rq_held(rq); 4254 4255 while (true) { 4256 struct scx_sched *sch; 4257 u64 reenq_flags; 4258 bool skip = false; 4259 4260 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4261 struct scx_deferred_reenq_local *drl = 4262 list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4263 struct scx_deferred_reenq_local, 4264 node); 4265 struct scx_sched_pcpu *sch_pcpu; 4266 4267 if (!drl) 4268 return; 4269 4270 sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4271 deferred_reenq_local); 4272 sch = sch_pcpu->sch; 4273 4274 reenq_flags = drl->flags; 4275 WRITE_ONCE(drl->flags, 0); 4276 list_del_init(&drl->node); 4277 4278 if (likely(drl->seq != seq)) { 4279 drl->seq = seq; 4280 drl->cnt = 0; 4281 } else { 4282 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4283 scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4284 drl->cnt); 4285 skip = true; 4286 } 4287 4288 __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4289 } 4290 } 4291 4292 if (!skip) { 4293 /* see schedule_dsq_reenq() */ 4294 smp_mb(); 4295 4296 reenq_local(sch, rq, reenq_flags); 4297 } 4298 } 4299 } 4300 4301 static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4302 { 4303 *reason = SCX_TASK_REENQ_KFUNC; 4304 return reenq_flags & SCX_REENQ_ANY; 4305 } 4306 4307 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4308 { 4309 struct rq *locked_rq = rq; 4310 struct scx_sched *sch = dsq->sched; 4311 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4312 struct task_struct *p; 4313 s32 nr_enqueued = 0; 4314 4315 lockdep_assert_rq_held(rq); 4316 4317 raw_spin_lock(&dsq->lock); 4318 4319 while (likely(!READ_ONCE(sch->bypass_depth))) { 4320 struct rq *task_rq; 4321 u32 reason; 4322 4323 p = nldsq_cursor_next_task(&cursor, dsq); 4324 if (!p) 4325 break; 4326 4327 if (!user_task_should_reenq(p, reenq_flags, &reason)) 4328 continue; 4329 4330 task_rq = task_rq(p); 4331 4332 if (locked_rq != task_rq) { 4333 if (locked_rq) 4334 raw_spin_rq_unlock(locked_rq); 4335 if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4336 raw_spin_unlock(&dsq->lock); 4337 raw_spin_rq_lock(task_rq); 4338 raw_spin_lock(&dsq->lock); 4339 } 4340 locked_rq = task_rq; 4341 4342 /* did we lose @p while switching locks? */ 4343 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4344 continue; 4345 } 4346 4347 /* @p is on @dsq, its rq and @dsq are locked */ 4348 dispatch_dequeue_locked(p, dsq); 4349 raw_spin_unlock(&dsq->lock); 4350 4351 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4352 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4353 p->scx.flags |= reason; 4354 4355 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4356 4357 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4358 4359 if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4360 raw_spin_rq_unlock(locked_rq); 4361 locked_rq = NULL; 4362 cpu_relax(); 4363 } 4364 4365 raw_spin_lock(&dsq->lock); 4366 } 4367 4368 list_del_init(&cursor.node); 4369 raw_spin_unlock(&dsq->lock); 4370 4371 if (locked_rq != rq) { 4372 if (locked_rq) 4373 raw_spin_rq_unlock(locked_rq); 4374 raw_spin_rq_lock(rq); 4375 } 4376 } 4377 4378 static void process_deferred_reenq_users(struct rq *rq) 4379 { 4380 lockdep_assert_rq_held(rq); 4381 4382 while (true) { 4383 struct scx_dispatch_q *dsq; 4384 u64 reenq_flags; 4385 4386 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4387 struct scx_deferred_reenq_user *dru = 4388 list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4389 struct scx_deferred_reenq_user, 4390 node); 4391 struct scx_dsq_pcpu *dsq_pcpu; 4392 4393 if (!dru) 4394 return; 4395 4396 dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4397 deferred_reenq_user); 4398 dsq = dsq_pcpu->dsq; 4399 reenq_flags = dru->flags; 4400 WRITE_ONCE(dru->flags, 0); 4401 list_del_init(&dru->node); 4402 } 4403 4404 /* see schedule_dsq_reenq() */ 4405 smp_mb(); 4406 4407 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4408 reenq_user(rq, dsq, reenq_flags); 4409 } 4410 } 4411 4412 static void run_deferred(struct rq *rq) 4413 { 4414 process_ddsp_deferred_locals(rq); 4415 4416 if (!list_empty(&rq->scx.deferred_reenq_locals)) 4417 process_deferred_reenq_locals(rq); 4418 4419 if (!list_empty(&rq->scx.deferred_reenq_users)) 4420 process_deferred_reenq_users(rq); 4421 } 4422 4423 #ifdef CONFIG_NO_HZ_FULL 4424 bool scx_can_stop_tick(struct rq *rq) 4425 { 4426 struct task_struct *p = rq->curr; 4427 struct scx_sched *sch = scx_task_sched(p); 4428 4429 if (p->sched_class != &ext_sched_class) 4430 return true; 4431 4432 if (scx_bypassing(sch, cpu_of(rq))) 4433 return false; 4434 4435 /* 4436 * @rq can dispatch from different DSQs, so we can't tell whether it 4437 * needs the tick or not by looking at nr_running. Allow stopping ticks 4438 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4439 */ 4440 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4441 } 4442 #endif 4443 4444 #ifdef CONFIG_EXT_GROUP_SCHED 4445 4446 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4447 static bool scx_cgroup_enabled; 4448 4449 void scx_tg_init(struct task_group *tg) 4450 { 4451 tg->scx.weight = CGROUP_WEIGHT_DFL; 4452 tg->scx.bw_period_us = default_bw_period_us(); 4453 tg->scx.bw_quota_us = RUNTIME_INF; 4454 tg->scx.idle = false; 4455 } 4456 4457 int scx_tg_online(struct task_group *tg) 4458 { 4459 struct scx_sched *sch = scx_root; 4460 int ret = 0; 4461 4462 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4463 4464 if (scx_cgroup_enabled) { 4465 if (SCX_HAS_OP(sch, cgroup_init)) { 4466 struct scx_cgroup_init_args args = 4467 { .weight = tg->scx.weight, 4468 .bw_period_us = tg->scx.bw_period_us, 4469 .bw_quota_us = tg->scx.bw_quota_us, 4470 .bw_burst_us = tg->scx.bw_burst_us }; 4471 4472 ret = SCX_CALL_OP_RET(sch, cgroup_init, 4473 NULL, tg->css.cgroup, &args); 4474 if (ret) 4475 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4476 } 4477 if (ret == 0) 4478 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4479 } else { 4480 tg->scx.flags |= SCX_TG_ONLINE; 4481 } 4482 4483 return ret; 4484 } 4485 4486 void scx_tg_offline(struct task_group *tg) 4487 { 4488 struct scx_sched *sch = scx_root; 4489 4490 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4491 4492 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4493 (tg->scx.flags & SCX_TG_INITED)) 4494 SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); 4495 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4496 } 4497 4498 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4499 { 4500 struct scx_sched *sch = scx_root; 4501 struct cgroup_subsys_state *css; 4502 struct task_struct *p; 4503 int ret; 4504 4505 if (!scx_cgroup_enabled) 4506 return 0; 4507 4508 cgroup_taskset_for_each(p, css, tset) { 4509 struct cgroup *from = tg_cgrp(task_group(p)); 4510 struct cgroup *to = tg_cgrp(css_tg(css)); 4511 4512 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4513 4514 /* 4515 * sched_move_task() omits identity migrations. Let's match the 4516 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4517 * always match one-to-one. 4518 */ 4519 if (from == to) 4520 continue; 4521 4522 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4523 ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, 4524 p, from, css->cgroup); 4525 if (ret) 4526 goto err; 4527 } 4528 4529 p->scx.cgrp_moving_from = from; 4530 } 4531 4532 return 0; 4533 4534 err: 4535 cgroup_taskset_for_each(p, css, tset) { 4536 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4537 p->scx.cgrp_moving_from) 4538 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4539 p, p->scx.cgrp_moving_from, css->cgroup); 4540 p->scx.cgrp_moving_from = NULL; 4541 } 4542 4543 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4544 } 4545 4546 void scx_cgroup_move_task(struct task_struct *p) 4547 { 4548 struct scx_sched *sch = scx_root; 4549 4550 if (!scx_cgroup_enabled) 4551 return; 4552 4553 /* 4554 * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's 4555 * cgroup changes. Migration keys off css rather than cgroup identity, 4556 * so it can hand an unchanged-cgroup task here with cgrp_moving_from 4557 * NULL. Nothing to report to the BPF scheduler then, so skip it and 4558 * keep prep_move and move paired. 4559 */ 4560 if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from) 4561 SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), 4562 p, p->scx.cgrp_moving_from, 4563 tg_cgrp(task_group(p))); 4564 p->scx.cgrp_moving_from = NULL; 4565 } 4566 4567 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4568 { 4569 struct scx_sched *sch = scx_root; 4570 struct cgroup_subsys_state *css; 4571 struct task_struct *p; 4572 4573 if (!scx_cgroup_enabled) 4574 return; 4575 4576 cgroup_taskset_for_each(p, css, tset) { 4577 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4578 p->scx.cgrp_moving_from) 4579 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4580 p, p->scx.cgrp_moving_from, css->cgroup); 4581 p->scx.cgrp_moving_from = NULL; 4582 } 4583 } 4584 4585 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4586 { 4587 struct scx_sched *sch; 4588 4589 percpu_down_read(&scx_cgroup_ops_rwsem); 4590 sch = scx_root; 4591 4592 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4593 tg->scx.weight != weight) 4594 SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); 4595 4596 tg->scx.weight = weight; 4597 4598 percpu_up_read(&scx_cgroup_ops_rwsem); 4599 } 4600 4601 void scx_group_set_idle(struct task_group *tg, bool idle) 4602 { 4603 struct scx_sched *sch; 4604 4605 percpu_down_read(&scx_cgroup_ops_rwsem); 4606 sch = scx_root; 4607 4608 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4609 SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); 4610 4611 /* Update the task group's idle state */ 4612 tg->scx.idle = idle; 4613 4614 percpu_up_read(&scx_cgroup_ops_rwsem); 4615 } 4616 4617 void scx_group_set_bandwidth(struct task_group *tg, 4618 u64 period_us, u64 quota_us, u64 burst_us) 4619 { 4620 struct scx_sched *sch; 4621 4622 percpu_down_read(&scx_cgroup_ops_rwsem); 4623 sch = scx_root; 4624 4625 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4626 (tg->scx.bw_period_us != period_us || 4627 tg->scx.bw_quota_us != quota_us || 4628 tg->scx.bw_burst_us != burst_us)) 4629 SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, 4630 tg_cgrp(tg), period_us, quota_us, burst_us); 4631 4632 tg->scx.bw_period_us = period_us; 4633 tg->scx.bw_quota_us = quota_us; 4634 tg->scx.bw_burst_us = burst_us; 4635 4636 percpu_up_read(&scx_cgroup_ops_rwsem); 4637 } 4638 #endif /* CONFIG_EXT_GROUP_SCHED */ 4639 4640 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4641 static struct cgroup *root_cgroup(void) 4642 { 4643 return &cgrp_dfl_root.cgrp; 4644 } 4645 4646 static void scx_cgroup_lock(void) 4647 { 4648 #ifdef CONFIG_EXT_GROUP_SCHED 4649 percpu_down_write(&scx_cgroup_ops_rwsem); 4650 #endif 4651 cgroup_lock(); 4652 } 4653 4654 static void scx_cgroup_unlock(void) 4655 { 4656 cgroup_unlock(); 4657 #ifdef CONFIG_EXT_GROUP_SCHED 4658 percpu_up_write(&scx_cgroup_ops_rwsem); 4659 #endif 4660 } 4661 #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4662 static inline struct cgroup *root_cgroup(void) { return NULL; } 4663 static inline void scx_cgroup_lock(void) {} 4664 static inline void scx_cgroup_unlock(void) {} 4665 #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4666 4667 #ifdef CONFIG_EXT_SUB_SCHED 4668 static struct cgroup *sch_cgroup(struct scx_sched *sch) 4669 { 4670 return sch->cgrp; 4671 } 4672 4673 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4674 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4675 { 4676 struct cgroup *pos; 4677 struct cgroup_subsys_state *css; 4678 4679 cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4680 rcu_assign_pointer(pos->scx_sched, sch); 4681 } 4682 #else /* CONFIG_EXT_SUB_SCHED */ 4683 static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4684 static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4685 #endif /* CONFIG_EXT_SUB_SCHED */ 4686 4687 /* 4688 * Omitted operations: 4689 * 4690 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4691 * 4692 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4693 * their current sched_class. Call them directly from sched core instead. 4694 */ 4695 DEFINE_SCHED_CLASS(ext) = { 4696 .enqueue_task = enqueue_task_scx, 4697 .dequeue_task = dequeue_task_scx, 4698 .yield_task = yield_task_scx, 4699 .yield_to_task = yield_to_task_scx, 4700 4701 .wakeup_preempt = wakeup_preempt_scx, 4702 4703 .pick_task = pick_task_scx, 4704 4705 .put_prev_task = put_prev_task_scx, 4706 .set_next_task = set_next_task_scx, 4707 4708 .select_task_rq = select_task_rq_scx, 4709 .task_woken = task_woken_scx, 4710 .set_cpus_allowed = set_cpus_allowed_scx, 4711 4712 .rq_online = rq_online_scx, 4713 .rq_offline = rq_offline_scx, 4714 4715 .task_tick = task_tick_scx, 4716 4717 .switching_to = switching_to_scx, 4718 .switched_from = switched_from_scx, 4719 .switched_to = switched_to_scx, 4720 .reweight_task = reweight_task_scx, 4721 .prio_changed = prio_changed_scx, 4722 4723 .update_curr = update_curr_scx, 4724 4725 #ifdef CONFIG_UCLAMP_TASK 4726 .uclamp_enabled = 1, 4727 #endif 4728 }; 4729 4730 static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4731 struct scx_sched *sch) 4732 { 4733 s32 cpu; 4734 4735 memset(dsq, 0, sizeof(*dsq)); 4736 4737 raw_spin_lock_init(&dsq->lock); 4738 INIT_LIST_HEAD(&dsq->list); 4739 dsq->id = dsq_id; 4740 dsq->sched = sch; 4741 4742 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4743 if (!dsq->pcpu) 4744 return -ENOMEM; 4745 4746 for_each_possible_cpu(cpu) { 4747 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4748 4749 pcpu->dsq = dsq; 4750 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4751 } 4752 4753 return 0; 4754 } 4755 4756 static void exit_dsq(struct scx_dispatch_q *dsq) 4757 { 4758 s32 cpu; 4759 4760 for_each_possible_cpu(cpu) { 4761 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4762 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4763 struct rq *rq = cpu_rq(cpu); 4764 4765 /* 4766 * There must have been a RCU grace period since the last 4767 * insertion and @dsq should be off the deferred list by now. 4768 */ 4769 if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4770 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4771 list_del_init(&dru->node); 4772 } 4773 } 4774 4775 free_percpu(dsq->pcpu); 4776 } 4777 4778 static void free_dsq_rcufn(struct rcu_head *rcu) 4779 { 4780 struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4781 4782 exit_dsq(dsq); 4783 kfree(dsq); 4784 } 4785 4786 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4787 { 4788 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4789 struct scx_dispatch_q *dsq, *tmp_dsq; 4790 4791 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4792 call_rcu(&dsq->rcu, free_dsq_rcufn); 4793 } 4794 4795 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4796 4797 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4798 { 4799 struct scx_dispatch_q *dsq; 4800 unsigned long flags; 4801 4802 rcu_read_lock(); 4803 4804 dsq = find_user_dsq(sch, dsq_id); 4805 if (!dsq) 4806 goto out_unlock_rcu; 4807 4808 raw_spin_lock_irqsave(&dsq->lock, flags); 4809 4810 if (dsq->nr) { 4811 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4812 dsq->id, dsq->nr); 4813 goto out_unlock_dsq; 4814 } 4815 4816 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4817 dsq_hash_params)) 4818 goto out_unlock_dsq; 4819 4820 /* 4821 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4822 * queueing more tasks. As this function can be called from anywhere, 4823 * freeing is bounced through an irq work to avoid nesting RCU 4824 * operations inside scheduler locks. 4825 */ 4826 dsq->id = SCX_DSQ_INVALID; 4827 if (llist_add(&dsq->free_node, &dsqs_to_free)) 4828 irq_work_queue(&free_dsq_irq_work); 4829 4830 out_unlock_dsq: 4831 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4832 out_unlock_rcu: 4833 rcu_read_unlock(); 4834 } 4835 4836 #ifdef CONFIG_EXT_GROUP_SCHED 4837 static void scx_cgroup_exit(struct scx_sched *sch) 4838 { 4839 struct cgroup_subsys_state *css; 4840 4841 scx_cgroup_enabled = false; 4842 4843 /* 4844 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4845 * cgroups and exit all the inited ones, all online cgroups are exited. 4846 */ 4847 css_for_each_descendant_post(css, &root_task_group.css) { 4848 struct task_group *tg = css_tg(css); 4849 4850 if (!(tg->scx.flags & SCX_TG_INITED)) 4851 continue; 4852 tg->scx.flags &= ~SCX_TG_INITED; 4853 4854 if (!sch->ops.cgroup_exit) 4855 continue; 4856 4857 SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); 4858 } 4859 } 4860 4861 static int scx_cgroup_init(struct scx_sched *sch) 4862 { 4863 struct cgroup_subsys_state *css; 4864 int ret; 4865 4866 /* 4867 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4868 * cgroups and init, all online cgroups are initialized. 4869 */ 4870 css_for_each_descendant_pre(css, &root_task_group.css) { 4871 struct task_group *tg = css_tg(css); 4872 struct scx_cgroup_init_args args = { 4873 .weight = tg->scx.weight, 4874 .bw_period_us = tg->scx.bw_period_us, 4875 .bw_quota_us = tg->scx.bw_quota_us, 4876 .bw_burst_us = tg->scx.bw_burst_us, 4877 }; 4878 4879 if ((tg->scx.flags & 4880 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4881 continue; 4882 4883 if (!sch->ops.cgroup_init) { 4884 tg->scx.flags |= SCX_TG_INITED; 4885 continue; 4886 } 4887 4888 ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, 4889 css->cgroup, &args); 4890 if (ret) { 4891 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4892 return ret; 4893 } 4894 tg->scx.flags |= SCX_TG_INITED; 4895 } 4896 4897 WARN_ON_ONCE(scx_cgroup_enabled); 4898 scx_cgroup_enabled = true; 4899 4900 return 0; 4901 } 4902 4903 #else 4904 static void scx_cgroup_exit(struct scx_sched *sch) {} 4905 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4906 #endif 4907 4908 4909 /******************************************************************************** 4910 * Sysfs interface and ops enable/disable. 4911 */ 4912 4913 #define SCX_ATTR(_name) \ 4914 static struct kobj_attribute scx_attr_##_name = { \ 4915 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4916 .show = scx_attr_##_name##_show, \ 4917 } 4918 4919 static ssize_t scx_attr_state_show(struct kobject *kobj, 4920 struct kobj_attribute *ka, char *buf) 4921 { 4922 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4923 } 4924 SCX_ATTR(state); 4925 4926 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4927 struct kobj_attribute *ka, char *buf) 4928 { 4929 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4930 } 4931 SCX_ATTR(switch_all); 4932 4933 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4934 struct kobj_attribute *ka, char *buf) 4935 { 4936 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4937 } 4938 SCX_ATTR(nr_rejected); 4939 4940 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4941 struct kobj_attribute *ka, char *buf) 4942 { 4943 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4944 } 4945 SCX_ATTR(hotplug_seq); 4946 4947 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4948 struct kobj_attribute *ka, char *buf) 4949 { 4950 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4951 } 4952 SCX_ATTR(enable_seq); 4953 4954 static struct attribute *scx_global_attrs[] = { 4955 &scx_attr_state.attr, 4956 &scx_attr_switch_all.attr, 4957 &scx_attr_nr_rejected.attr, 4958 &scx_attr_hotplug_seq.attr, 4959 &scx_attr_enable_seq.attr, 4960 NULL, 4961 }; 4962 4963 static const struct attribute_group scx_global_attr_group = { 4964 .attrs = scx_global_attrs, 4965 }; 4966 4967 static void free_pnode(struct scx_sched_pnode *pnode); 4968 static void free_exit_info(struct scx_exit_info *ei); 4969 4970 static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch) 4971 { 4972 size_t size = struct_size_t(struct scx_cmask, bits, 4973 SCX_CMASK_NR_WORDS(num_possible_cpus())); 4974 int cpu; 4975 4976 if (!sch->is_cid_type || !sch->arena_pool) 4977 return 0; 4978 4979 sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *); 4980 if (!sch->set_cmask_scratch) 4981 return -ENOMEM; 4982 4983 for_each_possible_cpu(cpu) { 4984 struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); 4985 4986 *slot = scx_arena_alloc(sch, size); 4987 if (!*slot) 4988 return -ENOMEM; 4989 scx_cmask_init(*slot, 0, num_possible_cpus()); 4990 } 4991 return 0; 4992 } 4993 4994 static void scx_set_cmask_scratch_free(struct scx_sched *sch) 4995 { 4996 size_t size = struct_size_t(struct scx_cmask, bits, 4997 SCX_CMASK_NR_WORDS(num_possible_cpus())); 4998 int cpu; 4999 5000 if (!sch->set_cmask_scratch) 5001 return; 5002 5003 for_each_possible_cpu(cpu) { 5004 struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); 5005 5006 scx_arena_free(sch, *slot, size); 5007 } 5008 free_percpu(sch->set_cmask_scratch); 5009 sch->set_cmask_scratch = NULL; 5010 } 5011 5012 static void scx_sched_free_rcu_work(struct work_struct *work) 5013 { 5014 struct rcu_work *rcu_work = to_rcu_work(work); 5015 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 5016 struct rhashtable_iter rht_iter; 5017 struct scx_dispatch_q *dsq; 5018 int cpu, node; 5019 5020 irq_work_sync(&sch->disable_irq_work); 5021 kthread_destroy_worker(sch->helper); 5022 timer_shutdown_sync(&sch->bypass_lb_timer); 5023 free_cpumask_var(sch->bypass_lb_donee_cpumask); 5024 free_cpumask_var(sch->bypass_lb_resched_cpumask); 5025 5026 #ifdef CONFIG_EXT_SUB_SCHED 5027 kfree(sch->cgrp_path); 5028 if (sch_cgroup(sch)) 5029 cgroup_put(sch_cgroup(sch)); 5030 if (sch->sub_kset) 5031 kobject_put(&sch->sub_kset->kobj); 5032 #endif /* CONFIG_EXT_SUB_SCHED */ 5033 5034 for_each_possible_cpu(cpu) { 5035 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 5036 5037 /* 5038 * $sch would have entered bypass mode before the RCU grace 5039 * period. As that blocks new deferrals, all 5040 * deferred_reenq_local_node's must be off-list by now. 5041 */ 5042 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 5043 5044 exit_dsq(bypass_dsq(sch, cpu)); 5045 } 5046 5047 free_percpu(sch->pcpu); 5048 5049 for_each_node_state(node, N_POSSIBLE) 5050 free_pnode(sch->pnode[node]); 5051 kfree(sch->pnode); 5052 5053 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 5054 do { 5055 rhashtable_walk_start(&rht_iter); 5056 5057 while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 5058 destroy_dsq(sch, dsq->id); 5059 5060 rhashtable_walk_stop(&rht_iter); 5061 } while (dsq == ERR_PTR(-EAGAIN)); 5062 rhashtable_walk_exit(&rht_iter); 5063 5064 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 5065 free_exit_info(sch->exit_info); 5066 scx_set_cmask_scratch_free(sch); 5067 scx_arena_pool_destroy(sch); 5068 if (sch->arena_map) 5069 bpf_map_put(sch->arena_map); 5070 kfree(sch); 5071 } 5072 5073 static void scx_kobj_release(struct kobject *kobj) 5074 { 5075 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 5076 5077 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 5078 queue_rcu_work(system_dfl_wq, &sch->rcu_work); 5079 } 5080 5081 static ssize_t scx_attr_ops_show(struct kobject *kobj, 5082 struct kobj_attribute *ka, char *buf) 5083 { 5084 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 5085 5086 return sysfs_emit(buf, "%s\n", sch->ops.name); 5087 } 5088 SCX_ATTR(ops); 5089 5090 #define scx_attr_event_show(buf, at, events, kind) ({ \ 5091 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 5092 }) 5093 5094 static ssize_t scx_attr_events_show(struct kobject *kobj, 5095 struct kobj_attribute *ka, char *buf) 5096 { 5097 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 5098 struct scx_event_stats events; 5099 int at = 0; 5100 5101 scx_read_events(sch, &events); 5102 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 5103 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 5104 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 5105 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 5106 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 5107 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 5108 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 5109 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 5110 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 5111 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 5112 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 5113 at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 5114 at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 5115 return at; 5116 } 5117 SCX_ATTR(events); 5118 5119 static struct attribute *scx_sched_attrs[] = { 5120 &scx_attr_ops.attr, 5121 &scx_attr_events.attr, 5122 NULL, 5123 }; 5124 ATTRIBUTE_GROUPS(scx_sched); 5125 5126 static const struct kobj_type scx_ktype = { 5127 .release = scx_kobj_release, 5128 .sysfs_ops = &kobj_sysfs_ops, 5129 .default_groups = scx_sched_groups, 5130 }; 5131 5132 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 5133 { 5134 const struct scx_sched *sch; 5135 5136 /* 5137 * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) 5138 * and sub-scheduler kset kobjects (kset_ktype) through the parent 5139 * chain walk. Filter out the latter to avoid invalid casts. 5140 */ 5141 if (kobj->ktype != &scx_ktype) 5142 return 0; 5143 5144 sch = container_of(kobj, struct scx_sched, kobj); 5145 5146 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 5147 } 5148 5149 static const struct kset_uevent_ops scx_uevent_ops = { 5150 .uevent = scx_uevent, 5151 }; 5152 5153 /* 5154 * Used by sched_fork() and __setscheduler_prio() to pick the matching 5155 * sched_class. dl/rt are already handled. 5156 */ 5157 bool task_should_scx(int policy) 5158 { 5159 /* if disabled, nothing should be on it */ 5160 if (!scx_enabled()) 5161 return false; 5162 5163 /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ 5164 if (READ_ONCE(scx_switching_all)) 5165 return true; 5166 5167 /* 5168 * scx is tearing down - keep new SCHED_EXT tasks out. 5169 * 5170 * Must come after scx_switching_all test, which serves as a proxy 5171 * for __scx_switched_all. While __scx_switched_all is set, we must 5172 * return true via the branch above: a fork routed to fair would 5173 * stall because next_active_class() skips fair. 5174 * 5175 * This can develop into a deadlock - scx holds scx_enable_mutex across 5176 * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is 5177 * the stalled task, the disable path can never grab the mutex to clear 5178 * scx_switching_all. 5179 */ 5180 if (unlikely(scx_enable_state() == SCX_DISABLING)) 5181 return false; 5182 5183 return policy == SCHED_EXT; 5184 } 5185 5186 bool scx_allow_ttwu_queue(const struct task_struct *p) 5187 { 5188 struct scx_sched *sch; 5189 5190 if (!scx_enabled()) 5191 return true; 5192 5193 sch = scx_task_sched(p); 5194 if (unlikely(!sch)) 5195 return true; 5196 5197 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 5198 return true; 5199 5200 if (unlikely(p->sched_class != &ext_sched_class)) 5201 return true; 5202 5203 return false; 5204 } 5205 5206 /** 5207 * handle_lockup - sched_ext common lockup handler 5208 * @fmt: format string 5209 * 5210 * Called on system stall or lockup condition and initiates abort of sched_ext 5211 * if enabled, which may resolve the reported lockup. 5212 * 5213 * Returns %true if sched_ext is enabled and abort was initiated, which may 5214 * resolve the lockup. %false if sched_ext is not enabled or abort was already 5215 * initiated by someone else. 5216 */ 5217 static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 5218 { 5219 struct scx_sched *sch; 5220 va_list args; 5221 bool ret; 5222 5223 guard(rcu)(); 5224 5225 sch = rcu_dereference(scx_root); 5226 if (unlikely(!sch)) 5227 return false; 5228 5229 switch (scx_enable_state()) { 5230 case SCX_ENABLING: 5231 case SCX_ENABLED: 5232 va_start(args, fmt); 5233 ret = scx_verror(sch, fmt, args); 5234 va_end(args); 5235 return ret; 5236 default: 5237 return false; 5238 } 5239 } 5240 5241 /** 5242 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 5243 * 5244 * While there are various reasons why RCU CPU stalls can occur on a system 5245 * that may not be caused by the current BPF scheduler, try kicking out the 5246 * current scheduler in an attempt to recover the system to a good state before 5247 * issuing panics. 5248 * 5249 * Returns %true if sched_ext is enabled and abort was initiated, which may 5250 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 5251 * else already initiated abort. 5252 */ 5253 bool scx_rcu_cpu_stall(void) 5254 { 5255 return handle_lockup("RCU CPU stall detected!"); 5256 } 5257 5258 /** 5259 * scx_softlockup - sched_ext softlockup handler 5260 * @dur_s: number of seconds of CPU stuck due to soft lockup 5261 * 5262 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 5263 * live-lock the system by making many CPUs target the same DSQ to the point 5264 * where soft-lockup detection triggers. This function is called from 5265 * soft-lockup watchdog when the triggering point is close and tries to unjam 5266 * the system and aborting the BPF scheduler. 5267 */ 5268 void scx_softlockup(u32 dur_s) 5269 { 5270 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 5271 return; 5272 5273 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 5274 smp_processor_id(), dur_s); 5275 } 5276 5277 /* 5278 * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), 5279 * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing 5280 * it from NMI context can lead to deadlocks. Defer via irq_work; the 5281 * disable path runs off irq_work anyway. 5282 */ 5283 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); 5284 5285 static void scx_hardlockup_irq_workfn(struct irq_work *work) 5286 { 5287 int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); 5288 5289 if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) 5290 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 5291 cpu); 5292 } 5293 5294 static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); 5295 5296 /** 5297 * scx_hardlockup - sched_ext hardlockup handler 5298 * 5299 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 5300 * numerous affinitized tasks in a single queue and directing all CPUs at it. 5301 * Try kicking out the current scheduler in an attempt to recover the system to 5302 * a good state before taking more drastic actions. 5303 * 5304 * Queues an irq_work; the handle_lockup() call happens in IRQ context (see 5305 * scx_hardlockup_irq_workfn). 5306 * 5307 * Returns %true if sched_ext is enabled and the work was queued, %false 5308 * otherwise. 5309 */ 5310 bool scx_hardlockup(int cpu) 5311 { 5312 if (!rcu_access_pointer(scx_root)) 5313 return false; 5314 5315 atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); 5316 irq_work_queue(&scx_hardlockup_irq_work); 5317 return true; 5318 } 5319 5320 static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 5321 struct cpumask *donee_mask, struct cpumask *resched_mask, 5322 u32 nr_donor_target, u32 nr_donee_target) 5323 { 5324 struct rq *donor_rq = cpu_rq(donor); 5325 struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 5326 struct task_struct *p, *n; 5327 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 5328 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 5329 u32 nr_balanced = 0, min_delta_us; 5330 5331 /* 5332 * All we want to guarantee is reasonable forward progress. No reason to 5333 * fine tune. Assuming every task on @donor_dsq runs their full slice, 5334 * consider offloading iff the total queued duration is over the 5335 * threshold. 5336 */ 5337 min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 5338 if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 5339 return 0; 5340 5341 raw_spin_rq_lock_irq(donor_rq); 5342 raw_spin_lock(&donor_dsq->lock); 5343 list_add(&cursor.node, &donor_dsq->list); 5344 resume: 5345 n = container_of(&cursor, struct task_struct, scx.dsq_list); 5346 n = nldsq_next_task(donor_dsq, n, false); 5347 5348 while ((p = n)) { 5349 struct scx_dispatch_q *donee_dsq; 5350 int donee; 5351 5352 n = nldsq_next_task(donor_dsq, n, false); 5353 5354 if (donor_dsq->nr <= nr_donor_target) 5355 break; 5356 5357 if (cpumask_empty(donee_mask)) 5358 break; 5359 5360 /* 5361 * If an earlier pass placed @p on @donor_dsq from a different 5362 * CPU and the donee hasn't consumed it yet, @p is still on the 5363 * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved 5364 * without its rq locked. Skip. 5365 */ 5366 if (task_rq(p) != donor_rq) 5367 continue; 5368 5369 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5370 if (donee >= nr_cpu_ids) 5371 continue; 5372 5373 donee_dsq = bypass_dsq(sch, donee); 5374 5375 /* 5376 * $p's rq is not locked but $p's DSQ lock protects its 5377 * scheduling properties making this test safe. 5378 */ 5379 if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5380 continue; 5381 5382 /* 5383 * Moving $p from one non-local DSQ to another. The source rq 5384 * and DSQ are already locked. Do an abbreviated dequeue and 5385 * then perform enqueue without unlocking $donor_dsq. 5386 * 5387 * We don't want to drop and reacquire the lock on each 5388 * iteration as @donor_dsq can be very long and potentially 5389 * highly contended. Donee DSQs are less likely to be contended. 5390 * The nested locking is safe as only this LB moves tasks 5391 * between bypass DSQs. 5392 */ 5393 dispatch_dequeue_locked(p, donor_dsq); 5394 dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5395 5396 /* 5397 * $donee might have been idle and need to be woken up. No need 5398 * to be clever. Kick every CPU that receives tasks. 5399 */ 5400 cpumask_set_cpu(donee, resched_mask); 5401 5402 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5403 cpumask_clear_cpu(donee, donee_mask); 5404 5405 nr_balanced++; 5406 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5407 list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5408 raw_spin_unlock(&donor_dsq->lock); 5409 raw_spin_rq_unlock_irq(donor_rq); 5410 cpu_relax(); 5411 raw_spin_rq_lock_irq(donor_rq); 5412 raw_spin_lock(&donor_dsq->lock); 5413 goto resume; 5414 } 5415 } 5416 5417 list_del_init(&cursor.node); 5418 raw_spin_unlock(&donor_dsq->lock); 5419 raw_spin_rq_unlock_irq(donor_rq); 5420 5421 return nr_balanced; 5422 } 5423 5424 static void bypass_lb_node(struct scx_sched *sch, int node) 5425 { 5426 const struct cpumask *node_mask = cpumask_of_node(node); 5427 struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; 5428 struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; 5429 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5430 u32 nr_target, nr_donor_target; 5431 u32 before_min = U32_MAX, before_max = 0; 5432 u32 after_min = U32_MAX, after_max = 0; 5433 int cpu; 5434 5435 /* count the target tasks and CPUs */ 5436 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5437 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5438 5439 nr_tasks += nr; 5440 nr_cpus++; 5441 5442 before_min = min(nr, before_min); 5443 before_max = max(nr, before_max); 5444 } 5445 5446 if (!nr_cpus) 5447 return; 5448 5449 /* 5450 * We don't want CPUs to have more than $nr_donor_target tasks and 5451 * balancing to fill donee CPUs upto $nr_target. Once targets are 5452 * calculated, find the donee CPUs. 5453 */ 5454 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5455 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5456 5457 cpumask_clear(donee_mask); 5458 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5459 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5460 cpumask_set_cpu(cpu, donee_mask); 5461 } 5462 5463 /* iterate !donee CPUs and see if they should be offloaded */ 5464 cpumask_clear(resched_mask); 5465 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5466 if (cpumask_empty(donee_mask)) 5467 break; 5468 if (cpumask_test_cpu(cpu, donee_mask)) 5469 continue; 5470 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5471 continue; 5472 5473 nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5474 nr_donor_target, nr_target); 5475 } 5476 5477 for_each_cpu(cpu, resched_mask) 5478 resched_cpu(cpu); 5479 5480 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5481 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5482 5483 after_min = min(nr, after_min); 5484 after_max = max(nr, after_max); 5485 5486 } 5487 5488 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5489 before_min, before_max, after_min, after_max); 5490 } 5491 5492 /* 5493 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5494 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5495 * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5496 * lightly loaded CPUs, such imbalance can lead to very high execution latency 5497 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5498 * outcomes, a simple load balancing mechanism is implemented by the following 5499 * timer which runs periodically while bypass mode is in effect. 5500 */ 5501 static void scx_bypass_lb_timerfn(struct timer_list *timer) 5502 { 5503 struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5504 int node; 5505 u32 intv_us; 5506 5507 if (!bypass_dsp_enabled(sch)) 5508 return; 5509 5510 for_each_node_with_cpus(node) 5511 bypass_lb_node(sch, node); 5512 5513 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5514 if (intv_us) 5515 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5516 } 5517 5518 static bool inc_bypass_depth(struct scx_sched *sch) 5519 { 5520 lockdep_assert_held(&scx_bypass_lock); 5521 5522 WARN_ON_ONCE(sch->bypass_depth < 0); 5523 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5524 if (sch->bypass_depth != 1) 5525 return false; 5526 5527 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5528 sch->bypass_timestamp = ktime_get_ns(); 5529 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5530 return true; 5531 } 5532 5533 static bool dec_bypass_depth(struct scx_sched *sch) 5534 { 5535 lockdep_assert_held(&scx_bypass_lock); 5536 5537 WARN_ON_ONCE(sch->bypass_depth < 1); 5538 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5539 if (sch->bypass_depth != 0) 5540 return false; 5541 5542 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5543 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5544 ktime_get_ns() - sch->bypass_timestamp); 5545 return true; 5546 } 5547 5548 static void enable_bypass_dsp(struct scx_sched *sch) 5549 { 5550 struct scx_sched *host = scx_parent(sch) ?: sch; 5551 u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5552 s32 ret; 5553 5554 /* 5555 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5556 * Shouldn't stagger. 5557 */ 5558 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5559 return; 5560 5561 /* 5562 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5563 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5564 * called iff @sch is not already bypassed due to an ancestor bypassing, 5565 * we can assume that the parent is not bypassing and thus will be the 5566 * host of the bypass DSQs. 5567 * 5568 * While the situation may change in the future, the following 5569 * guarantees that the nearest non-bypassing ancestor or root has bypass 5570 * dispatch enabled while a descendant is bypassing, which is all that's 5571 * required. 5572 * 5573 * bypass_dsp_enabled() test is used to determine whether to enter the 5574 * bypass dispatch handling path from both bypassing and hosting scheds. 5575 * Bump enable depth on both @sch and bypass dispatch host. 5576 */ 5577 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5578 WARN_ON_ONCE(ret <= 0); 5579 5580 if (host != sch) { 5581 ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5582 WARN_ON_ONCE(ret <= 0); 5583 } 5584 5585 /* 5586 * The LB timer will stop running if bypass dispatch is disabled. Start 5587 * after enabling bypass dispatch. 5588 */ 5589 if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5590 mod_timer(&host->bypass_lb_timer, 5591 jiffies + usecs_to_jiffies(intv_us)); 5592 } 5593 5594 /* may be called without holding scx_bypass_lock */ 5595 static void disable_bypass_dsp(struct scx_sched *sch) 5596 { 5597 s32 ret; 5598 5599 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5600 return; 5601 5602 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5603 WARN_ON_ONCE(ret < 0); 5604 5605 if (scx_parent(sch)) { 5606 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5607 WARN_ON_ONCE(ret < 0); 5608 } 5609 } 5610 5611 /** 5612 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5613 * @sch: sched to bypass 5614 * @bypass: true for bypass, false for unbypass 5615 * 5616 * Bypassing guarantees that all runnable tasks make forward progress without 5617 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5618 * be held by tasks that the BPF scheduler is forgetting to run, which 5619 * unfortunately also excludes toggling the static branches. 5620 * 5621 * Let's work around by overriding a couple ops and modifying behaviors based on 5622 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5623 * to force global FIFO scheduling. 5624 * 5625 * - ops.select_cpu() is ignored and the default select_cpu() is used. 5626 * 5627 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5628 * %SCX_OPS_ENQ_LAST is also ignored. 5629 * 5630 * - ops.dispatch() is ignored. 5631 * 5632 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5633 * can't be trusted. Whenever a tick triggers, the running task is rotated to 5634 * the tail of the queue with core_sched_at touched. 5635 * 5636 * - pick_next_task() suppresses zero slice warning. 5637 * 5638 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5639 * operations. 5640 * 5641 * - scx_prio_less() reverts to the default core_sched_at order. 5642 */ 5643 static void scx_bypass(struct scx_sched *sch, bool bypass) 5644 { 5645 struct scx_sched *pos; 5646 unsigned long flags; 5647 int cpu; 5648 5649 raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5650 5651 if (bypass) { 5652 if (!inc_bypass_depth(sch)) 5653 goto unlock; 5654 5655 enable_bypass_dsp(sch); 5656 } else { 5657 if (!dec_bypass_depth(sch)) 5658 goto unlock; 5659 } 5660 5661 /* 5662 * Bypass state is propagated to all descendants - an scx_sched bypasses 5663 * if itself or any of its ancestors are in bypass mode. 5664 */ 5665 raw_spin_lock(&scx_sched_lock); 5666 scx_for_each_descendant_pre(pos, sch) { 5667 if (pos == sch) 5668 continue; 5669 if (bypass) 5670 inc_bypass_depth(pos); 5671 else 5672 dec_bypass_depth(pos); 5673 } 5674 raw_spin_unlock(&scx_sched_lock); 5675 5676 /* 5677 * No task property is changing. We just need to make sure all currently 5678 * queued tasks are re-queued according to the new scx_bypassing() 5679 * state. As an optimization, walk each rq's runnable_list instead of 5680 * the scx_tasks list. 5681 * 5682 * This function can't trust the scheduler and thus can't use 5683 * cpus_read_lock(). Walk all possible CPUs instead of online. 5684 */ 5685 for_each_possible_cpu(cpu) { 5686 struct rq *rq = cpu_rq(cpu); 5687 struct task_struct *p, *n; 5688 5689 raw_spin_rq_lock(rq); 5690 raw_spin_lock(&scx_sched_lock); 5691 5692 scx_for_each_descendant_pre(pos, sch) { 5693 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5694 5695 if (pos->bypass_depth) 5696 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5697 else 5698 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5699 } 5700 5701 raw_spin_unlock(&scx_sched_lock); 5702 5703 /* 5704 * We need to guarantee that no tasks are on the BPF scheduler 5705 * while bypassing. Either we see enabled or the enable path 5706 * sees scx_bypassing() before moving tasks to SCX. 5707 */ 5708 if (!scx_enabled()) { 5709 raw_spin_rq_unlock(rq); 5710 continue; 5711 } 5712 5713 /* 5714 * The use of list_for_each_entry_safe_reverse() is required 5715 * because each task is going to be removed from and added back 5716 * to the runnable_list during iteration. Because they're added 5717 * to the tail of the list, safe reverse iteration can still 5718 * visit all nodes. 5719 */ 5720 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5721 scx.runnable_node) { 5722 if (!scx_is_descendant(scx_task_sched(p), sch)) 5723 continue; 5724 5725 /* cycling deq/enq is enough, see the function comment */ 5726 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5727 /* nothing */ ; 5728 } 5729 } 5730 5731 /* resched to restore ticks and idle state */ 5732 if (cpu_online(cpu) || cpu == smp_processor_id()) 5733 resched_curr(rq); 5734 5735 raw_spin_rq_unlock(rq); 5736 } 5737 5738 /* disarming must come after moving all tasks out of the bypass DSQs */ 5739 if (!bypass) 5740 disable_bypass_dsp(sch); 5741 unlock: 5742 raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5743 } 5744 5745 static void free_exit_info(struct scx_exit_info *ei) 5746 { 5747 kvfree(ei->dump); 5748 kfree(ei->msg); 5749 kfree(ei->bt); 5750 kfree(ei); 5751 } 5752 5753 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5754 { 5755 struct scx_exit_info *ei; 5756 5757 ei = kzalloc_obj(*ei); 5758 if (!ei) 5759 return NULL; 5760 5761 ei->exit_cpu = -1; 5762 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5763 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5764 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5765 5766 if (!ei->bt || !ei->msg || !ei->dump) { 5767 free_exit_info(ei); 5768 return NULL; 5769 } 5770 5771 return ei; 5772 } 5773 5774 static const char *scx_exit_reason(enum scx_exit_kind kind) 5775 { 5776 switch (kind) { 5777 case SCX_EXIT_UNREG: 5778 return "unregistered from user space"; 5779 case SCX_EXIT_UNREG_BPF: 5780 return "unregistered from BPF"; 5781 case SCX_EXIT_UNREG_KERN: 5782 return "unregistered from the main kernel"; 5783 case SCX_EXIT_SYSRQ: 5784 return "disabled by sysrq-S"; 5785 case SCX_EXIT_PARENT: 5786 return "parent exiting"; 5787 case SCX_EXIT_ERROR: 5788 return "runtime error"; 5789 case SCX_EXIT_ERROR_BPF: 5790 return "scx_bpf_error"; 5791 case SCX_EXIT_ERROR_STALL: 5792 return "runnable task stall"; 5793 default: 5794 return "<UNKNOWN>"; 5795 } 5796 } 5797 5798 static void free_kick_syncs(void) 5799 { 5800 int cpu; 5801 5802 for_each_possible_cpu(cpu) { 5803 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5804 struct scx_kick_syncs *to_free; 5805 5806 to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5807 if (to_free) 5808 kvfree_rcu(to_free, rcu); 5809 } 5810 } 5811 5812 static void refresh_watchdog(void) 5813 { 5814 struct scx_sched *sch; 5815 unsigned long intv = ULONG_MAX; 5816 5817 /* take the shortest timeout and use its half for watchdog interval */ 5818 rcu_read_lock(); 5819 list_for_each_entry_rcu(sch, &scx_sched_all, all) 5820 intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5821 rcu_read_unlock(); 5822 5823 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5824 WRITE_ONCE(scx_watchdog_interval, intv); 5825 5826 if (intv < ULONG_MAX) 5827 mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5828 else 5829 cancel_delayed_work_sync(&scx_watchdog_work); 5830 } 5831 5832 static s32 scx_link_sched(struct scx_sched *sch) 5833 { 5834 const char *err_msg = ""; 5835 s32 ret = 0; 5836 5837 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5838 #ifdef CONFIG_EXT_SUB_SCHED 5839 struct scx_sched *parent = scx_parent(sch); 5840 5841 if (parent) { 5842 /* 5843 * scx_claim_exit() propagates exit_kind transition to 5844 * its sub-scheds while holding scx_sched_lock - either 5845 * we can see the parent's non-NONE exit_kind or the 5846 * parent can shoot us down. 5847 */ 5848 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5849 err_msg = "parent disabled"; 5850 ret = -ENOENT; 5851 break; 5852 } 5853 5854 ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5855 &sch->hash_node, scx_sched_hash_params); 5856 if (ret) { 5857 err_msg = "failed to insert into scx_sched_hash"; 5858 break; 5859 } 5860 5861 list_add_tail(&sch->sibling, &parent->children); 5862 } 5863 #endif /* CONFIG_EXT_SUB_SCHED */ 5864 5865 list_add_tail_rcu(&sch->all, &scx_sched_all); 5866 } 5867 5868 /* 5869 * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after 5870 * the guard above is released. 5871 */ 5872 if (ret) { 5873 scx_error(sch, "%s (%d)", err_msg, ret); 5874 return ret; 5875 } 5876 5877 refresh_watchdog(); 5878 return 0; 5879 } 5880 5881 static void scx_unlink_sched(struct scx_sched *sch) 5882 { 5883 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5884 #ifdef CONFIG_EXT_SUB_SCHED 5885 if (scx_parent(sch)) { 5886 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5887 scx_sched_hash_params); 5888 list_del_init(&sch->sibling); 5889 } 5890 #endif /* CONFIG_EXT_SUB_SCHED */ 5891 list_del_rcu(&sch->all); 5892 } 5893 5894 refresh_watchdog(); 5895 } 5896 5897 /* 5898 * Called to disable future dumps and wait for in-progress one while disabling 5899 * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5900 * This prevents calling dump ops on a dead sch. 5901 */ 5902 static void scx_disable_dump(struct scx_sched *sch) 5903 { 5904 guard(raw_spinlock_irqsave)(&scx_dump_lock); 5905 sch->dump_disabled = true; 5906 } 5907 5908 static void scx_log_sched_disable(struct scx_sched *sch) 5909 { 5910 struct scx_exit_info *ei = sch->exit_info; 5911 const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler"; 5912 5913 if (ei->kind >= SCX_EXIT_ERROR) { 5914 pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, 5915 sch->ops.name, ei->reason); 5916 5917 if (ei->msg[0] != '\0') 5918 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 5919 #ifdef CONFIG_STACKTRACE 5920 stack_trace_print(ei->bt, ei->bt_len, 2); 5921 #endif 5922 } else { 5923 pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, 5924 sch->ops.name, ei->reason); 5925 } 5926 } 5927 5928 #ifdef CONFIG_EXT_SUB_SCHED 5929 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5930 5931 static void drain_descendants(struct scx_sched *sch) 5932 { 5933 /* 5934 * Child scheds that finished the critical part of disabling will take 5935 * themselves off @sch->children. Wait for it to drain. As propagation 5936 * is recursive, empty @sch->children means that all proper descendant 5937 * scheds reached unlinking stage. 5938 */ 5939 wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5940 } 5941 5942 static void scx_fail_parent(struct scx_sched *sch, 5943 struct task_struct *failed, s32 fail_code) 5944 { 5945 struct scx_sched *parent = scx_parent(sch); 5946 struct scx_task_iter sti; 5947 struct task_struct *p; 5948 5949 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5950 fail_code, failed->comm, failed->pid); 5951 5952 /* 5953 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5954 * it. This may cause downstream failures on the BPF side but $parent is 5955 * dying anyway. 5956 */ 5957 scx_bypass(parent, true); 5958 5959 scx_task_iter_start(&sti, sch->cgrp); 5960 while ((p = scx_task_iter_next_locked(&sti))) { 5961 if (scx_task_on_sched(parent, p)) 5962 continue; 5963 5964 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5965 scx_disable_and_exit_task(sch, p); 5966 scx_set_task_sched(p, parent); 5967 } 5968 } 5969 scx_task_iter_stop(&sti); 5970 } 5971 5972 static void scx_sub_disable(struct scx_sched *sch) 5973 { 5974 struct scx_sched *parent = scx_parent(sch); 5975 struct scx_task_iter sti; 5976 struct task_struct *p; 5977 int ret; 5978 5979 /* 5980 * Guarantee forward progress and wait for descendants to be disabled. 5981 * To limit disruptions, $parent is not bypassed. Tasks are fully 5982 * prepped and then inserted back into $parent. 5983 */ 5984 scx_bypass(sch, true); 5985 drain_descendants(sch); 5986 5987 /* 5988 * Here, every runnable task is guaranteed to make forward progress and 5989 * we can safely use blocking synchronization constructs. Actually 5990 * disable ops. 5991 */ 5992 mutex_lock(&scx_enable_mutex); 5993 percpu_down_write(&scx_fork_rwsem); 5994 scx_cgroup_lock(); 5995 5996 set_cgroup_sched(sch_cgroup(sch), parent); 5997 5998 scx_task_iter_start(&sti, sch->cgrp); 5999 while ((p = scx_task_iter_next_locked(&sti))) { 6000 struct rq *rq; 6001 struct rq_flags rf; 6002 6003 /* filter out duplicate visits */ 6004 if (scx_task_on_sched(parent, p)) 6005 continue; 6006 6007 /* 6008 * By the time control reaches here, all descendant schedulers 6009 * should already have been disabled. 6010 */ 6011 WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 6012 6013 /* 6014 * @p is pinned by the iter: css_task_iter_next() takes a 6015 * reference and holds it until the next iter_next() call, so 6016 * @p->usage is guaranteed > 0. 6017 */ 6018 get_task_struct(p); 6019 6020 scx_task_iter_unlock(&sti); 6021 6022 /* 6023 * $p is READY or ENABLED on @sch. Initialize for $parent, 6024 * disable and exit from @sch, and then switch over to $parent. 6025 * 6026 * If a task fails to initialize for $parent, the only available 6027 * action is disabling $parent too. While this allows disabling 6028 * of a child sched to cause the parent scheduler to fail, the 6029 * failure can only originate from ops.init_task() of the 6030 * parent. A child can't directly affect the parent through its 6031 * own failures. 6032 */ 6033 ret = __scx_init_task(parent, p, false); 6034 if (ret) { 6035 scx_fail_parent(sch, p, ret); 6036 put_task_struct(p); 6037 break; 6038 } 6039 6040 rq = task_rq_lock(p, &rf); 6041 6042 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 6043 /* 6044 * sched_ext_dead() raced us between __scx_init_task() 6045 * and this rq lock and ran exit_task() on @sch (the 6046 * sched @p was on at that point), not on $parent. 6047 * $parent's just-completed init is owed an exit_task() 6048 * and we issue it here. 6049 */ 6050 scx_sub_init_cancel_task(parent, p); 6051 task_rq_unlock(rq, p, &rf); 6052 put_task_struct(p); 6053 continue; 6054 } 6055 6056 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 6057 /* 6058 * $p is initialized for $parent and still attached to 6059 * @sch. Disable and exit for @sch, switch over to 6060 * $parent, override the state to READY to account for 6061 * $p having already been initialized, and then enable. 6062 */ 6063 scx_disable_and_exit_task(sch, p); 6064 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 6065 scx_set_task_state(p, SCX_TASK_INIT); 6066 scx_set_task_sched(p, parent); 6067 scx_set_task_state(p, SCX_TASK_READY); 6068 scx_enable_task(parent, p); 6069 } 6070 6071 task_rq_unlock(rq, p, &rf); 6072 put_task_struct(p); 6073 } 6074 scx_task_iter_stop(&sti); 6075 6076 scx_disable_dump(sch); 6077 6078 scx_cgroup_unlock(); 6079 percpu_up_write(&scx_fork_rwsem); 6080 6081 /* 6082 * All tasks are moved off of @sch but there may still be on-going 6083 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 6084 * the expedited version as ancestors may be waiting in bypass mode. 6085 * Also, tell the parent that there is no need to keep running bypass 6086 * DSQs for us. 6087 */ 6088 synchronize_rcu_expedited(); 6089 disable_bypass_dsp(sch); 6090 6091 scx_unlink_sched(sch); 6092 6093 mutex_unlock(&scx_enable_mutex); 6094 6095 /* 6096 * @sch is now unlinked from the parent's children list. Notify and call 6097 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 6098 * after unlinking and releasing all locks. See scx_claim_exit(). 6099 */ 6100 wake_up_all(&scx_unlink_waitq); 6101 6102 if (parent->ops.sub_detach && sch->sub_attached) { 6103 struct scx_sub_detach_args sub_detach_args = { 6104 .ops = &sch->ops, 6105 .cgroup_path = sch->cgrp_path, 6106 }; 6107 SCX_CALL_OP(parent, sub_detach, NULL, 6108 &sub_detach_args); 6109 } 6110 6111 scx_log_sched_disable(sch); 6112 6113 if (sch->ops.exit) 6114 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 6115 if (sch->sub_kset) 6116 kobject_del(&sch->sub_kset->kobj); 6117 kobject_del(&sch->kobj); 6118 } 6119 #else /* CONFIG_EXT_SUB_SCHED */ 6120 static inline void drain_descendants(struct scx_sched *sch) { } 6121 static inline void scx_sub_disable(struct scx_sched *sch) { } 6122 #endif /* CONFIG_EXT_SUB_SCHED */ 6123 6124 static void scx_root_disable(struct scx_sched *sch) 6125 { 6126 struct scx_task_iter sti; 6127 struct task_struct *p; 6128 bool was_switched_all; 6129 int cpu; 6130 6131 /* guarantee forward progress and wait for descendants to be disabled */ 6132 scx_bypass(sch, true); 6133 drain_descendants(sch); 6134 6135 switch (scx_set_enable_state(SCX_DISABLING)) { 6136 case SCX_DISABLING: 6137 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 6138 break; 6139 case SCX_DISABLED: 6140 pr_warn("sched_ext: ops error detected without ops (%s)\n", 6141 sch->exit_info->msg); 6142 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6143 goto done; 6144 default: 6145 break; 6146 } 6147 6148 /* 6149 * Here, every runnable task is guaranteed to make forward progress and 6150 * we can safely use blocking synchronization constructs. Actually 6151 * disable ops. 6152 */ 6153 mutex_lock(&scx_enable_mutex); 6154 6155 was_switched_all = scx_switched_all(); 6156 6157 static_branch_disable(&__scx_switched_all); 6158 WRITE_ONCE(scx_switching_all, false); 6159 6160 /* 6161 * Shut down cgroup support before tasks so that the cgroup attach path 6162 * doesn't race against scx_disable_and_exit_task(). 6163 */ 6164 scx_cgroup_lock(); 6165 scx_cgroup_exit(sch); 6166 scx_cgroup_unlock(); 6167 6168 /* 6169 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 6170 * must be switched out and exited synchronously. 6171 */ 6172 percpu_down_write(&scx_fork_rwsem); 6173 6174 scx_init_task_enabled = false; 6175 6176 scx_task_iter_start(&sti, NULL); 6177 while ((p = scx_task_iter_next_locked(&sti))) { 6178 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 6179 const struct sched_class *old_class = p->sched_class; 6180 const struct sched_class *new_class = scx_setscheduler_class(p); 6181 6182 update_rq_clock(task_rq(p)); 6183 6184 if (old_class != new_class) 6185 queue_flags |= DEQUEUE_CLASS; 6186 6187 scoped_guard (sched_change, p, queue_flags) { 6188 p->sched_class = new_class; 6189 } 6190 6191 scx_disable_and_exit_task(scx_task_sched(p), p); 6192 } 6193 scx_task_iter_stop(&sti); 6194 6195 scx_disable_dump(sch); 6196 6197 scx_cgroup_lock(); 6198 set_cgroup_sched(sch_cgroup(sch), NULL); 6199 scx_cgroup_unlock(); 6200 6201 percpu_up_write(&scx_fork_rwsem); 6202 6203 /* 6204 * Invalidate all the rq clocks to prevent getting outdated 6205 * rq clocks from a previous scx scheduler. 6206 * 6207 * Also re-balance the dl_server bandwidth reservations: detach 6208 * ext_server (no more sched_ext tasks) and reinstate fair_server if it 6209 * was previously detached because we were running in full mode. 6210 * 6211 * Unlike the enable path, this runs on a recovery path that cannot 6212 * fail, so we use dl_server_swap_bw() to atomically free ext_server's 6213 * bandwidth and reclaim it for fair_server under the same dl_b lock. 6214 * 6215 * The swap can still fail with -EBUSY if someone bumped ext_server's 6216 * runtime via debugfs between enable and disable; in that narrow case 6217 * both servers end up detached and we just WARN. 6218 */ 6219 for_each_possible_cpu(cpu) { 6220 struct rq *rq = cpu_rq(cpu); 6221 6222 scx_rq_clock_invalidate(rq); 6223 6224 scoped_guard(rq_lock_irqsave, rq) { 6225 update_rq_clock(rq); 6226 if (was_switched_all) { 6227 if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server, 6228 &rq->fair_server))) 6229 pr_warn("failed to re-attach fair_server on CPU %d\n", cpu); 6230 } else { 6231 dl_server_detach_bw(&rq->ext_server); 6232 } 6233 } 6234 } 6235 6236 /* no task is on scx, turn off all the switches and flush in-progress calls */ 6237 static_branch_disable(&__scx_enabled); 6238 static_branch_disable(&__scx_is_cid_type); 6239 if (sch->ops.flags & SCX_OPS_TID_TO_TASK) 6240 static_branch_disable(&__scx_tid_to_task_enabled); 6241 bitmap_zero(sch->has_op, SCX_OPI_END); 6242 scx_idle_disable(); 6243 synchronize_rcu(); 6244 if (sch->ops.flags & SCX_OPS_TID_TO_TASK) 6245 rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); 6246 6247 scx_log_sched_disable(sch); 6248 6249 if (sch->ops.exit) 6250 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 6251 6252 scx_unlink_sched(sch); 6253 6254 /* 6255 * scx_root clearing must be inside cpus_read_lock(). See 6256 * handle_hotplug(). 6257 */ 6258 cpus_read_lock(); 6259 RCU_INIT_POINTER(scx_root, NULL); 6260 cpus_read_unlock(); 6261 6262 /* 6263 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 6264 * could observe an object of the same name still in the hierarchy when 6265 * the next scheduler is loaded. 6266 */ 6267 #ifdef CONFIG_EXT_SUB_SCHED 6268 if (sch->sub_kset) 6269 kobject_del(&sch->sub_kset->kobj); 6270 #endif 6271 kobject_del(&sch->kobj); 6272 6273 free_kick_syncs(); 6274 6275 mutex_unlock(&scx_enable_mutex); 6276 6277 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6278 done: 6279 scx_bypass(sch, false); 6280 } 6281 6282 /* 6283 * Claim the exit on @sch. The caller must ensure that the helper kthread work 6284 * is kicked before the current task can be preempted. Once exit_kind is 6285 * claimed, scx_error() can no longer trigger, so if the current task gets 6286 * preempted and the BPF scheduler fails to schedule it back, the helper work 6287 * will never be kicked and the whole system can wedge. 6288 */ 6289 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 6290 { 6291 int none = SCX_EXIT_NONE; 6292 6293 lockdep_assert_preemption_disabled(); 6294 6295 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 6296 kind = SCX_EXIT_ERROR; 6297 6298 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 6299 return false; 6300 6301 /* 6302 * Some CPUs may be trapped in the dispatch paths. Set the aborting 6303 * flag to break potential live-lock scenarios, ensuring we can 6304 * successfully reach scx_bypass(). 6305 */ 6306 WRITE_ONCE(sch->aborting, true); 6307 6308 /* 6309 * Propagate exits to descendants immediately. Each has a dedicated 6310 * helper kthread and can run in parallel. While most of disabling is 6311 * serialized, running them in separate threads allows parallelizing 6312 * ops.exit(), which can take arbitrarily long prolonging bypass mode. 6313 * 6314 * To guarantee forward progress, this propagation must be in-line so 6315 * that ->aborting is synchronously asserted for all sub-scheds. The 6316 * propagation is also the interlocking point against sub-sched 6317 * attachment. See scx_link_sched(). 6318 * 6319 * This doesn't cause recursions as propagation only takes place for 6320 * non-propagation exits. 6321 */ 6322 if (kind != SCX_EXIT_PARENT) { 6323 scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 6324 struct scx_sched *pos; 6325 scx_for_each_descendant_pre(pos, sch) 6326 scx_disable(pos, SCX_EXIT_PARENT); 6327 } 6328 } 6329 6330 return true; 6331 } 6332 6333 static void scx_disable_workfn(struct kthread_work *work) 6334 { 6335 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 6336 struct scx_exit_info *ei = sch->exit_info; 6337 int kind; 6338 6339 kind = atomic_read(&sch->exit_kind); 6340 while (true) { 6341 if (kind == SCX_EXIT_DONE) /* already disabled? */ 6342 return; 6343 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 6344 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 6345 break; 6346 } 6347 ei->kind = kind; 6348 ei->reason = scx_exit_reason(ei->kind); 6349 6350 if (scx_parent(sch)) 6351 scx_sub_disable(sch); 6352 else 6353 scx_root_disable(sch); 6354 } 6355 6356 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 6357 { 6358 guard(preempt)(); 6359 if (scx_claim_exit(sch, kind)) 6360 irq_work_queue(&sch->disable_irq_work); 6361 } 6362 6363 /** 6364 * scx_flush_disable_work - flush the disable work and wait for it to finish 6365 * @sch: the scheduler 6366 * 6367 * sch->disable_work might still not queued, causing kthread_flush_work() 6368 * as a noop. Syncing the irq_work first is required to guarantee the 6369 * kthread work has been queued before waiting for it. 6370 */ 6371 static void scx_flush_disable_work(struct scx_sched *sch) 6372 { 6373 int kind; 6374 6375 do { 6376 irq_work_sync(&sch->disable_irq_work); 6377 kthread_flush_work(&sch->disable_work); 6378 kind = atomic_read(&sch->exit_kind); 6379 } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); 6380 } 6381 6382 static void dump_newline(struct seq_buf *s) 6383 { 6384 trace_sched_ext_dump(""); 6385 6386 /* @s may be zero sized and seq_buf triggers WARN if so */ 6387 if (s->size) 6388 seq_buf_putc(s, '\n'); 6389 } 6390 6391 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 6392 { 6393 va_list args; 6394 6395 #ifdef CONFIG_TRACEPOINTS 6396 if (trace_sched_ext_dump_enabled()) { 6397 /* protected by scx_dump_lock */ 6398 static char line_buf[SCX_EXIT_MSG_LEN]; 6399 6400 va_start(args, fmt); 6401 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 6402 va_end(args); 6403 6404 trace_call__sched_ext_dump(line_buf); 6405 } 6406 #endif 6407 /* @s may be zero sized and seq_buf triggers WARN if so */ 6408 if (s->size) { 6409 va_start(args, fmt); 6410 seq_buf_vprintf(s, fmt, args); 6411 va_end(args); 6412 6413 seq_buf_putc(s, '\n'); 6414 } 6415 } 6416 6417 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 6418 const unsigned long *bt, unsigned int len) 6419 { 6420 unsigned int i; 6421 6422 for (i = 0; i < len; i++) 6423 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 6424 } 6425 6426 static void ops_dump_init(struct seq_buf *s, const char *prefix) 6427 { 6428 struct scx_dump_data *dd = &scx_dump_data; 6429 6430 lockdep_assert_irqs_disabled(); 6431 6432 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 6433 dd->first = true; 6434 dd->cursor = 0; 6435 dd->s = s; 6436 dd->prefix = prefix; 6437 } 6438 6439 static void ops_dump_flush(void) 6440 { 6441 struct scx_dump_data *dd = &scx_dump_data; 6442 char *line = dd->buf.line; 6443 6444 if (!dd->cursor) 6445 return; 6446 6447 /* 6448 * There's something to flush and this is the first line. Insert a blank 6449 * line to distinguish ops dump. 6450 */ 6451 if (dd->first) { 6452 dump_newline(dd->s); 6453 dd->first = false; 6454 } 6455 6456 /* 6457 * There may be multiple lines in $line. Scan and emit each line 6458 * separately. 6459 */ 6460 while (true) { 6461 char *end = line; 6462 char c; 6463 6464 while (*end != '\n' && *end != '\0') 6465 end++; 6466 6467 /* 6468 * If $line overflowed, it may not have newline at the end. 6469 * Always emit with a newline. 6470 */ 6471 c = *end; 6472 *end = '\0'; 6473 dump_line(dd->s, "%s%s", dd->prefix, line); 6474 if (c == '\0') 6475 break; 6476 6477 /* move to the next line */ 6478 end++; 6479 if (*end == '\0') 6480 break; 6481 line = end; 6482 } 6483 6484 dd->cursor = 0; 6485 } 6486 6487 static void ops_dump_exit(void) 6488 { 6489 ops_dump_flush(); 6490 scx_dump_data.cpu = -1; 6491 } 6492 6493 static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, 6494 struct rq *rq, struct task_struct *p, char marker) 6495 { 6496 static unsigned long bt[SCX_EXIT_BT_LEN]; 6497 struct scx_sched *task_sch = scx_task_sched(p); 6498 const char *own_marker; 6499 char sch_id_buf[32]; 6500 char dsq_id_buf[19] = "(n/a)"; 6501 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6502 unsigned int bt_len = 0; 6503 6504 own_marker = task_sch == sch ? "*" : ""; 6505 6506 if (task_sch->level == 0) 6507 scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6508 else 6509 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6510 task_sch->level, task_sch->ops.sub_cgroup_id); 6511 6512 if (p->scx.dsq) 6513 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6514 (unsigned long long)p->scx.dsq->id); 6515 6516 dump_newline(s); 6517 dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6518 marker, task_state_to_char(p), p->comm, p->pid, 6519 own_marker, sch_id_buf, 6520 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6521 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6522 scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6523 p->scx.flags & ~SCX_TASK_STATE_MASK, 6524 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6525 ops_state >> SCX_OPSS_QSEQ_SHIFT); 6526 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6527 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6528 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6529 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6530 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6531 p->migration_disabled); 6532 6533 if (SCX_HAS_OP(sch, dump_task)) { 6534 ops_dump_init(s, " "); 6535 SCX_CALL_OP(sch, dump_task, rq, dctx, p); 6536 ops_dump_exit(); 6537 } 6538 6539 #ifdef CONFIG_STACKTRACE 6540 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6541 #endif 6542 if (bt_len) { 6543 dump_newline(s); 6544 dump_stack_trace(s, " ", bt, bt_len); 6545 } 6546 } 6547 6548 static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s, 6549 struct scx_dump_ctx *dctx, int cpu, 6550 bool dump_all_tasks) 6551 { 6552 struct rq *rq = cpu_rq(cpu); 6553 struct rq_flags rf; 6554 struct task_struct *p; 6555 struct seq_buf ns; 6556 size_t avail, used; 6557 char *buf; 6558 bool idle; 6559 6560 rq_lock_irqsave(rq, &rf); 6561 6562 idle = list_empty(&rq->scx.runnable_list) && 6563 rq->curr->sched_class == &idle_sched_class; 6564 6565 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6566 goto next; 6567 6568 /* 6569 * We don't yet know whether ops.dump_cpu() will produce output 6570 * and we may want to skip the default CPU dump if it doesn't. 6571 * Use a nested seq_buf to generate the standard dump so that we 6572 * can decide whether to commit later. 6573 */ 6574 avail = seq_buf_get_buf(s, &buf); 6575 seq_buf_init(&ns, buf, avail); 6576 6577 dump_newline(&ns); 6578 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6579 cpu, rq->scx.nr_running, rq->scx.flags, 6580 rq->scx.cpu_released, rq->scx.ops_qseq, 6581 rq->scx.kick_sync); 6582 dump_line(&ns, " curr=%s[%d] class=%ps", 6583 rq->curr->comm, rq->curr->pid, 6584 rq->curr->sched_class); 6585 if (!cpumask_empty(rq->scx.cpus_to_kick)) 6586 dump_line(&ns, " cpus_to_kick : %*pb", 6587 cpumask_pr_args(rq->scx.cpus_to_kick)); 6588 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6589 dump_line(&ns, " idle_to_kick : %*pb", 6590 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6591 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6592 dump_line(&ns, " cpus_to_preempt: %*pb", 6593 cpumask_pr_args(rq->scx.cpus_to_preempt)); 6594 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6595 dump_line(&ns, " cpus_to_wait : %*pb", 6596 cpumask_pr_args(rq->scx.cpus_to_wait)); 6597 if (!cpumask_empty(rq->scx.cpus_to_sync)) 6598 dump_line(&ns, " cpus_to_sync : %*pb", 6599 cpumask_pr_args(rq->scx.cpus_to_sync)); 6600 6601 used = seq_buf_used(&ns); 6602 if (SCX_HAS_OP(sch, dump_cpu)) { 6603 ops_dump_init(&ns, " "); 6604 SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle); 6605 ops_dump_exit(); 6606 } 6607 6608 /* 6609 * If idle && nothing generated by ops.dump_cpu(), there's 6610 * nothing interesting. Skip. 6611 */ 6612 if (idle && used == seq_buf_used(&ns)) 6613 goto next; 6614 6615 /* 6616 * $s may already have overflowed when $ns was created. If so, 6617 * calling commit on it will trigger BUG. 6618 */ 6619 if (avail) { 6620 seq_buf_commit(s, seq_buf_used(&ns)); 6621 if (seq_buf_has_overflowed(&ns)) 6622 seq_buf_set_overflow(s); 6623 } 6624 6625 if (rq->curr->sched_class == &ext_sched_class && 6626 (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6627 scx_dump_task(sch, s, dctx, rq, rq->curr, '*'); 6628 6629 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6630 if (dump_all_tasks || scx_task_on_sched(sch, p)) 6631 scx_dump_task(sch, s, dctx, rq, p, ' '); 6632 next: 6633 rq_unlock_irqrestore(rq, &rf); 6634 } 6635 6636 /* 6637 * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6638 * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6639 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6640 * separately. For error dumps, @dump_all_tasks=true since only the failing 6641 * scheduler is dumped. 6642 */ 6643 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6644 size_t dump_len, bool dump_all_tasks) 6645 { 6646 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6647 struct scx_dump_ctx dctx = { 6648 .kind = ei->kind, 6649 .exit_code = ei->exit_code, 6650 .reason = ei->reason, 6651 .at_ns = ktime_get_ns(), 6652 .at_jiffies = jiffies, 6653 }; 6654 struct seq_buf s; 6655 struct scx_event_stats events; 6656 int cpu; 6657 6658 guard(raw_spinlock_irqsave)(&scx_dump_lock); 6659 6660 if (sch->dump_disabled) 6661 return; 6662 6663 seq_buf_init(&s, ei->dump, dump_len); 6664 6665 #ifdef CONFIG_EXT_SUB_SCHED 6666 if (sch->level == 0) 6667 dump_line(&s, "%s: root", sch->ops.name); 6668 else 6669 dump_line(&s, "%s: sub%d-%llu %s", 6670 sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6671 sch->cgrp_path); 6672 #endif 6673 if (ei->kind == SCX_EXIT_NONE) { 6674 dump_line(&s, "Debug dump triggered by %s", ei->reason); 6675 } else { 6676 if (ei->exit_cpu >= 0) 6677 dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:", 6678 current->comm, current->pid, ei->kind, 6679 ei->exit_cpu); 6680 else 6681 dump_line(&s, "%s[%d] triggered exit kind %d:", 6682 current->comm, current->pid, ei->kind); 6683 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6684 dump_newline(&s); 6685 dump_line(&s, "Backtrace:"); 6686 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6687 } 6688 6689 if (SCX_HAS_OP(sch, dump)) { 6690 ops_dump_init(&s, ""); 6691 SCX_CALL_OP(sch, dump, NULL, &dctx); 6692 ops_dump_exit(); 6693 } 6694 6695 dump_newline(&s); 6696 dump_line(&s, "CPU states"); 6697 dump_line(&s, "----------"); 6698 6699 /* 6700 * Dump the exit CPU first so it isn't lost to dump truncation, then 6701 * walk the rest in order, skipping the one already dumped. 6702 */ 6703 if (ei->exit_cpu >= 0) 6704 scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks); 6705 for_each_possible_cpu(cpu) { 6706 if (cpu != ei->exit_cpu) 6707 scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks); 6708 } 6709 6710 dump_newline(&s); 6711 dump_line(&s, "Event counters"); 6712 dump_line(&s, "--------------"); 6713 6714 scx_read_events(sch, &events); 6715 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6716 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6717 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6718 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6719 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6720 scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6721 scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6722 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6723 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6724 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6725 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6726 scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6727 scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6728 6729 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6730 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6731 trunc_marker, sizeof(trunc_marker)); 6732 } 6733 6734 static void scx_disable_irq_workfn(struct irq_work *irq_work) 6735 { 6736 struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6737 struct scx_exit_info *ei = sch->exit_info; 6738 6739 if (ei->kind >= SCX_EXIT_ERROR) 6740 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6741 6742 kthread_queue_work(sch->helper, &sch->disable_work); 6743 } 6744 6745 bool scx_vexit(struct scx_sched *sch, 6746 enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu, 6747 const char *fmt, va_list args) 6748 { 6749 struct scx_exit_info *ei = sch->exit_info; 6750 6751 guard(preempt)(); 6752 6753 if (!scx_claim_exit(sch, kind)) 6754 return false; 6755 6756 ei->exit_code = exit_code; 6757 #ifdef CONFIG_STACKTRACE 6758 if (kind >= SCX_EXIT_ERROR) 6759 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6760 #endif 6761 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6762 6763 /* 6764 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6765 * in scx_disable_workfn(). 6766 */ 6767 ei->kind = kind; 6768 ei->reason = scx_exit_reason(ei->kind); 6769 ei->exit_cpu = exit_cpu; 6770 6771 irq_work_queue(&sch->disable_irq_work); 6772 return true; 6773 } 6774 6775 static int alloc_kick_syncs(void) 6776 { 6777 int cpu; 6778 6779 /* 6780 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6781 * can exceed percpu allocator limits on large machines. 6782 */ 6783 for_each_possible_cpu(cpu) { 6784 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6785 struct scx_kick_syncs *new_ksyncs; 6786 6787 WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6788 6789 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6790 GFP_KERNEL, cpu_to_node(cpu)); 6791 if (!new_ksyncs) { 6792 free_kick_syncs(); 6793 return -ENOMEM; 6794 } 6795 6796 rcu_assign_pointer(*ksyncs, new_ksyncs); 6797 } 6798 6799 return 0; 6800 } 6801 6802 static void free_pnode(struct scx_sched_pnode *pnode) 6803 { 6804 if (!pnode) 6805 return; 6806 exit_dsq(&pnode->global_dsq); 6807 kfree(pnode); 6808 } 6809 6810 static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6811 { 6812 struct scx_sched_pnode *pnode; 6813 6814 pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6815 if (!pnode) 6816 return NULL; 6817 6818 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6819 kfree(pnode); 6820 return NULL; 6821 } 6822 6823 return pnode; 6824 } 6825 6826 /* 6827 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6828 * starvation. During the READY -> ENABLED task switching loop, the calling 6829 * thread's sched_class gets switched from fair to ext. As fair has higher 6830 * priority than ext, the calling thread can be indefinitely starved under 6831 * fair-class saturation, leading to a system hang. 6832 */ 6833 struct scx_enable_cmd { 6834 struct kthread_work work; 6835 union { 6836 struct sched_ext_ops *ops; 6837 struct sched_ext_ops_cid *ops_cid; 6838 }; 6839 bool is_cid_type; 6840 struct bpf_map *arena_map; /* arena ref to transfer to sch */ 6841 int ret; 6842 }; 6843 6844 /* 6845 * Allocate and initialize a new scx_sched. @cgrp's reference is always 6846 * consumed whether the function succeeds or fails. 6847 */ 6848 static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd, 6849 struct cgroup *cgrp, 6850 struct scx_sched *parent) 6851 { 6852 struct sched_ext_ops *ops = cmd->ops; 6853 struct scx_sched *sch; 6854 s32 level = parent ? parent->level + 1 : 0; 6855 s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6856 6857 sch = kzalloc_flex(*sch, ancestors, level + 1); 6858 if (!sch) { 6859 ret = -ENOMEM; 6860 goto err_put_cgrp; 6861 } 6862 6863 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6864 if (!sch->exit_info) { 6865 ret = -ENOMEM; 6866 goto err_free_sch; 6867 } 6868 6869 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6870 if (ret < 0) 6871 goto err_free_ei; 6872 6873 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6874 if (!sch->pnode) { 6875 ret = -ENOMEM; 6876 goto err_free_hash; 6877 } 6878 6879 for_each_node_state(node, N_POSSIBLE) { 6880 sch->pnode[node] = alloc_pnode(sch, node); 6881 if (!sch->pnode[node]) { 6882 ret = -ENOMEM; 6883 goto err_free_pnode; 6884 } 6885 } 6886 6887 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6888 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6889 dsp_ctx.buf, sch->dsp_max_batch), 6890 __alignof__(struct scx_sched_pcpu)); 6891 if (!sch->pcpu) { 6892 ret = -ENOMEM; 6893 goto err_free_pnode; 6894 } 6895 6896 for_each_possible_cpu(cpu) { 6897 ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6898 if (ret) { 6899 bypass_fail_cpu = cpu; 6900 goto err_free_pcpu; 6901 } 6902 } 6903 6904 for_each_possible_cpu(cpu) { 6905 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6906 6907 pcpu->sch = sch; 6908 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6909 } 6910 6911 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6912 if (IS_ERR(sch->helper)) { 6913 ret = PTR_ERR(sch->helper); 6914 goto err_free_pcpu; 6915 } 6916 6917 sched_set_fifo(sch->helper->task); 6918 6919 if (parent) 6920 memcpy(sch->ancestors, parent->ancestors, 6921 level * sizeof(parent->ancestors[0])); 6922 sch->ancestors[level] = sch; 6923 sch->level = level; 6924 6925 if (ops->timeout_ms) 6926 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6927 else 6928 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6929 6930 sch->slice_dfl = SCX_SLICE_DFL; 6931 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6932 sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); 6933 kthread_init_work(&sch->disable_work, scx_disable_workfn); 6934 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6935 6936 if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { 6937 ret = -ENOMEM; 6938 goto err_stop_helper; 6939 } 6940 if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { 6941 ret = -ENOMEM; 6942 goto err_free_lb_cpumask; 6943 } 6944 /* 6945 * Copy ops through the right union view. For cid-form the source is 6946 * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/ 6947 * cpu_release; those stay zero from kzalloc. 6948 */ 6949 if (cmd->is_cid_type) { 6950 sch->ops_cid = *cmd->ops_cid; 6951 sch->is_cid_type = true; 6952 } else { 6953 sch->ops = *cmd->ops; 6954 } 6955 6956 rcu_assign_pointer(ops->priv, sch); 6957 6958 sch->kobj.kset = scx_kset; 6959 INIT_LIST_HEAD(&sch->all); 6960 6961 #ifdef CONFIG_EXT_SUB_SCHED 6962 char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6963 if (!buf) { 6964 ret = -ENOMEM; 6965 goto err_free_lb_resched; 6966 } 6967 cgroup_path(cgrp, buf, PATH_MAX); 6968 sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6969 kfree(buf); 6970 if (!sch->cgrp_path) { 6971 ret = -ENOMEM; 6972 goto err_free_lb_resched; 6973 } 6974 6975 sch->cgrp = cgrp; 6976 INIT_LIST_HEAD(&sch->children); 6977 INIT_LIST_HEAD(&sch->sibling); 6978 6979 if (parent) 6980 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6981 &parent->sub_kset->kobj, 6982 "sub-%llu", cgroup_id(cgrp)); 6983 else 6984 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6985 6986 if (ret < 0) { 6987 RCU_INIT_POINTER(ops->priv, NULL); 6988 kobject_put(&sch->kobj); 6989 return ERR_PTR(ret); 6990 } 6991 6992 if (ops->sub_attach) { 6993 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6994 if (!sch->sub_kset) { 6995 RCU_INIT_POINTER(ops->priv, NULL); 6996 kobject_put(&sch->kobj); 6997 return ERR_PTR(-ENOMEM); 6998 } 6999 } 7000 #else /* CONFIG_EXT_SUB_SCHED */ 7001 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 7002 if (ret < 0) { 7003 RCU_INIT_POINTER(ops->priv, NULL); 7004 kobject_put(&sch->kobj); 7005 return ERR_PTR(ret); 7006 } 7007 #endif /* CONFIG_EXT_SUB_SCHED */ 7008 7009 /* 7010 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so 7011 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid 7012 * drops the ref. After this point, sch owns the ref and any cleanup 7013 * runs through scx_sched_free_rcu_work() which puts it. 7014 */ 7015 sch->arena_map = cmd->arena_map; 7016 /* BPF arena is only available on MMU && 64BIT */ 7017 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 7018 if (sch->arena_map) 7019 sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map); 7020 #endif 7021 cmd->arena_map = NULL; 7022 return sch; 7023 7024 #ifdef CONFIG_EXT_SUB_SCHED 7025 err_free_lb_resched: 7026 RCU_INIT_POINTER(ops->priv, NULL); 7027 free_cpumask_var(sch->bypass_lb_resched_cpumask); 7028 #endif 7029 err_free_lb_cpumask: 7030 free_cpumask_var(sch->bypass_lb_donee_cpumask); 7031 err_stop_helper: 7032 kthread_destroy_worker(sch->helper); 7033 err_free_pcpu: 7034 for_each_possible_cpu(cpu) { 7035 if (cpu == bypass_fail_cpu) 7036 break; 7037 exit_dsq(bypass_dsq(sch, cpu)); 7038 } 7039 free_percpu(sch->pcpu); 7040 err_free_pnode: 7041 for_each_node_state(node, N_POSSIBLE) 7042 free_pnode(sch->pnode[node]); 7043 kfree(sch->pnode); 7044 err_free_hash: 7045 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 7046 err_free_ei: 7047 free_exit_info(sch->exit_info); 7048 err_free_sch: 7049 kfree(sch); 7050 err_put_cgrp: 7051 #ifdef CONFIG_EXT_SUB_SCHED 7052 cgroup_put(cgrp); 7053 #endif 7054 return ERR_PTR(ret); 7055 } 7056 7057 static int check_hotplug_seq(struct scx_sched *sch, 7058 const struct sched_ext_ops *ops) 7059 { 7060 unsigned long long global_hotplug_seq; 7061 7062 /* 7063 * If a hotplug event has occurred between when a scheduler was 7064 * initialized, and when we were able to attach, exit and notify user 7065 * space about it. 7066 */ 7067 if (ops->hotplug_seq) { 7068 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 7069 if (ops->hotplug_seq != global_hotplug_seq) { 7070 scx_exit(sch, SCX_EXIT_UNREG_KERN, 7071 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 7072 "expected hotplug seq %llu did not match actual %llu", 7073 ops->hotplug_seq, global_hotplug_seq); 7074 return -EBUSY; 7075 } 7076 } 7077 7078 return 0; 7079 } 7080 7081 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 7082 { 7083 /* 7084 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 7085 * ops.enqueue() callback isn't implemented. 7086 */ 7087 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 7088 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 7089 return -EINVAL; 7090 } 7091 7092 /* 7093 * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched 7094 * may set it to declare a dependency; reject if the root hasn't 7095 * enabled it. 7096 */ 7097 if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) && 7098 !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) { 7099 scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it"); 7100 return -EINVAL; 7101 } 7102 7103 /* 7104 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 7105 * selection policy to be enabled. 7106 */ 7107 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 7108 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 7109 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 7110 return -EINVAL; 7111 } 7112 7113 /* 7114 * cid-form's struct is shorter and doesn't include the cpu_acquire / 7115 * cpu_release tail; reading those fields off a cid-form @ops would 7116 * run past the BPF allocation. Skip for cid-form. 7117 */ 7118 if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release)) 7119 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 7120 7121 /* 7122 * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched 7123 * attaches through a cid-form-only interface (sub_attach/sub_detach), 7124 * and a root that accepts sub-scheds must expose cid-form state to 7125 * them. Reject cpu-form schedulers on either side. 7126 */ 7127 if (!sch->is_cid_type) { 7128 if (scx_parent(sch)) { 7129 scx_error(sch, "sub-sched requires cid-form struct_ops"); 7130 return -EINVAL; 7131 } 7132 if (ops->sub_attach || ops->sub_detach) { 7133 scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops"); 7134 return -EINVAL; 7135 } 7136 } 7137 7138 return 0; 7139 } 7140 7141 static void scx_root_enable_workfn(struct kthread_work *work) 7142 { 7143 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7144 struct sched_ext_ops *ops = cmd->ops; 7145 struct cgroup *cgrp = root_cgroup(); 7146 struct scx_sched *sch; 7147 struct scx_task_iter sti; 7148 struct task_struct *p; 7149 int i, cpu, ret; 7150 7151 mutex_lock(&scx_enable_mutex); 7152 7153 if (scx_enable_state() != SCX_DISABLED) { 7154 ret = -EBUSY; 7155 goto err_unlock; 7156 } 7157 7158 /* 7159 * @ops->priv binds @ops to its scx_sched instance. It is set here by 7160 * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), 7161 * which runs after scx_root_disable() has dropped scx_enable_mutex. If 7162 * it's still non-NULL here, a previous attachment on @ops has not 7163 * finished tearing down; proceeding would let the in-flight unreg's 7164 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. 7165 */ 7166 if (rcu_access_pointer(ops->priv)) { 7167 ret = -EBUSY; 7168 goto err_unlock; 7169 } 7170 7171 ret = alloc_kick_syncs(); 7172 if (ret) 7173 goto err_unlock; 7174 7175 if (ops->flags & SCX_OPS_TID_TO_TASK) { 7176 ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params); 7177 if (ret) 7178 goto err_free_ksyncs; 7179 } 7180 7181 #ifdef CONFIG_EXT_SUB_SCHED 7182 cgroup_get(cgrp); 7183 #endif 7184 sch = scx_alloc_and_add_sched(cmd, cgrp, NULL); 7185 if (IS_ERR(sch)) { 7186 ret = PTR_ERR(sch); 7187 goto err_free_tid_hash; 7188 } 7189 7190 if (sch->is_cid_type) 7191 static_branch_enable(&__scx_is_cid_type); 7192 7193 /* 7194 * Transition to ENABLING and clear exit info to arm the disable path. 7195 * Failure triggers full disabling from here on. 7196 */ 7197 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 7198 WARN_ON_ONCE(scx_root); 7199 7200 atomic_long_set(&scx_nr_rejected, 0); 7201 7202 for_each_possible_cpu(cpu) { 7203 struct rq *rq = cpu_rq(cpu); 7204 7205 rq->scx.local_dsq.sched = sch; 7206 rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 7207 } 7208 7209 /* 7210 * Keep CPUs stable during enable so that the BPF scheduler can track 7211 * online CPUs by watching ->on/offline_cpu() after ->init(). 7212 */ 7213 cpus_read_lock(); 7214 7215 /* 7216 * Build the cid mapping before publishing scx_root. The cid kfuncs 7217 * dereference the cid arrays unconditionally once scx_prog_sched() 7218 * returns non-NULL; the rcu_assign_pointer() below pairs with their 7219 * rcu_dereference() to make the populated arrays visible. 7220 */ 7221 ret = scx_cid_init(sch); 7222 if (ret) { 7223 cpus_read_unlock(); 7224 goto err_disable; 7225 } 7226 7227 /* 7228 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 7229 * See handle_hotplug(). 7230 */ 7231 rcu_assign_pointer(scx_root, sch); 7232 7233 ret = scx_link_sched(sch); 7234 if (ret) { 7235 cpus_read_unlock(); 7236 goto err_disable; 7237 } 7238 7239 scx_idle_enable(ops); 7240 7241 if (sch->ops.init) { 7242 ret = SCX_CALL_OP_RET(sch, init, NULL); 7243 if (ret) { 7244 ret = ops_sanitize_err(sch, "init", ret); 7245 cpus_read_unlock(); 7246 scx_error(sch, "ops.init() failed (%d)", ret); 7247 goto err_disable; 7248 } 7249 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7250 } 7251 7252 ret = scx_arena_pool_init(sch); 7253 if (ret) { 7254 cpus_read_unlock(); 7255 goto err_disable; 7256 } 7257 7258 ret = scx_set_cmask_scratch_alloc(sch); 7259 if (ret) { 7260 cpus_read_unlock(); 7261 goto err_disable; 7262 } 7263 7264 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 7265 if (((void (**)(void))ops)[i]) 7266 set_bit(i, sch->has_op); 7267 7268 ret = check_hotplug_seq(sch, ops); 7269 if (ret) { 7270 cpus_read_unlock(); 7271 goto err_disable; 7272 } 7273 scx_idle_update_selcpu_topology(ops); 7274 7275 cpus_read_unlock(); 7276 7277 ret = validate_ops(sch, ops); 7278 if (ret) 7279 goto err_disable; 7280 7281 /* 7282 * Attach the ext_server bandwidth reservation before anything is 7283 * committed so that we can fail the enable if the root domain cannot 7284 * accommodate it. The matching fair_server detach is deferred to the 7285 * tail of this function, after the switch is fully committed and can no 7286 * longer fail. 7287 * 7288 * On failure, err_disable funnels into scx_root_disable() which 7289 * detaches ext_server, so partially-attached state is cleaned up 7290 * automatically. 7291 */ 7292 for_each_possible_cpu(cpu) { 7293 struct rq *rq = cpu_rq(cpu); 7294 7295 scoped_guard(rq_lock_irqsave, rq) { 7296 update_rq_clock(rq); 7297 ret = dl_server_attach_bw(&rq->ext_server); 7298 } 7299 if (ret) { 7300 pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n", 7301 cpu, ret); 7302 goto err_disable; 7303 } 7304 } 7305 7306 /* 7307 * Once __scx_enabled is set, %current can be switched to SCX anytime. 7308 * This can lead to stalls as some BPF schedulers (e.g. userspace 7309 * scheduling) may not function correctly before all tasks are switched. 7310 * Init in bypass mode to guarantee forward progress. 7311 */ 7312 scx_bypass(sch, true); 7313 7314 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 7315 if (((void (**)(void))ops)[i]) 7316 set_bit(i, sch->has_op); 7317 7318 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 7319 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 7320 7321 /* 7322 * Lock out forks, cgroup on/offlining and moves before opening the 7323 * floodgate so that they don't wander into the operations prematurely. 7324 */ 7325 percpu_down_write(&scx_fork_rwsem); 7326 7327 WARN_ON_ONCE(scx_init_task_enabled); 7328 scx_init_task_enabled = true; 7329 7330 /* flip under fork_rwsem; the iter below covers existing tasks */ 7331 if (ops->flags & SCX_OPS_TID_TO_TASK) 7332 static_branch_enable(&__scx_tid_to_task_enabled); 7333 7334 /* 7335 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 7336 * preventing new tasks from being added. No need to exclude tasks 7337 * leaving as sched_ext_free() can handle both prepped and enabled 7338 * tasks. Prep all tasks first and then enable them with preemption 7339 * disabled. 7340 * 7341 * All cgroups should be initialized before scx_init_task() so that the 7342 * BPF scheduler can reliably track each task's cgroup membership from 7343 * scx_init_task(). Lock out cgroup on/offlining and task migrations 7344 * while tasks are being initialized so that scx_cgroup_can_attach() 7345 * never sees uninitialized tasks. 7346 */ 7347 scx_cgroup_lock(); 7348 set_cgroup_sched(sch_cgroup(sch), sch); 7349 ret = scx_cgroup_init(sch); 7350 if (ret) 7351 goto err_disable_unlock_all; 7352 7353 scx_task_iter_start(&sti, NULL); 7354 while ((p = scx_task_iter_next_locked(&sti))) { 7355 /* 7356 * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD 7357 * tasks are filtered by scx_task_iter_next_locked(). 7358 * sched_ext_dead() removes @p from scx_tasks under the same 7359 * lock before put_task_struct_rcu_user() runs, so @p->usage 7360 * is guaranteed > 0 here. 7361 */ 7362 get_task_struct(p); 7363 7364 /* 7365 * Set %INIT_BEGIN under the iter's rq lock so that a concurrent 7366 * sched_ext_dead() does not call ops.exit_task() on @p while 7367 * ops.init_task() is running. If sched_ext_dead() runs before 7368 * this store, it has already removed @p from scx_tasks and the 7369 * iter won't visit @p; if it runs after, it observes 7370 * %INIT_BEGIN and transitions to %DEAD without calling ops, 7371 * leaving the post-init recheck below to unwind. 7372 */ 7373 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 7374 scx_task_iter_unlock(&sti); 7375 7376 ret = __scx_init_task(sch, p, false); 7377 7378 scx_task_iter_relock(&sti, p); 7379 7380 if (unlikely(ret)) { 7381 if (scx_get_task_state(p) != SCX_TASK_DEAD) 7382 scx_set_task_state(p, SCX_TASK_NONE); 7383 scx_task_iter_stop(&sti); 7384 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 7385 ret, p->comm, p->pid); 7386 put_task_struct(p); 7387 goto err_disable_unlock_all; 7388 } 7389 7390 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7391 /* 7392 * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. 7393 * ops.exit_task() is owed to the sched __scx_init_task() 7394 * ran against; call it now. 7395 */ 7396 scx_sub_init_cancel_task(sch, p); 7397 } else { 7398 scx_set_task_state(p, SCX_TASK_INIT); 7399 scx_set_task_sched(p, sch); 7400 scx_set_task_state(p, SCX_TASK_READY); 7401 } 7402 7403 /* 7404 * Insert into the tid hash. scx_tasks_lock is held by the iter; 7405 * list_empty() guards against sched_ext_dead() having taken @p 7406 * off the list while init ran unlocked. 7407 */ 7408 if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node)) 7409 scx_tid_hash_insert(p); 7410 7411 put_task_struct(p); 7412 } 7413 scx_task_iter_stop(&sti); 7414 scx_cgroup_unlock(); 7415 percpu_up_write(&scx_fork_rwsem); 7416 7417 /* 7418 * All tasks are READY. It's safe to turn on scx_enabled() and switch 7419 * all eligible tasks. 7420 */ 7421 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 7422 static_branch_enable(&__scx_enabled); 7423 7424 /* 7425 * We're fully committed and can't fail. The task READY -> ENABLED 7426 * transitions here are synchronized against sched_ext_free() through 7427 * scx_tasks_lock. 7428 */ 7429 percpu_down_write(&scx_fork_rwsem); 7430 scx_task_iter_start(&sti, NULL); 7431 while ((p = scx_task_iter_next_locked(&sti))) { 7432 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 7433 const struct sched_class *old_class = p->sched_class; 7434 const struct sched_class *new_class = scx_setscheduler_class(p); 7435 7436 if (scx_get_task_state(p) != SCX_TASK_READY) 7437 continue; 7438 7439 if (old_class != new_class) 7440 queue_flags |= DEQUEUE_CLASS; 7441 7442 scoped_guard (sched_change, p, queue_flags) { 7443 p->scx.slice = READ_ONCE(sch->slice_dfl); 7444 p->sched_class = new_class; 7445 } 7446 } 7447 scx_task_iter_stop(&sti); 7448 percpu_up_write(&scx_fork_rwsem); 7449 7450 scx_bypass(sch, false); 7451 7452 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 7453 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 7454 goto err_disable; 7455 } 7456 7457 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 7458 static_branch_enable(&__scx_switched_all); 7459 7460 /* 7461 * Detach the fair_server bandwidth reservation now that the switch 7462 * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no 7463 * task will ever run in the fair class, so give that bandwidth 7464 * back to the RT class. The matching ext_server attach already 7465 * happened earlier; this only releases bandwidth and cannot fail. 7466 * 7467 * In partial mode keep fair_server attached. 7468 */ 7469 if (scx_switched_all()) { 7470 for_each_possible_cpu(cpu) { 7471 struct rq *rq = cpu_rq(cpu); 7472 7473 guard(rq_lock_irqsave)(rq); 7474 update_rq_clock(rq); 7475 dl_server_detach_bw(&rq->fair_server); 7476 } 7477 } 7478 7479 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 7480 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 7481 kobject_uevent(&sch->kobj, KOBJ_ADD); 7482 mutex_unlock(&scx_enable_mutex); 7483 7484 atomic_long_inc(&scx_enable_seq); 7485 7486 cmd->ret = 0; 7487 return; 7488 7489 err_free_tid_hash: 7490 if (ops->flags & SCX_OPS_TID_TO_TASK) 7491 rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); 7492 err_free_ksyncs: 7493 free_kick_syncs(); 7494 err_unlock: 7495 mutex_unlock(&scx_enable_mutex); 7496 cmd->ret = ret; 7497 return; 7498 7499 err_disable_unlock_all: 7500 scx_cgroup_unlock(); 7501 percpu_up_write(&scx_fork_rwsem); 7502 /* we'll soon enter disable path, keep bypass on */ 7503 err_disable: 7504 mutex_unlock(&scx_enable_mutex); 7505 /* 7506 * Returning an error code here would not pass all the error information 7507 * to userspace. Record errno using scx_error() for cases scx_error() 7508 * wasn't already invoked and exit indicating success so that the error 7509 * is notified through ops.exit() with all the details. 7510 * 7511 * Flush scx_disable_work to ensure that error is reported before init 7512 * completion. sch's base reference will be put by bpf_scx_unreg(). 7513 */ 7514 scx_error(sch, "scx_root_enable() failed (%d)", ret); 7515 scx_flush_disable_work(sch); 7516 cmd->ret = 0; 7517 } 7518 7519 #ifdef CONFIG_EXT_SUB_SCHED 7520 /* verify that a scheduler can be attached to @cgrp and return the parent */ 7521 static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 7522 { 7523 struct scx_sched *parent = cgrp->scx_sched; 7524 struct scx_sched *pos; 7525 7526 lockdep_assert_held(&scx_sched_lock); 7527 7528 /* can't attach twice to the same cgroup */ 7529 if (parent->cgrp == cgrp) 7530 return ERR_PTR(-EBUSY); 7531 7532 /* does $parent allow sub-scheds? */ 7533 if (!parent->ops.sub_attach) 7534 return ERR_PTR(-EOPNOTSUPP); 7535 7536 /* can't insert between $parent and its exiting children */ 7537 list_for_each_entry(pos, &parent->children, sibling) 7538 if (cgroup_is_descendant(pos->cgrp, cgrp)) 7539 return ERR_PTR(-EBUSY); 7540 7541 return parent; 7542 } 7543 7544 static bool assert_task_ready_or_enabled(struct task_struct *p) 7545 { 7546 u32 state = scx_get_task_state(p); 7547 7548 switch (state) { 7549 case SCX_TASK_READY: 7550 case SCX_TASK_ENABLED: 7551 return true; 7552 default: 7553 WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 7554 state, p->comm, p->pid); 7555 return false; 7556 } 7557 } 7558 7559 static void scx_sub_enable_workfn(struct kthread_work *work) 7560 { 7561 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7562 struct sched_ext_ops *ops = cmd->ops; 7563 struct cgroup *cgrp; 7564 struct scx_sched *parent, *sch; 7565 struct scx_task_iter sti; 7566 struct task_struct *p; 7567 s32 i, ret; 7568 7569 mutex_lock(&scx_enable_mutex); 7570 7571 if (!scx_enabled()) { 7572 ret = -ENODEV; 7573 goto out_unlock; 7574 } 7575 7576 /* See scx_root_enable_workfn() for the @ops->priv check. */ 7577 if (rcu_access_pointer(ops->priv)) { 7578 ret = -EBUSY; 7579 goto out_unlock; 7580 } 7581 7582 cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 7583 if (IS_ERR(cgrp)) { 7584 ret = PTR_ERR(cgrp); 7585 goto out_unlock; 7586 } 7587 7588 raw_spin_lock_irq(&scx_sched_lock); 7589 parent = find_parent_sched(cgrp); 7590 if (IS_ERR(parent)) { 7591 raw_spin_unlock_irq(&scx_sched_lock); 7592 ret = PTR_ERR(parent); 7593 goto out_put_cgrp; 7594 } 7595 kobject_get(&parent->kobj); 7596 raw_spin_unlock_irq(&scx_sched_lock); 7597 7598 /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ 7599 sch = scx_alloc_and_add_sched(cmd, cgrp, parent); 7600 kobject_put(&parent->kobj); 7601 if (IS_ERR(sch)) { 7602 ret = PTR_ERR(sch); 7603 goto out_unlock; 7604 } 7605 7606 ret = scx_link_sched(sch); 7607 if (ret) 7608 goto err_disable; 7609 7610 if (sch->level >= SCX_SUB_MAX_DEPTH) { 7611 scx_error(sch, "max nesting depth %d violated", 7612 SCX_SUB_MAX_DEPTH); 7613 goto err_disable; 7614 } 7615 7616 if (sch->ops.init) { 7617 ret = SCX_CALL_OP_RET(sch, init, NULL); 7618 if (ret) { 7619 ret = ops_sanitize_err(sch, "init", ret); 7620 scx_error(sch, "ops.init() failed (%d)", ret); 7621 goto err_disable; 7622 } 7623 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7624 } 7625 7626 ret = scx_arena_pool_init(sch); 7627 if (ret) 7628 goto err_disable; 7629 7630 ret = scx_set_cmask_scratch_alloc(sch); 7631 if (ret) 7632 goto err_disable; 7633 7634 if (validate_ops(sch, ops)) 7635 goto err_disable; 7636 7637 struct scx_sub_attach_args sub_attach_args = { 7638 .ops = &sch->ops, 7639 .cgroup_path = sch->cgrp_path, 7640 }; 7641 7642 ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, 7643 &sub_attach_args); 7644 if (ret) { 7645 ret = ops_sanitize_err(sch, "sub_attach", ret); 7646 scx_error(sch, "parent rejected (%d)", ret); 7647 goto err_disable; 7648 } 7649 sch->sub_attached = true; 7650 7651 scx_bypass(sch, true); 7652 7653 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 7654 if (((void (**)(void))ops)[i]) 7655 set_bit(i, sch->has_op); 7656 7657 percpu_down_write(&scx_fork_rwsem); 7658 scx_cgroup_lock(); 7659 7660 /* 7661 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 7662 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 7663 */ 7664 set_cgroup_sched(sch_cgroup(sch), sch); 7665 if (!(cgrp->self.flags & CSS_ONLINE)) { 7666 scx_error(sch, "cgroup is not online"); 7667 goto err_unlock_and_disable; 7668 } 7669 7670 /* 7671 * Initialize tasks for the new child $sch without exiting them for 7672 * $parent so that the tasks can always be reverted back to $parent 7673 * sched on child init failure. 7674 */ 7675 WARN_ON_ONCE(scx_enabling_sub_sched); 7676 scx_enabling_sub_sched = sch; 7677 7678 scx_task_iter_start(&sti, sch->cgrp); 7679 while ((p = scx_task_iter_next_locked(&sti))) { 7680 struct rq *rq; 7681 struct rq_flags rf; 7682 7683 /* 7684 * Task iteration may visit the same task twice when racing 7685 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 7686 * finished __scx_init_task() and skip if set. 7687 * 7688 * A task may exit and get freed between __scx_init_task() 7689 * completion and scx_enable_task(). In such cases, 7690 * scx_disable_and_exit_task() must exit the task for both the 7691 * parent and child scheds. 7692 */ 7693 if (p->scx.flags & SCX_TASK_SUB_INIT) 7694 continue; 7695 7696 /* @p is pinned by the iter; see scx_sub_disable() */ 7697 get_task_struct(p); 7698 7699 if (!assert_task_ready_or_enabled(p)) { 7700 ret = -EINVAL; 7701 goto abort; 7702 } 7703 7704 scx_task_iter_unlock(&sti); 7705 7706 /* 7707 * As $p is still on $parent, it can't be transitioned to INIT. 7708 * Let's worry about task state later. Use __scx_init_task(). 7709 */ 7710 ret = __scx_init_task(sch, p, false); 7711 if (ret) 7712 goto abort; 7713 7714 rq = task_rq_lock(p, &rf); 7715 7716 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7717 /* 7718 * sched_ext_dead() raced us between __scx_init_task() 7719 * and this rq lock and ran exit_task() on $parent (the 7720 * sched @p was on at that point), not on @sch. @sch's 7721 * just-completed init is owed an exit_task() and we 7722 * issue it here. 7723 */ 7724 scx_sub_init_cancel_task(sch, p); 7725 task_rq_unlock(rq, p, &rf); 7726 put_task_struct(p); 7727 continue; 7728 } 7729 7730 p->scx.flags |= SCX_TASK_SUB_INIT; 7731 task_rq_unlock(rq, p, &rf); 7732 7733 put_task_struct(p); 7734 } 7735 scx_task_iter_stop(&sti); 7736 7737 /* 7738 * All tasks are prepped. Disable/exit tasks for $parent and enable for 7739 * the new @sch. 7740 */ 7741 scx_task_iter_start(&sti, sch->cgrp); 7742 while ((p = scx_task_iter_next_locked(&sti))) { 7743 /* 7744 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7745 * duplicate iterations. 7746 */ 7747 if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7748 continue; 7749 7750 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7751 /* 7752 * $p must be either READY or ENABLED. If ENABLED, 7753 * __scx_disabled_and_exit_task() first disables and 7754 * makes it READY. However, after exiting $p, it will 7755 * leave $p as READY. 7756 */ 7757 assert_task_ready_or_enabled(p); 7758 __scx_disable_and_exit_task(parent, p); 7759 7760 /* 7761 * $p is now only initialized for @sch and READY, which 7762 * is what we want. Assign it to @sch and enable. 7763 */ 7764 scx_set_task_sched(p, sch); 7765 scx_enable_task(sch, p); 7766 7767 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7768 } 7769 } 7770 scx_task_iter_stop(&sti); 7771 7772 scx_enabling_sub_sched = NULL; 7773 7774 scx_cgroup_unlock(); 7775 percpu_up_write(&scx_fork_rwsem); 7776 7777 scx_bypass(sch, false); 7778 7779 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7780 kobject_uevent(&sch->kobj, KOBJ_ADD); 7781 ret = 0; 7782 goto out_unlock; 7783 7784 out_put_cgrp: 7785 cgroup_put(cgrp); 7786 out_unlock: 7787 mutex_unlock(&scx_enable_mutex); 7788 cmd->ret = ret; 7789 return; 7790 7791 abort: 7792 put_task_struct(p); 7793 scx_task_iter_stop(&sti); 7794 7795 /* 7796 * Undo __scx_init_task() for tasks we marked. scx_enable_task() never 7797 * ran for @sch on them, so calling scx_disable_task() here would invoke 7798 * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched 7799 * must stay set until SUB_INIT is cleared from every marked task - 7800 * scx_disable_and_exit_task() reads it when a task exits concurrently. 7801 */ 7802 scx_task_iter_start(&sti, sch->cgrp); 7803 while ((p = scx_task_iter_next_locked(&sti))) { 7804 if (p->scx.flags & SCX_TASK_SUB_INIT) { 7805 scx_sub_init_cancel_task(sch, p); 7806 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7807 } 7808 } 7809 scx_task_iter_stop(&sti); 7810 scx_enabling_sub_sched = NULL; 7811 err_unlock_and_disable: 7812 /* we'll soon enter disable path, keep bypass on */ 7813 scx_cgroup_unlock(); 7814 percpu_up_write(&scx_fork_rwsem); 7815 err_disable: 7816 mutex_unlock(&scx_enable_mutex); 7817 scx_flush_disable_work(sch); 7818 cmd->ret = 0; 7819 } 7820 7821 static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7822 unsigned long action, void *data) 7823 { 7824 struct cgroup *cgrp = data; 7825 struct cgroup *parent = cgroup_parent(cgrp); 7826 7827 if (!cgroup_on_dfl(cgrp)) 7828 return NOTIFY_OK; 7829 7830 switch (action) { 7831 case CGROUP_LIFETIME_ONLINE: 7832 /* inherit ->scx_sched from $parent */ 7833 if (parent) 7834 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7835 break; 7836 case CGROUP_LIFETIME_OFFLINE: 7837 /* if there is a sched attached, shoot it down */ 7838 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7839 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7840 SCX_ECODE_RSN_CGROUP_OFFLINE, 7841 "cgroup %llu going offline", cgroup_id(cgrp)); 7842 break; 7843 } 7844 7845 return NOTIFY_OK; 7846 } 7847 7848 static struct notifier_block scx_cgroup_lifetime_nb = { 7849 .notifier_call = scx_cgroup_lifetime_notify, 7850 }; 7851 7852 static s32 __init scx_cgroup_lifetime_notifier_init(void) 7853 { 7854 return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7855 &scx_cgroup_lifetime_nb); 7856 } 7857 core_initcall(scx_cgroup_lifetime_notifier_init); 7858 #endif /* CONFIG_EXT_SUB_SCHED */ 7859 7860 static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link) 7861 { 7862 static struct kthread_worker *helper; 7863 static DEFINE_MUTEX(helper_mutex); 7864 7865 if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { 7866 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7867 return -EINVAL; 7868 } 7869 7870 if (!READ_ONCE(helper)) { 7871 mutex_lock(&helper_mutex); 7872 if (!helper) { 7873 struct kthread_worker *w = 7874 kthread_run_worker(0, "scx_enable_helper"); 7875 if (IS_ERR_OR_NULL(w)) { 7876 mutex_unlock(&helper_mutex); 7877 return -ENOMEM; 7878 } 7879 sched_set_fifo(w->task); 7880 WRITE_ONCE(helper, w); 7881 } 7882 mutex_unlock(&helper_mutex); 7883 } 7884 7885 #ifdef CONFIG_EXT_SUB_SCHED 7886 if (cmd->ops->sub_cgroup_id > 1) 7887 kthread_init_work(&cmd->work, scx_sub_enable_workfn); 7888 else 7889 #endif /* CONFIG_EXT_SUB_SCHED */ 7890 kthread_init_work(&cmd->work, scx_root_enable_workfn); 7891 7892 kthread_queue_work(READ_ONCE(helper), &cmd->work); 7893 kthread_flush_work(&cmd->work); 7894 return cmd->ret; 7895 } 7896 7897 7898 /******************************************************************************** 7899 * bpf_struct_ops plumbing. 7900 */ 7901 #include <linux/bpf_verifier.h> 7902 #include <linux/bpf.h> 7903 #include <linux/btf.h> 7904 7905 static const struct btf_type *task_struct_type; 7906 7907 static bool bpf_scx_is_valid_access(int off, int size, 7908 enum bpf_access_type type, 7909 const struct bpf_prog *prog, 7910 struct bpf_insn_access_aux *info) 7911 { 7912 if (type != BPF_READ) 7913 return false; 7914 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7915 return false; 7916 if (off % size != 0) 7917 return false; 7918 7919 return btf_ctx_access(off, size, type, prog, info); 7920 } 7921 7922 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7923 const struct bpf_reg_state *reg, int off, 7924 int size) 7925 { 7926 const struct btf_type *t; 7927 7928 t = btf_type_by_id(reg->btf, reg->btf_id); 7929 if (t == task_struct_type) { 7930 /* 7931 * COMPAT: Will be removed in v6.23. 7932 */ 7933 if ((off >= offsetof(struct task_struct, scx.slice) && 7934 off + size <= offsetofend(struct task_struct, scx.slice)) || 7935 (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7936 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7937 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7938 return SCALAR_VALUE; 7939 } 7940 7941 if (off >= offsetof(struct task_struct, scx.disallow) && 7942 off + size <= offsetofend(struct task_struct, scx.disallow)) 7943 return SCALAR_VALUE; 7944 } 7945 7946 return -EACCES; 7947 } 7948 7949 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7950 .get_func_proto = bpf_base_func_proto, 7951 .is_valid_access = bpf_scx_is_valid_access, 7952 .btf_struct_access = bpf_scx_btf_struct_access, 7953 }; 7954 7955 static int bpf_scx_init_member(const struct btf_type *t, 7956 const struct btf_member *member, 7957 void *kdata, const void *udata) 7958 { 7959 const struct sched_ext_ops *uops = udata; 7960 struct sched_ext_ops *ops = kdata; 7961 u32 moff = __btf_member_bit_offset(t, member) / 8; 7962 int ret; 7963 7964 switch (moff) { 7965 case offsetof(struct sched_ext_ops, dispatch_max_batch): 7966 if (*(u32 *)(udata + moff) > INT_MAX) 7967 return -E2BIG; 7968 ops->dispatch_max_batch = *(u32 *)(udata + moff); 7969 return 1; 7970 case offsetof(struct sched_ext_ops, flags): 7971 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7972 return -EINVAL; 7973 ops->flags = *(u64 *)(udata + moff); 7974 return 1; 7975 case offsetof(struct sched_ext_ops, name): 7976 ret = bpf_obj_name_cpy(ops->name, uops->name, 7977 sizeof(ops->name)); 7978 if (ret < 0) 7979 return ret; 7980 if (ret == 0) 7981 return -EINVAL; 7982 return 1; 7983 case offsetof(struct sched_ext_ops, timeout_ms): 7984 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7985 SCX_WATCHDOG_MAX_TIMEOUT) 7986 return -E2BIG; 7987 ops->timeout_ms = *(u32 *)(udata + moff); 7988 return 1; 7989 case offsetof(struct sched_ext_ops, exit_dump_len): 7990 ops->exit_dump_len = 7991 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7992 return 1; 7993 case offsetof(struct sched_ext_ops, hotplug_seq): 7994 ops->hotplug_seq = *(u64 *)(udata + moff); 7995 return 1; 7996 #ifdef CONFIG_EXT_SUB_SCHED 7997 case offsetof(struct sched_ext_ops, sub_cgroup_id): 7998 ops->sub_cgroup_id = *(u64 *)(udata + moff); 7999 return 1; 8000 #endif /* CONFIG_EXT_SUB_SCHED */ 8001 } 8002 8003 return 0; 8004 } 8005 8006 #ifdef CONFIG_EXT_SUB_SCHED 8007 static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 8008 { 8009 struct scx_sched *sch; 8010 8011 guard(rcu)(); 8012 sch = scx_prog_sched(prog->aux); 8013 if (unlikely(!sch)) 8014 return; 8015 8016 scx_error(sch, "dispatch recursion detected"); 8017 } 8018 #endif /* CONFIG_EXT_SUB_SCHED */ 8019 8020 static int bpf_scx_check_member(const struct btf_type *t, 8021 const struct btf_member *member, 8022 const struct bpf_prog *prog) 8023 { 8024 u32 moff = __btf_member_bit_offset(t, member) / 8; 8025 8026 switch (moff) { 8027 case offsetof(struct sched_ext_ops, init_task): 8028 #ifdef CONFIG_EXT_GROUP_SCHED 8029 case offsetof(struct sched_ext_ops, cgroup_init): 8030 case offsetof(struct sched_ext_ops, cgroup_exit): 8031 case offsetof(struct sched_ext_ops, cgroup_prep_move): 8032 #endif 8033 case offsetof(struct sched_ext_ops, cpu_online): 8034 case offsetof(struct sched_ext_ops, cpu_offline): 8035 case offsetof(struct sched_ext_ops, init): 8036 case offsetof(struct sched_ext_ops, exit): 8037 case offsetof(struct sched_ext_ops, sub_attach): 8038 case offsetof(struct sched_ext_ops, sub_detach): 8039 break; 8040 default: 8041 if (prog->sleepable) 8042 return -EINVAL; 8043 } 8044 8045 #ifdef CONFIG_EXT_SUB_SCHED 8046 /* 8047 * Enable private stack for operations that can nest along the 8048 * hierarchy. 8049 * 8050 * XXX - Ideally, we should only do this for scheds that allow 8051 * sub-scheds and sub-scheds themselves but I don't know how to access 8052 * struct_ops from here. 8053 */ 8054 switch (moff) { 8055 case offsetof(struct sched_ext_ops, dispatch): 8056 prog->aux->priv_stack_requested = true; 8057 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 8058 } 8059 #endif /* CONFIG_EXT_SUB_SCHED */ 8060 8061 return 0; 8062 } 8063 8064 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 8065 { 8066 struct scx_enable_cmd cmd = { .ops = kdata }; 8067 8068 return scx_enable(&cmd, link); 8069 } 8070 8071 struct scx_arena_scan { 8072 struct bpf_map *arena; 8073 int err; 8074 }; 8075 8076 /* 8077 * The verifier enforces one arena per BPF program, so each struct_ops 8078 * member prog contributes at most one arena via bpf_prog_arena(). 8079 * Require all non-NULL contributions to match. 8080 */ 8081 static int scx_arena_scan_prog(struct bpf_prog *prog, void *data) 8082 { 8083 struct scx_arena_scan *s = data; 8084 struct bpf_map *arena = NULL; 8085 8086 /* arena.o, which defines these, is built only on MMU && 64BIT */ 8087 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 8088 arena = bpf_prog_arena(prog); 8089 #endif 8090 if (!arena) 8091 return 0; 8092 if (s->arena && s->arena != arena) { 8093 s->err = -EINVAL; 8094 return 1; 8095 } 8096 s->arena = arena; 8097 return 0; 8098 } 8099 8100 static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link) 8101 { 8102 struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true }; 8103 struct scx_arena_scan scan = {}; 8104 int ret; 8105 8106 bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan); 8107 if (scan.err) { 8108 pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n"); 8109 return scan.err; 8110 } 8111 if (!scan.arena) { 8112 pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n"); 8113 return -EINVAL; 8114 } 8115 8116 bpf_map_inc(scan.arena); 8117 cmd.arena_map = scan.arena; 8118 ret = scx_enable(&cmd, link); 8119 if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */ 8120 bpf_map_put(cmd.arena_map); 8121 return ret; 8122 } 8123 8124 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 8125 { 8126 struct sched_ext_ops *ops = kdata; 8127 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 8128 8129 scx_disable(sch, SCX_EXIT_UNREG); 8130 scx_flush_disable_work(sch); 8131 RCU_INIT_POINTER(ops->priv, NULL); 8132 kobject_put(&sch->kobj); 8133 } 8134 8135 static int bpf_scx_init(struct btf *btf) 8136 { 8137 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 8138 8139 return 0; 8140 } 8141 8142 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 8143 { 8144 /* 8145 * sched_ext does not support updating the actively-loaded BPF 8146 * scheduler, as registering a BPF scheduler can always fail if the 8147 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 8148 * etc. Similarly, we can always race with unregistration happening 8149 * elsewhere, such as with sysrq. 8150 */ 8151 return -EOPNOTSUPP; 8152 } 8153 8154 static int bpf_scx_validate(void *kdata) 8155 { 8156 return 0; 8157 } 8158 8159 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 8160 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 8161 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 8162 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 8163 static void sched_ext_ops__tick(struct task_struct *p) {} 8164 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 8165 static void sched_ext_ops__running(struct task_struct *p) {} 8166 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 8167 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 8168 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 8169 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 8170 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 8171 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 8172 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 8173 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 8174 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 8175 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 8176 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 8177 static void sched_ext_ops__enable(struct task_struct *p) {} 8178 static void sched_ext_ops__disable(struct task_struct *p) {} 8179 #ifdef CONFIG_EXT_GROUP_SCHED 8180 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 8181 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 8182 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 8183 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 8184 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 8185 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 8186 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 8187 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 8188 #endif /* CONFIG_EXT_GROUP_SCHED */ 8189 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 8190 static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 8191 static void sched_ext_ops__cpu_online(s32 cpu) {} 8192 static void sched_ext_ops__cpu_offline(s32 cpu) {} 8193 static s32 sched_ext_ops__init(void) { return -EINVAL; } 8194 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 8195 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 8196 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 8197 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 8198 8199 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 8200 .select_cpu = sched_ext_ops__select_cpu, 8201 .enqueue = sched_ext_ops__enqueue, 8202 .dequeue = sched_ext_ops__dequeue, 8203 .dispatch = sched_ext_ops__dispatch, 8204 .tick = sched_ext_ops__tick, 8205 .runnable = sched_ext_ops__runnable, 8206 .running = sched_ext_ops__running, 8207 .stopping = sched_ext_ops__stopping, 8208 .quiescent = sched_ext_ops__quiescent, 8209 .yield = sched_ext_ops__yield, 8210 .core_sched_before = sched_ext_ops__core_sched_before, 8211 .set_weight = sched_ext_ops__set_weight, 8212 .set_cpumask = sched_ext_ops__set_cpumask, 8213 .update_idle = sched_ext_ops__update_idle, 8214 .cpu_acquire = sched_ext_ops__cpu_acquire, 8215 .cpu_release = sched_ext_ops__cpu_release, 8216 .init_task = sched_ext_ops__init_task, 8217 .exit_task = sched_ext_ops__exit_task, 8218 .enable = sched_ext_ops__enable, 8219 .disable = sched_ext_ops__disable, 8220 #ifdef CONFIG_EXT_GROUP_SCHED 8221 .cgroup_init = sched_ext_ops__cgroup_init, 8222 .cgroup_exit = sched_ext_ops__cgroup_exit, 8223 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 8224 .cgroup_move = sched_ext_ops__cgroup_move, 8225 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 8226 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 8227 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 8228 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 8229 #endif 8230 .sub_attach = sched_ext_ops__sub_attach, 8231 .sub_detach = sched_ext_ops__sub_detach, 8232 .cpu_online = sched_ext_ops__cpu_online, 8233 .cpu_offline = sched_ext_ops__cpu_offline, 8234 .init = sched_ext_ops__init, 8235 .exit = sched_ext_ops__exit, 8236 .dump = sched_ext_ops__dump, 8237 .dump_cpu = sched_ext_ops__dump_cpu, 8238 .dump_task = sched_ext_ops__dump_task, 8239 }; 8240 8241 static struct bpf_struct_ops bpf_sched_ext_ops = { 8242 .verifier_ops = &bpf_scx_verifier_ops, 8243 .reg = bpf_scx_reg, 8244 .unreg = bpf_scx_unreg, 8245 .check_member = bpf_scx_check_member, 8246 .init_member = bpf_scx_init_member, 8247 .init = bpf_scx_init, 8248 .update = bpf_scx_update, 8249 .validate = bpf_scx_validate, 8250 .name = "sched_ext_ops", 8251 .owner = THIS_MODULE, 8252 .cfi_stubs = &__bpf_ops_sched_ext_ops 8253 }; 8254 8255 /* 8256 * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types 8257 * identical, only param names differ across structs) are reused; only 8258 * set_cmask needs a fresh stub since the second argument type differs. 8259 */ 8260 static void sched_ext_ops_cid__set_cmask(struct task_struct *p, 8261 const struct scx_cmask *cmask) {} 8262 8263 static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = { 8264 .select_cid = sched_ext_ops__select_cpu, 8265 .enqueue = sched_ext_ops__enqueue, 8266 .dequeue = sched_ext_ops__dequeue, 8267 .dispatch = sched_ext_ops__dispatch, 8268 .tick = sched_ext_ops__tick, 8269 .runnable = sched_ext_ops__runnable, 8270 .running = sched_ext_ops__running, 8271 .stopping = sched_ext_ops__stopping, 8272 .quiescent = sched_ext_ops__quiescent, 8273 .yield = sched_ext_ops__yield, 8274 .core_sched_before = sched_ext_ops__core_sched_before, 8275 .set_weight = sched_ext_ops__set_weight, 8276 .set_cmask = sched_ext_ops_cid__set_cmask, 8277 .update_idle = sched_ext_ops__update_idle, 8278 .init_task = sched_ext_ops__init_task, 8279 .exit_task = sched_ext_ops__exit_task, 8280 .enable = sched_ext_ops__enable, 8281 .disable = sched_ext_ops__disable, 8282 #ifdef CONFIG_EXT_GROUP_SCHED 8283 .cgroup_init = sched_ext_ops__cgroup_init, 8284 .cgroup_exit = sched_ext_ops__cgroup_exit, 8285 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 8286 .cgroup_move = sched_ext_ops__cgroup_move, 8287 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 8288 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 8289 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 8290 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 8291 #endif 8292 .sub_attach = sched_ext_ops__sub_attach, 8293 .sub_detach = sched_ext_ops__sub_detach, 8294 .cid_online = sched_ext_ops__cpu_online, 8295 .cid_offline = sched_ext_ops__cpu_offline, 8296 .init = sched_ext_ops__init, 8297 .exit = sched_ext_ops__exit, 8298 .dump = sched_ext_ops__dump, 8299 .dump_cid = sched_ext_ops__dump_cpu, 8300 .dump_task = sched_ext_ops__dump_task, 8301 }; 8302 8303 /* 8304 * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form. 8305 * init_member, check_member, reg, unreg, etc. process kdata as the byte block 8306 * verified to match by the BUILD_BUG_ON checks in scx_init(). 8307 */ 8308 static struct bpf_struct_ops bpf_sched_ext_ops_cid = { 8309 .verifier_ops = &bpf_scx_verifier_ops, 8310 .reg = bpf_scx_reg_cid, 8311 .unreg = bpf_scx_unreg, 8312 .check_member = bpf_scx_check_member, 8313 .init_member = bpf_scx_init_member, 8314 .init = bpf_scx_init, 8315 .update = bpf_scx_update, 8316 .validate = bpf_scx_validate, 8317 .name = "sched_ext_ops_cid", 8318 .owner = THIS_MODULE, 8319 .cfi_stubs = &__bpf_ops_sched_ext_ops_cid 8320 }; 8321 8322 8323 /******************************************************************************** 8324 * System integration and init. 8325 */ 8326 8327 static void sysrq_handle_sched_ext_reset(u8 key) 8328 { 8329 struct scx_sched *sch; 8330 8331 sch = rcu_dereference(scx_root); 8332 if (likely(sch)) 8333 scx_disable(sch, SCX_EXIT_SYSRQ); 8334 else 8335 pr_info("sched_ext: BPF schedulers not loaded\n"); 8336 } 8337 8338 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 8339 .handler = sysrq_handle_sched_ext_reset, 8340 .help_msg = "reset-sched-ext(S)", 8341 .action_msg = "Disable sched_ext and revert all tasks to CFS", 8342 .enable_mask = SYSRQ_ENABLE_RTNICE, 8343 }; 8344 8345 static void sysrq_handle_sched_ext_dump(u8 key) 8346 { 8347 struct scx_exit_info ei = { 8348 .kind = SCX_EXIT_NONE, 8349 .exit_cpu = -1, 8350 .reason = "SysRq-D", 8351 }; 8352 struct scx_sched *sch; 8353 8354 list_for_each_entry_rcu(sch, &scx_sched_all, all) 8355 scx_dump_state(sch, &ei, 0, false); 8356 } 8357 8358 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 8359 .handler = sysrq_handle_sched_ext_dump, 8360 .help_msg = "dump-sched-ext(D)", 8361 .action_msg = "Trigger sched_ext debug dump", 8362 .enable_mask = SYSRQ_ENABLE_RTNICE, 8363 }; 8364 8365 static bool can_skip_idle_kick(struct rq *rq) 8366 { 8367 lockdep_assert_rq_held(rq); 8368 8369 /* 8370 * We can skip idle kicking if @rq is going to go through at least one 8371 * full SCX scheduling cycle before going idle. Just checking whether 8372 * curr is not idle is insufficient because we could be racing 8373 * balance_one() trying to pull the next task from a remote rq, which 8374 * may fail, and @rq may become idle afterwards. 8375 * 8376 * The race window is small and we don't and can't guarantee that @rq is 8377 * only kicked while idle anyway. Skip only when sure. 8378 */ 8379 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 8380 } 8381 8382 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 8383 { 8384 struct rq *rq = cpu_rq(cpu); 8385 struct scx_rq *this_scx = &this_rq->scx; 8386 const struct sched_class *cur_class; 8387 bool should_wait = false; 8388 unsigned long flags; 8389 8390 raw_spin_rq_lock_irqsave(rq, flags); 8391 cur_class = rq->curr->sched_class; 8392 8393 /* 8394 * During CPU hotplug, a CPU may depend on kicking itself to make 8395 * forward progress. Allow kicking self regardless of online state. If 8396 * @cpu is running a higher class task, we have no control over @cpu. 8397 * Skip kicking. 8398 */ 8399 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 8400 !sched_class_above(cur_class, &ext_sched_class)) { 8401 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 8402 if (cur_class == &ext_sched_class) 8403 rq->curr->scx.slice = 0; 8404 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 8405 } 8406 8407 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 8408 if (cur_class == &ext_sched_class) { 8409 cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 8410 ksyncs[cpu] = rq->scx.kick_sync; 8411 should_wait = true; 8412 } 8413 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 8414 } 8415 8416 resched_curr(rq); 8417 } else { 8418 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 8419 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 8420 } 8421 8422 raw_spin_rq_unlock_irqrestore(rq, flags); 8423 8424 return should_wait; 8425 } 8426 8427 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 8428 { 8429 struct rq *rq = cpu_rq(cpu); 8430 unsigned long flags; 8431 8432 raw_spin_rq_lock_irqsave(rq, flags); 8433 8434 if (!can_skip_idle_kick(rq) && 8435 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 8436 resched_curr(rq); 8437 8438 raw_spin_rq_unlock_irqrestore(rq, flags); 8439 } 8440 8441 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 8442 { 8443 struct rq *this_rq = this_rq(); 8444 struct scx_rq *this_scx = &this_rq->scx; 8445 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 8446 bool should_wait = false; 8447 unsigned long *ksyncs; 8448 s32 cpu; 8449 8450 /* can race with free_kick_syncs() during scheduler disable */ 8451 if (unlikely(!ksyncs_pcpu)) 8452 return; 8453 8454 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 8455 8456 for_each_cpu(cpu, this_scx->cpus_to_kick) { 8457 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 8458 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 8459 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 8460 } 8461 8462 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 8463 kick_one_cpu_if_idle(cpu, this_rq); 8464 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 8465 } 8466 8467 /* 8468 * Can't wait in hardirq — kick_sync can't advance, deadlocking if 8469 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 8470 */ 8471 if (should_wait) { 8472 raw_spin_rq_lock(this_rq); 8473 this_scx->kick_sync_pending = true; 8474 resched_curr(this_rq); 8475 raw_spin_rq_unlock(this_rq); 8476 } 8477 } 8478 8479 /** 8480 * print_scx_info - print out sched_ext scheduler state 8481 * @log_lvl: the log level to use when printing 8482 * @p: target task 8483 * 8484 * If a sched_ext scheduler is enabled, print the name and state of the 8485 * scheduler. If @p is on sched_ext, print further information about the task. 8486 * 8487 * This function can be safely called on any task as long as the task_struct 8488 * itself is accessible. While safe, this function isn't synchronized and may 8489 * print out mixups or garbages of limited length. 8490 */ 8491 void print_scx_info(const char *log_lvl, struct task_struct *p) 8492 { 8493 struct scx_sched *sch; 8494 enum scx_enable_state state = scx_enable_state(); 8495 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 8496 char runnable_at_buf[22] = "?"; 8497 struct sched_class *class; 8498 unsigned long runnable_at; 8499 8500 guard(rcu)(); 8501 8502 sch = scx_task_sched_rcu(p); 8503 8504 if (!sch) 8505 return; 8506 8507 /* 8508 * Carefully check if the task was running on sched_ext, and then 8509 * carefully copy the time it's been runnable, and its state. 8510 */ 8511 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 8512 class != &ext_sched_class) { 8513 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 8514 scx_enable_state_str[state], all); 8515 return; 8516 } 8517 8518 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 8519 sizeof(runnable_at))) 8520 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 8521 jiffies_delta_msecs(runnable_at, jiffies)); 8522 8523 /* print everything onto one line to conserve console space */ 8524 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 8525 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 8526 runnable_at_buf); 8527 } 8528 8529 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 8530 { 8531 struct scx_sched *sch; 8532 8533 guard(rcu)(); 8534 8535 sch = rcu_dereference(scx_root); 8536 if (!sch) 8537 return NOTIFY_OK; 8538 8539 /* 8540 * SCX schedulers often have userspace components which are sometimes 8541 * involved in critial scheduling paths. PM operations involve freezing 8542 * userspace which can lead to scheduling misbehaviors including stalls. 8543 * Let's bypass while PM operations are in progress. 8544 */ 8545 switch (event) { 8546 case PM_HIBERNATION_PREPARE: 8547 case PM_SUSPEND_PREPARE: 8548 case PM_RESTORE_PREPARE: 8549 scx_bypass(sch, true); 8550 break; 8551 case PM_POST_HIBERNATION: 8552 case PM_POST_SUSPEND: 8553 case PM_POST_RESTORE: 8554 scx_bypass(sch, false); 8555 break; 8556 } 8557 8558 return NOTIFY_OK; 8559 } 8560 8561 static struct notifier_block scx_pm_notifier = { 8562 .notifier_call = scx_pm_handler, 8563 }; 8564 8565 void __init init_sched_ext_class(void) 8566 { 8567 s32 cpu, v; 8568 8569 /* 8570 * The following is to prevent the compiler from optimizing out the enum 8571 * definitions so that BPF scheduler implementations can use them 8572 * through the generated vmlinux.h. 8573 */ 8574 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 8575 SCX_TG_ONLINE); 8576 8577 scx_idle_init_masks(); 8578 8579 for_each_possible_cpu(cpu) { 8580 struct rq *rq = cpu_rq(cpu); 8581 int n = cpu_to_node(cpu); 8582 8583 /* local_dsq's sch will be set during scx_root_enable() */ 8584 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 8585 8586 INIT_LIST_HEAD(&rq->scx.runnable_list); 8587 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 8588 8589 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 8590 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 8591 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 8592 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 8593 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 8594 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 8595 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 8596 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 8597 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 8598 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 8599 8600 if (cpu_online(cpu)) 8601 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 8602 } 8603 8604 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 8605 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 8606 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 8607 8608 #ifdef CONFIG_EXT_SUB_SCHED 8609 BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 8610 #endif /* CONFIG_EXT_SUB_SCHED */ 8611 } 8612 8613 8614 /******************************************************************************** 8615 * Helpers that can be called from the BPF scheduler. 8616 */ 8617 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) 8618 { 8619 bool is_local = dsq_id == SCX_DSQ_LOCAL || 8620 (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; 8621 8622 if (*enq_flags & SCX_ENQ_IMMED) { 8623 if (unlikely(!is_local)) { 8624 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 8625 return false; 8626 } 8627 } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { 8628 *enq_flags |= SCX_ENQ_IMMED; 8629 } 8630 8631 return true; 8632 } 8633 8634 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 8635 u64 dsq_id, u64 *enq_flags) 8636 { 8637 lockdep_assert_irqs_disabled(); 8638 8639 if (unlikely(!p)) { 8640 scx_error(sch, "called with NULL task"); 8641 return false; 8642 } 8643 8644 if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 8645 scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); 8646 return false; 8647 } 8648 8649 /* see SCX_EV_INSERT_NOT_OWNED definition */ 8650 if (unlikely(!scx_task_on_sched(sch, p))) { 8651 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 8652 return false; 8653 } 8654 8655 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8656 return false; 8657 8658 return true; 8659 } 8660 8661 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 8662 u64 dsq_id, u64 enq_flags) 8663 { 8664 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8665 struct task_struct *ddsp_task; 8666 8667 ddsp_task = __this_cpu_read(direct_dispatch_task); 8668 if (ddsp_task) { 8669 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 8670 return; 8671 } 8672 8673 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 8674 scx_error(sch, "dispatch buffer overflow"); 8675 return; 8676 } 8677 8678 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 8679 .task = p, 8680 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 8681 .dsq_id = dsq_id, 8682 .enq_flags = enq_flags, 8683 }; 8684 } 8685 8686 __bpf_kfunc_start_defs(); 8687 8688 /** 8689 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 8690 * @p: task_struct to insert 8691 * @dsq_id: DSQ to insert into 8692 * @slice: duration @p can run for in nsecs, 0 to keep the current value 8693 * @enq_flags: SCX_ENQ_* 8694 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8695 * 8696 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 8697 * call this function spuriously. Can be called from ops.enqueue(), 8698 * ops.select_cpu(), and ops.dispatch(). 8699 * 8700 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 8701 * and @p must match the task being enqueued. 8702 * 8703 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 8704 * will be directly inserted into the corresponding dispatch queue after 8705 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 8706 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 8707 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 8708 * task is inserted. 8709 * 8710 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 8711 * and this function can be called upto ops.dispatch_max_batch times to insert 8712 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 8713 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 8714 * counter. 8715 * 8716 * This function doesn't have any locking restrictions and may be called under 8717 * BPF locks (in the future when BPF introduces more flexible locking). 8718 * 8719 * @p is allowed to run for @slice. The scheduling path is triggered on slice 8720 * exhaustion. If zero, the current residual slice is maintained. If 8721 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 8722 * scx_bpf_kick_cpu() to trigger scheduling. 8723 * 8724 * Returns %true on successful insertion, %false on failure. On the root 8725 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8726 * to check the return value. 8727 */ 8728 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 8729 u64 slice, u64 enq_flags, 8730 const struct bpf_prog_aux *aux) 8731 { 8732 struct scx_sched *sch; 8733 8734 guard(rcu)(); 8735 sch = scx_prog_sched(aux); 8736 if (unlikely(!sch)) 8737 return false; 8738 8739 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8740 return false; 8741 8742 if (slice) 8743 p->scx.slice = slice; 8744 else 8745 p->scx.slice = p->scx.slice ?: 1; 8746 8747 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 8748 8749 return true; 8750 } 8751 8752 /* 8753 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 8754 */ 8755 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 8756 u64 slice, u64 enq_flags, 8757 const struct bpf_prog_aux *aux) 8758 { 8759 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 8760 } 8761 8762 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 8763 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 8764 { 8765 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8766 return false; 8767 8768 if (slice) 8769 p->scx.slice = slice; 8770 else 8771 p->scx.slice = p->scx.slice ?: 1; 8772 8773 p->scx.dsq_vtime = vtime; 8774 8775 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8776 8777 return true; 8778 } 8779 8780 struct scx_bpf_dsq_insert_vtime_args { 8781 /* @p can't be packed together as KF_RCU is not transitive */ 8782 u64 dsq_id; 8783 u64 slice; 8784 u64 vtime; 8785 u64 enq_flags; 8786 }; 8787 8788 /** 8789 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 8790 * @p: task_struct to insert 8791 * @args: struct containing the rest of the arguments 8792 * @args->dsq_id: DSQ to insert into 8793 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 8794 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 8795 * @args->enq_flags: SCX_ENQ_* 8796 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8797 * 8798 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 8799 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 8800 * as an inline wrapper in common.bpf.h. 8801 * 8802 * Insert @p into the vtime priority queue of the DSQ identified by 8803 * @args->dsq_id. Tasks queued into the priority queue are ordered by 8804 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 8805 * 8806 * @args->vtime ordering is according to time_before64() which considers 8807 * wrapping. A numerically larger vtime may indicate an earlier position in the 8808 * ordering and vice-versa. 8809 * 8810 * A DSQ can only be used as a FIFO or priority queue at any given time and this 8811 * function must not be called on a DSQ which already has one or more FIFO tasks 8812 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 8813 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 8814 * 8815 * Returns %true on successful insertion, %false on failure. On the root 8816 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8817 * to check the return value. 8818 */ 8819 __bpf_kfunc bool 8820 __scx_bpf_dsq_insert_vtime(struct task_struct *p, 8821 struct scx_bpf_dsq_insert_vtime_args *args, 8822 const struct bpf_prog_aux *aux) 8823 { 8824 struct scx_sched *sch; 8825 8826 guard(rcu)(); 8827 8828 sch = scx_prog_sched(aux); 8829 if (unlikely(!sch)) 8830 return false; 8831 8832 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 8833 args->vtime, args->enq_flags); 8834 } 8835 8836 /* 8837 * COMPAT: Will be removed in v6.23. 8838 */ 8839 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 8840 u64 slice, u64 vtime, u64 enq_flags) 8841 { 8842 struct scx_sched *sch; 8843 8844 guard(rcu)(); 8845 8846 sch = rcu_dereference(scx_root); 8847 if (unlikely(!sch)) 8848 return; 8849 8850 #ifdef CONFIG_EXT_SUB_SCHED 8851 /* 8852 * Disallow if any sub-scheds are attached. There is no way to tell 8853 * which scheduler called us, just error out @p's scheduler. 8854 */ 8855 if (unlikely(!list_empty(&sch->children))) { 8856 scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 8857 return; 8858 } 8859 #endif 8860 8861 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8862 } 8863 8864 __bpf_kfunc_end_defs(); 8865 8866 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8867 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8868 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8869 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8870 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8871 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8872 8873 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8874 .owner = THIS_MODULE, 8875 .set = &scx_kfunc_ids_enqueue_dispatch, 8876 .filter = scx_kfunc_context_filter, 8877 }; 8878 8879 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8880 struct task_struct *p, u64 dsq_id, u64 enq_flags) 8881 { 8882 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8883 struct scx_sched *sch; 8884 struct rq *this_rq, *src_rq, *locked_rq; 8885 bool dispatched = false; 8886 bool in_balance; 8887 unsigned long flags; 8888 8889 /* 8890 * The verifier considers an iterator slot initialized on any 8891 * KF_ITER_NEW return, so a BPF program may legally reach here after 8892 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. 8893 */ 8894 if (unlikely(!src_dsq)) 8895 return false; 8896 8897 sch = src_dsq->sched; 8898 8899 if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) 8900 return false; 8901 8902 /* 8903 * If the BPF scheduler keeps calling this function repeatedly, it can 8904 * cause similar live-lock conditions as consume_dispatch_q(). 8905 */ 8906 if (unlikely(READ_ONCE(sch->aborting))) 8907 return false; 8908 8909 if (unlikely(!scx_task_on_sched(sch, p))) { 8910 scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8911 p->comm, p->pid); 8912 return false; 8913 } 8914 8915 /* 8916 * Can be called from either ops.dispatch() locking this_rq() or any 8917 * context where no rq lock is held. If latter, lock @p's task_rq which 8918 * we'll likely need anyway. 8919 */ 8920 src_rq = task_rq(p); 8921 8922 local_irq_save(flags); 8923 this_rq = this_rq(); 8924 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8925 8926 if (in_balance) { 8927 if (this_rq != src_rq) { 8928 raw_spin_rq_unlock(this_rq); 8929 raw_spin_rq_lock(src_rq); 8930 } 8931 } else { 8932 raw_spin_rq_lock(src_rq); 8933 } 8934 8935 locked_rq = src_rq; 8936 raw_spin_lock(&src_dsq->lock); 8937 8938 /* did someone else get to it while we dropped the locks? */ 8939 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8940 raw_spin_unlock(&src_dsq->lock); 8941 goto out; 8942 } 8943 8944 /* @p is still on $src_dsq and stable, determine the destination */ 8945 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8946 8947 /* 8948 * Apply vtime and slice updates before moving so that the new time is 8949 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8950 * this is safe as we're locking it. 8951 */ 8952 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8953 p->scx.dsq_vtime = kit->vtime; 8954 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8955 p->scx.slice = kit->slice; 8956 8957 /* execute move */ 8958 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8959 dispatched = true; 8960 out: 8961 if (in_balance) { 8962 if (this_rq != locked_rq) { 8963 raw_spin_rq_unlock(locked_rq); 8964 raw_spin_rq_lock(this_rq); 8965 } 8966 } else { 8967 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8968 } 8969 8970 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8971 __SCX_DSQ_ITER_HAS_VTIME); 8972 return dispatched; 8973 } 8974 8975 __bpf_kfunc_start_defs(); 8976 8977 /** 8978 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8979 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8980 * 8981 * Can only be called from ops.dispatch(). 8982 */ 8983 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8984 { 8985 struct scx_sched *sch; 8986 8987 guard(rcu)(); 8988 8989 sch = scx_prog_sched(aux); 8990 if (unlikely(!sch)) 8991 return 0; 8992 8993 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8994 } 8995 8996 /** 8997 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8998 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8999 * 9000 * Cancel the latest dispatch. Can be called multiple times to cancel further 9001 * dispatches. Can only be called from ops.dispatch(). 9002 */ 9003 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 9004 { 9005 struct scx_sched *sch; 9006 struct scx_dsp_ctx *dspc; 9007 9008 guard(rcu)(); 9009 9010 sch = scx_prog_sched(aux); 9011 if (unlikely(!sch)) 9012 return; 9013 9014 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 9015 9016 if (dspc->cursor > 0) 9017 dspc->cursor--; 9018 else 9019 scx_error(sch, "dispatch buffer underflow"); 9020 } 9021 9022 /** 9023 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 9024 * @dsq_id: DSQ to move task from. Must be a user-created DSQ 9025 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9026 * @enq_flags: %SCX_ENQ_* 9027 * 9028 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 9029 * local DSQ for execution with @enq_flags applied. Can only be called from 9030 * ops.dispatch(). 9031 * 9032 * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as 9033 * sources. Local DSQs support reenqueueing (a task can be picked up for 9034 * execution, dequeued for property changes, or reenqueued), but the BPF 9035 * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL 9036 * is similar but also doesn't support reenqueueing, as it maps to multiple 9037 * per-node DSQs making the scope difficult to define; this may change in the 9038 * future. 9039 * 9040 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 9041 * before trying to move from the specified DSQ. It may also grab rq locks and 9042 * thus can't be called under any BPF locks. 9043 * 9044 * Returns %true if a task has been moved, %false if there isn't any task to 9045 * move. 9046 */ 9047 __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, 9048 const struct bpf_prog_aux *aux) 9049 { 9050 struct scx_dispatch_q *dsq; 9051 struct scx_sched *sch; 9052 struct scx_dsp_ctx *dspc; 9053 9054 guard(rcu)(); 9055 9056 sch = scx_prog_sched(aux); 9057 if (unlikely(!sch)) 9058 return false; 9059 9060 if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) 9061 return false; 9062 9063 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 9064 9065 flush_dispatch_buf(sch, dspc->rq); 9066 9067 dsq = find_user_dsq(sch, dsq_id); 9068 if (unlikely(!dsq)) { 9069 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 9070 return false; 9071 } 9072 9073 if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { 9074 /* 9075 * A successfully consumed task can be dequeued before it starts 9076 * running while the CPU is trying to migrate other dispatched 9077 * tasks. Bump nr_tasks to tell balance_one() to retry on empty 9078 * local DSQ. 9079 */ 9080 dspc->nr_tasks++; 9081 return true; 9082 } else { 9083 return false; 9084 } 9085 } 9086 9087 /* 9088 * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. 9089 */ 9090 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 9091 { 9092 return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); 9093 } 9094 9095 /** 9096 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 9097 * @it__iter: DSQ iterator in progress 9098 * @slice: duration the moved task can run for in nsecs 9099 * 9100 * Override the slice of the next task that will be moved from @it__iter using 9101 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 9102 * slice duration is kept. 9103 */ 9104 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 9105 u64 slice) 9106 { 9107 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 9108 9109 kit->slice = slice; 9110 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 9111 } 9112 9113 /** 9114 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 9115 * @it__iter: DSQ iterator in progress 9116 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 9117 * 9118 * Override the vtime of the next task that will be moved from @it__iter using 9119 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 9120 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 9121 * override is ignored and cleared. 9122 */ 9123 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 9124 u64 vtime) 9125 { 9126 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 9127 9128 kit->vtime = vtime; 9129 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 9130 } 9131 9132 /** 9133 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 9134 * @it__iter: DSQ iterator in progress 9135 * @p: task to transfer 9136 * @dsq_id: DSQ to move @p to 9137 * @enq_flags: SCX_ENQ_* 9138 * 9139 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 9140 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 9141 * be the destination. 9142 * 9143 * For the transfer to be successful, @p must still be on the DSQ and have been 9144 * queued before the DSQ iteration started. This function doesn't care whether 9145 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 9146 * been queued before the iteration started. 9147 * 9148 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 9149 * 9150 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 9151 * lock (e.g. BPF timers or SYSCALL programs). 9152 * 9153 * Returns %true if @p has been consumed, %false if @p had already been 9154 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 9155 * DSQ. 9156 */ 9157 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 9158 struct task_struct *p, u64 dsq_id, 9159 u64 enq_flags) 9160 { 9161 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 9162 p, dsq_id, enq_flags); 9163 } 9164 9165 /** 9166 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 9167 * @it__iter: DSQ iterator in progress 9168 * @p: task to transfer 9169 * @dsq_id: DSQ to move @p to 9170 * @enq_flags: SCX_ENQ_* 9171 * 9172 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 9173 * priority queue of the DSQ specified by @dsq_id. The destination must be a 9174 * user DSQ as only user DSQs support priority queue. 9175 * 9176 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 9177 * and scx_bpf_dsq_move_set_vtime() to update. 9178 * 9179 * All other aspects are identical to scx_bpf_dsq_move(). See 9180 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 9181 */ 9182 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 9183 struct task_struct *p, u64 dsq_id, 9184 u64 enq_flags) 9185 { 9186 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 9187 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 9188 } 9189 9190 #ifdef CONFIG_EXT_SUB_SCHED 9191 /** 9192 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 9193 * @cgroup_id: cgroup ID of the child scheduler to dispatch 9194 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9195 * 9196 * Allows a parent scheduler to trigger dispatching on one of its direct 9197 * child schedulers. The child scheduler runs its dispatch operation to 9198 * move tasks from dispatch queues to the local runqueue. 9199 * 9200 * Returns: true on success, false if cgroup_id is invalid, not a direct 9201 * child, or caller lacks dispatch permission. 9202 */ 9203 __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 9204 { 9205 struct rq *this_rq = this_rq(); 9206 struct scx_sched *parent, *child; 9207 9208 guard(rcu)(); 9209 parent = scx_prog_sched(aux); 9210 if (unlikely(!parent)) 9211 return false; 9212 9213 child = scx_find_sub_sched(cgroup_id); 9214 9215 if (unlikely(!child)) 9216 return false; 9217 9218 if (unlikely(scx_parent(child) != parent)) { 9219 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 9220 cgroup_id); 9221 return false; 9222 } 9223 9224 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 9225 true); 9226 } 9227 #endif /* CONFIG_EXT_SUB_SCHED */ 9228 9229 __bpf_kfunc_end_defs(); 9230 9231 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 9232 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 9233 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 9234 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 9235 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) 9236 /* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ 9237 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 9238 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 9239 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 9240 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 9241 #ifdef CONFIG_EXT_SUB_SCHED 9242 BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 9243 #endif 9244 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 9245 9246 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 9247 .owner = THIS_MODULE, 9248 .set = &scx_kfunc_ids_dispatch, 9249 .filter = scx_kfunc_context_filter, 9250 }; 9251 9252 __bpf_kfunc_start_defs(); 9253 9254 /** 9255 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9256 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9257 * 9258 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9259 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 9260 * processed tasks. Can only be called from ops.cpu_release(). 9261 */ 9262 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 9263 { 9264 struct scx_sched *sch; 9265 struct rq *rq; 9266 9267 guard(rcu)(); 9268 sch = scx_prog_sched(aux); 9269 if (unlikely(!sch)) 9270 return 0; 9271 9272 rq = cpu_rq(smp_processor_id()); 9273 lockdep_assert_rq_held(rq); 9274 9275 return reenq_local(sch, rq, SCX_REENQ_ANY); 9276 } 9277 9278 __bpf_kfunc_end_defs(); 9279 9280 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 9281 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 9282 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 9283 9284 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 9285 .owner = THIS_MODULE, 9286 .set = &scx_kfunc_ids_cpu_release, 9287 .filter = scx_kfunc_context_filter, 9288 }; 9289 9290 __bpf_kfunc_start_defs(); 9291 9292 /** 9293 * scx_bpf_create_dsq - Create a custom DSQ 9294 * @dsq_id: DSQ to create 9295 * @node: NUMA node to allocate from 9296 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9297 * 9298 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 9299 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 9300 */ 9301 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 9302 { 9303 struct scx_dispatch_q *dsq; 9304 struct scx_sched *sch; 9305 s32 ret; 9306 9307 if (unlikely(node >= (int)nr_node_ids || 9308 (node < 0 && node != NUMA_NO_NODE))) 9309 return -EINVAL; 9310 9311 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 9312 return -EINVAL; 9313 9314 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 9315 if (!dsq) 9316 return -ENOMEM; 9317 9318 /* 9319 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 9320 * @sch and update afterwards. 9321 */ 9322 ret = init_dsq(dsq, dsq_id, NULL); 9323 if (ret) { 9324 kfree(dsq); 9325 return ret; 9326 } 9327 9328 rcu_read_lock(); 9329 9330 sch = scx_prog_sched(aux); 9331 if (sch) { 9332 dsq->sched = sch; 9333 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 9334 dsq_hash_params); 9335 } else { 9336 ret = -ENODEV; 9337 } 9338 9339 rcu_read_unlock(); 9340 if (ret) { 9341 exit_dsq(dsq); 9342 kfree(dsq); 9343 } 9344 return ret; 9345 } 9346 9347 __bpf_kfunc_end_defs(); 9348 9349 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 9350 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 9351 /* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ 9352 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 9353 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 9354 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 9355 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 9356 /* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ 9357 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 9358 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 9359 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 9360 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 9361 9362 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 9363 .owner = THIS_MODULE, 9364 .set = &scx_kfunc_ids_unlocked, 9365 .filter = scx_kfunc_context_filter, 9366 }; 9367 9368 __bpf_kfunc_start_defs(); 9369 9370 /** 9371 * scx_bpf_task_set_slice - Set task's time slice 9372 * @p: task of interest 9373 * @slice: time slice to set in nsecs 9374 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9375 * 9376 * Set @p's time slice to @slice. Returns %true on success, %false if the 9377 * calling scheduler doesn't have authority over @p. 9378 */ 9379 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 9380 const struct bpf_prog_aux *aux) 9381 { 9382 struct scx_sched *sch; 9383 9384 guard(rcu)(); 9385 sch = scx_prog_sched(aux); 9386 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 9387 return false; 9388 9389 p->scx.slice = slice; 9390 return true; 9391 } 9392 9393 /** 9394 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 9395 * @p: task of interest 9396 * @vtime: virtual time to set 9397 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9398 * 9399 * Set @p's virtual time to @vtime. Returns %true on success, %false if the 9400 * calling scheduler doesn't have authority over @p. 9401 */ 9402 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 9403 const struct bpf_prog_aux *aux) 9404 { 9405 struct scx_sched *sch; 9406 9407 guard(rcu)(); 9408 sch = scx_prog_sched(aux); 9409 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 9410 return false; 9411 9412 p->scx.dsq_vtime = vtime; 9413 return true; 9414 } 9415 9416 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 9417 { 9418 struct rq *this_rq; 9419 unsigned long irq_flags; 9420 9421 local_irq_save(irq_flags); 9422 9423 this_rq = this_rq(); 9424 9425 /* 9426 * While bypassing for PM ops, IRQ handling may not be online which can 9427 * lead to irq_work_queue() malfunction such as infinite busy wait for 9428 * IRQ status update. Suppress kicking. 9429 */ 9430 if (scx_bypassing(sch, cpu_of(this_rq))) 9431 goto out; 9432 9433 /* 9434 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 9435 * rq locks. We can probably be smarter and avoid bouncing if called 9436 * from ops which don't hold a rq lock. 9437 */ 9438 if (flags & SCX_KICK_IDLE) { 9439 struct rq *target_rq = cpu_rq(cpu); 9440 9441 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 9442 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 9443 9444 if (raw_spin_rq_trylock(target_rq)) { 9445 if (can_skip_idle_kick(target_rq)) { 9446 raw_spin_rq_unlock(target_rq); 9447 goto out; 9448 } 9449 raw_spin_rq_unlock(target_rq); 9450 } 9451 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 9452 } else { 9453 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 9454 9455 if (flags & SCX_KICK_PREEMPT) 9456 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 9457 if (flags & SCX_KICK_WAIT) 9458 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 9459 } 9460 9461 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 9462 out: 9463 local_irq_restore(irq_flags); 9464 } 9465 9466 /** 9467 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 9468 * @cpu: cpu to kick 9469 * @flags: %SCX_KICK_* flags 9470 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9471 * 9472 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 9473 * trigger rescheduling on a busy CPU. This can be called from any online 9474 * scx_ops operation and the actual kicking is performed asynchronously through 9475 * an irq work. 9476 */ 9477 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 9478 { 9479 struct scx_sched *sch; 9480 9481 guard(rcu)(); 9482 sch = scx_prog_sched(aux); 9483 if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9484 scx_kick_cpu(sch, cpu, flags); 9485 } 9486 9487 /** 9488 * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid 9489 * @cid: cid to kick 9490 * @flags: %SCX_KICK_* flags 9491 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9492 * 9493 * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success, 9494 * -errno otherwise. 9495 */ 9496 __bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux) 9497 { 9498 struct scx_sched *sch; 9499 s32 cpu; 9500 9501 guard(rcu)(); 9502 sch = scx_prog_sched(aux); 9503 if (unlikely(!sch)) 9504 return -ENODEV; 9505 cpu = scx_cid_to_cpu(sch, cid); 9506 if (cpu < 0) 9507 return cpu; 9508 scx_kick_cpu(sch, cpu, flags); 9509 return 0; 9510 } 9511 9512 /** 9513 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 9514 * @dsq_id: id of the DSQ 9515 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9516 * 9517 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 9518 * -%ENOENT is returned. 9519 */ 9520 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) 9521 { 9522 struct scx_sched *sch; 9523 struct scx_dispatch_q *dsq; 9524 s32 ret; 9525 9526 preempt_disable(); 9527 9528 sch = scx_prog_sched(aux); 9529 if (unlikely(!sch)) { 9530 ret = -ENODEV; 9531 goto out; 9532 } 9533 9534 if (dsq_id == SCX_DSQ_LOCAL) { 9535 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 9536 goto out; 9537 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 9538 s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); 9539 9540 if (scx_cpu_valid(sch, cpu, NULL)) { 9541 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 9542 goto out; 9543 } 9544 } else { 9545 dsq = find_user_dsq(sch, dsq_id); 9546 if (dsq) { 9547 ret = READ_ONCE(dsq->nr); 9548 goto out; 9549 } 9550 } 9551 ret = -ENOENT; 9552 out: 9553 preempt_enable(); 9554 return ret; 9555 } 9556 9557 /** 9558 * scx_bpf_destroy_dsq - Destroy a custom DSQ 9559 * @dsq_id: DSQ to destroy 9560 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9561 * 9562 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 9563 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 9564 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 9565 * which doesn't exist. Can be called from any online scx_ops operations. 9566 */ 9567 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) 9568 { 9569 struct scx_sched *sch; 9570 9571 guard(rcu)(); 9572 sch = scx_prog_sched(aux); 9573 if (sch) 9574 destroy_dsq(sch, dsq_id); 9575 } 9576 9577 /** 9578 * bpf_iter_scx_dsq_new - Create a DSQ iterator 9579 * @it: iterator to initialize 9580 * @dsq_id: DSQ to iterate 9581 * @flags: %SCX_DSQ_ITER_* 9582 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9583 * 9584 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 9585 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 9586 * tasks which are already queued when this function is invoked. 9587 */ 9588 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 9589 u64 flags, const struct bpf_prog_aux *aux) 9590 { 9591 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9592 struct scx_sched *sch; 9593 9594 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 9595 sizeof(struct bpf_iter_scx_dsq)); 9596 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 9597 __alignof__(struct bpf_iter_scx_dsq)); 9598 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 9599 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 9600 9601 /* 9602 * next() and destroy() will be called regardless of the return value. 9603 * Always clear $kit->dsq. 9604 */ 9605 kit->dsq = NULL; 9606 9607 sch = scx_prog_sched(aux); 9608 if (unlikely(!sch)) 9609 return -ENODEV; 9610 9611 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 9612 return -EINVAL; 9613 9614 kit->dsq = find_user_dsq(sch, dsq_id); 9615 if (!kit->dsq) 9616 return -ENOENT; 9617 9618 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 9619 9620 return 0; 9621 } 9622 9623 /** 9624 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 9625 * @it: iterator to progress 9626 * 9627 * Return the next task. See bpf_iter_scx_dsq_new(). 9628 */ 9629 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 9630 { 9631 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9632 9633 if (!kit->dsq) 9634 return NULL; 9635 9636 guard(raw_spinlock_irqsave)(&kit->dsq->lock); 9637 9638 return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 9639 } 9640 9641 /** 9642 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 9643 * @it: iterator to destroy 9644 * 9645 * Undo scx_iter_scx_dsq_new(). 9646 */ 9647 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 9648 { 9649 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9650 9651 if (!kit->dsq) 9652 return; 9653 9654 if (!list_empty(&kit->cursor.node)) { 9655 unsigned long flags; 9656 9657 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 9658 list_del_init(&kit->cursor.node); 9659 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 9660 } 9661 kit->dsq = NULL; 9662 } 9663 9664 /** 9665 * scx_bpf_dsq_peek - Lockless peek at the first element. 9666 * @dsq_id: DSQ to examine. 9667 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9668 * 9669 * Read the first element in the DSQ. This is semantically equivalent to using 9670 * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 9671 * this provides only a point-in-time snapshot, and the contents may change 9672 * by the time any subsequent locking operation reads the queue. 9673 * 9674 * Returns the pointer, or NULL indicates an empty queue OR internal error. 9675 */ 9676 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 9677 const struct bpf_prog_aux *aux) 9678 { 9679 struct scx_sched *sch; 9680 struct scx_dispatch_q *dsq; 9681 9682 sch = scx_prog_sched(aux); 9683 if (unlikely(!sch)) 9684 return NULL; 9685 9686 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 9687 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 9688 return NULL; 9689 } 9690 9691 dsq = find_user_dsq(sch, dsq_id); 9692 if (unlikely(!dsq)) { 9693 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 9694 return NULL; 9695 } 9696 9697 return rcu_dereference(dsq->first_task); 9698 } 9699 9700 /** 9701 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 9702 * @dsq_id: DSQ to re-enqueue 9703 * @reenq_flags: %SCX_RENQ_* 9704 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9705 * 9706 * Iterate over all of the tasks currently enqueued on the DSQ identified by 9707 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 9708 * supported: 9709 * 9710 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 9711 * - User DSQs 9712 * 9713 * Re-enqueues are performed asynchronously. Can be called from anywhere. 9714 */ 9715 __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 9716 const struct bpf_prog_aux *aux) 9717 { 9718 struct scx_sched *sch; 9719 struct scx_dispatch_q *dsq; 9720 9721 guard(preempt)(); 9722 9723 sch = scx_prog_sched(aux); 9724 if (unlikely(!sch)) 9725 return; 9726 9727 if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 9728 scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 9729 return; 9730 } 9731 9732 /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 9733 if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 9734 reenq_flags |= SCX_REENQ_ANY; 9735 9736 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 9737 schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); 9738 } 9739 9740 /** 9741 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9742 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9743 * 9744 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9745 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 9746 * anywhere. 9747 * 9748 * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 9749 * future. 9750 */ 9751 __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 9752 { 9753 scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 9754 } 9755 9756 __bpf_kfunc_end_defs(); 9757 9758 __printf(5, 0) 9759 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 9760 size_t line_size, char *fmt, unsigned long long *data, 9761 u32 data__sz) 9762 { 9763 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 9764 s32 ret; 9765 9766 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 9767 (data__sz && !data)) { 9768 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 9769 return -EINVAL; 9770 } 9771 9772 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 9773 if (ret < 0) { 9774 scx_error(sch, "failed to read data fields (%d)", ret); 9775 return ret; 9776 } 9777 9778 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 9779 &bprintf_data); 9780 if (ret < 0) { 9781 scx_error(sch, "format preparation failed (%d)", ret); 9782 return ret; 9783 } 9784 9785 ret = bstr_printf(line_buf, line_size, fmt, 9786 bprintf_data.bin_args); 9787 bpf_bprintf_cleanup(&bprintf_data); 9788 if (ret < 0) { 9789 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 9790 return ret; 9791 } 9792 9793 return ret; 9794 } 9795 9796 __printf(3, 0) 9797 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 9798 char *fmt, unsigned long long *data, u32 data__sz) 9799 { 9800 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 9801 fmt, data, data__sz); 9802 } 9803 9804 __bpf_kfunc_start_defs(); 9805 9806 /** 9807 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 9808 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 9809 * @fmt: error message format string 9810 * @data: format string parameters packaged using ___bpf_fill() macro 9811 * @data__sz: @data len, must end in '__sz' for the verifier 9812 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9813 * 9814 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 9815 * disabling. 9816 */ 9817 __printf(2, 0) 9818 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 9819 unsigned long long *data, u32 data__sz, 9820 const struct bpf_prog_aux *aux) 9821 { 9822 struct scx_sched *sch; 9823 unsigned long flags; 9824 9825 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9826 sch = scx_prog_sched(aux); 9827 if (likely(sch) && 9828 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9829 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 9830 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9831 } 9832 9833 /** 9834 * scx_bpf_error_bstr - Indicate fatal error 9835 * @fmt: error message format string 9836 * @data: format string parameters packaged using ___bpf_fill() macro 9837 * @data__sz: @data len, must end in '__sz' for the verifier 9838 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9839 * 9840 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 9841 * disabling. 9842 */ 9843 __printf(1, 0) 9844 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 9845 u32 data__sz, const struct bpf_prog_aux *aux) 9846 { 9847 struct scx_sched *sch; 9848 unsigned long flags; 9849 9850 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9851 sch = scx_prog_sched(aux); 9852 if (likely(sch) && 9853 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9854 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 9855 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9856 } 9857 9858 /** 9859 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 9860 * @fmt: format string 9861 * @data: format string parameters packaged using ___bpf_fill() macro 9862 * @data__sz: @data len, must end in '__sz' for the verifier 9863 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9864 * 9865 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 9866 * dump_task() to generate extra debug dump specific to the BPF scheduler. 9867 * 9868 * The extra dump may be multiple lines. A single line may be split over 9869 * multiple calls. The last line is automatically terminated. 9870 */ 9871 __printf(1, 0) 9872 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 9873 u32 data__sz, const struct bpf_prog_aux *aux) 9874 { 9875 struct scx_sched *sch; 9876 struct scx_dump_data *dd = &scx_dump_data; 9877 struct scx_bstr_buf *buf = &dd->buf; 9878 s32 ret; 9879 9880 guard(rcu)(); 9881 9882 sch = scx_prog_sched(aux); 9883 if (unlikely(!sch)) 9884 return; 9885 9886 if (raw_smp_processor_id() != dd->cpu) { 9887 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 9888 return; 9889 } 9890 9891 /* append the formatted string to the line buf */ 9892 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 9893 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 9894 if (ret < 0) { 9895 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 9896 dd->prefix, fmt, data, data__sz, ret); 9897 return; 9898 } 9899 9900 dd->cursor += ret; 9901 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 9902 9903 if (!dd->cursor) 9904 return; 9905 9906 /* 9907 * If the line buf overflowed or ends in a newline, flush it into the 9908 * dump. This is to allow the caller to generate a single line over 9909 * multiple calls. As ops_dump_flush() can also handle multiple lines in 9910 * the line buf, the only case which can lead to an unexpected 9911 * truncation is when the caller keeps generating newlines in the middle 9912 * instead of the end consecutively. Don't do that. 9913 */ 9914 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9915 ops_dump_flush(); 9916 } 9917 9918 /** 9919 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9920 * @cpu: CPU of interest 9921 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9922 * 9923 * Return the maximum relative capacity of @cpu in relation to the most 9924 * performant CPU in the system. The return value is in the range [1, 9925 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9926 */ 9927 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9928 { 9929 struct scx_sched *sch; 9930 9931 guard(rcu)(); 9932 9933 sch = scx_prog_sched(aux); 9934 if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9935 return arch_scale_cpu_capacity(cpu); 9936 else 9937 return SCX_CPUPERF_ONE; 9938 } 9939 9940 /** 9941 * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid 9942 * @cid: cid of the CPU to query 9943 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9944 * 9945 * cid-addressed equivalent of scx_bpf_cpuperf_cap(). 9946 */ 9947 __bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux) 9948 { 9949 struct scx_sched *sch; 9950 s32 cpu; 9951 9952 guard(rcu)(); 9953 9954 sch = scx_prog_sched(aux); 9955 if (unlikely(!sch)) 9956 return SCX_CPUPERF_ONE; 9957 cpu = scx_cid_to_cpu(sch, cid); 9958 if (cpu < 0) 9959 return SCX_CPUPERF_ONE; 9960 return arch_scale_cpu_capacity(cpu); 9961 } 9962 9963 /** 9964 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9965 * @cpu: CPU of interest 9966 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9967 * 9968 * Return the current relative performance of @cpu in relation to its maximum. 9969 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9970 * 9971 * The current performance level of a CPU in relation to the maximum performance 9972 * available in the system can be calculated as follows: 9973 * 9974 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9975 * 9976 * The result is in the range [1, %SCX_CPUPERF_ONE]. 9977 */ 9978 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9979 { 9980 struct scx_sched *sch; 9981 9982 guard(rcu)(); 9983 9984 sch = scx_prog_sched(aux); 9985 if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9986 return arch_scale_freq_capacity(cpu); 9987 else 9988 return SCX_CPUPERF_ONE; 9989 } 9990 9991 /** 9992 * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid 9993 * @cid: cid of the CPU to query 9994 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9995 * 9996 * cid-addressed equivalent of scx_bpf_cpuperf_cur(). 9997 */ 9998 __bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux) 9999 { 10000 struct scx_sched *sch; 10001 s32 cpu; 10002 10003 guard(rcu)(); 10004 10005 sch = scx_prog_sched(aux); 10006 if (unlikely(!sch)) 10007 return SCX_CPUPERF_ONE; 10008 cpu = scx_cid_to_cpu(sch, cid); 10009 if (cpu < 0) 10010 return SCX_CPUPERF_ONE; 10011 return arch_scale_freq_capacity(cpu); 10012 } 10013 10014 /** 10015 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 10016 * @cpu: CPU of interest 10017 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 10018 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10019 * 10020 * Set the target performance level of @cpu to @perf. @perf is in linear 10021 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 10022 * schedutil cpufreq governor chooses the target frequency. 10023 * 10024 * The actual performance level chosen, CPU grouping, and the overhead and 10025 * latency of the operations are dependent on the hardware and cpufreq driver in 10026 * use. Consult hardware and cpufreq documentation for more information. The 10027 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 10028 */ 10029 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 10030 { 10031 struct scx_sched *sch; 10032 10033 guard(rcu)(); 10034 10035 sch = scx_prog_sched(aux); 10036 if (unlikely(!sch)) 10037 return; 10038 10039 if (unlikely(perf > SCX_CPUPERF_ONE)) { 10040 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 10041 return; 10042 } 10043 10044 if (scx_cpu_valid(sch, cpu, NULL)) { 10045 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 10046 struct rq_flags rf; 10047 10048 /* 10049 * When called with an rq lock held, restrict the operation 10050 * to the corresponding CPU to prevent ABBA deadlocks. 10051 */ 10052 if (locked_rq && rq != locked_rq) { 10053 scx_error(sch, "Invalid target CPU %d", cpu); 10054 return; 10055 } 10056 10057 /* 10058 * If no rq lock is held, allow to operate on any CPU by 10059 * acquiring the corresponding rq lock. 10060 */ 10061 if (!locked_rq) { 10062 rq_lock_irqsave(rq, &rf); 10063 update_rq_clock(rq); 10064 } 10065 10066 rq->scx.cpuperf_target = perf; 10067 cpufreq_update_util(rq, 0); 10068 10069 if (!locked_rq) 10070 rq_unlock_irqrestore(rq, &rf); 10071 } 10072 } 10073 10074 /** 10075 * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid 10076 * @cid: cid of the CPU to target 10077 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 10078 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10079 * 10080 * cid-addressed equivalent of scx_bpf_cpuperf_set(). 10081 */ 10082 __bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf, 10083 const struct bpf_prog_aux *aux) 10084 { 10085 struct scx_sched *sch; 10086 s32 cpu; 10087 10088 guard(rcu)(); 10089 10090 sch = scx_prog_sched(aux); 10091 if (unlikely(!sch)) 10092 return; 10093 cpu = scx_cid_to_cpu(sch, cid); 10094 if (cpu < 0) 10095 return; 10096 scx_bpf_cpuperf_set(cpu, perf, aux); 10097 } 10098 10099 /** 10100 * scx_bpf_nr_node_ids - Return the number of possible node IDs 10101 * 10102 * All valid node IDs in the system are smaller than the returned value. 10103 */ 10104 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 10105 { 10106 return nr_node_ids; 10107 } 10108 10109 /** 10110 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 10111 * 10112 * All valid CPU IDs in the system are smaller than the returned value. 10113 */ 10114 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 10115 { 10116 return nr_cpu_ids; 10117 } 10118 10119 /** 10120 * scx_bpf_nr_cids - Return the size of the cid space 10121 * 10122 * Equals num_possible_cpus(). All valid cids are in [0, return value). 10123 */ 10124 __bpf_kfunc u32 scx_bpf_nr_cids(void) 10125 { 10126 return num_possible_cpus(); 10127 } 10128 10129 /** 10130 * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space 10131 * 10132 * Return num_online_cpus(). The standard model restarts the scheduler on 10133 * hotplug, which lets schedulers treat [0, nr_online_cids) as the online 10134 * range. Schedulers that prefer to handle hotplug without a restart should 10135 * install a custom mapping via scx_bpf_cid_override() and track onlining 10136 * through the ops.cid_online / ops.cid_offline callbacks. 10137 */ 10138 __bpf_kfunc u32 scx_bpf_nr_online_cids(void) 10139 { 10140 return num_online_cpus(); 10141 } 10142 10143 /** 10144 * scx_bpf_this_cid - Return the cid of the CPU this program is running on 10145 * 10146 * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs. 10147 * The current cpu is trivially valid, so this is just a table lookup. Return 10148 * -EINVAL if called from a non-SCX program before any scheduler has ever 10149 * been enabled (the cid table is still unallocated at that point). 10150 */ 10151 __bpf_kfunc s32 scx_bpf_this_cid(void) 10152 { 10153 s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); 10154 10155 if (!tbl) 10156 return -EINVAL; 10157 return tbl[raw_smp_processor_id()]; 10158 } 10159 10160 /** 10161 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 10162 */ 10163 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 10164 { 10165 return cpu_possible_mask; 10166 } 10167 10168 /** 10169 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 10170 */ 10171 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 10172 { 10173 return cpu_online_mask; 10174 } 10175 10176 /** 10177 * scx_bpf_put_cpumask - Release a possible/online cpumask 10178 * @cpumask: cpumask to release 10179 */ 10180 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 10181 { 10182 /* 10183 * Empty function body because we aren't actually acquiring or releasing 10184 * a reference to a global cpumask, which is read-only in the caller and 10185 * is never released. The acquire / release semantics here are just used 10186 * to make the cpumask is a trusted pointer in the caller. 10187 */ 10188 } 10189 10190 /** 10191 * scx_bpf_task_running - Is task currently running? 10192 * @p: task of interest 10193 */ 10194 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 10195 { 10196 return task_rq(p)->curr == p; 10197 } 10198 10199 /** 10200 * scx_bpf_task_cpu - CPU a task is currently associated with 10201 * @p: task of interest 10202 */ 10203 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 10204 { 10205 return task_cpu(p); 10206 } 10207 10208 /** 10209 * scx_bpf_task_cid - cid a task is currently associated with 10210 * @p: task of interest 10211 * 10212 * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a 10213 * valid cpu, so this is just a table lookup. Return -EINVAL if called from 10214 * a non-SCX program before any scheduler has ever been enabled. 10215 */ 10216 __bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p) 10217 { 10218 s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); 10219 10220 if (!tbl) 10221 return -EINVAL; 10222 return tbl[task_cpu(p)]; 10223 } 10224 10225 /** 10226 * scx_bpf_cpu_rq - Fetch the rq of a CPU 10227 * @cpu: CPU of the rq 10228 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10229 */ 10230 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 10231 { 10232 struct scx_sched *sch; 10233 10234 guard(rcu)(); 10235 10236 sch = scx_prog_sched(aux); 10237 if (unlikely(!sch)) 10238 return NULL; 10239 10240 if (!scx_cpu_valid(sch, cpu, NULL)) 10241 return NULL; 10242 10243 if (!sch->warned_deprecated_rq) { 10244 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 10245 "use scx_bpf_locked_rq() when holding rq lock " 10246 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 10247 sch->warned_deprecated_rq = true; 10248 } 10249 10250 return cpu_rq(cpu); 10251 } 10252 10253 /** 10254 * scx_bpf_locked_rq - Return the rq currently locked by SCX 10255 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10256 * 10257 * Returns the rq if a rq lock is currently held by SCX. 10258 * Otherwise emits an error and returns NULL. 10259 */ 10260 __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 10261 { 10262 struct scx_sched *sch; 10263 struct rq *rq; 10264 10265 guard(preempt)(); 10266 10267 sch = scx_prog_sched(aux); 10268 if (unlikely(!sch)) 10269 return NULL; 10270 10271 rq = scx_locked_rq(); 10272 if (!rq) { 10273 scx_error(sch, "accessing rq without holding rq lock"); 10274 return NULL; 10275 } 10276 10277 return rq; 10278 } 10279 10280 /** 10281 * scx_bpf_cpu_curr - Return remote CPU's curr task 10282 * @cpu: CPU of interest 10283 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10284 * 10285 * Callers must hold RCU read lock (KF_RCU). 10286 */ 10287 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 10288 { 10289 struct scx_sched *sch; 10290 10291 guard(rcu)(); 10292 10293 sch = scx_prog_sched(aux); 10294 if (unlikely(!sch)) 10295 return NULL; 10296 10297 if (!scx_cpu_valid(sch, cpu, NULL)) 10298 return NULL; 10299 10300 return rcu_dereference(cpu_rq(cpu)->curr); 10301 } 10302 10303 /** 10304 * scx_bpf_cid_curr - Return the curr task on the CPU at @cid 10305 * @cid: cid of interest 10306 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10307 * 10308 * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU 10309 * read lock (KF_RCU). 10310 */ 10311 __bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux) 10312 { 10313 struct scx_sched *sch; 10314 s32 cpu; 10315 10316 guard(rcu)(); 10317 10318 sch = scx_prog_sched(aux); 10319 if (unlikely(!sch)) 10320 return NULL; 10321 cpu = scx_cid_to_cpu(sch, cid); 10322 if (cpu < 0) 10323 return NULL; 10324 return rcu_dereference(cpu_rq(cpu)->curr); 10325 } 10326 10327 /** 10328 * scx_bpf_tid_to_task - Look up a task by its scx tid 10329 * @tid: task ID previously read from p->scx.tid 10330 * 10331 * Returns the task with the given tid, or NULL if no such task exists. The 10332 * returned pointer is valid until the end of the current RCU read section 10333 * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root 10334 * scheduler; otherwise an error is raised and NULL returned. 10335 */ 10336 __bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid) 10337 { 10338 struct sched_ext_entity *scx; 10339 10340 if (!scx_tid_to_task_enabled()) { 10341 struct scx_sched *sch = rcu_dereference(scx_root); 10342 10343 if (sch) 10344 scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK"); 10345 return NULL; 10346 } 10347 10348 scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params); 10349 if (!scx) 10350 return NULL; 10351 10352 return container_of(scx, struct task_struct, scx); 10353 } 10354 10355 /** 10356 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 10357 * clock for the current CPU. The clock returned is in nanoseconds. 10358 * 10359 * It provides the following properties: 10360 * 10361 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 10362 * to account for execution time and track tasks' runtime properties. 10363 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 10364 * eventually reads a hardware timestamp counter -- is neither performant nor 10365 * scalable. scx_bpf_now() aims to provide a high-performance clock by 10366 * using the rq clock in the scheduler core whenever possible. 10367 * 10368 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 10369 * scheduler use cases, the required clock resolution is lower than the most 10370 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 10371 * uses the rq clock in the scheduler core whenever it is valid. It considers 10372 * that the rq clock is valid from the time the rq clock is updated 10373 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 10374 * 10375 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 10376 * guarantees the clock never goes backward when comparing them in the same 10377 * CPU. On the other hand, when comparing clocks in different CPUs, there 10378 * is no such guarantee -- the clock can go backward. It provides a 10379 * monotonically *non-decreasing* clock so that it would provide the same 10380 * clock values in two different scx_bpf_now() calls in the same CPU 10381 * during the same period of when the rq clock is valid. 10382 */ 10383 __bpf_kfunc u64 scx_bpf_now(void) 10384 { 10385 struct rq *rq; 10386 u64 clock; 10387 10388 preempt_disable(); 10389 10390 rq = this_rq(); 10391 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 10392 /* 10393 * If the rq clock is valid, use the cached rq clock. 10394 * 10395 * Note that scx_bpf_now() is re-entrant between a process 10396 * context and an interrupt context (e.g., timer interrupt). 10397 * However, we don't need to consider the race between them 10398 * because such race is not observable from a caller. 10399 */ 10400 clock = READ_ONCE(rq->scx.clock); 10401 } else { 10402 /* 10403 * Otherwise, return a fresh rq clock. 10404 * 10405 * The rq clock is updated outside of the rq lock. 10406 * In this case, keep the updated rq clock invalid so the next 10407 * kfunc call outside the rq lock gets a fresh rq clock. 10408 */ 10409 clock = sched_clock_cpu(cpu_of(rq)); 10410 } 10411 10412 preempt_enable(); 10413 10414 return clock; 10415 } 10416 10417 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 10418 { 10419 struct scx_event_stats *e_cpu; 10420 int cpu; 10421 10422 /* Aggregate per-CPU event counters into @events. */ 10423 memset(events, 0, sizeof(*events)); 10424 for_each_possible_cpu(cpu) { 10425 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 10426 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 10427 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 10428 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 10429 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 10430 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 10431 scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 10432 scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 10433 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 10434 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 10435 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 10436 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 10437 scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 10438 scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); 10439 } 10440 } 10441 10442 /* 10443 * scx_bpf_events - Get a system-wide event counter to 10444 * @events: output buffer from a BPF program 10445 * @events__sz: @events len, must end in '__sz'' for the verifier 10446 */ 10447 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 10448 size_t events__sz) 10449 { 10450 struct scx_sched *sch; 10451 struct scx_event_stats e_sys; 10452 10453 rcu_read_lock(); 10454 sch = rcu_dereference(scx_root); 10455 if (sch) 10456 scx_read_events(sch, &e_sys); 10457 else 10458 memset(&e_sys, 0, sizeof(e_sys)); 10459 rcu_read_unlock(); 10460 10461 /* 10462 * We cannot entirely trust a BPF-provided size since a BPF program 10463 * might be compiled against a different vmlinux.h, of which 10464 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 10465 * (an older vmlinux.h). Hence, we use the smaller size to avoid 10466 * memory corruption. 10467 */ 10468 events__sz = min(events__sz, sizeof(*events)); 10469 memcpy(events, &e_sys, events__sz); 10470 } 10471 10472 #ifdef CONFIG_CGROUP_SCHED 10473 /** 10474 * scx_bpf_task_cgroup - Return the sched cgroup of a task 10475 * @p: task of interest 10476 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10477 * 10478 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 10479 * from the scheduler's POV. SCX operations should use this function to 10480 * determine @p's current cgroup as, unlike following @p->cgroups, 10481 * @p->sched_task_group is stable for the duration of the SCX op. See 10482 * SCX_CALL_OP_TASK() for details. 10483 */ 10484 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 10485 const struct bpf_prog_aux *aux) 10486 { 10487 struct task_group *tg = p->sched_task_group; 10488 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 10489 struct scx_sched *sch; 10490 10491 guard(rcu)(); 10492 10493 sch = scx_prog_sched(aux); 10494 if (unlikely(!sch)) 10495 goto out; 10496 10497 if (!scx_kf_arg_task_ok(sch, p)) 10498 goto out; 10499 10500 cgrp = tg_cgrp(tg); 10501 10502 out: 10503 cgroup_get(cgrp); 10504 return cgrp; 10505 } 10506 #endif /* CONFIG_CGROUP_SCHED */ 10507 10508 __bpf_kfunc_end_defs(); 10509 10510 BTF_KFUNCS_START(scx_kfunc_ids_any) 10511 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 10512 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 10513 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 10514 BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS) 10515 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) 10516 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) 10517 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 10518 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 10519 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 10520 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 10521 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 10522 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 10523 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 10524 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 10525 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 10526 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 10527 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 10528 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 10529 BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS) 10530 BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS) 10531 BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS) 10532 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 10533 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 10534 BTF_ID_FLAGS(func, scx_bpf_nr_cids) 10535 BTF_ID_FLAGS(func, scx_bpf_nr_online_cids) 10536 BTF_ID_FLAGS(func, scx_bpf_this_cid) 10537 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 10538 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 10539 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 10540 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 10541 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 10542 BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU) 10543 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 10544 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 10545 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10546 BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10547 BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED) 10548 BTF_ID_FLAGS(func, scx_bpf_now) 10549 BTF_ID_FLAGS(func, scx_bpf_events) 10550 #ifdef CONFIG_CGROUP_SCHED 10551 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 10552 #endif 10553 BTF_KFUNCS_END(scx_kfunc_ids_any) 10554 10555 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 10556 .owner = THIS_MODULE, 10557 .set = &scx_kfunc_ids_any, 10558 .filter = scx_kfunc_context_filter, 10559 }; 10560 10561 /* 10562 * cpu-form kfuncs that are forbidden from cid-form schedulers 10563 * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must 10564 * use the cid-form alternative (cid/cmask kfuncs). 10565 * 10566 * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter 10567 * tests this set independently and rejects matches before the per-op 10568 * allow-list check runs. 10569 * 10570 * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and 10571 * intersects flags across duplicate entries, so each entry must carry the 10572 * same flags as the kfunc's primary declaration; otherwise the flags get 10573 * dropped globally. 10574 */ 10575 BTF_KFUNCS_START(scx_kfunc_ids_cpu_only) 10576 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 10577 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 10578 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 10579 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10580 BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) 10581 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 10582 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 10583 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 10584 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 10585 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 10586 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 10587 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 10588 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 10589 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 10590 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10591 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10592 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10593 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10594 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 10595 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) 10596 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) 10597 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) 10598 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) 10599 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) 10600 BTF_KFUNCS_END(scx_kfunc_ids_cpu_only) 10601 10602 /* 10603 * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc 10604 * group; an op may permit zero or more groups, with the union expressed in 10605 * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) 10606 * consults this table to decide whether a context-sensitive kfunc is callable 10607 * from a given SCX op. 10608 */ 10609 enum scx_kf_allow_flags { 10610 SCX_KF_ALLOW_UNLOCKED = 1 << 0, 10611 SCX_KF_ALLOW_INIT = 1 << 1, 10612 SCX_KF_ALLOW_CPU_RELEASE = 1 << 2, 10613 SCX_KF_ALLOW_DISPATCH = 1 << 3, 10614 SCX_KF_ALLOW_ENQUEUE = 1 << 4, 10615 SCX_KF_ALLOW_SELECT_CPU = 1 << 5, 10616 }; 10617 10618 /* 10619 * Map each SCX op to the union of kfunc groups it permits, indexed by 10620 * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not 10621 * context-sensitive. 10622 */ 10623 static const u32 scx_kf_allow_flags[] = { 10624 [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 10625 [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 10626 [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, 10627 [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, 10628 [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, 10629 [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, 10630 #ifdef CONFIG_EXT_GROUP_SCHED 10631 [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, 10632 [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, 10633 [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, 10634 [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, 10635 [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, 10636 [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, 10637 [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, 10638 #endif /* CONFIG_EXT_GROUP_SCHED */ 10639 [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, 10640 [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, 10641 [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, 10642 [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, 10643 [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT, 10644 [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, 10645 }; 10646 10647 /* 10648 * Verifier-time filter for SCX kfuncs. Registered via the .filter field on 10649 * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc 10650 * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or 10651 * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the 10652 * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by 10653 * falling through to "allow" when none of the SCX sets contain the kfunc. 10654 */ 10655 int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) 10656 { 10657 bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); 10658 bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id); 10659 bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); 10660 bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); 10661 bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); 10662 bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); 10663 bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); 10664 bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); 10665 bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id); 10666 u32 moff, flags; 10667 10668 /* Not an SCX kfunc - allow. */ 10669 if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch || 10670 in_cpu_release || in_idle || in_any)) 10671 return 0; 10672 10673 /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ 10674 if (prog->type == BPF_PROG_TYPE_SYSCALL) 10675 return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; 10676 10677 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) 10678 return (in_any || in_idle) ? 0 : -EACCES; 10679 10680 /* 10681 * add_subprog_and_kfunc() collects all kfunc calls, including dead code 10682 * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets 10683 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; 10684 * do_check_main() re-runs the filter with st_ops set and enforces the 10685 * actual restrictions. 10686 */ 10687 if (!prog->aux->st_ops) 10688 return 0; 10689 10690 /* 10691 * Non-SCX struct_ops: SCX kfuncs are not permitted. 10692 * 10693 * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid 10694 * (cid-form) are valid SCX struct_ops. Member offsets match between 10695 * the two (verified by BUILD_BUG_ON in scx_init()), so the shared 10696 * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to 10697 * both. 10698 */ 10699 if (prog->aux->st_ops != &bpf_sched_ext_ops && 10700 prog->aux->st_ops != &bpf_sched_ext_ops_cid) 10701 return -EACCES; 10702 10703 /* 10704 * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both 10705 * small s32s and trivially confused, so cpu-only kfuncs are rejected at 10706 * load time. The reverse (cpu-form calling cid-form kfuncs) is 10707 * intentionally permissive to ease gradual cpumask -> cid migration. 10708 */ 10709 if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only) 10710 return -EACCES; 10711 10712 /* SCX struct_ops: check the per-op allow list. */ 10713 if (in_any || in_idle) 10714 return 0; 10715 10716 moff = prog->aux->attach_st_ops_member_off; 10717 flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; 10718 10719 if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) 10720 return 0; 10721 if ((flags & SCX_KF_ALLOW_INIT) && in_init) 10722 return 0; 10723 if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) 10724 return 0; 10725 if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) 10726 return 0; 10727 if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) 10728 return 0; 10729 if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) 10730 return 0; 10731 10732 return -EACCES; 10733 } 10734 10735 static int __init scx_init(void) 10736 { 10737 int ret; 10738 10739 /* 10740 * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv. 10741 * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets 10742 * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets 10743 * matching for the shared fields. Catch any drift at boot. 10744 */ 10745 #define CID_OFFSET_MATCH(cpu_field, cid_field) \ 10746 BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \ 10747 offsetof(struct sched_ext_ops_cid, cid_field)) 10748 /* data fields used by bpf_scx_init_member() */ 10749 CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch); 10750 CID_OFFSET_MATCH(flags, flags); 10751 CID_OFFSET_MATCH(name, name); 10752 CID_OFFSET_MATCH(timeout_ms, timeout_ms); 10753 CID_OFFSET_MATCH(exit_dump_len, exit_dump_len); 10754 CID_OFFSET_MATCH(hotplug_seq, hotplug_seq); 10755 CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id); 10756 /* shared callbacks: the union view requires byte-for-byte offset match */ 10757 CID_OFFSET_MATCH(enqueue, enqueue); 10758 CID_OFFSET_MATCH(dequeue, dequeue); 10759 CID_OFFSET_MATCH(dispatch, dispatch); 10760 CID_OFFSET_MATCH(tick, tick); 10761 CID_OFFSET_MATCH(runnable, runnable); 10762 CID_OFFSET_MATCH(running, running); 10763 CID_OFFSET_MATCH(stopping, stopping); 10764 CID_OFFSET_MATCH(quiescent, quiescent); 10765 CID_OFFSET_MATCH(yield, yield); 10766 CID_OFFSET_MATCH(core_sched_before, core_sched_before); 10767 CID_OFFSET_MATCH(set_weight, set_weight); 10768 CID_OFFSET_MATCH(update_idle, update_idle); 10769 CID_OFFSET_MATCH(init_task, init_task); 10770 CID_OFFSET_MATCH(exit_task, exit_task); 10771 CID_OFFSET_MATCH(enable, enable); 10772 CID_OFFSET_MATCH(disable, disable); 10773 CID_OFFSET_MATCH(dump, dump); 10774 CID_OFFSET_MATCH(dump_task, dump_task); 10775 CID_OFFSET_MATCH(sub_attach, sub_attach); 10776 CID_OFFSET_MATCH(sub_detach, sub_detach); 10777 CID_OFFSET_MATCH(init, init); 10778 CID_OFFSET_MATCH(exit, exit); 10779 #ifdef CONFIG_EXT_GROUP_SCHED 10780 CID_OFFSET_MATCH(cgroup_init, cgroup_init); 10781 CID_OFFSET_MATCH(cgroup_exit, cgroup_exit); 10782 CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move); 10783 CID_OFFSET_MATCH(cgroup_move, cgroup_move); 10784 CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move); 10785 CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight); 10786 CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth); 10787 CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle); 10788 #endif 10789 /* renamed callbacks must occupy the same slot as their cpu-form sibling */ 10790 CID_OFFSET_MATCH(select_cpu, select_cid); 10791 CID_OFFSET_MATCH(set_cpumask, set_cmask); 10792 CID_OFFSET_MATCH(cpu_online, cid_online); 10793 CID_OFFSET_MATCH(cpu_offline, cid_offline); 10794 CID_OFFSET_MATCH(dump_cpu, dump_cid); 10795 /* @priv tail must align since both share the same data block */ 10796 CID_OFFSET_MATCH(priv, priv); 10797 /* 10798 * cid-form must end exactly at @priv - validate_ops() skips 10799 * cpu_acquire/cpu_release for cid-form because reading those fields 10800 * past the BPF allocation would be UB. 10801 */ 10802 BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) != 10803 offsetofend(struct sched_ext_ops, priv)); 10804 #undef CID_OFFSET_MATCH 10805 10806 /* 10807 * kfunc registration can't be done from init_sched_ext_class() as 10808 * register_btf_kfunc_id_set() needs most of the system to be up. 10809 * 10810 * Some kfuncs are context-sensitive and can only be called from 10811 * specific SCX ops. They are grouped into per-context BTF sets, each 10812 * registered with scx_kfunc_context_filter as its .filter callback. The 10813 * BPF core dedups identical filter pointers per hook 10814 * (btf_populate_kfunc_set()), so the filter is invoked exactly once per 10815 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op 10816 * restrictions at verify time. 10817 */ 10818 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10819 &scx_kfunc_set_enqueue_dispatch)) || 10820 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10821 &scx_kfunc_set_dispatch)) || 10822 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10823 &scx_kfunc_set_cpu_release)) || 10824 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10825 &scx_kfunc_set_unlocked)) || 10826 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 10827 &scx_kfunc_set_unlocked)) || 10828 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10829 &scx_kfunc_set_any)) || 10830 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 10831 &scx_kfunc_set_any)) || 10832 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 10833 &scx_kfunc_set_any))) { 10834 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 10835 return ret; 10836 } 10837 10838 ret = scx_idle_init(); 10839 if (ret) { 10840 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 10841 return ret; 10842 } 10843 10844 ret = scx_cid_kfunc_init(); 10845 if (ret) { 10846 pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret); 10847 return ret; 10848 } 10849 10850 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 10851 if (ret) { 10852 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 10853 return ret; 10854 } 10855 10856 ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid); 10857 if (ret) { 10858 pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret); 10859 return ret; 10860 } 10861 10862 ret = register_pm_notifier(&scx_pm_notifier); 10863 if (ret) { 10864 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 10865 return ret; 10866 } 10867 10868 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 10869 if (!scx_kset) { 10870 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 10871 return -ENOMEM; 10872 } 10873 10874 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 10875 if (ret < 0) { 10876 pr_err("sched_ext: Failed to add global attributes\n"); 10877 return ret; 10878 } 10879 10880 return 0; 10881 } 10882 __initcall(scx_init); 10883