1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #include <linux/bitmap.h> 10 #include <linux/btf_ids.h> 11 #include <linux/rhashtable.h> 12 #include <linux/sched/clock.h> 13 #include <linux/sched/isolation.h> 14 #include <linux/suspend.h> 15 #include <linux/sysrq.h> 16 17 #include "../pelt.h" 18 #include "internal.h" 19 #include "cid.h" 20 #include "arena.h" 21 #include "idle.h" 22 23 static DEFINE_RAW_SPINLOCK(scx_sched_lock); 24 25 /* 26 * NOTE: sched_ext is in the process of growing multiple scheduler support and 27 * scx_root usage is in a transitional state. Naked dereferences are safe if the 28 * caller is one of the tasks attached to SCX and explicit RCU dereference is 29 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 30 * are used as temporary markers to indicate that the dereferences need to be 31 * updated to point to the associated scheduler instances rather than scx_root. 32 */ 33 struct scx_sched __rcu *scx_root; 34 35 /* 36 * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 37 * Readers can hold either or rcu_read_lock(). 38 */ 39 static LIST_HEAD(scx_sched_all); 40 41 #ifdef CONFIG_EXT_SUB_SCHED 42 static const struct rhashtable_params scx_sched_hash_params = { 43 .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 44 .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 45 .head_offset = offsetof(struct scx_sched, hash_node), 46 .insecure_elasticity = true, /* inserted under scx_sched_lock */ 47 }; 48 49 static struct rhashtable scx_sched_hash; 50 #endif 51 52 /* see SCX_OPS_TID_TO_TASK */ 53 static const struct rhashtable_params scx_tid_hash_params = { 54 .key_len = sizeof_field(struct sched_ext_entity, tid), 55 .key_offset = offsetof(struct sched_ext_entity, tid), 56 .head_offset = offsetof(struct sched_ext_entity, tid_hash_node), 57 .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */ 58 }; 59 static struct rhashtable scx_tid_hash; 60 61 /* 62 * During exit, a task may schedule after losing its PIDs. When disabling the 63 * BPF scheduler, we need to be able to iterate tasks in every state to 64 * guarantee system safety. Maintain a dedicated task list which contains every 65 * task between its fork and eventual free. 66 */ 67 static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 68 static LIST_HEAD(scx_tasks); 69 70 /* ops enable/disable */ 71 static DEFINE_MUTEX(scx_enable_mutex); 72 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 73 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 74 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 75 static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 76 static bool scx_init_task_enabled; 77 static bool scx_switching_all; 78 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 79 static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled); 80 81 /* 82 * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler 83 * and the tid->task table is live. Wraps the static key so callers don't 84 * take the address, and hints "likely enabled" for the common case where 85 * the feature is in use. 86 */ 87 static inline bool scx_tid_to_task_enabled(void) 88 { 89 return static_branch_likely(&__scx_tid_to_task_enabled); 90 } 91 92 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 93 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 94 95 /* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */ 96 static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1); 97 98 #ifdef CONFIG_EXT_SUB_SCHED 99 /* 100 * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 101 * tasks for the sub-sched being enabled. Use a global variable instead of a 102 * per-task field as all enables are serialized. 103 */ 104 static struct scx_sched *scx_enabling_sub_sched; 105 #else 106 #define scx_enabling_sub_sched (struct scx_sched *)NULL 107 #endif /* CONFIG_EXT_SUB_SCHED */ 108 109 /* 110 * A monotonically increasing sequence number that is incremented every time a 111 * scheduler is enabled. This can be used to check if any custom sched_ext 112 * scheduler has ever been used in the system. 113 */ 114 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 115 116 /* 117 * Watchdog interval. All scx_sched's share a single watchdog timer and the 118 * interval is half of the shortest sch->watchdog_timeout. 119 */ 120 static unsigned long scx_watchdog_interval; 121 122 /* 123 * The last time the delayed work was run. This delayed work relies on 124 * ksoftirqd being able to run to service timer interrupts, so it's possible 125 * that this work itself could get wedged. To account for this, we check that 126 * it's not stalled in the timer tick, and trigger an error if it is. 127 */ 128 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 129 130 static struct delayed_work scx_watchdog_work; 131 132 /* 133 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 134 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 135 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 136 * lazily when enabling and freed when disabling to avoid waste when sched_ext 137 * isn't active. 138 */ 139 struct scx_kick_syncs { 140 struct rcu_head rcu; 141 unsigned long syncs[]; 142 }; 143 144 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 145 146 /* 147 * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of 148 * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without 149 * further synchronization. See scx_alloc_tid(). 150 */ 151 struct scx_tid_alloc { 152 u64 next; 153 u64 end; 154 }; 155 static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc); 156 157 /* 158 * Direct dispatch marker. 159 * 160 * Non-NULL values are used for direct dispatch from enqueue path. A valid 161 * pointer points to the task currently being enqueued. An ERR_PTR value is used 162 * to indicate that direct dispatch has already happened. 163 */ 164 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 165 166 static const struct rhashtable_params dsq_hash_params = { 167 .key_len = sizeof_field(struct scx_dispatch_q, id), 168 .key_offset = offsetof(struct scx_dispatch_q, id), 169 .head_offset = offsetof(struct scx_dispatch_q, hash_node), 170 }; 171 172 static LLIST_HEAD(dsqs_to_free); 173 174 /* string formatting from BPF */ 175 struct scx_bstr_buf { 176 u64 data[MAX_BPRINTF_VARARGS]; 177 char line[SCX_EXIT_MSG_LEN]; 178 }; 179 180 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 181 static struct scx_bstr_buf scx_exit_bstr_buf; 182 183 /* ops debug dump */ 184 static DEFINE_RAW_SPINLOCK(scx_dump_lock); 185 186 struct scx_dump_data { 187 s32 cpu; 188 bool first; 189 s32 cursor; 190 struct seq_buf *s; 191 const char *prefix; 192 struct scx_bstr_buf buf; 193 }; 194 195 static struct scx_dump_data scx_dump_data = { 196 .cpu = -1, 197 }; 198 199 /* /sys/kernel/sched_ext interface */ 200 static struct kset *scx_kset; 201 202 /* 203 * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 204 * There usually is no reason to modify these as normal scheduler operation 205 * shouldn't be affected by them. The knobs are primarily for debugging. 206 */ 207 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 208 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 209 210 static int set_slice_us(const char *val, const struct kernel_param *kp) 211 { 212 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 213 } 214 215 static const struct kernel_param_ops slice_us_param_ops = { 216 .set = set_slice_us, 217 .get = param_get_uint, 218 }; 219 220 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 221 { 222 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 223 } 224 225 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 226 .set = set_bypass_lb_intv_us, 227 .get = param_get_uint, 228 }; 229 230 #undef MODULE_PARAM_PREFIX 231 #define MODULE_PARAM_PREFIX "sched_ext." 232 233 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 234 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 235 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 236 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 237 238 #undef MODULE_PARAM_PREFIX 239 240 #define CREATE_TRACE_POINTS 241 #include <trace/events/sched_ext.h> 242 243 static void run_deferred(struct rq *rq); 244 static bool task_dead_and_done(struct task_struct *p); 245 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 246 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 247 248 __printf(5, 6) bool __scx_exit(struct scx_sched *sch, 249 enum scx_exit_kind kind, s64 exit_code, 250 s32 exit_cpu, const char *fmt, ...) 251 { 252 va_list args; 253 bool ret; 254 255 va_start(args, fmt); 256 ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args); 257 va_end(args); 258 259 return ret; 260 } 261 262 static long jiffies_delta_msecs(unsigned long at, unsigned long now) 263 { 264 if (time_after(at, now)) 265 return jiffies_to_msecs(at - now); 266 else 267 return -(long)jiffies_to_msecs(now - at); 268 } 269 270 static bool u32_before(u32 a, u32 b) 271 { 272 return (s32)(a - b) < 0; 273 } 274 275 #ifdef CONFIG_EXT_SUB_SCHED 276 /** 277 * scx_next_descendant_pre - find the next descendant for pre-order walk 278 * @pos: the current position (%NULL to initiate traversal) 279 * @root: sched whose descendants to walk 280 * 281 * To be used by scx_for_each_descendant_pre(). Find the next descendant to 282 * visit for pre-order traversal of @root's descendants. @root is included in 283 * the iteration and the first node to be visited. 284 */ 285 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 286 struct scx_sched *root) 287 { 288 struct scx_sched *next; 289 290 lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 291 lockdep_is_held(&scx_sched_lock)); 292 293 /* if first iteration, visit @root */ 294 if (!pos) 295 return root; 296 297 /* visit the first child if exists */ 298 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 299 if (next) 300 return next; 301 302 /* no child, visit my or the closest ancestor's next sibling */ 303 while (pos != root) { 304 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 305 return list_next_entry(pos, sibling); 306 pos = scx_parent(pos); 307 } 308 309 return NULL; 310 } 311 312 static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 313 { 314 return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 315 scx_sched_hash_params); 316 } 317 318 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 319 { 320 rcu_assign_pointer(p->scx.sched, sch); 321 } 322 #else /* CONFIG_EXT_SUB_SCHED */ 323 static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 324 static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 325 #endif /* CONFIG_EXT_SUB_SCHED */ 326 327 /** 328 * scx_is_descendant - Test whether sched is a descendant 329 * @sch: sched to test 330 * @ancestor: ancestor sched to test against 331 * 332 * Test whether @sch is a descendant of @ancestor. 333 */ 334 static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 335 { 336 if (sch->level < ancestor->level) 337 return false; 338 return sch->ancestors[ancestor->level] == ancestor; 339 } 340 341 /** 342 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 343 * @pos: iteration cursor 344 * @root: sched to walk the descendants of 345 * 346 * Walk @root's descendants. @root is included in the iteration and the first 347 * node to be visited. Must be called with either scx_enable_mutex or 348 * scx_sched_lock held. 349 */ 350 #define scx_for_each_descendant_pre(pos, root) \ 351 for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 352 (pos) = scx_next_descendant_pre((pos), (root))) 353 354 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 355 { 356 return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 357 } 358 359 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 360 { 361 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 362 } 363 364 static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 365 { 366 if (p->sched_class == &stop_sched_class) 367 return &stop_sched_class; 368 369 return __setscheduler_class(p->policy, p->prio); 370 } 371 372 static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 373 { 374 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 375 } 376 377 static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 378 { 379 #ifdef CONFIG_EXT_SUB_SCHED 380 /* 381 * If @sch is a sub-sched which is bypassing, its tasks should go into 382 * the bypass DSQs of the nearest ancestor which is not bypassing. The 383 * not-bypassing ancestor is responsible for scheduling all tasks from 384 * bypassing sub-trees. If all ancestors including root are bypassing, 385 * all tasks should go to the root's bypass DSQs. 386 * 387 * Whenever a sched starts bypassing, all runnable tasks in its subtree 388 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 389 * all tasks are transferred to the right DSQs. 390 */ 391 while (scx_parent(sch) && scx_bypassing(sch, cpu)) 392 sch = scx_parent(sch); 393 #endif /* CONFIG_EXT_SUB_SCHED */ 394 395 return bypass_dsq(sch, cpu); 396 } 397 398 /** 399 * bypass_dsp_enabled - Check if bypass dispatch path is enabled 400 * @sch: scheduler to check 401 * 402 * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 403 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 404 * are bypassing. In the former case, the ancestor is not itself bypassing but 405 * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 406 * the ancestor's bypass dispatch path must be active even though its own 407 * bypass_depth remains zero. 408 * 409 * This function checks bypass_dsp_enable_depth which is managed separately from 410 * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 411 * disable_bypass_dsp(). 412 */ 413 static bool bypass_dsp_enabled(struct scx_sched *sch) 414 { 415 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 416 } 417 418 /** 419 * rq_is_open - Is the rq available for immediate execution of an SCX task? 420 * @rq: rq to test 421 * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 422 * 423 * Returns %true if @rq is currently open for executing an SCX task. After a 424 * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 425 * before going to idle and not inserting a task into @rq's local DSQ after a 426 * %false return doesn't cause @rq to stall. 427 */ 428 static bool rq_is_open(struct rq *rq, u64 enq_flags) 429 { 430 lockdep_assert_rq_held(rq); 431 432 /* 433 * A higher-priority class task is either running or in the process of 434 * waking up on @rq. 435 */ 436 if (sched_class_above(rq->next_class, &ext_sched_class)) 437 return false; 438 439 /* 440 * @rq is either in transition to or in idle and there is no 441 * higher-priority class task waking up on it. 442 */ 443 if (sched_class_above(&ext_sched_class, rq->next_class)) 444 return true; 445 446 /* 447 * @rq is either picking, in transition to, or running an SCX task. 448 */ 449 450 /* 451 * If we're in the dispatch path holding rq lock, $curr may or may not 452 * be ready depending on whether the on-going dispatch decides to extend 453 * $curr's slice. We say yes here and resolve it at the end of dispatch. 454 * See balance_one(). 455 */ 456 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 457 return true; 458 459 /* 460 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 461 * so allow it to avoid spuriously triggering reenq on a combined 462 * PREEMPT|IMMED insertion. 463 */ 464 if (enq_flags & SCX_ENQ_PREEMPT) 465 return true; 466 467 /* 468 * @rq is either in transition to or running an SCX task and can't go 469 * idle without another SCX dispatch cycle. 470 */ 471 return false; 472 } 473 474 /* 475 * Track the rq currently locked. 476 * 477 * This allows kfuncs to safely operate on rq from any scx ops callback, 478 * knowing which rq is already locked. 479 */ 480 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 481 482 /* 483 * Flipped on enable per sch->is_cid_type. Declared in internal.h so 484 * subsystem inlines can read it. 485 */ 486 DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type); 487 488 /** 489 * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task 490 * @sch: scx_sched being invoked 491 * @rq: rq to update as the currently-locked rq, or NULL 492 * @task: task whose affinity is changing 493 * @cpumask: new cpumask 494 * 495 * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu 496 * scratch in cid.c and dispatch through the ops_cid union view. Caller 497 * must hold @rq's rq lock so this_cpu_ptr is stable across the call. 498 */ 499 static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq, 500 struct task_struct *task, 501 const struct cpumask *cpumask) 502 { 503 WARN_ON_ONCE(current->scx.kf_tasks[0]); 504 current->scx.kf_tasks[0] = task; 505 if (rq) 506 update_locked_rq(rq); 507 508 if (scx_is_cid_type()) { 509 struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch); 510 /* 511 * Build the per-CPU arena cmask and hand BPF its arena address. 512 * Caller holds the rq lock with IRQs disabled, which makes us 513 * the sole user of the scratch area. 514 */ 515 scx_cpumask_to_cmask(cpumask, kern_va); 516 sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va)); 517 } else { 518 sch->ops.set_cpumask(task, cpumask); 519 } 520 521 if (rq) 522 update_locked_rq(NULL); 523 current->scx.kf_tasks[0] = NULL; 524 } 525 526 enum scx_dsq_iter_flags { 527 /* iterate in the reverse dispatch order */ 528 SCX_DSQ_ITER_REV = 1U << 16, 529 530 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 531 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 532 533 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 534 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 535 __SCX_DSQ_ITER_HAS_SLICE | 536 __SCX_DSQ_ITER_HAS_VTIME, 537 }; 538 539 /** 540 * nldsq_next_task - Iterate to the next task in a non-local DSQ 541 * @dsq: non-local dsq being iterated 542 * @cur: current position, %NULL to start iteration 543 * @rev: walk backwards 544 * 545 * Returns %NULL when iteration is finished. 546 */ 547 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 548 struct task_struct *cur, bool rev) 549 { 550 struct list_head *list_node; 551 struct scx_dsq_list_node *dsq_lnode; 552 553 lockdep_assert_held(&dsq->lock); 554 555 if (cur) 556 list_node = &cur->scx.dsq_list.node; 557 else 558 list_node = &dsq->list; 559 560 /* find the next task, need to skip BPF iteration cursors */ 561 do { 562 if (rev) 563 list_node = list_node->prev; 564 else 565 list_node = list_node->next; 566 567 if (list_node == &dsq->list) 568 return NULL; 569 570 dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 571 node); 572 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 573 574 return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 575 } 576 577 #define nldsq_for_each_task(p, dsq) \ 578 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 579 (p) = nldsq_next_task((dsq), (p), false)) 580 581 /** 582 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 583 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 584 * @dsq: non-local dsq being iterated 585 * 586 * Find the next task in a cursor based iteration. The caller must have 587 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 588 * between the iteration steps. 589 * 590 * Only tasks which were queued before @cursor was initialized are visible. This 591 * bounds the iteration and guarantees that vtime never jumps in the other 592 * direction while iterating. 593 */ 594 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 595 struct scx_dispatch_q *dsq) 596 { 597 bool rev = cursor->flags & SCX_DSQ_ITER_REV; 598 struct task_struct *p; 599 600 lockdep_assert_held(&dsq->lock); 601 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 602 603 if (list_empty(&cursor->node)) 604 p = NULL; 605 else 606 p = container_of(cursor, struct task_struct, scx.dsq_list); 607 608 /* skip cursors and tasks that were queued after @cursor init */ 609 do { 610 p = nldsq_next_task(dsq, p, rev); 611 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 612 613 if (p) { 614 if (rev) 615 list_move_tail(&cursor->node, &p->scx.dsq_list.node); 616 else 617 list_move(&cursor->node, &p->scx.dsq_list.node); 618 } else { 619 list_del_init(&cursor->node); 620 } 621 622 return p; 623 } 624 625 /** 626 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 627 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 628 * @rq: rq @p was on 629 * @dsq: dsq @p was on 630 * @p: target task 631 * 632 * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 633 * dropped and re-acquired inbetween. Verify that no one else took or is in the 634 * process of taking @p from @dsq. 635 * 636 * On %false return, the caller can assume full ownership of @p. 637 */ 638 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 639 struct rq *rq, struct scx_dispatch_q *dsq, 640 struct task_struct *p) 641 { 642 lockdep_assert_rq_held(rq); 643 lockdep_assert_held(&dsq->lock); 644 645 /* 646 * @p could have already left $src_dsq, got re-enqueud, or be in the 647 * process of being consumed by someone else. 648 */ 649 if (unlikely(p->scx.dsq != dsq || 650 u32_before(cursor->priv, p->scx.dsq_seq) || 651 p->scx.holding_cpu >= 0)) 652 return true; 653 654 /* if @p has stayed on @dsq, its rq couldn't have changed */ 655 if (WARN_ON_ONCE(rq != task_rq(p))) 656 return true; 657 658 return false; 659 } 660 661 /* 662 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 663 * dispatch order. BPF-visible iterator is opaque and larger to allow future 664 * changes without breaking backward compatibility. Can be used with 665 * bpf_for_each(). See bpf_iter_scx_dsq_*(). 666 */ 667 struct bpf_iter_scx_dsq_kern { 668 struct scx_dsq_list_node cursor; 669 struct scx_dispatch_q *dsq; 670 u64 slice; 671 u64 vtime; 672 } __attribute__((aligned(8))); 673 674 struct bpf_iter_scx_dsq { 675 u64 __opaque[6]; 676 } __attribute__((aligned(8))); 677 678 679 static u32 scx_get_task_state(const struct task_struct *p) 680 { 681 return p->scx.flags & SCX_TASK_STATE_MASK; 682 } 683 684 static void scx_set_task_state(struct task_struct *p, u32 state) 685 { 686 u32 prev_state = scx_get_task_state(p); 687 bool warn = false; 688 689 switch (state) { 690 case SCX_TASK_NONE: 691 warn = prev_state == SCX_TASK_DEAD; 692 break; 693 case SCX_TASK_INIT_BEGIN: 694 warn = prev_state != SCX_TASK_NONE; 695 break; 696 case SCX_TASK_INIT: 697 warn = prev_state != SCX_TASK_INIT_BEGIN; 698 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 699 break; 700 case SCX_TASK_READY: 701 warn = !(prev_state == SCX_TASK_INIT || 702 prev_state == SCX_TASK_ENABLED); 703 break; 704 case SCX_TASK_ENABLED: 705 warn = prev_state != SCX_TASK_READY; 706 break; 707 case SCX_TASK_DEAD: 708 warn = !(prev_state == SCX_TASK_NONE || 709 prev_state == SCX_TASK_INIT_BEGIN); 710 break; 711 default: 712 WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", 713 prev_state, state, p->comm, p->pid); 714 return; 715 } 716 717 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 718 prev_state, state, p->comm, p->pid); 719 720 p->scx.flags &= ~SCX_TASK_STATE_MASK; 721 p->scx.flags |= state; 722 } 723 724 /* 725 * SCX task iterator. 726 */ 727 struct scx_task_iter { 728 struct sched_ext_entity cursor; 729 struct task_struct *locked_task; 730 struct rq *rq; 731 struct rq_flags rf; 732 u32 cnt; 733 bool list_locked; 734 #ifdef CONFIG_EXT_SUB_SCHED 735 struct cgroup *cgrp; 736 struct cgroup_subsys_state *css_pos; 737 struct css_task_iter css_iter; 738 #endif 739 }; 740 741 /** 742 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 743 * @iter: iterator to init 744 * @cgrp: Optional root of cgroup subhierarchy to iterate 745 * 746 * Initialize @iter. Once initialized, @iter must eventually be stopped with 747 * scx_task_iter_stop(). 748 * 749 * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 750 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 751 * 752 * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 753 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 754 * task migrations. 755 * 756 * The two modes of iterations are largely independent and it's likely that 757 * scx_tasks can be removed in favor of always using cgroup iteration if 758 * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 759 * 760 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 761 * between this and the first next() call or between any two next() calls. If 762 * the locks are released between two next() calls, the caller is responsible 763 * for ensuring that the task being iterated remains accessible either through 764 * RCU read lock or obtaining a reference count. 765 * 766 * All tasks which existed when the iteration started are guaranteed to be 767 * visited as long as they are not dead. 768 */ 769 static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 770 { 771 memset(iter, 0, sizeof(*iter)); 772 773 #ifdef CONFIG_EXT_SUB_SCHED 774 if (cgrp) { 775 lockdep_assert_held(&cgroup_mutex); 776 iter->cgrp = cgrp; 777 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 778 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 779 &iter->css_iter); 780 return; 781 } 782 #endif 783 raw_spin_lock_irq(&scx_tasks_lock); 784 785 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 786 list_add(&iter->cursor.tasks_node, &scx_tasks); 787 iter->list_locked = true; 788 } 789 790 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 791 { 792 if (iter->locked_task) { 793 __balance_callbacks(iter->rq, &iter->rf); 794 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 795 iter->locked_task = NULL; 796 } 797 } 798 799 /** 800 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 801 * @iter: iterator to unlock 802 * 803 * If @iter is in the middle of a locked iteration, it may be locking the rq of 804 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 805 * This function can be safely called anytime during an iteration. The next 806 * iterator operation will automatically restore the necessary locking. 807 */ 808 static void scx_task_iter_unlock(struct scx_task_iter *iter) 809 { 810 __scx_task_iter_rq_unlock(iter); 811 if (iter->list_locked) { 812 iter->list_locked = false; 813 raw_spin_unlock_irq(&scx_tasks_lock); 814 } 815 } 816 817 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 818 { 819 if (!iter->list_locked) { 820 raw_spin_lock_irq(&scx_tasks_lock); 821 iter->list_locked = true; 822 } 823 } 824 825 /** 826 * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq 827 * @iter: iterator to relock 828 * @p: task whose rq to lock, or %NULL for scx_tasks_lock only 829 * 830 * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once 831 * re-acquired, both locks are managed by the iterator from here on. 832 */ 833 static void scx_task_iter_relock(struct scx_task_iter *iter, 834 struct task_struct *p) 835 { 836 __scx_task_iter_maybe_relock(iter); 837 if (p) { 838 iter->rq = task_rq_lock(p, &iter->rf); 839 iter->locked_task = p; 840 } 841 } 842 843 /** 844 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 845 * @iter: iterator to exit 846 * 847 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 848 * which is released on return. If the iterator holds a task's rq lock, that rq 849 * lock is also released. See scx_task_iter_start() for details. 850 */ 851 static void scx_task_iter_stop(struct scx_task_iter *iter) 852 { 853 #ifdef CONFIG_EXT_SUB_SCHED 854 if (iter->cgrp) { 855 if (iter->css_pos) 856 css_task_iter_end(&iter->css_iter); 857 __scx_task_iter_rq_unlock(iter); 858 return; 859 } 860 #endif 861 __scx_task_iter_maybe_relock(iter); 862 list_del_init(&iter->cursor.tasks_node); 863 scx_task_iter_unlock(iter); 864 } 865 866 /** 867 * scx_task_iter_next - Next task 868 * @iter: iterator to walk 869 * 870 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 871 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 872 * by holding scx_tasks_lock for too long. 873 */ 874 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 875 { 876 struct list_head *cursor = &iter->cursor.tasks_node; 877 struct sched_ext_entity *pos; 878 879 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 880 scx_task_iter_unlock(iter); 881 cond_resched(); 882 } 883 884 #ifdef CONFIG_EXT_SUB_SCHED 885 if (iter->cgrp) { 886 while (iter->css_pos) { 887 struct task_struct *p; 888 889 p = css_task_iter_next(&iter->css_iter); 890 if (p) 891 return p; 892 893 css_task_iter_end(&iter->css_iter); 894 iter->css_pos = css_next_descendant_pre(iter->css_pos, 895 &iter->cgrp->self); 896 if (iter->css_pos) 897 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 898 &iter->css_iter); 899 } 900 return NULL; 901 } 902 #endif 903 __scx_task_iter_maybe_relock(iter); 904 905 list_for_each_entry(pos, cursor, tasks_node) { 906 if (&pos->tasks_node == &scx_tasks) 907 return NULL; 908 if (!(pos->flags & SCX_TASK_CURSOR)) { 909 list_move(cursor, &pos->tasks_node); 910 return container_of(pos, struct task_struct, scx); 911 } 912 } 913 914 /* can't happen, should always terminate at scx_tasks above */ 915 BUG(); 916 } 917 918 /** 919 * scx_task_iter_next_locked - Next non-idle task with its rq locked 920 * @iter: iterator to walk 921 * 922 * Visit the non-idle task with its rq lock held. Allows callers to specify 923 * whether they would like to filter out dead tasks. See scx_task_iter_start() 924 * for details. 925 */ 926 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 927 { 928 struct task_struct *p; 929 930 __scx_task_iter_rq_unlock(iter); 931 932 while ((p = scx_task_iter_next(iter))) { 933 /* 934 * scx_task_iter is used to prepare and move tasks into SCX 935 * while loading the BPF scheduler and vice-versa while 936 * unloading. The init_tasks ("swappers") should be excluded 937 * from the iteration because: 938 * 939 * - It's unsafe to use __setschduler_prio() on an init_task to 940 * determine the sched_class to use as it won't preserve its 941 * idle_sched_class. 942 * 943 * - ops.init/exit_task() can easily be confused if called with 944 * init_tasks as they, e.g., share PID 0. 945 * 946 * As init_tasks are never scheduled through SCX, they can be 947 * skipped safely. Note that is_idle_task() which tests %PF_IDLE 948 * doesn't work here: 949 * 950 * - %PF_IDLE may not be set for an init_task whose CPU hasn't 951 * yet been onlined. 952 * 953 * - %PF_IDLE can be set on tasks that are not init_tasks. See 954 * play_idle_precise() used by CONFIG_IDLE_INJECT. 955 * 956 * Test for idle_sched_class as only init_tasks are on it. 957 */ 958 if (p->sched_class == &idle_sched_class) 959 continue; 960 961 iter->rq = task_rq_lock(p, &iter->rf); 962 iter->locked_task = p; 963 964 /* 965 * cgroup_task_dead() removes the dead tasks from cset->tasks 966 * after sched_ext_dead() and cgroup iteration may see tasks 967 * which already finished sched_ext_dead(). %SCX_TASK_DEAD is 968 * set by sched_ext_dead() under @p's rq lock. Test it to 969 * avoid visiting tasks which are already dead from SCX POV. 970 */ 971 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 972 __scx_task_iter_rq_unlock(iter); 973 continue; 974 } 975 976 return p; 977 } 978 return NULL; 979 } 980 981 /** 982 * scx_add_event - Increase an event counter for 'name' by 'cnt' 983 * @sch: scx_sched to account events for 984 * @name: an event name defined in struct scx_event_stats 985 * @cnt: the number of the event occurred 986 * 987 * This can be used when preemption is not disabled. 988 */ 989 #define scx_add_event(sch, name, cnt) do { \ 990 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 991 trace_sched_ext_event(#name, (cnt)); \ 992 } while(0) 993 994 /** 995 * __scx_add_event - Increase an event counter for 'name' by 'cnt' 996 * @sch: scx_sched to account events for 997 * @name: an event name defined in struct scx_event_stats 998 * @cnt: the number of the event occurred 999 * 1000 * This should be used only when preemption is disabled. 1001 */ 1002 #define __scx_add_event(sch, name, cnt) do { \ 1003 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1004 trace_sched_ext_event(#name, cnt); \ 1005 } while(0) 1006 1007 /** 1008 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1009 * @dst_e: destination event stats 1010 * @src_e: source event stats 1011 * @kind: a kind of event to be aggregated 1012 */ 1013 #define scx_agg_event(dst_e, src_e, kind) do { \ 1014 (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1015 } while(0) 1016 1017 /** 1018 * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1019 * @s: output seq_buf 1020 * @events: event stats 1021 * @kind: a kind of event to dump 1022 */ 1023 #define scx_dump_event(s, events, kind) do { \ 1024 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1025 } while (0) 1026 1027 1028 static void scx_read_events(struct scx_sched *sch, 1029 struct scx_event_stats *events); 1030 1031 static enum scx_enable_state scx_enable_state(void) 1032 { 1033 return atomic_read(&scx_enable_state_var); 1034 } 1035 1036 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1037 { 1038 return atomic_xchg(&scx_enable_state_var, to); 1039 } 1040 1041 static bool scx_tryset_enable_state(enum scx_enable_state to, 1042 enum scx_enable_state from) 1043 { 1044 int from_v = from; 1045 1046 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1047 } 1048 1049 /** 1050 * wait_ops_state - Busy-wait the specified ops state to end 1051 * @p: target task 1052 * @opss: state to wait the end of 1053 * 1054 * Busy-wait for @p to transition out of @opss. This can only be used when the 1055 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1056 * has load_acquire semantics to ensure that the caller can see the updates made 1057 * in the enqueueing and dispatching paths. 1058 */ 1059 static void wait_ops_state(struct task_struct *p, unsigned long opss) 1060 { 1061 do { 1062 cpu_relax(); 1063 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1064 } 1065 1066 static inline bool __cpu_valid(s32 cpu) 1067 { 1068 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1069 } 1070 1071 /** 1072 * scx_cpu_valid - Verify a cpu number, to be used on ops input args 1073 * @sch: scx_sched to abort on error 1074 * @cpu: cpu number which came from a BPF ops 1075 * @where: extra information reported on error 1076 * 1077 * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1078 * Verify that it is in range and one of the possible cpus. If invalid, trigger 1079 * an ops error. 1080 */ 1081 bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1082 { 1083 if (__cpu_valid(cpu)) { 1084 return true; 1085 } else { 1086 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1087 return false; 1088 } 1089 } 1090 1091 /** 1092 * ops_sanitize_err - Sanitize a -errno value 1093 * @sch: scx_sched to error out on error 1094 * @ops_name: operation to blame on failure 1095 * @err: -errno value to sanitize 1096 * 1097 * Verify @err is a valid -errno. If not, trigger scx_error() and return 1098 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1099 * cause misbehaviors. For an example, a large negative return from 1100 * ops.init_task() triggers an oops when passed up the call chain because the 1101 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1102 * handled as a pointer. 1103 */ 1104 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1105 { 1106 if (err < 0 && err >= -MAX_ERRNO) 1107 return err; 1108 1109 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1110 return -EPROTO; 1111 } 1112 1113 static void deferred_bal_cb_workfn(struct rq *rq) 1114 { 1115 run_deferred(rq); 1116 } 1117 1118 static void deferred_irq_workfn(struct irq_work *irq_work) 1119 { 1120 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1121 1122 raw_spin_rq_lock(rq); 1123 run_deferred(rq); 1124 raw_spin_rq_unlock(rq); 1125 } 1126 1127 /** 1128 * schedule_deferred - Schedule execution of deferred actions on an rq 1129 * @rq: target rq 1130 * 1131 * Schedule execution of deferred actions on @rq. Deferred actions are executed 1132 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1133 * to other rqs. 1134 */ 1135 static void schedule_deferred(struct rq *rq) 1136 { 1137 /* 1138 * This is the fallback when schedule_deferred_locked() can't use 1139 * the cheaper balance callback or wakeup hook paths (the target 1140 * CPU is not in balance or wakeup). Currently, this is primarily 1141 * hit by reenqueue operations targeting a remote CPU. 1142 * 1143 * Queue on the target CPU. The deferred work can run from any CPU 1144 * correctly - the _locked() path already processes remote rqs from 1145 * the calling CPU - but targeting the owning CPU allows IPI delivery 1146 * without waiting for the calling CPU to re-enable IRQs and is 1147 * cheaper as the reenqueue runs locally. 1148 */ 1149 irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); 1150 } 1151 1152 /** 1153 * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1154 * @rq: target rq 1155 * 1156 * Schedule execution of deferred actions on @rq. Equivalent to 1157 * schedule_deferred() but requires @rq to be locked and can be more efficient. 1158 */ 1159 static void schedule_deferred_locked(struct rq *rq) 1160 { 1161 lockdep_assert_rq_held(rq); 1162 1163 /* 1164 * If in the middle of waking up a task, task_woken_scx() will be called 1165 * afterwards which will then run the deferred actions, no need to 1166 * schedule anything. 1167 */ 1168 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1169 return; 1170 1171 /* Don't do anything if there already is a deferred operation. */ 1172 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1173 return; 1174 1175 /* 1176 * If in balance, the balance callbacks will be called before rq lock is 1177 * released. Schedule one. 1178 * 1179 * 1180 * We can't directly insert the callback into the 1181 * rq's list: The call can drop its lock and make the pending balance 1182 * callback visible to unrelated code paths that call rq_pin_lock(). 1183 * 1184 * Just let balance_one() know that it must do it itself. 1185 */ 1186 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1187 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1188 return; 1189 } 1190 1191 /* 1192 * No scheduler hooks available. Use the generic irq_work path. The 1193 * above WAKEUP and BALANCE paths should cover most of the cases and the 1194 * time to IRQ re-enable shouldn't be long. 1195 */ 1196 schedule_deferred(rq); 1197 } 1198 1199 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1200 u64 reenq_flags, struct rq *locked_rq) 1201 { 1202 struct rq *rq; 1203 1204 /* 1205 * Allowing reenqueues doesn't make sense while bypassing. This also 1206 * blocks from new reenqueues to be scheduled on dead scheds. 1207 */ 1208 if (unlikely(READ_ONCE(sch->bypass_depth))) 1209 return; 1210 1211 if (dsq->id == SCX_DSQ_LOCAL) { 1212 rq = container_of(dsq, struct rq, scx.local_dsq); 1213 1214 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1215 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1216 1217 /* 1218 * Pairs with smp_mb() in process_deferred_reenq_locals() and 1219 * guarantees that there is a reenq_local() afterwards. 1220 */ 1221 smp_mb(); 1222 1223 if (list_empty(&drl->node) || 1224 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1225 1226 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1227 1228 if (list_empty(&drl->node)) 1229 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1230 WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1231 } 1232 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1233 rq = this_rq(); 1234 1235 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1236 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1237 1238 /* 1239 * Pairs with smp_mb() in process_deferred_reenq_users() and 1240 * guarantees that there is a reenq_user() afterwards. 1241 */ 1242 smp_mb(); 1243 1244 if (list_empty(&dru->node) || 1245 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1246 1247 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1248 1249 if (list_empty(&dru->node)) 1250 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1251 WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1252 } 1253 } else { 1254 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1255 return; 1256 } 1257 1258 if (rq == locked_rq) 1259 schedule_deferred_locked(rq); 1260 else 1261 schedule_deferred(rq); 1262 } 1263 1264 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1265 { 1266 struct scx_sched *root = rcu_dereference_sched(scx_root); 1267 1268 if (WARN_ON_ONCE(!root)) 1269 return; 1270 1271 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); 1272 } 1273 1274 /** 1275 * touch_core_sched - Update timestamp used for core-sched task ordering 1276 * @rq: rq to read clock from, must be locked 1277 * @p: task to update the timestamp for 1278 * 1279 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1280 * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1281 * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1282 * exhaustion). 1283 */ 1284 static void touch_core_sched(struct rq *rq, struct task_struct *p) 1285 { 1286 lockdep_assert_rq_held(rq); 1287 1288 #ifdef CONFIG_SCHED_CORE 1289 /* 1290 * It's okay to update the timestamp spuriously. Use 1291 * sched_core_disabled() which is cheaper than enabled(). 1292 * 1293 * As this is used to determine ordering between tasks of sibling CPUs, 1294 * it may be better to use per-core dispatch sequence instead. 1295 */ 1296 if (!sched_core_disabled()) 1297 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1298 #endif 1299 } 1300 1301 /** 1302 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1303 * @rq: rq to read clock from, must be locked 1304 * @p: task being dispatched 1305 * 1306 * If the BPF scheduler implements custom core-sched ordering via 1307 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1308 * ordering within each local DSQ. This function is called from dispatch paths 1309 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1310 */ 1311 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1312 { 1313 lockdep_assert_rq_held(rq); 1314 1315 #ifdef CONFIG_SCHED_CORE 1316 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1317 touch_core_sched(rq, p); 1318 #endif 1319 } 1320 1321 static void update_curr_scx(struct rq *rq) 1322 { 1323 struct task_struct *curr = rq->curr; 1324 s64 delta_exec; 1325 1326 delta_exec = update_curr_common(rq); 1327 if (unlikely(delta_exec <= 0)) 1328 return; 1329 1330 if (curr->scx.slice != SCX_SLICE_INF) { 1331 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1332 if (!curr->scx.slice) 1333 touch_core_sched(rq, curr); 1334 } 1335 1336 dl_server_update(&rq->ext_server, delta_exec); 1337 } 1338 1339 static bool scx_dsq_priq_less(struct rb_node *node_a, 1340 const struct rb_node *node_b) 1341 { 1342 const struct task_struct *a = 1343 container_of(node_a, struct task_struct, scx.dsq_priq); 1344 const struct task_struct *b = 1345 container_of(node_b, struct task_struct, scx.dsq_priq); 1346 1347 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1348 } 1349 1350 static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1351 { 1352 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1353 WRITE_ONCE(dsq->nr, dsq->nr + 1); 1354 1355 /* 1356 * Once @p reaches a local DSQ, it can only leave it by being dispatched 1357 * to the CPU or dequeued. In both cases, the only way @p can go back to 1358 * the BPF sched is through enqueueing. If being inserted into a local 1359 * DSQ with IMMED, persist the state until the next enqueueing event in 1360 * do_enqueue_task() so that we can maintain IMMED protection through 1361 * e.g. SAVE/RESTORE cycles and slice extensions. 1362 */ 1363 if (enq_flags & SCX_ENQ_IMMED) { 1364 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1365 WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1366 return; 1367 } 1368 p->scx.flags |= SCX_TASK_IMMED; 1369 } 1370 1371 if (p->scx.flags & SCX_TASK_IMMED) { 1372 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1373 1374 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1375 return; 1376 1377 rq->scx.nr_immed++; 1378 1379 /* 1380 * If @rq already had other tasks or the current task is not 1381 * done yet, @p can't go on the CPU immediately. Re-enqueue. 1382 */ 1383 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1384 schedule_reenq_local(rq, 0); 1385 } 1386 } 1387 1388 static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1389 { 1390 /* see dsq_inc_nr() */ 1391 WRITE_ONCE(dsq->nr, dsq->nr - 1); 1392 1393 if (p->scx.flags & SCX_TASK_IMMED) { 1394 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1395 1396 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1397 WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1398 return; 1399 1400 rq->scx.nr_immed--; 1401 } 1402 } 1403 1404 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1405 { 1406 p->scx.slice = READ_ONCE(sch->slice_dfl); 1407 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1408 } 1409 1410 /* 1411 * Return true if @p is moving due to an internal SCX migration, false 1412 * otherwise. 1413 */ 1414 static inline bool task_scx_migrating(struct task_struct *p) 1415 { 1416 /* 1417 * We only need to check sticky_cpu: it is set to the destination 1418 * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1419 * and cleared when the task is enqueued on the destination, so it 1420 * is only non-negative during an internal SCX migration. 1421 */ 1422 return p->scx.sticky_cpu >= 0; 1423 } 1424 1425 /* 1426 * Call ops.dequeue() if the task is in BPF custody and not migrating. 1427 * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1428 */ 1429 static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1430 struct task_struct *p, u64 deq_flags) 1431 { 1432 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1433 return; 1434 1435 if (SCX_HAS_OP(sch, dequeue)) 1436 SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); 1437 1438 p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1439 } 1440 1441 static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1442 struct task_struct *p, u64 enq_flags) 1443 { 1444 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1445 1446 call_task_dequeue(sch, rq, p, 0); 1447 1448 /* 1449 * Note that @rq's lock may be dropped between this enqueue and @p 1450 * actually getting on CPU. This gives higher-class tasks (e.g. RT) 1451 * an opportunity to wake up on @rq and prevent @p from running. 1452 * Here are some concrete examples: 1453 * 1454 * Example 1: 1455 * 1456 * We dispatch two tasks from a single ops.dispatch(): 1457 * - First, a local task to this CPU's local DSQ; 1458 * - Second, a local/remote task to a remote CPU's local DSQ. 1459 * We must drop the local rq lock in order to finish the second 1460 * dispatch. In that time, an RT task can wake up on the local rq. 1461 * 1462 * Example 2: 1463 * 1464 * We dispatch a local/remote task to a remote CPU's local DSQ. 1465 * We must drop the remote rq lock before the dispatched task can run, 1466 * which gives an RT task an opportunity to wake up on the remote rq. 1467 * 1468 * Both examples work the same if we replace dispatching with moving 1469 * the tasks from a user-created DSQ. 1470 * 1471 * We must detect these wakeups so that we can re-enqueue IMMED tasks 1472 * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this 1473 * purpose, but for it to be invoked, we must ensure that we bump 1474 * @rq->next_class to &ext_sched_class if it's currently idle. 1475 * 1476 * wakeup_preempt() does the bumping, and since we only invoke it if 1477 * @rq->next_class is below &ext_sched_class, it will also 1478 * resched_curr(rq). 1479 */ 1480 if (sched_class_above(p->sched_class, rq->next_class)) 1481 wakeup_preempt(rq, p, 0); 1482 1483 /* 1484 * If @rq is in balance, the CPU is already vacant and looking for the 1485 * next task to run. No need to preempt or trigger resched after moving 1486 * @p into its local DSQ. 1487 * Note that the wakeup_preempt() above may have already triggered 1488 * a resched if @rq->next_class was idle. It's harmless, since 1489 * need_resched is cleared immediately after task pick. 1490 */ 1491 if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1492 return; 1493 1494 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1495 rq->curr->sched_class == &ext_sched_class) { 1496 rq->curr->scx.slice = 0; 1497 resched_curr(rq); 1498 } 1499 } 1500 1501 static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1502 struct scx_dispatch_q *dsq, struct task_struct *p, 1503 u64 enq_flags) 1504 { 1505 bool is_local = dsq->id == SCX_DSQ_LOCAL; 1506 1507 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1508 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1509 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1510 1511 if (!is_local) { 1512 raw_spin_lock_nested(&dsq->lock, 1513 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1514 1515 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1516 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1517 /* fall back to the global dsq */ 1518 raw_spin_unlock(&dsq->lock); 1519 dsq = find_global_dsq(sch, task_cpu(p)); 1520 raw_spin_lock(&dsq->lock); 1521 } 1522 } 1523 1524 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1525 (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1526 /* 1527 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1528 * their FIFO queues. To avoid confusion and accidentally 1529 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1530 * disallow any internal DSQ from doing vtime ordering of 1531 * tasks. 1532 */ 1533 scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1534 enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1535 } 1536 1537 if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1538 struct rb_node *rbp; 1539 1540 /* 1541 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1542 * linked to both the rbtree and list on PRIQs, this can only be 1543 * tested easily when adding the first task. 1544 */ 1545 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1546 nldsq_next_task(dsq, NULL, false))) 1547 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1548 dsq->id); 1549 1550 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1551 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1552 1553 /* 1554 * Find the previous task and insert after it on the list so 1555 * that @dsq->list is vtime ordered. 1556 */ 1557 rbp = rb_prev(&p->scx.dsq_priq); 1558 if (rbp) { 1559 struct task_struct *prev = 1560 container_of(rbp, struct task_struct, 1561 scx.dsq_priq); 1562 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1563 /* first task unchanged - no update needed */ 1564 } else { 1565 list_add(&p->scx.dsq_list.node, &dsq->list); 1566 /* not builtin and new task is at head - use fastpath */ 1567 rcu_assign_pointer(dsq->first_task, p); 1568 } 1569 } else { 1570 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1571 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1572 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1573 dsq->id); 1574 1575 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1576 list_add(&p->scx.dsq_list.node, &dsq->list); 1577 /* new task inserted at head - use fastpath */ 1578 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1579 rcu_assign_pointer(dsq->first_task, p); 1580 } else { 1581 /* 1582 * dsq->list can contain parked BPF iterator cursors, so 1583 * list_empty() here isn't a reliable proxy for "no real 1584 * task in the DSQ". Test dsq->first_task directly. 1585 */ 1586 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1587 if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1588 rcu_assign_pointer(dsq->first_task, p); 1589 } 1590 } 1591 1592 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1593 WRITE_ONCE(dsq->seq, dsq->seq + 1); 1594 p->scx.dsq_seq = dsq->seq; 1595 1596 dsq_inc_nr(dsq, p, enq_flags); 1597 p->scx.dsq = dsq; 1598 1599 /* 1600 * Update custody and call ops.dequeue() before clearing ops_state: 1601 * once ops_state is cleared, waiters in ops_dequeue() can proceed 1602 * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1603 * ops_state first, both sides would modify p->scx.flags 1604 * concurrently in a non-atomic way. 1605 */ 1606 if (is_local) { 1607 local_dsq_post_enq(sch, dsq, p, enq_flags); 1608 } else { 1609 /* 1610 * Task on global/bypass DSQ: leave custody, task on 1611 * non-terminal DSQ: enter custody. 1612 */ 1613 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1614 call_task_dequeue(sch, rq, p, 0); 1615 else 1616 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1617 1618 raw_spin_unlock(&dsq->lock); 1619 } 1620 1621 /* 1622 * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1623 * match waiters' load_acquire. 1624 */ 1625 if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1626 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1627 } 1628 1629 static void task_unlink_from_dsq(struct task_struct *p, 1630 struct scx_dispatch_q *dsq) 1631 { 1632 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1633 1634 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1635 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1636 RB_CLEAR_NODE(&p->scx.dsq_priq); 1637 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1638 } 1639 1640 list_del_init(&p->scx.dsq_list.node); 1641 dsq_dec_nr(dsq, p); 1642 1643 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1644 struct task_struct *first_task; 1645 1646 first_task = nldsq_next_task(dsq, NULL, false); 1647 rcu_assign_pointer(dsq->first_task, first_task); 1648 } 1649 } 1650 1651 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1652 { 1653 struct scx_dispatch_q *dsq = p->scx.dsq; 1654 bool is_local = dsq == &rq->scx.local_dsq; 1655 1656 lockdep_assert_rq_held(rq); 1657 1658 if (!dsq) { 1659 /* 1660 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1661 * Unlinking is all that's needed to cancel. 1662 */ 1663 if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1664 list_del_init(&p->scx.dsq_list.node); 1665 1666 /* 1667 * When dispatching directly from the BPF scheduler to a local 1668 * DSQ, the task isn't associated with any DSQ but 1669 * @p->scx.holding_cpu may be set under the protection of 1670 * %SCX_OPSS_DISPATCHING. 1671 */ 1672 if (p->scx.holding_cpu >= 0) 1673 p->scx.holding_cpu = -1; 1674 1675 return; 1676 } 1677 1678 if (!is_local) 1679 raw_spin_lock(&dsq->lock); 1680 1681 /* 1682 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1683 * change underneath us. 1684 */ 1685 if (p->scx.holding_cpu < 0) { 1686 /* @p must still be on @dsq, dequeue */ 1687 task_unlink_from_dsq(p, dsq); 1688 } else { 1689 /* 1690 * We're racing against dispatch_to_local_dsq() which already 1691 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1692 * holding_cpu which tells dispatch_to_local_dsq() that it lost 1693 * the race. 1694 */ 1695 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1696 p->scx.holding_cpu = -1; 1697 } 1698 p->scx.dsq = NULL; 1699 1700 if (!is_local) 1701 raw_spin_unlock(&dsq->lock); 1702 } 1703 1704 /* 1705 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1706 * and dsq are locked. 1707 */ 1708 static void dispatch_dequeue_locked(struct task_struct *p, 1709 struct scx_dispatch_q *dsq) 1710 { 1711 lockdep_assert_rq_held(task_rq(p)); 1712 lockdep_assert_held(&dsq->lock); 1713 1714 task_unlink_from_dsq(p, dsq); 1715 p->scx.dsq = NULL; 1716 } 1717 1718 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1719 struct rq *rq, u64 dsq_id, 1720 s32 tcpu) 1721 { 1722 struct scx_dispatch_q *dsq; 1723 1724 if (dsq_id == SCX_DSQ_LOCAL) 1725 return &rq->scx.local_dsq; 1726 1727 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1728 s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); 1729 1730 if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1731 return find_global_dsq(sch, tcpu); 1732 1733 return &cpu_rq(cpu)->scx.local_dsq; 1734 } 1735 1736 if (dsq_id == SCX_DSQ_GLOBAL) 1737 dsq = find_global_dsq(sch, tcpu); 1738 else 1739 dsq = find_user_dsq(sch, dsq_id); 1740 1741 if (unlikely(!dsq)) { 1742 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1743 return find_global_dsq(sch, tcpu); 1744 } 1745 1746 return dsq; 1747 } 1748 1749 static void mark_direct_dispatch(struct scx_sched *sch, 1750 struct task_struct *ddsp_task, 1751 struct task_struct *p, u64 dsq_id, 1752 u64 enq_flags) 1753 { 1754 /* 1755 * Mark that dispatch already happened from ops.select_cpu() or 1756 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1757 * which can never match a valid task pointer. 1758 */ 1759 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1760 1761 /* @p must match the task on the enqueue path */ 1762 if (unlikely(p != ddsp_task)) { 1763 if (IS_ERR(ddsp_task)) 1764 scx_error(sch, "%s[%d] already direct-dispatched", 1765 p->comm, p->pid); 1766 else 1767 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1768 ddsp_task->comm, ddsp_task->pid, 1769 p->comm, p->pid); 1770 return; 1771 } 1772 1773 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1774 WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1775 1776 p->scx.ddsp_dsq_id = dsq_id; 1777 p->scx.ddsp_enq_flags = enq_flags; 1778 } 1779 1780 /* 1781 * Clear @p direct dispatch state when leaving the scheduler. 1782 * 1783 * Direct dispatch state must be cleared in the following cases: 1784 * - direct_dispatch(): cleared on the synchronous enqueue path, deferred 1785 * dispatch keeps the state until consumed 1786 * - process_ddsp_deferred_locals(): cleared after consuming deferred state, 1787 * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch 1788 * verdict is ignored (local/global/bypass) 1789 * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred 1790 * cancellation and holding_cpu races 1791 * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by 1792 * the scx_bypass() loop, so that stale state is not reused by a subsequent 1793 * scheduler instance 1794 */ 1795 static inline void clear_direct_dispatch(struct task_struct *p) 1796 { 1797 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1798 p->scx.ddsp_enq_flags = 0; 1799 } 1800 1801 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1802 u64 enq_flags) 1803 { 1804 struct rq *rq = task_rq(p); 1805 struct scx_dispatch_q *dsq = 1806 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1807 u64 ddsp_enq_flags; 1808 1809 touch_core_sched_dispatch(rq, p); 1810 1811 p->scx.ddsp_enq_flags |= enq_flags; 1812 1813 /* 1814 * We are in the enqueue path with @rq locked and pinned, and thus can't 1815 * double lock a remote rq and enqueue to its local DSQ. For 1816 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1817 * the enqueue so that it's executed when @rq can be unlocked. 1818 */ 1819 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1820 unsigned long opss; 1821 1822 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1823 1824 switch (opss & SCX_OPSS_STATE_MASK) { 1825 case SCX_OPSS_NONE: 1826 break; 1827 case SCX_OPSS_QUEUEING: 1828 /* 1829 * As @p was never passed to the BPF side, _release is 1830 * not strictly necessary. Still do it for consistency. 1831 */ 1832 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1833 break; 1834 default: 1835 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1836 p->comm, p->pid, opss); 1837 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1838 break; 1839 } 1840 1841 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1842 list_add_tail(&p->scx.dsq_list.node, 1843 &rq->scx.ddsp_deferred_locals); 1844 schedule_deferred_locked(rq); 1845 return; 1846 } 1847 1848 ddsp_enq_flags = p->scx.ddsp_enq_flags; 1849 clear_direct_dispatch(p); 1850 1851 dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1852 } 1853 1854 static bool scx_rq_online(struct rq *rq) 1855 { 1856 /* 1857 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1858 * the online state as seen from the BPF scheduler. cpu_active() test 1859 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1860 * stay set until the current scheduling operation is complete even if 1861 * we aren't locking @rq. 1862 */ 1863 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1864 } 1865 1866 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1867 int sticky_cpu) 1868 { 1869 struct scx_sched *sch = scx_task_sched(p); 1870 struct task_struct **ddsp_taskp; 1871 struct scx_dispatch_q *dsq; 1872 unsigned long qseq; 1873 1874 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 1875 1876 /* internal movements - rq migration / RESTORE */ 1877 if (sticky_cpu == cpu_of(rq)) 1878 goto local_norefill; 1879 1880 /* 1881 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 1882 * Note that exiting and migration-disabled tasks that skip 1883 * ops.enqueue() below will lose IMMED protection unless 1884 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 1885 */ 1886 p->scx.flags &= ~SCX_TASK_IMMED; 1887 1888 /* 1889 * If !scx_rq_online(), we already told the BPF scheduler that the CPU 1890 * is offline and are just running the hotplug path. Don't bother the 1891 * BPF scheduler. 1892 */ 1893 if (!scx_rq_online(rq)) 1894 goto local; 1895 1896 if (scx_bypassing(sch, cpu_of(rq))) { 1897 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 1898 goto bypass; 1899 } 1900 1901 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1902 goto direct; 1903 1904 /* see %SCX_OPS_ENQ_EXITING */ 1905 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 1906 unlikely(p->flags & PF_EXITING)) { 1907 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 1908 goto local; 1909 } 1910 1911 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 1912 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 1913 is_migration_disabled(p)) { 1914 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 1915 goto local; 1916 } 1917 1918 if (unlikely(!SCX_HAS_OP(sch, enqueue))) 1919 goto global; 1920 1921 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 1922 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 1923 1924 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 1925 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 1926 1927 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 1928 WARN_ON_ONCE(*ddsp_taskp); 1929 *ddsp_taskp = p; 1930 1931 SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); 1932 1933 *ddsp_taskp = NULL; 1934 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 1935 goto direct; 1936 1937 /* 1938 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 1939 * so ops.dequeue() is called when it leaves custody. 1940 */ 1941 p->scx.flags |= SCX_TASK_IN_CUSTODY; 1942 1943 /* 1944 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 1945 * dequeue may be waiting. The store_release matches their load_acquire. 1946 */ 1947 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 1948 return; 1949 1950 direct: 1951 direct_dispatch(sch, p, enq_flags); 1952 return; 1953 local_norefill: 1954 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 1955 return; 1956 local: 1957 dsq = &rq->scx.local_dsq; 1958 goto enqueue; 1959 global: 1960 dsq = find_global_dsq(sch, task_cpu(p)); 1961 goto enqueue; 1962 bypass: 1963 dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 1964 goto enqueue; 1965 1966 enqueue: 1967 /* 1968 * For task-ordering, slice refill must be treated as implying the end 1969 * of the current slice. Otherwise, the longer @p stays on the CPU, the 1970 * higher priority it becomes from scx_prio_less()'s POV. 1971 */ 1972 touch_core_sched(rq, p); 1973 refill_task_slice_dfl(sch, p); 1974 clear_direct_dispatch(p); 1975 dispatch_enqueue(sch, rq, dsq, p, enq_flags); 1976 } 1977 1978 static bool task_runnable(const struct task_struct *p) 1979 { 1980 return !list_empty(&p->scx.runnable_node); 1981 } 1982 1983 static void set_task_runnable(struct rq *rq, struct task_struct *p) 1984 { 1985 lockdep_assert_rq_held(rq); 1986 1987 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 1988 p->scx.runnable_at = jiffies; 1989 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 1990 } 1991 1992 /* 1993 * list_add_tail() must be used. scx_bypass() depends on tasks being 1994 * appended to the runnable_list. 1995 */ 1996 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 1997 } 1998 1999 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2000 { 2001 list_del_init(&p->scx.runnable_node); 2002 if (reset_runnable_at) 2003 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2004 } 2005 2006 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 2007 { 2008 struct scx_sched *sch = scx_task_sched(p); 2009 int sticky_cpu = p->scx.sticky_cpu; 2010 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 2011 2012 if (enq_flags & ENQUEUE_WAKEUP) 2013 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2014 2015 /* 2016 * Restoring a running task will be immediately followed by 2017 * set_next_task_scx() which expects the task to not be on the BPF 2018 * scheduler as tasks can only start running through local DSQs. Force 2019 * direct-dispatch into the local DSQ by setting the sticky_cpu. 2020 */ 2021 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2022 sticky_cpu = cpu_of(rq); 2023 2024 if (p->scx.flags & SCX_TASK_QUEUED) { 2025 WARN_ON_ONCE(!task_runnable(p)); 2026 goto out; 2027 } 2028 2029 set_task_runnable(rq, p); 2030 p->scx.flags |= SCX_TASK_QUEUED; 2031 rq->scx.nr_running++; 2032 add_nr_running(rq, 1); 2033 2034 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2035 SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); 2036 2037 if (enq_flags & SCX_ENQ_WAKEUP) 2038 touch_core_sched(rq, p); 2039 2040 /* Start dl_server if this is the first task being enqueued */ 2041 if (rq->scx.nr_running == 1) 2042 dl_server_start(&rq->ext_server); 2043 2044 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2045 2046 if (sticky_cpu >= 0) 2047 p->scx.sticky_cpu = -1; 2048 out: 2049 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2050 2051 if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2052 unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2053 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2054 } 2055 2056 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2057 { 2058 struct scx_sched *sch = scx_task_sched(p); 2059 unsigned long opss; 2060 2061 /* dequeue is always temporary, don't reset runnable_at */ 2062 clr_task_runnable(p, false); 2063 2064 retry: 2065 /* acquire ensures that we see the preceding updates on QUEUED */ 2066 opss = atomic_long_read_acquire(&p->scx.ops_state); 2067 2068 switch (opss & SCX_OPSS_STATE_MASK) { 2069 case SCX_OPSS_NONE: 2070 break; 2071 case SCX_OPSS_QUEUEING: 2072 /* 2073 * QUEUEING is started and finished while holding @p's rq lock. 2074 * As we're holding the rq lock now, we shouldn't see QUEUEING. 2075 */ 2076 BUG(); 2077 case SCX_OPSS_QUEUED: 2078 /* 2079 * A queued task must always be in BPF scheduler's custody. If 2080 * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another 2081 * CPU has already passed call_task_dequeue() (which clears the 2082 * flag), but has not yet written SCX_OPSS_NONE. That final 2083 * store does not require this rq's lock, so retrying with 2084 * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, 2085 * handled by the fallthrough) on a subsequent iteration. 2086 */ 2087 if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { 2088 cpu_relax(); 2089 goto retry; 2090 } 2091 2092 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2093 SCX_OPSS_NONE)) 2094 break; 2095 fallthrough; 2096 case SCX_OPSS_DISPATCHING: 2097 /* 2098 * If @p is being dispatched from the BPF scheduler to a DSQ, 2099 * wait for the transfer to complete so that @p doesn't get 2100 * added to its DSQ after dequeueing is complete. 2101 * 2102 * As we're waiting on DISPATCHING with the rq locked, the 2103 * dispatching side shouldn't try to lock the rq while 2104 * DISPATCHING is set. See dispatch_to_local_dsq(). 2105 * 2106 * DISPATCHING shouldn't have qseq set and control can reach 2107 * here with NONE @opss from the above QUEUED case block. 2108 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2109 */ 2110 wait_ops_state(p, SCX_OPSS_DISPATCHING); 2111 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2112 break; 2113 } 2114 2115 /* 2116 * Call ops.dequeue() if the task is still in BPF custody. 2117 * 2118 * The code that clears ops_state to %SCX_OPSS_NONE does not always 2119 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2120 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2121 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2122 * so that a concurrent dequeue can proceed, but we clear 2123 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2124 * task. So we can see NONE + IN_CUSTODY here and we must handle 2125 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2126 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2127 * it is enqueued on the destination. 2128 */ 2129 call_task_dequeue(sch, rq, p, deq_flags); 2130 } 2131 2132 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2133 { 2134 struct scx_sched *sch = scx_task_sched(p); 2135 u64 deq_flags = core_deq_flags; 2136 2137 /* 2138 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2139 * change (not sleep or core-sched pick). 2140 */ 2141 if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2142 deq_flags |= SCX_DEQ_SCHED_CHANGE; 2143 2144 if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2145 WARN_ON_ONCE(task_runnable(p)); 2146 return true; 2147 } 2148 2149 ops_dequeue(rq, p, deq_flags); 2150 2151 /* 2152 * A currently running task which is going off @rq first gets dequeued 2153 * and then stops running. As we want running <-> stopping transitions 2154 * to be contained within runnable <-> quiescent transitions, trigger 2155 * ->stopping() early here instead of in put_prev_task_scx(). 2156 * 2157 * @p may go through multiple stopping <-> running transitions between 2158 * here and put_prev_task_scx() if task attribute changes occur while 2159 * balance_one() leaves @rq unlocked. However, they don't contain any 2160 * information meaningful to the BPF scheduler and can be suppressed by 2161 * skipping the callbacks if the task is !QUEUED. 2162 */ 2163 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2164 update_curr_scx(rq); 2165 SCX_CALL_OP_TASK(sch, stopping, rq, p, false); 2166 } 2167 2168 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2169 SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); 2170 2171 if (deq_flags & SCX_DEQ_SLEEP) 2172 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2173 else 2174 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2175 2176 p->scx.flags &= ~SCX_TASK_QUEUED; 2177 rq->scx.nr_running--; 2178 sub_nr_running(rq, 1); 2179 2180 dispatch_dequeue(rq, p); 2181 clear_direct_dispatch(p); 2182 return true; 2183 } 2184 2185 static void yield_task_scx(struct rq *rq) 2186 { 2187 struct task_struct *p = rq->donor; 2188 struct scx_sched *sch = scx_task_sched(p); 2189 2190 if (SCX_HAS_OP(sch, yield)) 2191 SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); 2192 else 2193 p->scx.slice = 0; 2194 } 2195 2196 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2197 { 2198 struct task_struct *from = rq->donor; 2199 struct scx_sched *sch = scx_task_sched(from); 2200 2201 if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2202 return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); 2203 else 2204 return false; 2205 } 2206 2207 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2208 { 2209 /* 2210 * Preemption between SCX tasks is implemented by resetting the victim 2211 * task's slice to 0 and triggering reschedule on the target CPU. 2212 * Nothing to do. 2213 */ 2214 if (p->sched_class == &ext_sched_class) 2215 return; 2216 2217 /* 2218 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2219 * This captures all preemption cases including: 2220 * 2221 * - A SCX task is currently running. 2222 * 2223 * - @rq is waking from idle due to a SCX task waking to it. 2224 * 2225 * - A higher-priority wakes up while SCX dispatch is in progress. 2226 */ 2227 if (rq->scx.nr_immed) 2228 schedule_reenq_local(rq, 0); 2229 } 2230 2231 static void move_local_task_to_local_dsq(struct scx_sched *sch, 2232 struct task_struct *p, u64 enq_flags, 2233 struct scx_dispatch_q *src_dsq, 2234 struct rq *dst_rq) 2235 { 2236 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2237 2238 /* @dsq is locked and @p is on @dst_rq */ 2239 lockdep_assert_held(&src_dsq->lock); 2240 lockdep_assert_rq_held(dst_rq); 2241 2242 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2243 2244 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2245 list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2246 else 2247 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2248 2249 dsq_inc_nr(dst_dsq, p, enq_flags); 2250 p->scx.dsq = dst_dsq; 2251 2252 local_dsq_post_enq(sch, dst_dsq, p, enq_flags); 2253 } 2254 2255 /** 2256 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2257 * @p: task to move 2258 * @enq_flags: %SCX_ENQ_* 2259 * @src_rq: rq to move the task from, locked on entry, released on return 2260 * @dst_rq: rq to move the task into, locked on return 2261 * 2262 * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2263 */ 2264 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2265 struct rq *src_rq, struct rq *dst_rq) 2266 { 2267 lockdep_assert_rq_held(src_rq); 2268 2269 /* 2270 * Set sticky_cpu before deactivate_task() to properly mark the 2271 * beginning of an SCX-internal migration. 2272 */ 2273 p->scx.sticky_cpu = cpu_of(dst_rq); 2274 deactivate_task(src_rq, p, 0); 2275 set_task_cpu(p, cpu_of(dst_rq)); 2276 2277 raw_spin_rq_unlock(src_rq); 2278 raw_spin_rq_lock(dst_rq); 2279 2280 /* 2281 * We want to pass scx-specific enq_flags but activate_task() will 2282 * truncate the upper 32 bit. As we own @rq, we can pass them through 2283 * @rq->scx.extra_enq_flags instead. 2284 */ 2285 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2286 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2287 dst_rq->scx.extra_enq_flags = enq_flags; 2288 activate_task(dst_rq, p, 0); 2289 dst_rq->scx.extra_enq_flags = 0; 2290 } 2291 2292 /* 2293 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2294 * differences: 2295 * 2296 * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2297 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2298 * this CPU?". 2299 * 2300 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2301 * must be allowed to finish on the CPU that it's currently on regardless of 2302 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2303 * BPF scheduler shouldn't attempt to migrate a task which has migration 2304 * disabled. 2305 * 2306 * - The BPF scheduler is bypassed while the rq is offline and we can always say 2307 * no to the BPF scheduler initiated migrations while offline. 2308 * 2309 * The caller must ensure that @p and @rq are on different CPUs. 2310 */ 2311 static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2312 struct task_struct *p, struct rq *rq, 2313 bool enforce) 2314 { 2315 s32 cpu = cpu_of(rq); 2316 2317 WARN_ON_ONCE(task_cpu(p) == cpu); 2318 2319 /* 2320 * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2321 * the pinned CPU in migrate_disable_switch() while @p is being switched 2322 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2323 * updated and thus another CPU may see @p on a DSQ inbetween leading to 2324 * @p passing the below task_allowed_on_cpu() check while migration is 2325 * disabled. 2326 * 2327 * Test the migration disabled state first as the race window is narrow 2328 * and the BPF scheduler failing to check migration disabled state can 2329 * easily be masked if task_allowed_on_cpu() is done first. 2330 */ 2331 if (unlikely(is_migration_disabled(p))) { 2332 if (enforce) 2333 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2334 p->comm, p->pid, task_cpu(p), cpu); 2335 return false; 2336 } 2337 2338 /* 2339 * We don't require the BPF scheduler to avoid dispatching to offline 2340 * CPUs mostly for convenience but also because CPUs can go offline 2341 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2342 * picked CPU is outside the allowed mask. 2343 */ 2344 if (!task_allowed_on_cpu(p, cpu)) { 2345 if (enforce) 2346 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2347 cpu, p->comm, p->pid); 2348 return false; 2349 } 2350 2351 if (!scx_rq_online(rq)) { 2352 if (enforce) 2353 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2354 return false; 2355 } 2356 2357 return true; 2358 } 2359 2360 /** 2361 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2362 * @p: target task 2363 * @dsq: locked DSQ @p is currently on 2364 * @src_rq: rq @p is currently on, stable with @dsq locked 2365 * 2366 * Called with @dsq locked but no rq's locked. We want to move @p to a different 2367 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2368 * required when transferring into a local DSQ. Even when transferring into a 2369 * non-local DSQ, it's better to use the same mechanism to protect against 2370 * dequeues and maintain the invariant that @p->scx.dsq can only change while 2371 * @src_rq is locked, which e.g. scx_dump_task() depends on. 2372 * 2373 * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2374 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2375 * this may race with dequeue, which can't drop the rq lock or fail, do a little 2376 * dancing from our side. 2377 * 2378 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2379 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2380 * would be cleared to -1. While other cpus may have updated it to different 2381 * values afterwards, as this operation can't be preempted or recurse, the 2382 * holding_cpu can never become this CPU again before we're done. Thus, we can 2383 * tell whether we lost to dequeue by testing whether the holding_cpu still 2384 * points to this CPU. See dispatch_dequeue() for the counterpart. 2385 * 2386 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2387 * still valid. %false if lost to dequeue. 2388 */ 2389 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2390 struct scx_dispatch_q *dsq, 2391 struct rq *src_rq) 2392 { 2393 s32 cpu = raw_smp_processor_id(); 2394 2395 lockdep_assert_held(&dsq->lock); 2396 2397 WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2398 task_unlink_from_dsq(p, dsq); 2399 p->scx.holding_cpu = cpu; 2400 2401 raw_spin_unlock(&dsq->lock); 2402 raw_spin_rq_lock(src_rq); 2403 2404 /* task_rq couldn't have changed if we're still the holding cpu */ 2405 return likely(p->scx.holding_cpu == cpu) && 2406 !WARN_ON_ONCE(src_rq != task_rq(p)); 2407 } 2408 2409 static bool consume_remote_task(struct rq *this_rq, 2410 struct task_struct *p, u64 enq_flags, 2411 struct scx_dispatch_q *dsq, struct rq *src_rq) 2412 { 2413 raw_spin_rq_unlock(this_rq); 2414 2415 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2416 move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); 2417 return true; 2418 } else { 2419 raw_spin_rq_unlock(src_rq); 2420 raw_spin_rq_lock(this_rq); 2421 return false; 2422 } 2423 } 2424 2425 /** 2426 * move_task_between_dsqs() - Move a task from one DSQ to another 2427 * @sch: scx_sched being operated on 2428 * @p: target task 2429 * @enq_flags: %SCX_ENQ_* 2430 * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2431 * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2432 * 2433 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2434 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2435 * will change. As @p's task_rq is locked, this function doesn't need to use the 2436 * holding_cpu mechanism. 2437 * 2438 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2439 * return value, is locked. 2440 */ 2441 static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2442 struct task_struct *p, u64 enq_flags, 2443 struct scx_dispatch_q *src_dsq, 2444 struct scx_dispatch_q *dst_dsq) 2445 { 2446 struct rq *src_rq = task_rq(p), *dst_rq; 2447 2448 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2449 lockdep_assert_held(&src_dsq->lock); 2450 lockdep_assert_rq_held(src_rq); 2451 2452 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2453 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2454 if (src_rq != dst_rq && 2455 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2456 dst_dsq = find_global_dsq(sch, task_cpu(p)); 2457 dst_rq = src_rq; 2458 enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2459 } 2460 } else { 2461 /* no need to migrate if destination is a non-local DSQ */ 2462 dst_rq = src_rq; 2463 } 2464 2465 /* 2466 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2467 * CPU, @p will be migrated. 2468 */ 2469 if (dst_dsq->id == SCX_DSQ_LOCAL) { 2470 /* @p is going from a non-local DSQ to a local DSQ */ 2471 if (src_rq == dst_rq) { 2472 task_unlink_from_dsq(p, src_dsq); 2473 move_local_task_to_local_dsq(sch, p, enq_flags, 2474 src_dsq, dst_rq); 2475 raw_spin_unlock(&src_dsq->lock); 2476 } else { 2477 raw_spin_unlock(&src_dsq->lock); 2478 move_remote_task_to_local_dsq(p, enq_flags, 2479 src_rq, dst_rq); 2480 } 2481 } else { 2482 /* 2483 * @p is going from a non-local DSQ to a non-local DSQ. As 2484 * $src_dsq is already locked, do an abbreviated dequeue. 2485 */ 2486 dispatch_dequeue_locked(p, src_dsq); 2487 raw_spin_unlock(&src_dsq->lock); 2488 2489 dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2490 } 2491 2492 return dst_rq; 2493 } 2494 2495 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2496 struct scx_dispatch_q *dsq, u64 enq_flags) 2497 { 2498 struct task_struct *p; 2499 retry: 2500 /* 2501 * The caller can't expect to successfully consume a task if the task's 2502 * addition to @dsq isn't guaranteed to be visible somehow. Test 2503 * @dsq->list without locking and skip if it seems empty. 2504 */ 2505 if (list_empty(&dsq->list)) 2506 return false; 2507 2508 raw_spin_lock(&dsq->lock); 2509 2510 nldsq_for_each_task(p, dsq) { 2511 struct rq *task_rq = task_rq(p); 2512 2513 /* 2514 * This loop can lead to multiple lockup scenarios, e.g. the BPF 2515 * scheduler can put an enormous number of affinitized tasks into 2516 * a contended DSQ, or the outer retry loop can repeatedly race 2517 * against scx_bypass() dequeueing tasks from @dsq trying to put 2518 * the system into the bypass mode. This can easily live-lock the 2519 * machine. If aborting, exit from all non-bypass DSQs. 2520 */ 2521 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2522 break; 2523 2524 if (rq == task_rq) { 2525 task_unlink_from_dsq(p, dsq); 2526 move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); 2527 raw_spin_unlock(&dsq->lock); 2528 return true; 2529 } 2530 2531 if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2532 if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) 2533 return true; 2534 goto retry; 2535 } 2536 } 2537 2538 raw_spin_unlock(&dsq->lock); 2539 return false; 2540 } 2541 2542 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2543 { 2544 int node = cpu_to_node(cpu_of(rq)); 2545 2546 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); 2547 } 2548 2549 /** 2550 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2551 * @sch: scx_sched being operated on 2552 * @rq: current rq which is locked 2553 * @dst_dsq: destination DSQ 2554 * @p: task to dispatch 2555 * @enq_flags: %SCX_ENQ_* 2556 * 2557 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2558 * DSQ. This function performs all the synchronization dancing needed because 2559 * local DSQs are protected with rq locks. 2560 * 2561 * The caller must have exclusive ownership of @p (e.g. through 2562 * %SCX_OPSS_DISPATCHING). 2563 */ 2564 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2565 struct scx_dispatch_q *dst_dsq, 2566 struct task_struct *p, u64 enq_flags) 2567 { 2568 struct rq *src_rq = task_rq(p); 2569 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2570 struct rq *locked_rq = rq; 2571 2572 /* 2573 * We're synchronized against dequeue through DISPATCHING. As @p can't 2574 * be dequeued, its task_rq and cpus_allowed are stable too. 2575 * 2576 * If dispatching to @rq that @p is already on, no lock dancing needed. 2577 */ 2578 if (rq == src_rq && rq == dst_rq) { 2579 dispatch_enqueue(sch, rq, dst_dsq, p, 2580 enq_flags | SCX_ENQ_CLEAR_OPSS); 2581 return; 2582 } 2583 2584 if (src_rq != dst_rq && 2585 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2586 dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2587 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2588 return; 2589 } 2590 2591 /* 2592 * @p is on a possibly remote @src_rq which we need to lock to move the 2593 * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2594 * on DISPATCHING, so we can't grab @src_rq lock while holding 2595 * DISPATCHING. 2596 * 2597 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2598 * we're moving from a DSQ and use the same mechanism - mark the task 2599 * under transfer with holding_cpu, release DISPATCHING and then follow 2600 * the same protocol. See unlink_dsq_and_lock_src_rq(). 2601 */ 2602 p->scx.holding_cpu = raw_smp_processor_id(); 2603 2604 /* store_release ensures that dequeue sees the above */ 2605 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2606 2607 /* switch to @src_rq lock */ 2608 if (locked_rq != src_rq) { 2609 raw_spin_rq_unlock(locked_rq); 2610 locked_rq = src_rq; 2611 raw_spin_rq_lock(src_rq); 2612 } 2613 2614 /* task_rq couldn't have changed if we're still the holding cpu */ 2615 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2616 !WARN_ON_ONCE(src_rq != task_rq(p))) { 2617 /* 2618 * If @p is staying on the same rq, there's no need to go 2619 * through the full deactivate/activate cycle. Optimize by 2620 * abbreviating move_remote_task_to_local_dsq(). 2621 */ 2622 if (src_rq == dst_rq) { 2623 p->scx.holding_cpu = -1; 2624 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2625 enq_flags); 2626 } else { 2627 move_remote_task_to_local_dsq(p, enq_flags, 2628 src_rq, dst_rq); 2629 /* task has been moved to dst_rq, which is now locked */ 2630 locked_rq = dst_rq; 2631 } 2632 2633 /* if the destination CPU is idle, wake it up */ 2634 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2635 resched_curr(dst_rq); 2636 } 2637 2638 /* switch back to @rq lock */ 2639 if (locked_rq != rq) { 2640 raw_spin_rq_unlock(locked_rq); 2641 raw_spin_rq_lock(rq); 2642 } 2643 } 2644 2645 /** 2646 * finish_dispatch - Asynchronously finish dispatching a task 2647 * @rq: current rq which is locked 2648 * @p: task to finish dispatching 2649 * @qseq_at_dispatch: qseq when @p started getting dispatched 2650 * @dsq_id: destination DSQ ID 2651 * @enq_flags: %SCX_ENQ_* 2652 * 2653 * Dispatching to local DSQs may need to wait for queueing to complete or 2654 * require rq lock dancing. As we don't wanna do either while inside 2655 * ops.dispatch() to avoid locking order inversion, we split dispatching into 2656 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2657 * task and its qseq. Once ops.dispatch() returns, this function is called to 2658 * finish up. 2659 * 2660 * There is no guarantee that @p is still valid for dispatching or even that it 2661 * was valid in the first place. Make sure that the task is still owned by the 2662 * BPF scheduler and claim the ownership before dispatching. 2663 */ 2664 static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2665 struct task_struct *p, 2666 unsigned long qseq_at_dispatch, 2667 u64 dsq_id, u64 enq_flags) 2668 { 2669 struct scx_dispatch_q *dsq; 2670 unsigned long opss; 2671 2672 touch_core_sched_dispatch(rq, p); 2673 retry: 2674 /* 2675 * No need for _acquire here. @p is accessed only after a successful 2676 * try_cmpxchg to DISPATCHING. 2677 */ 2678 opss = atomic_long_read(&p->scx.ops_state); 2679 2680 switch (opss & SCX_OPSS_STATE_MASK) { 2681 case SCX_OPSS_DISPATCHING: 2682 case SCX_OPSS_NONE: 2683 /* someone else already got to it */ 2684 return; 2685 case SCX_OPSS_QUEUED: 2686 /* 2687 * If qseq doesn't match, @p has gone through at least one 2688 * dispatch/dequeue and re-enqueue cycle between 2689 * scx_bpf_dsq_insert() and here and we have no claim on it. 2690 */ 2691 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2692 return; 2693 2694 /* see SCX_EV_INSERT_NOT_OWNED definition */ 2695 if (unlikely(!scx_task_on_sched(sch, p))) { 2696 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2697 return; 2698 } 2699 2700 /* 2701 * While we know @p is accessible, we don't yet have a claim on 2702 * it - the BPF scheduler is allowed to dispatch tasks 2703 * spuriously and there can be a racing dequeue attempt. Let's 2704 * claim @p by atomically transitioning it from QUEUED to 2705 * DISPATCHING. 2706 */ 2707 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2708 SCX_OPSS_DISPATCHING))) 2709 break; 2710 goto retry; 2711 case SCX_OPSS_QUEUEING: 2712 /* 2713 * do_enqueue_task() is in the process of transferring the task 2714 * to the BPF scheduler while holding @p's rq lock. As we aren't 2715 * holding any kernel or BPF resource that the enqueue path may 2716 * depend upon, it's safe to wait. 2717 */ 2718 wait_ops_state(p, opss); 2719 goto retry; 2720 } 2721 2722 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2723 2724 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2725 2726 if (dsq->id == SCX_DSQ_LOCAL) 2727 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2728 else 2729 dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2730 } 2731 2732 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2733 { 2734 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2735 u32 u; 2736 2737 for (u = 0; u < dspc->cursor; u++) { 2738 struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2739 2740 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2741 ent->enq_flags); 2742 } 2743 2744 dspc->nr_tasks += dspc->cursor; 2745 dspc->cursor = 0; 2746 } 2747 2748 static inline void maybe_queue_balance_callback(struct rq *rq) 2749 { 2750 lockdep_assert_rq_held(rq); 2751 2752 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2753 return; 2754 2755 queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2756 deferred_bal_cb_workfn); 2757 2758 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2759 } 2760 2761 /* 2762 * One user of this function is scx_bpf_dispatch() which can be called 2763 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2764 * from the call frame. 2765 */ 2766 static __always_inline bool 2767 scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2768 struct task_struct *prev, bool nested) 2769 { 2770 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2771 int nr_loops = SCX_DSP_MAX_LOOPS; 2772 s32 cpu = cpu_of(rq); 2773 bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2774 scx_task_on_sched(sch, prev); 2775 2776 if (consume_global_dsq(sch, rq)) 2777 return true; 2778 2779 if (bypass_dsp_enabled(sch)) { 2780 /* if @sch is bypassing, only the bypass DSQs are active */ 2781 if (scx_bypassing(sch, cpu)) 2782 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2783 2784 #ifdef CONFIG_EXT_SUB_SCHED 2785 /* 2786 * If @sch isn't bypassing but its children are, @sch is 2787 * responsible for making forward progress for both its own 2788 * tasks that aren't bypassing and the bypassing descendants' 2789 * tasks. The following implements a simple built-in behavior - 2790 * let each CPU try to run the bypass DSQ every Nth time. 2791 * 2792 * Later, if necessary, we can add an ops flag to suppress the 2793 * auto-consumption and a kfunc to consume the bypass DSQ and, 2794 * so that the BPF scheduler can fully control scheduling of 2795 * bypassed tasks. 2796 */ 2797 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2798 2799 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2800 consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { 2801 __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2802 return true; 2803 } 2804 #endif /* CONFIG_EXT_SUB_SCHED */ 2805 } 2806 2807 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2808 return false; 2809 2810 dspc->rq = rq; 2811 2812 /* 2813 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2814 * the local DSQ might still end up empty after a successful 2815 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2816 * produced some tasks, retry. The BPF scheduler may depend on this 2817 * looping behavior to simplify its implementation. 2818 */ 2819 do { 2820 dspc->nr_tasks = 0; 2821 2822 if (nested) { 2823 SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), 2824 prev_on_sch ? prev : NULL); 2825 } else { 2826 /* stash @prev so that nested invocations can access it */ 2827 rq->scx.sub_dispatch_prev = prev; 2828 SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), 2829 prev_on_sch ? prev : NULL); 2830 rq->scx.sub_dispatch_prev = NULL; 2831 } 2832 2833 flush_dispatch_buf(sch, rq); 2834 2835 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2836 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2837 return true; 2838 } 2839 if (rq->scx.local_dsq.nr) 2840 return true; 2841 if (consume_global_dsq(sch, rq)) 2842 return true; 2843 2844 /* 2845 * ops.dispatch() can trap us in this loop by repeatedly 2846 * dispatching ineligible tasks. Break out once in a while to 2847 * allow the watchdog to run. As IRQ can't be enabled in 2848 * balance(), we want to complete this scheduling cycle and then 2849 * start a new one. IOW, we want to call resched_curr() on the 2850 * next, most likely idle, task, not the current one. Use 2851 * __scx_bpf_kick_cpu() for deferred kicking. 2852 */ 2853 if (unlikely(!--nr_loops)) { 2854 scx_kick_cpu(sch, cpu, 0); 2855 break; 2856 } 2857 } while (dspc->nr_tasks); 2858 2859 /* 2860 * Prevent the CPU from going idle while bypassed descendants have tasks 2861 * queued. Without this fallback, bypassed tasks could stall if the host 2862 * scheduler's ops.dispatch() doesn't yield any tasks. 2863 */ 2864 if (bypass_dsp_enabled(sch)) 2865 return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2866 2867 return false; 2868 } 2869 2870 static int balance_one(struct rq *rq, struct task_struct *prev) 2871 { 2872 struct scx_sched *sch = scx_root; 2873 s32 cpu = cpu_of(rq); 2874 2875 lockdep_assert_rq_held(rq); 2876 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2877 rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2878 2879 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2880 unlikely(rq->scx.cpu_released)) { 2881 /* 2882 * If the previous sched_class for the current CPU was not SCX, 2883 * notify the BPF scheduler that it again has control of the 2884 * core. This callback complements ->cpu_release(), which is 2885 * emitted in switch_class(). 2886 */ 2887 if (sch->ops.cpu_acquire) 2888 SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); 2889 rq->scx.cpu_released = false; 2890 } 2891 2892 if (prev->sched_class == &ext_sched_class) { 2893 update_curr_scx(rq); 2894 2895 /* 2896 * If @prev is runnable & has slice left, it has priority and 2897 * fetching more just increases latency for the fetched tasks. 2898 * Tell pick_task_scx() to keep running @prev. If the BPF 2899 * scheduler wants to handle this explicitly, it should 2900 * implement ->cpu_release(). 2901 * 2902 * See scx_disable_workfn() for the explanation on the bypassing 2903 * test. 2904 */ 2905 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 2906 !scx_bypassing(sch, cpu)) { 2907 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2908 goto has_tasks; 2909 } 2910 } 2911 2912 /* if there already are tasks to run, nothing to do */ 2913 if (rq->scx.local_dsq.nr) 2914 goto has_tasks; 2915 2916 if (scx_dispatch_sched(sch, rq, prev, false)) 2917 goto has_tasks; 2918 2919 /* 2920 * Didn't find another task to run. Keep running @prev unless 2921 * %SCX_OPS_ENQ_LAST is in effect. 2922 */ 2923 if ((prev->scx.flags & SCX_TASK_QUEUED) && 2924 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 2925 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2926 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 2927 goto has_tasks; 2928 } 2929 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2930 return false; 2931 2932 has_tasks: 2933 /* 2934 * @rq may have extra IMMED tasks without reenq scheduled: 2935 * 2936 * - rq_is_open() can't reliably tell when and how slice is going to be 2937 * modified for $curr and allows IMMED tasks to be queued while 2938 * dispatch is in progress. 2939 * 2940 * - A non-IMMED HEAD task can get queued in front of an IMMED task 2941 * between the IMMED queueing and the subsequent scheduling event. 2942 */ 2943 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 2944 schedule_reenq_local(rq, 0); 2945 2946 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2947 return true; 2948 } 2949 2950 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 2951 { 2952 struct scx_sched *sch = scx_task_sched(p); 2953 2954 if (p->scx.flags & SCX_TASK_QUEUED) { 2955 /* 2956 * Core-sched might decide to execute @p before it is 2957 * dispatched. Call ops_dequeue() to notify the BPF scheduler. 2958 */ 2959 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 2960 dispatch_dequeue(rq, p); 2961 } 2962 2963 p->se.exec_start = rq_clock_task(rq); 2964 2965 /* see dequeue_task_scx() on why we skip when !QUEUED */ 2966 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 2967 SCX_CALL_OP_TASK(sch, running, rq, p); 2968 2969 clr_task_runnable(p, true); 2970 2971 /* 2972 * @p is getting newly scheduled or got kicked after someone updated its 2973 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 2974 */ 2975 if ((p->scx.slice == SCX_SLICE_INF) != 2976 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 2977 if (p->scx.slice == SCX_SLICE_INF) 2978 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 2979 else 2980 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 2981 2982 sched_update_tick_dependency(rq); 2983 2984 /* 2985 * For now, let's refresh the load_avgs just when transitioning 2986 * in and out of nohz. In the future, we might want to add a 2987 * mechanism which calls the following periodically on 2988 * tick-stopped CPUs. 2989 */ 2990 update_other_load_avgs(rq); 2991 } 2992 } 2993 2994 static enum scx_cpu_preempt_reason 2995 preempt_reason_from_class(const struct sched_class *class) 2996 { 2997 if (class == &stop_sched_class) 2998 return SCX_CPU_PREEMPT_STOP; 2999 if (class == &dl_sched_class) 3000 return SCX_CPU_PREEMPT_DL; 3001 if (class == &rt_sched_class) 3002 return SCX_CPU_PREEMPT_RT; 3003 return SCX_CPU_PREEMPT_UNKNOWN; 3004 } 3005 3006 static void switch_class(struct rq *rq, struct task_struct *next) 3007 { 3008 struct scx_sched *sch = scx_root; 3009 const struct sched_class *next_class = next->sched_class; 3010 3011 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3012 return; 3013 3014 /* 3015 * The callback is conceptually meant to convey that the CPU is no 3016 * longer under the control of SCX. Therefore, don't invoke the callback 3017 * if the next class is below SCX (in which case the BPF scheduler has 3018 * actively decided not to schedule any tasks on the CPU). 3019 */ 3020 if (sched_class_above(&ext_sched_class, next_class)) 3021 return; 3022 3023 /* 3024 * At this point we know that SCX was preempted by a higher priority 3025 * sched_class, so invoke the ->cpu_release() callback if we have not 3026 * done so already. We only send the callback once between SCX being 3027 * preempted, and it regaining control of the CPU. 3028 * 3029 * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3030 * next time that balance_one() is invoked. 3031 */ 3032 if (!rq->scx.cpu_released) { 3033 if (sch->ops.cpu_release) { 3034 struct scx_cpu_release_args args = { 3035 .reason = preempt_reason_from_class(next_class), 3036 .task = next, 3037 }; 3038 3039 SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); 3040 } 3041 rq->scx.cpu_released = true; 3042 } 3043 } 3044 3045 static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3046 struct task_struct *next) 3047 { 3048 struct scx_sched *sch = scx_task_sched(p); 3049 3050 /* see kick_sync_wait_bal_cb() */ 3051 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3052 3053 update_curr_scx(rq); 3054 3055 /* see dequeue_task_scx() on why we skip when !QUEUED */ 3056 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3057 SCX_CALL_OP_TASK(sch, stopping, rq, p, true); 3058 3059 if (p->scx.flags & SCX_TASK_QUEUED) { 3060 set_task_runnable(rq, p); 3061 3062 /* 3063 * If @p has slice left and is being put, @p is getting 3064 * preempted by a higher priority scheduler class or core-sched 3065 * forcing a different task. Leave it at the head of the local 3066 * DSQ unless it was an IMMED task. IMMED tasks should not 3067 * linger on a busy CPU, reenqueue them to the BPF scheduler. 3068 */ 3069 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3070 if (p->scx.flags & SCX_TASK_IMMED) { 3071 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3072 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3073 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3074 } else { 3075 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3076 } 3077 goto switch_class; 3078 } 3079 3080 /* 3081 * If @p is runnable but we're about to enter a lower 3082 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3083 * ops.enqueue() that @p is the only one available for this cpu, 3084 * which should trigger an explicit follow-up scheduling event. 3085 */ 3086 if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3087 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3088 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3089 } else { 3090 do_enqueue_task(rq, p, 0, -1); 3091 } 3092 } 3093 3094 switch_class: 3095 if (next && next->sched_class != &ext_sched_class) 3096 switch_class(rq, next); 3097 } 3098 3099 static void kick_sync_wait_bal_cb(struct rq *rq) 3100 { 3101 struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3102 unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3103 bool waited; 3104 s32 cpu; 3105 3106 /* 3107 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3108 * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3109 * flush) while we wait for its kick_sync to advance. 3110 * 3111 * Also, keep advancing our own kick_sync so that new kick_sync waits 3112 * targeting us, which can start after we drop the lock, cannot form 3113 * cyclic dependencies. 3114 */ 3115 retry: 3116 waited = false; 3117 for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3118 /* 3119 * smp_load_acquire() pairs with smp_store_release() on 3120 * kick_sync updates on the target CPUs. 3121 */ 3122 if (cpu == cpu_of(rq) || 3123 smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3124 cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3125 continue; 3126 } 3127 3128 raw_spin_rq_unlock_irq(rq); 3129 while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3130 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3131 cpu_relax(); 3132 } 3133 raw_spin_rq_lock_irq(rq); 3134 waited = true; 3135 } 3136 3137 if (waited) 3138 goto retry; 3139 } 3140 3141 static struct task_struct *first_local_task(struct rq *rq) 3142 { 3143 return list_first_entry_or_null(&rq->scx.local_dsq.list, 3144 struct task_struct, scx.dsq_list.node); 3145 } 3146 3147 static struct task_struct * 3148 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3149 { 3150 struct task_struct *prev = rq->curr; 3151 bool keep_prev; 3152 struct task_struct *p; 3153 3154 /* see kick_sync_wait_bal_cb() */ 3155 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3156 3157 rq_modified_begin(rq, &ext_sched_class); 3158 3159 rq_unpin_lock(rq, rf); 3160 balance_one(rq, prev); 3161 rq_repin_lock(rq, rf); 3162 maybe_queue_balance_callback(rq); 3163 3164 /* 3165 * Defer to a balance callback which can drop rq lock and enable 3166 * IRQs. Waiting directly in the pick path would deadlock against 3167 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3168 */ 3169 if (unlikely(rq->scx.kick_sync_pending)) { 3170 rq->scx.kick_sync_pending = false; 3171 queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3172 kick_sync_wait_bal_cb); 3173 } 3174 3175 /* 3176 * If any higher-priority sched class enqueued a runnable task on 3177 * this rq during balance_one(), abort and return RETRY_TASK, so 3178 * that the scheduler loop can restart. 3179 * 3180 * If @force_scx is true, always try to pick a SCHED_EXT task, 3181 * regardless of any higher-priority sched classes activity. 3182 */ 3183 if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3184 return RETRY_TASK; 3185 3186 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3187 if (unlikely(keep_prev && 3188 prev->sched_class != &ext_sched_class)) { 3189 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3190 keep_prev = false; 3191 } 3192 3193 /* 3194 * If balance_one() is telling us to keep running @prev, replenish slice 3195 * if necessary and keep running @prev. Otherwise, pop the first one 3196 * from the local DSQ. 3197 */ 3198 if (keep_prev) { 3199 p = prev; 3200 if (!p->scx.slice) 3201 refill_task_slice_dfl(scx_task_sched(p), p); 3202 } else { 3203 p = first_local_task(rq); 3204 if (!p) 3205 return NULL; 3206 3207 if (unlikely(!p->scx.slice)) { 3208 struct scx_sched *sch = scx_task_sched(p); 3209 3210 if (!scx_bypassing(sch, cpu_of(rq)) && 3211 !sch->warned_zero_slice) { 3212 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3213 p->comm, p->pid, __func__); 3214 sch->warned_zero_slice = true; 3215 } 3216 refill_task_slice_dfl(sch, p); 3217 } 3218 } 3219 3220 return p; 3221 } 3222 3223 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3224 { 3225 return do_pick_task_scx(rq, rf, false); 3226 } 3227 3228 /* 3229 * Select the next task to run from the ext scheduling class. 3230 * 3231 * Use do_pick_task_scx() directly with @force_scx enabled, since the 3232 * dl_server must always select a sched_ext task. 3233 */ 3234 static struct task_struct * 3235 ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3236 { 3237 if (!scx_enabled()) 3238 return NULL; 3239 3240 return do_pick_task_scx(dl_se->rq, rf, true); 3241 } 3242 3243 /* 3244 * Initialize the ext server deadline entity. 3245 */ 3246 void ext_server_init(struct rq *rq) 3247 { 3248 struct sched_dl_entity *dl_se = &rq->ext_server; 3249 3250 init_dl_entity(dl_se); 3251 3252 dl_server_init(dl_se, rq, ext_server_pick_task); 3253 } 3254 3255 #ifdef CONFIG_SCHED_CORE 3256 /** 3257 * scx_prio_less - Task ordering for core-sched 3258 * @a: task A 3259 * @b: task B 3260 * @in_fi: in forced idle state 3261 * 3262 * Core-sched is implemented as an additional scheduling layer on top of the 3263 * usual sched_class'es and needs to find out the expected task ordering. For 3264 * SCX, core-sched calls this function to interrogate the task ordering. 3265 * 3266 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3267 * to implement the default task ordering. The older the timestamp, the higher 3268 * priority the task - the global FIFO ordering matching the default scheduling 3269 * behavior. 3270 * 3271 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3272 * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3273 */ 3274 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3275 bool in_fi) 3276 { 3277 struct scx_sched *sch_a = scx_task_sched(a); 3278 struct scx_sched *sch_b = scx_task_sched(b); 3279 3280 /* 3281 * The const qualifiers are dropped from task_struct pointers when 3282 * calling ops.core_sched_before(). Accesses are controlled by the 3283 * verifier. 3284 */ 3285 if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3286 !scx_bypassing(sch_a, task_cpu(a))) 3287 return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, 3288 task_rq(a), 3289 (struct task_struct *)a, 3290 (struct task_struct *)b); 3291 else 3292 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3293 } 3294 #endif /* CONFIG_SCHED_CORE */ 3295 3296 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3297 { 3298 struct scx_sched *sch = scx_task_sched(p); 3299 bool bypassing; 3300 3301 /* 3302 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3303 * can be a good migration opportunity with low cache and memory 3304 * footprint. Returning a CPU different than @prev_cpu triggers 3305 * immediate rq migration. However, for SCX, as the current rq 3306 * association doesn't dictate where the task is going to run, this 3307 * doesn't fit well. If necessary, we can later add a dedicated method 3308 * which can decide to preempt self to force it through the regular 3309 * scheduling path. 3310 */ 3311 if (unlikely(wake_flags & WF_EXEC)) 3312 return prev_cpu; 3313 3314 bypassing = scx_bypassing(sch, task_cpu(p)); 3315 if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3316 s32 cpu; 3317 struct task_struct **ddsp_taskp; 3318 3319 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3320 WARN_ON_ONCE(*ddsp_taskp); 3321 *ddsp_taskp = p; 3322 3323 this_rq()->scx.in_select_cpu = true; 3324 cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, 3325 scx_cpu_arg(prev_cpu), wake_flags); 3326 cpu = scx_cpu_ret(sch, cpu); 3327 this_rq()->scx.in_select_cpu = false; 3328 p->scx.selected_cpu = cpu; 3329 *ddsp_taskp = NULL; 3330 if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3331 return cpu; 3332 else 3333 return prev_cpu; 3334 } else { 3335 s32 cpu; 3336 3337 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3338 if (cpu >= 0) { 3339 refill_task_slice_dfl(sch, p); 3340 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3341 } else { 3342 cpu = prev_cpu; 3343 } 3344 p->scx.selected_cpu = cpu; 3345 3346 if (bypassing) 3347 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3348 return cpu; 3349 } 3350 } 3351 3352 static void task_woken_scx(struct rq *rq, struct task_struct *p) 3353 { 3354 run_deferred(rq); 3355 } 3356 3357 static void set_cpus_allowed_scx(struct task_struct *p, 3358 struct affinity_context *ac) 3359 { 3360 struct scx_sched *sch = scx_task_sched(p); 3361 3362 set_cpus_allowed_common(p, ac); 3363 3364 if (task_dead_and_done(p)) 3365 return; 3366 3367 /* 3368 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3369 * differ from the configured one in @p->cpus_mask. Always tell the bpf 3370 * scheduler the effective one. 3371 * 3372 * Fine-grained memory write control is enforced by BPF making the const 3373 * designation pointless. Cast it away when calling the operation. 3374 */ 3375 if (SCX_HAS_OP(sch, set_cpumask)) 3376 scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr); 3377 } 3378 3379 static void handle_hotplug(struct rq *rq, bool online) 3380 { 3381 struct scx_sched *sch = scx_root; 3382 s32 cpu = cpu_of(rq); 3383 3384 atomic_long_inc(&scx_hotplug_seq); 3385 3386 /* 3387 * scx_root updates are protected by cpus_read_lock() and will stay 3388 * stable here. Note that we can't depend on scx_enabled() test as the 3389 * hotplug ops need to be enabled before __scx_enabled is set. 3390 */ 3391 if (unlikely(!sch)) 3392 return; 3393 3394 if (scx_enabled()) 3395 scx_idle_update_selcpu_topology(&sch->ops); 3396 3397 if (online && SCX_HAS_OP(sch, cpu_online)) 3398 SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu)); 3399 else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3400 SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu)); 3401 else 3402 scx_exit(sch, SCX_EXIT_UNREG_KERN, 3403 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3404 "cpu %d going %s, exiting scheduler", cpu, 3405 online ? "online" : "offline"); 3406 } 3407 3408 void scx_rq_activate(struct rq *rq) 3409 { 3410 handle_hotplug(rq, true); 3411 } 3412 3413 void scx_rq_deactivate(struct rq *rq) 3414 { 3415 handle_hotplug(rq, false); 3416 } 3417 3418 static void rq_online_scx(struct rq *rq) 3419 { 3420 rq->scx.flags |= SCX_RQ_ONLINE; 3421 } 3422 3423 static void rq_offline_scx(struct rq *rq) 3424 { 3425 rq->scx.flags &= ~SCX_RQ_ONLINE; 3426 } 3427 3428 static bool check_rq_for_timeouts(struct rq *rq) 3429 { 3430 struct scx_sched *sch; 3431 struct task_struct *p; 3432 struct rq_flags rf; 3433 bool timed_out = false; 3434 3435 rq_lock_irqsave(rq, &rf); 3436 sch = rcu_dereference_bh(scx_root); 3437 if (unlikely(!sch)) 3438 goto out_unlock; 3439 3440 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3441 struct scx_sched *sch = scx_task_sched(p); 3442 unsigned long last_runnable = p->scx.runnable_at; 3443 3444 if (unlikely(time_after(jiffies, 3445 last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3446 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3447 3448 __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq), 3449 "%s[%d] failed to run for %u.%03us", 3450 p->comm, p->pid, dur_ms / 1000, 3451 dur_ms % 1000); 3452 timed_out = true; 3453 break; 3454 } 3455 } 3456 out_unlock: 3457 rq_unlock_irqrestore(rq, &rf); 3458 return timed_out; 3459 } 3460 3461 static void scx_watchdog_workfn(struct work_struct *work) 3462 { 3463 unsigned long intv; 3464 int cpu; 3465 3466 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3467 3468 for_each_online_cpu(cpu) { 3469 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3470 break; 3471 3472 cond_resched(); 3473 } 3474 3475 intv = READ_ONCE(scx_watchdog_interval); 3476 if (intv < ULONG_MAX) 3477 queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3478 } 3479 3480 void scx_tick(struct rq *rq) 3481 { 3482 struct scx_sched *root; 3483 unsigned long last_check; 3484 3485 if (!scx_enabled()) 3486 return; 3487 3488 root = rcu_dereference_bh(scx_root); 3489 if (unlikely(!root)) 3490 return; 3491 3492 last_check = READ_ONCE(scx_watchdog_timestamp); 3493 if (unlikely(time_after(jiffies, 3494 last_check + READ_ONCE(root->watchdog_timeout)))) { 3495 u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3496 3497 scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3498 "watchdog failed to check in for %u.%03us", 3499 dur_ms / 1000, dur_ms % 1000); 3500 } 3501 3502 update_other_load_avgs(rq); 3503 } 3504 3505 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3506 { 3507 struct scx_sched *sch = scx_task_sched(curr); 3508 3509 update_curr_scx(rq); 3510 3511 /* 3512 * While disabling, always resched and refresh core-sched timestamp as 3513 * we can't trust the slice management or ops.core_sched_before(). 3514 */ 3515 if (scx_bypassing(sch, cpu_of(rq))) { 3516 curr->scx.slice = 0; 3517 touch_core_sched(rq, curr); 3518 } else if (SCX_HAS_OP(sch, tick)) { 3519 SCX_CALL_OP_TASK(sch, tick, rq, curr); 3520 } 3521 3522 if (!curr->scx.slice) 3523 resched_curr(rq); 3524 } 3525 3526 #ifdef CONFIG_EXT_GROUP_SCHED 3527 static struct cgroup *tg_cgrp(struct task_group *tg) 3528 { 3529 /* 3530 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3531 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3532 * root cgroup. 3533 */ 3534 if (tg && tg->css.cgroup) 3535 return tg->css.cgroup; 3536 else 3537 return &cgrp_dfl_root.cgrp; 3538 } 3539 3540 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3541 3542 #else /* CONFIG_EXT_GROUP_SCHED */ 3543 3544 #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3545 3546 #endif /* CONFIG_EXT_GROUP_SCHED */ 3547 3548 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3549 { 3550 int ret; 3551 3552 p->scx.disallow = false; 3553 3554 if (SCX_HAS_OP(sch, init_task)) { 3555 struct scx_init_task_args args = { 3556 SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3557 .fork = fork, 3558 }; 3559 3560 ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); 3561 if (unlikely(ret)) { 3562 ret = ops_sanitize_err(sch, "init_task", ret); 3563 return ret; 3564 } 3565 } 3566 3567 if (p->scx.disallow) { 3568 if (unlikely(scx_parent(sch))) { 3569 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3570 p->comm, p->pid); 3571 } else if (unlikely(fork)) { 3572 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3573 p->comm, p->pid); 3574 } else { 3575 struct rq *rq; 3576 struct rq_flags rf; 3577 3578 rq = task_rq_lock(p, &rf); 3579 3580 /* 3581 * We're in the load path and @p->policy will be applied 3582 * right after. Reverting @p->policy here and rejecting 3583 * %SCHED_EXT transitions from scx_check_setscheduler() 3584 * guarantees that if ops.init_task() sets @p->disallow, 3585 * @p can never be in SCX. 3586 */ 3587 if (p->policy == SCHED_EXT) { 3588 p->policy = SCHED_NORMAL; 3589 atomic_long_inc(&scx_nr_rejected); 3590 } 3591 3592 task_rq_unlock(rq, p, &rf); 3593 } 3594 } 3595 3596 return 0; 3597 } 3598 3599 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3600 { 3601 struct rq *rq = task_rq(p); 3602 u32 weight; 3603 3604 lockdep_assert_rq_held(rq); 3605 3606 /* 3607 * Verify the task is not in BPF scheduler's custody. If flag 3608 * transitions are consistent, the flag should always be clear 3609 * here. 3610 */ 3611 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3612 3613 /* 3614 * Set the weight before calling ops.enable() so that the scheduler 3615 * doesn't see a stale value if they inspect the task struct. 3616 */ 3617 if (task_has_idle_policy(p)) 3618 weight = WEIGHT_IDLEPRIO; 3619 else 3620 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3621 3622 p->scx.weight = sched_weight_to_cgroup(weight); 3623 3624 if (SCX_HAS_OP(sch, enable)) 3625 SCX_CALL_OP_TASK(sch, enable, rq, p); 3626 3627 if (SCX_HAS_OP(sch, set_weight)) 3628 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3629 } 3630 3631 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3632 { 3633 __scx_enable_task(sch, p); 3634 scx_set_task_state(p, SCX_TASK_ENABLED); 3635 } 3636 3637 static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3638 { 3639 struct rq *rq = task_rq(p); 3640 3641 lockdep_assert_rq_held(rq); 3642 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3643 3644 clear_direct_dispatch(p); 3645 3646 if (SCX_HAS_OP(sch, disable)) 3647 SCX_CALL_OP_TASK(sch, disable, rq, p); 3648 scx_set_task_state(p, SCX_TASK_READY); 3649 3650 /* 3651 * Verify the task is not in BPF scheduler's custody. If flag 3652 * transitions are consistent, the flag should always be clear 3653 * here. 3654 */ 3655 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3656 } 3657 3658 static void __scx_disable_and_exit_task(struct scx_sched *sch, 3659 struct task_struct *p) 3660 { 3661 struct scx_exit_task_args args = { 3662 .cancelled = false, 3663 }; 3664 3665 lockdep_assert_held(&p->pi_lock); 3666 lockdep_assert_rq_held(task_rq(p)); 3667 3668 switch (scx_get_task_state(p)) { 3669 case SCX_TASK_NONE: 3670 return; 3671 case SCX_TASK_INIT: 3672 args.cancelled = true; 3673 break; 3674 case SCX_TASK_READY: 3675 break; 3676 case SCX_TASK_ENABLED: 3677 scx_disable_task(sch, p); 3678 break; 3679 default: 3680 WARN_ON_ONCE(true); 3681 return; 3682 } 3683 3684 if (SCX_HAS_OP(sch, exit_task)) 3685 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3686 } 3687 3688 /* 3689 * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never 3690 * ran. The task state has not been transitioned, so this mirrors the 3691 * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). 3692 */ 3693 static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) 3694 { 3695 struct scx_exit_task_args args = { .cancelled = true }; 3696 3697 lockdep_assert_held(&p->pi_lock); 3698 lockdep_assert_rq_held(task_rq(p)); 3699 3700 if (SCX_HAS_OP(sch, exit_task)) 3701 SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3702 } 3703 3704 static void scx_disable_and_exit_task(struct scx_sched *sch, 3705 struct task_struct *p) 3706 { 3707 __scx_disable_and_exit_task(sch, p); 3708 3709 /* 3710 * If set, @p exited between __scx_init_task() and scx_enable_task() in 3711 * scx_sub_enable() and is initialized for both the associated sched and 3712 * its parent. Exit for the child too - scx_enable_task() never ran for 3713 * it, so undo only init_task. The flag is only set on the sub-enable 3714 * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. 3715 */ 3716 if (p->scx.flags & SCX_TASK_SUB_INIT) { 3717 if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) 3718 scx_sub_init_cancel_task(scx_enabling_sub_sched, p); 3719 p->scx.flags &= ~SCX_TASK_SUB_INIT; 3720 } 3721 3722 scx_set_task_sched(p, NULL); 3723 scx_set_task_state(p, SCX_TASK_NONE); 3724 } 3725 3726 void init_scx_entity(struct sched_ext_entity *scx) 3727 { 3728 memset(scx, 0, sizeof(*scx)); 3729 INIT_LIST_HEAD(&scx->dsq_list.node); 3730 RB_CLEAR_NODE(&scx->dsq_priq); 3731 scx->sticky_cpu = -1; 3732 scx->holding_cpu = -1; 3733 INIT_LIST_HEAD(&scx->runnable_node); 3734 scx->runnable_at = jiffies; 3735 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3736 scx->slice = SCX_SLICE_DFL; 3737 } 3738 3739 /* See scx_tid_alloc / scx_tid_cursor. */ 3740 static u64 scx_alloc_tid(void) 3741 { 3742 struct scx_tid_alloc *ta; 3743 3744 guard(preempt)(); 3745 ta = this_cpu_ptr(&scx_tid_alloc); 3746 3747 if (unlikely(ta->next >= ta->end)) { 3748 ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor); 3749 ta->end = ta->next + SCX_TID_CHUNK; 3750 } 3751 return ta->next++; 3752 } 3753 3754 static void scx_tid_hash_insert(struct task_struct *p) 3755 { 3756 int ret; 3757 3758 lockdep_assert_held(&scx_tasks_lock); 3759 3760 ret = rhashtable_lookup_insert_fast(&scx_tid_hash, 3761 &p->scx.tid_hash_node, 3762 scx_tid_hash_params); 3763 WARN_ON_ONCE(ret); 3764 } 3765 3766 void scx_pre_fork(struct task_struct *p) 3767 { 3768 /* 3769 * BPF scheduler enable/disable paths want to be able to iterate and 3770 * update all tasks which can become complex when racing forks. As 3771 * enable/disable are very cold paths, let's use a percpu_rwsem to 3772 * exclude forks. 3773 */ 3774 percpu_down_read(&scx_fork_rwsem); 3775 } 3776 3777 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3778 { 3779 s32 ret; 3780 3781 percpu_rwsem_assert_held(&scx_fork_rwsem); 3782 3783 p->scx.tid = scx_alloc_tid(); 3784 3785 if (scx_init_task_enabled) { 3786 #ifdef CONFIG_EXT_SUB_SCHED 3787 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3788 #else 3789 struct scx_sched *sch = scx_root; 3790 #endif 3791 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 3792 ret = __scx_init_task(sch, p, true); 3793 if (unlikely(ret)) { 3794 scx_set_task_state(p, SCX_TASK_NONE); 3795 return ret; 3796 } 3797 scx_set_task_state(p, SCX_TASK_INIT); 3798 scx_set_task_sched(p, sch); 3799 } 3800 3801 return 0; 3802 } 3803 3804 void scx_post_fork(struct task_struct *p) 3805 { 3806 if (scx_init_task_enabled) { 3807 scx_set_task_state(p, SCX_TASK_READY); 3808 3809 /* 3810 * Enable the task immediately if it's running on sched_ext. 3811 * Otherwise, it'll be enabled in switching_to_scx() if and 3812 * when it's ever configured to run with a SCHED_EXT policy. 3813 */ 3814 if (p->sched_class == &ext_sched_class) { 3815 struct rq_flags rf; 3816 struct rq *rq; 3817 3818 rq = task_rq_lock(p, &rf); 3819 scx_enable_task(scx_task_sched(p), p); 3820 task_rq_unlock(rq, p, &rf); 3821 } 3822 } 3823 3824 scoped_guard(raw_spinlock_irq, &scx_tasks_lock) { 3825 list_add_tail(&p->scx.tasks_node, &scx_tasks); 3826 if (scx_tid_to_task_enabled()) 3827 scx_tid_hash_insert(p); 3828 } 3829 3830 percpu_up_read(&scx_fork_rwsem); 3831 } 3832 3833 void scx_cancel_fork(struct task_struct *p) 3834 { 3835 if (scx_enabled()) { 3836 struct rq *rq; 3837 struct rq_flags rf; 3838 3839 rq = task_rq_lock(p, &rf); 3840 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3841 scx_disable_and_exit_task(scx_task_sched(p), p); 3842 task_rq_unlock(rq, p, &rf); 3843 } 3844 3845 percpu_up_read(&scx_fork_rwsem); 3846 } 3847 3848 /** 3849 * task_dead_and_done - Is a task dead and done running? 3850 * @p: target task 3851 * 3852 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3853 * task no longer exists from SCX's POV. However, certain sched_class ops may be 3854 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3855 * may try to switch a task which finished sched_ext_dead() back into SCX 3856 * triggering invalid SCX task state transitions and worse. 3857 * 3858 * Once a task has finished the final switch, sched_ext_dead() is the only thing 3859 * that needs to happen on the task. Use this test to short-circuit sched_class 3860 * operations which may be called on dead tasks. 3861 */ 3862 static bool task_dead_and_done(struct task_struct *p) 3863 { 3864 struct rq *rq = task_rq(p); 3865 3866 lockdep_assert_rq_held(rq); 3867 3868 /* 3869 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3870 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3871 * won't ever run again. 3872 */ 3873 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 3874 !task_on_cpu(rq, p); 3875 } 3876 3877 void sched_ext_dead(struct task_struct *p) 3878 { 3879 /* 3880 * By the time control reaches here, @p has %TASK_DEAD set, switched out 3881 * for the last time and then dropped the rq lock - task_dead_and_done() 3882 * should be returning %true nullifying the straggling sched_class ops. 3883 * Remove from scx_tasks and exit @p. 3884 */ 3885 scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) { 3886 list_del_init(&p->scx.tasks_node); 3887 if (scx_tid_to_task_enabled()) 3888 rhashtable_remove_fast(&scx_tid_hash, 3889 &p->scx.tid_hash_node, 3890 scx_tid_hash_params); 3891 } 3892 3893 /* 3894 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 3895 * ENABLED transitions can't race us. Disable ops for @p. 3896 * 3897 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see 3898 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup 3899 * iteration is only used from sub-sched paths, which require root 3900 * enabled. Root enable transitions every live task to at least READY. 3901 * 3902 * %INIT_BEGIN means ops.init_task() is running for @p. Don't call 3903 * into ops; transition to %DEAD so the post-init recheck unwinds 3904 * via scx_sub_init_cancel_task(). 3905 */ 3906 if (scx_get_task_state(p) != SCX_TASK_NONE) { 3907 struct rq_flags rf; 3908 struct rq *rq; 3909 3910 rq = task_rq_lock(p, &rf); 3911 if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) 3912 scx_disable_and_exit_task(scx_task_sched(p), p); 3913 scx_set_task_state(p, SCX_TASK_DEAD); 3914 task_rq_unlock(rq, p, &rf); 3915 } 3916 } 3917 3918 static void reweight_task_scx(struct rq *rq, struct task_struct *p, 3919 const struct load_weight *lw) 3920 { 3921 struct scx_sched *sch = scx_task_sched(p); 3922 3923 lockdep_assert_rq_held(task_rq(p)); 3924 3925 if (task_dead_and_done(p)) 3926 return; 3927 3928 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 3929 if (SCX_HAS_OP(sch, set_weight)) 3930 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3931 } 3932 3933 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 3934 { 3935 } 3936 3937 static void switching_to_scx(struct rq *rq, struct task_struct *p) 3938 { 3939 struct scx_sched *sch = scx_task_sched(p); 3940 3941 if (task_dead_and_done(p)) 3942 return; 3943 3944 scx_enable_task(sch, p); 3945 3946 /* 3947 * set_cpus_allowed_scx() is not called while @p is associated with a 3948 * different scheduler class. Keep the BPF scheduler up-to-date. 3949 */ 3950 if (SCX_HAS_OP(sch, set_cpumask)) 3951 scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr); 3952 } 3953 3954 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3955 { 3956 if (task_dead_and_done(p)) 3957 return; 3958 3959 /* 3960 * %NONE means SCX is no longer tracking @p at the task level (e.g. 3961 * scx_fail_parent() handed @p back to the parent at NONE pending the 3962 * parent's own teardown). There is nothing to disable; calling 3963 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a 3964 * NONE -> READY validation failure. 3965 */ 3966 if (scx_get_task_state(p) == SCX_TASK_NONE) 3967 return; 3968 3969 scx_disable_task(scx_task_sched(p), p); 3970 } 3971 3972 static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 3973 3974 int scx_check_setscheduler(struct task_struct *p, int policy) 3975 { 3976 lockdep_assert_rq_held(task_rq(p)); 3977 3978 /* if disallow, reject transitioning into SCX */ 3979 if (scx_enabled() && READ_ONCE(p->scx.disallow) && 3980 p->policy != policy && policy == SCHED_EXT) 3981 return -EACCES; 3982 3983 return 0; 3984 } 3985 3986 static void process_ddsp_deferred_locals(struct rq *rq) 3987 { 3988 struct task_struct *p; 3989 3990 lockdep_assert_rq_held(rq); 3991 3992 /* 3993 * Now that @rq can be unlocked, execute the deferred enqueueing of 3994 * tasks directly dispatched to the local DSQs of other CPUs. See 3995 * direct_dispatch(). Keep popping from the head instead of using 3996 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 3997 * temporarily. 3998 */ 3999 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 4000 struct task_struct, scx.dsq_list.node))) { 4001 struct scx_sched *sch = scx_task_sched(p); 4002 struct scx_dispatch_q *dsq; 4003 u64 dsq_id = p->scx.ddsp_dsq_id; 4004 u64 enq_flags = p->scx.ddsp_enq_flags; 4005 4006 list_del_init(&p->scx.dsq_list.node); 4007 clear_direct_dispatch(p); 4008 4009 dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); 4010 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 4011 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 4012 } 4013 } 4014 4015 /* 4016 * Determine whether @p should be reenqueued from a local DSQ. 4017 * 4018 * @reenq_flags is mutable and accumulates state across the DSQ walk: 4019 * 4020 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 4021 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 4022 * the head consumes the first slot. 4023 * 4024 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 4025 * rq_is_open() is true. 4026 * 4027 * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 4028 * AND the current task is done — i.e. it will execute immediately. All other 4029 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 4030 * every IMMED task behind it gets reenqueued. 4031 * 4032 * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 4033 * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 4034 * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 4035 * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 4036 * in process_deferred_reenq_locals(). 4037 */ 4038 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 4039 { 4040 bool first; 4041 4042 first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 4043 *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 4044 4045 *reason = SCX_TASK_REENQ_KFUNC; 4046 4047 if ((p->scx.flags & SCX_TASK_IMMED) && 4048 (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 4049 __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 4050 *reason = SCX_TASK_REENQ_IMMED; 4051 return true; 4052 } 4053 4054 return *reenq_flags & SCX_REENQ_ANY; 4055 } 4056 4057 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 4058 { 4059 LIST_HEAD(tasks); 4060 u32 nr_enqueued = 0; 4061 struct task_struct *p, *n; 4062 4063 lockdep_assert_rq_held(rq); 4064 4065 if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 4066 reenq_flags &= ~__SCX_REENQ_TSR_MASK; 4067 if (rq_is_open(rq, 0)) 4068 reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 4069 4070 /* 4071 * The BPF scheduler may choose to dispatch tasks back to 4072 * @rq->scx.local_dsq. Move all candidate tasks off to a private list 4073 * first to avoid processing the same tasks repeatedly. 4074 */ 4075 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 4076 scx.dsq_list.node) { 4077 struct scx_sched *task_sch = scx_task_sched(p); 4078 u32 reason; 4079 4080 /* 4081 * If @p is being migrated, @p's current CPU may not agree with 4082 * its allowed CPUs and the migration_cpu_stop is about to 4083 * deactivate and re-activate @p anyway. Skip re-enqueueing. 4084 * 4085 * While racing sched property changes may also dequeue and 4086 * re-enqueue a migrating task while its current CPU and allowed 4087 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 4088 * the current local DSQ for running tasks and thus are not 4089 * visible to the BPF scheduler. 4090 */ 4091 if (p->migration_pending) 4092 continue; 4093 4094 if (!scx_is_descendant(task_sch, sch)) 4095 continue; 4096 4097 if (!local_task_should_reenq(p, &reenq_flags, &reason)) 4098 continue; 4099 4100 dispatch_dequeue(rq, p); 4101 4102 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4103 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4104 p->scx.flags |= reason; 4105 4106 list_add_tail(&p->scx.dsq_list.node, &tasks); 4107 } 4108 4109 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 4110 list_del_init(&p->scx.dsq_list.node); 4111 4112 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 4113 4114 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4115 nr_enqueued++; 4116 } 4117 4118 return nr_enqueued; 4119 } 4120 4121 static void process_deferred_reenq_locals(struct rq *rq) 4122 { 4123 u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4124 4125 lockdep_assert_rq_held(rq); 4126 4127 while (true) { 4128 struct scx_sched *sch; 4129 u64 reenq_flags; 4130 bool skip = false; 4131 4132 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4133 struct scx_deferred_reenq_local *drl = 4134 list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4135 struct scx_deferred_reenq_local, 4136 node); 4137 struct scx_sched_pcpu *sch_pcpu; 4138 4139 if (!drl) 4140 return; 4141 4142 sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4143 deferred_reenq_local); 4144 sch = sch_pcpu->sch; 4145 4146 reenq_flags = drl->flags; 4147 WRITE_ONCE(drl->flags, 0); 4148 list_del_init(&drl->node); 4149 4150 if (likely(drl->seq != seq)) { 4151 drl->seq = seq; 4152 drl->cnt = 0; 4153 } else { 4154 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4155 scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4156 drl->cnt); 4157 skip = true; 4158 } 4159 4160 __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4161 } 4162 } 4163 4164 if (!skip) { 4165 /* see schedule_dsq_reenq() */ 4166 smp_mb(); 4167 4168 reenq_local(sch, rq, reenq_flags); 4169 } 4170 } 4171 } 4172 4173 static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4174 { 4175 *reason = SCX_TASK_REENQ_KFUNC; 4176 return reenq_flags & SCX_REENQ_ANY; 4177 } 4178 4179 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4180 { 4181 struct rq *locked_rq = rq; 4182 struct scx_sched *sch = dsq->sched; 4183 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4184 struct task_struct *p; 4185 s32 nr_enqueued = 0; 4186 4187 lockdep_assert_rq_held(rq); 4188 4189 raw_spin_lock(&dsq->lock); 4190 4191 while (likely(!READ_ONCE(sch->bypass_depth))) { 4192 struct rq *task_rq; 4193 u32 reason; 4194 4195 p = nldsq_cursor_next_task(&cursor, dsq); 4196 if (!p) 4197 break; 4198 4199 if (!user_task_should_reenq(p, reenq_flags, &reason)) 4200 continue; 4201 4202 task_rq = task_rq(p); 4203 4204 if (locked_rq != task_rq) { 4205 if (locked_rq) 4206 raw_spin_rq_unlock(locked_rq); 4207 if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4208 raw_spin_unlock(&dsq->lock); 4209 raw_spin_rq_lock(task_rq); 4210 raw_spin_lock(&dsq->lock); 4211 } 4212 locked_rq = task_rq; 4213 4214 /* did we lose @p while switching locks? */ 4215 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4216 continue; 4217 } 4218 4219 /* @p is on @dsq, its rq and @dsq are locked */ 4220 dispatch_dequeue_locked(p, dsq); 4221 raw_spin_unlock(&dsq->lock); 4222 4223 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4224 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4225 p->scx.flags |= reason; 4226 4227 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4228 4229 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4230 4231 if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4232 raw_spin_rq_unlock(locked_rq); 4233 locked_rq = NULL; 4234 cpu_relax(); 4235 } 4236 4237 raw_spin_lock(&dsq->lock); 4238 } 4239 4240 list_del_init(&cursor.node); 4241 raw_spin_unlock(&dsq->lock); 4242 4243 if (locked_rq != rq) { 4244 if (locked_rq) 4245 raw_spin_rq_unlock(locked_rq); 4246 raw_spin_rq_lock(rq); 4247 } 4248 } 4249 4250 static void process_deferred_reenq_users(struct rq *rq) 4251 { 4252 lockdep_assert_rq_held(rq); 4253 4254 while (true) { 4255 struct scx_dispatch_q *dsq; 4256 u64 reenq_flags; 4257 4258 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4259 struct scx_deferred_reenq_user *dru = 4260 list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4261 struct scx_deferred_reenq_user, 4262 node); 4263 struct scx_dsq_pcpu *dsq_pcpu; 4264 4265 if (!dru) 4266 return; 4267 4268 dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4269 deferred_reenq_user); 4270 dsq = dsq_pcpu->dsq; 4271 reenq_flags = dru->flags; 4272 WRITE_ONCE(dru->flags, 0); 4273 list_del_init(&dru->node); 4274 } 4275 4276 /* see schedule_dsq_reenq() */ 4277 smp_mb(); 4278 4279 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4280 reenq_user(rq, dsq, reenq_flags); 4281 } 4282 } 4283 4284 static void run_deferred(struct rq *rq) 4285 { 4286 process_ddsp_deferred_locals(rq); 4287 4288 if (!list_empty(&rq->scx.deferred_reenq_locals)) 4289 process_deferred_reenq_locals(rq); 4290 4291 if (!list_empty(&rq->scx.deferred_reenq_users)) 4292 process_deferred_reenq_users(rq); 4293 } 4294 4295 #ifdef CONFIG_NO_HZ_FULL 4296 bool scx_can_stop_tick(struct rq *rq) 4297 { 4298 struct task_struct *p = rq->curr; 4299 struct scx_sched *sch = scx_task_sched(p); 4300 4301 if (p->sched_class != &ext_sched_class) 4302 return true; 4303 4304 if (scx_bypassing(sch, cpu_of(rq))) 4305 return false; 4306 4307 /* 4308 * @rq can dispatch from different DSQs, so we can't tell whether it 4309 * needs the tick or not by looking at nr_running. Allow stopping ticks 4310 * iff the BPF scheduler indicated so. See set_next_task_scx(). 4311 */ 4312 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4313 } 4314 #endif 4315 4316 #ifdef CONFIG_EXT_GROUP_SCHED 4317 4318 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4319 static bool scx_cgroup_enabled; 4320 4321 void scx_tg_init(struct task_group *tg) 4322 { 4323 tg->scx.weight = CGROUP_WEIGHT_DFL; 4324 tg->scx.bw_period_us = default_bw_period_us(); 4325 tg->scx.bw_quota_us = RUNTIME_INF; 4326 tg->scx.idle = false; 4327 } 4328 4329 int scx_tg_online(struct task_group *tg) 4330 { 4331 struct scx_sched *sch = scx_root; 4332 int ret = 0; 4333 4334 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4335 4336 if (scx_cgroup_enabled) { 4337 if (SCX_HAS_OP(sch, cgroup_init)) { 4338 struct scx_cgroup_init_args args = 4339 { .weight = tg->scx.weight, 4340 .bw_period_us = tg->scx.bw_period_us, 4341 .bw_quota_us = tg->scx.bw_quota_us, 4342 .bw_burst_us = tg->scx.bw_burst_us }; 4343 4344 ret = SCX_CALL_OP_RET(sch, cgroup_init, 4345 NULL, tg->css.cgroup, &args); 4346 if (ret) 4347 ret = ops_sanitize_err(sch, "cgroup_init", ret); 4348 } 4349 if (ret == 0) 4350 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4351 } else { 4352 tg->scx.flags |= SCX_TG_ONLINE; 4353 } 4354 4355 return ret; 4356 } 4357 4358 void scx_tg_offline(struct task_group *tg) 4359 { 4360 struct scx_sched *sch = scx_root; 4361 4362 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4363 4364 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4365 (tg->scx.flags & SCX_TG_INITED)) 4366 SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); 4367 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4368 } 4369 4370 int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4371 { 4372 struct scx_sched *sch = scx_root; 4373 struct cgroup_subsys_state *css; 4374 struct task_struct *p; 4375 int ret; 4376 4377 if (!scx_cgroup_enabled) 4378 return 0; 4379 4380 cgroup_taskset_for_each(p, css, tset) { 4381 struct cgroup *from = tg_cgrp(task_group(p)); 4382 struct cgroup *to = tg_cgrp(css_tg(css)); 4383 4384 WARN_ON_ONCE(p->scx.cgrp_moving_from); 4385 4386 /* 4387 * sched_move_task() omits identity migrations. Let's match the 4388 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4389 * always match one-to-one. 4390 */ 4391 if (from == to) 4392 continue; 4393 4394 if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4395 ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, 4396 p, from, css->cgroup); 4397 if (ret) 4398 goto err; 4399 } 4400 4401 p->scx.cgrp_moving_from = from; 4402 } 4403 4404 return 0; 4405 4406 err: 4407 cgroup_taskset_for_each(p, css, tset) { 4408 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4409 p->scx.cgrp_moving_from) 4410 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4411 p, p->scx.cgrp_moving_from, css->cgroup); 4412 p->scx.cgrp_moving_from = NULL; 4413 } 4414 4415 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4416 } 4417 4418 void scx_cgroup_move_task(struct task_struct *p) 4419 { 4420 struct scx_sched *sch = scx_root; 4421 4422 if (!scx_cgroup_enabled) 4423 return; 4424 4425 /* 4426 * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's 4427 * cgroup changes. Migration keys off css rather than cgroup identity, 4428 * so it can hand an unchanged-cgroup task here with cgrp_moving_from 4429 * NULL. Nothing to report to the BPF scheduler then, so skip it and 4430 * keep prep_move and move paired. 4431 */ 4432 if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from) 4433 SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), 4434 p, p->scx.cgrp_moving_from, 4435 tg_cgrp(task_group(p))); 4436 p->scx.cgrp_moving_from = NULL; 4437 } 4438 4439 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4440 { 4441 struct scx_sched *sch = scx_root; 4442 struct cgroup_subsys_state *css; 4443 struct task_struct *p; 4444 4445 if (!scx_cgroup_enabled) 4446 return; 4447 4448 cgroup_taskset_for_each(p, css, tset) { 4449 if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4450 p->scx.cgrp_moving_from) 4451 SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4452 p, p->scx.cgrp_moving_from, css->cgroup); 4453 p->scx.cgrp_moving_from = NULL; 4454 } 4455 } 4456 4457 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4458 { 4459 struct scx_sched *sch; 4460 4461 percpu_down_read(&scx_cgroup_ops_rwsem); 4462 sch = scx_root; 4463 4464 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4465 tg->scx.weight != weight) 4466 SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); 4467 4468 tg->scx.weight = weight; 4469 4470 percpu_up_read(&scx_cgroup_ops_rwsem); 4471 } 4472 4473 void scx_group_set_idle(struct task_group *tg, bool idle) 4474 { 4475 struct scx_sched *sch; 4476 4477 percpu_down_read(&scx_cgroup_ops_rwsem); 4478 sch = scx_root; 4479 4480 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4481 SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); 4482 4483 /* Update the task group's idle state */ 4484 tg->scx.idle = idle; 4485 4486 percpu_up_read(&scx_cgroup_ops_rwsem); 4487 } 4488 4489 void scx_group_set_bandwidth(struct task_group *tg, 4490 u64 period_us, u64 quota_us, u64 burst_us) 4491 { 4492 struct scx_sched *sch; 4493 4494 percpu_down_read(&scx_cgroup_ops_rwsem); 4495 sch = scx_root; 4496 4497 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4498 (tg->scx.bw_period_us != period_us || 4499 tg->scx.bw_quota_us != quota_us || 4500 tg->scx.bw_burst_us != burst_us)) 4501 SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, 4502 tg_cgrp(tg), period_us, quota_us, burst_us); 4503 4504 tg->scx.bw_period_us = period_us; 4505 tg->scx.bw_quota_us = quota_us; 4506 tg->scx.bw_burst_us = burst_us; 4507 4508 percpu_up_read(&scx_cgroup_ops_rwsem); 4509 } 4510 #endif /* CONFIG_EXT_GROUP_SCHED */ 4511 4512 #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4513 static struct cgroup *root_cgroup(void) 4514 { 4515 return &cgrp_dfl_root.cgrp; 4516 } 4517 4518 static void scx_cgroup_lock(void) 4519 { 4520 #ifdef CONFIG_EXT_GROUP_SCHED 4521 percpu_down_write(&scx_cgroup_ops_rwsem); 4522 #endif 4523 cgroup_lock(); 4524 } 4525 4526 static void scx_cgroup_unlock(void) 4527 { 4528 cgroup_unlock(); 4529 #ifdef CONFIG_EXT_GROUP_SCHED 4530 percpu_up_write(&scx_cgroup_ops_rwsem); 4531 #endif 4532 } 4533 #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4534 static inline struct cgroup *root_cgroup(void) { return NULL; } 4535 static inline void scx_cgroup_lock(void) {} 4536 static inline void scx_cgroup_unlock(void) {} 4537 #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4538 4539 #ifdef CONFIG_EXT_SUB_SCHED 4540 static struct cgroup *sch_cgroup(struct scx_sched *sch) 4541 { 4542 return sch->cgrp; 4543 } 4544 4545 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4546 static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4547 { 4548 struct cgroup *pos; 4549 struct cgroup_subsys_state *css; 4550 4551 cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4552 rcu_assign_pointer(pos->scx_sched, sch); 4553 } 4554 #else /* CONFIG_EXT_SUB_SCHED */ 4555 static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4556 static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4557 #endif /* CONFIG_EXT_SUB_SCHED */ 4558 4559 /* 4560 * Omitted operations: 4561 * 4562 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4563 * 4564 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4565 * their current sched_class. Call them directly from sched core instead. 4566 */ 4567 DEFINE_SCHED_CLASS(ext) = { 4568 .enqueue_task = enqueue_task_scx, 4569 .dequeue_task = dequeue_task_scx, 4570 .yield_task = yield_task_scx, 4571 .yield_to_task = yield_to_task_scx, 4572 4573 .wakeup_preempt = wakeup_preempt_scx, 4574 4575 .pick_task = pick_task_scx, 4576 4577 .put_prev_task = put_prev_task_scx, 4578 .set_next_task = set_next_task_scx, 4579 4580 .select_task_rq = select_task_rq_scx, 4581 .task_woken = task_woken_scx, 4582 .set_cpus_allowed = set_cpus_allowed_scx, 4583 4584 .rq_online = rq_online_scx, 4585 .rq_offline = rq_offline_scx, 4586 4587 .task_tick = task_tick_scx, 4588 4589 .switching_to = switching_to_scx, 4590 .switched_from = switched_from_scx, 4591 .switched_to = switched_to_scx, 4592 .reweight_task = reweight_task_scx, 4593 .prio_changed = prio_changed_scx, 4594 4595 .update_curr = update_curr_scx, 4596 4597 #ifdef CONFIG_UCLAMP_TASK 4598 .uclamp_enabled = 1, 4599 #endif 4600 }; 4601 4602 static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4603 struct scx_sched *sch) 4604 { 4605 s32 cpu; 4606 4607 memset(dsq, 0, sizeof(*dsq)); 4608 4609 raw_spin_lock_init(&dsq->lock); 4610 INIT_LIST_HEAD(&dsq->list); 4611 dsq->id = dsq_id; 4612 dsq->sched = sch; 4613 4614 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4615 if (!dsq->pcpu) 4616 return -ENOMEM; 4617 4618 for_each_possible_cpu(cpu) { 4619 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4620 4621 pcpu->dsq = dsq; 4622 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4623 } 4624 4625 return 0; 4626 } 4627 4628 static void exit_dsq(struct scx_dispatch_q *dsq) 4629 { 4630 s32 cpu; 4631 4632 for_each_possible_cpu(cpu) { 4633 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4634 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4635 struct rq *rq = cpu_rq(cpu); 4636 4637 /* 4638 * There must have been a RCU grace period since the last 4639 * insertion and @dsq should be off the deferred list by now. 4640 */ 4641 if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4642 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4643 list_del_init(&dru->node); 4644 } 4645 } 4646 4647 free_percpu(dsq->pcpu); 4648 } 4649 4650 static void free_dsq_rcufn(struct rcu_head *rcu) 4651 { 4652 struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4653 4654 exit_dsq(dsq); 4655 kfree(dsq); 4656 } 4657 4658 static void free_dsq_irq_workfn(struct irq_work *irq_work) 4659 { 4660 struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4661 struct scx_dispatch_q *dsq, *tmp_dsq; 4662 4663 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4664 call_rcu(&dsq->rcu, free_dsq_rcufn); 4665 } 4666 4667 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4668 4669 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4670 { 4671 struct scx_dispatch_q *dsq; 4672 unsigned long flags; 4673 4674 rcu_read_lock(); 4675 4676 dsq = find_user_dsq(sch, dsq_id); 4677 if (!dsq) 4678 goto out_unlock_rcu; 4679 4680 raw_spin_lock_irqsave(&dsq->lock, flags); 4681 4682 if (dsq->nr) { 4683 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4684 dsq->id, dsq->nr); 4685 goto out_unlock_dsq; 4686 } 4687 4688 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4689 dsq_hash_params)) 4690 goto out_unlock_dsq; 4691 4692 /* 4693 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4694 * queueing more tasks. As this function can be called from anywhere, 4695 * freeing is bounced through an irq work to avoid nesting RCU 4696 * operations inside scheduler locks. 4697 */ 4698 dsq->id = SCX_DSQ_INVALID; 4699 if (llist_add(&dsq->free_node, &dsqs_to_free)) 4700 irq_work_queue(&free_dsq_irq_work); 4701 4702 out_unlock_dsq: 4703 raw_spin_unlock_irqrestore(&dsq->lock, flags); 4704 out_unlock_rcu: 4705 rcu_read_unlock(); 4706 } 4707 4708 #ifdef CONFIG_EXT_GROUP_SCHED 4709 static void scx_cgroup_exit(struct scx_sched *sch) 4710 { 4711 struct cgroup_subsys_state *css; 4712 4713 scx_cgroup_enabled = false; 4714 4715 /* 4716 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4717 * cgroups and exit all the inited ones, all online cgroups are exited. 4718 */ 4719 css_for_each_descendant_post(css, &root_task_group.css) { 4720 struct task_group *tg = css_tg(css); 4721 4722 if (!(tg->scx.flags & SCX_TG_INITED)) 4723 continue; 4724 tg->scx.flags &= ~SCX_TG_INITED; 4725 4726 if (!sch->ops.cgroup_exit) 4727 continue; 4728 4729 SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); 4730 } 4731 } 4732 4733 static int scx_cgroup_init(struct scx_sched *sch) 4734 { 4735 struct cgroup_subsys_state *css; 4736 int ret; 4737 4738 /* 4739 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4740 * cgroups and init, all online cgroups are initialized. 4741 */ 4742 css_for_each_descendant_pre(css, &root_task_group.css) { 4743 struct task_group *tg = css_tg(css); 4744 struct scx_cgroup_init_args args = { 4745 .weight = tg->scx.weight, 4746 .bw_period_us = tg->scx.bw_period_us, 4747 .bw_quota_us = tg->scx.bw_quota_us, 4748 .bw_burst_us = tg->scx.bw_burst_us, 4749 }; 4750 4751 if ((tg->scx.flags & 4752 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4753 continue; 4754 4755 if (!sch->ops.cgroup_init) { 4756 tg->scx.flags |= SCX_TG_INITED; 4757 continue; 4758 } 4759 4760 ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, 4761 css->cgroup, &args); 4762 if (ret) { 4763 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4764 return ret; 4765 } 4766 tg->scx.flags |= SCX_TG_INITED; 4767 } 4768 4769 WARN_ON_ONCE(scx_cgroup_enabled); 4770 scx_cgroup_enabled = true; 4771 4772 return 0; 4773 } 4774 4775 #else 4776 static void scx_cgroup_exit(struct scx_sched *sch) {} 4777 static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4778 #endif 4779 4780 4781 /******************************************************************************** 4782 * Sysfs interface and ops enable/disable. 4783 */ 4784 4785 #define SCX_ATTR(_name) \ 4786 static struct kobj_attribute scx_attr_##_name = { \ 4787 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4788 .show = scx_attr_##_name##_show, \ 4789 } 4790 4791 static ssize_t scx_attr_state_show(struct kobject *kobj, 4792 struct kobj_attribute *ka, char *buf) 4793 { 4794 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4795 } 4796 SCX_ATTR(state); 4797 4798 static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4799 struct kobj_attribute *ka, char *buf) 4800 { 4801 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4802 } 4803 SCX_ATTR(switch_all); 4804 4805 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4806 struct kobj_attribute *ka, char *buf) 4807 { 4808 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4809 } 4810 SCX_ATTR(nr_rejected); 4811 4812 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4813 struct kobj_attribute *ka, char *buf) 4814 { 4815 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4816 } 4817 SCX_ATTR(hotplug_seq); 4818 4819 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4820 struct kobj_attribute *ka, char *buf) 4821 { 4822 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4823 } 4824 SCX_ATTR(enable_seq); 4825 4826 static struct attribute *scx_global_attrs[] = { 4827 &scx_attr_state.attr, 4828 &scx_attr_switch_all.attr, 4829 &scx_attr_nr_rejected.attr, 4830 &scx_attr_hotplug_seq.attr, 4831 &scx_attr_enable_seq.attr, 4832 NULL, 4833 }; 4834 4835 static const struct attribute_group scx_global_attr_group = { 4836 .attrs = scx_global_attrs, 4837 }; 4838 4839 static void free_pnode(struct scx_sched_pnode *pnode); 4840 static void free_exit_info(struct scx_exit_info *ei); 4841 4842 static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch) 4843 { 4844 size_t size = struct_size_t(struct scx_cmask, bits, 4845 SCX_CMASK_NR_WORDS(num_possible_cpus())); 4846 int cpu; 4847 4848 if (!sch->is_cid_type || !sch->arena_pool) 4849 return 0; 4850 4851 sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *); 4852 if (!sch->set_cmask_scratch) 4853 return -ENOMEM; 4854 4855 for_each_possible_cpu(cpu) { 4856 struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); 4857 4858 *slot = scx_arena_alloc(sch, size); 4859 if (!*slot) 4860 return -ENOMEM; 4861 scx_cmask_init(*slot, 0, num_possible_cpus()); 4862 } 4863 return 0; 4864 } 4865 4866 static void scx_set_cmask_scratch_free(struct scx_sched *sch) 4867 { 4868 size_t size = struct_size_t(struct scx_cmask, bits, 4869 SCX_CMASK_NR_WORDS(num_possible_cpus())); 4870 int cpu; 4871 4872 if (!sch->set_cmask_scratch) 4873 return; 4874 4875 for_each_possible_cpu(cpu) { 4876 struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); 4877 4878 scx_arena_free(sch, *slot, size); 4879 } 4880 free_percpu(sch->set_cmask_scratch); 4881 sch->set_cmask_scratch = NULL; 4882 } 4883 4884 static void scx_sched_free_rcu_work(struct work_struct *work) 4885 { 4886 struct rcu_work *rcu_work = to_rcu_work(work); 4887 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 4888 struct rhashtable_iter rht_iter; 4889 struct scx_dispatch_q *dsq; 4890 int cpu, node; 4891 4892 irq_work_sync(&sch->disable_irq_work); 4893 kthread_destroy_worker(sch->helper); 4894 timer_shutdown_sync(&sch->bypass_lb_timer); 4895 free_cpumask_var(sch->bypass_lb_donee_cpumask); 4896 free_cpumask_var(sch->bypass_lb_resched_cpumask); 4897 4898 #ifdef CONFIG_EXT_SUB_SCHED 4899 kfree(sch->cgrp_path); 4900 if (sch_cgroup(sch)) 4901 cgroup_put(sch_cgroup(sch)); 4902 if (sch->sub_kset) 4903 kobject_put(&sch->sub_kset->kobj); 4904 #endif /* CONFIG_EXT_SUB_SCHED */ 4905 4906 for_each_possible_cpu(cpu) { 4907 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 4908 4909 /* 4910 * $sch would have entered bypass mode before the RCU grace 4911 * period. As that blocks new deferrals, all 4912 * deferred_reenq_local_node's must be off-list by now. 4913 */ 4914 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 4915 4916 exit_dsq(bypass_dsq(sch, cpu)); 4917 } 4918 4919 free_percpu(sch->pcpu); 4920 4921 for_each_node_state(node, N_POSSIBLE) 4922 free_pnode(sch->pnode[node]); 4923 kfree(sch->pnode); 4924 4925 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 4926 do { 4927 rhashtable_walk_start(&rht_iter); 4928 4929 while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 4930 destroy_dsq(sch, dsq->id); 4931 4932 rhashtable_walk_stop(&rht_iter); 4933 } while (dsq == ERR_PTR(-EAGAIN)); 4934 rhashtable_walk_exit(&rht_iter); 4935 4936 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 4937 free_exit_info(sch->exit_info); 4938 scx_set_cmask_scratch_free(sch); 4939 scx_arena_pool_destroy(sch); 4940 if (sch->arena_map) 4941 bpf_map_put(sch->arena_map); 4942 kfree(sch); 4943 } 4944 4945 static void scx_kobj_release(struct kobject *kobj) 4946 { 4947 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4948 4949 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 4950 queue_rcu_work(system_dfl_wq, &sch->rcu_work); 4951 } 4952 4953 static ssize_t scx_attr_ops_show(struct kobject *kobj, 4954 struct kobj_attribute *ka, char *buf) 4955 { 4956 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4957 4958 return sysfs_emit(buf, "%s\n", sch->ops.name); 4959 } 4960 SCX_ATTR(ops); 4961 4962 #define scx_attr_event_show(buf, at, events, kind) ({ \ 4963 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4964 }) 4965 4966 static ssize_t scx_attr_events_show(struct kobject *kobj, 4967 struct kobj_attribute *ka, char *buf) 4968 { 4969 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 4970 struct scx_event_stats events; 4971 int at = 0; 4972 4973 scx_read_events(sch, &events); 4974 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4975 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4976 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4977 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4978 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4979 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 4980 at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 4981 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 4982 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4983 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4984 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4985 at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 4986 at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 4987 return at; 4988 } 4989 SCX_ATTR(events); 4990 4991 static struct attribute *scx_sched_attrs[] = { 4992 &scx_attr_ops.attr, 4993 &scx_attr_events.attr, 4994 NULL, 4995 }; 4996 ATTRIBUTE_GROUPS(scx_sched); 4997 4998 static const struct kobj_type scx_ktype = { 4999 .release = scx_kobj_release, 5000 .sysfs_ops = &kobj_sysfs_ops, 5001 .default_groups = scx_sched_groups, 5002 }; 5003 5004 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 5005 { 5006 const struct scx_sched *sch; 5007 5008 /* 5009 * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) 5010 * and sub-scheduler kset kobjects (kset_ktype) through the parent 5011 * chain walk. Filter out the latter to avoid invalid casts. 5012 */ 5013 if (kobj->ktype != &scx_ktype) 5014 return 0; 5015 5016 sch = container_of(kobj, struct scx_sched, kobj); 5017 5018 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 5019 } 5020 5021 static const struct kset_uevent_ops scx_uevent_ops = { 5022 .uevent = scx_uevent, 5023 }; 5024 5025 /* 5026 * Used by sched_fork() and __setscheduler_prio() to pick the matching 5027 * sched_class. dl/rt are already handled. 5028 */ 5029 bool task_should_scx(int policy) 5030 { 5031 /* if disabled, nothing should be on it */ 5032 if (!scx_enabled()) 5033 return false; 5034 5035 /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ 5036 if (READ_ONCE(scx_switching_all)) 5037 return true; 5038 5039 /* 5040 * scx is tearing down - keep new SCHED_EXT tasks out. 5041 * 5042 * Must come after scx_switching_all test, which serves as a proxy 5043 * for __scx_switched_all. While __scx_switched_all is set, we must 5044 * return true via the branch above: a fork routed to fair would 5045 * stall because next_active_class() skips fair. 5046 * 5047 * This can develop into a deadlock - scx holds scx_enable_mutex across 5048 * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is 5049 * the stalled task, the disable path can never grab the mutex to clear 5050 * scx_switching_all. 5051 */ 5052 if (unlikely(scx_enable_state() == SCX_DISABLING)) 5053 return false; 5054 5055 return policy == SCHED_EXT; 5056 } 5057 5058 bool scx_allow_ttwu_queue(const struct task_struct *p) 5059 { 5060 struct scx_sched *sch; 5061 5062 if (!scx_enabled()) 5063 return true; 5064 5065 sch = scx_task_sched(p); 5066 if (unlikely(!sch)) 5067 return true; 5068 5069 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 5070 return true; 5071 5072 if (unlikely(p->sched_class != &ext_sched_class)) 5073 return true; 5074 5075 return false; 5076 } 5077 5078 /** 5079 * handle_lockup - sched_ext common lockup handler 5080 * @fmt: format string 5081 * 5082 * Called on system stall or lockup condition and initiates abort of sched_ext 5083 * if enabled, which may resolve the reported lockup. 5084 * 5085 * Returns %true if sched_ext is enabled and abort was initiated, which may 5086 * resolve the lockup. %false if sched_ext is not enabled or abort was already 5087 * initiated by someone else. 5088 */ 5089 static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 5090 { 5091 struct scx_sched *sch; 5092 va_list args; 5093 bool ret; 5094 5095 guard(rcu)(); 5096 5097 sch = rcu_dereference(scx_root); 5098 if (unlikely(!sch)) 5099 return false; 5100 5101 switch (scx_enable_state()) { 5102 case SCX_ENABLING: 5103 case SCX_ENABLED: 5104 va_start(args, fmt); 5105 ret = scx_verror(sch, fmt, args); 5106 va_end(args); 5107 return ret; 5108 default: 5109 return false; 5110 } 5111 } 5112 5113 /** 5114 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 5115 * 5116 * While there are various reasons why RCU CPU stalls can occur on a system 5117 * that may not be caused by the current BPF scheduler, try kicking out the 5118 * current scheduler in an attempt to recover the system to a good state before 5119 * issuing panics. 5120 * 5121 * Returns %true if sched_ext is enabled and abort was initiated, which may 5122 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 5123 * else already initiated abort. 5124 */ 5125 bool scx_rcu_cpu_stall(void) 5126 { 5127 return handle_lockup("RCU CPU stall detected!"); 5128 } 5129 5130 /** 5131 * scx_softlockup - sched_ext softlockup handler 5132 * @dur_s: number of seconds of CPU stuck due to soft lockup 5133 * 5134 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 5135 * live-lock the system by making many CPUs target the same DSQ to the point 5136 * where soft-lockup detection triggers. This function is called from 5137 * soft-lockup watchdog when the triggering point is close and tries to unjam 5138 * the system and aborting the BPF scheduler. 5139 */ 5140 void scx_softlockup(u32 dur_s) 5141 { 5142 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 5143 return; 5144 5145 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 5146 smp_processor_id(), dur_s); 5147 } 5148 5149 /* 5150 * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), 5151 * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing 5152 * it from NMI context can lead to deadlocks. Defer via irq_work; the 5153 * disable path runs off irq_work anyway. 5154 */ 5155 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); 5156 5157 static void scx_hardlockup_irq_workfn(struct irq_work *work) 5158 { 5159 int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); 5160 5161 if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) 5162 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 5163 cpu); 5164 } 5165 5166 static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); 5167 5168 /** 5169 * scx_hardlockup - sched_ext hardlockup handler 5170 * 5171 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 5172 * numerous affinitized tasks in a single queue and directing all CPUs at it. 5173 * Try kicking out the current scheduler in an attempt to recover the system to 5174 * a good state before taking more drastic actions. 5175 * 5176 * Queues an irq_work; the handle_lockup() call happens in IRQ context (see 5177 * scx_hardlockup_irq_workfn). 5178 * 5179 * Returns %true if sched_ext is enabled and the work was queued, %false 5180 * otherwise. 5181 */ 5182 bool scx_hardlockup(int cpu) 5183 { 5184 if (!rcu_access_pointer(scx_root)) 5185 return false; 5186 5187 atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); 5188 irq_work_queue(&scx_hardlockup_irq_work); 5189 return true; 5190 } 5191 5192 static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 5193 struct cpumask *donee_mask, struct cpumask *resched_mask, 5194 u32 nr_donor_target, u32 nr_donee_target) 5195 { 5196 struct rq *donor_rq = cpu_rq(donor); 5197 struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 5198 struct task_struct *p, *n; 5199 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 5200 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 5201 u32 nr_balanced = 0, min_delta_us; 5202 5203 /* 5204 * All we want to guarantee is reasonable forward progress. No reason to 5205 * fine tune. Assuming every task on @donor_dsq runs their full slice, 5206 * consider offloading iff the total queued duration is over the 5207 * threshold. 5208 */ 5209 min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 5210 if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 5211 return 0; 5212 5213 raw_spin_rq_lock_irq(donor_rq); 5214 raw_spin_lock(&donor_dsq->lock); 5215 list_add(&cursor.node, &donor_dsq->list); 5216 resume: 5217 n = container_of(&cursor, struct task_struct, scx.dsq_list); 5218 n = nldsq_next_task(donor_dsq, n, false); 5219 5220 while ((p = n)) { 5221 struct scx_dispatch_q *donee_dsq; 5222 int donee; 5223 5224 n = nldsq_next_task(donor_dsq, n, false); 5225 5226 if (donor_dsq->nr <= nr_donor_target) 5227 break; 5228 5229 if (cpumask_empty(donee_mask)) 5230 break; 5231 5232 /* 5233 * If an earlier pass placed @p on @donor_dsq from a different 5234 * CPU and the donee hasn't consumed it yet, @p is still on the 5235 * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved 5236 * without its rq locked. Skip. 5237 */ 5238 if (task_rq(p) != donor_rq) 5239 continue; 5240 5241 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5242 if (donee >= nr_cpu_ids) 5243 continue; 5244 5245 donee_dsq = bypass_dsq(sch, donee); 5246 5247 /* 5248 * $p's rq is not locked but $p's DSQ lock protects its 5249 * scheduling properties making this test safe. 5250 */ 5251 if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5252 continue; 5253 5254 /* 5255 * Moving $p from one non-local DSQ to another. The source rq 5256 * and DSQ are already locked. Do an abbreviated dequeue and 5257 * then perform enqueue without unlocking $donor_dsq. 5258 * 5259 * We don't want to drop and reacquire the lock on each 5260 * iteration as @donor_dsq can be very long and potentially 5261 * highly contended. Donee DSQs are less likely to be contended. 5262 * The nested locking is safe as only this LB moves tasks 5263 * between bypass DSQs. 5264 */ 5265 dispatch_dequeue_locked(p, donor_dsq); 5266 dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5267 5268 /* 5269 * $donee might have been idle and need to be woken up. No need 5270 * to be clever. Kick every CPU that receives tasks. 5271 */ 5272 cpumask_set_cpu(donee, resched_mask); 5273 5274 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5275 cpumask_clear_cpu(donee, donee_mask); 5276 5277 nr_balanced++; 5278 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5279 list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5280 raw_spin_unlock(&donor_dsq->lock); 5281 raw_spin_rq_unlock_irq(donor_rq); 5282 cpu_relax(); 5283 raw_spin_rq_lock_irq(donor_rq); 5284 raw_spin_lock(&donor_dsq->lock); 5285 goto resume; 5286 } 5287 } 5288 5289 list_del_init(&cursor.node); 5290 raw_spin_unlock(&donor_dsq->lock); 5291 raw_spin_rq_unlock_irq(donor_rq); 5292 5293 return nr_balanced; 5294 } 5295 5296 static void bypass_lb_node(struct scx_sched *sch, int node) 5297 { 5298 const struct cpumask *node_mask = cpumask_of_node(node); 5299 struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; 5300 struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; 5301 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5302 u32 nr_target, nr_donor_target; 5303 u32 before_min = U32_MAX, before_max = 0; 5304 u32 after_min = U32_MAX, after_max = 0; 5305 int cpu; 5306 5307 /* count the target tasks and CPUs */ 5308 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5309 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5310 5311 nr_tasks += nr; 5312 nr_cpus++; 5313 5314 before_min = min(nr, before_min); 5315 before_max = max(nr, before_max); 5316 } 5317 5318 if (!nr_cpus) 5319 return; 5320 5321 /* 5322 * We don't want CPUs to have more than $nr_donor_target tasks and 5323 * balancing to fill donee CPUs upto $nr_target. Once targets are 5324 * calculated, find the donee CPUs. 5325 */ 5326 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5327 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5328 5329 cpumask_clear(donee_mask); 5330 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5331 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5332 cpumask_set_cpu(cpu, donee_mask); 5333 } 5334 5335 /* iterate !donee CPUs and see if they should be offloaded */ 5336 cpumask_clear(resched_mask); 5337 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5338 if (cpumask_empty(donee_mask)) 5339 break; 5340 if (cpumask_test_cpu(cpu, donee_mask)) 5341 continue; 5342 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5343 continue; 5344 5345 nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5346 nr_donor_target, nr_target); 5347 } 5348 5349 for_each_cpu(cpu, resched_mask) 5350 resched_cpu(cpu); 5351 5352 for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5353 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5354 5355 after_min = min(nr, after_min); 5356 after_max = max(nr, after_max); 5357 5358 } 5359 5360 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5361 before_min, before_max, after_min, after_max); 5362 } 5363 5364 /* 5365 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5366 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5367 * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5368 * lightly loaded CPUs, such imbalance can lead to very high execution latency 5369 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5370 * outcomes, a simple load balancing mechanism is implemented by the following 5371 * timer which runs periodically while bypass mode is in effect. 5372 */ 5373 static void scx_bypass_lb_timerfn(struct timer_list *timer) 5374 { 5375 struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5376 int node; 5377 u32 intv_us; 5378 5379 if (!bypass_dsp_enabled(sch)) 5380 return; 5381 5382 for_each_node_with_cpus(node) 5383 bypass_lb_node(sch, node); 5384 5385 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5386 if (intv_us) 5387 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5388 } 5389 5390 static bool inc_bypass_depth(struct scx_sched *sch) 5391 { 5392 lockdep_assert_held(&scx_bypass_lock); 5393 5394 WARN_ON_ONCE(sch->bypass_depth < 0); 5395 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5396 if (sch->bypass_depth != 1) 5397 return false; 5398 5399 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5400 sch->bypass_timestamp = ktime_get_ns(); 5401 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5402 return true; 5403 } 5404 5405 static bool dec_bypass_depth(struct scx_sched *sch) 5406 { 5407 lockdep_assert_held(&scx_bypass_lock); 5408 5409 WARN_ON_ONCE(sch->bypass_depth < 1); 5410 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5411 if (sch->bypass_depth != 0) 5412 return false; 5413 5414 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5415 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5416 ktime_get_ns() - sch->bypass_timestamp); 5417 return true; 5418 } 5419 5420 static void enable_bypass_dsp(struct scx_sched *sch) 5421 { 5422 struct scx_sched *host = scx_parent(sch) ?: sch; 5423 u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5424 s32 ret; 5425 5426 /* 5427 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5428 * Shouldn't stagger. 5429 */ 5430 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5431 return; 5432 5433 /* 5434 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5435 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5436 * called iff @sch is not already bypassed due to an ancestor bypassing, 5437 * we can assume that the parent is not bypassing and thus will be the 5438 * host of the bypass DSQs. 5439 * 5440 * While the situation may change in the future, the following 5441 * guarantees that the nearest non-bypassing ancestor or root has bypass 5442 * dispatch enabled while a descendant is bypassing, which is all that's 5443 * required. 5444 * 5445 * bypass_dsp_enabled() test is used to determine whether to enter the 5446 * bypass dispatch handling path from both bypassing and hosting scheds. 5447 * Bump enable depth on both @sch and bypass dispatch host. 5448 */ 5449 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5450 WARN_ON_ONCE(ret <= 0); 5451 5452 if (host != sch) { 5453 ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5454 WARN_ON_ONCE(ret <= 0); 5455 } 5456 5457 /* 5458 * The LB timer will stop running if bypass dispatch is disabled. Start 5459 * after enabling bypass dispatch. 5460 */ 5461 if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5462 mod_timer(&host->bypass_lb_timer, 5463 jiffies + usecs_to_jiffies(intv_us)); 5464 } 5465 5466 /* may be called without holding scx_bypass_lock */ 5467 static void disable_bypass_dsp(struct scx_sched *sch) 5468 { 5469 s32 ret; 5470 5471 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5472 return; 5473 5474 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5475 WARN_ON_ONCE(ret < 0); 5476 5477 if (scx_parent(sch)) { 5478 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5479 WARN_ON_ONCE(ret < 0); 5480 } 5481 } 5482 5483 /** 5484 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5485 * @sch: sched to bypass 5486 * @bypass: true for bypass, false for unbypass 5487 * 5488 * Bypassing guarantees that all runnable tasks make forward progress without 5489 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5490 * be held by tasks that the BPF scheduler is forgetting to run, which 5491 * unfortunately also excludes toggling the static branches. 5492 * 5493 * Let's work around by overriding a couple ops and modifying behaviors based on 5494 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5495 * to force global FIFO scheduling. 5496 * 5497 * - ops.select_cpu() is ignored and the default select_cpu() is used. 5498 * 5499 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5500 * %SCX_OPS_ENQ_LAST is also ignored. 5501 * 5502 * - ops.dispatch() is ignored. 5503 * 5504 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5505 * can't be trusted. Whenever a tick triggers, the running task is rotated to 5506 * the tail of the queue with core_sched_at touched. 5507 * 5508 * - pick_next_task() suppresses zero slice warning. 5509 * 5510 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5511 * operations. 5512 * 5513 * - scx_prio_less() reverts to the default core_sched_at order. 5514 */ 5515 static void scx_bypass(struct scx_sched *sch, bool bypass) 5516 { 5517 struct scx_sched *pos; 5518 unsigned long flags; 5519 int cpu; 5520 5521 raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5522 5523 if (bypass) { 5524 if (!inc_bypass_depth(sch)) 5525 goto unlock; 5526 5527 enable_bypass_dsp(sch); 5528 } else { 5529 if (!dec_bypass_depth(sch)) 5530 goto unlock; 5531 } 5532 5533 /* 5534 * Bypass state is propagated to all descendants - an scx_sched bypasses 5535 * if itself or any of its ancestors are in bypass mode. 5536 */ 5537 raw_spin_lock(&scx_sched_lock); 5538 scx_for_each_descendant_pre(pos, sch) { 5539 if (pos == sch) 5540 continue; 5541 if (bypass) 5542 inc_bypass_depth(pos); 5543 else 5544 dec_bypass_depth(pos); 5545 } 5546 raw_spin_unlock(&scx_sched_lock); 5547 5548 /* 5549 * No task property is changing. We just need to make sure all currently 5550 * queued tasks are re-queued according to the new scx_bypassing() 5551 * state. As an optimization, walk each rq's runnable_list instead of 5552 * the scx_tasks list. 5553 * 5554 * This function can't trust the scheduler and thus can't use 5555 * cpus_read_lock(). Walk all possible CPUs instead of online. 5556 */ 5557 for_each_possible_cpu(cpu) { 5558 struct rq *rq = cpu_rq(cpu); 5559 struct task_struct *p, *n; 5560 5561 raw_spin_rq_lock(rq); 5562 raw_spin_lock(&scx_sched_lock); 5563 5564 scx_for_each_descendant_pre(pos, sch) { 5565 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5566 5567 if (pos->bypass_depth) 5568 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5569 else 5570 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5571 } 5572 5573 raw_spin_unlock(&scx_sched_lock); 5574 5575 /* 5576 * We need to guarantee that no tasks are on the BPF scheduler 5577 * while bypassing. Either we see enabled or the enable path 5578 * sees scx_bypassing() before moving tasks to SCX. 5579 */ 5580 if (!scx_enabled()) { 5581 raw_spin_rq_unlock(rq); 5582 continue; 5583 } 5584 5585 /* 5586 * The use of list_for_each_entry_safe_reverse() is required 5587 * because each task is going to be removed from and added back 5588 * to the runnable_list during iteration. Because they're added 5589 * to the tail of the list, safe reverse iteration can still 5590 * visit all nodes. 5591 */ 5592 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5593 scx.runnable_node) { 5594 if (!scx_is_descendant(scx_task_sched(p), sch)) 5595 continue; 5596 5597 /* cycling deq/enq is enough, see the function comment */ 5598 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5599 /* nothing */ ; 5600 } 5601 } 5602 5603 /* resched to restore ticks and idle state */ 5604 if (cpu_online(cpu) || cpu == smp_processor_id()) 5605 resched_curr(rq); 5606 5607 raw_spin_rq_unlock(rq); 5608 } 5609 5610 /* disarming must come after moving all tasks out of the bypass DSQs */ 5611 if (!bypass) 5612 disable_bypass_dsp(sch); 5613 unlock: 5614 raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5615 } 5616 5617 static void free_exit_info(struct scx_exit_info *ei) 5618 { 5619 kvfree(ei->dump); 5620 kfree(ei->msg); 5621 kfree(ei->bt); 5622 kfree(ei); 5623 } 5624 5625 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5626 { 5627 struct scx_exit_info *ei; 5628 5629 ei = kzalloc_obj(*ei); 5630 if (!ei) 5631 return NULL; 5632 5633 ei->exit_cpu = -1; 5634 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5635 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5636 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5637 5638 if (!ei->bt || !ei->msg || !ei->dump) { 5639 free_exit_info(ei); 5640 return NULL; 5641 } 5642 5643 return ei; 5644 } 5645 5646 static const char *scx_exit_reason(enum scx_exit_kind kind) 5647 { 5648 switch (kind) { 5649 case SCX_EXIT_UNREG: 5650 return "unregistered from user space"; 5651 case SCX_EXIT_UNREG_BPF: 5652 return "unregistered from BPF"; 5653 case SCX_EXIT_UNREG_KERN: 5654 return "unregistered from the main kernel"; 5655 case SCX_EXIT_SYSRQ: 5656 return "disabled by sysrq-S"; 5657 case SCX_EXIT_PARENT: 5658 return "parent exiting"; 5659 case SCX_EXIT_ERROR: 5660 return "runtime error"; 5661 case SCX_EXIT_ERROR_BPF: 5662 return "scx_bpf_error"; 5663 case SCX_EXIT_ERROR_STALL: 5664 return "runnable task stall"; 5665 default: 5666 return "<UNKNOWN>"; 5667 } 5668 } 5669 5670 static void free_kick_syncs(void) 5671 { 5672 int cpu; 5673 5674 for_each_possible_cpu(cpu) { 5675 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5676 struct scx_kick_syncs *to_free; 5677 5678 to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5679 if (to_free) 5680 kvfree_rcu(to_free, rcu); 5681 } 5682 } 5683 5684 static void refresh_watchdog(void) 5685 { 5686 struct scx_sched *sch; 5687 unsigned long intv = ULONG_MAX; 5688 5689 /* take the shortest timeout and use its half for watchdog interval */ 5690 rcu_read_lock(); 5691 list_for_each_entry_rcu(sch, &scx_sched_all, all) 5692 intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5693 rcu_read_unlock(); 5694 5695 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5696 WRITE_ONCE(scx_watchdog_interval, intv); 5697 5698 if (intv < ULONG_MAX) 5699 mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5700 else 5701 cancel_delayed_work_sync(&scx_watchdog_work); 5702 } 5703 5704 static s32 scx_link_sched(struct scx_sched *sch) 5705 { 5706 const char *err_msg = ""; 5707 s32 ret = 0; 5708 5709 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5710 #ifdef CONFIG_EXT_SUB_SCHED 5711 struct scx_sched *parent = scx_parent(sch); 5712 5713 if (parent) { 5714 /* 5715 * scx_claim_exit() propagates exit_kind transition to 5716 * its sub-scheds while holding scx_sched_lock - either 5717 * we can see the parent's non-NONE exit_kind or the 5718 * parent can shoot us down. 5719 */ 5720 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5721 err_msg = "parent disabled"; 5722 ret = -ENOENT; 5723 break; 5724 } 5725 5726 ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5727 &sch->hash_node, scx_sched_hash_params); 5728 if (ret) { 5729 err_msg = "failed to insert into scx_sched_hash"; 5730 break; 5731 } 5732 5733 list_add_tail(&sch->sibling, &parent->children); 5734 } 5735 #endif /* CONFIG_EXT_SUB_SCHED */ 5736 5737 list_add_tail_rcu(&sch->all, &scx_sched_all); 5738 } 5739 5740 /* 5741 * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after 5742 * the guard above is released. 5743 */ 5744 if (ret) { 5745 scx_error(sch, "%s (%d)", err_msg, ret); 5746 return ret; 5747 } 5748 5749 refresh_watchdog(); 5750 return 0; 5751 } 5752 5753 static void scx_unlink_sched(struct scx_sched *sch) 5754 { 5755 scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5756 #ifdef CONFIG_EXT_SUB_SCHED 5757 if (scx_parent(sch)) { 5758 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5759 scx_sched_hash_params); 5760 list_del_init(&sch->sibling); 5761 } 5762 #endif /* CONFIG_EXT_SUB_SCHED */ 5763 list_del_rcu(&sch->all); 5764 } 5765 5766 refresh_watchdog(); 5767 } 5768 5769 /* 5770 * Called to disable future dumps and wait for in-progress one while disabling 5771 * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5772 * This prevents calling dump ops on a dead sch. 5773 */ 5774 static void scx_disable_dump(struct scx_sched *sch) 5775 { 5776 guard(raw_spinlock_irqsave)(&scx_dump_lock); 5777 sch->dump_disabled = true; 5778 } 5779 5780 static void scx_log_sched_disable(struct scx_sched *sch) 5781 { 5782 struct scx_exit_info *ei = sch->exit_info; 5783 const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler"; 5784 5785 if (ei->kind >= SCX_EXIT_ERROR) { 5786 pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, 5787 sch->ops.name, ei->reason); 5788 5789 if (ei->msg[0] != '\0') 5790 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 5791 #ifdef CONFIG_STACKTRACE 5792 stack_trace_print(ei->bt, ei->bt_len, 2); 5793 #endif 5794 } else { 5795 pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, 5796 sch->ops.name, ei->reason); 5797 } 5798 } 5799 5800 #ifdef CONFIG_EXT_SUB_SCHED 5801 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5802 5803 static void drain_descendants(struct scx_sched *sch) 5804 { 5805 /* 5806 * Child scheds that finished the critical part of disabling will take 5807 * themselves off @sch->children. Wait for it to drain. As propagation 5808 * is recursive, empty @sch->children means that all proper descendant 5809 * scheds reached unlinking stage. 5810 */ 5811 wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5812 } 5813 5814 static void scx_fail_parent(struct scx_sched *sch, 5815 struct task_struct *failed, s32 fail_code) 5816 { 5817 struct scx_sched *parent = scx_parent(sch); 5818 struct scx_task_iter sti; 5819 struct task_struct *p; 5820 5821 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5822 fail_code, failed->comm, failed->pid); 5823 5824 /* 5825 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5826 * it. This may cause downstream failures on the BPF side but $parent is 5827 * dying anyway. 5828 */ 5829 scx_bypass(parent, true); 5830 5831 scx_task_iter_start(&sti, sch->cgrp); 5832 while ((p = scx_task_iter_next_locked(&sti))) { 5833 if (scx_task_on_sched(parent, p)) 5834 continue; 5835 5836 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5837 scx_disable_and_exit_task(sch, p); 5838 scx_set_task_sched(p, parent); 5839 } 5840 } 5841 scx_task_iter_stop(&sti); 5842 } 5843 5844 static void scx_sub_disable(struct scx_sched *sch) 5845 { 5846 struct scx_sched *parent = scx_parent(sch); 5847 struct scx_task_iter sti; 5848 struct task_struct *p; 5849 int ret; 5850 5851 /* 5852 * Guarantee forward progress and wait for descendants to be disabled. 5853 * To limit disruptions, $parent is not bypassed. Tasks are fully 5854 * prepped and then inserted back into $parent. 5855 */ 5856 scx_bypass(sch, true); 5857 drain_descendants(sch); 5858 5859 /* 5860 * Here, every runnable task is guaranteed to make forward progress and 5861 * we can safely use blocking synchronization constructs. Actually 5862 * disable ops. 5863 */ 5864 mutex_lock(&scx_enable_mutex); 5865 percpu_down_write(&scx_fork_rwsem); 5866 scx_cgroup_lock(); 5867 5868 set_cgroup_sched(sch_cgroup(sch), parent); 5869 5870 scx_task_iter_start(&sti, sch->cgrp); 5871 while ((p = scx_task_iter_next_locked(&sti))) { 5872 struct rq *rq; 5873 struct rq_flags rf; 5874 5875 /* filter out duplicate visits */ 5876 if (scx_task_on_sched(parent, p)) 5877 continue; 5878 5879 /* 5880 * By the time control reaches here, all descendant schedulers 5881 * should already have been disabled. 5882 */ 5883 WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 5884 5885 /* 5886 * @p is pinned by the iter: css_task_iter_next() takes a 5887 * reference and holds it until the next iter_next() call, so 5888 * @p->usage is guaranteed > 0. 5889 */ 5890 get_task_struct(p); 5891 5892 scx_task_iter_unlock(&sti); 5893 5894 /* 5895 * $p is READY or ENABLED on @sch. Initialize for $parent, 5896 * disable and exit from @sch, and then switch over to $parent. 5897 * 5898 * If a task fails to initialize for $parent, the only available 5899 * action is disabling $parent too. While this allows disabling 5900 * of a child sched to cause the parent scheduler to fail, the 5901 * failure can only originate from ops.init_task() of the 5902 * parent. A child can't directly affect the parent through its 5903 * own failures. 5904 */ 5905 ret = __scx_init_task(parent, p, false); 5906 if (ret) { 5907 scx_fail_parent(sch, p, ret); 5908 put_task_struct(p); 5909 break; 5910 } 5911 5912 rq = task_rq_lock(p, &rf); 5913 5914 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 5915 /* 5916 * sched_ext_dead() raced us between __scx_init_task() 5917 * and this rq lock and ran exit_task() on @sch (the 5918 * sched @p was on at that point), not on $parent. 5919 * $parent's just-completed init is owed an exit_task() 5920 * and we issue it here. 5921 */ 5922 scx_sub_init_cancel_task(parent, p); 5923 task_rq_unlock(rq, p, &rf); 5924 put_task_struct(p); 5925 continue; 5926 } 5927 5928 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5929 /* 5930 * $p is initialized for $parent and still attached to 5931 * @sch. Disable and exit for @sch, switch over to 5932 * $parent, override the state to READY to account for 5933 * $p having already been initialized, and then enable. 5934 */ 5935 scx_disable_and_exit_task(sch, p); 5936 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 5937 scx_set_task_state(p, SCX_TASK_INIT); 5938 scx_set_task_sched(p, parent); 5939 scx_set_task_state(p, SCX_TASK_READY); 5940 scx_enable_task(parent, p); 5941 } 5942 5943 task_rq_unlock(rq, p, &rf); 5944 put_task_struct(p); 5945 } 5946 scx_task_iter_stop(&sti); 5947 5948 scx_disable_dump(sch); 5949 5950 scx_cgroup_unlock(); 5951 percpu_up_write(&scx_fork_rwsem); 5952 5953 /* 5954 * All tasks are moved off of @sch but there may still be on-going 5955 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 5956 * the expedited version as ancestors may be waiting in bypass mode. 5957 * Also, tell the parent that there is no need to keep running bypass 5958 * DSQs for us. 5959 */ 5960 synchronize_rcu_expedited(); 5961 disable_bypass_dsp(sch); 5962 5963 scx_unlink_sched(sch); 5964 5965 mutex_unlock(&scx_enable_mutex); 5966 5967 /* 5968 * @sch is now unlinked from the parent's children list. Notify and call 5969 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 5970 * after unlinking and releasing all locks. See scx_claim_exit(). 5971 */ 5972 wake_up_all(&scx_unlink_waitq); 5973 5974 if (parent->ops.sub_detach && sch->sub_attached) { 5975 struct scx_sub_detach_args sub_detach_args = { 5976 .ops = &sch->ops, 5977 .cgroup_path = sch->cgrp_path, 5978 }; 5979 SCX_CALL_OP(parent, sub_detach, NULL, 5980 &sub_detach_args); 5981 } 5982 5983 scx_log_sched_disable(sch); 5984 5985 if (sch->ops.exit) 5986 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 5987 if (sch->sub_kset) 5988 kobject_del(&sch->sub_kset->kobj); 5989 kobject_del(&sch->kobj); 5990 } 5991 #else /* CONFIG_EXT_SUB_SCHED */ 5992 static inline void drain_descendants(struct scx_sched *sch) { } 5993 static inline void scx_sub_disable(struct scx_sched *sch) { } 5994 #endif /* CONFIG_EXT_SUB_SCHED */ 5995 5996 static void scx_root_disable(struct scx_sched *sch) 5997 { 5998 struct scx_task_iter sti; 5999 struct task_struct *p; 6000 bool was_switched_all; 6001 int cpu; 6002 6003 /* guarantee forward progress and wait for descendants to be disabled */ 6004 scx_bypass(sch, true); 6005 drain_descendants(sch); 6006 6007 switch (scx_set_enable_state(SCX_DISABLING)) { 6008 case SCX_DISABLING: 6009 WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 6010 break; 6011 case SCX_DISABLED: 6012 pr_warn("sched_ext: ops error detected without ops (%s)\n", 6013 sch->exit_info->msg); 6014 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6015 goto done; 6016 default: 6017 break; 6018 } 6019 6020 /* 6021 * Here, every runnable task is guaranteed to make forward progress and 6022 * we can safely use blocking synchronization constructs. Actually 6023 * disable ops. 6024 */ 6025 mutex_lock(&scx_enable_mutex); 6026 6027 was_switched_all = scx_switched_all(); 6028 6029 static_branch_disable(&__scx_switched_all); 6030 WRITE_ONCE(scx_switching_all, false); 6031 6032 /* 6033 * Shut down cgroup support before tasks so that the cgroup attach path 6034 * doesn't race against scx_disable_and_exit_task(). 6035 */ 6036 scx_cgroup_lock(); 6037 scx_cgroup_exit(sch); 6038 scx_cgroup_unlock(); 6039 6040 /* 6041 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 6042 * must be switched out and exited synchronously. 6043 */ 6044 percpu_down_write(&scx_fork_rwsem); 6045 6046 scx_init_task_enabled = false; 6047 6048 scx_task_iter_start(&sti, NULL); 6049 while ((p = scx_task_iter_next_locked(&sti))) { 6050 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 6051 const struct sched_class *old_class = p->sched_class; 6052 const struct sched_class *new_class = scx_setscheduler_class(p); 6053 6054 update_rq_clock(task_rq(p)); 6055 6056 if (old_class != new_class) 6057 queue_flags |= DEQUEUE_CLASS; 6058 6059 scoped_guard (sched_change, p, queue_flags) { 6060 p->sched_class = new_class; 6061 } 6062 6063 scx_disable_and_exit_task(scx_task_sched(p), p); 6064 } 6065 scx_task_iter_stop(&sti); 6066 6067 scx_disable_dump(sch); 6068 6069 scx_cgroup_lock(); 6070 set_cgroup_sched(sch_cgroup(sch), NULL); 6071 scx_cgroup_unlock(); 6072 6073 percpu_up_write(&scx_fork_rwsem); 6074 6075 /* 6076 * Invalidate all the rq clocks to prevent getting outdated 6077 * rq clocks from a previous scx scheduler. 6078 * 6079 * Also re-balance the dl_server bandwidth reservations: detach 6080 * ext_server (no more sched_ext tasks) and reinstate fair_server if it 6081 * was previously detached because we were running in full mode. 6082 * 6083 * Unlike the enable path, this runs on a recovery path that cannot 6084 * fail, so we use dl_server_swap_bw() to atomically free ext_server's 6085 * bandwidth and reclaim it for fair_server under the same dl_b lock. 6086 * 6087 * The swap can still fail with -EBUSY if someone bumped ext_server's 6088 * runtime via debugfs between enable and disable; in that narrow case 6089 * both servers end up detached and we just WARN. 6090 */ 6091 for_each_possible_cpu(cpu) { 6092 struct rq *rq = cpu_rq(cpu); 6093 6094 scx_rq_clock_invalidate(rq); 6095 6096 scoped_guard(rq_lock_irqsave, rq) { 6097 update_rq_clock(rq); 6098 if (was_switched_all) { 6099 if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server, 6100 &rq->fair_server))) 6101 pr_warn("failed to re-attach fair_server on CPU %d\n", cpu); 6102 } else { 6103 dl_server_detach_bw(&rq->ext_server); 6104 } 6105 } 6106 } 6107 6108 /* no task is on scx, turn off all the switches and flush in-progress calls */ 6109 static_branch_disable(&__scx_enabled); 6110 static_branch_disable(&__scx_is_cid_type); 6111 if (sch->ops.flags & SCX_OPS_TID_TO_TASK) 6112 static_branch_disable(&__scx_tid_to_task_enabled); 6113 bitmap_zero(sch->has_op, SCX_OPI_END); 6114 scx_idle_disable(); 6115 synchronize_rcu(); 6116 if (sch->ops.flags & SCX_OPS_TID_TO_TASK) 6117 rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); 6118 6119 scx_log_sched_disable(sch); 6120 6121 if (sch->ops.exit) 6122 SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 6123 6124 scx_unlink_sched(sch); 6125 6126 /* 6127 * scx_root clearing must be inside cpus_read_lock(). See 6128 * handle_hotplug(). 6129 */ 6130 cpus_read_lock(); 6131 RCU_INIT_POINTER(scx_root, NULL); 6132 cpus_read_unlock(); 6133 6134 /* 6135 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 6136 * could observe an object of the same name still in the hierarchy when 6137 * the next scheduler is loaded. 6138 */ 6139 #ifdef CONFIG_EXT_SUB_SCHED 6140 if (sch->sub_kset) 6141 kobject_del(&sch->sub_kset->kobj); 6142 #endif 6143 kobject_del(&sch->kobj); 6144 6145 free_kick_syncs(); 6146 6147 mutex_unlock(&scx_enable_mutex); 6148 6149 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6150 done: 6151 scx_bypass(sch, false); 6152 } 6153 6154 /* 6155 * Claim the exit on @sch. The caller must ensure that the helper kthread work 6156 * is kicked before the current task can be preempted. Once exit_kind is 6157 * claimed, scx_error() can no longer trigger, so if the current task gets 6158 * preempted and the BPF scheduler fails to schedule it back, the helper work 6159 * will never be kicked and the whole system can wedge. 6160 */ 6161 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 6162 { 6163 int none = SCX_EXIT_NONE; 6164 6165 lockdep_assert_preemption_disabled(); 6166 6167 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 6168 kind = SCX_EXIT_ERROR; 6169 6170 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 6171 return false; 6172 6173 /* 6174 * Some CPUs may be trapped in the dispatch paths. Set the aborting 6175 * flag to break potential live-lock scenarios, ensuring we can 6176 * successfully reach scx_bypass(). 6177 */ 6178 WRITE_ONCE(sch->aborting, true); 6179 6180 /* 6181 * Propagate exits to descendants immediately. Each has a dedicated 6182 * helper kthread and can run in parallel. While most of disabling is 6183 * serialized, running them in separate threads allows parallelizing 6184 * ops.exit(), which can take arbitrarily long prolonging bypass mode. 6185 * 6186 * To guarantee forward progress, this propagation must be in-line so 6187 * that ->aborting is synchronously asserted for all sub-scheds. The 6188 * propagation is also the interlocking point against sub-sched 6189 * attachment. See scx_link_sched(). 6190 * 6191 * This doesn't cause recursions as propagation only takes place for 6192 * non-propagation exits. 6193 */ 6194 if (kind != SCX_EXIT_PARENT) { 6195 scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 6196 struct scx_sched *pos; 6197 scx_for_each_descendant_pre(pos, sch) 6198 scx_disable(pos, SCX_EXIT_PARENT); 6199 } 6200 } 6201 6202 return true; 6203 } 6204 6205 static void scx_disable_workfn(struct kthread_work *work) 6206 { 6207 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 6208 struct scx_exit_info *ei = sch->exit_info; 6209 int kind; 6210 6211 kind = atomic_read(&sch->exit_kind); 6212 while (true) { 6213 if (kind == SCX_EXIT_DONE) /* already disabled? */ 6214 return; 6215 WARN_ON_ONCE(kind == SCX_EXIT_NONE); 6216 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 6217 break; 6218 } 6219 ei->kind = kind; 6220 ei->reason = scx_exit_reason(ei->kind); 6221 6222 if (scx_parent(sch)) 6223 scx_sub_disable(sch); 6224 else 6225 scx_root_disable(sch); 6226 } 6227 6228 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 6229 { 6230 guard(preempt)(); 6231 if (scx_claim_exit(sch, kind)) 6232 irq_work_queue(&sch->disable_irq_work); 6233 } 6234 6235 /** 6236 * scx_flush_disable_work - flush the disable work and wait for it to finish 6237 * @sch: the scheduler 6238 * 6239 * sch->disable_work might still not queued, causing kthread_flush_work() 6240 * as a noop. Syncing the irq_work first is required to guarantee the 6241 * kthread work has been queued before waiting for it. 6242 */ 6243 static void scx_flush_disable_work(struct scx_sched *sch) 6244 { 6245 int kind; 6246 6247 do { 6248 irq_work_sync(&sch->disable_irq_work); 6249 kthread_flush_work(&sch->disable_work); 6250 kind = atomic_read(&sch->exit_kind); 6251 } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); 6252 } 6253 6254 static void dump_newline(struct seq_buf *s) 6255 { 6256 trace_sched_ext_dump(""); 6257 6258 /* @s may be zero sized and seq_buf triggers WARN if so */ 6259 if (s->size) 6260 seq_buf_putc(s, '\n'); 6261 } 6262 6263 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 6264 { 6265 va_list args; 6266 6267 #ifdef CONFIG_TRACEPOINTS 6268 if (trace_sched_ext_dump_enabled()) { 6269 /* protected by scx_dump_lock */ 6270 static char line_buf[SCX_EXIT_MSG_LEN]; 6271 6272 va_start(args, fmt); 6273 vscnprintf(line_buf, sizeof(line_buf), fmt, args); 6274 va_end(args); 6275 6276 trace_call__sched_ext_dump(line_buf); 6277 } 6278 #endif 6279 /* @s may be zero sized and seq_buf triggers WARN if so */ 6280 if (s->size) { 6281 va_start(args, fmt); 6282 seq_buf_vprintf(s, fmt, args); 6283 va_end(args); 6284 6285 seq_buf_putc(s, '\n'); 6286 } 6287 } 6288 6289 static void dump_stack_trace(struct seq_buf *s, const char *prefix, 6290 const unsigned long *bt, unsigned int len) 6291 { 6292 unsigned int i; 6293 6294 for (i = 0; i < len; i++) 6295 dump_line(s, "%s%pS", prefix, (void *)bt[i]); 6296 } 6297 6298 static void ops_dump_init(struct seq_buf *s, const char *prefix) 6299 { 6300 struct scx_dump_data *dd = &scx_dump_data; 6301 6302 lockdep_assert_irqs_disabled(); 6303 6304 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 6305 dd->first = true; 6306 dd->cursor = 0; 6307 dd->s = s; 6308 dd->prefix = prefix; 6309 } 6310 6311 static void ops_dump_flush(void) 6312 { 6313 struct scx_dump_data *dd = &scx_dump_data; 6314 char *line = dd->buf.line; 6315 6316 if (!dd->cursor) 6317 return; 6318 6319 /* 6320 * There's something to flush and this is the first line. Insert a blank 6321 * line to distinguish ops dump. 6322 */ 6323 if (dd->first) { 6324 dump_newline(dd->s); 6325 dd->first = false; 6326 } 6327 6328 /* 6329 * There may be multiple lines in $line. Scan and emit each line 6330 * separately. 6331 */ 6332 while (true) { 6333 char *end = line; 6334 char c; 6335 6336 while (*end != '\n' && *end != '\0') 6337 end++; 6338 6339 /* 6340 * If $line overflowed, it may not have newline at the end. 6341 * Always emit with a newline. 6342 */ 6343 c = *end; 6344 *end = '\0'; 6345 dump_line(dd->s, "%s%s", dd->prefix, line); 6346 if (c == '\0') 6347 break; 6348 6349 /* move to the next line */ 6350 end++; 6351 if (*end == '\0') 6352 break; 6353 line = end; 6354 } 6355 6356 dd->cursor = 0; 6357 } 6358 6359 static void ops_dump_exit(void) 6360 { 6361 ops_dump_flush(); 6362 scx_dump_data.cpu = -1; 6363 } 6364 6365 static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, 6366 struct rq *rq, struct task_struct *p, char marker) 6367 { 6368 static unsigned long bt[SCX_EXIT_BT_LEN]; 6369 struct scx_sched *task_sch = scx_task_sched(p); 6370 const char *own_marker; 6371 char sch_id_buf[32]; 6372 char dsq_id_buf[19] = "(n/a)"; 6373 unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6374 unsigned int bt_len = 0; 6375 6376 own_marker = task_sch == sch ? "*" : ""; 6377 6378 if (task_sch->level == 0) 6379 scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6380 else 6381 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6382 task_sch->level, task_sch->ops.sub_cgroup_id); 6383 6384 if (p->scx.dsq) 6385 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6386 (unsigned long long)p->scx.dsq->id); 6387 6388 dump_newline(s); 6389 dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6390 marker, task_state_to_char(p), p->comm, p->pid, 6391 own_marker, sch_id_buf, 6392 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6393 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6394 scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6395 p->scx.flags & ~SCX_TASK_STATE_MASK, 6396 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6397 ops_state >> SCX_OPSS_QSEQ_SHIFT); 6398 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6399 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6400 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6401 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6402 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6403 p->migration_disabled); 6404 6405 if (SCX_HAS_OP(sch, dump_task)) { 6406 ops_dump_init(s, " "); 6407 SCX_CALL_OP(sch, dump_task, rq, dctx, p); 6408 ops_dump_exit(); 6409 } 6410 6411 #ifdef CONFIG_STACKTRACE 6412 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6413 #endif 6414 if (bt_len) { 6415 dump_newline(s); 6416 dump_stack_trace(s, " ", bt, bt_len); 6417 } 6418 } 6419 6420 static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s, 6421 struct scx_dump_ctx *dctx, int cpu, 6422 bool dump_all_tasks) 6423 { 6424 struct rq *rq = cpu_rq(cpu); 6425 struct rq_flags rf; 6426 struct task_struct *p; 6427 struct seq_buf ns; 6428 size_t avail, used; 6429 char *buf; 6430 bool idle; 6431 6432 rq_lock_irqsave(rq, &rf); 6433 6434 idle = list_empty(&rq->scx.runnable_list) && 6435 rq->curr->sched_class == &idle_sched_class; 6436 6437 if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6438 goto next; 6439 6440 /* 6441 * We don't yet know whether ops.dump_cpu() will produce output 6442 * and we may want to skip the default CPU dump if it doesn't. 6443 * Use a nested seq_buf to generate the standard dump so that we 6444 * can decide whether to commit later. 6445 */ 6446 avail = seq_buf_get_buf(s, &buf); 6447 seq_buf_init(&ns, buf, avail); 6448 6449 dump_newline(&ns); 6450 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6451 cpu, rq->scx.nr_running, rq->scx.flags, 6452 rq->scx.cpu_released, rq->scx.ops_qseq, 6453 rq->scx.kick_sync); 6454 dump_line(&ns, " curr=%s[%d] class=%ps", 6455 rq->curr->comm, rq->curr->pid, 6456 rq->curr->sched_class); 6457 if (!cpumask_empty(rq->scx.cpus_to_kick)) 6458 dump_line(&ns, " cpus_to_kick : %*pb", 6459 cpumask_pr_args(rq->scx.cpus_to_kick)); 6460 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6461 dump_line(&ns, " idle_to_kick : %*pb", 6462 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6463 if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6464 dump_line(&ns, " cpus_to_preempt: %*pb", 6465 cpumask_pr_args(rq->scx.cpus_to_preempt)); 6466 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6467 dump_line(&ns, " cpus_to_wait : %*pb", 6468 cpumask_pr_args(rq->scx.cpus_to_wait)); 6469 if (!cpumask_empty(rq->scx.cpus_to_sync)) 6470 dump_line(&ns, " cpus_to_sync : %*pb", 6471 cpumask_pr_args(rq->scx.cpus_to_sync)); 6472 6473 used = seq_buf_used(&ns); 6474 if (SCX_HAS_OP(sch, dump_cpu)) { 6475 ops_dump_init(&ns, " "); 6476 SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle); 6477 ops_dump_exit(); 6478 } 6479 6480 /* 6481 * If idle && nothing generated by ops.dump_cpu(), there's 6482 * nothing interesting. Skip. 6483 */ 6484 if (idle && used == seq_buf_used(&ns)) 6485 goto next; 6486 6487 /* 6488 * $s may already have overflowed when $ns was created. If so, 6489 * calling commit on it will trigger BUG. 6490 */ 6491 if (avail) { 6492 seq_buf_commit(s, seq_buf_used(&ns)); 6493 if (seq_buf_has_overflowed(&ns)) 6494 seq_buf_set_overflow(s); 6495 } 6496 6497 if (rq->curr->sched_class == &ext_sched_class && 6498 (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6499 scx_dump_task(sch, s, dctx, rq, rq->curr, '*'); 6500 6501 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6502 if (dump_all_tasks || scx_task_on_sched(sch, p)) 6503 scx_dump_task(sch, s, dctx, rq, p, ' '); 6504 next: 6505 rq_unlock_irqrestore(rq, &rf); 6506 } 6507 6508 /* 6509 * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6510 * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6511 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6512 * separately. For error dumps, @dump_all_tasks=true since only the failing 6513 * scheduler is dumped. 6514 */ 6515 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6516 size_t dump_len, bool dump_all_tasks) 6517 { 6518 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6519 struct scx_dump_ctx dctx = { 6520 .kind = ei->kind, 6521 .exit_code = ei->exit_code, 6522 .reason = ei->reason, 6523 .at_ns = ktime_get_ns(), 6524 .at_jiffies = jiffies, 6525 }; 6526 struct seq_buf s; 6527 struct scx_event_stats events; 6528 int cpu; 6529 6530 guard(raw_spinlock_irqsave)(&scx_dump_lock); 6531 6532 if (sch->dump_disabled) 6533 return; 6534 6535 seq_buf_init(&s, ei->dump, dump_len); 6536 6537 #ifdef CONFIG_EXT_SUB_SCHED 6538 if (sch->level == 0) 6539 dump_line(&s, "%s: root", sch->ops.name); 6540 else 6541 dump_line(&s, "%s: sub%d-%llu %s", 6542 sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6543 sch->cgrp_path); 6544 #endif 6545 if (ei->kind == SCX_EXIT_NONE) { 6546 dump_line(&s, "Debug dump triggered by %s", ei->reason); 6547 } else { 6548 if (ei->exit_cpu >= 0) 6549 dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:", 6550 current->comm, current->pid, ei->kind, 6551 ei->exit_cpu); 6552 else 6553 dump_line(&s, "%s[%d] triggered exit kind %d:", 6554 current->comm, current->pid, ei->kind); 6555 dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6556 dump_newline(&s); 6557 dump_line(&s, "Backtrace:"); 6558 dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6559 } 6560 6561 if (SCX_HAS_OP(sch, dump)) { 6562 ops_dump_init(&s, ""); 6563 SCX_CALL_OP(sch, dump, NULL, &dctx); 6564 ops_dump_exit(); 6565 } 6566 6567 dump_newline(&s); 6568 dump_line(&s, "CPU states"); 6569 dump_line(&s, "----------"); 6570 6571 /* 6572 * Dump the exit CPU first so it isn't lost to dump truncation, then 6573 * walk the rest in order, skipping the one already dumped. 6574 */ 6575 if (ei->exit_cpu >= 0) 6576 scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks); 6577 for_each_possible_cpu(cpu) { 6578 if (cpu != ei->exit_cpu) 6579 scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks); 6580 } 6581 6582 dump_newline(&s); 6583 dump_line(&s, "Event counters"); 6584 dump_line(&s, "--------------"); 6585 6586 scx_read_events(sch, &events); 6587 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6588 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6589 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6590 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6591 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6592 scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6593 scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6594 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6595 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6596 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6597 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6598 scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6599 scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6600 6601 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6602 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6603 trunc_marker, sizeof(trunc_marker)); 6604 } 6605 6606 static void scx_disable_irq_workfn(struct irq_work *irq_work) 6607 { 6608 struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6609 struct scx_exit_info *ei = sch->exit_info; 6610 6611 if (ei->kind >= SCX_EXIT_ERROR) 6612 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6613 6614 kthread_queue_work(sch->helper, &sch->disable_work); 6615 } 6616 6617 bool scx_vexit(struct scx_sched *sch, 6618 enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu, 6619 const char *fmt, va_list args) 6620 { 6621 struct scx_exit_info *ei = sch->exit_info; 6622 6623 guard(preempt)(); 6624 6625 if (!scx_claim_exit(sch, kind)) 6626 return false; 6627 6628 ei->exit_code = exit_code; 6629 #ifdef CONFIG_STACKTRACE 6630 if (kind >= SCX_EXIT_ERROR) 6631 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6632 #endif 6633 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6634 6635 /* 6636 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6637 * in scx_disable_workfn(). 6638 */ 6639 ei->kind = kind; 6640 ei->reason = scx_exit_reason(ei->kind); 6641 ei->exit_cpu = exit_cpu; 6642 6643 irq_work_queue(&sch->disable_irq_work); 6644 return true; 6645 } 6646 6647 static int alloc_kick_syncs(void) 6648 { 6649 int cpu; 6650 6651 /* 6652 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6653 * can exceed percpu allocator limits on large machines. 6654 */ 6655 for_each_possible_cpu(cpu) { 6656 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6657 struct scx_kick_syncs *new_ksyncs; 6658 6659 WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6660 6661 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6662 GFP_KERNEL, cpu_to_node(cpu)); 6663 if (!new_ksyncs) { 6664 free_kick_syncs(); 6665 return -ENOMEM; 6666 } 6667 6668 rcu_assign_pointer(*ksyncs, new_ksyncs); 6669 } 6670 6671 return 0; 6672 } 6673 6674 static void free_pnode(struct scx_sched_pnode *pnode) 6675 { 6676 if (!pnode) 6677 return; 6678 exit_dsq(&pnode->global_dsq); 6679 kfree(pnode); 6680 } 6681 6682 static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6683 { 6684 struct scx_sched_pnode *pnode; 6685 6686 pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6687 if (!pnode) 6688 return NULL; 6689 6690 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6691 kfree(pnode); 6692 return NULL; 6693 } 6694 6695 return pnode; 6696 } 6697 6698 /* 6699 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6700 * starvation. During the READY -> ENABLED task switching loop, the calling 6701 * thread's sched_class gets switched from fair to ext. As fair has higher 6702 * priority than ext, the calling thread can be indefinitely starved under 6703 * fair-class saturation, leading to a system hang. 6704 */ 6705 struct scx_enable_cmd { 6706 struct kthread_work work; 6707 union { 6708 struct sched_ext_ops *ops; 6709 struct sched_ext_ops_cid *ops_cid; 6710 }; 6711 bool is_cid_type; 6712 struct bpf_map *arena_map; /* arena ref to transfer to sch */ 6713 int ret; 6714 }; 6715 6716 /* 6717 * Allocate and initialize a new scx_sched. @cgrp's reference is always 6718 * consumed whether the function succeeds or fails. 6719 */ 6720 static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd, 6721 struct cgroup *cgrp, 6722 struct scx_sched *parent) 6723 { 6724 struct sched_ext_ops *ops = cmd->ops; 6725 struct scx_sched *sch; 6726 s32 level = parent ? parent->level + 1 : 0; 6727 s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6728 6729 sch = kzalloc_flex(*sch, ancestors, level + 1); 6730 if (!sch) { 6731 ret = -ENOMEM; 6732 goto err_put_cgrp; 6733 } 6734 6735 sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6736 if (!sch->exit_info) { 6737 ret = -ENOMEM; 6738 goto err_free_sch; 6739 } 6740 6741 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6742 if (ret < 0) 6743 goto err_free_ei; 6744 6745 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6746 if (!sch->pnode) { 6747 ret = -ENOMEM; 6748 goto err_free_hash; 6749 } 6750 6751 for_each_node_state(node, N_POSSIBLE) { 6752 sch->pnode[node] = alloc_pnode(sch, node); 6753 if (!sch->pnode[node]) { 6754 ret = -ENOMEM; 6755 goto err_free_pnode; 6756 } 6757 } 6758 6759 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6760 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6761 dsp_ctx.buf, sch->dsp_max_batch), 6762 __alignof__(struct scx_sched_pcpu)); 6763 if (!sch->pcpu) { 6764 ret = -ENOMEM; 6765 goto err_free_pnode; 6766 } 6767 6768 for_each_possible_cpu(cpu) { 6769 ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6770 if (ret) { 6771 bypass_fail_cpu = cpu; 6772 goto err_free_pcpu; 6773 } 6774 } 6775 6776 for_each_possible_cpu(cpu) { 6777 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6778 6779 pcpu->sch = sch; 6780 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6781 } 6782 6783 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6784 if (IS_ERR(sch->helper)) { 6785 ret = PTR_ERR(sch->helper); 6786 goto err_free_pcpu; 6787 } 6788 6789 sched_set_fifo(sch->helper->task); 6790 6791 if (parent) 6792 memcpy(sch->ancestors, parent->ancestors, 6793 level * sizeof(parent->ancestors[0])); 6794 sch->ancestors[level] = sch; 6795 sch->level = level; 6796 6797 if (ops->timeout_ms) 6798 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6799 else 6800 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6801 6802 sch->slice_dfl = SCX_SLICE_DFL; 6803 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6804 sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); 6805 kthread_init_work(&sch->disable_work, scx_disable_workfn); 6806 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6807 6808 if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { 6809 ret = -ENOMEM; 6810 goto err_stop_helper; 6811 } 6812 if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { 6813 ret = -ENOMEM; 6814 goto err_free_lb_cpumask; 6815 } 6816 /* 6817 * Copy ops through the right union view. For cid-form the source is 6818 * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/ 6819 * cpu_release; those stay zero from kzalloc. 6820 */ 6821 if (cmd->is_cid_type) { 6822 sch->ops_cid = *cmd->ops_cid; 6823 sch->is_cid_type = true; 6824 } else { 6825 sch->ops = *cmd->ops; 6826 } 6827 6828 rcu_assign_pointer(ops->priv, sch); 6829 6830 sch->kobj.kset = scx_kset; 6831 INIT_LIST_HEAD(&sch->all); 6832 6833 #ifdef CONFIG_EXT_SUB_SCHED 6834 char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6835 if (!buf) { 6836 ret = -ENOMEM; 6837 goto err_free_lb_resched; 6838 } 6839 cgroup_path(cgrp, buf, PATH_MAX); 6840 sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6841 kfree(buf); 6842 if (!sch->cgrp_path) { 6843 ret = -ENOMEM; 6844 goto err_free_lb_resched; 6845 } 6846 6847 sch->cgrp = cgrp; 6848 INIT_LIST_HEAD(&sch->children); 6849 INIT_LIST_HEAD(&sch->sibling); 6850 6851 if (parent) 6852 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6853 &parent->sub_kset->kobj, 6854 "sub-%llu", cgroup_id(cgrp)); 6855 else 6856 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6857 6858 if (ret < 0) { 6859 RCU_INIT_POINTER(ops->priv, NULL); 6860 kobject_put(&sch->kobj); 6861 return ERR_PTR(ret); 6862 } 6863 6864 if (ops->sub_attach) { 6865 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6866 if (!sch->sub_kset) { 6867 RCU_INIT_POINTER(ops->priv, NULL); 6868 kobject_put(&sch->kobj); 6869 return ERR_PTR(-ENOMEM); 6870 } 6871 } 6872 #else /* CONFIG_EXT_SUB_SCHED */ 6873 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6874 if (ret < 0) { 6875 RCU_INIT_POINTER(ops->priv, NULL); 6876 kobject_put(&sch->kobj); 6877 return ERR_PTR(ret); 6878 } 6879 #endif /* CONFIG_EXT_SUB_SCHED */ 6880 6881 /* 6882 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so 6883 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid 6884 * drops the ref. After this point, sch owns the ref and any cleanup 6885 * runs through scx_sched_free_rcu_work() which puts it. 6886 */ 6887 sch->arena_map = cmd->arena_map; 6888 /* BPF arena is only available on MMU && 64BIT */ 6889 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 6890 if (sch->arena_map) 6891 sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map); 6892 #endif 6893 cmd->arena_map = NULL; 6894 return sch; 6895 6896 #ifdef CONFIG_EXT_SUB_SCHED 6897 err_free_lb_resched: 6898 RCU_INIT_POINTER(ops->priv, NULL); 6899 free_cpumask_var(sch->bypass_lb_resched_cpumask); 6900 #endif 6901 err_free_lb_cpumask: 6902 free_cpumask_var(sch->bypass_lb_donee_cpumask); 6903 err_stop_helper: 6904 kthread_destroy_worker(sch->helper); 6905 err_free_pcpu: 6906 for_each_possible_cpu(cpu) { 6907 if (cpu == bypass_fail_cpu) 6908 break; 6909 exit_dsq(bypass_dsq(sch, cpu)); 6910 } 6911 free_percpu(sch->pcpu); 6912 err_free_pnode: 6913 for_each_node_state(node, N_POSSIBLE) 6914 free_pnode(sch->pnode[node]); 6915 kfree(sch->pnode); 6916 err_free_hash: 6917 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 6918 err_free_ei: 6919 free_exit_info(sch->exit_info); 6920 err_free_sch: 6921 kfree(sch); 6922 err_put_cgrp: 6923 #ifdef CONFIG_EXT_SUB_SCHED 6924 cgroup_put(cgrp); 6925 #endif 6926 return ERR_PTR(ret); 6927 } 6928 6929 static int check_hotplug_seq(struct scx_sched *sch, 6930 const struct sched_ext_ops *ops) 6931 { 6932 unsigned long long global_hotplug_seq; 6933 6934 /* 6935 * If a hotplug event has occurred between when a scheduler was 6936 * initialized, and when we were able to attach, exit and notify user 6937 * space about it. 6938 */ 6939 if (ops->hotplug_seq) { 6940 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 6941 if (ops->hotplug_seq != global_hotplug_seq) { 6942 scx_exit(sch, SCX_EXIT_UNREG_KERN, 6943 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 6944 "expected hotplug seq %llu did not match actual %llu", 6945 ops->hotplug_seq, global_hotplug_seq); 6946 return -EBUSY; 6947 } 6948 } 6949 6950 return 0; 6951 } 6952 6953 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 6954 { 6955 /* 6956 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 6957 * ops.enqueue() callback isn't implemented. 6958 */ 6959 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 6960 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 6961 return -EINVAL; 6962 } 6963 6964 /* 6965 * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched 6966 * may set it to declare a dependency; reject if the root hasn't 6967 * enabled it. 6968 */ 6969 if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) && 6970 !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) { 6971 scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it"); 6972 return -EINVAL; 6973 } 6974 6975 /* 6976 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 6977 * selection policy to be enabled. 6978 */ 6979 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 6980 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 6981 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 6982 return -EINVAL; 6983 } 6984 6985 /* 6986 * cid-form's struct is shorter and doesn't include the cpu_acquire / 6987 * cpu_release tail; reading those fields off a cid-form @ops would 6988 * run past the BPF allocation. Skip for cid-form. 6989 */ 6990 if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release)) 6991 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 6992 6993 /* 6994 * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched 6995 * attaches through a cid-form-only interface (sub_attach/sub_detach), 6996 * and a root that accepts sub-scheds must expose cid-form state to 6997 * them. Reject cpu-form schedulers on either side. 6998 */ 6999 if (!sch->is_cid_type) { 7000 if (scx_parent(sch)) { 7001 scx_error(sch, "sub-sched requires cid-form struct_ops"); 7002 return -EINVAL; 7003 } 7004 if (ops->sub_attach || ops->sub_detach) { 7005 scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops"); 7006 return -EINVAL; 7007 } 7008 } 7009 7010 return 0; 7011 } 7012 7013 static void scx_root_enable_workfn(struct kthread_work *work) 7014 { 7015 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7016 struct sched_ext_ops *ops = cmd->ops; 7017 struct cgroup *cgrp = root_cgroup(); 7018 struct scx_sched *sch; 7019 struct scx_task_iter sti; 7020 struct task_struct *p; 7021 int i, cpu, ret; 7022 7023 mutex_lock(&scx_enable_mutex); 7024 7025 if (scx_enable_state() != SCX_DISABLED) { 7026 ret = -EBUSY; 7027 goto err_unlock; 7028 } 7029 7030 /* 7031 * @ops->priv binds @ops to its scx_sched instance. It is set here by 7032 * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), 7033 * which runs after scx_root_disable() has dropped scx_enable_mutex. If 7034 * it's still non-NULL here, a previous attachment on @ops has not 7035 * finished tearing down; proceeding would let the in-flight unreg's 7036 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. 7037 */ 7038 if (rcu_access_pointer(ops->priv)) { 7039 ret = -EBUSY; 7040 goto err_unlock; 7041 } 7042 7043 ret = alloc_kick_syncs(); 7044 if (ret) 7045 goto err_unlock; 7046 7047 if (ops->flags & SCX_OPS_TID_TO_TASK) { 7048 ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params); 7049 if (ret) 7050 goto err_free_ksyncs; 7051 } 7052 7053 #ifdef CONFIG_EXT_SUB_SCHED 7054 cgroup_get(cgrp); 7055 #endif 7056 sch = scx_alloc_and_add_sched(cmd, cgrp, NULL); 7057 if (IS_ERR(sch)) { 7058 ret = PTR_ERR(sch); 7059 goto err_free_tid_hash; 7060 } 7061 7062 if (sch->is_cid_type) 7063 static_branch_enable(&__scx_is_cid_type); 7064 7065 /* 7066 * Transition to ENABLING and clear exit info to arm the disable path. 7067 * Failure triggers full disabling from here on. 7068 */ 7069 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 7070 WARN_ON_ONCE(scx_root); 7071 7072 atomic_long_set(&scx_nr_rejected, 0); 7073 7074 for_each_possible_cpu(cpu) { 7075 struct rq *rq = cpu_rq(cpu); 7076 7077 rq->scx.local_dsq.sched = sch; 7078 rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 7079 } 7080 7081 /* 7082 * Keep CPUs stable during enable so that the BPF scheduler can track 7083 * online CPUs by watching ->on/offline_cpu() after ->init(). 7084 */ 7085 cpus_read_lock(); 7086 7087 /* 7088 * Build the cid mapping before publishing scx_root. The cid kfuncs 7089 * dereference the cid arrays unconditionally once scx_prog_sched() 7090 * returns non-NULL; the rcu_assign_pointer() below pairs with their 7091 * rcu_dereference() to make the populated arrays visible. 7092 */ 7093 ret = scx_cid_init(sch); 7094 if (ret) { 7095 cpus_read_unlock(); 7096 goto err_disable; 7097 } 7098 7099 /* 7100 * Make the scheduler instance visible. Must be inside cpus_read_lock(). 7101 * See handle_hotplug(). 7102 */ 7103 rcu_assign_pointer(scx_root, sch); 7104 7105 ret = scx_link_sched(sch); 7106 if (ret) { 7107 cpus_read_unlock(); 7108 goto err_disable; 7109 } 7110 7111 scx_idle_enable(ops); 7112 7113 if (sch->ops.init) { 7114 ret = SCX_CALL_OP_RET(sch, init, NULL); 7115 if (ret) { 7116 ret = ops_sanitize_err(sch, "init", ret); 7117 cpus_read_unlock(); 7118 scx_error(sch, "ops.init() failed (%d)", ret); 7119 goto err_disable; 7120 } 7121 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7122 } 7123 7124 ret = scx_arena_pool_init(sch); 7125 if (ret) { 7126 cpus_read_unlock(); 7127 goto err_disable; 7128 } 7129 7130 ret = scx_set_cmask_scratch_alloc(sch); 7131 if (ret) { 7132 cpus_read_unlock(); 7133 goto err_disable; 7134 } 7135 7136 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 7137 if (((void (**)(void))ops)[i]) 7138 set_bit(i, sch->has_op); 7139 7140 ret = check_hotplug_seq(sch, ops); 7141 if (ret) { 7142 cpus_read_unlock(); 7143 goto err_disable; 7144 } 7145 scx_idle_update_selcpu_topology(ops); 7146 7147 cpus_read_unlock(); 7148 7149 ret = validate_ops(sch, ops); 7150 if (ret) 7151 goto err_disable; 7152 7153 /* 7154 * Attach the ext_server bandwidth reservation before anything is 7155 * committed so that we can fail the enable if the root domain cannot 7156 * accommodate it. The matching fair_server detach is deferred to the 7157 * tail of this function, after the switch is fully committed and can no 7158 * longer fail. 7159 * 7160 * On failure, err_disable funnels into scx_root_disable() which 7161 * detaches ext_server, so partially-attached state is cleaned up 7162 * automatically. 7163 */ 7164 for_each_possible_cpu(cpu) { 7165 struct rq *rq = cpu_rq(cpu); 7166 7167 scoped_guard(rq_lock_irqsave, rq) { 7168 update_rq_clock(rq); 7169 ret = dl_server_attach_bw(&rq->ext_server); 7170 } 7171 if (ret) { 7172 pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n", 7173 cpu, ret); 7174 goto err_disable; 7175 } 7176 } 7177 7178 /* 7179 * Once __scx_enabled is set, %current can be switched to SCX anytime. 7180 * This can lead to stalls as some BPF schedulers (e.g. userspace 7181 * scheduling) may not function correctly before all tasks are switched. 7182 * Init in bypass mode to guarantee forward progress. 7183 */ 7184 scx_bypass(sch, true); 7185 7186 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 7187 if (((void (**)(void))ops)[i]) 7188 set_bit(i, sch->has_op); 7189 7190 if (sch->ops.cpu_acquire || sch->ops.cpu_release) 7191 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 7192 7193 /* 7194 * Lock out forks, cgroup on/offlining and moves before opening the 7195 * floodgate so that they don't wander into the operations prematurely. 7196 */ 7197 percpu_down_write(&scx_fork_rwsem); 7198 7199 WARN_ON_ONCE(scx_init_task_enabled); 7200 scx_init_task_enabled = true; 7201 7202 /* flip under fork_rwsem; the iter below covers existing tasks */ 7203 if (ops->flags & SCX_OPS_TID_TO_TASK) 7204 static_branch_enable(&__scx_tid_to_task_enabled); 7205 7206 /* 7207 * Enable ops for every task. Fork is excluded by scx_fork_rwsem 7208 * preventing new tasks from being added. No need to exclude tasks 7209 * leaving as sched_ext_free() can handle both prepped and enabled 7210 * tasks. Prep all tasks first and then enable them with preemption 7211 * disabled. 7212 * 7213 * All cgroups should be initialized before scx_init_task() so that the 7214 * BPF scheduler can reliably track each task's cgroup membership from 7215 * scx_init_task(). Lock out cgroup on/offlining and task migrations 7216 * while tasks are being initialized so that scx_cgroup_can_attach() 7217 * never sees uninitialized tasks. 7218 */ 7219 scx_cgroup_lock(); 7220 set_cgroup_sched(sch_cgroup(sch), sch); 7221 ret = scx_cgroup_init(sch); 7222 if (ret) 7223 goto err_disable_unlock_all; 7224 7225 scx_task_iter_start(&sti, NULL); 7226 while ((p = scx_task_iter_next_locked(&sti))) { 7227 /* 7228 * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD 7229 * tasks are filtered by scx_task_iter_next_locked(). 7230 * sched_ext_dead() removes @p from scx_tasks under the same 7231 * lock before put_task_struct_rcu_user() runs, so @p->usage 7232 * is guaranteed > 0 here. 7233 */ 7234 get_task_struct(p); 7235 7236 /* 7237 * Set %INIT_BEGIN under the iter's rq lock so that a concurrent 7238 * sched_ext_dead() does not call ops.exit_task() on @p while 7239 * ops.init_task() is running. If sched_ext_dead() runs before 7240 * this store, it has already removed @p from scx_tasks and the 7241 * iter won't visit @p; if it runs after, it observes 7242 * %INIT_BEGIN and transitions to %DEAD without calling ops, 7243 * leaving the post-init recheck below to unwind. 7244 */ 7245 scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 7246 scx_task_iter_unlock(&sti); 7247 7248 ret = __scx_init_task(sch, p, false); 7249 7250 scx_task_iter_relock(&sti, p); 7251 7252 if (unlikely(ret)) { 7253 if (scx_get_task_state(p) != SCX_TASK_DEAD) 7254 scx_set_task_state(p, SCX_TASK_NONE); 7255 scx_task_iter_stop(&sti); 7256 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 7257 ret, p->comm, p->pid); 7258 put_task_struct(p); 7259 goto err_disable_unlock_all; 7260 } 7261 7262 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7263 /* 7264 * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. 7265 * ops.exit_task() is owed to the sched __scx_init_task() 7266 * ran against; call it now. 7267 */ 7268 scx_sub_init_cancel_task(sch, p); 7269 } else { 7270 scx_set_task_state(p, SCX_TASK_INIT); 7271 scx_set_task_sched(p, sch); 7272 scx_set_task_state(p, SCX_TASK_READY); 7273 } 7274 7275 /* 7276 * Insert into the tid hash. scx_tasks_lock is held by the iter; 7277 * list_empty() guards against sched_ext_dead() having taken @p 7278 * off the list while init ran unlocked. 7279 */ 7280 if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node)) 7281 scx_tid_hash_insert(p); 7282 7283 put_task_struct(p); 7284 } 7285 scx_task_iter_stop(&sti); 7286 scx_cgroup_unlock(); 7287 percpu_up_write(&scx_fork_rwsem); 7288 7289 /* 7290 * All tasks are READY. It's safe to turn on scx_enabled() and switch 7291 * all eligible tasks. 7292 */ 7293 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 7294 static_branch_enable(&__scx_enabled); 7295 7296 /* 7297 * We're fully committed and can't fail. The task READY -> ENABLED 7298 * transitions here are synchronized against sched_ext_free() through 7299 * scx_tasks_lock. 7300 */ 7301 percpu_down_write(&scx_fork_rwsem); 7302 scx_task_iter_start(&sti, NULL); 7303 while ((p = scx_task_iter_next_locked(&sti))) { 7304 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 7305 const struct sched_class *old_class = p->sched_class; 7306 const struct sched_class *new_class = scx_setscheduler_class(p); 7307 7308 if (scx_get_task_state(p) != SCX_TASK_READY) 7309 continue; 7310 7311 if (old_class != new_class) 7312 queue_flags |= DEQUEUE_CLASS; 7313 7314 scoped_guard (sched_change, p, queue_flags) { 7315 p->scx.slice = READ_ONCE(sch->slice_dfl); 7316 p->sched_class = new_class; 7317 } 7318 } 7319 scx_task_iter_stop(&sti); 7320 percpu_up_write(&scx_fork_rwsem); 7321 7322 scx_bypass(sch, false); 7323 7324 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 7325 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 7326 goto err_disable; 7327 } 7328 7329 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 7330 static_branch_enable(&__scx_switched_all); 7331 7332 /* 7333 * Detach the fair_server bandwidth reservation now that the switch 7334 * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no 7335 * task will ever run in the fair class, so give that bandwidth 7336 * back to the RT class. The matching ext_server attach already 7337 * happened earlier; this only releases bandwidth and cannot fail. 7338 * 7339 * In partial mode keep fair_server attached. 7340 */ 7341 if (scx_switched_all()) { 7342 for_each_possible_cpu(cpu) { 7343 struct rq *rq = cpu_rq(cpu); 7344 7345 guard(rq_lock_irqsave)(rq); 7346 update_rq_clock(rq); 7347 dl_server_detach_bw(&rq->fair_server); 7348 } 7349 } 7350 7351 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 7352 sch->ops.name, scx_switched_all() ? "" : " (partial)"); 7353 kobject_uevent(&sch->kobj, KOBJ_ADD); 7354 mutex_unlock(&scx_enable_mutex); 7355 7356 atomic_long_inc(&scx_enable_seq); 7357 7358 cmd->ret = 0; 7359 return; 7360 7361 err_free_tid_hash: 7362 if (ops->flags & SCX_OPS_TID_TO_TASK) 7363 rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); 7364 err_free_ksyncs: 7365 free_kick_syncs(); 7366 err_unlock: 7367 mutex_unlock(&scx_enable_mutex); 7368 cmd->ret = ret; 7369 return; 7370 7371 err_disable_unlock_all: 7372 scx_cgroup_unlock(); 7373 percpu_up_write(&scx_fork_rwsem); 7374 /* we'll soon enter disable path, keep bypass on */ 7375 err_disable: 7376 mutex_unlock(&scx_enable_mutex); 7377 /* 7378 * Returning an error code here would not pass all the error information 7379 * to userspace. Record errno using scx_error() for cases scx_error() 7380 * wasn't already invoked and exit indicating success so that the error 7381 * is notified through ops.exit() with all the details. 7382 * 7383 * Flush scx_disable_work to ensure that error is reported before init 7384 * completion. sch's base reference will be put by bpf_scx_unreg(). 7385 */ 7386 scx_error(sch, "scx_root_enable() failed (%d)", ret); 7387 scx_flush_disable_work(sch); 7388 cmd->ret = 0; 7389 } 7390 7391 #ifdef CONFIG_EXT_SUB_SCHED 7392 /* verify that a scheduler can be attached to @cgrp and return the parent */ 7393 static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 7394 { 7395 struct scx_sched *parent = cgrp->scx_sched; 7396 struct scx_sched *pos; 7397 7398 lockdep_assert_held(&scx_sched_lock); 7399 7400 /* can't attach twice to the same cgroup */ 7401 if (parent->cgrp == cgrp) 7402 return ERR_PTR(-EBUSY); 7403 7404 /* does $parent allow sub-scheds? */ 7405 if (!parent->ops.sub_attach) 7406 return ERR_PTR(-EOPNOTSUPP); 7407 7408 /* can't insert between $parent and its exiting children */ 7409 list_for_each_entry(pos, &parent->children, sibling) 7410 if (cgroup_is_descendant(pos->cgrp, cgrp)) 7411 return ERR_PTR(-EBUSY); 7412 7413 return parent; 7414 } 7415 7416 static bool assert_task_ready_or_enabled(struct task_struct *p) 7417 { 7418 u32 state = scx_get_task_state(p); 7419 7420 switch (state) { 7421 case SCX_TASK_READY: 7422 case SCX_TASK_ENABLED: 7423 return true; 7424 default: 7425 WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 7426 state, p->comm, p->pid); 7427 return false; 7428 } 7429 } 7430 7431 static void scx_sub_enable_workfn(struct kthread_work *work) 7432 { 7433 struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7434 struct sched_ext_ops *ops = cmd->ops; 7435 struct cgroup *cgrp; 7436 struct scx_sched *parent, *sch; 7437 struct scx_task_iter sti; 7438 struct task_struct *p; 7439 s32 i, ret; 7440 7441 mutex_lock(&scx_enable_mutex); 7442 7443 if (!scx_enabled()) { 7444 ret = -ENODEV; 7445 goto out_unlock; 7446 } 7447 7448 /* See scx_root_enable_workfn() for the @ops->priv check. */ 7449 if (rcu_access_pointer(ops->priv)) { 7450 ret = -EBUSY; 7451 goto out_unlock; 7452 } 7453 7454 cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 7455 if (IS_ERR(cgrp)) { 7456 ret = PTR_ERR(cgrp); 7457 goto out_unlock; 7458 } 7459 7460 raw_spin_lock_irq(&scx_sched_lock); 7461 parent = find_parent_sched(cgrp); 7462 if (IS_ERR(parent)) { 7463 raw_spin_unlock_irq(&scx_sched_lock); 7464 ret = PTR_ERR(parent); 7465 goto out_put_cgrp; 7466 } 7467 kobject_get(&parent->kobj); 7468 raw_spin_unlock_irq(&scx_sched_lock); 7469 7470 /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ 7471 sch = scx_alloc_and_add_sched(cmd, cgrp, parent); 7472 kobject_put(&parent->kobj); 7473 if (IS_ERR(sch)) { 7474 ret = PTR_ERR(sch); 7475 goto out_unlock; 7476 } 7477 7478 ret = scx_link_sched(sch); 7479 if (ret) 7480 goto err_disable; 7481 7482 if (sch->level >= SCX_SUB_MAX_DEPTH) { 7483 scx_error(sch, "max nesting depth %d violated", 7484 SCX_SUB_MAX_DEPTH); 7485 goto err_disable; 7486 } 7487 7488 if (sch->ops.init) { 7489 ret = SCX_CALL_OP_RET(sch, init, NULL); 7490 if (ret) { 7491 ret = ops_sanitize_err(sch, "init", ret); 7492 scx_error(sch, "ops.init() failed (%d)", ret); 7493 goto err_disable; 7494 } 7495 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7496 } 7497 7498 ret = scx_arena_pool_init(sch); 7499 if (ret) 7500 goto err_disable; 7501 7502 ret = scx_set_cmask_scratch_alloc(sch); 7503 if (ret) 7504 goto err_disable; 7505 7506 if (validate_ops(sch, ops)) 7507 goto err_disable; 7508 7509 struct scx_sub_attach_args sub_attach_args = { 7510 .ops = &sch->ops, 7511 .cgroup_path = sch->cgrp_path, 7512 }; 7513 7514 ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, 7515 &sub_attach_args); 7516 if (ret) { 7517 ret = ops_sanitize_err(sch, "sub_attach", ret); 7518 scx_error(sch, "parent rejected (%d)", ret); 7519 goto err_disable; 7520 } 7521 sch->sub_attached = true; 7522 7523 scx_bypass(sch, true); 7524 7525 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 7526 if (((void (**)(void))ops)[i]) 7527 set_bit(i, sch->has_op); 7528 7529 percpu_down_write(&scx_fork_rwsem); 7530 scx_cgroup_lock(); 7531 7532 /* 7533 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 7534 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 7535 */ 7536 set_cgroup_sched(sch_cgroup(sch), sch); 7537 if (!(cgrp->self.flags & CSS_ONLINE)) { 7538 scx_error(sch, "cgroup is not online"); 7539 goto err_unlock_and_disable; 7540 } 7541 7542 /* 7543 * Initialize tasks for the new child $sch without exiting them for 7544 * $parent so that the tasks can always be reverted back to $parent 7545 * sched on child init failure. 7546 */ 7547 WARN_ON_ONCE(scx_enabling_sub_sched); 7548 scx_enabling_sub_sched = sch; 7549 7550 scx_task_iter_start(&sti, sch->cgrp); 7551 while ((p = scx_task_iter_next_locked(&sti))) { 7552 struct rq *rq; 7553 struct rq_flags rf; 7554 7555 /* 7556 * Task iteration may visit the same task twice when racing 7557 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 7558 * finished __scx_init_task() and skip if set. 7559 * 7560 * A task may exit and get freed between __scx_init_task() 7561 * completion and scx_enable_task(). In such cases, 7562 * scx_disable_and_exit_task() must exit the task for both the 7563 * parent and child scheds. 7564 */ 7565 if (p->scx.flags & SCX_TASK_SUB_INIT) 7566 continue; 7567 7568 /* @p is pinned by the iter; see scx_sub_disable() */ 7569 get_task_struct(p); 7570 7571 if (!assert_task_ready_or_enabled(p)) { 7572 ret = -EINVAL; 7573 goto abort; 7574 } 7575 7576 scx_task_iter_unlock(&sti); 7577 7578 /* 7579 * As $p is still on $parent, it can't be transitioned to INIT. 7580 * Let's worry about task state later. Use __scx_init_task(). 7581 */ 7582 ret = __scx_init_task(sch, p, false); 7583 if (ret) 7584 goto abort; 7585 7586 rq = task_rq_lock(p, &rf); 7587 7588 if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7589 /* 7590 * sched_ext_dead() raced us between __scx_init_task() 7591 * and this rq lock and ran exit_task() on $parent (the 7592 * sched @p was on at that point), not on @sch. @sch's 7593 * just-completed init is owed an exit_task() and we 7594 * issue it here. 7595 */ 7596 scx_sub_init_cancel_task(sch, p); 7597 task_rq_unlock(rq, p, &rf); 7598 put_task_struct(p); 7599 continue; 7600 } 7601 7602 p->scx.flags |= SCX_TASK_SUB_INIT; 7603 task_rq_unlock(rq, p, &rf); 7604 7605 put_task_struct(p); 7606 } 7607 scx_task_iter_stop(&sti); 7608 7609 /* 7610 * All tasks are prepped. Disable/exit tasks for $parent and enable for 7611 * the new @sch. 7612 */ 7613 scx_task_iter_start(&sti, sch->cgrp); 7614 while ((p = scx_task_iter_next_locked(&sti))) { 7615 /* 7616 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7617 * duplicate iterations. 7618 */ 7619 if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7620 continue; 7621 7622 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7623 /* 7624 * $p must be either READY or ENABLED. If ENABLED, 7625 * __scx_disabled_and_exit_task() first disables and 7626 * makes it READY. However, after exiting $p, it will 7627 * leave $p as READY. 7628 */ 7629 assert_task_ready_or_enabled(p); 7630 __scx_disable_and_exit_task(parent, p); 7631 7632 /* 7633 * $p is now only initialized for @sch and READY, which 7634 * is what we want. Assign it to @sch and enable. 7635 */ 7636 scx_set_task_sched(p, sch); 7637 scx_enable_task(sch, p); 7638 7639 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7640 } 7641 } 7642 scx_task_iter_stop(&sti); 7643 7644 scx_enabling_sub_sched = NULL; 7645 7646 scx_cgroup_unlock(); 7647 percpu_up_write(&scx_fork_rwsem); 7648 7649 scx_bypass(sch, false); 7650 7651 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7652 kobject_uevent(&sch->kobj, KOBJ_ADD); 7653 ret = 0; 7654 goto out_unlock; 7655 7656 out_put_cgrp: 7657 cgroup_put(cgrp); 7658 out_unlock: 7659 mutex_unlock(&scx_enable_mutex); 7660 cmd->ret = ret; 7661 return; 7662 7663 abort: 7664 put_task_struct(p); 7665 scx_task_iter_stop(&sti); 7666 7667 /* 7668 * Undo __scx_init_task() for tasks we marked. scx_enable_task() never 7669 * ran for @sch on them, so calling scx_disable_task() here would invoke 7670 * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched 7671 * must stay set until SUB_INIT is cleared from every marked task - 7672 * scx_disable_and_exit_task() reads it when a task exits concurrently. 7673 */ 7674 scx_task_iter_start(&sti, sch->cgrp); 7675 while ((p = scx_task_iter_next_locked(&sti))) { 7676 if (p->scx.flags & SCX_TASK_SUB_INIT) { 7677 scx_sub_init_cancel_task(sch, p); 7678 p->scx.flags &= ~SCX_TASK_SUB_INIT; 7679 } 7680 } 7681 scx_task_iter_stop(&sti); 7682 scx_enabling_sub_sched = NULL; 7683 err_unlock_and_disable: 7684 /* we'll soon enter disable path, keep bypass on */ 7685 scx_cgroup_unlock(); 7686 percpu_up_write(&scx_fork_rwsem); 7687 err_disable: 7688 mutex_unlock(&scx_enable_mutex); 7689 scx_flush_disable_work(sch); 7690 cmd->ret = 0; 7691 } 7692 7693 static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7694 unsigned long action, void *data) 7695 { 7696 struct cgroup *cgrp = data; 7697 struct cgroup *parent = cgroup_parent(cgrp); 7698 7699 if (!cgroup_on_dfl(cgrp)) 7700 return NOTIFY_OK; 7701 7702 switch (action) { 7703 case CGROUP_LIFETIME_ONLINE: 7704 /* inherit ->scx_sched from $parent */ 7705 if (parent) 7706 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7707 break; 7708 case CGROUP_LIFETIME_OFFLINE: 7709 /* if there is a sched attached, shoot it down */ 7710 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7711 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7712 SCX_ECODE_RSN_CGROUP_OFFLINE, 7713 "cgroup %llu going offline", cgroup_id(cgrp)); 7714 break; 7715 } 7716 7717 return NOTIFY_OK; 7718 } 7719 7720 static struct notifier_block scx_cgroup_lifetime_nb = { 7721 .notifier_call = scx_cgroup_lifetime_notify, 7722 }; 7723 7724 static s32 __init scx_cgroup_lifetime_notifier_init(void) 7725 { 7726 return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7727 &scx_cgroup_lifetime_nb); 7728 } 7729 core_initcall(scx_cgroup_lifetime_notifier_init); 7730 #endif /* CONFIG_EXT_SUB_SCHED */ 7731 7732 static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link) 7733 { 7734 static struct kthread_worker *helper; 7735 static DEFINE_MUTEX(helper_mutex); 7736 7737 if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { 7738 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7739 return -EINVAL; 7740 } 7741 7742 if (!READ_ONCE(helper)) { 7743 mutex_lock(&helper_mutex); 7744 if (!helper) { 7745 struct kthread_worker *w = 7746 kthread_run_worker(0, "scx_enable_helper"); 7747 if (IS_ERR_OR_NULL(w)) { 7748 mutex_unlock(&helper_mutex); 7749 return -ENOMEM; 7750 } 7751 sched_set_fifo(w->task); 7752 WRITE_ONCE(helper, w); 7753 } 7754 mutex_unlock(&helper_mutex); 7755 } 7756 7757 #ifdef CONFIG_EXT_SUB_SCHED 7758 if (cmd->ops->sub_cgroup_id > 1) 7759 kthread_init_work(&cmd->work, scx_sub_enable_workfn); 7760 else 7761 #endif /* CONFIG_EXT_SUB_SCHED */ 7762 kthread_init_work(&cmd->work, scx_root_enable_workfn); 7763 7764 kthread_queue_work(READ_ONCE(helper), &cmd->work); 7765 kthread_flush_work(&cmd->work); 7766 return cmd->ret; 7767 } 7768 7769 7770 /******************************************************************************** 7771 * bpf_struct_ops plumbing. 7772 */ 7773 #include <linux/bpf_verifier.h> 7774 #include <linux/bpf.h> 7775 #include <linux/btf.h> 7776 7777 static const struct btf_type *task_struct_type; 7778 7779 static bool bpf_scx_is_valid_access(int off, int size, 7780 enum bpf_access_type type, 7781 const struct bpf_prog *prog, 7782 struct bpf_insn_access_aux *info) 7783 { 7784 if (type != BPF_READ) 7785 return false; 7786 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7787 return false; 7788 if (off % size != 0) 7789 return false; 7790 7791 return btf_ctx_access(off, size, type, prog, info); 7792 } 7793 7794 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7795 const struct bpf_reg_state *reg, int off, 7796 int size) 7797 { 7798 const struct btf_type *t; 7799 7800 t = btf_type_by_id(reg->btf, reg->btf_id); 7801 if (t == task_struct_type) { 7802 /* 7803 * COMPAT: Will be removed in v6.23. 7804 */ 7805 if ((off >= offsetof(struct task_struct, scx.slice) && 7806 off + size <= offsetofend(struct task_struct, scx.slice)) || 7807 (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7808 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7809 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7810 return SCALAR_VALUE; 7811 } 7812 7813 if (off >= offsetof(struct task_struct, scx.disallow) && 7814 off + size <= offsetofend(struct task_struct, scx.disallow)) 7815 return SCALAR_VALUE; 7816 } 7817 7818 return -EACCES; 7819 } 7820 7821 static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7822 .get_func_proto = bpf_base_func_proto, 7823 .is_valid_access = bpf_scx_is_valid_access, 7824 .btf_struct_access = bpf_scx_btf_struct_access, 7825 }; 7826 7827 static int bpf_scx_init_member(const struct btf_type *t, 7828 const struct btf_member *member, 7829 void *kdata, const void *udata) 7830 { 7831 const struct sched_ext_ops *uops = udata; 7832 struct sched_ext_ops *ops = kdata; 7833 u32 moff = __btf_member_bit_offset(t, member) / 8; 7834 int ret; 7835 7836 switch (moff) { 7837 case offsetof(struct sched_ext_ops, dispatch_max_batch): 7838 if (*(u32 *)(udata + moff) > INT_MAX) 7839 return -E2BIG; 7840 ops->dispatch_max_batch = *(u32 *)(udata + moff); 7841 return 1; 7842 case offsetof(struct sched_ext_ops, flags): 7843 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7844 return -EINVAL; 7845 ops->flags = *(u64 *)(udata + moff); 7846 return 1; 7847 case offsetof(struct sched_ext_ops, name): 7848 ret = bpf_obj_name_cpy(ops->name, uops->name, 7849 sizeof(ops->name)); 7850 if (ret < 0) 7851 return ret; 7852 if (ret == 0) 7853 return -EINVAL; 7854 return 1; 7855 case offsetof(struct sched_ext_ops, timeout_ms): 7856 if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7857 SCX_WATCHDOG_MAX_TIMEOUT) 7858 return -E2BIG; 7859 ops->timeout_ms = *(u32 *)(udata + moff); 7860 return 1; 7861 case offsetof(struct sched_ext_ops, exit_dump_len): 7862 ops->exit_dump_len = 7863 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7864 return 1; 7865 case offsetof(struct sched_ext_ops, hotplug_seq): 7866 ops->hotplug_seq = *(u64 *)(udata + moff); 7867 return 1; 7868 #ifdef CONFIG_EXT_SUB_SCHED 7869 case offsetof(struct sched_ext_ops, sub_cgroup_id): 7870 ops->sub_cgroup_id = *(u64 *)(udata + moff); 7871 return 1; 7872 #endif /* CONFIG_EXT_SUB_SCHED */ 7873 } 7874 7875 return 0; 7876 } 7877 7878 #ifdef CONFIG_EXT_SUB_SCHED 7879 static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 7880 { 7881 struct scx_sched *sch; 7882 7883 guard(rcu)(); 7884 sch = scx_prog_sched(prog->aux); 7885 if (unlikely(!sch)) 7886 return; 7887 7888 scx_error(sch, "dispatch recursion detected"); 7889 } 7890 #endif /* CONFIG_EXT_SUB_SCHED */ 7891 7892 static int bpf_scx_check_member(const struct btf_type *t, 7893 const struct btf_member *member, 7894 const struct bpf_prog *prog) 7895 { 7896 u32 moff = __btf_member_bit_offset(t, member) / 8; 7897 7898 switch (moff) { 7899 case offsetof(struct sched_ext_ops, init_task): 7900 #ifdef CONFIG_EXT_GROUP_SCHED 7901 case offsetof(struct sched_ext_ops, cgroup_init): 7902 case offsetof(struct sched_ext_ops, cgroup_exit): 7903 case offsetof(struct sched_ext_ops, cgroup_prep_move): 7904 #endif 7905 case offsetof(struct sched_ext_ops, cpu_online): 7906 case offsetof(struct sched_ext_ops, cpu_offline): 7907 case offsetof(struct sched_ext_ops, init): 7908 case offsetof(struct sched_ext_ops, exit): 7909 case offsetof(struct sched_ext_ops, sub_attach): 7910 case offsetof(struct sched_ext_ops, sub_detach): 7911 break; 7912 default: 7913 if (prog->sleepable) 7914 return -EINVAL; 7915 } 7916 7917 #ifdef CONFIG_EXT_SUB_SCHED 7918 /* 7919 * Enable private stack for operations that can nest along the 7920 * hierarchy. 7921 * 7922 * XXX - Ideally, we should only do this for scheds that allow 7923 * sub-scheds and sub-scheds themselves but I don't know how to access 7924 * struct_ops from here. 7925 */ 7926 switch (moff) { 7927 case offsetof(struct sched_ext_ops, dispatch): 7928 prog->aux->priv_stack_requested = true; 7929 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 7930 } 7931 #endif /* CONFIG_EXT_SUB_SCHED */ 7932 7933 return 0; 7934 } 7935 7936 static int bpf_scx_reg(void *kdata, struct bpf_link *link) 7937 { 7938 struct scx_enable_cmd cmd = { .ops = kdata }; 7939 7940 return scx_enable(&cmd, link); 7941 } 7942 7943 struct scx_arena_scan { 7944 struct bpf_map *arena; 7945 int err; 7946 }; 7947 7948 /* 7949 * The verifier enforces one arena per BPF program, so each struct_ops 7950 * member prog contributes at most one arena via bpf_prog_arena(). 7951 * Require all non-NULL contributions to match. 7952 */ 7953 static int scx_arena_scan_prog(struct bpf_prog *prog, void *data) 7954 { 7955 struct scx_arena_scan *s = data; 7956 struct bpf_map *arena = NULL; 7957 7958 /* arena.o, which defines these, is built only on MMU && 64BIT */ 7959 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 7960 arena = bpf_prog_arena(prog); 7961 #endif 7962 if (!arena) 7963 return 0; 7964 if (s->arena && s->arena != arena) { 7965 s->err = -EINVAL; 7966 return 1; 7967 } 7968 s->arena = arena; 7969 return 0; 7970 } 7971 7972 static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link) 7973 { 7974 struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true }; 7975 struct scx_arena_scan scan = {}; 7976 int ret; 7977 7978 bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan); 7979 if (scan.err) { 7980 pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n"); 7981 return scan.err; 7982 } 7983 if (!scan.arena) { 7984 pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n"); 7985 return -EINVAL; 7986 } 7987 7988 bpf_map_inc(scan.arena); 7989 cmd.arena_map = scan.arena; 7990 ret = scx_enable(&cmd, link); 7991 if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */ 7992 bpf_map_put(cmd.arena_map); 7993 return ret; 7994 } 7995 7996 static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 7997 { 7998 struct sched_ext_ops *ops = kdata; 7999 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 8000 8001 scx_disable(sch, SCX_EXIT_UNREG); 8002 scx_flush_disable_work(sch); 8003 RCU_INIT_POINTER(ops->priv, NULL); 8004 kobject_put(&sch->kobj); 8005 } 8006 8007 static int bpf_scx_init(struct btf *btf) 8008 { 8009 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 8010 8011 return 0; 8012 } 8013 8014 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 8015 { 8016 /* 8017 * sched_ext does not support updating the actively-loaded BPF 8018 * scheduler, as registering a BPF scheduler can always fail if the 8019 * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 8020 * etc. Similarly, we can always race with unregistration happening 8021 * elsewhere, such as with sysrq. 8022 */ 8023 return -EOPNOTSUPP; 8024 } 8025 8026 static int bpf_scx_validate(void *kdata) 8027 { 8028 return 0; 8029 } 8030 8031 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 8032 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 8033 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 8034 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 8035 static void sched_ext_ops__tick(struct task_struct *p) {} 8036 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 8037 static void sched_ext_ops__running(struct task_struct *p) {} 8038 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 8039 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 8040 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 8041 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 8042 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 8043 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 8044 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 8045 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 8046 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 8047 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 8048 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 8049 static void sched_ext_ops__enable(struct task_struct *p) {} 8050 static void sched_ext_ops__disable(struct task_struct *p) {} 8051 #ifdef CONFIG_EXT_GROUP_SCHED 8052 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 8053 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 8054 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 8055 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 8056 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 8057 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 8058 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 8059 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 8060 #endif /* CONFIG_EXT_GROUP_SCHED */ 8061 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 8062 static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 8063 static void sched_ext_ops__cpu_online(s32 cpu) {} 8064 static void sched_ext_ops__cpu_offline(s32 cpu) {} 8065 static s32 sched_ext_ops__init(void) { return -EINVAL; } 8066 static void sched_ext_ops__exit(struct scx_exit_info *info) {} 8067 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 8068 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 8069 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 8070 8071 static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 8072 .select_cpu = sched_ext_ops__select_cpu, 8073 .enqueue = sched_ext_ops__enqueue, 8074 .dequeue = sched_ext_ops__dequeue, 8075 .dispatch = sched_ext_ops__dispatch, 8076 .tick = sched_ext_ops__tick, 8077 .runnable = sched_ext_ops__runnable, 8078 .running = sched_ext_ops__running, 8079 .stopping = sched_ext_ops__stopping, 8080 .quiescent = sched_ext_ops__quiescent, 8081 .yield = sched_ext_ops__yield, 8082 .core_sched_before = sched_ext_ops__core_sched_before, 8083 .set_weight = sched_ext_ops__set_weight, 8084 .set_cpumask = sched_ext_ops__set_cpumask, 8085 .update_idle = sched_ext_ops__update_idle, 8086 .cpu_acquire = sched_ext_ops__cpu_acquire, 8087 .cpu_release = sched_ext_ops__cpu_release, 8088 .init_task = sched_ext_ops__init_task, 8089 .exit_task = sched_ext_ops__exit_task, 8090 .enable = sched_ext_ops__enable, 8091 .disable = sched_ext_ops__disable, 8092 #ifdef CONFIG_EXT_GROUP_SCHED 8093 .cgroup_init = sched_ext_ops__cgroup_init, 8094 .cgroup_exit = sched_ext_ops__cgroup_exit, 8095 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 8096 .cgroup_move = sched_ext_ops__cgroup_move, 8097 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 8098 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 8099 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 8100 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 8101 #endif 8102 .sub_attach = sched_ext_ops__sub_attach, 8103 .sub_detach = sched_ext_ops__sub_detach, 8104 .cpu_online = sched_ext_ops__cpu_online, 8105 .cpu_offline = sched_ext_ops__cpu_offline, 8106 .init = sched_ext_ops__init, 8107 .exit = sched_ext_ops__exit, 8108 .dump = sched_ext_ops__dump, 8109 .dump_cpu = sched_ext_ops__dump_cpu, 8110 .dump_task = sched_ext_ops__dump_task, 8111 }; 8112 8113 static struct bpf_struct_ops bpf_sched_ext_ops = { 8114 .verifier_ops = &bpf_scx_verifier_ops, 8115 .reg = bpf_scx_reg, 8116 .unreg = bpf_scx_unreg, 8117 .check_member = bpf_scx_check_member, 8118 .init_member = bpf_scx_init_member, 8119 .init = bpf_scx_init, 8120 .update = bpf_scx_update, 8121 .validate = bpf_scx_validate, 8122 .name = "sched_ext_ops", 8123 .owner = THIS_MODULE, 8124 .cfi_stubs = &__bpf_ops_sched_ext_ops 8125 }; 8126 8127 /* 8128 * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types 8129 * identical, only param names differ across structs) are reused; only 8130 * set_cmask needs a fresh stub since the second argument type differs. 8131 */ 8132 static void sched_ext_ops_cid__set_cmask(struct task_struct *p, 8133 const struct scx_cmask *cmask) {} 8134 8135 static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = { 8136 .select_cid = sched_ext_ops__select_cpu, 8137 .enqueue = sched_ext_ops__enqueue, 8138 .dequeue = sched_ext_ops__dequeue, 8139 .dispatch = sched_ext_ops__dispatch, 8140 .tick = sched_ext_ops__tick, 8141 .runnable = sched_ext_ops__runnable, 8142 .running = sched_ext_ops__running, 8143 .stopping = sched_ext_ops__stopping, 8144 .quiescent = sched_ext_ops__quiescent, 8145 .yield = sched_ext_ops__yield, 8146 .core_sched_before = sched_ext_ops__core_sched_before, 8147 .set_weight = sched_ext_ops__set_weight, 8148 .set_cmask = sched_ext_ops_cid__set_cmask, 8149 .update_idle = sched_ext_ops__update_idle, 8150 .init_task = sched_ext_ops__init_task, 8151 .exit_task = sched_ext_ops__exit_task, 8152 .enable = sched_ext_ops__enable, 8153 .disable = sched_ext_ops__disable, 8154 #ifdef CONFIG_EXT_GROUP_SCHED 8155 .cgroup_init = sched_ext_ops__cgroup_init, 8156 .cgroup_exit = sched_ext_ops__cgroup_exit, 8157 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 8158 .cgroup_move = sched_ext_ops__cgroup_move, 8159 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 8160 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 8161 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 8162 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 8163 #endif 8164 .sub_attach = sched_ext_ops__sub_attach, 8165 .sub_detach = sched_ext_ops__sub_detach, 8166 .cid_online = sched_ext_ops__cpu_online, 8167 .cid_offline = sched_ext_ops__cpu_offline, 8168 .init = sched_ext_ops__init, 8169 .exit = sched_ext_ops__exit, 8170 .dump = sched_ext_ops__dump, 8171 .dump_cid = sched_ext_ops__dump_cpu, 8172 .dump_task = sched_ext_ops__dump_task, 8173 }; 8174 8175 /* 8176 * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form. 8177 * init_member, check_member, reg, unreg, etc. process kdata as the byte block 8178 * verified to match by the BUILD_BUG_ON checks in scx_init(). 8179 */ 8180 static struct bpf_struct_ops bpf_sched_ext_ops_cid = { 8181 .verifier_ops = &bpf_scx_verifier_ops, 8182 .reg = bpf_scx_reg_cid, 8183 .unreg = bpf_scx_unreg, 8184 .check_member = bpf_scx_check_member, 8185 .init_member = bpf_scx_init_member, 8186 .init = bpf_scx_init, 8187 .update = bpf_scx_update, 8188 .validate = bpf_scx_validate, 8189 .name = "sched_ext_ops_cid", 8190 .owner = THIS_MODULE, 8191 .cfi_stubs = &__bpf_ops_sched_ext_ops_cid 8192 }; 8193 8194 8195 /******************************************************************************** 8196 * System integration and init. 8197 */ 8198 8199 static void sysrq_handle_sched_ext_reset(u8 key) 8200 { 8201 struct scx_sched *sch; 8202 8203 sch = rcu_dereference(scx_root); 8204 if (likely(sch)) 8205 scx_disable(sch, SCX_EXIT_SYSRQ); 8206 else 8207 pr_info("sched_ext: BPF schedulers not loaded\n"); 8208 } 8209 8210 static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 8211 .handler = sysrq_handle_sched_ext_reset, 8212 .help_msg = "reset-sched-ext(S)", 8213 .action_msg = "Disable sched_ext and revert all tasks to CFS", 8214 .enable_mask = SYSRQ_ENABLE_RTNICE, 8215 }; 8216 8217 static void sysrq_handle_sched_ext_dump(u8 key) 8218 { 8219 struct scx_exit_info ei = { 8220 .kind = SCX_EXIT_NONE, 8221 .exit_cpu = -1, 8222 .reason = "SysRq-D", 8223 }; 8224 struct scx_sched *sch; 8225 8226 list_for_each_entry_rcu(sch, &scx_sched_all, all) 8227 scx_dump_state(sch, &ei, 0, false); 8228 } 8229 8230 static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 8231 .handler = sysrq_handle_sched_ext_dump, 8232 .help_msg = "dump-sched-ext(D)", 8233 .action_msg = "Trigger sched_ext debug dump", 8234 .enable_mask = SYSRQ_ENABLE_RTNICE, 8235 }; 8236 8237 static bool can_skip_idle_kick(struct rq *rq) 8238 { 8239 lockdep_assert_rq_held(rq); 8240 8241 /* 8242 * We can skip idle kicking if @rq is going to go through at least one 8243 * full SCX scheduling cycle before going idle. Just checking whether 8244 * curr is not idle is insufficient because we could be racing 8245 * balance_one() trying to pull the next task from a remote rq, which 8246 * may fail, and @rq may become idle afterwards. 8247 * 8248 * The race window is small and we don't and can't guarantee that @rq is 8249 * only kicked while idle anyway. Skip only when sure. 8250 */ 8251 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 8252 } 8253 8254 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 8255 { 8256 struct rq *rq = cpu_rq(cpu); 8257 struct scx_rq *this_scx = &this_rq->scx; 8258 const struct sched_class *cur_class; 8259 bool should_wait = false; 8260 unsigned long flags; 8261 8262 raw_spin_rq_lock_irqsave(rq, flags); 8263 cur_class = rq->curr->sched_class; 8264 8265 /* 8266 * During CPU hotplug, a CPU may depend on kicking itself to make 8267 * forward progress. Allow kicking self regardless of online state. If 8268 * @cpu is running a higher class task, we have no control over @cpu. 8269 * Skip kicking. 8270 */ 8271 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 8272 !sched_class_above(cur_class, &ext_sched_class)) { 8273 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 8274 if (cur_class == &ext_sched_class) 8275 rq->curr->scx.slice = 0; 8276 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 8277 } 8278 8279 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 8280 if (cur_class == &ext_sched_class) { 8281 cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 8282 ksyncs[cpu] = rq->scx.kick_sync; 8283 should_wait = true; 8284 } 8285 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 8286 } 8287 8288 resched_curr(rq); 8289 } else { 8290 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 8291 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 8292 } 8293 8294 raw_spin_rq_unlock_irqrestore(rq, flags); 8295 8296 return should_wait; 8297 } 8298 8299 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 8300 { 8301 struct rq *rq = cpu_rq(cpu); 8302 unsigned long flags; 8303 8304 raw_spin_rq_lock_irqsave(rq, flags); 8305 8306 if (!can_skip_idle_kick(rq) && 8307 (cpu_online(cpu) || cpu == cpu_of(this_rq))) 8308 resched_curr(rq); 8309 8310 raw_spin_rq_unlock_irqrestore(rq, flags); 8311 } 8312 8313 static void kick_cpus_irq_workfn(struct irq_work *irq_work) 8314 { 8315 struct rq *this_rq = this_rq(); 8316 struct scx_rq *this_scx = &this_rq->scx; 8317 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 8318 bool should_wait = false; 8319 unsigned long *ksyncs; 8320 s32 cpu; 8321 8322 /* can race with free_kick_syncs() during scheduler disable */ 8323 if (unlikely(!ksyncs_pcpu)) 8324 return; 8325 8326 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 8327 8328 for_each_cpu(cpu, this_scx->cpus_to_kick) { 8329 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 8330 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 8331 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 8332 } 8333 8334 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 8335 kick_one_cpu_if_idle(cpu, this_rq); 8336 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 8337 } 8338 8339 /* 8340 * Can't wait in hardirq — kick_sync can't advance, deadlocking if 8341 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 8342 */ 8343 if (should_wait) { 8344 raw_spin_rq_lock(this_rq); 8345 this_scx->kick_sync_pending = true; 8346 resched_curr(this_rq); 8347 raw_spin_rq_unlock(this_rq); 8348 } 8349 } 8350 8351 /** 8352 * print_scx_info - print out sched_ext scheduler state 8353 * @log_lvl: the log level to use when printing 8354 * @p: target task 8355 * 8356 * If a sched_ext scheduler is enabled, print the name and state of the 8357 * scheduler. If @p is on sched_ext, print further information about the task. 8358 * 8359 * This function can be safely called on any task as long as the task_struct 8360 * itself is accessible. While safe, this function isn't synchronized and may 8361 * print out mixups or garbages of limited length. 8362 */ 8363 void print_scx_info(const char *log_lvl, struct task_struct *p) 8364 { 8365 struct scx_sched *sch; 8366 enum scx_enable_state state = scx_enable_state(); 8367 const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 8368 char runnable_at_buf[22] = "?"; 8369 struct sched_class *class; 8370 unsigned long runnable_at; 8371 8372 guard(rcu)(); 8373 8374 sch = scx_task_sched_rcu(p); 8375 8376 if (!sch) 8377 return; 8378 8379 /* 8380 * Carefully check if the task was running on sched_ext, and then 8381 * carefully copy the time it's been runnable, and its state. 8382 */ 8383 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 8384 class != &ext_sched_class) { 8385 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 8386 scx_enable_state_str[state], all); 8387 return; 8388 } 8389 8390 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 8391 sizeof(runnable_at))) 8392 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 8393 jiffies_delta_msecs(runnable_at, jiffies)); 8394 8395 /* print everything onto one line to conserve console space */ 8396 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 8397 log_lvl, sch->ops.name, scx_enable_state_str[state], all, 8398 runnable_at_buf); 8399 } 8400 8401 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 8402 { 8403 struct scx_sched *sch; 8404 8405 guard(rcu)(); 8406 8407 sch = rcu_dereference(scx_root); 8408 if (!sch) 8409 return NOTIFY_OK; 8410 8411 /* 8412 * SCX schedulers often have userspace components which are sometimes 8413 * involved in critial scheduling paths. PM operations involve freezing 8414 * userspace which can lead to scheduling misbehaviors including stalls. 8415 * Let's bypass while PM operations are in progress. 8416 */ 8417 switch (event) { 8418 case PM_HIBERNATION_PREPARE: 8419 case PM_SUSPEND_PREPARE: 8420 case PM_RESTORE_PREPARE: 8421 scx_bypass(sch, true); 8422 break; 8423 case PM_POST_HIBERNATION: 8424 case PM_POST_SUSPEND: 8425 case PM_POST_RESTORE: 8426 scx_bypass(sch, false); 8427 break; 8428 } 8429 8430 return NOTIFY_OK; 8431 } 8432 8433 static struct notifier_block scx_pm_notifier = { 8434 .notifier_call = scx_pm_handler, 8435 }; 8436 8437 void __init init_sched_ext_class(void) 8438 { 8439 s32 cpu, v; 8440 8441 /* 8442 * The following is to prevent the compiler from optimizing out the enum 8443 * definitions so that BPF scheduler implementations can use them 8444 * through the generated vmlinux.h. 8445 */ 8446 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 8447 SCX_TG_ONLINE); 8448 8449 scx_idle_init_masks(); 8450 8451 for_each_possible_cpu(cpu) { 8452 struct rq *rq = cpu_rq(cpu); 8453 int n = cpu_to_node(cpu); 8454 8455 /* local_dsq's sch will be set during scx_root_enable() */ 8456 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 8457 8458 INIT_LIST_HEAD(&rq->scx.runnable_list); 8459 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 8460 8461 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 8462 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 8463 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 8464 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 8465 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 8466 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 8467 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 8468 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 8469 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 8470 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 8471 8472 if (cpu_online(cpu)) 8473 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 8474 } 8475 8476 register_sysrq_key('S', &sysrq_sched_ext_reset_op); 8477 register_sysrq_key('D', &sysrq_sched_ext_dump_op); 8478 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 8479 8480 #ifdef CONFIG_EXT_SUB_SCHED 8481 BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 8482 #endif /* CONFIG_EXT_SUB_SCHED */ 8483 } 8484 8485 8486 /******************************************************************************** 8487 * Helpers that can be called from the BPF scheduler. 8488 */ 8489 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) 8490 { 8491 bool is_local = dsq_id == SCX_DSQ_LOCAL || 8492 (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; 8493 8494 if (*enq_flags & SCX_ENQ_IMMED) { 8495 if (unlikely(!is_local)) { 8496 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 8497 return false; 8498 } 8499 } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { 8500 *enq_flags |= SCX_ENQ_IMMED; 8501 } 8502 8503 return true; 8504 } 8505 8506 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 8507 u64 dsq_id, u64 *enq_flags) 8508 { 8509 lockdep_assert_irqs_disabled(); 8510 8511 if (unlikely(!p)) { 8512 scx_error(sch, "called with NULL task"); 8513 return false; 8514 } 8515 8516 if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 8517 scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); 8518 return false; 8519 } 8520 8521 /* see SCX_EV_INSERT_NOT_OWNED definition */ 8522 if (unlikely(!scx_task_on_sched(sch, p))) { 8523 __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 8524 return false; 8525 } 8526 8527 if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8528 return false; 8529 8530 return true; 8531 } 8532 8533 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 8534 u64 dsq_id, u64 enq_flags) 8535 { 8536 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8537 struct task_struct *ddsp_task; 8538 8539 ddsp_task = __this_cpu_read(direct_dispatch_task); 8540 if (ddsp_task) { 8541 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 8542 return; 8543 } 8544 8545 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 8546 scx_error(sch, "dispatch buffer overflow"); 8547 return; 8548 } 8549 8550 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 8551 .task = p, 8552 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 8553 .dsq_id = dsq_id, 8554 .enq_flags = enq_flags, 8555 }; 8556 } 8557 8558 __bpf_kfunc_start_defs(); 8559 8560 /** 8561 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 8562 * @p: task_struct to insert 8563 * @dsq_id: DSQ to insert into 8564 * @slice: duration @p can run for in nsecs, 0 to keep the current value 8565 * @enq_flags: SCX_ENQ_* 8566 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8567 * 8568 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 8569 * call this function spuriously. Can be called from ops.enqueue(), 8570 * ops.select_cpu(), and ops.dispatch(). 8571 * 8572 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 8573 * and @p must match the task being enqueued. 8574 * 8575 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 8576 * will be directly inserted into the corresponding dispatch queue after 8577 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 8578 * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 8579 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 8580 * task is inserted. 8581 * 8582 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 8583 * and this function can be called upto ops.dispatch_max_batch times to insert 8584 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 8585 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 8586 * counter. 8587 * 8588 * This function doesn't have any locking restrictions and may be called under 8589 * BPF locks (in the future when BPF introduces more flexible locking). 8590 * 8591 * @p is allowed to run for @slice. The scheduling path is triggered on slice 8592 * exhaustion. If zero, the current residual slice is maintained. If 8593 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 8594 * scx_bpf_kick_cpu() to trigger scheduling. 8595 * 8596 * Returns %true on successful insertion, %false on failure. On the root 8597 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8598 * to check the return value. 8599 */ 8600 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 8601 u64 slice, u64 enq_flags, 8602 const struct bpf_prog_aux *aux) 8603 { 8604 struct scx_sched *sch; 8605 8606 guard(rcu)(); 8607 sch = scx_prog_sched(aux); 8608 if (unlikely(!sch)) 8609 return false; 8610 8611 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8612 return false; 8613 8614 if (slice) 8615 p->scx.slice = slice; 8616 else 8617 p->scx.slice = p->scx.slice ?: 1; 8618 8619 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 8620 8621 return true; 8622 } 8623 8624 /* 8625 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 8626 */ 8627 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 8628 u64 slice, u64 enq_flags, 8629 const struct bpf_prog_aux *aux) 8630 { 8631 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 8632 } 8633 8634 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 8635 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 8636 { 8637 if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8638 return false; 8639 8640 if (slice) 8641 p->scx.slice = slice; 8642 else 8643 p->scx.slice = p->scx.slice ?: 1; 8644 8645 p->scx.dsq_vtime = vtime; 8646 8647 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8648 8649 return true; 8650 } 8651 8652 struct scx_bpf_dsq_insert_vtime_args { 8653 /* @p can't be packed together as KF_RCU is not transitive */ 8654 u64 dsq_id; 8655 u64 slice; 8656 u64 vtime; 8657 u64 enq_flags; 8658 }; 8659 8660 /** 8661 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 8662 * @p: task_struct to insert 8663 * @args: struct containing the rest of the arguments 8664 * @args->dsq_id: DSQ to insert into 8665 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 8666 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 8667 * @args->enq_flags: SCX_ENQ_* 8668 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8669 * 8670 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 8671 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 8672 * as an inline wrapper in common.bpf.h. 8673 * 8674 * Insert @p into the vtime priority queue of the DSQ identified by 8675 * @args->dsq_id. Tasks queued into the priority queue are ordered by 8676 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 8677 * 8678 * @args->vtime ordering is according to time_before64() which considers 8679 * wrapping. A numerically larger vtime may indicate an earlier position in the 8680 * ordering and vice-versa. 8681 * 8682 * A DSQ can only be used as a FIFO or priority queue at any given time and this 8683 * function must not be called on a DSQ which already has one or more FIFO tasks 8684 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 8685 * SCX_DSQ_GLOBAL) cannot be used as priority queues. 8686 * 8687 * Returns %true on successful insertion, %false on failure. On the root 8688 * scheduler, %false return triggers scheduler abort and the caller doesn't need 8689 * to check the return value. 8690 */ 8691 __bpf_kfunc bool 8692 __scx_bpf_dsq_insert_vtime(struct task_struct *p, 8693 struct scx_bpf_dsq_insert_vtime_args *args, 8694 const struct bpf_prog_aux *aux) 8695 { 8696 struct scx_sched *sch; 8697 8698 guard(rcu)(); 8699 8700 sch = scx_prog_sched(aux); 8701 if (unlikely(!sch)) 8702 return false; 8703 8704 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 8705 args->vtime, args->enq_flags); 8706 } 8707 8708 /* 8709 * COMPAT: Will be removed in v6.23. 8710 */ 8711 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 8712 u64 slice, u64 vtime, u64 enq_flags) 8713 { 8714 struct scx_sched *sch; 8715 8716 guard(rcu)(); 8717 8718 sch = rcu_dereference(scx_root); 8719 if (unlikely(!sch)) 8720 return; 8721 8722 #ifdef CONFIG_EXT_SUB_SCHED 8723 /* 8724 * Disallow if any sub-scheds are attached. There is no way to tell 8725 * which scheduler called us, just error out @p's scheduler. 8726 */ 8727 if (unlikely(!list_empty(&sch->children))) { 8728 scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 8729 return; 8730 } 8731 #endif 8732 8733 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8734 } 8735 8736 __bpf_kfunc_end_defs(); 8737 8738 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8739 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8740 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8741 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8742 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8743 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8744 8745 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8746 .owner = THIS_MODULE, 8747 .set = &scx_kfunc_ids_enqueue_dispatch, 8748 .filter = scx_kfunc_context_filter, 8749 }; 8750 8751 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8752 struct task_struct *p, u64 dsq_id, u64 enq_flags) 8753 { 8754 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8755 struct scx_sched *sch; 8756 struct rq *this_rq, *src_rq, *locked_rq; 8757 bool dispatched = false; 8758 bool in_balance; 8759 unsigned long flags; 8760 8761 /* 8762 * The verifier considers an iterator slot initialized on any 8763 * KF_ITER_NEW return, so a BPF program may legally reach here after 8764 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. 8765 */ 8766 if (unlikely(!src_dsq)) 8767 return false; 8768 8769 sch = src_dsq->sched; 8770 8771 if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) 8772 return false; 8773 8774 /* 8775 * If the BPF scheduler keeps calling this function repeatedly, it can 8776 * cause similar live-lock conditions as consume_dispatch_q(). 8777 */ 8778 if (unlikely(READ_ONCE(sch->aborting))) 8779 return false; 8780 8781 if (unlikely(!scx_task_on_sched(sch, p))) { 8782 scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8783 p->comm, p->pid); 8784 return false; 8785 } 8786 8787 /* 8788 * Can be called from either ops.dispatch() locking this_rq() or any 8789 * context where no rq lock is held. If latter, lock @p's task_rq which 8790 * we'll likely need anyway. 8791 */ 8792 src_rq = task_rq(p); 8793 8794 local_irq_save(flags); 8795 this_rq = this_rq(); 8796 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8797 8798 if (in_balance) { 8799 if (this_rq != src_rq) { 8800 raw_spin_rq_unlock(this_rq); 8801 raw_spin_rq_lock(src_rq); 8802 } 8803 } else { 8804 raw_spin_rq_lock(src_rq); 8805 } 8806 8807 locked_rq = src_rq; 8808 raw_spin_lock(&src_dsq->lock); 8809 8810 /* did someone else get to it while we dropped the locks? */ 8811 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8812 raw_spin_unlock(&src_dsq->lock); 8813 goto out; 8814 } 8815 8816 /* @p is still on $src_dsq and stable, determine the destination */ 8817 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8818 8819 /* 8820 * Apply vtime and slice updates before moving so that the new time is 8821 * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8822 * this is safe as we're locking it. 8823 */ 8824 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8825 p->scx.dsq_vtime = kit->vtime; 8826 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8827 p->scx.slice = kit->slice; 8828 8829 /* execute move */ 8830 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8831 dispatched = true; 8832 out: 8833 if (in_balance) { 8834 if (this_rq != locked_rq) { 8835 raw_spin_rq_unlock(locked_rq); 8836 raw_spin_rq_lock(this_rq); 8837 } 8838 } else { 8839 raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8840 } 8841 8842 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8843 __SCX_DSQ_ITER_HAS_VTIME); 8844 return dispatched; 8845 } 8846 8847 __bpf_kfunc_start_defs(); 8848 8849 /** 8850 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8851 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8852 * 8853 * Can only be called from ops.dispatch(). 8854 */ 8855 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8856 { 8857 struct scx_sched *sch; 8858 8859 guard(rcu)(); 8860 8861 sch = scx_prog_sched(aux); 8862 if (unlikely(!sch)) 8863 return 0; 8864 8865 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8866 } 8867 8868 /** 8869 * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8870 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8871 * 8872 * Cancel the latest dispatch. Can be called multiple times to cancel further 8873 * dispatches. Can only be called from ops.dispatch(). 8874 */ 8875 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 8876 { 8877 struct scx_sched *sch; 8878 struct scx_dsp_ctx *dspc; 8879 8880 guard(rcu)(); 8881 8882 sch = scx_prog_sched(aux); 8883 if (unlikely(!sch)) 8884 return; 8885 8886 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8887 8888 if (dspc->cursor > 0) 8889 dspc->cursor--; 8890 else 8891 scx_error(sch, "dispatch buffer underflow"); 8892 } 8893 8894 /** 8895 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 8896 * @dsq_id: DSQ to move task from. Must be a user-created DSQ 8897 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8898 * @enq_flags: %SCX_ENQ_* 8899 * 8900 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 8901 * local DSQ for execution with @enq_flags applied. Can only be called from 8902 * ops.dispatch(). 8903 * 8904 * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as 8905 * sources. Local DSQs support reenqueueing (a task can be picked up for 8906 * execution, dequeued for property changes, or reenqueued), but the BPF 8907 * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL 8908 * is similar but also doesn't support reenqueueing, as it maps to multiple 8909 * per-node DSQs making the scope difficult to define; this may change in the 8910 * future. 8911 * 8912 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 8913 * before trying to move from the specified DSQ. It may also grab rq locks and 8914 * thus can't be called under any BPF locks. 8915 * 8916 * Returns %true if a task has been moved, %false if there isn't any task to 8917 * move. 8918 */ 8919 __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, 8920 const struct bpf_prog_aux *aux) 8921 { 8922 struct scx_dispatch_q *dsq; 8923 struct scx_sched *sch; 8924 struct scx_dsp_ctx *dspc; 8925 8926 guard(rcu)(); 8927 8928 sch = scx_prog_sched(aux); 8929 if (unlikely(!sch)) 8930 return false; 8931 8932 if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) 8933 return false; 8934 8935 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8936 8937 flush_dispatch_buf(sch, dspc->rq); 8938 8939 dsq = find_user_dsq(sch, dsq_id); 8940 if (unlikely(!dsq)) { 8941 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 8942 return false; 8943 } 8944 8945 if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { 8946 /* 8947 * A successfully consumed task can be dequeued before it starts 8948 * running while the CPU is trying to migrate other dispatched 8949 * tasks. Bump nr_tasks to tell balance_one() to retry on empty 8950 * local DSQ. 8951 */ 8952 dspc->nr_tasks++; 8953 return true; 8954 } else { 8955 return false; 8956 } 8957 } 8958 8959 /* 8960 * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. 8961 */ 8962 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 8963 { 8964 return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); 8965 } 8966 8967 /** 8968 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 8969 * @it__iter: DSQ iterator in progress 8970 * @slice: duration the moved task can run for in nsecs 8971 * 8972 * Override the slice of the next task that will be moved from @it__iter using 8973 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 8974 * slice duration is kept. 8975 */ 8976 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 8977 u64 slice) 8978 { 8979 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8980 8981 kit->slice = slice; 8982 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 8983 } 8984 8985 /** 8986 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 8987 * @it__iter: DSQ iterator in progress 8988 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 8989 * 8990 * Override the vtime of the next task that will be moved from @it__iter using 8991 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 8992 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 8993 * override is ignored and cleared. 8994 */ 8995 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 8996 u64 vtime) 8997 { 8998 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 8999 9000 kit->vtime = vtime; 9001 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 9002 } 9003 9004 /** 9005 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 9006 * @it__iter: DSQ iterator in progress 9007 * @p: task to transfer 9008 * @dsq_id: DSQ to move @p to 9009 * @enq_flags: SCX_ENQ_* 9010 * 9011 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 9012 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 9013 * be the destination. 9014 * 9015 * For the transfer to be successful, @p must still be on the DSQ and have been 9016 * queued before the DSQ iteration started. This function doesn't care whether 9017 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 9018 * been queued before the iteration started. 9019 * 9020 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 9021 * 9022 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 9023 * lock (e.g. BPF timers or SYSCALL programs). 9024 * 9025 * Returns %true if @p has been consumed, %false if @p had already been 9026 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 9027 * DSQ. 9028 */ 9029 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 9030 struct task_struct *p, u64 dsq_id, 9031 u64 enq_flags) 9032 { 9033 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 9034 p, dsq_id, enq_flags); 9035 } 9036 9037 /** 9038 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 9039 * @it__iter: DSQ iterator in progress 9040 * @p: task to transfer 9041 * @dsq_id: DSQ to move @p to 9042 * @enq_flags: SCX_ENQ_* 9043 * 9044 * Transfer @p which is on the DSQ currently iterated by @it__iter to the 9045 * priority queue of the DSQ specified by @dsq_id. The destination must be a 9046 * user DSQ as only user DSQs support priority queue. 9047 * 9048 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 9049 * and scx_bpf_dsq_move_set_vtime() to update. 9050 * 9051 * All other aspects are identical to scx_bpf_dsq_move(). See 9052 * scx_bpf_dsq_insert_vtime() for more information on @vtime. 9053 */ 9054 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 9055 struct task_struct *p, u64 dsq_id, 9056 u64 enq_flags) 9057 { 9058 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 9059 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 9060 } 9061 9062 #ifdef CONFIG_EXT_SUB_SCHED 9063 /** 9064 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 9065 * @cgroup_id: cgroup ID of the child scheduler to dispatch 9066 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9067 * 9068 * Allows a parent scheduler to trigger dispatching on one of its direct 9069 * child schedulers. The child scheduler runs its dispatch operation to 9070 * move tasks from dispatch queues to the local runqueue. 9071 * 9072 * Returns: true on success, false if cgroup_id is invalid, not a direct 9073 * child, or caller lacks dispatch permission. 9074 */ 9075 __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 9076 { 9077 struct rq *this_rq = this_rq(); 9078 struct scx_sched *parent, *child; 9079 9080 guard(rcu)(); 9081 parent = scx_prog_sched(aux); 9082 if (unlikely(!parent)) 9083 return false; 9084 9085 child = scx_find_sub_sched(cgroup_id); 9086 9087 if (unlikely(!child)) 9088 return false; 9089 9090 if (unlikely(scx_parent(child) != parent)) { 9091 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 9092 cgroup_id); 9093 return false; 9094 } 9095 9096 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 9097 true); 9098 } 9099 #endif /* CONFIG_EXT_SUB_SCHED */ 9100 9101 __bpf_kfunc_end_defs(); 9102 9103 BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 9104 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 9105 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 9106 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 9107 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) 9108 /* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ 9109 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 9110 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 9111 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 9112 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 9113 #ifdef CONFIG_EXT_SUB_SCHED 9114 BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 9115 #endif 9116 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 9117 9118 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 9119 .owner = THIS_MODULE, 9120 .set = &scx_kfunc_ids_dispatch, 9121 .filter = scx_kfunc_context_filter, 9122 }; 9123 9124 __bpf_kfunc_start_defs(); 9125 9126 /** 9127 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9128 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9129 * 9130 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9131 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 9132 * processed tasks. Can only be called from ops.cpu_release(). 9133 */ 9134 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 9135 { 9136 struct scx_sched *sch; 9137 struct rq *rq; 9138 9139 guard(rcu)(); 9140 sch = scx_prog_sched(aux); 9141 if (unlikely(!sch)) 9142 return 0; 9143 9144 rq = cpu_rq(smp_processor_id()); 9145 lockdep_assert_rq_held(rq); 9146 9147 return reenq_local(sch, rq, SCX_REENQ_ANY); 9148 } 9149 9150 __bpf_kfunc_end_defs(); 9151 9152 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 9153 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 9154 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 9155 9156 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 9157 .owner = THIS_MODULE, 9158 .set = &scx_kfunc_ids_cpu_release, 9159 .filter = scx_kfunc_context_filter, 9160 }; 9161 9162 __bpf_kfunc_start_defs(); 9163 9164 /** 9165 * scx_bpf_create_dsq - Create a custom DSQ 9166 * @dsq_id: DSQ to create 9167 * @node: NUMA node to allocate from 9168 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9169 * 9170 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 9171 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 9172 */ 9173 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 9174 { 9175 struct scx_dispatch_q *dsq; 9176 struct scx_sched *sch; 9177 s32 ret; 9178 9179 if (unlikely(node >= (int)nr_node_ids || 9180 (node < 0 && node != NUMA_NO_NODE))) 9181 return -EINVAL; 9182 9183 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 9184 return -EINVAL; 9185 9186 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 9187 if (!dsq) 9188 return -ENOMEM; 9189 9190 /* 9191 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 9192 * @sch and update afterwards. 9193 */ 9194 ret = init_dsq(dsq, dsq_id, NULL); 9195 if (ret) { 9196 kfree(dsq); 9197 return ret; 9198 } 9199 9200 rcu_read_lock(); 9201 9202 sch = scx_prog_sched(aux); 9203 if (sch) { 9204 dsq->sched = sch; 9205 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 9206 dsq_hash_params); 9207 } else { 9208 ret = -ENODEV; 9209 } 9210 9211 rcu_read_unlock(); 9212 if (ret) { 9213 exit_dsq(dsq); 9214 kfree(dsq); 9215 } 9216 return ret; 9217 } 9218 9219 __bpf_kfunc_end_defs(); 9220 9221 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 9222 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 9223 /* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ 9224 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 9225 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 9226 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 9227 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 9228 /* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ 9229 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 9230 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 9231 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 9232 BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 9233 9234 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 9235 .owner = THIS_MODULE, 9236 .set = &scx_kfunc_ids_unlocked, 9237 .filter = scx_kfunc_context_filter, 9238 }; 9239 9240 __bpf_kfunc_start_defs(); 9241 9242 /** 9243 * scx_bpf_task_set_slice - Set task's time slice 9244 * @p: task of interest 9245 * @slice: time slice to set in nsecs 9246 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9247 * 9248 * Set @p's time slice to @slice. Returns %true on success, %false if the 9249 * calling scheduler doesn't have authority over @p. 9250 */ 9251 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 9252 const struct bpf_prog_aux *aux) 9253 { 9254 struct scx_sched *sch; 9255 9256 guard(rcu)(); 9257 sch = scx_prog_sched(aux); 9258 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 9259 return false; 9260 9261 p->scx.slice = slice; 9262 return true; 9263 } 9264 9265 /** 9266 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 9267 * @p: task of interest 9268 * @vtime: virtual time to set 9269 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9270 * 9271 * Set @p's virtual time to @vtime. Returns %true on success, %false if the 9272 * calling scheduler doesn't have authority over @p. 9273 */ 9274 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 9275 const struct bpf_prog_aux *aux) 9276 { 9277 struct scx_sched *sch; 9278 9279 guard(rcu)(); 9280 sch = scx_prog_sched(aux); 9281 if (unlikely(!sch || !scx_task_on_sched(sch, p))) 9282 return false; 9283 9284 p->scx.dsq_vtime = vtime; 9285 return true; 9286 } 9287 9288 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 9289 { 9290 struct rq *this_rq; 9291 unsigned long irq_flags; 9292 9293 local_irq_save(irq_flags); 9294 9295 this_rq = this_rq(); 9296 9297 /* 9298 * While bypassing for PM ops, IRQ handling may not be online which can 9299 * lead to irq_work_queue() malfunction such as infinite busy wait for 9300 * IRQ status update. Suppress kicking. 9301 */ 9302 if (scx_bypassing(sch, cpu_of(this_rq))) 9303 goto out; 9304 9305 /* 9306 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 9307 * rq locks. We can probably be smarter and avoid bouncing if called 9308 * from ops which don't hold a rq lock. 9309 */ 9310 if (flags & SCX_KICK_IDLE) { 9311 struct rq *target_rq = cpu_rq(cpu); 9312 9313 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 9314 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 9315 9316 if (raw_spin_rq_trylock(target_rq)) { 9317 if (can_skip_idle_kick(target_rq)) { 9318 raw_spin_rq_unlock(target_rq); 9319 goto out; 9320 } 9321 raw_spin_rq_unlock(target_rq); 9322 } 9323 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 9324 } else { 9325 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 9326 9327 if (flags & SCX_KICK_PREEMPT) 9328 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 9329 if (flags & SCX_KICK_WAIT) 9330 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 9331 } 9332 9333 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 9334 out: 9335 local_irq_restore(irq_flags); 9336 } 9337 9338 /** 9339 * scx_bpf_kick_cpu - Trigger reschedule on a CPU 9340 * @cpu: cpu to kick 9341 * @flags: %SCX_KICK_* flags 9342 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9343 * 9344 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 9345 * trigger rescheduling on a busy CPU. This can be called from any online 9346 * scx_ops operation and the actual kicking is performed asynchronously through 9347 * an irq work. 9348 */ 9349 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 9350 { 9351 struct scx_sched *sch; 9352 9353 guard(rcu)(); 9354 sch = scx_prog_sched(aux); 9355 if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9356 scx_kick_cpu(sch, cpu, flags); 9357 } 9358 9359 /** 9360 * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid 9361 * @cid: cid to kick 9362 * @flags: %SCX_KICK_* flags 9363 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9364 * 9365 * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success, 9366 * -errno otherwise. 9367 */ 9368 __bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux) 9369 { 9370 struct scx_sched *sch; 9371 s32 cpu; 9372 9373 guard(rcu)(); 9374 sch = scx_prog_sched(aux); 9375 if (unlikely(!sch)) 9376 return -ENODEV; 9377 cpu = scx_cid_to_cpu(sch, cid); 9378 if (cpu < 0) 9379 return cpu; 9380 scx_kick_cpu(sch, cpu, flags); 9381 return 0; 9382 } 9383 9384 /** 9385 * scx_bpf_dsq_nr_queued - Return the number of queued tasks 9386 * @dsq_id: id of the DSQ 9387 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9388 * 9389 * Return the number of tasks in the DSQ matching @dsq_id. If not found, 9390 * -%ENOENT is returned. 9391 */ 9392 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) 9393 { 9394 struct scx_sched *sch; 9395 struct scx_dispatch_q *dsq; 9396 s32 ret; 9397 9398 preempt_disable(); 9399 9400 sch = scx_prog_sched(aux); 9401 if (unlikely(!sch)) { 9402 ret = -ENODEV; 9403 goto out; 9404 } 9405 9406 if (dsq_id == SCX_DSQ_LOCAL) { 9407 ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 9408 goto out; 9409 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 9410 s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); 9411 9412 if (scx_cpu_valid(sch, cpu, NULL)) { 9413 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 9414 goto out; 9415 } 9416 } else { 9417 dsq = find_user_dsq(sch, dsq_id); 9418 if (dsq) { 9419 ret = READ_ONCE(dsq->nr); 9420 goto out; 9421 } 9422 } 9423 ret = -ENOENT; 9424 out: 9425 preempt_enable(); 9426 return ret; 9427 } 9428 9429 /** 9430 * scx_bpf_destroy_dsq - Destroy a custom DSQ 9431 * @dsq_id: DSQ to destroy 9432 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9433 * 9434 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 9435 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 9436 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 9437 * which doesn't exist. Can be called from any online scx_ops operations. 9438 */ 9439 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) 9440 { 9441 struct scx_sched *sch; 9442 9443 guard(rcu)(); 9444 sch = scx_prog_sched(aux); 9445 if (sch) 9446 destroy_dsq(sch, dsq_id); 9447 } 9448 9449 /** 9450 * bpf_iter_scx_dsq_new - Create a DSQ iterator 9451 * @it: iterator to initialize 9452 * @dsq_id: DSQ to iterate 9453 * @flags: %SCX_DSQ_ITER_* 9454 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9455 * 9456 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 9457 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 9458 * tasks which are already queued when this function is invoked. 9459 */ 9460 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 9461 u64 flags, const struct bpf_prog_aux *aux) 9462 { 9463 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9464 struct scx_sched *sch; 9465 9466 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 9467 sizeof(struct bpf_iter_scx_dsq)); 9468 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 9469 __alignof__(struct bpf_iter_scx_dsq)); 9470 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 9471 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 9472 9473 /* 9474 * next() and destroy() will be called regardless of the return value. 9475 * Always clear $kit->dsq. 9476 */ 9477 kit->dsq = NULL; 9478 9479 sch = scx_prog_sched(aux); 9480 if (unlikely(!sch)) 9481 return -ENODEV; 9482 9483 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 9484 return -EINVAL; 9485 9486 kit->dsq = find_user_dsq(sch, dsq_id); 9487 if (!kit->dsq) 9488 return -ENOENT; 9489 9490 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 9491 9492 return 0; 9493 } 9494 9495 /** 9496 * bpf_iter_scx_dsq_next - Progress a DSQ iterator 9497 * @it: iterator to progress 9498 * 9499 * Return the next task. See bpf_iter_scx_dsq_new(). 9500 */ 9501 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 9502 { 9503 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9504 9505 if (!kit->dsq) 9506 return NULL; 9507 9508 guard(raw_spinlock_irqsave)(&kit->dsq->lock); 9509 9510 return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 9511 } 9512 9513 /** 9514 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 9515 * @it: iterator to destroy 9516 * 9517 * Undo scx_iter_scx_dsq_new(). 9518 */ 9519 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 9520 { 9521 struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9522 9523 if (!kit->dsq) 9524 return; 9525 9526 if (!list_empty(&kit->cursor.node)) { 9527 unsigned long flags; 9528 9529 raw_spin_lock_irqsave(&kit->dsq->lock, flags); 9530 list_del_init(&kit->cursor.node); 9531 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 9532 } 9533 kit->dsq = NULL; 9534 } 9535 9536 /** 9537 * scx_bpf_dsq_peek - Lockless peek at the first element. 9538 * @dsq_id: DSQ to examine. 9539 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9540 * 9541 * Read the first element in the DSQ. This is semantically equivalent to using 9542 * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 9543 * this provides only a point-in-time snapshot, and the contents may change 9544 * by the time any subsequent locking operation reads the queue. 9545 * 9546 * Returns the pointer, or NULL indicates an empty queue OR internal error. 9547 */ 9548 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 9549 const struct bpf_prog_aux *aux) 9550 { 9551 struct scx_sched *sch; 9552 struct scx_dispatch_q *dsq; 9553 9554 sch = scx_prog_sched(aux); 9555 if (unlikely(!sch)) 9556 return NULL; 9557 9558 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 9559 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 9560 return NULL; 9561 } 9562 9563 dsq = find_user_dsq(sch, dsq_id); 9564 if (unlikely(!dsq)) { 9565 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 9566 return NULL; 9567 } 9568 9569 return rcu_dereference(dsq->first_task); 9570 } 9571 9572 /** 9573 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 9574 * @dsq_id: DSQ to re-enqueue 9575 * @reenq_flags: %SCX_RENQ_* 9576 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9577 * 9578 * Iterate over all of the tasks currently enqueued on the DSQ identified by 9579 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 9580 * supported: 9581 * 9582 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 9583 * - User DSQs 9584 * 9585 * Re-enqueues are performed asynchronously. Can be called from anywhere. 9586 */ 9587 __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 9588 const struct bpf_prog_aux *aux) 9589 { 9590 struct scx_sched *sch; 9591 struct scx_dispatch_q *dsq; 9592 9593 guard(preempt)(); 9594 9595 sch = scx_prog_sched(aux); 9596 if (unlikely(!sch)) 9597 return; 9598 9599 if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 9600 scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 9601 return; 9602 } 9603 9604 /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 9605 if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 9606 reenq_flags |= SCX_REENQ_ANY; 9607 9608 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 9609 schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); 9610 } 9611 9612 /** 9613 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9614 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9615 * 9616 * Iterate over all of the tasks currently enqueued on the local DSQ of the 9617 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 9618 * anywhere. 9619 * 9620 * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 9621 * future. 9622 */ 9623 __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 9624 { 9625 scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 9626 } 9627 9628 __bpf_kfunc_end_defs(); 9629 9630 __printf(5, 0) 9631 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 9632 size_t line_size, char *fmt, unsigned long long *data, 9633 u32 data__sz) 9634 { 9635 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 9636 s32 ret; 9637 9638 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 9639 (data__sz && !data)) { 9640 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 9641 return -EINVAL; 9642 } 9643 9644 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 9645 if (ret < 0) { 9646 scx_error(sch, "failed to read data fields (%d)", ret); 9647 return ret; 9648 } 9649 9650 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 9651 &bprintf_data); 9652 if (ret < 0) { 9653 scx_error(sch, "format preparation failed (%d)", ret); 9654 return ret; 9655 } 9656 9657 ret = bstr_printf(line_buf, line_size, fmt, 9658 bprintf_data.bin_args); 9659 bpf_bprintf_cleanup(&bprintf_data); 9660 if (ret < 0) { 9661 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 9662 return ret; 9663 } 9664 9665 return ret; 9666 } 9667 9668 __printf(3, 0) 9669 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 9670 char *fmt, unsigned long long *data, u32 data__sz) 9671 { 9672 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 9673 fmt, data, data__sz); 9674 } 9675 9676 __bpf_kfunc_start_defs(); 9677 9678 /** 9679 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 9680 * @exit_code: Exit value to pass to user space via struct scx_exit_info. 9681 * @fmt: error message format string 9682 * @data: format string parameters packaged using ___bpf_fill() macro 9683 * @data__sz: @data len, must end in '__sz' for the verifier 9684 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9685 * 9686 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 9687 * disabling. 9688 */ 9689 __printf(2, 0) 9690 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 9691 unsigned long long *data, u32 data__sz, 9692 const struct bpf_prog_aux *aux) 9693 { 9694 struct scx_sched *sch; 9695 unsigned long flags; 9696 9697 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9698 sch = scx_prog_sched(aux); 9699 if (likely(sch) && 9700 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9701 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 9702 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9703 } 9704 9705 /** 9706 * scx_bpf_error_bstr - Indicate fatal error 9707 * @fmt: error message format string 9708 * @data: format string parameters packaged using ___bpf_fill() macro 9709 * @data__sz: @data len, must end in '__sz' for the verifier 9710 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9711 * 9712 * Indicate that the BPF scheduler encountered a fatal error and initiate ops 9713 * disabling. 9714 */ 9715 __printf(1, 0) 9716 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 9717 u32 data__sz, const struct bpf_prog_aux *aux) 9718 { 9719 struct scx_sched *sch; 9720 unsigned long flags; 9721 9722 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9723 sch = scx_prog_sched(aux); 9724 if (likely(sch) && 9725 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9726 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 9727 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9728 } 9729 9730 /** 9731 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 9732 * @fmt: format string 9733 * @data: format string parameters packaged using ___bpf_fill() macro 9734 * @data__sz: @data len, must end in '__sz' for the verifier 9735 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9736 * 9737 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 9738 * dump_task() to generate extra debug dump specific to the BPF scheduler. 9739 * 9740 * The extra dump may be multiple lines. A single line may be split over 9741 * multiple calls. The last line is automatically terminated. 9742 */ 9743 __printf(1, 0) 9744 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 9745 u32 data__sz, const struct bpf_prog_aux *aux) 9746 { 9747 struct scx_sched *sch; 9748 struct scx_dump_data *dd = &scx_dump_data; 9749 struct scx_bstr_buf *buf = &dd->buf; 9750 s32 ret; 9751 9752 guard(rcu)(); 9753 9754 sch = scx_prog_sched(aux); 9755 if (unlikely(!sch)) 9756 return; 9757 9758 if (raw_smp_processor_id() != dd->cpu) { 9759 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 9760 return; 9761 } 9762 9763 /* append the formatted string to the line buf */ 9764 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 9765 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 9766 if (ret < 0) { 9767 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 9768 dd->prefix, fmt, data, data__sz, ret); 9769 return; 9770 } 9771 9772 dd->cursor += ret; 9773 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 9774 9775 if (!dd->cursor) 9776 return; 9777 9778 /* 9779 * If the line buf overflowed or ends in a newline, flush it into the 9780 * dump. This is to allow the caller to generate a single line over 9781 * multiple calls. As ops_dump_flush() can also handle multiple lines in 9782 * the line buf, the only case which can lead to an unexpected 9783 * truncation is when the caller keeps generating newlines in the middle 9784 * instead of the end consecutively. Don't do that. 9785 */ 9786 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9787 ops_dump_flush(); 9788 } 9789 9790 /** 9791 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9792 * @cpu: CPU of interest 9793 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9794 * 9795 * Return the maximum relative capacity of @cpu in relation to the most 9796 * performant CPU in the system. The return value is in the range [1, 9797 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9798 */ 9799 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9800 { 9801 struct scx_sched *sch; 9802 9803 guard(rcu)(); 9804 9805 sch = scx_prog_sched(aux); 9806 if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9807 return arch_scale_cpu_capacity(cpu); 9808 else 9809 return SCX_CPUPERF_ONE; 9810 } 9811 9812 /** 9813 * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid 9814 * @cid: cid of the CPU to query 9815 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9816 * 9817 * cid-addressed equivalent of scx_bpf_cpuperf_cap(). 9818 */ 9819 __bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux) 9820 { 9821 struct scx_sched *sch; 9822 s32 cpu; 9823 9824 guard(rcu)(); 9825 9826 sch = scx_prog_sched(aux); 9827 if (unlikely(!sch)) 9828 return SCX_CPUPERF_ONE; 9829 cpu = scx_cid_to_cpu(sch, cid); 9830 if (cpu < 0) 9831 return SCX_CPUPERF_ONE; 9832 return arch_scale_cpu_capacity(cpu); 9833 } 9834 9835 /** 9836 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9837 * @cpu: CPU of interest 9838 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9839 * 9840 * Return the current relative performance of @cpu in relation to its maximum. 9841 * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9842 * 9843 * The current performance level of a CPU in relation to the maximum performance 9844 * available in the system can be calculated as follows: 9845 * 9846 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9847 * 9848 * The result is in the range [1, %SCX_CPUPERF_ONE]. 9849 */ 9850 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9851 { 9852 struct scx_sched *sch; 9853 9854 guard(rcu)(); 9855 9856 sch = scx_prog_sched(aux); 9857 if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9858 return arch_scale_freq_capacity(cpu); 9859 else 9860 return SCX_CPUPERF_ONE; 9861 } 9862 9863 /** 9864 * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid 9865 * @cid: cid of the CPU to query 9866 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9867 * 9868 * cid-addressed equivalent of scx_bpf_cpuperf_cur(). 9869 */ 9870 __bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux) 9871 { 9872 struct scx_sched *sch; 9873 s32 cpu; 9874 9875 guard(rcu)(); 9876 9877 sch = scx_prog_sched(aux); 9878 if (unlikely(!sch)) 9879 return SCX_CPUPERF_ONE; 9880 cpu = scx_cid_to_cpu(sch, cid); 9881 if (cpu < 0) 9882 return SCX_CPUPERF_ONE; 9883 return arch_scale_freq_capacity(cpu); 9884 } 9885 9886 /** 9887 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 9888 * @cpu: CPU of interest 9889 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 9890 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9891 * 9892 * Set the target performance level of @cpu to @perf. @perf is in linear 9893 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 9894 * schedutil cpufreq governor chooses the target frequency. 9895 * 9896 * The actual performance level chosen, CPU grouping, and the overhead and 9897 * latency of the operations are dependent on the hardware and cpufreq driver in 9898 * use. Consult hardware and cpufreq documentation for more information. The 9899 * current performance level can be monitored using scx_bpf_cpuperf_cur(). 9900 */ 9901 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 9902 { 9903 struct scx_sched *sch; 9904 9905 guard(rcu)(); 9906 9907 sch = scx_prog_sched(aux); 9908 if (unlikely(!sch)) 9909 return; 9910 9911 if (unlikely(perf > SCX_CPUPERF_ONE)) { 9912 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 9913 return; 9914 } 9915 9916 if (scx_cpu_valid(sch, cpu, NULL)) { 9917 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 9918 struct rq_flags rf; 9919 9920 /* 9921 * When called with an rq lock held, restrict the operation 9922 * to the corresponding CPU to prevent ABBA deadlocks. 9923 */ 9924 if (locked_rq && rq != locked_rq) { 9925 scx_error(sch, "Invalid target CPU %d", cpu); 9926 return; 9927 } 9928 9929 /* 9930 * If no rq lock is held, allow to operate on any CPU by 9931 * acquiring the corresponding rq lock. 9932 */ 9933 if (!locked_rq) { 9934 rq_lock_irqsave(rq, &rf); 9935 update_rq_clock(rq); 9936 } 9937 9938 rq->scx.cpuperf_target = perf; 9939 cpufreq_update_util(rq, 0); 9940 9941 if (!locked_rq) 9942 rq_unlock_irqrestore(rq, &rf); 9943 } 9944 } 9945 9946 /** 9947 * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid 9948 * @cid: cid of the CPU to target 9949 * @perf: target performance level [0, %SCX_CPUPERF_ONE] 9950 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9951 * 9952 * cid-addressed equivalent of scx_bpf_cpuperf_set(). 9953 */ 9954 __bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf, 9955 const struct bpf_prog_aux *aux) 9956 { 9957 struct scx_sched *sch; 9958 s32 cpu; 9959 9960 guard(rcu)(); 9961 9962 sch = scx_prog_sched(aux); 9963 if (unlikely(!sch)) 9964 return; 9965 cpu = scx_cid_to_cpu(sch, cid); 9966 if (cpu < 0) 9967 return; 9968 scx_bpf_cpuperf_set(cpu, perf, aux); 9969 } 9970 9971 /** 9972 * scx_bpf_nr_node_ids - Return the number of possible node IDs 9973 * 9974 * All valid node IDs in the system are smaller than the returned value. 9975 */ 9976 __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 9977 { 9978 return nr_node_ids; 9979 } 9980 9981 /** 9982 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 9983 * 9984 * All valid CPU IDs in the system are smaller than the returned value. 9985 */ 9986 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 9987 { 9988 return nr_cpu_ids; 9989 } 9990 9991 /** 9992 * scx_bpf_nr_cids - Return the size of the cid space 9993 * 9994 * Equals num_possible_cpus(). All valid cids are in [0, return value). 9995 */ 9996 __bpf_kfunc u32 scx_bpf_nr_cids(void) 9997 { 9998 return num_possible_cpus(); 9999 } 10000 10001 /** 10002 * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space 10003 * 10004 * Return num_online_cpus(). The standard model restarts the scheduler on 10005 * hotplug, which lets schedulers treat [0, nr_online_cids) as the online 10006 * range. Schedulers that prefer to handle hotplug without a restart should 10007 * install a custom mapping via scx_bpf_cid_override() and track onlining 10008 * through the ops.cid_online / ops.cid_offline callbacks. 10009 */ 10010 __bpf_kfunc u32 scx_bpf_nr_online_cids(void) 10011 { 10012 return num_online_cpus(); 10013 } 10014 10015 /** 10016 * scx_bpf_this_cid - Return the cid of the CPU this program is running on 10017 * 10018 * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs. 10019 * The current cpu is trivially valid, so this is just a table lookup. Return 10020 * -EINVAL if called from a non-SCX program before any scheduler has ever 10021 * been enabled (the cid table is still unallocated at that point). 10022 */ 10023 __bpf_kfunc s32 scx_bpf_this_cid(void) 10024 { 10025 s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); 10026 10027 if (!tbl) 10028 return -EINVAL; 10029 return tbl[raw_smp_processor_id()]; 10030 } 10031 10032 /** 10033 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 10034 */ 10035 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 10036 { 10037 return cpu_possible_mask; 10038 } 10039 10040 /** 10041 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 10042 */ 10043 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 10044 { 10045 return cpu_online_mask; 10046 } 10047 10048 /** 10049 * scx_bpf_put_cpumask - Release a possible/online cpumask 10050 * @cpumask: cpumask to release 10051 */ 10052 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 10053 { 10054 /* 10055 * Empty function body because we aren't actually acquiring or releasing 10056 * a reference to a global cpumask, which is read-only in the caller and 10057 * is never released. The acquire / release semantics here are just used 10058 * to make the cpumask is a trusted pointer in the caller. 10059 */ 10060 } 10061 10062 /** 10063 * scx_bpf_task_running - Is task currently running? 10064 * @p: task of interest 10065 */ 10066 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 10067 { 10068 return task_rq(p)->curr == p; 10069 } 10070 10071 /** 10072 * scx_bpf_task_cpu - CPU a task is currently associated with 10073 * @p: task of interest 10074 */ 10075 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 10076 { 10077 return task_cpu(p); 10078 } 10079 10080 /** 10081 * scx_bpf_task_cid - cid a task is currently associated with 10082 * @p: task of interest 10083 * 10084 * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a 10085 * valid cpu, so this is just a table lookup. Return -EINVAL if called from 10086 * a non-SCX program before any scheduler has ever been enabled. 10087 */ 10088 __bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p) 10089 { 10090 s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); 10091 10092 if (!tbl) 10093 return -EINVAL; 10094 return tbl[task_cpu(p)]; 10095 } 10096 10097 /** 10098 * scx_bpf_cpu_rq - Fetch the rq of a CPU 10099 * @cpu: CPU of the rq 10100 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10101 */ 10102 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 10103 { 10104 struct scx_sched *sch; 10105 10106 guard(rcu)(); 10107 10108 sch = scx_prog_sched(aux); 10109 if (unlikely(!sch)) 10110 return NULL; 10111 10112 if (!scx_cpu_valid(sch, cpu, NULL)) 10113 return NULL; 10114 10115 if (!sch->warned_deprecated_rq) { 10116 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 10117 "use scx_bpf_locked_rq() when holding rq lock " 10118 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 10119 sch->warned_deprecated_rq = true; 10120 } 10121 10122 return cpu_rq(cpu); 10123 } 10124 10125 /** 10126 * scx_bpf_locked_rq - Return the rq currently locked by SCX 10127 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10128 * 10129 * Returns the rq if a rq lock is currently held by SCX. 10130 * Otherwise emits an error and returns NULL. 10131 */ 10132 __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 10133 { 10134 struct scx_sched *sch; 10135 struct rq *rq; 10136 10137 guard(preempt)(); 10138 10139 sch = scx_prog_sched(aux); 10140 if (unlikely(!sch)) 10141 return NULL; 10142 10143 rq = scx_locked_rq(); 10144 if (!rq) { 10145 scx_error(sch, "accessing rq without holding rq lock"); 10146 return NULL; 10147 } 10148 10149 return rq; 10150 } 10151 10152 /** 10153 * scx_bpf_cpu_curr - Return remote CPU's curr task 10154 * @cpu: CPU of interest 10155 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10156 * 10157 * Callers must hold RCU read lock (KF_RCU). 10158 */ 10159 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 10160 { 10161 struct scx_sched *sch; 10162 10163 guard(rcu)(); 10164 10165 sch = scx_prog_sched(aux); 10166 if (unlikely(!sch)) 10167 return NULL; 10168 10169 if (!scx_cpu_valid(sch, cpu, NULL)) 10170 return NULL; 10171 10172 return rcu_dereference(cpu_rq(cpu)->curr); 10173 } 10174 10175 /** 10176 * scx_bpf_cid_curr - Return the curr task on the CPU at @cid 10177 * @cid: cid of interest 10178 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10179 * 10180 * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU 10181 * read lock (KF_RCU). 10182 */ 10183 __bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux) 10184 { 10185 struct scx_sched *sch; 10186 s32 cpu; 10187 10188 guard(rcu)(); 10189 10190 sch = scx_prog_sched(aux); 10191 if (unlikely(!sch)) 10192 return NULL; 10193 cpu = scx_cid_to_cpu(sch, cid); 10194 if (cpu < 0) 10195 return NULL; 10196 return rcu_dereference(cpu_rq(cpu)->curr); 10197 } 10198 10199 /** 10200 * scx_bpf_tid_to_task - Look up a task by its scx tid 10201 * @tid: task ID previously read from p->scx.tid 10202 * 10203 * Returns the task with the given tid, or NULL if no such task exists. The 10204 * returned pointer is valid until the end of the current RCU read section 10205 * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root 10206 * scheduler; otherwise an error is raised and NULL returned. 10207 */ 10208 __bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid) 10209 { 10210 struct sched_ext_entity *scx; 10211 10212 if (!scx_tid_to_task_enabled()) { 10213 struct scx_sched *sch = rcu_dereference(scx_root); 10214 10215 if (sch) 10216 scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK"); 10217 return NULL; 10218 } 10219 10220 scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params); 10221 if (!scx) 10222 return NULL; 10223 10224 return container_of(scx, struct task_struct, scx); 10225 } 10226 10227 /** 10228 * scx_bpf_now - Returns a high-performance monotonically non-decreasing 10229 * clock for the current CPU. The clock returned is in nanoseconds. 10230 * 10231 * It provides the following properties: 10232 * 10233 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 10234 * to account for execution time and track tasks' runtime properties. 10235 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 10236 * eventually reads a hardware timestamp counter -- is neither performant nor 10237 * scalable. scx_bpf_now() aims to provide a high-performance clock by 10238 * using the rq clock in the scheduler core whenever possible. 10239 * 10240 * 2) High enough resolution for the BPF scheduler use cases: In most BPF 10241 * scheduler use cases, the required clock resolution is lower than the most 10242 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 10243 * uses the rq clock in the scheduler core whenever it is valid. It considers 10244 * that the rq clock is valid from the time the rq clock is updated 10245 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 10246 * 10247 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 10248 * guarantees the clock never goes backward when comparing them in the same 10249 * CPU. On the other hand, when comparing clocks in different CPUs, there 10250 * is no such guarantee -- the clock can go backward. It provides a 10251 * monotonically *non-decreasing* clock so that it would provide the same 10252 * clock values in two different scx_bpf_now() calls in the same CPU 10253 * during the same period of when the rq clock is valid. 10254 */ 10255 __bpf_kfunc u64 scx_bpf_now(void) 10256 { 10257 struct rq *rq; 10258 u64 clock; 10259 10260 preempt_disable(); 10261 10262 rq = this_rq(); 10263 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 10264 /* 10265 * If the rq clock is valid, use the cached rq clock. 10266 * 10267 * Note that scx_bpf_now() is re-entrant between a process 10268 * context and an interrupt context (e.g., timer interrupt). 10269 * However, we don't need to consider the race between them 10270 * because such race is not observable from a caller. 10271 */ 10272 clock = READ_ONCE(rq->scx.clock); 10273 } else { 10274 /* 10275 * Otherwise, return a fresh rq clock. 10276 * 10277 * The rq clock is updated outside of the rq lock. 10278 * In this case, keep the updated rq clock invalid so the next 10279 * kfunc call outside the rq lock gets a fresh rq clock. 10280 */ 10281 clock = sched_clock_cpu(cpu_of(rq)); 10282 } 10283 10284 preempt_enable(); 10285 10286 return clock; 10287 } 10288 10289 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 10290 { 10291 struct scx_event_stats *e_cpu; 10292 int cpu; 10293 10294 /* Aggregate per-CPU event counters into @events. */ 10295 memset(events, 0, sizeof(*events)); 10296 for_each_possible_cpu(cpu) { 10297 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 10298 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 10299 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 10300 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 10301 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 10302 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 10303 scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 10304 scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 10305 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 10306 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 10307 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 10308 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 10309 scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 10310 scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); 10311 } 10312 } 10313 10314 /* 10315 * scx_bpf_events - Get a system-wide event counter to 10316 * @events: output buffer from a BPF program 10317 * @events__sz: @events len, must end in '__sz'' for the verifier 10318 */ 10319 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 10320 size_t events__sz) 10321 { 10322 struct scx_sched *sch; 10323 struct scx_event_stats e_sys; 10324 10325 rcu_read_lock(); 10326 sch = rcu_dereference(scx_root); 10327 if (sch) 10328 scx_read_events(sch, &e_sys); 10329 else 10330 memset(&e_sys, 0, sizeof(e_sys)); 10331 rcu_read_unlock(); 10332 10333 /* 10334 * We cannot entirely trust a BPF-provided size since a BPF program 10335 * might be compiled against a different vmlinux.h, of which 10336 * scx_event_stats would be larger (a newer vmlinux.h) or smaller 10337 * (an older vmlinux.h). Hence, we use the smaller size to avoid 10338 * memory corruption. 10339 */ 10340 events__sz = min(events__sz, sizeof(*events)); 10341 memcpy(events, &e_sys, events__sz); 10342 } 10343 10344 #ifdef CONFIG_CGROUP_SCHED 10345 /** 10346 * scx_bpf_task_cgroup - Return the sched cgroup of a task 10347 * @p: task of interest 10348 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10349 * 10350 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 10351 * from the scheduler's POV. SCX operations should use this function to 10352 * determine @p's current cgroup as, unlike following @p->cgroups, 10353 * @p->sched_task_group is stable for the duration of the SCX op. See 10354 * SCX_CALL_OP_TASK() for details. 10355 */ 10356 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 10357 const struct bpf_prog_aux *aux) 10358 { 10359 struct task_group *tg = p->sched_task_group; 10360 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 10361 struct scx_sched *sch; 10362 10363 guard(rcu)(); 10364 10365 sch = scx_prog_sched(aux); 10366 if (unlikely(!sch)) 10367 goto out; 10368 10369 if (!scx_kf_arg_task_ok(sch, p)) 10370 goto out; 10371 10372 cgrp = tg_cgrp(tg); 10373 10374 out: 10375 cgroup_get(cgrp); 10376 return cgrp; 10377 } 10378 #endif /* CONFIG_CGROUP_SCHED */ 10379 10380 __bpf_kfunc_end_defs(); 10381 10382 BTF_KFUNCS_START(scx_kfunc_ids_any) 10383 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 10384 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 10385 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 10386 BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS) 10387 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) 10388 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) 10389 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 10390 BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 10391 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 10392 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 10393 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 10394 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 10395 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 10396 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 10397 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 10398 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 10399 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 10400 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 10401 BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS) 10402 BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS) 10403 BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS) 10404 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 10405 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 10406 BTF_ID_FLAGS(func, scx_bpf_nr_cids) 10407 BTF_ID_FLAGS(func, scx_bpf_nr_online_cids) 10408 BTF_ID_FLAGS(func, scx_bpf_this_cid) 10409 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 10410 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 10411 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 10412 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 10413 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 10414 BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU) 10415 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 10416 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 10417 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10418 BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10419 BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED) 10420 BTF_ID_FLAGS(func, scx_bpf_now) 10421 BTF_ID_FLAGS(func, scx_bpf_events) 10422 #ifdef CONFIG_CGROUP_SCHED 10423 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 10424 #endif 10425 BTF_KFUNCS_END(scx_kfunc_ids_any) 10426 10427 static const struct btf_kfunc_id_set scx_kfunc_set_any = { 10428 .owner = THIS_MODULE, 10429 .set = &scx_kfunc_ids_any, 10430 .filter = scx_kfunc_context_filter, 10431 }; 10432 10433 /* 10434 * cpu-form kfuncs that are forbidden from cid-form schedulers 10435 * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must 10436 * use the cid-form alternative (cid/cmask kfuncs). 10437 * 10438 * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter 10439 * tests this set independently and rejects matches before the per-op 10440 * allow-list check runs. 10441 * 10442 * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and 10443 * intersects flags across duplicate entries, so each entry must carry the 10444 * same flags as the kfunc's primary declaration; otherwise the flags get 10445 * dropped globally. 10446 */ 10447 BTF_KFUNCS_START(scx_kfunc_ids_cpu_only) 10448 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 10449 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 10450 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 10451 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10452 BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) 10453 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 10454 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 10455 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 10456 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 10457 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 10458 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 10459 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 10460 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 10461 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 10462 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10463 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10464 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10465 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10466 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 10467 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) 10468 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) 10469 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) 10470 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) 10471 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) 10472 BTF_KFUNCS_END(scx_kfunc_ids_cpu_only) 10473 10474 /* 10475 * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc 10476 * group; an op may permit zero or more groups, with the union expressed in 10477 * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) 10478 * consults this table to decide whether a context-sensitive kfunc is callable 10479 * from a given SCX op. 10480 */ 10481 enum scx_kf_allow_flags { 10482 SCX_KF_ALLOW_UNLOCKED = 1 << 0, 10483 SCX_KF_ALLOW_INIT = 1 << 1, 10484 SCX_KF_ALLOW_CPU_RELEASE = 1 << 2, 10485 SCX_KF_ALLOW_DISPATCH = 1 << 3, 10486 SCX_KF_ALLOW_ENQUEUE = 1 << 4, 10487 SCX_KF_ALLOW_SELECT_CPU = 1 << 5, 10488 }; 10489 10490 /* 10491 * Map each SCX op to the union of kfunc groups it permits, indexed by 10492 * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not 10493 * context-sensitive. 10494 */ 10495 static const u32 scx_kf_allow_flags[] = { 10496 [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 10497 [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 10498 [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, 10499 [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, 10500 [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, 10501 [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, 10502 #ifdef CONFIG_EXT_GROUP_SCHED 10503 [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, 10504 [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, 10505 [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, 10506 [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, 10507 [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, 10508 [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, 10509 [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, 10510 #endif /* CONFIG_EXT_GROUP_SCHED */ 10511 [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, 10512 [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, 10513 [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, 10514 [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, 10515 [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT, 10516 [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, 10517 }; 10518 10519 /* 10520 * Verifier-time filter for SCX kfuncs. Registered via the .filter field on 10521 * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc 10522 * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or 10523 * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the 10524 * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by 10525 * falling through to "allow" when none of the SCX sets contain the kfunc. 10526 */ 10527 int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) 10528 { 10529 bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); 10530 bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id); 10531 bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); 10532 bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); 10533 bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); 10534 bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); 10535 bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); 10536 bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); 10537 bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id); 10538 u32 moff, flags; 10539 10540 /* Not an SCX kfunc - allow. */ 10541 if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch || 10542 in_cpu_release || in_idle || in_any)) 10543 return 0; 10544 10545 /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ 10546 if (prog->type == BPF_PROG_TYPE_SYSCALL) 10547 return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; 10548 10549 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) 10550 return (in_any || in_idle) ? 0 : -EACCES; 10551 10552 /* 10553 * add_subprog_and_kfunc() collects all kfunc calls, including dead code 10554 * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets 10555 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; 10556 * do_check_main() re-runs the filter with st_ops set and enforces the 10557 * actual restrictions. 10558 */ 10559 if (!prog->aux->st_ops) 10560 return 0; 10561 10562 /* 10563 * Non-SCX struct_ops: SCX kfuncs are not permitted. 10564 * 10565 * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid 10566 * (cid-form) are valid SCX struct_ops. Member offsets match between 10567 * the two (verified by BUILD_BUG_ON in scx_init()), so the shared 10568 * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to 10569 * both. 10570 */ 10571 if (prog->aux->st_ops != &bpf_sched_ext_ops && 10572 prog->aux->st_ops != &bpf_sched_ext_ops_cid) 10573 return -EACCES; 10574 10575 /* 10576 * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both 10577 * small s32s and trivially confused, so cpu-only kfuncs are rejected at 10578 * load time. The reverse (cpu-form calling cid-form kfuncs) is 10579 * intentionally permissive to ease gradual cpumask -> cid migration. 10580 */ 10581 if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only) 10582 return -EACCES; 10583 10584 /* SCX struct_ops: check the per-op allow list. */ 10585 if (in_any || in_idle) 10586 return 0; 10587 10588 moff = prog->aux->attach_st_ops_member_off; 10589 flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; 10590 10591 if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) 10592 return 0; 10593 if ((flags & SCX_KF_ALLOW_INIT) && in_init) 10594 return 0; 10595 if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) 10596 return 0; 10597 if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) 10598 return 0; 10599 if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) 10600 return 0; 10601 if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) 10602 return 0; 10603 10604 return -EACCES; 10605 } 10606 10607 static int __init scx_init(void) 10608 { 10609 int ret; 10610 10611 /* 10612 * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv. 10613 * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets 10614 * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets 10615 * matching for the shared fields. Catch any drift at boot. 10616 */ 10617 #define CID_OFFSET_MATCH(cpu_field, cid_field) \ 10618 BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \ 10619 offsetof(struct sched_ext_ops_cid, cid_field)) 10620 /* data fields used by bpf_scx_init_member() */ 10621 CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch); 10622 CID_OFFSET_MATCH(flags, flags); 10623 CID_OFFSET_MATCH(name, name); 10624 CID_OFFSET_MATCH(timeout_ms, timeout_ms); 10625 CID_OFFSET_MATCH(exit_dump_len, exit_dump_len); 10626 CID_OFFSET_MATCH(hotplug_seq, hotplug_seq); 10627 CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id); 10628 /* shared callbacks: the union view requires byte-for-byte offset match */ 10629 CID_OFFSET_MATCH(enqueue, enqueue); 10630 CID_OFFSET_MATCH(dequeue, dequeue); 10631 CID_OFFSET_MATCH(dispatch, dispatch); 10632 CID_OFFSET_MATCH(tick, tick); 10633 CID_OFFSET_MATCH(runnable, runnable); 10634 CID_OFFSET_MATCH(running, running); 10635 CID_OFFSET_MATCH(stopping, stopping); 10636 CID_OFFSET_MATCH(quiescent, quiescent); 10637 CID_OFFSET_MATCH(yield, yield); 10638 CID_OFFSET_MATCH(core_sched_before, core_sched_before); 10639 CID_OFFSET_MATCH(set_weight, set_weight); 10640 CID_OFFSET_MATCH(update_idle, update_idle); 10641 CID_OFFSET_MATCH(init_task, init_task); 10642 CID_OFFSET_MATCH(exit_task, exit_task); 10643 CID_OFFSET_MATCH(enable, enable); 10644 CID_OFFSET_MATCH(disable, disable); 10645 CID_OFFSET_MATCH(dump, dump); 10646 CID_OFFSET_MATCH(dump_task, dump_task); 10647 CID_OFFSET_MATCH(sub_attach, sub_attach); 10648 CID_OFFSET_MATCH(sub_detach, sub_detach); 10649 CID_OFFSET_MATCH(init, init); 10650 CID_OFFSET_MATCH(exit, exit); 10651 #ifdef CONFIG_EXT_GROUP_SCHED 10652 CID_OFFSET_MATCH(cgroup_init, cgroup_init); 10653 CID_OFFSET_MATCH(cgroup_exit, cgroup_exit); 10654 CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move); 10655 CID_OFFSET_MATCH(cgroup_move, cgroup_move); 10656 CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move); 10657 CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight); 10658 CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth); 10659 CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle); 10660 #endif 10661 /* renamed callbacks must occupy the same slot as their cpu-form sibling */ 10662 CID_OFFSET_MATCH(select_cpu, select_cid); 10663 CID_OFFSET_MATCH(set_cpumask, set_cmask); 10664 CID_OFFSET_MATCH(cpu_online, cid_online); 10665 CID_OFFSET_MATCH(cpu_offline, cid_offline); 10666 CID_OFFSET_MATCH(dump_cpu, dump_cid); 10667 /* @priv tail must align since both share the same data block */ 10668 CID_OFFSET_MATCH(priv, priv); 10669 /* 10670 * cid-form must end exactly at @priv - validate_ops() skips 10671 * cpu_acquire/cpu_release for cid-form because reading those fields 10672 * past the BPF allocation would be UB. 10673 */ 10674 BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) != 10675 offsetofend(struct sched_ext_ops, priv)); 10676 #undef CID_OFFSET_MATCH 10677 10678 /* 10679 * kfunc registration can't be done from init_sched_ext_class() as 10680 * register_btf_kfunc_id_set() needs most of the system to be up. 10681 * 10682 * Some kfuncs are context-sensitive and can only be called from 10683 * specific SCX ops. They are grouped into per-context BTF sets, each 10684 * registered with scx_kfunc_context_filter as its .filter callback. The 10685 * BPF core dedups identical filter pointers per hook 10686 * (btf_populate_kfunc_set()), so the filter is invoked exactly once per 10687 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op 10688 * restrictions at verify time. 10689 */ 10690 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10691 &scx_kfunc_set_enqueue_dispatch)) || 10692 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10693 &scx_kfunc_set_dispatch)) || 10694 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10695 &scx_kfunc_set_cpu_release)) || 10696 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10697 &scx_kfunc_set_unlocked)) || 10698 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 10699 &scx_kfunc_set_unlocked)) || 10700 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10701 &scx_kfunc_set_any)) || 10702 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 10703 &scx_kfunc_set_any)) || 10704 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 10705 &scx_kfunc_set_any))) { 10706 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 10707 return ret; 10708 } 10709 10710 ret = scx_idle_init(); 10711 if (ret) { 10712 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 10713 return ret; 10714 } 10715 10716 ret = scx_cid_kfunc_init(); 10717 if (ret) { 10718 pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret); 10719 return ret; 10720 } 10721 10722 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 10723 if (ret) { 10724 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 10725 return ret; 10726 } 10727 10728 ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid); 10729 if (ret) { 10730 pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret); 10731 return ret; 10732 } 10733 10734 ret = register_pm_notifier(&scx_pm_notifier); 10735 if (ret) { 10736 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 10737 return ret; 10738 } 10739 10740 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 10741 if (!scx_kset) { 10742 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 10743 return -ENOMEM; 10744 } 10745 10746 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 10747 if (ret < 0) { 10748 pr_err("sched_ext: Failed to add global attributes\n"); 10749 return ret; 10750 } 10751 10752 return 0; 10753 } 10754 __initcall(scx_init); 10755