Lines Matching +full:post +full:- +full:cursor

1 /* SPDX-License-Identifier: GPL-2.0 */
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
66 * tasks for the sub-sched being enabled. Use a global variable instead of a
67 * per-task field as all enables are serialized.
83 * interval is half of the shortest sch->watchdog_timeout.
114 * Non-NULL values are used for direct dispatch from enqueue path. A valid
143 s32 cursor;
150 .cpu = -1,
221 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
226 return jiffies_to_msecs(at - now);
228 return -(long)jiffies_to_msecs(now - at);
233 return (s32)(a - b) < 0;
238 * scx_parent - Find the parent sched
245 if (sch->level)
246 return sch->ancestors[sch->level - 1];
252 * scx_next_descendant_pre - find the next descendant for pre-order walk
257 * visit for pre-order traversal of @root's descendants. @root is included in
273 next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
279 if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
295 rcu_assign_pointer(p->scx.sched, sch);
304 * scx_is_descendant - Test whether sched is a descendant
312 if (sch->level < ancestor->level)
314 return sch->ancestors[ancestor->level] == ancestor;
318 * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
319 * @pos: iteration cursor
332 return &sch->pnode[cpu_to_node(cpu)]->global_dsq;
337 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
342 if (p->sched_class == &stop_sched_class)
345 return __setscheduler_class(p->policy, p->prio);
350 return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
357 * If @sch is a sub-sched which is bypassing, its tasks should go into
359 * not-bypassing ancestor is responsible for scheduling all tasks from
360 * bypassing sub-trees. If all ancestors including root are bypassing,
364 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that
375 * bypass_dsp_enabled - Check if bypass dispatch path is enabled
379 * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors
391 return unlikely(atomic_read(&sch->bypass_dsp_enable_depth));
395 * rq_is_open - Is the rq available for immediate execution of an SCX task?
409 * A higher-priority class task is either running or in the process of
412 if (sched_class_above(rq->next_class, &ext_sched_class))
417 * higher-priority class task waking up on it.
419 if (sched_class_above(&ext_sched_class, rq->next_class))
428 * be ready depending on whether the on-going dispatch decides to extend
432 if (rq->scx.flags & SCX_RQ_IN_BALANCE)
471 * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
482 (sch)->ops.op(args); \
490 __typeof__((sch)->ops.op(args)) __ret; \
496 __ret = (sch)->ops.op(args); \
504 * and records them in current->scx.kf_tasks[] for the duration of the call. A
509 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
512 * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
514 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
515 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
520 WARN_ON_ONCE(current->scx.kf_tasks[0]); \
521 current->scx.kf_tasks[0] = task; \
523 current->scx.kf_tasks[0] = NULL; \
528 __typeof__((sch)->ops.op(task, ##args)) __ret; \
529 WARN_ON_ONCE(current->scx.kf_tasks[0]); \
530 current->scx.kf_tasks[0] = task; \
532 current->scx.kf_tasks[0] = NULL; \
538 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
539 WARN_ON_ONCE(current->scx.kf_tasks[0]); \
540 current->scx.kf_tasks[0] = task0; \
541 current->scx.kf_tasks[1] = task1; \
543 current->scx.kf_tasks[0] = NULL; \
544 current->scx.kf_tasks[1] = NULL; \
552 if (unlikely((p != current->scx.kf_tasks[0] &&
553 p != current->scx.kf_tasks[1]))) {
575 * nldsq_next_task - Iterate to the next task in a non-local DSQ
576 * @dsq: non-local dsq being iterated
588 lockdep_assert_held(&dsq->lock);
591 list_node = &cur->scx.dsq_list.node;
593 list_node = &dsq->list;
598 list_node = list_node->prev;
600 list_node = list_node->next;
602 if (list_node == &dsq->list)
607 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
617 * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ
618 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
619 * @dsq: non-local dsq being iterated
621 * Find the next task in a cursor based iteration. The caller must have
622 * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock
625 * Only tasks which were queued before @cursor was initialized are visible. This
629 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor,
632 bool rev = cursor->flags & SCX_DSQ_ITER_REV;
635 lockdep_assert_held(&dsq->lock);
636 BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR));
638 if (list_empty(&cursor->node))
641 p = container_of(cursor, struct task_struct, scx.dsq_list);
643 /* skip cursors and tasks that were queued after @cursor init */
646 } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq)));
650 list_move_tail(&cursor->node, &p->scx.dsq_list.node);
652 list_move(&cursor->node, &p->scx.dsq_list.node);
654 list_del_init(&cursor->node);
661 * nldsq_cursor_lost_task - Test whether someone else took the task since iteration
662 * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
668 * dropped and re-acquired inbetween. Verify that no one else took or is in the
673 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor,
678 lockdep_assert_held(&dsq->lock);
681 * @p could have already left $src_dsq, got re-enqueud, or be in the
684 if (unlikely(p->scx.dsq != dsq ||
685 u32_before(cursor->priv, p->scx.dsq_seq) ||
686 p->scx.holding_cpu >= 0))
697 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
698 * dispatch order. BPF-visible iterator is opaque and larger to allow future
703 struct scx_dsq_list_node cursor;
716 return p->scx.flags & SCX_TASK_STATE_MASK;
733 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
747 WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
748 prev_state, state, p->comm, p->pid);
752 WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
753 prev_state, state, p->comm, p->pid);
755 p->scx.flags &= ~SCX_TASK_STATE_MASK;
756 p->scx.flags |= state;
763 struct sched_ext_entity cursor;
777 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
785 * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks.
788 * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup
811 iter->cgrp = cgrp;
812 iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
813 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
814 &iter->css_iter);
820 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
821 list_add(&iter->cursor.tasks_node, &scx_tasks);
822 iter->list_locked = true;
827 if (iter->locked_task) {
828 __balance_callbacks(iter->rq, &iter->rf);
829 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
830 iter->locked_task = NULL;
835 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
846 if (iter->list_locked) {
847 iter->list_locked = false;
854 if (!iter->list_locked) {
856 iter->list_locked = true;
861 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
871 if (iter->cgrp) {
872 if (iter->css_pos)
873 css_task_iter_end(&iter->css_iter);
879 list_del_init(&iter->cursor.tasks_node);
884 * scx_task_iter_next - Next task
888 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
893 struct list_head *cursor = &iter->cursor.tasks_node;
896 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
902 if (iter->cgrp) {
903 while (iter->css_pos) {
906 p = css_task_iter_next(&iter->css_iter);
910 css_task_iter_end(&iter->css_iter);
911 iter->css_pos = css_next_descendant_pre(iter->css_pos,
912 &iter->cgrp->self);
913 if (iter->css_pos)
914 css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
915 &iter->css_iter);
922 list_for_each_entry(pos, cursor, tasks_node) {
923 if (&pos->tasks_node == &scx_tasks)
925 if (!(pos->flags & SCX_TASK_CURSOR)) {
926 list_move(cursor, &pos->tasks_node);
936 * scx_task_iter_next_locked - Next non-idle task with its rq locked
939 * Visit the non-idle task with its rq lock held. Allows callers to specify
952 * while loading the BPF scheduler and vice-versa while
956 * - It's unsafe to use __setschduler_prio() on an init_task to
960 * - ops.init/exit_task() can easily be confused if called with
967 * - %PF_IDLE may not be set for an init_task whose CPU hasn't
970 * - %PF_IDLE can be set on tasks that are not init_tasks. See
975 if (p->sched_class == &idle_sched_class)
978 iter->rq = task_rq_lock(p, &iter->rf);
979 iter->locked_task = p;
982 * cgroup_task_dead() removes the dead tasks from cset->tasks
999 * scx_add_event - Increase an event counter for 'name' by 'cnt'
1007 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
1012 * __scx_add_event - Increase an event counter for 'name' by 'cnt'
1020 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
1025 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
1031 (dst_e)->kind += READ_ONCE((src_e)->kind); \
1035 * scx_dump_event - Dump an event 'kind' in 'events' to 's'
1041 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
1067 * wait_ops_state - Busy-wait the specified ops state to end
1071 * Busy-wait for @p to transition out of @opss. This can only be used when the
1080 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
1089 * ops_cpu_valid - Verify a cpu number, to be used on ops input args
1109 * ops_sanitize_err - Sanitize a -errno value
1112 * @err: -errno value to sanitize
1114 * Verify @err is a valid -errno. If not, trigger scx_error() and return
1115 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
1123 if (err < 0 && err >= -MAX_ERRNO)
1127 return -EPROTO;
1145 * schedule_deferred - Schedule execution of deferred actions on an rq
1161 * correctly - the _locked() path already processes remote rqs from
1162 * the calling CPU - but targeting the owning CPU allows IPI delivery
1163 * without waiting for the calling CPU to re-enable IRQs and is
1166 irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq));
1170 * schedule_deferred_locked - Schedule execution of deferred actions on an rq
1185 if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
1189 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
1203 if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
1204 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
1211 * time to IRQ re-enable shouldn't be long.
1225 if (unlikely(READ_ONCE(sch->bypass_depth)))
1228 if (dsq->id == SCX_DSQ_LOCAL) {
1231 struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
1232 struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
1240 if (list_empty(&drl->node) ||
1241 (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) {
1243 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
1245 if (list_empty(&drl->node))
1246 list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
1247 WRITE_ONCE(drl->flags, drl->flags | reenq_flags);
1249 } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
1252 struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
1253 struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
1261 if (list_empty(&dru->node) ||
1262 (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) {
1264 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
1266 if (list_empty(&dru->node))
1267 list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
1268 WRITE_ONCE(dru->flags, dru->flags | reenq_flags);
1271 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
1288 schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq);
1292 * touch_core_sched - Update timestamp used for core-sched task ordering
1296 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
1297 * implement global or local-DSQ FIFO ordering for core-sched. Should be called
1311 * it may be better to use per-core dispatch sequence instead.
1314 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
1319 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
1323 * If the BPF scheduler implements custom core-sched ordering via
1324 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
1326 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
1340 struct task_struct *curr = rq->curr;
1347 if (curr->scx.slice != SCX_SLICE_INF) {
1348 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
1349 if (!curr->scx.slice)
1353 dl_server_update(&rq->ext_server, delta_exec);
1364 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
1369 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
1370 WRITE_ONCE(dsq->nr, dsq->nr + 1);
1381 if (unlikely(dsq->id != SCX_DSQ_LOCAL)) {
1385 p->scx.flags |= SCX_TASK_IMMED;
1388 if (p->scx.flags & SCX_TASK_IMMED) {
1391 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
1394 rq->scx.nr_immed++;
1398 * done yet, @p can't go on the CPU immediately. Re-enqueue.
1400 if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags)))
1408 WRITE_ONCE(dsq->nr, dsq->nr - 1);
1410 if (p->scx.flags & SCX_TASK_IMMED) {
1413 if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) ||
1414 WARN_ON_ONCE(rq->scx.nr_immed <= 0))
1417 rq->scx.nr_immed--;
1423 p->scx.slice = READ_ONCE(sch->slice_dfl);
1437 * is only non-negative during an internal SCX migration.
1439 return p->scx.sticky_cpu >= 0;
1449 if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p))
1455 p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
1467 * actually getting on CPU. This gives higher-class tasks (e.g. RT)
1474 * - First, a local task to this CPU's local DSQ;
1475 * - Second, a local/remote task to a remote CPU's local DSQ.
1486 * the tasks from a user-created DSQ.
1488 * We must detect these wakeups so that we can re-enqueue IMMED tasks
1491 * @rq->next_class to &ext_sched_class if it's currently idle.
1494 * @rq->next_class is below &ext_sched_class, it will also
1497 if (sched_class_above(p->sched_class, rq->next_class))
1505 * a resched if @rq->next_class was idle. It's harmless, since
1508 if (rq->scx.flags & SCX_RQ_IN_BALANCE)
1511 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
1512 rq->curr->sched_class == &ext_sched_class) {
1513 rq->curr->scx.slice = 0;
1522 bool is_local = dsq->id == SCX_DSQ_LOCAL;
1524 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1525 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
1526 !RB_EMPTY_NODE(&p->scx.dsq_priq));
1529 raw_spin_lock_nested(&dsq->lock,
1532 if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
1535 raw_spin_unlock(&dsq->lock);
1537 raw_spin_lock(&dsq->lock);
1541 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
1546 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
1550 scx_error(sch, "cannot use vtime ordering for built-in DSQs");
1562 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
1564 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
1565 dsq->id);
1567 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
1568 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
1572 * that @dsq->list is vtime ordered.
1574 rbp = rb_prev(&p->scx.dsq_priq);
1579 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
1580 /* first task unchanged - no update needed */
1582 list_add(&p->scx.dsq_list.node, &dsq->list);
1583 /* not builtin and new task is at head - use fastpath */
1584 rcu_assign_pointer(dsq->first_task, p);
1588 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
1589 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
1590 dsq->id);
1593 list_add(&p->scx.dsq_list.node, &dsq->list);
1594 /* new task inserted at head - use fastpath */
1595 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
1596 rcu_assign_pointer(dsq->first_task, p);
1599 * dsq->list can contain parked BPF iterator cursors, so
1601 * task in the DSQ". Test dsq->first_task directly.
1603 list_add_tail(&p->scx.dsq_list.node, &dsq->list);
1604 if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
1605 rcu_assign_pointer(dsq->first_task, p);
1610 WRITE_ONCE(dsq->seq, dsq->seq + 1);
1611 p->scx.dsq_seq = dsq->seq;
1614 p->scx.dsq = dsq;
1619 * and dequeue_task_scx() will RMW p->scx.flags. If we clear
1620 * ops_state first, both sides would modify p->scx.flags
1621 * concurrently in a non-atomic way.
1628 * non-terminal DSQ: enter custody.
1630 if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
1633 p->scx.flags |= SCX_TASK_IN_CUSTODY;
1635 raw_spin_unlock(&dsq->lock);
1643 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1649 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
1651 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
1652 rb_erase(&p->scx.dsq_priq, &dsq->priq);
1653 RB_CLEAR_NODE(&p->scx.dsq_priq);
1654 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
1657 list_del_init(&p->scx.dsq_list.node);
1660 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
1664 rcu_assign_pointer(dsq->first_task, first_task);
1670 struct scx_dispatch_q *dsq = p->scx.dsq;
1671 bool is_local = dsq == &rq->scx.local_dsq;
1677 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
1680 if (unlikely(!list_empty(&p->scx.dsq_list.node)))
1681 list_del_init(&p->scx.dsq_list.node);
1686 * @p->scx.holding_cpu may be set under the protection of
1689 if (p->scx.holding_cpu >= 0)
1690 p->scx.holding_cpu = -1;
1696 raw_spin_lock(&dsq->lock);
1699 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
1702 if (p->scx.holding_cpu < 0) {
1708 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
1712 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
1713 p->scx.holding_cpu = -1;
1715 p->scx.dsq = NULL;
1718 raw_spin_unlock(&dsq->lock);
1729 lockdep_assert_held(&dsq->lock);
1732 p->scx.dsq = NULL;
1742 return &rq->scx.local_dsq;
1750 return &cpu_rq(cpu)->scx.local_dsq;
1759 scx_error(sch, "non-existent DSQ 0x%llx", dsq_id);
1773 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
1776 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
1781 scx_error(sch, "%s[%d] already direct-dispatched",
1782 p->comm, p->pid);
1784 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
1785 ddsp_task->comm, ddsp_task->pid,
1786 p->comm, p->pid);
1790 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
1791 WARN_ON_ONCE(p->scx.ddsp_enq_flags);
1793 p->scx.ddsp_dsq_id = dsq_id;
1794 p->scx.ddsp_enq_flags = enq_flags;
1801 * - direct_dispatch(): cleared on the synchronous enqueue path, deferred
1803 * - process_ddsp_deferred_locals(): cleared after consuming deferred state,
1804 * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch
1806 * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred
1808 * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by
1814 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
1815 p->scx.ddsp_enq_flags = 0;
1823 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
1828 p->scx.ddsp_enq_flags |= enq_flags;
1836 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
1839 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
1849 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1853 p->comm, p->pid, opss);
1854 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1858 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1859 list_add_tail(&p->scx.dsq_list.node,
1860 &rq->scx.ddsp_deferred_locals);
1865 ddsp_enq_flags = p->scx.ddsp_enq_flags;
1880 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
1891 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
1893 /* internal movements - rq migration / RESTORE */
1899 * Note that exiting and migration-disabled tasks that skip
1903 p->scx.flags &= ~SCX_TASK_IMMED;
1918 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1922 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
1923 unlikely(p->flags & PF_EXITING)) {
1929 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
1939 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
1941 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1942 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
1951 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1958 p->scx.flags |= SCX_TASK_IN_CUSTODY;
1964 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
1971 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags);
1974 dsq = &rq->scx.local_dsq;
1985 * For task-ordering, slice refill must be treated as implying the end
1997 return !list_empty(&p->scx.runnable_node);
2004 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
2005 p->scx.runnable_at = jiffies;
2006 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
2013 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
2018 list_del_init(&p->scx.runnable_node);
2020 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
2026 int sticky_cpu = p->scx.sticky_cpu;
2027 u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
2030 rq->scx.flags |= SCX_RQ_IN_WAKEUP;
2036 * direct-dispatch into the local DSQ by setting the sticky_cpu.
2041 if (p->scx.flags & SCX_TASK_QUEUED) {
2047 p->scx.flags |= SCX_TASK_QUEUED;
2048 rq->scx.nr_running++;
2058 if (rq->scx.nr_running == 1)
2059 dl_server_start(&rq->ext_server);
2064 p->scx.sticky_cpu = -1;
2066 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
2069 unlikely(cpu_of(rq) != p->scx.selected_cpu))
2083 opss = atomic_long_read_acquire(&p->scx.ops_state);
2104 if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) {
2109 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2128 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
2156 * change (not sleep or core-sched pick).
2161 if (!(p->scx.flags & SCX_TASK_QUEUED)) {
2170 * and then stops running. As we want running <-> stopping transitions
2171 * to be contained within runnable <-> quiescent transitions, trigger
2172 * ->stopping() early here instead of in put_prev_task_scx().
2174 * @p may go through multiple stopping <-> running transitions between
2189 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
2191 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
2193 p->scx.flags &= ~SCX_TASK_QUEUED;
2194 rq->scx.nr_running--;
2204 struct task_struct *p = rq->donor;
2210 p->scx.slice = 0;
2215 struct task_struct *from = rq->donor;
2231 if (p->sched_class == &ext_sched_class)
2235 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks.
2238 * - A SCX task is currently running.
2240 * - @rq is waking from idle due to a SCX task waking to it.
2242 * - A higher-priority wakes up while SCX dispatch is in progress.
2244 if (rq->scx.nr_immed)
2253 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
2256 lockdep_assert_held(&src_dsq->lock);
2259 WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2262 list_add(&p->scx.dsq_list.node, &dst_dsq->list);
2264 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
2267 p->scx.dsq = dst_dsq;
2273 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
2288 * beginning of an SCX-internal migration.
2290 p->scx.sticky_cpu = cpu_of(dst_rq);
2298 * We want to pass scx-specific enq_flags but activate_task() will
2300 * @rq->scx.extra_enq_flags instead.
2302 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
2303 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
2304 dst_rq->scx.extra_enq_flags = enq_flags;
2306 dst_rq->scx.extra_enq_flags = 0;
2313 * - is_cpu_allowed() asks "Can this task run on this CPU?" while
2323 * - The BPF scheduler is bypassed while the rq is offline and we can always say
2337 * If @p has migration disabled, @p->cpus_ptr is updated to contain only
2339 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
2351 p->comm, p->pid, task_cpu(p), cpu);
2364 cpu, p->comm, p->pid);
2378 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
2386 * non-local DSQ, it's better to use the same mechanism to protect against
2387 * dequeues and maintain the invariant that @p->scx.dsq can only change while
2395 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
2397 * would be cleared to -1. While other cpus may have updated it to different
2412 lockdep_assert_held(&dsq->lock);
2414 WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2416 p->scx.holding_cpu = cpu;
2418 raw_spin_unlock(&dsq->lock);
2422 return likely(p->scx.holding_cpu == cpu) &&
2443 * move_task_between_dsqs() - Move a task from one DSQ to another
2465 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
2466 lockdep_assert_held(&src_dsq->lock);
2469 if (dst_dsq->id == SCX_DSQ_LOCAL) {
2478 /* no need to migrate if destination is a non-local DSQ */
2486 if (dst_dsq->id == SCX_DSQ_LOCAL) {
2487 /* @p is going from a non-local DSQ to a local DSQ */
2492 raw_spin_unlock(&src_dsq->lock);
2494 raw_spin_unlock(&src_dsq->lock);
2500 * @p is going from a non-local DSQ to a non-local DSQ. As
2504 raw_spin_unlock(&src_dsq->lock);
2520 * @dsq->list without locking and skip if it seems empty.
2522 if (list_empty(&dsq->list))
2525 raw_spin_lock(&dsq->lock);
2535 * the system into the bypass mode. This can easily live-lock the
2536 * machine. If aborting, exit from all non-bypass DSQs.
2538 if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS)
2544 raw_spin_unlock(&dsq->lock);
2555 raw_spin_unlock(&dsq->lock);
2563 return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0);
2567 * dispatch_to_local_dsq - Dispatch a task to a local dsq
2615 * we're moving from a DSQ and use the same mechanism - mark the task
2619 p->scx.holding_cpu = raw_smp_processor_id();
2622 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
2632 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
2640 p->scx.holding_cpu = -1;
2641 dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p,
2651 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
2663 * finish_dispatch - Asynchronously finish dispatching a task
2695 opss = atomic_long_read(&p->scx.ops_state);
2705 * dispatch/dequeue and re-enqueue cycle between
2719 * it - the BPF scheduler is allowed to dispatch tasks
2724 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2739 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
2743 if (dsq->id == SCX_DSQ_LOCAL)
2751 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
2754 for (u = 0; u < dspc->cursor; u++) {
2755 struct scx_dsp_buf_ent *ent = &dspc->buf[u];
2757 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
2758 ent->enq_flags);
2761 dspc->nr_tasks += dspc->cursor;
2762 dspc->cursor = 0;
2769 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
2772 queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
2775 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
2780 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
2787 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
2790 bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
2806 * tasks. The following implements a simple built-in behavior -
2810 * auto-consumption and a kfunc to consume the bypass DSQ and,
2814 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
2816 if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
2827 dspc->rq = rq;
2837 dspc->nr_tasks = 0;
2843 rq->scx.sub_dispatch_prev = prev;
2845 rq->scx.sub_dispatch_prev = NULL;
2850 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) {
2851 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2854 if (rq->scx.local_dsq.nr)
2868 if (unlikely(!--nr_loops)) {
2872 } while (dspc->nr_tasks);
2891 rq->scx.flags |= SCX_RQ_IN_BALANCE;
2892 rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
2894 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
2895 unlikely(rq->scx.cpu_released)) {
2899 * core. This callback complements ->cpu_release(), which is
2904 rq->scx.cpu_released = false;
2907 if (prev->sched_class == &ext_sched_class) {
2915 * implement ->cpu_release().
2920 if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice &&
2922 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2928 if (rq->scx.local_dsq.nr)
2938 if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2939 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) {
2940 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2944 rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2951 * - rq_is_open() can't reliably tell when and how slice is going to be
2955 * - A non-IMMED HEAD task can get queued in front of an IMMED task
2958 if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed))
2961 rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2969 if (p->scx.flags & SCX_TASK_QUEUED) {
2971 * Core-sched might decide to execute @p before it is
2978 p->se.exec_start = rq_clock_task(rq);
2981 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
2990 if ((p->scx.slice == SCX_SLICE_INF) !=
2991 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
2992 if (p->scx.slice == SCX_SLICE_INF)
2993 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
2995 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
3003 * tick-stopped CPUs.
3024 const struct sched_class *next_class = next->sched_class;
3026 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
3040 * sched_class, so invoke the ->cpu_release() callback if we have not
3044 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
3047 if (!rq->scx.cpu_released) {
3056 rq->scx.cpu_released = true;
3066 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
3071 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
3074 if (p->scx.flags & SCX_TASK_QUEUED) {
3079 * preempted by a higher priority scheduler class or core-sched
3084 if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) {
3085 if (p->scx.flags & SCX_TASK_IMMED) {
3086 p->scx.flags |= SCX_TASK_REENQ_PREEMPTED;
3087 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
3088 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
3090 dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD);
3099 * which should trigger an explicit follow-up scheduling event.
3101 if (next && sched_class_above(&ext_sched_class, next->sched_class)) {
3102 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
3103 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
3105 do_enqueue_task(rq, p, 0, -1);
3110 if (next && next->sched_class != &ext_sched_class)
3117 unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
3132 for_each_cpu(cpu, rq->scx.cpus_to_sync) {
3138 smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
3139 cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
3144 while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
3145 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
3158 return list_first_entry_or_null(&rq->scx.local_dsq.list,
3165 struct task_struct *prev = rq->curr;
3170 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
3184 if (unlikely(rq->scx.kick_sync_pending)) {
3185 rq->scx.kick_sync_pending = false;
3186 queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
3191 * If any higher-priority sched class enqueued a runnable task on
3196 * regardless of any higher-priority sched classes activity.
3201 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
3203 prev->sched_class != &ext_sched_class)) {
3215 if (!p->scx.slice)
3222 if (unlikely(!p->scx.slice)) {
3226 !sch->warned_zero_slice) {
3228 p->comm, p->pid, __func__);
3229 sch->warned_zero_slice = true;
3255 return do_pick_task_scx(dl_se->rq, rf, true);
3263 struct sched_dl_entity *dl_se = &rq->ext_server;
3272 * scx_prio_less - Task ordering for core-sched
3277 * Core-sched is implemented as an additional scheduling layer on top of the
3279 * SCX, core-sched calls this function to interrogate the task ordering.
3281 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
3283 * priority the task - the global FIFO ordering matching the default scheduling
3286 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
3307 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
3338 this_rq()->scx.in_select_cpu = true;
3340 this_rq()->scx.in_select_cpu = false;
3341 p->scx.selected_cpu = cpu;
3353 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
3357 p->scx.selected_cpu = cpu;
3381 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
3382 * differ from the configured one in @p->cpus_mask. Always tell the bpf
3385 * Fine-grained memory write control is enforced by BPF making the const
3389 SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
3408 scx_idle_update_selcpu_topology(&sch->ops);
3433 rq->scx.flags |= SCX_RQ_ONLINE;
3438 rq->scx.flags &= ~SCX_RQ_ONLINE;
3453 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
3455 unsigned long last_runnable = p->scx.runnable_at;
3458 last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
3459 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
3463 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
3506 last_check + READ_ONCE(root->watchdog_timeout)))) {
3507 u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
3524 * While disabling, always resched and refresh core-sched timestamp as
3528 curr->scx.slice = 0;
3534 if (!curr->scx.slice)
3543 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
3546 if (tg && tg->css.cgroup)
3547 return tg->css.cgroup;
3564 p->scx.disallow = false;
3579 if (p->scx.disallow) {
3581 scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
3582 p->comm, p->pid);
3584 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
3585 p->comm, p->pid);
3593 * We're in the load path and @p->policy will be applied
3594 * right after. Reverting @p->policy here and rejecting
3596 * guarantees that if ops.init_task() sets @p->disallow,
3599 if (p->policy == SCHED_EXT) {
3600 p->policy = SCHED_NORMAL;
3623 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
3632 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
3634 p->scx.weight = sched_weight_to_cgroup(weight);
3640 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
3667 WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
3677 lockdep_assert_held(&p->pi_lock);
3709 lockdep_assert_held(&p->pi_lock);
3724 * its parent. Exit for the child too - scx_enable_task() never ran for
3725 * it, so undo only init_task. The flag is only set on the sub-enable
3728 if (p->scx.flags & SCX_TASK_SUB_INIT) {
3731 p->scx.flags &= ~SCX_TASK_SUB_INIT;
3741 INIT_LIST_HEAD(&scx->dsq_list.node);
3742 RB_CLEAR_NODE(&scx->dsq_priq);
3743 scx->sticky_cpu = -1;
3744 scx->holding_cpu = -1;
3745 INIT_LIST_HEAD(&scx->runnable_node);
3746 scx->runnable_at = jiffies;
3747 scx->ddsp_dsq_id = SCX_DSQ_INVALID;
3748 scx->slice = SCX_SLICE_DFL;
3770 struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
3797 if (p->sched_class == &ext_sched_class) {
3808 list_add_tail(&p->scx.tasks_node, &scx_tasks);
3830 * task_dead_and_done - Is a task dead and done running?
3835 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
3840 * that needs to happen on the task. Use this test to short-circuit sched_class
3854 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
3864 * for the last time and then dropped the rq lock - task_dead_and_done()
3869 list_del_init(&p->scx.tasks_node);
3873 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
3876 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see
3878 * iteration is only used from sub-sched paths, which require root
3882 * into ops; transition to %DEAD so the post-init recheck unwinds
3907 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
3909 SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
3927 * different scheduler class. Keep the BPF scheduler up-to-date.
3930 SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr);
3942 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a
3943 * NONE -> READY validation failure.
3958 if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
3959 p->policy != policy && policy == SCHED_EXT)
3960 return -EACCES;
3978 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
3982 u64 dsq_id = p->scx.ddsp_dsq_id;
3983 u64 enq_flags = p->scx.ddsp_enq_flags;
3985 list_del_init(&p->scx.dsq_list.node);
3989 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
3999 * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
4000 * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
4003 * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
4008 * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
4026 if ((p->scx.flags & SCX_TASK_IMMED) &&
4051 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
4054 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
4062 * deactivate and re-activate @p anyway. Skip re-enqueueing.
4065 * re-enqueue a migrating task while its current CPU and allowed
4070 if (p->migration_pending)
4081 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
4082 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4083 p->scx.flags |= reason;
4085 list_add_tail(&p->scx.dsq_list.node, &tasks);
4089 list_del_init(&p->scx.dsq_list.node);
4091 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
4093 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4102 u64 seq = ++rq->scx.deferred_reenq_locals_seq;
4111 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
4113 list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
4123 sch = sch_pcpu->sch;
4125 reenq_flags = drl->flags;
4126 WRITE_ONCE(drl->flags, 0);
4127 list_del_init(&drl->node);
4129 if (likely(drl->seq != seq)) {
4130 drl->seq = seq;
4131 drl->cnt = 0;
4133 if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
4135 drl->cnt);
4161 struct scx_sched *sch = dsq->sched;
4162 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
4168 raw_spin_lock(&dsq->lock);
4170 while (likely(!READ_ONCE(sch->bypass_depth))) {
4174 p = nldsq_cursor_next_task(&cursor, dsq);
4187 raw_spin_unlock(&dsq->lock);
4189 raw_spin_lock(&dsq->lock);
4194 if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
4200 raw_spin_unlock(&dsq->lock);
4202 if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
4203 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4204 p->scx.flags |= reason;
4206 do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
4208 p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4216 raw_spin_lock(&dsq->lock);
4219 list_del_init(&cursor.node);
4220 raw_spin_unlock(&dsq->lock);
4237 scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
4239 list_first_entry_or_null(&rq->scx.deferred_reenq_users,
4249 dsq = dsq_pcpu->dsq;
4250 reenq_flags = dru->flags;
4251 WRITE_ONCE(dru->flags, 0);
4252 list_del_init(&dru->node);
4258 BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
4267 if (!list_empty(&rq->scx.deferred_reenq_locals))
4270 if (!list_empty(&rq->scx.deferred_reenq_users))
4277 struct task_struct *p = rq->curr;
4280 if (p->sched_class != &ext_sched_class)
4291 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
4302 tg->scx.weight = CGROUP_WEIGHT_DFL;
4303 tg->scx.bw_period_us = default_bw_period_us();
4304 tg->scx.bw_quota_us = RUNTIME_INF;
4305 tg->scx.idle = false;
4313 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
4318 { .weight = tg->scx.weight,
4319 .bw_period_us = tg->scx.bw_period_us,
4320 .bw_quota_us = tg->scx.bw_quota_us,
4321 .bw_burst_us = tg->scx.bw_burst_us };
4324 NULL, tg->css.cgroup, &args);
4329 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
4331 tg->scx.flags |= SCX_TG_ONLINE;
4341 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
4344 (tg->scx.flags & SCX_TG_INITED))
4345 SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup);
4346 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
4363 WARN_ON_ONCE(p->scx.cgrp_moving_from);
4368 * always match one-to-one.
4375 p, from, css->cgroup);
4380 p->scx.cgrp_moving_from = from;
4388 p->scx.cgrp_moving_from)
4390 p, p->scx.cgrp_moving_from, css->cgroup);
4391 p->scx.cgrp_moving_from = NULL;
4407 * so it can hand an unchanged-cgroup task here with cgrp_moving_from
4411 if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from)
4413 p, p->scx.cgrp_moving_from,
4415 p->scx.cgrp_moving_from = NULL;
4429 p->scx.cgrp_moving_from)
4431 p, p->scx.cgrp_moving_from, css->cgroup);
4432 p->scx.cgrp_moving_from = NULL;
4444 tg->scx.weight != weight)
4447 tg->scx.weight = weight;
4463 tg->scx.idle = idle;
4477 (tg->scx.bw_period_us != period_us ||
4478 tg->scx.bw_quota_us != quota_us ||
4479 tg->scx.bw_burst_us != burst_us))
4483 tg->scx.bw_period_us = period_us;
4484 tg->scx.bw_quota_us = quota_us;
4485 tg->scx.bw_burst_us = burst_us;
4521 return sch->cgrp;
4524 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */
4531 rcu_assign_pointer(pos->scx_sched, sch);
4541 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
4543 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
4588 raw_spin_lock_init(&dsq->lock);
4589 INIT_LIST_HEAD(&dsq->list);
4590 dsq->id = dsq_id;
4591 dsq->sched = sch;
4593 dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu);
4594 if (!dsq->pcpu)
4595 return -ENOMEM;
4598 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
4600 pcpu->dsq = dsq;
4601 INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node);
4612 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
4613 struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user;
4620 if (WARN_ON_ONCE(!list_empty(&dru->node))) {
4621 guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
4622 list_del_init(&dru->node);
4626 free_percpu(dsq->pcpu);
4643 call_rcu(&dsq->rcu, free_dsq_rcufn);
4659 raw_spin_lock_irqsave(&dsq->lock, flags);
4661 if (dsq->nr) {
4662 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
4663 dsq->id, dsq->nr);
4667 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
4672 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
4677 dsq->id = SCX_DSQ_INVALID;
4678 if (llist_add(&dsq->free_node, &dsqs_to_free))
4682 raw_spin_unlock_irqrestore(&dsq->lock, flags);
4701 if (!(tg->scx.flags & SCX_TG_INITED))
4703 tg->scx.flags &= ~SCX_TG_INITED;
4705 if (!sch->ops.cgroup_exit)
4708 SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup);
4724 .weight = tg->scx.weight,
4725 .bw_period_us = tg->scx.bw_period_us,
4726 .bw_quota_us = tg->scx.bw_quota_us,
4727 .bw_burst_us = tg->scx.bw_burst_us,
4730 if ((tg->scx.flags &
4734 if (!sch->ops.cgroup_init) {
4735 tg->scx.flags |= SCX_TG_INITED;
4740 css->cgroup, &args);
4745 tg->scx.flags |= SCX_TG_INITED;
4829 irq_work_sync(&sch->disable_irq_work);
4830 kthread_destroy_worker(sch->helper);
4831 timer_shutdown_sync(&sch->bypass_lb_timer);
4832 free_cpumask_var(sch->bypass_lb_donee_cpumask);
4833 free_cpumask_var(sch->bypass_lb_resched_cpumask);
4836 kfree(sch->cgrp_path);
4839 if (sch->sub_kset)
4840 kobject_put(&sch->sub_kset->kobj);
4844 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
4849 * deferred_reenq_local_node's must be off-list by now.
4851 WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
4856 free_percpu(sch->pcpu);
4859 free_pnode(sch->pnode[node]);
4860 kfree(sch->pnode);
4862 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
4867 destroy_dsq(sch, dsq->id);
4870 } while (dsq == ERR_PTR(-EAGAIN));
4873 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
4874 free_exit_info(sch->exit_info);
4882 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
4883 queue_rcu_work(system_dfl_wq, &sch->rcu_work);
4891 return sysfs_emit(buf, "%s\n", sch->ops.name);
4896 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
4943 * and sub-scheduler kset kobjects (kset_ktype) through the parent
4946 if (kobj->ktype != &scx_ktype)
4951 return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
4973 * scx is tearing down - keep new SCHED_EXT tasks out.
4980 * This can develop into a deadlock - scx holds scx_enable_mutex across
5002 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
5005 if (unlikely(p->sched_class != &ext_sched_class))
5012 * handle_lockup - sched_ext common lockup handler
5047 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
5064 * scx_softlockup - sched_ext softlockup handler
5067 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
5068 * live-lock the system by making many CPUs target the same DSQ to the point
5069 * where soft-lockup detection triggers. This function is called from
5070 * soft-lockup watchdog when the triggering point is close and tries to unjam
5075 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
5078 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
5084 * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
5088 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1);
5092 int cpu = atomic_xchg(&scx_hardlockup_cpu, -1);
5094 if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu))
5095 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
5102 * scx_hardlockup - sched_ext hardlockup handler
5120 atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu);
5132 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0);
5133 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
5147 raw_spin_lock(&donor_dsq->lock);
5148 list_add(&cursor.node, &donor_dsq->list);
5150 n = container_of(&cursor, struct task_struct, scx.dsq_list);
5159 if (donor_dsq->nr <= nr_donor_target)
5174 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
5188 * Moving $p from one non-local DSQ to another. The source rq
5207 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
5212 list_move_tail(&cursor.node, &n->scx.dsq_list.node);
5213 raw_spin_unlock(&donor_dsq->lock);
5217 raw_spin_lock(&donor_dsq->lock);
5222 list_del_init(&cursor.node);
5223 raw_spin_unlock(&donor_dsq->lock);
5232 struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask;
5233 struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask;
5242 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
5264 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target)
5275 if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target)
5286 u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
5298 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
5299 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
5327 WARN_ON_ONCE(sch->bypass_depth < 0);
5328 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
5329 if (sch->bypass_depth != 1)
5332 WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
5333 sch->bypass_timestamp = ktime_get_ns();
5342 WARN_ON_ONCE(sch->bypass_depth < 1);
5343 WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
5344 if (sch->bypass_depth != 0)
5347 WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
5349 ktime_get_ns() - sch->bypass_timestamp);
5360 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling.
5363 if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim)))
5367 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of
5368 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is
5374 * guarantees that the nearest non-bypassing ancestor or root has bypass
5382 ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
5386 ret = atomic_inc_return(&host->bypass_dsp_enable_depth);
5394 if (intv_us && !timer_pending(&host->bypass_lb_timer))
5395 mod_timer(&host->bypass_lb_timer,
5404 if (!test_and_clear_bit(0, &sch->bypass_dsp_claim))
5407 ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
5411 ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth);
5417 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
5430 * - ops.select_cpu() is ignored and the default select_cpu() is used.
5432 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
5435 * - ops.dispatch() is ignored.
5437 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
5441 * - pick_next_task() suppresses zero slice warning.
5443 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
5446 * - scx_prio_less() reverts to the default core_sched_at order.
5467 * Bypass state is propagated to all descendants - an scx_sched bypasses
5483 * queued tasks are re-queued according to the new scx_bypassing()
5498 struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
5500 if (pos->bypass_depth)
5501 pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
5503 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
5525 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
5552 kvfree(ei->dump);
5553 kfree(ei->msg);
5554 kfree(ei->bt);
5566 ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN);
5567 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
5568 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
5570 if (!ei->bt || !ei->msg || !ei->dump) {
5588 return "disabled by sysrq-S";
5624 intv = max(min(intv, sch->watchdog_timeout / 2), 1);
5648 * its sub-scheds while holding scx_sched_lock - either
5649 * we can see the parent's non-NONE exit_kind or the
5652 if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
5654 ret = -ENOENT;
5659 &sch->hash_node, scx_sched_hash_params);
5665 list_add_tail(&sch->sibling, &parent->children);
5669 list_add_tail_rcu(&sch->all, &scx_sched_all);
5690 rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
5692 list_del_init(&sch->sibling);
5695 list_del_rcu(&sch->all);
5702 * Called to disable future dumps and wait for in-progress one while disabling
5709 sch->dump_disabled = true;
5719 * themselves off @sch->children. Wait for it to drain. As propagation
5720 * is recursive, empty @sch->children means that all proper descendant
5723 wait_event(scx_unlink_waitq, list_empty(&sch->children));
5733 scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
5734 fail_code, failed->comm, failed->pid);
5743 scx_task_iter_start(&sti, sch->cgrp);
5782 scx_task_iter_start(&sti, sch->cgrp);
5834 * $parent's just-completed init is owed an exit_task()
5869 * All tasks are moved off of @sch but there may still be on-going
5889 if (parent->ops.sub_detach && sch->sub_attached) {
5891 .ops = &sch->ops,
5892 .cgroup_path = sch->cgrp_path,
5898 if (sch->ops.exit)
5899 SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
5900 if (sch->sub_kset)
5901 kobject_del(&sch->sub_kset->kobj);
5902 kobject_del(&sch->kobj);
5911 struct scx_exit_info *ei = sch->exit_info;
5927 sch->exit_info->msg);
5965 const struct sched_class *old_class = p->sched_class;
5974 p->sched_class = new_class;
5993 * Also re-balance the dl_server bandwidth reservations: detach
6001 * The swap can still fail with -EBUSY if someone bumped ext_server's
6013 if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server,
6014 &rq->fair_server)))
6015 pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
6017 dl_server_detach_bw(&rq->ext_server);
6022 /* no task is on scx, turn off all the switches and flush in-progress calls */
6024 bitmap_zero(sch->has_op, SCX_OPI_END);
6028 if (ei->kind >= SCX_EXIT_ERROR) {
6030 sch->ops.name, ei->reason);
6032 if (ei->msg[0] != '\0')
6033 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
6035 stack_trace_print(ei->bt, ei->bt_len, 2);
6039 sch->ops.name, ei->reason);
6042 if (sch->ops.exit)
6061 if (sch->sub_kset)
6062 kobject_del(&sch->sub_kset->kobj);
6064 kobject_del(&sch->kobj);
6091 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
6096 * flag to break potential live-lock scenarios, ensuring we can
6099 WRITE_ONCE(sch->aborting, true);
6107 * To guarantee forward progress, this propagation must be in-line so
6108 * that ->aborting is synchronously asserted for all sub-scheds. The
6109 * propagation is also the interlocking point against sub-sched
6113 * non-propagation exits.
6129 struct scx_exit_info *ei = sch->exit_info;
6132 kind = atomic_read(&sch->exit_kind);
6137 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
6140 ei->kind = kind;
6141 ei->reason = scx_exit_reason(ei->kind);
6153 irq_work_queue(&sch->disable_irq_work);
6157 * scx_flush_disable_work - flush the disable work and wait for it to finish
6160 * sch->disable_work might still not queued, causing kthread_flush_work()
6169 irq_work_sync(&sch->disable_irq_work);
6170 kthread_flush_work(&sch->disable_work);
6171 kind = atomic_read(&sch->exit_kind);
6180 if (s->size)
6201 if (s->size) {
6225 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */
6226 dd->first = true;
6227 dd->cursor = 0;
6228 dd->s = s;
6229 dd->prefix = prefix;
6235 char *line = dd->buf.line;
6237 if (!dd->cursor)
6244 if (dd->first) {
6245 dump_newline(dd->s);
6246 dd->first = false;
6266 dump_line(dd->s, "%s%s", dd->prefix, line);
6277 dd->cursor = 0;
6283 scx_dump_data.cpu = -1;
6294 unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
6299 if (task_sch->level == 0)
6302 scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu",
6303 task_sch->level, task_sch->ops.sub_cgroup_id);
6305 if (p->scx.dsq)
6307 (unsigned long long)p->scx.dsq->id);
6311 marker, task_state_to_char(p), p->comm, p->pid,
6313 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
6316 p->scx.flags & ~SCX_TASK_STATE_MASK,
6317 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
6320 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
6322 p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
6323 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
6324 p->migration_disabled);
6344 * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped
6353 .kind = ei->kind,
6354 .exit_code = ei->exit_code,
6355 .reason = ei->reason,
6366 if (sch->dump_disabled)
6369 seq_buf_init(&s, ei->dump, dump_len);
6372 if (sch->level == 0)
6373 dump_line(&s, "%s: root", sch->ops.name);
6375 dump_line(&s, "%s: sub%d-%llu %s",
6376 sch->ops.name, sch->level, sch->ops.sub_cgroup_id,
6377 sch->cgrp_path);
6379 if (ei->kind == SCX_EXIT_NONE) {
6380 dump_line(&s, "Debug dump triggered by %s", ei->reason);
6383 current->comm, current->pid, ei->kind);
6384 dump_line(&s, " %s (%s)", ei->reason, ei->msg);
6387 dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
6398 dump_line(&s, "----------");
6410 idle = list_empty(&rq->scx.runnable_list) &&
6411 rq->curr->sched_class == &idle_sched_class;
6426 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
6427 cpu, rq->scx.nr_running, rq->scx.flags,
6428 rq->scx.cpu_released, rq->scx.ops_qseq,
6429 rq->scx.kick_sync);
6431 rq->curr->comm, rq->curr->pid,
6432 rq->curr->sched_class);
6433 if (!cpumask_empty(rq->scx.cpus_to_kick))
6435 cpumask_pr_args(rq->scx.cpus_to_kick));
6436 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
6438 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
6439 if (!cpumask_empty(rq->scx.cpus_to_preempt))
6441 cpumask_pr_args(rq->scx.cpus_to_preempt));
6442 if (!cpumask_empty(rq->scx.cpus_to_wait))
6444 cpumask_pr_args(rq->scx.cpus_to_wait));
6445 if (!cpumask_empty(rq->scx.cpus_to_sync))
6447 cpumask_pr_args(rq->scx.cpus_to_sync));
6473 if (rq->curr->sched_class == &ext_sched_class &&
6474 (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
6475 scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
6477 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
6486 dump_line(&s, "--------------");
6504 memcpy(ei->dump + dump_len - sizeof(trunc_marker),
6511 struct scx_exit_info *ei = sch->exit_info;
6513 if (ei->kind >= SCX_EXIT_ERROR)
6514 scx_dump_state(sch, ei, sch->ops.exit_dump_len, true);
6516 kthread_queue_work(sch->helper, &sch->disable_work);
6523 struct scx_exit_info *ei = sch->exit_info;
6530 ei->exit_code = exit_code;
6533 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
6535 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
6538 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
6541 ei->kind = kind;
6542 ei->reason = scx_exit_reason(ei->kind);
6544 irq_work_queue(&sch->disable_irq_work);
6553 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
6566 return -ENOMEM;
6579 exit_dsq(&pnode->global_dsq);
6591 if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) {
6608 s32 level = parent ? parent->level + 1 : 0;
6613 ret = -ENOMEM;
6617 sch->exit_info = alloc_exit_info(ops->exit_dump_len);
6618 if (!sch->exit_info) {
6619 ret = -ENOMEM;
6623 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
6627 sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids);
6628 if (!sch->pnode) {
6629 ret = -ENOMEM;
6634 sch->pnode[node] = alloc_pnode(sch, node);
6635 if (!sch->pnode[node]) {
6636 ret = -ENOMEM;
6641 sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
6642 sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu,
6643 dsp_ctx.buf, sch->dsp_max_batch),
6645 if (!sch->pcpu) {
6646 ret = -ENOMEM;
6659 struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
6661 pcpu->sch = sch;
6662 INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node);
6665 sch->helper = kthread_run_worker(0, "sched_ext_helper");
6666 if (IS_ERR(sch->helper)) {
6667 ret = PTR_ERR(sch->helper);
6671 sched_set_fifo(sch->helper->task);
6674 memcpy(sch->ancestors, parent->ancestors,
6675 level * sizeof(parent->ancestors[0]));
6676 sch->ancestors[level] = sch;
6677 sch->level = level;
6679 if (ops->timeout_ms)
6680 sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
6682 sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
6684 sch->slice_dfl = SCX_SLICE_DFL;
6685 atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
6686 sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn);
6687 kthread_init_work(&sch->disable_work, scx_disable_workfn);
6688 timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
6690 if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) {
6691 ret = -ENOMEM;
6694 if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) {
6695 ret = -ENOMEM;
6698 sch->ops = *ops;
6699 rcu_assign_pointer(ops->priv, sch);
6701 sch->kobj.kset = scx_kset;
6702 INIT_LIST_HEAD(&sch->all);
6707 ret = -ENOMEM;
6711 sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
6713 if (!sch->cgrp_path) {
6714 ret = -ENOMEM;
6718 sch->cgrp = cgrp;
6719 INIT_LIST_HEAD(&sch->children);
6720 INIT_LIST_HEAD(&sch->sibling);
6723 ret = kobject_init_and_add(&sch->kobj, &scx_ktype,
6724 &parent->sub_kset->kobj,
6725 "sub-%llu", cgroup_id(cgrp));
6727 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
6730 RCU_INIT_POINTER(ops->priv, NULL);
6731 kobject_put(&sch->kobj);
6735 if (ops->sub_attach) {
6736 sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
6737 if (!sch->sub_kset) {
6738 RCU_INIT_POINTER(ops->priv, NULL);
6739 kobject_put(&sch->kobj);
6740 return ERR_PTR(-ENOMEM);
6744 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
6746 RCU_INIT_POINTER(ops->priv, NULL);
6747 kobject_put(&sch->kobj);
6755 RCU_INIT_POINTER(ops->priv, NULL);
6756 free_cpumask_var(sch->bypass_lb_resched_cpumask);
6759 free_cpumask_var(sch->bypass_lb_donee_cpumask);
6761 kthread_destroy_worker(sch->helper);
6768 free_percpu(sch->pcpu);
6771 free_pnode(sch->pnode[node]);
6772 kfree(sch->pnode);
6774 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
6776 free_exit_info(sch->exit_info);
6796 if (ops->hotplug_seq) {
6798 if (ops->hotplug_seq != global_hotplug_seq) {
6802 ops->hotplug_seq, global_hotplug_seq);
6803 return -EBUSY;
6816 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
6818 return -EINVAL;
6822 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
6825 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
6826 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
6828 return -EINVAL;
6831 if (ops->cpu_acquire || ops->cpu_release)
6832 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
6838 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
6839 * starvation. During the READY -> ENABLED task switching loop, the calling
6842 * fair-class saturation, leading to a system hang.
6853 struct sched_ext_ops *ops = cmd->ops;
6863 ret = -EBUSY;
6868 * @ops->priv binds @ops to its scx_sched instance. It is set here by
6871 * it's still non-NULL here, a previous attachment on @ops has not
6872 * finished tearing down; proceeding would let the in-flight unreg's
6873 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign.
6875 if (rcu_access_pointer(ops->priv)) {
6876 ret = -EBUSY;
6905 rq->scx.local_dsq.sched = sch;
6906 rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
6911 * online CPUs by watching ->on/offline_cpu() after ->init().
6929 if (sch->ops.init) {
6937 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
6942 set_bit(i, sch->has_op);
6965 * detaches ext_server, so partially-attached state is cleaned up
6973 ret = dl_server_attach_bw(&rq->ext_server);
6992 set_bit(i, sch->has_op);
6994 if (sch->ops.cpu_acquire || sch->ops.cpu_release)
6995 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
7045 * leaving the post-init recheck below to unwind.
7060 ret, p->comm, p->pid);
7089 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
7093 * We're fully committed and can't fail. The task READY -> ENABLED
7101 const struct sched_class *old_class = p->sched_class;
7111 p->scx.slice = READ_ONCE(sch->slice_dfl);
7112 p->sched_class = new_class;
7121 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
7125 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
7143 dl_server_detach_bw(&rq->fair_server);
7148 sch->ops.name, scx_switched_all() ? "" : " (partial)");
7149 kobject_uevent(&sch->kobj, KOBJ_ADD);
7154 cmd->ret = 0;
7161 cmd->ret = ret;
7181 cmd->ret = 0;
7188 struct scx_sched *parent = cgrp->scx_sched;
7194 if (parent->cgrp == cgrp)
7195 return ERR_PTR(-EBUSY);
7197 /* does $parent allow sub-scheds? */
7198 if (!parent->ops.sub_attach)
7199 return ERR_PTR(-EOPNOTSUPP);
7202 list_for_each_entry(pos, &parent->children, sibling)
7203 if (cgroup_is_descendant(pos->cgrp, cgrp))
7204 return ERR_PTR(-EBUSY);
7219 state, p->comm, p->pid);
7227 struct sched_ext_ops *ops = cmd->ops;
7237 ret = -ENODEV;
7241 /* See scx_root_enable_workfn() for the @ops->priv check. */
7242 if (rcu_access_pointer(ops->priv)) {
7243 ret = -EBUSY;
7247 cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
7260 kobject_get(&parent->kobj);
7265 kobject_put(&parent->kobj);
7275 if (sch->level >= SCX_SUB_MAX_DEPTH) {
7281 if (sch->ops.init) {
7288 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
7295 .ops = &sch->ops,
7296 .cgroup_path = sch->cgrp_path,
7306 sch->sub_attached = true;
7312 set_bit(i, sch->has_op);
7318 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
7322 if (!(cgrp->self.flags & CSS_ONLINE)) {
7335 scx_task_iter_start(&sti, sch->cgrp);
7350 if (p->scx.flags & SCX_TASK_SUB_INIT)
7358 ret = -EINVAL;
7379 * just-completed init is owed an exit_task() and we
7388 p->scx.flags |= SCX_TASK_SUB_INIT;
7399 scx_task_iter_start(&sti, sch->cgrp);
7405 if (!(p->scx.flags & SCX_TASK_SUB_INIT))
7425 p->scx.flags &= ~SCX_TASK_SUB_INIT;
7437 pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
7438 kobject_uevent(&sch->kobj, KOBJ_ADD);
7446 cmd->ret = ret;
7457 * must stay set until SUB_INIT is cleared from every marked task -
7460 scx_task_iter_start(&sti, sch->cgrp);
7462 if (p->scx.flags & SCX_TASK_SUB_INIT) {
7464 p->scx.flags &= ~SCX_TASK_SUB_INIT;
7476 cmd->ret = 0;
7490 /* inherit ->scx_sched from $parent */
7492 rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
7496 if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
7497 scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
7526 return -EINVAL;
7536 return -ENOMEM;
7538 sched_set_fifo(w->task);
7545 if (ops->sub_cgroup_id > 1)
7588 t = btf_type_by_id(reg->btf, reg->btf_id);
7597 pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()");
7606 return -EACCES;
7627 return -E2BIG;
7628 ops->dispatch_max_batch = *(u32 *)(udata + moff);
7632 return -EINVAL;
7633 ops->flags = *(u64 *)(udata + moff);
7636 ret = bpf_obj_name_cpy(ops->name, uops->name,
7637 sizeof(ops->name));
7641 return -EINVAL;
7646 return -E2BIG;
7647 ops->timeout_ms = *(u32 *)(udata + moff);
7650 ops->exit_dump_len =
7654 ops->hotplug_seq = *(u64 *)(udata + moff);
7658 ops->sub_cgroup_id = *(u64 *)(udata + moff);
7672 sch = scx_prog_sched(prog->aux);
7701 if (prog->sleepable)
7702 return -EINVAL;
7710 * XXX - Ideally, we should only do this for scheds that allow
7711 * sub-scheds and sub-scheds themselves but I don't know how to access
7716 prog->aux->priv_stack_requested = true;
7717 prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
7732 struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
7736 RCU_INIT_POINTER(ops->priv, NULL);
7737 kobject_put(&sch->kobj);
7750 * sched_ext does not support updating the actively-loaded BPF
7756 return -EOPNOTSUPP;
7764 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
7780 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
7785 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
7787 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
7794 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; }
7798 static s32 sched_ext_ops__init(void) { return -EINVAL; }
7880 .help_msg = "reset-sched-ext(S)",
7887 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
7896 .help_msg = "dump-sched-ext(D)",
7915 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
7921 struct scx_rq *this_scx = &this_rq->scx;
7927 cur_class = rq->curr->sched_class;
7937 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
7939 rq->curr->scx.slice = 0;
7940 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
7943 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
7945 cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
7946 ksyncs[cpu] = rq->scx.kick_sync;
7949 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
7954 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
7955 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
7980 struct scx_rq *this_scx = &this_rq->scx;
7990 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
7992 for_each_cpu(cpu, this_scx->cpus_to_kick) {
7994 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
7995 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
7998 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
8000 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
8009 this_scx->kick_sync_pending = true;
8016 * print_scx_info - print out sched_ext scheduler state
8047 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
8049 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
8054 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
8061 log_lvl, sch->ops.name, scx_enable_state_str[state], all,
8120 BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL));
8122 INIT_LIST_HEAD(&rq->scx.runnable_list);
8123 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
8125 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
8126 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
8127 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
8128 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
8129 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
8130 raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
8131 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
8132 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
8133 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
8134 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
8137 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
8160 scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id);
8163 } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) {
8200 struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
8209 if (unlikely(dspc->cursor >= sch->dsp_max_batch)) {
8214 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
8216 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
8225 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
8279 p->scx.slice = slice;
8281 p->scx.slice = p->scx.slice ?: 1;
8305 p->scx.slice = slice;
8307 p->scx.slice = p->scx.slice ?: 1;
8309 p->scx.dsq_vtime = vtime;
8325 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
8328 * @args->dsq_id: DSQ to insert into
8329 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value
8330 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
8331 * @args->enq_flags: SCX_ENQ_*
8339 * @args->dsq_id. Tasks queued into the priority queue are ordered by
8340 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
8342 * @args->vtime ordering is according to time_before64() which considers
8344 * ordering and vice-versa.
8348 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
8368 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice,
8369 args->vtime, args->enq_flags);
8388 * Disallow if any sub-scheds are attached. There is no way to tell
8391 if (unlikely(!list_empty(&sch->children))) {
8418 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
8428 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL.
8433 sch = src_dsq->sched;
8440 * cause similar live-lock conditions as consume_dispatch_q().
8442 if (unlikely(READ_ONCE(sch->aborting)))
8447 p->comm, p->pid);
8460 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
8472 raw_spin_lock(&src_dsq->lock);
8475 if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) {
8476 raw_spin_unlock(&src_dsq->lock);
8488 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
8489 p->scx.dsq_vtime = kit->vtime;
8490 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
8491 p->scx.slice = kit->slice;
8506 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
8514 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
8529 return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor);
8533 * scx_bpf_dispatch_cancel - Cancel the latest dispatch
8550 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
8552 if (dspc->cursor > 0)
8553 dspc->cursor--;
8559 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
8560 * @dsq_id: DSQ to move task from. Must be a user-created DSQ
8564 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
8568 * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as
8573 * per-node DSQs making the scope difficult to define; this may change in the
8576 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
8599 dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
8601 flush_dispatch_buf(sch, dspc->rq);
8609 if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) {
8616 dspc->nr_tasks++;
8632 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
8645 kit->slice = slice;
8646 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
8650 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
8652 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
8664 kit->vtime = vtime;
8665 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
8669 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
8676 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
8690 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local
8702 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
8728 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
8755 scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
8760 return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
8791 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
8795 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
8829 * scx_bpf_create_dsq - Create a custom DSQ
8845 return -EINVAL;
8848 return -EINVAL;
8852 return -ENOMEM;
8868 dsq->sched = sch;
8869 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
8872 ret = -ENODEV;
8907 * scx_bpf_task_set_slice - Set task's time slice
8925 p->scx.slice = slice;
8930 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
8948 p->scx.dsq_vtime = vtime;
8990 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
8992 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
8995 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
8997 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
9000 irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
9006 * scx_bpf_kick_cpu - Trigger reschedule on a CPU
9027 * scx_bpf_dsq_nr_queued - Return the number of queued tasks
9032 * -%ENOENT is returned.
9044 ret = -ENODEV;
9049 ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
9055 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
9061 ret = READ_ONCE(dsq->nr);
9065 ret = -ENOENT;
9072 * scx_bpf_destroy_dsq - Destroy a custom DSQ
9092 * bpf_iter_scx_dsq_new - Create a DSQ iterator
9113 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
9117 * Always clear $kit->dsq.
9119 kit->dsq = NULL;
9123 return -ENODEV;
9126 return -EINVAL;
9128 kit->dsq = find_user_dsq(sch, dsq_id);
9129 if (!kit->dsq)
9130 return -ENOENT;
9132 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags);
9138 * bpf_iter_scx_dsq_next - Progress a DSQ iterator
9147 if (!kit->dsq)
9150 guard(raw_spinlock_irqsave)(&kit->dsq->lock);
9152 return nldsq_cursor_next_task(&kit->cursor, kit->dsq);
9156 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
9165 if (!kit->dsq)
9168 if (!list_empty(&kit->cursor.node)) {
9171 raw_spin_lock_irqsave(&kit->dsq->lock, flags);
9172 list_del_init(&kit->cursor.node);
9173 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
9175 kit->dsq = NULL;
9179 * scx_bpf_dsq_peek - Lockless peek at the first element.
9185 * this provides only a point-in-time snapshot, and the contents may change
9207 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
9211 return rcu_dereference(dsq->first_task);
9215 * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ
9216 * @dsq_id: DSQ to re-enqueue
9221 * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are
9224 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
9225 * - User DSQs
9227 * Re-enqueues are performed asynchronously. Can be called from anywhere.
9255 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
9259 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
9282 return -EINVAL;
9312 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
9319 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
9345 * scx_bpf_error_bstr - Indicate fatal error
9369 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
9386 struct scx_bstr_buf *buf = &dd->buf;
9395 if (raw_smp_processor_id() != dd->cpu) {
9401 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
9402 sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
9404 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
9405 dd->prefix, fmt, data, data__sz, ret);
9409 dd->cursor += ret;
9410 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
9412 if (!dd->cursor)
9423 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
9428 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
9450 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
9478 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
9529 rq->scx.cpuperf_target = perf;
9538 * scx_bpf_nr_node_ids - Return the number of possible node IDs
9548 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
9558 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
9566 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
9574 * scx_bpf_put_cpumask - Release a possible/online cpumask
9581 * a reference to a global cpumask, which is read-only in the caller and
9588 * scx_bpf_task_running - Is task currently running?
9593 return task_rq(p)->curr == p;
9597 * scx_bpf_task_cpu - CPU a task is currently associated with
9606 * scx_bpf_cpu_rq - Fetch the rq of a CPU
9623 if (!sch->warned_deprecated_rq) {
9627 sch->warned_deprecated_rq = true;
9634 * scx_bpf_locked_rq - Return the rq currently locked by SCX
9661 * scx_bpf_cpu_curr - Return remote CPU's curr task
9680 return rcu_dereference(cpu_rq(cpu)->curr);
9684 * scx_bpf_now - Returns a high-performance monotonically non-decreasing
9691 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
9692 * eventually reads a hardware timestamp counter -- is neither performant nor
9693 * scalable. scx_bpf_now() aims to provide a high-performance clock by
9703 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
9706 * is no such guarantee -- the clock can go backward. It provides a
9707 * monotonically *non-decreasing* clock so that it would provide the same
9719 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
9723 * Note that scx_bpf_now() is re-entrant between a process
9728 clock = READ_ONCE(rq->scx.clock);
9750 /* Aggregate per-CPU event counters into @events. */
9753 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
9771 * scx_bpf_events - Get a system-wide event counter to
9790 * We cannot entirely trust a BPF-provided size since a BPF program
9802 * scx_bpf_task_cgroup - Return the sched cgroup of a task
9806 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
9808 * determine @p's current cgroup as, unlike following @p->cgroups,
9809 * @p->sched_task_group is stable for the duration of the SCX op. See
9815 struct task_group *tg = p->sched_task_group;
9880 * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc
9882 * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter())
9883 * consults this table to decide whether a context-sensitive kfunc is callable
9897 * context-sensitive.
9924 * Verifier-time filter for SCX kfuncs. Registered via the .filter field on
9925 * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc
9928 * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by
9942 /* Not an SCX kfunc - allow. */
9948 if (prog->type == BPF_PROG_TYPE_SYSCALL)
9949 return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES;
9951 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
9952 return (in_any || in_idle) ? 0 : -EACCES;
9957 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set;
9958 * do_check_main() re-runs the filter with st_ops set and enforces the
9961 if (!prog->aux->st_ops)
9965 * Non-SCX struct_ops: SCX kfuncs are not permitted.
9967 if (prog->aux->st_ops != &bpf_sched_ext_ops)
9968 return -EACCES;
9970 /* SCX struct_ops: check the per-op allow list. */
9974 moff = prog->aux->attach_st_ops_member_off;
9988 return -EACCES;
9999 * Some kfuncs are context-sensitive and can only be called from
10000 * specific SCX ops. They are grouped into per-context BTF sets, each
10004 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op
10048 return -ENOMEM;
10051 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);