ext.c - OpenGrok cross reference for /linux/kernel/sched/ext.c

Lines Matching +full:post +full:- +full:cursor
1 /* SPDX-License-Identifier: GPL-2.0 */
3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
66  * tasks for the sub-sched being enabled. Use a global variable instead of a
67  * per-task field as all enables are serialized.
83  * interval is half of the shortest sch->watchdog_timeout.
114  * Non-NULL values are used for direct dispatch from enqueue path. A valid
143 	s32			cursor;
150 	.cpu			= -1,
221 #define SCX_HAS_OP(sch, op)	test_bit(SCX_OP_IDX(op), (sch)->has_op)
226 		return jiffies_to_msecs(at - now);
228 		return -(long)jiffies_to_msecs(now - at);
233 	return (s32)(a - b) < 0;
238  * scx_parent - Find the parent sched
245 	if (sch->level)
246 		return sch->ancestors[sch->level - 1];
252  * scx_next_descendant_pre - find the next descendant for pre-order walk
257  * visit for pre-order traversal of @root's descendants. @root is included in
273 	next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
279 		if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
295 	rcu_assign_pointer(p->scx.sched, sch);
304  * scx_is_descendant - Test whether sched is a descendant
312 	if (sch->level < ancestor->level)
314 	return sch->ancestors[ancestor->level] == ancestor;
318  * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
319  * @pos: iteration cursor
332 	return &sch->pnode[cpu_to_node(cpu)]->global_dsq;
337 	return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
342 	if (p->sched_class == &stop_sched_class)
345 	return __setscheduler_class(p->policy, p->prio);
350 	return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
357 	 * If @sch is a sub-sched which is bypassing, its tasks should go into
359 	 * not-bypassing ancestor is responsible for scheduling all tasks from
360 	 * bypassing sub-trees. If all ancestors including root are bypassing,
364 	 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that
375  * bypass_dsp_enabled - Check if bypass dispatch path is enabled
379  * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors
391 	return unlikely(atomic_read(&sch->bypass_dsp_enable_depth));
395  * rq_is_open - Is the rq available for immediate execution of an SCX task?
409 	 * A higher-priority class task is either running or in the process of
412 	if (sched_class_above(rq->next_class, &ext_sched_class))
417 	 * higher-priority class task waking up on it.
419 	if (sched_class_above(&ext_sched_class, rq->next_class))
428 	 * be ready depending on whether the on-going dispatch decides to extend
432 	if (rq->scx.flags & SCX_RQ_IN_BALANCE)
471  * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
482 	(sch)->ops.op(args);							\
490 	__typeof__((sch)->ops.op(args)) __ret;					\
496 	__ret = (sch)->ops.op(args);						\
504  * and records them in current->scx.kf_tasks[] for the duration of the call. A
509  * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
512  * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
514  * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
515  * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
520 	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
521 	current->scx.kf_tasks[0] = task;					\
523 	current->scx.kf_tasks[0] = NULL;					\
528 	__typeof__((sch)->ops.op(task, ##args)) __ret;				\
529 	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
530 	current->scx.kf_tasks[0] = task;					\
532 	current->scx.kf_tasks[0] = NULL;					\
538 	__typeof__((sch)->ops.op(task0, task1, ##args)) __ret;			\
539 	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
540 	current->scx.kf_tasks[0] = task0;					\
541 	current->scx.kf_tasks[1] = task1;					\
543 	current->scx.kf_tasks[0] = NULL;					\
544 	current->scx.kf_tasks[1] = NULL;					\
552 	if (unlikely((p != current->scx.kf_tasks[0] &&
553 		      p != current->scx.kf_tasks[1]))) {
575  * nldsq_next_task - Iterate to the next task in a non-local DSQ
576  * @dsq: non-local dsq being iterated
588 	lockdep_assert_held(&dsq->lock);
591 		list_node = &cur->scx.dsq_list.node;
593 		list_node = &dsq->list;
598 			list_node = list_node->prev;
600 			list_node = list_node->next;
602 		if (list_node == &dsq->list)
607 	} while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
617  * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ
618  * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
619  * @dsq: non-local dsq being iterated
621  * Find the next task in a cursor based iteration. The caller must have
622  * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock
625  * Only tasks which were queued before @cursor was initialized are visible. This
629 static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor,
632 	bool rev = cursor->flags & SCX_DSQ_ITER_REV;
635 	lockdep_assert_held(&dsq->lock);
636 	BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR));
638 	if (list_empty(&cursor->node))
641 		p = container_of(cursor, struct task_struct, scx.dsq_list);
643 	/* skip cursors and tasks that were queued after @cursor init */
646 	} while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq)));
650 			list_move_tail(&cursor->node, &p->scx.dsq_list.node);
652 			list_move(&cursor->node, &p->scx.dsq_list.node);
654 		list_del_init(&cursor->node);
661  * nldsq_cursor_lost_task - Test whether someone else took the task since iteration
662  * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
668  * dropped and re-acquired inbetween. Verify that no one else took or is in the
673 static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor,
678 	lockdep_assert_held(&dsq->lock);
681 	 * @p could have already left $src_dsq, got re-enqueud, or be in the
684 	if (unlikely(p->scx.dsq != dsq ||
685 		     u32_before(cursor->priv, p->scx.dsq_seq) ||
686 		     p->scx.holding_cpu >= 0))
697  * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
698  * dispatch order. BPF-visible iterator is opaque and larger to allow future
703 	struct scx_dsq_list_node	cursor;
716 	return p->scx.flags & SCX_TASK_STATE_MASK;
733 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
747 		WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
748 			  prev_state, state, p->comm, p->pid);
752 	WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
753 		  prev_state, state, p->comm, p->pid);
755 	p->scx.flags &= ~SCX_TASK_STATE_MASK;
756 	p->scx.flags |= state;
763 	struct sched_ext_entity		cursor;
777  * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
785  * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks.
788  * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup
811 		iter->cgrp = cgrp;
812 		iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
813 		css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
814 				    &iter->css_iter);
820 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
821 	list_add(&iter->cursor.tasks_node, &scx_tasks);
822 	iter->list_locked = true;
827 	if (iter->locked_task) {
828 		__balance_callbacks(iter->rq, &iter->rf);
829 		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
830 		iter->locked_task = NULL;
835  * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
846 	if (iter->list_locked) {
847 		iter->list_locked = false;
854 	if (!iter->list_locked) {
856 		iter->list_locked = true;
861  * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
871 	if (iter->cgrp) {
872 		if (iter->css_pos)
873 			css_task_iter_end(&iter->css_iter);
879 	list_del_init(&iter->cursor.tasks_node);
884  * scx_task_iter_next - Next task
888  * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
893 	struct list_head *cursor = &iter->cursor.tasks_node;
896 	if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
902 	if (iter->cgrp) {
903 		while (iter->css_pos) {
906 			p = css_task_iter_next(&iter->css_iter);
910 			css_task_iter_end(&iter->css_iter);
911 			iter->css_pos = css_next_descendant_pre(iter->css_pos,
912 								&iter->cgrp->self);
913 			if (iter->css_pos)
914 				css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
915 						    &iter->css_iter);
922 	list_for_each_entry(pos, cursor, tasks_node) {
923 		if (&pos->tasks_node == &scx_tasks)
925 		if (!(pos->flags & SCX_TASK_CURSOR)) {
926 			list_move(cursor, &pos->tasks_node);
936  * scx_task_iter_next_locked - Next non-idle task with its rq locked
939  * Visit the non-idle task with its rq lock held. Allows callers to specify
952 		 * while loading the BPF scheduler and vice-versa while
956 		 * - It's unsafe to use __setschduler_prio() on an init_task to
960 		 * - ops.init/exit_task() can easily be confused if called with
967 		 * - %PF_IDLE may not be set for an init_task whose CPU hasn't
970 		 * - %PF_IDLE can be set on tasks that are not init_tasks. See
975 		if (p->sched_class == &idle_sched_class)
978 		iter->rq = task_rq_lock(p, &iter->rf);
979 		iter->locked_task = p;
982 		 * cgroup_task_dead() removes the dead tasks from cset->tasks
999  * scx_add_event - Increase an event counter for 'name' by 'cnt'
1007 	this_cpu_add((sch)->pcpu->event_stats.name, (cnt));			\
1012  * __scx_add_event - Increase an event counter for 'name' by 'cnt'
1020 	__this_cpu_add((sch)->pcpu->event_stats.name, (cnt));			\
1025  * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
1031 	(dst_e)->kind += READ_ONCE((src_e)->kind);				\
1035  * scx_dump_event - Dump an event 'kind' in 'events' to 's'
1041 	dump_line(&(s), "%40s: %16lld", #kind, (events)->kind);			\
1067  * wait_ops_state - Busy-wait the specified ops state to end
1071  * Busy-wait for @p to transition out of @opss. This can only be used when the
1080 	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
1089  * ops_cpu_valid - Verify a cpu number, to be used on ops input args
1109  * ops_sanitize_err - Sanitize a -errno value
1112  * @err: -errno value to sanitize
1114  * Verify @err is a valid -errno. If not, trigger scx_error() and return
1115  * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
1123 	if (err < 0 && err >= -MAX_ERRNO)
1127 	return -EPROTO;
1145  * schedule_deferred - Schedule execution of deferred actions on an rq
1161 	 * correctly - the _locked() path already processes remote rqs from
1162 	 * the calling CPU - but targeting the owning CPU allows IPI delivery
1163 	 * without waiting for the calling CPU to re-enable IRQs and is
1166 	irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq));
1170  * schedule_deferred_locked - Schedule execution of deferred actions on an rq
1185 	if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
1189 	if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
1203 	if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
1204 		rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
1211 	 * time to IRQ re-enable shouldn't be long.
1225 	if (unlikely(READ_ONCE(sch->bypass_depth)))
1228 	if (dsq->id == SCX_DSQ_LOCAL) {
1231 		struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
1232 		struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
1240 		if (list_empty(&drl->node) ||
1241 		    (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) {
1243 			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
1245 			if (list_empty(&drl->node))
1246 				list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
1247 			WRITE_ONCE(drl->flags, drl->flags | reenq_flags);
1249 	} else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
1252 		struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
1253 		struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
1261 		if (list_empty(&dru->node) ||
1262 		    (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) {
1264 			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
1266 			if (list_empty(&dru->node))
1267 				list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
1268 			WRITE_ONCE(dru->flags, dru->flags | reenq_flags);
1271 		scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
1288 	schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq);
1292  * touch_core_sched - Update timestamp used for core-sched task ordering
1296  * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
1297  * implement global or local-DSQ FIFO ordering for core-sched. Should be called
1311 	 * it may be better to use per-core dispatch sequence instead.
1314 		p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
1319  * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
1323  * If the BPF scheduler implements custom core-sched ordering via
1324  * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
1326  * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
1340 	struct task_struct *curr = rq->curr;
1347 	if (curr->scx.slice != SCX_SLICE_INF) {
1348 		curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
1349 		if (!curr->scx.slice)
1353 	dl_server_update(&rq->ext_server, delta_exec);
1364 	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
1369 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
1370 	WRITE_ONCE(dsq->nr, dsq->nr + 1);
1381 		if (unlikely(dsq->id != SCX_DSQ_LOCAL)) {
1385 		p->scx.flags |= SCX_TASK_IMMED;
1388 	if (p->scx.flags & SCX_TASK_IMMED) {
1391 		if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
1394 		rq->scx.nr_immed++;
1398 		 * done yet, @p can't go on the CPU immediately. Re-enqueue.
1400 		if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags)))
1408 	WRITE_ONCE(dsq->nr, dsq->nr - 1);
1410 	if (p->scx.flags & SCX_TASK_IMMED) {
1413 		if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) ||
1414 		    WARN_ON_ONCE(rq->scx.nr_immed <= 0))
1417 		rq->scx.nr_immed--;
1423 	p->scx.slice = READ_ONCE(sch->slice_dfl);
1437 	 * is only non-negative during an internal SCX migration.
1439 	return p->scx.sticky_cpu >= 0;
1449 	if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p))
1455 	p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
1467 	 * actually getting on CPU. This gives higher-class tasks (e.g. RT)
1474 	 * - First, a local task to this CPU's local DSQ;
1475 	 * - Second, a local/remote task to a remote CPU's local DSQ.
1486 	 * the tasks from a user-created DSQ.
1488 	 * We must detect these wakeups so that we can re-enqueue IMMED tasks
1491 	 * @rq->next_class to &ext_sched_class if it's currently idle.
1494 	 * @rq->next_class is below &ext_sched_class, it will also
1497 	if (sched_class_above(p->sched_class, rq->next_class))
1505 	 * a resched if @rq->next_class was idle. It's harmless, since
1508 	if (rq->scx.flags & SCX_RQ_IN_BALANCE)
1511 	if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
1512 	    rq->curr->sched_class == &ext_sched_class) {
1513 		rq->curr->scx.slice = 0;
1522 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
1524 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1525 	WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
1526 		     !RB_EMPTY_NODE(&p->scx.dsq_priq));
1529 		raw_spin_lock_nested(&dsq->lock,
1532 		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
1535 			raw_spin_unlock(&dsq->lock);
1537 			raw_spin_lock(&dsq->lock);
1541 	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
1546 		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
1550 		scx_error(sch, "cannot use vtime ordering for built-in DSQs");
1562 		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
1564 			scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
1565 				  dsq->id);
1567 		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
1568 		rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
1572 		 * that @dsq->list is vtime ordered.
1574 		rbp = rb_prev(&p->scx.dsq_priq);
1579 			list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
1580 			/* first task unchanged - no update needed */
1582 			list_add(&p->scx.dsq_list.node, &dsq->list);
1583 			/* not builtin and new task is at head - use fastpath */
1584 			rcu_assign_pointer(dsq->first_task, p);
1588 		if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
1589 			scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
1590 				  dsq->id);
1593 			list_add(&p->scx.dsq_list.node, &dsq->list);
1594 			/* new task inserted at head - use fastpath */
1595 			if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
1596 				rcu_assign_pointer(dsq->first_task, p);
1599 			 * dsq->list can contain parked BPF iterator cursors, so
1601 			 * task in the DSQ". Test dsq->first_task directly.
1603 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);
1604 			if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
1605 				rcu_assign_pointer(dsq->first_task, p);
1610 	WRITE_ONCE(dsq->seq, dsq->seq + 1);
1611 	p->scx.dsq_seq = dsq->seq;
1614 	p->scx.dsq = dsq;
1619 	 * and dequeue_task_scx() will RMW p->scx.flags. If we clear
1620 	 * ops_state first, both sides would modify p->scx.flags
1621 	 * concurrently in a non-atomic way.
1628 		 * non-terminal DSQ: enter custody.
1630 		if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
1633 			p->scx.flags |= SCX_TASK_IN_CUSTODY;
1635 		raw_spin_unlock(&dsq->lock);
1643 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1649 	WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
1651 	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
1652 		rb_erase(&p->scx.dsq_priq, &dsq->priq);
1653 		RB_CLEAR_NODE(&p->scx.dsq_priq);
1654 		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
1657 	list_del_init(&p->scx.dsq_list.node);
1660 	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
1664 		rcu_assign_pointer(dsq->first_task, first_task);
1670 	struct scx_dispatch_q *dsq = p->scx.dsq;
1671 	bool is_local = dsq == &rq->scx.local_dsq;
1677 		 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
1680 		if (unlikely(!list_empty(&p->scx.dsq_list.node)))
1681 			list_del_init(&p->scx.dsq_list.node);
1686 		 * @p->scx.holding_cpu may be set under the protection of
1689 		if (p->scx.holding_cpu >= 0)
1690 			p->scx.holding_cpu = -1;
1696 		raw_spin_lock(&dsq->lock);
1699 	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
1702 	if (p->scx.holding_cpu < 0) {
1708 		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
1712 		WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
1713 		p->scx.holding_cpu = -1;
1715 	p->scx.dsq = NULL;
1718 		raw_spin_unlock(&dsq->lock);
1729 	lockdep_assert_held(&dsq->lock);
1732 	p->scx.dsq = NULL;
1742 		return &rq->scx.local_dsq;
1750 		return &cpu_rq(cpu)->scx.local_dsq;
1759 		scx_error(sch, "non-existent DSQ 0x%llx", dsq_id);
1773 	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
1776 	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
1781 			scx_error(sch, "%s[%d] already direct-dispatched",
1782 				  p->comm, p->pid);
1784 			scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
1785 				  ddsp_task->comm, ddsp_task->pid,
1786 				  p->comm, p->pid);
1790 	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
1791 	WARN_ON_ONCE(p->scx.ddsp_enq_flags);
1793 	p->scx.ddsp_dsq_id = dsq_id;
1794 	p->scx.ddsp_enq_flags = enq_flags;
1801  *  - direct_dispatch(): cleared on the synchronous enqueue path, deferred
1803  *  - process_ddsp_deferred_locals(): cleared after consuming deferred state,
1804  *  - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch
1806  *  - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred
1808  *  - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by
1814 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
1815 	p->scx.ddsp_enq_flags = 0;
1823 		find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
1828 	p->scx.ddsp_enq_flags |= enq_flags;
1836 	if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
1839 		opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
1849 			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1853 				  p->comm, p->pid, opss);
1854 			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1858 		WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1859 		list_add_tail(&p->scx.dsq_list.node,
1860 			      &rq->scx.ddsp_deferred_locals);
1865 	ddsp_enq_flags = p->scx.ddsp_enq_flags;
1880 	return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
1891 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
1893 	/* internal movements - rq migration / RESTORE */
1899 	 * Note that exiting and migration-disabled tasks that skip
1903 	p->scx.flags &= ~SCX_TASK_IMMED;
1918 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1922 	if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
1923 	    unlikely(p->flags & PF_EXITING)) {
1929 	if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
1939 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
1941 	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1942 	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
1951 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1958 	p->scx.flags |= SCX_TASK_IN_CUSTODY;
1964 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
1971 	dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags);
1974 	dsq = &rq->scx.local_dsq;
1985 	 * For task-ordering, slice refill must be treated as implying the end
1997 	return !list_empty(&p->scx.runnable_node);
2004 	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
2005 		p->scx.runnable_at = jiffies;
2006 		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
2013 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
2018 	list_del_init(&p->scx.runnable_node);
2020 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
2026 	int sticky_cpu = p->scx.sticky_cpu;
2027 	u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
2030 		rq->scx.flags |= SCX_RQ_IN_WAKEUP;
2036 	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
2041 	if (p->scx.flags & SCX_TASK_QUEUED) {
2047 	p->scx.flags |= SCX_TASK_QUEUED;
2048 	rq->scx.nr_running++;
2058 	if (rq->scx.nr_running == 1)
2059 		dl_server_start(&rq->ext_server);
2064 		p->scx.sticky_cpu = -1;
2066 	rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
2069 	    unlikely(cpu_of(rq) != p->scx.selected_cpu))
2083 	opss = atomic_long_read_acquire(&p->scx.ops_state);
2104 		if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) {
2109 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2128 		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
2156 	 * change (not sleep or core-sched pick).
2161 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
2170 	 * and then stops running. As we want running <-> stopping transitions
2171 	 * to be contained within runnable <-> quiescent transitions, trigger
2172 	 * ->stopping() early here instead of in put_prev_task_scx().
2174 	 * @p may go through multiple stopping <-> running transitions between
2189 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
2191 		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
2193 	p->scx.flags &= ~SCX_TASK_QUEUED;
2194 	rq->scx.nr_running--;
2204 	struct task_struct *p = rq->donor;
2210 		p->scx.slice = 0;
2215 	struct task_struct *from = rq->donor;
2231 	if (p->sched_class == &ext_sched_class)
2235 	 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks.
2238 	 * - A SCX task is currently running.
2240 	 * - @rq is waking from idle due to a SCX task waking to it.
2242 	 * - A higher-priority wakes up while SCX dispatch is in progress.
2244 	if (rq->scx.nr_immed)
2253 	struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
2256 	lockdep_assert_held(&src_dsq->lock);
2259 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2262 		list_add(&p->scx.dsq_list.node, &dst_dsq->list);
2264 		list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
2267 	p->scx.dsq = dst_dsq;
2273  * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
2288 	 * beginning of an SCX-internal migration.
2290 	p->scx.sticky_cpu = cpu_of(dst_rq);
2298 	 * We want to pass scx-specific enq_flags but activate_task() will
2300 	 * @rq->scx.extra_enq_flags instead.
2302 	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
2303 	WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
2304 	dst_rq->scx.extra_enq_flags = enq_flags;
2306 	dst_rq->scx.extra_enq_flags = 0;
2313  * - is_cpu_allowed() asks "Can this task run on this CPU?" while
2323  * - The BPF scheduler is bypassed while the rq is offline and we can always say
2337 	 * If @p has migration disabled, @p->cpus_ptr is updated to contain only
2339 	 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
2351 				  p->comm, p->pid, task_cpu(p), cpu);
2364 				  cpu, p->comm, p->pid);
2378  * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
2386  * non-local DSQ, it's better to use the same mechanism to protect against
2387  * dequeues and maintain the invariant that @p->scx.dsq can only change while
2395  * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
2397  * would be cleared to -1. While other cpus may have updated it to different
2412 	lockdep_assert_held(&dsq->lock);
2414 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
2416 	p->scx.holding_cpu = cpu;
2418 	raw_spin_unlock(&dsq->lock);
2422 	return likely(p->scx.holding_cpu == cpu) &&
2443  * move_task_between_dsqs() - Move a task from one DSQ to another
2465 	BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
2466 	lockdep_assert_held(&src_dsq->lock);
2469 	if (dst_dsq->id == SCX_DSQ_LOCAL) {
2478 		/* no need to migrate if destination is a non-local DSQ */
2486 	if (dst_dsq->id == SCX_DSQ_LOCAL) {
2487 		/* @p is going from a non-local DSQ to a local DSQ */
2492 			raw_spin_unlock(&src_dsq->lock);
2494 			raw_spin_unlock(&src_dsq->lock);
2500 		 * @p is going from a non-local DSQ to a non-local DSQ. As
2504 		raw_spin_unlock(&src_dsq->lock);
2520 	 * @dsq->list without locking and skip if it seems empty.
2522 	if (list_empty(&dsq->list))
2525 	raw_spin_lock(&dsq->lock);
2535 		 * the system into the bypass mode. This can easily live-lock the
2536 		 * machine. If aborting, exit from all non-bypass DSQs.
2538 		if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS)
2544 			raw_spin_unlock(&dsq->lock);
2555 	raw_spin_unlock(&dsq->lock);
2563 	return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0);
2567  * dispatch_to_local_dsq - Dispatch a task to a local dsq
2615 	 * we're moving from a DSQ and use the same mechanism - mark the task
2619 	p->scx.holding_cpu = raw_smp_processor_id();
2622 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
2632 	if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
2640 			p->scx.holding_cpu = -1;
2641 			dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p,
2651 		if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
2663  * finish_dispatch - Asynchronously finish dispatching a task
2695 	opss = atomic_long_read(&p->scx.ops_state);
2705 		 * dispatch/dequeue and re-enqueue cycle between
2719 		 * it - the BPF scheduler is allowed to dispatch tasks
2724 		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2739 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
2743 	if (dsq->id == SCX_DSQ_LOCAL)
2751 	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
2754 	for (u = 0; u < dspc->cursor; u++) {
2755 		struct scx_dsp_buf_ent *ent = &dspc->buf[u];
2757 		finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
2758 				ent->enq_flags);
2761 	dspc->nr_tasks += dspc->cursor;
2762 	dspc->cursor = 0;
2769 	if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
2772 	queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
2775 	rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
2780  * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
2787 	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
2790 	bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
2806 		 * tasks. The following implements a simple built-in behavior -
2810 		 * auto-consumption and a kfunc to consume the bypass DSQ and,
2814 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
2816 		if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
2827 	dspc->rq = rq;
2837 		dspc->nr_tasks = 0;
2843 			rq->scx.sub_dispatch_prev = prev;
2845 			rq->scx.sub_dispatch_prev = NULL;
2850 		if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) {
2851 			rq->scx.flags |= SCX_RQ_BAL_KEEP;
2854 		if (rq->scx.local_dsq.nr)
2868 		if (unlikely(!--nr_loops)) {
2872 	} while (dspc->nr_tasks);
2891 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
2892 	rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
2894 	if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
2895 	    unlikely(rq->scx.cpu_released)) {
2899 		 * core. This callback complements ->cpu_release(), which is
2904 		rq->scx.cpu_released = false;
2907 	if (prev->sched_class == &ext_sched_class) {
2915 		 * implement ->cpu_release().
2920 		if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice &&
2922 			rq->scx.flags |= SCX_RQ_BAL_KEEP;
2928 	if (rq->scx.local_dsq.nr)
2938 	if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2939 	    (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) {
2940 		rq->scx.flags |= SCX_RQ_BAL_KEEP;
2944 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2951 	 * - rq_is_open() can't reliably tell when and how slice is going to be
2955 	 * - A non-IMMED HEAD task can get queued in front of an IMMED task
2958 	if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed))
2961 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2969 	if (p->scx.flags & SCX_TASK_QUEUED) {
2971 		 * Core-sched might decide to execute @p before it is
2978 	p->se.exec_start = rq_clock_task(rq);
2981 	if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
2990 	if ((p->scx.slice == SCX_SLICE_INF) !=
2991 	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
2992 		if (p->scx.slice == SCX_SLICE_INF)
2993 			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
2995 			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
3003 		 * tick-stopped CPUs.
3024 	const struct sched_class *next_class = next->sched_class;
3026 	if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
3040 	 * sched_class, so invoke the ->cpu_release() callback if we have not
3044 	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
3047 	if (!rq->scx.cpu_released) {
3056 		rq->scx.cpu_released = true;
3066 	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
3071 	if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
3074 	if (p->scx.flags & SCX_TASK_QUEUED) {
3079 		 * preempted by a higher priority scheduler class or core-sched
3084 		if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) {
3085 			if (p->scx.flags & SCX_TASK_IMMED) {
3086 				p->scx.flags |= SCX_TASK_REENQ_PREEMPTED;
3087 				do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
3088 				p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
3090 				dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD);
3099 		 * which should trigger an explicit follow-up scheduling event.
3101 		if (next && sched_class_above(&ext_sched_class, next->sched_class)) {
3102 			WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
3103 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
3105 			do_enqueue_task(rq, p, 0, -1);
3110 	if (next && next->sched_class != &ext_sched_class)
3117 	unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
3132 	for_each_cpu(cpu, rq->scx.cpus_to_sync) {
3138 		    smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
3139 			cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
3144 		while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
3145 			smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
3158 	return list_first_entry_or_null(&rq->scx.local_dsq.list,
3165 	struct task_struct *prev = rq->curr;
3170 	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
3184 	if (unlikely(rq->scx.kick_sync_pending)) {
3185 		rq->scx.kick_sync_pending = false;
3186 		queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
3191 	 * If any higher-priority sched class enqueued a runnable task on
3196 	 * regardless of any higher-priority sched classes activity.
3201 	keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
3203 		     prev->sched_class != &ext_sched_class)) {
3215 		if (!p->scx.slice)
3222 		if (unlikely(!p->scx.slice)) {
3226 			    !sch->warned_zero_slice) {
3228 						p->comm, p->pid, __func__);
3229 				sch->warned_zero_slice = true;
3255 	return do_pick_task_scx(dl_se->rq, rf, true);
3263 	struct sched_dl_entity *dl_se = &rq->ext_server;
3272  * scx_prio_less - Task ordering for core-sched
3277  * Core-sched is implemented as an additional scheduling layer on top of the
3279  * SCX, core-sched calls this function to interrogate the task ordering.
3281  * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
3283  * priority the task - the global FIFO ordering matching the default scheduling
3286  * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
3307 		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
3338 		this_rq()->scx.in_select_cpu = true;
3340 		this_rq()->scx.in_select_cpu = false;
3341 		p->scx.selected_cpu = cpu;
3353 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
3357 		p->scx.selected_cpu = cpu;
3381 	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
3382 	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
3385 	 * Fine-grained memory write control is enforced by BPF making the const
3389 		SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
3408 		scx_idle_update_selcpu_topology(&sch->ops);
3433 	rq->scx.flags |= SCX_RQ_ONLINE;
3438 	rq->scx.flags &= ~SCX_RQ_ONLINE;
3453 	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
3455 		unsigned long last_runnable = p->scx.runnable_at;
3458 					last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
3459 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
3463 				 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
3506 				last_check + READ_ONCE(root->watchdog_timeout)))) {
3507 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
3524 	 * While disabling, always resched and refresh core-sched timestamp as
3528 		curr->scx.slice = 0;
3534 	if (!curr->scx.slice)
3543 	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
3546 	if (tg && tg->css.cgroup)
3547 		return tg->css.cgroup;
3564 	p->scx.disallow = false;
3579 	if (p->scx.disallow) {
3581 			scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
3582 				  p->comm, p->pid);
3584 			scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
3585 				  p->comm, p->pid);
3593 			 * We're in the load path and @p->policy will be applied
3594 			 * right after. Reverting @p->policy here and rejecting
3596 			 * guarantees that if ops.init_task() sets @p->disallow,
3599 			if (p->policy == SCHED_EXT) {
3600 				p->policy = SCHED_NORMAL;
3623 	WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
3632 		weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
3634 	p->scx.weight = sched_weight_to_cgroup(weight);
3640 		SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
3667 	WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
3677 	lockdep_assert_held(&p->pi_lock);
3709 	lockdep_assert_held(&p->pi_lock);
3724 	 * its parent. Exit for the child too - scx_enable_task() never ran for
3725 	 * it, so undo only init_task. The flag is only set on the sub-enable
3728 	if (p->scx.flags & SCX_TASK_SUB_INIT) {
3731 		p->scx.flags &= ~SCX_TASK_SUB_INIT;
3741 	INIT_LIST_HEAD(&scx->dsq_list.node);
3742 	RB_CLEAR_NODE(&scx->dsq_priq);
3743 	scx->sticky_cpu = -1;
3744 	scx->holding_cpu = -1;
3745 	INIT_LIST_HEAD(&scx->runnable_node);
3746 	scx->runnable_at = jiffies;
3747 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
3748 	scx->slice = SCX_SLICE_DFL;
3770 		struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
3797 		if (p->sched_class == &ext_sched_class) {
3808 	list_add_tail(&p->scx.tasks_node, &scx_tasks);
3830  * task_dead_and_done - Is a task dead and done running?
3835  * invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
3840  * that needs to happen on the task. Use this test to short-circuit sched_class
3854 	return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
3864 	 * for the last time and then dropped the rq lock - task_dead_and_done()
3869 	list_del_init(&p->scx.tasks_node);
3873 	 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
3876 	 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see
3878 	 * iteration is only used from sub-sched paths, which require root
3882 	 * into ops; transition to %DEAD so the post-init recheck unwinds
3907 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
3909 		SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
3927 	 * different scheduler class. Keep the BPF scheduler up-to-date.
3930 		SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr);
3942 	 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a
3943 	 * NONE -> READY validation failure.
3958 	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
3959 	    p->policy != policy && policy == SCHED_EXT)
3960 		return -EACCES;
3978 	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
3982 		u64 dsq_id = p->scx.ddsp_dsq_id;
3983 		u64 enq_flags = p->scx.ddsp_enq_flags;
3985 		list_del_init(&p->scx.dsq_list.node);
3989 		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
3999  * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
4000  *   tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
4003  * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
4008  * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
4026 	if ((p->scx.flags & SCX_TASK_IMMED) &&
4051 	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
4054 	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
4062 		 * deactivate and re-activate @p anyway. Skip re-enqueueing.
4065 		 * re-enqueue a migrating task while its current CPU and allowed
4070 		if (p->migration_pending)
4081 		if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
4082 			p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4083 		p->scx.flags |= reason;
4085 		list_add_tail(&p->scx.dsq_list.node, &tasks);
4089 		list_del_init(&p->scx.dsq_list.node);
4091 		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
4093 		p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4102 	u64 seq = ++rq->scx.deferred_reenq_locals_seq;
4111 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
4113 				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
4123 			sch = sch_pcpu->sch;
4125 			reenq_flags = drl->flags;
4126 			WRITE_ONCE(drl->flags, 0);
4127 			list_del_init(&drl->node);
4129 			if (likely(drl->seq != seq)) {
4130 				drl->seq = seq;
4131 				drl->cnt = 0;
4133 				if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
4135 						  drl->cnt);
4161 	struct scx_sched *sch = dsq->sched;
4162 	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
4168 	raw_spin_lock(&dsq->lock);
4170 	while (likely(!READ_ONCE(sch->bypass_depth))) {
4174 		p = nldsq_cursor_next_task(&cursor, dsq);
4187 				raw_spin_unlock(&dsq->lock);
4189 				raw_spin_lock(&dsq->lock);
4194 			if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
4200 		raw_spin_unlock(&dsq->lock);
4202 		if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
4203 			p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4204 		p->scx.flags |= reason;
4206 		do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
4208 		p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
4216 		raw_spin_lock(&dsq->lock);
4219 	list_del_init(&cursor.node);
4220 	raw_spin_unlock(&dsq->lock);
4237 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
4239 				list_first_entry_or_null(&rq->scx.deferred_reenq_users,
4249 			dsq = dsq_pcpu->dsq;
4250 			reenq_flags = dru->flags;
4251 			WRITE_ONCE(dru->flags, 0);
4252 			list_del_init(&dru->node);
4258 		BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
4267 	if (!list_empty(&rq->scx.deferred_reenq_locals))
4270 	if (!list_empty(&rq->scx.deferred_reenq_users))
4277 	struct task_struct *p = rq->curr;
4280 	if (p->sched_class != &ext_sched_class)
4291 	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
4302 	tg->scx.weight = CGROUP_WEIGHT_DFL;
4303 	tg->scx.bw_period_us = default_bw_period_us();
4304 	tg->scx.bw_quota_us = RUNTIME_INF;
4305 	tg->scx.idle = false;
4313 	WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
4318 				{ .weight = tg->scx.weight,
4319 				  .bw_period_us = tg->scx.bw_period_us,
4320 				  .bw_quota_us = tg->scx.bw_quota_us,
4321 				  .bw_burst_us = tg->scx.bw_burst_us };
4324 					      NULL, tg->css.cgroup, &args);
4329 			tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
4331 		tg->scx.flags |= SCX_TG_ONLINE;
4341 	WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
4344 	    (tg->scx.flags & SCX_TG_INITED))
4345 		SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup);
4346 	tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
4363 		WARN_ON_ONCE(p->scx.cgrp_moving_from);
4368 		 * always match one-to-one.
4375 					      p, from, css->cgroup);
4380 		p->scx.cgrp_moving_from = from;
4388 		    p->scx.cgrp_moving_from)
4390 				    p, p->scx.cgrp_moving_from, css->cgroup);
4391 		p->scx.cgrp_moving_from = NULL;
4407 	 * so it can hand an unchanged-cgroup task here with cgrp_moving_from
4411 	if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from)
4413 				 p, p->scx.cgrp_moving_from,
4415 	p->scx.cgrp_moving_from = NULL;
4429 		    p->scx.cgrp_moving_from)
4431 				    p, p->scx.cgrp_moving_from, css->cgroup);
4432 		p->scx.cgrp_moving_from = NULL;
4444 	    tg->scx.weight != weight)
4447 	tg->scx.weight = weight;
4463 	tg->scx.idle = idle;
4477 	    (tg->scx.bw_period_us != period_us ||
4478 	     tg->scx.bw_quota_us != quota_us ||
4479 	     tg->scx.bw_burst_us != burst_us))
4483 	tg->scx.bw_period_us = period_us;
4484 	tg->scx.bw_quota_us = quota_us;
4485 	tg->scx.bw_burst_us = burst_us;
4521 	return sch->cgrp;
4524 /* for each descendant of @cgrp including self, set ->scx_sched to @sch */
4531 		rcu_assign_pointer(pos->scx_sched, sch);
4541  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
4543  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
4588 	raw_spin_lock_init(&dsq->lock);
4589 	INIT_LIST_HEAD(&dsq->list);
4590 	dsq->id = dsq_id;
4591 	dsq->sched = sch;
4593 	dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu);
4594 	if (!dsq->pcpu)
4595 		return -ENOMEM;
4598 		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
4600 		pcpu->dsq = dsq;
4601 		INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node);
4612 		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
4613 		struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user;
4620 		if (WARN_ON_ONCE(!list_empty(&dru->node))) {
4621 			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
4622 			list_del_init(&dru->node);
4626 	free_percpu(dsq->pcpu);
4643 		call_rcu(&dsq->rcu, free_dsq_rcufn);
4659 	raw_spin_lock_irqsave(&dsq->lock, flags);
4661 	if (dsq->nr) {
4662 		scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
4663 			  dsq->id, dsq->nr);
4667 	if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
4672 	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
4677 	dsq->id = SCX_DSQ_INVALID;
4678 	if (llist_add(&dsq->free_node, &dsqs_to_free))
4682 	raw_spin_unlock_irqrestore(&dsq->lock, flags);
4701 		if (!(tg->scx.flags & SCX_TG_INITED))
4703 		tg->scx.flags &= ~SCX_TG_INITED;
4705 		if (!sch->ops.cgroup_exit)
4708 		SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup);
4724 			.weight = tg->scx.weight,
4725 			.bw_period_us = tg->scx.bw_period_us,
4726 			.bw_quota_us = tg->scx.bw_quota_us,
4727 			.bw_burst_us = tg->scx.bw_burst_us,
4730 		if ((tg->scx.flags &
4734 		if (!sch->ops.cgroup_init) {
4735 			tg->scx.flags |= SCX_TG_INITED;
4740 				      css->cgroup, &args);
4745 		tg->scx.flags |= SCX_TG_INITED;
4829 	irq_work_sync(&sch->disable_irq_work);
4830 	kthread_destroy_worker(sch->helper);
4831 	timer_shutdown_sync(&sch->bypass_lb_timer);
4832 	free_cpumask_var(sch->bypass_lb_donee_cpumask);
4833 	free_cpumask_var(sch->bypass_lb_resched_cpumask);
4836 	kfree(sch->cgrp_path);
4839 	if (sch->sub_kset)
4840 		kobject_put(&sch->sub_kset->kobj);
4844 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
4849 		 * deferred_reenq_local_node's must be off-list by now.
4851 		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
4856 	free_percpu(sch->pcpu);
4859 		free_pnode(sch->pnode[node]);
4860 	kfree(sch->pnode);
4862 	rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
4867 			destroy_dsq(sch, dsq->id);
4870 	} while (dsq == ERR_PTR(-EAGAIN));
4873 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
4874 	free_exit_info(sch->exit_info);
4882 	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
4883 	queue_rcu_work(system_dfl_wq, &sch->rcu_work);
4891 	return sysfs_emit(buf, "%s\n", sch->ops.name);
4896 	sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind);		\
4943 	 * and sub-scheduler kset kobjects (kset_ktype) through the parent
4946 	if (kobj->ktype != &scx_ktype)
4951 	return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
4973 	 * scx is tearing down - keep new SCHED_EXT tasks out.
4980 	 * This can develop into a deadlock - scx holds scx_enable_mutex across
5002 	if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
5005 	if (unlikely(p->sched_class != &ext_sched_class))
5012  * handle_lockup - sched_ext common lockup handler
5047  * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
5064  * scx_softlockup - sched_ext softlockup handler
5067  * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
5068  * live-lock the system by making many CPUs target the same DSQ to the point
5069  * where soft-lockup detection triggers. This function is called from
5070  * soft-lockup watchdog when the triggering point is close and tries to unjam
5075 	if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
5078 	printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
5084  * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing
5088 static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1);
5092 	int cpu = atomic_xchg(&scx_hardlockup_cpu, -1);
5094 	if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu))
5095 		printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
5102  * scx_hardlockup - sched_ext hardlockup handler
5120 	atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu);
5132 	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0);
5133 	s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
5147 	raw_spin_lock(&donor_dsq->lock);
5148 	list_add(&cursor.node, &donor_dsq->list);
5150 	n = container_of(&cursor, struct task_struct, scx.dsq_list);
5159 		if (donor_dsq->nr <= nr_donor_target)
5174 		donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
5188 		 * Moving $p from one non-local DSQ to another. The source rq
5207 		if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
5212 			list_move_tail(&cursor.node, &n->scx.dsq_list.node);
5213 			raw_spin_unlock(&donor_dsq->lock);
5217 			raw_spin_lock(&donor_dsq->lock);
5222 	list_del_init(&cursor.node);
5223 	raw_spin_unlock(&donor_dsq->lock);
5232 	struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask;
5233 	struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask;
5242 		u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
5264 		if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target)
5275 		if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target)
5286 		u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
5298  * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
5299  * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
5327 	WARN_ON_ONCE(sch->bypass_depth < 0);
5328 	WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
5329 	if (sch->bypass_depth != 1)
5332 	WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
5333 	sch->bypass_timestamp = ktime_get_ns();
5342 	WARN_ON_ONCE(sch->bypass_depth < 1);
5343 	WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
5344 	if (sch->bypass_depth != 0)
5347 	WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
5349 		      ktime_get_ns() - sch->bypass_timestamp);
5360 	 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling.
5363 	if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim)))
5367 	 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of
5368 	 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is
5374 	 * guarantees that the nearest non-bypassing ancestor or root has bypass
5382 	ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
5386 		ret = atomic_inc_return(&host->bypass_dsp_enable_depth);
5394 	if (intv_us && !timer_pending(&host->bypass_lb_timer))
5395 		mod_timer(&host->bypass_lb_timer,
5404 	if (!test_and_clear_bit(0, &sch->bypass_dsp_claim))
5407 	ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
5411 		ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth);
5417  * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
5430  * - ops.select_cpu() is ignored and the default select_cpu() is used.
5432  * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
5435  * - ops.dispatch() is ignored.
5437  * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
5441  * - pick_next_task() suppresses zero slice warning.
5443  * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
5446  * - scx_prio_less() reverts to the default core_sched_at order.
5467 	 * Bypass state is propagated to all descendants - an scx_sched bypasses
5483 	 * queued tasks are re-queued according to the new scx_bypassing()
5498 			struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
5500 			if (pos->bypass_depth)
5501 				pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
5503 				pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
5525 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
5552 	kvfree(ei->dump);
5553 	kfree(ei->msg);
5554 	kfree(ei->bt);
5566 	ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN);
5567 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
5568 	ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
5570 	if (!ei->bt || !ei->msg || !ei->dump) {
5588 		return "disabled by sysrq-S";
5624 		intv = max(min(intv, sch->watchdog_timeout / 2), 1);
5648 			 * its sub-scheds while holding scx_sched_lock - either
5649 			 * we can see the parent's non-NONE exit_kind or the
5652 			if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
5654 				ret = -ENOENT;
5659 					&sch->hash_node, scx_sched_hash_params);
5665 			list_add_tail(&sch->sibling, &parent->children);
5669 		list_add_tail_rcu(&sch->all, &scx_sched_all);
5690 			rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
5692 			list_del_init(&sch->sibling);
5695 		list_del_rcu(&sch->all);
5702  * Called to disable future dumps and wait for in-progress one while disabling
5709 	sch->dump_disabled = true;
5719 	 * themselves off @sch->children. Wait for it to drain. As propagation
5720 	 * is recursive, empty @sch->children means that all proper descendant
5723 	wait_event(scx_unlink_waitq, list_empty(&sch->children));
5733 	scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
5734 		  fail_code, failed->comm, failed->pid);
5743 	scx_task_iter_start(&sti, sch->cgrp);
5782 	scx_task_iter_start(&sti, sch->cgrp);
5834 			 * $parent's just-completed init is owed an exit_task()
5869 	 * All tasks are moved off of @sch but there may still be on-going
5889 	if (parent->ops.sub_detach && sch->sub_attached) {
5891 			.ops = &sch->ops,
5892 			.cgroup_path = sch->cgrp_path,
5898 	if (sch->ops.exit)
5899 		SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
5900 	if (sch->sub_kset)
5901 		kobject_del(&sch->sub_kset->kobj);
5902 	kobject_del(&sch->kobj);
5911 	struct scx_exit_info *ei = sch->exit_info;
5927 			sch->exit_info->msg);
5965 		const struct sched_class *old_class = p->sched_class;
5974 			p->sched_class = new_class;
5993 	 * Also re-balance the dl_server bandwidth reservations: detach
6001 	 * The swap can still fail with -EBUSY if someone bumped ext_server's
6013 				if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server,
6014 								   &rq->fair_server)))
6015 					pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
6017 				dl_server_detach_bw(&rq->ext_server);
6022 	/* no task is on scx, turn off all the switches and flush in-progress calls */
6024 	bitmap_zero(sch->has_op, SCX_OPI_END);
6028 	if (ei->kind >= SCX_EXIT_ERROR) {
6030 		       sch->ops.name, ei->reason);
6032 		if (ei->msg[0] != '\0')
6033 			pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
6035 		stack_trace_print(ei->bt, ei->bt_len, 2);
6039 			sch->ops.name, ei->reason);
6042 	if (sch->ops.exit)
6061 	if (sch->sub_kset)
6062 		kobject_del(&sch->sub_kset->kobj);
6064 	kobject_del(&sch->kobj);
6091 	if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
6096 	 * flag to break potential live-lock scenarios, ensuring we can
6099 	WRITE_ONCE(sch->aborting, true);
6107 	 * To guarantee forward progress, this propagation must be in-line so
6108 	 * that ->aborting is synchronously asserted for all sub-scheds. The
6109 	 * propagation is also the interlocking point against sub-sched
6113 	 * non-propagation exits.
6129 	struct scx_exit_info *ei = sch->exit_info;
6132 	kind = atomic_read(&sch->exit_kind);
6137 		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
6140 	ei->kind = kind;
6141 	ei->reason = scx_exit_reason(ei->kind);
6153 		irq_work_queue(&sch->disable_irq_work);
6157  * scx_flush_disable_work - flush the disable work and wait for it to finish
6160  * sch->disable_work might still not queued, causing kthread_flush_work()
6169 		irq_work_sync(&sch->disable_irq_work);
6170 		kthread_flush_work(&sch->disable_work);
6171 		kind = atomic_read(&sch->exit_kind);
6180 	if (s->size)
6201 	if (s->size) {
6225 	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */
6226 	dd->first = true;
6227 	dd->cursor = 0;
6228 	dd->s = s;
6229 	dd->prefix = prefix;
6235 	char *line = dd->buf.line;
6237 	if (!dd->cursor)
6244 	if (dd->first) {
6245 		dump_newline(dd->s);
6246 		dd->first = false;
6266 		dump_line(dd->s, "%s%s", dd->prefix, line);
6277 	dd->cursor = 0;
6283 	scx_dump_data.cpu = -1;
6294 	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
6299 	if (task_sch->level == 0)
6302 		scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu",
6303 			  task_sch->level, task_sch->ops.sub_cgroup_id);
6305 	if (p->scx.dsq)
6307 			  (unsigned long long)p->scx.dsq->id);
6311 		  marker, task_state_to_char(p), p->comm, p->pid,
6313 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
6316 		  p->scx.flags & ~SCX_TASK_STATE_MASK,
6317 		  p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
6320 		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
6322 		  p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
6323 	dump_line(s, "      cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
6324 		  p->migration_disabled);
6344  * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped
6353 		.kind = ei->kind,
6354 		.exit_code = ei->exit_code,
6355 		.reason = ei->reason,
6366 	if (sch->dump_disabled)
6369 	seq_buf_init(&s, ei->dump, dump_len);
6372 	if (sch->level == 0)
6373 		dump_line(&s, "%s: root", sch->ops.name);
6375 		dump_line(&s, "%s: sub%d-%llu %s",
6376 			  sch->ops.name, sch->level, sch->ops.sub_cgroup_id,
6377 			  sch->cgrp_path);
6379 	if (ei->kind == SCX_EXIT_NONE) {
6380 		dump_line(&s, "Debug dump triggered by %s", ei->reason);
6383 			  current->comm, current->pid, ei->kind);
6384 		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);
6387 		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
6398 	dump_line(&s, "----------");
6410 		idle = list_empty(&rq->scx.runnable_list) &&
6411 			rq->curr->sched_class == &idle_sched_class;
6426 		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
6427 			  cpu, rq->scx.nr_running, rq->scx.flags,
6428 			  rq->scx.cpu_released, rq->scx.ops_qseq,
6429 			  rq->scx.kick_sync);
6431 			  rq->curr->comm, rq->curr->pid,
6432 			  rq->curr->sched_class);
6433 		if (!cpumask_empty(rq->scx.cpus_to_kick))
6435 				  cpumask_pr_args(rq->scx.cpus_to_kick));
6436 		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
6438 				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
6439 		if (!cpumask_empty(rq->scx.cpus_to_preempt))
6441 				  cpumask_pr_args(rq->scx.cpus_to_preempt));
6442 		if (!cpumask_empty(rq->scx.cpus_to_wait))
6444 				  cpumask_pr_args(rq->scx.cpus_to_wait));
6445 		if (!cpumask_empty(rq->scx.cpus_to_sync))
6447 				  cpumask_pr_args(rq->scx.cpus_to_sync));
6473 		if (rq->curr->sched_class == &ext_sched_class &&
6474 		    (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
6475 			scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
6477 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
6486 	dump_line(&s, "--------------");
6504 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
6511 	struct scx_exit_info *ei = sch->exit_info;
6513 	if (ei->kind >= SCX_EXIT_ERROR)
6514 		scx_dump_state(sch, ei, sch->ops.exit_dump_len, true);
6516 	kthread_queue_work(sch->helper, &sch->disable_work);
6523 	struct scx_exit_info *ei = sch->exit_info;
6530 	ei->exit_code = exit_code;
6533 		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
6535 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
6538 	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
6541 	ei->kind = kind;
6542 	ei->reason = scx_exit_reason(ei->kind);
6544 	irq_work_queue(&sch->disable_irq_work);
6553 	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
6566 			return -ENOMEM;
6579 	exit_dsq(&pnode->global_dsq);
6591 	if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) {
6608 	s32 level = parent ? parent->level + 1 : 0;
6613 		ret = -ENOMEM;
6617 	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
6618 	if (!sch->exit_info) {
6619 		ret = -ENOMEM;
6623 	ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
6627 	sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids);
6628 	if (!sch->pnode) {
6629 		ret = -ENOMEM;
6634 		sch->pnode[node] = alloc_pnode(sch, node);
6635 		if (!sch->pnode[node]) {
6636 			ret = -ENOMEM;
6641 	sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
6642 	sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu,
6643 						 dsp_ctx.buf, sch->dsp_max_batch),
6645 	if (!sch->pcpu) {
6646 		ret = -ENOMEM;
6659 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
6661 		pcpu->sch = sch;
6662 		INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node);
6665 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
6666 	if (IS_ERR(sch->helper)) {
6667 		ret = PTR_ERR(sch->helper);
6671 	sched_set_fifo(sch->helper->task);
6674 		memcpy(sch->ancestors, parent->ancestors,
6675 		       level * sizeof(parent->ancestors[0]));
6676 	sch->ancestors[level] = sch;
6677 	sch->level = level;
6679 	if (ops->timeout_ms)
6680 		sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
6682 		sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
6684 	sch->slice_dfl = SCX_SLICE_DFL;
6685 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
6686 	sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn);
6687 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
6688 	timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
6690 	if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) {
6691 		ret = -ENOMEM;
6694 	if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) {
6695 		ret = -ENOMEM;
6698 	sch->ops = *ops;
6699 	rcu_assign_pointer(ops->priv, sch);
6701 	sch->kobj.kset = scx_kset;
6702 	INIT_LIST_HEAD(&sch->all);
6707 		ret = -ENOMEM;
6711 	sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
6713 	if (!sch->cgrp_path) {
6714 		ret = -ENOMEM;
6718 	sch->cgrp = cgrp;
6719 	INIT_LIST_HEAD(&sch->children);
6720 	INIT_LIST_HEAD(&sch->sibling);
6723 		ret = kobject_init_and_add(&sch->kobj, &scx_ktype,
6724 					   &parent->sub_kset->kobj,
6725 					   "sub-%llu", cgroup_id(cgrp));
6727 		ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
6730 		RCU_INIT_POINTER(ops->priv, NULL);
6731 		kobject_put(&sch->kobj);
6735 	if (ops->sub_attach) {
6736 		sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
6737 		if (!sch->sub_kset) {
6738 			RCU_INIT_POINTER(ops->priv, NULL);
6739 			kobject_put(&sch->kobj);
6740 			return ERR_PTR(-ENOMEM);
6744 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
6746 		RCU_INIT_POINTER(ops->priv, NULL);
6747 		kobject_put(&sch->kobj);
6755 	RCU_INIT_POINTER(ops->priv, NULL);
6756 	free_cpumask_var(sch->bypass_lb_resched_cpumask);
6759 	free_cpumask_var(sch->bypass_lb_donee_cpumask);
6761 	kthread_destroy_worker(sch->helper);
6768 	free_percpu(sch->pcpu);
6771 		free_pnode(sch->pnode[node]);
6772 	kfree(sch->pnode);
6774 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
6776 	free_exit_info(sch->exit_info);
6796 	if (ops->hotplug_seq) {
6798 		if (ops->hotplug_seq != global_hotplug_seq) {
6802 				 ops->hotplug_seq, global_hotplug_seq);
6803 			return -EBUSY;
6816 	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
6818 		return -EINVAL;
6822 	 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
6825 	if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
6826 	    (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
6828 		return -EINVAL;
6831 	if (ops->cpu_acquire || ops->cpu_release)
6832 		pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
6838  * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
6839  * starvation. During the READY -> ENABLED task switching loop, the calling
6842  * fair-class saturation, leading to a system hang.
6853 	struct sched_ext_ops *ops = cmd->ops;
6863 		ret = -EBUSY;
6868 	 * @ops->priv binds @ops to its scx_sched instance. It is set here by
6871 	 * it's still non-NULL here, a previous attachment on @ops has not
6872 	 * finished tearing down; proceeding would let the in-flight unreg's
6873 	 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign.
6875 	if (rcu_access_pointer(ops->priv)) {
6876 		ret = -EBUSY;
6905 		rq->scx.local_dsq.sched = sch;
6906 		rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
6911 	 * online CPUs by watching ->on/offline_cpu() after ->init().
6929 	if (sch->ops.init) {
6937 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
6942 			set_bit(i, sch->has_op);
6965 	 * detaches ext_server, so partially-attached state is cleaned up
6973 			ret = dl_server_attach_bw(&rq->ext_server);
6992 			set_bit(i, sch->has_op);
6994 	if (sch->ops.cpu_acquire || sch->ops.cpu_release)
6995 		sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
7045 		 * leaving the post-init recheck below to unwind.
7060 				  ret, p->comm, p->pid);
7089 	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
7093 	 * We're fully committed and can't fail. The task READY -> ENABLED
7101 		const struct sched_class *old_class = p->sched_class;
7111 			p->scx.slice = READ_ONCE(sch->slice_dfl);
7112 			p->sched_class = new_class;
7121 		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
7125 	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
7143 			dl_server_detach_bw(&rq->fair_server);
7148 		sch->ops.name, scx_switched_all() ? "" : " (partial)");
7149 	kobject_uevent(&sch->kobj, KOBJ_ADD);
7154 	cmd->ret = 0;
7161 	cmd->ret = ret;
7181 	cmd->ret = 0;
7188 	struct scx_sched *parent = cgrp->scx_sched;
7194 	if (parent->cgrp == cgrp)
7195 		return ERR_PTR(-EBUSY);
7197 	/* does $parent allow sub-scheds? */
7198 	if (!parent->ops.sub_attach)
7199 		return ERR_PTR(-EOPNOTSUPP);
7202 	list_for_each_entry(pos, &parent->children, sibling)
7203 		if (cgroup_is_descendant(pos->cgrp, cgrp))
7204 			return ERR_PTR(-EBUSY);
7219 			  state, p->comm, p->pid);
7227 	struct sched_ext_ops *ops = cmd->ops;
7237 		ret = -ENODEV;
7241 	/* See scx_root_enable_workfn() for the @ops->priv check. */
7242 	if (rcu_access_pointer(ops->priv)) {
7243 		ret = -EBUSY;
7247 	cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
7260 	kobject_get(&parent->kobj);
7265 	kobject_put(&parent->kobj);
7275 	if (sch->level >= SCX_SUB_MAX_DEPTH) {
7281 	if (sch->ops.init) {
7288 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
7295 		.ops = &sch->ops,
7296 		.cgroup_path = sch->cgrp_path,
7306 	sch->sub_attached = true;
7312 			set_bit(i, sch->has_op);
7318 	 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
7322 	if (!(cgrp->self.flags & CSS_ONLINE)) {
7335 	scx_task_iter_start(&sti, sch->cgrp);
7350 		if (p->scx.flags & SCX_TASK_SUB_INIT)
7358 			ret = -EINVAL;
7379 			 * just-completed init is owed an exit_task() and we
7388 		p->scx.flags |= SCX_TASK_SUB_INIT;
7399 	scx_task_iter_start(&sti, sch->cgrp);
7405 		if (!(p->scx.flags & SCX_TASK_SUB_INIT))
7425 			p->scx.flags &= ~SCX_TASK_SUB_INIT;
7437 	pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
7438 	kobject_uevent(&sch->kobj, KOBJ_ADD);
7446 	cmd->ret = ret;
7457 	 * must stay set until SUB_INIT is cleared from every marked task -
7460 	scx_task_iter_start(&sti, sch->cgrp);
7462 		if (p->scx.flags & SCX_TASK_SUB_INIT) {
7464 			p->scx.flags &= ~SCX_TASK_SUB_INIT;
7476 	cmd->ret = 0;
7490 		/* inherit ->scx_sched from $parent */
7492 			rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
7496 		if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
7497 			scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
7526 		return -EINVAL;
7536 				return -ENOMEM;
7538 			sched_set_fifo(w->task);
7545 	if (ops->sub_cgroup_id > 1)
7588 	t = btf_type_by_id(reg->btf, reg->btf_id);
7597 			pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()");
7606 	return -EACCES;
7627 			return -E2BIG;
7628 		ops->dispatch_max_batch = *(u32 *)(udata + moff);
7632 			return -EINVAL;
7633 		ops->flags = *(u64 *)(udata + moff);
7636 		ret = bpf_obj_name_cpy(ops->name, uops->name,
7637 				       sizeof(ops->name));
7641 			return -EINVAL;
7646 			return -E2BIG;
7647 		ops->timeout_ms = *(u32 *)(udata + moff);
7650 		ops->exit_dump_len =
7654 		ops->hotplug_seq = *(u64 *)(udata + moff);
7658 		ops->sub_cgroup_id = *(u64 *)(udata + moff);
7672 	sch = scx_prog_sched(prog->aux);
7701 		if (prog->sleepable)
7702 			return -EINVAL;
7710 	 * XXX - Ideally, we should only do this for scheds that allow
7711 	 * sub-scheds and sub-scheds themselves but I don't know how to access
7716 		prog->aux->priv_stack_requested = true;
7717 		prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
7732 	struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
7736 	RCU_INIT_POINTER(ops->priv, NULL);
7737 	kobject_put(&sch->kobj);
7750 	 * sched_ext does not support updating the actively-loaded BPF
7756 	return -EOPNOTSUPP;
7764 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
7780 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
7785 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
7787 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
7794 static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; }
7798 static s32 sched_ext_ops__init(void) { return -EINVAL; }
7880 	.help_msg	= "reset-sched-ext(S)",
7887 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
7896 	.help_msg	= "dump-sched-ext(D)",
7915 	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
7921 	struct scx_rq *this_scx = &this_rq->scx;
7927 	cur_class = rq->curr->sched_class;
7937 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
7939 				rq->curr->scx.slice = 0;
7940 			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
7943 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
7945 				cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
7946 				ksyncs[cpu] = rq->scx.kick_sync;
7949 			cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
7954 		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
7955 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
7980 	struct scx_rq *this_scx = &this_rq->scx;
7990 	ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
7992 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
7994 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
7995 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
7998 	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
8000 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
8009 		this_scx->kick_sync_pending = true;
8016  * print_scx_info - print out sched_ext scheduler state
8047 	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
8049 		printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
8054 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
8061 	       log_lvl, sch->ops.name, scx_enable_state_str[state], all,
8120 		BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL));
8122 		INIT_LIST_HEAD(&rq->scx.runnable_list);
8123 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
8125 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
8126 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
8127 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
8128 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
8129 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
8130 		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
8131 		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
8132 		INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
8133 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
8134 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
8137 			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
8160 			scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id);
8163 	} else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) {
8200 	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
8209 	if (unlikely(dspc->cursor >= sch->dsp_max_batch)) {
8214 	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
8216 		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
8225  * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
8279 		p->scx.slice = slice;
8281 		p->scx.slice = p->scx.slice ?: 1;
8305 		p->scx.slice = slice;
8307 		p->scx.slice = p->scx.slice ?: 1;
8309 	p->scx.dsq_vtime = vtime;
8325  * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
8328  *       @args->dsq_id: DSQ to insert into
8329  *       @args->slice: duration @p can run for in nsecs, 0 to keep the current value
8330  *       @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
8331  *       @args->enq_flags: SCX_ENQ_*
8339  * @args->dsq_id. Tasks queued into the priority queue are ordered by
8340  * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
8342  * @args->vtime ordering is according to time_before64() which considers
8344  * ordering and vice-versa.
8348  * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
8368 	return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice,
8369 				    args->vtime, args->enq_flags);
8388 	 * Disallow if any sub-scheds are attached. There is no way to tell
8391 	if (unlikely(!list_empty(&sch->children))) {
8418 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
8428 	 * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL.
8433 	sch = src_dsq->sched;
8440 	 * cause similar live-lock conditions as consume_dispatch_q().
8442 	if (unlikely(READ_ONCE(sch->aborting)))
8447 			  p->comm, p->pid);
8460 	in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
8472 	raw_spin_lock(&src_dsq->lock);
8475 	if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) {
8476 		raw_spin_unlock(&src_dsq->lock);
8488 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
8489 		p->scx.dsq_vtime = kit->vtime;
8490 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
8491 		p->scx.slice = kit->slice;
8506 	kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
8514  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
8529 	return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor);
8533  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
8550 	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
8552 	if (dspc->cursor > 0)
8553 		dspc->cursor--;
8559  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
8560  * @dsq_id: DSQ to move task from. Must be a user-created DSQ
8564  * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
8568  * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as
8573  * per-node DSQs making the scope difficult to define; this may change in the
8576  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
8599 	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
8601 	flush_dispatch_buf(sch, dspc->rq);
8609 	if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) {
8616 		dspc->nr_tasks++;
8632  * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
8645 	kit->slice = slice;
8646 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
8650  * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
8652  * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
8664 	kit->vtime = vtime;
8665 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
8669  * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
8676  * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
8690  * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local
8702  * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
8728  * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
8755 		scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
8760 	return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
8791  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
8795  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
8829  * scx_bpf_create_dsq - Create a custom DSQ
8845 		return -EINVAL;
8848 		return -EINVAL;
8852 		return -ENOMEM;
8868 		dsq->sched = sch;
8869 		ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
8872 		ret = -ENODEV;
8907  * scx_bpf_task_set_slice - Set task's time slice
8925 	p->scx.slice = slice;
8930  * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
8948 	p->scx.dsq_vtime = vtime;
8990 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
8992 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
8995 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
8997 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
9000 	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
9006  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
9027  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
9032  * -%ENOENT is returned.
9044 		ret = -ENODEV;
9049 		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
9055 			ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
9061 			ret = READ_ONCE(dsq->nr);
9065 	ret = -ENOENT;
9072  * scx_bpf_destroy_dsq - Destroy a custom DSQ
9092  * bpf_iter_scx_dsq_new - Create a DSQ iterator
9113 		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
9117 	 * Always clear $kit->dsq.
9119 	kit->dsq = NULL;
9123 		return -ENODEV;
9126 		return -EINVAL;
9128 	kit->dsq = find_user_dsq(sch, dsq_id);
9129 	if (!kit->dsq)
9130 		return -ENOENT;
9132 	kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags);
9138  * bpf_iter_scx_dsq_next - Progress a DSQ iterator
9147 	if (!kit->dsq)
9150 	guard(raw_spinlock_irqsave)(&kit->dsq->lock);
9152 	return nldsq_cursor_next_task(&kit->cursor, kit->dsq);
9156  * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
9165 	if (!kit->dsq)
9168 	if (!list_empty(&kit->cursor.node)) {
9171 		raw_spin_lock_irqsave(&kit->dsq->lock, flags);
9172 		list_del_init(&kit->cursor.node);
9173 		raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
9175 	kit->dsq = NULL;
9179  * scx_bpf_dsq_peek - Lockless peek at the first element.
9185  * this provides only a point-in-time snapshot, and the contents may change
9207 		scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
9211 	return rcu_dereference(dsq->first_task);
9215  * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ
9216  * @dsq_id: DSQ to re-enqueue
9221  * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are
9224  * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
9225  * - User DSQs
9227  * Re-enqueues are performed asynchronously. Can be called from anywhere.
9255  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
9259  * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
9282 		return -EINVAL;
9312 	return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
9319  * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
9345  * scx_bpf_error_bstr - Indicate fatal error
9369  * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
9386 	struct scx_bstr_buf *buf = &dd->buf;
9395 	if (raw_smp_processor_id() != dd->cpu) {
9401 	ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
9402 			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
9404 		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
9405 			  dd->prefix, fmt, data, data__sz, ret);
9409 	dd->cursor += ret;
9410 	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
9412 	if (!dd->cursor)
9423 	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
9428  * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
9450  * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
9478  * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
9529 		rq->scx.cpuperf_target = perf;
9538  * scx_bpf_nr_node_ids - Return the number of possible node IDs
9548  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
9558  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
9566  * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
9574  * scx_bpf_put_cpumask - Release a possible/online cpumask
9581 	 * a reference to a global cpumask, which is read-only in the caller and
9588  * scx_bpf_task_running - Is task currently running?
9593 	return task_rq(p)->curr == p;
9597  * scx_bpf_task_cpu - CPU a task is currently associated with
9606  * scx_bpf_cpu_rq - Fetch the rq of a CPU
9623 	if (!sch->warned_deprecated_rq) {
9627 		sch->warned_deprecated_rq = true;
9634  * scx_bpf_locked_rq - Return the rq currently locked by SCX
9661  * scx_bpf_cpu_curr - Return remote CPU's curr task
9680 	return rcu_dereference(cpu_rq(cpu)->curr);
9684  * scx_bpf_now - Returns a high-performance monotonically non-decreasing
9691  *  Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
9692  *  eventually reads a hardware timestamp counter -- is neither performant nor
9693  *  scalable. scx_bpf_now() aims to provide a high-performance clock by
9703  * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
9706  *  is no such guarantee -- the clock can go backward. It provides a
9707  *  monotonically *non-decreasing* clock so that it would provide the same
9719 	if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
9723 		 * Note that scx_bpf_now() is re-entrant between a process
9728 		clock = READ_ONCE(rq->scx.clock);
9750 	/* Aggregate per-CPU event counters into @events. */
9753 		e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
9771  * scx_bpf_events - Get a system-wide event counter to
9790 	 * We cannot entirely trust a BPF-provided size since a BPF program
9802  * scx_bpf_task_cgroup - Return the sched cgroup of a task
9806  * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
9808  * determine @p's current cgroup as, unlike following @p->cgroups,
9809  * @p->sched_task_group is stable for the duration of the SCX op. See
9815 	struct task_group *tg = p->sched_task_group;
9880  * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc
9882  * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter())
9883  * consults this table to decide whether a context-sensitive kfunc is callable
9897  * context-sensitive.
9924  * Verifier-time filter for SCX kfuncs. Registered via the .filter field on
9925  * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc
9928  * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by
9942 	/* Not an SCX kfunc - allow. */
9948 	if (prog->type == BPF_PROG_TYPE_SYSCALL)
9949 		return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES;
9951 	if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
9952 		return (in_any || in_idle) ? 0 : -EACCES;
9957 	 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set;
9958 	 * do_check_main() re-runs the filter with st_ops set and enforces the
9961 	if (!prog->aux->st_ops)
9965 	 * Non-SCX struct_ops: SCX kfuncs are not permitted.
9967 	if (prog->aux->st_ops != &bpf_sched_ext_ops)
9968 		return -EACCES;
9970 	/* SCX struct_ops: check the per-op allow list. */
9974 	moff = prog->aux->attach_st_ops_member_off;
9988 	return -EACCES;
9999 	 * Some kfuncs are context-sensitive and can only be called from
10000 	 * specific SCX ops. They are grouped into per-context BTF sets, each
10004 	 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op
10048 		return -ENOMEM;
10051 	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);