ext.c - OpenGrok cross reference for /linux/kernel/sched/ext.c

Lines Matching +full:slice +full:- +full:per +full:- +full:line
1 /* SPDX-License-Identifier: GPL-2.0 */
3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
33 	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
34 	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
35 	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
51  *   SYS ACT: System-defined exit actions
52  *   SYS RSN: System-defined exit reasons
53  *   USR    : User-defined exit codes and reasons
56  * actions and/or system reasons with a user-defined exit code.
71 	/* %SCX_EXIT_* - broad category of the exit reason */
94 	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
100 	 * keeps running the current task even after its slice expires. If this
156 /* argument container for ops->cgroup_init() */
174  * Argument container for ops->cpu_acquire(). Currently empty, but may be
179 /* argument container for ops->cpu_release() */
200  * struct sched_ext_ops - Operation table for BPF scheduler implementation
209 	 * select_cpu - Pick the target CPU for a task which is being woken up
217 	 * saves a small bit of overhead down the line.
236 	 * enqueue - Enqueue a task on the BPF scheduler
251 	 * dequeue - Remove a task from the BPF scheduler
261 	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
267 	 * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
281 	 * When not %NULL, @prev is an SCX task with its slice depleted. If
283 	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
290 	 * tick - Periodic tick
294 	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
300 	 * runnable - A task is becoming runnable on its associated CPU
305 	 * execution state transitions. A task becomes ->runnable() on a CPU,
306 	 * and then goes through one or more ->running() and ->stopping() pairs
307 	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
312 	 * - waking up (%SCX_ENQ_WAKEUP)
313 	 * - being moved from another CPU
314 	 * - being restored after temporarily taken off the queue for an
317 	 * This and ->enqueue() are related but not coupled. This operation
318 	 * notifies @p's state transition and may not be followed by ->enqueue()
321 	 * task may be ->enqueue()'d without being preceded by this operation
322 	 * e.g. after exhausting its slice.
327 	 * running - A task is starting to run on its associated CPU
330 	 * See ->runnable() for explanation on the task state notifiers.
335 	 * stopping - A task is stopping execution
339 	 * See ->runnable() for explanation on the task state notifiers. If
340 	 * !@runnable, ->quiescent() will be invoked after this operation
346 	 * quiescent - A task is becoming not runnable on its associated CPU
350 	 * See ->runnable() for explanation on the task state notifiers.
354 	 * - sleeping (%SCX_DEQ_SLEEP)
355 	 * - being moved to another CPU
356 	 * - being temporarily taken off the queue for an attribute change
359 	 * This and ->dequeue() are related but not coupled. This operation
360 	 * notifies @p's state transition and may not be preceded by ->dequeue()
366 	 * yield - Yield CPU
375 	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
381 	 * core_sched_before - Task ordering for core-sched
385 	 * Used by core-sched to determine the ordering between two tasks. See
386 	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
387 	 * core-sched.
399 	 * set_weight - Set task weight
408 	 * set_cpumask - Set CPU affinity
418 	 * update_idle - Update the idle state of a CPU
423 	 * state. By default, implementing this operation disables the built-in
426 	 * - scx_bpf_select_cpu_dfl()
427 	 * - scx_bpf_test_and_clear_cpu_idle()
428 	 * - scx_bpf_pick_idle_cpu()
433 	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
439 	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
449 	 * cpu_release - A CPU is taken away from the BPF scheduler
456 	 * caller should consult @args->reason to determine the cause.
461 	 * init_task - Initialize a task to run in a BPF scheduler
469 	 * Return 0 for success, -errno for failure. An error return while
476 	 * exit_task - Exit a previously-running task from the system
485 	 * enable - Enable BPF scheduling for a task
494 	 * disable - Disable BPF scheduling for a task
504 	 * dump - Dump BPF scheduler state on error
512 	 * dump_cpu - Dump BPF scheduler state for a CPU on error
524 	 * dump_task - Dump BPF scheduler state for a runnable task on error
535 	 * cgroup_init - Initialize a cgroup
542 	 * Return 0 for success, -errno for failure. An error return while
550 	 * cgroup_exit - Exit a cgroup
559 	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
567 	 * Return 0 for success, -errno for failure. An error return aborts the
574 	 * cgroup_move - Commit cgroup move
585 	 * cgroup_cancel_move - Cancel cgroup move
597 	 * cgroup_set_weight - A cgroup's weight is being changed
611 	 * cpu_online - A CPU became online
620 	 * cpu_offline - A CPU is going offline
633 	 * init - Initialize the BPF scheduler
638 	 * exit - Clean up after the BPF scheduler
648 	 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
653 	 * flags - %SCX_OPS_* flags
658 	 * timeout_ms - The maximum amount of time, in milliseconds, that a
667 	 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
673 	 * hotplug_seq - A sequence number that may be set by the scheduler to
682 	 * name - BPF scheduler's name
684 	 * Must be a non-zero valid BPF object name including only isalnum(),
717 	 * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
727 	 * invoked in a ->cpu_release() callback, and the task is again
728 	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
730 	 * of the ->cpu_acquire() callback.
740 	 * The BPF scheduler is responsible for triggering a follow-up
759 	 * The generic core-sched layer decided to execute the task even though
780 	 * current task of the target CPU is an SCX task, its ->scx.slice is
813  * sched_ext_entity->ops_state
818  * NONE -> QUEUEING -> QUEUED -> DISPATCHING
821  *   \-------------------------------/
846 	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
854 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
893 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
946  * Non-NULL values are used for direct dispatch from enqueue path. A valid
955  * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
956  * to avoid live-locking in bypass mode where all tasks are dispatched to
957  * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
993 	char			line[SCX_EXIT_MSG_LEN];  member
1010 	.cpu			= -1,
1040 		return jiffies_to_msecs(at - now);  in jiffies_delta_msecs()
1042 		return -(long)jiffies_to_msecs(now - at);  in jiffies_delta_msecs()
1048 	return ~((1 << fls(flags)) - 1);  in higher_bits()
1060 	return (s32)(a - b) < 0;  in u32_before()
1084 	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,  in scx_kf_allow()
1085 		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",  in scx_kf_allow()
1086 		  current->scx.kf_mask, mask);  in scx_kf_allow()
1087 	current->scx.kf_mask |= mask;  in scx_kf_allow()
1094 	current->scx.kf_mask &= ~mask;  in scx_kf_disallow()
1123  * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
1126  * for non-nesting operations due to the way the tasks are tracked.
1135 	current->scx.kf_tasks[0] = task;					\
1137 	current->scx.kf_tasks[0] = NULL;					\
1144 	current->scx.kf_tasks[0] = task;					\
1146 	current->scx.kf_tasks[0] = NULL;					\
1154 	current->scx.kf_tasks[0] = task0;					\
1155 	current->scx.kf_tasks[1] = task1;					\
1157 	current->scx.kf_tasks[0] = NULL;					\
1158 	current->scx.kf_tasks[1] = NULL;					\
1165 	if (unlikely(!(current->scx.kf_mask & mask))) {  in scx_kf_allowed()
1167 			      mask, current->scx.kf_mask);  in scx_kf_allowed()
1179 		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {  in scx_kf_allowed()
1185 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {  in scx_kf_allowed()
1200 	if (unlikely((p != current->scx.kf_tasks[0] &&  in scx_kf_allowed_on_arg_tasks()
1201 		      p != current->scx.kf_tasks[1]))) {  in scx_kf_allowed_on_arg_tasks()
1211 	return !current->scx.kf_mask;  in scx_kf_allowed_if_unlocked()
1215  * nldsq_next_task - Iterate to the next task in a non-local DSQ
1228 	lockdep_assert_held(&dsq->lock);  in nldsq_next_task()
1231 		list_node = &cur->scx.dsq_list.node;  in nldsq_next_task()
1233 		list_node = &dsq->list;  in nldsq_next_task()
1238 			list_node = list_node->prev;  in nldsq_next_task()
1240 			list_node = list_node->next;  in nldsq_next_task()
1242 		if (list_node == &dsq->list)  in nldsq_next_task()
1247 	} while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);  in nldsq_next_task()
1258  * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
1259  * dispatch order. BPF-visible iterator is opaque and larger to allow future
1279 	u64				slice;  member
1300  * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
1318 		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));  in scx_task_iter_start()
1322 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };  in scx_task_iter_start()
1323 	list_add(&iter->cursor.tasks_node, &scx_tasks);  in scx_task_iter_start()
1324 	iter->locked = NULL;  in scx_task_iter_start()
1325 	iter->cnt = 0;  in scx_task_iter_start()
1330 	if (iter->locked) {  in __scx_task_iter_rq_unlock()
1331 		task_rq_unlock(iter->rq, iter->locked, &iter->rf);  in __scx_task_iter_rq_unlock()
1332 		iter->locked = NULL;  in __scx_task_iter_rq_unlock()
1337  * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
1351  * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
1352  * @iter: iterator to re-lock
1354  * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
1355  * doesn't re-lock the rq lock. Must be called before other iterator operations.
1363  * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
1372 	list_del_init(&iter->cursor.tasks_node);  in scx_task_iter_stop()
1377  * scx_task_iter_next - Next task
1381  * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
1386 	struct list_head *cursor = &iter->cursor.tasks_node;  in scx_task_iter_next()
1389 	if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {  in scx_task_iter_next()
1396 		if (&pos->tasks_node == &scx_tasks)  in scx_task_iter_next()
1398 		if (!(pos->flags & SCX_TASK_CURSOR)) {  in scx_task_iter_next()
1399 			list_move(cursor, &pos->tasks_node);  in scx_task_iter_next()
1409  * scx_task_iter_next_locked - Next non-idle task with its rq locked
1413  * Visit the non-idle task with its rq lock held. Allows callers to specify
1426 		 * while loading the BPF scheduler and vice-versa while  in scx_task_iter_next_locked()
1430 		 * - It's unsafe to use __setschduler_prio() on an init_task to  in scx_task_iter_next_locked()
1434 		 * - ops.init/exit_task() can easily be confused if called with  in scx_task_iter_next_locked()
1441 		 * - %PF_IDLE may not be set for an init_task whose CPU hasn't  in scx_task_iter_next_locked()
1444 		 * - %PF_IDLE can be set on tasks that are not init_tasks. See  in scx_task_iter_next_locked()
1449 		if (p->sched_class != &idle_sched_class)  in scx_task_iter_next_locked()
1455 	iter->rq = task_rq_lock(p, &iter->rf);  in scx_task_iter_next_locked()
1456 	iter->locked = p;  in scx_task_iter_next_locked()
1482 	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);  in scx_rq_bypassing()
1486  * wait_ops_state - Busy-wait the specified ops state to end
1490  * Busy-wait for @p to transition out of @opss. This can only be used when the
1499 	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);  in wait_ops_state()
1503  * ops_cpu_valid - Verify a cpu number
1523  * ops_sanitize_err - Sanitize a -errno value
1525  * @err: -errno value to sanitize
1527  * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
1528  * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
1536 	if (err < 0 && err >= -MAX_ERRNO)  in ops_sanitize_err()
1540 	return -EPROTO;  in ops_sanitize_err()
1565  * schedule_deferred - Schedule execution of deferred actions on an rq
1582 	if (rq->scx.flags & SCX_RQ_IN_WAKEUP)  in schedule_deferred()
1589 	if (rq->scx.flags & SCX_RQ_IN_BALANCE) {  in schedule_deferred()
1590 		queue_balance_callback(rq, &rq->scx.deferred_bal_cb,  in schedule_deferred()
1597 	 * IRQ re-enable which may take a bit longer than the scheduler hooks.  in schedule_deferred()
1599 	 * the time to IRQ re-enable shouldn't be long.  in schedule_deferred()
1601 	irq_work_queue(&rq->scx.deferred_irq_work);  in schedule_deferred()
1605  * touch_core_sched - Update timestamp used for core-sched task ordering
1609  * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
1610  * implement global or local-DSQ FIFO ordering for core-sched. Should be called
1611  * when a task becomes runnable and its turn on the CPU ends (e.g. slice
1624 	 * it may be better to use per-core dispatch sequence instead.  in touch_core_sched()
1627 		p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));  in touch_core_sched()
1632  * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
1636  * If the BPF scheduler implements custom core-sched ordering via
1637  * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
1639  * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
1653 	struct task_struct *curr = rq->curr;  in update_curr_scx()
1660 	if (curr->scx.slice != SCX_SLICE_INF) {  in update_curr_scx()
1661 		curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);  in update_curr_scx()
1662 		if (!curr->scx.slice)  in update_curr_scx()
1675 	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);  in scx_dsq_priq_less()
1680 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */  in dsq_mod_nr()
1681 	WRITE_ONCE(dsq->nr, dsq->nr + delta);  in dsq_mod_nr()
1687 	bool is_local = dsq->id == SCX_DSQ_LOCAL;  in dispatch_enqueue()
1689 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));  in dispatch_enqueue()
1690 	WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||  in dispatch_enqueue()
1691 		     !RB_EMPTY_NODE(&p->scx.dsq_priq));  in dispatch_enqueue()
1694 		raw_spin_lock(&dsq->lock);  in dispatch_enqueue()
1695 		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {  in dispatch_enqueue()
1698 			raw_spin_unlock(&dsq->lock);  in dispatch_enqueue()
1700 			raw_spin_lock(&dsq->lock);  in dispatch_enqueue()
1704 	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&  in dispatch_enqueue()
1709 		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we  in dispatch_enqueue()
1713 		scx_ops_error("cannot use vtime ordering for built-in DSQs");  in dispatch_enqueue()
1725 		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&  in dispatch_enqueue()
1727 			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",  in dispatch_enqueue()
1728 				      dsq->id);  in dispatch_enqueue()
1730 		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;  in dispatch_enqueue()
1731 		rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);  in dispatch_enqueue()
1735 		 * that @dsq->list is vtime ordered.  in dispatch_enqueue()
1737 		rbp = rb_prev(&p->scx.dsq_priq);  in dispatch_enqueue()
1742 			list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);  in dispatch_enqueue()
1744 			list_add(&p->scx.dsq_list.node, &dsq->list);  in dispatch_enqueue()
1748 		if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))  in dispatch_enqueue()
1749 			scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",  in dispatch_enqueue()
1750 				      dsq->id);  in dispatch_enqueue()
1753 			list_add(&p->scx.dsq_list.node, &dsq->list);  in dispatch_enqueue()
1755 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);  in dispatch_enqueue()
1759 	dsq->seq++;  in dispatch_enqueue()
1760 	p->scx.dsq_seq = dsq->seq;  in dispatch_enqueue()
1763 	p->scx.dsq = dsq;  in dispatch_enqueue()
1771 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;  in dispatch_enqueue()
1772 	p->scx.ddsp_enq_flags = 0;  in dispatch_enqueue()
1779 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in dispatch_enqueue()
1785 		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&  in dispatch_enqueue()
1786 		    rq->curr->sched_class == &ext_sched_class) {  in dispatch_enqueue()
1787 			rq->curr->scx.slice = 0;  in dispatch_enqueue()
1792 						 rq->curr->sched_class))  in dispatch_enqueue()
1795 		raw_spin_unlock(&dsq->lock);  in dispatch_enqueue()
1802 	WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));  in task_unlink_from_dsq()
1804 	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {  in task_unlink_from_dsq()
1805 		rb_erase(&p->scx.dsq_priq, &dsq->priq);  in task_unlink_from_dsq()
1806 		RB_CLEAR_NODE(&p->scx.dsq_priq);  in task_unlink_from_dsq()
1807 		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;  in task_unlink_from_dsq()
1810 	list_del_init(&p->scx.dsq_list.node);  in task_unlink_from_dsq()
1811 	dsq_mod_nr(dsq, -1);  in task_unlink_from_dsq()
1816 	struct scx_dispatch_q *dsq = p->scx.dsq;  in dispatch_dequeue()
1817 	bool is_local = dsq == &rq->scx.local_dsq;  in dispatch_dequeue()
1821 		 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.  in dispatch_dequeue()
1824 		if (unlikely(!list_empty(&p->scx.dsq_list.node)))  in dispatch_dequeue()
1825 			list_del_init(&p->scx.dsq_list.node);  in dispatch_dequeue()
1830 		 * @p->scx.holding_cpu may be set under the protection of  in dispatch_dequeue()
1833 		if (p->scx.holding_cpu >= 0)  in dispatch_dequeue()
1834 			p->scx.holding_cpu = -1;  in dispatch_dequeue()
1840 		raw_spin_lock(&dsq->lock);  in dispatch_dequeue()
1843 	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't  in dispatch_dequeue()
1846 	if (p->scx.holding_cpu < 0) {  in dispatch_dequeue()
1852 		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the  in dispatch_dequeue()
1856 		WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));  in dispatch_dequeue()
1857 		p->scx.holding_cpu = -1;  in dispatch_dequeue()
1859 	p->scx.dsq = NULL;  in dispatch_dequeue()
1862 		raw_spin_unlock(&dsq->lock);  in dispatch_dequeue()
1871 		return &rq->scx.local_dsq;  in find_dsq_for_dispatch()
1879 		return &cpu_rq(cpu)->scx.local_dsq;  in find_dsq_for_dispatch()
1888 		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",  in find_dsq_for_dispatch()
1889 			      dsq_id, p->comm, p->pid);  in find_dsq_for_dispatch()
1902 	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value  in mark_direct_dispatch()
1905 	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));  in mark_direct_dispatch()
1910 			scx_ops_error("%s[%d] already direct-dispatched",  in mark_direct_dispatch()
1911 				      p->comm, p->pid);  in mark_direct_dispatch()
1913 			scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",  in mark_direct_dispatch()
1914 				      ddsp_task->comm, ddsp_task->pid,  in mark_direct_dispatch()
1915 				      p->comm, p->pid);  in mark_direct_dispatch()
1919 	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);  in mark_direct_dispatch()
1920 	WARN_ON_ONCE(p->scx.ddsp_enq_flags);  in mark_direct_dispatch()
1922 	p->scx.ddsp_dsq_id = dsq_id;  in mark_direct_dispatch()
1923 	p->scx.ddsp_enq_flags = enq_flags;  in mark_direct_dispatch()
1930 		find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);  in direct_dispatch()
1934 	p->scx.ddsp_enq_flags |= enq_flags;  in direct_dispatch()
1942 	if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {  in direct_dispatch()
1945 		opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;  in direct_dispatch()
1955 			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in direct_dispatch()
1959 				  p->comm, p->pid, opss);  in direct_dispatch()
1960 			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in direct_dispatch()
1964 		WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));  in direct_dispatch()
1965 		list_add_tail(&p->scx.dsq_list.node,  in direct_dispatch()
1966 			      &rq->scx.ddsp_deferred_locals);  in direct_dispatch()
1971 	dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);  in direct_dispatch()
1983 	return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));  in scx_rq_online()
1992 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));  in do_enqueue_task()
2009 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)  in do_enqueue_task()
2014 	    unlikely(p->flags & PF_EXITING))  in do_enqueue_task()
2021 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;  in do_enqueue_task()
2023 	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);  in do_enqueue_task()
2024 	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);  in do_enqueue_task()
2033 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)  in do_enqueue_task()
2040 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);  in do_enqueue_task()
2049 	 * For task-ordering, slice refill must be treated as implying the end  in do_enqueue_task()
2050 	 * of the current slice. Otherwise, the longer @p stays on the CPU, the  in do_enqueue_task()
2054 	p->scx.slice = SCX_SLICE_DFL;  in do_enqueue_task()
2056 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);  in do_enqueue_task()
2061 	p->scx.slice = SCX_SLICE_DFL;  in do_enqueue_task()
2067 	return !list_empty(&p->scx.runnable_node);  in task_runnable()
2074 	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {  in set_task_runnable()
2075 		p->scx.runnable_at = jiffies;  in set_task_runnable()
2076 		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;  in set_task_runnable()
2083 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);  in set_task_runnable()
2088 	list_del_init(&p->scx.runnable_node);  in clr_task_runnable()
2090 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;  in clr_task_runnable()
2095 	int sticky_cpu = p->scx.sticky_cpu;  in enqueue_task_scx()
2098 		rq->scx.flags |= SCX_RQ_IN_WAKEUP;  in enqueue_task_scx()
2100 	enq_flags |= rq->scx.extra_enq_flags;  in enqueue_task_scx()
2103 		p->scx.sticky_cpu = -1;  in enqueue_task_scx()
2109 	 * direct-dispatch into the local DSQ by setting the sticky_cpu.  in enqueue_task_scx()
2114 	if (p->scx.flags & SCX_TASK_QUEUED) {  in enqueue_task_scx()
2120 	p->scx.flags |= SCX_TASK_QUEUED;  in enqueue_task_scx()
2121 	rq->scx.nr_running++;  in enqueue_task_scx()
2132 	rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;  in enqueue_task_scx()
2143 	opss = atomic_long_read_acquire(&p->scx.ops_state);  in ops_dequeue()
2158 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,  in ops_dequeue()
2177 		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);  in ops_dequeue()
2184 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {  in dequeue_task_scx()
2193 	 * and then stops running. As we want running <-> stopping transitions  in dequeue_task_scx()
2194 	 * to be contained within runnable <-> quiescent transitions, trigger  in dequeue_task_scx()
2195 	 * ->stopping() early here instead of in put_prev_task_scx().  in dequeue_task_scx()
2197 	 * @p may go through multiple stopping <-> running transitions between  in dequeue_task_scx()
2212 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;  in dequeue_task_scx()
2214 		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;  in dequeue_task_scx()
2216 	p->scx.flags &= ~SCX_TASK_QUEUED;  in dequeue_task_scx()
2217 	rq->scx.nr_running--;  in dequeue_task_scx()
2226 	struct task_struct *p = rq->curr;  in yield_task_scx()
2231 		p->scx.slice = 0;  in yield_task_scx()
2236 	struct task_struct *from = rq->curr;  in yield_to_task_scx()
2248 	struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;  in move_local_task_to_local_dsq()
2251 	lockdep_assert_held(&src_dsq->lock);  in move_local_task_to_local_dsq()
2254 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);  in move_local_task_to_local_dsq()
2257 		list_add(&p->scx.dsq_list.node, &dst_dsq->list);  in move_local_task_to_local_dsq()
2259 		list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);  in move_local_task_to_local_dsq()
2262 	p->scx.dsq = dst_dsq;  in move_local_task_to_local_dsq()
2267  * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
2283 	p->scx.sticky_cpu = cpu_of(dst_rq);  in move_remote_task_to_local_dsq()
2289 	 * We want to pass scx-specific enq_flags but activate_task() will  in move_remote_task_to_local_dsq()
2291 	 * @rq->scx.extra_enq_flags instead.  in move_remote_task_to_local_dsq()
2293 	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));  in move_remote_task_to_local_dsq()
2294 	WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);  in move_remote_task_to_local_dsq()
2295 	dst_rq->scx.extra_enq_flags = enq_flags;  in move_remote_task_to_local_dsq()
2297 	dst_rq->scx.extra_enq_flags = 0;  in move_remote_task_to_local_dsq()
2304  * - is_cpu_allowed() asks "Can this task run on this CPU?" while
2314  * - The BPF scheduler is bypassed while the rq is offline and we can always say
2331 				      cpu_of(rq), p->comm, p->pid);  in task_can_run_on_remote_rq()
2345  * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
2353  * non-local DSQ, it's better to use the same mechanism to protect against
2354  * dequeues and maintain the invariant that @p->scx.dsq can only change while
2362  * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
2364  * would be cleared to -1. While other cpus may have updated it to different
2379 	lockdep_assert_held(&dsq->lock);  in unlink_dsq_and_lock_src_rq()
2381 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);  in unlink_dsq_and_lock_src_rq()
2383 	p->scx.holding_cpu = cpu;  in unlink_dsq_and_lock_src_rq()
2385 	raw_spin_unlock(&dsq->lock);  in unlink_dsq_and_lock_src_rq()
2389 	return likely(p->scx.holding_cpu == cpu) &&  in unlink_dsq_and_lock_src_rq()
2414  * move_task_between_dsqs() - Move a task from one DSQ to another
2434 	BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);  in move_task_between_dsqs()
2435 	lockdep_assert_held(&src_dsq->lock);  in move_task_between_dsqs()
2438 	if (dst_dsq->id == SCX_DSQ_LOCAL) {  in move_task_between_dsqs()
2445 		/* no need to migrate if destination is a non-local DSQ */  in move_task_between_dsqs()
2453 	if (dst_dsq->id == SCX_DSQ_LOCAL) {  in move_task_between_dsqs()
2454 		/* @p is going from a non-local DSQ to a local DSQ */  in move_task_between_dsqs()
2459 			raw_spin_unlock(&src_dsq->lock);  in move_task_between_dsqs()
2461 			raw_spin_unlock(&src_dsq->lock);  in move_task_between_dsqs()
2467 		 * @p is going from a non-local DSQ to a non-local DSQ. As  in move_task_between_dsqs()
2471 		p->scx.dsq = NULL;  in move_task_between_dsqs()
2472 		raw_spin_unlock(&src_dsq->lock);  in move_task_between_dsqs()
2481  * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
2501 		while (atomic_read(&scx_ops_breather_depth) && --cnt)  in scx_ops_breather()
2516 	 * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can  in consume_dispatch_q()
2517 	 * live-lock the machine into soft lockups. Give a breather.  in consume_dispatch_q()
2524 	 * @dsq->list without locking and skip if it seems empty.  in consume_dispatch_q()
2526 	if (list_empty(&dsq->list))  in consume_dispatch_q()
2529 	raw_spin_lock(&dsq->lock);  in consume_dispatch_q()
2537 			raw_spin_unlock(&dsq->lock);  in consume_dispatch_q()
2548 	raw_spin_unlock(&dsq->lock);  in consume_dispatch_q()
2560  * dispatch_to_local_dsq - Dispatch a task to a local dsq
2604 	 * we're moving from a DSQ and use the same mechanism - mark the task  in dispatch_to_local_dsq()
2608 	p->scx.holding_cpu = raw_smp_processor_id();  in dispatch_to_local_dsq()
2611 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in dispatch_to_local_dsq()
2620 	if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&  in dispatch_to_local_dsq()
2628 			p->scx.holding_cpu = -1;  in dispatch_to_local_dsq()
2629 			dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);  in dispatch_to_local_dsq()
2636 		if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))  in dispatch_to_local_dsq()
2651  * finish_dispatch - Asynchronously finish dispatching a task
2682 	opss = atomic_long_read(&p->scx.ops_state);  in finish_dispatch()
2692 		 * dispatch/dequeue and re-enqueue cycle between  in finish_dispatch()
2700 		 * it - the BPF scheduler is allowed to dispatch tasks  in finish_dispatch()
2705 		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,  in finish_dispatch()
2720 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));  in finish_dispatch()
2724 	if (dsq->id == SCX_DSQ_LOCAL)  in finish_dispatch()
2735 	for (u = 0; u < dspc->cursor; u++) {  in flush_dispatch_buf()
2736 		struct scx_dsp_buf_ent *ent = &dspc->buf[u];  in flush_dispatch_buf()
2738 		finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,  in flush_dispatch_buf()
2739 				ent->enq_flags);  in flush_dispatch_buf()
2742 	dspc->nr_tasks += dspc->cursor;  in flush_dispatch_buf()
2743 	dspc->cursor = 0;  in flush_dispatch_buf()
2749 	bool prev_on_scx = prev->sched_class == &ext_sched_class;  in balance_one()
2750 	bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;  in balance_one()
2754 	rq->scx.flags |= SCX_RQ_IN_BALANCE;  in balance_one()
2755 	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);  in balance_one()
2758 	    unlikely(rq->scx.cpu_released)) {  in balance_one()
2762 		 * core. This callback complements ->cpu_release(), which is  in balance_one()
2767 		rq->scx.cpu_released = false;  in balance_one()
2774 		 * If @prev is runnable & has slice left, it has priority and  in balance_one()
2778 		 * implement ->cpu_release().  in balance_one()
2783 		if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {  in balance_one()
2784 			rq->scx.flags |= SCX_RQ_BAL_KEEP;  in balance_one()
2790 	if (rq->scx.local_dsq.nr)  in balance_one()
2799 	dspc->rq = rq;  in balance_one()
2809 		dspc->nr_tasks = 0;  in balance_one()
2816 		if (prev_on_rq && prev->scx.slice) {  in balance_one()
2817 			rq->scx.flags |= SCX_RQ_BAL_KEEP;  in balance_one()
2820 		if (rq->scx.local_dsq.nr)  in balance_one()
2834 		if (unlikely(!--nr_loops)) {  in balance_one()
2838 	} while (dspc->nr_tasks);  in balance_one()
2847 		rq->scx.flags |= SCX_RQ_BAL_KEEP;  in balance_one()
2850 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;  in balance_one()
2854 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;  in balance_one()
2869 	 * When core-sched is enabled, this ops.balance() call will be followed  in balance_scx()
2879 			struct task_struct *sprev = srq->curr;  in balance_scx()
2905 	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,  in process_ddsp_deferred_locals()
2909 		list_del_init(&p->scx.dsq_list.node);  in process_ddsp_deferred_locals()
2911 		dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);  in process_ddsp_deferred_locals()
2912 		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))  in process_ddsp_deferred_locals()
2913 			dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);  in process_ddsp_deferred_locals()
2919 	if (p->scx.flags & SCX_TASK_QUEUED) {  in set_next_task_scx()
2921 		 * Core-sched might decide to execute @p before it is  in set_next_task_scx()
2928 	p->se.exec_start = rq_clock_task(rq);  in set_next_task_scx()
2931 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))  in set_next_task_scx()
2938 	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().  in set_next_task_scx()
2940 	if ((p->scx.slice == SCX_SLICE_INF) !=  in set_next_task_scx()
2941 	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {  in set_next_task_scx()
2942 		if (p->scx.slice == SCX_SLICE_INF)  in set_next_task_scx()
2943 			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;  in set_next_task_scx()
2945 			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;  in set_next_task_scx()
2953 		 * tick-stopped CPUs.  in set_next_task_scx()
2975 	const struct sched_class *next_class = next->sched_class;  in switch_class()
2983 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);  in switch_class()
2999 	 * sched_class, so invoke the ->cpu_release() callback if we have not  in switch_class()
3003 	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the  in switch_class()
3006 	if (!rq->scx.cpu_released) {  in switch_class()
3016 		rq->scx.cpu_released = true;  in switch_class()
3026 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))  in put_prev_task_scx()
3029 	if (p->scx.flags & SCX_TASK_QUEUED) {  in put_prev_task_scx()
3033 		 * If @p has slice left and is being put, @p is getting  in put_prev_task_scx()
3034 		 * preempted by a higher priority scheduler class or core-sched  in put_prev_task_scx()
3038 		if (p->scx.slice && !scx_rq_bypassing(rq)) {  in put_prev_task_scx()
3039 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);  in put_prev_task_scx()
3047 		 * which should trigger an explicit follow-up scheduling event.  in put_prev_task_scx()
3049 		if (sched_class_above(&ext_sched_class, next->sched_class)) {  in put_prev_task_scx()
3051 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);  in put_prev_task_scx()
3053 			do_enqueue_task(rq, p, 0, -1);  in put_prev_task_scx()
3058 	if (next && next->sched_class != &ext_sched_class)  in put_prev_task_scx()
3064 	return list_first_entry_or_null(&rq->scx.local_dsq.list,  in first_local_task()
3070 	struct task_struct *prev = rq->curr;  in pick_task_scx()
3072 	bool prev_on_scx = prev->sched_class == &ext_sched_class;  in pick_task_scx()
3073 	bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;  in pick_task_scx()
3091 	if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {  in pick_task_scx()
3105 	 * If balance_scx() is telling us to keep running @prev, replenish slice  in pick_task_scx()
3111 		if (!p->scx.slice)  in pick_task_scx()
3112 			p->scx.slice = SCX_SLICE_DFL;  in pick_task_scx()
3121 		if (unlikely(!p->scx.slice)) {  in pick_task_scx()
3123 				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",  in pick_task_scx()
3124 						p->comm, p->pid, __func__);  in pick_task_scx()
3127 			p->scx.slice = SCX_SLICE_DFL;  in pick_task_scx()
3136  * scx_prio_less - Task ordering for core-sched
3140  * Core-sched is implemented as an additional scheduling layer on top of the
3142  * SCX, core-sched calls this function to interrogate the task ordering.
3144  * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
3146  * prority the task - the global FIFO ordering matching the default scheduling
3149  * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
3165 		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);  in scx_prio_less()
3208 			return -EBUSY;  in scx_pick_idle_cpu()
3213 		return -EBUSY;  in scx_pick_idle_cpu()
3241 	 *    - LLC 0: cpu0..cpu7  in llc_numa_mismatch()
3242 	 *    - LLC 1: cpu8..cpu15 [offline]  in llc_numa_mismatch()
3245 	 *    - LLC 0: cpu16..cpu23  in llc_numa_mismatch()
3246 	 *    - LLC 1: cpu24..cpu31  in llc_numa_mismatch()
3262 		if (sd->span_weight != cpumask_weight(numa_cpus))  in llc_numa_mismatch()
3270  * Initialize topology-aware scheduling.
3273  * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
3300 		if (sd->span_weight < num_online_cpus())  in update_selcpu_topology()
3334  * Built-in CPU idle selection policy:
3336  * 1. Prioritize full-idle cores:
3337  *   - always prioritize CPUs from fully idle cores (both logical CPUs are
3341  *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
3344  * 3. Pick a CPU within the same LLC (Last-Level Cache):
3345  *   - if the above conditions aren't met, pick a CPU that shares the same LLC
3349  *   - choose a CPU from the same NUMA node to reduce memory access latency.
3380 	 * if a task's scheduling domain is restricted by user-space (through  in scx_select_cpu_dfl()
3382 	 * defined by user-space.  in scx_select_cpu_dfl()
3384 	if (p->nr_cpus_allowed >= num_possible_cpus()) {  in scx_select_cpu_dfl()
3427 		    !(current->flags & PF_EXITING) &&  in scx_select_cpu_dfl()
3428 		    cpu_rq(cpu)->scx.local_dsq.nr == 0) {  in scx_select_cpu_dfl()
3429 			if (cpumask_test_cpu(cpu, p->cpus_ptr))  in scx_select_cpu_dfl()
3469 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);  in scx_select_cpu_dfl()
3503 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);  in scx_select_cpu_dfl()
3553 			p->scx.slice = SCX_SLICE_DFL;  in select_task_rq_scx()
3554 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;  in select_task_rq_scx()
3571 	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily  in set_cpus_allowed_scx()
3572 	 * differ from the configured one in @p->cpus_mask. Always tell the bpf  in set_cpus_allowed_scx()
3575 	 * Fine-grained memory write control is enforced by BPF making the const  in set_cpus_allowed_scx()
3580 				 (struct cpumask *)p->cpus_ptr);  in set_cpus_allowed_scx()
3607 			 * it's only for optimization and self-correcting.  in update_builtin_idle()
3655 	 * - for real idle transitions (do_notify == true)  in __scx_update_idle()
3656 	 * - for idle-to-idle transitions (indicated by the previous task  in __scx_update_idle()
3668 		if (do_notify || is_idle_task(rq->curr))  in __scx_update_idle()
3703 	rq->scx.flags |= SCX_RQ_ONLINE;  in rq_online_scx()
3708 	rq->scx.flags &= ~SCX_RQ_ONLINE;  in rq_offline_scx()
3714 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }  in scx_pick_idle_cpu()
3726 	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {  in check_rq_for_timeouts()
3727 		unsigned long last_runnable = p->scx.runnable_at;  in check_rq_for_timeouts()
3731 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);  in check_rq_for_timeouts()
3735 					   p->comm, p->pid,  in check_rq_for_timeouts()
3772 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);  in scx_tick()
3787 	 * While disabling, always resched and refresh core-sched timestamp as  in task_tick_scx()
3788 	 * we can't trust the slice management or ops.core_sched_before().  in task_tick_scx()
3791 		curr->scx.slice = 0;  in task_tick_scx()
3797 	if (!curr->scx.slice)  in task_tick_scx()
3806 	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the  in tg_cgrp()
3809 	if (tg && tg->css.cgroup)  in tg_cgrp()
3810 		return tg->css.cgroup;  in tg_cgrp()
3825 	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;  in scx_get_task_state()
3852 	WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",  in scx_set_task_state()
3853 		  prev_state, state, p->comm, p->pid);  in scx_set_task_state()
3855 	p->scx.flags &= ~SCX_TASK_STATE_MASK;  in scx_set_task_state()
3856 	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;  in scx_set_task_state()
3863 	p->scx.disallow = false;  in scx_ops_init_task()
3880 	if (p->scx.disallow) {  in scx_ops_init_task()
3888 			 * We're in the load path and @p->policy will be applied  in scx_ops_init_task()
3889 			 * right after. Reverting @p->policy here and rejecting  in scx_ops_init_task()
3891 			 * guarantees that if ops.init_task() sets @p->disallow,  in scx_ops_init_task()
3894 			if (p->policy == SCHED_EXT) {  in scx_ops_init_task()
3895 				p->policy = SCHED_NORMAL;  in scx_ops_init_task()
3900 		} else if (p->policy == SCHED_EXT) {  in scx_ops_init_task()
3901 			scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",  in scx_ops_init_task()
3902 				      p->comm, p->pid);  in scx_ops_init_task()
3906 	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;  in scx_ops_init_task()
3923 		weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];  in scx_ops_enable_task()
3925 	p->scx.weight = sched_weight_to_cgroup(weight);  in scx_ops_enable_task()
3932 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);  in scx_ops_enable_task()
3977 	INIT_LIST_HEAD(&scx->dsq_list.node);  in init_scx_entity()
3978 	RB_CLEAR_NODE(&scx->dsq_priq);  in init_scx_entity()
3979 	scx->sticky_cpu = -1;  in init_scx_entity()
3980 	scx->holding_cpu = -1;  in init_scx_entity()
3981 	INIT_LIST_HEAD(&scx->runnable_node);  in init_scx_entity()
3982 	scx->runnable_at = jiffies;  in init_scx_entity()
3983 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;  in init_scx_entity()
3984 	scx->slice = SCX_SLICE_DFL;  in init_scx_entity()
4018 		if (p->sched_class == &ext_sched_class) {  in scx_post_fork()
4029 	list_add_tail(&p->scx.tasks_node, &scx_tasks);  in scx_post_fork()
4055 	list_del_init(&p->scx.tasks_node);  in sched_ext_free()
4059 	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->  in sched_ext_free()
4077 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));  in reweight_task_scx()
4079 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);  in reweight_task_scx()
4092 	 * different scheduler class. Keep the BPF scheduler up-to-date.  in switching_to_scx()
4096 				 (struct cpumask *)p->cpus_ptr);  in switching_to_scx()
4112 	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&  in scx_check_setscheduler()
4113 	    p->policy != policy && policy == SCHED_EXT)  in scx_check_setscheduler()
4114 		return -EACCES;  in scx_check_setscheduler()
4122 	struct task_struct *p = rq->curr;  in scx_can_stop_tick()
4127 	if (p->sched_class != &ext_sched_class)  in scx_can_stop_tick()
4135 	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;  in scx_can_stop_tick()
4152 	if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)  in scx_cgroup_warn_missing_weight()
4165 	if (!tg->idle)  in scx_cgroup_warn_missing_idle()
4177 	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));  in scx_tg_online()
4186 				{ .weight = tg->scx_weight };  in scx_tg_online()
4189 					      tg->css.cgroup, &args);  in scx_tg_online()
4194 			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;  in scx_tg_online()
4196 		tg->scx_flags |= SCX_TG_ONLINE;  in scx_tg_online()
4205 	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));  in scx_tg_offline()
4209 	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))  in scx_tg_offline()
4210 		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);  in scx_tg_offline()
4211 	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);  in scx_tg_offline()
4232 		WARN_ON_ONCE(p->scx.cgrp_moving_from);  in scx_cgroup_can_attach()
4237 		 * always match one-to-one.  in scx_cgroup_can_attach()
4244 					      p, from, css->cgroup);  in scx_cgroup_can_attach()
4249 		p->scx.cgrp_moving_from = from;  in scx_cgroup_can_attach()
4256 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)  in scx_cgroup_can_attach()
4258 				    p->scx.cgrp_moving_from, css->cgroup);  in scx_cgroup_can_attach()
4259 		p->scx.cgrp_moving_from = NULL;  in scx_cgroup_can_attach()
4281 	if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))  in scx_move_task()
4288 	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))  in scx_move_task()
4290 			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));  in scx_move_task()
4291 	p->scx.cgrp_moving_from = NULL;  in scx_move_task()
4308 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)  in scx_cgroup_cancel_attach()
4310 				    p->scx.cgrp_moving_from, css->cgroup);  in scx_cgroup_cancel_attach()
4311 		p->scx.cgrp_moving_from = NULL;  in scx_cgroup_cancel_attach()
4321 	if (scx_cgroup_enabled && tg->scx_weight != weight) {  in scx_group_set_weight()
4325 		tg->scx_weight = weight;  in scx_group_set_weight()
4358  * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
4360  *   the victim task's slice to 0 and triggering reschedule on the target CPU.
4362  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
4364  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
4409 	raw_spin_lock_init(&dsq->lock);  in init_dsq()
4410 	INIT_LIST_HEAD(&dsq->list);  in init_dsq()
4411 	dsq->id = dsq_id;  in init_dsq()
4420 		return ERR_PTR(-EINVAL);  in create_dsq()
4424 		return ERR_PTR(-ENOMEM);  in create_dsq()
4428 	ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,  in create_dsq()
4459 	raw_spin_lock_irqsave(&dsq->lock, flags);  in destroy_dsq()
4461 	if (dsq->nr) {  in destroy_dsq()
4462 		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",  in destroy_dsq()
4463 			      dsq->id, dsq->nr);  in destroy_dsq()
4467 	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))  in destroy_dsq()
4471 	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from  in destroy_dsq()
4476 	dsq->id = SCX_DSQ_INVALID;  in destroy_dsq()
4477 	llist_add(&dsq->free_node, &dsqs_to_free);  in destroy_dsq()
4481 	raw_spin_unlock_irqrestore(&dsq->lock, flags);  in destroy_dsq()
4503 		if (!(tg->scx_flags & SCX_TG_INITED))  in scx_cgroup_exit()
4505 		tg->scx_flags &= ~SCX_TG_INITED;  in scx_cgroup_exit()
4514 		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);  in scx_cgroup_exit()
4539 		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };  in scx_cgroup_init()
4544 		if ((tg->scx_flags &  in scx_cgroup_init()
4549 			tg->scx_flags |= SCX_TG_INITED;  in scx_cgroup_init()
4558 				      css->cgroup, &args);  in scx_cgroup_init()
4564 		tg->scx_flags |= SCX_TG_INITED;  in scx_cgroup_init()
4690  * scx_softlockup - sched_ext softlockup handler
4692  * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
4693  * live-lock the system by making many CPUs target the same DSQ to the point
4694  * where soft-lockup detection triggers. This function is called from
4695  * soft-lockup watchdog when the triggering point is close and tries to unjam
4712 	printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",  in scx_softlockup()
4722 	scx_ops_error("soft lockup - CPU#%d stuck for %us",  in scx_softlockup()
4733  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
4744  * - ops.select_cpu() is ignored and the default select_cpu() is used.
4746  * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
4749  * - ops.dispatch() is ignored.
4751  * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
4755  * - pick_next_task() suppresses zero slice warning.
4757  * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
4760  * - scx_prio_less() reverts to the default core_sched_at order.
4775 		scx_ops_bypass_depth--;  in scx_ops_bypass()
4785 	 * queued tasks are re-queued according to the new scx_rq_bypassing()  in scx_ops_bypass()
4799 			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);  in scx_ops_bypass()
4800 			rq->scx.flags |= SCX_RQ_BYPASSING;  in scx_ops_bypass()
4802 			WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));  in scx_ops_bypass()
4803 			rq->scx.flags &= ~SCX_RQ_BYPASSING;  in scx_ops_bypass()
4823 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,  in scx_ops_bypass()
4847 	kfree(ei->dump);  in free_exit_info()
4848 	kfree(ei->msg);  in free_exit_info()
4849 	kfree(ei->bt);  in free_exit_info()
4861 	ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);  in alloc_exit_info()
4862 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);  in alloc_exit_info()
4863 	ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);  in alloc_exit_info()
4865 	if (!ei->bt || !ei->msg || !ei->dump) {  in alloc_exit_info()
4883 		return "disabled by sysrq-S";  in scx_exit_reason()
4908 		 * disable was scheduled - don't kill the new ops. DONE  in scx_ops_disable_workfn()
4916 	ei->kind = kind;  in scx_ops_disable_workfn()
4917 	ei->reason = scx_exit_reason(ei->kind);  in scx_ops_disable_workfn()
4928 			scx_exit_info->msg);  in scx_ops_disable_workfn()
4964 		const struct sched_class *old_class = p->sched_class;  in scx_ops_disable_workfn()
4966 			__setscheduler_class(p->policy, p->prio);  in scx_ops_disable_workfn()
4969 		if (old_class != new_class && p->se.sched_delayed)  in scx_ops_disable_workfn()
4974 		p->sched_class = new_class;  in scx_ops_disable_workfn()
4979 		check_class_changed(task_rq(p), p, old_class, p->prio);  in scx_ops_disable_workfn()
4985 	/* no task is on scx, turn off all the switches and flush in-progress calls */  in scx_ops_disable_workfn()
4995 	if (ei->kind >= SCX_EXIT_ERROR) {  in scx_ops_disable_workfn()
4997 		       scx_ops.name, ei->reason);  in scx_ops_disable_workfn()
4999 		if (ei->msg[0] != '\0')  in scx_ops_disable_workfn()
5000 			pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);  in scx_ops_disable_workfn()
5002 		stack_trace_print(ei->bt, ei->bt_len, 2);  in scx_ops_disable_workfn()
5006 			scx_ops.name, ei->reason);  in scx_ops_disable_workfn()
5031 			destroy_dsq(dsq->id);  in scx_ops_disable_workfn()
5034 	} while (dsq == ERR_PTR(-EAGAIN));  in scx_ops_disable_workfn()
5083 	if (s->size)  in dump_newline()
5104 	if (s->size) {  in dump_line()
5128 	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */  in ops_dump_init()
5129 	dd->first = true;  in ops_dump_init()
5130 	dd->cursor = 0;  in ops_dump_init()
5131 	dd->s = s;  in ops_dump_init()
5132 	dd->prefix = prefix;  in ops_dump_init()
5138 	char *line = dd->buf.line;  in ops_dump_flush()  local
5140 	if (!dd->cursor)  in ops_dump_flush()
5144 	 * There's something to flush and this is the first line. Insert a blank  in ops_dump_flush()
5145 	 * line to distinguish ops dump.  in ops_dump_flush()
5147 	if (dd->first) {  in ops_dump_flush()
5148 		dump_newline(dd->s);  in ops_dump_flush()
5149 		dd->first = false;  in ops_dump_flush()
5153 	 * There may be multiple lines in $line. Scan and emit each line  in ops_dump_flush()
5157 		char *end = line;  in ops_dump_flush()
5164 		 * If $line overflowed, it may not have newline at the end.  in ops_dump_flush()
5169 		dump_line(dd->s, "%s%s", dd->prefix, line);  in ops_dump_flush()
5173 		/* move to the next line */  in ops_dump_flush()
5177 		line = end;  in ops_dump_flush()
5180 	dd->cursor = 0;  in ops_dump_flush()
5186 	scx_dump_data.cpu = -1;  in ops_dump_exit()
5194 	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);  in scx_dump_task()
5197 	if (p->scx.dsq)  in scx_dump_task()
5199 			  (unsigned long long)p->scx.dsq->id);  in scx_dump_task()
5203 		  marker, task_state_to_char(p), p->comm, p->pid,  in scx_dump_task()
5204 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));  in scx_dump_task()
5206 		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,  in scx_dump_task()
5207 		  p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,  in scx_dump_task()
5210 		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,  in scx_dump_task()
5211 		  p->scx.dsq_vtime);  in scx_dump_task()
5212 	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));  in scx_dump_task()
5234 		.kind = ei->kind,  in scx_dump_state()
5235 		.exit_code = ei->exit_code,  in scx_dump_state()
5236 		.reason = ei->reason,  in scx_dump_state()
5247 	seq_buf_init(&s, ei->dump, dump_len);  in scx_dump_state()
5249 	if (ei->kind == SCX_EXIT_NONE) {  in scx_dump_state()
5250 		dump_line(&s, "Debug dump triggered by %s", ei->reason);  in scx_dump_state()
5253 			  current->comm, current->pid, ei->kind);  in scx_dump_state()
5254 		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);  in scx_dump_state()
5257 		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);  in scx_dump_state()
5268 	dump_line(&s, "----------");  in scx_dump_state()
5280 		idle = list_empty(&rq->scx.runnable_list) &&  in scx_dump_state()
5281 			rq->curr->sched_class == &idle_sched_class;  in scx_dump_state()
5296 		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",  in scx_dump_state()
5297 			  cpu, rq->scx.nr_running, rq->scx.flags,  in scx_dump_state()
5298 			  rq->scx.cpu_released, rq->scx.ops_qseq,  in scx_dump_state()
5299 			  rq->scx.pnt_seq);  in scx_dump_state()
5301 			  rq->curr->comm, rq->curr->pid,  in scx_dump_state()
5302 			  rq->curr->sched_class);  in scx_dump_state()
5303 		if (!cpumask_empty(rq->scx.cpus_to_kick))  in scx_dump_state()
5305 				  cpumask_pr_args(rq->scx.cpus_to_kick));  in scx_dump_state()
5306 		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))  in scx_dump_state()
5308 				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));  in scx_dump_state()
5309 		if (!cpumask_empty(rq->scx.cpus_to_preempt))  in scx_dump_state()
5311 				  cpumask_pr_args(rq->scx.cpus_to_preempt));  in scx_dump_state()
5312 		if (!cpumask_empty(rq->scx.cpus_to_wait))  in scx_dump_state()
5314 				  cpumask_pr_args(rq->scx.cpus_to_wait));  in scx_dump_state()
5340 		if (rq->curr->sched_class == &ext_sched_class)  in scx_dump_state()
5341 			scx_dump_task(&s, &dctx, rq->curr, '*');  in scx_dump_state()
5343 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)  in scx_dump_state()
5350 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),  in scx_dump_state()
5360 	if (ei->kind >= SCX_EXIT_ERROR)  in scx_ops_error_irq_workfn()
5379 	ei->exit_code = exit_code;  in scx_ops_exit_kind()
5382 		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);  in scx_ops_exit_kind()
5385 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);  in scx_ops_exit_kind()
5389 	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again  in scx_ops_exit_kind()
5392 	ei->kind = kind;  in scx_ops_exit_kind()
5393 	ei->reason = scx_exit_reason(ei->kind);  in scx_ops_exit_kind()
5404 		sched_set_fifo(helper->task);  in scx_create_rt_helper()
5417 	if (ops->hotplug_seq) {  in check_hotplug_seq()
5419 		if (ops->hotplug_seq != global_hotplug_seq) {  in check_hotplug_seq()
5422 				     ops->hotplug_seq, global_hotplug_seq);  in check_hotplug_seq()
5433 	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {  in validate_ops()
5435 		return -EINVAL;  in validate_ops()
5451 		return -EINVAL;  in scx_ops_enable()
5460 			ret = -ENOMEM;  in scx_ops_enable()
5470 			ret = -ENOMEM;  in scx_ops_enable()
5482 				ret = -ENOMEM;  in scx_ops_enable()
5494 		ret = -EBUSY;  in scx_ops_enable()
5500 		ret = -ENOMEM;  in scx_ops_enable()
5504 	scx_root_kobj->kset = scx_kset;  in scx_ops_enable()
5509 	scx_exit_info = alloc_exit_info(ops->exit_dump_len);  in scx_ops_enable()
5511 		ret = -ENOMEM;  in scx_ops_enable()
5530 		cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;  in scx_ops_enable()
5534 	 * online CPUs by watching ->on/offline_cpu() after ->init().  in scx_ops_enable()
5563 	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;  in scx_ops_enable()
5568 		ret = -ENOMEM;  in scx_ops_enable()
5572 	if (ops->timeout_ms)  in scx_ops_enable()
5573 		timeout = msecs_to_jiffies(ops->timeout_ms);  in scx_ops_enable()
5594 	if (ops->flags & SCX_OPS_ENQ_LAST)  in scx_ops_enable()
5597 	if (ops->flags & SCX_OPS_ENQ_EXITING)  in scx_ops_enable()
5602 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {  in scx_ops_enable()
5654 				      ret, p->comm, p->pid);  in scx_ops_enable()
5671 	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));  in scx_ops_enable()
5675 	 * We're fully committed and can't fail. The task READY -> ENABLED  in scx_ops_enable()
5682 		const struct sched_class *old_class = p->sched_class;  in scx_ops_enable()
5684 			__setscheduler_class(p->policy, p->prio);  in scx_ops_enable()
5687 		if (old_class != new_class && p->se.sched_delayed)  in scx_ops_enable()
5692 		p->scx.slice = SCX_SLICE_DFL;  in scx_ops_enable()
5693 		p->sched_class = new_class;  in scx_ops_enable()
5698 		check_class_changed(task_rq(p), p, old_class, p->prio);  in scx_ops_enable()
5710 	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))  in scx_ops_enable()
5786 	t = btf_type_by_id(reg->btf, reg->btf_id);  in bpf_scx_btf_struct_access()
5788 		if (off >= offsetof(struct task_struct, scx.slice) &&  in bpf_scx_btf_struct_access()
5789 		    off + size <= offsetofend(struct task_struct, scx.slice))  in bpf_scx_btf_struct_access()
5799 	return -EACCES;  in bpf_scx_btf_struct_access()
5833 			return -E2BIG;  in bpf_scx_init_member()
5834 		ops->dispatch_max_batch = *(u32 *)(udata + moff);  in bpf_scx_init_member()
5838 			return -EINVAL;  in bpf_scx_init_member()
5839 		ops->flags = *(u64 *)(udata + moff);  in bpf_scx_init_member()
5842 		ret = bpf_obj_name_cpy(ops->name, uops->name,  in bpf_scx_init_member()
5843 				       sizeof(ops->name));  in bpf_scx_init_member()
5847 			return -EINVAL;  in bpf_scx_init_member()
5852 			return -E2BIG;  in bpf_scx_init_member()
5853 		ops->timeout_ms = *(u32 *)(udata + moff);  in bpf_scx_init_member()
5856 		ops->exit_dump_len =  in bpf_scx_init_member()
5860 		ops->hotplug_seq = *(u64 *)(udata + moff);  in bpf_scx_init_member()
5886 		if (prog->sleepable)  in bpf_scx_check_member()
5887 			return -EINVAL;  in bpf_scx_check_member()
5914 	 * sched_ext does not support updating the actively-loaded BPF  in bpf_scx_update()
5920 	return -EOPNOTSUPP;  in bpf_scx_update()
5928 … sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }  in sched_ext_ops__select_cpu()
5944 …ched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }  in sched_ext_ops__init_task()
5949 …ed_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }  in sched_ext_ops__cgroup_init()
5951 …cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }  in sched_ext_ops__cgroup_prep_move()
5958 static s32 sched_ext_ops__init(void) { return -EINVAL; }  in sched_ext_ops__init()
6031 	.help_msg	= "reset-sched-ext(S)",
6038 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };  in sysrq_handle_sched_ext_dump()
6046 	.help_msg	= "dump-sched-ext(D)",
6065 	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);  in can_skip_idle_kick()
6071 	struct scx_rq *this_scx = &this_rq->scx;  in kick_one_cpu()
6082 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {  in kick_one_cpu()
6083 			if (rq->curr->sched_class == &ext_sched_class)  in kick_one_cpu()
6084 				rq->curr->scx.slice = 0;  in kick_one_cpu()
6085 			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);  in kick_one_cpu()
6088 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {  in kick_one_cpu()
6089 			pseqs[cpu] = rq->scx.pnt_seq;  in kick_one_cpu()
6095 		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);  in kick_one_cpu()
6096 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);  in kick_one_cpu()
6121 	struct scx_rq *this_scx = &this_rq->scx;  in kick_cpus_irq_workfn()
6126 	for_each_cpu(cpu, this_scx->cpus_to_kick) {  in kick_cpus_irq_workfn()
6128 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);  in kick_cpus_irq_workfn()
6129 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);  in kick_cpus_irq_workfn()
6132 	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {  in kick_cpus_irq_workfn()
6134 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);  in kick_cpus_irq_workfn()
6140 	for_each_cpu(cpu, this_scx->cpus_to_wait) {  in kick_cpus_irq_workfn()
6141 		unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;  in kick_cpus_irq_workfn()
6148 			 * We busy-wait here to guarantee that no other task can  in kick_cpus_irq_workfn()
6156 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);  in kick_cpus_irq_workfn()
6161  * print_scx_info - print out sched_ext scheduler state
6187 	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||  in print_scx_info()
6194 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,  in print_scx_info()
6199 	/* print everything onto one line to conserve console space */  in print_scx_info()
6258 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);  in init_sched_ext_class()
6259 		INIT_LIST_HEAD(&rq->scx.runnable_list);  in init_sched_ext_class()
6260 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);  in init_sched_ext_class()
6262 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));  in init_sched_ext_class()
6263 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));  in init_sched_ext_class()
6264 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));  in init_sched_ext_class()
6265 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));  in init_sched_ext_class()
6266 		init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);  in init_sched_ext_class()
6267 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);  in init_sched_ext_class()
6270 			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;  in init_sched_ext_class()
6287  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
6293  * Can only be called from ops.select_cpu() if the built-in CPU selection is
6294  * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
6304 		scx_ops_error("built-in idle tracking is disabled");  in scx_bpf_select_cpu_dfl()
6363 	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {  in scx_dsq_insert_commit()
6368 	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){  in scx_dsq_insert_commit()
6370 		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,  in scx_dsq_insert_commit()
6379  * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
6382  * @slice: duration @p can run for in nsecs, 0 to keep the current value
6409  * @p is allowed to run for @slice. The scheduling path is triggered on slice
6410  * exhaustion. If zero, the current residual slice is maintained. If
6414 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,  in scx_bpf_dsq_insert()  argument
6420 	if (slice)  in scx_bpf_dsq_insert()
6421 		p->scx.slice = slice;  in scx_bpf_dsq_insert()
6423 		p->scx.slice = p->scx.slice ?: 1;  in scx_bpf_dsq_insert()
6429 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,  in scx_bpf_dispatch()  argument
6433 	scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);  in scx_bpf_dispatch()
6437  * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
6440  * @slice: duration @p can run for in nsecs, 0 to keep the current value
6441  * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
6450  * vice-versa.
6454  * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
6458 					  u64 slice, u64 vtime, u64 enq_flags)  in scx_bpf_dsq_insert_vtime()  argument
6463 	if (slice)  in scx_bpf_dsq_insert_vtime()
6464 		p->scx.slice = slice;  in scx_bpf_dsq_insert_vtime()
6466 		p->scx.slice = p->scx.slice ?: 1;  in scx_bpf_dsq_insert_vtime()
6468 	p->scx.dsq_vtime = vtime;  in scx_bpf_dsq_insert_vtime()
6475 					u64 slice, u64 vtime, u64 enq_flags)  in scx_bpf_dispatch_vtime()  argument
6478 	scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);  in scx_bpf_dispatch_vtime()
6498 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;  in scx_dsq_move()
6516 	in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;  in scx_dsq_move()
6529 	 * cause similar live-lock conditions as consume_dispatch_q(). Insert a  in scx_dsq_move()
6535 	raw_spin_lock(&src_dsq->lock);  in scx_dsq_move()
6539 	 * re-enqueud, or be in the process of being consumed by someone else.  in scx_dsq_move()
6541 	if (unlikely(p->scx.dsq != src_dsq ||  in scx_dsq_move()
6542 		     u32_before(kit->cursor.priv, p->scx.dsq_seq) ||  in scx_dsq_move()
6543 		     p->scx.holding_cpu >= 0) ||  in scx_dsq_move()
6545 		raw_spin_unlock(&src_dsq->lock);  in scx_dsq_move()
6553 	 * Apply vtime and slice updates before moving so that the new time is  in scx_dsq_move()
6557 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)  in scx_dsq_move()
6558 		p->scx.dsq_vtime = kit->vtime;  in scx_dsq_move()
6559 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)  in scx_dsq_move()
6560 		p->scx.slice = kit->slice;  in scx_dsq_move()
6575 	kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |  in scx_dsq_move()
6583  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
6592 	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);  in scx_bpf_dispatch_nr_slots()
6596  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
6608 	if (dspc->cursor > 0)  in scx_bpf_dispatch_cancel()
6609 		dspc->cursor--;  in scx_bpf_dispatch_cancel()
6615  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
6618  * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
6621  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
6636 	flush_dispatch_buf(dspc->rq);  in scx_bpf_dsq_move_to_local()
6644 	if (consume_dispatch_q(dspc->rq, dsq)) {  in scx_bpf_dsq_move_to_local()
6651 		dspc->nr_tasks++;  in scx_bpf_dsq_move_to_local()
6666  * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
6668  * @slice: duration the moved task can run for in nsecs
6670  * Override the slice of the next task that will be moved from @it__iter using
6672  * slice duration is kept.
6675 					    u64 slice)  in scx_bpf_dsq_move_set_slice()  argument
6679 	kit->slice = slice;  in scx_bpf_dsq_move_set_slice()
6680 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;  in scx_bpf_dsq_move_set_slice()
6685 			struct bpf_iter_scx_dsq *it__iter, u64 slice)  in scx_bpf_dispatch_from_dsq_set_slice()  argument
6688 	scx_bpf_dsq_move_set_slice(it__iter, slice);  in scx_bpf_dispatch_from_dsq_set_slice()
6692  * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
6694  * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
6697  * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
6706 	kit->vtime = vtime;  in scx_bpf_dsq_move_set_vtime()
6707 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;  in scx_bpf_dsq_move_set_vtime()
6719  * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
6726  * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
6734  * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
6760  * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
6770  * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
6818  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
6821  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
6839 	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list  in scx_bpf_reenqueue_local()
6842 	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,  in scx_bpf_reenqueue_local()
6847 		 * deactivate and re-activate @p anyway. Skip re-enqueueing.  in scx_bpf_reenqueue_local()
6850 		 * re-enqueue a migrating task while its current CPU and allowed  in scx_bpf_reenqueue_local()
6855 		if (p->migration_pending)  in scx_bpf_reenqueue_local()
6859 		list_add_tail(&p->scx.dsq_list.node, &tasks);  in scx_bpf_reenqueue_local()
6863 		list_del_init(&p->scx.dsq_list.node);  in scx_bpf_reenqueue_local()
6864 		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);  in scx_bpf_reenqueue_local()
6885  * scx_bpf_create_dsq - Create a custom DSQ
6896 		return -EINVAL;  in scx_bpf_create_dsq()
6922  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
6969 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);  in scx_bpf_kick_cpu()
6971 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);  in scx_bpf_kick_cpu()
6974 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);  in scx_bpf_kick_cpu()
6976 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);  in scx_bpf_kick_cpu()
6979 	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);  in scx_bpf_kick_cpu()
6985  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
6989  * -%ENOENT is returned.
6999 		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);  in scx_bpf_dsq_nr_queued()
7005 			ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);  in scx_bpf_dsq_nr_queued()
7011 			ret = READ_ONCE(dsq->nr);  in scx_bpf_dsq_nr_queued()
7015 	ret = -ENOENT;  in scx_bpf_dsq_nr_queued()
7022  * scx_bpf_destroy_dsq - Destroy a custom DSQ
7036  * bpf_iter_scx_dsq_new - Create a DSQ iterator
7056 		return -EINVAL;  in bpf_iter_scx_dsq_new()
7058 	kit->dsq = find_user_dsq(dsq_id);  in bpf_iter_scx_dsq_new()
7059 	if (!kit->dsq)  in bpf_iter_scx_dsq_new()
7060 		return -ENOENT;  in bpf_iter_scx_dsq_new()
7062 	INIT_LIST_HEAD(&kit->cursor.node);  in bpf_iter_scx_dsq_new()
7063 	kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;  in bpf_iter_scx_dsq_new()
7064 	kit->cursor.priv = READ_ONCE(kit->dsq->seq);  in bpf_iter_scx_dsq_new()
7070  * bpf_iter_scx_dsq_next - Progress a DSQ iterator
7078 	bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;  in bpf_iter_scx_dsq_next()
7082 	if (!kit->dsq)  in bpf_iter_scx_dsq_next()
7085 	raw_spin_lock_irqsave(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_next()
7087 	if (list_empty(&kit->cursor.node))  in bpf_iter_scx_dsq_next()
7090 		p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);  in bpf_iter_scx_dsq_next()
7098 		p = nldsq_next_task(kit->dsq, p, rev);  in bpf_iter_scx_dsq_next()
7099 	} while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));  in bpf_iter_scx_dsq_next()
7103 			list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);  in bpf_iter_scx_dsq_next()
7105 			list_move(&kit->cursor.node, &p->scx.dsq_list.node);  in bpf_iter_scx_dsq_next()
7107 		list_del_init(&kit->cursor.node);  in bpf_iter_scx_dsq_next()
7110 	raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_next()
7116  * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
7125 	if (!kit->dsq)  in bpf_iter_scx_dsq_destroy()
7128 	if (!list_empty(&kit->cursor.node)) {  in bpf_iter_scx_dsq_destroy()
7131 		raw_spin_lock_irqsave(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_destroy()
7132 		list_del_init(&kit->cursor.node);  in bpf_iter_scx_dsq_destroy()
7133 		raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_destroy()
7135 	kit->dsq = NULL;  in bpf_iter_scx_dsq_destroy()
7150 		return -EINVAL;  in __bstr_format()
7181 	return __bstr_format(buf->data, buf->line, sizeof(buf->line),  in bstr_format()
7188  * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
7205 				  scx_exit_bstr_buf.line);  in scx_bpf_exit_bstr()
7210  * scx_bpf_error_bstr - Indicate fatal error
7226 				  scx_exit_bstr_buf.line);  in scx_bpf_error_bstr()
7231  * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
7239  * The extra dump may be multiple lines. A single line may be split over
7240  * multiple calls. The last line is automatically terminated.
7246 	struct scx_bstr_buf *buf = &dd->buf;  in scx_bpf_dump_bstr()
7249 	if (raw_smp_processor_id() != dd->cpu) {  in scx_bpf_dump_bstr()
7254 	/* append the formatted string to the line buf */  in scx_bpf_dump_bstr()
7255 	ret = __bstr_format(buf->data, buf->line + dd->cursor,  in scx_bpf_dump_bstr()
7256 			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);  in scx_bpf_dump_bstr()
7258 		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",  in scx_bpf_dump_bstr()
7259 			  dd->prefix, fmt, data, data__sz, ret);  in scx_bpf_dump_bstr()
7263 	dd->cursor += ret;  in scx_bpf_dump_bstr()
7264 	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));  in scx_bpf_dump_bstr()
7266 	if (!dd->cursor)  in scx_bpf_dump_bstr()
7270 	 * If the line buf overflowed or ends in a newline, flush it into the  in scx_bpf_dump_bstr()
7271 	 * dump. This is to allow the caller to generate a single line over  in scx_bpf_dump_bstr()
7273 	 * the line buf, the only case which can lead to an unexpected  in scx_bpf_dump_bstr()
7277 	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')  in scx_bpf_dump_bstr()
7282  * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
7298  * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
7320  * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
7344 		rq->scx.cpuperf_target = perf;  in scx_bpf_cpuperf_set()
7353  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
7363  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
7371  * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
7379  * scx_bpf_put_cpumask - Release a possible/online cpumask
7386 	 * a reference to a global cpumask, which is read-only in the caller and  in scx_bpf_put_cpumask()
7393  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
7394  * per-CPU cpumask.
7401 		scx_ops_error("built-in idle tracking is disabled");  in scx_bpf_get_idle_cpumask()
7413  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
7414  * per-physical-core cpumask. Can be used to determine if an entire physical
7422 		scx_ops_error("built-in idle tracking is disabled");  in scx_bpf_get_idle_smtmask()
7437  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
7438  * either the percpu, or SMT idle-tracking cpumask.
7444 	 * a reference to a global idle cpumask, which is read-only in the  in scx_bpf_put_idle_cpumask()
7451  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
7463 		scx_ops_error("built-in idle tracking is disabled");  in scx_bpf_test_and_clear_cpu_idle()
7474  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
7479  * number on success. -%EBUSY if no matching cpu was found.
7482  * example, this function may return -%EBUSY as CPUs are transitioning into the
7496 		scx_ops_error("built-in idle tracking is disabled");  in scx_bpf_pick_idle_cpu()
7497 		return -EBUSY;  in scx_bpf_pick_idle_cpu()
7504  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
7510  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
7532 		return -EBUSY;  in scx_bpf_pick_any_cpu()
7536  * scx_bpf_task_running - Is task currently running?
7541 	return task_rq(p)->curr == p;  in scx_bpf_task_running()
7545  * scx_bpf_task_cpu - CPU a task is currently associated with
7554  * scx_bpf_cpu_rq - Fetch the rq of a CPU
7566  * scx_bpf_task_cgroup - Return the sched cgroup of a task
7569  * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
7571  * determine @p's current cgroup as, unlike following @p->cgroups,
7572  * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
7573  * rq-locked operations. Can be called on the parameter tasks of rq-locked
7579 	struct task_group *tg = p->sched_task_group;  in scx_bpf_task_cgroup()
7639 	 * Some kfuncs are context-sensitive and can only be called from  in scx_init()
7683 		return -ENOMEM;  in scx_init()
7686 	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);  in scx_init()