xref: /linux/include/linux/sched/ext.h (revision 59a62ea4583e0f740bb3576ec210b23f39754327)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4  *
5  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
7  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
8  */
9 #ifndef _LINUX_SCHED_EXT_H
10 #define _LINUX_SCHED_EXT_H
11 
12 #ifdef CONFIG_SCHED_CLASS_EXT
13 
14 #include <linux/llist.h>
15 #include <linux/rhashtable-types.h>
16 
17 enum scx_public_consts {
18 	SCX_OPS_NAME_LEN	= 128,
19 
20 	/*
21 	 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
22 	 * to set the slice for a task that is selected for execution.
23 	 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
24 	 * refill has been triggered.
25 	 *
26 	 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
27 	 * mode. As making forward progress for all tasks is the main goal of
28 	 * the bypass mode, a shorter slice is used.
29 	 */
30 	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
31 	SCX_SLICE_BYPASS	=  5 * 1000000, /*  5ms */
32 	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
33 };
34 
35 /*
36  * DSQ (dispatch queue) IDs are 64bit of the format:
37  *
38  *   Bits: [63] [62 ..  0]
39  *         [ B] [   ID   ]
40  *
41  *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
42  *   ID: 63 bit ID
43  *
44  * Built-in IDs:
45  *
46  *   Bits: [63] [62] [61..32] [31 ..  0]
47  *         [ 1] [ L] [   R  ] [    V   ]
48  *
49  *    1: 1 for built-in DSQs.
50  *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
51  *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
52  */
53 enum scx_dsq_id_flags {
54 	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
55 	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
56 
57 	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
58 	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
59 	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
60 	SCX_DSQ_BYPASS		= SCX_DSQ_FLAG_BUILTIN | 3,
61 	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
62 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
63 };
64 
65 struct scx_deferred_reenq_user {
66 	struct list_head	node;
67 	u64			flags;
68 };
69 
70 struct scx_dsq_pcpu {
71 	struct scx_dispatch_q	*dsq;
72 	struct scx_deferred_reenq_user deferred_reenq_user;
73 };
74 
75 /*
76  * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
77  * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
78  * buffer between the scheduler core and the BPF scheduler. See the
79  * documentation for more details.
80  */
81 struct scx_dispatch_q {
82 	raw_spinlock_t		lock;
83 	struct task_struct __rcu *first_task; /* lockless peek at head */
84 	struct list_head	list;	/* tasks in dispatch order */
85 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
86 	u32			nr;
87 	u32			seq;	/* used by BPF iter */
88 	u64			id;
89 	struct rhash_head	hash_node;
90 	struct llist_node	free_node;
91 	struct scx_sched	*sched;
92 	struct scx_dsq_pcpu __percpu *pcpu;
93 	struct rcu_head		rcu;
94 };
95 
96 /* sched_ext_entity.flags */
97 enum scx_ent_flags {
98 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
99 	SCX_TASK_IN_CUSTODY	= 1 << 1, /* in custody, needs ops.dequeue() when leaving */
100 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
101 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
102 	SCX_TASK_SUB_INIT	= 1 << 4, /* task being initialized for a sub sched */
103 	SCX_TASK_IMMED		= 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */
104 
105 	/*
106 	 * Bits 8 to 10 are used to carry task state:
107 	 *
108 	 * NONE		ops.init_task() not called yet
109 	 * INIT_BEGIN	ops.init_task() in flight; see sched_ext_dead()
110 	 * INIT		ops.init_task() succeeded, but task can be cancelled
111 	 * READY	fully initialized, but not in sched_ext
112 	 * ENABLED	fully initialized and in sched_ext
113 	 * DEAD		terminal state set by sched_ext_dead()
114 	 */
115 	SCX_TASK_STATE_SHIFT	= 8,
116 	SCX_TASK_STATE_BITS	= 3,
117 	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
118 
119 	SCX_TASK_NONE		= 0 << SCX_TASK_STATE_SHIFT,
120 	SCX_TASK_INIT_BEGIN	= 1 << SCX_TASK_STATE_SHIFT,
121 	SCX_TASK_INIT		= 2 << SCX_TASK_STATE_SHIFT,
122 	SCX_TASK_READY		= 3 << SCX_TASK_STATE_SHIFT,
123 	SCX_TASK_ENABLED	= 4 << SCX_TASK_STATE_SHIFT,
124 	SCX_TASK_DEAD		= 5 << SCX_TASK_STATE_SHIFT,
125 
126 	/*
127 	 * Bits 12 and 13 are used to carry reenqueue reason. In addition to
128 	 * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for
129 	 * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues.
130 	 *
131 	 * NONE		not being reenqueued
132 	 * KFUNC	reenqueued by scx_bpf_dsq_reenq() and friends
133 	 * IMMED	reenqueued due to failed ENQ_IMMED
134 	 * PREEMPTED	preempted while running
135 	 */
136 	SCX_TASK_REENQ_REASON_SHIFT = 12,
137 	SCX_TASK_REENQ_REASON_BITS = 2,
138 	SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT,
139 
140 	SCX_TASK_REENQ_NONE	= 0 << SCX_TASK_REENQ_REASON_SHIFT,
141 	SCX_TASK_REENQ_KFUNC	= 1 << SCX_TASK_REENQ_REASON_SHIFT,
142 	SCX_TASK_REENQ_IMMED	= 2 << SCX_TASK_REENQ_REASON_SHIFT,
143 	SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT,
144 
145 	/* iteration cursor, not a task */
146 	SCX_TASK_CURSOR		= 1 << 31,
147 };
148 
149 /* scx_entity.dsq_flags */
150 enum scx_ent_dsq_flags {
151 	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
152 };
153 
154 enum scx_dsq_lnode_flags {
155 	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
156 
157 	/* high 16 bits can be for iter cursor flags */
158 	__SCX_DSQ_LNODE_PRIV_SHIFT = 16,
159 };
160 
161 struct scx_dsq_list_node {
162 	struct list_head	node;
163 	u32			flags;
164 	u32			priv;		/* can be used by iter cursor */
165 };
166 
167 #define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags)				\
168 	(struct scx_dsq_list_node) {						\
169 		.node = LIST_HEAD_INIT((__cursor).node),			\
170 		.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags),			\
171 		.priv = READ_ONCE((__dsq)->seq),				\
172 	}
173 
174 struct scx_sched;
175 
176 /*
177  * The following is embedded in task_struct and contains all fields necessary
178  * for a task to be scheduled by SCX.
179  */
180 struct sched_ext_entity {
181 #ifdef CONFIG_CGROUPS
182 	/*
183 	 * Associated scx_sched. Updated either during fork or while holding
184 	 * both p->pi_lock and rq lock.
185 	 */
186 	struct scx_sched __rcu	*sched;
187 #endif
188 	struct scx_dispatch_q	*dsq;
189 	atomic_long_t		ops_state;
190 	u64			ddsp_dsq_id;
191 	u64			ddsp_enq_flags;
192 	struct scx_dsq_list_node dsq_list;	/* dispatch order */
193 	struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */
194 	u32			dsq_seq;
195 	u32			dsq_flags;	/* protected by DSQ lock */
196 	u32			flags;		/* protected by rq lock */
197 	u32			weight;
198 	s32			sticky_cpu;
199 	s32			holding_cpu;
200 	s32			selected_cpu;
201 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
202 
203 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
204 	unsigned long		runnable_at;
205 
206 #ifdef CONFIG_SCHED_CORE
207 	u64			core_sched_at;	/* see scx_prio_less() */
208 #endif
209 
210 	/* BPF scheduler modifiable fields */
211 
212 	/*
213 	 * Runtime budget in nsecs. This is usually set through
214 	 * scx_bpf_dsq_insert() but can also be modified directly by the BPF
215 	 * scheduler. Automatically decreased by SCX as the task executes. On
216 	 * depletion, a scheduling event is triggered.
217 	 *
218 	 * This value is cleared to zero if the task is preempted by
219 	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
220 	 * task ran. Use p->se.sum_exec_runtime instead.
221 	 */
222 	u64			slice;
223 
224 	/*
225 	 * Used to order tasks when dispatching to the vtime-ordered priority
226 	 * queue of a dsq. This is usually set through
227 	 * scx_bpf_dsq_insert_vtime() but can also be modified directly by the
228 	 * BPF scheduler. Modifying it while a task is queued on a dsq may
229 	 * mangle the ordering and is not recommended.
230 	 */
231 	u64			dsq_vtime;
232 
233 	/*
234 	 * If set, reject future sched_setscheduler(2) calls updating the policy
235 	 * to %SCHED_EXT with -%EACCES.
236 	 *
237 	 * Can be set from ops.init_task() while the BPF scheduler is being
238 	 * loaded (!scx_init_task_args->fork). If set and the task's policy is
239 	 * already %SCHED_EXT, the task's policy is rejected and forcefully
240 	 * reverted to %SCHED_NORMAL. The number of such events are reported
241 	 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
242 	 * during fork is not allowed.
243 	 */
244 	bool			disallow;	/* reject switching into SCX */
245 
246 	/* cold fields */
247 #ifdef CONFIG_EXT_GROUP_SCHED
248 	struct cgroup		*cgrp_moving_from;
249 #endif
250 	struct list_head	tasks_node;
251 };
252 
253 void sched_ext_dead(struct task_struct *p);
254 void print_scx_info(const char *log_lvl, struct task_struct *p);
255 void scx_softlockup(u32 dur_s);
256 bool scx_hardlockup(int cpu);
257 bool scx_rcu_cpu_stall(void);
258 
259 #else	/* !CONFIG_SCHED_CLASS_EXT */
260 
sched_ext_dead(struct task_struct * p)261 static inline void sched_ext_dead(struct task_struct *p) {}
print_scx_info(const char * log_lvl,struct task_struct * p)262 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
scx_softlockup(u32 dur_s)263 static inline void scx_softlockup(u32 dur_s) {}
scx_hardlockup(int cpu)264 static inline bool scx_hardlockup(int cpu) { return false; }
scx_rcu_cpu_stall(void)265 static inline bool scx_rcu_cpu_stall(void) { return false; }
266 
267 #endif	/* CONFIG_SCHED_CLASS_EXT */
268 
269 struct scx_task_group {
270 #ifdef CONFIG_EXT_GROUP_SCHED
271 	u32			flags;		/* SCX_TG_* */
272 	u32			weight;
273 	u64			bw_period_us;
274 	u64			bw_quota_us;
275 	u64			bw_burst_us;
276 	bool			idle;
277 #endif
278 };
279 
280 #endif	/* _LINUX_SCHED_EXT_H */
281