1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4 *
5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
7 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
8 */
9 #ifndef _LINUX_SCHED_EXT_H
10 #define _LINUX_SCHED_EXT_H
11
12 #ifdef CONFIG_SCHED_CLASS_EXT
13
14 #include <linux/llist.h>
15 #include <linux/rhashtable-types.h>
16
17 enum scx_public_consts {
18 SCX_OPS_NAME_LEN = 128,
19
20 /*
21 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
22 * to set the slice for a task that is selected for execution.
23 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
24 * refill has been triggered.
25 *
26 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
27 * mode. As making forward progress for all tasks is the main goal of
28 * the bypass mode, a shorter slice is used.
29 */
30 SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
31 SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */
32 SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
33 };
34
35 /*
36 * DSQ (dispatch queue) IDs are 64bit of the format:
37 *
38 * Bits: [63] [62 .. 0]
39 * [ B] [ ID ]
40 *
41 * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
42 * ID: 63 bit ID
43 *
44 * Built-in IDs:
45 *
46 * Bits: [63] [62] [61..32] [31 .. 0]
47 * [ 1] [ L] [ R ] [ V ]
48 *
49 * 1: 1 for built-in DSQs.
50 * L: 1 for LOCAL_ON DSQ IDs, 0 for others
51 * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
52 */
53 enum scx_dsq_id_flags {
54 SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
55 SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
56
57 SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
58 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
59 SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
60 SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3,
61 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
62 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
63 };
64
65 struct scx_deferred_reenq_user {
66 struct list_head node;
67 u64 flags;
68 };
69
70 struct scx_dsq_pcpu {
71 struct scx_dispatch_q *dsq;
72 struct scx_deferred_reenq_user deferred_reenq_user;
73 };
74
75 /*
76 * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
77 * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
78 * buffer between the scheduler core and the BPF scheduler. See the
79 * documentation for more details.
80 */
81 struct scx_dispatch_q {
82 raw_spinlock_t lock;
83 struct task_struct __rcu *first_task; /* lockless peek at head */
84 struct list_head list; /* tasks in dispatch order */
85 struct rb_root priq; /* used to order by p->scx.dsq_vtime */
86 u32 nr;
87 u32 seq; /* used by BPF iter */
88 u64 id;
89 struct rhash_head hash_node;
90 struct llist_node free_node;
91 struct scx_sched *sched;
92 struct scx_dsq_pcpu __percpu *pcpu;
93 struct rcu_head rcu;
94 };
95
96 /* sched_ext_entity.flags */
97 enum scx_ent_flags {
98 SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
99 SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */
100 SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
101 SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
102 SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */
103 SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */
104
105 /*
106 * Bits 8 to 10 are used to carry task state:
107 *
108 * NONE ops.init_task() not called yet
109 * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead()
110 * INIT ops.init_task() succeeded, but task can be cancelled
111 * READY fully initialized, but not in sched_ext
112 * ENABLED fully initialized and in sched_ext
113 * DEAD terminal state set by sched_ext_dead()
114 */
115 SCX_TASK_STATE_SHIFT = 8,
116 SCX_TASK_STATE_BITS = 3,
117 SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
118
119 SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT,
120 SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT,
121 SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT,
122 SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT,
123 SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT,
124 SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT,
125
126 /*
127 * Bits 12 and 13 are used to carry reenqueue reason. In addition to
128 * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for
129 * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues.
130 *
131 * NONE not being reenqueued
132 * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends
133 * IMMED reenqueued due to failed ENQ_IMMED
134 * PREEMPTED preempted while running
135 */
136 SCX_TASK_REENQ_REASON_SHIFT = 12,
137 SCX_TASK_REENQ_REASON_BITS = 2,
138 SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT,
139
140 SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT,
141 SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT,
142 SCX_TASK_REENQ_IMMED = 2 << SCX_TASK_REENQ_REASON_SHIFT,
143 SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT,
144
145 /* iteration cursor, not a task */
146 SCX_TASK_CURSOR = 1 << 31,
147 };
148
149 /* scx_entity.dsq_flags */
150 enum scx_ent_dsq_flags {
151 SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */
152 };
153
154 enum scx_dsq_lnode_flags {
155 SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
156
157 /* high 16 bits can be for iter cursor flags */
158 __SCX_DSQ_LNODE_PRIV_SHIFT = 16,
159 };
160
161 struct scx_dsq_list_node {
162 struct list_head node;
163 u32 flags;
164 u32 priv; /* can be used by iter cursor */
165 };
166
167 #define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags) \
168 (struct scx_dsq_list_node) { \
169 .node = LIST_HEAD_INIT((__cursor).node), \
170 .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \
171 .priv = READ_ONCE((__dsq)->seq), \
172 }
173
174 struct scx_sched;
175
176 /*
177 * The following is embedded in task_struct and contains all fields necessary
178 * for a task to be scheduled by SCX.
179 */
180 struct sched_ext_entity {
181 #ifdef CONFIG_CGROUPS
182 /*
183 * Associated scx_sched. Updated either during fork or while holding
184 * both p->pi_lock and rq lock.
185 */
186 struct scx_sched __rcu *sched;
187 #endif
188 struct scx_dispatch_q *dsq;
189 atomic_long_t ops_state;
190 u64 ddsp_dsq_id;
191 u64 ddsp_enq_flags;
192 struct scx_dsq_list_node dsq_list; /* dispatch order */
193 struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
194 u32 dsq_seq;
195 u32 dsq_flags; /* protected by DSQ lock */
196 u32 flags; /* protected by rq lock */
197 u32 weight;
198 s32 sticky_cpu;
199 s32 holding_cpu;
200 s32 selected_cpu;
201 struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
202
203 struct list_head runnable_node; /* rq->scx.runnable_list */
204 unsigned long runnable_at;
205
206 #ifdef CONFIG_SCHED_CORE
207 u64 core_sched_at; /* see scx_prio_less() */
208 #endif
209
210 /* BPF scheduler modifiable fields */
211
212 /*
213 * Runtime budget in nsecs. This is usually set through
214 * scx_bpf_dsq_insert() but can also be modified directly by the BPF
215 * scheduler. Automatically decreased by SCX as the task executes. On
216 * depletion, a scheduling event is triggered.
217 *
218 * This value is cleared to zero if the task is preempted by
219 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
220 * task ran. Use p->se.sum_exec_runtime instead.
221 */
222 u64 slice;
223
224 /*
225 * Used to order tasks when dispatching to the vtime-ordered priority
226 * queue of a dsq. This is usually set through
227 * scx_bpf_dsq_insert_vtime() but can also be modified directly by the
228 * BPF scheduler. Modifying it while a task is queued on a dsq may
229 * mangle the ordering and is not recommended.
230 */
231 u64 dsq_vtime;
232
233 /*
234 * If set, reject future sched_setscheduler(2) calls updating the policy
235 * to %SCHED_EXT with -%EACCES.
236 *
237 * Can be set from ops.init_task() while the BPF scheduler is being
238 * loaded (!scx_init_task_args->fork). If set and the task's policy is
239 * already %SCHED_EXT, the task's policy is rejected and forcefully
240 * reverted to %SCHED_NORMAL. The number of such events are reported
241 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
242 * during fork is not allowed.
243 */
244 bool disallow; /* reject switching into SCX */
245
246 /* cold fields */
247 #ifdef CONFIG_EXT_GROUP_SCHED
248 struct cgroup *cgrp_moving_from;
249 #endif
250 struct list_head tasks_node;
251 };
252
253 void sched_ext_dead(struct task_struct *p);
254 void print_scx_info(const char *log_lvl, struct task_struct *p);
255 void scx_softlockup(u32 dur_s);
256 bool scx_hardlockup(int cpu);
257 bool scx_rcu_cpu_stall(void);
258
259 #else /* !CONFIG_SCHED_CLASS_EXT */
260
sched_ext_dead(struct task_struct * p)261 static inline void sched_ext_dead(struct task_struct *p) {}
print_scx_info(const char * log_lvl,struct task_struct * p)262 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
scx_softlockup(u32 dur_s)263 static inline void scx_softlockup(u32 dur_s) {}
scx_hardlockup(int cpu)264 static inline bool scx_hardlockup(int cpu) { return false; }
scx_rcu_cpu_stall(void)265 static inline bool scx_rcu_cpu_stall(void) { return false; }
266
267 #endif /* CONFIG_SCHED_CLASS_EXT */
268
269 struct scx_task_group {
270 #ifdef CONFIG_EXT_GROUP_SCHED
271 u32 flags; /* SCX_TG_* */
272 u32 weight;
273 u64 bw_period_us;
274 u64 bw_quota_us;
275 u64 bw_burst_us;
276 bool idle;
277 #endif
278 };
279
280 #endif /* _LINUX_SCHED_EXT_H */
281