1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4 *
5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
7 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
8 */
9 #ifndef _LINUX_SCHED_EXT_H
10 #define _LINUX_SCHED_EXT_H
11
12 #ifdef CONFIG_SCHED_CLASS_EXT
13
14 #include <linux/llist.h>
15 #include <linux/rhashtable-types.h>
16
17 enum scx_public_consts {
18 SCX_OPS_NAME_LEN = 128,
19
20 SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
21 SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
22 };
23
24 /*
25 * DSQ (dispatch queue) IDs are 64bit of the format:
26 *
27 * Bits: [63] [62 .. 0]
28 * [ B] [ ID ]
29 *
30 * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
31 * ID: 63 bit ID
32 *
33 * Built-in IDs:
34 *
35 * Bits: [63] [62] [61..32] [31 .. 0]
36 * [ 1] [ L] [ R ] [ V ]
37 *
38 * 1: 1 for built-in DSQs.
39 * L: 1 for LOCAL_ON DSQ IDs, 0 for others
40 * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
41 */
42 enum scx_dsq_id_flags {
43 SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
44 SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
45
46 SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
47 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
48 SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
49 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
50 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
51 };
52
53 /*
54 * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
55 * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
56 * buffer between the scheduler core and the BPF scheduler. See the
57 * documentation for more details.
58 */
59 struct scx_dispatch_q {
60 raw_spinlock_t lock;
61 struct list_head list; /* tasks in dispatch order */
62 struct rb_root priq; /* used to order by p->scx.dsq_vtime */
63 u32 nr;
64 u32 seq; /* used by BPF iter */
65 u64 id;
66 struct rhash_head hash_node;
67 struct llist_node free_node;
68 struct rcu_head rcu;
69 };
70
71 /* scx_entity.flags */
72 enum scx_ent_flags {
73 SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
74 SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
75 SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
76
77 SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
78 SCX_TASK_STATE_BITS = 2,
79 SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
80
81 SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
82 };
83
84 /* scx_entity.flags & SCX_TASK_STATE_MASK */
85 enum scx_task_state {
86 SCX_TASK_NONE, /* ops.init_task() not called yet */
87 SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
88 SCX_TASK_READY, /* fully initialized, but not in sched_ext */
89 SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
90
91 SCX_TASK_NR_STATES,
92 };
93
94 /* scx_entity.dsq_flags */
95 enum scx_ent_dsq_flags {
96 SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */
97 };
98
99 /*
100 * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
101 * everywhere and the following bits track which kfunc sets are currently
102 * allowed for %current. This simple per-task tracking works because SCX ops
103 * nest in a limited way. BPF will likely implement a way to allow and disallow
104 * kfuncs depending on the calling context which will replace this manual
105 * mechanism. See scx_kf_allow().
106 */
107 enum scx_kf_mask {
108 SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
109 /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
110 SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
111 /*
112 * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and
113 * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be
114 * nested inside DISPATCH.
115 */
116 SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
117 SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
118 SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
119 SCX_KF_REST = 1 << 4, /* other rq-locked operations */
120
121 __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
122 SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
123 __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
124 };
125
126 enum scx_dsq_lnode_flags {
127 SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
128
129 /* high 16 bits can be for iter cursor flags */
130 __SCX_DSQ_LNODE_PRIV_SHIFT = 16,
131 };
132
133 struct scx_dsq_list_node {
134 struct list_head node;
135 u32 flags;
136 u32 priv; /* can be used by iter cursor */
137 };
138
139 /*
140 * The following is embedded in task_struct and contains all fields necessary
141 * for a task to be scheduled by SCX.
142 */
143 struct sched_ext_entity {
144 struct scx_dispatch_q *dsq;
145 struct scx_dsq_list_node dsq_list; /* dispatch order */
146 struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
147 u32 dsq_seq;
148 u32 dsq_flags; /* protected by DSQ lock */
149 u32 flags; /* protected by rq lock */
150 u32 weight;
151 s32 sticky_cpu;
152 s32 holding_cpu;
153 s32 selected_cpu;
154 u32 kf_mask; /* see scx_kf_mask above */
155 struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
156 atomic_long_t ops_state;
157
158 struct list_head runnable_node; /* rq->scx.runnable_list */
159 unsigned long runnable_at;
160
161 #ifdef CONFIG_SCHED_CORE
162 u64 core_sched_at; /* see scx_prio_less() */
163 #endif
164 u64 ddsp_dsq_id;
165 u64 ddsp_enq_flags;
166
167 /* BPF scheduler modifiable fields */
168
169 /*
170 * Runtime budget in nsecs. This is usually set through
171 * scx_bpf_dsq_insert() but can also be modified directly by the BPF
172 * scheduler. Automatically decreased by SCX as the task executes. On
173 * depletion, a scheduling event is triggered.
174 *
175 * This value is cleared to zero if the task is preempted by
176 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
177 * task ran. Use p->se.sum_exec_runtime instead.
178 */
179 u64 slice;
180
181 /*
182 * Used to order tasks when dispatching to the vtime-ordered priority
183 * queue of a dsq. This is usually set through
184 * scx_bpf_dsq_insert_vtime() but can also be modified directly by the
185 * BPF scheduler. Modifying it while a task is queued on a dsq may
186 * mangle the ordering and is not recommended.
187 */
188 u64 dsq_vtime;
189
190 /*
191 * If set, reject future sched_setscheduler(2) calls updating the policy
192 * to %SCHED_EXT with -%EACCES.
193 *
194 * Can be set from ops.init_task() while the BPF scheduler is being
195 * loaded (!scx_init_task_args->fork). If set and the task's policy is
196 * already %SCHED_EXT, the task's policy is rejected and forcefully
197 * reverted to %SCHED_NORMAL. The number of such events are reported
198 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
199 * during fork is not allowed.
200 */
201 bool disallow; /* reject switching into SCX */
202
203 /* cold fields */
204 #ifdef CONFIG_EXT_GROUP_SCHED
205 struct cgroup *cgrp_moving_from;
206 #endif
207 struct list_head tasks_node;
208 };
209
210 void sched_ext_free(struct task_struct *p);
211 void print_scx_info(const char *log_lvl, struct task_struct *p);
212 void scx_softlockup(u32 dur_s);
213 bool scx_rcu_cpu_stall(void);
214
215 #else /* !CONFIG_SCHED_CLASS_EXT */
216
sched_ext_free(struct task_struct * p)217 static inline void sched_ext_free(struct task_struct *p) {}
print_scx_info(const char * log_lvl,struct task_struct * p)218 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
scx_softlockup(u32 dur_s)219 static inline void scx_softlockup(u32 dur_s) {}
scx_rcu_cpu_stall(void)220 static inline bool scx_rcu_cpu_stall(void) { return false; }
221
222 #endif /* CONFIG_SCHED_CLASS_EXT */
223
224 struct scx_task_group {
225 #ifdef CONFIG_EXT_GROUP_SCHED
226 u32 flags; /* SCX_TG_* */
227 u32 weight;
228 u64 bw_period_us;
229 u64 bw_quota_us;
230 u64 bw_burst_us;
231 #endif
232 };
233
234 #endif /* _LINUX_SCHED_EXT_H */
235