1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9 #ifndef _LINUX_SCHED_EXT_H 10 #define _LINUX_SCHED_EXT_H 11 12 #ifdef CONFIG_SCHED_CLASS_EXT 13 14 #include <linux/llist.h> 15 #include <linux/rhashtable-types.h> 16 17 enum scx_public_consts { 18 SCX_OPS_NAME_LEN = 128, 19 20 /* 21 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses 22 * to set the slice for a task that is selected for execution. 23 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice 24 * refill has been triggered. 25 * 26 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass 27 * mode. As making forward progress for all tasks is the main goal of 28 * the bypass mode, a shorter slice is used. 29 */ 30 SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ 31 SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ 32 SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ 33 }; 34 35 /* 36 * DSQ (dispatch queue) IDs are 64bit of the format: 37 * 38 * Bits: [63] [62 .. 0] 39 * [ B] [ ID ] 40 * 41 * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs 42 * ID: 63 bit ID 43 * 44 * Built-in IDs: 45 * 46 * Bits: [63] [62] [61..32] [31 .. 0] 47 * [ 1] [ L] [ R ] [ V ] 48 * 49 * 1: 1 for built-in DSQs. 50 * L: 1 for LOCAL_ON DSQ IDs, 0 for others 51 * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. 52 */ 53 enum scx_dsq_id_flags { 54 SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, 55 SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, 56 57 SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, 58 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, 59 SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, 60 SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, 61 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, 62 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, 63 }; 64 65 struct scx_deferred_reenq_user { 66 struct list_head node; 67 u64 flags; 68 }; 69 70 struct scx_dsq_pcpu { 71 struct scx_dispatch_q *dsq; 72 struct scx_deferred_reenq_user deferred_reenq_user; 73 }; 74 75 /* 76 * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered 77 * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to 78 * buffer between the scheduler core and the BPF scheduler. See the 79 * documentation for more details. 80 */ 81 struct scx_dispatch_q { 82 raw_spinlock_t lock; 83 struct task_struct __rcu *first_task; /* lockless peek at head */ 84 struct list_head list; /* tasks in dispatch order */ 85 struct rb_root priq; /* used to order by p->scx.dsq_vtime */ 86 u32 nr; 87 u32 seq; /* used by BPF iter */ 88 u64 id; 89 struct rhash_head hash_node; 90 struct llist_node free_node; 91 struct scx_sched *sched; 92 struct scx_dsq_pcpu __percpu *pcpu; 93 struct rcu_head rcu; 94 }; 95 96 /* sched_ext_entity.flags */ 97 enum scx_ent_flags { 98 SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ 99 SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ 100 SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ 101 SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ 102 SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ 103 SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ 104 105 /* 106 * Bits 8 and 9 are used to carry task state: 107 * 108 * NONE ops.init_task() not called yet 109 * INIT ops.init_task() succeeded, but task can be cancelled 110 * READY fully initialized, but not in sched_ext 111 * ENABLED fully initialized and in sched_ext 112 */ 113 SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ 114 SCX_TASK_STATE_BITS = 2, 115 SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, 116 117 SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, 118 SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, 119 SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, 120 SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, 121 122 /* 123 * Bits 12 and 13 are used to carry reenqueue reason. In addition to 124 * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for 125 * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues. 126 * 127 * NONE not being reenqueued 128 * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends 129 * IMMED reenqueued due to failed ENQ_IMMED 130 * PREEMPTED preempted while running 131 */ 132 SCX_TASK_REENQ_REASON_SHIFT = 12, 133 SCX_TASK_REENQ_REASON_BITS = 2, 134 SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT, 135 136 SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, 137 SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, 138 SCX_TASK_REENQ_IMMED = 2 << SCX_TASK_REENQ_REASON_SHIFT, 139 SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT, 140 141 /* iteration cursor, not a task */ 142 SCX_TASK_CURSOR = 1 << 31, 143 }; 144 145 /* scx_entity.dsq_flags */ 146 enum scx_ent_dsq_flags { 147 SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ 148 }; 149 150 enum scx_dsq_lnode_flags { 151 SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, 152 153 /* high 16 bits can be for iter cursor flags */ 154 __SCX_DSQ_LNODE_PRIV_SHIFT = 16, 155 }; 156 157 struct scx_dsq_list_node { 158 struct list_head node; 159 u32 flags; 160 u32 priv; /* can be used by iter cursor */ 161 }; 162 163 #define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags) \ 164 (struct scx_dsq_list_node) { \ 165 .node = LIST_HEAD_INIT((__cursor).node), \ 166 .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ 167 .priv = READ_ONCE((__dsq)->seq), \ 168 } 169 170 struct scx_sched; 171 172 /* 173 * The following is embedded in task_struct and contains all fields necessary 174 * for a task to be scheduled by SCX. 175 */ 176 struct sched_ext_entity { 177 #ifdef CONFIG_CGROUPS 178 /* 179 * Associated scx_sched. Updated either during fork or while holding 180 * both p->pi_lock and rq lock. 181 */ 182 struct scx_sched __rcu *sched; 183 #endif 184 struct scx_dispatch_q *dsq; 185 atomic_long_t ops_state; 186 u64 ddsp_dsq_id; 187 u64 ddsp_enq_flags; 188 struct scx_dsq_list_node dsq_list; /* dispatch order */ 189 struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ 190 u32 dsq_seq; 191 u32 dsq_flags; /* protected by DSQ lock */ 192 u32 flags; /* protected by rq lock */ 193 u32 weight; 194 s32 sticky_cpu; 195 s32 holding_cpu; 196 s32 selected_cpu; 197 struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ 198 199 struct list_head runnable_node; /* rq->scx.runnable_list */ 200 unsigned long runnable_at; 201 202 #ifdef CONFIG_SCHED_CORE 203 u64 core_sched_at; /* see scx_prio_less() */ 204 #endif 205 206 /* BPF scheduler modifiable fields */ 207 208 /* 209 * Runtime budget in nsecs. This is usually set through 210 * scx_bpf_dsq_insert() but can also be modified directly by the BPF 211 * scheduler. Automatically decreased by SCX as the task executes. On 212 * depletion, a scheduling event is triggered. 213 * 214 * This value is cleared to zero if the task is preempted by 215 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the 216 * task ran. Use p->se.sum_exec_runtime instead. 217 */ 218 u64 slice; 219 220 /* 221 * Used to order tasks when dispatching to the vtime-ordered priority 222 * queue of a dsq. This is usually set through 223 * scx_bpf_dsq_insert_vtime() but can also be modified directly by the 224 * BPF scheduler. Modifying it while a task is queued on a dsq may 225 * mangle the ordering and is not recommended. 226 */ 227 u64 dsq_vtime; 228 229 /* 230 * If set, reject future sched_setscheduler(2) calls updating the policy 231 * to %SCHED_EXT with -%EACCES. 232 * 233 * Can be set from ops.init_task() while the BPF scheduler is being 234 * loaded (!scx_init_task_args->fork). If set and the task's policy is 235 * already %SCHED_EXT, the task's policy is rejected and forcefully 236 * reverted to %SCHED_NORMAL. The number of such events are reported 237 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag 238 * during fork is not allowed. 239 */ 240 bool disallow; /* reject switching into SCX */ 241 242 /* cold fields */ 243 #ifdef CONFIG_EXT_GROUP_SCHED 244 struct cgroup *cgrp_moving_from; 245 #endif 246 struct list_head tasks_node; 247 }; 248 249 void sched_ext_dead(struct task_struct *p); 250 void print_scx_info(const char *log_lvl, struct task_struct *p); 251 void scx_softlockup(u32 dur_s); 252 bool scx_hardlockup(int cpu); 253 bool scx_rcu_cpu_stall(void); 254 255 #else /* !CONFIG_SCHED_CLASS_EXT */ 256 257 static inline void sched_ext_dead(struct task_struct *p) {} 258 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} 259 static inline void scx_softlockup(u32 dur_s) {} 260 static inline bool scx_hardlockup(int cpu) { return false; } 261 static inline bool scx_rcu_cpu_stall(void) { return false; } 262 263 #endif /* CONFIG_SCHED_CLASS_EXT */ 264 265 struct scx_task_group { 266 #ifdef CONFIG_EXT_GROUP_SCHED 267 u32 flags; /* SCX_TG_* */ 268 u32 weight; 269 u64 bw_period_us; 270 u64 bw_quota_us; 271 u64 bw_burst_us; 272 bool idle; 273 #endif 274 }; 275 276 #endif /* _LINUX_SCHED_EXT_H */ 277