1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking 15 * the CPU away. 16 * 17 * This scheduler is primarily for demonstration and testing of sched_ext 18 * features and unlikely to be useful for actual workloads. 19 * 20 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 21 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 22 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 23 */ 24 #include <scx/common.bpf.h> 25 26 enum consts { 27 ONE_SEC_IN_NS = 1000000000, 28 SHARED_DSQ = 0, 29 }; 30 31 char _license[] SEC("license") = "GPL"; 32 33 const volatile u64 slice_ns = SCX_SLICE_DFL; 34 const volatile u32 stall_user_nth; 35 const volatile u32 stall_kernel_nth; 36 const volatile u32 dsp_inf_loop_after; 37 const volatile u32 dsp_batch; 38 const volatile s32 disallow_tgid; 39 const volatile bool suppress_dump; 40 41 u32 test_error_cnt; 42 43 UEI_DEFINE(uei); 44 45 struct qmap { 46 __uint(type, BPF_MAP_TYPE_QUEUE); 47 __uint(max_entries, 4096); 48 __type(value, u32); 49 } queue0 SEC(".maps"), 50 queue1 SEC(".maps"), 51 queue2 SEC(".maps"), 52 queue3 SEC(".maps"), 53 queue4 SEC(".maps"); 54 55 struct { 56 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 57 __uint(max_entries, 5); 58 __type(key, int); 59 __array(values, struct qmap); 60 } queue_arr SEC(".maps") = { 61 .values = { 62 [0] = &queue0, 63 [1] = &queue1, 64 [2] = &queue2, 65 [3] = &queue3, 66 [4] = &queue4, 67 }, 68 }; 69 70 /* Per-task scheduling context */ 71 struct task_ctx { 72 bool force_local; /* Dispatch directly to local_dsq */ 73 }; 74 75 struct { 76 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 77 __uint(map_flags, BPF_F_NO_PREALLOC); 78 __type(key, int); 79 __type(value, struct task_ctx); 80 } task_ctx_stor SEC(".maps"); 81 82 struct cpu_ctx { 83 u64 dsp_idx; /* dispatch index */ 84 u64 dsp_cnt; /* remaining count */ 85 }; 86 87 struct { 88 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 89 __uint(max_entries, 1); 90 __type(key, u32); 91 __type(value, struct cpu_ctx); 92 } cpu_ctx_stor SEC(".maps"); 93 94 /* Statistics */ 95 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; 96 97 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 98 s32 prev_cpu, u64 wake_flags) 99 { 100 struct task_ctx *tctx; 101 s32 cpu; 102 103 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 104 if (!tctx) { 105 scx_bpf_error("task_ctx lookup failed"); 106 return -ESRCH; 107 } 108 109 if (p->nr_cpus_allowed == 1 || 110 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 111 tctx->force_local = true; 112 return prev_cpu; 113 } 114 115 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 116 if (cpu >= 0) 117 return cpu; 118 119 return prev_cpu; 120 } 121 122 static int weight_to_idx(u32 weight) 123 { 124 /* Coarsely map the compound weight to a FIFO. */ 125 if (weight <= 25) 126 return 0; 127 else if (weight <= 50) 128 return 1; 129 else if (weight < 200) 130 return 2; 131 else if (weight < 400) 132 return 3; 133 else 134 return 4; 135 } 136 137 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 138 { 139 static u32 user_cnt, kernel_cnt; 140 struct task_ctx *tctx; 141 u32 pid = p->pid; 142 int idx = weight_to_idx(p->scx.weight); 143 void *ring; 144 145 if (p->flags & PF_KTHREAD) { 146 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 147 return; 148 } else { 149 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 150 return; 151 } 152 153 if (test_error_cnt && !--test_error_cnt) 154 scx_bpf_error("test triggering error"); 155 156 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 157 if (!tctx) { 158 scx_bpf_error("task_ctx lookup failed"); 159 return; 160 } 161 162 /* Is select_cpu() is telling us to enqueue locally? */ 163 if (tctx->force_local) { 164 tctx->force_local = false; 165 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 166 return; 167 } 168 169 /* 170 * If the task was re-enqueued due to the CPU being preempted by a 171 * higher priority scheduling class, just re-enqueue the task directly 172 * on the global DSQ. As we want another CPU to pick it up, find and 173 * kick an idle CPU. 174 */ 175 if (enq_flags & SCX_ENQ_REENQ) { 176 s32 cpu; 177 178 scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); 179 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 180 if (cpu >= 0) 181 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 182 return; 183 } 184 185 ring = bpf_map_lookup_elem(&queue_arr, &idx); 186 if (!ring) { 187 scx_bpf_error("failed to find ring %d", idx); 188 return; 189 } 190 191 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 192 if (bpf_map_push_elem(ring, &pid, 0)) { 193 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); 194 return; 195 } 196 197 __sync_fetch_and_add(&nr_enqueued, 1); 198 } 199 200 /* 201 * The BPF queue map doesn't support removal and sched_ext can handle spurious 202 * dispatches. qmap_dequeue() is only used to collect statistics. 203 */ 204 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 205 { 206 __sync_fetch_and_add(&nr_dequeued, 1); 207 } 208 209 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 210 { 211 struct task_struct *p; 212 struct cpu_ctx *cpuc; 213 u32 zero = 0, batch = dsp_batch ?: 1; 214 void *fifo; 215 s32 i, pid; 216 217 if (scx_bpf_consume(SHARED_DSQ)) 218 return; 219 220 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { 221 /* 222 * PID 2 should be kthreadd which should mostly be idle and off 223 * the scheduler. Let's keep dispatching it to force the kernel 224 * to call this function over and over again. 225 */ 226 p = bpf_task_from_pid(2); 227 if (p) { 228 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); 229 bpf_task_release(p); 230 return; 231 } 232 } 233 234 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 235 scx_bpf_error("failed to look up cpu_ctx"); 236 return; 237 } 238 239 for (i = 0; i < 5; i++) { 240 /* Advance the dispatch cursor and pick the fifo. */ 241 if (!cpuc->dsp_cnt) { 242 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 243 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 244 } 245 246 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 247 if (!fifo) { 248 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 249 return; 250 } 251 252 /* Dispatch or advance. */ 253 bpf_repeat(BPF_MAX_LOOPS) { 254 if (bpf_map_pop_elem(fifo, &pid)) 255 break; 256 257 p = bpf_task_from_pid(pid); 258 if (!p) 259 continue; 260 261 __sync_fetch_and_add(&nr_dispatched, 1); 262 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 263 bpf_task_release(p); 264 batch--; 265 cpuc->dsp_cnt--; 266 if (!batch || !scx_bpf_dispatch_nr_slots()) { 267 scx_bpf_consume(SHARED_DSQ); 268 return; 269 } 270 if (!cpuc->dsp_cnt) 271 break; 272 } 273 274 cpuc->dsp_cnt = 0; 275 } 276 } 277 278 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 279 { 280 u32 cnt; 281 282 /* 283 * Called when @cpu is taken by a higher priority scheduling class. This 284 * makes @cpu no longer available for executing sched_ext tasks. As we 285 * don't want the tasks in @cpu's local dsq to sit there until @cpu 286 * becomes available again, re-enqueue them into the global dsq. See 287 * %SCX_ENQ_REENQ handling in qmap_enqueue(). 288 */ 289 cnt = scx_bpf_reenqueue_local(); 290 if (cnt) 291 __sync_fetch_and_add(&nr_reenqueued, cnt); 292 } 293 294 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 295 struct scx_init_task_args *args) 296 { 297 if (p->tgid == disallow_tgid) 298 p->scx.disallow = true; 299 300 /* 301 * @p is new. Let's ensure that its task_ctx is available. We can sleep 302 * in this function and the following will automatically use GFP_KERNEL. 303 */ 304 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 305 BPF_LOCAL_STORAGE_GET_F_CREATE)) 306 return 0; 307 else 308 return -ENOMEM; 309 } 310 311 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 312 { 313 s32 i, pid; 314 315 if (suppress_dump) 316 return; 317 318 bpf_for(i, 0, 5) { 319 void *fifo; 320 321 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 322 return; 323 324 scx_bpf_dump("QMAP FIFO[%d]:", i); 325 bpf_repeat(4096) { 326 if (bpf_map_pop_elem(fifo, &pid)) 327 break; 328 scx_bpf_dump(" %d", pid); 329 } 330 scx_bpf_dump("\n"); 331 } 332 } 333 334 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 335 { 336 u32 zero = 0; 337 struct cpu_ctx *cpuc; 338 339 if (suppress_dump || idle) 340 return; 341 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 342 return; 343 344 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu", 345 cpuc->dsp_idx, cpuc->dsp_cnt); 346 } 347 348 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 349 { 350 struct task_ctx *taskc; 351 352 if (suppress_dump) 353 return; 354 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 355 return; 356 357 scx_bpf_dump("QMAP: force_local=%d", 358 taskc->force_local); 359 } 360 361 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 362 { 363 return scx_bpf_create_dsq(SHARED_DSQ, -1); 364 } 365 366 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 367 { 368 UEI_RECORD(uei, ei); 369 } 370 371 SCX_OPS_DEFINE(qmap_ops, 372 .select_cpu = (void *)qmap_select_cpu, 373 .enqueue = (void *)qmap_enqueue, 374 .dequeue = (void *)qmap_dequeue, 375 .dispatch = (void *)qmap_dispatch, 376 .cpu_release = (void *)qmap_cpu_release, 377 .init_task = (void *)qmap_init_task, 378 .dump = (void *)qmap_dump, 379 .dump_cpu = (void *)qmap_dump_cpu, 380 .dump_task = (void *)qmap_dump_task, 381 .init = (void *)qmap_init, 382 .exit = (void *)qmap_exit, 383 .timeout_ms = 5000U, 384 .name = "qmap"); 385