1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * 15 * This scheduler is primarily for demonstration and testing of sched_ext 16 * features and unlikely to be useful for actual workloads. 17 * 18 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 19 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 20 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 21 */ 22 #include <scx/common.bpf.h> 23 24 enum consts { 25 ONE_SEC_IN_NS = 1000000000, 26 SHARED_DSQ = 0, 27 }; 28 29 char _license[] SEC("license") = "GPL"; 30 31 const volatile u64 slice_ns = SCX_SLICE_DFL; 32 const volatile u32 stall_user_nth; 33 const volatile u32 stall_kernel_nth; 34 const volatile u32 dsp_batch; 35 const volatile s32 disallow_tgid; 36 const volatile bool suppress_dump; 37 38 u32 test_error_cnt; 39 40 UEI_DEFINE(uei); 41 42 struct qmap { 43 __uint(type, BPF_MAP_TYPE_QUEUE); 44 __uint(max_entries, 4096); 45 __type(value, u32); 46 } queue0 SEC(".maps"), 47 queue1 SEC(".maps"), 48 queue2 SEC(".maps"), 49 queue3 SEC(".maps"), 50 queue4 SEC(".maps"); 51 52 struct { 53 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 54 __uint(max_entries, 5); 55 __type(key, int); 56 __array(values, struct qmap); 57 } queue_arr SEC(".maps") = { 58 .values = { 59 [0] = &queue0, 60 [1] = &queue1, 61 [2] = &queue2, 62 [3] = &queue3, 63 [4] = &queue4, 64 }, 65 }; 66 67 /* Per-task scheduling context */ 68 struct task_ctx { 69 bool force_local; /* Dispatch directly to local_dsq */ 70 }; 71 72 struct { 73 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 74 __uint(map_flags, BPF_F_NO_PREALLOC); 75 __type(key, int); 76 __type(value, struct task_ctx); 77 } task_ctx_stor SEC(".maps"); 78 79 struct cpu_ctx { 80 u64 dsp_idx; /* dispatch index */ 81 u64 dsp_cnt; /* remaining count */ 82 }; 83 84 struct { 85 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 86 __uint(max_entries, 1); 87 __type(key, u32); 88 __type(value, struct cpu_ctx); 89 } cpu_ctx_stor SEC(".maps"); 90 91 /* Statistics */ 92 u64 nr_enqueued, nr_dispatched, nr_dequeued; 93 94 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 95 s32 prev_cpu, u64 wake_flags) 96 { 97 struct task_ctx *tctx; 98 s32 cpu; 99 100 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 101 if (!tctx) { 102 scx_bpf_error("task_ctx lookup failed"); 103 return -ESRCH; 104 } 105 106 if (p->nr_cpus_allowed == 1 || 107 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 108 tctx->force_local = true; 109 return prev_cpu; 110 } 111 112 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 113 if (cpu >= 0) 114 return cpu; 115 116 return prev_cpu; 117 } 118 119 static int weight_to_idx(u32 weight) 120 { 121 /* Coarsely map the compound weight to a FIFO. */ 122 if (weight <= 25) 123 return 0; 124 else if (weight <= 50) 125 return 1; 126 else if (weight < 200) 127 return 2; 128 else if (weight < 400) 129 return 3; 130 else 131 return 4; 132 } 133 134 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 135 { 136 static u32 user_cnt, kernel_cnt; 137 struct task_ctx *tctx; 138 u32 pid = p->pid; 139 int idx = weight_to_idx(p->scx.weight); 140 void *ring; 141 142 if (p->flags & PF_KTHREAD) { 143 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 144 return; 145 } else { 146 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 147 return; 148 } 149 150 if (test_error_cnt && !--test_error_cnt) 151 scx_bpf_error("test triggering error"); 152 153 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 154 if (!tctx) { 155 scx_bpf_error("task_ctx lookup failed"); 156 return; 157 } 158 159 /* Is select_cpu() is telling us to enqueue locally? */ 160 if (tctx->force_local) { 161 tctx->force_local = false; 162 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 163 return; 164 } 165 166 ring = bpf_map_lookup_elem(&queue_arr, &idx); 167 if (!ring) { 168 scx_bpf_error("failed to find ring %d", idx); 169 return; 170 } 171 172 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 173 if (bpf_map_push_elem(ring, &pid, 0)) { 174 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); 175 return; 176 } 177 178 __sync_fetch_and_add(&nr_enqueued, 1); 179 } 180 181 /* 182 * The BPF queue map doesn't support removal and sched_ext can handle spurious 183 * dispatches. qmap_dequeue() is only used to collect statistics. 184 */ 185 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 186 { 187 __sync_fetch_and_add(&nr_dequeued, 1); 188 } 189 190 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 191 { 192 struct task_struct *p; 193 struct cpu_ctx *cpuc; 194 u32 zero = 0, batch = dsp_batch ?: 1; 195 void *fifo; 196 s32 i, pid; 197 198 if (scx_bpf_consume(SHARED_DSQ)) 199 return; 200 201 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 202 scx_bpf_error("failed to look up cpu_ctx"); 203 return; 204 } 205 206 for (i = 0; i < 5; i++) { 207 /* Advance the dispatch cursor and pick the fifo. */ 208 if (!cpuc->dsp_cnt) { 209 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 210 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 211 } 212 213 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 214 if (!fifo) { 215 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 216 return; 217 } 218 219 /* Dispatch or advance. */ 220 bpf_repeat(BPF_MAX_LOOPS) { 221 if (bpf_map_pop_elem(fifo, &pid)) 222 break; 223 224 p = bpf_task_from_pid(pid); 225 if (!p) 226 continue; 227 228 __sync_fetch_and_add(&nr_dispatched, 1); 229 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 230 bpf_task_release(p); 231 batch--; 232 cpuc->dsp_cnt--; 233 if (!batch || !scx_bpf_dispatch_nr_slots()) { 234 scx_bpf_consume(SHARED_DSQ); 235 return; 236 } 237 if (!cpuc->dsp_cnt) 238 break; 239 } 240 241 cpuc->dsp_cnt = 0; 242 } 243 } 244 245 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 246 struct scx_init_task_args *args) 247 { 248 if (p->tgid == disallow_tgid) 249 p->scx.disallow = true; 250 251 /* 252 * @p is new. Let's ensure that its task_ctx is available. We can sleep 253 * in this function and the following will automatically use GFP_KERNEL. 254 */ 255 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 256 BPF_LOCAL_STORAGE_GET_F_CREATE)) 257 return 0; 258 else 259 return -ENOMEM; 260 } 261 262 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 263 { 264 s32 i, pid; 265 266 if (suppress_dump) 267 return; 268 269 bpf_for(i, 0, 5) { 270 void *fifo; 271 272 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 273 return; 274 275 scx_bpf_dump("QMAP FIFO[%d]:", i); 276 bpf_repeat(4096) { 277 if (bpf_map_pop_elem(fifo, &pid)) 278 break; 279 scx_bpf_dump(" %d", pid); 280 } 281 scx_bpf_dump("\n"); 282 } 283 } 284 285 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 286 { 287 u32 zero = 0; 288 struct cpu_ctx *cpuc; 289 290 if (suppress_dump || idle) 291 return; 292 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 293 return; 294 295 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu", 296 cpuc->dsp_idx, cpuc->dsp_cnt); 297 } 298 299 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 300 { 301 struct task_ctx *taskc; 302 303 if (suppress_dump) 304 return; 305 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 306 return; 307 308 scx_bpf_dump("QMAP: force_local=%d", 309 taskc->force_local); 310 } 311 312 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 313 { 314 return scx_bpf_create_dsq(SHARED_DSQ, -1); 315 } 316 317 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 318 { 319 UEI_RECORD(uei, ei); 320 } 321 322 SCX_OPS_DEFINE(qmap_ops, 323 .select_cpu = (void *)qmap_select_cpu, 324 .enqueue = (void *)qmap_enqueue, 325 .dequeue = (void *)qmap_dequeue, 326 .dispatch = (void *)qmap_dispatch, 327 .init_task = (void *)qmap_init_task, 328 .dump = (void *)qmap_dump, 329 .dump_cpu = (void *)qmap_dump_cpu, 330 .dump_task = (void *)qmap_dump_task, 331 .init = (void *)qmap_init, 332 .exit = (void *)qmap_exit, 333 .timeout_ms = 5000U, 334 .name = "qmap"); 335