1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * 15 * This scheduler is primarily for demonstration and testing of sched_ext 16 * features and unlikely to be useful for actual workloads. 17 * 18 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 19 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 20 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 21 */ 22 #include <scx/common.bpf.h> 23 24 enum consts { 25 ONE_SEC_IN_NS = 1000000000, 26 SHARED_DSQ = 0, 27 }; 28 29 char _license[] SEC("license") = "GPL"; 30 31 const volatile u64 slice_ns = SCX_SLICE_DFL; 32 const volatile u32 dsp_batch; 33 34 u32 test_error_cnt; 35 36 UEI_DEFINE(uei); 37 38 struct qmap { 39 __uint(type, BPF_MAP_TYPE_QUEUE); 40 __uint(max_entries, 4096); 41 __type(value, u32); 42 } queue0 SEC(".maps"), 43 queue1 SEC(".maps"), 44 queue2 SEC(".maps"), 45 queue3 SEC(".maps"), 46 queue4 SEC(".maps"); 47 48 struct { 49 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 50 __uint(max_entries, 5); 51 __type(key, int); 52 __array(values, struct qmap); 53 } queue_arr SEC(".maps") = { 54 .values = { 55 [0] = &queue0, 56 [1] = &queue1, 57 [2] = &queue2, 58 [3] = &queue3, 59 [4] = &queue4, 60 }, 61 }; 62 63 /* Per-task scheduling context */ 64 struct task_ctx { 65 bool force_local; /* Dispatch directly to local_dsq */ 66 }; 67 68 struct { 69 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 70 __uint(map_flags, BPF_F_NO_PREALLOC); 71 __type(key, int); 72 __type(value, struct task_ctx); 73 } task_ctx_stor SEC(".maps"); 74 75 struct cpu_ctx { 76 u64 dsp_idx; /* dispatch index */ 77 u64 dsp_cnt; /* remaining count */ 78 }; 79 80 struct { 81 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 82 __uint(max_entries, 1); 83 __type(key, u32); 84 __type(value, struct cpu_ctx); 85 } cpu_ctx_stor SEC(".maps"); 86 87 /* Statistics */ 88 u64 nr_enqueued, nr_dispatched, nr_dequeued; 89 90 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 91 s32 prev_cpu, u64 wake_flags) 92 { 93 struct task_ctx *tctx; 94 s32 cpu; 95 96 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 97 if (!tctx) { 98 scx_bpf_error("task_ctx lookup failed"); 99 return -ESRCH; 100 } 101 102 if (p->nr_cpus_allowed == 1 || 103 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 104 tctx->force_local = true; 105 return prev_cpu; 106 } 107 108 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 109 if (cpu >= 0) 110 return cpu; 111 112 return prev_cpu; 113 } 114 115 static int weight_to_idx(u32 weight) 116 { 117 /* Coarsely map the compound weight to a FIFO. */ 118 if (weight <= 25) 119 return 0; 120 else if (weight <= 50) 121 return 1; 122 else if (weight < 200) 123 return 2; 124 else if (weight < 400) 125 return 3; 126 else 127 return 4; 128 } 129 130 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 131 { 132 struct task_ctx *tctx; 133 u32 pid = p->pid; 134 int idx = weight_to_idx(p->scx.weight); 135 void *ring; 136 137 if (test_error_cnt && !--test_error_cnt) 138 scx_bpf_error("test triggering error"); 139 140 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 141 if (!tctx) { 142 scx_bpf_error("task_ctx lookup failed"); 143 return; 144 } 145 146 /* Is select_cpu() is telling us to enqueue locally? */ 147 if (tctx->force_local) { 148 tctx->force_local = false; 149 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 150 return; 151 } 152 153 ring = bpf_map_lookup_elem(&queue_arr, &idx); 154 if (!ring) { 155 scx_bpf_error("failed to find ring %d", idx); 156 return; 157 } 158 159 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 160 if (bpf_map_push_elem(ring, &pid, 0)) { 161 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); 162 return; 163 } 164 165 __sync_fetch_and_add(&nr_enqueued, 1); 166 } 167 168 /* 169 * The BPF queue map doesn't support removal and sched_ext can handle spurious 170 * dispatches. qmap_dequeue() is only used to collect statistics. 171 */ 172 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 173 { 174 __sync_fetch_and_add(&nr_dequeued, 1); 175 } 176 177 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 178 { 179 struct task_struct *p; 180 struct cpu_ctx *cpuc; 181 u32 zero = 0, batch = dsp_batch ?: 1; 182 void *fifo; 183 s32 i, pid; 184 185 if (scx_bpf_consume(SHARED_DSQ)) 186 return; 187 188 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 189 scx_bpf_error("failed to look up cpu_ctx"); 190 return; 191 } 192 193 for (i = 0; i < 5; i++) { 194 /* Advance the dispatch cursor and pick the fifo. */ 195 if (!cpuc->dsp_cnt) { 196 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 197 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 198 } 199 200 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 201 if (!fifo) { 202 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 203 return; 204 } 205 206 /* Dispatch or advance. */ 207 bpf_repeat(BPF_MAX_LOOPS) { 208 if (bpf_map_pop_elem(fifo, &pid)) 209 break; 210 211 p = bpf_task_from_pid(pid); 212 if (!p) 213 continue; 214 215 __sync_fetch_and_add(&nr_dispatched, 1); 216 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 217 bpf_task_release(p); 218 batch--; 219 cpuc->dsp_cnt--; 220 if (!batch || !scx_bpf_dispatch_nr_slots()) { 221 scx_bpf_consume(SHARED_DSQ); 222 return; 223 } 224 if (!cpuc->dsp_cnt) 225 break; 226 } 227 228 cpuc->dsp_cnt = 0; 229 } 230 } 231 232 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 233 struct scx_init_task_args *args) 234 { 235 /* 236 * @p is new. Let's ensure that its task_ctx is available. We can sleep 237 * in this function and the following will automatically use GFP_KERNEL. 238 */ 239 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 240 BPF_LOCAL_STORAGE_GET_F_CREATE)) 241 return 0; 242 else 243 return -ENOMEM; 244 } 245 246 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 247 { 248 return scx_bpf_create_dsq(SHARED_DSQ, -1); 249 } 250 251 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 252 { 253 UEI_RECORD(uei, ei); 254 } 255 256 SCX_OPS_DEFINE(qmap_ops, 257 .select_cpu = (void *)qmap_select_cpu, 258 .enqueue = (void *)qmap_enqueue, 259 .dequeue = (void *)qmap_dequeue, 260 .dispatch = (void *)qmap_dispatch, 261 .init_task = (void *)qmap_init_task, 262 .init = (void *)qmap_init, 263 .exit = (void *)qmap_exit, 264 .name = "qmap"); 265