1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * A scheduler that validates ops.dequeue() is called correctly: 4 * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF 5 * scheduler entirely: no ops.dequeue() should be called 6 * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody: 7 * ops.dequeue() must be called when they leave custody 8 * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by 9 * exactly one ops.dequeue() (validate 1:1 pairing and state machine) 10 * 11 * Copyright (c) 2026 NVIDIA Corporation. 12 */ 13 14 #include <scx/common.bpf.h> 15 16 #define SHARED_DSQ 0 17 18 /* 19 * BPF internal queue. 20 * 21 * Tasks are stored here and consumed from ops.dispatch(), validating that 22 * tasks on BPF internal structures still get ops.dequeue() when they 23 * leave. 24 */ 25 struct { 26 __uint(type, BPF_MAP_TYPE_QUEUE); 27 __uint(max_entries, 32768); 28 __type(value, s32); 29 } global_queue SEC(".maps"); 30 31 char _license[] SEC("license") = "GPL"; 32 33 UEI_DEFINE(uei); 34 35 /* 36 * Counters to track the lifecycle of tasks: 37 * - enqueue_cnt: Number of times ops.enqueue() was called 38 * - dequeue_cnt: Number of times ops.dequeue() was called (any type) 39 * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag) 40 * - change_dequeue_cnt: Number of property change dequeues 41 * - bpf_queue_full: Number of times the BPF internal queue was full 42 */ 43 u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full; 44 45 /* 46 * Test scenarios: 47 * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF 48 * scheduler, no dequeue callbacks) 49 * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF 50 * scheduler, no dequeue callbacks) 51 * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler, 52 * dequeue callbacks expected) 53 * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF 54 * scheduler, no dequeue callbacks) 55 * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF 56 * scheduler, no dequeue callbacks) 57 * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler, 58 * dequeue callbacks expected) 59 * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(), 60 * consume in ops.dispatch() and dispatch to local DSQ (validates dequeue 61 * for tasks stored in internal BPF data structures) 62 */ 63 u32 test_scenario; 64 65 /* 66 * Per-task state to track lifecycle and validate workflow semantics. 67 * State transitions: 68 * NONE -> ENQUEUED (on enqueue) 69 * NONE -> DISPATCHED (on direct dispatch to terminal DSQ) 70 * ENQUEUED -> DISPATCHED (on dispatch dequeue) 71 * DISPATCHED -> NONE (on property change dequeue or re-enqueue) 72 * ENQUEUED -> NONE (on property change dequeue before dispatch) 73 */ 74 enum task_state { 75 TASK_NONE = 0, 76 TASK_ENQUEUED, 77 TASK_DISPATCHED, 78 }; 79 80 struct task_ctx { 81 enum task_state state; /* Current state in the workflow */ 82 u64 enqueue_seq; /* Sequence number for debugging */ 83 }; 84 85 struct { 86 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 87 __uint(map_flags, BPF_F_NO_PREALLOC); 88 __type(key, int); 89 __type(value, struct task_ctx); 90 } task_ctx_stor SEC(".maps"); 91 92 static struct task_ctx *try_lookup_task_ctx(struct task_struct *p) 93 { 94 return bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 95 } 96 97 s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p, 98 s32 prev_cpu, u64 wake_flags) 99 { 100 struct task_ctx *tctx; 101 102 tctx = try_lookup_task_ctx(p); 103 if (!tctx) 104 return prev_cpu; 105 106 switch (test_scenario) { 107 case 0: 108 /* 109 * Direct dispatch to the local DSQ. 110 * 111 * Task bypasses BPF scheduler entirely: no enqueue 112 * tracking, no ops.dequeue() callbacks. 113 */ 114 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 115 tctx->state = TASK_DISPATCHED; 116 break; 117 case 1: 118 /* 119 * Direct dispatch to the global DSQ. 120 * 121 * Task bypasses BPF scheduler entirely: no enqueue 122 * tracking, no ops.dequeue() callbacks. 123 */ 124 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 125 tctx->state = TASK_DISPATCHED; 126 break; 127 case 2: 128 /* 129 * Dispatch to a shared user DSQ. 130 * 131 * Task enters BPF scheduler management: track 132 * enqueue/dequeue lifecycle and validate state 133 * transitions. 134 */ 135 if (tctx->state == TASK_ENQUEUED) 136 scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", 137 p->pid, p->comm, tctx->enqueue_seq); 138 139 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0); 140 141 __sync_fetch_and_add(&enqueue_cnt, 1); 142 143 tctx->state = TASK_ENQUEUED; 144 tctx->enqueue_seq++; 145 break; 146 } 147 148 return prev_cpu; 149 } 150 151 void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags) 152 { 153 struct task_ctx *tctx; 154 s32 pid = p->pid; 155 156 tctx = try_lookup_task_ctx(p); 157 if (!tctx) 158 return; 159 160 switch (test_scenario) { 161 case 3: 162 /* 163 * Direct dispatch to the local DSQ. 164 * 165 * Task bypasses BPF scheduler entirely: no enqueue 166 * tracking, no ops.dequeue() callbacks. 167 */ 168 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); 169 tctx->state = TASK_DISPATCHED; 170 break; 171 case 4: 172 /* 173 * Direct dispatch to the global DSQ. 174 * 175 * Task bypasses BPF scheduler entirely: no enqueue 176 * tracking, no ops.dequeue() callbacks. 177 */ 178 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 179 tctx->state = TASK_DISPATCHED; 180 break; 181 case 5: 182 /* 183 * Dispatch to shared user DSQ. 184 * 185 * Task enters BPF scheduler management: track 186 * enqueue/dequeue lifecycle and validate state 187 * transitions. 188 */ 189 if (tctx->state == TASK_ENQUEUED) 190 scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", 191 p->pid, p->comm, tctx->enqueue_seq); 192 193 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); 194 195 __sync_fetch_and_add(&enqueue_cnt, 1); 196 197 tctx->state = TASK_ENQUEUED; 198 tctx->enqueue_seq++; 199 break; 200 case 6: 201 /* 202 * Store task in BPF internal queue. 203 * 204 * Task enters BPF scheduler management: track 205 * enqueue/dequeue lifecycle and validate state 206 * transitions. 207 */ 208 if (tctx->state == TASK_ENQUEUED) 209 scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", 210 p->pid, p->comm, tctx->enqueue_seq); 211 212 if (bpf_map_push_elem(&global_queue, &pid, 0)) { 213 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 214 __sync_fetch_and_add(&bpf_queue_full, 1); 215 216 tctx->state = TASK_DISPATCHED; 217 } else { 218 __sync_fetch_and_add(&enqueue_cnt, 1); 219 220 tctx->state = TASK_ENQUEUED; 221 tctx->enqueue_seq++; 222 } 223 break; 224 default: 225 /* For all other scenarios, dispatch to the global DSQ */ 226 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 227 tctx->state = TASK_DISPATCHED; 228 break; 229 } 230 231 scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE); 232 } 233 234 void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags) 235 { 236 struct task_ctx *tctx; 237 238 __sync_fetch_and_add(&dequeue_cnt, 1); 239 240 tctx = try_lookup_task_ctx(p); 241 if (!tctx) 242 return; 243 244 /* 245 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global), 246 * ops.dequeue() should never be called because tasks bypass the 247 * BPF scheduler entirely. If we get here, it's a kernel bug. 248 */ 249 if (test_scenario == 0 || test_scenario == 3) { 250 scx_bpf_error("%d (%s): dequeue called for local DSQ scenario", 251 p->pid, p->comm); 252 return; 253 } 254 255 if (test_scenario == 1 || test_scenario == 4) { 256 scx_bpf_error("%d (%s): dequeue called for global DSQ scenario", 257 p->pid, p->comm); 258 return; 259 } 260 261 if (deq_flags & SCX_DEQ_SCHED_CHANGE) { 262 /* 263 * Property change interrupting the workflow. Valid from 264 * both ENQUEUED and DISPATCHED states. Transitions task 265 * back to NONE state. 266 */ 267 __sync_fetch_and_add(&change_dequeue_cnt, 1); 268 269 /* Validate state transition */ 270 if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED) 271 scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu", 272 p->pid, p->comm, tctx->state, tctx->enqueue_seq); 273 274 /* 275 * Transition back to NONE: task outside scheduler control. 276 * 277 * Scenario 6: dispatch() checks tctx->state after popping a 278 * PID, if the task is in state NONE, it was dequeued by 279 * property change and must not be dispatched (this 280 * prevents "target CPU not allowed"). 281 */ 282 tctx->state = TASK_NONE; 283 } else { 284 /* 285 * Regular dispatch dequeue: kernel is moving the task from 286 * BPF custody to a terminal DSQ. Normally we come from 287 * ENQUEUED state. We can also see TASK_NONE if the task 288 * was dequeued by property change (SCX_DEQ_SCHED_CHANGE) 289 * while it was already on a DSQ (dispatched but not yet 290 * consumed); in that case we just leave state as NONE. 291 */ 292 __sync_fetch_and_add(&dispatch_dequeue_cnt, 1); 293 294 /* 295 * Must be ENQUEUED (normal path) or NONE (already dequeued 296 * by property change while on a DSQ). 297 */ 298 if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE) 299 scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu", 300 p->pid, p->comm, tctx->state, tctx->enqueue_seq); 301 302 if (tctx->state == TASK_ENQUEUED) 303 tctx->state = TASK_DISPATCHED; 304 305 /* NONE: leave as-is, task was already property-change dequeued */ 306 } 307 } 308 309 void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev) 310 { 311 if (test_scenario == 6) { 312 struct task_ctx *tctx; 313 struct task_struct *p; 314 s32 pid; 315 316 if (bpf_map_pop_elem(&global_queue, &pid)) 317 return; 318 319 p = bpf_task_from_pid(pid); 320 if (!p) 321 return; 322 323 /* 324 * If the task was dequeued by property change 325 * (ops.dequeue() set tctx->state = TASK_NONE), skip 326 * dispatch. 327 */ 328 tctx = try_lookup_task_ctx(p); 329 if (!tctx || tctx->state == TASK_NONE) { 330 bpf_task_release(p); 331 return; 332 } 333 334 /* 335 * Dispatch to this CPU's local DSQ if allowed, otherwise 336 * fallback to the global DSQ. 337 */ 338 if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) 339 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); 340 else 341 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 342 343 bpf_task_release(p); 344 } else { 345 scx_bpf_dsq_move_to_local(SHARED_DSQ, 0); 346 } 347 } 348 349 s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p, 350 struct scx_init_task_args *args) 351 { 352 struct task_ctx *tctx; 353 354 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 355 BPF_LOCAL_STORAGE_GET_F_CREATE); 356 if (!tctx) 357 return -ENOMEM; 358 359 return 0; 360 } 361 362 s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init) 363 { 364 s32 ret; 365 366 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 367 if (ret) 368 return ret; 369 370 return 0; 371 } 372 373 void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei) 374 { 375 UEI_RECORD(uei, ei); 376 } 377 378 SEC(".struct_ops.link") 379 struct sched_ext_ops dequeue_ops = { 380 .select_cpu = (void *)dequeue_select_cpu, 381 .enqueue = (void *)dequeue_enqueue, 382 .dequeue = (void *)dequeue_dequeue, 383 .dispatch = (void *)dequeue_dispatch, 384 .init_task = (void *)dequeue_init_task, 385 .init = (void *)dequeue_init, 386 .exit = (void *)dequeue_exit, 387 .flags = SCX_OPS_ENQ_LAST, 388 .name = "dequeue_test", 389 }; 390