1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking 15 * the CPU away. 16 * - Core-sched support. 17 * 18 * This scheduler is primarily for demonstration and testing of sched_ext 19 * features and unlikely to be useful for actual workloads. 20 * 21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 23 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 24 */ 25 #include <scx/common.bpf.h> 26 27 enum consts { 28 ONE_SEC_IN_NS = 1000000000, 29 SHARED_DSQ = 0, 30 }; 31 32 char _license[] SEC("license") = "GPL"; 33 34 const volatile u64 slice_ns = SCX_SLICE_DFL; 35 const volatile u32 stall_user_nth; 36 const volatile u32 stall_kernel_nth; 37 const volatile u32 dsp_inf_loop_after; 38 const volatile u32 dsp_batch; 39 const volatile s32 disallow_tgid; 40 const volatile bool suppress_dump; 41 42 u32 test_error_cnt; 43 44 UEI_DEFINE(uei); 45 46 struct qmap { 47 __uint(type, BPF_MAP_TYPE_QUEUE); 48 __uint(max_entries, 4096); 49 __type(value, u32); 50 } queue0 SEC(".maps"), 51 queue1 SEC(".maps"), 52 queue2 SEC(".maps"), 53 queue3 SEC(".maps"), 54 queue4 SEC(".maps"); 55 56 struct { 57 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 58 __uint(max_entries, 5); 59 __type(key, int); 60 __array(values, struct qmap); 61 } queue_arr SEC(".maps") = { 62 .values = { 63 [0] = &queue0, 64 [1] = &queue1, 65 [2] = &queue2, 66 [3] = &queue3, 67 [4] = &queue4, 68 }, 69 }; 70 71 /* 72 * If enabled, CPU performance target is set according to the queue index 73 * according to the following table. 74 */ 75 static const u32 qidx_to_cpuperf_target[] = { 76 [0] = SCX_CPUPERF_ONE * 0 / 4, 77 [1] = SCX_CPUPERF_ONE * 1 / 4, 78 [2] = SCX_CPUPERF_ONE * 2 / 4, 79 [3] = SCX_CPUPERF_ONE * 3 / 4, 80 [4] = SCX_CPUPERF_ONE * 4 / 4, 81 }; 82 83 /* 84 * Per-queue sequence numbers to implement core-sched ordering. 85 * 86 * Tail seq is assigned to each queued task and incremented. Head seq tracks the 87 * sequence number of the latest dispatched task. The distance between the a 88 * task's seq and the associated queue's head seq is called the queue distance 89 * and used when comparing two tasks for ordering. See qmap_core_sched_before(). 90 */ 91 static u64 core_sched_head_seqs[5]; 92 static u64 core_sched_tail_seqs[5]; 93 94 /* Per-task scheduling context */ 95 struct task_ctx { 96 bool force_local; /* Dispatch directly to local_dsq */ 97 u64 core_sched_seq; 98 }; 99 100 struct { 101 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 102 __uint(map_flags, BPF_F_NO_PREALLOC); 103 __type(key, int); 104 __type(value, struct task_ctx); 105 } task_ctx_stor SEC(".maps"); 106 107 struct cpu_ctx { 108 u64 dsp_idx; /* dispatch index */ 109 u64 dsp_cnt; /* remaining count */ 110 u32 avg_weight; 111 u32 cpuperf_target; 112 }; 113 114 struct { 115 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 116 __uint(max_entries, 1); 117 __type(key, u32); 118 __type(value, struct cpu_ctx); 119 } cpu_ctx_stor SEC(".maps"); 120 121 /* Statistics */ 122 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; 123 u64 nr_core_sched_execed; 124 u32 cpuperf_min, cpuperf_avg, cpuperf_max; 125 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; 126 127 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 128 s32 prev_cpu, u64 wake_flags) 129 { 130 struct task_ctx *tctx; 131 s32 cpu; 132 133 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 134 if (!tctx) { 135 scx_bpf_error("task_ctx lookup failed"); 136 return -ESRCH; 137 } 138 139 if (p->nr_cpus_allowed == 1 || 140 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 141 tctx->force_local = true; 142 return prev_cpu; 143 } 144 145 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 146 if (cpu >= 0) 147 return cpu; 148 149 return prev_cpu; 150 } 151 152 static int weight_to_idx(u32 weight) 153 { 154 /* Coarsely map the compound weight to a FIFO. */ 155 if (weight <= 25) 156 return 0; 157 else if (weight <= 50) 158 return 1; 159 else if (weight < 200) 160 return 2; 161 else if (weight < 400) 162 return 3; 163 else 164 return 4; 165 } 166 167 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 168 { 169 static u32 user_cnt, kernel_cnt; 170 struct task_ctx *tctx; 171 u32 pid = p->pid; 172 int idx = weight_to_idx(p->scx.weight); 173 void *ring; 174 175 if (p->flags & PF_KTHREAD) { 176 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 177 return; 178 } else { 179 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 180 return; 181 } 182 183 if (test_error_cnt && !--test_error_cnt) 184 scx_bpf_error("test triggering error"); 185 186 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 187 if (!tctx) { 188 scx_bpf_error("task_ctx lookup failed"); 189 return; 190 } 191 192 /* 193 * All enqueued tasks must have their core_sched_seq updated for correct 194 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in 195 * qmap_ops.flags. 196 */ 197 tctx->core_sched_seq = core_sched_tail_seqs[idx]++; 198 199 /* 200 * If qmap_select_cpu() is telling us to or this is the last runnable 201 * task on the CPU, enqueue locally. 202 */ 203 if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { 204 tctx->force_local = false; 205 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 206 return; 207 } 208 209 /* 210 * If the task was re-enqueued due to the CPU being preempted by a 211 * higher priority scheduling class, just re-enqueue the task directly 212 * on the global DSQ. As we want another CPU to pick it up, find and 213 * kick an idle CPU. 214 */ 215 if (enq_flags & SCX_ENQ_REENQ) { 216 s32 cpu; 217 218 scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); 219 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 220 if (cpu >= 0) 221 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 222 return; 223 } 224 225 ring = bpf_map_lookup_elem(&queue_arr, &idx); 226 if (!ring) { 227 scx_bpf_error("failed to find ring %d", idx); 228 return; 229 } 230 231 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 232 if (bpf_map_push_elem(ring, &pid, 0)) { 233 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); 234 return; 235 } 236 237 __sync_fetch_and_add(&nr_enqueued, 1); 238 } 239 240 /* 241 * The BPF queue map doesn't support removal and sched_ext can handle spurious 242 * dispatches. qmap_dequeue() is only used to collect statistics. 243 */ 244 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 245 { 246 __sync_fetch_and_add(&nr_dequeued, 1); 247 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) 248 __sync_fetch_and_add(&nr_core_sched_execed, 1); 249 } 250 251 static void update_core_sched_head_seq(struct task_struct *p) 252 { 253 struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 254 int idx = weight_to_idx(p->scx.weight); 255 256 if (tctx) 257 core_sched_head_seqs[idx] = tctx->core_sched_seq; 258 else 259 scx_bpf_error("task_ctx lookup failed"); 260 } 261 262 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 263 { 264 struct task_struct *p; 265 struct cpu_ctx *cpuc; 266 u32 zero = 0, batch = dsp_batch ?: 1; 267 void *fifo; 268 s32 i, pid; 269 270 if (scx_bpf_consume(SHARED_DSQ)) 271 return; 272 273 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { 274 /* 275 * PID 2 should be kthreadd which should mostly be idle and off 276 * the scheduler. Let's keep dispatching it to force the kernel 277 * to call this function over and over again. 278 */ 279 p = bpf_task_from_pid(2); 280 if (p) { 281 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); 282 bpf_task_release(p); 283 return; 284 } 285 } 286 287 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 288 scx_bpf_error("failed to look up cpu_ctx"); 289 return; 290 } 291 292 for (i = 0; i < 5; i++) { 293 /* Advance the dispatch cursor and pick the fifo. */ 294 if (!cpuc->dsp_cnt) { 295 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 296 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 297 } 298 299 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 300 if (!fifo) { 301 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 302 return; 303 } 304 305 /* Dispatch or advance. */ 306 bpf_repeat(BPF_MAX_LOOPS) { 307 if (bpf_map_pop_elem(fifo, &pid)) 308 break; 309 310 p = bpf_task_from_pid(pid); 311 if (!p) 312 continue; 313 314 update_core_sched_head_seq(p); 315 __sync_fetch_and_add(&nr_dispatched, 1); 316 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 317 bpf_task_release(p); 318 batch--; 319 cpuc->dsp_cnt--; 320 if (!batch || !scx_bpf_dispatch_nr_slots()) { 321 scx_bpf_consume(SHARED_DSQ); 322 return; 323 } 324 if (!cpuc->dsp_cnt) 325 break; 326 } 327 328 cpuc->dsp_cnt = 0; 329 } 330 } 331 332 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) 333 { 334 struct cpu_ctx *cpuc; 335 u32 zero = 0; 336 int idx; 337 338 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 339 scx_bpf_error("failed to look up cpu_ctx"); 340 return; 341 } 342 343 /* 344 * Use the running avg of weights to select the target cpuperf level. 345 * This is a demonstration of the cpuperf feature rather than a 346 * practical strategy to regulate CPU frequency. 347 */ 348 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; 349 idx = weight_to_idx(cpuc->avg_weight); 350 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; 351 352 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); 353 } 354 355 /* 356 * The distance from the head of the queue scaled by the weight of the queue. 357 * The lower the number, the older the task and the higher the priority. 358 */ 359 static s64 task_qdist(struct task_struct *p) 360 { 361 int idx = weight_to_idx(p->scx.weight); 362 struct task_ctx *tctx; 363 s64 qdist; 364 365 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 366 if (!tctx) { 367 scx_bpf_error("task_ctx lookup failed"); 368 return 0; 369 } 370 371 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; 372 373 /* 374 * As queue index increments, the priority doubles. The queue w/ index 3 375 * is dispatched twice more frequently than 2. Reflect the difference by 376 * scaling qdists accordingly. Note that the shift amount needs to be 377 * flipped depending on the sign to avoid flipping priority direction. 378 */ 379 if (qdist >= 0) 380 return qdist << (4 - idx); 381 else 382 return qdist << idx; 383 } 384 385 /* 386 * This is called to determine the task ordering when core-sched is picking 387 * tasks to execute on SMT siblings and should encode about the same ordering as 388 * the regular scheduling path. Use the priority-scaled distances from the head 389 * of the queues to compare the two tasks which should be consistent with the 390 * dispatch path behavior. 391 */ 392 bool BPF_STRUCT_OPS(qmap_core_sched_before, 393 struct task_struct *a, struct task_struct *b) 394 { 395 return task_qdist(a) > task_qdist(b); 396 } 397 398 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 399 { 400 u32 cnt; 401 402 /* 403 * Called when @cpu is taken by a higher priority scheduling class. This 404 * makes @cpu no longer available for executing sched_ext tasks. As we 405 * don't want the tasks in @cpu's local dsq to sit there until @cpu 406 * becomes available again, re-enqueue them into the global dsq. See 407 * %SCX_ENQ_REENQ handling in qmap_enqueue(). 408 */ 409 cnt = scx_bpf_reenqueue_local(); 410 if (cnt) 411 __sync_fetch_and_add(&nr_reenqueued, cnt); 412 } 413 414 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 415 struct scx_init_task_args *args) 416 { 417 if (p->tgid == disallow_tgid) 418 p->scx.disallow = true; 419 420 /* 421 * @p is new. Let's ensure that its task_ctx is available. We can sleep 422 * in this function and the following will automatically use GFP_KERNEL. 423 */ 424 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 425 BPF_LOCAL_STORAGE_GET_F_CREATE)) 426 return 0; 427 else 428 return -ENOMEM; 429 } 430 431 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 432 { 433 s32 i, pid; 434 435 if (suppress_dump) 436 return; 437 438 bpf_for(i, 0, 5) { 439 void *fifo; 440 441 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 442 return; 443 444 scx_bpf_dump("QMAP FIFO[%d]:", i); 445 bpf_repeat(4096) { 446 if (bpf_map_pop_elem(fifo, &pid)) 447 break; 448 scx_bpf_dump(" %d", pid); 449 } 450 scx_bpf_dump("\n"); 451 } 452 } 453 454 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 455 { 456 u32 zero = 0; 457 struct cpu_ctx *cpuc; 458 459 if (suppress_dump || idle) 460 return; 461 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 462 return; 463 464 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", 465 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, 466 cpuc->cpuperf_target); 467 } 468 469 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 470 { 471 struct task_ctx *taskc; 472 473 if (suppress_dump) 474 return; 475 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 476 return; 477 478 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", 479 taskc->force_local, taskc->core_sched_seq); 480 } 481 482 /* 483 * Print out the online and possible CPU map using bpf_printk() as a 484 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). 485 */ 486 static void print_cpus(void) 487 { 488 const struct cpumask *possible, *online; 489 s32 cpu; 490 char buf[128] = "", *p; 491 int idx; 492 493 possible = scx_bpf_get_possible_cpumask(); 494 online = scx_bpf_get_online_cpumask(); 495 496 idx = 0; 497 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { 498 if (!(p = MEMBER_VPTR(buf, [idx++]))) 499 break; 500 if (bpf_cpumask_test_cpu(cpu, online)) 501 *p++ = 'O'; 502 else if (bpf_cpumask_test_cpu(cpu, possible)) 503 *p++ = 'X'; 504 else 505 *p++ = ' '; 506 507 if ((cpu & 7) == 7) { 508 if (!(p = MEMBER_VPTR(buf, [idx++]))) 509 break; 510 *p++ = '|'; 511 } 512 } 513 buf[sizeof(buf) - 1] = '\0'; 514 515 scx_bpf_put_cpumask(online); 516 scx_bpf_put_cpumask(possible); 517 518 bpf_printk("CPUS: |%s", buf); 519 } 520 521 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) 522 { 523 bpf_printk("CPU %d coming online", cpu); 524 /* @cpu is already online at this point */ 525 print_cpus(); 526 } 527 528 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) 529 { 530 bpf_printk("CPU %d going offline", cpu); 531 /* @cpu is still online at this point */ 532 print_cpus(); 533 } 534 535 struct monitor_timer { 536 struct bpf_timer timer; 537 }; 538 539 struct { 540 __uint(type, BPF_MAP_TYPE_ARRAY); 541 __uint(max_entries, 1); 542 __type(key, u32); 543 __type(value, struct monitor_timer); 544 } monitor_timer SEC(".maps"); 545 546 /* 547 * Print out the min, avg and max performance levels of CPUs every second to 548 * demonstrate the cpuperf interface. 549 */ 550 static void monitor_cpuperf(void) 551 { 552 u32 zero = 0, nr_cpu_ids; 553 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; 554 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; 555 const struct cpumask *online; 556 int i, nr_online_cpus = 0; 557 558 nr_cpu_ids = scx_bpf_nr_cpu_ids(); 559 online = scx_bpf_get_online_cpumask(); 560 561 bpf_for(i, 0, nr_cpu_ids) { 562 struct cpu_ctx *cpuc; 563 u32 cap, cur; 564 565 if (!bpf_cpumask_test_cpu(i, online)) 566 continue; 567 nr_online_cpus++; 568 569 /* collect the capacity and current cpuperf */ 570 cap = scx_bpf_cpuperf_cap(i); 571 cur = scx_bpf_cpuperf_cur(i); 572 573 cur_min = cur < cur_min ? cur : cur_min; 574 cur_max = cur > cur_max ? cur : cur_max; 575 576 /* 577 * $cur is relative to $cap. Scale it down accordingly so that 578 * it's in the same scale as other CPUs and $cur_sum/$cap_sum 579 * makes sense. 580 */ 581 cur_sum += cur * cap / SCX_CPUPERF_ONE; 582 cap_sum += cap; 583 584 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { 585 scx_bpf_error("failed to look up cpu_ctx"); 586 goto out; 587 } 588 589 /* collect target */ 590 cur = cpuc->cpuperf_target; 591 target_sum += cur; 592 target_min = cur < target_min ? cur : target_min; 593 target_max = cur > target_max ? cur : target_max; 594 } 595 596 cpuperf_min = cur_min; 597 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; 598 cpuperf_max = cur_max; 599 600 cpuperf_target_min = target_min; 601 cpuperf_target_avg = target_sum / nr_online_cpus; 602 cpuperf_target_max = target_max; 603 out: 604 scx_bpf_put_cpumask(online); 605 } 606 607 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 608 { 609 monitor_cpuperf(); 610 611 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 612 return 0; 613 } 614 615 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 616 { 617 u32 key = 0; 618 struct bpf_timer *timer; 619 s32 ret; 620 621 print_cpus(); 622 623 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 624 if (ret) 625 return ret; 626 627 timer = bpf_map_lookup_elem(&monitor_timer, &key); 628 if (!timer) 629 return -ESRCH; 630 631 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); 632 bpf_timer_set_callback(timer, monitor_timerfn); 633 634 return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 635 } 636 637 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 638 { 639 UEI_RECORD(uei, ei); 640 } 641 642 SCX_OPS_DEFINE(qmap_ops, 643 .select_cpu = (void *)qmap_select_cpu, 644 .enqueue = (void *)qmap_enqueue, 645 .dequeue = (void *)qmap_dequeue, 646 .dispatch = (void *)qmap_dispatch, 647 .tick = (void *)qmap_tick, 648 .core_sched_before = (void *)qmap_core_sched_before, 649 .cpu_release = (void *)qmap_cpu_release, 650 .init_task = (void *)qmap_init_task, 651 .dump = (void *)qmap_dump, 652 .dump_cpu = (void *)qmap_dump_cpu, 653 .dump_task = (void *)qmap_dump_task, 654 .cpu_online = (void *)qmap_cpu_online, 655 .cpu_offline = (void *)qmap_cpu_offline, 656 .init = (void *)qmap_init, 657 .exit = (void *)qmap_exit, 658 .flags = SCX_OPS_ENQ_LAST, 659 .timeout_ms = 5000U, 660 .name = "qmap"); 661