1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * - Core-sched support. 15 * 16 * This scheduler is primarily for demonstration and testing of sched_ext 17 * features and unlikely to be useful for actual workloads. 18 * 19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 21 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 22 */ 23 #include <scx/common.bpf.h> 24 25 enum consts { 26 ONE_SEC_IN_NS = 1000000000, 27 ONE_MSEC_IN_NS = 1000000, 28 LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS, 29 SHARED_DSQ = 0, 30 HIGHPRI_DSQ = 1, 31 LOWPRI_DSQ = 2, 32 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ 33 }; 34 35 char _license[] SEC("license") = "GPL"; 36 37 const volatile u64 slice_ns; 38 const volatile u32 stall_user_nth; 39 const volatile u32 stall_kernel_nth; 40 const volatile u32 dsp_inf_loop_after; 41 const volatile u32 dsp_batch; 42 const volatile bool highpri_boosting; 43 const volatile bool print_dsqs_and_events; 44 const volatile bool print_msgs; 45 const volatile u64 sub_cgroup_id; 46 const volatile s32 disallow_tgid; 47 const volatile bool suppress_dump; 48 const volatile bool always_enq_immed; 49 const volatile u32 immed_stress_nth; 50 51 u64 nr_highpri_queued; 52 u32 test_error_cnt; 53 54 #define MAX_SUB_SCHEDS 8 55 u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS]; 56 57 UEI_DEFINE(uei); 58 59 struct qmap { 60 __uint(type, BPF_MAP_TYPE_QUEUE); 61 __uint(max_entries, 4096); 62 __type(value, u32); 63 } queue0 SEC(".maps"), 64 queue1 SEC(".maps"), 65 queue2 SEC(".maps"), 66 queue3 SEC(".maps"), 67 queue4 SEC(".maps"), 68 dump_store SEC(".maps"); 69 70 struct { 71 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 72 __uint(max_entries, 5); 73 __type(key, int); 74 __array(values, struct qmap); 75 } queue_arr SEC(".maps") = { 76 .values = { 77 [0] = &queue0, 78 [1] = &queue1, 79 [2] = &queue2, 80 [3] = &queue3, 81 [4] = &queue4, 82 }, 83 }; 84 85 /* 86 * If enabled, CPU performance target is set according to the queue index 87 * according to the following table. 88 */ 89 static const u32 qidx_to_cpuperf_target[] = { 90 [0] = SCX_CPUPERF_ONE * 0 / 4, 91 [1] = SCX_CPUPERF_ONE * 1 / 4, 92 [2] = SCX_CPUPERF_ONE * 2 / 4, 93 [3] = SCX_CPUPERF_ONE * 3 / 4, 94 [4] = SCX_CPUPERF_ONE * 4 / 4, 95 }; 96 97 /* 98 * Per-queue sequence numbers to implement core-sched ordering. 99 * 100 * Tail seq is assigned to each queued task and incremented. Head seq tracks the 101 * sequence number of the latest dispatched task. The distance between the a 102 * task's seq and the associated queue's head seq is called the queue distance 103 * and used when comparing two tasks for ordering. See qmap_core_sched_before(). 104 */ 105 static u64 core_sched_head_seqs[5]; 106 static u64 core_sched_tail_seqs[5]; 107 108 /* Per-task scheduling context */ 109 struct task_ctx { 110 bool force_local; /* Dispatch directly to local_dsq */ 111 bool highpri; 112 u64 core_sched_seq; 113 }; 114 115 struct { 116 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 117 __uint(map_flags, BPF_F_NO_PREALLOC); 118 __type(key, int); 119 __type(value, struct task_ctx); 120 } task_ctx_stor SEC(".maps"); 121 122 struct cpu_ctx { 123 u64 dsp_idx; /* dispatch index */ 124 u64 dsp_cnt; /* remaining count */ 125 u32 avg_weight; 126 u32 cpuperf_target; 127 }; 128 129 struct { 130 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 131 __uint(max_entries, 1); 132 __type(key, u32); 133 __type(value, struct cpu_ctx); 134 } cpu_ctx_stor SEC(".maps"); 135 136 /* Statistics */ 137 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq; 138 u64 nr_core_sched_execed; 139 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; 140 u32 cpuperf_min, cpuperf_avg, cpuperf_max; 141 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; 142 143 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) 144 { 145 s32 cpu; 146 147 if (!always_enq_immed && p->nr_cpus_allowed == 1) 148 return prev_cpu; 149 150 if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) 151 return prev_cpu; 152 153 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 154 if (cpu >= 0) 155 return cpu; 156 157 return -1; 158 } 159 160 static struct task_ctx *lookup_task_ctx(struct task_struct *p) 161 { 162 struct task_ctx *tctx; 163 164 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { 165 scx_bpf_error("task_ctx lookup failed"); 166 return NULL; 167 } 168 return tctx; 169 } 170 171 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 172 s32 prev_cpu, u64 wake_flags) 173 { 174 struct task_ctx *tctx; 175 s32 cpu; 176 177 if (!(tctx = lookup_task_ctx(p))) 178 return -ESRCH; 179 180 if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) 181 return prev_cpu; 182 183 cpu = pick_direct_dispatch_cpu(p, prev_cpu); 184 185 if (cpu >= 0) { 186 tctx->force_local = true; 187 return cpu; 188 } else { 189 return prev_cpu; 190 } 191 } 192 193 static int weight_to_idx(u32 weight) 194 { 195 /* Coarsely map the compound weight to a FIFO. */ 196 if (weight <= 25) 197 return 0; 198 else if (weight <= 50) 199 return 1; 200 else if (weight < 200) 201 return 2; 202 else if (weight < 400) 203 return 3; 204 else 205 return 4; 206 } 207 208 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 209 { 210 static u32 user_cnt, kernel_cnt; 211 struct task_ctx *tctx; 212 u32 pid = p->pid; 213 int idx = weight_to_idx(p->scx.weight); 214 void *ring; 215 s32 cpu; 216 217 if (enq_flags & SCX_ENQ_REENQ) { 218 __sync_fetch_and_add(&nr_reenqueued, 1); 219 if (scx_bpf_task_cpu(p) == 0) 220 __sync_fetch_and_add(&nr_reenqueued_cpu0, 1); 221 } 222 223 if (p->flags & PF_KTHREAD) { 224 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 225 return; 226 } else { 227 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 228 return; 229 } 230 231 if (test_error_cnt && !--test_error_cnt) 232 scx_bpf_error("test triggering error"); 233 234 if (!(tctx = lookup_task_ctx(p))) 235 return; 236 237 /* 238 * All enqueued tasks must have their core_sched_seq updated for correct 239 * core-sched ordering. Also, take a look at the end of qmap_dispatch(). 240 */ 241 tctx->core_sched_seq = core_sched_tail_seqs[idx]++; 242 243 /* 244 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch 245 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1 246 * and exercise the kernel IMMED reenqueue trigger paths. 247 */ 248 if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) { 249 static u32 immed_stress_cnt; 250 251 if (!(++immed_stress_cnt % immed_stress_nth)) { 252 tctx->force_local = false; 253 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p), 254 slice_ns, enq_flags); 255 return; 256 } 257 } 258 259 /* 260 * If qmap_select_cpu() is telling us to or this is the last runnable 261 * task on the CPU, enqueue locally. 262 */ 263 if (tctx->force_local) { 264 tctx->force_local = false; 265 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 266 return; 267 } 268 269 /* see lowpri_timerfn() */ 270 if (__COMPAT_has_generic_reenq() && 271 p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) { 272 scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags); 273 return; 274 } 275 276 /* if select_cpu() wasn't called, try direct dispatch */ 277 if (!__COMPAT_is_enq_cpu_selected(enq_flags) && 278 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { 279 __sync_fetch_and_add(&nr_ddsp_from_enq, 1); 280 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); 281 return; 282 } 283 284 /* 285 * If the task was re-enqueued due to the CPU being preempted by a 286 * higher priority scheduling class, just re-enqueue the task directly 287 * on the global DSQ. As we want another CPU to pick it up, find and 288 * kick an idle CPU. 289 */ 290 if (enq_flags & SCX_ENQ_REENQ) { 291 s32 cpu; 292 293 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags); 294 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 295 if (cpu >= 0) 296 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 297 return; 298 } 299 300 ring = bpf_map_lookup_elem(&queue_arr, &idx); 301 if (!ring) { 302 scx_bpf_error("failed to find ring %d", idx); 303 return; 304 } 305 306 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 307 if (bpf_map_push_elem(ring, &pid, 0)) { 308 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags); 309 return; 310 } 311 312 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { 313 tctx->highpri = true; 314 __sync_fetch_and_add(&nr_highpri_queued, 1); 315 } 316 __sync_fetch_and_add(&nr_enqueued, 1); 317 } 318 319 /* 320 * The BPF queue map doesn't support removal and sched_ext can handle spurious 321 * dispatches. qmap_dequeue() is only used to collect statistics. 322 */ 323 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 324 { 325 __sync_fetch_and_add(&nr_dequeued, 1); 326 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) 327 __sync_fetch_and_add(&nr_core_sched_execed, 1); 328 } 329 330 static void update_core_sched_head_seq(struct task_struct *p) 331 { 332 int idx = weight_to_idx(p->scx.weight); 333 struct task_ctx *tctx; 334 335 if ((tctx = lookup_task_ctx(p))) 336 core_sched_head_seqs[idx] = tctx->core_sched_seq; 337 } 338 339 /* 340 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective 341 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks, 342 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor 343 * difference only when dsp_batch is larger than 1. 344 * 345 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and 346 * non-rq-lock holding BPF programs. As demonstration, this function is called 347 * from qmap_dispatch() and monitor_timerfn(). 348 */ 349 static bool dispatch_highpri(bool from_timer) 350 { 351 struct task_struct *p; 352 s32 this_cpu = bpf_get_smp_processor_id(); 353 354 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ 355 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { 356 static u64 highpri_seq; 357 struct task_ctx *tctx; 358 359 if (!(tctx = lookup_task_ctx(p))) 360 return false; 361 362 if (tctx->highpri) { 363 /* exercise the set_*() and vtime interface too */ 364 scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); 365 scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); 366 scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 367 } 368 } 369 370 /* 371 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU 372 * is found. 373 */ 374 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { 375 bool dispatched = false; 376 s32 cpu; 377 378 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) 379 cpu = this_cpu; 380 else 381 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 382 383 if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, 384 SCX_ENQ_PREEMPT)) { 385 if (cpu == this_cpu) { 386 dispatched = true; 387 __sync_fetch_and_add(&nr_expedited_local, 1); 388 } else { 389 __sync_fetch_and_add(&nr_expedited_remote, 1); 390 } 391 if (from_timer) 392 __sync_fetch_and_add(&nr_expedited_from_timer, 1); 393 } else { 394 __sync_fetch_and_add(&nr_expedited_lost, 1); 395 } 396 397 if (dispatched) 398 return true; 399 } 400 401 return false; 402 } 403 404 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 405 { 406 struct task_struct *p; 407 struct cpu_ctx *cpuc; 408 struct task_ctx *tctx; 409 u32 zero = 0, batch = dsp_batch ?: 1; 410 void *fifo; 411 s32 i, pid; 412 413 if (dispatch_highpri(false)) 414 return; 415 416 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0)) 417 return; 418 419 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { 420 /* 421 * PID 2 should be kthreadd which should mostly be idle and off 422 * the scheduler. Let's keep dispatching it to force the kernel 423 * to call this function over and over again. 424 */ 425 p = bpf_task_from_pid(2); 426 if (p) { 427 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); 428 bpf_task_release(p); 429 return; 430 } 431 } 432 433 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 434 scx_bpf_error("failed to look up cpu_ctx"); 435 return; 436 } 437 438 for (i = 0; i < 5; i++) { 439 /* Advance the dispatch cursor and pick the fifo. */ 440 if (!cpuc->dsp_cnt) { 441 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 442 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 443 } 444 445 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 446 if (!fifo) { 447 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 448 return; 449 } 450 451 /* Dispatch or advance. */ 452 bpf_repeat(BPF_MAX_LOOPS) { 453 struct task_ctx *tctx; 454 455 if (bpf_map_pop_elem(fifo, &pid)) 456 break; 457 458 p = bpf_task_from_pid(pid); 459 if (!p) 460 continue; 461 462 if (!(tctx = lookup_task_ctx(p))) { 463 bpf_task_release(p); 464 return; 465 } 466 467 if (tctx->highpri) 468 __sync_fetch_and_sub(&nr_highpri_queued, 1); 469 470 update_core_sched_head_seq(p); 471 __sync_fetch_and_add(&nr_dispatched, 1); 472 473 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0); 474 475 /* 476 * scx_qmap uses a global BPF queue that any CPU's 477 * dispatch can pop from. If this CPU popped a task that 478 * can't run here, it gets stranded on SHARED_DSQ after 479 * consume_dispatch_q() skips it. Kick the task's home 480 * CPU so it drains SHARED_DSQ. 481 * 482 * There's a race between the pop and the flush of the 483 * buffered dsq_insert: 484 * 485 * CPU 0 (dispatching) CPU 1 (home, idle) 486 * ~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~ 487 * pop from BPF queue 488 * dsq_insert(buffered) 489 * balance: 490 * SHARED_DSQ empty 491 * BPF queue empty 492 * -> goes idle 493 * flush -> on SHARED 494 * kick CPU 1 495 * wakes, drains task 496 * 497 * The kick prevents indefinite stalls but a per-CPU 498 * kthread like ksoftirqd can be briefly stranded when 499 * its home CPU enters idle with softirq pending, 500 * triggering: 501 * 502 * "NOHZ tick-stop error: local softirq work is pending, handler #N!!!" 503 * 504 * from report_idle_softirq(). The kick lands shortly 505 * after and the home CPU drains the task. This could be 506 * avoided by e.g. dispatching pinned tasks to local or 507 * global DSQs, but the current code is left as-is to 508 * document this class of issue -- other schedulers 509 * seeing similar warnings can use this as a reference. 510 */ 511 if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) 512 scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0); 513 514 bpf_task_release(p); 515 516 batch--; 517 cpuc->dsp_cnt--; 518 if (!batch || !scx_bpf_dispatch_nr_slots()) { 519 if (dispatch_highpri(false)) 520 return; 521 scx_bpf_dsq_move_to_local(SHARED_DSQ, 0); 522 return; 523 } 524 if (!cpuc->dsp_cnt) 525 break; 526 } 527 528 cpuc->dsp_cnt = 0; 529 } 530 531 for (i = 0; i < MAX_SUB_SCHEDS; i++) { 532 if (sub_sched_cgroup_ids[i] && 533 scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i])) 534 return; 535 } 536 537 /* 538 * No other tasks. @prev will keep running. Update its core_sched_seq as 539 * if the task were enqueued and dispatched immediately. 540 */ 541 if (prev) { 542 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); 543 if (!tctx) { 544 scx_bpf_error("task_ctx lookup failed"); 545 return; 546 } 547 548 tctx->core_sched_seq = 549 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; 550 } 551 } 552 553 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) 554 { 555 struct cpu_ctx *cpuc; 556 u32 zero = 0; 557 int idx; 558 559 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 560 scx_bpf_error("failed to look up cpu_ctx"); 561 return; 562 } 563 564 /* 565 * Use the running avg of weights to select the target cpuperf level. 566 * This is a demonstration of the cpuperf feature rather than a 567 * practical strategy to regulate CPU frequency. 568 */ 569 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; 570 idx = weight_to_idx(cpuc->avg_weight); 571 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; 572 573 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); 574 } 575 576 /* 577 * The distance from the head of the queue scaled by the weight of the queue. 578 * The lower the number, the older the task and the higher the priority. 579 */ 580 static s64 task_qdist(struct task_struct *p) 581 { 582 int idx = weight_to_idx(p->scx.weight); 583 struct task_ctx *tctx; 584 s64 qdist; 585 586 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 587 if (!tctx) { 588 scx_bpf_error("task_ctx lookup failed"); 589 return 0; 590 } 591 592 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; 593 594 /* 595 * As queue index increments, the priority doubles. The queue w/ index 3 596 * is dispatched twice more frequently than 2. Reflect the difference by 597 * scaling qdists accordingly. Note that the shift amount needs to be 598 * flipped depending on the sign to avoid flipping priority direction. 599 */ 600 if (qdist >= 0) 601 return qdist << (4 - idx); 602 else 603 return qdist << idx; 604 } 605 606 /* 607 * This is called to determine the task ordering when core-sched is picking 608 * tasks to execute on SMT siblings and should encode about the same ordering as 609 * the regular scheduling path. Use the priority-scaled distances from the head 610 * of the queues to compare the two tasks which should be consistent with the 611 * dispatch path behavior. 612 */ 613 bool BPF_STRUCT_OPS(qmap_core_sched_before, 614 struct task_struct *a, struct task_struct *b) 615 { 616 return task_qdist(a) > task_qdist(b); 617 } 618 619 /* 620 * sched_switch tracepoint and cpu_release handlers are no longer needed. 621 * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED 622 * tasks when a higher-priority scheduling class takes the CPU. 623 */ 624 625 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 626 struct scx_init_task_args *args) 627 { 628 if (p->tgid == disallow_tgid) 629 p->scx.disallow = true; 630 631 /* 632 * @p is new. Let's ensure that its task_ctx is available. We can sleep 633 * in this function and the following will automatically use GFP_KERNEL. 634 */ 635 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 636 BPF_LOCAL_STORAGE_GET_F_CREATE)) 637 return 0; 638 else 639 return -ENOMEM; 640 } 641 642 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 643 { 644 s32 i, pid; 645 646 if (suppress_dump) 647 return; 648 649 bpf_for(i, 0, 5) { 650 void *fifo; 651 652 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 653 return; 654 655 scx_bpf_dump("QMAP FIFO[%d]:", i); 656 657 /* 658 * Dump can be invoked anytime and there is no way to iterate in 659 * a non-destructive way. Pop and store in dump_store and then 660 * restore afterwards. If racing against new enqueues, ordering 661 * can get mixed up. 662 */ 663 bpf_repeat(4096) { 664 if (bpf_map_pop_elem(fifo, &pid)) 665 break; 666 bpf_map_push_elem(&dump_store, &pid, 0); 667 scx_bpf_dump(" %d", pid); 668 } 669 670 bpf_repeat(4096) { 671 if (bpf_map_pop_elem(&dump_store, &pid)) 672 break; 673 bpf_map_push_elem(fifo, &pid, 0); 674 } 675 676 scx_bpf_dump("\n"); 677 } 678 } 679 680 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 681 { 682 u32 zero = 0; 683 struct cpu_ctx *cpuc; 684 685 if (suppress_dump || idle) 686 return; 687 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 688 return; 689 690 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", 691 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, 692 cpuc->cpuperf_target); 693 } 694 695 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 696 { 697 struct task_ctx *taskc; 698 699 if (suppress_dump) 700 return; 701 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 702 return; 703 704 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", 705 taskc->force_local, taskc->core_sched_seq); 706 } 707 708 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) 709 { 710 if (print_msgs) 711 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", 712 cgrp->kn->id, args->weight, args->bw_period_us, 713 args->bw_quota_us, args->bw_burst_us); 714 return 0; 715 } 716 717 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 718 { 719 if (print_msgs) 720 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); 721 } 722 723 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, 724 u64 period_us, u64 quota_us, u64 burst_us) 725 { 726 if (print_msgs) 727 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", 728 cgrp->kn->id, period_us, quota_us, burst_us); 729 } 730 731 /* 732 * Print out the online and possible CPU map using bpf_printk() as a 733 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). 734 */ 735 static void print_cpus(void) 736 { 737 const struct cpumask *possible, *online; 738 s32 cpu; 739 char buf[128] = "", *p; 740 int idx; 741 742 possible = scx_bpf_get_possible_cpumask(); 743 online = scx_bpf_get_online_cpumask(); 744 745 idx = 0; 746 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { 747 if (!(p = MEMBER_VPTR(buf, [idx++]))) 748 break; 749 if (bpf_cpumask_test_cpu(cpu, online)) 750 *p++ = 'O'; 751 else if (bpf_cpumask_test_cpu(cpu, possible)) 752 *p++ = 'X'; 753 else 754 *p++ = ' '; 755 756 if ((cpu & 7) == 7) { 757 if (!(p = MEMBER_VPTR(buf, [idx++]))) 758 break; 759 *p++ = '|'; 760 } 761 } 762 buf[sizeof(buf) - 1] = '\0'; 763 764 scx_bpf_put_cpumask(online); 765 scx_bpf_put_cpumask(possible); 766 767 bpf_printk("CPUS: |%s", buf); 768 } 769 770 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) 771 { 772 if (print_msgs) { 773 bpf_printk("CPU %d coming online", cpu); 774 /* @cpu is already online at this point */ 775 print_cpus(); 776 } 777 } 778 779 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) 780 { 781 if (print_msgs) { 782 bpf_printk("CPU %d going offline", cpu); 783 /* @cpu is still online at this point */ 784 print_cpus(); 785 } 786 } 787 788 struct monitor_timer { 789 struct bpf_timer timer; 790 }; 791 792 struct { 793 __uint(type, BPF_MAP_TYPE_ARRAY); 794 __uint(max_entries, 1); 795 __type(key, u32); 796 __type(value, struct monitor_timer); 797 } monitor_timer SEC(".maps"); 798 799 /* 800 * Print out the min, avg and max performance levels of CPUs every second to 801 * demonstrate the cpuperf interface. 802 */ 803 static void monitor_cpuperf(void) 804 { 805 u32 zero = 0, nr_cpu_ids; 806 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; 807 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; 808 const struct cpumask *online; 809 int i, nr_online_cpus = 0; 810 811 nr_cpu_ids = scx_bpf_nr_cpu_ids(); 812 online = scx_bpf_get_online_cpumask(); 813 814 bpf_for(i, 0, nr_cpu_ids) { 815 struct cpu_ctx *cpuc; 816 u32 cap, cur; 817 818 if (!bpf_cpumask_test_cpu(i, online)) 819 continue; 820 nr_online_cpus++; 821 822 /* collect the capacity and current cpuperf */ 823 cap = scx_bpf_cpuperf_cap(i); 824 cur = scx_bpf_cpuperf_cur(i); 825 826 cur_min = cur < cur_min ? cur : cur_min; 827 cur_max = cur > cur_max ? cur : cur_max; 828 829 /* 830 * $cur is relative to $cap. Scale it down accordingly so that 831 * it's in the same scale as other CPUs and $cur_sum/$cap_sum 832 * makes sense. 833 */ 834 cur_sum += cur * cap / SCX_CPUPERF_ONE; 835 cap_sum += cap; 836 837 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { 838 scx_bpf_error("failed to look up cpu_ctx"); 839 goto out; 840 } 841 842 /* collect target */ 843 cur = cpuc->cpuperf_target; 844 target_sum += cur; 845 target_min = cur < target_min ? cur : target_min; 846 target_max = cur > target_max ? cur : target_max; 847 } 848 849 cpuperf_min = cur_min; 850 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; 851 cpuperf_max = cur_max; 852 853 cpuperf_target_min = target_min; 854 cpuperf_target_avg = target_sum / nr_online_cpus; 855 cpuperf_target_max = target_max; 856 out: 857 scx_bpf_put_cpumask(online); 858 } 859 860 /* 861 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of 862 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to 863 * see meaningful dumps in the trace pipe. 864 */ 865 static void dump_shared_dsq(void) 866 { 867 struct task_struct *p; 868 s32 nr; 869 870 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) 871 return; 872 873 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); 874 875 bpf_rcu_read_lock(); 876 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) 877 bpf_printk("%s[%d]", p->comm, p->pid); 878 bpf_rcu_read_unlock(); 879 } 880 881 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 882 { 883 bpf_rcu_read_lock(); 884 dispatch_highpri(true); 885 bpf_rcu_read_unlock(); 886 887 monitor_cpuperf(); 888 889 if (print_dsqs_and_events) { 890 struct scx_event_stats events; 891 892 dump_shared_dsq(); 893 894 __COMPAT_scx_bpf_events(&events, sizeof(events)); 895 896 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", 897 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); 898 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", 899 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); 900 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", 901 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); 902 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", 903 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); 904 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", 905 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); 906 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", 907 scx_read_event(&events, SCX_EV_BYPASS_DURATION)); 908 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", 909 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); 910 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", 911 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); 912 } 913 914 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 915 return 0; 916 } 917 918 struct lowpri_timer { 919 struct bpf_timer timer; 920 }; 921 922 struct { 923 __uint(type, BPF_MAP_TYPE_ARRAY); 924 __uint(max_entries, 1); 925 __type(key, u32); 926 __type(value, struct lowpri_timer); 927 } lowpri_timer SEC(".maps"); 928 929 /* 930 * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and 931 * the tasks are transferred to SHARED_DSQ. 932 */ 933 static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) 934 { 935 scx_bpf_dsq_reenq(LOWPRI_DSQ, 0); 936 bpf_timer_start(timer, LOWPRI_INTV_NS, 0); 937 return 0; 938 } 939 940 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 941 { 942 u32 key = 0; 943 struct bpf_timer *timer; 944 s32 ret; 945 946 if (print_msgs && !sub_cgroup_id) 947 print_cpus(); 948 949 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 950 if (ret) { 951 scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); 952 return ret; 953 } 954 955 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); 956 if (ret) { 957 scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret); 958 return ret; 959 } 960 961 ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1); 962 if (ret) 963 return ret; 964 965 timer = bpf_map_lookup_elem(&monitor_timer, &key); 966 if (!timer) 967 return -ESRCH; 968 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); 969 bpf_timer_set_callback(timer, monitor_timerfn); 970 ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 971 if (ret) 972 return ret; 973 974 if (__COMPAT_has_generic_reenq()) { 975 /* see lowpri_timerfn() */ 976 timer = bpf_map_lookup_elem(&lowpri_timer, &key); 977 if (!timer) 978 return -ESRCH; 979 bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC); 980 bpf_timer_set_callback(timer, lowpri_timerfn); 981 ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0); 982 if (ret) 983 return ret; 984 } 985 986 return 0; 987 } 988 989 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 990 { 991 UEI_RECORD(uei, ei); 992 } 993 994 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args) 995 { 996 s32 i; 997 998 for (i = 0; i < MAX_SUB_SCHEDS; i++) { 999 if (!sub_sched_cgroup_ids[i]) { 1000 sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id; 1001 bpf_printk("attaching sub-sched[%d] on %s", 1002 i, args->cgroup_path); 1003 return 0; 1004 } 1005 } 1006 1007 return -ENOSPC; 1008 } 1009 1010 void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args) 1011 { 1012 s32 i; 1013 1014 for (i = 0; i < MAX_SUB_SCHEDS; i++) { 1015 if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) { 1016 sub_sched_cgroup_ids[i] = 0; 1017 bpf_printk("detaching sub-sched[%d] on %s", 1018 i, args->cgroup_path); 1019 break; 1020 } 1021 } 1022 } 1023 1024 SCX_OPS_DEFINE(qmap_ops, 1025 .select_cpu = (void *)qmap_select_cpu, 1026 .enqueue = (void *)qmap_enqueue, 1027 .dequeue = (void *)qmap_dequeue, 1028 .dispatch = (void *)qmap_dispatch, 1029 .tick = (void *)qmap_tick, 1030 .core_sched_before = (void *)qmap_core_sched_before, 1031 .init_task = (void *)qmap_init_task, 1032 .dump = (void *)qmap_dump, 1033 .dump_cpu = (void *)qmap_dump_cpu, 1034 .dump_task = (void *)qmap_dump_task, 1035 .cgroup_init = (void *)qmap_cgroup_init, 1036 .cgroup_set_weight = (void *)qmap_cgroup_set_weight, 1037 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, 1038 .sub_attach = (void *)qmap_sub_attach, 1039 .sub_detach = (void *)qmap_sub_detach, 1040 .cpu_online = (void *)qmap_cpu_online, 1041 .cpu_offline = (void *)qmap_cpu_offline, 1042 .init = (void *)qmap_init, 1043 .exit = (void *)qmap_exit, 1044 .timeout_ms = 5000U, 1045 .name = "qmap"); 1046