1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking 15 * the CPU away. 16 * - Core-sched support. 17 * 18 * This scheduler is primarily for demonstration and testing of sched_ext 19 * features and unlikely to be useful for actual workloads. 20 * 21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 23 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 24 */ 25 #include <scx/common.bpf.h> 26 27 enum consts { 28 ONE_SEC_IN_NS = 1000000000, 29 SHARED_DSQ = 0, 30 HIGHPRI_DSQ = 1, 31 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ 32 }; 33 34 char _license[] SEC("license") = "GPL"; 35 36 const volatile u64 slice_ns; 37 const volatile u32 stall_user_nth; 38 const volatile u32 stall_kernel_nth; 39 const volatile u32 dsp_inf_loop_after; 40 const volatile u32 dsp_batch; 41 const volatile bool highpri_boosting; 42 const volatile bool print_dsqs_and_events; 43 const volatile bool print_msgs; 44 const volatile s32 disallow_tgid; 45 const volatile bool suppress_dump; 46 47 u64 nr_highpri_queued; 48 u32 test_error_cnt; 49 50 UEI_DEFINE(uei); 51 52 struct qmap { 53 __uint(type, BPF_MAP_TYPE_QUEUE); 54 __uint(max_entries, 4096); 55 __type(value, u32); 56 } queue0 SEC(".maps"), 57 queue1 SEC(".maps"), 58 queue2 SEC(".maps"), 59 queue3 SEC(".maps"), 60 queue4 SEC(".maps"), 61 dump_store SEC(".maps"); 62 63 struct { 64 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 65 __uint(max_entries, 5); 66 __type(key, int); 67 __array(values, struct qmap); 68 } queue_arr SEC(".maps") = { 69 .values = { 70 [0] = &queue0, 71 [1] = &queue1, 72 [2] = &queue2, 73 [3] = &queue3, 74 [4] = &queue4, 75 }, 76 }; 77 78 /* 79 * If enabled, CPU performance target is set according to the queue index 80 * according to the following table. 81 */ 82 static const u32 qidx_to_cpuperf_target[] = { 83 [0] = SCX_CPUPERF_ONE * 0 / 4, 84 [1] = SCX_CPUPERF_ONE * 1 / 4, 85 [2] = SCX_CPUPERF_ONE * 2 / 4, 86 [3] = SCX_CPUPERF_ONE * 3 / 4, 87 [4] = SCX_CPUPERF_ONE * 4 / 4, 88 }; 89 90 /* 91 * Per-queue sequence numbers to implement core-sched ordering. 92 * 93 * Tail seq is assigned to each queued task and incremented. Head seq tracks the 94 * sequence number of the latest dispatched task. The distance between the a 95 * task's seq and the associated queue's head seq is called the queue distance 96 * and used when comparing two tasks for ordering. See qmap_core_sched_before(). 97 */ 98 static u64 core_sched_head_seqs[5]; 99 static u64 core_sched_tail_seqs[5]; 100 101 /* Per-task scheduling context */ 102 struct task_ctx { 103 bool force_local; /* Dispatch directly to local_dsq */ 104 bool highpri; 105 u64 core_sched_seq; 106 }; 107 108 struct { 109 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 110 __uint(map_flags, BPF_F_NO_PREALLOC); 111 __type(key, int); 112 __type(value, struct task_ctx); 113 } task_ctx_stor SEC(".maps"); 114 115 struct cpu_ctx { 116 u64 dsp_idx; /* dispatch index */ 117 u64 dsp_cnt; /* remaining count */ 118 u32 avg_weight; 119 u32 cpuperf_target; 120 }; 121 122 struct { 123 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 124 __uint(max_entries, 1); 125 __type(key, u32); 126 __type(value, struct cpu_ctx); 127 } cpu_ctx_stor SEC(".maps"); 128 129 /* Statistics */ 130 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; 131 u64 nr_core_sched_execed; 132 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; 133 u32 cpuperf_min, cpuperf_avg, cpuperf_max; 134 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; 135 136 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) 137 { 138 s32 cpu; 139 140 if (p->nr_cpus_allowed == 1 || 141 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) 142 return prev_cpu; 143 144 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 145 if (cpu >= 0) 146 return cpu; 147 148 return -1; 149 } 150 151 static struct task_ctx *lookup_task_ctx(struct task_struct *p) 152 { 153 struct task_ctx *tctx; 154 155 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { 156 scx_bpf_error("task_ctx lookup failed"); 157 return NULL; 158 } 159 return tctx; 160 } 161 162 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 163 s32 prev_cpu, u64 wake_flags) 164 { 165 struct task_ctx *tctx; 166 s32 cpu; 167 168 if (!(tctx = lookup_task_ctx(p))) 169 return -ESRCH; 170 171 cpu = pick_direct_dispatch_cpu(p, prev_cpu); 172 173 if (cpu >= 0) { 174 tctx->force_local = true; 175 return cpu; 176 } else { 177 return prev_cpu; 178 } 179 } 180 181 static int weight_to_idx(u32 weight) 182 { 183 /* Coarsely map the compound weight to a FIFO. */ 184 if (weight <= 25) 185 return 0; 186 else if (weight <= 50) 187 return 1; 188 else if (weight < 200) 189 return 2; 190 else if (weight < 400) 191 return 3; 192 else 193 return 4; 194 } 195 196 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 197 { 198 static u32 user_cnt, kernel_cnt; 199 struct task_ctx *tctx; 200 u32 pid = p->pid; 201 int idx = weight_to_idx(p->scx.weight); 202 void *ring; 203 s32 cpu; 204 205 if (p->flags & PF_KTHREAD) { 206 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 207 return; 208 } else { 209 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 210 return; 211 } 212 213 if (test_error_cnt && !--test_error_cnt) 214 scx_bpf_error("test triggering error"); 215 216 if (!(tctx = lookup_task_ctx(p))) 217 return; 218 219 /* 220 * All enqueued tasks must have their core_sched_seq updated for correct 221 * core-sched ordering. Also, take a look at the end of qmap_dispatch(). 222 */ 223 tctx->core_sched_seq = core_sched_tail_seqs[idx]++; 224 225 /* 226 * If qmap_select_cpu() is telling us to or this is the last runnable 227 * task on the CPU, enqueue locally. 228 */ 229 if (tctx->force_local) { 230 tctx->force_local = false; 231 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 232 return; 233 } 234 235 /* if select_cpu() wasn't called, try direct dispatch */ 236 if (!__COMPAT_is_enq_cpu_selected(enq_flags) && 237 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { 238 __sync_fetch_and_add(&nr_ddsp_from_enq, 1); 239 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); 240 return; 241 } 242 243 /* 244 * If the task was re-enqueued due to the CPU being preempted by a 245 * higher priority scheduling class, just re-enqueue the task directly 246 * on the global DSQ. As we want another CPU to pick it up, find and 247 * kick an idle CPU. 248 */ 249 if (enq_flags & SCX_ENQ_REENQ) { 250 s32 cpu; 251 252 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags); 253 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 254 if (cpu >= 0) 255 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 256 return; 257 } 258 259 ring = bpf_map_lookup_elem(&queue_arr, &idx); 260 if (!ring) { 261 scx_bpf_error("failed to find ring %d", idx); 262 return; 263 } 264 265 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 266 if (bpf_map_push_elem(ring, &pid, 0)) { 267 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags); 268 return; 269 } 270 271 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { 272 tctx->highpri = true; 273 __sync_fetch_and_add(&nr_highpri_queued, 1); 274 } 275 __sync_fetch_and_add(&nr_enqueued, 1); 276 } 277 278 /* 279 * The BPF queue map doesn't support removal and sched_ext can handle spurious 280 * dispatches. qmap_dequeue() is only used to collect statistics. 281 */ 282 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 283 { 284 __sync_fetch_and_add(&nr_dequeued, 1); 285 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) 286 __sync_fetch_and_add(&nr_core_sched_execed, 1); 287 } 288 289 static void update_core_sched_head_seq(struct task_struct *p) 290 { 291 int idx = weight_to_idx(p->scx.weight); 292 struct task_ctx *tctx; 293 294 if ((tctx = lookup_task_ctx(p))) 295 core_sched_head_seqs[idx] = tctx->core_sched_seq; 296 } 297 298 /* 299 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective 300 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks, 301 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor 302 * difference only when dsp_batch is larger than 1. 303 * 304 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and 305 * non-rq-lock holding BPF programs. As demonstration, this function is called 306 * from qmap_dispatch() and monitor_timerfn(). 307 */ 308 static bool dispatch_highpri(bool from_timer) 309 { 310 struct task_struct *p; 311 s32 this_cpu = bpf_get_smp_processor_id(); 312 313 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ 314 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { 315 static u64 highpri_seq; 316 struct task_ctx *tctx; 317 318 if (!(tctx = lookup_task_ctx(p))) 319 return false; 320 321 if (tctx->highpri) { 322 /* exercise the set_*() and vtime interface too */ 323 __COMPAT_scx_bpf_dsq_move_set_slice( 324 BPF_FOR_EACH_ITER, slice_ns * 2); 325 __COMPAT_scx_bpf_dsq_move_set_vtime( 326 BPF_FOR_EACH_ITER, highpri_seq++); 327 __COMPAT_scx_bpf_dsq_move_vtime( 328 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 329 } 330 } 331 332 /* 333 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU 334 * is found. 335 */ 336 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { 337 bool dispatched = false; 338 s32 cpu; 339 340 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) 341 cpu = this_cpu; 342 else 343 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 344 345 if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, 346 SCX_DSQ_LOCAL_ON | cpu, 347 SCX_ENQ_PREEMPT)) { 348 if (cpu == this_cpu) { 349 dispatched = true; 350 __sync_fetch_and_add(&nr_expedited_local, 1); 351 } else { 352 __sync_fetch_and_add(&nr_expedited_remote, 1); 353 } 354 if (from_timer) 355 __sync_fetch_and_add(&nr_expedited_from_timer, 1); 356 } else { 357 __sync_fetch_and_add(&nr_expedited_lost, 1); 358 } 359 360 if (dispatched) 361 return true; 362 } 363 364 return false; 365 } 366 367 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 368 { 369 struct task_struct *p; 370 struct cpu_ctx *cpuc; 371 struct task_ctx *tctx; 372 u32 zero = 0, batch = dsp_batch ?: 1; 373 void *fifo; 374 s32 i, pid; 375 376 if (dispatch_highpri(false)) 377 return; 378 379 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ)) 380 return; 381 382 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { 383 /* 384 * PID 2 should be kthreadd which should mostly be idle and off 385 * the scheduler. Let's keep dispatching it to force the kernel 386 * to call this function over and over again. 387 */ 388 p = bpf_task_from_pid(2); 389 if (p) { 390 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); 391 bpf_task_release(p); 392 return; 393 } 394 } 395 396 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 397 scx_bpf_error("failed to look up cpu_ctx"); 398 return; 399 } 400 401 for (i = 0; i < 5; i++) { 402 /* Advance the dispatch cursor and pick the fifo. */ 403 if (!cpuc->dsp_cnt) { 404 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 405 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 406 } 407 408 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 409 if (!fifo) { 410 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 411 return; 412 } 413 414 /* Dispatch or advance. */ 415 bpf_repeat(BPF_MAX_LOOPS) { 416 struct task_ctx *tctx; 417 418 if (bpf_map_pop_elem(fifo, &pid)) 419 break; 420 421 p = bpf_task_from_pid(pid); 422 if (!p) 423 continue; 424 425 if (!(tctx = lookup_task_ctx(p))) { 426 bpf_task_release(p); 427 return; 428 } 429 430 if (tctx->highpri) 431 __sync_fetch_and_sub(&nr_highpri_queued, 1); 432 433 update_core_sched_head_seq(p); 434 __sync_fetch_and_add(&nr_dispatched, 1); 435 436 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0); 437 bpf_task_release(p); 438 439 batch--; 440 cpuc->dsp_cnt--; 441 if (!batch || !scx_bpf_dispatch_nr_slots()) { 442 if (dispatch_highpri(false)) 443 return; 444 scx_bpf_dsq_move_to_local(SHARED_DSQ); 445 return; 446 } 447 if (!cpuc->dsp_cnt) 448 break; 449 } 450 451 cpuc->dsp_cnt = 0; 452 } 453 454 /* 455 * No other tasks. @prev will keep running. Update its core_sched_seq as 456 * if the task were enqueued and dispatched immediately. 457 */ 458 if (prev) { 459 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); 460 if (!tctx) { 461 scx_bpf_error("task_ctx lookup failed"); 462 return; 463 } 464 465 tctx->core_sched_seq = 466 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; 467 } 468 } 469 470 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) 471 { 472 struct cpu_ctx *cpuc; 473 u32 zero = 0; 474 int idx; 475 476 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 477 scx_bpf_error("failed to look up cpu_ctx"); 478 return; 479 } 480 481 /* 482 * Use the running avg of weights to select the target cpuperf level. 483 * This is a demonstration of the cpuperf feature rather than a 484 * practical strategy to regulate CPU frequency. 485 */ 486 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; 487 idx = weight_to_idx(cpuc->avg_weight); 488 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; 489 490 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); 491 } 492 493 /* 494 * The distance from the head of the queue scaled by the weight of the queue. 495 * The lower the number, the older the task and the higher the priority. 496 */ 497 static s64 task_qdist(struct task_struct *p) 498 { 499 int idx = weight_to_idx(p->scx.weight); 500 struct task_ctx *tctx; 501 s64 qdist; 502 503 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 504 if (!tctx) { 505 scx_bpf_error("task_ctx lookup failed"); 506 return 0; 507 } 508 509 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; 510 511 /* 512 * As queue index increments, the priority doubles. The queue w/ index 3 513 * is dispatched twice more frequently than 2. Reflect the difference by 514 * scaling qdists accordingly. Note that the shift amount needs to be 515 * flipped depending on the sign to avoid flipping priority direction. 516 */ 517 if (qdist >= 0) 518 return qdist << (4 - idx); 519 else 520 return qdist << idx; 521 } 522 523 /* 524 * This is called to determine the task ordering when core-sched is picking 525 * tasks to execute on SMT siblings and should encode about the same ordering as 526 * the regular scheduling path. Use the priority-scaled distances from the head 527 * of the queues to compare the two tasks which should be consistent with the 528 * dispatch path behavior. 529 */ 530 bool BPF_STRUCT_OPS(qmap_core_sched_before, 531 struct task_struct *a, struct task_struct *b) 532 { 533 return task_qdist(a) > task_qdist(b); 534 } 535 536 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 537 { 538 u32 cnt; 539 540 /* 541 * Called when @cpu is taken by a higher priority scheduling class. This 542 * makes @cpu no longer available for executing sched_ext tasks. As we 543 * don't want the tasks in @cpu's local dsq to sit there until @cpu 544 * becomes available again, re-enqueue them into the global dsq. See 545 * %SCX_ENQ_REENQ handling in qmap_enqueue(). 546 */ 547 cnt = scx_bpf_reenqueue_local(); 548 if (cnt) 549 __sync_fetch_and_add(&nr_reenqueued, cnt); 550 } 551 552 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 553 struct scx_init_task_args *args) 554 { 555 if (p->tgid == disallow_tgid) 556 p->scx.disallow = true; 557 558 /* 559 * @p is new. Let's ensure that its task_ctx is available. We can sleep 560 * in this function and the following will automatically use GFP_KERNEL. 561 */ 562 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 563 BPF_LOCAL_STORAGE_GET_F_CREATE)) 564 return 0; 565 else 566 return -ENOMEM; 567 } 568 569 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 570 { 571 s32 i, pid; 572 573 if (suppress_dump) 574 return; 575 576 bpf_for(i, 0, 5) { 577 void *fifo; 578 579 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 580 return; 581 582 scx_bpf_dump("QMAP FIFO[%d]:", i); 583 584 /* 585 * Dump can be invoked anytime and there is no way to iterate in 586 * a non-destructive way. Pop and store in dump_store and then 587 * restore afterwards. If racing against new enqueues, ordering 588 * can get mixed up. 589 */ 590 bpf_repeat(4096) { 591 if (bpf_map_pop_elem(fifo, &pid)) 592 break; 593 bpf_map_push_elem(&dump_store, &pid, 0); 594 scx_bpf_dump(" %d", pid); 595 } 596 597 bpf_repeat(4096) { 598 if (bpf_map_pop_elem(&dump_store, &pid)) 599 break; 600 bpf_map_push_elem(fifo, &pid, 0); 601 } 602 603 scx_bpf_dump("\n"); 604 } 605 } 606 607 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 608 { 609 u32 zero = 0; 610 struct cpu_ctx *cpuc; 611 612 if (suppress_dump || idle) 613 return; 614 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 615 return; 616 617 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", 618 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, 619 cpuc->cpuperf_target); 620 } 621 622 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 623 { 624 struct task_ctx *taskc; 625 626 if (suppress_dump) 627 return; 628 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 629 return; 630 631 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", 632 taskc->force_local, taskc->core_sched_seq); 633 } 634 635 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) 636 { 637 if (print_msgs) 638 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", 639 cgrp->kn->id, args->weight, args->bw_period_us, 640 args->bw_quota_us, args->bw_burst_us); 641 return 0; 642 } 643 644 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 645 { 646 if (print_msgs) 647 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); 648 } 649 650 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, 651 u64 period_us, u64 quota_us, u64 burst_us) 652 { 653 if (print_msgs) 654 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", 655 cgrp->kn->id, period_us, quota_us, burst_us); 656 } 657 658 /* 659 * Print out the online and possible CPU map using bpf_printk() as a 660 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). 661 */ 662 static void print_cpus(void) 663 { 664 const struct cpumask *possible, *online; 665 s32 cpu; 666 char buf[128] = "", *p; 667 int idx; 668 669 possible = scx_bpf_get_possible_cpumask(); 670 online = scx_bpf_get_online_cpumask(); 671 672 idx = 0; 673 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { 674 if (!(p = MEMBER_VPTR(buf, [idx++]))) 675 break; 676 if (bpf_cpumask_test_cpu(cpu, online)) 677 *p++ = 'O'; 678 else if (bpf_cpumask_test_cpu(cpu, possible)) 679 *p++ = 'X'; 680 else 681 *p++ = ' '; 682 683 if ((cpu & 7) == 7) { 684 if (!(p = MEMBER_VPTR(buf, [idx++]))) 685 break; 686 *p++ = '|'; 687 } 688 } 689 buf[sizeof(buf) - 1] = '\0'; 690 691 scx_bpf_put_cpumask(online); 692 scx_bpf_put_cpumask(possible); 693 694 bpf_printk("CPUS: |%s", buf); 695 } 696 697 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) 698 { 699 if (print_msgs) { 700 bpf_printk("CPU %d coming online", cpu); 701 /* @cpu is already online at this point */ 702 print_cpus(); 703 } 704 } 705 706 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) 707 { 708 if (print_msgs) { 709 bpf_printk("CPU %d going offline", cpu); 710 /* @cpu is still online at this point */ 711 print_cpus(); 712 } 713 } 714 715 struct monitor_timer { 716 struct bpf_timer timer; 717 }; 718 719 struct { 720 __uint(type, BPF_MAP_TYPE_ARRAY); 721 __uint(max_entries, 1); 722 __type(key, u32); 723 __type(value, struct monitor_timer); 724 } monitor_timer SEC(".maps"); 725 726 /* 727 * Print out the min, avg and max performance levels of CPUs every second to 728 * demonstrate the cpuperf interface. 729 */ 730 static void monitor_cpuperf(void) 731 { 732 u32 zero = 0, nr_cpu_ids; 733 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; 734 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; 735 const struct cpumask *online; 736 int i, nr_online_cpus = 0; 737 738 nr_cpu_ids = scx_bpf_nr_cpu_ids(); 739 online = scx_bpf_get_online_cpumask(); 740 741 bpf_for(i, 0, nr_cpu_ids) { 742 struct cpu_ctx *cpuc; 743 u32 cap, cur; 744 745 if (!bpf_cpumask_test_cpu(i, online)) 746 continue; 747 nr_online_cpus++; 748 749 /* collect the capacity and current cpuperf */ 750 cap = scx_bpf_cpuperf_cap(i); 751 cur = scx_bpf_cpuperf_cur(i); 752 753 cur_min = cur < cur_min ? cur : cur_min; 754 cur_max = cur > cur_max ? cur : cur_max; 755 756 /* 757 * $cur is relative to $cap. Scale it down accordingly so that 758 * it's in the same scale as other CPUs and $cur_sum/$cap_sum 759 * makes sense. 760 */ 761 cur_sum += cur * cap / SCX_CPUPERF_ONE; 762 cap_sum += cap; 763 764 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { 765 scx_bpf_error("failed to look up cpu_ctx"); 766 goto out; 767 } 768 769 /* collect target */ 770 cur = cpuc->cpuperf_target; 771 target_sum += cur; 772 target_min = cur < target_min ? cur : target_min; 773 target_max = cur > target_max ? cur : target_max; 774 } 775 776 cpuperf_min = cur_min; 777 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; 778 cpuperf_max = cur_max; 779 780 cpuperf_target_min = target_min; 781 cpuperf_target_avg = target_sum / nr_online_cpus; 782 cpuperf_target_max = target_max; 783 out: 784 scx_bpf_put_cpumask(online); 785 } 786 787 /* 788 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of 789 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to 790 * see meaningful dumps in the trace pipe. 791 */ 792 static void dump_shared_dsq(void) 793 { 794 struct task_struct *p; 795 s32 nr; 796 797 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) 798 return; 799 800 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); 801 802 bpf_rcu_read_lock(); 803 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) 804 bpf_printk("%s[%d]", p->comm, p->pid); 805 bpf_rcu_read_unlock(); 806 } 807 808 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 809 { 810 bpf_rcu_read_lock(); 811 dispatch_highpri(true); 812 bpf_rcu_read_unlock(); 813 814 monitor_cpuperf(); 815 816 if (print_dsqs_and_events) { 817 struct scx_event_stats events; 818 819 dump_shared_dsq(); 820 821 __COMPAT_scx_bpf_events(&events, sizeof(events)); 822 823 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", 824 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); 825 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", 826 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); 827 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", 828 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); 829 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", 830 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); 831 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", 832 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); 833 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", 834 scx_read_event(&events, SCX_EV_BYPASS_DURATION)); 835 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", 836 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); 837 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", 838 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); 839 } 840 841 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 842 return 0; 843 } 844 845 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 846 { 847 u32 key = 0; 848 struct bpf_timer *timer; 849 s32 ret; 850 851 if (print_msgs) 852 print_cpus(); 853 854 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 855 if (ret) 856 return ret; 857 858 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); 859 if (ret) 860 return ret; 861 862 timer = bpf_map_lookup_elem(&monitor_timer, &key); 863 if (!timer) 864 return -ESRCH; 865 866 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); 867 bpf_timer_set_callback(timer, monitor_timerfn); 868 869 return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 870 } 871 872 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 873 { 874 UEI_RECORD(uei, ei); 875 } 876 877 SCX_OPS_DEFINE(qmap_ops, 878 .select_cpu = (void *)qmap_select_cpu, 879 .enqueue = (void *)qmap_enqueue, 880 .dequeue = (void *)qmap_dequeue, 881 .dispatch = (void *)qmap_dispatch, 882 .tick = (void *)qmap_tick, 883 .core_sched_before = (void *)qmap_core_sched_before, 884 .cpu_release = (void *)qmap_cpu_release, 885 .init_task = (void *)qmap_init_task, 886 .dump = (void *)qmap_dump, 887 .dump_cpu = (void *)qmap_dump_cpu, 888 .dump_task = (void *)qmap_dump_task, 889 .cgroup_init = (void *)qmap_cgroup_init, 890 .cgroup_set_weight = (void *)qmap_cgroup_set_weight, 891 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, 892 .cpu_online = (void *)qmap_cpu_online, 893 .cpu_offline = (void *)qmap_cpu_offline, 894 .init = (void *)qmap_init, 895 .exit = (void *)qmap_exit, 896 .timeout_ms = 5000U, 897 .name = "qmap"); 898