1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking 15 * the CPU away. 16 * - Core-sched support. 17 * 18 * This scheduler is primarily for demonstration and testing of sched_ext 19 * features and unlikely to be useful for actual workloads. 20 * 21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 23 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 24 */ 25 #include <scx/common.bpf.h> 26 27 enum consts { 28 ONE_SEC_IN_NS = 1000000000, 29 SHARED_DSQ = 0, 30 HIGHPRI_DSQ = 1, 31 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ 32 }; 33 34 char _license[] SEC("license") = "GPL"; 35 36 const volatile u64 slice_ns; 37 const volatile u32 stall_user_nth; 38 const volatile u32 stall_kernel_nth; 39 const volatile u32 dsp_inf_loop_after; 40 const volatile u32 dsp_batch; 41 const volatile bool highpri_boosting; 42 const volatile bool print_dsqs_and_events; 43 const volatile bool print_msgs; 44 const volatile s32 disallow_tgid; 45 const volatile bool suppress_dump; 46 47 u64 nr_highpri_queued; 48 u32 test_error_cnt; 49 50 UEI_DEFINE(uei); 51 52 struct qmap { 53 __uint(type, BPF_MAP_TYPE_QUEUE); 54 __uint(max_entries, 4096); 55 __type(value, u32); 56 } queue0 SEC(".maps"), 57 queue1 SEC(".maps"), 58 queue2 SEC(".maps"), 59 queue3 SEC(".maps"), 60 queue4 SEC(".maps"), 61 dump_store SEC(".maps"); 62 63 struct { 64 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 65 __uint(max_entries, 5); 66 __type(key, int); 67 __array(values, struct qmap); 68 } queue_arr SEC(".maps") = { 69 .values = { 70 [0] = &queue0, 71 [1] = &queue1, 72 [2] = &queue2, 73 [3] = &queue3, 74 [4] = &queue4, 75 }, 76 }; 77 78 /* 79 * If enabled, CPU performance target is set according to the queue index 80 * according to the following table. 81 */ 82 static const u32 qidx_to_cpuperf_target[] = { 83 [0] = SCX_CPUPERF_ONE * 0 / 4, 84 [1] = SCX_CPUPERF_ONE * 1 / 4, 85 [2] = SCX_CPUPERF_ONE * 2 / 4, 86 [3] = SCX_CPUPERF_ONE * 3 / 4, 87 [4] = SCX_CPUPERF_ONE * 4 / 4, 88 }; 89 90 /* 91 * Per-queue sequence numbers to implement core-sched ordering. 92 * 93 * Tail seq is assigned to each queued task and incremented. Head seq tracks the 94 * sequence number of the latest dispatched task. The distance between the a 95 * task's seq and the associated queue's head seq is called the queue distance 96 * and used when comparing two tasks for ordering. See qmap_core_sched_before(). 97 */ 98 static u64 core_sched_head_seqs[5]; 99 static u64 core_sched_tail_seqs[5]; 100 101 /* Per-task scheduling context */ 102 struct task_ctx { 103 bool force_local; /* Dispatch directly to local_dsq */ 104 bool highpri; 105 u64 core_sched_seq; 106 }; 107 108 struct { 109 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 110 __uint(map_flags, BPF_F_NO_PREALLOC); 111 __type(key, int); 112 __type(value, struct task_ctx); 113 } task_ctx_stor SEC(".maps"); 114 115 struct cpu_ctx { 116 u64 dsp_idx; /* dispatch index */ 117 u64 dsp_cnt; /* remaining count */ 118 u32 avg_weight; 119 u32 cpuperf_target; 120 }; 121 122 struct { 123 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 124 __uint(max_entries, 1); 125 __type(key, u32); 126 __type(value, struct cpu_ctx); 127 } cpu_ctx_stor SEC(".maps"); 128 129 /* Statistics */ 130 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; 131 u64 nr_core_sched_execed; 132 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; 133 u32 cpuperf_min, cpuperf_avg, cpuperf_max; 134 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; 135 136 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) 137 { 138 s32 cpu; 139 140 if (p->nr_cpus_allowed == 1 || 141 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) 142 return prev_cpu; 143 144 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 145 if (cpu >= 0) 146 return cpu; 147 148 return -1; 149 } 150 151 static struct task_ctx *lookup_task_ctx(struct task_struct *p) 152 { 153 struct task_ctx *tctx; 154 155 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { 156 scx_bpf_error("task_ctx lookup failed"); 157 return NULL; 158 } 159 return tctx; 160 } 161 162 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 163 s32 prev_cpu, u64 wake_flags) 164 { 165 struct task_ctx *tctx; 166 s32 cpu; 167 168 if (!(tctx = lookup_task_ctx(p))) 169 return -ESRCH; 170 171 cpu = pick_direct_dispatch_cpu(p, prev_cpu); 172 173 if (cpu >= 0) { 174 tctx->force_local = true; 175 return cpu; 176 } else { 177 return prev_cpu; 178 } 179 } 180 181 static int weight_to_idx(u32 weight) 182 { 183 /* Coarsely map the compound weight to a FIFO. */ 184 if (weight <= 25) 185 return 0; 186 else if (weight <= 50) 187 return 1; 188 else if (weight < 200) 189 return 2; 190 else if (weight < 400) 191 return 3; 192 else 193 return 4; 194 } 195 196 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 197 { 198 static u32 user_cnt, kernel_cnt; 199 struct task_ctx *tctx; 200 u32 pid = p->pid; 201 int idx = weight_to_idx(p->scx.weight); 202 void *ring; 203 s32 cpu; 204 205 if (enq_flags & SCX_ENQ_REENQ) 206 __sync_fetch_and_add(&nr_reenqueued, 1); 207 208 if (p->flags & PF_KTHREAD) { 209 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 210 return; 211 } else { 212 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 213 return; 214 } 215 216 if (test_error_cnt && !--test_error_cnt) 217 scx_bpf_error("test triggering error"); 218 219 if (!(tctx = lookup_task_ctx(p))) 220 return; 221 222 /* 223 * All enqueued tasks must have their core_sched_seq updated for correct 224 * core-sched ordering. Also, take a look at the end of qmap_dispatch(). 225 */ 226 tctx->core_sched_seq = core_sched_tail_seqs[idx]++; 227 228 /* 229 * If qmap_select_cpu() is telling us to or this is the last runnable 230 * task on the CPU, enqueue locally. 231 */ 232 if (tctx->force_local) { 233 tctx->force_local = false; 234 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 235 return; 236 } 237 238 /* if select_cpu() wasn't called, try direct dispatch */ 239 if (!__COMPAT_is_enq_cpu_selected(enq_flags) && 240 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { 241 __sync_fetch_and_add(&nr_ddsp_from_enq, 1); 242 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); 243 return; 244 } 245 246 /* 247 * If the task was re-enqueued due to the CPU being preempted by a 248 * higher priority scheduling class, just re-enqueue the task directly 249 * on the global DSQ. As we want another CPU to pick it up, find and 250 * kick an idle CPU. 251 */ 252 if (enq_flags & SCX_ENQ_REENQ) { 253 s32 cpu; 254 255 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags); 256 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 257 if (cpu >= 0) 258 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 259 return; 260 } 261 262 ring = bpf_map_lookup_elem(&queue_arr, &idx); 263 if (!ring) { 264 scx_bpf_error("failed to find ring %d", idx); 265 return; 266 } 267 268 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 269 if (bpf_map_push_elem(ring, &pid, 0)) { 270 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags); 271 return; 272 } 273 274 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { 275 tctx->highpri = true; 276 __sync_fetch_and_add(&nr_highpri_queued, 1); 277 } 278 __sync_fetch_and_add(&nr_enqueued, 1); 279 } 280 281 /* 282 * The BPF queue map doesn't support removal and sched_ext can handle spurious 283 * dispatches. qmap_dequeue() is only used to collect statistics. 284 */ 285 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 286 { 287 __sync_fetch_and_add(&nr_dequeued, 1); 288 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) 289 __sync_fetch_and_add(&nr_core_sched_execed, 1); 290 } 291 292 static void update_core_sched_head_seq(struct task_struct *p) 293 { 294 int idx = weight_to_idx(p->scx.weight); 295 struct task_ctx *tctx; 296 297 if ((tctx = lookup_task_ctx(p))) 298 core_sched_head_seqs[idx] = tctx->core_sched_seq; 299 } 300 301 /* 302 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective 303 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks, 304 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor 305 * difference only when dsp_batch is larger than 1. 306 * 307 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and 308 * non-rq-lock holding BPF programs. As demonstration, this function is called 309 * from qmap_dispatch() and monitor_timerfn(). 310 */ 311 static bool dispatch_highpri(bool from_timer) 312 { 313 struct task_struct *p; 314 s32 this_cpu = bpf_get_smp_processor_id(); 315 316 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ 317 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { 318 static u64 highpri_seq; 319 struct task_ctx *tctx; 320 321 if (!(tctx = lookup_task_ctx(p))) 322 return false; 323 324 if (tctx->highpri) { 325 /* exercise the set_*() and vtime interface too */ 326 scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); 327 scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); 328 scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 329 } 330 } 331 332 /* 333 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU 334 * is found. 335 */ 336 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { 337 bool dispatched = false; 338 s32 cpu; 339 340 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) 341 cpu = this_cpu; 342 else 343 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 344 345 if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, 346 SCX_ENQ_PREEMPT)) { 347 if (cpu == this_cpu) { 348 dispatched = true; 349 __sync_fetch_and_add(&nr_expedited_local, 1); 350 } else { 351 __sync_fetch_and_add(&nr_expedited_remote, 1); 352 } 353 if (from_timer) 354 __sync_fetch_and_add(&nr_expedited_from_timer, 1); 355 } else { 356 __sync_fetch_and_add(&nr_expedited_lost, 1); 357 } 358 359 if (dispatched) 360 return true; 361 } 362 363 return false; 364 } 365 366 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 367 { 368 struct task_struct *p; 369 struct cpu_ctx *cpuc; 370 struct task_ctx *tctx; 371 u32 zero = 0, batch = dsp_batch ?: 1; 372 void *fifo; 373 s32 i, pid; 374 375 if (dispatch_highpri(false)) 376 return; 377 378 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ)) 379 return; 380 381 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { 382 /* 383 * PID 2 should be kthreadd which should mostly be idle and off 384 * the scheduler. Let's keep dispatching it to force the kernel 385 * to call this function over and over again. 386 */ 387 p = bpf_task_from_pid(2); 388 if (p) { 389 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); 390 bpf_task_release(p); 391 return; 392 } 393 } 394 395 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 396 scx_bpf_error("failed to look up cpu_ctx"); 397 return; 398 } 399 400 for (i = 0; i < 5; i++) { 401 /* Advance the dispatch cursor and pick the fifo. */ 402 if (!cpuc->dsp_cnt) { 403 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 404 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 405 } 406 407 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 408 if (!fifo) { 409 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 410 return; 411 } 412 413 /* Dispatch or advance. */ 414 bpf_repeat(BPF_MAX_LOOPS) { 415 struct task_ctx *tctx; 416 417 if (bpf_map_pop_elem(fifo, &pid)) 418 break; 419 420 p = bpf_task_from_pid(pid); 421 if (!p) 422 continue; 423 424 if (!(tctx = lookup_task_ctx(p))) { 425 bpf_task_release(p); 426 return; 427 } 428 429 if (tctx->highpri) 430 __sync_fetch_and_sub(&nr_highpri_queued, 1); 431 432 update_core_sched_head_seq(p); 433 __sync_fetch_and_add(&nr_dispatched, 1); 434 435 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0); 436 bpf_task_release(p); 437 438 batch--; 439 cpuc->dsp_cnt--; 440 if (!batch || !scx_bpf_dispatch_nr_slots()) { 441 if (dispatch_highpri(false)) 442 return; 443 scx_bpf_dsq_move_to_local(SHARED_DSQ); 444 return; 445 } 446 if (!cpuc->dsp_cnt) 447 break; 448 } 449 450 cpuc->dsp_cnt = 0; 451 } 452 453 /* 454 * No other tasks. @prev will keep running. Update its core_sched_seq as 455 * if the task were enqueued and dispatched immediately. 456 */ 457 if (prev) { 458 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); 459 if (!tctx) { 460 scx_bpf_error("task_ctx lookup failed"); 461 return; 462 } 463 464 tctx->core_sched_seq = 465 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; 466 } 467 } 468 469 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) 470 { 471 struct cpu_ctx *cpuc; 472 u32 zero = 0; 473 int idx; 474 475 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 476 scx_bpf_error("failed to look up cpu_ctx"); 477 return; 478 } 479 480 /* 481 * Use the running avg of weights to select the target cpuperf level. 482 * This is a demonstration of the cpuperf feature rather than a 483 * practical strategy to regulate CPU frequency. 484 */ 485 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; 486 idx = weight_to_idx(cpuc->avg_weight); 487 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; 488 489 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); 490 } 491 492 /* 493 * The distance from the head of the queue scaled by the weight of the queue. 494 * The lower the number, the older the task and the higher the priority. 495 */ 496 static s64 task_qdist(struct task_struct *p) 497 { 498 int idx = weight_to_idx(p->scx.weight); 499 struct task_ctx *tctx; 500 s64 qdist; 501 502 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 503 if (!tctx) { 504 scx_bpf_error("task_ctx lookup failed"); 505 return 0; 506 } 507 508 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; 509 510 /* 511 * As queue index increments, the priority doubles. The queue w/ index 3 512 * is dispatched twice more frequently than 2. Reflect the difference by 513 * scaling qdists accordingly. Note that the shift amount needs to be 514 * flipped depending on the sign to avoid flipping priority direction. 515 */ 516 if (qdist >= 0) 517 return qdist << (4 - idx); 518 else 519 return qdist << idx; 520 } 521 522 /* 523 * This is called to determine the task ordering when core-sched is picking 524 * tasks to execute on SMT siblings and should encode about the same ordering as 525 * the regular scheduling path. Use the priority-scaled distances from the head 526 * of the queues to compare the two tasks which should be consistent with the 527 * dispatch path behavior. 528 */ 529 bool BPF_STRUCT_OPS(qmap_core_sched_before, 530 struct task_struct *a, struct task_struct *b) 531 { 532 return task_qdist(a) > task_qdist(b); 533 } 534 535 SEC("tp_btf/sched_switch") 536 int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, 537 struct task_struct *next, unsigned long prev_state) 538 { 539 if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 540 return 0; 541 542 /* 543 * If @cpu is taken by a higher priority scheduling class, it is no 544 * longer available for executing sched_ext tasks. As we don't want the 545 * tasks in @cpu's local dsq to sit there until @cpu becomes available 546 * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ 547 * handling in qmap_enqueue(). 548 */ 549 switch (next->policy) { 550 case 1: /* SCHED_FIFO */ 551 case 2: /* SCHED_RR */ 552 case 6: /* SCHED_DEADLINE */ 553 scx_bpf_reenqueue_local(); 554 } 555 556 return 0; 557 } 558 559 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 560 { 561 /* see qmap_sched_switch() to learn how to do this on newer kernels */ 562 if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 563 scx_bpf_reenqueue_local(); 564 } 565 566 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 567 struct scx_init_task_args *args) 568 { 569 if (p->tgid == disallow_tgid) 570 p->scx.disallow = true; 571 572 /* 573 * @p is new. Let's ensure that its task_ctx is available. We can sleep 574 * in this function and the following will automatically use GFP_KERNEL. 575 */ 576 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 577 BPF_LOCAL_STORAGE_GET_F_CREATE)) 578 return 0; 579 else 580 return -ENOMEM; 581 } 582 583 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 584 { 585 s32 i, pid; 586 587 if (suppress_dump) 588 return; 589 590 bpf_for(i, 0, 5) { 591 void *fifo; 592 593 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 594 return; 595 596 scx_bpf_dump("QMAP FIFO[%d]:", i); 597 598 /* 599 * Dump can be invoked anytime and there is no way to iterate in 600 * a non-destructive way. Pop and store in dump_store and then 601 * restore afterwards. If racing against new enqueues, ordering 602 * can get mixed up. 603 */ 604 bpf_repeat(4096) { 605 if (bpf_map_pop_elem(fifo, &pid)) 606 break; 607 bpf_map_push_elem(&dump_store, &pid, 0); 608 scx_bpf_dump(" %d", pid); 609 } 610 611 bpf_repeat(4096) { 612 if (bpf_map_pop_elem(&dump_store, &pid)) 613 break; 614 bpf_map_push_elem(fifo, &pid, 0); 615 } 616 617 scx_bpf_dump("\n"); 618 } 619 } 620 621 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 622 { 623 u32 zero = 0; 624 struct cpu_ctx *cpuc; 625 626 if (suppress_dump || idle) 627 return; 628 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 629 return; 630 631 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", 632 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, 633 cpuc->cpuperf_target); 634 } 635 636 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 637 { 638 struct task_ctx *taskc; 639 640 if (suppress_dump) 641 return; 642 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 643 return; 644 645 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", 646 taskc->force_local, taskc->core_sched_seq); 647 } 648 649 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) 650 { 651 if (print_msgs) 652 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", 653 cgrp->kn->id, args->weight, args->bw_period_us, 654 args->bw_quota_us, args->bw_burst_us); 655 return 0; 656 } 657 658 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 659 { 660 if (print_msgs) 661 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); 662 } 663 664 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, 665 u64 period_us, u64 quota_us, u64 burst_us) 666 { 667 if (print_msgs) 668 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", 669 cgrp->kn->id, period_us, quota_us, burst_us); 670 } 671 672 /* 673 * Print out the online and possible CPU map using bpf_printk() as a 674 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). 675 */ 676 static void print_cpus(void) 677 { 678 const struct cpumask *possible, *online; 679 s32 cpu; 680 char buf[128] = "", *p; 681 int idx; 682 683 possible = scx_bpf_get_possible_cpumask(); 684 online = scx_bpf_get_online_cpumask(); 685 686 idx = 0; 687 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { 688 if (!(p = MEMBER_VPTR(buf, [idx++]))) 689 break; 690 if (bpf_cpumask_test_cpu(cpu, online)) 691 *p++ = 'O'; 692 else if (bpf_cpumask_test_cpu(cpu, possible)) 693 *p++ = 'X'; 694 else 695 *p++ = ' '; 696 697 if ((cpu & 7) == 7) { 698 if (!(p = MEMBER_VPTR(buf, [idx++]))) 699 break; 700 *p++ = '|'; 701 } 702 } 703 buf[sizeof(buf) - 1] = '\0'; 704 705 scx_bpf_put_cpumask(online); 706 scx_bpf_put_cpumask(possible); 707 708 bpf_printk("CPUS: |%s", buf); 709 } 710 711 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) 712 { 713 if (print_msgs) { 714 bpf_printk("CPU %d coming online", cpu); 715 /* @cpu is already online at this point */ 716 print_cpus(); 717 } 718 } 719 720 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) 721 { 722 if (print_msgs) { 723 bpf_printk("CPU %d going offline", cpu); 724 /* @cpu is still online at this point */ 725 print_cpus(); 726 } 727 } 728 729 struct monitor_timer { 730 struct bpf_timer timer; 731 }; 732 733 struct { 734 __uint(type, BPF_MAP_TYPE_ARRAY); 735 __uint(max_entries, 1); 736 __type(key, u32); 737 __type(value, struct monitor_timer); 738 } monitor_timer SEC(".maps"); 739 740 /* 741 * Print out the min, avg and max performance levels of CPUs every second to 742 * demonstrate the cpuperf interface. 743 */ 744 static void monitor_cpuperf(void) 745 { 746 u32 zero = 0, nr_cpu_ids; 747 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; 748 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; 749 const struct cpumask *online; 750 int i, nr_online_cpus = 0; 751 752 nr_cpu_ids = scx_bpf_nr_cpu_ids(); 753 online = scx_bpf_get_online_cpumask(); 754 755 bpf_for(i, 0, nr_cpu_ids) { 756 struct cpu_ctx *cpuc; 757 u32 cap, cur; 758 759 if (!bpf_cpumask_test_cpu(i, online)) 760 continue; 761 nr_online_cpus++; 762 763 /* collect the capacity and current cpuperf */ 764 cap = scx_bpf_cpuperf_cap(i); 765 cur = scx_bpf_cpuperf_cur(i); 766 767 cur_min = cur < cur_min ? cur : cur_min; 768 cur_max = cur > cur_max ? cur : cur_max; 769 770 /* 771 * $cur is relative to $cap. Scale it down accordingly so that 772 * it's in the same scale as other CPUs and $cur_sum/$cap_sum 773 * makes sense. 774 */ 775 cur_sum += cur * cap / SCX_CPUPERF_ONE; 776 cap_sum += cap; 777 778 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { 779 scx_bpf_error("failed to look up cpu_ctx"); 780 goto out; 781 } 782 783 /* collect target */ 784 cur = cpuc->cpuperf_target; 785 target_sum += cur; 786 target_min = cur < target_min ? cur : target_min; 787 target_max = cur > target_max ? cur : target_max; 788 } 789 790 cpuperf_min = cur_min; 791 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; 792 cpuperf_max = cur_max; 793 794 cpuperf_target_min = target_min; 795 cpuperf_target_avg = target_sum / nr_online_cpus; 796 cpuperf_target_max = target_max; 797 out: 798 scx_bpf_put_cpumask(online); 799 } 800 801 /* 802 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of 803 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to 804 * see meaningful dumps in the trace pipe. 805 */ 806 static void dump_shared_dsq(void) 807 { 808 struct task_struct *p; 809 s32 nr; 810 811 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) 812 return; 813 814 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); 815 816 bpf_rcu_read_lock(); 817 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) 818 bpf_printk("%s[%d]", p->comm, p->pid); 819 bpf_rcu_read_unlock(); 820 } 821 822 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 823 { 824 bpf_rcu_read_lock(); 825 dispatch_highpri(true); 826 bpf_rcu_read_unlock(); 827 828 monitor_cpuperf(); 829 830 if (print_dsqs_and_events) { 831 struct scx_event_stats events; 832 833 dump_shared_dsq(); 834 835 __COMPAT_scx_bpf_events(&events, sizeof(events)); 836 837 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", 838 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); 839 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", 840 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); 841 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", 842 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); 843 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", 844 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); 845 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", 846 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); 847 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", 848 scx_read_event(&events, SCX_EV_BYPASS_DURATION)); 849 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", 850 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); 851 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", 852 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); 853 } 854 855 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 856 return 0; 857 } 858 859 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 860 { 861 u32 key = 0; 862 struct bpf_timer *timer; 863 s32 ret; 864 865 if (print_msgs) 866 print_cpus(); 867 868 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 869 if (ret) 870 return ret; 871 872 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); 873 if (ret) 874 return ret; 875 876 timer = bpf_map_lookup_elem(&monitor_timer, &key); 877 if (!timer) 878 return -ESRCH; 879 880 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); 881 bpf_timer_set_callback(timer, monitor_timerfn); 882 883 return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 884 } 885 886 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 887 { 888 UEI_RECORD(uei, ei); 889 } 890 891 SCX_OPS_DEFINE(qmap_ops, 892 .select_cpu = (void *)qmap_select_cpu, 893 .enqueue = (void *)qmap_enqueue, 894 .dequeue = (void *)qmap_dequeue, 895 .dispatch = (void *)qmap_dispatch, 896 .tick = (void *)qmap_tick, 897 .core_sched_before = (void *)qmap_core_sched_before, 898 .cpu_release = (void *)qmap_cpu_release, 899 .init_task = (void *)qmap_init_task, 900 .dump = (void *)qmap_dump, 901 .dump_cpu = (void *)qmap_dump_cpu, 902 .dump_task = (void *)qmap_dump_task, 903 .cgroup_init = (void *)qmap_cgroup_init, 904 .cgroup_set_weight = (void *)qmap_cgroup_set_weight, 905 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, 906 .cpu_online = (void *)qmap_cpu_online, 907 .cpu_offline = (void *)qmap_cpu_offline, 908 .init = (void *)qmap_init, 909 .exit = (void *)qmap_exit, 910 .timeout_ms = 5000U, 911 .name = "qmap"); 912