1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple five-level FIFO queue scheduler. 4 * 5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 * assigned to one depending on its compound weight. Each CPU round robins 7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 * queue0, 2 from queue1, 4 from queue2 and so on. 9 * 10 * This scheduler demonstrates: 11 * 12 * - BPF-side queueing using PIDs. 13 * - Sleepable per-task storage allocation using ops.prep_enable(). 14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking 15 * the CPU away. 16 * - Core-sched support. 17 * 18 * This scheduler is primarily for demonstration and testing of sched_ext 19 * features and unlikely to be useful for actual workloads. 20 * 21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 23 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 24 */ 25 #include <scx/common.bpf.h> 26 27 enum consts { 28 ONE_SEC_IN_NS = 1000000000, 29 SHARED_DSQ = 0, 30 HIGHPRI_DSQ = 1, 31 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ 32 }; 33 34 char _license[] SEC("license") = "GPL"; 35 36 const volatile u64 slice_ns = SCX_SLICE_DFL; 37 const volatile u32 stall_user_nth; 38 const volatile u32 stall_kernel_nth; 39 const volatile u32 dsp_inf_loop_after; 40 const volatile u32 dsp_batch; 41 const volatile bool highpri_boosting; 42 const volatile bool print_shared_dsq; 43 const volatile s32 disallow_tgid; 44 const volatile bool suppress_dump; 45 46 u64 nr_highpri_queued; 47 u32 test_error_cnt; 48 49 UEI_DEFINE(uei); 50 51 struct qmap { 52 __uint(type, BPF_MAP_TYPE_QUEUE); 53 __uint(max_entries, 4096); 54 __type(value, u32); 55 } queue0 SEC(".maps"), 56 queue1 SEC(".maps"), 57 queue2 SEC(".maps"), 58 queue3 SEC(".maps"), 59 queue4 SEC(".maps"); 60 61 struct { 62 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 63 __uint(max_entries, 5); 64 __type(key, int); 65 __array(values, struct qmap); 66 } queue_arr SEC(".maps") = { 67 .values = { 68 [0] = &queue0, 69 [1] = &queue1, 70 [2] = &queue2, 71 [3] = &queue3, 72 [4] = &queue4, 73 }, 74 }; 75 76 /* 77 * If enabled, CPU performance target is set according to the queue index 78 * according to the following table. 79 */ 80 static const u32 qidx_to_cpuperf_target[] = { 81 [0] = SCX_CPUPERF_ONE * 0 / 4, 82 [1] = SCX_CPUPERF_ONE * 1 / 4, 83 [2] = SCX_CPUPERF_ONE * 2 / 4, 84 [3] = SCX_CPUPERF_ONE * 3 / 4, 85 [4] = SCX_CPUPERF_ONE * 4 / 4, 86 }; 87 88 /* 89 * Per-queue sequence numbers to implement core-sched ordering. 90 * 91 * Tail seq is assigned to each queued task and incremented. Head seq tracks the 92 * sequence number of the latest dispatched task. The distance between the a 93 * task's seq and the associated queue's head seq is called the queue distance 94 * and used when comparing two tasks for ordering. See qmap_core_sched_before(). 95 */ 96 static u64 core_sched_head_seqs[5]; 97 static u64 core_sched_tail_seqs[5]; 98 99 /* Per-task scheduling context */ 100 struct task_ctx { 101 bool force_local; /* Dispatch directly to local_dsq */ 102 bool highpri; 103 u64 core_sched_seq; 104 }; 105 106 struct { 107 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 108 __uint(map_flags, BPF_F_NO_PREALLOC); 109 __type(key, int); 110 __type(value, struct task_ctx); 111 } task_ctx_stor SEC(".maps"); 112 113 struct cpu_ctx { 114 u64 dsp_idx; /* dispatch index */ 115 u64 dsp_cnt; /* remaining count */ 116 u32 avg_weight; 117 u32 cpuperf_target; 118 }; 119 120 struct { 121 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 122 __uint(max_entries, 1); 123 __type(key, u32); 124 __type(value, struct cpu_ctx); 125 } cpu_ctx_stor SEC(".maps"); 126 127 /* Statistics */ 128 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; 129 u64 nr_core_sched_execed; 130 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; 131 u32 cpuperf_min, cpuperf_avg, cpuperf_max; 132 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; 133 134 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) 135 { 136 s32 cpu; 137 138 if (p->nr_cpus_allowed == 1 || 139 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) 140 return prev_cpu; 141 142 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 143 if (cpu >= 0) 144 return cpu; 145 146 return -1; 147 } 148 149 static struct task_ctx *lookup_task_ctx(struct task_struct *p) 150 { 151 struct task_ctx *tctx; 152 153 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { 154 scx_bpf_error("task_ctx lookup failed"); 155 return NULL; 156 } 157 return tctx; 158 } 159 160 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 161 s32 prev_cpu, u64 wake_flags) 162 { 163 struct task_ctx *tctx; 164 s32 cpu; 165 166 if (!(tctx = lookup_task_ctx(p))) 167 return -ESRCH; 168 169 cpu = pick_direct_dispatch_cpu(p, prev_cpu); 170 171 if (cpu >= 0) { 172 tctx->force_local = true; 173 return cpu; 174 } else { 175 return prev_cpu; 176 } 177 } 178 179 static int weight_to_idx(u32 weight) 180 { 181 /* Coarsely map the compound weight to a FIFO. */ 182 if (weight <= 25) 183 return 0; 184 else if (weight <= 50) 185 return 1; 186 else if (weight < 200) 187 return 2; 188 else if (weight < 400) 189 return 3; 190 else 191 return 4; 192 } 193 194 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 195 { 196 static u32 user_cnt, kernel_cnt; 197 struct task_ctx *tctx; 198 u32 pid = p->pid; 199 int idx = weight_to_idx(p->scx.weight); 200 void *ring; 201 s32 cpu; 202 203 if (p->flags & PF_KTHREAD) { 204 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 205 return; 206 } else { 207 if (stall_user_nth && !(++user_cnt % stall_user_nth)) 208 return; 209 } 210 211 if (test_error_cnt && !--test_error_cnt) 212 scx_bpf_error("test triggering error"); 213 214 if (!(tctx = lookup_task_ctx(p))) 215 return; 216 217 /* 218 * All enqueued tasks must have their core_sched_seq updated for correct 219 * core-sched ordering. Also, take a look at the end of qmap_dispatch(). 220 */ 221 tctx->core_sched_seq = core_sched_tail_seqs[idx]++; 222 223 /* 224 * If qmap_select_cpu() is telling us to or this is the last runnable 225 * task on the CPU, enqueue locally. 226 */ 227 if (tctx->force_local) { 228 tctx->force_local = false; 229 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 230 return; 231 } 232 233 /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ 234 if (!(enq_flags & SCX_ENQ_WAKEUP) && 235 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { 236 __sync_fetch_and_add(&nr_ddsp_from_enq, 1); 237 scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); 238 return; 239 } 240 241 /* 242 * If the task was re-enqueued due to the CPU being preempted by a 243 * higher priority scheduling class, just re-enqueue the task directly 244 * on the global DSQ. As we want another CPU to pick it up, find and 245 * kick an idle CPU. 246 */ 247 if (enq_flags & SCX_ENQ_REENQ) { 248 s32 cpu; 249 250 scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); 251 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 252 if (cpu >= 0) 253 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 254 return; 255 } 256 257 ring = bpf_map_lookup_elem(&queue_arr, &idx); 258 if (!ring) { 259 scx_bpf_error("failed to find ring %d", idx); 260 return; 261 } 262 263 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 264 if (bpf_map_push_elem(ring, &pid, 0)) { 265 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); 266 return; 267 } 268 269 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { 270 tctx->highpri = true; 271 __sync_fetch_and_add(&nr_highpri_queued, 1); 272 } 273 __sync_fetch_and_add(&nr_enqueued, 1); 274 } 275 276 /* 277 * The BPF queue map doesn't support removal and sched_ext can handle spurious 278 * dispatches. qmap_dequeue() is only used to collect statistics. 279 */ 280 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 281 { 282 __sync_fetch_and_add(&nr_dequeued, 1); 283 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) 284 __sync_fetch_and_add(&nr_core_sched_execed, 1); 285 } 286 287 static void update_core_sched_head_seq(struct task_struct *p) 288 { 289 int idx = weight_to_idx(p->scx.weight); 290 struct task_ctx *tctx; 291 292 if ((tctx = lookup_task_ctx(p))) 293 core_sched_head_seqs[idx] = tctx->core_sched_seq; 294 } 295 296 /* 297 * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly 298 * selective priority boosting mechanism by scanning SHARED_DSQ looking for 299 * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This 300 * makes minor difference only when dsp_batch is larger than 1. 301 * 302 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and 303 * non-rq-lock holding BPF programs. As demonstration, this function is called 304 * from qmap_dispatch() and monitor_timerfn(). 305 */ 306 static bool dispatch_highpri(bool from_timer) 307 { 308 struct task_struct *p; 309 s32 this_cpu = bpf_get_smp_processor_id(); 310 311 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ 312 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { 313 static u64 highpri_seq; 314 struct task_ctx *tctx; 315 316 if (!(tctx = lookup_task_ctx(p))) 317 return false; 318 319 if (tctx->highpri) { 320 /* exercise the set_*() and vtime interface too */ 321 scx_bpf_dispatch_from_dsq_set_slice( 322 BPF_FOR_EACH_ITER, slice_ns * 2); 323 scx_bpf_dispatch_from_dsq_set_vtime( 324 BPF_FOR_EACH_ITER, highpri_seq++); 325 scx_bpf_dispatch_vtime_from_dsq( 326 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 327 } 328 } 329 330 /* 331 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU 332 * is found. 333 */ 334 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { 335 bool dispatched = false; 336 s32 cpu; 337 338 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) 339 cpu = this_cpu; 340 else 341 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 342 343 if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, 344 SCX_DSQ_LOCAL_ON | cpu, 345 SCX_ENQ_PREEMPT)) { 346 if (cpu == this_cpu) { 347 dispatched = true; 348 __sync_fetch_and_add(&nr_expedited_local, 1); 349 } else { 350 __sync_fetch_and_add(&nr_expedited_remote, 1); 351 } 352 if (from_timer) 353 __sync_fetch_and_add(&nr_expedited_from_timer, 1); 354 } else { 355 __sync_fetch_and_add(&nr_expedited_lost, 1); 356 } 357 358 if (dispatched) 359 return true; 360 } 361 362 return false; 363 } 364 365 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 366 { 367 struct task_struct *p; 368 struct cpu_ctx *cpuc; 369 struct task_ctx *tctx; 370 u32 zero = 0, batch = dsp_batch ?: 1; 371 void *fifo; 372 s32 i, pid; 373 374 if (dispatch_highpri(false)) 375 return; 376 377 if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) 378 return; 379 380 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { 381 /* 382 * PID 2 should be kthreadd which should mostly be idle and off 383 * the scheduler. Let's keep dispatching it to force the kernel 384 * to call this function over and over again. 385 */ 386 p = bpf_task_from_pid(2); 387 if (p) { 388 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); 389 bpf_task_release(p); 390 return; 391 } 392 } 393 394 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 395 scx_bpf_error("failed to look up cpu_ctx"); 396 return; 397 } 398 399 for (i = 0; i < 5; i++) { 400 /* Advance the dispatch cursor and pick the fifo. */ 401 if (!cpuc->dsp_cnt) { 402 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 403 cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 404 } 405 406 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 407 if (!fifo) { 408 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 409 return; 410 } 411 412 /* Dispatch or advance. */ 413 bpf_repeat(BPF_MAX_LOOPS) { 414 struct task_ctx *tctx; 415 416 if (bpf_map_pop_elem(fifo, &pid)) 417 break; 418 419 p = bpf_task_from_pid(pid); 420 if (!p) 421 continue; 422 423 if (!(tctx = lookup_task_ctx(p))) { 424 bpf_task_release(p); 425 return; 426 } 427 428 if (tctx->highpri) 429 __sync_fetch_and_sub(&nr_highpri_queued, 1); 430 431 update_core_sched_head_seq(p); 432 __sync_fetch_and_add(&nr_dispatched, 1); 433 434 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 435 bpf_task_release(p); 436 437 batch--; 438 cpuc->dsp_cnt--; 439 if (!batch || !scx_bpf_dispatch_nr_slots()) { 440 if (dispatch_highpri(false)) 441 return; 442 scx_bpf_consume(SHARED_DSQ); 443 return; 444 } 445 if (!cpuc->dsp_cnt) 446 break; 447 } 448 449 cpuc->dsp_cnt = 0; 450 } 451 452 /* 453 * No other tasks. @prev will keep running. Update its core_sched_seq as 454 * if the task were enqueued and dispatched immediately. 455 */ 456 if (prev) { 457 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); 458 if (!tctx) { 459 scx_bpf_error("task_ctx lookup failed"); 460 return; 461 } 462 463 tctx->core_sched_seq = 464 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; 465 } 466 } 467 468 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) 469 { 470 struct cpu_ctx *cpuc; 471 u32 zero = 0; 472 int idx; 473 474 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 475 scx_bpf_error("failed to look up cpu_ctx"); 476 return; 477 } 478 479 /* 480 * Use the running avg of weights to select the target cpuperf level. 481 * This is a demonstration of the cpuperf feature rather than a 482 * practical strategy to regulate CPU frequency. 483 */ 484 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; 485 idx = weight_to_idx(cpuc->avg_weight); 486 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; 487 488 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); 489 } 490 491 /* 492 * The distance from the head of the queue scaled by the weight of the queue. 493 * The lower the number, the older the task and the higher the priority. 494 */ 495 static s64 task_qdist(struct task_struct *p) 496 { 497 int idx = weight_to_idx(p->scx.weight); 498 struct task_ctx *tctx; 499 s64 qdist; 500 501 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 502 if (!tctx) { 503 scx_bpf_error("task_ctx lookup failed"); 504 return 0; 505 } 506 507 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; 508 509 /* 510 * As queue index increments, the priority doubles. The queue w/ index 3 511 * is dispatched twice more frequently than 2. Reflect the difference by 512 * scaling qdists accordingly. Note that the shift amount needs to be 513 * flipped depending on the sign to avoid flipping priority direction. 514 */ 515 if (qdist >= 0) 516 return qdist << (4 - idx); 517 else 518 return qdist << idx; 519 } 520 521 /* 522 * This is called to determine the task ordering when core-sched is picking 523 * tasks to execute on SMT siblings and should encode about the same ordering as 524 * the regular scheduling path. Use the priority-scaled distances from the head 525 * of the queues to compare the two tasks which should be consistent with the 526 * dispatch path behavior. 527 */ 528 bool BPF_STRUCT_OPS(qmap_core_sched_before, 529 struct task_struct *a, struct task_struct *b) 530 { 531 return task_qdist(a) > task_qdist(b); 532 } 533 534 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 535 { 536 u32 cnt; 537 538 /* 539 * Called when @cpu is taken by a higher priority scheduling class. This 540 * makes @cpu no longer available for executing sched_ext tasks. As we 541 * don't want the tasks in @cpu's local dsq to sit there until @cpu 542 * becomes available again, re-enqueue them into the global dsq. See 543 * %SCX_ENQ_REENQ handling in qmap_enqueue(). 544 */ 545 cnt = scx_bpf_reenqueue_local(); 546 if (cnt) 547 __sync_fetch_and_add(&nr_reenqueued, cnt); 548 } 549 550 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 551 struct scx_init_task_args *args) 552 { 553 if (p->tgid == disallow_tgid) 554 p->scx.disallow = true; 555 556 /* 557 * @p is new. Let's ensure that its task_ctx is available. We can sleep 558 * in this function and the following will automatically use GFP_KERNEL. 559 */ 560 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 561 BPF_LOCAL_STORAGE_GET_F_CREATE)) 562 return 0; 563 else 564 return -ENOMEM; 565 } 566 567 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) 568 { 569 s32 i, pid; 570 571 if (suppress_dump) 572 return; 573 574 bpf_for(i, 0, 5) { 575 void *fifo; 576 577 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) 578 return; 579 580 scx_bpf_dump("QMAP FIFO[%d]:", i); 581 bpf_repeat(4096) { 582 if (bpf_map_pop_elem(fifo, &pid)) 583 break; 584 scx_bpf_dump(" %d", pid); 585 } 586 scx_bpf_dump("\n"); 587 } 588 } 589 590 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) 591 { 592 u32 zero = 0; 593 struct cpu_ctx *cpuc; 594 595 if (suppress_dump || idle) 596 return; 597 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) 598 return; 599 600 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", 601 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, 602 cpuc->cpuperf_target); 603 } 604 605 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) 606 { 607 struct task_ctx *taskc; 608 609 if (suppress_dump) 610 return; 611 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) 612 return; 613 614 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", 615 taskc->force_local, taskc->core_sched_seq); 616 } 617 618 /* 619 * Print out the online and possible CPU map using bpf_printk() as a 620 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). 621 */ 622 static void print_cpus(void) 623 { 624 const struct cpumask *possible, *online; 625 s32 cpu; 626 char buf[128] = "", *p; 627 int idx; 628 629 possible = scx_bpf_get_possible_cpumask(); 630 online = scx_bpf_get_online_cpumask(); 631 632 idx = 0; 633 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { 634 if (!(p = MEMBER_VPTR(buf, [idx++]))) 635 break; 636 if (bpf_cpumask_test_cpu(cpu, online)) 637 *p++ = 'O'; 638 else if (bpf_cpumask_test_cpu(cpu, possible)) 639 *p++ = 'X'; 640 else 641 *p++ = ' '; 642 643 if ((cpu & 7) == 7) { 644 if (!(p = MEMBER_VPTR(buf, [idx++]))) 645 break; 646 *p++ = '|'; 647 } 648 } 649 buf[sizeof(buf) - 1] = '\0'; 650 651 scx_bpf_put_cpumask(online); 652 scx_bpf_put_cpumask(possible); 653 654 bpf_printk("CPUS: |%s", buf); 655 } 656 657 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) 658 { 659 bpf_printk("CPU %d coming online", cpu); 660 /* @cpu is already online at this point */ 661 print_cpus(); 662 } 663 664 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) 665 { 666 bpf_printk("CPU %d going offline", cpu); 667 /* @cpu is still online at this point */ 668 print_cpus(); 669 } 670 671 struct monitor_timer { 672 struct bpf_timer timer; 673 }; 674 675 struct { 676 __uint(type, BPF_MAP_TYPE_ARRAY); 677 __uint(max_entries, 1); 678 __type(key, u32); 679 __type(value, struct monitor_timer); 680 } monitor_timer SEC(".maps"); 681 682 /* 683 * Print out the min, avg and max performance levels of CPUs every second to 684 * demonstrate the cpuperf interface. 685 */ 686 static void monitor_cpuperf(void) 687 { 688 u32 zero = 0, nr_cpu_ids; 689 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; 690 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; 691 const struct cpumask *online; 692 int i, nr_online_cpus = 0; 693 694 nr_cpu_ids = scx_bpf_nr_cpu_ids(); 695 online = scx_bpf_get_online_cpumask(); 696 697 bpf_for(i, 0, nr_cpu_ids) { 698 struct cpu_ctx *cpuc; 699 u32 cap, cur; 700 701 if (!bpf_cpumask_test_cpu(i, online)) 702 continue; 703 nr_online_cpus++; 704 705 /* collect the capacity and current cpuperf */ 706 cap = scx_bpf_cpuperf_cap(i); 707 cur = scx_bpf_cpuperf_cur(i); 708 709 cur_min = cur < cur_min ? cur : cur_min; 710 cur_max = cur > cur_max ? cur : cur_max; 711 712 /* 713 * $cur is relative to $cap. Scale it down accordingly so that 714 * it's in the same scale as other CPUs and $cur_sum/$cap_sum 715 * makes sense. 716 */ 717 cur_sum += cur * cap / SCX_CPUPERF_ONE; 718 cap_sum += cap; 719 720 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { 721 scx_bpf_error("failed to look up cpu_ctx"); 722 goto out; 723 } 724 725 /* collect target */ 726 cur = cpuc->cpuperf_target; 727 target_sum += cur; 728 target_min = cur < target_min ? cur : target_min; 729 target_max = cur > target_max ? cur : target_max; 730 } 731 732 cpuperf_min = cur_min; 733 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; 734 cpuperf_max = cur_max; 735 736 cpuperf_target_min = target_min; 737 cpuperf_target_avg = target_sum / nr_online_cpus; 738 cpuperf_target_max = target_max; 739 out: 740 scx_bpf_put_cpumask(online); 741 } 742 743 /* 744 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of 745 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to 746 * see meaningful dumps in the trace pipe. 747 */ 748 static void dump_shared_dsq(void) 749 { 750 struct task_struct *p; 751 s32 nr; 752 753 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) 754 return; 755 756 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); 757 758 bpf_rcu_read_lock(); 759 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) 760 bpf_printk("%s[%d]", p->comm, p->pid); 761 bpf_rcu_read_unlock(); 762 } 763 764 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 765 { 766 bpf_rcu_read_lock(); 767 dispatch_highpri(true); 768 bpf_rcu_read_unlock(); 769 770 monitor_cpuperf(); 771 772 if (print_shared_dsq) 773 dump_shared_dsq(); 774 775 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 776 return 0; 777 } 778 779 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 780 { 781 u32 key = 0; 782 struct bpf_timer *timer; 783 s32 ret; 784 785 print_cpus(); 786 787 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 788 if (ret) 789 return ret; 790 791 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); 792 if (ret) 793 return ret; 794 795 timer = bpf_map_lookup_elem(&monitor_timer, &key); 796 if (!timer) 797 return -ESRCH; 798 799 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); 800 bpf_timer_set_callback(timer, monitor_timerfn); 801 802 return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 803 } 804 805 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 806 { 807 UEI_RECORD(uei, ei); 808 } 809 810 SCX_OPS_DEFINE(qmap_ops, 811 .select_cpu = (void *)qmap_select_cpu, 812 .enqueue = (void *)qmap_enqueue, 813 .dequeue = (void *)qmap_dequeue, 814 .dispatch = (void *)qmap_dispatch, 815 .tick = (void *)qmap_tick, 816 .core_sched_before = (void *)qmap_core_sched_before, 817 .cpu_release = (void *)qmap_cpu_release, 818 .init_task = (void *)qmap_init_task, 819 .dump = (void *)qmap_dump, 820 .dump_cpu = (void *)qmap_dump_cpu, 821 .dump_task = (void *)qmap_dump_task, 822 .cpu_online = (void *)qmap_cpu_online, 823 .cpu_offline = (void *)qmap_cpu_offline, 824 .init = (void *)qmap_init, 825 .exit = (void *)qmap_exit, 826 .timeout_ms = 5000U, 827 .name = "qmap"); 828