1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * A simple five-level FIFO queue scheduler.
4 *
5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6 * assigned to one depending on its compound weight. Each CPU round robins
7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8 * queue0, 2 from queue1, 4 from queue2 and so on.
9 *
10 * This scheduler demonstrates:
11 *
12 * - BPF-side queueing using PIDs.
13 * - Sleepable per-task storage allocation using ops.prep_enable().
14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15 * the CPU away.
16 * - Core-sched support.
17 *
18 * This scheduler is primarily for demonstration and testing of sched_ext
19 * features and unlikely to be useful for actual workloads.
20 *
21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
23 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
24 */
25 #include <scx/common.bpf.h>
26
27 enum consts {
28 ONE_SEC_IN_NS = 1000000000,
29 SHARED_DSQ = 0,
30 HIGHPRI_DSQ = 1,
31 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
32 };
33
34 char _license[] SEC("license") = "GPL";
35
36 const volatile u64 slice_ns;
37 const volatile u32 stall_user_nth;
38 const volatile u32 stall_kernel_nth;
39 const volatile u32 dsp_inf_loop_after;
40 const volatile u32 dsp_batch;
41 const volatile bool highpri_boosting;
42 const volatile bool print_dsqs_and_events;
43 const volatile bool print_msgs;
44 const volatile s32 disallow_tgid;
45 const volatile bool suppress_dump;
46
47 u64 nr_highpri_queued;
48 u32 test_error_cnt;
49
50 UEI_DEFINE(uei);
51
52 struct qmap {
53 __uint(type, BPF_MAP_TYPE_QUEUE);
54 __uint(max_entries, 4096);
55 __type(value, u32);
56 } queue0 SEC(".maps"),
57 queue1 SEC(".maps"),
58 queue2 SEC(".maps"),
59 queue3 SEC(".maps"),
60 queue4 SEC(".maps"),
61 dump_store SEC(".maps");
62
63 struct {
64 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
65 __uint(max_entries, 5);
66 __type(key, int);
67 __array(values, struct qmap);
68 } queue_arr SEC(".maps") = {
69 .values = {
70 [0] = &queue0,
71 [1] = &queue1,
72 [2] = &queue2,
73 [3] = &queue3,
74 [4] = &queue4,
75 },
76 };
77
78 /*
79 * If enabled, CPU performance target is set according to the queue index
80 * according to the following table.
81 */
82 static const u32 qidx_to_cpuperf_target[] = {
83 [0] = SCX_CPUPERF_ONE * 0 / 4,
84 [1] = SCX_CPUPERF_ONE * 1 / 4,
85 [2] = SCX_CPUPERF_ONE * 2 / 4,
86 [3] = SCX_CPUPERF_ONE * 3 / 4,
87 [4] = SCX_CPUPERF_ONE * 4 / 4,
88 };
89
90 /*
91 * Per-queue sequence numbers to implement core-sched ordering.
92 *
93 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
94 * sequence number of the latest dispatched task. The distance between the a
95 * task's seq and the associated queue's head seq is called the queue distance
96 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
97 */
98 static u64 core_sched_head_seqs[5];
99 static u64 core_sched_tail_seqs[5];
100
101 /* Per-task scheduling context */
102 struct task_ctx {
103 bool force_local; /* Dispatch directly to local_dsq */
104 bool highpri;
105 u64 core_sched_seq;
106 };
107
108 struct {
109 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
110 __uint(map_flags, BPF_F_NO_PREALLOC);
111 __type(key, int);
112 __type(value, struct task_ctx);
113 } task_ctx_stor SEC(".maps");
114
115 struct cpu_ctx {
116 u64 dsp_idx; /* dispatch index */
117 u64 dsp_cnt; /* remaining count */
118 u32 avg_weight;
119 u32 cpuperf_target;
120 };
121
122 struct {
123 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
124 __uint(max_entries, 1);
125 __type(key, u32);
126 __type(value, struct cpu_ctx);
127 } cpu_ctx_stor SEC(".maps");
128
129 /* Statistics */
130 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
131 u64 nr_core_sched_execed;
132 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
133 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
134 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
135
pick_direct_dispatch_cpu(struct task_struct * p,s32 prev_cpu)136 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
137 {
138 s32 cpu;
139
140 if (p->nr_cpus_allowed == 1 ||
141 scx_bpf_test_and_clear_cpu_idle(prev_cpu))
142 return prev_cpu;
143
144 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
145 if (cpu >= 0)
146 return cpu;
147
148 return -1;
149 }
150
lookup_task_ctx(struct task_struct * p)151 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
152 {
153 struct task_ctx *tctx;
154
155 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
156 scx_bpf_error("task_ctx lookup failed");
157 return NULL;
158 }
159 return tctx;
160 }
161
BPF_STRUCT_OPS(qmap_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)162 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
163 s32 prev_cpu, u64 wake_flags)
164 {
165 struct task_ctx *tctx;
166 s32 cpu;
167
168 if (!(tctx = lookup_task_ctx(p)))
169 return -ESRCH;
170
171 cpu = pick_direct_dispatch_cpu(p, prev_cpu);
172
173 if (cpu >= 0) {
174 tctx->force_local = true;
175 return cpu;
176 } else {
177 return prev_cpu;
178 }
179 }
180
weight_to_idx(u32 weight)181 static int weight_to_idx(u32 weight)
182 {
183 /* Coarsely map the compound weight to a FIFO. */
184 if (weight <= 25)
185 return 0;
186 else if (weight <= 50)
187 return 1;
188 else if (weight < 200)
189 return 2;
190 else if (weight < 400)
191 return 3;
192 else
193 return 4;
194 }
195
BPF_STRUCT_OPS(qmap_enqueue,struct task_struct * p,u64 enq_flags)196 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
197 {
198 static u32 user_cnt, kernel_cnt;
199 struct task_ctx *tctx;
200 u32 pid = p->pid;
201 int idx = weight_to_idx(p->scx.weight);
202 void *ring;
203 s32 cpu;
204
205 if (p->flags & PF_KTHREAD) {
206 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
207 return;
208 } else {
209 if (stall_user_nth && !(++user_cnt % stall_user_nth))
210 return;
211 }
212
213 if (test_error_cnt && !--test_error_cnt)
214 scx_bpf_error("test triggering error");
215
216 if (!(tctx = lookup_task_ctx(p)))
217 return;
218
219 /*
220 * All enqueued tasks must have their core_sched_seq updated for correct
221 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
222 */
223 tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
224
225 /*
226 * If qmap_select_cpu() is telling us to or this is the last runnable
227 * task on the CPU, enqueue locally.
228 */
229 if (tctx->force_local) {
230 tctx->force_local = false;
231 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
232 return;
233 }
234
235 /* if select_cpu() wasn't called, try direct dispatch */
236 if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
237 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
238 __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
239 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
240 return;
241 }
242
243 /*
244 * If the task was re-enqueued due to the CPU being preempted by a
245 * higher priority scheduling class, just re-enqueue the task directly
246 * on the global DSQ. As we want another CPU to pick it up, find and
247 * kick an idle CPU.
248 */
249 if (enq_flags & SCX_ENQ_REENQ) {
250 s32 cpu;
251
252 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
253 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
254 if (cpu >= 0)
255 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
256 return;
257 }
258
259 ring = bpf_map_lookup_elem(&queue_arr, &idx);
260 if (!ring) {
261 scx_bpf_error("failed to find ring %d", idx);
262 return;
263 }
264
265 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
266 if (bpf_map_push_elem(ring, &pid, 0)) {
267 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
268 return;
269 }
270
271 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
272 tctx->highpri = true;
273 __sync_fetch_and_add(&nr_highpri_queued, 1);
274 }
275 __sync_fetch_and_add(&nr_enqueued, 1);
276 }
277
278 /*
279 * The BPF queue map doesn't support removal and sched_ext can handle spurious
280 * dispatches. qmap_dequeue() is only used to collect statistics.
281 */
BPF_STRUCT_OPS(qmap_dequeue,struct task_struct * p,u64 deq_flags)282 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
283 {
284 __sync_fetch_and_add(&nr_dequeued, 1);
285 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
286 __sync_fetch_and_add(&nr_core_sched_execed, 1);
287 }
288
update_core_sched_head_seq(struct task_struct * p)289 static void update_core_sched_head_seq(struct task_struct *p)
290 {
291 int idx = weight_to_idx(p->scx.weight);
292 struct task_ctx *tctx;
293
294 if ((tctx = lookup_task_ctx(p)))
295 core_sched_head_seqs[idx] = tctx->core_sched_seq;
296 }
297
298 /*
299 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
300 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
301 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
302 * difference only when dsp_batch is larger than 1.
303 *
304 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
305 * non-rq-lock holding BPF programs. As demonstration, this function is called
306 * from qmap_dispatch() and monitor_timerfn().
307 */
dispatch_highpri(bool from_timer)308 static bool dispatch_highpri(bool from_timer)
309 {
310 struct task_struct *p;
311 s32 this_cpu = bpf_get_smp_processor_id();
312
313 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
314 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
315 static u64 highpri_seq;
316 struct task_ctx *tctx;
317
318 if (!(tctx = lookup_task_ctx(p)))
319 return false;
320
321 if (tctx->highpri) {
322 /* exercise the set_*() and vtime interface too */
323 __COMPAT_scx_bpf_dsq_move_set_slice(
324 BPF_FOR_EACH_ITER, slice_ns * 2);
325 __COMPAT_scx_bpf_dsq_move_set_vtime(
326 BPF_FOR_EACH_ITER, highpri_seq++);
327 __COMPAT_scx_bpf_dsq_move_vtime(
328 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
329 }
330 }
331
332 /*
333 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
334 * is found.
335 */
336 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
337 bool dispatched = false;
338 s32 cpu;
339
340 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
341 cpu = this_cpu;
342 else
343 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
344
345 if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
346 SCX_DSQ_LOCAL_ON | cpu,
347 SCX_ENQ_PREEMPT)) {
348 if (cpu == this_cpu) {
349 dispatched = true;
350 __sync_fetch_and_add(&nr_expedited_local, 1);
351 } else {
352 __sync_fetch_and_add(&nr_expedited_remote, 1);
353 }
354 if (from_timer)
355 __sync_fetch_and_add(&nr_expedited_from_timer, 1);
356 } else {
357 __sync_fetch_and_add(&nr_expedited_lost, 1);
358 }
359
360 if (dispatched)
361 return true;
362 }
363
364 return false;
365 }
366
BPF_STRUCT_OPS(qmap_dispatch,s32 cpu,struct task_struct * prev)367 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
368 {
369 struct task_struct *p;
370 struct cpu_ctx *cpuc;
371 struct task_ctx *tctx;
372 u32 zero = 0, batch = dsp_batch ?: 1;
373 void *fifo;
374 s32 i, pid;
375
376 if (dispatch_highpri(false))
377 return;
378
379 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
380 return;
381
382 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
383 /*
384 * PID 2 should be kthreadd which should mostly be idle and off
385 * the scheduler. Let's keep dispatching it to force the kernel
386 * to call this function over and over again.
387 */
388 p = bpf_task_from_pid(2);
389 if (p) {
390 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
391 bpf_task_release(p);
392 return;
393 }
394 }
395
396 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
397 scx_bpf_error("failed to look up cpu_ctx");
398 return;
399 }
400
401 for (i = 0; i < 5; i++) {
402 /* Advance the dispatch cursor and pick the fifo. */
403 if (!cpuc->dsp_cnt) {
404 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
405 cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
406 }
407
408 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
409 if (!fifo) {
410 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
411 return;
412 }
413
414 /* Dispatch or advance. */
415 bpf_repeat(BPF_MAX_LOOPS) {
416 struct task_ctx *tctx;
417
418 if (bpf_map_pop_elem(fifo, &pid))
419 break;
420
421 p = bpf_task_from_pid(pid);
422 if (!p)
423 continue;
424
425 if (!(tctx = lookup_task_ctx(p))) {
426 bpf_task_release(p);
427 return;
428 }
429
430 if (tctx->highpri)
431 __sync_fetch_and_sub(&nr_highpri_queued, 1);
432
433 update_core_sched_head_seq(p);
434 __sync_fetch_and_add(&nr_dispatched, 1);
435
436 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
437 bpf_task_release(p);
438
439 batch--;
440 cpuc->dsp_cnt--;
441 if (!batch || !scx_bpf_dispatch_nr_slots()) {
442 if (dispatch_highpri(false))
443 return;
444 scx_bpf_dsq_move_to_local(SHARED_DSQ);
445 return;
446 }
447 if (!cpuc->dsp_cnt)
448 break;
449 }
450
451 cpuc->dsp_cnt = 0;
452 }
453
454 /*
455 * No other tasks. @prev will keep running. Update its core_sched_seq as
456 * if the task were enqueued and dispatched immediately.
457 */
458 if (prev) {
459 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
460 if (!tctx) {
461 scx_bpf_error("task_ctx lookup failed");
462 return;
463 }
464
465 tctx->core_sched_seq =
466 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
467 }
468 }
469
BPF_STRUCT_OPS(qmap_tick,struct task_struct * p)470 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
471 {
472 struct cpu_ctx *cpuc;
473 u32 zero = 0;
474 int idx;
475
476 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
477 scx_bpf_error("failed to look up cpu_ctx");
478 return;
479 }
480
481 /*
482 * Use the running avg of weights to select the target cpuperf level.
483 * This is a demonstration of the cpuperf feature rather than a
484 * practical strategy to regulate CPU frequency.
485 */
486 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
487 idx = weight_to_idx(cpuc->avg_weight);
488 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
489
490 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
491 }
492
493 /*
494 * The distance from the head of the queue scaled by the weight of the queue.
495 * The lower the number, the older the task and the higher the priority.
496 */
task_qdist(struct task_struct * p)497 static s64 task_qdist(struct task_struct *p)
498 {
499 int idx = weight_to_idx(p->scx.weight);
500 struct task_ctx *tctx;
501 s64 qdist;
502
503 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
504 if (!tctx) {
505 scx_bpf_error("task_ctx lookup failed");
506 return 0;
507 }
508
509 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
510
511 /*
512 * As queue index increments, the priority doubles. The queue w/ index 3
513 * is dispatched twice more frequently than 2. Reflect the difference by
514 * scaling qdists accordingly. Note that the shift amount needs to be
515 * flipped depending on the sign to avoid flipping priority direction.
516 */
517 if (qdist >= 0)
518 return qdist << (4 - idx);
519 else
520 return qdist << idx;
521 }
522
523 /*
524 * This is called to determine the task ordering when core-sched is picking
525 * tasks to execute on SMT siblings and should encode about the same ordering as
526 * the regular scheduling path. Use the priority-scaled distances from the head
527 * of the queues to compare the two tasks which should be consistent with the
528 * dispatch path behavior.
529 */
BPF_STRUCT_OPS(qmap_core_sched_before,struct task_struct * a,struct task_struct * b)530 bool BPF_STRUCT_OPS(qmap_core_sched_before,
531 struct task_struct *a, struct task_struct *b)
532 {
533 return task_qdist(a) > task_qdist(b);
534 }
535
BPF_STRUCT_OPS(qmap_cpu_release,s32 cpu,struct scx_cpu_release_args * args)536 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
537 {
538 u32 cnt;
539
540 /*
541 * Called when @cpu is taken by a higher priority scheduling class. This
542 * makes @cpu no longer available for executing sched_ext tasks. As we
543 * don't want the tasks in @cpu's local dsq to sit there until @cpu
544 * becomes available again, re-enqueue them into the global dsq. See
545 * %SCX_ENQ_REENQ handling in qmap_enqueue().
546 */
547 cnt = scx_bpf_reenqueue_local();
548 if (cnt)
549 __sync_fetch_and_add(&nr_reenqueued, cnt);
550 }
551
BPF_STRUCT_OPS(qmap_init_task,struct task_struct * p,struct scx_init_task_args * args)552 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
553 struct scx_init_task_args *args)
554 {
555 if (p->tgid == disallow_tgid)
556 p->scx.disallow = true;
557
558 /*
559 * @p is new. Let's ensure that its task_ctx is available. We can sleep
560 * in this function and the following will automatically use GFP_KERNEL.
561 */
562 if (bpf_task_storage_get(&task_ctx_stor, p, 0,
563 BPF_LOCAL_STORAGE_GET_F_CREATE))
564 return 0;
565 else
566 return -ENOMEM;
567 }
568
BPF_STRUCT_OPS(qmap_dump,struct scx_dump_ctx * dctx)569 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
570 {
571 s32 i, pid;
572
573 if (suppress_dump)
574 return;
575
576 bpf_for(i, 0, 5) {
577 void *fifo;
578
579 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
580 return;
581
582 scx_bpf_dump("QMAP FIFO[%d]:", i);
583
584 /*
585 * Dump can be invoked anytime and there is no way to iterate in
586 * a non-destructive way. Pop and store in dump_store and then
587 * restore afterwards. If racing against new enqueues, ordering
588 * can get mixed up.
589 */
590 bpf_repeat(4096) {
591 if (bpf_map_pop_elem(fifo, &pid))
592 break;
593 bpf_map_push_elem(&dump_store, &pid, 0);
594 scx_bpf_dump(" %d", pid);
595 }
596
597 bpf_repeat(4096) {
598 if (bpf_map_pop_elem(&dump_store, &pid))
599 break;
600 bpf_map_push_elem(fifo, &pid, 0);
601 }
602
603 scx_bpf_dump("\n");
604 }
605 }
606
BPF_STRUCT_OPS(qmap_dump_cpu,struct scx_dump_ctx * dctx,s32 cpu,bool idle)607 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
608 {
609 u32 zero = 0;
610 struct cpu_ctx *cpuc;
611
612 if (suppress_dump || idle)
613 return;
614 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
615 return;
616
617 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
618 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
619 cpuc->cpuperf_target);
620 }
621
BPF_STRUCT_OPS(qmap_dump_task,struct scx_dump_ctx * dctx,struct task_struct * p)622 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
623 {
624 struct task_ctx *taskc;
625
626 if (suppress_dump)
627 return;
628 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
629 return;
630
631 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
632 taskc->force_local, taskc->core_sched_seq);
633 }
634
BPF_STRUCT_OPS(qmap_cgroup_init,struct cgroup * cgrp,struct scx_cgroup_init_args * args)635 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
636 {
637 if (print_msgs)
638 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
639 cgrp->kn->id, args->weight, args->bw_period_us,
640 args->bw_quota_us, args->bw_burst_us);
641 return 0;
642 }
643
BPF_STRUCT_OPS(qmap_cgroup_set_weight,struct cgroup * cgrp,u32 weight)644 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
645 {
646 if (print_msgs)
647 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
648 }
649
BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth,struct cgroup * cgrp,u64 period_us,u64 quota_us,u64 burst_us)650 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
651 u64 period_us, u64 quota_us, u64 burst_us)
652 {
653 if (print_msgs)
654 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
655 cgrp->kn->id, period_us, quota_us, burst_us);
656 }
657
658 /*
659 * Print out the online and possible CPU map using bpf_printk() as a
660 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
661 */
print_cpus(void)662 static void print_cpus(void)
663 {
664 const struct cpumask *possible, *online;
665 s32 cpu;
666 char buf[128] = "", *p;
667 int idx;
668
669 possible = scx_bpf_get_possible_cpumask();
670 online = scx_bpf_get_online_cpumask();
671
672 idx = 0;
673 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
674 if (!(p = MEMBER_VPTR(buf, [idx++])))
675 break;
676 if (bpf_cpumask_test_cpu(cpu, online))
677 *p++ = 'O';
678 else if (bpf_cpumask_test_cpu(cpu, possible))
679 *p++ = 'X';
680 else
681 *p++ = ' ';
682
683 if ((cpu & 7) == 7) {
684 if (!(p = MEMBER_VPTR(buf, [idx++])))
685 break;
686 *p++ = '|';
687 }
688 }
689 buf[sizeof(buf) - 1] = '\0';
690
691 scx_bpf_put_cpumask(online);
692 scx_bpf_put_cpumask(possible);
693
694 bpf_printk("CPUS: |%s", buf);
695 }
696
BPF_STRUCT_OPS(qmap_cpu_online,s32 cpu)697 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
698 {
699 if (print_msgs) {
700 bpf_printk("CPU %d coming online", cpu);
701 /* @cpu is already online at this point */
702 print_cpus();
703 }
704 }
705
BPF_STRUCT_OPS(qmap_cpu_offline,s32 cpu)706 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
707 {
708 if (print_msgs) {
709 bpf_printk("CPU %d going offline", cpu);
710 /* @cpu is still online at this point */
711 print_cpus();
712 }
713 }
714
715 struct monitor_timer {
716 struct bpf_timer timer;
717 };
718
719 struct {
720 __uint(type, BPF_MAP_TYPE_ARRAY);
721 __uint(max_entries, 1);
722 __type(key, u32);
723 __type(value, struct monitor_timer);
724 } monitor_timer SEC(".maps");
725
726 /*
727 * Print out the min, avg and max performance levels of CPUs every second to
728 * demonstrate the cpuperf interface.
729 */
monitor_cpuperf(void)730 static void monitor_cpuperf(void)
731 {
732 u32 zero = 0, nr_cpu_ids;
733 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
734 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
735 const struct cpumask *online;
736 int i, nr_online_cpus = 0;
737
738 nr_cpu_ids = scx_bpf_nr_cpu_ids();
739 online = scx_bpf_get_online_cpumask();
740
741 bpf_for(i, 0, nr_cpu_ids) {
742 struct cpu_ctx *cpuc;
743 u32 cap, cur;
744
745 if (!bpf_cpumask_test_cpu(i, online))
746 continue;
747 nr_online_cpus++;
748
749 /* collect the capacity and current cpuperf */
750 cap = scx_bpf_cpuperf_cap(i);
751 cur = scx_bpf_cpuperf_cur(i);
752
753 cur_min = cur < cur_min ? cur : cur_min;
754 cur_max = cur > cur_max ? cur : cur_max;
755
756 /*
757 * $cur is relative to $cap. Scale it down accordingly so that
758 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
759 * makes sense.
760 */
761 cur_sum += cur * cap / SCX_CPUPERF_ONE;
762 cap_sum += cap;
763
764 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
765 scx_bpf_error("failed to look up cpu_ctx");
766 goto out;
767 }
768
769 /* collect target */
770 cur = cpuc->cpuperf_target;
771 target_sum += cur;
772 target_min = cur < target_min ? cur : target_min;
773 target_max = cur > target_max ? cur : target_max;
774 }
775
776 cpuperf_min = cur_min;
777 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
778 cpuperf_max = cur_max;
779
780 cpuperf_target_min = target_min;
781 cpuperf_target_avg = target_sum / nr_online_cpus;
782 cpuperf_target_max = target_max;
783 out:
784 scx_bpf_put_cpumask(online);
785 }
786
787 /*
788 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
789 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
790 * see meaningful dumps in the trace pipe.
791 */
dump_shared_dsq(void)792 static void dump_shared_dsq(void)
793 {
794 struct task_struct *p;
795 s32 nr;
796
797 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
798 return;
799
800 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
801
802 bpf_rcu_read_lock();
803 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
804 bpf_printk("%s[%d]", p->comm, p->pid);
805 bpf_rcu_read_unlock();
806 }
807
monitor_timerfn(void * map,int * key,struct bpf_timer * timer)808 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
809 {
810 bpf_rcu_read_lock();
811 dispatch_highpri(true);
812 bpf_rcu_read_unlock();
813
814 monitor_cpuperf();
815
816 if (print_dsqs_and_events) {
817 struct scx_event_stats events;
818
819 dump_shared_dsq();
820
821 __COMPAT_scx_bpf_events(&events, sizeof(events));
822
823 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
824 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
825 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
826 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
827 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
828 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
829 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
830 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
831 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
832 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
833 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
834 scx_read_event(&events, SCX_EV_BYPASS_DURATION));
835 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
836 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
837 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
838 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
839 }
840
841 bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
842 return 0;
843 }
844
BPF_STRUCT_OPS_SLEEPABLE(qmap_init)845 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
846 {
847 u32 key = 0;
848 struct bpf_timer *timer;
849 s32 ret;
850
851 if (print_msgs)
852 print_cpus();
853
854 ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
855 if (ret)
856 return ret;
857
858 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
859 if (ret)
860 return ret;
861
862 timer = bpf_map_lookup_elem(&monitor_timer, &key);
863 if (!timer)
864 return -ESRCH;
865
866 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
867 bpf_timer_set_callback(timer, monitor_timerfn);
868
869 return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
870 }
871
BPF_STRUCT_OPS(qmap_exit,struct scx_exit_info * ei)872 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
873 {
874 UEI_RECORD(uei, ei);
875 }
876
877 SCX_OPS_DEFINE(qmap_ops,
878 .select_cpu = (void *)qmap_select_cpu,
879 .enqueue = (void *)qmap_enqueue,
880 .dequeue = (void *)qmap_dequeue,
881 .dispatch = (void *)qmap_dispatch,
882 .tick = (void *)qmap_tick,
883 .core_sched_before = (void *)qmap_core_sched_before,
884 .cpu_release = (void *)qmap_cpu_release,
885 .init_task = (void *)qmap_init_task,
886 .dump = (void *)qmap_dump,
887 .dump_cpu = (void *)qmap_dump_cpu,
888 .dump_task = (void *)qmap_dump_task,
889 .cgroup_init = (void *)qmap_cgroup_init,
890 .cgroup_set_weight = (void *)qmap_cgroup_set_weight,
891 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
892 .cpu_online = (void *)qmap_cpu_online,
893 .cpu_offline = (void *)qmap_cpu_offline,
894 .init = (void *)qmap_init,
895 .exit = (void *)qmap_exit,
896 .timeout_ms = 5000U,
897 .name = "qmap");
898