xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 3fd6c59042dbba50391e30862beac979491145fe)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15  *   the CPU away.
16  * - Core-sched support.
17  *
18  * This scheduler is primarily for demonstration and testing of sched_ext
19  * features and unlikely to be useful for actual workloads.
20  *
21  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
23  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
24  */
25 #include <scx/common.bpf.h>
26 
27 enum consts {
28 	ONE_SEC_IN_NS		= 1000000000,
29 	SHARED_DSQ		= 0,
30 	HIGHPRI_DSQ		= 1,
31 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
32 };
33 
34 char _license[] SEC("license") = "GPL";
35 
36 const volatile u64 slice_ns = SCX_SLICE_DFL;
37 const volatile u32 stall_user_nth;
38 const volatile u32 stall_kernel_nth;
39 const volatile u32 dsp_inf_loop_after;
40 const volatile u32 dsp_batch;
41 const volatile bool highpri_boosting;
42 const volatile bool print_shared_dsq;
43 const volatile s32 disallow_tgid;
44 const volatile bool suppress_dump;
45 
46 u64 nr_highpri_queued;
47 u32 test_error_cnt;
48 
49 UEI_DEFINE(uei);
50 
51 struct qmap {
52 	__uint(type, BPF_MAP_TYPE_QUEUE);
53 	__uint(max_entries, 4096);
54 	__type(value, u32);
55 } queue0 SEC(".maps"),
56   queue1 SEC(".maps"),
57   queue2 SEC(".maps"),
58   queue3 SEC(".maps"),
59   queue4 SEC(".maps");
60 
61 struct {
62 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
63 	__uint(max_entries, 5);
64 	__type(key, int);
65 	__array(values, struct qmap);
66 } queue_arr SEC(".maps") = {
67 	.values = {
68 		[0] = &queue0,
69 		[1] = &queue1,
70 		[2] = &queue2,
71 		[3] = &queue3,
72 		[4] = &queue4,
73 	},
74 };
75 
76 /*
77  * If enabled, CPU performance target is set according to the queue index
78  * according to the following table.
79  */
80 static const u32 qidx_to_cpuperf_target[] = {
81 	[0] = SCX_CPUPERF_ONE * 0 / 4,
82 	[1] = SCX_CPUPERF_ONE * 1 / 4,
83 	[2] = SCX_CPUPERF_ONE * 2 / 4,
84 	[3] = SCX_CPUPERF_ONE * 3 / 4,
85 	[4] = SCX_CPUPERF_ONE * 4 / 4,
86 };
87 
88 /*
89  * Per-queue sequence numbers to implement core-sched ordering.
90  *
91  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
92  * sequence number of the latest dispatched task. The distance between the a
93  * task's seq and the associated queue's head seq is called the queue distance
94  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
95  */
96 static u64 core_sched_head_seqs[5];
97 static u64 core_sched_tail_seqs[5];
98 
99 /* Per-task scheduling context */
100 struct task_ctx {
101 	bool	force_local;	/* Dispatch directly to local_dsq */
102 	bool	highpri;
103 	u64	core_sched_seq;
104 };
105 
106 struct {
107 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
108 	__uint(map_flags, BPF_F_NO_PREALLOC);
109 	__type(key, int);
110 	__type(value, struct task_ctx);
111 } task_ctx_stor SEC(".maps");
112 
113 struct cpu_ctx {
114 	u64	dsp_idx;	/* dispatch index */
115 	u64	dsp_cnt;	/* remaining count */
116 	u32	avg_weight;
117 	u32	cpuperf_target;
118 };
119 
120 struct {
121 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122 	__uint(max_entries, 1);
123 	__type(key, u32);
124 	__type(value, struct cpu_ctx);
125 } cpu_ctx_stor SEC(".maps");
126 
127 /* Statistics */
128 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
129 u64 nr_core_sched_execed;
130 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
131 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
132 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
133 
pick_direct_dispatch_cpu(struct task_struct * p,s32 prev_cpu)134 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
135 {
136 	s32 cpu;
137 
138 	if (p->nr_cpus_allowed == 1 ||
139 	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
140 		return prev_cpu;
141 
142 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
143 	if (cpu >= 0)
144 		return cpu;
145 
146 	return -1;
147 }
148 
lookup_task_ctx(struct task_struct * p)149 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
150 {
151 	struct task_ctx *tctx;
152 
153 	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
154 		scx_bpf_error("task_ctx lookup failed");
155 		return NULL;
156 	}
157 	return tctx;
158 }
159 
BPF_STRUCT_OPS(qmap_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)160 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
161 		   s32 prev_cpu, u64 wake_flags)
162 {
163 	struct task_ctx *tctx;
164 	s32 cpu;
165 
166 	if (!(tctx = lookup_task_ctx(p)))
167 		return -ESRCH;
168 
169 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
170 
171 	if (cpu >= 0) {
172 		tctx->force_local = true;
173 		return cpu;
174 	} else {
175 		return prev_cpu;
176 	}
177 }
178 
weight_to_idx(u32 weight)179 static int weight_to_idx(u32 weight)
180 {
181 	/* Coarsely map the compound weight to a FIFO. */
182 	if (weight <= 25)
183 		return 0;
184 	else if (weight <= 50)
185 		return 1;
186 	else if (weight < 200)
187 		return 2;
188 	else if (weight < 400)
189 		return 3;
190 	else
191 		return 4;
192 }
193 
BPF_STRUCT_OPS(qmap_enqueue,struct task_struct * p,u64 enq_flags)194 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
195 {
196 	static u32 user_cnt, kernel_cnt;
197 	struct task_ctx *tctx;
198 	u32 pid = p->pid;
199 	int idx = weight_to_idx(p->scx.weight);
200 	void *ring;
201 	s32 cpu;
202 
203 	if (p->flags & PF_KTHREAD) {
204 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
205 			return;
206 	} else {
207 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
208 			return;
209 	}
210 
211 	if (test_error_cnt && !--test_error_cnt)
212 		scx_bpf_error("test triggering error");
213 
214 	if (!(tctx = lookup_task_ctx(p)))
215 		return;
216 
217 	/*
218 	 * All enqueued tasks must have their core_sched_seq updated for correct
219 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
220 	 */
221 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
222 
223 	/*
224 	 * If qmap_select_cpu() is telling us to or this is the last runnable
225 	 * task on the CPU, enqueue locally.
226 	 */
227 	if (tctx->force_local) {
228 		tctx->force_local = false;
229 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
230 		return;
231 	}
232 
233 	/* if select_cpu() wasn't called, try direct dispatch */
234 	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
235 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
236 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
237 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
238 		return;
239 	}
240 
241 	/*
242 	 * If the task was re-enqueued due to the CPU being preempted by a
243 	 * higher priority scheduling class, just re-enqueue the task directly
244 	 * on the global DSQ. As we want another CPU to pick it up, find and
245 	 * kick an idle CPU.
246 	 */
247 	if (enq_flags & SCX_ENQ_REENQ) {
248 		s32 cpu;
249 
250 		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
251 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
252 		if (cpu >= 0)
253 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
254 		return;
255 	}
256 
257 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
258 	if (!ring) {
259 		scx_bpf_error("failed to find ring %d", idx);
260 		return;
261 	}
262 
263 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
264 	if (bpf_map_push_elem(ring, &pid, 0)) {
265 		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
266 		return;
267 	}
268 
269 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
270 		tctx->highpri = true;
271 		__sync_fetch_and_add(&nr_highpri_queued, 1);
272 	}
273 	__sync_fetch_and_add(&nr_enqueued, 1);
274 }
275 
276 /*
277  * The BPF queue map doesn't support removal and sched_ext can handle spurious
278  * dispatches. qmap_dequeue() is only used to collect statistics.
279  */
BPF_STRUCT_OPS(qmap_dequeue,struct task_struct * p,u64 deq_flags)280 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
281 {
282 	__sync_fetch_and_add(&nr_dequeued, 1);
283 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
284 		__sync_fetch_and_add(&nr_core_sched_execed, 1);
285 }
286 
update_core_sched_head_seq(struct task_struct * p)287 static void update_core_sched_head_seq(struct task_struct *p)
288 {
289 	int idx = weight_to_idx(p->scx.weight);
290 	struct task_ctx *tctx;
291 
292 	if ((tctx = lookup_task_ctx(p)))
293 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
294 }
295 
296 /*
297  * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
298  * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
299  * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
300  * difference only when dsp_batch is larger than 1.
301  *
302  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
303  * non-rq-lock holding BPF programs. As demonstration, this function is called
304  * from qmap_dispatch() and monitor_timerfn().
305  */
dispatch_highpri(bool from_timer)306 static bool dispatch_highpri(bool from_timer)
307 {
308 	struct task_struct *p;
309 	s32 this_cpu = bpf_get_smp_processor_id();
310 
311 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
312 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
313 		static u64 highpri_seq;
314 		struct task_ctx *tctx;
315 
316 		if (!(tctx = lookup_task_ctx(p)))
317 			return false;
318 
319 		if (tctx->highpri) {
320 			/* exercise the set_*() and vtime interface too */
321 			__COMPAT_scx_bpf_dsq_move_set_slice(
322 				BPF_FOR_EACH_ITER, slice_ns * 2);
323 			__COMPAT_scx_bpf_dsq_move_set_vtime(
324 				BPF_FOR_EACH_ITER, highpri_seq++);
325 			__COMPAT_scx_bpf_dsq_move_vtime(
326 				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
327 		}
328 	}
329 
330 	/*
331 	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
332 	 * is found.
333 	 */
334 	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
335 		bool dispatched = false;
336 		s32 cpu;
337 
338 		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
339 			cpu = this_cpu;
340 		else
341 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
342 
343 		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
344 					      SCX_DSQ_LOCAL_ON | cpu,
345 					      SCX_ENQ_PREEMPT)) {
346 			if (cpu == this_cpu) {
347 				dispatched = true;
348 				__sync_fetch_and_add(&nr_expedited_local, 1);
349 			} else {
350 				__sync_fetch_and_add(&nr_expedited_remote, 1);
351 			}
352 			if (from_timer)
353 				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
354 		} else {
355 			__sync_fetch_and_add(&nr_expedited_lost, 1);
356 		}
357 
358 		if (dispatched)
359 			return true;
360 	}
361 
362 	return false;
363 }
364 
BPF_STRUCT_OPS(qmap_dispatch,s32 cpu,struct task_struct * prev)365 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
366 {
367 	struct task_struct *p;
368 	struct cpu_ctx *cpuc;
369 	struct task_ctx *tctx;
370 	u32 zero = 0, batch = dsp_batch ?: 1;
371 	void *fifo;
372 	s32 i, pid;
373 
374 	if (dispatch_highpri(false))
375 		return;
376 
377 	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
378 		return;
379 
380 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
381 		/*
382 		 * PID 2 should be kthreadd which should mostly be idle and off
383 		 * the scheduler. Let's keep dispatching it to force the kernel
384 		 * to call this function over and over again.
385 		 */
386 		p = bpf_task_from_pid(2);
387 		if (p) {
388 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
389 			bpf_task_release(p);
390 			return;
391 		}
392 	}
393 
394 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
395 		scx_bpf_error("failed to look up cpu_ctx");
396 		return;
397 	}
398 
399 	for (i = 0; i < 5; i++) {
400 		/* Advance the dispatch cursor and pick the fifo. */
401 		if (!cpuc->dsp_cnt) {
402 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
403 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
404 		}
405 
406 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
407 		if (!fifo) {
408 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
409 			return;
410 		}
411 
412 		/* Dispatch or advance. */
413 		bpf_repeat(BPF_MAX_LOOPS) {
414 			struct task_ctx *tctx;
415 
416 			if (bpf_map_pop_elem(fifo, &pid))
417 				break;
418 
419 			p = bpf_task_from_pid(pid);
420 			if (!p)
421 				continue;
422 
423 			if (!(tctx = lookup_task_ctx(p))) {
424 				bpf_task_release(p);
425 				return;
426 			}
427 
428 			if (tctx->highpri)
429 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
430 
431 			update_core_sched_head_seq(p);
432 			__sync_fetch_and_add(&nr_dispatched, 1);
433 
434 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
435 			bpf_task_release(p);
436 
437 			batch--;
438 			cpuc->dsp_cnt--;
439 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
440 				if (dispatch_highpri(false))
441 					return;
442 				scx_bpf_dsq_move_to_local(SHARED_DSQ);
443 				return;
444 			}
445 			if (!cpuc->dsp_cnt)
446 				break;
447 		}
448 
449 		cpuc->dsp_cnt = 0;
450 	}
451 
452 	/*
453 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
454 	 * if the task were enqueued and dispatched immediately.
455 	 */
456 	if (prev) {
457 		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
458 		if (!tctx) {
459 			scx_bpf_error("task_ctx lookup failed");
460 			return;
461 		}
462 
463 		tctx->core_sched_seq =
464 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
465 	}
466 }
467 
BPF_STRUCT_OPS(qmap_tick,struct task_struct * p)468 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
469 {
470 	struct cpu_ctx *cpuc;
471 	u32 zero = 0;
472 	int idx;
473 
474 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
475 		scx_bpf_error("failed to look up cpu_ctx");
476 		return;
477 	}
478 
479 	/*
480 	 * Use the running avg of weights to select the target cpuperf level.
481 	 * This is a demonstration of the cpuperf feature rather than a
482 	 * practical strategy to regulate CPU frequency.
483 	 */
484 	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
485 	idx = weight_to_idx(cpuc->avg_weight);
486 	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
487 
488 	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
489 }
490 
491 /*
492  * The distance from the head of the queue scaled by the weight of the queue.
493  * The lower the number, the older the task and the higher the priority.
494  */
task_qdist(struct task_struct * p)495 static s64 task_qdist(struct task_struct *p)
496 {
497 	int idx = weight_to_idx(p->scx.weight);
498 	struct task_ctx *tctx;
499 	s64 qdist;
500 
501 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
502 	if (!tctx) {
503 		scx_bpf_error("task_ctx lookup failed");
504 		return 0;
505 	}
506 
507 	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
508 
509 	/*
510 	 * As queue index increments, the priority doubles. The queue w/ index 3
511 	 * is dispatched twice more frequently than 2. Reflect the difference by
512 	 * scaling qdists accordingly. Note that the shift amount needs to be
513 	 * flipped depending on the sign to avoid flipping priority direction.
514 	 */
515 	if (qdist >= 0)
516 		return qdist << (4 - idx);
517 	else
518 		return qdist << idx;
519 }
520 
521 /*
522  * This is called to determine the task ordering when core-sched is picking
523  * tasks to execute on SMT siblings and should encode about the same ordering as
524  * the regular scheduling path. Use the priority-scaled distances from the head
525  * of the queues to compare the two tasks which should be consistent with the
526  * dispatch path behavior.
527  */
BPF_STRUCT_OPS(qmap_core_sched_before,struct task_struct * a,struct task_struct * b)528 bool BPF_STRUCT_OPS(qmap_core_sched_before,
529 		    struct task_struct *a, struct task_struct *b)
530 {
531 	return task_qdist(a) > task_qdist(b);
532 }
533 
BPF_STRUCT_OPS(qmap_cpu_release,s32 cpu,struct scx_cpu_release_args * args)534 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
535 {
536 	u32 cnt;
537 
538 	/*
539 	 * Called when @cpu is taken by a higher priority scheduling class. This
540 	 * makes @cpu no longer available for executing sched_ext tasks. As we
541 	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
542 	 * becomes available again, re-enqueue them into the global dsq. See
543 	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
544 	 */
545 	cnt = scx_bpf_reenqueue_local();
546 	if (cnt)
547 		__sync_fetch_and_add(&nr_reenqueued, cnt);
548 }
549 
BPF_STRUCT_OPS(qmap_init_task,struct task_struct * p,struct scx_init_task_args * args)550 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
551 		   struct scx_init_task_args *args)
552 {
553 	if (p->tgid == disallow_tgid)
554 		p->scx.disallow = true;
555 
556 	/*
557 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
558 	 * in this function and the following will automatically use GFP_KERNEL.
559 	 */
560 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
561 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
562 		return 0;
563 	else
564 		return -ENOMEM;
565 }
566 
BPF_STRUCT_OPS(qmap_dump,struct scx_dump_ctx * dctx)567 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
568 {
569 	s32 i, pid;
570 
571 	if (suppress_dump)
572 		return;
573 
574 	bpf_for(i, 0, 5) {
575 		void *fifo;
576 
577 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
578 			return;
579 
580 		scx_bpf_dump("QMAP FIFO[%d]:", i);
581 		bpf_repeat(4096) {
582 			if (bpf_map_pop_elem(fifo, &pid))
583 				break;
584 			scx_bpf_dump(" %d", pid);
585 		}
586 		scx_bpf_dump("\n");
587 	}
588 }
589 
BPF_STRUCT_OPS(qmap_dump_cpu,struct scx_dump_ctx * dctx,s32 cpu,bool idle)590 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
591 {
592 	u32 zero = 0;
593 	struct cpu_ctx *cpuc;
594 
595 	if (suppress_dump || idle)
596 		return;
597 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
598 		return;
599 
600 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
601 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
602 		     cpuc->cpuperf_target);
603 }
604 
BPF_STRUCT_OPS(qmap_dump_task,struct scx_dump_ctx * dctx,struct task_struct * p)605 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
606 {
607 	struct task_ctx *taskc;
608 
609 	if (suppress_dump)
610 		return;
611 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
612 		return;
613 
614 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
615 		     taskc->force_local, taskc->core_sched_seq);
616 }
617 
618 /*
619  * Print out the online and possible CPU map using bpf_printk() as a
620  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
621  */
print_cpus(void)622 static void print_cpus(void)
623 {
624 	const struct cpumask *possible, *online;
625 	s32 cpu;
626 	char buf[128] = "", *p;
627 	int idx;
628 
629 	possible = scx_bpf_get_possible_cpumask();
630 	online = scx_bpf_get_online_cpumask();
631 
632 	idx = 0;
633 	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
634 		if (!(p = MEMBER_VPTR(buf, [idx++])))
635 			break;
636 		if (bpf_cpumask_test_cpu(cpu, online))
637 			*p++ = 'O';
638 		else if (bpf_cpumask_test_cpu(cpu, possible))
639 			*p++ = 'X';
640 		else
641 			*p++ = ' ';
642 
643 		if ((cpu & 7) == 7) {
644 			if (!(p = MEMBER_VPTR(buf, [idx++])))
645 				break;
646 			*p++ = '|';
647 		}
648 	}
649 	buf[sizeof(buf) - 1] = '\0';
650 
651 	scx_bpf_put_cpumask(online);
652 	scx_bpf_put_cpumask(possible);
653 
654 	bpf_printk("CPUS: |%s", buf);
655 }
656 
BPF_STRUCT_OPS(qmap_cpu_online,s32 cpu)657 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
658 {
659 	bpf_printk("CPU %d coming online", cpu);
660 	/* @cpu is already online at this point */
661 	print_cpus();
662 }
663 
BPF_STRUCT_OPS(qmap_cpu_offline,s32 cpu)664 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
665 {
666 	bpf_printk("CPU %d going offline", cpu);
667 	/* @cpu is still online at this point */
668 	print_cpus();
669 }
670 
671 struct monitor_timer {
672 	struct bpf_timer timer;
673 };
674 
675 struct {
676 	__uint(type, BPF_MAP_TYPE_ARRAY);
677 	__uint(max_entries, 1);
678 	__type(key, u32);
679 	__type(value, struct monitor_timer);
680 } monitor_timer SEC(".maps");
681 
682 /*
683  * Print out the min, avg and max performance levels of CPUs every second to
684  * demonstrate the cpuperf interface.
685  */
monitor_cpuperf(void)686 static void monitor_cpuperf(void)
687 {
688 	u32 zero = 0, nr_cpu_ids;
689 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
690 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
691 	const struct cpumask *online;
692 	int i, nr_online_cpus = 0;
693 
694 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
695 	online = scx_bpf_get_online_cpumask();
696 
697 	bpf_for(i, 0, nr_cpu_ids) {
698 		struct cpu_ctx *cpuc;
699 		u32 cap, cur;
700 
701 		if (!bpf_cpumask_test_cpu(i, online))
702 			continue;
703 		nr_online_cpus++;
704 
705 		/* collect the capacity and current cpuperf */
706 		cap = scx_bpf_cpuperf_cap(i);
707 		cur = scx_bpf_cpuperf_cur(i);
708 
709 		cur_min = cur < cur_min ? cur : cur_min;
710 		cur_max = cur > cur_max ? cur : cur_max;
711 
712 		/*
713 		 * $cur is relative to $cap. Scale it down accordingly so that
714 		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
715 		 * makes sense.
716 		 */
717 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
718 		cap_sum += cap;
719 
720 		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
721 			scx_bpf_error("failed to look up cpu_ctx");
722 			goto out;
723 		}
724 
725 		/* collect target */
726 		cur = cpuc->cpuperf_target;
727 		target_sum += cur;
728 		target_min = cur < target_min ? cur : target_min;
729 		target_max = cur > target_max ? cur : target_max;
730 	}
731 
732 	cpuperf_min = cur_min;
733 	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
734 	cpuperf_max = cur_max;
735 
736 	cpuperf_target_min = target_min;
737 	cpuperf_target_avg = target_sum / nr_online_cpus;
738 	cpuperf_target_max = target_max;
739 out:
740 	scx_bpf_put_cpumask(online);
741 }
742 
743 /*
744  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
745  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
746  * see meaningful dumps in the trace pipe.
747  */
dump_shared_dsq(void)748 static void dump_shared_dsq(void)
749 {
750 	struct task_struct *p;
751 	s32 nr;
752 
753 	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
754 		return;
755 
756 	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
757 
758 	bpf_rcu_read_lock();
759 	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
760 		bpf_printk("%s[%d]", p->comm, p->pid);
761 	bpf_rcu_read_unlock();
762 }
763 
monitor_timerfn(void * map,int * key,struct bpf_timer * timer)764 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
765 {
766 	bpf_rcu_read_lock();
767 	dispatch_highpri(true);
768 	bpf_rcu_read_unlock();
769 
770 	monitor_cpuperf();
771 
772 	if (print_shared_dsq)
773 		dump_shared_dsq();
774 
775 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
776 	return 0;
777 }
778 
BPF_STRUCT_OPS_SLEEPABLE(qmap_init)779 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
780 {
781 	u32 key = 0;
782 	struct bpf_timer *timer;
783 	s32 ret;
784 
785 	print_cpus();
786 
787 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
788 	if (ret)
789 		return ret;
790 
791 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
792 	if (ret)
793 		return ret;
794 
795 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
796 	if (!timer)
797 		return -ESRCH;
798 
799 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
800 	bpf_timer_set_callback(timer, monitor_timerfn);
801 
802 	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
803 }
804 
BPF_STRUCT_OPS(qmap_exit,struct scx_exit_info * ei)805 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
806 {
807 	UEI_RECORD(uei, ei);
808 }
809 
810 SCX_OPS_DEFINE(qmap_ops,
811 	       .select_cpu		= (void *)qmap_select_cpu,
812 	       .enqueue			= (void *)qmap_enqueue,
813 	       .dequeue			= (void *)qmap_dequeue,
814 	       .dispatch		= (void *)qmap_dispatch,
815 	       .tick			= (void *)qmap_tick,
816 	       .core_sched_before	= (void *)qmap_core_sched_before,
817 	       .cpu_release		= (void *)qmap_cpu_release,
818 	       .init_task		= (void *)qmap_init_task,
819 	       .dump			= (void *)qmap_dump,
820 	       .dump_cpu		= (void *)qmap_dump_cpu,
821 	       .dump_task		= (void *)qmap_dump_task,
822 	       .cpu_online		= (void *)qmap_cpu_online,
823 	       .cpu_offline		= (void *)qmap_cpu_offline,
824 	       .init			= (void *)qmap_init,
825 	       .exit			= (void *)qmap_exit,
826 	       .timeout_ms		= 5000U,
827 	       .name			= "qmap");
828