xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 02baaa67d9afc2e56c6e1ac6a1fb1f1dd2be366f)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15  *   the CPU away.
16  * - Core-sched support.
17  *
18  * This scheduler is primarily for demonstration and testing of sched_ext
19  * features and unlikely to be useful for actual workloads.
20  *
21  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
23  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
24  */
25 #include <scx/common.bpf.h>
26 
27 enum consts {
28 	ONE_SEC_IN_NS		= 1000000000,
29 	SHARED_DSQ		= 0,
30 	HIGHPRI_DSQ		= 1,
31 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
32 };
33 
34 char _license[] SEC("license") = "GPL";
35 
36 const volatile u64 slice_ns;
37 const volatile u32 stall_user_nth;
38 const volatile u32 stall_kernel_nth;
39 const volatile u32 dsp_inf_loop_after;
40 const volatile u32 dsp_batch;
41 const volatile bool highpri_boosting;
42 const volatile bool print_dsqs_and_events;
43 const volatile bool print_msgs;
44 const volatile s32 disallow_tgid;
45 const volatile bool suppress_dump;
46 
47 u64 nr_highpri_queued;
48 u32 test_error_cnt;
49 
50 UEI_DEFINE(uei);
51 
52 struct qmap {
53 	__uint(type, BPF_MAP_TYPE_QUEUE);
54 	__uint(max_entries, 4096);
55 	__type(value, u32);
56 } queue0 SEC(".maps"),
57   queue1 SEC(".maps"),
58   queue2 SEC(".maps"),
59   queue3 SEC(".maps"),
60   queue4 SEC(".maps"),
61   dump_store SEC(".maps");
62 
63 struct {
64 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
65 	__uint(max_entries, 5);
66 	__type(key, int);
67 	__array(values, struct qmap);
68 } queue_arr SEC(".maps") = {
69 	.values = {
70 		[0] = &queue0,
71 		[1] = &queue1,
72 		[2] = &queue2,
73 		[3] = &queue3,
74 		[4] = &queue4,
75 	},
76 };
77 
78 /*
79  * If enabled, CPU performance target is set according to the queue index
80  * according to the following table.
81  */
82 static const u32 qidx_to_cpuperf_target[] = {
83 	[0] = SCX_CPUPERF_ONE * 0 / 4,
84 	[1] = SCX_CPUPERF_ONE * 1 / 4,
85 	[2] = SCX_CPUPERF_ONE * 2 / 4,
86 	[3] = SCX_CPUPERF_ONE * 3 / 4,
87 	[4] = SCX_CPUPERF_ONE * 4 / 4,
88 };
89 
90 /*
91  * Per-queue sequence numbers to implement core-sched ordering.
92  *
93  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
94  * sequence number of the latest dispatched task. The distance between the a
95  * task's seq and the associated queue's head seq is called the queue distance
96  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
97  */
98 static u64 core_sched_head_seqs[5];
99 static u64 core_sched_tail_seqs[5];
100 
101 /* Per-task scheduling context */
102 struct task_ctx {
103 	bool	force_local;	/* Dispatch directly to local_dsq */
104 	bool	highpri;
105 	u64	core_sched_seq;
106 };
107 
108 struct {
109 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
110 	__uint(map_flags, BPF_F_NO_PREALLOC);
111 	__type(key, int);
112 	__type(value, struct task_ctx);
113 } task_ctx_stor SEC(".maps");
114 
115 struct cpu_ctx {
116 	u64	dsp_idx;	/* dispatch index */
117 	u64	dsp_cnt;	/* remaining count */
118 	u32	avg_weight;
119 	u32	cpuperf_target;
120 };
121 
122 struct {
123 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
124 	__uint(max_entries, 1);
125 	__type(key, u32);
126 	__type(value, struct cpu_ctx);
127 } cpu_ctx_stor SEC(".maps");
128 
129 /* Statistics */
130 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
131 u64 nr_core_sched_execed;
132 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
133 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
134 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
135 
pick_direct_dispatch_cpu(struct task_struct * p,s32 prev_cpu)136 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
137 {
138 	s32 cpu;
139 
140 	if (p->nr_cpus_allowed == 1 ||
141 	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
142 		return prev_cpu;
143 
144 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
145 	if (cpu >= 0)
146 		return cpu;
147 
148 	return -1;
149 }
150 
lookup_task_ctx(struct task_struct * p)151 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
152 {
153 	struct task_ctx *tctx;
154 
155 	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
156 		scx_bpf_error("task_ctx lookup failed");
157 		return NULL;
158 	}
159 	return tctx;
160 }
161 
BPF_STRUCT_OPS(qmap_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)162 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
163 		   s32 prev_cpu, u64 wake_flags)
164 {
165 	struct task_ctx *tctx;
166 	s32 cpu;
167 
168 	if (!(tctx = lookup_task_ctx(p)))
169 		return -ESRCH;
170 
171 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
172 
173 	if (cpu >= 0) {
174 		tctx->force_local = true;
175 		return cpu;
176 	} else {
177 		return prev_cpu;
178 	}
179 }
180 
weight_to_idx(u32 weight)181 static int weight_to_idx(u32 weight)
182 {
183 	/* Coarsely map the compound weight to a FIFO. */
184 	if (weight <= 25)
185 		return 0;
186 	else if (weight <= 50)
187 		return 1;
188 	else if (weight < 200)
189 		return 2;
190 	else if (weight < 400)
191 		return 3;
192 	else
193 		return 4;
194 }
195 
BPF_STRUCT_OPS(qmap_enqueue,struct task_struct * p,u64 enq_flags)196 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
197 {
198 	static u32 user_cnt, kernel_cnt;
199 	struct task_ctx *tctx;
200 	u32 pid = p->pid;
201 	int idx = weight_to_idx(p->scx.weight);
202 	void *ring;
203 	s32 cpu;
204 
205 	if (enq_flags & SCX_ENQ_REENQ)
206 		__sync_fetch_and_add(&nr_reenqueued, 1);
207 
208 	if (p->flags & PF_KTHREAD) {
209 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
210 			return;
211 	} else {
212 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
213 			return;
214 	}
215 
216 	if (test_error_cnt && !--test_error_cnt)
217 		scx_bpf_error("test triggering error");
218 
219 	if (!(tctx = lookup_task_ctx(p)))
220 		return;
221 
222 	/*
223 	 * All enqueued tasks must have their core_sched_seq updated for correct
224 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
225 	 */
226 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
227 
228 	/*
229 	 * If qmap_select_cpu() is telling us to or this is the last runnable
230 	 * task on the CPU, enqueue locally.
231 	 */
232 	if (tctx->force_local) {
233 		tctx->force_local = false;
234 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
235 		return;
236 	}
237 
238 	/* if select_cpu() wasn't called, try direct dispatch */
239 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
240 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
241 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
242 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
243 		return;
244 	}
245 
246 	/*
247 	 * If the task was re-enqueued due to the CPU being preempted by a
248 	 * higher priority scheduling class, just re-enqueue the task directly
249 	 * on the global DSQ. As we want another CPU to pick it up, find and
250 	 * kick an idle CPU.
251 	 */
252 	if (enq_flags & SCX_ENQ_REENQ) {
253 		s32 cpu;
254 
255 		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
256 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
257 		if (cpu >= 0)
258 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
259 		return;
260 	}
261 
262 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
263 	if (!ring) {
264 		scx_bpf_error("failed to find ring %d", idx);
265 		return;
266 	}
267 
268 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
269 	if (bpf_map_push_elem(ring, &pid, 0)) {
270 		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
271 		return;
272 	}
273 
274 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
275 		tctx->highpri = true;
276 		__sync_fetch_and_add(&nr_highpri_queued, 1);
277 	}
278 	__sync_fetch_and_add(&nr_enqueued, 1);
279 }
280 
281 /*
282  * The BPF queue map doesn't support removal and sched_ext can handle spurious
283  * dispatches. qmap_dequeue() is only used to collect statistics.
284  */
BPF_STRUCT_OPS(qmap_dequeue,struct task_struct * p,u64 deq_flags)285 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
286 {
287 	__sync_fetch_and_add(&nr_dequeued, 1);
288 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
289 		__sync_fetch_and_add(&nr_core_sched_execed, 1);
290 }
291 
update_core_sched_head_seq(struct task_struct * p)292 static void update_core_sched_head_seq(struct task_struct *p)
293 {
294 	int idx = weight_to_idx(p->scx.weight);
295 	struct task_ctx *tctx;
296 
297 	if ((tctx = lookup_task_ctx(p)))
298 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
299 }
300 
301 /*
302  * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
303  * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
304  * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
305  * difference only when dsp_batch is larger than 1.
306  *
307  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
308  * non-rq-lock holding BPF programs. As demonstration, this function is called
309  * from qmap_dispatch() and monitor_timerfn().
310  */
dispatch_highpri(bool from_timer)311 static bool dispatch_highpri(bool from_timer)
312 {
313 	struct task_struct *p;
314 	s32 this_cpu = bpf_get_smp_processor_id();
315 
316 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
317 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
318 		static u64 highpri_seq;
319 		struct task_ctx *tctx;
320 
321 		if (!(tctx = lookup_task_ctx(p)))
322 			return false;
323 
324 		if (tctx->highpri) {
325 			/* exercise the set_*() and vtime interface too */
326 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
327 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
328 			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
329 		}
330 	}
331 
332 	/*
333 	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
334 	 * is found.
335 	 */
336 	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
337 		bool dispatched = false;
338 		s32 cpu;
339 
340 		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
341 			cpu = this_cpu;
342 		else
343 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
344 
345 		if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
346 				     SCX_ENQ_PREEMPT)) {
347 			if (cpu == this_cpu) {
348 				dispatched = true;
349 				__sync_fetch_and_add(&nr_expedited_local, 1);
350 			} else {
351 				__sync_fetch_and_add(&nr_expedited_remote, 1);
352 			}
353 			if (from_timer)
354 				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
355 		} else {
356 			__sync_fetch_and_add(&nr_expedited_lost, 1);
357 		}
358 
359 		if (dispatched)
360 			return true;
361 	}
362 
363 	return false;
364 }
365 
BPF_STRUCT_OPS(qmap_dispatch,s32 cpu,struct task_struct * prev)366 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
367 {
368 	struct task_struct *p;
369 	struct cpu_ctx *cpuc;
370 	struct task_ctx *tctx;
371 	u32 zero = 0, batch = dsp_batch ?: 1;
372 	void *fifo;
373 	s32 i, pid;
374 
375 	if (dispatch_highpri(false))
376 		return;
377 
378 	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
379 		return;
380 
381 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
382 		/*
383 		 * PID 2 should be kthreadd which should mostly be idle and off
384 		 * the scheduler. Let's keep dispatching it to force the kernel
385 		 * to call this function over and over again.
386 		 */
387 		p = bpf_task_from_pid(2);
388 		if (p) {
389 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
390 			bpf_task_release(p);
391 			return;
392 		}
393 	}
394 
395 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
396 		scx_bpf_error("failed to look up cpu_ctx");
397 		return;
398 	}
399 
400 	for (i = 0; i < 5; i++) {
401 		/* Advance the dispatch cursor and pick the fifo. */
402 		if (!cpuc->dsp_cnt) {
403 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
404 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
405 		}
406 
407 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
408 		if (!fifo) {
409 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
410 			return;
411 		}
412 
413 		/* Dispatch or advance. */
414 		bpf_repeat(BPF_MAX_LOOPS) {
415 			struct task_ctx *tctx;
416 
417 			if (bpf_map_pop_elem(fifo, &pid))
418 				break;
419 
420 			p = bpf_task_from_pid(pid);
421 			if (!p)
422 				continue;
423 
424 			if (!(tctx = lookup_task_ctx(p))) {
425 				bpf_task_release(p);
426 				return;
427 			}
428 
429 			if (tctx->highpri)
430 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
431 
432 			update_core_sched_head_seq(p);
433 			__sync_fetch_and_add(&nr_dispatched, 1);
434 
435 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
436 			bpf_task_release(p);
437 
438 			batch--;
439 			cpuc->dsp_cnt--;
440 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
441 				if (dispatch_highpri(false))
442 					return;
443 				scx_bpf_dsq_move_to_local(SHARED_DSQ);
444 				return;
445 			}
446 			if (!cpuc->dsp_cnt)
447 				break;
448 		}
449 
450 		cpuc->dsp_cnt = 0;
451 	}
452 
453 	/*
454 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
455 	 * if the task were enqueued and dispatched immediately.
456 	 */
457 	if (prev) {
458 		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
459 		if (!tctx) {
460 			scx_bpf_error("task_ctx lookup failed");
461 			return;
462 		}
463 
464 		tctx->core_sched_seq =
465 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
466 	}
467 }
468 
BPF_STRUCT_OPS(qmap_tick,struct task_struct * p)469 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
470 {
471 	struct cpu_ctx *cpuc;
472 	u32 zero = 0;
473 	int idx;
474 
475 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
476 		scx_bpf_error("failed to look up cpu_ctx");
477 		return;
478 	}
479 
480 	/*
481 	 * Use the running avg of weights to select the target cpuperf level.
482 	 * This is a demonstration of the cpuperf feature rather than a
483 	 * practical strategy to regulate CPU frequency.
484 	 */
485 	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
486 	idx = weight_to_idx(cpuc->avg_weight);
487 	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
488 
489 	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
490 }
491 
492 /*
493  * The distance from the head of the queue scaled by the weight of the queue.
494  * The lower the number, the older the task and the higher the priority.
495  */
task_qdist(struct task_struct * p)496 static s64 task_qdist(struct task_struct *p)
497 {
498 	int idx = weight_to_idx(p->scx.weight);
499 	struct task_ctx *tctx;
500 	s64 qdist;
501 
502 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
503 	if (!tctx) {
504 		scx_bpf_error("task_ctx lookup failed");
505 		return 0;
506 	}
507 
508 	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
509 
510 	/*
511 	 * As queue index increments, the priority doubles. The queue w/ index 3
512 	 * is dispatched twice more frequently than 2. Reflect the difference by
513 	 * scaling qdists accordingly. Note that the shift amount needs to be
514 	 * flipped depending on the sign to avoid flipping priority direction.
515 	 */
516 	if (qdist >= 0)
517 		return qdist << (4 - idx);
518 	else
519 		return qdist << idx;
520 }
521 
522 /*
523  * This is called to determine the task ordering when core-sched is picking
524  * tasks to execute on SMT siblings and should encode about the same ordering as
525  * the regular scheduling path. Use the priority-scaled distances from the head
526  * of the queues to compare the two tasks which should be consistent with the
527  * dispatch path behavior.
528  */
BPF_STRUCT_OPS(qmap_core_sched_before,struct task_struct * a,struct task_struct * b)529 bool BPF_STRUCT_OPS(qmap_core_sched_before,
530 		    struct task_struct *a, struct task_struct *b)
531 {
532 	return task_qdist(a) > task_qdist(b);
533 }
534 
535 SEC("tp_btf/sched_switch")
BPF_PROG(qmap_sched_switch,bool preempt,struct task_struct * prev,struct task_struct * next,unsigned long prev_state)536 int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
537 	     struct task_struct *next, unsigned long prev_state)
538 {
539 	if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
540 		return 0;
541 
542 	/*
543 	 * If @cpu is taken by a higher priority scheduling class, it is no
544 	 * longer available for executing sched_ext tasks. As we don't want the
545 	 * tasks in @cpu's local dsq to sit there until @cpu becomes available
546 	 * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
547 	 * handling in qmap_enqueue().
548 	 */
549 	switch (next->policy) {
550 	case 1: /* SCHED_FIFO */
551 	case 2: /* SCHED_RR */
552 	case 6: /* SCHED_DEADLINE */
553 		scx_bpf_reenqueue_local();
554 	}
555 
556 	return 0;
557 }
558 
BPF_STRUCT_OPS(qmap_cpu_release,s32 cpu,struct scx_cpu_release_args * args)559 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
560 {
561 	/* see qmap_sched_switch() to learn how to do this on newer kernels */
562 	if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
563 		scx_bpf_reenqueue_local();
564 }
565 
BPF_STRUCT_OPS(qmap_init_task,struct task_struct * p,struct scx_init_task_args * args)566 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
567 		   struct scx_init_task_args *args)
568 {
569 	if (p->tgid == disallow_tgid)
570 		p->scx.disallow = true;
571 
572 	/*
573 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
574 	 * in this function and the following will automatically use GFP_KERNEL.
575 	 */
576 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
577 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
578 		return 0;
579 	else
580 		return -ENOMEM;
581 }
582 
BPF_STRUCT_OPS(qmap_dump,struct scx_dump_ctx * dctx)583 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
584 {
585 	s32 i, pid;
586 
587 	if (suppress_dump)
588 		return;
589 
590 	bpf_for(i, 0, 5) {
591 		void *fifo;
592 
593 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
594 			return;
595 
596 		scx_bpf_dump("QMAP FIFO[%d]:", i);
597 
598 		/*
599 		 * Dump can be invoked anytime and there is no way to iterate in
600 		 * a non-destructive way. Pop and store in dump_store and then
601 		 * restore afterwards. If racing against new enqueues, ordering
602 		 * can get mixed up.
603 		 */
604 		bpf_repeat(4096) {
605 			if (bpf_map_pop_elem(fifo, &pid))
606 				break;
607 			bpf_map_push_elem(&dump_store, &pid, 0);
608 			scx_bpf_dump(" %d", pid);
609 		}
610 
611 		bpf_repeat(4096) {
612 			if (bpf_map_pop_elem(&dump_store, &pid))
613 				break;
614 			bpf_map_push_elem(fifo, &pid, 0);
615 		}
616 
617 		scx_bpf_dump("\n");
618 	}
619 }
620 
BPF_STRUCT_OPS(qmap_dump_cpu,struct scx_dump_ctx * dctx,s32 cpu,bool idle)621 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
622 {
623 	u32 zero = 0;
624 	struct cpu_ctx *cpuc;
625 
626 	if (suppress_dump || idle)
627 		return;
628 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
629 		return;
630 
631 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
632 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
633 		     cpuc->cpuperf_target);
634 }
635 
BPF_STRUCT_OPS(qmap_dump_task,struct scx_dump_ctx * dctx,struct task_struct * p)636 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
637 {
638 	struct task_ctx *taskc;
639 
640 	if (suppress_dump)
641 		return;
642 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
643 		return;
644 
645 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
646 		     taskc->force_local, taskc->core_sched_seq);
647 }
648 
BPF_STRUCT_OPS(qmap_cgroup_init,struct cgroup * cgrp,struct scx_cgroup_init_args * args)649 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
650 {
651 	if (print_msgs)
652 		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
653 			   cgrp->kn->id, args->weight, args->bw_period_us,
654 			   args->bw_quota_us, args->bw_burst_us);
655 	return 0;
656 }
657 
BPF_STRUCT_OPS(qmap_cgroup_set_weight,struct cgroup * cgrp,u32 weight)658 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
659 {
660 	if (print_msgs)
661 		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
662 }
663 
BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth,struct cgroup * cgrp,u64 period_us,u64 quota_us,u64 burst_us)664 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
665 		    u64 period_us, u64 quota_us, u64 burst_us)
666 {
667 	if (print_msgs)
668 		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
669 			   cgrp->kn->id, period_us, quota_us, burst_us);
670 }
671 
672 /*
673  * Print out the online and possible CPU map using bpf_printk() as a
674  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
675  */
print_cpus(void)676 static void print_cpus(void)
677 {
678 	const struct cpumask *possible, *online;
679 	s32 cpu;
680 	char buf[128] = "", *p;
681 	int idx;
682 
683 	possible = scx_bpf_get_possible_cpumask();
684 	online = scx_bpf_get_online_cpumask();
685 
686 	idx = 0;
687 	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
688 		if (!(p = MEMBER_VPTR(buf, [idx++])))
689 			break;
690 		if (bpf_cpumask_test_cpu(cpu, online))
691 			*p++ = 'O';
692 		else if (bpf_cpumask_test_cpu(cpu, possible))
693 			*p++ = 'X';
694 		else
695 			*p++ = ' ';
696 
697 		if ((cpu & 7) == 7) {
698 			if (!(p = MEMBER_VPTR(buf, [idx++])))
699 				break;
700 			*p++ = '|';
701 		}
702 	}
703 	buf[sizeof(buf) - 1] = '\0';
704 
705 	scx_bpf_put_cpumask(online);
706 	scx_bpf_put_cpumask(possible);
707 
708 	bpf_printk("CPUS: |%s", buf);
709 }
710 
BPF_STRUCT_OPS(qmap_cpu_online,s32 cpu)711 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
712 {
713 	if (print_msgs) {
714 		bpf_printk("CPU %d coming online", cpu);
715 		/* @cpu is already online at this point */
716 		print_cpus();
717 	}
718 }
719 
BPF_STRUCT_OPS(qmap_cpu_offline,s32 cpu)720 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
721 {
722 	if (print_msgs) {
723 		bpf_printk("CPU %d going offline", cpu);
724 		/* @cpu is still online at this point */
725 		print_cpus();
726 	}
727 }
728 
729 struct monitor_timer {
730 	struct bpf_timer timer;
731 };
732 
733 struct {
734 	__uint(type, BPF_MAP_TYPE_ARRAY);
735 	__uint(max_entries, 1);
736 	__type(key, u32);
737 	__type(value, struct monitor_timer);
738 } monitor_timer SEC(".maps");
739 
740 /*
741  * Print out the min, avg and max performance levels of CPUs every second to
742  * demonstrate the cpuperf interface.
743  */
monitor_cpuperf(void)744 static void monitor_cpuperf(void)
745 {
746 	u32 zero = 0, nr_cpu_ids;
747 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
748 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
749 	const struct cpumask *online;
750 	int i, nr_online_cpus = 0;
751 
752 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
753 	online = scx_bpf_get_online_cpumask();
754 
755 	bpf_for(i, 0, nr_cpu_ids) {
756 		struct cpu_ctx *cpuc;
757 		u32 cap, cur;
758 
759 		if (!bpf_cpumask_test_cpu(i, online))
760 			continue;
761 		nr_online_cpus++;
762 
763 		/* collect the capacity and current cpuperf */
764 		cap = scx_bpf_cpuperf_cap(i);
765 		cur = scx_bpf_cpuperf_cur(i);
766 
767 		cur_min = cur < cur_min ? cur : cur_min;
768 		cur_max = cur > cur_max ? cur : cur_max;
769 
770 		/*
771 		 * $cur is relative to $cap. Scale it down accordingly so that
772 		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
773 		 * makes sense.
774 		 */
775 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
776 		cap_sum += cap;
777 
778 		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
779 			scx_bpf_error("failed to look up cpu_ctx");
780 			goto out;
781 		}
782 
783 		/* collect target */
784 		cur = cpuc->cpuperf_target;
785 		target_sum += cur;
786 		target_min = cur < target_min ? cur : target_min;
787 		target_max = cur > target_max ? cur : target_max;
788 	}
789 
790 	cpuperf_min = cur_min;
791 	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
792 	cpuperf_max = cur_max;
793 
794 	cpuperf_target_min = target_min;
795 	cpuperf_target_avg = target_sum / nr_online_cpus;
796 	cpuperf_target_max = target_max;
797 out:
798 	scx_bpf_put_cpumask(online);
799 }
800 
801 /*
802  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
803  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
804  * see meaningful dumps in the trace pipe.
805  */
dump_shared_dsq(void)806 static void dump_shared_dsq(void)
807 {
808 	struct task_struct *p;
809 	s32 nr;
810 
811 	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
812 		return;
813 
814 	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
815 
816 	bpf_rcu_read_lock();
817 	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
818 		bpf_printk("%s[%d]", p->comm, p->pid);
819 	bpf_rcu_read_unlock();
820 }
821 
monitor_timerfn(void * map,int * key,struct bpf_timer * timer)822 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
823 {
824 	bpf_rcu_read_lock();
825 	dispatch_highpri(true);
826 	bpf_rcu_read_unlock();
827 
828 	monitor_cpuperf();
829 
830 	if (print_dsqs_and_events) {
831 		struct scx_event_stats events;
832 
833 		dump_shared_dsq();
834 
835 		__COMPAT_scx_bpf_events(&events, sizeof(events));
836 
837 		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
838 			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
839 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
840 			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
841 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
842 			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
843 		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
844 			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
845 		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
846 			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
847 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
848 			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
849 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
850 			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
851 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
852 			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
853 	}
854 
855 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
856 	return 0;
857 }
858 
BPF_STRUCT_OPS_SLEEPABLE(qmap_init)859 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
860 {
861 	u32 key = 0;
862 	struct bpf_timer *timer;
863 	s32 ret;
864 
865 	if (print_msgs)
866 		print_cpus();
867 
868 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
869 	if (ret)
870 		return ret;
871 
872 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
873 	if (ret)
874 		return ret;
875 
876 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
877 	if (!timer)
878 		return -ESRCH;
879 
880 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
881 	bpf_timer_set_callback(timer, monitor_timerfn);
882 
883 	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
884 }
885 
BPF_STRUCT_OPS(qmap_exit,struct scx_exit_info * ei)886 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
887 {
888 	UEI_RECORD(uei, ei);
889 }
890 
891 SCX_OPS_DEFINE(qmap_ops,
892 	       .select_cpu		= (void *)qmap_select_cpu,
893 	       .enqueue			= (void *)qmap_enqueue,
894 	       .dequeue			= (void *)qmap_dequeue,
895 	       .dispatch		= (void *)qmap_dispatch,
896 	       .tick			= (void *)qmap_tick,
897 	       .core_sched_before	= (void *)qmap_core_sched_before,
898 	       .cpu_release		= (void *)qmap_cpu_release,
899 	       .init_task		= (void *)qmap_init_task,
900 	       .dump			= (void *)qmap_dump,
901 	       .dump_cpu		= (void *)qmap_dump_cpu,
902 	       .dump_task		= (void *)qmap_dump_task,
903 	       .cgroup_init		= (void *)qmap_cgroup_init,
904 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
905 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
906 	       .cpu_online		= (void *)qmap_cpu_online,
907 	       .cpu_offline		= (void *)qmap_cpu_offline,
908 	       .init			= (void *)qmap_init,
909 	       .exit			= (void *)qmap_exit,
910 	       .timeout_ms		= 5000U,
911 	       .name			= "qmap");
912