xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 05909810a946222aca5d0611d37be82d18f95228)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  * - Core-sched support.
15  *
16  * This scheduler is primarily for demonstration and testing of sched_ext
17  * features and unlikely to be useful for actual workloads.
18  *
19  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
21  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
22  */
23 #include <scx/common.bpf.h>
24 
25 enum consts {
26 	ONE_SEC_IN_NS		= 1000000000,
27 	ONE_MSEC_IN_NS		= 1000000,
28 	LOWPRI_INTV_NS		= 10 * ONE_MSEC_IN_NS,
29 	SHARED_DSQ		= 0,
30 	HIGHPRI_DSQ		= 1,
31 	LOWPRI_DSQ		= 2,
32 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
33 };
34 
35 char _license[] SEC("license") = "GPL";
36 
37 const volatile u64 slice_ns;
38 const volatile u32 stall_user_nth;
39 const volatile u32 stall_kernel_nth;
40 const volatile u32 dsp_inf_loop_after;
41 const volatile u32 dsp_batch;
42 const volatile bool highpri_boosting;
43 const volatile bool print_dsqs_and_events;
44 const volatile bool print_msgs;
45 const volatile u64 sub_cgroup_id;
46 const volatile s32 disallow_tgid;
47 const volatile bool suppress_dump;
48 const volatile bool always_enq_immed;
49 const volatile u32 immed_stress_nth;
50 
51 u64 nr_highpri_queued;
52 u32 test_error_cnt;
53 
54 #define MAX_SUB_SCHEDS		8
55 u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
56 
57 UEI_DEFINE(uei);
58 
59 struct qmap {
60 	__uint(type, BPF_MAP_TYPE_QUEUE);
61 	__uint(max_entries, 4096);
62 	__type(value, u32);
63 } queue0 SEC(".maps"),
64   queue1 SEC(".maps"),
65   queue2 SEC(".maps"),
66   queue3 SEC(".maps"),
67   queue4 SEC(".maps"),
68   dump_store SEC(".maps");
69 
70 struct {
71 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
72 	__uint(max_entries, 5);
73 	__type(key, int);
74 	__array(values, struct qmap);
75 } queue_arr SEC(".maps") = {
76 	.values = {
77 		[0] = &queue0,
78 		[1] = &queue1,
79 		[2] = &queue2,
80 		[3] = &queue3,
81 		[4] = &queue4,
82 	},
83 };
84 
85 /*
86  * If enabled, CPU performance target is set according to the queue index
87  * according to the following table.
88  */
89 static const u32 qidx_to_cpuperf_target[] = {
90 	[0] = SCX_CPUPERF_ONE * 0 / 4,
91 	[1] = SCX_CPUPERF_ONE * 1 / 4,
92 	[2] = SCX_CPUPERF_ONE * 2 / 4,
93 	[3] = SCX_CPUPERF_ONE * 3 / 4,
94 	[4] = SCX_CPUPERF_ONE * 4 / 4,
95 };
96 
97 /*
98  * Per-queue sequence numbers to implement core-sched ordering.
99  *
100  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
101  * sequence number of the latest dispatched task. The distance between the a
102  * task's seq and the associated queue's head seq is called the queue distance
103  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
104  */
105 static u64 core_sched_head_seqs[5];
106 static u64 core_sched_tail_seqs[5];
107 
108 /* Per-task scheduling context */
109 struct task_ctx {
110 	bool	force_local;	/* Dispatch directly to local_dsq */
111 	bool	highpri;
112 	u64	core_sched_seq;
113 };
114 
115 struct {
116 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
117 	__uint(map_flags, BPF_F_NO_PREALLOC);
118 	__type(key, int);
119 	__type(value, struct task_ctx);
120 } task_ctx_stor SEC(".maps");
121 
122 struct cpu_ctx {
123 	u64	dsp_idx;	/* dispatch index */
124 	u64	dsp_cnt;	/* remaining count */
125 	u32	avg_weight;
126 	u32	cpuperf_target;
127 };
128 
129 struct {
130 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
131 	__uint(max_entries, 1);
132 	__type(key, u32);
133 	__type(value, struct cpu_ctx);
134 } cpu_ctx_stor SEC(".maps");
135 
136 /* Statistics */
137 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
138 u64 nr_core_sched_execed;
139 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
140 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
141 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
142 
143 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
144 {
145 	s32 cpu;
146 
147 	if (!always_enq_immed && p->nr_cpus_allowed == 1)
148 		return prev_cpu;
149 
150 	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
151 		return prev_cpu;
152 
153 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
154 	if (cpu >= 0)
155 		return cpu;
156 
157 	return -1;
158 }
159 
160 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
161 {
162 	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
163 }
164 
165 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
166 		   s32 prev_cpu, u64 wake_flags)
167 {
168 	struct task_ctx *tctx;
169 	s32 cpu;
170 
171 	if (!(tctx = lookup_task_ctx(p)))
172 		return prev_cpu;
173 
174 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
175 		return prev_cpu;
176 
177 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
178 
179 	if (cpu >= 0) {
180 		tctx->force_local = true;
181 		return cpu;
182 	} else {
183 		return prev_cpu;
184 	}
185 }
186 
187 static int weight_to_idx(u32 weight)
188 {
189 	/* Coarsely map the compound weight to a FIFO. */
190 	if (weight <= 25)
191 		return 0;
192 	else if (weight <= 50)
193 		return 1;
194 	else if (weight < 200)
195 		return 2;
196 	else if (weight < 400)
197 		return 3;
198 	else
199 		return 4;
200 }
201 
202 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
203 {
204 	static u32 user_cnt, kernel_cnt;
205 	struct task_ctx *tctx;
206 	u32 pid = p->pid;
207 	int idx = weight_to_idx(p->scx.weight);
208 	void *ring;
209 	s32 cpu;
210 
211 	if (enq_flags & SCX_ENQ_REENQ) {
212 		__sync_fetch_and_add(&nr_reenqueued, 1);
213 		if (scx_bpf_task_cpu(p) == 0)
214 			__sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
215 	}
216 
217 	if (p->flags & PF_KTHREAD) {
218 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
219 			return;
220 	} else {
221 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
222 			return;
223 	}
224 
225 	if (test_error_cnt && !--test_error_cnt)
226 		scx_bpf_error("test triggering error");
227 
228 	if (!(tctx = lookup_task_ctx(p)))
229 		return;
230 
231 	/*
232 	 * All enqueued tasks must have their core_sched_seq updated for correct
233 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
234 	 */
235 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
236 
237 	/*
238 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
239 	 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
240 	 * and exercise the kernel IMMED reenqueue trigger paths.
241 	 */
242 	if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
243 		static u32 immed_stress_cnt;
244 
245 		if (!(++immed_stress_cnt % immed_stress_nth)) {
246 			tctx->force_local = false;
247 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
248 					   slice_ns, enq_flags);
249 			return;
250 		}
251 	}
252 
253 	/*
254 	 * If qmap_select_cpu() is telling us to or this is the last runnable
255 	 * task on the CPU, enqueue locally.
256 	 */
257 	if (tctx->force_local) {
258 		tctx->force_local = false;
259 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
260 		return;
261 	}
262 
263 	/* see lowpri_timerfn() */
264 	if (__COMPAT_has_generic_reenq() &&
265 	    p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
266 		scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
267 		return;
268 	}
269 
270 	/* if select_cpu() wasn't called, try direct dispatch */
271 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
272 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
273 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
274 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
275 		return;
276 	}
277 
278 	/*
279 	 * If the task was re-enqueued due to the CPU being preempted by a
280 	 * higher priority scheduling class, just re-enqueue the task directly
281 	 * on the global DSQ. As we want another CPU to pick it up, find and
282 	 * kick an idle CPU.
283 	 */
284 	if (enq_flags & SCX_ENQ_REENQ) {
285 		s32 cpu;
286 
287 		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
288 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
289 		if (cpu >= 0)
290 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
291 		return;
292 	}
293 
294 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
295 	if (!ring) {
296 		scx_bpf_error("failed to find ring %d", idx);
297 		return;
298 	}
299 
300 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
301 	if (bpf_map_push_elem(ring, &pid, 0)) {
302 		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
303 		return;
304 	}
305 
306 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
307 		tctx->highpri = true;
308 		__sync_fetch_and_add(&nr_highpri_queued, 1);
309 	}
310 	__sync_fetch_and_add(&nr_enqueued, 1);
311 }
312 
313 /*
314  * The BPF queue map doesn't support removal and sched_ext can handle spurious
315  * dispatches. qmap_dequeue() is only used to collect statistics.
316  */
317 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
318 {
319 	__sync_fetch_and_add(&nr_dequeued, 1);
320 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
321 		__sync_fetch_and_add(&nr_core_sched_execed, 1);
322 }
323 
324 static void update_core_sched_head_seq(struct task_struct *p)
325 {
326 	int idx = weight_to_idx(p->scx.weight);
327 	struct task_ctx *tctx;
328 
329 	if ((tctx = lookup_task_ctx(p)))
330 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
331 }
332 
333 /*
334  * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
335  * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
336  * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
337  * difference only when dsp_batch is larger than 1.
338  *
339  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
340  * non-rq-lock holding BPF programs. As demonstration, this function is called
341  * from qmap_dispatch() and monitor_timerfn().
342  */
343 static bool dispatch_highpri(bool from_timer)
344 {
345 	struct task_struct *p;
346 	s32 this_cpu = bpf_get_smp_processor_id();
347 
348 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
349 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
350 		static u64 highpri_seq;
351 		struct task_ctx *tctx;
352 
353 		if (!(tctx = lookup_task_ctx(p)))
354 			return false;
355 
356 		if (tctx->highpri) {
357 			/* exercise the set_*() and vtime interface too */
358 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
359 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
360 			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
361 		}
362 	}
363 
364 	/*
365 	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
366 	 * is found.
367 	 */
368 	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
369 		bool dispatched = false;
370 		s32 cpu;
371 
372 		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
373 			cpu = this_cpu;
374 		else
375 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
376 
377 		if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
378 				     SCX_ENQ_PREEMPT)) {
379 			if (cpu == this_cpu) {
380 				dispatched = true;
381 				__sync_fetch_and_add(&nr_expedited_local, 1);
382 			} else {
383 				__sync_fetch_and_add(&nr_expedited_remote, 1);
384 			}
385 			if (from_timer)
386 				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
387 		} else {
388 			__sync_fetch_and_add(&nr_expedited_lost, 1);
389 		}
390 
391 		if (dispatched)
392 			return true;
393 	}
394 
395 	return false;
396 }
397 
398 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
399 {
400 	struct task_struct *p;
401 	struct cpu_ctx *cpuc;
402 	struct task_ctx *tctx;
403 	u32 zero = 0, batch = dsp_batch ?: 1;
404 	void *fifo;
405 	s32 i, pid;
406 
407 	if (dispatch_highpri(false))
408 		return;
409 
410 	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
411 		return;
412 
413 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
414 		/*
415 		 * PID 2 should be kthreadd which should mostly be idle and off
416 		 * the scheduler. Let's keep dispatching it to force the kernel
417 		 * to call this function over and over again.
418 		 */
419 		p = bpf_task_from_pid(2);
420 		if (p) {
421 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
422 			bpf_task_release(p);
423 			return;
424 		}
425 	}
426 
427 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
428 		scx_bpf_error("failed to look up cpu_ctx");
429 		return;
430 	}
431 
432 	for (i = 0; i < 5; i++) {
433 		/* Advance the dispatch cursor and pick the fifo. */
434 		if (!cpuc->dsp_cnt) {
435 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
436 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
437 		}
438 
439 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
440 		if (!fifo) {
441 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
442 			return;
443 		}
444 
445 		/* Dispatch or advance. */
446 		bpf_repeat(BPF_MAX_LOOPS) {
447 			struct task_ctx *tctx;
448 
449 			if (bpf_map_pop_elem(fifo, &pid))
450 				break;
451 
452 			p = bpf_task_from_pid(pid);
453 			if (!p)
454 				continue;
455 
456 			if (!(tctx = lookup_task_ctx(p))) {
457 				bpf_task_release(p);
458 				return;
459 			}
460 
461 			if (tctx->highpri)
462 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
463 
464 			update_core_sched_head_seq(p);
465 			__sync_fetch_and_add(&nr_dispatched, 1);
466 
467 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
468 
469 			/*
470 			 * scx_qmap uses a global BPF queue that any CPU's
471 			 * dispatch can pop from. If this CPU popped a task that
472 			 * can't run here, it gets stranded on SHARED_DSQ after
473 			 * consume_dispatch_q() skips it. Kick the task's home
474 			 * CPU so it drains SHARED_DSQ.
475 			 *
476 			 * There's a race between the pop and the flush of the
477 			 * buffered dsq_insert:
478 			 *
479 			 *  CPU 0 (dispatching)      CPU 1 (home, idle)
480 			 *  ~~~~~~~~~~~~~~~~~~~      ~~~~~~~~~~~~~~~~~~~
481 			 *  pop from BPF queue
482 			 *  dsq_insert(buffered)
483 			 *                           balance:
484 			 *                             SHARED_DSQ empty
485 			 *                             BPF queue empty
486 			 *                             -> goes idle
487 			 *  flush -> on SHARED
488 			 *  kick CPU 1
489 			 *                           wakes, drains task
490 			 *
491 			 * The kick prevents indefinite stalls but a per-CPU
492 			 * kthread like ksoftirqd can be briefly stranded when
493 			 * its home CPU enters idle with softirq pending,
494 			 * triggering:
495 			 *
496 			 *  "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
497 			 *
498 			 * from report_idle_softirq(). The kick lands shortly
499 			 * after and the home CPU drains the task. This could be
500 			 * avoided by e.g. dispatching pinned tasks to local or
501 			 * global DSQs, but the current code is left as-is to
502 			 * document this class of issue -- other schedulers
503 			 * seeing similar warnings can use this as a reference.
504 			 */
505 			if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
506 				scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
507 
508 			bpf_task_release(p);
509 
510 			batch--;
511 			cpuc->dsp_cnt--;
512 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
513 				if (dispatch_highpri(false))
514 					return;
515 				scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
516 				return;
517 			}
518 			if (!cpuc->dsp_cnt)
519 				break;
520 		}
521 
522 		cpuc->dsp_cnt = 0;
523 	}
524 
525 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
526 		if (sub_sched_cgroup_ids[i] &&
527 		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
528 			return;
529 	}
530 
531 	/*
532 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
533 	 * if the task were enqueued and dispatched immediately.
534 	 */
535 	if (prev) {
536 		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
537 		if (tctx)
538 			tctx->core_sched_seq =
539 				core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
540 	}
541 }
542 
543 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
544 {
545 	struct cpu_ctx *cpuc;
546 	u32 zero = 0;
547 	int idx;
548 
549 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
550 		scx_bpf_error("failed to look up cpu_ctx");
551 		return;
552 	}
553 
554 	/*
555 	 * Use the running avg of weights to select the target cpuperf level.
556 	 * This is a demonstration of the cpuperf feature rather than a
557 	 * practical strategy to regulate CPU frequency.
558 	 */
559 	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
560 	idx = weight_to_idx(cpuc->avg_weight);
561 	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
562 
563 	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
564 }
565 
566 /*
567  * The distance from the head of the queue scaled by the weight of the queue.
568  * The lower the number, the older the task and the higher the priority.
569  */
570 static s64 task_qdist(struct task_struct *p)
571 {
572 	int idx = weight_to_idx(p->scx.weight);
573 	struct task_ctx *tctx;
574 	s64 qdist;
575 
576 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
577 	if (!tctx)
578 		return 0;
579 
580 	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
581 
582 	/*
583 	 * As queue index increments, the priority doubles. The queue w/ index 3
584 	 * is dispatched twice more frequently than 2. Reflect the difference by
585 	 * scaling qdists accordingly. Note that the shift amount needs to be
586 	 * flipped depending on the sign to avoid flipping priority direction.
587 	 */
588 	if (qdist >= 0)
589 		return qdist << (4 - idx);
590 	else
591 		return qdist << idx;
592 }
593 
594 /*
595  * This is called to determine the task ordering when core-sched is picking
596  * tasks to execute on SMT siblings and should encode about the same ordering as
597  * the regular scheduling path. Use the priority-scaled distances from the head
598  * of the queues to compare the two tasks which should be consistent with the
599  * dispatch path behavior.
600  */
601 bool BPF_STRUCT_OPS(qmap_core_sched_before,
602 		    struct task_struct *a, struct task_struct *b)
603 {
604 	return task_qdist(a) > task_qdist(b);
605 }
606 
607 /*
608  * sched_switch tracepoint and cpu_release handlers are no longer needed.
609  * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
610  * tasks when a higher-priority scheduling class takes the CPU.
611  */
612 
613 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
614 		   struct scx_init_task_args *args)
615 {
616 	if (p->tgid == disallow_tgid)
617 		p->scx.disallow = true;
618 
619 	/*
620 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
621 	 * in this function and the following will automatically use GFP_KERNEL.
622 	 */
623 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
624 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
625 		return 0;
626 	else
627 		return -ENOMEM;
628 }
629 
630 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
631 {
632 	s32 i, pid;
633 
634 	if (suppress_dump)
635 		return;
636 
637 	bpf_for(i, 0, 5) {
638 		void *fifo;
639 
640 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
641 			return;
642 
643 		scx_bpf_dump("QMAP FIFO[%d]:", i);
644 
645 		/*
646 		 * Dump can be invoked anytime and there is no way to iterate in
647 		 * a non-destructive way. Pop and store in dump_store and then
648 		 * restore afterwards. If racing against new enqueues, ordering
649 		 * can get mixed up.
650 		 */
651 		bpf_repeat(4096) {
652 			if (bpf_map_pop_elem(fifo, &pid))
653 				break;
654 			bpf_map_push_elem(&dump_store, &pid, 0);
655 			scx_bpf_dump(" %d", pid);
656 		}
657 
658 		bpf_repeat(4096) {
659 			if (bpf_map_pop_elem(&dump_store, &pid))
660 				break;
661 			bpf_map_push_elem(fifo, &pid, 0);
662 		}
663 
664 		scx_bpf_dump("\n");
665 	}
666 }
667 
668 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
669 {
670 	u32 zero = 0;
671 	struct cpu_ctx *cpuc;
672 
673 	if (suppress_dump || idle)
674 		return;
675 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
676 		return;
677 
678 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
679 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
680 		     cpuc->cpuperf_target);
681 }
682 
683 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
684 {
685 	struct task_ctx *taskc;
686 
687 	if (suppress_dump)
688 		return;
689 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
690 		return;
691 
692 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
693 		     taskc->force_local, taskc->core_sched_seq);
694 }
695 
696 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
697 {
698 	if (print_msgs)
699 		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
700 			   cgrp->kn->id, args->weight, args->bw_period_us,
701 			   args->bw_quota_us, args->bw_burst_us);
702 	return 0;
703 }
704 
705 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
706 {
707 	if (print_msgs)
708 		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
709 }
710 
711 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
712 		    u64 period_us, u64 quota_us, u64 burst_us)
713 {
714 	if (print_msgs)
715 		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
716 			   cgrp->kn->id, period_us, quota_us, burst_us);
717 }
718 
719 /*
720  * Print out the online and possible CPU map using bpf_printk() as a
721  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
722  */
723 static void print_cpus(void)
724 {
725 	const struct cpumask *possible, *online;
726 	s32 cpu;
727 	char buf[128] = "", *p;
728 	int idx;
729 
730 	possible = scx_bpf_get_possible_cpumask();
731 	online = scx_bpf_get_online_cpumask();
732 
733 	idx = 0;
734 	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
735 		if (!(p = MEMBER_VPTR(buf, [idx++])))
736 			break;
737 		if (bpf_cpumask_test_cpu(cpu, online))
738 			*p++ = 'O';
739 		else if (bpf_cpumask_test_cpu(cpu, possible))
740 			*p++ = 'X';
741 		else
742 			*p++ = ' ';
743 
744 		if ((cpu & 7) == 7) {
745 			if (!(p = MEMBER_VPTR(buf, [idx++])))
746 				break;
747 			*p++ = '|';
748 		}
749 	}
750 	buf[sizeof(buf) - 1] = '\0';
751 
752 	scx_bpf_put_cpumask(online);
753 	scx_bpf_put_cpumask(possible);
754 
755 	bpf_printk("CPUS: |%s", buf);
756 }
757 
758 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
759 {
760 	if (print_msgs) {
761 		bpf_printk("CPU %d coming online", cpu);
762 		/* @cpu is already online at this point */
763 		print_cpus();
764 	}
765 }
766 
767 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
768 {
769 	if (print_msgs) {
770 		bpf_printk("CPU %d going offline", cpu);
771 		/* @cpu is still online at this point */
772 		print_cpus();
773 	}
774 }
775 
776 struct monitor_timer {
777 	struct bpf_timer timer;
778 };
779 
780 struct {
781 	__uint(type, BPF_MAP_TYPE_ARRAY);
782 	__uint(max_entries, 1);
783 	__type(key, u32);
784 	__type(value, struct monitor_timer);
785 } monitor_timer SEC(".maps");
786 
787 /*
788  * Print out the min, avg and max performance levels of CPUs every second to
789  * demonstrate the cpuperf interface.
790  */
791 static void monitor_cpuperf(void)
792 {
793 	u32 zero = 0, nr_cpu_ids;
794 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
795 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
796 	const struct cpumask *online;
797 	int i, nr_online_cpus = 0;
798 
799 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
800 	online = scx_bpf_get_online_cpumask();
801 
802 	bpf_for(i, 0, nr_cpu_ids) {
803 		struct cpu_ctx *cpuc;
804 		u32 cap, cur;
805 
806 		if (!bpf_cpumask_test_cpu(i, online))
807 			continue;
808 		nr_online_cpus++;
809 
810 		/* collect the capacity and current cpuperf */
811 		cap = scx_bpf_cpuperf_cap(i);
812 		cur = scx_bpf_cpuperf_cur(i);
813 
814 		cur_min = cur < cur_min ? cur : cur_min;
815 		cur_max = cur > cur_max ? cur : cur_max;
816 
817 		/*
818 		 * $cur is relative to $cap. Scale it down accordingly so that
819 		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
820 		 * makes sense.
821 		 */
822 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
823 		cap_sum += cap;
824 
825 		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
826 			scx_bpf_error("failed to look up cpu_ctx");
827 			goto out;
828 		}
829 
830 		/* collect target */
831 		cur = cpuc->cpuperf_target;
832 		target_sum += cur;
833 		target_min = cur < target_min ? cur : target_min;
834 		target_max = cur > target_max ? cur : target_max;
835 	}
836 
837 	cpuperf_min = cur_min;
838 	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
839 	cpuperf_max = cur_max;
840 
841 	cpuperf_target_min = target_min;
842 	cpuperf_target_avg = target_sum / nr_online_cpus;
843 	cpuperf_target_max = target_max;
844 out:
845 	scx_bpf_put_cpumask(online);
846 }
847 
848 /*
849  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
850  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
851  * see meaningful dumps in the trace pipe.
852  */
853 static void dump_shared_dsq(void)
854 {
855 	struct task_struct *p;
856 	s32 nr;
857 
858 	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
859 		return;
860 
861 	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
862 
863 	bpf_rcu_read_lock();
864 	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
865 		bpf_printk("%s[%d]", p->comm, p->pid);
866 	bpf_rcu_read_unlock();
867 }
868 
869 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
870 {
871 	bpf_rcu_read_lock();
872 	dispatch_highpri(true);
873 	bpf_rcu_read_unlock();
874 
875 	monitor_cpuperf();
876 
877 	if (print_dsqs_and_events) {
878 		struct scx_event_stats events;
879 
880 		dump_shared_dsq();
881 
882 		__COMPAT_scx_bpf_events(&events, sizeof(events));
883 
884 		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
885 			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
886 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
887 			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
888 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
889 			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
890 		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
891 			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
892 		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
893 			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
894 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
895 			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
896 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
897 			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
898 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
899 			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
900 	}
901 
902 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
903 	return 0;
904 }
905 
906 struct lowpri_timer {
907 	struct bpf_timer timer;
908 };
909 
910 struct {
911 	__uint(type, BPF_MAP_TYPE_ARRAY);
912 	__uint(max_entries, 1);
913 	__type(key, u32);
914 	__type(value, struct lowpri_timer);
915 } lowpri_timer SEC(".maps");
916 
917 /*
918  * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
919  * the tasks are transferred to SHARED_DSQ.
920  */
921 static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
922 {
923 	scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
924 	bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
925 	return 0;
926 }
927 
928 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
929 {
930 	u32 key = 0;
931 	struct bpf_timer *timer;
932 	s32 ret;
933 
934 	if (print_msgs && !sub_cgroup_id)
935 		print_cpus();
936 
937 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
938 	if (ret) {
939 		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
940 		return ret;
941 	}
942 
943 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
944 	if (ret) {
945 		scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
946 		return ret;
947 	}
948 
949 	ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
950 	if (ret)
951 		return ret;
952 
953 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
954 	if (!timer)
955 		return -ESRCH;
956 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
957 	bpf_timer_set_callback(timer, monitor_timerfn);
958 	ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
959 	if (ret)
960 		return ret;
961 
962 	if (__COMPAT_has_generic_reenq()) {
963 		/* see lowpri_timerfn() */
964 		timer = bpf_map_lookup_elem(&lowpri_timer, &key);
965 		if (!timer)
966 			return -ESRCH;
967 		bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
968 		bpf_timer_set_callback(timer, lowpri_timerfn);
969 		ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
970 		if (ret)
971 			return ret;
972 	}
973 
974 	return 0;
975 }
976 
977 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
978 {
979 	UEI_RECORD(uei, ei);
980 }
981 
982 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
983 {
984 	s32 i;
985 
986 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
987 		if (!sub_sched_cgroup_ids[i]) {
988 			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
989 			bpf_printk("attaching sub-sched[%d] on %s",
990 				   i, args->cgroup_path);
991 			return 0;
992 		}
993 	}
994 
995 	return -ENOSPC;
996 }
997 
998 void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
999 {
1000 	s32 i;
1001 
1002 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1003 		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
1004 			sub_sched_cgroup_ids[i] = 0;
1005 			bpf_printk("detaching sub-sched[%d] on %s",
1006 				   i, args->cgroup_path);
1007 			break;
1008 		}
1009 	}
1010 }
1011 
1012 SCX_OPS_DEFINE(qmap_ops,
1013 	       .select_cpu		= (void *)qmap_select_cpu,
1014 	       .enqueue			= (void *)qmap_enqueue,
1015 	       .dequeue			= (void *)qmap_dequeue,
1016 	       .dispatch		= (void *)qmap_dispatch,
1017 	       .tick			= (void *)qmap_tick,
1018 	       .core_sched_before	= (void *)qmap_core_sched_before,
1019 	       .init_task		= (void *)qmap_init_task,
1020 	       .dump			= (void *)qmap_dump,
1021 	       .dump_cpu		= (void *)qmap_dump_cpu,
1022 	       .dump_task		= (void *)qmap_dump_task,
1023 	       .cgroup_init		= (void *)qmap_cgroup_init,
1024 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
1025 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
1026 	       .sub_attach		= (void *)qmap_sub_attach,
1027 	       .sub_detach		= (void *)qmap_sub_detach,
1028 	       .cpu_online		= (void *)qmap_cpu_online,
1029 	       .cpu_offline		= (void *)qmap_cpu_offline,
1030 	       .init			= (void *)qmap_init,
1031 	       .exit			= (void *)qmap_exit,
1032 	       .timeout_ms		= 5000U,
1033 	       .name			= "qmap");
1034