xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision a83f9edf7aba9421bfb53d181691fbcf9f34ce72)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented as arena-backed doubly-linked lists
6  * threaded through per-task context. A task gets assigned to one depending on
7  * its compound weight. Each CPU round robins through the FIFOs and dispatches
8  * more from FIFOs with higher indices - 1 from queue0, 2 from queue1, 4 from
9  * queue2 and so on.
10  *
11  * This scheduler demonstrates:
12  *
13  * - BPF-side queueing using TIDs.
14  * - BPF arena for scheduler state.
15  * - Core-sched support.
16  *
17  * This scheduler is primarily for demonstration and testing of sched_ext
18  * features and unlikely to be useful for actual workloads.
19  *
20  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
21  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
22  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
23  */
24 #include <scx/common.bpf.h>
25 
26 #include "scx_qmap.h"
27 
28 enum consts {
29 	ONE_SEC_IN_NS		= 1000000000,
30 	ONE_MSEC_IN_NS		= 1000000,
31 	LOWPRI_INTV_NS		= 10 * ONE_MSEC_IN_NS,
32 	SHARED_DSQ		= 0,
33 	HIGHPRI_DSQ		= 1,
34 	LOWPRI_DSQ		= 2,
35 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
36 };
37 
38 char _license[] SEC("license") = "GPL";
39 
40 const volatile u64 slice_ns;
41 const volatile u32 stall_user_nth;
42 const volatile u32 stall_kernel_nth;
43 const volatile u32 dsp_inf_loop_after;
44 const volatile u32 dsp_batch;
45 const volatile bool highpri_boosting;
46 const volatile bool print_dsqs_and_events;
47 const volatile bool print_msgs;
48 const volatile u64 sub_cgroup_id;
49 const volatile s32 disallow_tgid;
50 const volatile bool suppress_dump;
51 const volatile bool always_enq_immed;
52 const volatile u32 immed_stress_nth;
53 const volatile u32 max_tasks;
54 
55 /*
56  * Optional cid-override test harness. When cid_override_mode is non-zero,
57  * qmap_init() calls scx_bpf_cid_override() with the caller-supplied
58  * cpu_to_cid array to exercise the kfunc's acceptance and error paths.
59  *
60  *   0 = disabled
61  *   1 = valid reverse mapping
62  *   2 = invalid: duplicate cid assignment
63  *   3 = invalid: out-of-range cid
64  */
65 const volatile u32 cid_override_mode;
66 /*
67  * Array lives in bss (writable) because scx_bpf_cid_override()'s BPF
68  * verifier signature treats its len-paired pointer as read/write - rodata
69  * fails verification with "write into map forbidden". Userspace populates
70  * it before SCX_OPS_LOAD, same as rodata, and nothing writes it after.
71  */
72 s32 cid_override_cpu_to_cid[SCX_QMAP_MAX_CPUS];
73 
74 UEI_DEFINE(uei);
75 
76 /*
77  * All scheduler state - per-cpu context, stats counters, core-sched sequence
78  * numbers, sub-sched cgroup ids - lives in this single BPF arena map. Userspace
79  * reaches it via skel->arena->qa.
80  */
81 struct {
82 	__uint(type, BPF_MAP_TYPE_ARENA);
83 	__uint(map_flags, BPF_F_MMAPABLE);
84 	__uint(max_entries, 1 << 16);		/* upper bound in pages */
85 #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
86 	__ulong(map_extra, 0x1ull << 32);	/* user/BPF mmap base */
87 #else
88 	__ulong(map_extra, 0x1ull << 44);
89 #endif
90 } arena SEC(".maps");
91 
92 struct qmap_arena __arena_global qa;
93 
94 /*
95  * Global idle-cid tracking, maintained via update_idle / cpu_offline and
96  * scanned by the direct-dispatch path. Allocated in qmap_init() from one
97  * arena page, sized to the full cid space.
98  */
99 struct scx_cmask __arena *qa_idle_cids;
100 
101 /* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
102 __hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
103 __hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
104 __hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
105 __hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
106 __hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
107 
108 static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
109 {
110 	switch (qid) {
111 	case 0:	return &qa_q_lock0;
112 	case 1:	return &qa_q_lock1;
113 	case 2:	return &qa_q_lock2;
114 	case 3:	return &qa_q_lock3;
115 	case 4:	return &qa_q_lock4;
116 	default: return NULL;
117 	}
118 }
119 
120 /*
121  * If enabled, CPU performance target is set according to the queue index
122  * according to the following table.
123  */
124 static const u32 qidx_to_cpuperf_target[] = {
125 	[0] = SCX_CPUPERF_ONE * 0 / 4,
126 	[1] = SCX_CPUPERF_ONE * 1 / 4,
127 	[2] = SCX_CPUPERF_ONE * 2 / 4,
128 	[3] = SCX_CPUPERF_ONE * 3 / 4,
129 	[4] = SCX_CPUPERF_ONE * 4 / 4,
130 };
131 
132 /*
133  * Per-queue sequence numbers to implement core-sched ordering.
134  *
135  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
136  * sequence number of the latest dispatched task. The distance between the a
137  * task's seq and the associated queue's head seq is called the queue distance
138  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
139  */
140 
141 /*
142  * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
143  * arena. While the task is alive the entry is referenced from task_ctx_stor;
144  * while it's free the entry sits on the free list singly-linked through
145  * @next_free.
146  *
147  * When the task is queued on one of the five priority FIFOs, @q_idx is the
148  * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
149  * @q_idx is -1 when the task isn't on any queue.
150  */
151 struct task_ctx {
152 	struct task_ctx __arena	*next_free;	/* only valid on free list */
153 	struct task_ctx __arena	*q_next;	/* queue link, NULL if tail */
154 	struct task_ctx __arena	*q_prev;	/* queue link, NULL if head */
155 	struct qmap_fifo __arena *fifo;		/* queue we're on, NULL if not queued */
156 	u64			tid;
157 	s32			pid;	/* for dump only */
158 	bool			force_local;	/* Dispatch directly to local_dsq */
159 	bool			highpri;
160 	u64			core_sched_seq;
161 	struct scx_cmask	cpus_allowed;	/* per-task affinity in cid space */
162 };
163 
164 /*
165  * Slab stride for task_ctx. cpus_allowed's flex array bits[] overlaps the
166  * tail bytes appended per entry; struct_size() gives the actual per-entry
167  * footprint.
168  */
169 #define TASK_CTX_STRIDE							\
170 	struct_size_t(struct task_ctx, cpus_allowed.bits,		\
171 		      CMASK_NR_WORDS(SCX_QMAP_MAX_CPUS))
172 
173 /* All task_ctx pointers are arena pointers. */
174 typedef struct task_ctx __arena task_ctx_t;
175 
176 /* Holds an arena pointer to the task's slab entry. */
177 struct task_ctx_stor_val {
178 	task_ctx_t		*taskc;
179 };
180 
181 struct {
182 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
183 	__uint(map_flags, BPF_F_NO_PREALLOC);
184 	__type(key, int);
185 	__type(value, struct task_ctx_stor_val);
186 } task_ctx_stor SEC(".maps");
187 
188 /* Protects the task_ctx slab free list. */
189 __hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
190 
191 static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
192 {
193 	if (bpf_res_spin_lock(lock)) {
194 		scx_bpf_error("res_spin_lock failed");
195 		return -EBUSY;
196 	}
197 	return 0;
198 }
199 
200 /*
201  * Try prev_cid, then scan taskc->cpus_allowed AND qa_idle_cids round-robin
202  * from prev_cid + 1. Atomic claim retries on race; bounded by
203  * IDLE_PICK_RETRIES to keep the verifier's insn budget in check.
204  */
205 #define IDLE_PICK_RETRIES	16
206 
207 static s32 pick_direct_dispatch_cid(struct task_struct *p, s32 prev_cid,
208 				    task_ctx_t *taskc)
209 {
210 	u32 nr_cids = scx_bpf_nr_cids();
211 	s32 cid;
212 	u32 i;
213 
214 	if (!always_enq_immed && p->nr_cpus_allowed == 1)
215 		return prev_cid;
216 
217 	if (cmask_test_and_clear(prev_cid, qa_idle_cids))
218 		return prev_cid;
219 
220 	cid = prev_cid;
221 	bpf_for(i, 0, IDLE_PICK_RETRIES) {
222 		cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
223 					      qa_idle_cids, cid + 1);
224 		barrier_var(cid);
225 		if (cid >= nr_cids)
226 			return -1;
227 		if (cmask_test_and_clear(cid, qa_idle_cids))
228 			return cid;
229 	}
230 	return -1;
231 }
232 
233 /*
234  * Force a reference to the arena map. The verifier associates an arena with
235  * a program by finding an LD_IMM64 instruction that loads the arena's BPF
236  * map; programs that only use arena pointers returned from task-local
237  * storage (like qmap_select_cpu) never reference @arena directly. Without
238  * this, the verifier rejects addr_space_cast with "addr_space_cast insn
239  * can only be used in a program that has an associated arena".
240  */
241 #define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
242 
243 static task_ctx_t *lookup_task_ctx(struct task_struct *p)
244 {
245 	struct task_ctx_stor_val *v;
246 
247 	QMAP_TOUCH_ARENA();
248 
249 	v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
250 	if (!v || !v->taskc)
251 		return NULL;
252 	return v->taskc;
253 }
254 
255 /* Append @taskc to the tail of @fifo. Must not already be queued. */
256 static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo, task_ctx_t *taskc)
257 {
258 	struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
259 
260 	if (!lock || qmap_spin_lock(lock))
261 		return;
262 	taskc->fifo = fifo;
263 	taskc->q_next = NULL;
264 	taskc->q_prev = fifo->tail;
265 	if (fifo->tail)
266 		fifo->tail->q_next = taskc;
267 	else
268 		fifo->head = taskc;
269 	fifo->tail = taskc;
270 	bpf_res_spin_unlock(lock);
271 }
272 
273 /* Pop the head of @fifo. Returns NULL if empty. */
274 static task_ctx_t *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
275 {
276 	struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
277 	task_ctx_t *taskc;
278 
279 	if (!lock || qmap_spin_lock(lock))
280 		return NULL;
281 	taskc = fifo->head;
282 	if (taskc) {
283 		fifo->head = taskc->q_next;
284 		if (taskc->q_next)
285 			taskc->q_next->q_prev = NULL;
286 		else
287 			fifo->tail = NULL;
288 		taskc->q_next = NULL;
289 		taskc->q_prev = NULL;
290 		taskc->fifo = NULL;
291 	}
292 	bpf_res_spin_unlock(lock);
293 	return taskc;
294 }
295 
296 /* Remove @taskc from its fifo. No-op if not queued. */
297 static void qmap_fifo_remove(task_ctx_t *taskc)
298 {
299 	struct qmap_fifo __arena *fifo = taskc->fifo;
300 	struct bpf_res_spin_lock *lock;
301 
302 	if (!fifo)
303 		return;
304 
305 	lock = qa_q_lock(fifo->idx);
306 	if (!lock || qmap_spin_lock(lock))
307 		return;
308 
309 	/* Re-check under lock — a concurrent pop may have cleared fifo. */
310 	if (taskc->fifo != fifo) {
311 		bpf_res_spin_unlock(lock);
312 		return;
313 	}
314 
315 	if (taskc->q_next)
316 		taskc->q_next->q_prev = taskc->q_prev;
317 	else
318 		fifo->tail = taskc->q_prev;
319 	if (taskc->q_prev)
320 		taskc->q_prev->q_next = taskc->q_next;
321 	else
322 		fifo->head = taskc->q_next;
323 	taskc->q_next = NULL;
324 	taskc->q_prev = NULL;
325 	taskc->fifo = NULL;
326 	bpf_res_spin_unlock(lock);
327 }
328 
329 s32 BPF_STRUCT_OPS(qmap_select_cid, struct task_struct *p,
330 		   s32 prev_cid, u64 wake_flags)
331 {
332 	task_ctx_t *taskc;
333 	s32 cid;
334 
335 	if (!(taskc = lookup_task_ctx(p)))
336 		return prev_cid;
337 
338 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
339 		return prev_cid;
340 
341 	cid = pick_direct_dispatch_cid(p, prev_cid, taskc);
342 
343 	if (cid >= 0) {
344 		taskc->force_local = true;
345 		return cid;
346 	} else {
347 		return prev_cid;
348 	}
349 }
350 
351 static int weight_to_idx(u32 weight)
352 {
353 	/* Coarsely map the compound weight to a FIFO. */
354 	if (weight <= 25)
355 		return 0;
356 	else if (weight <= 50)
357 		return 1;
358 	else if (weight < 200)
359 		return 2;
360 	else if (weight < 400)
361 		return 3;
362 	else
363 		return 4;
364 }
365 
366 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
367 {
368 	static u32 user_cnt, kernel_cnt;
369 	task_ctx_t *taskc;
370 	int idx = weight_to_idx(p->scx.weight);
371 	s32 cid;
372 
373 	if (enq_flags & SCX_ENQ_REENQ) {
374 		__sync_fetch_and_add(&qa.nr_reenqueued, 1);
375 		if (scx_bpf_task_cid(p) == 0)
376 			__sync_fetch_and_add(&qa.nr_reenqueued_cid0, 1);
377 	}
378 
379 	if (p->flags & PF_KTHREAD) {
380 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
381 			return;
382 	} else {
383 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
384 			return;
385 	}
386 
387 	if (qa.test_error_cnt && !--qa.test_error_cnt)
388 		scx_bpf_error("test triggering error");
389 
390 	if (!(taskc = lookup_task_ctx(p)))
391 		return;
392 
393 	/*
394 	 * All enqueued tasks must have their core_sched_seq updated for correct
395 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
396 	 */
397 	taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
398 
399 	/*
400 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
401 	 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
402 	 * and exercise the kernel IMMED reenqueue trigger paths.
403 	 */
404 	if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
405 		static u32 immed_stress_cnt;
406 
407 		if (!(++immed_stress_cnt % immed_stress_nth)) {
408 			taskc->force_local = false;
409 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cid(p),
410 					   slice_ns, enq_flags);
411 			return;
412 		}
413 	}
414 
415 	/*
416 	 * If qmap_select_cid() is telling us to or this is the last runnable
417 	 * task on the CPU, enqueue locally.
418 	 */
419 	if (taskc->force_local) {
420 		taskc->force_local = false;
421 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
422 		return;
423 	}
424 
425 	/* see lowpri_timerfn() */
426 	if (__COMPAT_has_generic_reenq() &&
427 	    p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
428 		scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
429 		return;
430 	}
431 
432 	/* if select_cid() wasn't called, try direct dispatch */
433 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
434 	    (cid = pick_direct_dispatch_cid(p, scx_bpf_task_cid(p), taskc)) >= 0) {
435 		__sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
436 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cid, slice_ns, enq_flags);
437 		return;
438 	}
439 
440 	/*
441 	 * If the task was re-enqueued due to the CPU being preempted by a
442 	 * higher priority scheduling class, just re-enqueue the task directly
443 	 * on the global DSQ. As we want another CPU to pick it up, find and
444 	 * kick an idle cid.
445 	 */
446 	if (enq_flags & SCX_ENQ_REENQ) {
447 		s32 cid;
448 
449 		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
450 		cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
451 					      qa_idle_cids, 0);
452 		if (cid < scx_bpf_nr_cids())
453 			scx_bpf_kick_cid(cid, SCX_KICK_IDLE);
454 		return;
455 	}
456 
457 	/* Queue on the selected FIFO. */
458 	qmap_fifo_enqueue(&qa.fifos[idx], taskc);
459 
460 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
461 		taskc->highpri = true;
462 		__sync_fetch_and_add(&qa.nr_highpri_queued, 1);
463 	}
464 	__sync_fetch_and_add(&qa.nr_enqueued, 1);
465 }
466 
467 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
468 {
469 	task_ctx_t *taskc;
470 
471 	__sync_fetch_and_add(&qa.nr_dequeued, 1);
472 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
473 		__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
474 
475 	taskc = lookup_task_ctx(p);
476 	if (taskc && taskc->fifo) {
477 		if (taskc->highpri)
478 			__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
479 		qmap_fifo_remove(taskc);
480 	}
481 }
482 
483 static void update_core_sched_head_seq(struct task_struct *p)
484 {
485 	int idx = weight_to_idx(p->scx.weight);
486 	task_ctx_t *taskc;
487 
488 	if ((taskc = lookup_task_ctx(p)))
489 		qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
490 }
491 
492 /*
493  * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
494  * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
495  * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
496  * difference only when dsp_batch is larger than 1.
497  *
498  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
499  * non-rq-lock holding BPF programs. As demonstration, this function is called
500  * from qmap_dispatch() and monitor_timerfn().
501  */
502 static bool dispatch_highpri(bool from_timer)
503 {
504 	struct task_struct *p;
505 	s32 this_cid = scx_bpf_this_cid();
506 	u32 nr_cids = scx_bpf_nr_cids();
507 
508 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
509 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
510 		static u64 highpri_seq;
511 		task_ctx_t *taskc;
512 
513 		if (!(taskc = lookup_task_ctx(p)))
514 			return false;
515 
516 		if (taskc->highpri) {
517 			/* exercise the set_*() and vtime interface too */
518 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
519 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
520 			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
521 		}
522 	}
523 
524 	/*
525 	 * Scan HIGHPRI_DSQ and dispatch until a task that can run here is
526 	 * found. Prefer this_cid if the task allows it; otherwise RR-scan the
527 	 * task's cpus_allowed starting after this_cid.
528 	 */
529 	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
530 		task_ctx_t *taskc;
531 		bool dispatched = false;
532 		s32 cid;
533 
534 		if (!(taskc = lookup_task_ctx(p)))
535 			return false;
536 
537 		if (cmask_test(this_cid, &taskc->cpus_allowed))
538 			cid = this_cid;
539 		else
540 			cid = cmask_next_set_wrap(&taskc->cpus_allowed,
541 						  this_cid + 1);
542 		if (cid >= nr_cids)
543 			continue;
544 
545 		if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cid,
546 				     SCX_ENQ_PREEMPT)) {
547 			if (cid == this_cid) {
548 				dispatched = true;
549 				__sync_fetch_and_add(&qa.nr_expedited_local, 1);
550 			} else {
551 				__sync_fetch_and_add(&qa.nr_expedited_remote, 1);
552 			}
553 			if (from_timer)
554 				__sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
555 		} else {
556 			__sync_fetch_and_add(&qa.nr_expedited_lost, 1);
557 		}
558 
559 		if (dispatched)
560 			return true;
561 	}
562 
563 	return false;
564 }
565 
566 void BPF_STRUCT_OPS(qmap_dispatch, s32 cid, struct task_struct *prev)
567 {
568 	struct task_struct *p;
569 	struct cpu_ctx __arena *cpuc;
570 	task_ctx_t *taskc;
571 	u32 batch = dsp_batch ?: 1;
572 	s32 i;
573 
574 	if (dispatch_highpri(false))
575 		return;
576 
577 	if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
578 		return;
579 
580 	if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
581 		/*
582 		 * PID 2 should be kthreadd which should mostly be idle and off
583 		 * the scheduler. Let's keep dispatching it to force the kernel
584 		 * to call this function over and over again.
585 		 */
586 		p = bpf_task_from_pid(2);
587 		if (p) {
588 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
589 			bpf_task_release(p);
590 			return;
591 		}
592 	}
593 
594 	cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()];
595 
596 	for (i = 0; i < 5; i++) {
597 		/* Advance the dispatch cursor and pick the fifo. */
598 		if (!cpuc->dsp_cnt) {
599 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
600 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
601 		}
602 
603 		/* Dispatch or advance. */
604 		bpf_repeat(BPF_MAX_LOOPS) {
605 			task_ctx_t *taskc;
606 
607 			taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
608 			if (!taskc)
609 				break;
610 
611 			p = scx_bpf_tid_to_task(taskc->tid);
612 			if (!p)
613 				continue;
614 
615 			if (taskc->highpri)
616 				__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
617 
618 			update_core_sched_head_seq(p);
619 			__sync_fetch_and_add(&qa.nr_dispatched, 1);
620 
621 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
622 
623 			/*
624 			 * scx_qmap uses a global BPF queue that any CPU's
625 			 * dispatch can pop from. If this CPU popped a task that
626 			 * can't run here, it gets stranded on SHARED_DSQ after
627 			 * consume_dispatch_q() skips it. Kick the task's home
628 			 * CPU so it drains SHARED_DSQ.
629 			 *
630 			 * There's a race between the pop and the flush of the
631 			 * buffered dsq_insert:
632 			 *
633 			 *  CPU 0 (dispatching)      CPU 1 (home, idle)
634 			 *  ~~~~~~~~~~~~~~~~~~~      ~~~~~~~~~~~~~~~~~~~
635 			 *  pop from BPF queue
636 			 *  dsq_insert(buffered)
637 			 *                           balance:
638 			 *                             SHARED_DSQ empty
639 			 *                             BPF queue empty
640 			 *                             -> goes idle
641 			 *  flush -> on SHARED
642 			 *  kick CPU 1
643 			 *                           wakes, drains task
644 			 *
645 			 * The kick prevents indefinite stalls but a per-CPU
646 			 * kthread like ksoftirqd can be briefly stranded when
647 			 * its home CPU enters idle with softirq pending,
648 			 * triggering:
649 			 *
650 			 *  "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
651 			 *
652 			 * from report_idle_softirq(). The kick lands shortly
653 			 * after and the home CPU drains the task. This could be
654 			 * avoided by e.g. dispatching pinned tasks to local or
655 			 * global DSQs, but the current code is left as-is to
656 			 * document this class of issue -- other schedulers
657 			 * seeing similar warnings can use this as a reference.
658 			 */
659 			if (!cmask_test(cid, &taskc->cpus_allowed))
660 				scx_bpf_kick_cid(scx_bpf_task_cid(p), 0);
661 
662 			batch--;
663 			cpuc->dsp_cnt--;
664 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
665 				if (dispatch_highpri(false))
666 					return;
667 				scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
668 				return;
669 			}
670 			if (!cpuc->dsp_cnt)
671 				break;
672 		}
673 
674 		cpuc->dsp_cnt = 0;
675 	}
676 
677 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
678 		if (qa.sub_sched_cgroup_ids[i] &&
679 		    scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
680 			return;
681 	}
682 
683 	/*
684 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
685 	 * if the task were enqueued and dispatched immediately.
686 	 */
687 	if (prev) {
688 		taskc = lookup_task_ctx(prev);
689 		if (!taskc)
690 			return;
691 
692 		taskc->core_sched_seq =
693 			qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
694 	}
695 }
696 
697 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
698 {
699 	struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()];
700 	int idx;
701 
702 	/*
703 	 * Use the running avg of weights to select the target cpuperf level.
704 	 * This is a demonstration of the cpuperf feature rather than a
705 	 * practical strategy to regulate CPU frequency.
706 	 */
707 	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
708 	idx = weight_to_idx(cpuc->avg_weight);
709 	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
710 
711 	scx_bpf_cidperf_set(scx_bpf_task_cid(p), cpuc->cpuperf_target);
712 }
713 
714 /*
715  * The distance from the head of the queue scaled by the weight of the queue.
716  * The lower the number, the older the task and the higher the priority.
717  */
718 static s64 task_qdist(struct task_struct *p)
719 {
720 	int idx = weight_to_idx(p->scx.weight);
721 	task_ctx_t *taskc;
722 	s64 qdist;
723 
724 	taskc = lookup_task_ctx(p);
725 	if (!taskc)
726 		return 0;
727 
728 	qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
729 
730 	/*
731 	 * As queue index increments, the priority doubles. The queue w/ index 3
732 	 * is dispatched twice more frequently than 2. Reflect the difference by
733 	 * scaling qdists accordingly. Note that the shift amount needs to be
734 	 * flipped depending on the sign to avoid flipping priority direction.
735 	 */
736 	if (qdist >= 0)
737 		return qdist << (4 - idx);
738 	else
739 		return qdist << idx;
740 }
741 
742 /*
743  * This is called to determine the task ordering when core-sched is picking
744  * tasks to execute on SMT siblings and should encode about the same ordering as
745  * the regular scheduling path. Use the priority-scaled distances from the head
746  * of the queues to compare the two tasks which should be consistent with the
747  * dispatch path behavior.
748  */
749 bool BPF_STRUCT_OPS(qmap_core_sched_before,
750 		    struct task_struct *a, struct task_struct *b)
751 {
752 	return task_qdist(a) > task_qdist(b);
753 }
754 
755 /*
756  * sched_switch tracepoint and cpu_release handlers are no longer needed.
757  * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
758  * tasks when a higher-priority scheduling class takes the CPU.
759  */
760 
761 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
762 			     struct scx_init_task_args *args)
763 {
764 	struct task_ctx_stor_val *v;
765 	task_ctx_t *taskc;
766 
767 	if (p->tgid == disallow_tgid)
768 		p->scx.disallow = true;
769 
770 	/* pop a slab entry off the free list */
771 	if (qmap_spin_lock(&qa_task_lock))
772 		return -EBUSY;
773 	taskc = qa.task_free_head;
774 	if (taskc)
775 		qa.task_free_head = taskc->next_free;
776 	bpf_res_spin_unlock(&qa_task_lock);
777 	if (!taskc) {
778 		scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
779 		return -ENOMEM;
780 	}
781 
782 	taskc->next_free = NULL;
783 	taskc->q_next = NULL;
784 	taskc->q_prev = NULL;
785 	taskc->fifo = NULL;
786 	taskc->tid = p->scx.tid;
787 	taskc->pid = p->pid;
788 	taskc->force_local = false;
789 	taskc->highpri = false;
790 	taskc->core_sched_seq = 0;
791 	cmask_init(&taskc->cpus_allowed, 0, scx_bpf_nr_cids());
792 	bpf_rcu_read_lock();
793 	cmask_from_cpumask(&taskc->cpus_allowed, p->cpus_ptr);
794 	bpf_rcu_read_unlock();
795 
796 	v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
797 				 BPF_LOCAL_STORAGE_GET_F_CREATE);
798 	if (!v) {
799 		/* push back to the free list */
800 		if (!qmap_spin_lock(&qa_task_lock)) {
801 			taskc->next_free = qa.task_free_head;
802 			qa.task_free_head = taskc;
803 			bpf_res_spin_unlock(&qa_task_lock);
804 		}
805 		return -ENOMEM;
806 	}
807 	v->taskc = taskc;
808 	return 0;
809 }
810 
811 void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
812 		    struct scx_exit_task_args *args)
813 {
814 	struct task_ctx_stor_val *v;
815 	task_ctx_t *taskc;
816 
817 	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
818 	if (!v || !v->taskc)
819 		return;
820 	taskc = v->taskc;
821 	v->taskc = NULL;
822 
823 	if (qmap_spin_lock(&qa_task_lock))
824 		return;
825 	taskc->next_free = qa.task_free_head;
826 	qa.task_free_head = taskc;
827 	bpf_res_spin_unlock(&qa_task_lock);
828 }
829 
830 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
831 {
832 	task_ctx_t *taskc;
833 	s32 i;
834 
835 	QMAP_TOUCH_ARENA();
836 
837 	if (suppress_dump)
838 		return;
839 
840 	/*
841 	 * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
842 	 * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
843 	 * a lock and dump. Best-effort; racing may print stale tids but the
844 	 * walk is bounded by bpf_repeat() so it always terminates.
845 	 */
846 	bpf_for(i, 0, 5) {
847 		scx_bpf_dump("QMAP FIFO[%d]:", i);
848 		taskc = qa.fifos[i].head;
849 		bpf_repeat(4096) {
850 			if (!taskc)
851 				break;
852 			scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid);
853 			taskc = taskc->q_next;
854 		}
855 		scx_bpf_dump("\n");
856 	}
857 }
858 
859 void BPF_STRUCT_OPS(qmap_dump_cid, struct scx_dump_ctx *dctx, s32 cid, bool idle)
860 {
861 	struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid];
862 
863 	if (suppress_dump || idle)
864 		return;
865 
866 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
867 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
868 		     cpuc->cpuperf_target);
869 }
870 
871 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
872 {
873 	struct task_ctx_stor_val *v;
874 	task_ctx_t *taskc;
875 
876 	QMAP_TOUCH_ARENA();
877 
878 	if (suppress_dump)
879 		return;
880 	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
881 	if (!v || !v->taskc)
882 		return;
883 	taskc = v->taskc;
884 
885 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
886 		     taskc->force_local, taskc->core_sched_seq);
887 }
888 
889 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
890 {
891 	if (print_msgs)
892 		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
893 			   cgrp->kn->id, args->weight, args->bw_period_us,
894 			   args->bw_quota_us, args->bw_burst_us);
895 	return 0;
896 }
897 
898 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
899 {
900 	if (print_msgs)
901 		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
902 }
903 
904 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
905 		    u64 period_us, u64 quota_us, u64 burst_us)
906 {
907 	if (print_msgs)
908 		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
909 			   cgrp->kn->id, period_us, quota_us, burst_us);
910 }
911 
912 void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
913 {
914 	QMAP_TOUCH_ARENA();
915 	if (idle)
916 		cmask_set(cid, qa_idle_cids);
917 	else
918 		cmask_clear(cid, qa_idle_cids);
919 }
920 
921 void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
922 		    const struct scx_cmask *cmask_in)
923 {
924 	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
925 	task_ctx_t *taskc;
926 
927 	taskc = lookup_task_ctx(p);
928 	if (!taskc)
929 		return;
930 	cmask_copy(&taskc->cpus_allowed, cmask);
931 }
932 
933 struct monitor_timer {
934 	struct bpf_timer timer;
935 };
936 
937 struct {
938 	__uint(type, BPF_MAP_TYPE_ARRAY);
939 	__uint(max_entries, 1);
940 	__type(key, u32);
941 	__type(value, struct monitor_timer);
942 } monitor_timer SEC(".maps");
943 
944 /*
945  * Aggregate cidperf across the first nr_online_cids cids. Post-hotplug
946  * the first-N-are-online invariant drifts, so some cap/cur values may
947  * be stale. For this demo monitor that's fine; the scheduler exits on
948  * the enable-time hotplug_seq mismatch and userspace restarts, which
949  * rebuilds the layout.
950  */
951 static void monitor_cpuperf(void)
952 {
953 	u32 nr_online = scx_bpf_nr_online_cids();
954 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
955 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
956 	s32 cid;
957 
958 	QMAP_TOUCH_ARENA();
959 
960 	bpf_for(cid, 0, nr_online) {
961 		struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid];
962 		u32 cap = scx_bpf_cidperf_cap(cid);
963 		u32 cur = scx_bpf_cidperf_cur(cid);
964 		u32 target;
965 
966 		cur_min = cur < cur_min ? cur : cur_min;
967 		cur_max = cur > cur_max ? cur : cur_max;
968 
969 		cur_sum += (u64)cur * cap / SCX_CPUPERF_ONE;
970 		cap_sum += cap;
971 
972 		target = cpuc->cpuperf_target;
973 		target_sum += target;
974 		target_min = target < target_min ? target : target_min;
975 		target_max = target > target_max ? target : target_max;
976 	}
977 
978 	if (!nr_online || !cap_sum)
979 		return;
980 
981 	qa.cpuperf_min = cur_min;
982 	qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
983 	qa.cpuperf_max = cur_max;
984 
985 	qa.cpuperf_target_min = target_min;
986 	qa.cpuperf_target_avg = target_sum / nr_online;
987 	qa.cpuperf_target_max = target_max;
988 }
989 
990 /*
991  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
992  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
993  * see meaningful dumps in the trace pipe.
994  */
995 static void dump_shared_dsq(void)
996 {
997 	struct task_struct *p;
998 	s32 nr;
999 
1000 	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
1001 		return;
1002 
1003 	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
1004 
1005 	bpf_rcu_read_lock();
1006 	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
1007 		bpf_printk("%s[%d]", p->comm, p->pid);
1008 	bpf_rcu_read_unlock();
1009 }
1010 
1011 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
1012 {
1013 	bpf_rcu_read_lock();
1014 	dispatch_highpri(true);
1015 	bpf_rcu_read_unlock();
1016 
1017 	monitor_cpuperf();
1018 
1019 	if (print_dsqs_and_events) {
1020 		struct scx_event_stats events;
1021 
1022 		dump_shared_dsq();
1023 
1024 		__COMPAT_scx_bpf_events(&events, sizeof(events));
1025 
1026 		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
1027 			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
1028 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
1029 			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
1030 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
1031 			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
1032 		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
1033 			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
1034 		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
1035 			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
1036 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
1037 			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
1038 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
1039 			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
1040 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
1041 			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
1042 	}
1043 
1044 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
1045 	return 0;
1046 }
1047 
1048 struct lowpri_timer {
1049 	struct bpf_timer timer;
1050 };
1051 
1052 struct {
1053 	__uint(type, BPF_MAP_TYPE_ARRAY);
1054 	__uint(max_entries, 1);
1055 	__type(key, u32);
1056 	__type(value, struct lowpri_timer);
1057 } lowpri_timer SEC(".maps");
1058 
1059 /*
1060  * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
1061  * the tasks are transferred to SHARED_DSQ.
1062  */
1063 static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
1064 {
1065 	scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
1066 	bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
1067 	return 0;
1068 }
1069 
1070 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
1071 {
1072 	u8 __arena *slab;
1073 	u32 nr_pages, key = 0, i;
1074 	u32 nr_cids, nr_cpu_ids;
1075 	struct bpf_timer *timer;
1076 	s32 ret;
1077 
1078 	nr_cids = scx_bpf_nr_cids();
1079 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
1080 
1081 	if (nr_cids > SCX_QMAP_MAX_CPUS) {
1082 		scx_bpf_error("nr_cids=%u exceeds SCX_QMAP_MAX_CPUS=%d",
1083 			      nr_cids, SCX_QMAP_MAX_CPUS);
1084 		return -EINVAL;
1085 	}
1086 	if (nr_cpu_ids > SCX_QMAP_MAX_CPUS) {
1087 		scx_bpf_error("nr_cpu_ids=%u exceeds SCX_QMAP_MAX_CPUS=%d",
1088 			      nr_cpu_ids, SCX_QMAP_MAX_CPUS);
1089 		return -EINVAL;
1090 	}
1091 
1092 	/*
1093 	 * cid-override test hook. Must run before anything that reads the
1094 	 * cid space (scx_bpf_nr_cids, cmask_init, etc.). On invalid input,
1095 	 * the kfunc calls scx_error() which aborts the scheduler.
1096 	 */
1097 	if (cid_override_mode) {
1098 		scx_bpf_cid_override((const s32 *)cid_override_cpu_to_cid,
1099 				     nr_cpu_ids * sizeof(s32));
1100 	}
1101 
1102 	/*
1103 	 * Allocate the task_ctx slab in arena and thread the entire slab onto
1104 	 * the free list. max_tasks is set by userspace before load. Each entry
1105 	 * is TASK_CTX_STRIDE bytes - task_ctx's trailing cpus_allowed flex
1106 	 * array extends into the stride tail.
1107 	 */
1108 	if (!max_tasks) {
1109 		scx_bpf_error("max_tasks must be > 0");
1110 		return -EINVAL;
1111 	}
1112 
1113 	nr_pages = (max_tasks * TASK_CTX_STRIDE + PAGE_SIZE - 1) / PAGE_SIZE;
1114 	slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
1115 	if (!slab) {
1116 		scx_bpf_error("failed to allocate task_ctx slab");
1117 		return -ENOMEM;
1118 	}
1119 	qa.task_ctxs = (task_ctx_t *)slab;
1120 
1121 	bpf_for(i, 0, 5)
1122 		qa.fifos[i].idx = i;
1123 
1124 	bpf_for(i, 0, max_tasks) {
1125 		task_ctx_t *cur = (task_ctx_t *)(slab + i * TASK_CTX_STRIDE);
1126 		task_ctx_t *next = (i + 1 < max_tasks) ?
1127 			(task_ctx_t *)(slab + (i + 1) * TASK_CTX_STRIDE) : NULL;
1128 		cur->next_free = next;
1129 	}
1130 	qa.task_free_head = (task_ctx_t *)slab;
1131 
1132 	/*
1133 	 * Allocate and initialize the idle cmask. Starts empty - update_idle
1134 	 * fills it as cpus enter idle.
1135 	 */
1136 	qa_idle_cids = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
1137 	if (!qa_idle_cids) {
1138 		scx_bpf_error("failed to allocate idle cmask");
1139 		return -ENOMEM;
1140 	}
1141 	cmask_init(qa_idle_cids, 0, nr_cids);
1142 
1143 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
1144 	if (ret) {
1145 		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
1146 		return ret;
1147 	}
1148 
1149 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
1150 	if (ret) {
1151 		scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
1152 		return ret;
1153 	}
1154 
1155 	ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
1156 	if (ret)
1157 		return ret;
1158 
1159 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
1160 	if (!timer)
1161 		return -ESRCH;
1162 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
1163 	bpf_timer_set_callback(timer, monitor_timerfn);
1164 	ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
1165 	if (ret)
1166 		return ret;
1167 
1168 	if (__COMPAT_has_generic_reenq()) {
1169 		/* see lowpri_timerfn() */
1170 		timer = bpf_map_lookup_elem(&lowpri_timer, &key);
1171 		if (!timer)
1172 			return -ESRCH;
1173 		bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
1174 		bpf_timer_set_callback(timer, lowpri_timerfn);
1175 		ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
1176 		if (ret)
1177 			return ret;
1178 	}
1179 
1180 	return 0;
1181 }
1182 
1183 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
1184 {
1185 	UEI_RECORD(uei, ei);
1186 }
1187 
1188 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
1189 {
1190 	s32 i;
1191 
1192 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1193 		if (!qa.sub_sched_cgroup_ids[i]) {
1194 			qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
1195 			bpf_printk("attaching sub-sched[%d] on %s",
1196 				   i, args->cgroup_path);
1197 			return 0;
1198 		}
1199 	}
1200 
1201 	return -ENOSPC;
1202 }
1203 
1204 void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
1205 {
1206 	s32 i;
1207 
1208 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1209 		if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
1210 			qa.sub_sched_cgroup_ids[i] = 0;
1211 			bpf_printk("detaching sub-sched[%d] on %s",
1212 				   i, args->cgroup_path);
1213 			break;
1214 		}
1215 	}
1216 }
1217 
1218 SCX_OPS_CID_DEFINE(qmap_ops,
1219 	       .flags			= SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK,
1220 	       .select_cid		= (void *)qmap_select_cid,
1221 	       .enqueue			= (void *)qmap_enqueue,
1222 	       .dequeue			= (void *)qmap_dequeue,
1223 	       .dispatch		= (void *)qmap_dispatch,
1224 	       .tick			= (void *)qmap_tick,
1225 	       .core_sched_before	= (void *)qmap_core_sched_before,
1226 	       .set_cmask		= (void *)qmap_set_cmask,
1227 	       .update_idle		= (void *)qmap_update_idle,
1228 	       .init_task		= (void *)qmap_init_task,
1229 	       .exit_task		= (void *)qmap_exit_task,
1230 	       .dump			= (void *)qmap_dump,
1231 	       .dump_cid		= (void *)qmap_dump_cid,
1232 	       .dump_task		= (void *)qmap_dump_task,
1233 	       .cgroup_init		= (void *)qmap_cgroup_init,
1234 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
1235 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
1236 	       .sub_attach		= (void *)qmap_sub_attach,
1237 	       .sub_detach		= (void *)qmap_sub_detach,
1238 	       .init			= (void *)qmap_init,
1239 	       .exit			= (void *)qmap_exit,
1240 	       .timeout_ms		= 5000U,
1241 	       .name			= "qmap");
1242