xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 245254f7081dbe1c8da54675d0e4ddbe74cee61b)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15  *   the CPU away.
16  *
17  * This scheduler is primarily for demonstration and testing of sched_ext
18  * features and unlikely to be useful for actual workloads.
19  *
20  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
21  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
22  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
23  */
24 #include <scx/common.bpf.h>
25 
26 enum consts {
27 	ONE_SEC_IN_NS		= 1000000000,
28 	SHARED_DSQ		= 0,
29 };
30 
31 char _license[] SEC("license") = "GPL";
32 
33 const volatile u64 slice_ns = SCX_SLICE_DFL;
34 const volatile u32 stall_user_nth;
35 const volatile u32 stall_kernel_nth;
36 const volatile u32 dsp_inf_loop_after;
37 const volatile u32 dsp_batch;
38 const volatile s32 disallow_tgid;
39 const volatile bool suppress_dump;
40 
41 u32 test_error_cnt;
42 
43 UEI_DEFINE(uei);
44 
45 struct qmap {
46 	__uint(type, BPF_MAP_TYPE_QUEUE);
47 	__uint(max_entries, 4096);
48 	__type(value, u32);
49 } queue0 SEC(".maps"),
50   queue1 SEC(".maps"),
51   queue2 SEC(".maps"),
52   queue3 SEC(".maps"),
53   queue4 SEC(".maps");
54 
55 struct {
56 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
57 	__uint(max_entries, 5);
58 	__type(key, int);
59 	__array(values, struct qmap);
60 } queue_arr SEC(".maps") = {
61 	.values = {
62 		[0] = &queue0,
63 		[1] = &queue1,
64 		[2] = &queue2,
65 		[3] = &queue3,
66 		[4] = &queue4,
67 	},
68 };
69 
70 /* Per-task scheduling context */
71 struct task_ctx {
72 	bool	force_local;	/* Dispatch directly to local_dsq */
73 };
74 
75 struct {
76 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
77 	__uint(map_flags, BPF_F_NO_PREALLOC);
78 	__type(key, int);
79 	__type(value, struct task_ctx);
80 } task_ctx_stor SEC(".maps");
81 
82 struct cpu_ctx {
83 	u64	dsp_idx;	/* dispatch index */
84 	u64	dsp_cnt;	/* remaining count */
85 };
86 
87 struct {
88 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
89 	__uint(max_entries, 1);
90 	__type(key, u32);
91 	__type(value, struct cpu_ctx);
92 } cpu_ctx_stor SEC(".maps");
93 
94 /* Statistics */
95 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
96 
97 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
98 		   s32 prev_cpu, u64 wake_flags)
99 {
100 	struct task_ctx *tctx;
101 	s32 cpu;
102 
103 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
104 	if (!tctx) {
105 		scx_bpf_error("task_ctx lookup failed");
106 		return -ESRCH;
107 	}
108 
109 	if (p->nr_cpus_allowed == 1 ||
110 	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
111 		tctx->force_local = true;
112 		return prev_cpu;
113 	}
114 
115 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
116 	if (cpu >= 0)
117 		return cpu;
118 
119 	return prev_cpu;
120 }
121 
122 static int weight_to_idx(u32 weight)
123 {
124 	/* Coarsely map the compound weight to a FIFO. */
125 	if (weight <= 25)
126 		return 0;
127 	else if (weight <= 50)
128 		return 1;
129 	else if (weight < 200)
130 		return 2;
131 	else if (weight < 400)
132 		return 3;
133 	else
134 		return 4;
135 }
136 
137 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
138 {
139 	static u32 user_cnt, kernel_cnt;
140 	struct task_ctx *tctx;
141 	u32 pid = p->pid;
142 	int idx = weight_to_idx(p->scx.weight);
143 	void *ring;
144 
145 	if (p->flags & PF_KTHREAD) {
146 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
147 			return;
148 	} else {
149 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
150 			return;
151 	}
152 
153 	if (test_error_cnt && !--test_error_cnt)
154 		scx_bpf_error("test triggering error");
155 
156 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
157 	if (!tctx) {
158 		scx_bpf_error("task_ctx lookup failed");
159 		return;
160 	}
161 
162 	/* Is select_cpu() is telling us to enqueue locally? */
163 	if (tctx->force_local) {
164 		tctx->force_local = false;
165 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
166 		return;
167 	}
168 
169 	/*
170 	 * If the task was re-enqueued due to the CPU being preempted by a
171 	 * higher priority scheduling class, just re-enqueue the task directly
172 	 * on the global DSQ. As we want another CPU to pick it up, find and
173 	 * kick an idle CPU.
174 	 */
175 	if (enq_flags & SCX_ENQ_REENQ) {
176 		s32 cpu;
177 
178 		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
179 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
180 		if (cpu >= 0)
181 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
182 		return;
183 	}
184 
185 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
186 	if (!ring) {
187 		scx_bpf_error("failed to find ring %d", idx);
188 		return;
189 	}
190 
191 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
192 	if (bpf_map_push_elem(ring, &pid, 0)) {
193 		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
194 		return;
195 	}
196 
197 	__sync_fetch_and_add(&nr_enqueued, 1);
198 }
199 
200 /*
201  * The BPF queue map doesn't support removal and sched_ext can handle spurious
202  * dispatches. qmap_dequeue() is only used to collect statistics.
203  */
204 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
205 {
206 	__sync_fetch_and_add(&nr_dequeued, 1);
207 }
208 
209 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
210 {
211 	struct task_struct *p;
212 	struct cpu_ctx *cpuc;
213 	u32 zero = 0, batch = dsp_batch ?: 1;
214 	void *fifo;
215 	s32 i, pid;
216 
217 	if (scx_bpf_consume(SHARED_DSQ))
218 		return;
219 
220 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
221 		/*
222 		 * PID 2 should be kthreadd which should mostly be idle and off
223 		 * the scheduler. Let's keep dispatching it to force the kernel
224 		 * to call this function over and over again.
225 		 */
226 		p = bpf_task_from_pid(2);
227 		if (p) {
228 			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
229 			bpf_task_release(p);
230 			return;
231 		}
232 	}
233 
234 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
235 		scx_bpf_error("failed to look up cpu_ctx");
236 		return;
237 	}
238 
239 	for (i = 0; i < 5; i++) {
240 		/* Advance the dispatch cursor and pick the fifo. */
241 		if (!cpuc->dsp_cnt) {
242 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
243 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
244 		}
245 
246 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
247 		if (!fifo) {
248 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
249 			return;
250 		}
251 
252 		/* Dispatch or advance. */
253 		bpf_repeat(BPF_MAX_LOOPS) {
254 			if (bpf_map_pop_elem(fifo, &pid))
255 				break;
256 
257 			p = bpf_task_from_pid(pid);
258 			if (!p)
259 				continue;
260 
261 			__sync_fetch_and_add(&nr_dispatched, 1);
262 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
263 			bpf_task_release(p);
264 			batch--;
265 			cpuc->dsp_cnt--;
266 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
267 				scx_bpf_consume(SHARED_DSQ);
268 				return;
269 			}
270 			if (!cpuc->dsp_cnt)
271 				break;
272 		}
273 
274 		cpuc->dsp_cnt = 0;
275 	}
276 }
277 
278 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
279 {
280 	u32 cnt;
281 
282 	/*
283 	 * Called when @cpu is taken by a higher priority scheduling class. This
284 	 * makes @cpu no longer available for executing sched_ext tasks. As we
285 	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
286 	 * becomes available again, re-enqueue them into the global dsq. See
287 	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
288 	 */
289 	cnt = scx_bpf_reenqueue_local();
290 	if (cnt)
291 		__sync_fetch_and_add(&nr_reenqueued, cnt);
292 }
293 
294 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
295 		   struct scx_init_task_args *args)
296 {
297 	if (p->tgid == disallow_tgid)
298 		p->scx.disallow = true;
299 
300 	/*
301 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
302 	 * in this function and the following will automatically use GFP_KERNEL.
303 	 */
304 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
305 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
306 		return 0;
307 	else
308 		return -ENOMEM;
309 }
310 
311 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
312 {
313 	s32 i, pid;
314 
315 	if (suppress_dump)
316 		return;
317 
318 	bpf_for(i, 0, 5) {
319 		void *fifo;
320 
321 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
322 			return;
323 
324 		scx_bpf_dump("QMAP FIFO[%d]:", i);
325 		bpf_repeat(4096) {
326 			if (bpf_map_pop_elem(fifo, &pid))
327 				break;
328 			scx_bpf_dump(" %d", pid);
329 		}
330 		scx_bpf_dump("\n");
331 	}
332 }
333 
334 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
335 {
336 	u32 zero = 0;
337 	struct cpu_ctx *cpuc;
338 
339 	if (suppress_dump || idle)
340 		return;
341 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
342 		return;
343 
344 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
345 		     cpuc->dsp_idx, cpuc->dsp_cnt);
346 }
347 
348 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
349 {
350 	struct task_ctx *taskc;
351 
352 	if (suppress_dump)
353 		return;
354 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
355 		return;
356 
357 	scx_bpf_dump("QMAP: force_local=%d",
358 		     taskc->force_local);
359 }
360 
361 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
362 {
363 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
364 }
365 
366 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
367 {
368 	UEI_RECORD(uei, ei);
369 }
370 
371 SCX_OPS_DEFINE(qmap_ops,
372 	       .select_cpu		= (void *)qmap_select_cpu,
373 	       .enqueue			= (void *)qmap_enqueue,
374 	       .dequeue			= (void *)qmap_dequeue,
375 	       .dispatch		= (void *)qmap_dispatch,
376 	       .cpu_release		= (void *)qmap_cpu_release,
377 	       .init_task		= (void *)qmap_init_task,
378 	       .dump			= (void *)qmap_dump,
379 	       .dump_cpu		= (void *)qmap_dump_cpu,
380 	       .dump_task		= (void *)qmap_dump_task,
381 	       .init			= (void *)qmap_init,
382 	       .exit			= (void *)qmap_exit,
383 	       .timeout_ms		= 5000U,
384 	       .name			= "qmap");
385