xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 07814a9439a3b03d79a1001614b5bc1cab69bcec)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  *
15  * This scheduler is primarily for demonstration and testing of sched_ext
16  * features and unlikely to be useful for actual workloads.
17  *
18  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
19  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
20  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
21  */
22 #include <scx/common.bpf.h>
23 
24 enum consts {
25 	ONE_SEC_IN_NS		= 1000000000,
26 	SHARED_DSQ		= 0,
27 };
28 
29 char _license[] SEC("license") = "GPL";
30 
31 const volatile u64 slice_ns = SCX_SLICE_DFL;
32 const volatile u32 stall_user_nth;
33 const volatile u32 stall_kernel_nth;
34 const volatile u32 dsp_batch;
35 const volatile s32 disallow_tgid;
36 const volatile bool suppress_dump;
37 
38 u32 test_error_cnt;
39 
40 UEI_DEFINE(uei);
41 
42 struct qmap {
43 	__uint(type, BPF_MAP_TYPE_QUEUE);
44 	__uint(max_entries, 4096);
45 	__type(value, u32);
46 } queue0 SEC(".maps"),
47   queue1 SEC(".maps"),
48   queue2 SEC(".maps"),
49   queue3 SEC(".maps"),
50   queue4 SEC(".maps");
51 
52 struct {
53 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
54 	__uint(max_entries, 5);
55 	__type(key, int);
56 	__array(values, struct qmap);
57 } queue_arr SEC(".maps") = {
58 	.values = {
59 		[0] = &queue0,
60 		[1] = &queue1,
61 		[2] = &queue2,
62 		[3] = &queue3,
63 		[4] = &queue4,
64 	},
65 };
66 
67 /* Per-task scheduling context */
68 struct task_ctx {
69 	bool	force_local;	/* Dispatch directly to local_dsq */
70 };
71 
72 struct {
73 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
74 	__uint(map_flags, BPF_F_NO_PREALLOC);
75 	__type(key, int);
76 	__type(value, struct task_ctx);
77 } task_ctx_stor SEC(".maps");
78 
79 struct cpu_ctx {
80 	u64	dsp_idx;	/* dispatch index */
81 	u64	dsp_cnt;	/* remaining count */
82 };
83 
84 struct {
85 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
86 	__uint(max_entries, 1);
87 	__type(key, u32);
88 	__type(value, struct cpu_ctx);
89 } cpu_ctx_stor SEC(".maps");
90 
91 /* Statistics */
92 u64 nr_enqueued, nr_dispatched, nr_dequeued;
93 
94 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
95 		   s32 prev_cpu, u64 wake_flags)
96 {
97 	struct task_ctx *tctx;
98 	s32 cpu;
99 
100 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
101 	if (!tctx) {
102 		scx_bpf_error("task_ctx lookup failed");
103 		return -ESRCH;
104 	}
105 
106 	if (p->nr_cpus_allowed == 1 ||
107 	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
108 		tctx->force_local = true;
109 		return prev_cpu;
110 	}
111 
112 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
113 	if (cpu >= 0)
114 		return cpu;
115 
116 	return prev_cpu;
117 }
118 
119 static int weight_to_idx(u32 weight)
120 {
121 	/* Coarsely map the compound weight to a FIFO. */
122 	if (weight <= 25)
123 		return 0;
124 	else if (weight <= 50)
125 		return 1;
126 	else if (weight < 200)
127 		return 2;
128 	else if (weight < 400)
129 		return 3;
130 	else
131 		return 4;
132 }
133 
134 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
135 {
136 	static u32 user_cnt, kernel_cnt;
137 	struct task_ctx *tctx;
138 	u32 pid = p->pid;
139 	int idx = weight_to_idx(p->scx.weight);
140 	void *ring;
141 
142 	if (p->flags & PF_KTHREAD) {
143 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
144 			return;
145 	} else {
146 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
147 			return;
148 	}
149 
150 	if (test_error_cnt && !--test_error_cnt)
151 		scx_bpf_error("test triggering error");
152 
153 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
154 	if (!tctx) {
155 		scx_bpf_error("task_ctx lookup failed");
156 		return;
157 	}
158 
159 	/* Is select_cpu() is telling us to enqueue locally? */
160 	if (tctx->force_local) {
161 		tctx->force_local = false;
162 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
163 		return;
164 	}
165 
166 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
167 	if (!ring) {
168 		scx_bpf_error("failed to find ring %d", idx);
169 		return;
170 	}
171 
172 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
173 	if (bpf_map_push_elem(ring, &pid, 0)) {
174 		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
175 		return;
176 	}
177 
178 	__sync_fetch_and_add(&nr_enqueued, 1);
179 }
180 
181 /*
182  * The BPF queue map doesn't support removal and sched_ext can handle spurious
183  * dispatches. qmap_dequeue() is only used to collect statistics.
184  */
185 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
186 {
187 	__sync_fetch_and_add(&nr_dequeued, 1);
188 }
189 
190 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
191 {
192 	struct task_struct *p;
193 	struct cpu_ctx *cpuc;
194 	u32 zero = 0, batch = dsp_batch ?: 1;
195 	void *fifo;
196 	s32 i, pid;
197 
198 	if (scx_bpf_consume(SHARED_DSQ))
199 		return;
200 
201 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
202 		scx_bpf_error("failed to look up cpu_ctx");
203 		return;
204 	}
205 
206 	for (i = 0; i < 5; i++) {
207 		/* Advance the dispatch cursor and pick the fifo. */
208 		if (!cpuc->dsp_cnt) {
209 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
210 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
211 		}
212 
213 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
214 		if (!fifo) {
215 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
216 			return;
217 		}
218 
219 		/* Dispatch or advance. */
220 		bpf_repeat(BPF_MAX_LOOPS) {
221 			if (bpf_map_pop_elem(fifo, &pid))
222 				break;
223 
224 			p = bpf_task_from_pid(pid);
225 			if (!p)
226 				continue;
227 
228 			__sync_fetch_and_add(&nr_dispatched, 1);
229 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
230 			bpf_task_release(p);
231 			batch--;
232 			cpuc->dsp_cnt--;
233 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
234 				scx_bpf_consume(SHARED_DSQ);
235 				return;
236 			}
237 			if (!cpuc->dsp_cnt)
238 				break;
239 		}
240 
241 		cpuc->dsp_cnt = 0;
242 	}
243 }
244 
245 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
246 		   struct scx_init_task_args *args)
247 {
248 	if (p->tgid == disallow_tgid)
249 		p->scx.disallow = true;
250 
251 	/*
252 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
253 	 * in this function and the following will automatically use GFP_KERNEL.
254 	 */
255 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
256 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
257 		return 0;
258 	else
259 		return -ENOMEM;
260 }
261 
262 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
263 {
264 	s32 i, pid;
265 
266 	if (suppress_dump)
267 		return;
268 
269 	bpf_for(i, 0, 5) {
270 		void *fifo;
271 
272 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
273 			return;
274 
275 		scx_bpf_dump("QMAP FIFO[%d]:", i);
276 		bpf_repeat(4096) {
277 			if (bpf_map_pop_elem(fifo, &pid))
278 				break;
279 			scx_bpf_dump(" %d", pid);
280 		}
281 		scx_bpf_dump("\n");
282 	}
283 }
284 
285 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
286 {
287 	u32 zero = 0;
288 	struct cpu_ctx *cpuc;
289 
290 	if (suppress_dump || idle)
291 		return;
292 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
293 		return;
294 
295 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
296 		     cpuc->dsp_idx, cpuc->dsp_cnt);
297 }
298 
299 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
300 {
301 	struct task_ctx *taskc;
302 
303 	if (suppress_dump)
304 		return;
305 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
306 		return;
307 
308 	scx_bpf_dump("QMAP: force_local=%d",
309 		     taskc->force_local);
310 }
311 
312 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
313 {
314 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
315 }
316 
317 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
318 {
319 	UEI_RECORD(uei, ei);
320 }
321 
322 SCX_OPS_DEFINE(qmap_ops,
323 	       .select_cpu		= (void *)qmap_select_cpu,
324 	       .enqueue			= (void *)qmap_enqueue,
325 	       .dequeue			= (void *)qmap_dequeue,
326 	       .dispatch		= (void *)qmap_dispatch,
327 	       .init_task		= (void *)qmap_init_task,
328 	       .dump			= (void *)qmap_dump,
329 	       .dump_cpu		= (void *)qmap_dump_cpu,
330 	       .dump_task		= (void *)qmap_dump_task,
331 	       .init			= (void *)qmap_init,
332 	       .exit			= (void *)qmap_exit,
333 	       .timeout_ms		= 5000U,
334 	       .name			= "qmap");
335