xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 2a52ca7c98960aafb0eca9ef96b2d0c932171357)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  *
15  * This scheduler is primarily for demonstration and testing of sched_ext
16  * features and unlikely to be useful for actual workloads.
17  *
18  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
19  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
20  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
21  */
22 #include <scx/common.bpf.h>
23 
24 enum consts {
25 	ONE_SEC_IN_NS		= 1000000000,
26 	SHARED_DSQ		= 0,
27 };
28 
29 char _license[] SEC("license") = "GPL";
30 
31 const volatile u64 slice_ns = SCX_SLICE_DFL;
32 const volatile u32 dsp_batch;
33 
34 u32 test_error_cnt;
35 
36 UEI_DEFINE(uei);
37 
38 struct qmap {
39 	__uint(type, BPF_MAP_TYPE_QUEUE);
40 	__uint(max_entries, 4096);
41 	__type(value, u32);
42 } queue0 SEC(".maps"),
43   queue1 SEC(".maps"),
44   queue2 SEC(".maps"),
45   queue3 SEC(".maps"),
46   queue4 SEC(".maps");
47 
48 struct {
49 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
50 	__uint(max_entries, 5);
51 	__type(key, int);
52 	__array(values, struct qmap);
53 } queue_arr SEC(".maps") = {
54 	.values = {
55 		[0] = &queue0,
56 		[1] = &queue1,
57 		[2] = &queue2,
58 		[3] = &queue3,
59 		[4] = &queue4,
60 	},
61 };
62 
63 /* Per-task scheduling context */
64 struct task_ctx {
65 	bool	force_local;	/* Dispatch directly to local_dsq */
66 };
67 
68 struct {
69 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
70 	__uint(map_flags, BPF_F_NO_PREALLOC);
71 	__type(key, int);
72 	__type(value, struct task_ctx);
73 } task_ctx_stor SEC(".maps");
74 
75 struct cpu_ctx {
76 	u64	dsp_idx;	/* dispatch index */
77 	u64	dsp_cnt;	/* remaining count */
78 };
79 
80 struct {
81 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
82 	__uint(max_entries, 1);
83 	__type(key, u32);
84 	__type(value, struct cpu_ctx);
85 } cpu_ctx_stor SEC(".maps");
86 
87 /* Statistics */
88 u64 nr_enqueued, nr_dispatched, nr_dequeued;
89 
90 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
91 		   s32 prev_cpu, u64 wake_flags)
92 {
93 	struct task_ctx *tctx;
94 	s32 cpu;
95 
96 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
97 	if (!tctx) {
98 		scx_bpf_error("task_ctx lookup failed");
99 		return -ESRCH;
100 	}
101 
102 	if (p->nr_cpus_allowed == 1 ||
103 	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
104 		tctx->force_local = true;
105 		return prev_cpu;
106 	}
107 
108 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
109 	if (cpu >= 0)
110 		return cpu;
111 
112 	return prev_cpu;
113 }
114 
115 static int weight_to_idx(u32 weight)
116 {
117 	/* Coarsely map the compound weight to a FIFO. */
118 	if (weight <= 25)
119 		return 0;
120 	else if (weight <= 50)
121 		return 1;
122 	else if (weight < 200)
123 		return 2;
124 	else if (weight < 400)
125 		return 3;
126 	else
127 		return 4;
128 }
129 
130 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
131 {
132 	struct task_ctx *tctx;
133 	u32 pid = p->pid;
134 	int idx = weight_to_idx(p->scx.weight);
135 	void *ring;
136 
137 	if (test_error_cnt && !--test_error_cnt)
138 		scx_bpf_error("test triggering error");
139 
140 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
141 	if (!tctx) {
142 		scx_bpf_error("task_ctx lookup failed");
143 		return;
144 	}
145 
146 	/* Is select_cpu() is telling us to enqueue locally? */
147 	if (tctx->force_local) {
148 		tctx->force_local = false;
149 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
150 		return;
151 	}
152 
153 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
154 	if (!ring) {
155 		scx_bpf_error("failed to find ring %d", idx);
156 		return;
157 	}
158 
159 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
160 	if (bpf_map_push_elem(ring, &pid, 0)) {
161 		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
162 		return;
163 	}
164 
165 	__sync_fetch_and_add(&nr_enqueued, 1);
166 }
167 
168 /*
169  * The BPF queue map doesn't support removal and sched_ext can handle spurious
170  * dispatches. qmap_dequeue() is only used to collect statistics.
171  */
172 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
173 {
174 	__sync_fetch_and_add(&nr_dequeued, 1);
175 }
176 
177 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
178 {
179 	struct task_struct *p;
180 	struct cpu_ctx *cpuc;
181 	u32 zero = 0, batch = dsp_batch ?: 1;
182 	void *fifo;
183 	s32 i, pid;
184 
185 	if (scx_bpf_consume(SHARED_DSQ))
186 		return;
187 
188 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
189 		scx_bpf_error("failed to look up cpu_ctx");
190 		return;
191 	}
192 
193 	for (i = 0; i < 5; i++) {
194 		/* Advance the dispatch cursor and pick the fifo. */
195 		if (!cpuc->dsp_cnt) {
196 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
197 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
198 		}
199 
200 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
201 		if (!fifo) {
202 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
203 			return;
204 		}
205 
206 		/* Dispatch or advance. */
207 		bpf_repeat(BPF_MAX_LOOPS) {
208 			if (bpf_map_pop_elem(fifo, &pid))
209 				break;
210 
211 			p = bpf_task_from_pid(pid);
212 			if (!p)
213 				continue;
214 
215 			__sync_fetch_and_add(&nr_dispatched, 1);
216 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
217 			bpf_task_release(p);
218 			batch--;
219 			cpuc->dsp_cnt--;
220 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
221 				scx_bpf_consume(SHARED_DSQ);
222 				return;
223 			}
224 			if (!cpuc->dsp_cnt)
225 				break;
226 		}
227 
228 		cpuc->dsp_cnt = 0;
229 	}
230 }
231 
232 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
233 		   struct scx_init_task_args *args)
234 {
235 	/*
236 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
237 	 * in this function and the following will automatically use GFP_KERNEL.
238 	 */
239 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
240 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
241 		return 0;
242 	else
243 		return -ENOMEM;
244 }
245 
246 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
247 {
248 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
249 }
250 
251 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
252 {
253 	UEI_RECORD(uei, ei);
254 }
255 
256 SCX_OPS_DEFINE(qmap_ops,
257 	       .select_cpu		= (void *)qmap_select_cpu,
258 	       .enqueue			= (void *)qmap_enqueue,
259 	       .dequeue			= (void *)qmap_dequeue,
260 	       .dispatch		= (void *)qmap_dispatch,
261 	       .init_task		= (void *)qmap_init_task,
262 	       .init			= (void *)qmap_init,
263 	       .exit			= (void *)qmap_exit,
264 	       .name			= "qmap");
265