1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A minimal userland scheduler. 4 * 5 * In terms of scheduling, this provides two different types of behaviors: 6 * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. 7 * All such tasks are direct-dispatched from the kernel, and are never 8 * enqueued in user space. 9 * 2. A primitive vruntime scheduler that is implemented in user space, for all 10 * other tasks. 11 * 12 * Some parts of this example user space scheduler could be implemented more 13 * efficiently using more complex and sophisticated data structures. For 14 * example, rather than using BPF_MAP_TYPE_QUEUE's, 15 * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between 16 * user space and kernel space. Similarly, we use a simple vruntime-sorted list 17 * in user space, but an rbtree could be used instead. 18 * 19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 21 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 22 */ 23 #include <scx/common.bpf.h> 24 #include "scx_userland.h" 25 26 /* 27 * Maximum amount of tasks enqueued/dispatched between kernel and user-space. 28 */ 29 #define MAX_ENQUEUED_TASKS 4096 30 31 char _license[] SEC("license") = "GPL"; 32 33 const volatile s32 usersched_pid; 34 35 /* !0 for veristat, set during init */ 36 const volatile u32 num_possible_cpus = 64; 37 38 /* Stats that are printed by user space. */ 39 u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; 40 41 /* 42 * Number of tasks that are queued for scheduling. 43 * 44 * This number is incremented by the BPF component when a task is queued to the 45 * user-space scheduler and it must be decremented by the user-space scheduler 46 * when a task is consumed. 47 */ 48 volatile u64 nr_queued; 49 50 /* 51 * Number of tasks that are waiting for scheduling. 52 * 53 * This number must be updated by the user-space scheduler to keep track if 54 * there is still some scheduling work to do. 55 */ 56 volatile u64 nr_scheduled; 57 58 UEI_DEFINE(uei); 59 60 /* 61 * The map containing tasks that are enqueued in user space from the kernel. 62 * 63 * This map is drained by the user space scheduler. 64 */ 65 struct { 66 __uint(type, BPF_MAP_TYPE_QUEUE); 67 __uint(max_entries, MAX_ENQUEUED_TASKS); 68 __type(value, struct scx_userland_enqueued_task); 69 } enqueued SEC(".maps"); 70 71 /* 72 * The map containing tasks that are dispatched to the kernel from user space. 73 * 74 * Drained by the kernel in userland_dispatch(). 75 */ 76 struct { 77 __uint(type, BPF_MAP_TYPE_QUEUE); 78 __uint(max_entries, MAX_ENQUEUED_TASKS); 79 __type(value, s32); 80 } dispatched SEC(".maps"); 81 82 /* Per-task scheduling context */ 83 struct task_ctx { 84 bool force_local; /* Dispatch directly to local DSQ */ 85 }; 86 87 /* Map that contains task-local storage. */ 88 struct { 89 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 90 __uint(map_flags, BPF_F_NO_PREALLOC); 91 __type(key, int); 92 __type(value, struct task_ctx); 93 } task_ctx_stor SEC(".maps"); 94 95 /* 96 * Flag used to wake-up the user-space scheduler. 97 */ 98 static volatile u32 usersched_needed; 99 100 /* 101 * Set user-space scheduler wake-up flag (equivalent to an atomic release 102 * operation). 103 */ 104 static void set_usersched_needed(void) 105 { 106 __sync_fetch_and_or(&usersched_needed, 1); 107 } 108 109 /* 110 * Check and clear user-space scheduler wake-up flag (equivalent to an atomic 111 * acquire operation). 112 */ 113 static bool test_and_clear_usersched_needed(void) 114 { 115 return __sync_fetch_and_and(&usersched_needed, 0) == 1; 116 } 117 118 static bool is_usersched_task(const struct task_struct *p) 119 { 120 return p->pid == usersched_pid; 121 } 122 123 static bool keep_in_kernel(const struct task_struct *p) 124 { 125 return p->nr_cpus_allowed < num_possible_cpus; 126 } 127 128 static struct task_struct *usersched_task(void) 129 { 130 struct task_struct *p; 131 132 p = bpf_task_from_pid(usersched_pid); 133 /* 134 * Should never happen -- the usersched task should always be managed 135 * by sched_ext. 136 */ 137 if (!p) 138 scx_bpf_error("Failed to find usersched task %d", usersched_pid); 139 140 return p; 141 } 142 143 s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, 144 s32 prev_cpu, u64 wake_flags) 145 { 146 if (keep_in_kernel(p)) { 147 s32 cpu; 148 struct task_ctx *tctx; 149 150 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 151 if (!tctx) { 152 scx_bpf_error("Failed to look up task-local storage for %s", p->comm); 153 return -ESRCH; 154 } 155 156 if (p->nr_cpus_allowed == 1 || 157 scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 158 tctx->force_local = true; 159 return prev_cpu; 160 } 161 162 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 163 if (cpu >= 0) { 164 tctx->force_local = true; 165 return cpu; 166 } 167 } 168 169 return prev_cpu; 170 } 171 172 static void dispatch_user_scheduler(void) 173 { 174 struct task_struct *p; 175 176 p = usersched_task(); 177 if (p) { 178 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 179 bpf_task_release(p); 180 } 181 } 182 183 static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) 184 { 185 struct scx_userland_enqueued_task task = {}; 186 187 task.pid = p->pid; 188 task.sum_exec_runtime = p->se.sum_exec_runtime; 189 task.weight = p->scx.weight; 190 191 if (bpf_map_push_elem(&enqueued, &task, 0)) { 192 /* 193 * If we fail to enqueue the task in user space, put it 194 * directly on the global DSQ. 195 */ 196 __sync_fetch_and_add(&nr_failed_enqueues, 1); 197 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 198 } else { 199 __sync_fetch_and_add(&nr_user_enqueues, 1); 200 set_usersched_needed(); 201 } 202 } 203 204 void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) 205 { 206 if (keep_in_kernel(p)) { 207 u64 dsq_id = SCX_DSQ_GLOBAL; 208 struct task_ctx *tctx; 209 210 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 211 if (!tctx) { 212 scx_bpf_error("Failed to lookup task ctx for %s", p->comm); 213 return; 214 } 215 216 if (tctx->force_local) 217 dsq_id = SCX_DSQ_LOCAL; 218 tctx->force_local = false; 219 scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); 220 __sync_fetch_and_add(&nr_kernel_enqueues, 1); 221 return; 222 } else if (!is_usersched_task(p)) { 223 enqueue_task_in_user_space(p, enq_flags); 224 } 225 } 226 227 void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) 228 { 229 if (test_and_clear_usersched_needed()) 230 dispatch_user_scheduler(); 231 232 bpf_repeat(MAX_ENQUEUED_TASKS) { 233 s32 pid; 234 struct task_struct *p; 235 236 if (bpf_map_pop_elem(&dispatched, &pid)) 237 break; 238 239 /* 240 * The task could have exited by the time we get around to 241 * dispatching it. Treat this as a normal occurrence, and simply 242 * move onto the next iteration. 243 */ 244 p = bpf_task_from_pid(pid); 245 if (!p) 246 continue; 247 248 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 249 bpf_task_release(p); 250 } 251 } 252 253 /* 254 * A CPU is about to change its idle state. If the CPU is going idle, ensure 255 * that the user-space scheduler has a chance to run if there is any remaining 256 * work to do. 257 */ 258 void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) 259 { 260 /* 261 * Don't do anything if we exit from and idle state, a CPU owner will 262 * be assigned in .running(). 263 */ 264 if (!idle) 265 return; 266 /* 267 * A CPU is now available, notify the user-space scheduler that tasks 268 * can be dispatched, if there is at least one task waiting to be 269 * scheduled, either queued (accounted in nr_queued) or scheduled 270 * (accounted in nr_scheduled). 271 * 272 * NOTE: nr_queued is incremented by the BPF component, more exactly in 273 * enqueue(), when a task is sent to the user-space scheduler, then 274 * the scheduler drains the queued tasks (updating nr_queued) and adds 275 * them to its internal data structures / state; at this point tasks 276 * become "scheduled" and the user-space scheduler will take care of 277 * updating nr_scheduled accordingly; lastly tasks will be dispatched 278 * and the user-space scheduler will update nr_scheduled again. 279 * 280 * Checking both counters allows to determine if there is still some 281 * pending work to do for the scheduler: new tasks have been queued 282 * since last check, or there are still tasks "queued" or "scheduled" 283 * since the previous user-space scheduler run. If the counters are 284 * both zero it is pointless to wake-up the scheduler (even if a CPU 285 * becomes idle), because there is nothing to do. 286 * 287 * Keep in mind that update_idle() doesn't run concurrently with the 288 * user-space scheduler (that is single-threaded): this function is 289 * naturally serialized with the user-space scheduler code, therefore 290 * this check here is also safe from a concurrency perspective. 291 */ 292 if (nr_queued || nr_scheduled) { 293 /* 294 * Kick the CPU to make it immediately ready to accept 295 * dispatched tasks. 296 */ 297 set_usersched_needed(); 298 scx_bpf_kick_cpu(cpu, 0); 299 } 300 } 301 302 s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, 303 struct scx_init_task_args *args) 304 { 305 if (bpf_task_storage_get(&task_ctx_stor, p, 0, 306 BPF_LOCAL_STORAGE_GET_F_CREATE)) 307 return 0; 308 else 309 return -ENOMEM; 310 } 311 312 s32 BPF_STRUCT_OPS(userland_init) 313 { 314 if (num_possible_cpus == 0) { 315 scx_bpf_error("User scheduler # CPUs uninitialized (%d)", 316 num_possible_cpus); 317 return -EINVAL; 318 } 319 320 if (usersched_pid <= 0) { 321 scx_bpf_error("User scheduler pid uninitialized (%d)", 322 usersched_pid); 323 return -EINVAL; 324 } 325 326 return 0; 327 } 328 329 void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) 330 { 331 UEI_RECORD(uei, ei); 332 } 333 334 SCX_OPS_DEFINE(userland_ops, 335 .select_cpu = (void *)userland_select_cpu, 336 .enqueue = (void *)userland_enqueue, 337 .dispatch = (void *)userland_dispatch, 338 .update_idle = (void *)userland_update_idle, 339 .init_task = (void *)userland_init_task, 340 .init = (void *)userland_init, 341 .exit = (void *)userland_exit, 342 .flags = SCX_OPS_ENQ_LAST | 343 SCX_OPS_KEEP_BUILTIN_IDLE, 344 .name = "userland"); 345