1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A demo sched_ext user space scheduler which provides vruntime semantics 4 * using a simple ordered-list implementation. 5 * 6 * Each CPU in the system resides in a single, global domain. This precludes 7 * the need to do any load balancing between domains. The scheduler could 8 * easily be extended to support multiple domains, with load balancing 9 * happening in user space. 10 * 11 * Any task which has any CPU affinity is scheduled entirely in BPF. This 12 * program only schedules tasks which may run on any CPU. 13 * 14 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 15 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 16 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 17 */ 18 #include <stdio.h> 19 #include <unistd.h> 20 #include <sched.h> 21 #include <signal.h> 22 #include <assert.h> 23 #include <libgen.h> 24 #include <pthread.h> 25 #include <bpf/bpf.h> 26 #include <sys/mman.h> 27 #include <sys/queue.h> 28 #include <sys/syscall.h> 29 30 #include <scx/common.h> 31 #include "scx_userland.h" 32 #include "scx_userland.bpf.skel.h" 33 34 const char help_fmt[] = 35 "A minimal userland sched_ext scheduler.\n" 36 "\n" 37 "See the top-level comment in .bpf.c for more details.\n" 38 "\n" 39 "Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" 40 "\n" 41 "Usage: %s [-b BATCH]\n" 42 "\n" 43 " -b BATCH The number of tasks to batch when dispatching (default: 8)\n" 44 " -v Print libbpf debug messages\n" 45 " -h Display this help and exit\n"; 46 47 /* Defined in UAPI */ 48 #define SCHED_EXT 7 49 50 /* Number of tasks to batch when dispatching to user space. */ 51 static __u32 batch_size = 8; 52 53 static bool verbose; 54 static volatile int exit_req; 55 static int enqueued_fd, dispatched_fd; 56 57 static struct scx_userland *skel; 58 static struct bpf_link *ops_link; 59 60 /* Stats collected in user space. */ 61 static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; 62 63 /* Number of tasks currently enqueued. */ 64 static __u64 nr_curr_enqueued; 65 66 /* The data structure containing tasks that are enqueued in user space. */ 67 struct enqueued_task { 68 LIST_ENTRY(enqueued_task) entries; 69 __u64 sum_exec_runtime; 70 double vruntime; 71 }; 72 73 /* 74 * Use a vruntime-sorted list to store tasks. This could easily be extended to 75 * a more optimal data structure, such as an rbtree as is done in CFS. We 76 * currently elect to use a sorted list to simplify the example for 77 * illustrative purposes. 78 */ 79 LIST_HEAD(listhead, enqueued_task); 80 81 /* 82 * A vruntime-sorted list of tasks. The head of the list contains the task with 83 * the lowest vruntime. That is, the task that has the "highest" claim to be 84 * scheduled. 85 */ 86 static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); 87 88 /* 89 * The main array of tasks. The array is allocated all at once during 90 * initialization, based on /proc/sys/kernel/pid_max, to avoid having to 91 * dynamically allocate memory on the enqueue path, which could cause a 92 * deadlock. A more substantive user space scheduler could e.g. provide a hook 93 * for newly enabled tasks that are passed to the scheduler from the 94 * .prep_enable() callback to allows the scheduler to allocate on safe paths. 95 */ 96 struct enqueued_task *tasks; 97 static int pid_max; 98 99 static double min_vruntime; 100 101 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 102 { 103 if (level == LIBBPF_DEBUG && !verbose) 104 return 0; 105 return vfprintf(stderr, format, args); 106 } 107 108 static void sigint_handler(int userland) 109 { 110 exit_req = 1; 111 } 112 113 static int get_pid_max(void) 114 { 115 FILE *fp; 116 int pid_max; 117 118 fp = fopen("/proc/sys/kernel/pid_max", "r"); 119 if (fp == NULL) { 120 fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); 121 return -1; 122 } 123 if (fscanf(fp, "%d", &pid_max) != 1) { 124 fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); 125 fclose(fp); 126 return -1; 127 } 128 fclose(fp); 129 130 return pid_max; 131 } 132 133 static int init_tasks(void) 134 { 135 pid_max = get_pid_max(); 136 if (pid_max < 0) 137 return pid_max; 138 139 tasks = calloc(pid_max, sizeof(*tasks)); 140 if (!tasks) { 141 fprintf(stderr, "Error allocating tasks array\n"); 142 return -ENOMEM; 143 } 144 145 return 0; 146 } 147 148 static __u32 task_pid(const struct enqueued_task *task) 149 { 150 return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); 151 } 152 153 static int dispatch_task(__s32 pid) 154 { 155 int err; 156 157 err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); 158 if (err) { 159 nr_vruntime_failed++; 160 } else { 161 nr_vruntime_dispatches++; 162 } 163 164 return err; 165 } 166 167 static struct enqueued_task *get_enqueued_task(__s32 pid) 168 { 169 if (pid >= pid_max) 170 return NULL; 171 172 return &tasks[pid]; 173 } 174 175 static double calc_vruntime_delta(__u64 weight, __u64 delta) 176 { 177 double weight_f = (double)weight / 100.0; 178 double delta_f = (double)delta; 179 180 return delta_f / weight_f; 181 } 182 183 static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) 184 { 185 __u64 delta; 186 187 delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; 188 189 enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); 190 if (min_vruntime > enqueued->vruntime) 191 enqueued->vruntime = min_vruntime; 192 enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; 193 } 194 195 static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) 196 { 197 struct enqueued_task *curr, *enqueued, *prev; 198 199 curr = get_enqueued_task(bpf_task->pid); 200 if (!curr) 201 return ENOENT; 202 203 update_enqueued(curr, bpf_task); 204 nr_vruntime_enqueues++; 205 nr_curr_enqueued++; 206 207 /* 208 * Enqueue the task in a vruntime-sorted list. A more optimal data 209 * structure such as an rbtree could easily be used as well. We elect 210 * to use a list here simply because it's less code, and thus the 211 * example is less convoluted and better serves to illustrate what a 212 * user space scheduler could look like. 213 */ 214 215 if (LIST_EMPTY(&vruntime_head)) { 216 LIST_INSERT_HEAD(&vruntime_head, curr, entries); 217 return 0; 218 } 219 220 LIST_FOREACH(enqueued, &vruntime_head, entries) { 221 if (curr->vruntime <= enqueued->vruntime) { 222 LIST_INSERT_BEFORE(enqueued, curr, entries); 223 return 0; 224 } 225 prev = enqueued; 226 } 227 228 LIST_INSERT_AFTER(prev, curr, entries); 229 230 return 0; 231 } 232 233 static void drain_enqueued_map(void) 234 { 235 while (1) { 236 struct scx_userland_enqueued_task task; 237 int err; 238 239 if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { 240 skel->bss->nr_queued = 0; 241 skel->bss->nr_scheduled = nr_curr_enqueued; 242 return; 243 } 244 245 err = vruntime_enqueue(&task); 246 if (err) { 247 fprintf(stderr, "Failed to enqueue task %d: %s\n", 248 task.pid, strerror(err)); 249 exit_req = 1; 250 return; 251 } 252 } 253 } 254 255 static void dispatch_batch(void) 256 { 257 __u32 i; 258 259 for (i = 0; i < batch_size; i++) { 260 struct enqueued_task *task; 261 int err; 262 __s32 pid; 263 264 task = LIST_FIRST(&vruntime_head); 265 if (!task) 266 break; 267 268 min_vruntime = task->vruntime; 269 pid = task_pid(task); 270 LIST_REMOVE(task, entries); 271 err = dispatch_task(pid); 272 if (err) { 273 /* 274 * If we fail to dispatch, put the task back to the 275 * vruntime_head list and stop dispatching additional 276 * tasks in this batch. 277 */ 278 LIST_INSERT_HEAD(&vruntime_head, task, entries); 279 break; 280 } 281 nr_curr_enqueued--; 282 } 283 skel->bss->nr_scheduled = nr_curr_enqueued; 284 } 285 286 static void *run_stats_printer(void *arg) 287 { 288 while (!exit_req) { 289 __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; 290 291 nr_failed_enqueues = skel->bss->nr_failed_enqueues; 292 nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; 293 nr_user_enqueues = skel->bss->nr_user_enqueues; 294 total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; 295 296 printf("o-----------------------o\n"); 297 printf("| BPF ENQUEUES |\n"); 298 printf("|-----------------------|\n"); 299 printf("| kern: %10llu |\n", nr_kernel_enqueues); 300 printf("| user: %10llu |\n", nr_user_enqueues); 301 printf("| failed: %10llu |\n", nr_failed_enqueues); 302 printf("| -------------------- |\n"); 303 printf("| total: %10llu |\n", total); 304 printf("| |\n"); 305 printf("|-----------------------|\n"); 306 printf("| VRUNTIME / USER |\n"); 307 printf("|-----------------------|\n"); 308 printf("| enq: %10llu |\n", nr_vruntime_enqueues); 309 printf("| disp: %10llu |\n", nr_vruntime_dispatches); 310 printf("| failed: %10llu |\n", nr_vruntime_failed); 311 printf("o-----------------------o\n"); 312 printf("\n\n"); 313 fflush(stdout); 314 sleep(1); 315 } 316 317 return NULL; 318 } 319 320 static int spawn_stats_thread(void) 321 { 322 pthread_t stats_printer; 323 324 return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); 325 } 326 327 static void pre_bootstrap(int argc, char **argv) 328 { 329 int err; 330 __u32 opt; 331 struct sched_param sched_param = { 332 .sched_priority = sched_get_priority_max(SCHED_EXT), 333 }; 334 335 err = init_tasks(); 336 if (err) 337 exit(err); 338 339 libbpf_set_print(libbpf_print_fn); 340 signal(SIGINT, sigint_handler); 341 signal(SIGTERM, sigint_handler); 342 343 /* 344 * Enforce that the user scheduler task is managed by sched_ext. The 345 * task eagerly drains the list of enqueued tasks in its main work 346 * loop, and then yields the CPU. The BPF scheduler only schedules the 347 * user space scheduler task when at least one other task in the system 348 * needs to be scheduled. 349 */ 350 err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); 351 SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); 352 353 while ((opt = getopt(argc, argv, "b:vh")) != -1) { 354 switch (opt) { 355 case 'b': 356 batch_size = strtoul(optarg, NULL, 0); 357 break; 358 case 'v': 359 verbose = true; 360 break; 361 default: 362 fprintf(stderr, help_fmt, basename(argv[0])); 363 exit(opt != 'h'); 364 } 365 } 366 367 /* 368 * It's not always safe to allocate in a user space scheduler, as an 369 * enqueued task could hold a lock that we require in order to be able 370 * to allocate. 371 */ 372 err = mlockall(MCL_CURRENT | MCL_FUTURE); 373 SCX_BUG_ON(err, "Failed to prefault and lock address space"); 374 } 375 376 static void bootstrap(char *comm) 377 { 378 skel = SCX_OPS_OPEN(userland_ops, scx_userland); 379 380 skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); 381 assert(skel->rodata->num_possible_cpus > 0); 382 skel->rodata->usersched_pid = getpid(); 383 assert(skel->rodata->usersched_pid > 0); 384 385 SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei); 386 387 enqueued_fd = bpf_map__fd(skel->maps.enqueued); 388 dispatched_fd = bpf_map__fd(skel->maps.dispatched); 389 assert(enqueued_fd > 0); 390 assert(dispatched_fd > 0); 391 392 SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); 393 394 ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland); 395 } 396 397 static void sched_main_loop(void) 398 { 399 while (!exit_req) { 400 /* 401 * Perform the following work in the main user space scheduler 402 * loop: 403 * 404 * 1. Drain all tasks from the enqueued map, and enqueue them 405 * to the vruntime sorted list. 406 * 407 * 2. Dispatch a batch of tasks from the vruntime sorted list 408 * down to the kernel. 409 * 410 * 3. Yield the CPU back to the system. The BPF scheduler will 411 * reschedule the user space scheduler once another task has 412 * been enqueued to user space. 413 */ 414 drain_enqueued_map(); 415 dispatch_batch(); 416 sched_yield(); 417 } 418 } 419 420 int main(int argc, char **argv) 421 { 422 __u64 ecode; 423 424 pre_bootstrap(argc, argv); 425 restart: 426 bootstrap(argv[0]); 427 sched_main_loop(); 428 429 exit_req = 1; 430 bpf_link__destroy(ops_link); 431 ecode = UEI_REPORT(skel, uei); 432 scx_userland__destroy(skel); 433 434 if (UEI_ECODE_RESTART(ecode)) 435 goto restart; 436 return 0; 437 } 438