1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A demo sched_ext user space scheduler which provides vruntime semantics 4 * using a simple ordered-list implementation. 5 * 6 * Each CPU in the system resides in a single, global domain. This precludes 7 * the need to do any load balancing between domains. The scheduler could 8 * easily be extended to support multiple domains, with load balancing 9 * happening in user space. 10 * 11 * Any task which has any CPU affinity is scheduled entirely in BPF. This 12 * program only schedules tasks which may run on any CPU. 13 * 14 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 15 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 16 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 17 */ 18 #include <stdio.h> 19 #include <unistd.h> 20 #include <sched.h> 21 #include <signal.h> 22 #include <assert.h> 23 #include <libgen.h> 24 #include <pthread.h> 25 #include <bpf/bpf.h> 26 #include <sys/mman.h> 27 #include <sys/queue.h> 28 #include <sys/syscall.h> 29 30 #include <scx/common.h> 31 #include "scx_userland.h" 32 #include "scx_userland.bpf.skel.h" 33 34 const char help_fmt[] = 35 "A minimal userland sched_ext scheduler.\n" 36 "\n" 37 "See the top-level comment in .bpf.c for more details.\n" 38 "\n" 39 "Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" 40 "\n" 41 "Usage: %s [-b BATCH]\n" 42 "\n" 43 " -b BATCH The number of tasks to batch when dispatching (default: 8)\n" 44 " -v Print libbpf debug messages\n" 45 " -h Display this help and exit\n"; 46 47 /* Defined in UAPI */ 48 #define SCHED_EXT 7 49 50 /* Number of tasks to batch when dispatching to user space. */ 51 static __u32 batch_size = 8; 52 53 static bool verbose; 54 static volatile int exit_req; 55 static int enqueued_fd, dispatched_fd; 56 57 static pthread_t stats_printer; 58 static struct scx_userland *skel; 59 static struct bpf_link *ops_link; 60 61 /* Stats collected in user space. */ 62 static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; 63 64 /* Number of tasks currently enqueued. */ 65 static __u64 nr_curr_enqueued; 66 67 /* The data structure containing tasks that are enqueued in user space. */ 68 struct enqueued_task { 69 LIST_ENTRY(enqueued_task) entries; 70 __u64 sum_exec_runtime; 71 double vruntime; 72 }; 73 74 /* 75 * Use a vruntime-sorted list to store tasks. This could easily be extended to 76 * a more optimal data structure, such as an rbtree as is done in CFS. We 77 * currently elect to use a sorted list to simplify the example for 78 * illustrative purposes. 79 */ 80 LIST_HEAD(listhead, enqueued_task); 81 82 /* 83 * A vruntime-sorted list of tasks. The head of the list contains the task with 84 * the lowest vruntime. That is, the task that has the "highest" claim to be 85 * scheduled. 86 */ 87 static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); 88 89 /* 90 * The main array of tasks. The array is allocated all at once during 91 * initialization, based on /proc/sys/kernel/pid_max, to avoid having to 92 * dynamically allocate memory on the enqueue path, which could cause a 93 * deadlock. A more substantive user space scheduler could e.g. provide a hook 94 * for newly enabled tasks that are passed to the scheduler from the 95 * .prep_enable() callback to allows the scheduler to allocate on safe paths. 96 */ 97 struct enqueued_task *tasks; 98 static int pid_max; 99 100 static double min_vruntime; 101 102 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 103 { 104 if (level == LIBBPF_DEBUG && !verbose) 105 return 0; 106 return vfprintf(stderr, format, args); 107 } 108 109 static void sigint_handler(int userland) 110 { 111 exit_req = 1; 112 } 113 114 static int get_pid_max(void) 115 { 116 FILE *fp; 117 int pid_max; 118 119 fp = fopen("/proc/sys/kernel/pid_max", "r"); 120 if (fp == NULL) { 121 fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); 122 return -1; 123 } 124 if (fscanf(fp, "%d", &pid_max) != 1) { 125 fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); 126 fclose(fp); 127 return -1; 128 } 129 fclose(fp); 130 131 return pid_max; 132 } 133 134 static int init_tasks(void) 135 { 136 pid_max = get_pid_max(); 137 if (pid_max < 0) 138 return pid_max; 139 140 tasks = calloc(pid_max, sizeof(*tasks)); 141 if (!tasks) { 142 fprintf(stderr, "Error allocating tasks array\n"); 143 return -ENOMEM; 144 } 145 146 return 0; 147 } 148 149 static __u32 task_pid(const struct enqueued_task *task) 150 { 151 return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); 152 } 153 154 static int dispatch_task(__s32 pid) 155 { 156 int err; 157 158 err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); 159 if (err) { 160 __atomic_add_fetch(&nr_vruntime_failed, 1, __ATOMIC_RELAXED); 161 } else { 162 __atomic_add_fetch(&nr_vruntime_dispatches, 1, __ATOMIC_RELAXED); 163 } 164 165 return err; 166 } 167 168 static struct enqueued_task *get_enqueued_task(__s32 pid) 169 { 170 if (pid >= pid_max) 171 return NULL; 172 173 return &tasks[pid]; 174 } 175 176 static double calc_vruntime_delta(__u64 weight, __u64 delta) 177 { 178 double weight_f = (double)weight / 100.0; 179 double delta_f = (double)delta; 180 181 return delta_f / weight_f; 182 } 183 184 static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) 185 { 186 __u64 delta; 187 188 delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; 189 190 enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); 191 if (min_vruntime > enqueued->vruntime) 192 enqueued->vruntime = min_vruntime; 193 enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; 194 } 195 196 static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) 197 { 198 struct enqueued_task *curr, *enqueued, *prev; 199 200 curr = get_enqueued_task(bpf_task->pid); 201 if (!curr) 202 return ENOENT; 203 204 update_enqueued(curr, bpf_task); 205 __atomic_add_fetch(&nr_vruntime_enqueues, 1, __ATOMIC_RELAXED); 206 __atomic_add_fetch(&nr_curr_enqueued, 1, __ATOMIC_RELAXED); 207 208 /* 209 * Enqueue the task in a vruntime-sorted list. A more optimal data 210 * structure such as an rbtree could easily be used as well. We elect 211 * to use a list here simply because it's less code, and thus the 212 * example is less convoluted and better serves to illustrate what a 213 * user space scheduler could look like. 214 */ 215 216 if (LIST_EMPTY(&vruntime_head)) { 217 LIST_INSERT_HEAD(&vruntime_head, curr, entries); 218 return 0; 219 } 220 221 LIST_FOREACH(enqueued, &vruntime_head, entries) { 222 if (curr->vruntime <= enqueued->vruntime) { 223 LIST_INSERT_BEFORE(enqueued, curr, entries); 224 return 0; 225 } 226 prev = enqueued; 227 } 228 229 LIST_INSERT_AFTER(prev, curr, entries); 230 231 return 0; 232 } 233 234 static void drain_enqueued_map(void) 235 { 236 while (1) { 237 struct scx_userland_enqueued_task task; 238 int err; 239 240 if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { 241 skel->bss->nr_queued = 0; 242 skel->bss->nr_scheduled = nr_curr_enqueued; 243 return; 244 } 245 246 err = vruntime_enqueue(&task); 247 if (err) { 248 fprintf(stderr, "Failed to enqueue task %d: %s\n", 249 task.pid, strerror(err)); 250 exit_req = 1; 251 return; 252 } 253 } 254 } 255 256 static void dispatch_batch(void) 257 { 258 __u32 i; 259 260 for (i = 0; i < batch_size; i++) { 261 struct enqueued_task *task; 262 int err; 263 __s32 pid; 264 265 task = LIST_FIRST(&vruntime_head); 266 if (!task) 267 break; 268 269 min_vruntime = task->vruntime; 270 pid = task_pid(task); 271 LIST_REMOVE(task, entries); 272 err = dispatch_task(pid); 273 if (err) { 274 /* 275 * If we fail to dispatch, put the task back to the 276 * vruntime_head list and stop dispatching additional 277 * tasks in this batch. 278 */ 279 LIST_INSERT_HEAD(&vruntime_head, task, entries); 280 break; 281 } 282 __atomic_sub_fetch(&nr_curr_enqueued, 1, __ATOMIC_RELAXED); 283 } 284 skel->bss->nr_scheduled = __atomic_load_n(&nr_curr_enqueued, __ATOMIC_RELAXED); 285 } 286 287 static void *run_stats_printer(void *arg) 288 { 289 while (!exit_req) { 290 __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; 291 292 nr_failed_enqueues = skel->bss->nr_failed_enqueues; 293 nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; 294 nr_user_enqueues = skel->bss->nr_user_enqueues; 295 total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; 296 297 printf("o-----------------------o\n"); 298 printf("| BPF ENQUEUES |\n"); 299 printf("|-----------------------|\n"); 300 printf("| kern: %10llu |\n", nr_kernel_enqueues); 301 printf("| user: %10llu |\n", nr_user_enqueues); 302 printf("| failed: %10llu |\n", nr_failed_enqueues); 303 printf("| -------------------- |\n"); 304 printf("| total: %10llu |\n", total); 305 printf("| |\n"); 306 printf("|-----------------------|\n"); 307 printf("| VRUNTIME / USER |\n"); 308 printf("|-----------------------|\n"); 309 printf("| enq: %10llu |\n", __atomic_load_n(&nr_vruntime_enqueues, __ATOMIC_RELAXED)); 310 printf("| disp: %10llu |\n", __atomic_load_n(&nr_vruntime_dispatches, __ATOMIC_RELAXED)); 311 printf("| failed: %10llu |\n", __atomic_load_n(&nr_vruntime_failed, __ATOMIC_RELAXED)); 312 printf("o-----------------------o\n"); 313 printf("\n\n"); 314 fflush(stdout); 315 sleep(1); 316 } 317 318 return NULL; 319 } 320 321 static int spawn_stats_thread(void) 322 { 323 return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); 324 } 325 326 static void pre_bootstrap(int argc, char **argv) 327 { 328 int err; 329 __u32 opt; 330 struct sched_param sched_param = { 331 .sched_priority = sched_get_priority_max(SCHED_EXT), 332 }; 333 334 err = init_tasks(); 335 if (err) 336 exit(err); 337 338 libbpf_set_print(libbpf_print_fn); 339 signal(SIGINT, sigint_handler); 340 signal(SIGTERM, sigint_handler); 341 342 /* 343 * Enforce that the user scheduler task is managed by sched_ext. The 344 * task eagerly drains the list of enqueued tasks in its main work 345 * loop, and then yields the CPU. The BPF scheduler only schedules the 346 * user space scheduler task when at least one other task in the system 347 * needs to be scheduled. 348 */ 349 err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); 350 SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); 351 352 while ((opt = getopt(argc, argv, "b:vh")) != -1) { 353 switch (opt) { 354 case 'b': 355 batch_size = strtoul(optarg, NULL, 0); 356 break; 357 case 'v': 358 verbose = true; 359 break; 360 default: 361 fprintf(stderr, help_fmt, basename(argv[0])); 362 exit(opt != 'h'); 363 } 364 } 365 366 /* 367 * It's not always safe to allocate in a user space scheduler, as an 368 * enqueued task could hold a lock that we require in order to be able 369 * to allocate. 370 */ 371 err = mlockall(MCL_CURRENT | MCL_FUTURE); 372 SCX_BUG_ON(err, "Failed to prefault and lock address space"); 373 } 374 375 static void bootstrap(char *comm) 376 { 377 exit_req = 0; 378 min_vruntime = 0.0; 379 __atomic_store_n(&nr_vruntime_enqueues, 0, __ATOMIC_RELAXED); 380 __atomic_store_n(&nr_vruntime_dispatches, 0, __ATOMIC_RELAXED); 381 __atomic_store_n(&nr_vruntime_failed, 0, __ATOMIC_RELAXED); 382 __atomic_store_n(&nr_curr_enqueued, 0, __ATOMIC_RELAXED); 383 memset(tasks, 0, pid_max * sizeof(*tasks)); 384 LIST_INIT(&vruntime_head); 385 386 skel = SCX_OPS_OPEN(userland_ops, scx_userland); 387 388 skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); 389 assert(skel->rodata->num_possible_cpus > 0); 390 skel->rodata->usersched_pid = getpid(); 391 assert(skel->rodata->usersched_pid > 0); 392 393 SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei); 394 395 enqueued_fd = bpf_map__fd(skel->maps.enqueued); 396 dispatched_fd = bpf_map__fd(skel->maps.dispatched); 397 assert(enqueued_fd > 0); 398 assert(dispatched_fd > 0); 399 400 SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); 401 402 ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland); 403 } 404 405 static void sched_main_loop(void) 406 { 407 while (!exit_req) { 408 /* 409 * Perform the following work in the main user space scheduler 410 * loop: 411 * 412 * 1. Drain all tasks from the enqueued map, and enqueue them 413 * to the vruntime sorted list. 414 * 415 * 2. Dispatch a batch of tasks from the vruntime sorted list 416 * down to the kernel. 417 * 418 * 3. Yield the CPU back to the system. The BPF scheduler will 419 * reschedule the user space scheduler once another task has 420 * been enqueued to user space. 421 */ 422 drain_enqueued_map(); 423 dispatch_batch(); 424 sched_yield(); 425 } 426 } 427 428 int main(int argc, char **argv) 429 { 430 __u64 ecode; 431 432 pre_bootstrap(argc, argv); 433 restart: 434 bootstrap(argv[0]); 435 sched_main_loop(); 436 437 exit_req = 1; 438 bpf_link__destroy(ops_link); 439 pthread_join(stats_printer, NULL); 440 ecode = UEI_REPORT(skel, uei); 441 scx_userland__destroy(skel); 442 443 if (UEI_ECODE_RESTART(ecode)) 444 goto restart; 445 return 0; 446 } 447