1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A demo sched_ext core-scheduler which always makes every sibling CPU pair 4 * execute from the same CPU cgroup. 5 * 6 * This scheduler is a minimal implementation and would need some form of 7 * priority handling both inside each cgroup and across the cgroups to be 8 * practically useful. 9 * 10 * Each CPU in the system is paired with exactly one other CPU, according to a 11 * "stride" value that can be specified when the BPF scheduler program is first 12 * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee 13 * that they will only ever schedule tasks that belong to the same CPU cgroup. 14 * 15 * Scheduler Initialization 16 * ------------------------ 17 * 18 * The scheduler BPF program is first initialized from user space, before it is 19 * enabled. During this initialization process, each CPU on the system is 20 * assigned several values that are constant throughout its runtime: 21 * 22 * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling 23 * decisions. Paired CPUs always schedule tasks from the same 24 * CPU cgroup, and synchronize with each other to guarantee 25 * that this constraint is not violated. 26 * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access 27 * a struct pair_ctx object that is shared between the pair. 28 * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the 29 * pair. Each struct pair_ctx has an active_mask field, 30 * which is a bitmap used to indicate whether each core 31 * in the pair currently has an actively running task. 32 * This index specifies which entry in the bitmap corresponds 33 * to each CPU in the pair. 34 * 35 * During this initialization, the CPUs are paired according to a "stride" that 36 * may be specified when invoking the user space program that initializes and 37 * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. 38 * 39 * Tasks and cgroups 40 * ----------------- 41 * 42 * Every cgroup in the system is registered with the scheduler using the 43 * pair_cgroup_init() callback, and every task in the system is associated with 44 * exactly one cgroup. At a high level, the idea with the pair scheduler is to 45 * always schedule tasks from the same cgroup within a given CPU pair. When a 46 * task is enqueued (i.e. passed to the pair_enqueue() callback function), its 47 * cgroup ID is read from its task struct, and then a corresponding queue map 48 * is used to FIFO-enqueue the task for that cgroup. 49 * 50 * If you look through the implementation of the scheduler, you'll notice that 51 * there is quite a bit of complexity involved with looking up the per-cgroup 52 * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash 53 * BPF hash map that is used to map a cgroup ID to a globally unique ID that's 54 * allocated in the BPF program. This is done because we use separate maps to 55 * store the FIFO queue of tasks, and the length of that map, per cgroup. This 56 * complexity is only present because of current deficiencies in BPF that will 57 * soon be addressed. The main point to keep in mind is that newly enqueued 58 * tasks are added to their cgroup's FIFO queue. 59 * 60 * Dispatching tasks 61 * ----------------- 62 * 63 * This section will describe how enqueued tasks are dispatched and scheduled. 64 * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is 65 * as follows: 66 * 67 * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is 68 * the structure that's used to synchronize amongst the two pair CPUs in their 69 * scheduling decisions. After any of the following events have occurred: 70 * 71 * - The cgroup's slice run has expired, or 72 * - The cgroup becomes empty, or 73 * - Either CPU in the pair is preempted by a higher priority scheduling class 74 * 75 * The cgroup transitions to the draining state and stops executing new tasks 76 * from the cgroup. 77 * 78 * 2. If the pair is still executing a task, mark the pair_ctx as draining, and 79 * wait for the pair CPU to be preempted. 80 * 81 * 3. Otherwise, if the pair CPU is not running a task, we can move onto 82 * scheduling new tasks. Pop the next cgroup id from the top_q queue. 83 * 84 * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. 85 * 86 * Note again that this scheduling behavior is simple, but the implementation 87 * is complex mostly because this it hits several BPF shortcomings and has to 88 * work around in often awkward ways. Most of the shortcomings are expected to 89 * be resolved in the near future which should allow greatly simplifying this 90 * scheduler. 91 * 92 * Dealing with preemption 93 * ----------------------- 94 * 95 * SCX is the lowest priority sched_class, and could be preempted by them at 96 * any time. To address this, the scheduler implements pair_cpu_release() and 97 * pair_cpu_acquire() callbacks which are invoked by the core scheduler when 98 * the scheduler loses and gains control of the CPU respectively. 99 * 100 * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and 101 * then invoke: 102 * 103 * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); 104 * 105 * This preempts the pair CPU, and waits until it has re-entered the scheduler 106 * before returning. This is necessary to ensure that the higher priority 107 * sched_class that preempted our scheduler does not schedule a task 108 * concurrently with our pair CPU. 109 * 110 * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption 111 * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable 112 * pair scheduling. 113 * 114 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 115 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 116 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 117 */ 118 #include <scx/common.bpf.h> 119 #include "scx_pair.h" 120 121 char _license[] SEC("license") = "GPL"; 122 123 /* !0 for veristat, set during init */ 124 const volatile u32 nr_cpu_ids = 1; 125 126 /* a pair of CPUs stay on a cgroup for this duration */ 127 const volatile u32 pair_batch_dur_ns; 128 129 /* cpu ID -> pair cpu ID */ 130 const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); 131 132 /* cpu ID -> pair_id */ 133 const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); 134 135 /* CPU ID -> CPU # in the pair (0 or 1) */ 136 const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); 137 138 struct pair_ctx { 139 struct bpf_spin_lock lock; 140 141 /* the cgroup the pair is currently executing */ 142 u64 cgid; 143 144 /* the pair started executing the current cgroup at */ 145 u64 started_at; 146 147 /* whether the current cgroup is draining */ 148 bool draining; 149 150 /* the CPUs that are currently active on the cgroup */ 151 u32 active_mask; 152 153 /* 154 * the CPUs that are currently preempted and running tasks in a 155 * different scheduler. 156 */ 157 u32 preempted_mask; 158 }; 159 160 struct { 161 __uint(type, BPF_MAP_TYPE_ARRAY); 162 __type(key, u32); 163 __type(value, struct pair_ctx); 164 } pair_ctx SEC(".maps"); 165 166 /* queue of cgrp_q's possibly with tasks on them */ 167 struct { 168 __uint(type, BPF_MAP_TYPE_QUEUE); 169 /* 170 * Because it's difficult to build strong synchronization encompassing 171 * multiple non-trivial operations in BPF, this queue is managed in an 172 * opportunistic way so that we guarantee that a cgroup w/ active tasks 173 * is always on it but possibly multiple times. Once we have more robust 174 * synchronization constructs and e.g. linked list, we should be able to 175 * do this in a prettier way but for now just size it big enough. 176 */ 177 __uint(max_entries, 4 * MAX_CGRPS); 178 __type(value, u64); 179 } top_q SEC(".maps"); 180 181 /* per-cgroup q which FIFOs the tasks from the cgroup */ 182 struct cgrp_q { 183 __uint(type, BPF_MAP_TYPE_QUEUE); 184 __uint(max_entries, MAX_QUEUED); 185 __type(value, u32); 186 }; 187 188 /* 189 * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local 190 * storage; however, a cgroup local storage can only be accessed from the BPF 191 * progs attached to the cgroup. For now, work around by allocating array of 192 * cgrp_q's and then allocating per-cgroup indices. 193 * 194 * Another caveat: It's difficult to populate a large array of maps statically 195 * or from BPF. Initialize it from userland. 196 */ 197 struct { 198 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 199 __uint(max_entries, MAX_CGRPS); 200 __type(key, s32); 201 __array(values, struct cgrp_q); 202 } cgrp_q_arr SEC(".maps"); 203 204 static u64 cgrp_q_len[MAX_CGRPS]; 205 206 /* 207 * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be 208 * useful to have as a map type. 209 */ 210 static u32 cgrp_q_idx_cursor; 211 static u64 cgrp_q_idx_busy[MAX_CGRPS]; 212 213 /* 214 * All added up, the following is what we do: 215 * 216 * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking 217 * for a free ID. If not found, fail cgroup creation with -EBUSY. 218 * 219 * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following 220 * cgrp_q_idx_hash. 221 * 222 * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from 223 * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. 224 * 225 * This is sadly complicated for something pretty simple. Hopefully, we should 226 * be able to simplify in the future. 227 */ 228 struct { 229 __uint(type, BPF_MAP_TYPE_HASH); 230 __uint(max_entries, MAX_CGRPS); 231 __uint(key_size, sizeof(u64)); /* cgrp ID */ 232 __uint(value_size, sizeof(s32)); /* cgrp_q idx */ 233 } cgrp_q_idx_hash SEC(".maps"); 234 235 /* statistics */ 236 u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; 237 u64 nr_exps, nr_exp_waits, nr_exp_empty; 238 u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; 239 240 UEI_DEFINE(uei); 241 242 void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) 243 { 244 struct cgroup *cgrp; 245 struct cgrp_q *cgq; 246 s32 pid = p->pid; 247 u64 cgid; 248 u32 *q_idx; 249 u64 *cgq_len; 250 251 __sync_fetch_and_add(&nr_total, 1); 252 253 cgrp = scx_bpf_task_cgroup(p); 254 cgid = cgrp->kn->id; 255 bpf_cgroup_release(cgrp); 256 257 /* find the cgroup's q and push @p into it */ 258 q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 259 if (!q_idx) { 260 scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); 261 return; 262 } 263 264 cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); 265 if (!cgq) { 266 scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", 267 cgid, *q_idx); 268 return; 269 } 270 271 if (bpf_map_push_elem(cgq, &pid, 0)) { 272 scx_bpf_error("cgroup[%llu] queue overflow", cgid); 273 return; 274 } 275 276 /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ 277 cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); 278 if (!cgq_len) { 279 scx_bpf_error("MEMBER_VTPR malfunction"); 280 return; 281 } 282 283 if (!__sync_fetch_and_add(cgq_len, 1) && 284 bpf_map_push_elem(&top_q, &cgid, 0)) { 285 scx_bpf_error("top_q overflow"); 286 return; 287 } 288 } 289 290 static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) 291 { 292 u32 *vptr; 293 294 vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); 295 if (!vptr) 296 return -EINVAL; 297 298 *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); 299 if (!(*pairc)) 300 return -EINVAL; 301 302 vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); 303 if (!vptr) 304 return -EINVAL; 305 306 *mask = 1U << *vptr; 307 308 return 0; 309 } 310 311 __attribute__((noinline)) 312 static int try_dispatch(s32 cpu) 313 { 314 struct pair_ctx *pairc; 315 struct bpf_map *cgq_map; 316 struct task_struct *p; 317 u64 now = scx_bpf_now(); 318 bool kick_pair = false; 319 bool expired, pair_preempted; 320 u32 *vptr, in_pair_mask; 321 s32 pid, q_idx; 322 u64 cgid; 323 int ret; 324 325 ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 326 if (ret) { 327 scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", 328 cpu); 329 return -ENOENT; 330 } 331 332 bpf_spin_lock(&pairc->lock); 333 pairc->active_mask &= ~in_pair_mask; 334 335 expired = time_before(pairc->started_at + pair_batch_dur_ns, now); 336 if (expired || pairc->draining) { 337 u64 new_cgid = 0; 338 339 __sync_fetch_and_add(&nr_exps, 1); 340 341 /* 342 * We're done with the current cgid. An obvious optimization 343 * would be not draining if the next cgroup is the current one. 344 * For now, be dumb and always expire. 345 */ 346 pairc->draining = true; 347 348 pair_preempted = pairc->preempted_mask; 349 if (pairc->active_mask || pair_preempted) { 350 /* 351 * The other CPU is still active, or is no longer under 352 * our control due to e.g. being preempted by a higher 353 * priority sched_class. We want to wait until this 354 * cgroup expires, or until control of our pair CPU has 355 * been returned to us. 356 * 357 * If the pair controls its CPU, and the time already 358 * expired, kick. When the other CPU arrives at 359 * dispatch and clears its active mask, it'll push the 360 * pair to the next cgroup and kick this CPU. 361 */ 362 __sync_fetch_and_add(&nr_exp_waits, 1); 363 bpf_spin_unlock(&pairc->lock); 364 if (expired && !pair_preempted) 365 kick_pair = true; 366 goto out_maybe_kick; 367 } 368 369 bpf_spin_unlock(&pairc->lock); 370 371 /* 372 * Pick the next cgroup. It'd be easier / cleaner to not drop 373 * pairc->lock and use stronger synchronization here especially 374 * given that we'll be switching cgroups significantly less 375 * frequently than tasks. Unfortunately, bpf_spin_lock can't 376 * really protect anything non-trivial. Let's do opportunistic 377 * operations instead. 378 */ 379 bpf_repeat(BPF_MAX_LOOPS) { 380 u32 *q_idx; 381 u64 *cgq_len; 382 383 if (bpf_map_pop_elem(&top_q, &new_cgid)) { 384 /* no active cgroup, go idle */ 385 __sync_fetch_and_add(&nr_exp_empty, 1); 386 return 0; 387 } 388 389 q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); 390 if (!q_idx) 391 continue; 392 393 /* 394 * This is the only place where empty cgroups are taken 395 * off the top_q. 396 */ 397 cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); 398 if (!cgq_len || !*cgq_len) 399 continue; 400 401 /* 402 * If it has any tasks, requeue as we may race and not 403 * execute it. 404 */ 405 bpf_map_push_elem(&top_q, &new_cgid, 0); 406 break; 407 } 408 409 bpf_spin_lock(&pairc->lock); 410 411 /* 412 * The other CPU may already have started on a new cgroup while 413 * we dropped the lock. Make sure that we're still draining and 414 * start on the new cgroup. 415 */ 416 if (pairc->draining && !pairc->active_mask) { 417 __sync_fetch_and_add(&nr_cgrp_next, 1); 418 pairc->cgid = new_cgid; 419 pairc->started_at = now; 420 pairc->draining = false; 421 kick_pair = true; 422 } else { 423 __sync_fetch_and_add(&nr_cgrp_coll, 1); 424 } 425 } 426 427 cgid = pairc->cgid; 428 pairc->active_mask |= in_pair_mask; 429 bpf_spin_unlock(&pairc->lock); 430 431 /* again, it'd be better to do all these with the lock held, oh well */ 432 vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 433 if (!vptr) { 434 scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); 435 return -ENOENT; 436 } 437 q_idx = *vptr; 438 439 /* claim one task from cgrp_q w/ q_idx */ 440 bpf_repeat(BPF_MAX_LOOPS) { 441 u64 *cgq_len, len; 442 443 cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); 444 if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { 445 /* the cgroup must be empty, expire and repeat */ 446 __sync_fetch_and_add(&nr_cgrp_empty, 1); 447 bpf_spin_lock(&pairc->lock); 448 pairc->draining = true; 449 pairc->active_mask &= ~in_pair_mask; 450 bpf_spin_unlock(&pairc->lock); 451 return -EAGAIN; 452 } 453 454 if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) 455 continue; 456 457 break; 458 } 459 460 cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); 461 if (!cgq_map) { 462 scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", 463 cgid, q_idx); 464 return -ENOENT; 465 } 466 467 if (bpf_map_pop_elem(cgq_map, &pid)) { 468 scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", 469 cgid, q_idx); 470 return -ENOENT; 471 } 472 473 p = bpf_task_from_pid(pid); 474 if (p) { 475 __sync_fetch_and_add(&nr_dispatched, 1); 476 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 477 bpf_task_release(p); 478 } else { 479 /* we don't handle dequeues, retry on lost tasks */ 480 __sync_fetch_and_add(&nr_missing, 1); 481 return -EAGAIN; 482 } 483 484 out_maybe_kick: 485 if (kick_pair) { 486 s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 487 if (pair) { 488 __sync_fetch_and_add(&nr_kicks, 1); 489 scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); 490 } 491 } 492 return 0; 493 } 494 495 void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) 496 { 497 bpf_repeat(BPF_MAX_LOOPS) { 498 if (try_dispatch(cpu) != -EAGAIN) 499 break; 500 } 501 } 502 503 void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) 504 { 505 int ret; 506 u32 in_pair_mask; 507 struct pair_ctx *pairc; 508 bool kick_pair; 509 510 ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 511 if (ret) 512 return; 513 514 bpf_spin_lock(&pairc->lock); 515 pairc->preempted_mask &= ~in_pair_mask; 516 /* Kick the pair CPU, unless it was also preempted. */ 517 kick_pair = !pairc->preempted_mask; 518 bpf_spin_unlock(&pairc->lock); 519 520 if (kick_pair) { 521 s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 522 523 if (pair) { 524 __sync_fetch_and_add(&nr_kicks, 1); 525 scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); 526 } 527 } 528 } 529 530 void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 531 { 532 int ret; 533 u32 in_pair_mask; 534 struct pair_ctx *pairc; 535 bool kick_pair; 536 537 ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 538 if (ret) 539 return; 540 541 bpf_spin_lock(&pairc->lock); 542 pairc->preempted_mask |= in_pair_mask; 543 pairc->active_mask &= ~in_pair_mask; 544 /* Kick the pair CPU if it's still running. */ 545 kick_pair = pairc->active_mask; 546 pairc->draining = true; 547 bpf_spin_unlock(&pairc->lock); 548 549 if (kick_pair) { 550 s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 551 552 if (pair) { 553 __sync_fetch_and_add(&nr_kicks, 1); 554 scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); 555 } 556 } 557 __sync_fetch_and_add(&nr_preemptions, 1); 558 } 559 560 s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) 561 { 562 u64 cgid = cgrp->kn->id; 563 s32 i, q_idx; 564 565 bpf_for(i, 0, MAX_CGRPS) { 566 q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; 567 if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) 568 break; 569 } 570 if (i == MAX_CGRPS) 571 return -EBUSY; 572 573 if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { 574 u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); 575 if (busy) 576 *busy = 0; 577 return -EBUSY; 578 } 579 580 return 0; 581 } 582 583 void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) 584 { 585 u64 cgid = cgrp->kn->id; 586 s32 *q_idx; 587 588 q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 589 if (q_idx) { 590 u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); 591 if (busy) 592 *busy = 0; 593 bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); 594 } 595 } 596 597 void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) 598 { 599 UEI_RECORD(uei, ei); 600 } 601 602 SCX_OPS_DEFINE(pair_ops, 603 .enqueue = (void *)pair_enqueue, 604 .dispatch = (void *)pair_dispatch, 605 .cpu_acquire = (void *)pair_cpu_acquire, 606 .cpu_release = (void *)pair_cpu_release, 607 .cgroup_init = (void *)pair_cgroup_init, 608 .cgroup_exit = (void *)pair_cgroup_exit, 609 .exit = (void *)pair_exit, 610 .name = "pair"); 611