1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A demo sched_ext flattened cgroup hierarchy scheduler. It implements 4 * hierarchical weight-based cgroup CPU control by flattening the cgroup 5 * hierarchy into a single layer by compounding the active weight share at each 6 * level. Consider the following hierarchy with weights in parentheses: 7 * 8 * R + A (100) + B (100) 9 * | \ C (100) 10 * \ D (200) 11 * 12 * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. 13 * Let's say all three have runnable tasks. The total share that each of these 14 * three cgroups is entitled to can be calculated by compounding its share at 15 * each level. 16 * 17 * For example, B is competing against C and in that competition its share is 18 * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's 19 * share in that competition is 100/(200+100) == 1/3. B's eventual share in the 20 * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's 21 * eventual shaer is the same at 1/6. D is only competing at the top level and 22 * its share is 200/(100+200) == 2/3. 23 * 24 * So, instead of hierarchically scheduling level-by-level, we can consider it 25 * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 26 * and keep updating the eventual shares as the cgroups' runnable states change. 27 * 28 * This flattening of hierarchy can bring a substantial performance gain when 29 * the cgroup hierarchy is nested multiple levels. in a simple benchmark using 30 * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it 31 * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two 32 * apache instances competing with 2:1 weight ratio nested four level deep. 33 * 34 * However, the gain comes at the cost of not being able to properly handle 35 * thundering herd of cgroups. For example, if many cgroups which are nested 36 * behind a low priority parent cgroup wake up around the same time, they may be 37 * able to consume more CPU cycles than they are entitled to. In many use cases, 38 * this isn't a real concern especially given the performance gain. Also, there 39 * are ways to mitigate the problem further by e.g. introducing an extra 40 * scheduling layer on cgroup delegation boundaries. 41 * 42 * The scheduler first picks the cgroup to run and then schedule the tasks 43 * within by using nested weighted vtime scheduling by default. The 44 * cgroup-internal scheduling can be switched to FIFO with the -f option. 45 */ 46 #include <scx/common.bpf.h> 47 #include "scx_flatcg.h" 48 49 /* 50 * Maximum amount of retries to find a valid cgroup. 51 */ 52 #define CGROUP_MAX_RETRIES 1024 53 54 char _license[] SEC("license") = "GPL"; 55 56 const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ 57 const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; 58 const volatile bool fifo_sched; 59 60 u64 cvtime_now; 61 UEI_DEFINE(uei); 62 63 struct { 64 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 65 __type(key, u32); 66 __type(value, u64); 67 __uint(max_entries, FCG_NR_STATS); 68 } stats SEC(".maps"); 69 70 static void stat_inc(enum fcg_stat_idx idx) 71 { 72 u32 idx_v = idx; 73 74 u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); 75 if (cnt_p) 76 (*cnt_p)++; 77 } 78 79 struct fcg_cpu_ctx { 80 u64 cur_cgid; 81 u64 cur_at; 82 }; 83 84 struct { 85 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 86 __type(key, u32); 87 __type(value, struct fcg_cpu_ctx); 88 __uint(max_entries, 1); 89 } cpu_ctx SEC(".maps"); 90 91 struct { 92 __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); 93 __uint(map_flags, BPF_F_NO_PREALLOC); 94 __type(key, int); 95 __type(value, struct fcg_cgrp_ctx); 96 } cgrp_ctx SEC(".maps"); 97 98 struct cgv_node { 99 struct bpf_rb_node rb_node; 100 __u64 cvtime; 101 __u64 cgid; 102 }; 103 104 private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; 105 private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); 106 107 struct cgv_node_stash { 108 struct cgv_node __kptr *node; 109 }; 110 111 struct { 112 __uint(type, BPF_MAP_TYPE_HASH); 113 __uint(max_entries, 16384); 114 __type(key, __u64); 115 __type(value, struct cgv_node_stash); 116 } cgv_node_stash SEC(".maps"); 117 118 struct fcg_task_ctx { 119 u64 bypassed_at; 120 }; 121 122 struct { 123 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 124 __uint(map_flags, BPF_F_NO_PREALLOC); 125 __type(key, int); 126 __type(value, struct fcg_task_ctx); 127 } task_ctx SEC(".maps"); 128 129 /* gets inc'd on weight tree changes to expire the cached hweights */ 130 u64 hweight_gen = 1; 131 132 static u64 div_round_up(u64 dividend, u64 divisor) 133 { 134 return (dividend + divisor - 1) / divisor; 135 } 136 137 static bool vtime_before(u64 a, u64 b) 138 { 139 return (s64)(a - b) < 0; 140 } 141 142 static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) 143 { 144 struct cgv_node *cgc_a, *cgc_b; 145 146 cgc_a = container_of(a, struct cgv_node, rb_node); 147 cgc_b = container_of(b, struct cgv_node, rb_node); 148 149 return cgc_a->cvtime < cgc_b->cvtime; 150 } 151 152 static struct fcg_cpu_ctx *find_cpu_ctx(void) 153 { 154 struct fcg_cpu_ctx *cpuc; 155 u32 idx = 0; 156 157 cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); 158 if (!cpuc) { 159 scx_bpf_error("cpu_ctx lookup failed"); 160 return NULL; 161 } 162 return cpuc; 163 } 164 165 static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) 166 { 167 struct fcg_cgrp_ctx *cgc; 168 169 cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); 170 if (!cgc) { 171 scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); 172 return NULL; 173 } 174 return cgc; 175 } 176 177 static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) 178 { 179 struct fcg_cgrp_ctx *cgc; 180 181 cgrp = bpf_cgroup_ancestor(cgrp, level); 182 if (!cgrp) { 183 scx_bpf_error("ancestor cgroup lookup failed"); 184 return NULL; 185 } 186 187 cgc = find_cgrp_ctx(cgrp); 188 if (!cgc) 189 scx_bpf_error("ancestor cgrp_ctx lookup failed"); 190 bpf_cgroup_release(cgrp); 191 return cgc; 192 } 193 194 static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) 195 { 196 int level; 197 198 if (!cgc->nr_active) { 199 stat_inc(FCG_STAT_HWT_SKIP); 200 return; 201 } 202 203 if (cgc->hweight_gen == hweight_gen) { 204 stat_inc(FCG_STAT_HWT_CACHE); 205 return; 206 } 207 208 stat_inc(FCG_STAT_HWT_UPDATES); 209 bpf_for(level, 0, cgrp->level + 1) { 210 struct fcg_cgrp_ctx *cgc; 211 bool is_active; 212 213 cgc = find_ancestor_cgrp_ctx(cgrp, level); 214 if (!cgc) 215 break; 216 217 if (!level) { 218 cgc->hweight = FCG_HWEIGHT_ONE; 219 cgc->hweight_gen = hweight_gen; 220 } else { 221 struct fcg_cgrp_ctx *pcgc; 222 223 pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); 224 if (!pcgc) 225 break; 226 227 /* 228 * We can be oppotunistic here and not grab the 229 * cgv_tree_lock and deal with the occasional races. 230 * However, hweight updates are already cached and 231 * relatively low-frequency. Let's just do the 232 * straightforward thing. 233 */ 234 bpf_spin_lock(&cgv_tree_lock); 235 is_active = cgc->nr_active; 236 if (is_active) { 237 cgc->hweight_gen = pcgc->hweight_gen; 238 cgc->hweight = 239 div_round_up(pcgc->hweight * cgc->weight, 240 pcgc->child_weight_sum); 241 } 242 bpf_spin_unlock(&cgv_tree_lock); 243 244 if (!is_active) { 245 stat_inc(FCG_STAT_HWT_RACE); 246 break; 247 } 248 } 249 } 250 } 251 252 static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) 253 { 254 u64 delta, cvtime, max_budget; 255 256 /* 257 * A node which is on the rbtree can't be pointed to from elsewhere yet 258 * and thus can't be updated and repositioned. Instead, we collect the 259 * vtime deltas separately and apply it asynchronously here. 260 */ 261 delta = cgc->cvtime_delta; 262 __sync_fetch_and_sub(&cgc->cvtime_delta, delta); 263 cvtime = cgv_node->cvtime + delta; 264 265 /* 266 * Allow a cgroup to carry the maximum budget proportional to its 267 * hweight such that a full-hweight cgroup can immediately take up half 268 * of the CPUs at the most while staying at the front of the rbtree. 269 */ 270 max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / 271 (2 * FCG_HWEIGHT_ONE); 272 if (vtime_before(cvtime, cvtime_now - max_budget)) 273 cvtime = cvtime_now - max_budget; 274 275 cgv_node->cvtime = cvtime; 276 } 277 278 static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) 279 { 280 struct cgv_node_stash *stash; 281 struct cgv_node *cgv_node; 282 u64 cgid = cgrp->kn->id; 283 284 /* paired with cmpxchg in try_pick_next_cgroup() */ 285 if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { 286 stat_inc(FCG_STAT_ENQ_SKIP); 287 return; 288 } 289 290 stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); 291 if (!stash) { 292 scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); 293 return; 294 } 295 296 /* NULL if the node is already on the rbtree */ 297 cgv_node = bpf_kptr_xchg(&stash->node, NULL); 298 if (!cgv_node) { 299 stat_inc(FCG_STAT_ENQ_RACE); 300 return; 301 } 302 303 bpf_spin_lock(&cgv_tree_lock); 304 cgrp_cap_budget(cgv_node, cgc); 305 bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); 306 bpf_spin_unlock(&cgv_tree_lock); 307 } 308 309 static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) 310 { 311 /* 312 * Tell fcg_stopping() that this bypassed the regular scheduling path 313 * and should be force charged to the cgroup. 0 is used to indicate that 314 * the task isn't bypassing, so if the current runtime is 0, go back by 315 * one nanosecond. 316 */ 317 taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; 318 } 319 320 s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 321 { 322 struct fcg_task_ctx *taskc; 323 bool is_idle = false; 324 s32 cpu; 325 326 cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 327 328 taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); 329 if (!taskc) { 330 scx_bpf_error("task_ctx lookup failed"); 331 return cpu; 332 } 333 334 /* 335 * If select_cpu_dfl() is recommending local enqueue, the target CPU is 336 * idle. Follow it and charge the cgroup later in fcg_stopping() after 337 * the fact. 338 */ 339 if (is_idle) { 340 set_bypassed_at(p, taskc); 341 stat_inc(FCG_STAT_LOCAL); 342 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 343 } 344 345 return cpu; 346 } 347 348 void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) 349 { 350 struct fcg_task_ctx *taskc; 351 struct cgroup *cgrp; 352 struct fcg_cgrp_ctx *cgc; 353 354 taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); 355 if (!taskc) { 356 scx_bpf_error("task_ctx lookup failed"); 357 return; 358 } 359 360 /* 361 * Use the direct dispatching and force charging to deal with tasks with 362 * custom affinities so that we don't have to worry about per-cgroup 363 * dq's containing tasks that can't be executed from some CPUs. 364 */ 365 if (p->nr_cpus_allowed != nr_cpus) { 366 set_bypassed_at(p, taskc); 367 368 /* 369 * The global dq is deprioritized as we don't want to let tasks 370 * to boost themselves by constraining its cpumask. The 371 * deprioritization is rather severe, so let's not apply that to 372 * per-cpu kernel threads. This is ham-fisted. We probably wanna 373 * implement per-cgroup fallback dq's instead so that we have 374 * more control over when tasks with custom cpumask get issued. 375 */ 376 if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { 377 stat_inc(FCG_STAT_LOCAL); 378 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); 379 } else { 380 stat_inc(FCG_STAT_GLOBAL); 381 scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 382 } 383 return; 384 } 385 386 cgrp = scx_bpf_task_cgroup(p); 387 cgc = find_cgrp_ctx(cgrp); 388 if (!cgc) 389 goto out_release; 390 391 if (fifo_sched) { 392 scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); 393 } else { 394 u64 tvtime = p->scx.dsq_vtime; 395 396 /* 397 * Limit the amount of budget that an idling task can accumulate 398 * to one slice. 399 */ 400 if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) 401 tvtime = cgc->tvtime_now - SCX_SLICE_DFL; 402 403 scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, 404 tvtime, enq_flags); 405 } 406 407 cgrp_enqueued(cgrp, cgc); 408 out_release: 409 bpf_cgroup_release(cgrp); 410 } 411 412 /* 413 * Walk the cgroup tree to update the active weight sums as tasks wake up and 414 * sleep. The weight sums are used as the base when calculating the proportion a 415 * given cgroup or task is entitled to at each level. 416 */ 417 static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) 418 { 419 struct fcg_cgrp_ctx *cgc; 420 bool updated = false; 421 int idx; 422 423 cgc = find_cgrp_ctx(cgrp); 424 if (!cgc) 425 return; 426 427 /* 428 * In most cases, a hot cgroup would have multiple threads going to 429 * sleep and waking up while the whole cgroup stays active. In leaf 430 * cgroups, ->nr_runnable which is updated with __sync operations gates 431 * ->nr_active updates, so that we don't have to grab the cgv_tree_lock 432 * repeatedly for a busy cgroup which is staying active. 433 */ 434 if (runnable) { 435 if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) 436 return; 437 stat_inc(FCG_STAT_ACT); 438 } else { 439 if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) 440 return; 441 stat_inc(FCG_STAT_DEACT); 442 } 443 444 /* 445 * If @cgrp is becoming runnable, its hweight should be refreshed after 446 * it's added to the weight tree so that enqueue has the up-to-date 447 * value. If @cgrp is becoming quiescent, the hweight should be 448 * refreshed before it's removed from the weight tree so that the usage 449 * charging which happens afterwards has access to the latest value. 450 */ 451 if (!runnable) 452 cgrp_refresh_hweight(cgrp, cgc); 453 454 /* propagate upwards */ 455 bpf_for(idx, 0, cgrp->level) { 456 int level = cgrp->level - idx; 457 struct fcg_cgrp_ctx *cgc, *pcgc = NULL; 458 bool propagate = false; 459 460 cgc = find_ancestor_cgrp_ctx(cgrp, level); 461 if (!cgc) 462 break; 463 if (level) { 464 pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); 465 if (!pcgc) 466 break; 467 } 468 469 /* 470 * We need the propagation protected by a lock to synchronize 471 * against weight changes. There's no reason to drop the lock at 472 * each level but bpf_spin_lock() doesn't want any function 473 * calls while locked. 474 */ 475 bpf_spin_lock(&cgv_tree_lock); 476 477 if (runnable) { 478 if (!cgc->nr_active++) { 479 updated = true; 480 if (pcgc) { 481 propagate = true; 482 pcgc->child_weight_sum += cgc->weight; 483 } 484 } 485 } else { 486 if (!--cgc->nr_active) { 487 updated = true; 488 if (pcgc) { 489 propagate = true; 490 pcgc->child_weight_sum -= cgc->weight; 491 } 492 } 493 } 494 495 bpf_spin_unlock(&cgv_tree_lock); 496 497 if (!propagate) 498 break; 499 } 500 501 if (updated) 502 __sync_fetch_and_add(&hweight_gen, 1); 503 504 if (runnable) 505 cgrp_refresh_hweight(cgrp, cgc); 506 } 507 508 void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) 509 { 510 struct cgroup *cgrp; 511 512 cgrp = scx_bpf_task_cgroup(p); 513 update_active_weight_sums(cgrp, true); 514 bpf_cgroup_release(cgrp); 515 } 516 517 void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) 518 { 519 struct cgroup *cgrp; 520 struct fcg_cgrp_ctx *cgc; 521 522 if (fifo_sched) 523 return; 524 525 cgrp = scx_bpf_task_cgroup(p); 526 cgc = find_cgrp_ctx(cgrp); 527 if (cgc) { 528 /* 529 * @cgc->tvtime_now always progresses forward as tasks start 530 * executing. The test and update can be performed concurrently 531 * from multiple CPUs and thus racy. Any error should be 532 * contained and temporary. Let's just live with it. 533 */ 534 if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) 535 cgc->tvtime_now = p->scx.dsq_vtime; 536 } 537 bpf_cgroup_release(cgrp); 538 } 539 540 void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) 541 { 542 struct fcg_task_ctx *taskc; 543 struct cgroup *cgrp; 544 struct fcg_cgrp_ctx *cgc; 545 546 /* 547 * Scale the execution time by the inverse of the weight and charge. 548 * 549 * Note that the default yield implementation yields by setting 550 * @p->scx.slice to zero and the following would treat the yielding task 551 * as if it has consumed all its slice. If this penalizes yielding tasks 552 * too much, determine the execution time by taking explicit timestamps 553 * instead of depending on @p->scx.slice. 554 */ 555 if (!fifo_sched) 556 p->scx.dsq_vtime += 557 (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; 558 559 taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); 560 if (!taskc) { 561 scx_bpf_error("task_ctx lookup failed"); 562 return; 563 } 564 565 if (!taskc->bypassed_at) 566 return; 567 568 cgrp = scx_bpf_task_cgroup(p); 569 cgc = find_cgrp_ctx(cgrp); 570 if (cgc) { 571 __sync_fetch_and_add(&cgc->cvtime_delta, 572 p->se.sum_exec_runtime - taskc->bypassed_at); 573 taskc->bypassed_at = 0; 574 } 575 bpf_cgroup_release(cgrp); 576 } 577 578 void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) 579 { 580 struct cgroup *cgrp; 581 582 cgrp = scx_bpf_task_cgroup(p); 583 update_active_weight_sums(cgrp, false); 584 bpf_cgroup_release(cgrp); 585 } 586 587 void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 588 { 589 struct fcg_cgrp_ctx *cgc, *pcgc = NULL; 590 591 cgc = find_cgrp_ctx(cgrp); 592 if (!cgc) 593 return; 594 595 if (cgrp->level) { 596 pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); 597 if (!pcgc) 598 return; 599 } 600 601 bpf_spin_lock(&cgv_tree_lock); 602 if (pcgc && cgc->nr_active) 603 pcgc->child_weight_sum += (s64)weight - cgc->weight; 604 cgc->weight = weight; 605 bpf_spin_unlock(&cgv_tree_lock); 606 } 607 608 static bool try_pick_next_cgroup(u64 *cgidp) 609 { 610 struct bpf_rb_node *rb_node; 611 struct cgv_node_stash *stash; 612 struct cgv_node *cgv_node; 613 struct fcg_cgrp_ctx *cgc; 614 struct cgroup *cgrp; 615 u64 cgid; 616 617 /* pop the front cgroup and wind cvtime_now accordingly */ 618 bpf_spin_lock(&cgv_tree_lock); 619 620 rb_node = bpf_rbtree_first(&cgv_tree); 621 if (!rb_node) { 622 bpf_spin_unlock(&cgv_tree_lock); 623 stat_inc(FCG_STAT_PNC_NO_CGRP); 624 *cgidp = 0; 625 return true; 626 } 627 628 rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); 629 bpf_spin_unlock(&cgv_tree_lock); 630 631 if (!rb_node) { 632 /* 633 * This should never happen. bpf_rbtree_first() was called 634 * above while the tree lock was held, so the node should 635 * always be present. 636 */ 637 scx_bpf_error("node could not be removed"); 638 return true; 639 } 640 641 cgv_node = container_of(rb_node, struct cgv_node, rb_node); 642 cgid = cgv_node->cgid; 643 644 if (vtime_before(cvtime_now, cgv_node->cvtime)) 645 cvtime_now = cgv_node->cvtime; 646 647 /* 648 * If lookup fails, the cgroup's gone. Free and move on. See 649 * fcg_cgroup_exit(). 650 */ 651 cgrp = bpf_cgroup_from_id(cgid); 652 if (!cgrp) { 653 stat_inc(FCG_STAT_PNC_GONE); 654 goto out_free; 655 } 656 657 cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); 658 if (!cgc) { 659 bpf_cgroup_release(cgrp); 660 stat_inc(FCG_STAT_PNC_GONE); 661 goto out_free; 662 } 663 664 if (!scx_bpf_consume(cgid)) { 665 bpf_cgroup_release(cgrp); 666 stat_inc(FCG_STAT_PNC_EMPTY); 667 goto out_stash; 668 } 669 670 /* 671 * Successfully consumed from the cgroup. This will be our current 672 * cgroup for the new slice. Refresh its hweight. 673 */ 674 cgrp_refresh_hweight(cgrp, cgc); 675 676 bpf_cgroup_release(cgrp); 677 678 /* 679 * As the cgroup may have more tasks, add it back to the rbtree. Note 680 * that here we charge the full slice upfront and then exact later 681 * according to the actual consumption. This prevents lowpri thundering 682 * herd from saturating the machine. 683 */ 684 bpf_spin_lock(&cgv_tree_lock); 685 cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); 686 cgrp_cap_budget(cgv_node, cgc); 687 bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); 688 bpf_spin_unlock(&cgv_tree_lock); 689 690 *cgidp = cgid; 691 stat_inc(FCG_STAT_PNC_NEXT); 692 return true; 693 694 out_stash: 695 stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); 696 if (!stash) { 697 stat_inc(FCG_STAT_PNC_GONE); 698 goto out_free; 699 } 700 701 /* 702 * Paired with cmpxchg in cgrp_enqueued(). If they see the following 703 * transition, they'll enqueue the cgroup. If they are earlier, we'll 704 * see their task in the dq below and requeue the cgroup. 705 */ 706 __sync_val_compare_and_swap(&cgc->queued, 1, 0); 707 708 if (scx_bpf_dsq_nr_queued(cgid)) { 709 bpf_spin_lock(&cgv_tree_lock); 710 bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); 711 bpf_spin_unlock(&cgv_tree_lock); 712 stat_inc(FCG_STAT_PNC_RACE); 713 } else { 714 cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); 715 if (cgv_node) { 716 scx_bpf_error("unexpected !NULL cgv_node stash"); 717 goto out_free; 718 } 719 } 720 721 return false; 722 723 out_free: 724 bpf_obj_drop(cgv_node); 725 return false; 726 } 727 728 void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) 729 { 730 struct fcg_cpu_ctx *cpuc; 731 struct fcg_cgrp_ctx *cgc; 732 struct cgroup *cgrp; 733 u64 now = bpf_ktime_get_ns(); 734 bool picked_next = false; 735 736 cpuc = find_cpu_ctx(); 737 if (!cpuc) 738 return; 739 740 if (!cpuc->cur_cgid) 741 goto pick_next_cgroup; 742 743 if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { 744 if (scx_bpf_consume(cpuc->cur_cgid)) { 745 stat_inc(FCG_STAT_CNS_KEEP); 746 return; 747 } 748 stat_inc(FCG_STAT_CNS_EMPTY); 749 } else { 750 stat_inc(FCG_STAT_CNS_EXPIRE); 751 } 752 753 /* 754 * The current cgroup is expiring. It was already charged a full slice. 755 * Calculate the actual usage and accumulate the delta. 756 */ 757 cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); 758 if (!cgrp) { 759 stat_inc(FCG_STAT_CNS_GONE); 760 goto pick_next_cgroup; 761 } 762 763 cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); 764 if (cgc) { 765 /* 766 * We want to update the vtime delta and then look for the next 767 * cgroup to execute but the latter needs to be done in a loop 768 * and we can't keep the lock held. Oh well... 769 */ 770 bpf_spin_lock(&cgv_tree_lock); 771 __sync_fetch_and_add(&cgc->cvtime_delta, 772 (cpuc->cur_at + cgrp_slice_ns - now) * 773 FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); 774 bpf_spin_unlock(&cgv_tree_lock); 775 } else { 776 stat_inc(FCG_STAT_CNS_GONE); 777 } 778 779 bpf_cgroup_release(cgrp); 780 781 pick_next_cgroup: 782 cpuc->cur_at = now; 783 784 if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { 785 cpuc->cur_cgid = 0; 786 return; 787 } 788 789 bpf_repeat(CGROUP_MAX_RETRIES) { 790 if (try_pick_next_cgroup(&cpuc->cur_cgid)) { 791 picked_next = true; 792 break; 793 } 794 } 795 796 /* 797 * This only happens if try_pick_next_cgroup() races against enqueue 798 * path for more than CGROUP_MAX_RETRIES times, which is extremely 799 * unlikely and likely indicates an underlying bug. There shouldn't be 800 * any stall risk as the race is against enqueue. 801 */ 802 if (!picked_next) 803 stat_inc(FCG_STAT_PNC_FAIL); 804 } 805 806 s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, 807 struct scx_init_task_args *args) 808 { 809 struct fcg_task_ctx *taskc; 810 struct fcg_cgrp_ctx *cgc; 811 812 /* 813 * @p is new. Let's ensure that its task_ctx is available. We can sleep 814 * in this function and the following will automatically use GFP_KERNEL. 815 */ 816 taskc = bpf_task_storage_get(&task_ctx, p, 0, 817 BPF_LOCAL_STORAGE_GET_F_CREATE); 818 if (!taskc) 819 return -ENOMEM; 820 821 taskc->bypassed_at = 0; 822 823 if (!(cgc = find_cgrp_ctx(args->cgroup))) 824 return -ENOENT; 825 826 p->scx.dsq_vtime = cgc->tvtime_now; 827 828 return 0; 829 } 830 831 int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, 832 struct scx_cgroup_init_args *args) 833 { 834 struct fcg_cgrp_ctx *cgc; 835 struct cgv_node *cgv_node; 836 struct cgv_node_stash empty_stash = {}, *stash; 837 u64 cgid = cgrp->kn->id; 838 int ret; 839 840 /* 841 * Technically incorrect as cgroup ID is full 64bit while dq ID is 842 * 63bit. Should not be a problem in practice and easy to spot in the 843 * unlikely case that it breaks. 844 */ 845 ret = scx_bpf_create_dsq(cgid, -1); 846 if (ret) 847 return ret; 848 849 cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 850 BPF_LOCAL_STORAGE_GET_F_CREATE); 851 if (!cgc) { 852 ret = -ENOMEM; 853 goto err_destroy_dsq; 854 } 855 856 cgc->weight = args->weight; 857 cgc->hweight = FCG_HWEIGHT_ONE; 858 859 ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, 860 BPF_NOEXIST); 861 if (ret) { 862 if (ret != -ENOMEM) 863 scx_bpf_error("unexpected stash creation error (%d)", 864 ret); 865 goto err_destroy_dsq; 866 } 867 868 stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); 869 if (!stash) { 870 scx_bpf_error("unexpected cgv_node stash lookup failure"); 871 ret = -ENOENT; 872 goto err_destroy_dsq; 873 } 874 875 cgv_node = bpf_obj_new(struct cgv_node); 876 if (!cgv_node) { 877 ret = -ENOMEM; 878 goto err_del_cgv_node; 879 } 880 881 cgv_node->cgid = cgid; 882 cgv_node->cvtime = cvtime_now; 883 884 cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); 885 if (cgv_node) { 886 scx_bpf_error("unexpected !NULL cgv_node stash"); 887 ret = -EBUSY; 888 goto err_drop; 889 } 890 891 return 0; 892 893 err_drop: 894 bpf_obj_drop(cgv_node); 895 err_del_cgv_node: 896 bpf_map_delete_elem(&cgv_node_stash, &cgid); 897 err_destroy_dsq: 898 scx_bpf_destroy_dsq(cgid); 899 return ret; 900 } 901 902 void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) 903 { 904 u64 cgid = cgrp->kn->id; 905 906 /* 907 * For now, there's no way find and remove the cgv_node if it's on the 908 * cgv_tree. Let's drain them in the dispatch path as they get popped 909 * off the front of the tree. 910 */ 911 bpf_map_delete_elem(&cgv_node_stash, &cgid); 912 scx_bpf_destroy_dsq(cgid); 913 } 914 915 void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, 916 struct cgroup *from, struct cgroup *to) 917 { 918 struct fcg_cgrp_ctx *from_cgc, *to_cgc; 919 s64 vtime_delta; 920 921 /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ 922 if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) 923 return; 924 925 vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; 926 p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; 927 } 928 929 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) 930 { 931 UEI_RECORD(uei, ei); 932 } 933 934 SCX_OPS_DEFINE(flatcg_ops, 935 .select_cpu = (void *)fcg_select_cpu, 936 .enqueue = (void *)fcg_enqueue, 937 .dispatch = (void *)fcg_dispatch, 938 .runnable = (void *)fcg_runnable, 939 .running = (void *)fcg_running, 940 .stopping = (void *)fcg_stopping, 941 .quiescent = (void *)fcg_quiescent, 942 .init_task = (void *)fcg_init_task, 943 .cgroup_set_weight = (void *)fcg_cgroup_set_weight, 944 .cgroup_init = (void *)fcg_cgroup_init, 945 .cgroup_exit = (void *)fcg_cgroup_exit, 946 .cgroup_move = (void *)fcg_cgroup_move, 947 .exit = (void *)fcg_exit, 948 .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, 949 .name = "flatcg"); 950