1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Arena-based task data scheduler. This is a variation of scx_simple 4 * that uses a combined allocator and indexing structure to organize 5 * task data. Task context allocation is done when a task enters the 6 * scheduler, while freeing is done when it exits. Task contexts are 7 * retrieved from task-local storage, pointing to the allocated memory. 8 * 9 * The main purpose of this scheduler is to demostrate arena memory 10 * management. 11 * 12 * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. 13 * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com> 14 * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org> 15 * 16 */ 17 #include <scx/common.bpf.h> 18 #include <scx/bpf_arena_common.bpf.h> 19 20 #include "scx_sdt.h" 21 22 char _license[] SEC("license") = "GPL"; 23 24 UEI_DEFINE(uei); 25 26 struct { 27 __uint(type, BPF_MAP_TYPE_ARENA); 28 __uint(map_flags, BPF_F_MMAPABLE); 29 #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) 30 __uint(max_entries, 1 << 16); /* number of pages */ 31 __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ 32 #else 33 __uint(max_entries, 1 << 20); /* number of pages */ 34 __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ 35 #endif 36 } arena __weak SEC(".maps"); 37 38 #define SHARED_DSQ 0 39 40 #define DEFINE_SDT_STAT(metric) \ 41 static inline void \ 42 stat_inc_##metric(struct scx_stats __arena *stats) \ 43 { \ 44 cast_kern(stats); \ 45 stats->metric += 1; \ 46 } \ 47 __u64 stat_##metric; \ 48 49 DEFINE_SDT_STAT(enqueue); 50 DEFINE_SDT_STAT(init); 51 DEFINE_SDT_STAT(exit); 52 DEFINE_SDT_STAT(select_idle_cpu); 53 DEFINE_SDT_STAT(select_busy_cpu); 54 55 /* 56 * Necessary for cond_break/can_loop's semantics. According to kernel commit 57 * 011832b, the loop counter variable must be seen as imprecise and bounded 58 * by the verifier. Initializing it from a constant (e.g., i = 0;), then, 59 * makes it precise and prevents may_goto from helping with converging the 60 * loop. For these loops we must initialize the loop counter from a variable 61 * whose value the verifier cannot reason about when checking the program, so 62 * that the loop counter's value is imprecise. 63 */ 64 static __u64 zero = 0; 65 66 /* 67 * XXX Hack to get the verifier to find the arena for sdt_exit_task. 68 * As of 6.12-rc5, The verifier associates arenas with programs by 69 * checking LD.IMM instruction operands for an arena and populating 70 * the program state with the first instance it finds. This requires 71 * accessing our global arena variable, but scx methods do not necessarily 72 * do so while still using pointers from that arena. Insert a bpf_printk 73 * statement that triggers at most once to generate an LD.IMM instruction 74 * to access the arena and help the verifier. 75 */ 76 static volatile bool scx_arena_verify_once; 77 78 __hidden void scx_arena_subprog_init(void) 79 { 80 if (scx_arena_verify_once) 81 return; 82 83 bpf_printk("%s: arena pointer %p", __func__, &arena); 84 scx_arena_verify_once = true; 85 } 86 87 88 private(LOCK) struct bpf_spin_lock alloc_lock; 89 private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; 90 91 /* allocation pools */ 92 struct sdt_pool desc_pool; 93 struct sdt_pool chunk_pool; 94 95 /* Protected by alloc_lock. */ 96 struct scx_alloc_stats alloc_stats; 97 98 99 /* Allocate element from the pool. Must be called with a then pool lock held. */ 100 static 101 void __arena *scx_alloc_from_pool(struct sdt_pool *pool) 102 { 103 __u64 elem_size, max_elems; 104 void __arena *slab; 105 void __arena *ptr; 106 107 elem_size = pool->elem_size; 108 max_elems = pool->max_elems; 109 110 /* If the chunk is spent, get a new one. */ 111 if (pool->idx >= max_elems) { 112 slab = bpf_arena_alloc_pages(&arena, NULL, 113 div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); 114 if (!slab) 115 return NULL; 116 117 pool->slab = slab; 118 pool->idx = 0; 119 } 120 121 ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); 122 pool->idx += 1; 123 124 return ptr; 125 } 126 127 /* Alloc desc and associated chunk. Called with the allocator spinlock held. */ 128 static sdt_desc_t *scx_alloc_chunk(void) 129 { 130 struct sdt_chunk __arena *chunk; 131 sdt_desc_t *desc; 132 sdt_desc_t *out; 133 134 chunk = scx_alloc_from_pool(&chunk_pool); 135 if (!chunk) 136 return NULL; 137 138 desc = scx_alloc_from_pool(&desc_pool); 139 if (!desc) { 140 /* 141 * Effectively frees the previous chunk allocation. 142 * Index cannot be 0, so decrementing is always 143 * valid. 144 */ 145 chunk_pool.idx -= 1; 146 return NULL; 147 } 148 149 out = desc; 150 151 desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; 152 desc->chunk = chunk; 153 154 alloc_stats.chunk_allocs += 1; 155 156 return out; 157 } 158 159 static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) 160 { 161 if (unlikely(data_size % 8)) 162 return -EINVAL; 163 164 if (unlikely(nr_pages == 0)) 165 return -EINVAL; 166 167 pool->elem_size = data_size; 168 pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; 169 /* Populate the pool slab on the first allocation. */ 170 pool->idx = pool->max_elems; 171 172 return 0; 173 } 174 175 /* Initialize both the base pool allocators and the root chunk of the index. */ 176 __hidden int 177 scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) 178 { 179 size_t min_chunk_size; 180 int ret; 181 182 _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, 183 "chunk size must fit into a page"); 184 185 ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); 186 if (ret != 0) 187 return ret; 188 189 ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); 190 if (ret != 0) 191 return ret; 192 193 /* Wrap data into a descriptor and word align. */ 194 data_size += sizeof(struct sdt_data); 195 data_size = round_up(data_size, 8); 196 197 /* 198 * Ensure we allocate large enough chunks from the arena to avoid excessive 199 * internal fragmentation when turning chunks it into structs. 200 */ 201 min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); 202 ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); 203 if (ret != 0) 204 return ret; 205 206 bpf_spin_lock(&alloc_lock); 207 alloc->root = scx_alloc_chunk(); 208 bpf_spin_unlock(&alloc_lock); 209 if (!alloc->root) 210 return -ENOMEM; 211 212 return 0; 213 } 214 215 static 216 int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) 217 { 218 __u64 __arena *allocated = desc->allocated; 219 __u64 bit; 220 221 if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) 222 return -EINVAL; 223 224 bit = (__u64)1 << (pos % 64); 225 226 if (state) 227 allocated[pos / 64] |= bit; 228 else 229 allocated[pos / 64] &= ~bit; 230 231 return 0; 232 } 233 234 static __noinline 235 int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) 236 { 237 sdt_desc_t *desc; 238 __u64 u, level; 239 int ret; 240 241 for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { 242 level = SDT_TASK_LEVELS - 1 - u; 243 244 /* Only propagate upwards if we are the parent's only free chunk. */ 245 desc = lv_desc[level]; 246 247 ret = set_idx_state(desc, lv_pos[level], false); 248 if (unlikely(ret != 0)) 249 return ret; 250 251 desc->nr_free += 1; 252 if (desc->nr_free > 1) 253 return 0; 254 } 255 256 return 0; 257 } 258 259 /* 260 * Free the allocated struct with the given index. Called with the 261 * allocator lock taken. 262 */ 263 __hidden 264 int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) 265 { 266 const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; 267 sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 268 sdt_desc_t * __arena *desc_children; 269 struct sdt_chunk __arena *chunk; 270 sdt_desc_t *desc; 271 struct sdt_data __arena *data; 272 __u64 level, shift, pos; 273 __u64 lv_pos[SDT_TASK_LEVELS]; 274 int ret; 275 int i; 276 277 if (!alloc) 278 return 0; 279 280 desc = alloc->root; 281 if (unlikely(!desc)) 282 return -EINVAL; 283 284 /* To appease the verifier. */ 285 for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 286 lv_desc[level] = NULL; 287 lv_pos[level] = 0; 288 } 289 290 /* Find the leaf node containing the index. */ 291 for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 292 shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; 293 pos = (idx >> shift) & mask; 294 295 lv_desc[level] = desc; 296 lv_pos[level] = pos; 297 298 if (level == SDT_TASK_LEVELS - 1) 299 break; 300 301 chunk = desc->chunk; 302 303 desc_children = (sdt_desc_t * __arena *)chunk->descs; 304 desc = desc_children[pos]; 305 306 if (unlikely(!desc)) 307 return -EINVAL; 308 } 309 310 chunk = desc->chunk; 311 312 pos = idx & mask; 313 data = chunk->data[pos]; 314 if (likely(data)) { 315 *data = (struct sdt_data) { 316 .tid.genn = data->tid.genn + 1, 317 }; 318 319 /* Zero out one word at a time. */ 320 for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { 321 data->payload[i] = 0; 322 } 323 } 324 325 ret = mark_nodes_avail(lv_desc, lv_pos); 326 if (unlikely(ret != 0)) 327 return ret; 328 329 alloc_stats.active_allocs -= 1; 330 alloc_stats.free_ops += 1; 331 332 return 0; 333 } 334 335 static inline 336 int ffs(__u64 word) 337 { 338 unsigned int num = 0; 339 340 if ((word & 0xffffffff) == 0) { 341 num += 32; 342 word >>= 32; 343 } 344 345 if ((word & 0xffff) == 0) { 346 num += 16; 347 word >>= 16; 348 } 349 350 if ((word & 0xff) == 0) { 351 num += 8; 352 word >>= 8; 353 } 354 355 if ((word & 0xf) == 0) { 356 num += 4; 357 word >>= 4; 358 } 359 360 if ((word & 0x3) == 0) { 361 num += 2; 362 word >>= 2; 363 } 364 365 if ((word & 0x1) == 0) { 366 num += 1; 367 word >>= 1; 368 } 369 370 return num; 371 } 372 373 374 /* find the first empty slot */ 375 __hidden 376 __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) 377 { 378 __u64 freeslots; 379 __u64 i; 380 381 for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { 382 freeslots = ~desc->allocated[i]; 383 if (freeslots == (__u64)0) 384 continue; 385 386 return (i * 64) + ffs(freeslots); 387 } 388 389 return SDT_TASK_ENTS_PER_CHUNK; 390 } 391 392 /* 393 * Find and return an available idx on the allocator. 394 * Called with the task spinlock held. 395 */ 396 static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) 397 { 398 sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 399 sdt_desc_t * __arena *desc_children; 400 struct sdt_chunk __arena *chunk; 401 sdt_desc_t *tmp; 402 __u64 lv_pos[SDT_TASK_LEVELS]; 403 __u64 u, pos, level; 404 __u64 idx = 0; 405 int ret; 406 407 for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 408 pos = chunk_find_empty(desc); 409 410 /* If we error out, something has gone very wrong. */ 411 if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) 412 return NULL; 413 414 if (pos == SDT_TASK_ENTS_PER_CHUNK) 415 return NULL; 416 417 idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; 418 idx |= pos; 419 420 /* Log the levels to complete allocation. */ 421 lv_desc[level] = desc; 422 lv_pos[level] = pos; 423 424 /* The rest of the loop is for internal node traversal. */ 425 if (level == SDT_TASK_LEVELS - 1) 426 break; 427 428 /* Allocate an internal node if necessary. */ 429 chunk = desc->chunk; 430 desc_children = (sdt_desc_t * __arena *)chunk->descs; 431 432 desc = desc_children[pos]; 433 if (!desc) { 434 desc = scx_alloc_chunk(); 435 if (!desc) 436 return NULL; 437 438 desc_children[pos] = desc; 439 } 440 } 441 442 /* 443 * Finding the descriptor along with any internal node 444 * allocations was successful. Update all levels with 445 * the new allocation. 446 */ 447 bpf_for(u, 0, SDT_TASK_LEVELS) { 448 level = SDT_TASK_LEVELS - 1 - u; 449 tmp = lv_desc[level]; 450 451 ret = set_idx_state(tmp, lv_pos[level], true); 452 if (ret != 0) 453 break; 454 455 tmp->nr_free -= 1; 456 if (tmp->nr_free > 0) 457 break; 458 } 459 460 *idxp = idx; 461 462 return desc; 463 } 464 465 __hidden 466 void __arena *scx_alloc(struct scx_allocator *alloc) 467 { 468 struct sdt_data __arena *data = NULL; 469 struct sdt_chunk __arena *chunk; 470 sdt_desc_t *desc; 471 __u64 idx, pos; 472 473 if (!alloc) 474 return NULL; 475 476 bpf_spin_lock(&alloc_lock); 477 478 /* We unlock if we encounter an error in the function. */ 479 desc = desc_find_empty(alloc->root, &idx); 480 if (unlikely(desc == NULL)) { 481 bpf_spin_unlock(&alloc_lock); 482 return NULL; 483 } 484 485 chunk = desc->chunk; 486 487 /* Populate the leaf node if necessary. */ 488 pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); 489 data = chunk->data[pos]; 490 if (!data) { 491 data = scx_alloc_from_pool(&alloc->pool); 492 if (!data) { 493 scx_alloc_free_idx(alloc, idx); 494 bpf_spin_unlock(&alloc_lock); 495 return NULL; 496 } 497 } 498 499 chunk->data[pos] = data; 500 501 /* The data counts as a chunk */ 502 alloc_stats.data_allocs += 1; 503 alloc_stats.alloc_ops += 1; 504 alloc_stats.active_allocs += 1; 505 506 data->tid.idx = idx; 507 508 bpf_spin_unlock(&alloc_lock); 509 510 return data; 511 } 512 513 /* 514 * Task BPF map entry recording the task's assigned ID and pointing to the data 515 * area allocated in arena. 516 */ 517 struct scx_task_map_val { 518 union sdt_id tid; 519 __u64 tptr; 520 struct sdt_data __arena *data; 521 }; 522 523 struct { 524 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 525 __uint(map_flags, BPF_F_NO_PREALLOC); 526 __type(key, int); 527 __type(value, struct scx_task_map_val); 528 } scx_task_map SEC(".maps"); 529 530 static struct scx_allocator scx_task_allocator; 531 532 __hidden 533 void __arena *scx_task_alloc(struct task_struct *p) 534 { 535 struct sdt_data __arena *data = NULL; 536 struct scx_task_map_val *mval; 537 538 mval = bpf_task_storage_get(&scx_task_map, p, 0, 539 BPF_LOCAL_STORAGE_GET_F_CREATE); 540 if (!mval) 541 return NULL; 542 543 data = scx_alloc(&scx_task_allocator); 544 if (unlikely(!data)) 545 return NULL; 546 547 mval->tid = data->tid; 548 mval->tptr = (__u64) p; 549 mval->data = data; 550 551 return (void __arena *)data->payload; 552 } 553 554 __hidden 555 int scx_task_init(__u64 data_size) 556 { 557 return scx_alloc_init(&scx_task_allocator, data_size); 558 } 559 560 __hidden 561 void __arena *scx_task_data(struct task_struct *p) 562 { 563 struct sdt_data __arena *data; 564 struct scx_task_map_val *mval; 565 566 scx_arena_subprog_init(); 567 568 mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 569 if (!mval) 570 return NULL; 571 572 data = mval->data; 573 574 return (void __arena *)data->payload; 575 } 576 577 __hidden 578 void scx_task_free(struct task_struct *p) 579 { 580 struct scx_task_map_val *mval; 581 582 scx_arena_subprog_init(); 583 584 mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 585 if (!mval) 586 return; 587 588 bpf_spin_lock(&alloc_lock); 589 scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); 590 bpf_spin_unlock(&alloc_lock); 591 592 bpf_task_storage_delete(&scx_task_map, p); 593 } 594 595 static inline void 596 scx_stat_global_update(struct scx_stats __arena *stats) 597 { 598 cast_kern(stats); 599 __sync_fetch_and_add(&stat_enqueue, stats->enqueue); 600 __sync_fetch_and_add(&stat_init, stats->init); 601 __sync_fetch_and_add(&stat_exit, stats->exit); 602 __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); 603 __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); 604 } 605 606 s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 607 { 608 struct scx_stats __arena *stats; 609 bool is_idle = false; 610 s32 cpu; 611 612 stats = scx_task_data(p); 613 if (!stats) { 614 scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 615 return 0; 616 } 617 618 cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 619 if (is_idle) { 620 stat_inc_select_idle_cpu(stats); 621 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 622 } else { 623 stat_inc_select_busy_cpu(stats); 624 } 625 626 return cpu; 627 } 628 629 void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) 630 { 631 struct scx_stats __arena *stats; 632 633 stats = scx_task_data(p); 634 if (!stats) { 635 scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 636 return; 637 } 638 639 stat_inc_enqueue(stats); 640 641 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); 642 } 643 644 void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) 645 { 646 scx_bpf_dsq_move_to_local(SHARED_DSQ); 647 } 648 649 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, 650 struct scx_init_task_args *args) 651 { 652 struct scx_stats __arena *stats; 653 654 stats = scx_task_alloc(p); 655 if (!stats) { 656 scx_bpf_error("arena allocator out of memory"); 657 return -ENOMEM; 658 } 659 660 stats->pid = p->pid; 661 662 stat_inc_init(stats); 663 664 return 0; 665 } 666 667 void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, 668 struct scx_exit_task_args *args) 669 { 670 struct scx_stats __arena *stats; 671 672 stats = scx_task_data(p); 673 if (!stats) { 674 scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 675 return; 676 } 677 678 stat_inc_exit(stats); 679 scx_stat_global_update(stats); 680 681 scx_task_free(p); 682 } 683 684 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) 685 { 686 int ret; 687 688 ret = scx_task_init(sizeof(struct scx_stats)); 689 if (ret < 0) { 690 scx_bpf_error("%s: failed with %d", __func__, ret); 691 return ret; 692 } 693 694 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 695 if (ret) { 696 scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); 697 return ret; 698 } 699 700 return 0; 701 } 702 703 void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) 704 { 705 UEI_RECORD(uei, ei); 706 } 707 708 SCX_OPS_DEFINE(sdt_ops, 709 .select_cpu = (void *)sdt_select_cpu, 710 .enqueue = (void *)sdt_enqueue, 711 .dispatch = (void *)sdt_dispatch, 712 .init_task = (void *)sdt_init_task, 713 .exit_task = (void *)sdt_exit_task, 714 .init = (void *)sdt_init, 715 .exit = (void *)sdt_exit, 716 .name = "sdt"); 717