xref: /linux/tools/sched_ext/scx_sdt.bpf.c (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Arena-based task data scheduler. This is a variation of scx_simple
4  * that uses a combined allocator and indexing structure to organize
5  * task data. Task context allocation is done when a task enters the
6  * scheduler, while freeing is done when it exits. Task contexts are
7  * retrieved from task-local storage, pointing to the allocated memory.
8  *
9  * The main purpose of this scheduler is to demostrate arena memory
10  * management.
11  *
12  * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
13  * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
14  * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
15  *
16  */
17 #include <scx/common.bpf.h>
18 #include <scx/bpf_arena_common.bpf.h>
19 
20 #include "scx_sdt.h"
21 
22 char _license[] SEC("license") = "GPL";
23 
24 UEI_DEFINE(uei);
25 
26 struct {
27 	__uint(type, BPF_MAP_TYPE_ARENA);
28 	__uint(map_flags, BPF_F_MMAPABLE);
29 #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
30 	__uint(max_entries, 1 << 16); /* number of pages */
31         __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
32 #else
33 	__uint(max_entries, 1 << 20); /* number of pages */
34         __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
35 #endif
36 } arena __weak SEC(".maps");
37 
38 #define SHARED_DSQ 0
39 
40 #define DEFINE_SDT_STAT(metric)				\
41 static inline void				\
42 stat_inc_##metric(struct scx_stats __arena *stats)	\
43 {							\
44 	cast_kern(stats);				\
45 	stats->metric += 1;				\
46 }							\
47 __u64 stat_##metric;					\
48 
49 DEFINE_SDT_STAT(enqueue);
50 DEFINE_SDT_STAT(init);
51 DEFINE_SDT_STAT(exit);
52 DEFINE_SDT_STAT(select_idle_cpu);
53 DEFINE_SDT_STAT(select_busy_cpu);
54 
55 /*
56  * Necessary for cond_break/can_loop's semantics. According to kernel commit
57  * 011832b, the loop counter variable must be seen as imprecise and bounded
58  * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
59  * makes it precise and prevents may_goto from helping with converging the
60  * loop. For these loops we must initialize the loop counter from a variable
61  * whose value the verifier cannot reason about when checking the program, so
62  * that the loop counter's value is imprecise.
63  */
64 static __u64 zero = 0;
65 
66 /*
67  * XXX Hack to get the verifier to find the arena for sdt_exit_task.
68  * As of 6.12-rc5, The verifier associates arenas with programs by
69  * checking LD.IMM instruction operands for an arena and populating
70  * the program state with the first instance it finds. This requires
71  * accessing our global arena variable, but scx methods do not necessarily
72  * do so while still using pointers from that arena. Insert a bpf_printk
73  * statement that triggers at most once to generate an LD.IMM instruction
74  * to access the arena and help the verifier.
75  */
76 static volatile bool scx_arena_verify_once;
77 
78 __hidden void scx_arena_subprog_init(void)
79 {
80 	if (scx_arena_verify_once)
81 		return;
82 
83 	bpf_printk("%s: arena pointer %p", __func__, &arena);
84 	scx_arena_verify_once = true;
85 }
86 
87 
88 private(LOCK) struct bpf_spin_lock alloc_lock;
89 private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
90 
91 /* allocation pools */
92 struct sdt_pool desc_pool;
93 struct sdt_pool chunk_pool;
94 
95 /* Protected by alloc_lock. */
96 struct scx_alloc_stats alloc_stats;
97 
98 
99 /* Allocate element from the pool. Must be called with a then pool lock held. */
100 static
101 void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
102 {
103 	__u64 elem_size, max_elems;
104 	void __arena *slab;
105 	void __arena *ptr;
106 
107 	elem_size = pool->elem_size;
108 	max_elems = pool->max_elems;
109 
110 	/* If the chunk is spent, get a new one. */
111 	if (pool->idx >= max_elems) {
112 		slab = bpf_arena_alloc_pages(&arena, NULL,
113 			div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
114 		if (!slab)
115 			return NULL;
116 
117 		pool->slab = slab;
118 		pool->idx = 0;
119 	}
120 
121 	ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
122 	pool->idx += 1;
123 
124 	return ptr;
125 }
126 
127 /* Alloc desc and associated chunk. Called with the allocator spinlock held. */
128 static sdt_desc_t *scx_alloc_chunk(void)
129 {
130 	struct sdt_chunk __arena *chunk;
131 	sdt_desc_t *desc;
132 	sdt_desc_t *out;
133 
134 	chunk = scx_alloc_from_pool(&chunk_pool);
135 	if (!chunk)
136 		return NULL;
137 
138 	desc = scx_alloc_from_pool(&desc_pool);
139 	if (!desc) {
140 		/*
141 		 * Effectively frees the previous chunk allocation.
142 		 * Index cannot be 0, so decrementing is always
143 		 * valid.
144 		 */
145 		chunk_pool.idx -= 1;
146 		return NULL;
147 	}
148 
149 	out = desc;
150 
151 	desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
152 	desc->chunk = chunk;
153 
154 	alloc_stats.chunk_allocs += 1;
155 
156 	return out;
157 }
158 
159 static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
160 {
161 	if (unlikely(data_size % 8))
162 		return -EINVAL;
163 
164 	if (unlikely(nr_pages == 0))
165 		return -EINVAL;
166 
167 	pool->elem_size = data_size;
168 	pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
169 	/* Populate the pool slab on the first allocation. */
170 	pool->idx = pool->max_elems;
171 
172 	return 0;
173 }
174 
175 /* Initialize both the base pool allocators and the root chunk of the index. */
176 __hidden int
177 scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
178 {
179 	size_t min_chunk_size;
180 	int ret;
181 
182 	_Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
183 		"chunk size must fit into a page");
184 
185 	ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
186 	if (ret != 0)
187 		return ret;
188 
189 	ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
190 	if (ret != 0)
191 		return ret;
192 
193 	/* Wrap data into a descriptor and word align. */
194 	data_size += sizeof(struct sdt_data);
195 	data_size = round_up(data_size, 8);
196 
197 	/*
198 	 * Ensure we allocate large enough chunks from the arena to avoid excessive
199 	 * internal fragmentation when turning chunks it into structs.
200 	 */
201 	min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
202 	ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
203 	if (ret != 0)
204 		return ret;
205 
206 	bpf_spin_lock(&alloc_lock);
207 	alloc->root = scx_alloc_chunk();
208 	bpf_spin_unlock(&alloc_lock);
209 	if (!alloc->root)
210 		return -ENOMEM;
211 
212 	return 0;
213 }
214 
215 static
216 int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
217 {
218 	__u64 __arena *allocated = desc->allocated;
219 	__u64 bit;
220 
221 	if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
222 		return -EINVAL;
223 
224 	bit = (__u64)1 << (pos % 64);
225 
226 	if (state)
227 		allocated[pos / 64] |= bit;
228 	else
229 		allocated[pos / 64] &= ~bit;
230 
231 	return 0;
232 }
233 
234 static __noinline
235 int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
236 {
237 	sdt_desc_t *desc;
238 	__u64 u, level;
239 	int ret;
240 
241 	for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
242 		level = SDT_TASK_LEVELS - 1 - u;
243 
244 		/* Only propagate upwards if we are the parent's only free chunk. */
245 		desc = lv_desc[level];
246 
247 		ret = set_idx_state(desc, lv_pos[level], false);
248 		if (unlikely(ret != 0))
249 			return ret;
250 
251 		desc->nr_free += 1;
252 		if (desc->nr_free > 1)
253 			return 0;
254 	}
255 
256 	return 0;
257 }
258 
259 /*
260  * Free the allocated struct with the given index. Called with the
261  * allocator lock taken.
262  */
263 __hidden
264 int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
265 {
266 	const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
267 	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
268 	sdt_desc_t * __arena *desc_children;
269 	struct sdt_chunk __arena *chunk;
270 	sdt_desc_t *desc;
271 	struct sdt_data __arena *data;
272 	__u64 level, shift, pos;
273 	__u64 lv_pos[SDT_TASK_LEVELS];
274 	int ret;
275 	int i;
276 
277 	if (!alloc)
278 		return 0;
279 
280 	desc = alloc->root;
281 	if (unlikely(!desc))
282 		return -EINVAL;
283 
284 	/* To appease the verifier. */
285 	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
286 		lv_desc[level] = NULL;
287 		lv_pos[level] = 0;
288 	}
289 
290 	/* Find the leaf node containing the index. */
291 	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
292 		shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
293 		pos = (idx >> shift) & mask;
294 
295 		lv_desc[level] = desc;
296 		lv_pos[level] = pos;
297 
298 		if (level == SDT_TASK_LEVELS - 1)
299 			break;
300 
301 		chunk = desc->chunk;
302 
303 		desc_children = (sdt_desc_t * __arena *)chunk->descs;
304 		desc = desc_children[pos];
305 
306 		if (unlikely(!desc))
307 			return -EINVAL;
308 	}
309 
310 	chunk = desc->chunk;
311 
312 	pos = idx & mask;
313 	data = chunk->data[pos];
314 	if (likely(data)) {
315 		*data = (struct sdt_data) {
316 			.tid.genn = data->tid.genn + 1,
317 		};
318 
319 		/* Zero out one word at a time. */
320 		for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
321 			data->payload[i] = 0;
322 		}
323 	}
324 
325 	ret = mark_nodes_avail(lv_desc, lv_pos);
326 	if (unlikely(ret != 0))
327 		return ret;
328 
329 	alloc_stats.active_allocs -= 1;
330 	alloc_stats.free_ops += 1;
331 
332 	return 0;
333 }
334 
335 static inline
336 int ffs(__u64 word)
337 {
338 	unsigned int num = 0;
339 
340 	if ((word & 0xffffffff) == 0) {
341 		num += 32;
342 		word >>= 32;
343 	}
344 
345 	if ((word & 0xffff) == 0) {
346 		num += 16;
347 		word >>= 16;
348 	}
349 
350 	if ((word & 0xff) == 0) {
351 		num += 8;
352 		word >>= 8;
353 	}
354 
355 	if ((word & 0xf) == 0) {
356 		num += 4;
357 		word >>= 4;
358 	}
359 
360 	if ((word & 0x3) == 0) {
361 		num += 2;
362 		word >>= 2;
363 	}
364 
365 	if ((word & 0x1) == 0) {
366 		num += 1;
367 		word >>= 1;
368 	}
369 
370 	return num;
371 }
372 
373 
374 /* find the first empty slot */
375 __hidden
376 __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
377 {
378 	__u64 freeslots;
379 	__u64 i;
380 
381 	for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
382 		freeslots = ~desc->allocated[i];
383 		if (freeslots == (__u64)0)
384 			continue;
385 
386 		return (i * 64) + ffs(freeslots);
387 	}
388 
389 	return SDT_TASK_ENTS_PER_CHUNK;
390 }
391 
392 /*
393  * Find and return an available idx on the allocator.
394  * Called with the task spinlock held.
395  */
396 static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
397 {
398 	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
399 	sdt_desc_t * __arena *desc_children;
400 	struct sdt_chunk __arena *chunk;
401 	sdt_desc_t *tmp;
402 	__u64 lv_pos[SDT_TASK_LEVELS];
403 	__u64 u, pos, level;
404 	__u64 idx = 0;
405 	int ret;
406 
407 	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
408 		pos = chunk_find_empty(desc);
409 
410 		/* If we error out, something has gone very wrong. */
411 		if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
412 			return NULL;
413 
414 		if (pos == SDT_TASK_ENTS_PER_CHUNK)
415 			return NULL;
416 
417 		idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
418 		idx |= pos;
419 
420 		/* Log the levels to complete allocation. */
421 		lv_desc[level] = desc;
422 		lv_pos[level] = pos;
423 
424 		/* The rest of the loop is for internal node traversal. */
425 		if (level == SDT_TASK_LEVELS - 1)
426 			break;
427 
428 		/* Allocate an internal node if necessary. */
429 		chunk = desc->chunk;
430 		desc_children = (sdt_desc_t * __arena *)chunk->descs;
431 
432 		desc = desc_children[pos];
433 		if (!desc) {
434 			desc = scx_alloc_chunk();
435 			if (!desc)
436 				return NULL;
437 
438 			desc_children[pos] = desc;
439 		}
440 	}
441 
442 	/*
443 	 * Finding the descriptor along with any internal node
444 	 * allocations was successful. Update all levels with
445 	 * the new allocation.
446 	 */
447 	bpf_for(u, 0, SDT_TASK_LEVELS) {
448 		level = SDT_TASK_LEVELS - 1 - u;
449 		tmp = lv_desc[level];
450 
451 		ret = set_idx_state(tmp, lv_pos[level], true);
452 		if (ret != 0)
453 			break;
454 
455 		tmp->nr_free -= 1;
456 		if (tmp->nr_free > 0)
457 			break;
458 	}
459 
460 	*idxp = idx;
461 
462 	return desc;
463 }
464 
465 __hidden
466 void __arena *scx_alloc(struct scx_allocator *alloc)
467 {
468 	struct sdt_data __arena *data = NULL;
469 	struct sdt_chunk __arena *chunk;
470 	sdt_desc_t *desc;
471 	__u64 idx, pos;
472 
473 	if (!alloc)
474 		return NULL;
475 
476 	bpf_spin_lock(&alloc_lock);
477 
478 	/* We unlock if we encounter an error in the function. */
479 	desc = desc_find_empty(alloc->root, &idx);
480 	if (unlikely(desc == NULL)) {
481 		bpf_spin_unlock(&alloc_lock);
482 		return NULL;
483 	}
484 
485 	chunk = desc->chunk;
486 
487 	/* Populate the leaf node if necessary. */
488 	pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
489 	data = chunk->data[pos];
490 	if (!data) {
491 		data = scx_alloc_from_pool(&alloc->pool);
492 		if (!data) {
493 			scx_alloc_free_idx(alloc, idx);
494 			bpf_spin_unlock(&alloc_lock);
495 			return NULL;
496 		}
497 	}
498 
499 	chunk->data[pos] = data;
500 
501 	/* The data counts as a chunk */
502 	alloc_stats.data_allocs += 1;
503 	alloc_stats.alloc_ops += 1;
504 	alloc_stats.active_allocs += 1;
505 
506 	data->tid.idx = idx;
507 
508 	bpf_spin_unlock(&alloc_lock);
509 
510 	return data;
511 }
512 
513 /*
514  * Task BPF map entry recording the task's assigned ID and pointing to the data
515  * area allocated in arena.
516  */
517 struct scx_task_map_val {
518 	union sdt_id		tid;
519 	__u64			tptr;
520 	struct sdt_data __arena	*data;
521 };
522 
523 struct {
524 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
525 	__uint(map_flags, BPF_F_NO_PREALLOC);
526 	__type(key, int);
527 	__type(value, struct scx_task_map_val);
528 } scx_task_map SEC(".maps");
529 
530 static struct scx_allocator scx_task_allocator;
531 
532 __hidden
533 void __arena *scx_task_alloc(struct task_struct *p)
534 {
535 	struct sdt_data __arena *data = NULL;
536 	struct scx_task_map_val *mval;
537 
538 	mval = bpf_task_storage_get(&scx_task_map, p, 0,
539 				    BPF_LOCAL_STORAGE_GET_F_CREATE);
540 	if (!mval)
541 		return NULL;
542 
543 	data = scx_alloc(&scx_task_allocator);
544 	if (unlikely(!data))
545 		return NULL;
546 
547 	mval->tid = data->tid;
548 	mval->tptr = (__u64) p;
549 	mval->data = data;
550 
551 	return (void __arena *)data->payload;
552 }
553 
554 __hidden
555 int scx_task_init(__u64 data_size)
556 {
557 	return scx_alloc_init(&scx_task_allocator, data_size);
558 }
559 
560 __hidden
561 void __arena *scx_task_data(struct task_struct *p)
562 {
563 	struct sdt_data __arena *data;
564 	struct scx_task_map_val *mval;
565 
566 	scx_arena_subprog_init();
567 
568 	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
569 	if (!mval)
570 		return NULL;
571 
572 	data = mval->data;
573 
574 	return (void __arena *)data->payload;
575 }
576 
577 __hidden
578 void scx_task_free(struct task_struct *p)
579 {
580 	struct scx_task_map_val *mval;
581 
582 	scx_arena_subprog_init();
583 
584 	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
585 	if (!mval)
586 		return;
587 
588 	bpf_spin_lock(&alloc_lock);
589 	scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
590 	bpf_spin_unlock(&alloc_lock);
591 
592 	bpf_task_storage_delete(&scx_task_map, p);
593 }
594 
595 static inline void
596 scx_stat_global_update(struct scx_stats __arena *stats)
597 {
598 	cast_kern(stats);
599 	__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
600 	__sync_fetch_and_add(&stat_init, stats->init);
601 	__sync_fetch_and_add(&stat_exit, stats->exit);
602 	__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
603 	__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
604 }
605 
606 s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
607 {
608 	struct scx_stats __arena *stats;
609 	bool is_idle = false;
610 	s32 cpu;
611 
612 	stats = scx_task_data(p);
613 	if (!stats) {
614 		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
615 		return 0;
616 	}
617 
618 	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
619 	if (is_idle) {
620 		stat_inc_select_idle_cpu(stats);
621 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
622 	} else {
623 		stat_inc_select_busy_cpu(stats);
624 	}
625 
626 	return cpu;
627 }
628 
629 void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
630 {
631 	struct scx_stats __arena *stats;
632 
633 	stats = scx_task_data(p);
634 	if (!stats) {
635 		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
636 		return;
637 	}
638 
639 	stat_inc_enqueue(stats);
640 
641 	scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
642 }
643 
644 void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
645 {
646 	scx_bpf_dsq_move_to_local(SHARED_DSQ);
647 }
648 
649 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
650 			     struct scx_init_task_args *args)
651 {
652 	struct scx_stats __arena *stats;
653 
654 	stats = scx_task_alloc(p);
655 	if (!stats) {
656 		scx_bpf_error("arena allocator out of memory");
657 		return -ENOMEM;
658 	}
659 
660 	stats->pid = p->pid;
661 
662 	stat_inc_init(stats);
663 
664 	return 0;
665 }
666 
667 void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
668 			      struct scx_exit_task_args *args)
669 {
670 	struct scx_stats __arena *stats;
671 
672 	stats = scx_task_data(p);
673 	if (!stats) {
674 		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
675 		return;
676 	}
677 
678 	stat_inc_exit(stats);
679 	scx_stat_global_update(stats);
680 
681 	scx_task_free(p);
682 }
683 
684 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
685 {
686 	int ret;
687 
688 	ret = scx_task_init(sizeof(struct scx_stats));
689 	if (ret < 0) {
690 		scx_bpf_error("%s: failed with %d", __func__, ret);
691 		return ret;
692 	}
693 
694 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
695 	if (ret) {
696 		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
697 		return ret;
698 	}
699 
700 	return 0;
701 }
702 
703 void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
704 {
705 	UEI_RECORD(uei, ei);
706 }
707 
708 SCX_OPS_DEFINE(sdt_ops,
709 	       .select_cpu		= (void *)sdt_select_cpu,
710 	       .enqueue			= (void *)sdt_enqueue,
711 	       .dispatch		= (void *)sdt_dispatch,
712 	       .init_task		= (void *)sdt_init_task,
713 	       .exit_task		= (void *)sdt_exit_task,
714 	       .init			= (void *)sdt_init,
715 	       .exit			= (void *)sdt_exit,
716 	       .name			= "sdt");
717