1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4 *
5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
7 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
8 */
9 #include <linux/btf_ids.h>
10 #include "ext_idle.h"
11
12 /*
13 * NOTE: sched_ext is in the process of growing multiple scheduler support and
14 * scx_root usage is in a transitional state. Naked dereferences are safe if the
15 * caller is one of the tasks attached to SCX and explicit RCU dereference is
16 * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but
17 * are used as temporary markers to indicate that the dereferences need to be
18 * updated to point to the associated scheduler instances rather than scx_root.
19 */
20 static struct scx_sched __rcu *scx_root;
21
22 /*
23 * During exit, a task may schedule after losing its PIDs. When disabling the
24 * BPF scheduler, we need to be able to iterate tasks in every state to
25 * guarantee system safety. Maintain a dedicated task list which contains every
26 * task between its fork and eventual free.
27 */
28 static DEFINE_RAW_SPINLOCK(scx_tasks_lock);
29 static LIST_HEAD(scx_tasks);
30
31 /* ops enable/disable */
32 static DEFINE_MUTEX(scx_enable_mutex);
33 DEFINE_STATIC_KEY_FALSE(__scx_enabled);
34 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
35 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
36 static int scx_bypass_depth;
37 static cpumask_var_t scx_bypass_lb_donee_cpumask;
38 static cpumask_var_t scx_bypass_lb_resched_cpumask;
39 static bool scx_aborting;
40 static bool scx_init_task_enabled;
41 static bool scx_switching_all;
42 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
43
44 /*
45 * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
46 * depth on enable failure. Will be removed when bypass depth is moved into the
47 * sched instance.
48 */
49 static bool scx_bypassed_for_enable;
50
51 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
52 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
53
54 /*
55 * A monotically increasing sequence number that is incremented every time a
56 * scheduler is enabled. This can be used by to check if any custom sched_ext
57 * scheduler has ever been used in the system.
58 */
59 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
60
61 /*
62 * The maximum amount of time in jiffies that a task may be runnable without
63 * being scheduled on a CPU. If this timeout is exceeded, it will trigger
64 * scx_error().
65 */
66 static unsigned long scx_watchdog_timeout;
67
68 /*
69 * The last time the delayed work was run. This delayed work relies on
70 * ksoftirqd being able to run to service timer interrupts, so it's possible
71 * that this work itself could get wedged. To account for this, we check that
72 * it's not stalled in the timer tick, and trigger an error if it is.
73 */
74 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
75
76 static struct delayed_work scx_watchdog_work;
77
78 /*
79 * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence
80 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
81 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
82 * lazily when enabling and freed when disabling to avoid waste when sched_ext
83 * isn't active.
84 */
85 struct scx_kick_syncs {
86 struct rcu_head rcu;
87 unsigned long syncs[];
88 };
89
90 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
91
92 /*
93 * Direct dispatch marker.
94 *
95 * Non-NULL values are used for direct dispatch from enqueue path. A valid
96 * pointer points to the task currently being enqueued. An ERR_PTR value is used
97 * to indicate that direct dispatch has already happened.
98 */
99 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
100
101 static const struct rhashtable_params dsq_hash_params = {
102 .key_len = sizeof_field(struct scx_dispatch_q, id),
103 .key_offset = offsetof(struct scx_dispatch_q, id),
104 .head_offset = offsetof(struct scx_dispatch_q, hash_node),
105 };
106
107 static LLIST_HEAD(dsqs_to_free);
108
109 /* dispatch buf */
110 struct scx_dsp_buf_ent {
111 struct task_struct *task;
112 unsigned long qseq;
113 u64 dsq_id;
114 u64 enq_flags;
115 };
116
117 static u32 scx_dsp_max_batch;
118
119 struct scx_dsp_ctx {
120 struct rq *rq;
121 u32 cursor;
122 u32 nr_tasks;
123 struct scx_dsp_buf_ent buf[];
124 };
125
126 static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
127
128 /* string formatting from BPF */
129 struct scx_bstr_buf {
130 u64 data[MAX_BPRINTF_VARARGS];
131 char line[SCX_EXIT_MSG_LEN];
132 };
133
134 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
135 static struct scx_bstr_buf scx_exit_bstr_buf;
136
137 /* ops debug dump */
138 struct scx_dump_data {
139 s32 cpu;
140 bool first;
141 s32 cursor;
142 struct seq_buf *s;
143 const char *prefix;
144 struct scx_bstr_buf buf;
145 };
146
147 static struct scx_dump_data scx_dump_data = {
148 .cpu = -1,
149 };
150
151 /* /sys/kernel/sched_ext interface */
152 static struct kset *scx_kset;
153
154 /*
155 * Parameters that can be adjusted through /sys/module/sched_ext/parameters.
156 * There usually is no reason to modify these as normal scheduler operation
157 * shouldn't be affected by them. The knobs are primarily for debugging.
158 */
159 static u64 scx_slice_dfl = SCX_SLICE_DFL;
160 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
161 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
162
set_slice_us(const char * val,const struct kernel_param * kp)163 static int set_slice_us(const char *val, const struct kernel_param *kp)
164 {
165 return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
166 }
167
168 static const struct kernel_param_ops slice_us_param_ops = {
169 .set = set_slice_us,
170 .get = param_get_uint,
171 };
172
set_bypass_lb_intv_us(const char * val,const struct kernel_param * kp)173 static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
174 {
175 return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
176 }
177
178 static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
179 .set = set_bypass_lb_intv_us,
180 .get = param_get_uint,
181 };
182
183 #undef MODULE_PARAM_PREFIX
184 #define MODULE_PARAM_PREFIX "sched_ext."
185
186 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
187 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
188 module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
189 MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
190
191 #undef MODULE_PARAM_PREFIX
192
193 #define CREATE_TRACE_POINTS
194 #include <trace/events/sched_ext.h>
195
196 static void process_ddsp_deferred_locals(struct rq *rq);
197 static bool task_dead_and_done(struct task_struct *p);
198 static u32 reenq_local(struct rq *rq);
199 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
200 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
201 s64 exit_code, const char *fmt, va_list args);
202
scx_exit(struct scx_sched * sch,enum scx_exit_kind kind,s64 exit_code,const char * fmt,...)203 static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
204 enum scx_exit_kind kind, s64 exit_code,
205 const char *fmt, ...)
206 {
207 va_list args;
208 bool ret;
209
210 va_start(args, fmt);
211 ret = scx_vexit(sch, kind, exit_code, fmt, args);
212 va_end(args);
213
214 return ret;
215 }
216
217 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
218 #define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
219
220 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
221
jiffies_delta_msecs(unsigned long at,unsigned long now)222 static long jiffies_delta_msecs(unsigned long at, unsigned long now)
223 {
224 if (time_after(at, now))
225 return jiffies_to_msecs(at - now);
226 else
227 return -(long)jiffies_to_msecs(now - at);
228 }
229
230 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */
higher_bits(u32 flags)231 static u32 higher_bits(u32 flags)
232 {
233 return ~((1 << fls(flags)) - 1);
234 }
235
236 /* return the mask with only the highest bit set */
highest_bit(u32 flags)237 static u32 highest_bit(u32 flags)
238 {
239 int bit = fls(flags);
240 return ((u64)1 << bit) >> 1;
241 }
242
u32_before(u32 a,u32 b)243 static bool u32_before(u32 a, u32 b)
244 {
245 return (s32)(a - b) < 0;
246 }
247
find_global_dsq(struct scx_sched * sch,struct task_struct * p)248 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
249 struct task_struct *p)
250 {
251 return sch->global_dsqs[cpu_to_node(task_cpu(p))];
252 }
253
find_user_dsq(struct scx_sched * sch,u64 dsq_id)254 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
255 {
256 return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
257 }
258
scx_setscheduler_class(struct task_struct * p)259 static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
260 {
261 if (p->sched_class == &stop_sched_class)
262 return &stop_sched_class;
263
264 return __setscheduler_class(p->policy, p->prio);
265 }
266
267 /*
268 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
269 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
270 * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
271 * whether it's running from an allowed context.
272 *
273 * @mask is constant, always inline to cull the mask calculations.
274 */
scx_kf_allow(u32 mask)275 static __always_inline void scx_kf_allow(u32 mask)
276 {
277 /* nesting is allowed only in increasing scx_kf_mask order */
278 WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
279 "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
280 current->scx.kf_mask, mask);
281 current->scx.kf_mask |= mask;
282 barrier();
283 }
284
scx_kf_disallow(u32 mask)285 static void scx_kf_disallow(u32 mask)
286 {
287 barrier();
288 current->scx.kf_mask &= ~mask;
289 }
290
291 /*
292 * Track the rq currently locked.
293 *
294 * This allows kfuncs to safely operate on rq from any scx ops callback,
295 * knowing which rq is already locked.
296 */
297 DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
298
update_locked_rq(struct rq * rq)299 static inline void update_locked_rq(struct rq *rq)
300 {
301 /*
302 * Check whether @rq is actually locked. This can help expose bugs
303 * or incorrect assumptions about the context in which a kfunc or
304 * callback is executed.
305 */
306 if (rq)
307 lockdep_assert_rq_held(rq);
308 __this_cpu_write(scx_locked_rq_state, rq);
309 }
310
311 #define SCX_CALL_OP(sch, mask, op, rq, args...) \
312 do { \
313 if (rq) \
314 update_locked_rq(rq); \
315 if (mask) { \
316 scx_kf_allow(mask); \
317 (sch)->ops.op(args); \
318 scx_kf_disallow(mask); \
319 } else { \
320 (sch)->ops.op(args); \
321 } \
322 if (rq) \
323 update_locked_rq(NULL); \
324 } while (0)
325
326 #define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
327 ({ \
328 __typeof__((sch)->ops.op(args)) __ret; \
329 \
330 if (rq) \
331 update_locked_rq(rq); \
332 if (mask) { \
333 scx_kf_allow(mask); \
334 __ret = (sch)->ops.op(args); \
335 scx_kf_disallow(mask); \
336 } else { \
337 __ret = (sch)->ops.op(args); \
338 } \
339 if (rq) \
340 update_locked_rq(NULL); \
341 __ret; \
342 })
343
344 /*
345 * Some kfuncs are allowed only on the tasks that are subjects of the
346 * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
347 * restrictions, the following SCX_CALL_OP_*() variants should be used when
348 * invoking scx_ops operations that take task arguments. These can only be used
349 * for non-nesting operations due to the way the tasks are tracked.
350 *
351 * kfuncs which can only operate on such tasks can in turn use
352 * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
353 * the specific task.
354 */
355 #define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
356 do { \
357 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
358 current->scx.kf_tasks[0] = task; \
359 SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
360 current->scx.kf_tasks[0] = NULL; \
361 } while (0)
362
363 #define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
364 ({ \
365 __typeof__((sch)->ops.op(task, ##args)) __ret; \
366 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
367 current->scx.kf_tasks[0] = task; \
368 __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
369 current->scx.kf_tasks[0] = NULL; \
370 __ret; \
371 })
372
373 #define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
374 ({ \
375 __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
376 BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
377 current->scx.kf_tasks[0] = task0; \
378 current->scx.kf_tasks[1] = task1; \
379 __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
380 current->scx.kf_tasks[0] = NULL; \
381 current->scx.kf_tasks[1] = NULL; \
382 __ret; \
383 })
384
385 /* @mask is constant, always inline to cull unnecessary branches */
scx_kf_allowed(struct scx_sched * sch,u32 mask)386 static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
387 {
388 if (unlikely(!(current->scx.kf_mask & mask))) {
389 scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
390 mask, current->scx.kf_mask);
391 return false;
392 }
393
394 /*
395 * Enforce nesting boundaries. e.g. A kfunc which can be called from
396 * DISPATCH must not be called if we're running DEQUEUE which is nested
397 * inside ops.dispatch(). We don't need to check boundaries for any
398 * blocking kfuncs as the verifier ensures they're only called from
399 * sleepable progs.
400 */
401 if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
402 (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
403 scx_error(sch, "cpu_release kfunc called from a nested operation");
404 return false;
405 }
406
407 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
408 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
409 scx_error(sch, "dispatch kfunc called from a nested operation");
410 return false;
411 }
412
413 return true;
414 }
415
416 /* see SCX_CALL_OP_TASK() */
scx_kf_allowed_on_arg_tasks(struct scx_sched * sch,u32 mask,struct task_struct * p)417 static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
418 u32 mask,
419 struct task_struct *p)
420 {
421 if (!scx_kf_allowed(sch, mask))
422 return false;
423
424 if (unlikely((p != current->scx.kf_tasks[0] &&
425 p != current->scx.kf_tasks[1]))) {
426 scx_error(sch, "called on a task not being operated on");
427 return false;
428 }
429
430 return true;
431 }
432
433 /**
434 * nldsq_next_task - Iterate to the next task in a non-local DSQ
435 * @dsq: user dsq being iterated
436 * @cur: current position, %NULL to start iteration
437 * @rev: walk backwards
438 *
439 * Returns %NULL when iteration is finished.
440 */
nldsq_next_task(struct scx_dispatch_q * dsq,struct task_struct * cur,bool rev)441 static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
442 struct task_struct *cur, bool rev)
443 {
444 struct list_head *list_node;
445 struct scx_dsq_list_node *dsq_lnode;
446
447 lockdep_assert_held(&dsq->lock);
448
449 if (cur)
450 list_node = &cur->scx.dsq_list.node;
451 else
452 list_node = &dsq->list;
453
454 /* find the next task, need to skip BPF iteration cursors */
455 do {
456 if (rev)
457 list_node = list_node->prev;
458 else
459 list_node = list_node->next;
460
461 if (list_node == &dsq->list)
462 return NULL;
463
464 dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
465 node);
466 } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
467
468 return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
469 }
470
471 #define nldsq_for_each_task(p, dsq) \
472 for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
473 (p) = nldsq_next_task((dsq), (p), false))
474
475
476 /*
477 * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
478 * dispatch order. BPF-visible iterator is opaque and larger to allow future
479 * changes without breaking backward compatibility. Can be used with
480 * bpf_for_each(). See bpf_iter_scx_dsq_*().
481 */
482 enum scx_dsq_iter_flags {
483 /* iterate in the reverse dispatch order */
484 SCX_DSQ_ITER_REV = 1U << 16,
485
486 __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
487 __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
488
489 __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
490 __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
491 __SCX_DSQ_ITER_HAS_SLICE |
492 __SCX_DSQ_ITER_HAS_VTIME,
493 };
494
495 struct bpf_iter_scx_dsq_kern {
496 struct scx_dsq_list_node cursor;
497 struct scx_dispatch_q *dsq;
498 u64 slice;
499 u64 vtime;
500 } __attribute__((aligned(8)));
501
502 struct bpf_iter_scx_dsq {
503 u64 __opaque[6];
504 } __attribute__((aligned(8)));
505
506
507 /*
508 * SCX task iterator.
509 */
510 struct scx_task_iter {
511 struct sched_ext_entity cursor;
512 struct task_struct *locked_task;
513 struct rq *rq;
514 struct rq_flags rf;
515 u32 cnt;
516 bool list_locked;
517 };
518
519 /**
520 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
521 * @iter: iterator to init
522 *
523 * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
524 * must eventually be stopped with scx_task_iter_stop().
525 *
526 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
527 * between this and the first next() call or between any two next() calls. If
528 * the locks are released between two next() calls, the caller is responsible
529 * for ensuring that the task being iterated remains accessible either through
530 * RCU read lock or obtaining a reference count.
531 *
532 * All tasks which existed when the iteration started are guaranteed to be
533 * visited as long as they are not dead.
534 */
scx_task_iter_start(struct scx_task_iter * iter)535 static void scx_task_iter_start(struct scx_task_iter *iter)
536 {
537 memset(iter, 0, sizeof(*iter));
538
539 raw_spin_lock_irq(&scx_tasks_lock);
540
541 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
542 list_add(&iter->cursor.tasks_node, &scx_tasks);
543 iter->list_locked = true;
544 }
545
__scx_task_iter_rq_unlock(struct scx_task_iter * iter)546 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
547 {
548 if (iter->locked_task) {
549 __balance_callbacks(iter->rq, &iter->rf);
550 task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
551 iter->locked_task = NULL;
552 }
553 }
554
555 /**
556 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
557 * @iter: iterator to unlock
558 *
559 * If @iter is in the middle of a locked iteration, it may be locking the rq of
560 * the task currently being visited in addition to scx_tasks_lock. Unlock both.
561 * This function can be safely called anytime during an iteration. The next
562 * iterator operation will automatically restore the necessary locking.
563 */
scx_task_iter_unlock(struct scx_task_iter * iter)564 static void scx_task_iter_unlock(struct scx_task_iter *iter)
565 {
566 __scx_task_iter_rq_unlock(iter);
567 if (iter->list_locked) {
568 iter->list_locked = false;
569 raw_spin_unlock_irq(&scx_tasks_lock);
570 }
571 }
572
__scx_task_iter_maybe_relock(struct scx_task_iter * iter)573 static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
574 {
575 if (!iter->list_locked) {
576 raw_spin_lock_irq(&scx_tasks_lock);
577 iter->list_locked = true;
578 }
579 }
580
581 /**
582 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
583 * @iter: iterator to exit
584 *
585 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
586 * which is released on return. If the iterator holds a task's rq lock, that rq
587 * lock is also released. See scx_task_iter_start() for details.
588 */
scx_task_iter_stop(struct scx_task_iter * iter)589 static void scx_task_iter_stop(struct scx_task_iter *iter)
590 {
591 __scx_task_iter_maybe_relock(iter);
592 list_del_init(&iter->cursor.tasks_node);
593 scx_task_iter_unlock(iter);
594 }
595
596 /**
597 * scx_task_iter_next - Next task
598 * @iter: iterator to walk
599 *
600 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
601 * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
602 * by holding scx_tasks_lock for too long.
603 */
scx_task_iter_next(struct scx_task_iter * iter)604 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
605 {
606 struct list_head *cursor = &iter->cursor.tasks_node;
607 struct sched_ext_entity *pos;
608
609 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
610 scx_task_iter_unlock(iter);
611 cond_resched();
612 }
613
614 __scx_task_iter_maybe_relock(iter);
615
616 list_for_each_entry(pos, cursor, tasks_node) {
617 if (&pos->tasks_node == &scx_tasks)
618 return NULL;
619 if (!(pos->flags & SCX_TASK_CURSOR)) {
620 list_move(cursor, &pos->tasks_node);
621 return container_of(pos, struct task_struct, scx);
622 }
623 }
624
625 /* can't happen, should always terminate at scx_tasks above */
626 BUG();
627 }
628
629 /**
630 * scx_task_iter_next_locked - Next non-idle task with its rq locked
631 * @iter: iterator to walk
632 *
633 * Visit the non-idle task with its rq lock held. Allows callers to specify
634 * whether they would like to filter out dead tasks. See scx_task_iter_start()
635 * for details.
636 */
scx_task_iter_next_locked(struct scx_task_iter * iter)637 static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
638 {
639 struct task_struct *p;
640
641 __scx_task_iter_rq_unlock(iter);
642
643 while ((p = scx_task_iter_next(iter))) {
644 /*
645 * scx_task_iter is used to prepare and move tasks into SCX
646 * while loading the BPF scheduler and vice-versa while
647 * unloading. The init_tasks ("swappers") should be excluded
648 * from the iteration because:
649 *
650 * - It's unsafe to use __setschduler_prio() on an init_task to
651 * determine the sched_class to use as it won't preserve its
652 * idle_sched_class.
653 *
654 * - ops.init/exit_task() can easily be confused if called with
655 * init_tasks as they, e.g., share PID 0.
656 *
657 * As init_tasks are never scheduled through SCX, they can be
658 * skipped safely. Note that is_idle_task() which tests %PF_IDLE
659 * doesn't work here:
660 *
661 * - %PF_IDLE may not be set for an init_task whose CPU hasn't
662 * yet been onlined.
663 *
664 * - %PF_IDLE can be set on tasks that are not init_tasks. See
665 * play_idle_precise() used by CONFIG_IDLE_INJECT.
666 *
667 * Test for idle_sched_class as only init_tasks are on it.
668 */
669 if (p->sched_class != &idle_sched_class)
670 break;
671 }
672 if (!p)
673 return NULL;
674
675 iter->rq = task_rq_lock(p, &iter->rf);
676 iter->locked_task = p;
677
678 return p;
679 }
680
681 /**
682 * scx_add_event - Increase an event counter for 'name' by 'cnt'
683 * @sch: scx_sched to account events for
684 * @name: an event name defined in struct scx_event_stats
685 * @cnt: the number of the event occurred
686 *
687 * This can be used when preemption is not disabled.
688 */
689 #define scx_add_event(sch, name, cnt) do { \
690 this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
691 trace_sched_ext_event(#name, (cnt)); \
692 } while(0)
693
694 /**
695 * __scx_add_event - Increase an event counter for 'name' by 'cnt'
696 * @sch: scx_sched to account events for
697 * @name: an event name defined in struct scx_event_stats
698 * @cnt: the number of the event occurred
699 *
700 * This should be used only when preemption is disabled.
701 */
702 #define __scx_add_event(sch, name, cnt) do { \
703 __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
704 trace_sched_ext_event(#name, cnt); \
705 } while(0)
706
707 /**
708 * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
709 * @dst_e: destination event stats
710 * @src_e: source event stats
711 * @kind: a kind of event to be aggregated
712 */
713 #define scx_agg_event(dst_e, src_e, kind) do { \
714 (dst_e)->kind += READ_ONCE((src_e)->kind); \
715 } while(0)
716
717 /**
718 * scx_dump_event - Dump an event 'kind' in 'events' to 's'
719 * @s: output seq_buf
720 * @events: event stats
721 * @kind: a kind of event to dump
722 */
723 #define scx_dump_event(s, events, kind) do { \
724 dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
725 } while (0)
726
727
728 static void scx_read_events(struct scx_sched *sch,
729 struct scx_event_stats *events);
730
scx_enable_state(void)731 static enum scx_enable_state scx_enable_state(void)
732 {
733 return atomic_read(&scx_enable_state_var);
734 }
735
scx_set_enable_state(enum scx_enable_state to)736 static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
737 {
738 return atomic_xchg(&scx_enable_state_var, to);
739 }
740
scx_tryset_enable_state(enum scx_enable_state to,enum scx_enable_state from)741 static bool scx_tryset_enable_state(enum scx_enable_state to,
742 enum scx_enable_state from)
743 {
744 int from_v = from;
745
746 return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
747 }
748
749 /**
750 * wait_ops_state - Busy-wait the specified ops state to end
751 * @p: target task
752 * @opss: state to wait the end of
753 *
754 * Busy-wait for @p to transition out of @opss. This can only be used when the
755 * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
756 * has load_acquire semantics to ensure that the caller can see the updates made
757 * in the enqueueing and dispatching paths.
758 */
wait_ops_state(struct task_struct * p,unsigned long opss)759 static void wait_ops_state(struct task_struct *p, unsigned long opss)
760 {
761 do {
762 cpu_relax();
763 } while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
764 }
765
__cpu_valid(s32 cpu)766 static inline bool __cpu_valid(s32 cpu)
767 {
768 return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
769 }
770
771 /**
772 * ops_cpu_valid - Verify a cpu number, to be used on ops input args
773 * @sch: scx_sched to abort on error
774 * @cpu: cpu number which came from a BPF ops
775 * @where: extra information reported on error
776 *
777 * @cpu is a cpu number which came from the BPF scheduler and can be any value.
778 * Verify that it is in range and one of the possible cpus. If invalid, trigger
779 * an ops error.
780 */
ops_cpu_valid(struct scx_sched * sch,s32 cpu,const char * where)781 static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
782 {
783 if (__cpu_valid(cpu)) {
784 return true;
785 } else {
786 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
787 return false;
788 }
789 }
790
791 /**
792 * ops_sanitize_err - Sanitize a -errno value
793 * @sch: scx_sched to error out on error
794 * @ops_name: operation to blame on failure
795 * @err: -errno value to sanitize
796 *
797 * Verify @err is a valid -errno. If not, trigger scx_error() and return
798 * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
799 * cause misbehaviors. For an example, a large negative return from
800 * ops.init_task() triggers an oops when passed up the call chain because the
801 * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
802 * handled as a pointer.
803 */
ops_sanitize_err(struct scx_sched * sch,const char * ops_name,s32 err)804 static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err)
805 {
806 if (err < 0 && err >= -MAX_ERRNO)
807 return err;
808
809 scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err);
810 return -EPROTO;
811 }
812
run_deferred(struct rq * rq)813 static void run_deferred(struct rq *rq)
814 {
815 process_ddsp_deferred_locals(rq);
816
817 if (local_read(&rq->scx.reenq_local_deferred)) {
818 local_set(&rq->scx.reenq_local_deferred, 0);
819 reenq_local(rq);
820 }
821 }
822
deferred_bal_cb_workfn(struct rq * rq)823 static void deferred_bal_cb_workfn(struct rq *rq)
824 {
825 run_deferred(rq);
826 }
827
deferred_irq_workfn(struct irq_work * irq_work)828 static void deferred_irq_workfn(struct irq_work *irq_work)
829 {
830 struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
831
832 raw_spin_rq_lock(rq);
833 run_deferred(rq);
834 raw_spin_rq_unlock(rq);
835 }
836
837 /**
838 * schedule_deferred - Schedule execution of deferred actions on an rq
839 * @rq: target rq
840 *
841 * Schedule execution of deferred actions on @rq. Deferred actions are executed
842 * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks
843 * to other rqs.
844 */
schedule_deferred(struct rq * rq)845 static void schedule_deferred(struct rq *rq)
846 {
847 /*
848 * Queue an irq work. They are executed on IRQ re-enable which may take
849 * a bit longer than the scheduler hook in schedule_deferred_locked().
850 */
851 irq_work_queue(&rq->scx.deferred_irq_work);
852 }
853
854 /**
855 * schedule_deferred_locked - Schedule execution of deferred actions on an rq
856 * @rq: target rq
857 *
858 * Schedule execution of deferred actions on @rq. Equivalent to
859 * schedule_deferred() but requires @rq to be locked and can be more efficient.
860 */
schedule_deferred_locked(struct rq * rq)861 static void schedule_deferred_locked(struct rq *rq)
862 {
863 lockdep_assert_rq_held(rq);
864
865 /*
866 * If in the middle of waking up a task, task_woken_scx() will be called
867 * afterwards which will then run the deferred actions, no need to
868 * schedule anything.
869 */
870 if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
871 return;
872
873 /* Don't do anything if there already is a deferred operation. */
874 if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
875 return;
876
877 /*
878 * If in balance, the balance callbacks will be called before rq lock is
879 * released. Schedule one.
880 *
881 *
882 * We can't directly insert the callback into the
883 * rq's list: The call can drop its lock and make the pending balance
884 * callback visible to unrelated code paths that call rq_pin_lock().
885 *
886 * Just let balance_one() know that it must do it itself.
887 */
888 if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
889 rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
890 return;
891 }
892
893 /*
894 * No scheduler hooks available. Use the generic irq_work path. The
895 * above WAKEUP and BALANCE paths should cover most of the cases and the
896 * time to IRQ re-enable shouldn't be long.
897 */
898 schedule_deferred(rq);
899 }
900
901 /**
902 * touch_core_sched - Update timestamp used for core-sched task ordering
903 * @rq: rq to read clock from, must be locked
904 * @p: task to update the timestamp for
905 *
906 * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
907 * implement global or local-DSQ FIFO ordering for core-sched. Should be called
908 * when a task becomes runnable and its turn on the CPU ends (e.g. slice
909 * exhaustion).
910 */
touch_core_sched(struct rq * rq,struct task_struct * p)911 static void touch_core_sched(struct rq *rq, struct task_struct *p)
912 {
913 lockdep_assert_rq_held(rq);
914
915 #ifdef CONFIG_SCHED_CORE
916 /*
917 * It's okay to update the timestamp spuriously. Use
918 * sched_core_disabled() which is cheaper than enabled().
919 *
920 * As this is used to determine ordering between tasks of sibling CPUs,
921 * it may be better to use per-core dispatch sequence instead.
922 */
923 if (!sched_core_disabled())
924 p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
925 #endif
926 }
927
928 /**
929 * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
930 * @rq: rq to read clock from, must be locked
931 * @p: task being dispatched
932 *
933 * If the BPF scheduler implements custom core-sched ordering via
934 * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
935 * ordering within each local DSQ. This function is called from dispatch paths
936 * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
937 */
touch_core_sched_dispatch(struct rq * rq,struct task_struct * p)938 static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
939 {
940 lockdep_assert_rq_held(rq);
941
942 #ifdef CONFIG_SCHED_CORE
943 if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
944 touch_core_sched(rq, p);
945 #endif
946 }
947
update_curr_scx(struct rq * rq)948 static void update_curr_scx(struct rq *rq)
949 {
950 struct task_struct *curr = rq->curr;
951 s64 delta_exec;
952
953 delta_exec = update_curr_common(rq);
954 if (unlikely(delta_exec <= 0))
955 return;
956
957 if (curr->scx.slice != SCX_SLICE_INF) {
958 curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
959 if (!curr->scx.slice)
960 touch_core_sched(rq, curr);
961 }
962 }
963
scx_dsq_priq_less(struct rb_node * node_a,const struct rb_node * node_b)964 static bool scx_dsq_priq_less(struct rb_node *node_a,
965 const struct rb_node *node_b)
966 {
967 const struct task_struct *a =
968 container_of(node_a, struct task_struct, scx.dsq_priq);
969 const struct task_struct *b =
970 container_of(node_b, struct task_struct, scx.dsq_priq);
971
972 return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
973 }
974
dsq_mod_nr(struct scx_dispatch_q * dsq,s32 delta)975 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
976 {
977 /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
978 WRITE_ONCE(dsq->nr, dsq->nr + delta);
979 }
980
refill_task_slice_dfl(struct scx_sched * sch,struct task_struct * p)981 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
982 {
983 p->scx.slice = READ_ONCE(scx_slice_dfl);
984 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
985 }
986
local_dsq_post_enq(struct scx_dispatch_q * dsq,struct task_struct * p,u64 enq_flags)987 static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
988 u64 enq_flags)
989 {
990 struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
991 bool preempt = false;
992
993 /*
994 * If @rq is in balance, the CPU is already vacant and looking for the
995 * next task to run. No need to preempt or trigger resched after moving
996 * @p into its local DSQ.
997 */
998 if (rq->scx.flags & SCX_RQ_IN_BALANCE)
999 return;
1000
1001 if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
1002 rq->curr->sched_class == &ext_sched_class) {
1003 rq->curr->scx.slice = 0;
1004 preempt = true;
1005 }
1006
1007 if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
1008 resched_curr(rq);
1009 }
1010
dispatch_enqueue(struct scx_sched * sch,struct scx_dispatch_q * dsq,struct task_struct * p,u64 enq_flags)1011 static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
1012 struct task_struct *p, u64 enq_flags)
1013 {
1014 bool is_local = dsq->id == SCX_DSQ_LOCAL;
1015
1016 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1017 WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
1018 !RB_EMPTY_NODE(&p->scx.dsq_priq));
1019
1020 if (!is_local) {
1021 raw_spin_lock_nested(&dsq->lock,
1022 (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
1023
1024 if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
1025 scx_error(sch, "attempting to dispatch to a destroyed dsq");
1026 /* fall back to the global dsq */
1027 raw_spin_unlock(&dsq->lock);
1028 dsq = find_global_dsq(sch, p);
1029 raw_spin_lock(&dsq->lock);
1030 }
1031 }
1032
1033 if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
1034 (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
1035 /*
1036 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
1037 * their FIFO queues. To avoid confusion and accidentally
1038 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
1039 * disallow any internal DSQ from doing vtime ordering of
1040 * tasks.
1041 */
1042 scx_error(sch, "cannot use vtime ordering for built-in DSQs");
1043 enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
1044 }
1045
1046 if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
1047 struct rb_node *rbp;
1048
1049 /*
1050 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
1051 * linked to both the rbtree and list on PRIQs, this can only be
1052 * tested easily when adding the first task.
1053 */
1054 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
1055 nldsq_next_task(dsq, NULL, false)))
1056 scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
1057 dsq->id);
1058
1059 p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
1060 rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
1061
1062 /*
1063 * Find the previous task and insert after it on the list so
1064 * that @dsq->list is vtime ordered.
1065 */
1066 rbp = rb_prev(&p->scx.dsq_priq);
1067 if (rbp) {
1068 struct task_struct *prev =
1069 container_of(rbp, struct task_struct,
1070 scx.dsq_priq);
1071 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
1072 /* first task unchanged - no update needed */
1073 } else {
1074 list_add(&p->scx.dsq_list.node, &dsq->list);
1075 /* not builtin and new task is at head - use fastpath */
1076 rcu_assign_pointer(dsq->first_task, p);
1077 }
1078 } else {
1079 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */
1080 if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
1081 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
1082 dsq->id);
1083
1084 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
1085 list_add(&p->scx.dsq_list.node, &dsq->list);
1086 /* new task inserted at head - use fastpath */
1087 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
1088 rcu_assign_pointer(dsq->first_task, p);
1089 } else {
1090 bool was_empty;
1091
1092 was_empty = list_empty(&dsq->list);
1093 list_add_tail(&p->scx.dsq_list.node, &dsq->list);
1094 if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
1095 rcu_assign_pointer(dsq->first_task, p);
1096 }
1097 }
1098
1099 /* seq records the order tasks are queued, used by BPF DSQ iterator */
1100 dsq->seq++;
1101 p->scx.dsq_seq = dsq->seq;
1102
1103 dsq_mod_nr(dsq, 1);
1104 p->scx.dsq = dsq;
1105
1106 /*
1107 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
1108 * direct dispatch path, but we clear them here because the direct
1109 * dispatch verdict may be overridden on the enqueue path during e.g.
1110 * bypass.
1111 */
1112 p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
1113 p->scx.ddsp_enq_flags = 0;
1114
1115 /*
1116 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
1117 * match waiters' load_acquire.
1118 */
1119 if (enq_flags & SCX_ENQ_CLEAR_OPSS)
1120 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1121
1122 if (is_local)
1123 local_dsq_post_enq(dsq, p, enq_flags);
1124 else
1125 raw_spin_unlock(&dsq->lock);
1126 }
1127
task_unlink_from_dsq(struct task_struct * p,struct scx_dispatch_q * dsq)1128 static void task_unlink_from_dsq(struct task_struct *p,
1129 struct scx_dispatch_q *dsq)
1130 {
1131 WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
1132
1133 if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
1134 rb_erase(&p->scx.dsq_priq, &dsq->priq);
1135 RB_CLEAR_NODE(&p->scx.dsq_priq);
1136 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
1137 }
1138
1139 list_del_init(&p->scx.dsq_list.node);
1140 dsq_mod_nr(dsq, -1);
1141
1142 if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
1143 struct task_struct *first_task;
1144
1145 first_task = nldsq_next_task(dsq, NULL, false);
1146 rcu_assign_pointer(dsq->first_task, first_task);
1147 }
1148 }
1149
dispatch_dequeue(struct rq * rq,struct task_struct * p)1150 static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
1151 {
1152 struct scx_dispatch_q *dsq = p->scx.dsq;
1153 bool is_local = dsq == &rq->scx.local_dsq;
1154
1155 lockdep_assert_rq_held(rq);
1156
1157 if (!dsq) {
1158 /*
1159 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
1160 * Unlinking is all that's needed to cancel.
1161 */
1162 if (unlikely(!list_empty(&p->scx.dsq_list.node)))
1163 list_del_init(&p->scx.dsq_list.node);
1164
1165 /*
1166 * When dispatching directly from the BPF scheduler to a local
1167 * DSQ, the task isn't associated with any DSQ but
1168 * @p->scx.holding_cpu may be set under the protection of
1169 * %SCX_OPSS_DISPATCHING.
1170 */
1171 if (p->scx.holding_cpu >= 0)
1172 p->scx.holding_cpu = -1;
1173
1174 return;
1175 }
1176
1177 if (!is_local)
1178 raw_spin_lock(&dsq->lock);
1179
1180 /*
1181 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
1182 * change underneath us.
1183 */
1184 if (p->scx.holding_cpu < 0) {
1185 /* @p must still be on @dsq, dequeue */
1186 task_unlink_from_dsq(p, dsq);
1187 } else {
1188 /*
1189 * We're racing against dispatch_to_local_dsq() which already
1190 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
1191 * holding_cpu which tells dispatch_to_local_dsq() that it lost
1192 * the race.
1193 */
1194 WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
1195 p->scx.holding_cpu = -1;
1196 }
1197 p->scx.dsq = NULL;
1198
1199 if (!is_local)
1200 raw_spin_unlock(&dsq->lock);
1201 }
1202
1203 /*
1204 * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq
1205 * and dsq are locked.
1206 */
dispatch_dequeue_locked(struct task_struct * p,struct scx_dispatch_q * dsq)1207 static void dispatch_dequeue_locked(struct task_struct *p,
1208 struct scx_dispatch_q *dsq)
1209 {
1210 lockdep_assert_rq_held(task_rq(p));
1211 lockdep_assert_held(&dsq->lock);
1212
1213 task_unlink_from_dsq(p, dsq);
1214 p->scx.dsq = NULL;
1215 }
1216
find_dsq_for_dispatch(struct scx_sched * sch,struct rq * rq,u64 dsq_id,struct task_struct * p)1217 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
1218 struct rq *rq, u64 dsq_id,
1219 struct task_struct *p)
1220 {
1221 struct scx_dispatch_q *dsq;
1222
1223 if (dsq_id == SCX_DSQ_LOCAL)
1224 return &rq->scx.local_dsq;
1225
1226 if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
1227 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
1228
1229 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
1230 return find_global_dsq(sch, p);
1231
1232 return &cpu_rq(cpu)->scx.local_dsq;
1233 }
1234
1235 if (dsq_id == SCX_DSQ_GLOBAL)
1236 dsq = find_global_dsq(sch, p);
1237 else
1238 dsq = find_user_dsq(sch, dsq_id);
1239
1240 if (unlikely(!dsq)) {
1241 scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
1242 dsq_id, p->comm, p->pid);
1243 return find_global_dsq(sch, p);
1244 }
1245
1246 return dsq;
1247 }
1248
mark_direct_dispatch(struct scx_sched * sch,struct task_struct * ddsp_task,struct task_struct * p,u64 dsq_id,u64 enq_flags)1249 static void mark_direct_dispatch(struct scx_sched *sch,
1250 struct task_struct *ddsp_task,
1251 struct task_struct *p, u64 dsq_id,
1252 u64 enq_flags)
1253 {
1254 /*
1255 * Mark that dispatch already happened from ops.select_cpu() or
1256 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
1257 * which can never match a valid task pointer.
1258 */
1259 __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
1260
1261 /* @p must match the task on the enqueue path */
1262 if (unlikely(p != ddsp_task)) {
1263 if (IS_ERR(ddsp_task))
1264 scx_error(sch, "%s[%d] already direct-dispatched",
1265 p->comm, p->pid);
1266 else
1267 scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
1268 ddsp_task->comm, ddsp_task->pid,
1269 p->comm, p->pid);
1270 return;
1271 }
1272
1273 WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
1274 WARN_ON_ONCE(p->scx.ddsp_enq_flags);
1275
1276 p->scx.ddsp_dsq_id = dsq_id;
1277 p->scx.ddsp_enq_flags = enq_flags;
1278 }
1279
direct_dispatch(struct scx_sched * sch,struct task_struct * p,u64 enq_flags)1280 static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
1281 u64 enq_flags)
1282 {
1283 struct rq *rq = task_rq(p);
1284 struct scx_dispatch_q *dsq =
1285 find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
1286
1287 touch_core_sched_dispatch(rq, p);
1288
1289 p->scx.ddsp_enq_flags |= enq_flags;
1290
1291 /*
1292 * We are in the enqueue path with @rq locked and pinned, and thus can't
1293 * double lock a remote rq and enqueue to its local DSQ. For
1294 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
1295 * the enqueue so that it's executed when @rq can be unlocked.
1296 */
1297 if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
1298 unsigned long opss;
1299
1300 opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
1301
1302 switch (opss & SCX_OPSS_STATE_MASK) {
1303 case SCX_OPSS_NONE:
1304 break;
1305 case SCX_OPSS_QUEUEING:
1306 /*
1307 * As @p was never passed to the BPF side, _release is
1308 * not strictly necessary. Still do it for consistency.
1309 */
1310 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1311 break;
1312 default:
1313 WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
1314 p->comm, p->pid, opss);
1315 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1316 break;
1317 }
1318
1319 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
1320 list_add_tail(&p->scx.dsq_list.node,
1321 &rq->scx.ddsp_deferred_locals);
1322 schedule_deferred_locked(rq);
1323 return;
1324 }
1325
1326 dispatch_enqueue(sch, dsq, p,
1327 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
1328 }
1329
scx_rq_online(struct rq * rq)1330 static bool scx_rq_online(struct rq *rq)
1331 {
1332 /*
1333 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
1334 * the online state as seen from the BPF scheduler. cpu_active() test
1335 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
1336 * stay set until the current scheduling operation is complete even if
1337 * we aren't locking @rq.
1338 */
1339 return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
1340 }
1341
do_enqueue_task(struct rq * rq,struct task_struct * p,u64 enq_flags,int sticky_cpu)1342 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
1343 int sticky_cpu)
1344 {
1345 struct scx_sched *sch = scx_root;
1346 struct task_struct **ddsp_taskp;
1347 struct scx_dispatch_q *dsq;
1348 unsigned long qseq;
1349
1350 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
1351
1352 /* rq migration */
1353 if (sticky_cpu == cpu_of(rq))
1354 goto local_norefill;
1355
1356 /*
1357 * If !scx_rq_online(), we already told the BPF scheduler that the CPU
1358 * is offline and are just running the hotplug path. Don't bother the
1359 * BPF scheduler.
1360 */
1361 if (!scx_rq_online(rq))
1362 goto local;
1363
1364 if (scx_rq_bypassing(rq)) {
1365 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
1366 goto bypass;
1367 }
1368
1369 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1370 goto direct;
1371
1372 /* see %SCX_OPS_ENQ_EXITING */
1373 if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
1374 unlikely(p->flags & PF_EXITING)) {
1375 __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
1376 goto local;
1377 }
1378
1379 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
1380 if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
1381 is_migration_disabled(p)) {
1382 __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
1383 goto local;
1384 }
1385
1386 if (unlikely(!SCX_HAS_OP(sch, enqueue)))
1387 goto global;
1388
1389 /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
1390 qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
1391
1392 WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1393 atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
1394
1395 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
1396 WARN_ON_ONCE(*ddsp_taskp);
1397 *ddsp_taskp = p;
1398
1399 SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
1400
1401 *ddsp_taskp = NULL;
1402 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1403 goto direct;
1404
1405 /*
1406 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
1407 * dequeue may be waiting. The store_release matches their load_acquire.
1408 */
1409 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
1410 return;
1411
1412 direct:
1413 direct_dispatch(sch, p, enq_flags);
1414 return;
1415 local_norefill:
1416 dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
1417 return;
1418 local:
1419 dsq = &rq->scx.local_dsq;
1420 goto enqueue;
1421 global:
1422 dsq = find_global_dsq(sch, p);
1423 goto enqueue;
1424 bypass:
1425 dsq = &task_rq(p)->scx.bypass_dsq;
1426 goto enqueue;
1427
1428 enqueue:
1429 /*
1430 * For task-ordering, slice refill must be treated as implying the end
1431 * of the current slice. Otherwise, the longer @p stays on the CPU, the
1432 * higher priority it becomes from scx_prio_less()'s POV.
1433 */
1434 touch_core_sched(rq, p);
1435 refill_task_slice_dfl(sch, p);
1436 dispatch_enqueue(sch, dsq, p, enq_flags);
1437 }
1438
task_runnable(const struct task_struct * p)1439 static bool task_runnable(const struct task_struct *p)
1440 {
1441 return !list_empty(&p->scx.runnable_node);
1442 }
1443
set_task_runnable(struct rq * rq,struct task_struct * p)1444 static void set_task_runnable(struct rq *rq, struct task_struct *p)
1445 {
1446 lockdep_assert_rq_held(rq);
1447
1448 if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
1449 p->scx.runnable_at = jiffies;
1450 p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
1451 }
1452
1453 /*
1454 * list_add_tail() must be used. scx_bypass() depends on tasks being
1455 * appended to the runnable_list.
1456 */
1457 list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
1458 }
1459
clr_task_runnable(struct task_struct * p,bool reset_runnable_at)1460 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
1461 {
1462 list_del_init(&p->scx.runnable_node);
1463 if (reset_runnable_at)
1464 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
1465 }
1466
enqueue_task_scx(struct rq * rq,struct task_struct * p,int enq_flags)1467 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
1468 {
1469 struct scx_sched *sch = scx_root;
1470 int sticky_cpu = p->scx.sticky_cpu;
1471
1472 if (enq_flags & ENQUEUE_WAKEUP)
1473 rq->scx.flags |= SCX_RQ_IN_WAKEUP;
1474
1475 enq_flags |= rq->scx.extra_enq_flags;
1476
1477 if (sticky_cpu >= 0)
1478 p->scx.sticky_cpu = -1;
1479
1480 /*
1481 * Restoring a running task will be immediately followed by
1482 * set_next_task_scx() which expects the task to not be on the BPF
1483 * scheduler as tasks can only start running through local DSQs. Force
1484 * direct-dispatch into the local DSQ by setting the sticky_cpu.
1485 */
1486 if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
1487 sticky_cpu = cpu_of(rq);
1488
1489 if (p->scx.flags & SCX_TASK_QUEUED) {
1490 WARN_ON_ONCE(!task_runnable(p));
1491 goto out;
1492 }
1493
1494 set_task_runnable(rq, p);
1495 p->scx.flags |= SCX_TASK_QUEUED;
1496 rq->scx.nr_running++;
1497 add_nr_running(rq, 1);
1498
1499 if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
1500 SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
1501
1502 if (enq_flags & SCX_ENQ_WAKEUP)
1503 touch_core_sched(rq, p);
1504
1505 do_enqueue_task(rq, p, enq_flags, sticky_cpu);
1506 out:
1507 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
1508
1509 if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
1510 unlikely(cpu_of(rq) != p->scx.selected_cpu))
1511 __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
1512 }
1513
ops_dequeue(struct rq * rq,struct task_struct * p,u64 deq_flags)1514 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
1515 {
1516 struct scx_sched *sch = scx_root;
1517 unsigned long opss;
1518
1519 /* dequeue is always temporary, don't reset runnable_at */
1520 clr_task_runnable(p, false);
1521
1522 /* acquire ensures that we see the preceding updates on QUEUED */
1523 opss = atomic_long_read_acquire(&p->scx.ops_state);
1524
1525 switch (opss & SCX_OPSS_STATE_MASK) {
1526 case SCX_OPSS_NONE:
1527 break;
1528 case SCX_OPSS_QUEUEING:
1529 /*
1530 * QUEUEING is started and finished while holding @p's rq lock.
1531 * As we're holding the rq lock now, we shouldn't see QUEUEING.
1532 */
1533 BUG();
1534 case SCX_OPSS_QUEUED:
1535 if (SCX_HAS_OP(sch, dequeue))
1536 SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
1537 p, deq_flags);
1538
1539 if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
1540 SCX_OPSS_NONE))
1541 break;
1542 fallthrough;
1543 case SCX_OPSS_DISPATCHING:
1544 /*
1545 * If @p is being dispatched from the BPF scheduler to a DSQ,
1546 * wait for the transfer to complete so that @p doesn't get
1547 * added to its DSQ after dequeueing is complete.
1548 *
1549 * As we're waiting on DISPATCHING with the rq locked, the
1550 * dispatching side shouldn't try to lock the rq while
1551 * DISPATCHING is set. See dispatch_to_local_dsq().
1552 *
1553 * DISPATCHING shouldn't have qseq set and control can reach
1554 * here with NONE @opss from the above QUEUED case block.
1555 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
1556 */
1557 wait_ops_state(p, SCX_OPSS_DISPATCHING);
1558 BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1559 break;
1560 }
1561 }
1562
dequeue_task_scx(struct rq * rq,struct task_struct * p,int deq_flags)1563 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
1564 {
1565 struct scx_sched *sch = scx_root;
1566
1567 if (!(p->scx.flags & SCX_TASK_QUEUED)) {
1568 WARN_ON_ONCE(task_runnable(p));
1569 return true;
1570 }
1571
1572 ops_dequeue(rq, p, deq_flags);
1573
1574 /*
1575 * A currently running task which is going off @rq first gets dequeued
1576 * and then stops running. As we want running <-> stopping transitions
1577 * to be contained within runnable <-> quiescent transitions, trigger
1578 * ->stopping() early here instead of in put_prev_task_scx().
1579 *
1580 * @p may go through multiple stopping <-> running transitions between
1581 * here and put_prev_task_scx() if task attribute changes occur while
1582 * balance_one() leaves @rq unlocked. However, they don't contain any
1583 * information meaningful to the BPF scheduler and can be suppressed by
1584 * skipping the callbacks if the task is !QUEUED.
1585 */
1586 if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
1587 update_curr_scx(rq);
1588 SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
1589 }
1590
1591 if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
1592 SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
1593
1594 if (deq_flags & SCX_DEQ_SLEEP)
1595 p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
1596 else
1597 p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
1598
1599 p->scx.flags &= ~SCX_TASK_QUEUED;
1600 rq->scx.nr_running--;
1601 sub_nr_running(rq, 1);
1602
1603 dispatch_dequeue(rq, p);
1604 return true;
1605 }
1606
yield_task_scx(struct rq * rq)1607 static void yield_task_scx(struct rq *rq)
1608 {
1609 struct scx_sched *sch = scx_root;
1610 struct task_struct *p = rq->donor;
1611
1612 if (SCX_HAS_OP(sch, yield))
1613 SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
1614 else
1615 p->scx.slice = 0;
1616 }
1617
yield_to_task_scx(struct rq * rq,struct task_struct * to)1618 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
1619 {
1620 struct scx_sched *sch = scx_root;
1621 struct task_struct *from = rq->donor;
1622
1623 if (SCX_HAS_OP(sch, yield))
1624 return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
1625 from, to);
1626 else
1627 return false;
1628 }
1629
move_local_task_to_local_dsq(struct task_struct * p,u64 enq_flags,struct scx_dispatch_q * src_dsq,struct rq * dst_rq)1630 static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
1631 struct scx_dispatch_q *src_dsq,
1632 struct rq *dst_rq)
1633 {
1634 struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
1635
1636 /* @dsq is locked and @p is on @dst_rq */
1637 lockdep_assert_held(&src_dsq->lock);
1638 lockdep_assert_rq_held(dst_rq);
1639
1640 WARN_ON_ONCE(p->scx.holding_cpu >= 0);
1641
1642 if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
1643 list_add(&p->scx.dsq_list.node, &dst_dsq->list);
1644 else
1645 list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
1646
1647 dsq_mod_nr(dst_dsq, 1);
1648 p->scx.dsq = dst_dsq;
1649
1650 local_dsq_post_enq(dst_dsq, p, enq_flags);
1651 }
1652
1653 /**
1654 * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
1655 * @p: task to move
1656 * @enq_flags: %SCX_ENQ_*
1657 * @src_rq: rq to move the task from, locked on entry, released on return
1658 * @dst_rq: rq to move the task into, locked on return
1659 *
1660 * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
1661 */
move_remote_task_to_local_dsq(struct task_struct * p,u64 enq_flags,struct rq * src_rq,struct rq * dst_rq)1662 static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
1663 struct rq *src_rq, struct rq *dst_rq)
1664 {
1665 lockdep_assert_rq_held(src_rq);
1666
1667 /* the following marks @p MIGRATING which excludes dequeue */
1668 deactivate_task(src_rq, p, 0);
1669 set_task_cpu(p, cpu_of(dst_rq));
1670 p->scx.sticky_cpu = cpu_of(dst_rq);
1671
1672 raw_spin_rq_unlock(src_rq);
1673 raw_spin_rq_lock(dst_rq);
1674
1675 /*
1676 * We want to pass scx-specific enq_flags but activate_task() will
1677 * truncate the upper 32 bit. As we own @rq, we can pass them through
1678 * @rq->scx.extra_enq_flags instead.
1679 */
1680 WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
1681 WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
1682 dst_rq->scx.extra_enq_flags = enq_flags;
1683 activate_task(dst_rq, p, 0);
1684 dst_rq->scx.extra_enq_flags = 0;
1685 }
1686
1687 /*
1688 * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
1689 * differences:
1690 *
1691 * - is_cpu_allowed() asks "Can this task run on this CPU?" while
1692 * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
1693 * this CPU?".
1694 *
1695 * While migration is disabled, is_cpu_allowed() has to say "yes" as the task
1696 * must be allowed to finish on the CPU that it's currently on regardless of
1697 * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
1698 * BPF scheduler shouldn't attempt to migrate a task which has migration
1699 * disabled.
1700 *
1701 * - The BPF scheduler is bypassed while the rq is offline and we can always say
1702 * no to the BPF scheduler initiated migrations while offline.
1703 *
1704 * The caller must ensure that @p and @rq are on different CPUs.
1705 */
task_can_run_on_remote_rq(struct scx_sched * sch,struct task_struct * p,struct rq * rq,bool enforce)1706 static bool task_can_run_on_remote_rq(struct scx_sched *sch,
1707 struct task_struct *p, struct rq *rq,
1708 bool enforce)
1709 {
1710 int cpu = cpu_of(rq);
1711
1712 WARN_ON_ONCE(task_cpu(p) == cpu);
1713
1714 /*
1715 * If @p has migration disabled, @p->cpus_ptr is updated to contain only
1716 * the pinned CPU in migrate_disable_switch() while @p is being switched
1717 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
1718 * updated and thus another CPU may see @p on a DSQ inbetween leading to
1719 * @p passing the below task_allowed_on_cpu() check while migration is
1720 * disabled.
1721 *
1722 * Test the migration disabled state first as the race window is narrow
1723 * and the BPF scheduler failing to check migration disabled state can
1724 * easily be masked if task_allowed_on_cpu() is done first.
1725 */
1726 if (unlikely(is_migration_disabled(p))) {
1727 if (enforce)
1728 scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
1729 p->comm, p->pid, task_cpu(p), cpu);
1730 return false;
1731 }
1732
1733 /*
1734 * We don't require the BPF scheduler to avoid dispatching to offline
1735 * CPUs mostly for convenience but also because CPUs can go offline
1736 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
1737 * picked CPU is outside the allowed mask.
1738 */
1739 if (!task_allowed_on_cpu(p, cpu)) {
1740 if (enforce)
1741 scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
1742 cpu, p->comm, p->pid);
1743 return false;
1744 }
1745
1746 if (!scx_rq_online(rq)) {
1747 if (enforce)
1748 __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
1749 return false;
1750 }
1751
1752 return true;
1753 }
1754
1755 /**
1756 * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
1757 * @p: target task
1758 * @dsq: locked DSQ @p is currently on
1759 * @src_rq: rq @p is currently on, stable with @dsq locked
1760 *
1761 * Called with @dsq locked but no rq's locked. We want to move @p to a different
1762 * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
1763 * required when transferring into a local DSQ. Even when transferring into a
1764 * non-local DSQ, it's better to use the same mechanism to protect against
1765 * dequeues and maintain the invariant that @p->scx.dsq can only change while
1766 * @src_rq is locked, which e.g. scx_dump_task() depends on.
1767 *
1768 * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
1769 * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
1770 * this may race with dequeue, which can't drop the rq lock or fail, do a little
1771 * dancing from our side.
1772 *
1773 * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
1774 * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
1775 * would be cleared to -1. While other cpus may have updated it to different
1776 * values afterwards, as this operation can't be preempted or recurse, the
1777 * holding_cpu can never become this CPU again before we're done. Thus, we can
1778 * tell whether we lost to dequeue by testing whether the holding_cpu still
1779 * points to this CPU. See dispatch_dequeue() for the counterpart.
1780 *
1781 * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
1782 * still valid. %false if lost to dequeue.
1783 */
unlink_dsq_and_lock_src_rq(struct task_struct * p,struct scx_dispatch_q * dsq,struct rq * src_rq)1784 static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
1785 struct scx_dispatch_q *dsq,
1786 struct rq *src_rq)
1787 {
1788 s32 cpu = raw_smp_processor_id();
1789
1790 lockdep_assert_held(&dsq->lock);
1791
1792 WARN_ON_ONCE(p->scx.holding_cpu >= 0);
1793 task_unlink_from_dsq(p, dsq);
1794 p->scx.holding_cpu = cpu;
1795
1796 raw_spin_unlock(&dsq->lock);
1797 raw_spin_rq_lock(src_rq);
1798
1799 /* task_rq couldn't have changed if we're still the holding cpu */
1800 return likely(p->scx.holding_cpu == cpu) &&
1801 !WARN_ON_ONCE(src_rq != task_rq(p));
1802 }
1803
consume_remote_task(struct rq * this_rq,struct task_struct * p,struct scx_dispatch_q * dsq,struct rq * src_rq)1804 static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
1805 struct scx_dispatch_q *dsq, struct rq *src_rq)
1806 {
1807 raw_spin_rq_unlock(this_rq);
1808
1809 if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
1810 move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
1811 return true;
1812 } else {
1813 raw_spin_rq_unlock(src_rq);
1814 raw_spin_rq_lock(this_rq);
1815 return false;
1816 }
1817 }
1818
1819 /**
1820 * move_task_between_dsqs() - Move a task from one DSQ to another
1821 * @sch: scx_sched being operated on
1822 * @p: target task
1823 * @enq_flags: %SCX_ENQ_*
1824 * @src_dsq: DSQ @p is currently on, must not be a local DSQ
1825 * @dst_dsq: DSQ @p is being moved to, can be any DSQ
1826 *
1827 * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
1828 * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
1829 * will change. As @p's task_rq is locked, this function doesn't need to use the
1830 * holding_cpu mechanism.
1831 *
1832 * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
1833 * return value, is locked.
1834 */
move_task_between_dsqs(struct scx_sched * sch,struct task_struct * p,u64 enq_flags,struct scx_dispatch_q * src_dsq,struct scx_dispatch_q * dst_dsq)1835 static struct rq *move_task_between_dsqs(struct scx_sched *sch,
1836 struct task_struct *p, u64 enq_flags,
1837 struct scx_dispatch_q *src_dsq,
1838 struct scx_dispatch_q *dst_dsq)
1839 {
1840 struct rq *src_rq = task_rq(p), *dst_rq;
1841
1842 BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
1843 lockdep_assert_held(&src_dsq->lock);
1844 lockdep_assert_rq_held(src_rq);
1845
1846 if (dst_dsq->id == SCX_DSQ_LOCAL) {
1847 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
1848 if (src_rq != dst_rq &&
1849 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
1850 dst_dsq = find_global_dsq(sch, p);
1851 dst_rq = src_rq;
1852 }
1853 } else {
1854 /* no need to migrate if destination is a non-local DSQ */
1855 dst_rq = src_rq;
1856 }
1857
1858 /*
1859 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
1860 * CPU, @p will be migrated.
1861 */
1862 if (dst_dsq->id == SCX_DSQ_LOCAL) {
1863 /* @p is going from a non-local DSQ to a local DSQ */
1864 if (src_rq == dst_rq) {
1865 task_unlink_from_dsq(p, src_dsq);
1866 move_local_task_to_local_dsq(p, enq_flags,
1867 src_dsq, dst_rq);
1868 raw_spin_unlock(&src_dsq->lock);
1869 } else {
1870 raw_spin_unlock(&src_dsq->lock);
1871 move_remote_task_to_local_dsq(p, enq_flags,
1872 src_rq, dst_rq);
1873 }
1874 } else {
1875 /*
1876 * @p is going from a non-local DSQ to a non-local DSQ. As
1877 * $src_dsq is already locked, do an abbreviated dequeue.
1878 */
1879 dispatch_dequeue_locked(p, src_dsq);
1880 raw_spin_unlock(&src_dsq->lock);
1881
1882 dispatch_enqueue(sch, dst_dsq, p, enq_flags);
1883 }
1884
1885 return dst_rq;
1886 }
1887
consume_dispatch_q(struct scx_sched * sch,struct rq * rq,struct scx_dispatch_q * dsq)1888 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
1889 struct scx_dispatch_q *dsq)
1890 {
1891 struct task_struct *p;
1892 retry:
1893 /*
1894 * The caller can't expect to successfully consume a task if the task's
1895 * addition to @dsq isn't guaranteed to be visible somehow. Test
1896 * @dsq->list without locking and skip if it seems empty.
1897 */
1898 if (list_empty(&dsq->list))
1899 return false;
1900
1901 raw_spin_lock(&dsq->lock);
1902
1903 nldsq_for_each_task(p, dsq) {
1904 struct rq *task_rq = task_rq(p);
1905
1906 /*
1907 * This loop can lead to multiple lockup scenarios, e.g. the BPF
1908 * scheduler can put an enormous number of affinitized tasks into
1909 * a contended DSQ, or the outer retry loop can repeatedly race
1910 * against scx_bypass() dequeueing tasks from @dsq trying to put
1911 * the system into the bypass mode. This can easily live-lock the
1912 * machine. If aborting, exit from all non-bypass DSQs.
1913 */
1914 if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
1915 break;
1916
1917 if (rq == task_rq) {
1918 task_unlink_from_dsq(p, dsq);
1919 move_local_task_to_local_dsq(p, 0, dsq, rq);
1920 raw_spin_unlock(&dsq->lock);
1921 return true;
1922 }
1923
1924 if (task_can_run_on_remote_rq(sch, p, rq, false)) {
1925 if (likely(consume_remote_task(rq, p, dsq, task_rq)))
1926 return true;
1927 goto retry;
1928 }
1929 }
1930
1931 raw_spin_unlock(&dsq->lock);
1932 return false;
1933 }
1934
consume_global_dsq(struct scx_sched * sch,struct rq * rq)1935 static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
1936 {
1937 int node = cpu_to_node(cpu_of(rq));
1938
1939 return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
1940 }
1941
1942 /**
1943 * dispatch_to_local_dsq - Dispatch a task to a local dsq
1944 * @sch: scx_sched being operated on
1945 * @rq: current rq which is locked
1946 * @dst_dsq: destination DSQ
1947 * @p: task to dispatch
1948 * @enq_flags: %SCX_ENQ_*
1949 *
1950 * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
1951 * DSQ. This function performs all the synchronization dancing needed because
1952 * local DSQs are protected with rq locks.
1953 *
1954 * The caller must have exclusive ownership of @p (e.g. through
1955 * %SCX_OPSS_DISPATCHING).
1956 */
dispatch_to_local_dsq(struct scx_sched * sch,struct rq * rq,struct scx_dispatch_q * dst_dsq,struct task_struct * p,u64 enq_flags)1957 static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
1958 struct scx_dispatch_q *dst_dsq,
1959 struct task_struct *p, u64 enq_flags)
1960 {
1961 struct rq *src_rq = task_rq(p);
1962 struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
1963 struct rq *locked_rq = rq;
1964
1965 /*
1966 * We're synchronized against dequeue through DISPATCHING. As @p can't
1967 * be dequeued, its task_rq and cpus_allowed are stable too.
1968 *
1969 * If dispatching to @rq that @p is already on, no lock dancing needed.
1970 */
1971 if (rq == src_rq && rq == dst_rq) {
1972 dispatch_enqueue(sch, dst_dsq, p,
1973 enq_flags | SCX_ENQ_CLEAR_OPSS);
1974 return;
1975 }
1976
1977 if (src_rq != dst_rq &&
1978 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
1979 dispatch_enqueue(sch, find_global_dsq(sch, p), p,
1980 enq_flags | SCX_ENQ_CLEAR_OPSS);
1981 return;
1982 }
1983
1984 /*
1985 * @p is on a possibly remote @src_rq which we need to lock to move the
1986 * task. If dequeue is in progress, it'd be locking @src_rq and waiting
1987 * on DISPATCHING, so we can't grab @src_rq lock while holding
1988 * DISPATCHING.
1989 *
1990 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
1991 * we're moving from a DSQ and use the same mechanism - mark the task
1992 * under transfer with holding_cpu, release DISPATCHING and then follow
1993 * the same protocol. See unlink_dsq_and_lock_src_rq().
1994 */
1995 p->scx.holding_cpu = raw_smp_processor_id();
1996
1997 /* store_release ensures that dequeue sees the above */
1998 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1999
2000 /* switch to @src_rq lock */
2001 if (locked_rq != src_rq) {
2002 raw_spin_rq_unlock(locked_rq);
2003 locked_rq = src_rq;
2004 raw_spin_rq_lock(src_rq);
2005 }
2006
2007 /* task_rq couldn't have changed if we're still the holding cpu */
2008 if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
2009 !WARN_ON_ONCE(src_rq != task_rq(p))) {
2010 /*
2011 * If @p is staying on the same rq, there's no need to go
2012 * through the full deactivate/activate cycle. Optimize by
2013 * abbreviating move_remote_task_to_local_dsq().
2014 */
2015 if (src_rq == dst_rq) {
2016 p->scx.holding_cpu = -1;
2017 dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
2018 enq_flags);
2019 } else {
2020 move_remote_task_to_local_dsq(p, enq_flags,
2021 src_rq, dst_rq);
2022 /* task has been moved to dst_rq, which is now locked */
2023 locked_rq = dst_rq;
2024 }
2025
2026 /* if the destination CPU is idle, wake it up */
2027 if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
2028 resched_curr(dst_rq);
2029 }
2030
2031 /* switch back to @rq lock */
2032 if (locked_rq != rq) {
2033 raw_spin_rq_unlock(locked_rq);
2034 raw_spin_rq_lock(rq);
2035 }
2036 }
2037
2038 /**
2039 * finish_dispatch - Asynchronously finish dispatching a task
2040 * @rq: current rq which is locked
2041 * @p: task to finish dispatching
2042 * @qseq_at_dispatch: qseq when @p started getting dispatched
2043 * @dsq_id: destination DSQ ID
2044 * @enq_flags: %SCX_ENQ_*
2045 *
2046 * Dispatching to local DSQs may need to wait for queueing to complete or
2047 * require rq lock dancing. As we don't wanna do either while inside
2048 * ops.dispatch() to avoid locking order inversion, we split dispatching into
2049 * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
2050 * task and its qseq. Once ops.dispatch() returns, this function is called to
2051 * finish up.
2052 *
2053 * There is no guarantee that @p is still valid for dispatching or even that it
2054 * was valid in the first place. Make sure that the task is still owned by the
2055 * BPF scheduler and claim the ownership before dispatching.
2056 */
finish_dispatch(struct scx_sched * sch,struct rq * rq,struct task_struct * p,unsigned long qseq_at_dispatch,u64 dsq_id,u64 enq_flags)2057 static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
2058 struct task_struct *p,
2059 unsigned long qseq_at_dispatch,
2060 u64 dsq_id, u64 enq_flags)
2061 {
2062 struct scx_dispatch_q *dsq;
2063 unsigned long opss;
2064
2065 touch_core_sched_dispatch(rq, p);
2066 retry:
2067 /*
2068 * No need for _acquire here. @p is accessed only after a successful
2069 * try_cmpxchg to DISPATCHING.
2070 */
2071 opss = atomic_long_read(&p->scx.ops_state);
2072
2073 switch (opss & SCX_OPSS_STATE_MASK) {
2074 case SCX_OPSS_DISPATCHING:
2075 case SCX_OPSS_NONE:
2076 /* someone else already got to it */
2077 return;
2078 case SCX_OPSS_QUEUED:
2079 /*
2080 * If qseq doesn't match, @p has gone through at least one
2081 * dispatch/dequeue and re-enqueue cycle between
2082 * scx_bpf_dsq_insert() and here and we have no claim on it.
2083 */
2084 if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
2085 return;
2086
2087 /*
2088 * While we know @p is accessible, we don't yet have a claim on
2089 * it - the BPF scheduler is allowed to dispatch tasks
2090 * spuriously and there can be a racing dequeue attempt. Let's
2091 * claim @p by atomically transitioning it from QUEUED to
2092 * DISPATCHING.
2093 */
2094 if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
2095 SCX_OPSS_DISPATCHING)))
2096 break;
2097 goto retry;
2098 case SCX_OPSS_QUEUEING:
2099 /*
2100 * do_enqueue_task() is in the process of transferring the task
2101 * to the BPF scheduler while holding @p's rq lock. As we aren't
2102 * holding any kernel or BPF resource that the enqueue path may
2103 * depend upon, it's safe to wait.
2104 */
2105 wait_ops_state(p, opss);
2106 goto retry;
2107 }
2108
2109 BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
2110
2111 dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
2112
2113 if (dsq->id == SCX_DSQ_LOCAL)
2114 dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
2115 else
2116 dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
2117 }
2118
flush_dispatch_buf(struct scx_sched * sch,struct rq * rq)2119 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
2120 {
2121 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
2122 u32 u;
2123
2124 for (u = 0; u < dspc->cursor; u++) {
2125 struct scx_dsp_buf_ent *ent = &dspc->buf[u];
2126
2127 finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
2128 ent->enq_flags);
2129 }
2130
2131 dspc->nr_tasks += dspc->cursor;
2132 dspc->cursor = 0;
2133 }
2134
maybe_queue_balance_callback(struct rq * rq)2135 static inline void maybe_queue_balance_callback(struct rq *rq)
2136 {
2137 lockdep_assert_rq_held(rq);
2138
2139 if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
2140 return;
2141
2142 queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
2143 deferred_bal_cb_workfn);
2144
2145 rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
2146 }
2147
balance_one(struct rq * rq,struct task_struct * prev)2148 static int balance_one(struct rq *rq, struct task_struct *prev)
2149 {
2150 struct scx_sched *sch = scx_root;
2151 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
2152 bool prev_on_scx = prev->sched_class == &ext_sched_class;
2153 bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
2154 int nr_loops = SCX_DSP_MAX_LOOPS;
2155
2156 lockdep_assert_rq_held(rq);
2157 rq->scx.flags |= SCX_RQ_IN_BALANCE;
2158 rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
2159
2160 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
2161 unlikely(rq->scx.cpu_released)) {
2162 /*
2163 * If the previous sched_class for the current CPU was not SCX,
2164 * notify the BPF scheduler that it again has control of the
2165 * core. This callback complements ->cpu_release(), which is
2166 * emitted in switch_class().
2167 */
2168 if (SCX_HAS_OP(sch, cpu_acquire))
2169 SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
2170 cpu_of(rq), NULL);
2171 rq->scx.cpu_released = false;
2172 }
2173
2174 if (prev_on_scx) {
2175 update_curr_scx(rq);
2176
2177 /*
2178 * If @prev is runnable & has slice left, it has priority and
2179 * fetching more just increases latency for the fetched tasks.
2180 * Tell pick_task_scx() to keep running @prev. If the BPF
2181 * scheduler wants to handle this explicitly, it should
2182 * implement ->cpu_release().
2183 *
2184 * See scx_disable_workfn() for the explanation on the bypassing
2185 * test.
2186 */
2187 if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
2188 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2189 goto has_tasks;
2190 }
2191 }
2192
2193 /* if there already are tasks to run, nothing to do */
2194 if (rq->scx.local_dsq.nr)
2195 goto has_tasks;
2196
2197 if (consume_global_dsq(sch, rq))
2198 goto has_tasks;
2199
2200 if (scx_rq_bypassing(rq)) {
2201 if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
2202 goto has_tasks;
2203 else
2204 goto no_tasks;
2205 }
2206
2207 if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
2208 goto no_tasks;
2209
2210 dspc->rq = rq;
2211
2212 /*
2213 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
2214 * the local DSQ might still end up empty after a successful
2215 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
2216 * produced some tasks, retry. The BPF scheduler may depend on this
2217 * looping behavior to simplify its implementation.
2218 */
2219 do {
2220 dspc->nr_tasks = 0;
2221
2222 SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
2223 cpu_of(rq), prev_on_scx ? prev : NULL);
2224
2225 flush_dispatch_buf(sch, rq);
2226
2227 if (prev_on_rq && prev->scx.slice) {
2228 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2229 goto has_tasks;
2230 }
2231 if (rq->scx.local_dsq.nr)
2232 goto has_tasks;
2233 if (consume_global_dsq(sch, rq))
2234 goto has_tasks;
2235
2236 /*
2237 * ops.dispatch() can trap us in this loop by repeatedly
2238 * dispatching ineligible tasks. Break out once in a while to
2239 * allow the watchdog to run. As IRQ can't be enabled in
2240 * balance(), we want to complete this scheduling cycle and then
2241 * start a new one. IOW, we want to call resched_curr() on the
2242 * next, most likely idle, task, not the current one. Use
2243 * scx_kick_cpu() for deferred kicking.
2244 */
2245 if (unlikely(!--nr_loops)) {
2246 scx_kick_cpu(sch, cpu_of(rq), 0);
2247 break;
2248 }
2249 } while (dspc->nr_tasks);
2250
2251 no_tasks:
2252 /*
2253 * Didn't find another task to run. Keep running @prev unless
2254 * %SCX_OPS_ENQ_LAST is in effect.
2255 */
2256 if (prev_on_rq &&
2257 (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
2258 rq->scx.flags |= SCX_RQ_BAL_KEEP;
2259 __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
2260 goto has_tasks;
2261 }
2262 rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2263 return false;
2264
2265 has_tasks:
2266 rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
2267 return true;
2268 }
2269
process_ddsp_deferred_locals(struct rq * rq)2270 static void process_ddsp_deferred_locals(struct rq *rq)
2271 {
2272 struct task_struct *p;
2273
2274 lockdep_assert_rq_held(rq);
2275
2276 /*
2277 * Now that @rq can be unlocked, execute the deferred enqueueing of
2278 * tasks directly dispatched to the local DSQs of other CPUs. See
2279 * direct_dispatch(). Keep popping from the head instead of using
2280 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
2281 * temporarily.
2282 */
2283 while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
2284 struct task_struct, scx.dsq_list.node))) {
2285 struct scx_sched *sch = scx_root;
2286 struct scx_dispatch_q *dsq;
2287
2288 list_del_init(&p->scx.dsq_list.node);
2289
2290 dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
2291 if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
2292 dispatch_to_local_dsq(sch, rq, dsq, p,
2293 p->scx.ddsp_enq_flags);
2294 }
2295 }
2296
set_next_task_scx(struct rq * rq,struct task_struct * p,bool first)2297 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
2298 {
2299 struct scx_sched *sch = scx_root;
2300
2301 if (p->scx.flags & SCX_TASK_QUEUED) {
2302 /*
2303 * Core-sched might decide to execute @p before it is
2304 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
2305 */
2306 ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
2307 dispatch_dequeue(rq, p);
2308 }
2309
2310 p->se.exec_start = rq_clock_task(rq);
2311
2312 /* see dequeue_task_scx() on why we skip when !QUEUED */
2313 if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
2314 SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
2315
2316 clr_task_runnable(p, true);
2317
2318 /*
2319 * @p is getting newly scheduled or got kicked after someone updated its
2320 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
2321 */
2322 if ((p->scx.slice == SCX_SLICE_INF) !=
2323 (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
2324 if (p->scx.slice == SCX_SLICE_INF)
2325 rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
2326 else
2327 rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
2328
2329 sched_update_tick_dependency(rq);
2330
2331 /*
2332 * For now, let's refresh the load_avgs just when transitioning
2333 * in and out of nohz. In the future, we might want to add a
2334 * mechanism which calls the following periodically on
2335 * tick-stopped CPUs.
2336 */
2337 update_other_load_avgs(rq);
2338 }
2339 }
2340
2341 static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class * class)2342 preempt_reason_from_class(const struct sched_class *class)
2343 {
2344 if (class == &stop_sched_class)
2345 return SCX_CPU_PREEMPT_STOP;
2346 if (class == &dl_sched_class)
2347 return SCX_CPU_PREEMPT_DL;
2348 if (class == &rt_sched_class)
2349 return SCX_CPU_PREEMPT_RT;
2350 return SCX_CPU_PREEMPT_UNKNOWN;
2351 }
2352
switch_class(struct rq * rq,struct task_struct * next)2353 static void switch_class(struct rq *rq, struct task_struct *next)
2354 {
2355 struct scx_sched *sch = scx_root;
2356 const struct sched_class *next_class = next->sched_class;
2357
2358 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
2359 return;
2360
2361 /*
2362 * The callback is conceptually meant to convey that the CPU is no
2363 * longer under the control of SCX. Therefore, don't invoke the callback
2364 * if the next class is below SCX (in which case the BPF scheduler has
2365 * actively decided not to schedule any tasks on the CPU).
2366 */
2367 if (sched_class_above(&ext_sched_class, next_class))
2368 return;
2369
2370 /*
2371 * At this point we know that SCX was preempted by a higher priority
2372 * sched_class, so invoke the ->cpu_release() callback if we have not
2373 * done so already. We only send the callback once between SCX being
2374 * preempted, and it regaining control of the CPU.
2375 *
2376 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
2377 * next time that balance_one() is invoked.
2378 */
2379 if (!rq->scx.cpu_released) {
2380 if (SCX_HAS_OP(sch, cpu_release)) {
2381 struct scx_cpu_release_args args = {
2382 .reason = preempt_reason_from_class(next_class),
2383 .task = next,
2384 };
2385
2386 SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
2387 cpu_of(rq), &args);
2388 }
2389 rq->scx.cpu_released = true;
2390 }
2391 }
2392
put_prev_task_scx(struct rq * rq,struct task_struct * p,struct task_struct * next)2393 static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
2394 struct task_struct *next)
2395 {
2396 struct scx_sched *sch = scx_root;
2397
2398 /* see kick_cpus_irq_workfn() */
2399 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
2400
2401 update_curr_scx(rq);
2402
2403 /* see dequeue_task_scx() on why we skip when !QUEUED */
2404 if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
2405 SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
2406
2407 if (p->scx.flags & SCX_TASK_QUEUED) {
2408 set_task_runnable(rq, p);
2409
2410 /*
2411 * If @p has slice left and is being put, @p is getting
2412 * preempted by a higher priority scheduler class or core-sched
2413 * forcing a different task. Leave it at the head of the local
2414 * DSQ.
2415 */
2416 if (p->scx.slice && !scx_rq_bypassing(rq)) {
2417 dispatch_enqueue(sch, &rq->scx.local_dsq, p,
2418 SCX_ENQ_HEAD);
2419 goto switch_class;
2420 }
2421
2422 /*
2423 * If @p is runnable but we're about to enter a lower
2424 * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
2425 * ops.enqueue() that @p is the only one available for this cpu,
2426 * which should trigger an explicit follow-up scheduling event.
2427 */
2428 if (next && sched_class_above(&ext_sched_class, next->sched_class)) {
2429 WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
2430 do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
2431 } else {
2432 do_enqueue_task(rq, p, 0, -1);
2433 }
2434 }
2435
2436 switch_class:
2437 if (next && next->sched_class != &ext_sched_class)
2438 switch_class(rq, next);
2439 }
2440
first_local_task(struct rq * rq)2441 static struct task_struct *first_local_task(struct rq *rq)
2442 {
2443 return list_first_entry_or_null(&rq->scx.local_dsq.list,
2444 struct task_struct, scx.dsq_list.node);
2445 }
2446
2447 static struct task_struct *
do_pick_task_scx(struct rq * rq,struct rq_flags * rf,bool force_scx)2448 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
2449 {
2450 struct task_struct *prev = rq->curr;
2451 bool keep_prev;
2452 struct task_struct *p;
2453
2454 /* see kick_cpus_irq_workfn() */
2455 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
2456
2457 rq_modified_clear(rq);
2458
2459 rq_unpin_lock(rq, rf);
2460 balance_one(rq, prev);
2461 rq_repin_lock(rq, rf);
2462 maybe_queue_balance_callback(rq);
2463
2464 /*
2465 * If any higher-priority sched class enqueued a runnable task on
2466 * this rq during balance_one(), abort and return RETRY_TASK, so
2467 * that the scheduler loop can restart.
2468 *
2469 * If @force_scx is true, always try to pick a SCHED_EXT task,
2470 * regardless of any higher-priority sched classes activity.
2471 */
2472 if (!force_scx && rq_modified_above(rq, &ext_sched_class))
2473 return RETRY_TASK;
2474
2475 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
2476 if (unlikely(keep_prev &&
2477 prev->sched_class != &ext_sched_class)) {
2478 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
2479 keep_prev = false;
2480 }
2481
2482 /*
2483 * If balance_one() is telling us to keep running @prev, replenish slice
2484 * if necessary and keep running @prev. Otherwise, pop the first one
2485 * from the local DSQ.
2486 */
2487 if (keep_prev) {
2488 p = prev;
2489 if (!p->scx.slice)
2490 refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
2491 } else {
2492 p = first_local_task(rq);
2493 if (!p)
2494 return NULL;
2495
2496 if (unlikely(!p->scx.slice)) {
2497 struct scx_sched *sch = rcu_dereference_sched(scx_root);
2498
2499 if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
2500 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
2501 p->comm, p->pid, __func__);
2502 sch->warned_zero_slice = true;
2503 }
2504 refill_task_slice_dfl(sch, p);
2505 }
2506 }
2507
2508 return p;
2509 }
2510
pick_task_scx(struct rq * rq,struct rq_flags * rf)2511 static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
2512 {
2513 return do_pick_task_scx(rq, rf, false);
2514 }
2515
2516 #ifdef CONFIG_SCHED_CORE
2517 /**
2518 * scx_prio_less - Task ordering for core-sched
2519 * @a: task A
2520 * @b: task B
2521 * @in_fi: in forced idle state
2522 *
2523 * Core-sched is implemented as an additional scheduling layer on top of the
2524 * usual sched_class'es and needs to find out the expected task ordering. For
2525 * SCX, core-sched calls this function to interrogate the task ordering.
2526 *
2527 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
2528 * to implement the default task ordering. The older the timestamp, the higher
2529 * priority the task - the global FIFO ordering matching the default scheduling
2530 * behavior.
2531 *
2532 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
2533 * implement FIFO ordering within each local DSQ. See pick_task_scx().
2534 */
scx_prio_less(const struct task_struct * a,const struct task_struct * b,bool in_fi)2535 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
2536 bool in_fi)
2537 {
2538 struct scx_sched *sch = scx_root;
2539
2540 /*
2541 * The const qualifiers are dropped from task_struct pointers when
2542 * calling ops.core_sched_before(). Accesses are controlled by the
2543 * verifier.
2544 */
2545 if (SCX_HAS_OP(sch, core_sched_before) &&
2546 !scx_rq_bypassing(task_rq(a)))
2547 return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
2548 NULL,
2549 (struct task_struct *)a,
2550 (struct task_struct *)b);
2551 else
2552 return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
2553 }
2554 #endif /* CONFIG_SCHED_CORE */
2555
select_task_rq_scx(struct task_struct * p,int prev_cpu,int wake_flags)2556 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
2557 {
2558 struct scx_sched *sch = scx_root;
2559 bool rq_bypass;
2560
2561 /*
2562 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
2563 * can be a good migration opportunity with low cache and memory
2564 * footprint. Returning a CPU different than @prev_cpu triggers
2565 * immediate rq migration. However, for SCX, as the current rq
2566 * association doesn't dictate where the task is going to run, this
2567 * doesn't fit well. If necessary, we can later add a dedicated method
2568 * which can decide to preempt self to force it through the regular
2569 * scheduling path.
2570 */
2571 if (unlikely(wake_flags & WF_EXEC))
2572 return prev_cpu;
2573
2574 rq_bypass = scx_rq_bypassing(task_rq(p));
2575 if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
2576 s32 cpu;
2577 struct task_struct **ddsp_taskp;
2578
2579 ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
2580 WARN_ON_ONCE(*ddsp_taskp);
2581 *ddsp_taskp = p;
2582
2583 cpu = SCX_CALL_OP_TASK_RET(sch,
2584 SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
2585 select_cpu, NULL, p, prev_cpu,
2586 wake_flags);
2587 p->scx.selected_cpu = cpu;
2588 *ddsp_taskp = NULL;
2589 if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
2590 return cpu;
2591 else
2592 return prev_cpu;
2593 } else {
2594 s32 cpu;
2595
2596 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
2597 if (cpu >= 0) {
2598 refill_task_slice_dfl(sch, p);
2599 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
2600 } else {
2601 cpu = prev_cpu;
2602 }
2603 p->scx.selected_cpu = cpu;
2604
2605 if (rq_bypass)
2606 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
2607 return cpu;
2608 }
2609 }
2610
task_woken_scx(struct rq * rq,struct task_struct * p)2611 static void task_woken_scx(struct rq *rq, struct task_struct *p)
2612 {
2613 run_deferred(rq);
2614 }
2615
set_cpus_allowed_scx(struct task_struct * p,struct affinity_context * ac)2616 static void set_cpus_allowed_scx(struct task_struct *p,
2617 struct affinity_context *ac)
2618 {
2619 struct scx_sched *sch = scx_root;
2620
2621 set_cpus_allowed_common(p, ac);
2622
2623 if (task_dead_and_done(p))
2624 return;
2625
2626 /*
2627 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
2628 * differ from the configured one in @p->cpus_mask. Always tell the bpf
2629 * scheduler the effective one.
2630 *
2631 * Fine-grained memory write control is enforced by BPF making the const
2632 * designation pointless. Cast it away when calling the operation.
2633 */
2634 if (SCX_HAS_OP(sch, set_cpumask))
2635 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
2636 p, (struct cpumask *)p->cpus_ptr);
2637 }
2638
handle_hotplug(struct rq * rq,bool online)2639 static void handle_hotplug(struct rq *rq, bool online)
2640 {
2641 struct scx_sched *sch = scx_root;
2642 int cpu = cpu_of(rq);
2643
2644 atomic_long_inc(&scx_hotplug_seq);
2645
2646 /*
2647 * scx_root updates are protected by cpus_read_lock() and will stay
2648 * stable here. Note that we can't depend on scx_enabled() test as the
2649 * hotplug ops need to be enabled before __scx_enabled is set.
2650 */
2651 if (unlikely(!sch))
2652 return;
2653
2654 if (scx_enabled())
2655 scx_idle_update_selcpu_topology(&sch->ops);
2656
2657 if (online && SCX_HAS_OP(sch, cpu_online))
2658 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
2659 else if (!online && SCX_HAS_OP(sch, cpu_offline))
2660 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
2661 else
2662 scx_exit(sch, SCX_EXIT_UNREG_KERN,
2663 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
2664 "cpu %d going %s, exiting scheduler", cpu,
2665 online ? "online" : "offline");
2666 }
2667
scx_rq_activate(struct rq * rq)2668 void scx_rq_activate(struct rq *rq)
2669 {
2670 handle_hotplug(rq, true);
2671 }
2672
scx_rq_deactivate(struct rq * rq)2673 void scx_rq_deactivate(struct rq *rq)
2674 {
2675 handle_hotplug(rq, false);
2676 }
2677
rq_online_scx(struct rq * rq)2678 static void rq_online_scx(struct rq *rq)
2679 {
2680 rq->scx.flags |= SCX_RQ_ONLINE;
2681 }
2682
rq_offline_scx(struct rq * rq)2683 static void rq_offline_scx(struct rq *rq)
2684 {
2685 rq->scx.flags &= ~SCX_RQ_ONLINE;
2686 }
2687
2688
check_rq_for_timeouts(struct rq * rq)2689 static bool check_rq_for_timeouts(struct rq *rq)
2690 {
2691 struct scx_sched *sch;
2692 struct task_struct *p;
2693 struct rq_flags rf;
2694 bool timed_out = false;
2695
2696 rq_lock_irqsave(rq, &rf);
2697 sch = rcu_dereference_bh(scx_root);
2698 if (unlikely(!sch))
2699 goto out_unlock;
2700
2701 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
2702 unsigned long last_runnable = p->scx.runnable_at;
2703
2704 if (unlikely(time_after(jiffies,
2705 last_runnable + scx_watchdog_timeout))) {
2706 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
2707
2708 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
2709 "%s[%d] failed to run for %u.%03us",
2710 p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
2711 timed_out = true;
2712 break;
2713 }
2714 }
2715 out_unlock:
2716 rq_unlock_irqrestore(rq, &rf);
2717 return timed_out;
2718 }
2719
scx_watchdog_workfn(struct work_struct * work)2720 static void scx_watchdog_workfn(struct work_struct *work)
2721 {
2722 int cpu;
2723
2724 WRITE_ONCE(scx_watchdog_timestamp, jiffies);
2725
2726 for_each_online_cpu(cpu) {
2727 if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
2728 break;
2729
2730 cond_resched();
2731 }
2732 queue_delayed_work(system_unbound_wq, to_delayed_work(work),
2733 scx_watchdog_timeout / 2);
2734 }
2735
scx_tick(struct rq * rq)2736 void scx_tick(struct rq *rq)
2737 {
2738 struct scx_sched *sch;
2739 unsigned long last_check;
2740
2741 if (!scx_enabled())
2742 return;
2743
2744 sch = rcu_dereference_bh(scx_root);
2745 if (unlikely(!sch))
2746 return;
2747
2748 last_check = READ_ONCE(scx_watchdog_timestamp);
2749 if (unlikely(time_after(jiffies,
2750 last_check + READ_ONCE(scx_watchdog_timeout)))) {
2751 u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
2752
2753 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
2754 "watchdog failed to check in for %u.%03us",
2755 dur_ms / 1000, dur_ms % 1000);
2756 }
2757
2758 update_other_load_avgs(rq);
2759 }
2760
task_tick_scx(struct rq * rq,struct task_struct * curr,int queued)2761 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
2762 {
2763 struct scx_sched *sch = scx_root;
2764
2765 update_curr_scx(rq);
2766
2767 /*
2768 * While disabling, always resched and refresh core-sched timestamp as
2769 * we can't trust the slice management or ops.core_sched_before().
2770 */
2771 if (scx_rq_bypassing(rq)) {
2772 curr->scx.slice = 0;
2773 touch_core_sched(rq, curr);
2774 } else if (SCX_HAS_OP(sch, tick)) {
2775 SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
2776 }
2777
2778 if (!curr->scx.slice)
2779 resched_curr(rq);
2780 }
2781
2782 #ifdef CONFIG_EXT_GROUP_SCHED
tg_cgrp(struct task_group * tg)2783 static struct cgroup *tg_cgrp(struct task_group *tg)
2784 {
2785 /*
2786 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
2787 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
2788 * root cgroup.
2789 */
2790 if (tg && tg->css.cgroup)
2791 return tg->css.cgroup;
2792 else
2793 return &cgrp_dfl_root.cgrp;
2794 }
2795
2796 #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg),
2797
2798 #else /* CONFIG_EXT_GROUP_SCHED */
2799
2800 #define SCX_INIT_TASK_ARGS_CGROUP(tg)
2801
2802 #endif /* CONFIG_EXT_GROUP_SCHED */
2803
scx_get_task_state(const struct task_struct * p)2804 static enum scx_task_state scx_get_task_state(const struct task_struct *p)
2805 {
2806 return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
2807 }
2808
scx_set_task_state(struct task_struct * p,enum scx_task_state state)2809 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
2810 {
2811 enum scx_task_state prev_state = scx_get_task_state(p);
2812 bool warn = false;
2813
2814 BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
2815
2816 switch (state) {
2817 case SCX_TASK_NONE:
2818 break;
2819 case SCX_TASK_INIT:
2820 warn = prev_state != SCX_TASK_NONE;
2821 break;
2822 case SCX_TASK_READY:
2823 warn = prev_state == SCX_TASK_NONE;
2824 break;
2825 case SCX_TASK_ENABLED:
2826 warn = prev_state != SCX_TASK_READY;
2827 break;
2828 default:
2829 warn = true;
2830 return;
2831 }
2832
2833 WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
2834 prev_state, state, p->comm, p->pid);
2835
2836 p->scx.flags &= ~SCX_TASK_STATE_MASK;
2837 p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
2838 }
2839
scx_init_task(struct task_struct * p,struct task_group * tg,bool fork)2840 static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
2841 {
2842 struct scx_sched *sch = scx_root;
2843 int ret;
2844
2845 p->scx.disallow = false;
2846
2847 if (SCX_HAS_OP(sch, init_task)) {
2848 struct scx_init_task_args args = {
2849 SCX_INIT_TASK_ARGS_CGROUP(tg)
2850 .fork = fork,
2851 };
2852
2853 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
2854 p, &args);
2855 if (unlikely(ret)) {
2856 ret = ops_sanitize_err(sch, "init_task", ret);
2857 return ret;
2858 }
2859 }
2860
2861 scx_set_task_state(p, SCX_TASK_INIT);
2862
2863 if (p->scx.disallow) {
2864 if (!fork) {
2865 struct rq *rq;
2866 struct rq_flags rf;
2867
2868 rq = task_rq_lock(p, &rf);
2869
2870 /*
2871 * We're in the load path and @p->policy will be applied
2872 * right after. Reverting @p->policy here and rejecting
2873 * %SCHED_EXT transitions from scx_check_setscheduler()
2874 * guarantees that if ops.init_task() sets @p->disallow,
2875 * @p can never be in SCX.
2876 */
2877 if (p->policy == SCHED_EXT) {
2878 p->policy = SCHED_NORMAL;
2879 atomic_long_inc(&scx_nr_rejected);
2880 }
2881
2882 task_rq_unlock(rq, p, &rf);
2883 } else if (p->policy == SCHED_EXT) {
2884 scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
2885 p->comm, p->pid);
2886 }
2887 }
2888
2889 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
2890 return 0;
2891 }
2892
scx_enable_task(struct task_struct * p)2893 static void scx_enable_task(struct task_struct *p)
2894 {
2895 struct scx_sched *sch = scx_root;
2896 struct rq *rq = task_rq(p);
2897 u32 weight;
2898
2899 lockdep_assert_rq_held(rq);
2900
2901 /*
2902 * Set the weight before calling ops.enable() so that the scheduler
2903 * doesn't see a stale value if they inspect the task struct.
2904 */
2905 if (task_has_idle_policy(p))
2906 weight = WEIGHT_IDLEPRIO;
2907 else
2908 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
2909
2910 p->scx.weight = sched_weight_to_cgroup(weight);
2911
2912 if (SCX_HAS_OP(sch, enable))
2913 SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
2914 scx_set_task_state(p, SCX_TASK_ENABLED);
2915
2916 if (SCX_HAS_OP(sch, set_weight))
2917 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
2918 p, p->scx.weight);
2919 }
2920
scx_disable_task(struct task_struct * p)2921 static void scx_disable_task(struct task_struct *p)
2922 {
2923 struct scx_sched *sch = scx_root;
2924 struct rq *rq = task_rq(p);
2925
2926 lockdep_assert_rq_held(rq);
2927 WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
2928
2929 if (SCX_HAS_OP(sch, disable))
2930 SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
2931 scx_set_task_state(p, SCX_TASK_READY);
2932 }
2933
scx_exit_task(struct task_struct * p)2934 static void scx_exit_task(struct task_struct *p)
2935 {
2936 struct scx_sched *sch = scx_root;
2937 struct scx_exit_task_args args = {
2938 .cancelled = false,
2939 };
2940
2941 lockdep_assert_rq_held(task_rq(p));
2942
2943 switch (scx_get_task_state(p)) {
2944 case SCX_TASK_NONE:
2945 return;
2946 case SCX_TASK_INIT:
2947 args.cancelled = true;
2948 break;
2949 case SCX_TASK_READY:
2950 break;
2951 case SCX_TASK_ENABLED:
2952 scx_disable_task(p);
2953 break;
2954 default:
2955 WARN_ON_ONCE(true);
2956 return;
2957 }
2958
2959 if (SCX_HAS_OP(sch, exit_task))
2960 SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
2961 p, &args);
2962 scx_set_task_state(p, SCX_TASK_NONE);
2963 }
2964
init_scx_entity(struct sched_ext_entity * scx)2965 void init_scx_entity(struct sched_ext_entity *scx)
2966 {
2967 memset(scx, 0, sizeof(*scx));
2968 INIT_LIST_HEAD(&scx->dsq_list.node);
2969 RB_CLEAR_NODE(&scx->dsq_priq);
2970 scx->sticky_cpu = -1;
2971 scx->holding_cpu = -1;
2972 INIT_LIST_HEAD(&scx->runnable_node);
2973 scx->runnable_at = jiffies;
2974 scx->ddsp_dsq_id = SCX_DSQ_INVALID;
2975 scx->slice = READ_ONCE(scx_slice_dfl);
2976 }
2977
scx_pre_fork(struct task_struct * p)2978 void scx_pre_fork(struct task_struct *p)
2979 {
2980 /*
2981 * BPF scheduler enable/disable paths want to be able to iterate and
2982 * update all tasks which can become complex when racing forks. As
2983 * enable/disable are very cold paths, let's use a percpu_rwsem to
2984 * exclude forks.
2985 */
2986 percpu_down_read(&scx_fork_rwsem);
2987 }
2988
scx_fork(struct task_struct * p)2989 int scx_fork(struct task_struct *p)
2990 {
2991 percpu_rwsem_assert_held(&scx_fork_rwsem);
2992
2993 if (scx_init_task_enabled)
2994 return scx_init_task(p, task_group(p), true);
2995 else
2996 return 0;
2997 }
2998
scx_post_fork(struct task_struct * p)2999 void scx_post_fork(struct task_struct *p)
3000 {
3001 if (scx_init_task_enabled) {
3002 scx_set_task_state(p, SCX_TASK_READY);
3003
3004 /*
3005 * Enable the task immediately if it's running on sched_ext.
3006 * Otherwise, it'll be enabled in switching_to_scx() if and
3007 * when it's ever configured to run with a SCHED_EXT policy.
3008 */
3009 if (p->sched_class == &ext_sched_class) {
3010 struct rq_flags rf;
3011 struct rq *rq;
3012
3013 rq = task_rq_lock(p, &rf);
3014 scx_enable_task(p);
3015 task_rq_unlock(rq, p, &rf);
3016 }
3017 }
3018
3019 raw_spin_lock_irq(&scx_tasks_lock);
3020 list_add_tail(&p->scx.tasks_node, &scx_tasks);
3021 raw_spin_unlock_irq(&scx_tasks_lock);
3022
3023 percpu_up_read(&scx_fork_rwsem);
3024 }
3025
scx_cancel_fork(struct task_struct * p)3026 void scx_cancel_fork(struct task_struct *p)
3027 {
3028 if (scx_enabled()) {
3029 struct rq *rq;
3030 struct rq_flags rf;
3031
3032 rq = task_rq_lock(p, &rf);
3033 WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
3034 scx_exit_task(p);
3035 task_rq_unlock(rq, p, &rf);
3036 }
3037
3038 percpu_up_read(&scx_fork_rwsem);
3039 }
3040
3041 /**
3042 * task_dead_and_done - Is a task dead and done running?
3043 * @p: target task
3044 *
3045 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the
3046 * task no longer exists from SCX's POV. However, certain sched_class ops may be
3047 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
3048 * may try to switch a task which finished sched_ext_dead() back into SCX
3049 * triggering invalid SCX task state transitions and worse.
3050 *
3051 * Once a task has finished the final switch, sched_ext_dead() is the only thing
3052 * that needs to happen on the task. Use this test to short-circuit sched_class
3053 * operations which may be called on dead tasks.
3054 */
task_dead_and_done(struct task_struct * p)3055 static bool task_dead_and_done(struct task_struct *p)
3056 {
3057 struct rq *rq = task_rq(p);
3058
3059 lockdep_assert_rq_held(rq);
3060
3061 /*
3062 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption
3063 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p
3064 * won't ever run again.
3065 */
3066 return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
3067 !task_on_cpu(rq, p);
3068 }
3069
sched_ext_dead(struct task_struct * p)3070 void sched_ext_dead(struct task_struct *p)
3071 {
3072 unsigned long flags;
3073
3074 /*
3075 * By the time control reaches here, @p has %TASK_DEAD set, switched out
3076 * for the last time and then dropped the rq lock - task_dead_and_done()
3077 * should be returning %true nullifying the straggling sched_class ops.
3078 * Remove from scx_tasks and exit @p.
3079 */
3080 raw_spin_lock_irqsave(&scx_tasks_lock, flags);
3081 list_del_init(&p->scx.tasks_node);
3082 raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
3083
3084 /*
3085 * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
3086 * transitions can't race us. Disable ops for @p.
3087 */
3088 if (scx_get_task_state(p) != SCX_TASK_NONE) {
3089 struct rq_flags rf;
3090 struct rq *rq;
3091
3092 rq = task_rq_lock(p, &rf);
3093 scx_exit_task(p);
3094 task_rq_unlock(rq, p, &rf);
3095 }
3096 }
3097
reweight_task_scx(struct rq * rq,struct task_struct * p,const struct load_weight * lw)3098 static void reweight_task_scx(struct rq *rq, struct task_struct *p,
3099 const struct load_weight *lw)
3100 {
3101 struct scx_sched *sch = scx_root;
3102
3103 lockdep_assert_rq_held(task_rq(p));
3104
3105 if (task_dead_and_done(p))
3106 return;
3107
3108 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
3109 if (SCX_HAS_OP(sch, set_weight))
3110 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
3111 p, p->scx.weight);
3112 }
3113
prio_changed_scx(struct rq * rq,struct task_struct * p,u64 oldprio)3114 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
3115 {
3116 }
3117
switching_to_scx(struct rq * rq,struct task_struct * p)3118 static void switching_to_scx(struct rq *rq, struct task_struct *p)
3119 {
3120 struct scx_sched *sch = scx_root;
3121
3122 if (task_dead_and_done(p))
3123 return;
3124
3125 scx_enable_task(p);
3126
3127 /*
3128 * set_cpus_allowed_scx() is not called while @p is associated with a
3129 * different scheduler class. Keep the BPF scheduler up-to-date.
3130 */
3131 if (SCX_HAS_OP(sch, set_cpumask))
3132 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
3133 p, (struct cpumask *)p->cpus_ptr);
3134 }
3135
switched_from_scx(struct rq * rq,struct task_struct * p)3136 static void switched_from_scx(struct rq *rq, struct task_struct *p)
3137 {
3138 if (task_dead_and_done(p))
3139 return;
3140
3141 scx_disable_task(p);
3142 }
3143
wakeup_preempt_scx(struct rq * rq,struct task_struct * p,int wake_flags)3144 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
switched_to_scx(struct rq * rq,struct task_struct * p)3145 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
3146
scx_check_setscheduler(struct task_struct * p,int policy)3147 int scx_check_setscheduler(struct task_struct *p, int policy)
3148 {
3149 lockdep_assert_rq_held(task_rq(p));
3150
3151 /* if disallow, reject transitioning into SCX */
3152 if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
3153 p->policy != policy && policy == SCHED_EXT)
3154 return -EACCES;
3155
3156 return 0;
3157 }
3158
3159 #ifdef CONFIG_NO_HZ_FULL
scx_can_stop_tick(struct rq * rq)3160 bool scx_can_stop_tick(struct rq *rq)
3161 {
3162 struct task_struct *p = rq->curr;
3163
3164 if (scx_rq_bypassing(rq))
3165 return false;
3166
3167 if (p->sched_class != &ext_sched_class)
3168 return true;
3169
3170 /*
3171 * @rq can dispatch from different DSQs, so we can't tell whether it
3172 * needs the tick or not by looking at nr_running. Allow stopping ticks
3173 * iff the BPF scheduler indicated so. See set_next_task_scx().
3174 */
3175 return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
3176 }
3177 #endif
3178
3179 #ifdef CONFIG_EXT_GROUP_SCHED
3180
3181 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
3182 static bool scx_cgroup_enabled;
3183
scx_tg_init(struct task_group * tg)3184 void scx_tg_init(struct task_group *tg)
3185 {
3186 tg->scx.weight = CGROUP_WEIGHT_DFL;
3187 tg->scx.bw_period_us = default_bw_period_us();
3188 tg->scx.bw_quota_us = RUNTIME_INF;
3189 tg->scx.idle = false;
3190 }
3191
scx_tg_online(struct task_group * tg)3192 int scx_tg_online(struct task_group *tg)
3193 {
3194 struct scx_sched *sch = scx_root;
3195 int ret = 0;
3196
3197 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
3198
3199 if (scx_cgroup_enabled) {
3200 if (SCX_HAS_OP(sch, cgroup_init)) {
3201 struct scx_cgroup_init_args args =
3202 { .weight = tg->scx.weight,
3203 .bw_period_us = tg->scx.bw_period_us,
3204 .bw_quota_us = tg->scx.bw_quota_us,
3205 .bw_burst_us = tg->scx.bw_burst_us };
3206
3207 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
3208 NULL, tg->css.cgroup, &args);
3209 if (ret)
3210 ret = ops_sanitize_err(sch, "cgroup_init", ret);
3211 }
3212 if (ret == 0)
3213 tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
3214 } else {
3215 tg->scx.flags |= SCX_TG_ONLINE;
3216 }
3217
3218 return ret;
3219 }
3220
scx_tg_offline(struct task_group * tg)3221 void scx_tg_offline(struct task_group *tg)
3222 {
3223 struct scx_sched *sch = scx_root;
3224
3225 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
3226
3227 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
3228 (tg->scx.flags & SCX_TG_INITED))
3229 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
3230 tg->css.cgroup);
3231 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
3232 }
3233
scx_cgroup_can_attach(struct cgroup_taskset * tset)3234 int scx_cgroup_can_attach(struct cgroup_taskset *tset)
3235 {
3236 struct scx_sched *sch = scx_root;
3237 struct cgroup_subsys_state *css;
3238 struct task_struct *p;
3239 int ret;
3240
3241 if (!scx_cgroup_enabled)
3242 return 0;
3243
3244 cgroup_taskset_for_each(p, css, tset) {
3245 struct cgroup *from = tg_cgrp(task_group(p));
3246 struct cgroup *to = tg_cgrp(css_tg(css));
3247
3248 WARN_ON_ONCE(p->scx.cgrp_moving_from);
3249
3250 /*
3251 * sched_move_task() omits identity migrations. Let's match the
3252 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
3253 * always match one-to-one.
3254 */
3255 if (from == to)
3256 continue;
3257
3258 if (SCX_HAS_OP(sch, cgroup_prep_move)) {
3259 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
3260 cgroup_prep_move, NULL,
3261 p, from, css->cgroup);
3262 if (ret)
3263 goto err;
3264 }
3265
3266 p->scx.cgrp_moving_from = from;
3267 }
3268
3269 return 0;
3270
3271 err:
3272 cgroup_taskset_for_each(p, css, tset) {
3273 if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
3274 p->scx.cgrp_moving_from)
3275 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
3276 p, p->scx.cgrp_moving_from, css->cgroup);
3277 p->scx.cgrp_moving_from = NULL;
3278 }
3279
3280 return ops_sanitize_err(sch, "cgroup_prep_move", ret);
3281 }
3282
scx_cgroup_move_task(struct task_struct * p)3283 void scx_cgroup_move_task(struct task_struct *p)
3284 {
3285 struct scx_sched *sch = scx_root;
3286
3287 if (!scx_cgroup_enabled)
3288 return;
3289
3290 /*
3291 * @p must have ops.cgroup_prep_move() called on it and thus
3292 * cgrp_moving_from set.
3293 */
3294 if (SCX_HAS_OP(sch, cgroup_move) &&
3295 !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
3296 SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
3297 p, p->scx.cgrp_moving_from,
3298 tg_cgrp(task_group(p)));
3299 p->scx.cgrp_moving_from = NULL;
3300 }
3301
scx_cgroup_cancel_attach(struct cgroup_taskset * tset)3302 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
3303 {
3304 struct scx_sched *sch = scx_root;
3305 struct cgroup_subsys_state *css;
3306 struct task_struct *p;
3307
3308 if (!scx_cgroup_enabled)
3309 return;
3310
3311 cgroup_taskset_for_each(p, css, tset) {
3312 if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
3313 p->scx.cgrp_moving_from)
3314 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
3315 p, p->scx.cgrp_moving_from, css->cgroup);
3316 p->scx.cgrp_moving_from = NULL;
3317 }
3318 }
3319
scx_group_set_weight(struct task_group * tg,unsigned long weight)3320 void scx_group_set_weight(struct task_group *tg, unsigned long weight)
3321 {
3322 struct scx_sched *sch = scx_root;
3323
3324 percpu_down_read(&scx_cgroup_ops_rwsem);
3325
3326 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
3327 tg->scx.weight != weight)
3328 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
3329 tg_cgrp(tg), weight);
3330
3331 tg->scx.weight = weight;
3332
3333 percpu_up_read(&scx_cgroup_ops_rwsem);
3334 }
3335
scx_group_set_idle(struct task_group * tg,bool idle)3336 void scx_group_set_idle(struct task_group *tg, bool idle)
3337 {
3338 struct scx_sched *sch = scx_root;
3339
3340 percpu_down_read(&scx_cgroup_ops_rwsem);
3341
3342 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
3343 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
3344 tg_cgrp(tg), idle);
3345
3346 /* Update the task group's idle state */
3347 tg->scx.idle = idle;
3348
3349 percpu_up_read(&scx_cgroup_ops_rwsem);
3350 }
3351
scx_group_set_bandwidth(struct task_group * tg,u64 period_us,u64 quota_us,u64 burst_us)3352 void scx_group_set_bandwidth(struct task_group *tg,
3353 u64 period_us, u64 quota_us, u64 burst_us)
3354 {
3355 struct scx_sched *sch = scx_root;
3356
3357 percpu_down_read(&scx_cgroup_ops_rwsem);
3358
3359 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
3360 (tg->scx.bw_period_us != period_us ||
3361 tg->scx.bw_quota_us != quota_us ||
3362 tg->scx.bw_burst_us != burst_us))
3363 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
3364 tg_cgrp(tg), period_us, quota_us, burst_us);
3365
3366 tg->scx.bw_period_us = period_us;
3367 tg->scx.bw_quota_us = quota_us;
3368 tg->scx.bw_burst_us = burst_us;
3369
3370 percpu_up_read(&scx_cgroup_ops_rwsem);
3371 }
3372
scx_cgroup_lock(void)3373 static void scx_cgroup_lock(void)
3374 {
3375 percpu_down_write(&scx_cgroup_ops_rwsem);
3376 cgroup_lock();
3377 }
3378
scx_cgroup_unlock(void)3379 static void scx_cgroup_unlock(void)
3380 {
3381 cgroup_unlock();
3382 percpu_up_write(&scx_cgroup_ops_rwsem);
3383 }
3384
3385 #else /* CONFIG_EXT_GROUP_SCHED */
3386
scx_cgroup_lock(void)3387 static void scx_cgroup_lock(void) {}
scx_cgroup_unlock(void)3388 static void scx_cgroup_unlock(void) {}
3389
3390 #endif /* CONFIG_EXT_GROUP_SCHED */
3391
3392 /*
3393 * Omitted operations:
3394 *
3395 * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
3396 * isn't tied to the CPU at that point. Preemption is implemented by resetting
3397 * the victim task's slice to 0 and triggering reschedule on the target CPU.
3398 *
3399 * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
3400 *
3401 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
3402 * their current sched_class. Call them directly from sched core instead.
3403 */
3404 DEFINE_SCHED_CLASS(ext) = {
3405 .queue_mask = 1,
3406
3407 .enqueue_task = enqueue_task_scx,
3408 .dequeue_task = dequeue_task_scx,
3409 .yield_task = yield_task_scx,
3410 .yield_to_task = yield_to_task_scx,
3411
3412 .wakeup_preempt = wakeup_preempt_scx,
3413
3414 .pick_task = pick_task_scx,
3415
3416 .put_prev_task = put_prev_task_scx,
3417 .set_next_task = set_next_task_scx,
3418
3419 .select_task_rq = select_task_rq_scx,
3420 .task_woken = task_woken_scx,
3421 .set_cpus_allowed = set_cpus_allowed_scx,
3422
3423 .rq_online = rq_online_scx,
3424 .rq_offline = rq_offline_scx,
3425
3426 .task_tick = task_tick_scx,
3427
3428 .switching_to = switching_to_scx,
3429 .switched_from = switched_from_scx,
3430 .switched_to = switched_to_scx,
3431 .reweight_task = reweight_task_scx,
3432 .prio_changed = prio_changed_scx,
3433
3434 .update_curr = update_curr_scx,
3435
3436 #ifdef CONFIG_UCLAMP_TASK
3437 .uclamp_enabled = 1,
3438 #endif
3439 };
3440
init_dsq(struct scx_dispatch_q * dsq,u64 dsq_id)3441 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
3442 {
3443 memset(dsq, 0, sizeof(*dsq));
3444
3445 raw_spin_lock_init(&dsq->lock);
3446 INIT_LIST_HEAD(&dsq->list);
3447 dsq->id = dsq_id;
3448 }
3449
free_dsq_irq_workfn(struct irq_work * irq_work)3450 static void free_dsq_irq_workfn(struct irq_work *irq_work)
3451 {
3452 struct llist_node *to_free = llist_del_all(&dsqs_to_free);
3453 struct scx_dispatch_q *dsq, *tmp_dsq;
3454
3455 llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
3456 kfree_rcu(dsq, rcu);
3457 }
3458
3459 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
3460
destroy_dsq(struct scx_sched * sch,u64 dsq_id)3461 static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
3462 {
3463 struct scx_dispatch_q *dsq;
3464 unsigned long flags;
3465
3466 rcu_read_lock();
3467
3468 dsq = find_user_dsq(sch, dsq_id);
3469 if (!dsq)
3470 goto out_unlock_rcu;
3471
3472 raw_spin_lock_irqsave(&dsq->lock, flags);
3473
3474 if (dsq->nr) {
3475 scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
3476 dsq->id, dsq->nr);
3477 goto out_unlock_dsq;
3478 }
3479
3480 if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
3481 dsq_hash_params))
3482 goto out_unlock_dsq;
3483
3484 /*
3485 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
3486 * queueing more tasks. As this function can be called from anywhere,
3487 * freeing is bounced through an irq work to avoid nesting RCU
3488 * operations inside scheduler locks.
3489 */
3490 dsq->id = SCX_DSQ_INVALID;
3491 llist_add(&dsq->free_node, &dsqs_to_free);
3492 irq_work_queue(&free_dsq_irq_work);
3493
3494 out_unlock_dsq:
3495 raw_spin_unlock_irqrestore(&dsq->lock, flags);
3496 out_unlock_rcu:
3497 rcu_read_unlock();
3498 }
3499
3500 #ifdef CONFIG_EXT_GROUP_SCHED
scx_cgroup_exit(struct scx_sched * sch)3501 static void scx_cgroup_exit(struct scx_sched *sch)
3502 {
3503 struct cgroup_subsys_state *css;
3504
3505 scx_cgroup_enabled = false;
3506
3507 /*
3508 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
3509 * cgroups and exit all the inited ones, all online cgroups are exited.
3510 */
3511 css_for_each_descendant_post(css, &root_task_group.css) {
3512 struct task_group *tg = css_tg(css);
3513
3514 if (!(tg->scx.flags & SCX_TG_INITED))
3515 continue;
3516 tg->scx.flags &= ~SCX_TG_INITED;
3517
3518 if (!sch->ops.cgroup_exit)
3519 continue;
3520
3521 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
3522 css->cgroup);
3523 }
3524 }
3525
scx_cgroup_init(struct scx_sched * sch)3526 static int scx_cgroup_init(struct scx_sched *sch)
3527 {
3528 struct cgroup_subsys_state *css;
3529 int ret;
3530
3531 /*
3532 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
3533 * cgroups and init, all online cgroups are initialized.
3534 */
3535 css_for_each_descendant_pre(css, &root_task_group.css) {
3536 struct task_group *tg = css_tg(css);
3537 struct scx_cgroup_init_args args = {
3538 .weight = tg->scx.weight,
3539 .bw_period_us = tg->scx.bw_period_us,
3540 .bw_quota_us = tg->scx.bw_quota_us,
3541 .bw_burst_us = tg->scx.bw_burst_us,
3542 };
3543
3544 if ((tg->scx.flags &
3545 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
3546 continue;
3547
3548 if (!sch->ops.cgroup_init) {
3549 tg->scx.flags |= SCX_TG_INITED;
3550 continue;
3551 }
3552
3553 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
3554 css->cgroup, &args);
3555 if (ret) {
3556 css_put(css);
3557 scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
3558 return ret;
3559 }
3560 tg->scx.flags |= SCX_TG_INITED;
3561 }
3562
3563 WARN_ON_ONCE(scx_cgroup_enabled);
3564 scx_cgroup_enabled = true;
3565
3566 return 0;
3567 }
3568
3569 #else
scx_cgroup_exit(struct scx_sched * sch)3570 static void scx_cgroup_exit(struct scx_sched *sch) {}
scx_cgroup_init(struct scx_sched * sch)3571 static int scx_cgroup_init(struct scx_sched *sch) { return 0; }
3572 #endif
3573
3574
3575 /********************************************************************************
3576 * Sysfs interface and ops enable/disable.
3577 */
3578
3579 #define SCX_ATTR(_name) \
3580 static struct kobj_attribute scx_attr_##_name = { \
3581 .attr = { .name = __stringify(_name), .mode = 0444 }, \
3582 .show = scx_attr_##_name##_show, \
3583 }
3584
scx_attr_state_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3585 static ssize_t scx_attr_state_show(struct kobject *kobj,
3586 struct kobj_attribute *ka, char *buf)
3587 {
3588 return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
3589 }
3590 SCX_ATTR(state);
3591
scx_attr_switch_all_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3592 static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
3593 struct kobj_attribute *ka, char *buf)
3594 {
3595 return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
3596 }
3597 SCX_ATTR(switch_all);
3598
scx_attr_nr_rejected_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3599 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
3600 struct kobj_attribute *ka, char *buf)
3601 {
3602 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
3603 }
3604 SCX_ATTR(nr_rejected);
3605
scx_attr_hotplug_seq_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3606 static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
3607 struct kobj_attribute *ka, char *buf)
3608 {
3609 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
3610 }
3611 SCX_ATTR(hotplug_seq);
3612
scx_attr_enable_seq_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3613 static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
3614 struct kobj_attribute *ka, char *buf)
3615 {
3616 return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
3617 }
3618 SCX_ATTR(enable_seq);
3619
3620 static struct attribute *scx_global_attrs[] = {
3621 &scx_attr_state.attr,
3622 &scx_attr_switch_all.attr,
3623 &scx_attr_nr_rejected.attr,
3624 &scx_attr_hotplug_seq.attr,
3625 &scx_attr_enable_seq.attr,
3626 NULL,
3627 };
3628
3629 static const struct attribute_group scx_global_attr_group = {
3630 .attrs = scx_global_attrs,
3631 };
3632
3633 static void free_exit_info(struct scx_exit_info *ei);
3634
scx_sched_free_rcu_work(struct work_struct * work)3635 static void scx_sched_free_rcu_work(struct work_struct *work)
3636 {
3637 struct rcu_work *rcu_work = to_rcu_work(work);
3638 struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
3639 struct rhashtable_iter rht_iter;
3640 struct scx_dispatch_q *dsq;
3641 int node;
3642
3643 irq_work_sync(&sch->error_irq_work);
3644 kthread_destroy_worker(sch->helper);
3645
3646 free_percpu(sch->pcpu);
3647
3648 for_each_node_state(node, N_POSSIBLE)
3649 kfree(sch->global_dsqs[node]);
3650 kfree(sch->global_dsqs);
3651
3652 rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
3653 do {
3654 rhashtable_walk_start(&rht_iter);
3655
3656 while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
3657 destroy_dsq(sch, dsq->id);
3658
3659 rhashtable_walk_stop(&rht_iter);
3660 } while (dsq == ERR_PTR(-EAGAIN));
3661 rhashtable_walk_exit(&rht_iter);
3662
3663 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
3664 free_exit_info(sch->exit_info);
3665 kfree(sch);
3666 }
3667
scx_kobj_release(struct kobject * kobj)3668 static void scx_kobj_release(struct kobject *kobj)
3669 {
3670 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
3671
3672 INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
3673 queue_rcu_work(system_unbound_wq, &sch->rcu_work);
3674 }
3675
scx_attr_ops_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3676 static ssize_t scx_attr_ops_show(struct kobject *kobj,
3677 struct kobj_attribute *ka, char *buf)
3678 {
3679 return sysfs_emit(buf, "%s\n", scx_root->ops.name);
3680 }
3681 SCX_ATTR(ops);
3682
3683 #define scx_attr_event_show(buf, at, events, kind) ({ \
3684 sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
3685 })
3686
scx_attr_events_show(struct kobject * kobj,struct kobj_attribute * ka,char * buf)3687 static ssize_t scx_attr_events_show(struct kobject *kobj,
3688 struct kobj_attribute *ka, char *buf)
3689 {
3690 struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
3691 struct scx_event_stats events;
3692 int at = 0;
3693
3694 scx_read_events(sch, &events);
3695 at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
3696 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
3697 at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
3698 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
3699 at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
3700 at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
3701 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
3702 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
3703 at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
3704 return at;
3705 }
3706 SCX_ATTR(events);
3707
3708 static struct attribute *scx_sched_attrs[] = {
3709 &scx_attr_ops.attr,
3710 &scx_attr_events.attr,
3711 NULL,
3712 };
3713 ATTRIBUTE_GROUPS(scx_sched);
3714
3715 static const struct kobj_type scx_ktype = {
3716 .release = scx_kobj_release,
3717 .sysfs_ops = &kobj_sysfs_ops,
3718 .default_groups = scx_sched_groups,
3719 };
3720
scx_uevent(const struct kobject * kobj,struct kobj_uevent_env * env)3721 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
3722 {
3723 return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
3724 }
3725
3726 static const struct kset_uevent_ops scx_uevent_ops = {
3727 .uevent = scx_uevent,
3728 };
3729
3730 /*
3731 * Used by sched_fork() and __setscheduler_prio() to pick the matching
3732 * sched_class. dl/rt are already handled.
3733 */
task_should_scx(int policy)3734 bool task_should_scx(int policy)
3735 {
3736 if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
3737 return false;
3738 if (READ_ONCE(scx_switching_all))
3739 return true;
3740 return policy == SCHED_EXT;
3741 }
3742
scx_allow_ttwu_queue(const struct task_struct * p)3743 bool scx_allow_ttwu_queue(const struct task_struct *p)
3744 {
3745 struct scx_sched *sch;
3746
3747 if (!scx_enabled())
3748 return true;
3749
3750 sch = rcu_dereference_sched(scx_root);
3751 if (unlikely(!sch))
3752 return true;
3753
3754 if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
3755 return true;
3756
3757 if (unlikely(p->sched_class != &ext_sched_class))
3758 return true;
3759
3760 return false;
3761 }
3762
3763 /**
3764 * handle_lockup - sched_ext common lockup handler
3765 * @fmt: format string
3766 *
3767 * Called on system stall or lockup condition and initiates abort of sched_ext
3768 * if enabled, which may resolve the reported lockup.
3769 *
3770 * Returns %true if sched_ext is enabled and abort was initiated, which may
3771 * resolve the lockup. %false if sched_ext is not enabled or abort was already
3772 * initiated by someone else.
3773 */
handle_lockup(const char * fmt,...)3774 static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
3775 {
3776 struct scx_sched *sch;
3777 va_list args;
3778 bool ret;
3779
3780 guard(rcu)();
3781
3782 sch = rcu_dereference(scx_root);
3783 if (unlikely(!sch))
3784 return false;
3785
3786 switch (scx_enable_state()) {
3787 case SCX_ENABLING:
3788 case SCX_ENABLED:
3789 va_start(args, fmt);
3790 ret = scx_verror(sch, fmt, args);
3791 va_end(args);
3792 return ret;
3793 default:
3794 return false;
3795 }
3796 }
3797
3798 /**
3799 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
3800 *
3801 * While there are various reasons why RCU CPU stalls can occur on a system
3802 * that may not be caused by the current BPF scheduler, try kicking out the
3803 * current scheduler in an attempt to recover the system to a good state before
3804 * issuing panics.
3805 *
3806 * Returns %true if sched_ext is enabled and abort was initiated, which may
3807 * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
3808 * else already initiated abort.
3809 */
scx_rcu_cpu_stall(void)3810 bool scx_rcu_cpu_stall(void)
3811 {
3812 return handle_lockup("RCU CPU stall detected!");
3813 }
3814
3815 /**
3816 * scx_softlockup - sched_ext softlockup handler
3817 * @dur_s: number of seconds of CPU stuck due to soft lockup
3818 *
3819 * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
3820 * live-lock the system by making many CPUs target the same DSQ to the point
3821 * where soft-lockup detection triggers. This function is called from
3822 * soft-lockup watchdog when the triggering point is close and tries to unjam
3823 * the system and aborting the BPF scheduler.
3824 */
scx_softlockup(u32 dur_s)3825 void scx_softlockup(u32 dur_s)
3826 {
3827 if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
3828 return;
3829
3830 printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
3831 smp_processor_id(), dur_s);
3832 }
3833
3834 /**
3835 * scx_hardlockup - sched_ext hardlockup handler
3836 *
3837 * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting
3838 * numerous affinitized tasks in a single queue and directing all CPUs at it.
3839 * Try kicking out the current scheduler in an attempt to recover the system to
3840 * a good state before taking more drastic actions.
3841 *
3842 * Returns %true if sched_ext is enabled and abort was initiated, which may
3843 * resolve the reported hardlockdup. %false if sched_ext is not enabled or
3844 * someone else already initiated abort.
3845 */
scx_hardlockup(int cpu)3846 bool scx_hardlockup(int cpu)
3847 {
3848 if (!handle_lockup("hard lockup - CPU %d", cpu))
3849 return false;
3850
3851 printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
3852 cpu);
3853 return true;
3854 }
3855
bypass_lb_cpu(struct scx_sched * sch,struct rq * rq,struct cpumask * donee_mask,struct cpumask * resched_mask,u32 nr_donor_target,u32 nr_donee_target)3856 static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
3857 struct cpumask *donee_mask, struct cpumask *resched_mask,
3858 u32 nr_donor_target, u32 nr_donee_target)
3859 {
3860 struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
3861 struct task_struct *p, *n;
3862 struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
3863 s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
3864 u32 nr_balanced = 0, min_delta_us;
3865
3866 /*
3867 * All we want to guarantee is reasonable forward progress. No reason to
3868 * fine tune. Assuming every task on @donor_dsq runs their full slice,
3869 * consider offloading iff the total queued duration is over the
3870 * threshold.
3871 */
3872 min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
3873 if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
3874 return 0;
3875
3876 raw_spin_rq_lock_irq(rq);
3877 raw_spin_lock(&donor_dsq->lock);
3878 list_add(&cursor.node, &donor_dsq->list);
3879 resume:
3880 n = container_of(&cursor, struct task_struct, scx.dsq_list);
3881 n = nldsq_next_task(donor_dsq, n, false);
3882
3883 while ((p = n)) {
3884 struct rq *donee_rq;
3885 struct scx_dispatch_q *donee_dsq;
3886 int donee;
3887
3888 n = nldsq_next_task(donor_dsq, n, false);
3889
3890 if (donor_dsq->nr <= nr_donor_target)
3891 break;
3892
3893 if (cpumask_empty(donee_mask))
3894 break;
3895
3896 donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
3897 if (donee >= nr_cpu_ids)
3898 continue;
3899
3900 donee_rq = cpu_rq(donee);
3901 donee_dsq = &donee_rq->scx.bypass_dsq;
3902
3903 /*
3904 * $p's rq is not locked but $p's DSQ lock protects its
3905 * scheduling properties making this test safe.
3906 */
3907 if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
3908 continue;
3909
3910 /*
3911 * Moving $p from one non-local DSQ to another. The source rq
3912 * and DSQ are already locked. Do an abbreviated dequeue and
3913 * then perform enqueue without unlocking $donor_dsq.
3914 *
3915 * We don't want to drop and reacquire the lock on each
3916 * iteration as @donor_dsq can be very long and potentially
3917 * highly contended. Donee DSQs are less likely to be contended.
3918 * The nested locking is safe as only this LB moves tasks
3919 * between bypass DSQs.
3920 */
3921 dispatch_dequeue_locked(p, donor_dsq);
3922 dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
3923
3924 /*
3925 * $donee might have been idle and need to be woken up. No need
3926 * to be clever. Kick every CPU that receives tasks.
3927 */
3928 cpumask_set_cpu(donee, resched_mask);
3929
3930 if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
3931 cpumask_clear_cpu(donee, donee_mask);
3932
3933 nr_balanced++;
3934 if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
3935 list_move_tail(&cursor.node, &n->scx.dsq_list.node);
3936 raw_spin_unlock(&donor_dsq->lock);
3937 raw_spin_rq_unlock_irq(rq);
3938 cpu_relax();
3939 raw_spin_rq_lock_irq(rq);
3940 raw_spin_lock(&donor_dsq->lock);
3941 goto resume;
3942 }
3943 }
3944
3945 list_del_init(&cursor.node);
3946 raw_spin_unlock(&donor_dsq->lock);
3947 raw_spin_rq_unlock_irq(rq);
3948
3949 return nr_balanced;
3950 }
3951
bypass_lb_node(struct scx_sched * sch,int node)3952 static void bypass_lb_node(struct scx_sched *sch, int node)
3953 {
3954 const struct cpumask *node_mask = cpumask_of_node(node);
3955 struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
3956 struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
3957 u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
3958 u32 nr_target, nr_donor_target;
3959 u32 before_min = U32_MAX, before_max = 0;
3960 u32 after_min = U32_MAX, after_max = 0;
3961 int cpu;
3962
3963 /* count the target tasks and CPUs */
3964 for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
3965 u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
3966
3967 nr_tasks += nr;
3968 nr_cpus++;
3969
3970 before_min = min(nr, before_min);
3971 before_max = max(nr, before_max);
3972 }
3973
3974 if (!nr_cpus)
3975 return;
3976
3977 /*
3978 * We don't want CPUs to have more than $nr_donor_target tasks and
3979 * balancing to fill donee CPUs upto $nr_target. Once targets are
3980 * calculated, find the donee CPUs.
3981 */
3982 nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
3983 nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
3984
3985 cpumask_clear(donee_mask);
3986 for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
3987 if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
3988 cpumask_set_cpu(cpu, donee_mask);
3989 }
3990
3991 /* iterate !donee CPUs and see if they should be offloaded */
3992 cpumask_clear(resched_mask);
3993 for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
3994 struct rq *rq = cpu_rq(cpu);
3995 struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
3996
3997 if (cpumask_empty(donee_mask))
3998 break;
3999 if (cpumask_test_cpu(cpu, donee_mask))
4000 continue;
4001 if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
4002 continue;
4003
4004 nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
4005 nr_donor_target, nr_target);
4006 }
4007
4008 for_each_cpu(cpu, resched_mask)
4009 resched_cpu(cpu);
4010
4011 for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
4012 u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
4013
4014 after_min = min(nr, after_min);
4015 after_max = max(nr, after_max);
4016
4017 }
4018
4019 trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
4020 before_min, before_max, after_min, after_max);
4021 }
4022
4023 /*
4024 * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
4025 * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
4026 * bypass DSQs can be overloaded. If there are enough tasks to saturate other
4027 * lightly loaded CPUs, such imbalance can lead to very high execution latency
4028 * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
4029 * outcomes, a simple load balancing mechanism is implemented by the following
4030 * timer which runs periodically while bypass mode is in effect.
4031 */
scx_bypass_lb_timerfn(struct timer_list * timer)4032 static void scx_bypass_lb_timerfn(struct timer_list *timer)
4033 {
4034 struct scx_sched *sch;
4035 int node;
4036 u32 intv_us;
4037
4038 sch = rcu_dereference_all(scx_root);
4039 if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
4040 return;
4041
4042 for_each_node_with_cpus(node)
4043 bypass_lb_node(sch, node);
4044
4045 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
4046 if (intv_us)
4047 mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
4048 }
4049
4050 static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
4051
4052 /**
4053 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
4054 * @bypass: true for bypass, false for unbypass
4055 *
4056 * Bypassing guarantees that all runnable tasks make forward progress without
4057 * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
4058 * be held by tasks that the BPF scheduler is forgetting to run, which
4059 * unfortunately also excludes toggling the static branches.
4060 *
4061 * Let's work around by overriding a couple ops and modifying behaviors based on
4062 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
4063 * to force global FIFO scheduling.
4064 *
4065 * - ops.select_cpu() is ignored and the default select_cpu() is used.
4066 *
4067 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
4068 * %SCX_OPS_ENQ_LAST is also ignored.
4069 *
4070 * - ops.dispatch() is ignored.
4071 *
4072 * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
4073 * can't be trusted. Whenever a tick triggers, the running task is rotated to
4074 * the tail of the queue with core_sched_at touched.
4075 *
4076 * - pick_next_task() suppresses zero slice warning.
4077 *
4078 * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
4079 * operations.
4080 *
4081 * - scx_prio_less() reverts to the default core_sched_at order.
4082 */
scx_bypass(bool bypass)4083 static void scx_bypass(bool bypass)
4084 {
4085 static DEFINE_RAW_SPINLOCK(bypass_lock);
4086 static unsigned long bypass_timestamp;
4087 struct scx_sched *sch;
4088 unsigned long flags;
4089 int cpu;
4090
4091 raw_spin_lock_irqsave(&bypass_lock, flags);
4092 sch = rcu_dereference_bh(scx_root);
4093
4094 if (bypass) {
4095 u32 intv_us;
4096
4097 WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
4098 WARN_ON_ONCE(scx_bypass_depth <= 0);
4099 if (scx_bypass_depth != 1)
4100 goto unlock;
4101 WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
4102 bypass_timestamp = ktime_get_ns();
4103 if (sch)
4104 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
4105
4106 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
4107 if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
4108 scx_bypass_lb_timer.expires =
4109 jiffies + usecs_to_jiffies(intv_us);
4110 add_timer_global(&scx_bypass_lb_timer);
4111 }
4112 } else {
4113 WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
4114 WARN_ON_ONCE(scx_bypass_depth < 0);
4115 if (scx_bypass_depth != 0)
4116 goto unlock;
4117 WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
4118 if (sch)
4119 scx_add_event(sch, SCX_EV_BYPASS_DURATION,
4120 ktime_get_ns() - bypass_timestamp);
4121 }
4122
4123 /*
4124 * No task property is changing. We just need to make sure all currently
4125 * queued tasks are re-queued according to the new scx_rq_bypassing()
4126 * state. As an optimization, walk each rq's runnable_list instead of
4127 * the scx_tasks list.
4128 *
4129 * This function can't trust the scheduler and thus can't use
4130 * cpus_read_lock(). Walk all possible CPUs instead of online.
4131 */
4132 for_each_possible_cpu(cpu) {
4133 struct rq *rq = cpu_rq(cpu);
4134 struct task_struct *p, *n;
4135
4136 raw_spin_rq_lock(rq);
4137
4138 if (bypass) {
4139 WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
4140 rq->scx.flags |= SCX_RQ_BYPASSING;
4141 } else {
4142 WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
4143 rq->scx.flags &= ~SCX_RQ_BYPASSING;
4144 }
4145
4146 /*
4147 * We need to guarantee that no tasks are on the BPF scheduler
4148 * while bypassing. Either we see enabled or the enable path
4149 * sees scx_rq_bypassing() before moving tasks to SCX.
4150 */
4151 if (!scx_enabled()) {
4152 raw_spin_rq_unlock(rq);
4153 continue;
4154 }
4155
4156 /*
4157 * The use of list_for_each_entry_safe_reverse() is required
4158 * because each task is going to be removed from and added back
4159 * to the runnable_list during iteration. Because they're added
4160 * to the tail of the list, safe reverse iteration can still
4161 * visit all nodes.
4162 */
4163 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
4164 scx.runnable_node) {
4165 /* cycling deq/enq is enough, see the function comment */
4166 scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
4167 /* nothing */ ;
4168 }
4169 }
4170
4171 /* resched to restore ticks and idle state */
4172 if (cpu_online(cpu) || cpu == smp_processor_id())
4173 resched_curr(rq);
4174
4175 raw_spin_rq_unlock(rq);
4176 }
4177
4178 unlock:
4179 raw_spin_unlock_irqrestore(&bypass_lock, flags);
4180 }
4181
free_exit_info(struct scx_exit_info * ei)4182 static void free_exit_info(struct scx_exit_info *ei)
4183 {
4184 kvfree(ei->dump);
4185 kfree(ei->msg);
4186 kfree(ei->bt);
4187 kfree(ei);
4188 }
4189
alloc_exit_info(size_t exit_dump_len)4190 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
4191 {
4192 struct scx_exit_info *ei;
4193
4194 ei = kzalloc(sizeof(*ei), GFP_KERNEL);
4195 if (!ei)
4196 return NULL;
4197
4198 ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
4199 ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
4200 ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
4201
4202 if (!ei->bt || !ei->msg || !ei->dump) {
4203 free_exit_info(ei);
4204 return NULL;
4205 }
4206
4207 return ei;
4208 }
4209
scx_exit_reason(enum scx_exit_kind kind)4210 static const char *scx_exit_reason(enum scx_exit_kind kind)
4211 {
4212 switch (kind) {
4213 case SCX_EXIT_UNREG:
4214 return "unregistered from user space";
4215 case SCX_EXIT_UNREG_BPF:
4216 return "unregistered from BPF";
4217 case SCX_EXIT_UNREG_KERN:
4218 return "unregistered from the main kernel";
4219 case SCX_EXIT_SYSRQ:
4220 return "disabled by sysrq-S";
4221 case SCX_EXIT_ERROR:
4222 return "runtime error";
4223 case SCX_EXIT_ERROR_BPF:
4224 return "scx_bpf_error";
4225 case SCX_EXIT_ERROR_STALL:
4226 return "runnable task stall";
4227 default:
4228 return "<UNKNOWN>";
4229 }
4230 }
4231
free_kick_syncs(void)4232 static void free_kick_syncs(void)
4233 {
4234 int cpu;
4235
4236 for_each_possible_cpu(cpu) {
4237 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
4238 struct scx_kick_syncs *to_free;
4239
4240 to_free = rcu_replace_pointer(*ksyncs, NULL, true);
4241 if (to_free)
4242 kvfree_rcu(to_free, rcu);
4243 }
4244 }
4245
scx_disable_workfn(struct kthread_work * work)4246 static void scx_disable_workfn(struct kthread_work *work)
4247 {
4248 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
4249 struct scx_exit_info *ei = sch->exit_info;
4250 struct scx_task_iter sti;
4251 struct task_struct *p;
4252 int kind, cpu;
4253
4254 kind = atomic_read(&sch->exit_kind);
4255 while (true) {
4256 if (kind == SCX_EXIT_DONE) /* already disabled? */
4257 return;
4258 WARN_ON_ONCE(kind == SCX_EXIT_NONE);
4259 if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
4260 break;
4261 }
4262 ei->kind = kind;
4263 ei->reason = scx_exit_reason(ei->kind);
4264
4265 /* guarantee forward progress by bypassing scx_ops */
4266 scx_bypass(true);
4267 WRITE_ONCE(scx_aborting, false);
4268
4269 switch (scx_set_enable_state(SCX_DISABLING)) {
4270 case SCX_DISABLING:
4271 WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
4272 break;
4273 case SCX_DISABLED:
4274 pr_warn("sched_ext: ops error detected without ops (%s)\n",
4275 sch->exit_info->msg);
4276 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
4277 goto done;
4278 default:
4279 break;
4280 }
4281
4282 /*
4283 * Here, every runnable task is guaranteed to make forward progress and
4284 * we can safely use blocking synchronization constructs. Actually
4285 * disable ops.
4286 */
4287 mutex_lock(&scx_enable_mutex);
4288
4289 static_branch_disable(&__scx_switched_all);
4290 WRITE_ONCE(scx_switching_all, false);
4291
4292 /*
4293 * Shut down cgroup support before tasks so that the cgroup attach path
4294 * doesn't race against scx_exit_task().
4295 */
4296 scx_cgroup_lock();
4297 scx_cgroup_exit(sch);
4298 scx_cgroup_unlock();
4299
4300 /*
4301 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
4302 * must be switched out and exited synchronously.
4303 */
4304 percpu_down_write(&scx_fork_rwsem);
4305
4306 scx_init_task_enabled = false;
4307
4308 scx_task_iter_start(&sti);
4309 while ((p = scx_task_iter_next_locked(&sti))) {
4310 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4311 const struct sched_class *old_class = p->sched_class;
4312 const struct sched_class *new_class = scx_setscheduler_class(p);
4313
4314 update_rq_clock(task_rq(p));
4315
4316 if (old_class != new_class)
4317 queue_flags |= DEQUEUE_CLASS;
4318
4319 scoped_guard (sched_change, p, queue_flags) {
4320 p->sched_class = new_class;
4321 }
4322
4323 scx_exit_task(p);
4324 }
4325 scx_task_iter_stop(&sti);
4326 percpu_up_write(&scx_fork_rwsem);
4327
4328 /*
4329 * Invalidate all the rq clocks to prevent getting outdated
4330 * rq clocks from a previous scx scheduler.
4331 */
4332 for_each_possible_cpu(cpu) {
4333 struct rq *rq = cpu_rq(cpu);
4334 scx_rq_clock_invalidate(rq);
4335 }
4336
4337 /* no task is on scx, turn off all the switches and flush in-progress calls */
4338 static_branch_disable(&__scx_enabled);
4339 bitmap_zero(sch->has_op, SCX_OPI_END);
4340 scx_idle_disable();
4341 synchronize_rcu();
4342
4343 if (ei->kind >= SCX_EXIT_ERROR) {
4344 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
4345 sch->ops.name, ei->reason);
4346
4347 if (ei->msg[0] != '\0')
4348 pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
4349 #ifdef CONFIG_STACKTRACE
4350 stack_trace_print(ei->bt, ei->bt_len, 2);
4351 #endif
4352 } else {
4353 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
4354 sch->ops.name, ei->reason);
4355 }
4356
4357 if (sch->ops.exit)
4358 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
4359
4360 cancel_delayed_work_sync(&scx_watchdog_work);
4361
4362 /*
4363 * scx_root clearing must be inside cpus_read_lock(). See
4364 * handle_hotplug().
4365 */
4366 cpus_read_lock();
4367 RCU_INIT_POINTER(scx_root, NULL);
4368 cpus_read_unlock();
4369
4370 /*
4371 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
4372 * could observe an object of the same name still in the hierarchy when
4373 * the next scheduler is loaded.
4374 */
4375 kobject_del(&sch->kobj);
4376
4377 free_percpu(scx_dsp_ctx);
4378 scx_dsp_ctx = NULL;
4379 scx_dsp_max_batch = 0;
4380 free_kick_syncs();
4381
4382 if (scx_bypassed_for_enable) {
4383 scx_bypassed_for_enable = false;
4384 scx_bypass(false);
4385 }
4386
4387 mutex_unlock(&scx_enable_mutex);
4388
4389 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
4390 done:
4391 scx_bypass(false);
4392 }
4393
scx_claim_exit(struct scx_sched * sch,enum scx_exit_kind kind)4394 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
4395 {
4396 int none = SCX_EXIT_NONE;
4397
4398 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
4399 return false;
4400
4401 /*
4402 * Some CPUs may be trapped in the dispatch paths. Set the aborting
4403 * flag to break potential live-lock scenarios, ensuring we can
4404 * successfully reach scx_bypass().
4405 */
4406 WRITE_ONCE(scx_aborting, true);
4407 return true;
4408 }
4409
scx_disable(enum scx_exit_kind kind)4410 static void scx_disable(enum scx_exit_kind kind)
4411 {
4412 struct scx_sched *sch;
4413
4414 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
4415 kind = SCX_EXIT_ERROR;
4416
4417 rcu_read_lock();
4418 sch = rcu_dereference(scx_root);
4419 if (sch) {
4420 scx_claim_exit(sch, kind);
4421 kthread_queue_work(sch->helper, &sch->disable_work);
4422 }
4423 rcu_read_unlock();
4424 }
4425
dump_newline(struct seq_buf * s)4426 static void dump_newline(struct seq_buf *s)
4427 {
4428 trace_sched_ext_dump("");
4429
4430 /* @s may be zero sized and seq_buf triggers WARN if so */
4431 if (s->size)
4432 seq_buf_putc(s, '\n');
4433 }
4434
dump_line(struct seq_buf * s,const char * fmt,...)4435 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
4436 {
4437 va_list args;
4438
4439 #ifdef CONFIG_TRACEPOINTS
4440 if (trace_sched_ext_dump_enabled()) {
4441 /* protected by scx_dump_state()::dump_lock */
4442 static char line_buf[SCX_EXIT_MSG_LEN];
4443
4444 va_start(args, fmt);
4445 vscnprintf(line_buf, sizeof(line_buf), fmt, args);
4446 va_end(args);
4447
4448 trace_sched_ext_dump(line_buf);
4449 }
4450 #endif
4451 /* @s may be zero sized and seq_buf triggers WARN if so */
4452 if (s->size) {
4453 va_start(args, fmt);
4454 seq_buf_vprintf(s, fmt, args);
4455 va_end(args);
4456
4457 seq_buf_putc(s, '\n');
4458 }
4459 }
4460
dump_stack_trace(struct seq_buf * s,const char * prefix,const unsigned long * bt,unsigned int len)4461 static void dump_stack_trace(struct seq_buf *s, const char *prefix,
4462 const unsigned long *bt, unsigned int len)
4463 {
4464 unsigned int i;
4465
4466 for (i = 0; i < len; i++)
4467 dump_line(s, "%s%pS", prefix, (void *)bt[i]);
4468 }
4469
ops_dump_init(struct seq_buf * s,const char * prefix)4470 static void ops_dump_init(struct seq_buf *s, const char *prefix)
4471 {
4472 struct scx_dump_data *dd = &scx_dump_data;
4473
4474 lockdep_assert_irqs_disabled();
4475
4476 dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */
4477 dd->first = true;
4478 dd->cursor = 0;
4479 dd->s = s;
4480 dd->prefix = prefix;
4481 }
4482
ops_dump_flush(void)4483 static void ops_dump_flush(void)
4484 {
4485 struct scx_dump_data *dd = &scx_dump_data;
4486 char *line = dd->buf.line;
4487
4488 if (!dd->cursor)
4489 return;
4490
4491 /*
4492 * There's something to flush and this is the first line. Insert a blank
4493 * line to distinguish ops dump.
4494 */
4495 if (dd->first) {
4496 dump_newline(dd->s);
4497 dd->first = false;
4498 }
4499
4500 /*
4501 * There may be multiple lines in $line. Scan and emit each line
4502 * separately.
4503 */
4504 while (true) {
4505 char *end = line;
4506 char c;
4507
4508 while (*end != '\n' && *end != '\0')
4509 end++;
4510
4511 /*
4512 * If $line overflowed, it may not have newline at the end.
4513 * Always emit with a newline.
4514 */
4515 c = *end;
4516 *end = '\0';
4517 dump_line(dd->s, "%s%s", dd->prefix, line);
4518 if (c == '\0')
4519 break;
4520
4521 /* move to the next line */
4522 end++;
4523 if (*end == '\0')
4524 break;
4525 line = end;
4526 }
4527
4528 dd->cursor = 0;
4529 }
4530
ops_dump_exit(void)4531 static void ops_dump_exit(void)
4532 {
4533 ops_dump_flush();
4534 scx_dump_data.cpu = -1;
4535 }
4536
scx_dump_task(struct seq_buf * s,struct scx_dump_ctx * dctx,struct task_struct * p,char marker)4537 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
4538 struct task_struct *p, char marker)
4539 {
4540 static unsigned long bt[SCX_EXIT_BT_LEN];
4541 struct scx_sched *sch = scx_root;
4542 char dsq_id_buf[19] = "(n/a)";
4543 unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
4544 unsigned int bt_len = 0;
4545
4546 if (p->scx.dsq)
4547 scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
4548 (unsigned long long)p->scx.dsq->id);
4549
4550 dump_newline(s);
4551 dump_line(s, " %c%c %s[%d] %+ldms",
4552 marker, task_state_to_char(p), p->comm, p->pid,
4553 jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
4554 dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
4555 scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
4556 p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
4557 ops_state >> SCX_OPSS_QSEQ_SHIFT);
4558 dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
4559 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
4560 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
4561 p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
4562 dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
4563 p->migration_disabled);
4564
4565 if (SCX_HAS_OP(sch, dump_task)) {
4566 ops_dump_init(s, " ");
4567 SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
4568 ops_dump_exit();
4569 }
4570
4571 #ifdef CONFIG_STACKTRACE
4572 bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
4573 #endif
4574 if (bt_len) {
4575 dump_newline(s);
4576 dump_stack_trace(s, " ", bt, bt_len);
4577 }
4578 }
4579
scx_dump_state(struct scx_exit_info * ei,size_t dump_len)4580 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
4581 {
4582 static DEFINE_SPINLOCK(dump_lock);
4583 static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
4584 struct scx_sched *sch = scx_root;
4585 struct scx_dump_ctx dctx = {
4586 .kind = ei->kind,
4587 .exit_code = ei->exit_code,
4588 .reason = ei->reason,
4589 .at_ns = ktime_get_ns(),
4590 .at_jiffies = jiffies,
4591 };
4592 struct seq_buf s;
4593 struct scx_event_stats events;
4594 unsigned long flags;
4595 char *buf;
4596 int cpu;
4597
4598 spin_lock_irqsave(&dump_lock, flags);
4599
4600 seq_buf_init(&s, ei->dump, dump_len);
4601
4602 if (ei->kind == SCX_EXIT_NONE) {
4603 dump_line(&s, "Debug dump triggered by %s", ei->reason);
4604 } else {
4605 dump_line(&s, "%s[%d] triggered exit kind %d:",
4606 current->comm, current->pid, ei->kind);
4607 dump_line(&s, " %s (%s)", ei->reason, ei->msg);
4608 dump_newline(&s);
4609 dump_line(&s, "Backtrace:");
4610 dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
4611 }
4612
4613 if (SCX_HAS_OP(sch, dump)) {
4614 ops_dump_init(&s, "");
4615 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
4616 ops_dump_exit();
4617 }
4618
4619 dump_newline(&s);
4620 dump_line(&s, "CPU states");
4621 dump_line(&s, "----------");
4622
4623 for_each_possible_cpu(cpu) {
4624 struct rq *rq = cpu_rq(cpu);
4625 struct rq_flags rf;
4626 struct task_struct *p;
4627 struct seq_buf ns;
4628 size_t avail, used;
4629 bool idle;
4630
4631 rq_lock_irqsave(rq, &rf);
4632
4633 idle = list_empty(&rq->scx.runnable_list) &&
4634 rq->curr->sched_class == &idle_sched_class;
4635
4636 if (idle && !SCX_HAS_OP(sch, dump_cpu))
4637 goto next;
4638
4639 /*
4640 * We don't yet know whether ops.dump_cpu() will produce output
4641 * and we may want to skip the default CPU dump if it doesn't.
4642 * Use a nested seq_buf to generate the standard dump so that we
4643 * can decide whether to commit later.
4644 */
4645 avail = seq_buf_get_buf(&s, &buf);
4646 seq_buf_init(&ns, buf, avail);
4647
4648 dump_newline(&ns);
4649 dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
4650 cpu, rq->scx.nr_running, rq->scx.flags,
4651 rq->scx.cpu_released, rq->scx.ops_qseq,
4652 rq->scx.kick_sync);
4653 dump_line(&ns, " curr=%s[%d] class=%ps",
4654 rq->curr->comm, rq->curr->pid,
4655 rq->curr->sched_class);
4656 if (!cpumask_empty(rq->scx.cpus_to_kick))
4657 dump_line(&ns, " cpus_to_kick : %*pb",
4658 cpumask_pr_args(rq->scx.cpus_to_kick));
4659 if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
4660 dump_line(&ns, " idle_to_kick : %*pb",
4661 cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
4662 if (!cpumask_empty(rq->scx.cpus_to_preempt))
4663 dump_line(&ns, " cpus_to_preempt: %*pb",
4664 cpumask_pr_args(rq->scx.cpus_to_preempt));
4665 if (!cpumask_empty(rq->scx.cpus_to_wait))
4666 dump_line(&ns, " cpus_to_wait : %*pb",
4667 cpumask_pr_args(rq->scx.cpus_to_wait));
4668
4669 used = seq_buf_used(&ns);
4670 if (SCX_HAS_OP(sch, dump_cpu)) {
4671 ops_dump_init(&ns, " ");
4672 SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
4673 &dctx, cpu, idle);
4674 ops_dump_exit();
4675 }
4676
4677 /*
4678 * If idle && nothing generated by ops.dump_cpu(), there's
4679 * nothing interesting. Skip.
4680 */
4681 if (idle && used == seq_buf_used(&ns))
4682 goto next;
4683
4684 /*
4685 * $s may already have overflowed when $ns was created. If so,
4686 * calling commit on it will trigger BUG.
4687 */
4688 if (avail) {
4689 seq_buf_commit(&s, seq_buf_used(&ns));
4690 if (seq_buf_has_overflowed(&ns))
4691 seq_buf_set_overflow(&s);
4692 }
4693
4694 if (rq->curr->sched_class == &ext_sched_class)
4695 scx_dump_task(&s, &dctx, rq->curr, '*');
4696
4697 list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
4698 scx_dump_task(&s, &dctx, p, ' ');
4699 next:
4700 rq_unlock_irqrestore(rq, &rf);
4701 }
4702
4703 dump_newline(&s);
4704 dump_line(&s, "Event counters");
4705 dump_line(&s, "--------------");
4706
4707 scx_read_events(sch, &events);
4708 scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
4709 scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
4710 scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
4711 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
4712 scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
4713 scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
4714 scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
4715 scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
4716 scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
4717
4718 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
4719 memcpy(ei->dump + dump_len - sizeof(trunc_marker),
4720 trunc_marker, sizeof(trunc_marker));
4721
4722 spin_unlock_irqrestore(&dump_lock, flags);
4723 }
4724
scx_error_irq_workfn(struct irq_work * irq_work)4725 static void scx_error_irq_workfn(struct irq_work *irq_work)
4726 {
4727 struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
4728 struct scx_exit_info *ei = sch->exit_info;
4729
4730 if (ei->kind >= SCX_EXIT_ERROR)
4731 scx_dump_state(ei, sch->ops.exit_dump_len);
4732
4733 kthread_queue_work(sch->helper, &sch->disable_work);
4734 }
4735
scx_vexit(struct scx_sched * sch,enum scx_exit_kind kind,s64 exit_code,const char * fmt,va_list args)4736 static bool scx_vexit(struct scx_sched *sch,
4737 enum scx_exit_kind kind, s64 exit_code,
4738 const char *fmt, va_list args)
4739 {
4740 struct scx_exit_info *ei = sch->exit_info;
4741
4742 if (!scx_claim_exit(sch, kind))
4743 return false;
4744
4745 ei->exit_code = exit_code;
4746 #ifdef CONFIG_STACKTRACE
4747 if (kind >= SCX_EXIT_ERROR)
4748 ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
4749 #endif
4750 vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
4751
4752 /*
4753 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
4754 * in scx_disable_workfn().
4755 */
4756 ei->kind = kind;
4757 ei->reason = scx_exit_reason(ei->kind);
4758
4759 irq_work_queue(&sch->error_irq_work);
4760 return true;
4761 }
4762
alloc_kick_syncs(void)4763 static int alloc_kick_syncs(void)
4764 {
4765 int cpu;
4766
4767 /*
4768 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
4769 * can exceed percpu allocator limits on large machines.
4770 */
4771 for_each_possible_cpu(cpu) {
4772 struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
4773 struct scx_kick_syncs *new_ksyncs;
4774
4775 WARN_ON_ONCE(rcu_access_pointer(*ksyncs));
4776
4777 new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids),
4778 GFP_KERNEL, cpu_to_node(cpu));
4779 if (!new_ksyncs) {
4780 free_kick_syncs();
4781 return -ENOMEM;
4782 }
4783
4784 rcu_assign_pointer(*ksyncs, new_ksyncs);
4785 }
4786
4787 return 0;
4788 }
4789
scx_alloc_and_add_sched(struct sched_ext_ops * ops)4790 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
4791 {
4792 struct scx_sched *sch;
4793 int node, ret;
4794
4795 sch = kzalloc(sizeof(*sch), GFP_KERNEL);
4796 if (!sch)
4797 return ERR_PTR(-ENOMEM);
4798
4799 sch->exit_info = alloc_exit_info(ops->exit_dump_len);
4800 if (!sch->exit_info) {
4801 ret = -ENOMEM;
4802 goto err_free_sch;
4803 }
4804
4805 ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
4806 if (ret < 0)
4807 goto err_free_ei;
4808
4809 sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
4810 GFP_KERNEL);
4811 if (!sch->global_dsqs) {
4812 ret = -ENOMEM;
4813 goto err_free_hash;
4814 }
4815
4816 for_each_node_state(node, N_POSSIBLE) {
4817 struct scx_dispatch_q *dsq;
4818
4819 dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
4820 if (!dsq) {
4821 ret = -ENOMEM;
4822 goto err_free_gdsqs;
4823 }
4824
4825 init_dsq(dsq, SCX_DSQ_GLOBAL);
4826 sch->global_dsqs[node] = dsq;
4827 }
4828
4829 sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
4830 if (!sch->pcpu) {
4831 ret = -ENOMEM;
4832 goto err_free_gdsqs;
4833 }
4834
4835 sch->helper = kthread_run_worker(0, "sched_ext_helper");
4836 if (IS_ERR(sch->helper)) {
4837 ret = PTR_ERR(sch->helper);
4838 goto err_free_pcpu;
4839 }
4840
4841 sched_set_fifo(sch->helper->task);
4842
4843 atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
4844 init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
4845 kthread_init_work(&sch->disable_work, scx_disable_workfn);
4846 sch->ops = *ops;
4847 ops->priv = sch;
4848
4849 sch->kobj.kset = scx_kset;
4850 ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
4851 if (ret < 0)
4852 goto err_stop_helper;
4853
4854 return sch;
4855
4856 err_stop_helper:
4857 kthread_destroy_worker(sch->helper);
4858 err_free_pcpu:
4859 free_percpu(sch->pcpu);
4860 err_free_gdsqs:
4861 for_each_node_state(node, N_POSSIBLE)
4862 kfree(sch->global_dsqs[node]);
4863 kfree(sch->global_dsqs);
4864 err_free_hash:
4865 rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
4866 err_free_ei:
4867 free_exit_info(sch->exit_info);
4868 err_free_sch:
4869 kfree(sch);
4870 return ERR_PTR(ret);
4871 }
4872
check_hotplug_seq(struct scx_sched * sch,const struct sched_ext_ops * ops)4873 static int check_hotplug_seq(struct scx_sched *sch,
4874 const struct sched_ext_ops *ops)
4875 {
4876 unsigned long long global_hotplug_seq;
4877
4878 /*
4879 * If a hotplug event has occurred between when a scheduler was
4880 * initialized, and when we were able to attach, exit and notify user
4881 * space about it.
4882 */
4883 if (ops->hotplug_seq) {
4884 global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
4885 if (ops->hotplug_seq != global_hotplug_seq) {
4886 scx_exit(sch, SCX_EXIT_UNREG_KERN,
4887 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
4888 "expected hotplug seq %llu did not match actual %llu",
4889 ops->hotplug_seq, global_hotplug_seq);
4890 return -EBUSY;
4891 }
4892 }
4893
4894 return 0;
4895 }
4896
validate_ops(struct scx_sched * sch,const struct sched_ext_ops * ops)4897 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
4898 {
4899 /*
4900 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
4901 * ops.enqueue() callback isn't implemented.
4902 */
4903 if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
4904 scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
4905 return -EINVAL;
4906 }
4907
4908 /*
4909 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
4910 * selection policy to be enabled.
4911 */
4912 if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
4913 (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
4914 scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
4915 return -EINVAL;
4916 }
4917
4918 if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
4919 pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
4920
4921 if (ops->cpu_acquire || ops->cpu_release)
4922 pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
4923
4924 return 0;
4925 }
4926
scx_enable(struct sched_ext_ops * ops,struct bpf_link * link)4927 static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
4928 {
4929 struct scx_sched *sch;
4930 struct scx_task_iter sti;
4931 struct task_struct *p;
4932 unsigned long timeout;
4933 int i, cpu, ret;
4934
4935 if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
4936 cpu_possible_mask)) {
4937 pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
4938 return -EINVAL;
4939 }
4940
4941 mutex_lock(&scx_enable_mutex);
4942
4943 if (scx_enable_state() != SCX_DISABLED) {
4944 ret = -EBUSY;
4945 goto err_unlock;
4946 }
4947
4948 ret = alloc_kick_syncs();
4949 if (ret)
4950 goto err_unlock;
4951
4952 sch = scx_alloc_and_add_sched(ops);
4953 if (IS_ERR(sch)) {
4954 ret = PTR_ERR(sch);
4955 goto err_free_ksyncs;
4956 }
4957
4958 /*
4959 * Transition to ENABLING and clear exit info to arm the disable path.
4960 * Failure triggers full disabling from here on.
4961 */
4962 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
4963 WARN_ON_ONCE(scx_root);
4964 if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
4965 WRITE_ONCE(scx_aborting, false);
4966
4967 atomic_long_set(&scx_nr_rejected, 0);
4968
4969 for_each_possible_cpu(cpu)
4970 cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
4971
4972 /*
4973 * Keep CPUs stable during enable so that the BPF scheduler can track
4974 * online CPUs by watching ->on/offline_cpu() after ->init().
4975 */
4976 cpus_read_lock();
4977
4978 /*
4979 * Make the scheduler instance visible. Must be inside cpus_read_lock().
4980 * See handle_hotplug().
4981 */
4982 rcu_assign_pointer(scx_root, sch);
4983
4984 scx_idle_enable(ops);
4985
4986 if (sch->ops.init) {
4987 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
4988 if (ret) {
4989 ret = ops_sanitize_err(sch, "init", ret);
4990 cpus_read_unlock();
4991 scx_error(sch, "ops.init() failed (%d)", ret);
4992 goto err_disable;
4993 }
4994 sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
4995 }
4996
4997 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
4998 if (((void (**)(void))ops)[i])
4999 set_bit(i, sch->has_op);
5000
5001 ret = check_hotplug_seq(sch, ops);
5002 if (ret) {
5003 cpus_read_unlock();
5004 goto err_disable;
5005 }
5006 scx_idle_update_selcpu_topology(ops);
5007
5008 cpus_read_unlock();
5009
5010 ret = validate_ops(sch, ops);
5011 if (ret)
5012 goto err_disable;
5013
5014 WARN_ON_ONCE(scx_dsp_ctx);
5015 scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
5016 scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
5017 scx_dsp_max_batch),
5018 __alignof__(struct scx_dsp_ctx));
5019 if (!scx_dsp_ctx) {
5020 ret = -ENOMEM;
5021 goto err_disable;
5022 }
5023
5024 if (ops->timeout_ms)
5025 timeout = msecs_to_jiffies(ops->timeout_ms);
5026 else
5027 timeout = SCX_WATCHDOG_MAX_TIMEOUT;
5028
5029 WRITE_ONCE(scx_watchdog_timeout, timeout);
5030 WRITE_ONCE(scx_watchdog_timestamp, jiffies);
5031 queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
5032 scx_watchdog_timeout / 2);
5033
5034 /*
5035 * Once __scx_enabled is set, %current can be switched to SCX anytime.
5036 * This can lead to stalls as some BPF schedulers (e.g. userspace
5037 * scheduling) may not function correctly before all tasks are switched.
5038 * Init in bypass mode to guarantee forward progress.
5039 */
5040 scx_bypass(true);
5041 scx_bypassed_for_enable = true;
5042
5043 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
5044 if (((void (**)(void))ops)[i])
5045 set_bit(i, sch->has_op);
5046
5047 if (sch->ops.cpu_acquire || sch->ops.cpu_release)
5048 sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
5049
5050 /*
5051 * Lock out forks, cgroup on/offlining and moves before opening the
5052 * floodgate so that they don't wander into the operations prematurely.
5053 */
5054 percpu_down_write(&scx_fork_rwsem);
5055
5056 WARN_ON_ONCE(scx_init_task_enabled);
5057 scx_init_task_enabled = true;
5058
5059 /*
5060 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
5061 * preventing new tasks from being added. No need to exclude tasks
5062 * leaving as sched_ext_free() can handle both prepped and enabled
5063 * tasks. Prep all tasks first and then enable them with preemption
5064 * disabled.
5065 *
5066 * All cgroups should be initialized before scx_init_task() so that the
5067 * BPF scheduler can reliably track each task's cgroup membership from
5068 * scx_init_task(). Lock out cgroup on/offlining and task migrations
5069 * while tasks are being initialized so that scx_cgroup_can_attach()
5070 * never sees uninitialized tasks.
5071 */
5072 scx_cgroup_lock();
5073 ret = scx_cgroup_init(sch);
5074 if (ret)
5075 goto err_disable_unlock_all;
5076
5077 scx_task_iter_start(&sti);
5078 while ((p = scx_task_iter_next_locked(&sti))) {
5079 /*
5080 * @p may already be dead, have lost all its usages counts and
5081 * be waiting for RCU grace period before being freed. @p can't
5082 * be initialized for SCX in such cases and should be ignored.
5083 */
5084 if (!tryget_task_struct(p))
5085 continue;
5086
5087 scx_task_iter_unlock(&sti);
5088
5089 ret = scx_init_task(p, task_group(p), false);
5090 if (ret) {
5091 put_task_struct(p);
5092 scx_task_iter_stop(&sti);
5093 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
5094 ret, p->comm, p->pid);
5095 goto err_disable_unlock_all;
5096 }
5097
5098 scx_set_task_state(p, SCX_TASK_READY);
5099
5100 put_task_struct(p);
5101 }
5102 scx_task_iter_stop(&sti);
5103 scx_cgroup_unlock();
5104 percpu_up_write(&scx_fork_rwsem);
5105
5106 /*
5107 * All tasks are READY. It's safe to turn on scx_enabled() and switch
5108 * all eligible tasks.
5109 */
5110 WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
5111 static_branch_enable(&__scx_enabled);
5112
5113 /*
5114 * We're fully committed and can't fail. The task READY -> ENABLED
5115 * transitions here are synchronized against sched_ext_free() through
5116 * scx_tasks_lock.
5117 */
5118 percpu_down_write(&scx_fork_rwsem);
5119 scx_task_iter_start(&sti);
5120 while ((p = scx_task_iter_next_locked(&sti))) {
5121 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
5122 const struct sched_class *old_class = p->sched_class;
5123 const struct sched_class *new_class = scx_setscheduler_class(p);
5124
5125 if (scx_get_task_state(p) != SCX_TASK_READY)
5126 continue;
5127
5128 if (old_class != new_class)
5129 queue_flags |= DEQUEUE_CLASS;
5130
5131 scoped_guard (sched_change, p, queue_flags) {
5132 p->scx.slice = READ_ONCE(scx_slice_dfl);
5133 p->sched_class = new_class;
5134 }
5135 }
5136 scx_task_iter_stop(&sti);
5137 percpu_up_write(&scx_fork_rwsem);
5138
5139 scx_bypassed_for_enable = false;
5140 scx_bypass(false);
5141
5142 if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
5143 WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
5144 goto err_disable;
5145 }
5146
5147 if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
5148 static_branch_enable(&__scx_switched_all);
5149
5150 pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
5151 sch->ops.name, scx_switched_all() ? "" : " (partial)");
5152 kobject_uevent(&sch->kobj, KOBJ_ADD);
5153 mutex_unlock(&scx_enable_mutex);
5154
5155 atomic_long_inc(&scx_enable_seq);
5156
5157 return 0;
5158
5159 err_free_ksyncs:
5160 free_kick_syncs();
5161 err_unlock:
5162 mutex_unlock(&scx_enable_mutex);
5163 return ret;
5164
5165 err_disable_unlock_all:
5166 scx_cgroup_unlock();
5167 percpu_up_write(&scx_fork_rwsem);
5168 /* we'll soon enter disable path, keep bypass on */
5169 err_disable:
5170 mutex_unlock(&scx_enable_mutex);
5171 /*
5172 * Returning an error code here would not pass all the error information
5173 * to userspace. Record errno using scx_error() for cases scx_error()
5174 * wasn't already invoked and exit indicating success so that the error
5175 * is notified through ops.exit() with all the details.
5176 *
5177 * Flush scx_disable_work to ensure that error is reported before init
5178 * completion. sch's base reference will be put by bpf_scx_unreg().
5179 */
5180 scx_error(sch, "scx_enable() failed (%d)", ret);
5181 kthread_flush_work(&sch->disable_work);
5182 return 0;
5183 }
5184
5185
5186 /********************************************************************************
5187 * bpf_struct_ops plumbing.
5188 */
5189 #include <linux/bpf_verifier.h>
5190 #include <linux/bpf.h>
5191 #include <linux/btf.h>
5192
5193 static const struct btf_type *task_struct_type;
5194
bpf_scx_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)5195 static bool bpf_scx_is_valid_access(int off, int size,
5196 enum bpf_access_type type,
5197 const struct bpf_prog *prog,
5198 struct bpf_insn_access_aux *info)
5199 {
5200 if (type != BPF_READ)
5201 return false;
5202 if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
5203 return false;
5204 if (off % size != 0)
5205 return false;
5206
5207 return btf_ctx_access(off, size, type, prog, info);
5208 }
5209
bpf_scx_btf_struct_access(struct bpf_verifier_log * log,const struct bpf_reg_state * reg,int off,int size)5210 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
5211 const struct bpf_reg_state *reg, int off,
5212 int size)
5213 {
5214 const struct btf_type *t;
5215
5216 t = btf_type_by_id(reg->btf, reg->btf_id);
5217 if (t == task_struct_type) {
5218 if (off >= offsetof(struct task_struct, scx.slice) &&
5219 off + size <= offsetofend(struct task_struct, scx.slice))
5220 return SCALAR_VALUE;
5221 if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
5222 off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
5223 return SCALAR_VALUE;
5224 if (off >= offsetof(struct task_struct, scx.disallow) &&
5225 off + size <= offsetofend(struct task_struct, scx.disallow))
5226 return SCALAR_VALUE;
5227 }
5228
5229 return -EACCES;
5230 }
5231
5232 static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
5233 .get_func_proto = bpf_base_func_proto,
5234 .is_valid_access = bpf_scx_is_valid_access,
5235 .btf_struct_access = bpf_scx_btf_struct_access,
5236 };
5237
bpf_scx_init_member(const struct btf_type * t,const struct btf_member * member,void * kdata,const void * udata)5238 static int bpf_scx_init_member(const struct btf_type *t,
5239 const struct btf_member *member,
5240 void *kdata, const void *udata)
5241 {
5242 const struct sched_ext_ops *uops = udata;
5243 struct sched_ext_ops *ops = kdata;
5244 u32 moff = __btf_member_bit_offset(t, member) / 8;
5245 int ret;
5246
5247 switch (moff) {
5248 case offsetof(struct sched_ext_ops, dispatch_max_batch):
5249 if (*(u32 *)(udata + moff) > INT_MAX)
5250 return -E2BIG;
5251 ops->dispatch_max_batch = *(u32 *)(udata + moff);
5252 return 1;
5253 case offsetof(struct sched_ext_ops, flags):
5254 if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
5255 return -EINVAL;
5256 ops->flags = *(u64 *)(udata + moff);
5257 return 1;
5258 case offsetof(struct sched_ext_ops, name):
5259 ret = bpf_obj_name_cpy(ops->name, uops->name,
5260 sizeof(ops->name));
5261 if (ret < 0)
5262 return ret;
5263 if (ret == 0)
5264 return -EINVAL;
5265 return 1;
5266 case offsetof(struct sched_ext_ops, timeout_ms):
5267 if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
5268 SCX_WATCHDOG_MAX_TIMEOUT)
5269 return -E2BIG;
5270 ops->timeout_ms = *(u32 *)(udata + moff);
5271 return 1;
5272 case offsetof(struct sched_ext_ops, exit_dump_len):
5273 ops->exit_dump_len =
5274 *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
5275 return 1;
5276 case offsetof(struct sched_ext_ops, hotplug_seq):
5277 ops->hotplug_seq = *(u64 *)(udata + moff);
5278 return 1;
5279 }
5280
5281 return 0;
5282 }
5283
bpf_scx_check_member(const struct btf_type * t,const struct btf_member * member,const struct bpf_prog * prog)5284 static int bpf_scx_check_member(const struct btf_type *t,
5285 const struct btf_member *member,
5286 const struct bpf_prog *prog)
5287 {
5288 u32 moff = __btf_member_bit_offset(t, member) / 8;
5289
5290 switch (moff) {
5291 case offsetof(struct sched_ext_ops, init_task):
5292 #ifdef CONFIG_EXT_GROUP_SCHED
5293 case offsetof(struct sched_ext_ops, cgroup_init):
5294 case offsetof(struct sched_ext_ops, cgroup_exit):
5295 case offsetof(struct sched_ext_ops, cgroup_prep_move):
5296 #endif
5297 case offsetof(struct sched_ext_ops, cpu_online):
5298 case offsetof(struct sched_ext_ops, cpu_offline):
5299 case offsetof(struct sched_ext_ops, init):
5300 case offsetof(struct sched_ext_ops, exit):
5301 break;
5302 default:
5303 if (prog->sleepable)
5304 return -EINVAL;
5305 }
5306
5307 return 0;
5308 }
5309
bpf_scx_reg(void * kdata,struct bpf_link * link)5310 static int bpf_scx_reg(void *kdata, struct bpf_link *link)
5311 {
5312 return scx_enable(kdata, link);
5313 }
5314
bpf_scx_unreg(void * kdata,struct bpf_link * link)5315 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
5316 {
5317 struct sched_ext_ops *ops = kdata;
5318 struct scx_sched *sch = ops->priv;
5319
5320 scx_disable(SCX_EXIT_UNREG);
5321 kthread_flush_work(&sch->disable_work);
5322 kobject_put(&sch->kobj);
5323 }
5324
bpf_scx_init(struct btf * btf)5325 static int bpf_scx_init(struct btf *btf)
5326 {
5327 task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
5328
5329 return 0;
5330 }
5331
bpf_scx_update(void * kdata,void * old_kdata,struct bpf_link * link)5332 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
5333 {
5334 /*
5335 * sched_ext does not support updating the actively-loaded BPF
5336 * scheduler, as registering a BPF scheduler can always fail if the
5337 * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
5338 * etc. Similarly, we can always race with unregistration happening
5339 * elsewhere, such as with sysrq.
5340 */
5341 return -EOPNOTSUPP;
5342 }
5343
bpf_scx_validate(void * kdata)5344 static int bpf_scx_validate(void *kdata)
5345 {
5346 return 0;
5347 }
5348
sched_ext_ops__select_cpu(struct task_struct * p,s32 prev_cpu,u64 wake_flags)5349 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
sched_ext_ops__enqueue(struct task_struct * p,u64 enq_flags)5350 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
sched_ext_ops__dequeue(struct task_struct * p,u64 enq_flags)5351 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
sched_ext_ops__dispatch(s32 prev_cpu,struct task_struct * prev__nullable)5352 static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
sched_ext_ops__tick(struct task_struct * p)5353 static void sched_ext_ops__tick(struct task_struct *p) {}
sched_ext_ops__runnable(struct task_struct * p,u64 enq_flags)5354 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
sched_ext_ops__running(struct task_struct * p)5355 static void sched_ext_ops__running(struct task_struct *p) {}
sched_ext_ops__stopping(struct task_struct * p,bool runnable)5356 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
sched_ext_ops__quiescent(struct task_struct * p,u64 deq_flags)5357 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
sched_ext_ops__yield(struct task_struct * from,struct task_struct * to__nullable)5358 static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
sched_ext_ops__core_sched_before(struct task_struct * a,struct task_struct * b)5359 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
sched_ext_ops__set_weight(struct task_struct * p,u32 weight)5360 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
sched_ext_ops__set_cpumask(struct task_struct * p,const struct cpumask * mask)5361 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
sched_ext_ops__update_idle(s32 cpu,bool idle)5362 static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
sched_ext_ops__cpu_acquire(s32 cpu,struct scx_cpu_acquire_args * args)5363 static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
sched_ext_ops__cpu_release(s32 cpu,struct scx_cpu_release_args * args)5364 static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
sched_ext_ops__init_task(struct task_struct * p,struct scx_init_task_args * args)5365 static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
sched_ext_ops__exit_task(struct task_struct * p,struct scx_exit_task_args * args)5366 static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
sched_ext_ops__enable(struct task_struct * p)5367 static void sched_ext_ops__enable(struct task_struct *p) {}
sched_ext_ops__disable(struct task_struct * p)5368 static void sched_ext_ops__disable(struct task_struct *p) {}
5369 #ifdef CONFIG_EXT_GROUP_SCHED
sched_ext_ops__cgroup_init(struct cgroup * cgrp,struct scx_cgroup_init_args * args)5370 static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
sched_ext_ops__cgroup_exit(struct cgroup * cgrp)5371 static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
sched_ext_ops__cgroup_prep_move(struct task_struct * p,struct cgroup * from,struct cgroup * to)5372 static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
sched_ext_ops__cgroup_move(struct task_struct * p,struct cgroup * from,struct cgroup * to)5373 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
sched_ext_ops__cgroup_cancel_move(struct task_struct * p,struct cgroup * from,struct cgroup * to)5374 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
sched_ext_ops__cgroup_set_weight(struct cgroup * cgrp,u32 weight)5375 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
sched_ext_ops__cgroup_set_bandwidth(struct cgroup * cgrp,u64 period_us,u64 quota_us,u64 burst_us)5376 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
sched_ext_ops__cgroup_set_idle(struct cgroup * cgrp,bool idle)5377 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
5378 #endif
sched_ext_ops__cpu_online(s32 cpu)5379 static void sched_ext_ops__cpu_online(s32 cpu) {}
sched_ext_ops__cpu_offline(s32 cpu)5380 static void sched_ext_ops__cpu_offline(s32 cpu) {}
sched_ext_ops__init(void)5381 static s32 sched_ext_ops__init(void) { return -EINVAL; }
sched_ext_ops__exit(struct scx_exit_info * info)5382 static void sched_ext_ops__exit(struct scx_exit_info *info) {}
sched_ext_ops__dump(struct scx_dump_ctx * ctx)5383 static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
sched_ext_ops__dump_cpu(struct scx_dump_ctx * ctx,s32 cpu,bool idle)5384 static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
sched_ext_ops__dump_task(struct scx_dump_ctx * ctx,struct task_struct * p)5385 static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
5386
5387 static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
5388 .select_cpu = sched_ext_ops__select_cpu,
5389 .enqueue = sched_ext_ops__enqueue,
5390 .dequeue = sched_ext_ops__dequeue,
5391 .dispatch = sched_ext_ops__dispatch,
5392 .tick = sched_ext_ops__tick,
5393 .runnable = sched_ext_ops__runnable,
5394 .running = sched_ext_ops__running,
5395 .stopping = sched_ext_ops__stopping,
5396 .quiescent = sched_ext_ops__quiescent,
5397 .yield = sched_ext_ops__yield,
5398 .core_sched_before = sched_ext_ops__core_sched_before,
5399 .set_weight = sched_ext_ops__set_weight,
5400 .set_cpumask = sched_ext_ops__set_cpumask,
5401 .update_idle = sched_ext_ops__update_idle,
5402 .cpu_acquire = sched_ext_ops__cpu_acquire,
5403 .cpu_release = sched_ext_ops__cpu_release,
5404 .init_task = sched_ext_ops__init_task,
5405 .exit_task = sched_ext_ops__exit_task,
5406 .enable = sched_ext_ops__enable,
5407 .disable = sched_ext_ops__disable,
5408 #ifdef CONFIG_EXT_GROUP_SCHED
5409 .cgroup_init = sched_ext_ops__cgroup_init,
5410 .cgroup_exit = sched_ext_ops__cgroup_exit,
5411 .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
5412 .cgroup_move = sched_ext_ops__cgroup_move,
5413 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
5414 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
5415 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
5416 .cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
5417 #endif
5418 .cpu_online = sched_ext_ops__cpu_online,
5419 .cpu_offline = sched_ext_ops__cpu_offline,
5420 .init = sched_ext_ops__init,
5421 .exit = sched_ext_ops__exit,
5422 .dump = sched_ext_ops__dump,
5423 .dump_cpu = sched_ext_ops__dump_cpu,
5424 .dump_task = sched_ext_ops__dump_task,
5425 };
5426
5427 static struct bpf_struct_ops bpf_sched_ext_ops = {
5428 .verifier_ops = &bpf_scx_verifier_ops,
5429 .reg = bpf_scx_reg,
5430 .unreg = bpf_scx_unreg,
5431 .check_member = bpf_scx_check_member,
5432 .init_member = bpf_scx_init_member,
5433 .init = bpf_scx_init,
5434 .update = bpf_scx_update,
5435 .validate = bpf_scx_validate,
5436 .name = "sched_ext_ops",
5437 .owner = THIS_MODULE,
5438 .cfi_stubs = &__bpf_ops_sched_ext_ops
5439 };
5440
5441
5442 /********************************************************************************
5443 * System integration and init.
5444 */
5445
sysrq_handle_sched_ext_reset(u8 key)5446 static void sysrq_handle_sched_ext_reset(u8 key)
5447 {
5448 scx_disable(SCX_EXIT_SYSRQ);
5449 }
5450
5451 static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
5452 .handler = sysrq_handle_sched_ext_reset,
5453 .help_msg = "reset-sched-ext(S)",
5454 .action_msg = "Disable sched_ext and revert all tasks to CFS",
5455 .enable_mask = SYSRQ_ENABLE_RTNICE,
5456 };
5457
sysrq_handle_sched_ext_dump(u8 key)5458 static void sysrq_handle_sched_ext_dump(u8 key)
5459 {
5460 struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
5461
5462 if (scx_enabled())
5463 scx_dump_state(&ei, 0);
5464 }
5465
5466 static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
5467 .handler = sysrq_handle_sched_ext_dump,
5468 .help_msg = "dump-sched-ext(D)",
5469 .action_msg = "Trigger sched_ext debug dump",
5470 .enable_mask = SYSRQ_ENABLE_RTNICE,
5471 };
5472
can_skip_idle_kick(struct rq * rq)5473 static bool can_skip_idle_kick(struct rq *rq)
5474 {
5475 lockdep_assert_rq_held(rq);
5476
5477 /*
5478 * We can skip idle kicking if @rq is going to go through at least one
5479 * full SCX scheduling cycle before going idle. Just checking whether
5480 * curr is not idle is insufficient because we could be racing
5481 * balance_one() trying to pull the next task from a remote rq, which
5482 * may fail, and @rq may become idle afterwards.
5483 *
5484 * The race window is small and we don't and can't guarantee that @rq is
5485 * only kicked while idle anyway. Skip only when sure.
5486 */
5487 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
5488 }
5489
kick_one_cpu(s32 cpu,struct rq * this_rq,unsigned long * ksyncs)5490 static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
5491 {
5492 struct rq *rq = cpu_rq(cpu);
5493 struct scx_rq *this_scx = &this_rq->scx;
5494 const struct sched_class *cur_class;
5495 bool should_wait = false;
5496 unsigned long flags;
5497
5498 raw_spin_rq_lock_irqsave(rq, flags);
5499 cur_class = rq->curr->sched_class;
5500
5501 /*
5502 * During CPU hotplug, a CPU may depend on kicking itself to make
5503 * forward progress. Allow kicking self regardless of online state. If
5504 * @cpu is running a higher class task, we have no control over @cpu.
5505 * Skip kicking.
5506 */
5507 if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) &&
5508 !sched_class_above(cur_class, &ext_sched_class)) {
5509 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
5510 if (cur_class == &ext_sched_class)
5511 rq->curr->scx.slice = 0;
5512 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
5513 }
5514
5515 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
5516 if (cur_class == &ext_sched_class) {
5517 ksyncs[cpu] = rq->scx.kick_sync;
5518 should_wait = true;
5519 } else {
5520 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5521 }
5522 }
5523
5524 resched_curr(rq);
5525 } else {
5526 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
5527 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5528 }
5529
5530 raw_spin_rq_unlock_irqrestore(rq, flags);
5531
5532 return should_wait;
5533 }
5534
kick_one_cpu_if_idle(s32 cpu,struct rq * this_rq)5535 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
5536 {
5537 struct rq *rq = cpu_rq(cpu);
5538 unsigned long flags;
5539
5540 raw_spin_rq_lock_irqsave(rq, flags);
5541
5542 if (!can_skip_idle_kick(rq) &&
5543 (cpu_online(cpu) || cpu == cpu_of(this_rq)))
5544 resched_curr(rq);
5545
5546 raw_spin_rq_unlock_irqrestore(rq, flags);
5547 }
5548
kick_cpus_irq_workfn(struct irq_work * irq_work)5549 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
5550 {
5551 struct rq *this_rq = this_rq();
5552 struct scx_rq *this_scx = &this_rq->scx;
5553 struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
5554 bool should_wait = false;
5555 unsigned long *ksyncs;
5556 s32 cpu;
5557
5558 if (unlikely(!ksyncs_pcpu)) {
5559 pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
5560 return;
5561 }
5562
5563 ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
5564
5565 for_each_cpu(cpu, this_scx->cpus_to_kick) {
5566 should_wait |= kick_one_cpu(cpu, this_rq, ksyncs);
5567 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
5568 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
5569 }
5570
5571 for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
5572 kick_one_cpu_if_idle(cpu, this_rq);
5573 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
5574 }
5575
5576 if (!should_wait)
5577 return;
5578
5579 for_each_cpu(cpu, this_scx->cpus_to_wait) {
5580 unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
5581
5582 /*
5583 * Busy-wait until the task running at the time of kicking is no
5584 * longer running. This can be used to implement e.g. core
5585 * scheduling.
5586 *
5587 * smp_cond_load_acquire() pairs with store_releases in
5588 * pick_task_scx() and put_prev_task_scx(). The former breaks
5589 * the wait if SCX's scheduling path is entered even if the same
5590 * task is picked subsequently. The latter is necessary to break
5591 * the wait when $cpu is taken by a higher sched class.
5592 */
5593 if (cpu != cpu_of(this_rq))
5594 smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
5595
5596 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5597 }
5598 }
5599
5600 /**
5601 * print_scx_info - print out sched_ext scheduler state
5602 * @log_lvl: the log level to use when printing
5603 * @p: target task
5604 *
5605 * If a sched_ext scheduler is enabled, print the name and state of the
5606 * scheduler. If @p is on sched_ext, print further information about the task.
5607 *
5608 * This function can be safely called on any task as long as the task_struct
5609 * itself is accessible. While safe, this function isn't synchronized and may
5610 * print out mixups or garbages of limited length.
5611 */
print_scx_info(const char * log_lvl,struct task_struct * p)5612 void print_scx_info(const char *log_lvl, struct task_struct *p)
5613 {
5614 struct scx_sched *sch = scx_root;
5615 enum scx_enable_state state = scx_enable_state();
5616 const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
5617 char runnable_at_buf[22] = "?";
5618 struct sched_class *class;
5619 unsigned long runnable_at;
5620
5621 if (state == SCX_DISABLED)
5622 return;
5623
5624 /*
5625 * Carefully check if the task was running on sched_ext, and then
5626 * carefully copy the time it's been runnable, and its state.
5627 */
5628 if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
5629 class != &ext_sched_class) {
5630 printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
5631 scx_enable_state_str[state], all);
5632 return;
5633 }
5634
5635 if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
5636 sizeof(runnable_at)))
5637 scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
5638 jiffies_delta_msecs(runnable_at, jiffies));
5639
5640 /* print everything onto one line to conserve console space */
5641 printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
5642 log_lvl, sch->ops.name, scx_enable_state_str[state], all,
5643 runnable_at_buf);
5644 }
5645
scx_pm_handler(struct notifier_block * nb,unsigned long event,void * ptr)5646 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
5647 {
5648 /*
5649 * SCX schedulers often have userspace components which are sometimes
5650 * involved in critial scheduling paths. PM operations involve freezing
5651 * userspace which can lead to scheduling misbehaviors including stalls.
5652 * Let's bypass while PM operations are in progress.
5653 */
5654 switch (event) {
5655 case PM_HIBERNATION_PREPARE:
5656 case PM_SUSPEND_PREPARE:
5657 case PM_RESTORE_PREPARE:
5658 scx_bypass(true);
5659 break;
5660 case PM_POST_HIBERNATION:
5661 case PM_POST_SUSPEND:
5662 case PM_POST_RESTORE:
5663 scx_bypass(false);
5664 break;
5665 }
5666
5667 return NOTIFY_OK;
5668 }
5669
5670 static struct notifier_block scx_pm_notifier = {
5671 .notifier_call = scx_pm_handler,
5672 };
5673
init_sched_ext_class(void)5674 void __init init_sched_ext_class(void)
5675 {
5676 s32 cpu, v;
5677
5678 /*
5679 * The following is to prevent the compiler from optimizing out the enum
5680 * definitions so that BPF scheduler implementations can use them
5681 * through the generated vmlinux.h.
5682 */
5683 WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
5684 SCX_TG_ONLINE);
5685
5686 scx_idle_init_masks();
5687
5688 for_each_possible_cpu(cpu) {
5689 struct rq *rq = cpu_rq(cpu);
5690 int n = cpu_to_node(cpu);
5691
5692 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
5693 init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
5694 INIT_LIST_HEAD(&rq->scx.runnable_list);
5695 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
5696
5697 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
5698 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
5699 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
5700 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
5701 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
5702 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
5703
5704 if (cpu_online(cpu))
5705 cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
5706 }
5707
5708 register_sysrq_key('S', &sysrq_sched_ext_reset_op);
5709 register_sysrq_key('D', &sysrq_sched_ext_dump_op);
5710 INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
5711 }
5712
5713
5714 /********************************************************************************
5715 * Helpers that can be called from the BPF scheduler.
5716 */
scx_dsq_insert_preamble(struct scx_sched * sch,struct task_struct * p,u64 enq_flags)5717 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
5718 u64 enq_flags)
5719 {
5720 if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
5721 return false;
5722
5723 lockdep_assert_irqs_disabled();
5724
5725 if (unlikely(!p)) {
5726 scx_error(sch, "called with NULL task");
5727 return false;
5728 }
5729
5730 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
5731 scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
5732 return false;
5733 }
5734
5735 return true;
5736 }
5737
scx_dsq_insert_commit(struct scx_sched * sch,struct task_struct * p,u64 dsq_id,u64 enq_flags)5738 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
5739 u64 dsq_id, u64 enq_flags)
5740 {
5741 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
5742 struct task_struct *ddsp_task;
5743
5744 ddsp_task = __this_cpu_read(direct_dispatch_task);
5745 if (ddsp_task) {
5746 mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
5747 return;
5748 }
5749
5750 if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
5751 scx_error(sch, "dispatch buffer overflow");
5752 return;
5753 }
5754
5755 dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
5756 .task = p,
5757 .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
5758 .dsq_id = dsq_id,
5759 .enq_flags = enq_flags,
5760 };
5761 }
5762
5763 __bpf_kfunc_start_defs();
5764
5765 /**
5766 * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
5767 * @p: task_struct to insert
5768 * @dsq_id: DSQ to insert into
5769 * @slice: duration @p can run for in nsecs, 0 to keep the current value
5770 * @enq_flags: SCX_ENQ_*
5771 *
5772 * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
5773 * call this function spuriously. Can be called from ops.enqueue(),
5774 * ops.select_cpu(), and ops.dispatch().
5775 *
5776 * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
5777 * and @p must match the task being enqueued.
5778 *
5779 * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
5780 * will be directly inserted into the corresponding dispatch queue after
5781 * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
5782 * inserted into the local DSQ of the CPU returned by ops.select_cpu().
5783 * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
5784 * task is inserted.
5785 *
5786 * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
5787 * and this function can be called upto ops.dispatch_max_batch times to insert
5788 * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
5789 * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
5790 * counter.
5791 *
5792 * This function doesn't have any locking restrictions and may be called under
5793 * BPF locks (in the future when BPF introduces more flexible locking).
5794 *
5795 * @p is allowed to run for @slice. The scheduling path is triggered on slice
5796 * exhaustion. If zero, the current residual slice is maintained. If
5797 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
5798 * scx_bpf_kick_cpu() to trigger scheduling.
5799 *
5800 * Returns %true on successful insertion, %false on failure. On the root
5801 * scheduler, %false return triggers scheduler abort and the caller doesn't need
5802 * to check the return value.
5803 */
scx_bpf_dsq_insert___v2(struct task_struct * p,u64 dsq_id,u64 slice,u64 enq_flags)5804 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
5805 u64 slice, u64 enq_flags)
5806 {
5807 struct scx_sched *sch;
5808
5809 guard(rcu)();
5810 sch = rcu_dereference(scx_root);
5811 if (unlikely(!sch))
5812 return false;
5813
5814 if (!scx_dsq_insert_preamble(sch, p, enq_flags))
5815 return false;
5816
5817 if (slice)
5818 p->scx.slice = slice;
5819 else
5820 p->scx.slice = p->scx.slice ?: 1;
5821
5822 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
5823
5824 return true;
5825 }
5826
5827 /*
5828 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix.
5829 */
scx_bpf_dsq_insert(struct task_struct * p,u64 dsq_id,u64 slice,u64 enq_flags)5830 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
5831 u64 slice, u64 enq_flags)
5832 {
5833 scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags);
5834 }
5835
scx_dsq_insert_vtime(struct scx_sched * sch,struct task_struct * p,u64 dsq_id,u64 slice,u64 vtime,u64 enq_flags)5836 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
5837 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
5838 {
5839 if (!scx_dsq_insert_preamble(sch, p, enq_flags))
5840 return false;
5841
5842 if (slice)
5843 p->scx.slice = slice;
5844 else
5845 p->scx.slice = p->scx.slice ?: 1;
5846
5847 p->scx.dsq_vtime = vtime;
5848
5849 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
5850
5851 return true;
5852 }
5853
5854 struct scx_bpf_dsq_insert_vtime_args {
5855 /* @p can't be packed together as KF_RCU is not transitive */
5856 u64 dsq_id;
5857 u64 slice;
5858 u64 vtime;
5859 u64 enq_flags;
5860 };
5861
5862 /**
5863 * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
5864 * @p: task_struct to insert
5865 * @args: struct containing the rest of the arguments
5866 * @args->dsq_id: DSQ to insert into
5867 * @args->slice: duration @p can run for in nsecs, 0 to keep the current value
5868 * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
5869 * @args->enq_flags: SCX_ENQ_*
5870 *
5871 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
5872 * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
5873 * as an inline wrapper in common.bpf.h.
5874 *
5875 * Insert @p into the vtime priority queue of the DSQ identified by
5876 * @args->dsq_id. Tasks queued into the priority queue are ordered by
5877 * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
5878 *
5879 * @args->vtime ordering is according to time_before64() which considers
5880 * wrapping. A numerically larger vtime may indicate an earlier position in the
5881 * ordering and vice-versa.
5882 *
5883 * A DSQ can only be used as a FIFO or priority queue at any given time and this
5884 * function must not be called on a DSQ which already has one or more FIFO tasks
5885 * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
5886 * SCX_DSQ_GLOBAL) cannot be used as priority queues.
5887 *
5888 * Returns %true on successful insertion, %false on failure. On the root
5889 * scheduler, %false return triggers scheduler abort and the caller doesn't need
5890 * to check the return value.
5891 */
5892 __bpf_kfunc bool
__scx_bpf_dsq_insert_vtime(struct task_struct * p,struct scx_bpf_dsq_insert_vtime_args * args)5893 __scx_bpf_dsq_insert_vtime(struct task_struct *p,
5894 struct scx_bpf_dsq_insert_vtime_args *args)
5895 {
5896 struct scx_sched *sch;
5897
5898 guard(rcu)();
5899
5900 sch = rcu_dereference(scx_root);
5901 if (unlikely(!sch))
5902 return false;
5903
5904 return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice,
5905 args->vtime, args->enq_flags);
5906 }
5907
5908 /*
5909 * COMPAT: Will be removed in v6.23.
5910 */
scx_bpf_dsq_insert_vtime(struct task_struct * p,u64 dsq_id,u64 slice,u64 vtime,u64 enq_flags)5911 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
5912 u64 slice, u64 vtime, u64 enq_flags)
5913 {
5914 struct scx_sched *sch;
5915
5916 guard(rcu)();
5917
5918 sch = rcu_dereference(scx_root);
5919 if (unlikely(!sch))
5920 return;
5921
5922 scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
5923 }
5924
5925 __bpf_kfunc_end_defs();
5926
5927 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
5928 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
5929 BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU)
5930 BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
5931 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
5932 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
5933
5934 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
5935 .owner = THIS_MODULE,
5936 .set = &scx_kfunc_ids_enqueue_dispatch,
5937 };
5938
scx_dsq_move(struct bpf_iter_scx_dsq_kern * kit,struct task_struct * p,u64 dsq_id,u64 enq_flags)5939 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
5940 struct task_struct *p, u64 dsq_id, u64 enq_flags)
5941 {
5942 struct scx_sched *sch = scx_root;
5943 struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
5944 struct rq *this_rq, *src_rq, *locked_rq;
5945 bool dispatched = false;
5946 bool in_balance;
5947 unsigned long flags;
5948
5949 if (!scx_kf_allowed_if_unlocked() &&
5950 !scx_kf_allowed(sch, SCX_KF_DISPATCH))
5951 return false;
5952
5953 /*
5954 * If the BPF scheduler keeps calling this function repeatedly, it can
5955 * cause similar live-lock conditions as consume_dispatch_q().
5956 */
5957 if (unlikely(READ_ONCE(scx_aborting)))
5958 return false;
5959
5960 /*
5961 * Can be called from either ops.dispatch() locking this_rq() or any
5962 * context where no rq lock is held. If latter, lock @p's task_rq which
5963 * we'll likely need anyway.
5964 */
5965 src_rq = task_rq(p);
5966
5967 local_irq_save(flags);
5968 this_rq = this_rq();
5969 in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
5970
5971 if (in_balance) {
5972 if (this_rq != src_rq) {
5973 raw_spin_rq_unlock(this_rq);
5974 raw_spin_rq_lock(src_rq);
5975 }
5976 } else {
5977 raw_spin_rq_lock(src_rq);
5978 }
5979
5980 locked_rq = src_rq;
5981 raw_spin_lock(&src_dsq->lock);
5982
5983 /*
5984 * Did someone else get to it? @p could have already left $src_dsq, got
5985 * re-enqueud, or be in the process of being consumed by someone else.
5986 */
5987 if (unlikely(p->scx.dsq != src_dsq ||
5988 u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
5989 p->scx.holding_cpu >= 0) ||
5990 WARN_ON_ONCE(src_rq != task_rq(p))) {
5991 raw_spin_unlock(&src_dsq->lock);
5992 goto out;
5993 }
5994
5995 /* @p is still on $src_dsq and stable, determine the destination */
5996 dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
5997
5998 /*
5999 * Apply vtime and slice updates before moving so that the new time is
6000 * visible before inserting into $dst_dsq. @p is still on $src_dsq but
6001 * this is safe as we're locking it.
6002 */
6003 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
6004 p->scx.dsq_vtime = kit->vtime;
6005 if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
6006 p->scx.slice = kit->slice;
6007
6008 /* execute move */
6009 locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
6010 dispatched = true;
6011 out:
6012 if (in_balance) {
6013 if (this_rq != locked_rq) {
6014 raw_spin_rq_unlock(locked_rq);
6015 raw_spin_rq_lock(this_rq);
6016 }
6017 } else {
6018 raw_spin_rq_unlock_irqrestore(locked_rq, flags);
6019 }
6020
6021 kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
6022 __SCX_DSQ_ITER_HAS_VTIME);
6023 return dispatched;
6024 }
6025
6026 __bpf_kfunc_start_defs();
6027
6028 /**
6029 * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
6030 *
6031 * Can only be called from ops.dispatch().
6032 */
scx_bpf_dispatch_nr_slots(void)6033 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
6034 {
6035 struct scx_sched *sch;
6036
6037 guard(rcu)();
6038
6039 sch = rcu_dereference(scx_root);
6040 if (unlikely(!sch))
6041 return 0;
6042
6043 if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
6044 return 0;
6045
6046 return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
6047 }
6048
6049 /**
6050 * scx_bpf_dispatch_cancel - Cancel the latest dispatch
6051 *
6052 * Cancel the latest dispatch. Can be called multiple times to cancel further
6053 * dispatches. Can only be called from ops.dispatch().
6054 */
scx_bpf_dispatch_cancel(void)6055 __bpf_kfunc void scx_bpf_dispatch_cancel(void)
6056 {
6057 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
6058 struct scx_sched *sch;
6059
6060 guard(rcu)();
6061
6062 sch = rcu_dereference(scx_root);
6063 if (unlikely(!sch))
6064 return;
6065
6066 if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
6067 return;
6068
6069 if (dspc->cursor > 0)
6070 dspc->cursor--;
6071 else
6072 scx_error(sch, "dispatch buffer underflow");
6073 }
6074
6075 /**
6076 * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
6077 * @dsq_id: DSQ to move task from
6078 *
6079 * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
6080 * local DSQ for execution. Can only be called from ops.dispatch().
6081 *
6082 * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
6083 * before trying to move from the specified DSQ. It may also grab rq locks and
6084 * thus can't be called under any BPF locks.
6085 *
6086 * Returns %true if a task has been moved, %false if there isn't any task to
6087 * move.
6088 */
scx_bpf_dsq_move_to_local(u64 dsq_id)6089 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
6090 {
6091 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
6092 struct scx_dispatch_q *dsq;
6093 struct scx_sched *sch;
6094
6095 guard(rcu)();
6096
6097 sch = rcu_dereference(scx_root);
6098 if (unlikely(!sch))
6099 return false;
6100
6101 if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
6102 return false;
6103
6104 flush_dispatch_buf(sch, dspc->rq);
6105
6106 dsq = find_user_dsq(sch, dsq_id);
6107 if (unlikely(!dsq)) {
6108 scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id);
6109 return false;
6110 }
6111
6112 if (consume_dispatch_q(sch, dspc->rq, dsq)) {
6113 /*
6114 * A successfully consumed task can be dequeued before it starts
6115 * running while the CPU is trying to migrate other dispatched
6116 * tasks. Bump nr_tasks to tell balance_one() to retry on empty
6117 * local DSQ.
6118 */
6119 dspc->nr_tasks++;
6120 return true;
6121 } else {
6122 return false;
6123 }
6124 }
6125
6126 /**
6127 * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
6128 * @it__iter: DSQ iterator in progress
6129 * @slice: duration the moved task can run for in nsecs
6130 *
6131 * Override the slice of the next task that will be moved from @it__iter using
6132 * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
6133 * slice duration is kept.
6134 */
scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq * it__iter,u64 slice)6135 __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
6136 u64 slice)
6137 {
6138 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
6139
6140 kit->slice = slice;
6141 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
6142 }
6143
6144 /**
6145 * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
6146 * @it__iter: DSQ iterator in progress
6147 * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
6148 *
6149 * Override the vtime of the next task that will be moved from @it__iter using
6150 * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
6151 * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
6152 * override is ignored and cleared.
6153 */
scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq * it__iter,u64 vtime)6154 __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
6155 u64 vtime)
6156 {
6157 struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
6158
6159 kit->vtime = vtime;
6160 kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
6161 }
6162
6163 /**
6164 * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
6165 * @it__iter: DSQ iterator in progress
6166 * @p: task to transfer
6167 * @dsq_id: DSQ to move @p to
6168 * @enq_flags: SCX_ENQ_*
6169 *
6170 * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
6171 * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
6172 * be the destination.
6173 *
6174 * For the transfer to be successful, @p must still be on the DSQ and have been
6175 * queued before the DSQ iteration started. This function doesn't care whether
6176 * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
6177 * been queued before the iteration started.
6178 *
6179 * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
6180 *
6181 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
6182 * lock (e.g. BPF timers or SYSCALL programs).
6183 *
6184 * Returns %true if @p has been consumed, %false if @p had already been
6185 * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local
6186 * DSQ.
6187 */
scx_bpf_dsq_move(struct bpf_iter_scx_dsq * it__iter,struct task_struct * p,u64 dsq_id,u64 enq_flags)6188 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
6189 struct task_struct *p, u64 dsq_id,
6190 u64 enq_flags)
6191 {
6192 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
6193 p, dsq_id, enq_flags);
6194 }
6195
6196 /**
6197 * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
6198 * @it__iter: DSQ iterator in progress
6199 * @p: task to transfer
6200 * @dsq_id: DSQ to move @p to
6201 * @enq_flags: SCX_ENQ_*
6202 *
6203 * Transfer @p which is on the DSQ currently iterated by @it__iter to the
6204 * priority queue of the DSQ specified by @dsq_id. The destination must be a
6205 * user DSQ as only user DSQs support priority queue.
6206 *
6207 * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
6208 * and scx_bpf_dsq_move_set_vtime() to update.
6209 *
6210 * All other aspects are identical to scx_bpf_dsq_move(). See
6211 * scx_bpf_dsq_insert_vtime() for more information on @vtime.
6212 */
scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq * it__iter,struct task_struct * p,u64 dsq_id,u64 enq_flags)6213 __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
6214 struct task_struct *p, u64 dsq_id,
6215 u64 enq_flags)
6216 {
6217 return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
6218 p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
6219 }
6220
6221 __bpf_kfunc_end_defs();
6222
6223 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
6224 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
6225 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
6226 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
6227 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
6228 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
6229 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
6230 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
6231 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
6232
6233 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
6234 .owner = THIS_MODULE,
6235 .set = &scx_kfunc_ids_dispatch,
6236 };
6237
reenq_local(struct rq * rq)6238 static u32 reenq_local(struct rq *rq)
6239 {
6240 LIST_HEAD(tasks);
6241 u32 nr_enqueued = 0;
6242 struct task_struct *p, *n;
6243
6244 lockdep_assert_rq_held(rq);
6245
6246 /*
6247 * The BPF scheduler may choose to dispatch tasks back to
6248 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
6249 * first to avoid processing the same tasks repeatedly.
6250 */
6251 list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
6252 scx.dsq_list.node) {
6253 /*
6254 * If @p is being migrated, @p's current CPU may not agree with
6255 * its allowed CPUs and the migration_cpu_stop is about to
6256 * deactivate and re-activate @p anyway. Skip re-enqueueing.
6257 *
6258 * While racing sched property changes may also dequeue and
6259 * re-enqueue a migrating task while its current CPU and allowed
6260 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
6261 * the current local DSQ for running tasks and thus are not
6262 * visible to the BPF scheduler.
6263 */
6264 if (p->migration_pending)
6265 continue;
6266
6267 dispatch_dequeue(rq, p);
6268 list_add_tail(&p->scx.dsq_list.node, &tasks);
6269 }
6270
6271 list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
6272 list_del_init(&p->scx.dsq_list.node);
6273 do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
6274 nr_enqueued++;
6275 }
6276
6277 return nr_enqueued;
6278 }
6279
6280 __bpf_kfunc_start_defs();
6281
6282 /**
6283 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
6284 *
6285 * Iterate over all of the tasks currently enqueued on the local DSQ of the
6286 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
6287 * processed tasks. Can only be called from ops.cpu_release().
6288 *
6289 * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
6290 * returning variant that can be called from anywhere.
6291 */
scx_bpf_reenqueue_local(void)6292 __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
6293 {
6294 struct scx_sched *sch;
6295 struct rq *rq;
6296
6297 guard(rcu)();
6298 sch = rcu_dereference(scx_root);
6299 if (unlikely(!sch))
6300 return 0;
6301
6302 if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
6303 return 0;
6304
6305 rq = cpu_rq(smp_processor_id());
6306 lockdep_assert_rq_held(rq);
6307
6308 return reenq_local(rq);
6309 }
6310
6311 __bpf_kfunc_end_defs();
6312
6313 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
6314 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
6315 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
6316
6317 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
6318 .owner = THIS_MODULE,
6319 .set = &scx_kfunc_ids_cpu_release,
6320 };
6321
6322 __bpf_kfunc_start_defs();
6323
6324 /**
6325 * scx_bpf_create_dsq - Create a custom DSQ
6326 * @dsq_id: DSQ to create
6327 * @node: NUMA node to allocate from
6328 *
6329 * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
6330 * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
6331 */
scx_bpf_create_dsq(u64 dsq_id,s32 node)6332 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
6333 {
6334 struct scx_dispatch_q *dsq;
6335 struct scx_sched *sch;
6336 s32 ret;
6337
6338 if (unlikely(node >= (int)nr_node_ids ||
6339 (node < 0 && node != NUMA_NO_NODE)))
6340 return -EINVAL;
6341
6342 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
6343 return -EINVAL;
6344
6345 dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
6346 if (!dsq)
6347 return -ENOMEM;
6348
6349 init_dsq(dsq, dsq_id);
6350
6351 rcu_read_lock();
6352
6353 sch = rcu_dereference(scx_root);
6354 if (sch)
6355 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
6356 dsq_hash_params);
6357 else
6358 ret = -ENODEV;
6359
6360 rcu_read_unlock();
6361 if (ret)
6362 kfree(dsq);
6363 return ret;
6364 }
6365
6366 __bpf_kfunc_end_defs();
6367
6368 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
6369 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
6370 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
6371 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
6372 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
6373 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
6374 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
6375
6376 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
6377 .owner = THIS_MODULE,
6378 .set = &scx_kfunc_ids_unlocked,
6379 };
6380
6381 __bpf_kfunc_start_defs();
6382
6383 /**
6384 * scx_bpf_task_set_slice - Set task's time slice
6385 * @p: task of interest
6386 * @slice: time slice to set in nsecs
6387 *
6388 * Set @p's time slice to @slice. Returns %true on success, %false if the
6389 * calling scheduler doesn't have authority over @p.
6390 */
scx_bpf_task_set_slice(struct task_struct * p,u64 slice)6391 __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
6392 {
6393 p->scx.slice = slice;
6394 return true;
6395 }
6396
6397 /**
6398 * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
6399 * @p: task of interest
6400 * @vtime: virtual time to set
6401 *
6402 * Set @p's virtual time to @vtime. Returns %true on success, %false if the
6403 * calling scheduler doesn't have authority over @p.
6404 */
scx_bpf_task_set_dsq_vtime(struct task_struct * p,u64 vtime)6405 __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
6406 {
6407 p->scx.dsq_vtime = vtime;
6408 return true;
6409 }
6410
scx_kick_cpu(struct scx_sched * sch,s32 cpu,u64 flags)6411 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
6412 {
6413 struct rq *this_rq;
6414 unsigned long irq_flags;
6415
6416 if (!ops_cpu_valid(sch, cpu, NULL))
6417 return;
6418
6419 local_irq_save(irq_flags);
6420
6421 this_rq = this_rq();
6422
6423 /*
6424 * While bypassing for PM ops, IRQ handling may not be online which can
6425 * lead to irq_work_queue() malfunction such as infinite busy wait for
6426 * IRQ status update. Suppress kicking.
6427 */
6428 if (scx_rq_bypassing(this_rq))
6429 goto out;
6430
6431 /*
6432 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
6433 * rq locks. We can probably be smarter and avoid bouncing if called
6434 * from ops which don't hold a rq lock.
6435 */
6436 if (flags & SCX_KICK_IDLE) {
6437 struct rq *target_rq = cpu_rq(cpu);
6438
6439 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
6440 scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
6441
6442 if (raw_spin_rq_trylock(target_rq)) {
6443 if (can_skip_idle_kick(target_rq)) {
6444 raw_spin_rq_unlock(target_rq);
6445 goto out;
6446 }
6447 raw_spin_rq_unlock(target_rq);
6448 }
6449 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
6450 } else {
6451 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
6452
6453 if (flags & SCX_KICK_PREEMPT)
6454 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
6455 if (flags & SCX_KICK_WAIT)
6456 cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
6457 }
6458
6459 irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
6460 out:
6461 local_irq_restore(irq_flags);
6462 }
6463
6464 /**
6465 * scx_bpf_kick_cpu - Trigger reschedule on a CPU
6466 * @cpu: cpu to kick
6467 * @flags: %SCX_KICK_* flags
6468 *
6469 * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
6470 * trigger rescheduling on a busy CPU. This can be called from any online
6471 * scx_ops operation and the actual kicking is performed asynchronously through
6472 * an irq work.
6473 */
scx_bpf_kick_cpu(s32 cpu,u64 flags)6474 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
6475 {
6476 struct scx_sched *sch;
6477
6478 guard(rcu)();
6479 sch = rcu_dereference(scx_root);
6480 if (likely(sch))
6481 scx_kick_cpu(sch, cpu, flags);
6482 }
6483
6484 /**
6485 * scx_bpf_dsq_nr_queued - Return the number of queued tasks
6486 * @dsq_id: id of the DSQ
6487 *
6488 * Return the number of tasks in the DSQ matching @dsq_id. If not found,
6489 * -%ENOENT is returned.
6490 */
scx_bpf_dsq_nr_queued(u64 dsq_id)6491 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
6492 {
6493 struct scx_sched *sch;
6494 struct scx_dispatch_q *dsq;
6495 s32 ret;
6496
6497 preempt_disable();
6498
6499 sch = rcu_dereference_sched(scx_root);
6500 if (unlikely(!sch)) {
6501 ret = -ENODEV;
6502 goto out;
6503 }
6504
6505 if (dsq_id == SCX_DSQ_LOCAL) {
6506 ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
6507 goto out;
6508 } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
6509 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
6510
6511 if (ops_cpu_valid(sch, cpu, NULL)) {
6512 ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
6513 goto out;
6514 }
6515 } else {
6516 dsq = find_user_dsq(sch, dsq_id);
6517 if (dsq) {
6518 ret = READ_ONCE(dsq->nr);
6519 goto out;
6520 }
6521 }
6522 ret = -ENOENT;
6523 out:
6524 preempt_enable();
6525 return ret;
6526 }
6527
6528 /**
6529 * scx_bpf_destroy_dsq - Destroy a custom DSQ
6530 * @dsq_id: DSQ to destroy
6531 *
6532 * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
6533 * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
6534 * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
6535 * which doesn't exist. Can be called from any online scx_ops operations.
6536 */
scx_bpf_destroy_dsq(u64 dsq_id)6537 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
6538 {
6539 struct scx_sched *sch;
6540
6541 rcu_read_lock();
6542 sch = rcu_dereference(scx_root);
6543 if (sch)
6544 destroy_dsq(sch, dsq_id);
6545 rcu_read_unlock();
6546 }
6547
6548 /**
6549 * bpf_iter_scx_dsq_new - Create a DSQ iterator
6550 * @it: iterator to initialize
6551 * @dsq_id: DSQ to iterate
6552 * @flags: %SCX_DSQ_ITER_*
6553 *
6554 * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
6555 * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
6556 * tasks which are already queued when this function is invoked.
6557 */
bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq * it,u64 dsq_id,u64 flags)6558 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
6559 u64 flags)
6560 {
6561 struct bpf_iter_scx_dsq_kern *kit = (void *)it;
6562 struct scx_sched *sch;
6563
6564 BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
6565 sizeof(struct bpf_iter_scx_dsq));
6566 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
6567 __alignof__(struct bpf_iter_scx_dsq));
6568 BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
6569 ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
6570
6571 /*
6572 * next() and destroy() will be called regardless of the return value.
6573 * Always clear $kit->dsq.
6574 */
6575 kit->dsq = NULL;
6576
6577 sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
6578 if (unlikely(!sch))
6579 return -ENODEV;
6580
6581 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
6582 return -EINVAL;
6583
6584 kit->dsq = find_user_dsq(sch, dsq_id);
6585 if (!kit->dsq)
6586 return -ENOENT;
6587
6588 kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
6589 READ_ONCE(kit->dsq->seq));
6590
6591 return 0;
6592 }
6593
6594 /**
6595 * bpf_iter_scx_dsq_next - Progress a DSQ iterator
6596 * @it: iterator to progress
6597 *
6598 * Return the next task. See bpf_iter_scx_dsq_new().
6599 */
bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq * it)6600 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
6601 {
6602 struct bpf_iter_scx_dsq_kern *kit = (void *)it;
6603 bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
6604 struct task_struct *p;
6605 unsigned long flags;
6606
6607 if (!kit->dsq)
6608 return NULL;
6609
6610 raw_spin_lock_irqsave(&kit->dsq->lock, flags);
6611
6612 if (list_empty(&kit->cursor.node))
6613 p = NULL;
6614 else
6615 p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
6616
6617 /*
6618 * Only tasks which were queued before the iteration started are
6619 * visible. This bounds BPF iterations and guarantees that vtime never
6620 * jumps in the other direction while iterating.
6621 */
6622 do {
6623 p = nldsq_next_task(kit->dsq, p, rev);
6624 } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
6625
6626 if (p) {
6627 if (rev)
6628 list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
6629 else
6630 list_move(&kit->cursor.node, &p->scx.dsq_list.node);
6631 } else {
6632 list_del_init(&kit->cursor.node);
6633 }
6634
6635 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
6636
6637 return p;
6638 }
6639
6640 /**
6641 * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
6642 * @it: iterator to destroy
6643 *
6644 * Undo scx_iter_scx_dsq_new().
6645 */
bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq * it)6646 __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
6647 {
6648 struct bpf_iter_scx_dsq_kern *kit = (void *)it;
6649
6650 if (!kit->dsq)
6651 return;
6652
6653 if (!list_empty(&kit->cursor.node)) {
6654 unsigned long flags;
6655
6656 raw_spin_lock_irqsave(&kit->dsq->lock, flags);
6657 list_del_init(&kit->cursor.node);
6658 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
6659 }
6660 kit->dsq = NULL;
6661 }
6662
6663 /**
6664 * scx_bpf_dsq_peek - Lockless peek at the first element.
6665 * @dsq_id: DSQ to examine.
6666 *
6667 * Read the first element in the DSQ. This is semantically equivalent to using
6668 * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
6669 * this provides only a point-in-time snapshot, and the contents may change
6670 * by the time any subsequent locking operation reads the queue.
6671 *
6672 * Returns the pointer, or NULL indicates an empty queue OR internal error.
6673 */
scx_bpf_dsq_peek(u64 dsq_id)6674 __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
6675 {
6676 struct scx_sched *sch;
6677 struct scx_dispatch_q *dsq;
6678
6679 sch = rcu_dereference(scx_root);
6680 if (unlikely(!sch))
6681 return NULL;
6682
6683 if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
6684 scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
6685 return NULL;
6686 }
6687
6688 dsq = find_user_dsq(sch, dsq_id);
6689 if (unlikely(!dsq)) {
6690 scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
6691 return NULL;
6692 }
6693
6694 return rcu_dereference(dsq->first_task);
6695 }
6696
6697 __bpf_kfunc_end_defs();
6698
__bstr_format(struct scx_sched * sch,u64 * data_buf,char * line_buf,size_t line_size,char * fmt,unsigned long long * data,u32 data__sz)6699 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
6700 size_t line_size, char *fmt, unsigned long long *data,
6701 u32 data__sz)
6702 {
6703 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
6704 s32 ret;
6705
6706 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
6707 (data__sz && !data)) {
6708 scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
6709 return -EINVAL;
6710 }
6711
6712 ret = copy_from_kernel_nofault(data_buf, data, data__sz);
6713 if (ret < 0) {
6714 scx_error(sch, "failed to read data fields (%d)", ret);
6715 return ret;
6716 }
6717
6718 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
6719 &bprintf_data);
6720 if (ret < 0) {
6721 scx_error(sch, "format preparation failed (%d)", ret);
6722 return ret;
6723 }
6724
6725 ret = bstr_printf(line_buf, line_size, fmt,
6726 bprintf_data.bin_args);
6727 bpf_bprintf_cleanup(&bprintf_data);
6728 if (ret < 0) {
6729 scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
6730 return ret;
6731 }
6732
6733 return ret;
6734 }
6735
bstr_format(struct scx_sched * sch,struct scx_bstr_buf * buf,char * fmt,unsigned long long * data,u32 data__sz)6736 static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
6737 char *fmt, unsigned long long *data, u32 data__sz)
6738 {
6739 return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
6740 fmt, data, data__sz);
6741 }
6742
6743 __bpf_kfunc_start_defs();
6744
6745 /**
6746 * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
6747 * @exit_code: Exit value to pass to user space via struct scx_exit_info.
6748 * @fmt: error message format string
6749 * @data: format string parameters packaged using ___bpf_fill() macro
6750 * @data__sz: @data len, must end in '__sz' for the verifier
6751 *
6752 * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
6753 * disabling.
6754 */
scx_bpf_exit_bstr(s64 exit_code,char * fmt,unsigned long long * data,u32 data__sz)6755 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
6756 unsigned long long *data, u32 data__sz)
6757 {
6758 struct scx_sched *sch;
6759 unsigned long flags;
6760
6761 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
6762 sch = rcu_dereference_bh(scx_root);
6763 if (likely(sch) &&
6764 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
6765 scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
6766 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
6767 }
6768
6769 /**
6770 * scx_bpf_error_bstr - Indicate fatal error
6771 * @fmt: error message format string
6772 * @data: format string parameters packaged using ___bpf_fill() macro
6773 * @data__sz: @data len, must end in '__sz' for the verifier
6774 *
6775 * Indicate that the BPF scheduler encountered a fatal error and initiate ops
6776 * disabling.
6777 */
scx_bpf_error_bstr(char * fmt,unsigned long long * data,u32 data__sz)6778 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
6779 u32 data__sz)
6780 {
6781 struct scx_sched *sch;
6782 unsigned long flags;
6783
6784 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
6785 sch = rcu_dereference_bh(scx_root);
6786 if (likely(sch) &&
6787 bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
6788 scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
6789 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
6790 }
6791
6792 /**
6793 * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
6794 * @fmt: format string
6795 * @data: format string parameters packaged using ___bpf_fill() macro
6796 * @data__sz: @data len, must end in '__sz' for the verifier
6797 *
6798 * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
6799 * dump_task() to generate extra debug dump specific to the BPF scheduler.
6800 *
6801 * The extra dump may be multiple lines. A single line may be split over
6802 * multiple calls. The last line is automatically terminated.
6803 */
scx_bpf_dump_bstr(char * fmt,unsigned long long * data,u32 data__sz)6804 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
6805 u32 data__sz)
6806 {
6807 struct scx_sched *sch;
6808 struct scx_dump_data *dd = &scx_dump_data;
6809 struct scx_bstr_buf *buf = &dd->buf;
6810 s32 ret;
6811
6812 guard(rcu)();
6813
6814 sch = rcu_dereference(scx_root);
6815 if (unlikely(!sch))
6816 return;
6817
6818 if (raw_smp_processor_id() != dd->cpu) {
6819 scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
6820 return;
6821 }
6822
6823 /* append the formatted string to the line buf */
6824 ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
6825 sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
6826 if (ret < 0) {
6827 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
6828 dd->prefix, fmt, data, data__sz, ret);
6829 return;
6830 }
6831
6832 dd->cursor += ret;
6833 dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
6834
6835 if (!dd->cursor)
6836 return;
6837
6838 /*
6839 * If the line buf overflowed or ends in a newline, flush it into the
6840 * dump. This is to allow the caller to generate a single line over
6841 * multiple calls. As ops_dump_flush() can also handle multiple lines in
6842 * the line buf, the only case which can lead to an unexpected
6843 * truncation is when the caller keeps generating newlines in the middle
6844 * instead of the end consecutively. Don't do that.
6845 */
6846 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
6847 ops_dump_flush();
6848 }
6849
6850 /**
6851 * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
6852 *
6853 * Iterate over all of the tasks currently enqueued on the local DSQ of the
6854 * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
6855 * anywhere.
6856 */
scx_bpf_reenqueue_local___v2(void)6857 __bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
6858 {
6859 struct rq *rq;
6860
6861 guard(preempt)();
6862
6863 rq = this_rq();
6864 local_set(&rq->scx.reenq_local_deferred, 1);
6865 schedule_deferred(rq);
6866 }
6867
6868 /**
6869 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
6870 * @cpu: CPU of interest
6871 *
6872 * Return the maximum relative capacity of @cpu in relation to the most
6873 * performant CPU in the system. The return value is in the range [1,
6874 * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
6875 */
scx_bpf_cpuperf_cap(s32 cpu)6876 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
6877 {
6878 struct scx_sched *sch;
6879
6880 guard(rcu)();
6881
6882 sch = rcu_dereference(scx_root);
6883 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
6884 return arch_scale_cpu_capacity(cpu);
6885 else
6886 return SCX_CPUPERF_ONE;
6887 }
6888
6889 /**
6890 * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
6891 * @cpu: CPU of interest
6892 *
6893 * Return the current relative performance of @cpu in relation to its maximum.
6894 * The return value is in the range [1, %SCX_CPUPERF_ONE].
6895 *
6896 * The current performance level of a CPU in relation to the maximum performance
6897 * available in the system can be calculated as follows:
6898 *
6899 * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
6900 *
6901 * The result is in the range [1, %SCX_CPUPERF_ONE].
6902 */
scx_bpf_cpuperf_cur(s32 cpu)6903 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
6904 {
6905 struct scx_sched *sch;
6906
6907 guard(rcu)();
6908
6909 sch = rcu_dereference(scx_root);
6910 if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
6911 return arch_scale_freq_capacity(cpu);
6912 else
6913 return SCX_CPUPERF_ONE;
6914 }
6915
6916 /**
6917 * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
6918 * @cpu: CPU of interest
6919 * @perf: target performance level [0, %SCX_CPUPERF_ONE]
6920 *
6921 * Set the target performance level of @cpu to @perf. @perf is in linear
6922 * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
6923 * schedutil cpufreq governor chooses the target frequency.
6924 *
6925 * The actual performance level chosen, CPU grouping, and the overhead and
6926 * latency of the operations are dependent on the hardware and cpufreq driver in
6927 * use. Consult hardware and cpufreq documentation for more information. The
6928 * current performance level can be monitored using scx_bpf_cpuperf_cur().
6929 */
scx_bpf_cpuperf_set(s32 cpu,u32 perf)6930 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
6931 {
6932 struct scx_sched *sch;
6933
6934 guard(rcu)();
6935
6936 sch = rcu_dereference(scx_root);
6937 if (unlikely(!sch))
6938 return;
6939
6940 if (unlikely(perf > SCX_CPUPERF_ONE)) {
6941 scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
6942 return;
6943 }
6944
6945 if (ops_cpu_valid(sch, cpu, NULL)) {
6946 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
6947 struct rq_flags rf;
6948
6949 /*
6950 * When called with an rq lock held, restrict the operation
6951 * to the corresponding CPU to prevent ABBA deadlocks.
6952 */
6953 if (locked_rq && rq != locked_rq) {
6954 scx_error(sch, "Invalid target CPU %d", cpu);
6955 return;
6956 }
6957
6958 /*
6959 * If no rq lock is held, allow to operate on any CPU by
6960 * acquiring the corresponding rq lock.
6961 */
6962 if (!locked_rq) {
6963 rq_lock_irqsave(rq, &rf);
6964 update_rq_clock(rq);
6965 }
6966
6967 rq->scx.cpuperf_target = perf;
6968 cpufreq_update_util(rq, 0);
6969
6970 if (!locked_rq)
6971 rq_unlock_irqrestore(rq, &rf);
6972 }
6973 }
6974
6975 /**
6976 * scx_bpf_nr_node_ids - Return the number of possible node IDs
6977 *
6978 * All valid node IDs in the system are smaller than the returned value.
6979 */
scx_bpf_nr_node_ids(void)6980 __bpf_kfunc u32 scx_bpf_nr_node_ids(void)
6981 {
6982 return nr_node_ids;
6983 }
6984
6985 /**
6986 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
6987 *
6988 * All valid CPU IDs in the system are smaller than the returned value.
6989 */
scx_bpf_nr_cpu_ids(void)6990 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
6991 {
6992 return nr_cpu_ids;
6993 }
6994
6995 /**
6996 * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
6997 */
scx_bpf_get_possible_cpumask(void)6998 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
6999 {
7000 return cpu_possible_mask;
7001 }
7002
7003 /**
7004 * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
7005 */
scx_bpf_get_online_cpumask(void)7006 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
7007 {
7008 return cpu_online_mask;
7009 }
7010
7011 /**
7012 * scx_bpf_put_cpumask - Release a possible/online cpumask
7013 * @cpumask: cpumask to release
7014 */
scx_bpf_put_cpumask(const struct cpumask * cpumask)7015 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
7016 {
7017 /*
7018 * Empty function body because we aren't actually acquiring or releasing
7019 * a reference to a global cpumask, which is read-only in the caller and
7020 * is never released. The acquire / release semantics here are just used
7021 * to make the cpumask is a trusted pointer in the caller.
7022 */
7023 }
7024
7025 /**
7026 * scx_bpf_task_running - Is task currently running?
7027 * @p: task of interest
7028 */
scx_bpf_task_running(const struct task_struct * p)7029 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
7030 {
7031 return task_rq(p)->curr == p;
7032 }
7033
7034 /**
7035 * scx_bpf_task_cpu - CPU a task is currently associated with
7036 * @p: task of interest
7037 */
scx_bpf_task_cpu(const struct task_struct * p)7038 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
7039 {
7040 return task_cpu(p);
7041 }
7042
7043 /**
7044 * scx_bpf_cpu_rq - Fetch the rq of a CPU
7045 * @cpu: CPU of the rq
7046 */
scx_bpf_cpu_rq(s32 cpu)7047 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
7048 {
7049 struct scx_sched *sch;
7050
7051 guard(rcu)();
7052
7053 sch = rcu_dereference(scx_root);
7054 if (unlikely(!sch))
7055 return NULL;
7056
7057 if (!ops_cpu_valid(sch, cpu, NULL))
7058 return NULL;
7059
7060 if (!sch->warned_deprecated_rq) {
7061 printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
7062 "use scx_bpf_locked_rq() when holding rq lock "
7063 "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
7064 sch->warned_deprecated_rq = true;
7065 }
7066
7067 return cpu_rq(cpu);
7068 }
7069
7070 /**
7071 * scx_bpf_locked_rq - Return the rq currently locked by SCX
7072 *
7073 * Returns the rq if a rq lock is currently held by SCX.
7074 * Otherwise emits an error and returns NULL.
7075 */
scx_bpf_locked_rq(void)7076 __bpf_kfunc struct rq *scx_bpf_locked_rq(void)
7077 {
7078 struct scx_sched *sch;
7079 struct rq *rq;
7080
7081 guard(preempt)();
7082
7083 sch = rcu_dereference_sched(scx_root);
7084 if (unlikely(!sch))
7085 return NULL;
7086
7087 rq = scx_locked_rq();
7088 if (!rq) {
7089 scx_error(sch, "accessing rq without holding rq lock");
7090 return NULL;
7091 }
7092
7093 return rq;
7094 }
7095
7096 /**
7097 * scx_bpf_cpu_curr - Return remote CPU's curr task
7098 * @cpu: CPU of interest
7099 *
7100 * Callers must hold RCU read lock (KF_RCU).
7101 */
scx_bpf_cpu_curr(s32 cpu)7102 __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
7103 {
7104 struct scx_sched *sch;
7105
7106 guard(rcu)();
7107
7108 sch = rcu_dereference(scx_root);
7109 if (unlikely(!sch))
7110 return NULL;
7111
7112 if (!ops_cpu_valid(sch, cpu, NULL))
7113 return NULL;
7114
7115 return rcu_dereference(cpu_rq(cpu)->curr);
7116 }
7117
7118 /**
7119 * scx_bpf_task_cgroup - Return the sched cgroup of a task
7120 * @p: task of interest
7121 *
7122 * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
7123 * from the scheduler's POV. SCX operations should use this function to
7124 * determine @p's current cgroup as, unlike following @p->cgroups,
7125 * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
7126 * rq-locked operations. Can be called on the parameter tasks of rq-locked
7127 * operations. The restriction guarantees that @p's rq is locked by the caller.
7128 */
7129 #ifdef CONFIG_CGROUP_SCHED
scx_bpf_task_cgroup(struct task_struct * p)7130 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
7131 {
7132 struct task_group *tg = p->sched_task_group;
7133 struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
7134 struct scx_sched *sch;
7135
7136 guard(rcu)();
7137
7138 sch = rcu_dereference(scx_root);
7139 if (unlikely(!sch))
7140 goto out;
7141
7142 if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
7143 goto out;
7144
7145 cgrp = tg_cgrp(tg);
7146
7147 out:
7148 cgroup_get(cgrp);
7149 return cgrp;
7150 }
7151 #endif
7152
7153 /**
7154 * scx_bpf_now - Returns a high-performance monotonically non-decreasing
7155 * clock for the current CPU. The clock returned is in nanoseconds.
7156 *
7157 * It provides the following properties:
7158 *
7159 * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
7160 * to account for execution time and track tasks' runtime properties.
7161 * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
7162 * eventually reads a hardware timestamp counter -- is neither performant nor
7163 * scalable. scx_bpf_now() aims to provide a high-performance clock by
7164 * using the rq clock in the scheduler core whenever possible.
7165 *
7166 * 2) High enough resolution for the BPF scheduler use cases: In most BPF
7167 * scheduler use cases, the required clock resolution is lower than the most
7168 * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
7169 * uses the rq clock in the scheduler core whenever it is valid. It considers
7170 * that the rq clock is valid from the time the rq clock is updated
7171 * (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
7172 *
7173 * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
7174 * guarantees the clock never goes backward when comparing them in the same
7175 * CPU. On the other hand, when comparing clocks in different CPUs, there
7176 * is no such guarantee -- the clock can go backward. It provides a
7177 * monotonically *non-decreasing* clock so that it would provide the same
7178 * clock values in two different scx_bpf_now() calls in the same CPU
7179 * during the same period of when the rq clock is valid.
7180 */
scx_bpf_now(void)7181 __bpf_kfunc u64 scx_bpf_now(void)
7182 {
7183 struct rq *rq;
7184 u64 clock;
7185
7186 preempt_disable();
7187
7188 rq = this_rq();
7189 if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
7190 /*
7191 * If the rq clock is valid, use the cached rq clock.
7192 *
7193 * Note that scx_bpf_now() is re-entrant between a process
7194 * context and an interrupt context (e.g., timer interrupt).
7195 * However, we don't need to consider the race between them
7196 * because such race is not observable from a caller.
7197 */
7198 clock = READ_ONCE(rq->scx.clock);
7199 } else {
7200 /*
7201 * Otherwise, return a fresh rq clock.
7202 *
7203 * The rq clock is updated outside of the rq lock.
7204 * In this case, keep the updated rq clock invalid so the next
7205 * kfunc call outside the rq lock gets a fresh rq clock.
7206 */
7207 clock = sched_clock_cpu(cpu_of(rq));
7208 }
7209
7210 preempt_enable();
7211
7212 return clock;
7213 }
7214
scx_read_events(struct scx_sched * sch,struct scx_event_stats * events)7215 static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
7216 {
7217 struct scx_event_stats *e_cpu;
7218 int cpu;
7219
7220 /* Aggregate per-CPU event counters into @events. */
7221 memset(events, 0, sizeof(*events));
7222 for_each_possible_cpu(cpu) {
7223 e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
7224 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
7225 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
7226 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
7227 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
7228 scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
7229 scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
7230 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
7231 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
7232 scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
7233 }
7234 }
7235
7236 /*
7237 * scx_bpf_events - Get a system-wide event counter to
7238 * @events: output buffer from a BPF program
7239 * @events__sz: @events len, must end in '__sz'' for the verifier
7240 */
scx_bpf_events(struct scx_event_stats * events,size_t events__sz)7241 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
7242 size_t events__sz)
7243 {
7244 struct scx_sched *sch;
7245 struct scx_event_stats e_sys;
7246
7247 rcu_read_lock();
7248 sch = rcu_dereference(scx_root);
7249 if (sch)
7250 scx_read_events(sch, &e_sys);
7251 else
7252 memset(&e_sys, 0, sizeof(e_sys));
7253 rcu_read_unlock();
7254
7255 /*
7256 * We cannot entirely trust a BPF-provided size since a BPF program
7257 * might be compiled against a different vmlinux.h, of which
7258 * scx_event_stats would be larger (a newer vmlinux.h) or smaller
7259 * (an older vmlinux.h). Hence, we use the smaller size to avoid
7260 * memory corruption.
7261 */
7262 events__sz = min(events__sz, sizeof(*events));
7263 memcpy(events, &e_sys, events__sz);
7264 }
7265
7266 __bpf_kfunc_end_defs();
7267
7268 BTF_KFUNCS_START(scx_kfunc_ids_any)
7269 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
7270 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
7271 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
7272 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
7273 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
7274 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
7275 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
7276 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
7277 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
7278 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
7279 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
7280 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
7281 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
7282 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
7283 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
7284 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
7285 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
7286 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
7287 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
7288 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
7289 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
7290 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
7291 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
7292 BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
7293 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
7294 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
7295 #ifdef CONFIG_CGROUP_SCHED
7296 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
7297 #endif
7298 BTF_ID_FLAGS(func, scx_bpf_now)
7299 BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
7300 BTF_KFUNCS_END(scx_kfunc_ids_any)
7301
7302 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
7303 .owner = THIS_MODULE,
7304 .set = &scx_kfunc_ids_any,
7305 };
7306
scx_init(void)7307 static int __init scx_init(void)
7308 {
7309 int ret;
7310
7311 /*
7312 * kfunc registration can't be done from init_sched_ext_class() as
7313 * register_btf_kfunc_id_set() needs most of the system to be up.
7314 *
7315 * Some kfuncs are context-sensitive and can only be called from
7316 * specific SCX ops. They are grouped into BTF sets accordingly.
7317 * Unfortunately, BPF currently doesn't have a way of enforcing such
7318 * restrictions. Eventually, the verifier should be able to enforce
7319 * them. For now, register them the same and make each kfunc explicitly
7320 * check using scx_kf_allowed().
7321 */
7322 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7323 &scx_kfunc_set_enqueue_dispatch)) ||
7324 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7325 &scx_kfunc_set_dispatch)) ||
7326 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7327 &scx_kfunc_set_cpu_release)) ||
7328 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7329 &scx_kfunc_set_unlocked)) ||
7330 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
7331 &scx_kfunc_set_unlocked)) ||
7332 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
7333 &scx_kfunc_set_any)) ||
7334 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
7335 &scx_kfunc_set_any)) ||
7336 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
7337 &scx_kfunc_set_any))) {
7338 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
7339 return ret;
7340 }
7341
7342 ret = scx_idle_init();
7343 if (ret) {
7344 pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
7345 return ret;
7346 }
7347
7348 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
7349 if (ret) {
7350 pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
7351 return ret;
7352 }
7353
7354 ret = register_pm_notifier(&scx_pm_notifier);
7355 if (ret) {
7356 pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
7357 return ret;
7358 }
7359
7360 scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
7361 if (!scx_kset) {
7362 pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
7363 return -ENOMEM;
7364 }
7365
7366 ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
7367 if (ret < 0) {
7368 pr_err("sched_ext: Failed to add global attributes\n");
7369 return ret;
7370 }
7371
7372 if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
7373 !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
7374 pr_err("sched_ext: Failed to allocate cpumasks\n");
7375 return -ENOMEM;
7376 }
7377
7378 return 0;
7379 }
7380 __initcall(scx_init);
7381