1*bba2c361STejun Heo /* SPDX-License-Identifier: GPL-2.0 */ 2*bba2c361STejun Heo /* 3*bba2c361STejun Heo * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4*bba2c361STejun Heo * 5*bba2c361STejun Heo * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6*bba2c361STejun Heo * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7*bba2c361STejun Heo * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8*bba2c361STejun Heo */ 9*bba2c361STejun Heo 10*bba2c361STejun Heo static DEFINE_RAW_SPINLOCK(scx_sched_lock); 11*bba2c361STejun Heo 12*bba2c361STejun Heo /* 13*bba2c361STejun Heo * NOTE: sched_ext is in the process of growing multiple scheduler support and 14*bba2c361STejun Heo * scx_root usage is in a transitional state. Naked dereferences are safe if the 15*bba2c361STejun Heo * caller is one of the tasks attached to SCX and explicit RCU dereference is 16*bba2c361STejun Heo * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but 17*bba2c361STejun Heo * are used as temporary markers to indicate that the dereferences need to be 18*bba2c361STejun Heo * updated to point to the associated scheduler instances rather than scx_root. 19*bba2c361STejun Heo */ 20*bba2c361STejun Heo struct scx_sched __rcu *scx_root; 21*bba2c361STejun Heo 22*bba2c361STejun Heo /* 23*bba2c361STejun Heo * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. 24*bba2c361STejun Heo * Readers can hold either or rcu_read_lock(). 25*bba2c361STejun Heo */ 26*bba2c361STejun Heo static LIST_HEAD(scx_sched_all); 27*bba2c361STejun Heo 28*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 29*bba2c361STejun Heo static const struct rhashtable_params scx_sched_hash_params = { 30*bba2c361STejun Heo .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), 31*bba2c361STejun Heo .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), 32*bba2c361STejun Heo .head_offset = offsetof(struct scx_sched, hash_node), 33*bba2c361STejun Heo .insecure_elasticity = true, /* inserted under scx_sched_lock */ 34*bba2c361STejun Heo }; 35*bba2c361STejun Heo 36*bba2c361STejun Heo static struct rhashtable scx_sched_hash; 37*bba2c361STejun Heo #endif 38*bba2c361STejun Heo 39*bba2c361STejun Heo /* see SCX_OPS_TID_TO_TASK */ 40*bba2c361STejun Heo static const struct rhashtable_params scx_tid_hash_params = { 41*bba2c361STejun Heo .key_len = sizeof_field(struct sched_ext_entity, tid), 42*bba2c361STejun Heo .key_offset = offsetof(struct sched_ext_entity, tid), 43*bba2c361STejun Heo .head_offset = offsetof(struct sched_ext_entity, tid_hash_node), 44*bba2c361STejun Heo .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */ 45*bba2c361STejun Heo }; 46*bba2c361STejun Heo static struct rhashtable scx_tid_hash; 47*bba2c361STejun Heo 48*bba2c361STejun Heo /* 49*bba2c361STejun Heo * During exit, a task may schedule after losing its PIDs. When disabling the 50*bba2c361STejun Heo * BPF scheduler, we need to be able to iterate tasks in every state to 51*bba2c361STejun Heo * guarantee system safety. Maintain a dedicated task list which contains every 52*bba2c361STejun Heo * task between its fork and eventual free. 53*bba2c361STejun Heo */ 54*bba2c361STejun Heo static DEFINE_RAW_SPINLOCK(scx_tasks_lock); 55*bba2c361STejun Heo static LIST_HEAD(scx_tasks); 56*bba2c361STejun Heo 57*bba2c361STejun Heo /* ops enable/disable */ 58*bba2c361STejun Heo static DEFINE_MUTEX(scx_enable_mutex); 59*bba2c361STejun Heo DEFINE_STATIC_KEY_FALSE(__scx_enabled); 60*bba2c361STejun Heo DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 61*bba2c361STejun Heo static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 62*bba2c361STejun Heo static DEFINE_RAW_SPINLOCK(scx_bypass_lock); 63*bba2c361STejun Heo static bool scx_init_task_enabled; 64*bba2c361STejun Heo static bool scx_switching_all; 65*bba2c361STejun Heo DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 66*bba2c361STejun Heo static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled); 67*bba2c361STejun Heo 68*bba2c361STejun Heo /* 69*bba2c361STejun Heo * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler 70*bba2c361STejun Heo * and the tid->task table is live. Wraps the static key so callers don't 71*bba2c361STejun Heo * take the address, and hints "likely enabled" for the common case where 72*bba2c361STejun Heo * the feature is in use. 73*bba2c361STejun Heo */ 74*bba2c361STejun Heo static inline bool scx_tid_to_task_enabled(void) 75*bba2c361STejun Heo { 76*bba2c361STejun Heo return static_branch_likely(&__scx_tid_to_task_enabled); 77*bba2c361STejun Heo } 78*bba2c361STejun Heo 79*bba2c361STejun Heo static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); 80*bba2c361STejun Heo static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); 81*bba2c361STejun Heo 82*bba2c361STejun Heo /* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */ 83*bba2c361STejun Heo static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1); 84*bba2c361STejun Heo 85*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 86*bba2c361STejun Heo /* 87*bba2c361STejun Heo * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit 88*bba2c361STejun Heo * tasks for the sub-sched being enabled. Use a global variable instead of a 89*bba2c361STejun Heo * per-task field as all enables are serialized. 90*bba2c361STejun Heo */ 91*bba2c361STejun Heo static struct scx_sched *scx_enabling_sub_sched; 92*bba2c361STejun Heo #else 93*bba2c361STejun Heo #define scx_enabling_sub_sched (struct scx_sched *)NULL 94*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 95*bba2c361STejun Heo 96*bba2c361STejun Heo /* 97*bba2c361STejun Heo * A monotonically increasing sequence number that is incremented every time a 98*bba2c361STejun Heo * scheduler is enabled. This can be used to check if any custom sched_ext 99*bba2c361STejun Heo * scheduler has ever been used in the system. 100*bba2c361STejun Heo */ 101*bba2c361STejun Heo static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); 102*bba2c361STejun Heo 103*bba2c361STejun Heo /* 104*bba2c361STejun Heo * Watchdog interval. All scx_sched's share a single watchdog timer and the 105*bba2c361STejun Heo * interval is half of the shortest sch->watchdog_timeout. 106*bba2c361STejun Heo */ 107*bba2c361STejun Heo static unsigned long scx_watchdog_interval; 108*bba2c361STejun Heo 109*bba2c361STejun Heo /* 110*bba2c361STejun Heo * The last time the delayed work was run. This delayed work relies on 111*bba2c361STejun Heo * ksoftirqd being able to run to service timer interrupts, so it's possible 112*bba2c361STejun Heo * that this work itself could get wedged. To account for this, we check that 113*bba2c361STejun Heo * it's not stalled in the timer tick, and trigger an error if it is. 114*bba2c361STejun Heo */ 115*bba2c361STejun Heo static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 116*bba2c361STejun Heo 117*bba2c361STejun Heo static struct delayed_work scx_watchdog_work; 118*bba2c361STejun Heo 119*bba2c361STejun Heo /* 120*bba2c361STejun Heo * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 121*bba2c361STejun Heo * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 122*bba2c361STejun Heo * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 123*bba2c361STejun Heo * lazily when enabling and freed when disabling to avoid waste when sched_ext 124*bba2c361STejun Heo * isn't active. 125*bba2c361STejun Heo */ 126*bba2c361STejun Heo struct scx_kick_syncs { 127*bba2c361STejun Heo struct rcu_head rcu; 128*bba2c361STejun Heo unsigned long syncs[]; 129*bba2c361STejun Heo }; 130*bba2c361STejun Heo 131*bba2c361STejun Heo static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 132*bba2c361STejun Heo 133*bba2c361STejun Heo /* 134*bba2c361STejun Heo * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of 135*bba2c361STejun Heo * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without 136*bba2c361STejun Heo * further synchronization. See scx_alloc_tid(). 137*bba2c361STejun Heo */ 138*bba2c361STejun Heo struct scx_tid_alloc { 139*bba2c361STejun Heo u64 next; 140*bba2c361STejun Heo u64 end; 141*bba2c361STejun Heo }; 142*bba2c361STejun Heo static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc); 143*bba2c361STejun Heo 144*bba2c361STejun Heo /* 145*bba2c361STejun Heo * Direct dispatch marker. 146*bba2c361STejun Heo * 147*bba2c361STejun Heo * Non-NULL values are used for direct dispatch from enqueue path. A valid 148*bba2c361STejun Heo * pointer points to the task currently being enqueued. An ERR_PTR value is used 149*bba2c361STejun Heo * to indicate that direct dispatch has already happened. 150*bba2c361STejun Heo */ 151*bba2c361STejun Heo static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 152*bba2c361STejun Heo 153*bba2c361STejun Heo static const struct rhashtable_params dsq_hash_params = { 154*bba2c361STejun Heo .key_len = sizeof_field(struct scx_dispatch_q, id), 155*bba2c361STejun Heo .key_offset = offsetof(struct scx_dispatch_q, id), 156*bba2c361STejun Heo .head_offset = offsetof(struct scx_dispatch_q, hash_node), 157*bba2c361STejun Heo }; 158*bba2c361STejun Heo 159*bba2c361STejun Heo static LLIST_HEAD(dsqs_to_free); 160*bba2c361STejun Heo 161*bba2c361STejun Heo /* string formatting from BPF */ 162*bba2c361STejun Heo struct scx_bstr_buf { 163*bba2c361STejun Heo u64 data[MAX_BPRINTF_VARARGS]; 164*bba2c361STejun Heo char line[SCX_EXIT_MSG_LEN]; 165*bba2c361STejun Heo }; 166*bba2c361STejun Heo 167*bba2c361STejun Heo static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); 168*bba2c361STejun Heo static struct scx_bstr_buf scx_exit_bstr_buf; 169*bba2c361STejun Heo 170*bba2c361STejun Heo /* ops debug dump */ 171*bba2c361STejun Heo static DEFINE_RAW_SPINLOCK(scx_dump_lock); 172*bba2c361STejun Heo 173*bba2c361STejun Heo struct scx_dump_data { 174*bba2c361STejun Heo s32 cpu; 175*bba2c361STejun Heo bool first; 176*bba2c361STejun Heo s32 cursor; 177*bba2c361STejun Heo struct seq_buf *s; 178*bba2c361STejun Heo const char *prefix; 179*bba2c361STejun Heo struct scx_bstr_buf buf; 180*bba2c361STejun Heo }; 181*bba2c361STejun Heo 182*bba2c361STejun Heo static struct scx_dump_data scx_dump_data = { 183*bba2c361STejun Heo .cpu = -1, 184*bba2c361STejun Heo }; 185*bba2c361STejun Heo 186*bba2c361STejun Heo /* /sys/kernel/sched_ext interface */ 187*bba2c361STejun Heo static struct kset *scx_kset; 188*bba2c361STejun Heo 189*bba2c361STejun Heo /* 190*bba2c361STejun Heo * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 191*bba2c361STejun Heo * There usually is no reason to modify these as normal scheduler operation 192*bba2c361STejun Heo * shouldn't be affected by them. The knobs are primarily for debugging. 193*bba2c361STejun Heo */ 194*bba2c361STejun Heo static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 195*bba2c361STejun Heo static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 196*bba2c361STejun Heo 197*bba2c361STejun Heo static int set_slice_us(const char *val, const struct kernel_param *kp) 198*bba2c361STejun Heo { 199*bba2c361STejun Heo return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 200*bba2c361STejun Heo } 201*bba2c361STejun Heo 202*bba2c361STejun Heo static const struct kernel_param_ops slice_us_param_ops = { 203*bba2c361STejun Heo .set = set_slice_us, 204*bba2c361STejun Heo .get = param_get_uint, 205*bba2c361STejun Heo }; 206*bba2c361STejun Heo 207*bba2c361STejun Heo static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 208*bba2c361STejun Heo { 209*bba2c361STejun Heo return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 210*bba2c361STejun Heo } 211*bba2c361STejun Heo 212*bba2c361STejun Heo static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 213*bba2c361STejun Heo .set = set_bypass_lb_intv_us, 214*bba2c361STejun Heo .get = param_get_uint, 215*bba2c361STejun Heo }; 216*bba2c361STejun Heo 217*bba2c361STejun Heo #undef MODULE_PARAM_PREFIX 218*bba2c361STejun Heo #define MODULE_PARAM_PREFIX "sched_ext." 219*bba2c361STejun Heo 220*bba2c361STejun Heo module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 221*bba2c361STejun Heo MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 222*bba2c361STejun Heo module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 223*bba2c361STejun Heo MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 224*bba2c361STejun Heo 225*bba2c361STejun Heo #undef MODULE_PARAM_PREFIX 226*bba2c361STejun Heo 227*bba2c361STejun Heo #define CREATE_TRACE_POINTS 228*bba2c361STejun Heo #include <trace/events/sched_ext.h> 229*bba2c361STejun Heo 230*bba2c361STejun Heo static void run_deferred(struct rq *rq); 231*bba2c361STejun Heo static bool task_dead_and_done(struct task_struct *p); 232*bba2c361STejun Heo static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 233*bba2c361STejun Heo static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); 234*bba2c361STejun Heo 235*bba2c361STejun Heo __printf(5, 6) bool __scx_exit(struct scx_sched *sch, 236*bba2c361STejun Heo enum scx_exit_kind kind, s64 exit_code, 237*bba2c361STejun Heo s32 exit_cpu, const char *fmt, ...) 238*bba2c361STejun Heo { 239*bba2c361STejun Heo va_list args; 240*bba2c361STejun Heo bool ret; 241*bba2c361STejun Heo 242*bba2c361STejun Heo va_start(args, fmt); 243*bba2c361STejun Heo ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args); 244*bba2c361STejun Heo va_end(args); 245*bba2c361STejun Heo 246*bba2c361STejun Heo return ret; 247*bba2c361STejun Heo } 248*bba2c361STejun Heo 249*bba2c361STejun Heo #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 250*bba2c361STejun Heo 251*bba2c361STejun Heo static long jiffies_delta_msecs(unsigned long at, unsigned long now) 252*bba2c361STejun Heo { 253*bba2c361STejun Heo if (time_after(at, now)) 254*bba2c361STejun Heo return jiffies_to_msecs(at - now); 255*bba2c361STejun Heo else 256*bba2c361STejun Heo return -(long)jiffies_to_msecs(now - at); 257*bba2c361STejun Heo } 258*bba2c361STejun Heo 259*bba2c361STejun Heo static bool u32_before(u32 a, u32 b) 260*bba2c361STejun Heo { 261*bba2c361STejun Heo return (s32)(a - b) < 0; 262*bba2c361STejun Heo } 263*bba2c361STejun Heo 264*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 265*bba2c361STejun Heo /** 266*bba2c361STejun Heo * scx_parent - Find the parent sched 267*bba2c361STejun Heo * @sch: sched to find the parent of 268*bba2c361STejun Heo * 269*bba2c361STejun Heo * Returns the parent scheduler or %NULL if @sch is root. 270*bba2c361STejun Heo */ 271*bba2c361STejun Heo static struct scx_sched *scx_parent(struct scx_sched *sch) 272*bba2c361STejun Heo { 273*bba2c361STejun Heo if (sch->level) 274*bba2c361STejun Heo return sch->ancestors[sch->level - 1]; 275*bba2c361STejun Heo else 276*bba2c361STejun Heo return NULL; 277*bba2c361STejun Heo } 278*bba2c361STejun Heo 279*bba2c361STejun Heo /** 280*bba2c361STejun Heo * scx_next_descendant_pre - find the next descendant for pre-order walk 281*bba2c361STejun Heo * @pos: the current position (%NULL to initiate traversal) 282*bba2c361STejun Heo * @root: sched whose descendants to walk 283*bba2c361STejun Heo * 284*bba2c361STejun Heo * To be used by scx_for_each_descendant_pre(). Find the next descendant to 285*bba2c361STejun Heo * visit for pre-order traversal of @root's descendants. @root is included in 286*bba2c361STejun Heo * the iteration and the first node to be visited. 287*bba2c361STejun Heo */ 288*bba2c361STejun Heo static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, 289*bba2c361STejun Heo struct scx_sched *root) 290*bba2c361STejun Heo { 291*bba2c361STejun Heo struct scx_sched *next; 292*bba2c361STejun Heo 293*bba2c361STejun Heo lockdep_assert(lockdep_is_held(&scx_enable_mutex) || 294*bba2c361STejun Heo lockdep_is_held(&scx_sched_lock)); 295*bba2c361STejun Heo 296*bba2c361STejun Heo /* if first iteration, visit @root */ 297*bba2c361STejun Heo if (!pos) 298*bba2c361STejun Heo return root; 299*bba2c361STejun Heo 300*bba2c361STejun Heo /* visit the first child if exists */ 301*bba2c361STejun Heo next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); 302*bba2c361STejun Heo if (next) 303*bba2c361STejun Heo return next; 304*bba2c361STejun Heo 305*bba2c361STejun Heo /* no child, visit my or the closest ancestor's next sibling */ 306*bba2c361STejun Heo while (pos != root) { 307*bba2c361STejun Heo if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) 308*bba2c361STejun Heo return list_next_entry(pos, sibling); 309*bba2c361STejun Heo pos = scx_parent(pos); 310*bba2c361STejun Heo } 311*bba2c361STejun Heo 312*bba2c361STejun Heo return NULL; 313*bba2c361STejun Heo } 314*bba2c361STejun Heo 315*bba2c361STejun Heo static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) 316*bba2c361STejun Heo { 317*bba2c361STejun Heo return rhashtable_lookup(&scx_sched_hash, &cgroup_id, 318*bba2c361STejun Heo scx_sched_hash_params); 319*bba2c361STejun Heo } 320*bba2c361STejun Heo 321*bba2c361STejun Heo static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) 322*bba2c361STejun Heo { 323*bba2c361STejun Heo rcu_assign_pointer(p->scx.sched, sch); 324*bba2c361STejun Heo } 325*bba2c361STejun Heo #else /* CONFIG_EXT_SUB_SCHED */ 326*bba2c361STejun Heo static inline struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } 327*bba2c361STejun Heo static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } 328*bba2c361STejun Heo static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} 329*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 330*bba2c361STejun Heo 331*bba2c361STejun Heo /** 332*bba2c361STejun Heo * scx_is_descendant - Test whether sched is a descendant 333*bba2c361STejun Heo * @sch: sched to test 334*bba2c361STejun Heo * @ancestor: ancestor sched to test against 335*bba2c361STejun Heo * 336*bba2c361STejun Heo * Test whether @sch is a descendant of @ancestor. 337*bba2c361STejun Heo */ 338*bba2c361STejun Heo static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) 339*bba2c361STejun Heo { 340*bba2c361STejun Heo if (sch->level < ancestor->level) 341*bba2c361STejun Heo return false; 342*bba2c361STejun Heo return sch->ancestors[ancestor->level] == ancestor; 343*bba2c361STejun Heo } 344*bba2c361STejun Heo 345*bba2c361STejun Heo /** 346*bba2c361STejun Heo * scx_for_each_descendant_pre - pre-order walk of a sched's descendants 347*bba2c361STejun Heo * @pos: iteration cursor 348*bba2c361STejun Heo * @root: sched to walk the descendants of 349*bba2c361STejun Heo * 350*bba2c361STejun Heo * Walk @root's descendants. @root is included in the iteration and the first 351*bba2c361STejun Heo * node to be visited. Must be called with either scx_enable_mutex or 352*bba2c361STejun Heo * scx_sched_lock held. 353*bba2c361STejun Heo */ 354*bba2c361STejun Heo #define scx_for_each_descendant_pre(pos, root) \ 355*bba2c361STejun Heo for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ 356*bba2c361STejun Heo (pos) = scx_next_descendant_pre((pos), (root))) 357*bba2c361STejun Heo 358*bba2c361STejun Heo static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) 359*bba2c361STejun Heo { 360*bba2c361STejun Heo return &sch->pnode[cpu_to_node(cpu)]->global_dsq; 361*bba2c361STejun Heo } 362*bba2c361STejun Heo 363*bba2c361STejun Heo static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 364*bba2c361STejun Heo { 365*bba2c361STejun Heo return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 366*bba2c361STejun Heo } 367*bba2c361STejun Heo 368*bba2c361STejun Heo static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 369*bba2c361STejun Heo { 370*bba2c361STejun Heo if (p->sched_class == &stop_sched_class) 371*bba2c361STejun Heo return &stop_sched_class; 372*bba2c361STejun Heo 373*bba2c361STejun Heo return __setscheduler_class(p->policy, p->prio); 374*bba2c361STejun Heo } 375*bba2c361STejun Heo 376*bba2c361STejun Heo static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) 377*bba2c361STejun Heo { 378*bba2c361STejun Heo return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; 379*bba2c361STejun Heo } 380*bba2c361STejun Heo 381*bba2c361STejun Heo static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) 382*bba2c361STejun Heo { 383*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 384*bba2c361STejun Heo /* 385*bba2c361STejun Heo * If @sch is a sub-sched which is bypassing, its tasks should go into 386*bba2c361STejun Heo * the bypass DSQs of the nearest ancestor which is not bypassing. The 387*bba2c361STejun Heo * not-bypassing ancestor is responsible for scheduling all tasks from 388*bba2c361STejun Heo * bypassing sub-trees. If all ancestors including root are bypassing, 389*bba2c361STejun Heo * all tasks should go to the root's bypass DSQs. 390*bba2c361STejun Heo * 391*bba2c361STejun Heo * Whenever a sched starts bypassing, all runnable tasks in its subtree 392*bba2c361STejun Heo * are re-enqueued after scx_bypassing() is turned on, guaranteeing that 393*bba2c361STejun Heo * all tasks are transferred to the right DSQs. 394*bba2c361STejun Heo */ 395*bba2c361STejun Heo while (scx_parent(sch) && scx_bypassing(sch, cpu)) 396*bba2c361STejun Heo sch = scx_parent(sch); 397*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 398*bba2c361STejun Heo 399*bba2c361STejun Heo return bypass_dsq(sch, cpu); 400*bba2c361STejun Heo } 401*bba2c361STejun Heo 402*bba2c361STejun Heo /** 403*bba2c361STejun Heo * bypass_dsp_enabled - Check if bypass dispatch path is enabled 404*bba2c361STejun Heo * @sch: scheduler to check 405*bba2c361STejun Heo * 406*bba2c361STejun Heo * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled 407*bba2c361STejun Heo * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors 408*bba2c361STejun Heo * are bypassing. In the former case, the ancestor is not itself bypassing but 409*bba2c361STejun Heo * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, 410*bba2c361STejun Heo * the ancestor's bypass dispatch path must be active even though its own 411*bba2c361STejun Heo * bypass_depth remains zero. 412*bba2c361STejun Heo * 413*bba2c361STejun Heo * This function checks bypass_dsp_enable_depth which is managed separately from 414*bba2c361STejun Heo * bypass_depth to enable this decoupling. See enable_bypass_dsp() and 415*bba2c361STejun Heo * disable_bypass_dsp(). 416*bba2c361STejun Heo */ 417*bba2c361STejun Heo static bool bypass_dsp_enabled(struct scx_sched *sch) 418*bba2c361STejun Heo { 419*bba2c361STejun Heo return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); 420*bba2c361STejun Heo } 421*bba2c361STejun Heo 422*bba2c361STejun Heo /** 423*bba2c361STejun Heo * rq_is_open - Is the rq available for immediate execution of an SCX task? 424*bba2c361STejun Heo * @rq: rq to test 425*bba2c361STejun Heo * @enq_flags: optional %SCX_ENQ_* of the task being enqueued 426*bba2c361STejun Heo * 427*bba2c361STejun Heo * Returns %true if @rq is currently open for executing an SCX task. After a 428*bba2c361STejun Heo * %false return, @rq is guaranteed to invoke SCX dispatch path at least once 429*bba2c361STejun Heo * before going to idle and not inserting a task into @rq's local DSQ after a 430*bba2c361STejun Heo * %false return doesn't cause @rq to stall. 431*bba2c361STejun Heo */ 432*bba2c361STejun Heo static bool rq_is_open(struct rq *rq, u64 enq_flags) 433*bba2c361STejun Heo { 434*bba2c361STejun Heo lockdep_assert_rq_held(rq); 435*bba2c361STejun Heo 436*bba2c361STejun Heo /* 437*bba2c361STejun Heo * A higher-priority class task is either running or in the process of 438*bba2c361STejun Heo * waking up on @rq. 439*bba2c361STejun Heo */ 440*bba2c361STejun Heo if (sched_class_above(rq->next_class, &ext_sched_class)) 441*bba2c361STejun Heo return false; 442*bba2c361STejun Heo 443*bba2c361STejun Heo /* 444*bba2c361STejun Heo * @rq is either in transition to or in idle and there is no 445*bba2c361STejun Heo * higher-priority class task waking up on it. 446*bba2c361STejun Heo */ 447*bba2c361STejun Heo if (sched_class_above(&ext_sched_class, rq->next_class)) 448*bba2c361STejun Heo return true; 449*bba2c361STejun Heo 450*bba2c361STejun Heo /* 451*bba2c361STejun Heo * @rq is either picking, in transition to, or running an SCX task. 452*bba2c361STejun Heo */ 453*bba2c361STejun Heo 454*bba2c361STejun Heo /* 455*bba2c361STejun Heo * If we're in the dispatch path holding rq lock, $curr may or may not 456*bba2c361STejun Heo * be ready depending on whether the on-going dispatch decides to extend 457*bba2c361STejun Heo * $curr's slice. We say yes here and resolve it at the end of dispatch. 458*bba2c361STejun Heo * See balance_one(). 459*bba2c361STejun Heo */ 460*bba2c361STejun Heo if (rq->scx.flags & SCX_RQ_IN_BALANCE) 461*bba2c361STejun Heo return true; 462*bba2c361STejun Heo 463*bba2c361STejun Heo /* 464*bba2c361STejun Heo * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, 465*bba2c361STejun Heo * so allow it to avoid spuriously triggering reenq on a combined 466*bba2c361STejun Heo * PREEMPT|IMMED insertion. 467*bba2c361STejun Heo */ 468*bba2c361STejun Heo if (enq_flags & SCX_ENQ_PREEMPT) 469*bba2c361STejun Heo return true; 470*bba2c361STejun Heo 471*bba2c361STejun Heo /* 472*bba2c361STejun Heo * @rq is either in transition to or running an SCX task and can't go 473*bba2c361STejun Heo * idle without another SCX dispatch cycle. 474*bba2c361STejun Heo */ 475*bba2c361STejun Heo return false; 476*bba2c361STejun Heo } 477*bba2c361STejun Heo 478*bba2c361STejun Heo /* 479*bba2c361STejun Heo * Track the rq currently locked. 480*bba2c361STejun Heo * 481*bba2c361STejun Heo * This allows kfuncs to safely operate on rq from any scx ops callback, 482*bba2c361STejun Heo * knowing which rq is already locked. 483*bba2c361STejun Heo */ 484*bba2c361STejun Heo DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); 485*bba2c361STejun Heo 486*bba2c361STejun Heo static inline void update_locked_rq(struct rq *rq) 487*bba2c361STejun Heo { 488*bba2c361STejun Heo /* 489*bba2c361STejun Heo * Check whether @rq is actually locked. This can help expose bugs 490*bba2c361STejun Heo * or incorrect assumptions about the context in which a kfunc or 491*bba2c361STejun Heo * callback is executed. 492*bba2c361STejun Heo */ 493*bba2c361STejun Heo if (rq) 494*bba2c361STejun Heo lockdep_assert_rq_held(rq); 495*bba2c361STejun Heo __this_cpu_write(scx_locked_rq_state, rq); 496*bba2c361STejun Heo } 497*bba2c361STejun Heo 498*bba2c361STejun Heo /* 499*bba2c361STejun Heo * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not 500*bba2c361STejun Heo * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. 501*bba2c361STejun Heo */ 502*bba2c361STejun Heo #define SCX_CALL_OP(sch, op, locked_rq, args...) \ 503*bba2c361STejun Heo do { \ 504*bba2c361STejun Heo struct rq *__prev_locked_rq; \ 505*bba2c361STejun Heo \ 506*bba2c361STejun Heo if (locked_rq) { \ 507*bba2c361STejun Heo __prev_locked_rq = scx_locked_rq(); \ 508*bba2c361STejun Heo update_locked_rq(locked_rq); \ 509*bba2c361STejun Heo } \ 510*bba2c361STejun Heo (sch)->ops.op(args); \ 511*bba2c361STejun Heo if (locked_rq) \ 512*bba2c361STejun Heo update_locked_rq(__prev_locked_rq); \ 513*bba2c361STejun Heo } while (0) 514*bba2c361STejun Heo 515*bba2c361STejun Heo /* 516*bba2c361STejun Heo * Flipped on enable per sch->is_cid_type. Declared in internal.h so 517*bba2c361STejun Heo * subsystem inlines can read it. 518*bba2c361STejun Heo */ 519*bba2c361STejun Heo DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type); 520*bba2c361STejun Heo 521*bba2c361STejun Heo /* 522*bba2c361STejun Heo * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form 523*bba2c361STejun Heo * schedulers it resolves to the matching cid; for cpu-form it passes @cpu 524*bba2c361STejun Heo * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op 525*bba2c361STejun Heo * (currently only ops.select_cpu); it validates the BPF-supplied cid and 526*bba2c361STejun Heo * triggers scx_error() on @sch if invalid. 527*bba2c361STejun Heo */ 528*bba2c361STejun Heo static s32 scx_cpu_arg(s32 cpu) 529*bba2c361STejun Heo { 530*bba2c361STejun Heo if (scx_is_cid_type()) 531*bba2c361STejun Heo return __scx_cpu_to_cid(cpu); 532*bba2c361STejun Heo return cpu; 533*bba2c361STejun Heo } 534*bba2c361STejun Heo 535*bba2c361STejun Heo static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid) 536*bba2c361STejun Heo { 537*bba2c361STejun Heo if (cpu_or_cid < 0 || !scx_is_cid_type()) 538*bba2c361STejun Heo return cpu_or_cid; 539*bba2c361STejun Heo return scx_cid_to_cpu(sch, cpu_or_cid); 540*bba2c361STejun Heo } 541*bba2c361STejun Heo 542*bba2c361STejun Heo #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ 543*bba2c361STejun Heo ({ \ 544*bba2c361STejun Heo struct rq *__prev_locked_rq; \ 545*bba2c361STejun Heo __typeof__((sch)->ops.op(args)) __ret; \ 546*bba2c361STejun Heo \ 547*bba2c361STejun Heo if (locked_rq) { \ 548*bba2c361STejun Heo __prev_locked_rq = scx_locked_rq(); \ 549*bba2c361STejun Heo update_locked_rq(locked_rq); \ 550*bba2c361STejun Heo } \ 551*bba2c361STejun Heo __ret = (sch)->ops.op(args); \ 552*bba2c361STejun Heo if (locked_rq) \ 553*bba2c361STejun Heo update_locked_rq(__prev_locked_rq); \ 554*bba2c361STejun Heo __ret; \ 555*bba2c361STejun Heo }) 556*bba2c361STejun Heo 557*bba2c361STejun Heo /* 558*bba2c361STejun Heo * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments 559*bba2c361STejun Heo * and records them in current->scx.kf_tasks[] for the duration of the call. A 560*bba2c361STejun Heo * kfunc invoked from inside such an op can then use 561*bba2c361STejun Heo * scx_kf_arg_task_ok() to verify that its task argument is one of 562*bba2c361STejun Heo * those subject tasks. 563*bba2c361STejun Heo * 564*bba2c361STejun Heo * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - 565*bba2c361STejun Heo * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's 566*bba2c361STejun Heo * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. 567*bba2c361STejun Heo * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. 568*bba2c361STejun Heo * 569*bba2c361STejun Heo * kf_tasks[] can not stack, so task-based SCX ops must not nest. The 570*bba2c361STejun Heo * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants 571*bba2c361STejun Heo * while a previous one is still in progress. 572*bba2c361STejun Heo */ 573*bba2c361STejun Heo #define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ 574*bba2c361STejun Heo do { \ 575*bba2c361STejun Heo WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 576*bba2c361STejun Heo current->scx.kf_tasks[0] = task; \ 577*bba2c361STejun Heo SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ 578*bba2c361STejun Heo current->scx.kf_tasks[0] = NULL; \ 579*bba2c361STejun Heo } while (0) 580*bba2c361STejun Heo 581*bba2c361STejun Heo #define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ 582*bba2c361STejun Heo ({ \ 583*bba2c361STejun Heo __typeof__((sch)->ops.op(task, ##args)) __ret; \ 584*bba2c361STejun Heo WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 585*bba2c361STejun Heo current->scx.kf_tasks[0] = task; \ 586*bba2c361STejun Heo __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ 587*bba2c361STejun Heo current->scx.kf_tasks[0] = NULL; \ 588*bba2c361STejun Heo __ret; \ 589*bba2c361STejun Heo }) 590*bba2c361STejun Heo 591*bba2c361STejun Heo #define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ 592*bba2c361STejun Heo ({ \ 593*bba2c361STejun Heo __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ 594*bba2c361STejun Heo WARN_ON_ONCE(current->scx.kf_tasks[0]); \ 595*bba2c361STejun Heo current->scx.kf_tasks[0] = task0; \ 596*bba2c361STejun Heo current->scx.kf_tasks[1] = task1; \ 597*bba2c361STejun Heo __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ 598*bba2c361STejun Heo current->scx.kf_tasks[0] = NULL; \ 599*bba2c361STejun Heo current->scx.kf_tasks[1] = NULL; \ 600*bba2c361STejun Heo __ret; \ 601*bba2c361STejun Heo }) 602*bba2c361STejun Heo 603*bba2c361STejun Heo /** 604*bba2c361STejun Heo * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task 605*bba2c361STejun Heo * @sch: scx_sched being invoked 606*bba2c361STejun Heo * @rq: rq to update as the currently-locked rq, or NULL 607*bba2c361STejun Heo * @task: task whose affinity is changing 608*bba2c361STejun Heo * @cpumask: new cpumask 609*bba2c361STejun Heo * 610*bba2c361STejun Heo * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu 611*bba2c361STejun Heo * scratch in cid.c and dispatch through the ops_cid union view. Caller 612*bba2c361STejun Heo * must hold @rq's rq lock so this_cpu_ptr is stable across the call. 613*bba2c361STejun Heo */ 614*bba2c361STejun Heo static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq, 615*bba2c361STejun Heo struct task_struct *task, 616*bba2c361STejun Heo const struct cpumask *cpumask) 617*bba2c361STejun Heo { 618*bba2c361STejun Heo WARN_ON_ONCE(current->scx.kf_tasks[0]); 619*bba2c361STejun Heo current->scx.kf_tasks[0] = task; 620*bba2c361STejun Heo if (rq) 621*bba2c361STejun Heo update_locked_rq(rq); 622*bba2c361STejun Heo 623*bba2c361STejun Heo if (scx_is_cid_type()) { 624*bba2c361STejun Heo struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch); 625*bba2c361STejun Heo /* 626*bba2c361STejun Heo * Build the per-CPU arena cmask and hand BPF its arena address. 627*bba2c361STejun Heo * Caller holds the rq lock with IRQs disabled, which makes us 628*bba2c361STejun Heo * the sole user of the scratch area. 629*bba2c361STejun Heo */ 630*bba2c361STejun Heo scx_cpumask_to_cmask(cpumask, kern_va); 631*bba2c361STejun Heo sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va)); 632*bba2c361STejun Heo } else { 633*bba2c361STejun Heo sch->ops.set_cpumask(task, cpumask); 634*bba2c361STejun Heo } 635*bba2c361STejun Heo 636*bba2c361STejun Heo if (rq) 637*bba2c361STejun Heo update_locked_rq(NULL); 638*bba2c361STejun Heo current->scx.kf_tasks[0] = NULL; 639*bba2c361STejun Heo } 640*bba2c361STejun Heo 641*bba2c361STejun Heo /* see SCX_CALL_OP_TASK() */ 642*bba2c361STejun Heo static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, 643*bba2c361STejun Heo struct task_struct *p) 644*bba2c361STejun Heo { 645*bba2c361STejun Heo if (unlikely((p != current->scx.kf_tasks[0] && 646*bba2c361STejun Heo p != current->scx.kf_tasks[1]))) { 647*bba2c361STejun Heo scx_error(sch, "called on a task not being operated on"); 648*bba2c361STejun Heo return false; 649*bba2c361STejun Heo } 650*bba2c361STejun Heo 651*bba2c361STejun Heo return true; 652*bba2c361STejun Heo } 653*bba2c361STejun Heo 654*bba2c361STejun Heo enum scx_dsq_iter_flags { 655*bba2c361STejun Heo /* iterate in the reverse dispatch order */ 656*bba2c361STejun Heo SCX_DSQ_ITER_REV = 1U << 16, 657*bba2c361STejun Heo 658*bba2c361STejun Heo __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, 659*bba2c361STejun Heo __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, 660*bba2c361STejun Heo 661*bba2c361STejun Heo __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, 662*bba2c361STejun Heo __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | 663*bba2c361STejun Heo __SCX_DSQ_ITER_HAS_SLICE | 664*bba2c361STejun Heo __SCX_DSQ_ITER_HAS_VTIME, 665*bba2c361STejun Heo }; 666*bba2c361STejun Heo 667*bba2c361STejun Heo /** 668*bba2c361STejun Heo * nldsq_next_task - Iterate to the next task in a non-local DSQ 669*bba2c361STejun Heo * @dsq: non-local dsq being iterated 670*bba2c361STejun Heo * @cur: current position, %NULL to start iteration 671*bba2c361STejun Heo * @rev: walk backwards 672*bba2c361STejun Heo * 673*bba2c361STejun Heo * Returns %NULL when iteration is finished. 674*bba2c361STejun Heo */ 675*bba2c361STejun Heo static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, 676*bba2c361STejun Heo struct task_struct *cur, bool rev) 677*bba2c361STejun Heo { 678*bba2c361STejun Heo struct list_head *list_node; 679*bba2c361STejun Heo struct scx_dsq_list_node *dsq_lnode; 680*bba2c361STejun Heo 681*bba2c361STejun Heo lockdep_assert_held(&dsq->lock); 682*bba2c361STejun Heo 683*bba2c361STejun Heo if (cur) 684*bba2c361STejun Heo list_node = &cur->scx.dsq_list.node; 685*bba2c361STejun Heo else 686*bba2c361STejun Heo list_node = &dsq->list; 687*bba2c361STejun Heo 688*bba2c361STejun Heo /* find the next task, need to skip BPF iteration cursors */ 689*bba2c361STejun Heo do { 690*bba2c361STejun Heo if (rev) 691*bba2c361STejun Heo list_node = list_node->prev; 692*bba2c361STejun Heo else 693*bba2c361STejun Heo list_node = list_node->next; 694*bba2c361STejun Heo 695*bba2c361STejun Heo if (list_node == &dsq->list) 696*bba2c361STejun Heo return NULL; 697*bba2c361STejun Heo 698*bba2c361STejun Heo dsq_lnode = container_of(list_node, struct scx_dsq_list_node, 699*bba2c361STejun Heo node); 700*bba2c361STejun Heo } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); 701*bba2c361STejun Heo 702*bba2c361STejun Heo return container_of(dsq_lnode, struct task_struct, scx.dsq_list); 703*bba2c361STejun Heo } 704*bba2c361STejun Heo 705*bba2c361STejun Heo #define nldsq_for_each_task(p, dsq) \ 706*bba2c361STejun Heo for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ 707*bba2c361STejun Heo (p) = nldsq_next_task((dsq), (p), false)) 708*bba2c361STejun Heo 709*bba2c361STejun Heo /** 710*bba2c361STejun Heo * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ 711*bba2c361STejun Heo * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 712*bba2c361STejun Heo * @dsq: non-local dsq being iterated 713*bba2c361STejun Heo * 714*bba2c361STejun Heo * Find the next task in a cursor based iteration. The caller must have 715*bba2c361STejun Heo * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock 716*bba2c361STejun Heo * between the iteration steps. 717*bba2c361STejun Heo * 718*bba2c361STejun Heo * Only tasks which were queued before @cursor was initialized are visible. This 719*bba2c361STejun Heo * bounds the iteration and guarantees that vtime never jumps in the other 720*bba2c361STejun Heo * direction while iterating. 721*bba2c361STejun Heo */ 722*bba2c361STejun Heo static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, 723*bba2c361STejun Heo struct scx_dispatch_q *dsq) 724*bba2c361STejun Heo { 725*bba2c361STejun Heo bool rev = cursor->flags & SCX_DSQ_ITER_REV; 726*bba2c361STejun Heo struct task_struct *p; 727*bba2c361STejun Heo 728*bba2c361STejun Heo lockdep_assert_held(&dsq->lock); 729*bba2c361STejun Heo BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); 730*bba2c361STejun Heo 731*bba2c361STejun Heo if (list_empty(&cursor->node)) 732*bba2c361STejun Heo p = NULL; 733*bba2c361STejun Heo else 734*bba2c361STejun Heo p = container_of(cursor, struct task_struct, scx.dsq_list); 735*bba2c361STejun Heo 736*bba2c361STejun Heo /* skip cursors and tasks that were queued after @cursor init */ 737*bba2c361STejun Heo do { 738*bba2c361STejun Heo p = nldsq_next_task(dsq, p, rev); 739*bba2c361STejun Heo } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); 740*bba2c361STejun Heo 741*bba2c361STejun Heo if (p) { 742*bba2c361STejun Heo if (rev) 743*bba2c361STejun Heo list_move_tail(&cursor->node, &p->scx.dsq_list.node); 744*bba2c361STejun Heo else 745*bba2c361STejun Heo list_move(&cursor->node, &p->scx.dsq_list.node); 746*bba2c361STejun Heo } else { 747*bba2c361STejun Heo list_del_init(&cursor->node); 748*bba2c361STejun Heo } 749*bba2c361STejun Heo 750*bba2c361STejun Heo return p; 751*bba2c361STejun Heo } 752*bba2c361STejun Heo 753*bba2c361STejun Heo /** 754*bba2c361STejun Heo * nldsq_cursor_lost_task - Test whether someone else took the task since iteration 755*bba2c361STejun Heo * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() 756*bba2c361STejun Heo * @rq: rq @p was on 757*bba2c361STejun Heo * @dsq: dsq @p was on 758*bba2c361STejun Heo * @p: target task 759*bba2c361STejun Heo * 760*bba2c361STejun Heo * @p is a task returned by nldsq_cursor_next_task(). The locks may have been 761*bba2c361STejun Heo * dropped and re-acquired inbetween. Verify that no one else took or is in the 762*bba2c361STejun Heo * process of taking @p from @dsq. 763*bba2c361STejun Heo * 764*bba2c361STejun Heo * On %false return, the caller can assume full ownership of @p. 765*bba2c361STejun Heo */ 766*bba2c361STejun Heo static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, 767*bba2c361STejun Heo struct rq *rq, struct scx_dispatch_q *dsq, 768*bba2c361STejun Heo struct task_struct *p) 769*bba2c361STejun Heo { 770*bba2c361STejun Heo lockdep_assert_rq_held(rq); 771*bba2c361STejun Heo lockdep_assert_held(&dsq->lock); 772*bba2c361STejun Heo 773*bba2c361STejun Heo /* 774*bba2c361STejun Heo * @p could have already left $src_dsq, got re-enqueud, or be in the 775*bba2c361STejun Heo * process of being consumed by someone else. 776*bba2c361STejun Heo */ 777*bba2c361STejun Heo if (unlikely(p->scx.dsq != dsq || 778*bba2c361STejun Heo u32_before(cursor->priv, p->scx.dsq_seq) || 779*bba2c361STejun Heo p->scx.holding_cpu >= 0)) 780*bba2c361STejun Heo return true; 781*bba2c361STejun Heo 782*bba2c361STejun Heo /* if @p has stayed on @dsq, its rq couldn't have changed */ 783*bba2c361STejun Heo if (WARN_ON_ONCE(rq != task_rq(p))) 784*bba2c361STejun Heo return true; 785*bba2c361STejun Heo 786*bba2c361STejun Heo return false; 787*bba2c361STejun Heo } 788*bba2c361STejun Heo 789*bba2c361STejun Heo /* 790*bba2c361STejun Heo * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] 791*bba2c361STejun Heo * dispatch order. BPF-visible iterator is opaque and larger to allow future 792*bba2c361STejun Heo * changes without breaking backward compatibility. Can be used with 793*bba2c361STejun Heo * bpf_for_each(). See bpf_iter_scx_dsq_*(). 794*bba2c361STejun Heo */ 795*bba2c361STejun Heo struct bpf_iter_scx_dsq_kern { 796*bba2c361STejun Heo struct scx_dsq_list_node cursor; 797*bba2c361STejun Heo struct scx_dispatch_q *dsq; 798*bba2c361STejun Heo u64 slice; 799*bba2c361STejun Heo u64 vtime; 800*bba2c361STejun Heo } __attribute__((aligned(8))); 801*bba2c361STejun Heo 802*bba2c361STejun Heo struct bpf_iter_scx_dsq { 803*bba2c361STejun Heo u64 __opaque[6]; 804*bba2c361STejun Heo } __attribute__((aligned(8))); 805*bba2c361STejun Heo 806*bba2c361STejun Heo 807*bba2c361STejun Heo static u32 scx_get_task_state(const struct task_struct *p) 808*bba2c361STejun Heo { 809*bba2c361STejun Heo return p->scx.flags & SCX_TASK_STATE_MASK; 810*bba2c361STejun Heo } 811*bba2c361STejun Heo 812*bba2c361STejun Heo static void scx_set_task_state(struct task_struct *p, u32 state) 813*bba2c361STejun Heo { 814*bba2c361STejun Heo u32 prev_state = scx_get_task_state(p); 815*bba2c361STejun Heo bool warn = false; 816*bba2c361STejun Heo 817*bba2c361STejun Heo switch (state) { 818*bba2c361STejun Heo case SCX_TASK_NONE: 819*bba2c361STejun Heo warn = prev_state == SCX_TASK_DEAD; 820*bba2c361STejun Heo break; 821*bba2c361STejun Heo case SCX_TASK_INIT_BEGIN: 822*bba2c361STejun Heo warn = prev_state != SCX_TASK_NONE; 823*bba2c361STejun Heo break; 824*bba2c361STejun Heo case SCX_TASK_INIT: 825*bba2c361STejun Heo warn = prev_state != SCX_TASK_INIT_BEGIN; 826*bba2c361STejun Heo p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 827*bba2c361STejun Heo break; 828*bba2c361STejun Heo case SCX_TASK_READY: 829*bba2c361STejun Heo warn = !(prev_state == SCX_TASK_INIT || 830*bba2c361STejun Heo prev_state == SCX_TASK_ENABLED); 831*bba2c361STejun Heo break; 832*bba2c361STejun Heo case SCX_TASK_ENABLED: 833*bba2c361STejun Heo warn = prev_state != SCX_TASK_READY; 834*bba2c361STejun Heo break; 835*bba2c361STejun Heo case SCX_TASK_DEAD: 836*bba2c361STejun Heo warn = !(prev_state == SCX_TASK_NONE || 837*bba2c361STejun Heo prev_state == SCX_TASK_INIT_BEGIN); 838*bba2c361STejun Heo break; 839*bba2c361STejun Heo default: 840*bba2c361STejun Heo WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", 841*bba2c361STejun Heo prev_state, state, p->comm, p->pid); 842*bba2c361STejun Heo return; 843*bba2c361STejun Heo } 844*bba2c361STejun Heo 845*bba2c361STejun Heo WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", 846*bba2c361STejun Heo prev_state, state, p->comm, p->pid); 847*bba2c361STejun Heo 848*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_STATE_MASK; 849*bba2c361STejun Heo p->scx.flags |= state; 850*bba2c361STejun Heo } 851*bba2c361STejun Heo 852*bba2c361STejun Heo /* 853*bba2c361STejun Heo * SCX task iterator. 854*bba2c361STejun Heo */ 855*bba2c361STejun Heo struct scx_task_iter { 856*bba2c361STejun Heo struct sched_ext_entity cursor; 857*bba2c361STejun Heo struct task_struct *locked_task; 858*bba2c361STejun Heo struct rq *rq; 859*bba2c361STejun Heo struct rq_flags rf; 860*bba2c361STejun Heo u32 cnt; 861*bba2c361STejun Heo bool list_locked; 862*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 863*bba2c361STejun Heo struct cgroup *cgrp; 864*bba2c361STejun Heo struct cgroup_subsys_state *css_pos; 865*bba2c361STejun Heo struct css_task_iter css_iter; 866*bba2c361STejun Heo #endif 867*bba2c361STejun Heo }; 868*bba2c361STejun Heo 869*bba2c361STejun Heo /** 870*bba2c361STejun Heo * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration 871*bba2c361STejun Heo * @iter: iterator to init 872*bba2c361STejun Heo * @cgrp: Optional root of cgroup subhierarchy to iterate 873*bba2c361STejun Heo * 874*bba2c361STejun Heo * Initialize @iter. Once initialized, @iter must eventually be stopped with 875*bba2c361STejun Heo * scx_task_iter_stop(). 876*bba2c361STejun Heo * 877*bba2c361STejun Heo * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns 878*bba2c361STejun Heo * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. 879*bba2c361STejun Heo * 880*bba2c361STejun Heo * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using 881*bba2c361STejun Heo * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup 882*bba2c361STejun Heo * task migrations. 883*bba2c361STejun Heo * 884*bba2c361STejun Heo * The two modes of iterations are largely independent and it's likely that 885*bba2c361STejun Heo * scx_tasks can be removed in favor of always using cgroup iteration if 886*bba2c361STejun Heo * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. 887*bba2c361STejun Heo * 888*bba2c361STejun Heo * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() 889*bba2c361STejun Heo * between this and the first next() call or between any two next() calls. If 890*bba2c361STejun Heo * the locks are released between two next() calls, the caller is responsible 891*bba2c361STejun Heo * for ensuring that the task being iterated remains accessible either through 892*bba2c361STejun Heo * RCU read lock or obtaining a reference count. 893*bba2c361STejun Heo * 894*bba2c361STejun Heo * All tasks which existed when the iteration started are guaranteed to be 895*bba2c361STejun Heo * visited as long as they are not dead. 896*bba2c361STejun Heo */ 897*bba2c361STejun Heo static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) 898*bba2c361STejun Heo { 899*bba2c361STejun Heo memset(iter, 0, sizeof(*iter)); 900*bba2c361STejun Heo 901*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 902*bba2c361STejun Heo if (cgrp) { 903*bba2c361STejun Heo lockdep_assert_held(&cgroup_mutex); 904*bba2c361STejun Heo iter->cgrp = cgrp; 905*bba2c361STejun Heo iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); 906*bba2c361STejun Heo css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 907*bba2c361STejun Heo &iter->css_iter); 908*bba2c361STejun Heo return; 909*bba2c361STejun Heo } 910*bba2c361STejun Heo #endif 911*bba2c361STejun Heo raw_spin_lock_irq(&scx_tasks_lock); 912*bba2c361STejun Heo 913*bba2c361STejun Heo iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 914*bba2c361STejun Heo list_add(&iter->cursor.tasks_node, &scx_tasks); 915*bba2c361STejun Heo iter->list_locked = true; 916*bba2c361STejun Heo } 917*bba2c361STejun Heo 918*bba2c361STejun Heo static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 919*bba2c361STejun Heo { 920*bba2c361STejun Heo if (iter->locked_task) { 921*bba2c361STejun Heo __balance_callbacks(iter->rq, &iter->rf); 922*bba2c361STejun Heo task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 923*bba2c361STejun Heo iter->locked_task = NULL; 924*bba2c361STejun Heo } 925*bba2c361STejun Heo } 926*bba2c361STejun Heo 927*bba2c361STejun Heo /** 928*bba2c361STejun Heo * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator 929*bba2c361STejun Heo * @iter: iterator to unlock 930*bba2c361STejun Heo * 931*bba2c361STejun Heo * If @iter is in the middle of a locked iteration, it may be locking the rq of 932*bba2c361STejun Heo * the task currently being visited in addition to scx_tasks_lock. Unlock both. 933*bba2c361STejun Heo * This function can be safely called anytime during an iteration. The next 934*bba2c361STejun Heo * iterator operation will automatically restore the necessary locking. 935*bba2c361STejun Heo */ 936*bba2c361STejun Heo static void scx_task_iter_unlock(struct scx_task_iter *iter) 937*bba2c361STejun Heo { 938*bba2c361STejun Heo __scx_task_iter_rq_unlock(iter); 939*bba2c361STejun Heo if (iter->list_locked) { 940*bba2c361STejun Heo iter->list_locked = false; 941*bba2c361STejun Heo raw_spin_unlock_irq(&scx_tasks_lock); 942*bba2c361STejun Heo } 943*bba2c361STejun Heo } 944*bba2c361STejun Heo 945*bba2c361STejun Heo static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 946*bba2c361STejun Heo { 947*bba2c361STejun Heo if (!iter->list_locked) { 948*bba2c361STejun Heo raw_spin_lock_irq(&scx_tasks_lock); 949*bba2c361STejun Heo iter->list_locked = true; 950*bba2c361STejun Heo } 951*bba2c361STejun Heo } 952*bba2c361STejun Heo 953*bba2c361STejun Heo /** 954*bba2c361STejun Heo * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq 955*bba2c361STejun Heo * @iter: iterator to relock 956*bba2c361STejun Heo * @p: task whose rq to lock, or %NULL for scx_tasks_lock only 957*bba2c361STejun Heo * 958*bba2c361STejun Heo * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once 959*bba2c361STejun Heo * re-acquired, both locks are managed by the iterator from here on. 960*bba2c361STejun Heo */ 961*bba2c361STejun Heo static void scx_task_iter_relock(struct scx_task_iter *iter, 962*bba2c361STejun Heo struct task_struct *p) 963*bba2c361STejun Heo { 964*bba2c361STejun Heo __scx_task_iter_maybe_relock(iter); 965*bba2c361STejun Heo if (p) { 966*bba2c361STejun Heo iter->rq = task_rq_lock(p, &iter->rf); 967*bba2c361STejun Heo iter->locked_task = p; 968*bba2c361STejun Heo } 969*bba2c361STejun Heo } 970*bba2c361STejun Heo 971*bba2c361STejun Heo /** 972*bba2c361STejun Heo * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock 973*bba2c361STejun Heo * @iter: iterator to exit 974*bba2c361STejun Heo * 975*bba2c361STejun Heo * Exit a previously initialized @iter. Must be called with scx_tasks_lock held 976*bba2c361STejun Heo * which is released on return. If the iterator holds a task's rq lock, that rq 977*bba2c361STejun Heo * lock is also released. See scx_task_iter_start() for details. 978*bba2c361STejun Heo */ 979*bba2c361STejun Heo static void scx_task_iter_stop(struct scx_task_iter *iter) 980*bba2c361STejun Heo { 981*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 982*bba2c361STejun Heo if (iter->cgrp) { 983*bba2c361STejun Heo if (iter->css_pos) 984*bba2c361STejun Heo css_task_iter_end(&iter->css_iter); 985*bba2c361STejun Heo __scx_task_iter_rq_unlock(iter); 986*bba2c361STejun Heo return; 987*bba2c361STejun Heo } 988*bba2c361STejun Heo #endif 989*bba2c361STejun Heo __scx_task_iter_maybe_relock(iter); 990*bba2c361STejun Heo list_del_init(&iter->cursor.tasks_node); 991*bba2c361STejun Heo scx_task_iter_unlock(iter); 992*bba2c361STejun Heo } 993*bba2c361STejun Heo 994*bba2c361STejun Heo /** 995*bba2c361STejun Heo * scx_task_iter_next - Next task 996*bba2c361STejun Heo * @iter: iterator to walk 997*bba2c361STejun Heo * 998*bba2c361STejun Heo * Visit the next task. See scx_task_iter_start() for details. Locks are dropped 999*bba2c361STejun Heo * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls 1000*bba2c361STejun Heo * by holding scx_tasks_lock for too long. 1001*bba2c361STejun Heo */ 1002*bba2c361STejun Heo static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) 1003*bba2c361STejun Heo { 1004*bba2c361STejun Heo struct list_head *cursor = &iter->cursor.tasks_node; 1005*bba2c361STejun Heo struct sched_ext_entity *pos; 1006*bba2c361STejun Heo 1007*bba2c361STejun Heo if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 1008*bba2c361STejun Heo scx_task_iter_unlock(iter); 1009*bba2c361STejun Heo cond_resched(); 1010*bba2c361STejun Heo } 1011*bba2c361STejun Heo 1012*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 1013*bba2c361STejun Heo if (iter->cgrp) { 1014*bba2c361STejun Heo while (iter->css_pos) { 1015*bba2c361STejun Heo struct task_struct *p; 1016*bba2c361STejun Heo 1017*bba2c361STejun Heo p = css_task_iter_next(&iter->css_iter); 1018*bba2c361STejun Heo if (p) 1019*bba2c361STejun Heo return p; 1020*bba2c361STejun Heo 1021*bba2c361STejun Heo css_task_iter_end(&iter->css_iter); 1022*bba2c361STejun Heo iter->css_pos = css_next_descendant_pre(iter->css_pos, 1023*bba2c361STejun Heo &iter->cgrp->self); 1024*bba2c361STejun Heo if (iter->css_pos) 1025*bba2c361STejun Heo css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, 1026*bba2c361STejun Heo &iter->css_iter); 1027*bba2c361STejun Heo } 1028*bba2c361STejun Heo return NULL; 1029*bba2c361STejun Heo } 1030*bba2c361STejun Heo #endif 1031*bba2c361STejun Heo __scx_task_iter_maybe_relock(iter); 1032*bba2c361STejun Heo 1033*bba2c361STejun Heo list_for_each_entry(pos, cursor, tasks_node) { 1034*bba2c361STejun Heo if (&pos->tasks_node == &scx_tasks) 1035*bba2c361STejun Heo return NULL; 1036*bba2c361STejun Heo if (!(pos->flags & SCX_TASK_CURSOR)) { 1037*bba2c361STejun Heo list_move(cursor, &pos->tasks_node); 1038*bba2c361STejun Heo return container_of(pos, struct task_struct, scx); 1039*bba2c361STejun Heo } 1040*bba2c361STejun Heo } 1041*bba2c361STejun Heo 1042*bba2c361STejun Heo /* can't happen, should always terminate at scx_tasks above */ 1043*bba2c361STejun Heo BUG(); 1044*bba2c361STejun Heo } 1045*bba2c361STejun Heo 1046*bba2c361STejun Heo /** 1047*bba2c361STejun Heo * scx_task_iter_next_locked - Next non-idle task with its rq locked 1048*bba2c361STejun Heo * @iter: iterator to walk 1049*bba2c361STejun Heo * 1050*bba2c361STejun Heo * Visit the non-idle task with its rq lock held. Allows callers to specify 1051*bba2c361STejun Heo * whether they would like to filter out dead tasks. See scx_task_iter_start() 1052*bba2c361STejun Heo * for details. 1053*bba2c361STejun Heo */ 1054*bba2c361STejun Heo static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) 1055*bba2c361STejun Heo { 1056*bba2c361STejun Heo struct task_struct *p; 1057*bba2c361STejun Heo 1058*bba2c361STejun Heo __scx_task_iter_rq_unlock(iter); 1059*bba2c361STejun Heo 1060*bba2c361STejun Heo while ((p = scx_task_iter_next(iter))) { 1061*bba2c361STejun Heo /* 1062*bba2c361STejun Heo * scx_task_iter is used to prepare and move tasks into SCX 1063*bba2c361STejun Heo * while loading the BPF scheduler and vice-versa while 1064*bba2c361STejun Heo * unloading. The init_tasks ("swappers") should be excluded 1065*bba2c361STejun Heo * from the iteration because: 1066*bba2c361STejun Heo * 1067*bba2c361STejun Heo * - It's unsafe to use __setschduler_prio() on an init_task to 1068*bba2c361STejun Heo * determine the sched_class to use as it won't preserve its 1069*bba2c361STejun Heo * idle_sched_class. 1070*bba2c361STejun Heo * 1071*bba2c361STejun Heo * - ops.init/exit_task() can easily be confused if called with 1072*bba2c361STejun Heo * init_tasks as they, e.g., share PID 0. 1073*bba2c361STejun Heo * 1074*bba2c361STejun Heo * As init_tasks are never scheduled through SCX, they can be 1075*bba2c361STejun Heo * skipped safely. Note that is_idle_task() which tests %PF_IDLE 1076*bba2c361STejun Heo * doesn't work here: 1077*bba2c361STejun Heo * 1078*bba2c361STejun Heo * - %PF_IDLE may not be set for an init_task whose CPU hasn't 1079*bba2c361STejun Heo * yet been onlined. 1080*bba2c361STejun Heo * 1081*bba2c361STejun Heo * - %PF_IDLE can be set on tasks that are not init_tasks. See 1082*bba2c361STejun Heo * play_idle_precise() used by CONFIG_IDLE_INJECT. 1083*bba2c361STejun Heo * 1084*bba2c361STejun Heo * Test for idle_sched_class as only init_tasks are on it. 1085*bba2c361STejun Heo */ 1086*bba2c361STejun Heo if (p->sched_class == &idle_sched_class) 1087*bba2c361STejun Heo continue; 1088*bba2c361STejun Heo 1089*bba2c361STejun Heo iter->rq = task_rq_lock(p, &iter->rf); 1090*bba2c361STejun Heo iter->locked_task = p; 1091*bba2c361STejun Heo 1092*bba2c361STejun Heo /* 1093*bba2c361STejun Heo * cgroup_task_dead() removes the dead tasks from cset->tasks 1094*bba2c361STejun Heo * after sched_ext_dead() and cgroup iteration may see tasks 1095*bba2c361STejun Heo * which already finished sched_ext_dead(). %SCX_TASK_DEAD is 1096*bba2c361STejun Heo * set by sched_ext_dead() under @p's rq lock. Test it to 1097*bba2c361STejun Heo * avoid visiting tasks which are already dead from SCX POV. 1098*bba2c361STejun Heo */ 1099*bba2c361STejun Heo if (scx_get_task_state(p) == SCX_TASK_DEAD) { 1100*bba2c361STejun Heo __scx_task_iter_rq_unlock(iter); 1101*bba2c361STejun Heo continue; 1102*bba2c361STejun Heo } 1103*bba2c361STejun Heo 1104*bba2c361STejun Heo return p; 1105*bba2c361STejun Heo } 1106*bba2c361STejun Heo return NULL; 1107*bba2c361STejun Heo } 1108*bba2c361STejun Heo 1109*bba2c361STejun Heo /** 1110*bba2c361STejun Heo * scx_add_event - Increase an event counter for 'name' by 'cnt' 1111*bba2c361STejun Heo * @sch: scx_sched to account events for 1112*bba2c361STejun Heo * @name: an event name defined in struct scx_event_stats 1113*bba2c361STejun Heo * @cnt: the number of the event occurred 1114*bba2c361STejun Heo * 1115*bba2c361STejun Heo * This can be used when preemption is not disabled. 1116*bba2c361STejun Heo */ 1117*bba2c361STejun Heo #define scx_add_event(sch, name, cnt) do { \ 1118*bba2c361STejun Heo this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1119*bba2c361STejun Heo trace_sched_ext_event(#name, (cnt)); \ 1120*bba2c361STejun Heo } while(0) 1121*bba2c361STejun Heo 1122*bba2c361STejun Heo /** 1123*bba2c361STejun Heo * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1124*bba2c361STejun Heo * @sch: scx_sched to account events for 1125*bba2c361STejun Heo * @name: an event name defined in struct scx_event_stats 1126*bba2c361STejun Heo * @cnt: the number of the event occurred 1127*bba2c361STejun Heo * 1128*bba2c361STejun Heo * This should be used only when preemption is disabled. 1129*bba2c361STejun Heo */ 1130*bba2c361STejun Heo #define __scx_add_event(sch, name, cnt) do { \ 1131*bba2c361STejun Heo __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 1132*bba2c361STejun Heo trace_sched_ext_event(#name, cnt); \ 1133*bba2c361STejun Heo } while(0) 1134*bba2c361STejun Heo 1135*bba2c361STejun Heo /** 1136*bba2c361STejun Heo * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1137*bba2c361STejun Heo * @dst_e: destination event stats 1138*bba2c361STejun Heo * @src_e: source event stats 1139*bba2c361STejun Heo * @kind: a kind of event to be aggregated 1140*bba2c361STejun Heo */ 1141*bba2c361STejun Heo #define scx_agg_event(dst_e, src_e, kind) do { \ 1142*bba2c361STejun Heo (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1143*bba2c361STejun Heo } while(0) 1144*bba2c361STejun Heo 1145*bba2c361STejun Heo /** 1146*bba2c361STejun Heo * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1147*bba2c361STejun Heo * @s: output seq_buf 1148*bba2c361STejun Heo * @events: event stats 1149*bba2c361STejun Heo * @kind: a kind of event to dump 1150*bba2c361STejun Heo */ 1151*bba2c361STejun Heo #define scx_dump_event(s, events, kind) do { \ 1152*bba2c361STejun Heo dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1153*bba2c361STejun Heo } while (0) 1154*bba2c361STejun Heo 1155*bba2c361STejun Heo 1156*bba2c361STejun Heo static void scx_read_events(struct scx_sched *sch, 1157*bba2c361STejun Heo struct scx_event_stats *events); 1158*bba2c361STejun Heo 1159*bba2c361STejun Heo static enum scx_enable_state scx_enable_state(void) 1160*bba2c361STejun Heo { 1161*bba2c361STejun Heo return atomic_read(&scx_enable_state_var); 1162*bba2c361STejun Heo } 1163*bba2c361STejun Heo 1164*bba2c361STejun Heo static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) 1165*bba2c361STejun Heo { 1166*bba2c361STejun Heo return atomic_xchg(&scx_enable_state_var, to); 1167*bba2c361STejun Heo } 1168*bba2c361STejun Heo 1169*bba2c361STejun Heo static bool scx_tryset_enable_state(enum scx_enable_state to, 1170*bba2c361STejun Heo enum scx_enable_state from) 1171*bba2c361STejun Heo { 1172*bba2c361STejun Heo int from_v = from; 1173*bba2c361STejun Heo 1174*bba2c361STejun Heo return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); 1175*bba2c361STejun Heo } 1176*bba2c361STejun Heo 1177*bba2c361STejun Heo /** 1178*bba2c361STejun Heo * wait_ops_state - Busy-wait the specified ops state to end 1179*bba2c361STejun Heo * @p: target task 1180*bba2c361STejun Heo * @opss: state to wait the end of 1181*bba2c361STejun Heo * 1182*bba2c361STejun Heo * Busy-wait for @p to transition out of @opss. This can only be used when the 1183*bba2c361STejun Heo * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also 1184*bba2c361STejun Heo * has load_acquire semantics to ensure that the caller can see the updates made 1185*bba2c361STejun Heo * in the enqueueing and dispatching paths. 1186*bba2c361STejun Heo */ 1187*bba2c361STejun Heo static void wait_ops_state(struct task_struct *p, unsigned long opss) 1188*bba2c361STejun Heo { 1189*bba2c361STejun Heo do { 1190*bba2c361STejun Heo cpu_relax(); 1191*bba2c361STejun Heo } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); 1192*bba2c361STejun Heo } 1193*bba2c361STejun Heo 1194*bba2c361STejun Heo static inline bool __cpu_valid(s32 cpu) 1195*bba2c361STejun Heo { 1196*bba2c361STejun Heo return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); 1197*bba2c361STejun Heo } 1198*bba2c361STejun Heo 1199*bba2c361STejun Heo /** 1200*bba2c361STejun Heo * scx_cpu_valid - Verify a cpu number, to be used on ops input args 1201*bba2c361STejun Heo * @sch: scx_sched to abort on error 1202*bba2c361STejun Heo * @cpu: cpu number which came from a BPF ops 1203*bba2c361STejun Heo * @where: extra information reported on error 1204*bba2c361STejun Heo * 1205*bba2c361STejun Heo * @cpu is a cpu number which came from the BPF scheduler and can be any value. 1206*bba2c361STejun Heo * Verify that it is in range and one of the possible cpus. If invalid, trigger 1207*bba2c361STejun Heo * an ops error. 1208*bba2c361STejun Heo */ 1209*bba2c361STejun Heo bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) 1210*bba2c361STejun Heo { 1211*bba2c361STejun Heo if (__cpu_valid(cpu)) { 1212*bba2c361STejun Heo return true; 1213*bba2c361STejun Heo } else { 1214*bba2c361STejun Heo scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 1215*bba2c361STejun Heo return false; 1216*bba2c361STejun Heo } 1217*bba2c361STejun Heo } 1218*bba2c361STejun Heo 1219*bba2c361STejun Heo /** 1220*bba2c361STejun Heo * ops_sanitize_err - Sanitize a -errno value 1221*bba2c361STejun Heo * @sch: scx_sched to error out on error 1222*bba2c361STejun Heo * @ops_name: operation to blame on failure 1223*bba2c361STejun Heo * @err: -errno value to sanitize 1224*bba2c361STejun Heo * 1225*bba2c361STejun Heo * Verify @err is a valid -errno. If not, trigger scx_error() and return 1226*bba2c361STejun Heo * -%EPROTO. This is necessary because returning a rogue -errno up the chain can 1227*bba2c361STejun Heo * cause misbehaviors. For an example, a large negative return from 1228*bba2c361STejun Heo * ops.init_task() triggers an oops when passed up the call chain because the 1229*bba2c361STejun Heo * value fails IS_ERR() test after being encoded with ERR_PTR() and then is 1230*bba2c361STejun Heo * handled as a pointer. 1231*bba2c361STejun Heo */ 1232*bba2c361STejun Heo static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) 1233*bba2c361STejun Heo { 1234*bba2c361STejun Heo if (err < 0 && err >= -MAX_ERRNO) 1235*bba2c361STejun Heo return err; 1236*bba2c361STejun Heo 1237*bba2c361STejun Heo scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); 1238*bba2c361STejun Heo return -EPROTO; 1239*bba2c361STejun Heo } 1240*bba2c361STejun Heo 1241*bba2c361STejun Heo static void deferred_bal_cb_workfn(struct rq *rq) 1242*bba2c361STejun Heo { 1243*bba2c361STejun Heo run_deferred(rq); 1244*bba2c361STejun Heo } 1245*bba2c361STejun Heo 1246*bba2c361STejun Heo static void deferred_irq_workfn(struct irq_work *irq_work) 1247*bba2c361STejun Heo { 1248*bba2c361STejun Heo struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); 1249*bba2c361STejun Heo 1250*bba2c361STejun Heo raw_spin_rq_lock(rq); 1251*bba2c361STejun Heo run_deferred(rq); 1252*bba2c361STejun Heo raw_spin_rq_unlock(rq); 1253*bba2c361STejun Heo } 1254*bba2c361STejun Heo 1255*bba2c361STejun Heo /** 1256*bba2c361STejun Heo * schedule_deferred - Schedule execution of deferred actions on an rq 1257*bba2c361STejun Heo * @rq: target rq 1258*bba2c361STejun Heo * 1259*bba2c361STejun Heo * Schedule execution of deferred actions on @rq. Deferred actions are executed 1260*bba2c361STejun Heo * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 1261*bba2c361STejun Heo * to other rqs. 1262*bba2c361STejun Heo */ 1263*bba2c361STejun Heo static void schedule_deferred(struct rq *rq) 1264*bba2c361STejun Heo { 1265*bba2c361STejun Heo /* 1266*bba2c361STejun Heo * This is the fallback when schedule_deferred_locked() can't use 1267*bba2c361STejun Heo * the cheaper balance callback or wakeup hook paths (the target 1268*bba2c361STejun Heo * CPU is not in balance or wakeup). Currently, this is primarily 1269*bba2c361STejun Heo * hit by reenqueue operations targeting a remote CPU. 1270*bba2c361STejun Heo * 1271*bba2c361STejun Heo * Queue on the target CPU. The deferred work can run from any CPU 1272*bba2c361STejun Heo * correctly - the _locked() path already processes remote rqs from 1273*bba2c361STejun Heo * the calling CPU - but targeting the owning CPU allows IPI delivery 1274*bba2c361STejun Heo * without waiting for the calling CPU to re-enable IRQs and is 1275*bba2c361STejun Heo * cheaper as the reenqueue runs locally. 1276*bba2c361STejun Heo */ 1277*bba2c361STejun Heo irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); 1278*bba2c361STejun Heo } 1279*bba2c361STejun Heo 1280*bba2c361STejun Heo /** 1281*bba2c361STejun Heo * schedule_deferred_locked - Schedule execution of deferred actions on an rq 1282*bba2c361STejun Heo * @rq: target rq 1283*bba2c361STejun Heo * 1284*bba2c361STejun Heo * Schedule execution of deferred actions on @rq. Equivalent to 1285*bba2c361STejun Heo * schedule_deferred() but requires @rq to be locked and can be more efficient. 1286*bba2c361STejun Heo */ 1287*bba2c361STejun Heo static void schedule_deferred_locked(struct rq *rq) 1288*bba2c361STejun Heo { 1289*bba2c361STejun Heo lockdep_assert_rq_held(rq); 1290*bba2c361STejun Heo 1291*bba2c361STejun Heo /* 1292*bba2c361STejun Heo * If in the middle of waking up a task, task_woken_scx() will be called 1293*bba2c361STejun Heo * afterwards which will then run the deferred actions, no need to 1294*bba2c361STejun Heo * schedule anything. 1295*bba2c361STejun Heo */ 1296*bba2c361STejun Heo if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 1297*bba2c361STejun Heo return; 1298*bba2c361STejun Heo 1299*bba2c361STejun Heo /* Don't do anything if there already is a deferred operation. */ 1300*bba2c361STejun Heo if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 1301*bba2c361STejun Heo return; 1302*bba2c361STejun Heo 1303*bba2c361STejun Heo /* 1304*bba2c361STejun Heo * If in balance, the balance callbacks will be called before rq lock is 1305*bba2c361STejun Heo * released. Schedule one. 1306*bba2c361STejun Heo * 1307*bba2c361STejun Heo * 1308*bba2c361STejun Heo * We can't directly insert the callback into the 1309*bba2c361STejun Heo * rq's list: The call can drop its lock and make the pending balance 1310*bba2c361STejun Heo * callback visible to unrelated code paths that call rq_pin_lock(). 1311*bba2c361STejun Heo * 1312*bba2c361STejun Heo * Just let balance_one() know that it must do it itself. 1313*bba2c361STejun Heo */ 1314*bba2c361STejun Heo if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 1315*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 1316*bba2c361STejun Heo return; 1317*bba2c361STejun Heo } 1318*bba2c361STejun Heo 1319*bba2c361STejun Heo /* 1320*bba2c361STejun Heo * No scheduler hooks available. Use the generic irq_work path. The 1321*bba2c361STejun Heo * above WAKEUP and BALANCE paths should cover most of the cases and the 1322*bba2c361STejun Heo * time to IRQ re-enable shouldn't be long. 1323*bba2c361STejun Heo */ 1324*bba2c361STejun Heo schedule_deferred(rq); 1325*bba2c361STejun Heo } 1326*bba2c361STejun Heo 1327*bba2c361STejun Heo static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1328*bba2c361STejun Heo u64 reenq_flags, struct rq *locked_rq) 1329*bba2c361STejun Heo { 1330*bba2c361STejun Heo struct rq *rq; 1331*bba2c361STejun Heo 1332*bba2c361STejun Heo /* 1333*bba2c361STejun Heo * Allowing reenqueues doesn't make sense while bypassing. This also 1334*bba2c361STejun Heo * blocks from new reenqueues to be scheduled on dead scheds. 1335*bba2c361STejun Heo */ 1336*bba2c361STejun Heo if (unlikely(READ_ONCE(sch->bypass_depth))) 1337*bba2c361STejun Heo return; 1338*bba2c361STejun Heo 1339*bba2c361STejun Heo if (dsq->id == SCX_DSQ_LOCAL) { 1340*bba2c361STejun Heo rq = container_of(dsq, struct rq, scx.local_dsq); 1341*bba2c361STejun Heo 1342*bba2c361STejun Heo struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); 1343*bba2c361STejun Heo struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; 1344*bba2c361STejun Heo 1345*bba2c361STejun Heo /* 1346*bba2c361STejun Heo * Pairs with smp_mb() in process_deferred_reenq_locals() and 1347*bba2c361STejun Heo * guarantees that there is a reenq_local() afterwards. 1348*bba2c361STejun Heo */ 1349*bba2c361STejun Heo smp_mb(); 1350*bba2c361STejun Heo 1351*bba2c361STejun Heo if (list_empty(&drl->node) || 1352*bba2c361STejun Heo (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { 1353*bba2c361STejun Heo 1354*bba2c361STejun Heo guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1355*bba2c361STejun Heo 1356*bba2c361STejun Heo if (list_empty(&drl->node)) 1357*bba2c361STejun Heo list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); 1358*bba2c361STejun Heo WRITE_ONCE(drl->flags, drl->flags | reenq_flags); 1359*bba2c361STejun Heo } 1360*bba2c361STejun Heo } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1361*bba2c361STejun Heo rq = this_rq(); 1362*bba2c361STejun Heo 1363*bba2c361STejun Heo struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1364*bba2c361STejun Heo struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1365*bba2c361STejun Heo 1366*bba2c361STejun Heo /* 1367*bba2c361STejun Heo * Pairs with smp_mb() in process_deferred_reenq_users() and 1368*bba2c361STejun Heo * guarantees that there is a reenq_user() afterwards. 1369*bba2c361STejun Heo */ 1370*bba2c361STejun Heo smp_mb(); 1371*bba2c361STejun Heo 1372*bba2c361STejun Heo if (list_empty(&dru->node) || 1373*bba2c361STejun Heo (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { 1374*bba2c361STejun Heo 1375*bba2c361STejun Heo guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 1376*bba2c361STejun Heo 1377*bba2c361STejun Heo if (list_empty(&dru->node)) 1378*bba2c361STejun Heo list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1379*bba2c361STejun Heo WRITE_ONCE(dru->flags, dru->flags | reenq_flags); 1380*bba2c361STejun Heo } 1381*bba2c361STejun Heo } else { 1382*bba2c361STejun Heo scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1383*bba2c361STejun Heo return; 1384*bba2c361STejun Heo } 1385*bba2c361STejun Heo 1386*bba2c361STejun Heo if (rq == locked_rq) 1387*bba2c361STejun Heo schedule_deferred_locked(rq); 1388*bba2c361STejun Heo else 1389*bba2c361STejun Heo schedule_deferred(rq); 1390*bba2c361STejun Heo } 1391*bba2c361STejun Heo 1392*bba2c361STejun Heo static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) 1393*bba2c361STejun Heo { 1394*bba2c361STejun Heo struct scx_sched *root = rcu_dereference_sched(scx_root); 1395*bba2c361STejun Heo 1396*bba2c361STejun Heo if (WARN_ON_ONCE(!root)) 1397*bba2c361STejun Heo return; 1398*bba2c361STejun Heo 1399*bba2c361STejun Heo schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); 1400*bba2c361STejun Heo } 1401*bba2c361STejun Heo 1402*bba2c361STejun Heo /** 1403*bba2c361STejun Heo * touch_core_sched - Update timestamp used for core-sched task ordering 1404*bba2c361STejun Heo * @rq: rq to read clock from, must be locked 1405*bba2c361STejun Heo * @p: task to update the timestamp for 1406*bba2c361STejun Heo * 1407*bba2c361STejun Heo * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to 1408*bba2c361STejun Heo * implement global or local-DSQ FIFO ordering for core-sched. Should be called 1409*bba2c361STejun Heo * when a task becomes runnable and its turn on the CPU ends (e.g. slice 1410*bba2c361STejun Heo * exhaustion). 1411*bba2c361STejun Heo */ 1412*bba2c361STejun Heo static void touch_core_sched(struct rq *rq, struct task_struct *p) 1413*bba2c361STejun Heo { 1414*bba2c361STejun Heo lockdep_assert_rq_held(rq); 1415*bba2c361STejun Heo 1416*bba2c361STejun Heo #ifdef CONFIG_SCHED_CORE 1417*bba2c361STejun Heo /* 1418*bba2c361STejun Heo * It's okay to update the timestamp spuriously. Use 1419*bba2c361STejun Heo * sched_core_disabled() which is cheaper than enabled(). 1420*bba2c361STejun Heo * 1421*bba2c361STejun Heo * As this is used to determine ordering between tasks of sibling CPUs, 1422*bba2c361STejun Heo * it may be better to use per-core dispatch sequence instead. 1423*bba2c361STejun Heo */ 1424*bba2c361STejun Heo if (!sched_core_disabled()) 1425*bba2c361STejun Heo p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); 1426*bba2c361STejun Heo #endif 1427*bba2c361STejun Heo } 1428*bba2c361STejun Heo 1429*bba2c361STejun Heo /** 1430*bba2c361STejun Heo * touch_core_sched_dispatch - Update core-sched timestamp on dispatch 1431*bba2c361STejun Heo * @rq: rq to read clock from, must be locked 1432*bba2c361STejun Heo * @p: task being dispatched 1433*bba2c361STejun Heo * 1434*bba2c361STejun Heo * If the BPF scheduler implements custom core-sched ordering via 1435*bba2c361STejun Heo * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO 1436*bba2c361STejun Heo * ordering within each local DSQ. This function is called from dispatch paths 1437*bba2c361STejun Heo * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. 1438*bba2c361STejun Heo */ 1439*bba2c361STejun Heo static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) 1440*bba2c361STejun Heo { 1441*bba2c361STejun Heo lockdep_assert_rq_held(rq); 1442*bba2c361STejun Heo 1443*bba2c361STejun Heo #ifdef CONFIG_SCHED_CORE 1444*bba2c361STejun Heo if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) 1445*bba2c361STejun Heo touch_core_sched(rq, p); 1446*bba2c361STejun Heo #endif 1447*bba2c361STejun Heo } 1448*bba2c361STejun Heo 1449*bba2c361STejun Heo static void update_curr_scx(struct rq *rq) 1450*bba2c361STejun Heo { 1451*bba2c361STejun Heo struct task_struct *curr = rq->curr; 1452*bba2c361STejun Heo s64 delta_exec; 1453*bba2c361STejun Heo 1454*bba2c361STejun Heo delta_exec = update_curr_common(rq); 1455*bba2c361STejun Heo if (unlikely(delta_exec <= 0)) 1456*bba2c361STejun Heo return; 1457*bba2c361STejun Heo 1458*bba2c361STejun Heo if (curr->scx.slice != SCX_SLICE_INF) { 1459*bba2c361STejun Heo curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); 1460*bba2c361STejun Heo if (!curr->scx.slice) 1461*bba2c361STejun Heo touch_core_sched(rq, curr); 1462*bba2c361STejun Heo } 1463*bba2c361STejun Heo 1464*bba2c361STejun Heo dl_server_update(&rq->ext_server, delta_exec); 1465*bba2c361STejun Heo } 1466*bba2c361STejun Heo 1467*bba2c361STejun Heo static bool scx_dsq_priq_less(struct rb_node *node_a, 1468*bba2c361STejun Heo const struct rb_node *node_b) 1469*bba2c361STejun Heo { 1470*bba2c361STejun Heo const struct task_struct *a = 1471*bba2c361STejun Heo container_of(node_a, struct task_struct, scx.dsq_priq); 1472*bba2c361STejun Heo const struct task_struct *b = 1473*bba2c361STejun Heo container_of(node_b, struct task_struct, scx.dsq_priq); 1474*bba2c361STejun Heo 1475*bba2c361STejun Heo return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); 1476*bba2c361STejun Heo } 1477*bba2c361STejun Heo 1478*bba2c361STejun Heo static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) 1479*bba2c361STejun Heo { 1480*bba2c361STejun Heo /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 1481*bba2c361STejun Heo WRITE_ONCE(dsq->nr, dsq->nr + 1); 1482*bba2c361STejun Heo 1483*bba2c361STejun Heo /* 1484*bba2c361STejun Heo * Once @p reaches a local DSQ, it can only leave it by being dispatched 1485*bba2c361STejun Heo * to the CPU or dequeued. In both cases, the only way @p can go back to 1486*bba2c361STejun Heo * the BPF sched is through enqueueing. If being inserted into a local 1487*bba2c361STejun Heo * DSQ with IMMED, persist the state until the next enqueueing event in 1488*bba2c361STejun Heo * do_enqueue_task() so that we can maintain IMMED protection through 1489*bba2c361STejun Heo * e.g. SAVE/RESTORE cycles and slice extensions. 1490*bba2c361STejun Heo */ 1491*bba2c361STejun Heo if (enq_flags & SCX_ENQ_IMMED) { 1492*bba2c361STejun Heo if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { 1493*bba2c361STejun Heo WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); 1494*bba2c361STejun Heo return; 1495*bba2c361STejun Heo } 1496*bba2c361STejun Heo p->scx.flags |= SCX_TASK_IMMED; 1497*bba2c361STejun Heo } 1498*bba2c361STejun Heo 1499*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_IMMED) { 1500*bba2c361STejun Heo struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1501*bba2c361STejun Heo 1502*bba2c361STejun Heo if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 1503*bba2c361STejun Heo return; 1504*bba2c361STejun Heo 1505*bba2c361STejun Heo rq->scx.nr_immed++; 1506*bba2c361STejun Heo 1507*bba2c361STejun Heo /* 1508*bba2c361STejun Heo * If @rq already had other tasks or the current task is not 1509*bba2c361STejun Heo * done yet, @p can't go on the CPU immediately. Re-enqueue. 1510*bba2c361STejun Heo */ 1511*bba2c361STejun Heo if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) 1512*bba2c361STejun Heo schedule_reenq_local(rq, 0); 1513*bba2c361STejun Heo } 1514*bba2c361STejun Heo } 1515*bba2c361STejun Heo 1516*bba2c361STejun Heo static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) 1517*bba2c361STejun Heo { 1518*bba2c361STejun Heo /* see dsq_inc_nr() */ 1519*bba2c361STejun Heo WRITE_ONCE(dsq->nr, dsq->nr - 1); 1520*bba2c361STejun Heo 1521*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_IMMED) { 1522*bba2c361STejun Heo struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1523*bba2c361STejun Heo 1524*bba2c361STejun Heo if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || 1525*bba2c361STejun Heo WARN_ON_ONCE(rq->scx.nr_immed <= 0)) 1526*bba2c361STejun Heo return; 1527*bba2c361STejun Heo 1528*bba2c361STejun Heo rq->scx.nr_immed--; 1529*bba2c361STejun Heo } 1530*bba2c361STejun Heo } 1531*bba2c361STejun Heo 1532*bba2c361STejun Heo static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 1533*bba2c361STejun Heo { 1534*bba2c361STejun Heo p->scx.slice = READ_ONCE(sch->slice_dfl); 1535*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 1536*bba2c361STejun Heo } 1537*bba2c361STejun Heo 1538*bba2c361STejun Heo /* 1539*bba2c361STejun Heo * Return true if @p is moving due to an internal SCX migration, false 1540*bba2c361STejun Heo * otherwise. 1541*bba2c361STejun Heo */ 1542*bba2c361STejun Heo static inline bool task_scx_migrating(struct task_struct *p) 1543*bba2c361STejun Heo { 1544*bba2c361STejun Heo /* 1545*bba2c361STejun Heo * We only need to check sticky_cpu: it is set to the destination 1546*bba2c361STejun Heo * CPU in move_remote_task_to_local_dsq() before deactivate_task() 1547*bba2c361STejun Heo * and cleared when the task is enqueued on the destination, so it 1548*bba2c361STejun Heo * is only non-negative during an internal SCX migration. 1549*bba2c361STejun Heo */ 1550*bba2c361STejun Heo return p->scx.sticky_cpu >= 0; 1551*bba2c361STejun Heo } 1552*bba2c361STejun Heo 1553*bba2c361STejun Heo /* 1554*bba2c361STejun Heo * Call ops.dequeue() if the task is in BPF custody and not migrating. 1555*bba2c361STejun Heo * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. 1556*bba2c361STejun Heo */ 1557*bba2c361STejun Heo static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, 1558*bba2c361STejun Heo struct task_struct *p, u64 deq_flags) 1559*bba2c361STejun Heo { 1560*bba2c361STejun Heo if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) 1561*bba2c361STejun Heo return; 1562*bba2c361STejun Heo 1563*bba2c361STejun Heo if (SCX_HAS_OP(sch, dequeue)) 1564*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); 1565*bba2c361STejun Heo 1566*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_IN_CUSTODY; 1567*bba2c361STejun Heo } 1568*bba2c361STejun Heo 1569*bba2c361STejun Heo static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, 1570*bba2c361STejun Heo struct task_struct *p, u64 enq_flags) 1571*bba2c361STejun Heo { 1572*bba2c361STejun Heo struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); 1573*bba2c361STejun Heo 1574*bba2c361STejun Heo call_task_dequeue(sch, rq, p, 0); 1575*bba2c361STejun Heo 1576*bba2c361STejun Heo /* 1577*bba2c361STejun Heo * Note that @rq's lock may be dropped between this enqueue and @p 1578*bba2c361STejun Heo * actually getting on CPU. This gives higher-class tasks (e.g. RT) 1579*bba2c361STejun Heo * an opportunity to wake up on @rq and prevent @p from running. 1580*bba2c361STejun Heo * Here are some concrete examples: 1581*bba2c361STejun Heo * 1582*bba2c361STejun Heo * Example 1: 1583*bba2c361STejun Heo * 1584*bba2c361STejun Heo * We dispatch two tasks from a single ops.dispatch(): 1585*bba2c361STejun Heo * - First, a local task to this CPU's local DSQ; 1586*bba2c361STejun Heo * - Second, a local/remote task to a remote CPU's local DSQ. 1587*bba2c361STejun Heo * We must drop the local rq lock in order to finish the second 1588*bba2c361STejun Heo * dispatch. In that time, an RT task can wake up on the local rq. 1589*bba2c361STejun Heo * 1590*bba2c361STejun Heo * Example 2: 1591*bba2c361STejun Heo * 1592*bba2c361STejun Heo * We dispatch a local/remote task to a remote CPU's local DSQ. 1593*bba2c361STejun Heo * We must drop the remote rq lock before the dispatched task can run, 1594*bba2c361STejun Heo * which gives an RT task an opportunity to wake up on the remote rq. 1595*bba2c361STejun Heo * 1596*bba2c361STejun Heo * Both examples work the same if we replace dispatching with moving 1597*bba2c361STejun Heo * the tasks from a user-created DSQ. 1598*bba2c361STejun Heo * 1599*bba2c361STejun Heo * We must detect these wakeups so that we can re-enqueue IMMED tasks 1600*bba2c361STejun Heo * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this 1601*bba2c361STejun Heo * purpose, but for it to be invoked, we must ensure that we bump 1602*bba2c361STejun Heo * @rq->next_class to &ext_sched_class if it's currently idle. 1603*bba2c361STejun Heo * 1604*bba2c361STejun Heo * wakeup_preempt() does the bumping, and since we only invoke it if 1605*bba2c361STejun Heo * @rq->next_class is below &ext_sched_class, it will also 1606*bba2c361STejun Heo * resched_curr(rq). 1607*bba2c361STejun Heo */ 1608*bba2c361STejun Heo if (sched_class_above(p->sched_class, rq->next_class)) 1609*bba2c361STejun Heo wakeup_preempt(rq, p, 0); 1610*bba2c361STejun Heo 1611*bba2c361STejun Heo /* 1612*bba2c361STejun Heo * If @rq is in balance, the CPU is already vacant and looking for the 1613*bba2c361STejun Heo * next task to run. No need to preempt or trigger resched after moving 1614*bba2c361STejun Heo * @p into its local DSQ. 1615*bba2c361STejun Heo * Note that the wakeup_preempt() above may have already triggered 1616*bba2c361STejun Heo * a resched if @rq->next_class was idle. It's harmless, since 1617*bba2c361STejun Heo * need_resched is cleared immediately after task pick. 1618*bba2c361STejun Heo */ 1619*bba2c361STejun Heo if (rq->scx.flags & SCX_RQ_IN_BALANCE) 1620*bba2c361STejun Heo return; 1621*bba2c361STejun Heo 1622*bba2c361STejun Heo if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && 1623*bba2c361STejun Heo rq->curr->sched_class == &ext_sched_class) { 1624*bba2c361STejun Heo rq->curr->scx.slice = 0; 1625*bba2c361STejun Heo resched_curr(rq); 1626*bba2c361STejun Heo } 1627*bba2c361STejun Heo } 1628*bba2c361STejun Heo 1629*bba2c361STejun Heo static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, 1630*bba2c361STejun Heo struct scx_dispatch_q *dsq, struct task_struct *p, 1631*bba2c361STejun Heo u64 enq_flags) 1632*bba2c361STejun Heo { 1633*bba2c361STejun Heo bool is_local = dsq->id == SCX_DSQ_LOCAL; 1634*bba2c361STejun Heo 1635*bba2c361STejun Heo WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1636*bba2c361STejun Heo WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || 1637*bba2c361STejun Heo !RB_EMPTY_NODE(&p->scx.dsq_priq)); 1638*bba2c361STejun Heo 1639*bba2c361STejun Heo if (!is_local) { 1640*bba2c361STejun Heo raw_spin_lock_nested(&dsq->lock, 1641*bba2c361STejun Heo (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 1642*bba2c361STejun Heo 1643*bba2c361STejun Heo if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 1644*bba2c361STejun Heo scx_error(sch, "attempting to dispatch to a destroyed dsq"); 1645*bba2c361STejun Heo /* fall back to the global dsq */ 1646*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 1647*bba2c361STejun Heo dsq = find_global_dsq(sch, task_cpu(p)); 1648*bba2c361STejun Heo raw_spin_lock(&dsq->lock); 1649*bba2c361STejun Heo } 1650*bba2c361STejun Heo } 1651*bba2c361STejun Heo 1652*bba2c361STejun Heo if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && 1653*bba2c361STejun Heo (enq_flags & SCX_ENQ_DSQ_PRIQ))) { 1654*bba2c361STejun Heo /* 1655*bba2c361STejun Heo * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from 1656*bba2c361STejun Heo * their FIFO queues. To avoid confusion and accidentally 1657*bba2c361STejun Heo * starving vtime-dispatched tasks by FIFO-dispatched tasks, we 1658*bba2c361STejun Heo * disallow any internal DSQ from doing vtime ordering of 1659*bba2c361STejun Heo * tasks. 1660*bba2c361STejun Heo */ 1661*bba2c361STejun Heo scx_error(sch, "cannot use vtime ordering for built-in DSQs"); 1662*bba2c361STejun Heo enq_flags &= ~SCX_ENQ_DSQ_PRIQ; 1663*bba2c361STejun Heo } 1664*bba2c361STejun Heo 1665*bba2c361STejun Heo if (enq_flags & SCX_ENQ_DSQ_PRIQ) { 1666*bba2c361STejun Heo struct rb_node *rbp; 1667*bba2c361STejun Heo 1668*bba2c361STejun Heo /* 1669*bba2c361STejun Heo * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are 1670*bba2c361STejun Heo * linked to both the rbtree and list on PRIQs, this can only be 1671*bba2c361STejun Heo * tested easily when adding the first task. 1672*bba2c361STejun Heo */ 1673*bba2c361STejun Heo if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && 1674*bba2c361STejun Heo nldsq_next_task(dsq, NULL, false))) 1675*bba2c361STejun Heo scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", 1676*bba2c361STejun Heo dsq->id); 1677*bba2c361STejun Heo 1678*bba2c361STejun Heo p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; 1679*bba2c361STejun Heo rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); 1680*bba2c361STejun Heo 1681*bba2c361STejun Heo /* 1682*bba2c361STejun Heo * Find the previous task and insert after it on the list so 1683*bba2c361STejun Heo * that @dsq->list is vtime ordered. 1684*bba2c361STejun Heo */ 1685*bba2c361STejun Heo rbp = rb_prev(&p->scx.dsq_priq); 1686*bba2c361STejun Heo if (rbp) { 1687*bba2c361STejun Heo struct task_struct *prev = 1688*bba2c361STejun Heo container_of(rbp, struct task_struct, 1689*bba2c361STejun Heo scx.dsq_priq); 1690*bba2c361STejun Heo list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 1691*bba2c361STejun Heo /* first task unchanged - no update needed */ 1692*bba2c361STejun Heo } else { 1693*bba2c361STejun Heo list_add(&p->scx.dsq_list.node, &dsq->list); 1694*bba2c361STejun Heo /* not builtin and new task is at head - use fastpath */ 1695*bba2c361STejun Heo rcu_assign_pointer(dsq->first_task, p); 1696*bba2c361STejun Heo } 1697*bba2c361STejun Heo } else { 1698*bba2c361STejun Heo /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ 1699*bba2c361STejun Heo if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) 1700*bba2c361STejun Heo scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1701*bba2c361STejun Heo dsq->id); 1702*bba2c361STejun Heo 1703*bba2c361STejun Heo if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1704*bba2c361STejun Heo list_add(&p->scx.dsq_list.node, &dsq->list); 1705*bba2c361STejun Heo /* new task inserted at head - use fastpath */ 1706*bba2c361STejun Heo if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1707*bba2c361STejun Heo rcu_assign_pointer(dsq->first_task, p); 1708*bba2c361STejun Heo } else { 1709*bba2c361STejun Heo /* 1710*bba2c361STejun Heo * dsq->list can contain parked BPF iterator cursors, so 1711*bba2c361STejun Heo * list_empty() here isn't a reliable proxy for "no real 1712*bba2c361STejun Heo * task in the DSQ". Test dsq->first_task directly. 1713*bba2c361STejun Heo */ 1714*bba2c361STejun Heo list_add_tail(&p->scx.dsq_list.node, &dsq->list); 1715*bba2c361STejun Heo if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 1716*bba2c361STejun Heo rcu_assign_pointer(dsq->first_task, p); 1717*bba2c361STejun Heo } 1718*bba2c361STejun Heo } 1719*bba2c361STejun Heo 1720*bba2c361STejun Heo /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1721*bba2c361STejun Heo WRITE_ONCE(dsq->seq, dsq->seq + 1); 1722*bba2c361STejun Heo p->scx.dsq_seq = dsq->seq; 1723*bba2c361STejun Heo 1724*bba2c361STejun Heo dsq_inc_nr(dsq, p, enq_flags); 1725*bba2c361STejun Heo p->scx.dsq = dsq; 1726*bba2c361STejun Heo 1727*bba2c361STejun Heo /* 1728*bba2c361STejun Heo * Update custody and call ops.dequeue() before clearing ops_state: 1729*bba2c361STejun Heo * once ops_state is cleared, waiters in ops_dequeue() can proceed 1730*bba2c361STejun Heo * and dequeue_task_scx() will RMW p->scx.flags. If we clear 1731*bba2c361STejun Heo * ops_state first, both sides would modify p->scx.flags 1732*bba2c361STejun Heo * concurrently in a non-atomic way. 1733*bba2c361STejun Heo */ 1734*bba2c361STejun Heo if (is_local) { 1735*bba2c361STejun Heo local_dsq_post_enq(sch, dsq, p, enq_flags); 1736*bba2c361STejun Heo } else { 1737*bba2c361STejun Heo /* 1738*bba2c361STejun Heo * Task on global/bypass DSQ: leave custody, task on 1739*bba2c361STejun Heo * non-terminal DSQ: enter custody. 1740*bba2c361STejun Heo */ 1741*bba2c361STejun Heo if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) 1742*bba2c361STejun Heo call_task_dequeue(sch, rq, p, 0); 1743*bba2c361STejun Heo else 1744*bba2c361STejun Heo p->scx.flags |= SCX_TASK_IN_CUSTODY; 1745*bba2c361STejun Heo 1746*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 1747*bba2c361STejun Heo } 1748*bba2c361STejun Heo 1749*bba2c361STejun Heo /* 1750*bba2c361STejun Heo * We're transitioning out of QUEUEING or DISPATCHING. store_release to 1751*bba2c361STejun Heo * match waiters' load_acquire. 1752*bba2c361STejun Heo */ 1753*bba2c361STejun Heo if (enq_flags & SCX_ENQ_CLEAR_OPSS) 1754*bba2c361STejun Heo atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1755*bba2c361STejun Heo } 1756*bba2c361STejun Heo 1757*bba2c361STejun Heo static void task_unlink_from_dsq(struct task_struct *p, 1758*bba2c361STejun Heo struct scx_dispatch_q *dsq) 1759*bba2c361STejun Heo { 1760*bba2c361STejun Heo WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); 1761*bba2c361STejun Heo 1762*bba2c361STejun Heo if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { 1763*bba2c361STejun Heo rb_erase(&p->scx.dsq_priq, &dsq->priq); 1764*bba2c361STejun Heo RB_CLEAR_NODE(&p->scx.dsq_priq); 1765*bba2c361STejun Heo p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1766*bba2c361STejun Heo } 1767*bba2c361STejun Heo 1768*bba2c361STejun Heo list_del_init(&p->scx.dsq_list.node); 1769*bba2c361STejun Heo dsq_dec_nr(dsq, p); 1770*bba2c361STejun Heo 1771*bba2c361STejun Heo if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1772*bba2c361STejun Heo struct task_struct *first_task; 1773*bba2c361STejun Heo 1774*bba2c361STejun Heo first_task = nldsq_next_task(dsq, NULL, false); 1775*bba2c361STejun Heo rcu_assign_pointer(dsq->first_task, first_task); 1776*bba2c361STejun Heo } 1777*bba2c361STejun Heo } 1778*bba2c361STejun Heo 1779*bba2c361STejun Heo static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1780*bba2c361STejun Heo { 1781*bba2c361STejun Heo struct scx_dispatch_q *dsq = p->scx.dsq; 1782*bba2c361STejun Heo bool is_local = dsq == &rq->scx.local_dsq; 1783*bba2c361STejun Heo 1784*bba2c361STejun Heo lockdep_assert_rq_held(rq); 1785*bba2c361STejun Heo 1786*bba2c361STejun Heo if (!dsq) { 1787*bba2c361STejun Heo /* 1788*bba2c361STejun Heo * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. 1789*bba2c361STejun Heo * Unlinking is all that's needed to cancel. 1790*bba2c361STejun Heo */ 1791*bba2c361STejun Heo if (unlikely(!list_empty(&p->scx.dsq_list.node))) 1792*bba2c361STejun Heo list_del_init(&p->scx.dsq_list.node); 1793*bba2c361STejun Heo 1794*bba2c361STejun Heo /* 1795*bba2c361STejun Heo * When dispatching directly from the BPF scheduler to a local 1796*bba2c361STejun Heo * DSQ, the task isn't associated with any DSQ but 1797*bba2c361STejun Heo * @p->scx.holding_cpu may be set under the protection of 1798*bba2c361STejun Heo * %SCX_OPSS_DISPATCHING. 1799*bba2c361STejun Heo */ 1800*bba2c361STejun Heo if (p->scx.holding_cpu >= 0) 1801*bba2c361STejun Heo p->scx.holding_cpu = -1; 1802*bba2c361STejun Heo 1803*bba2c361STejun Heo return; 1804*bba2c361STejun Heo } 1805*bba2c361STejun Heo 1806*bba2c361STejun Heo if (!is_local) 1807*bba2c361STejun Heo raw_spin_lock(&dsq->lock); 1808*bba2c361STejun Heo 1809*bba2c361STejun Heo /* 1810*bba2c361STejun Heo * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't 1811*bba2c361STejun Heo * change underneath us. 1812*bba2c361STejun Heo */ 1813*bba2c361STejun Heo if (p->scx.holding_cpu < 0) { 1814*bba2c361STejun Heo /* @p must still be on @dsq, dequeue */ 1815*bba2c361STejun Heo task_unlink_from_dsq(p, dsq); 1816*bba2c361STejun Heo } else { 1817*bba2c361STejun Heo /* 1818*bba2c361STejun Heo * We're racing against dispatch_to_local_dsq() which already 1819*bba2c361STejun Heo * removed @p from @dsq and set @p->scx.holding_cpu. Clear the 1820*bba2c361STejun Heo * holding_cpu which tells dispatch_to_local_dsq() that it lost 1821*bba2c361STejun Heo * the race. 1822*bba2c361STejun Heo */ 1823*bba2c361STejun Heo WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); 1824*bba2c361STejun Heo p->scx.holding_cpu = -1; 1825*bba2c361STejun Heo } 1826*bba2c361STejun Heo p->scx.dsq = NULL; 1827*bba2c361STejun Heo 1828*bba2c361STejun Heo if (!is_local) 1829*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 1830*bba2c361STejun Heo } 1831*bba2c361STejun Heo 1832*bba2c361STejun Heo /* 1833*bba2c361STejun Heo * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1834*bba2c361STejun Heo * and dsq are locked. 1835*bba2c361STejun Heo */ 1836*bba2c361STejun Heo static void dispatch_dequeue_locked(struct task_struct *p, 1837*bba2c361STejun Heo struct scx_dispatch_q *dsq) 1838*bba2c361STejun Heo { 1839*bba2c361STejun Heo lockdep_assert_rq_held(task_rq(p)); 1840*bba2c361STejun Heo lockdep_assert_held(&dsq->lock); 1841*bba2c361STejun Heo 1842*bba2c361STejun Heo task_unlink_from_dsq(p, dsq); 1843*bba2c361STejun Heo p->scx.dsq = NULL; 1844*bba2c361STejun Heo } 1845*bba2c361STejun Heo 1846*bba2c361STejun Heo static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, 1847*bba2c361STejun Heo struct rq *rq, u64 dsq_id, 1848*bba2c361STejun Heo s32 tcpu) 1849*bba2c361STejun Heo { 1850*bba2c361STejun Heo struct scx_dispatch_q *dsq; 1851*bba2c361STejun Heo 1852*bba2c361STejun Heo if (dsq_id == SCX_DSQ_LOCAL) 1853*bba2c361STejun Heo return &rq->scx.local_dsq; 1854*bba2c361STejun Heo 1855*bba2c361STejun Heo if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 1856*bba2c361STejun Heo s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); 1857*bba2c361STejun Heo 1858*bba2c361STejun Heo if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1859*bba2c361STejun Heo return find_global_dsq(sch, tcpu); 1860*bba2c361STejun Heo 1861*bba2c361STejun Heo return &cpu_rq(cpu)->scx.local_dsq; 1862*bba2c361STejun Heo } 1863*bba2c361STejun Heo 1864*bba2c361STejun Heo if (dsq_id == SCX_DSQ_GLOBAL) 1865*bba2c361STejun Heo dsq = find_global_dsq(sch, tcpu); 1866*bba2c361STejun Heo else 1867*bba2c361STejun Heo dsq = find_user_dsq(sch, dsq_id); 1868*bba2c361STejun Heo 1869*bba2c361STejun Heo if (unlikely(!dsq)) { 1870*bba2c361STejun Heo scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); 1871*bba2c361STejun Heo return find_global_dsq(sch, tcpu); 1872*bba2c361STejun Heo } 1873*bba2c361STejun Heo 1874*bba2c361STejun Heo return dsq; 1875*bba2c361STejun Heo } 1876*bba2c361STejun Heo 1877*bba2c361STejun Heo static void mark_direct_dispatch(struct scx_sched *sch, 1878*bba2c361STejun Heo struct task_struct *ddsp_task, 1879*bba2c361STejun Heo struct task_struct *p, u64 dsq_id, 1880*bba2c361STejun Heo u64 enq_flags) 1881*bba2c361STejun Heo { 1882*bba2c361STejun Heo /* 1883*bba2c361STejun Heo * Mark that dispatch already happened from ops.select_cpu() or 1884*bba2c361STejun Heo * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value 1885*bba2c361STejun Heo * which can never match a valid task pointer. 1886*bba2c361STejun Heo */ 1887*bba2c361STejun Heo __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); 1888*bba2c361STejun Heo 1889*bba2c361STejun Heo /* @p must match the task on the enqueue path */ 1890*bba2c361STejun Heo if (unlikely(p != ddsp_task)) { 1891*bba2c361STejun Heo if (IS_ERR(ddsp_task)) 1892*bba2c361STejun Heo scx_error(sch, "%s[%d] already direct-dispatched", 1893*bba2c361STejun Heo p->comm, p->pid); 1894*bba2c361STejun Heo else 1895*bba2c361STejun Heo scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1896*bba2c361STejun Heo ddsp_task->comm, ddsp_task->pid, 1897*bba2c361STejun Heo p->comm, p->pid); 1898*bba2c361STejun Heo return; 1899*bba2c361STejun Heo } 1900*bba2c361STejun Heo 1901*bba2c361STejun Heo WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); 1902*bba2c361STejun Heo WARN_ON_ONCE(p->scx.ddsp_enq_flags); 1903*bba2c361STejun Heo 1904*bba2c361STejun Heo p->scx.ddsp_dsq_id = dsq_id; 1905*bba2c361STejun Heo p->scx.ddsp_enq_flags = enq_flags; 1906*bba2c361STejun Heo } 1907*bba2c361STejun Heo 1908*bba2c361STejun Heo /* 1909*bba2c361STejun Heo * Clear @p direct dispatch state when leaving the scheduler. 1910*bba2c361STejun Heo * 1911*bba2c361STejun Heo * Direct dispatch state must be cleared in the following cases: 1912*bba2c361STejun Heo * - direct_dispatch(): cleared on the synchronous enqueue path, deferred 1913*bba2c361STejun Heo * dispatch keeps the state until consumed 1914*bba2c361STejun Heo * - process_ddsp_deferred_locals(): cleared after consuming deferred state, 1915*bba2c361STejun Heo * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch 1916*bba2c361STejun Heo * verdict is ignored (local/global/bypass) 1917*bba2c361STejun Heo * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred 1918*bba2c361STejun Heo * cancellation and holding_cpu races 1919*bba2c361STejun Heo * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by 1920*bba2c361STejun Heo * the scx_bypass() loop, so that stale state is not reused by a subsequent 1921*bba2c361STejun Heo * scheduler instance 1922*bba2c361STejun Heo */ 1923*bba2c361STejun Heo static inline void clear_direct_dispatch(struct task_struct *p) 1924*bba2c361STejun Heo { 1925*bba2c361STejun Heo p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; 1926*bba2c361STejun Heo p->scx.ddsp_enq_flags = 0; 1927*bba2c361STejun Heo } 1928*bba2c361STejun Heo 1929*bba2c361STejun Heo static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, 1930*bba2c361STejun Heo u64 enq_flags) 1931*bba2c361STejun Heo { 1932*bba2c361STejun Heo struct rq *rq = task_rq(p); 1933*bba2c361STejun Heo struct scx_dispatch_q *dsq = 1934*bba2c361STejun Heo find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); 1935*bba2c361STejun Heo u64 ddsp_enq_flags; 1936*bba2c361STejun Heo 1937*bba2c361STejun Heo touch_core_sched_dispatch(rq, p); 1938*bba2c361STejun Heo 1939*bba2c361STejun Heo p->scx.ddsp_enq_flags |= enq_flags; 1940*bba2c361STejun Heo 1941*bba2c361STejun Heo /* 1942*bba2c361STejun Heo * We are in the enqueue path with @rq locked and pinned, and thus can't 1943*bba2c361STejun Heo * double lock a remote rq and enqueue to its local DSQ. For 1944*bba2c361STejun Heo * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer 1945*bba2c361STejun Heo * the enqueue so that it's executed when @rq can be unlocked. 1946*bba2c361STejun Heo */ 1947*bba2c361STejun Heo if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { 1948*bba2c361STejun Heo unsigned long opss; 1949*bba2c361STejun Heo 1950*bba2c361STejun Heo opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; 1951*bba2c361STejun Heo 1952*bba2c361STejun Heo switch (opss & SCX_OPSS_STATE_MASK) { 1953*bba2c361STejun Heo case SCX_OPSS_NONE: 1954*bba2c361STejun Heo break; 1955*bba2c361STejun Heo case SCX_OPSS_QUEUEING: 1956*bba2c361STejun Heo /* 1957*bba2c361STejun Heo * As @p was never passed to the BPF side, _release is 1958*bba2c361STejun Heo * not strictly necessary. Still do it for consistency. 1959*bba2c361STejun Heo */ 1960*bba2c361STejun Heo atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1961*bba2c361STejun Heo break; 1962*bba2c361STejun Heo default: 1963*bba2c361STejun Heo WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", 1964*bba2c361STejun Heo p->comm, p->pid, opss); 1965*bba2c361STejun Heo atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 1966*bba2c361STejun Heo break; 1967*bba2c361STejun Heo } 1968*bba2c361STejun Heo 1969*bba2c361STejun Heo WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1970*bba2c361STejun Heo list_add_tail(&p->scx.dsq_list.node, 1971*bba2c361STejun Heo &rq->scx.ddsp_deferred_locals); 1972*bba2c361STejun Heo schedule_deferred_locked(rq); 1973*bba2c361STejun Heo return; 1974*bba2c361STejun Heo } 1975*bba2c361STejun Heo 1976*bba2c361STejun Heo ddsp_enq_flags = p->scx.ddsp_enq_flags; 1977*bba2c361STejun Heo clear_direct_dispatch(p); 1978*bba2c361STejun Heo 1979*bba2c361STejun Heo dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); 1980*bba2c361STejun Heo } 1981*bba2c361STejun Heo 1982*bba2c361STejun Heo static bool scx_rq_online(struct rq *rq) 1983*bba2c361STejun Heo { 1984*bba2c361STejun Heo /* 1985*bba2c361STejun Heo * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates 1986*bba2c361STejun Heo * the online state as seen from the BPF scheduler. cpu_active() test 1987*bba2c361STejun Heo * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will 1988*bba2c361STejun Heo * stay set until the current scheduling operation is complete even if 1989*bba2c361STejun Heo * we aren't locking @rq. 1990*bba2c361STejun Heo */ 1991*bba2c361STejun Heo return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); 1992*bba2c361STejun Heo } 1993*bba2c361STejun Heo 1994*bba2c361STejun Heo static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1995*bba2c361STejun Heo int sticky_cpu) 1996*bba2c361STejun Heo { 1997*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 1998*bba2c361STejun Heo struct task_struct **ddsp_taskp; 1999*bba2c361STejun Heo struct scx_dispatch_q *dsq; 2000*bba2c361STejun Heo unsigned long qseq; 2001*bba2c361STejun Heo 2002*bba2c361STejun Heo WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); 2003*bba2c361STejun Heo 2004*bba2c361STejun Heo /* internal movements - rq migration / RESTORE */ 2005*bba2c361STejun Heo if (sticky_cpu == cpu_of(rq)) 2006*bba2c361STejun Heo goto local_norefill; 2007*bba2c361STejun Heo 2008*bba2c361STejun Heo /* 2009*bba2c361STejun Heo * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). 2010*bba2c361STejun Heo * Note that exiting and migration-disabled tasks that skip 2011*bba2c361STejun Heo * ops.enqueue() below will lose IMMED protection unless 2012*bba2c361STejun Heo * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. 2013*bba2c361STejun Heo */ 2014*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_IMMED; 2015*bba2c361STejun Heo 2016*bba2c361STejun Heo /* 2017*bba2c361STejun Heo * If !scx_rq_online(), we already told the BPF scheduler that the CPU 2018*bba2c361STejun Heo * is offline and are just running the hotplug path. Don't bother the 2019*bba2c361STejun Heo * BPF scheduler. 2020*bba2c361STejun Heo */ 2021*bba2c361STejun Heo if (!scx_rq_online(rq)) 2022*bba2c361STejun Heo goto local; 2023*bba2c361STejun Heo 2024*bba2c361STejun Heo if (scx_bypassing(sch, cpu_of(rq))) { 2025*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 2026*bba2c361STejun Heo goto bypass; 2027*bba2c361STejun Heo } 2028*bba2c361STejun Heo 2029*bba2c361STejun Heo if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2030*bba2c361STejun Heo goto direct; 2031*bba2c361STejun Heo 2032*bba2c361STejun Heo /* see %SCX_OPS_ENQ_EXITING */ 2033*bba2c361STejun Heo if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && 2034*bba2c361STejun Heo unlikely(p->flags & PF_EXITING)) { 2035*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); 2036*bba2c361STejun Heo goto local; 2037*bba2c361STejun Heo } 2038*bba2c361STejun Heo 2039*bba2c361STejun Heo /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 2040*bba2c361STejun Heo if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && 2041*bba2c361STejun Heo is_migration_disabled(p)) { 2042*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 2043*bba2c361STejun Heo goto local; 2044*bba2c361STejun Heo } 2045*bba2c361STejun Heo 2046*bba2c361STejun Heo if (unlikely(!SCX_HAS_OP(sch, enqueue))) 2047*bba2c361STejun Heo goto global; 2048*bba2c361STejun Heo 2049*bba2c361STejun Heo /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ 2050*bba2c361STejun Heo qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; 2051*bba2c361STejun Heo 2052*bba2c361STejun Heo WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2053*bba2c361STejun Heo atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); 2054*bba2c361STejun Heo 2055*bba2c361STejun Heo ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 2056*bba2c361STejun Heo WARN_ON_ONCE(*ddsp_taskp); 2057*bba2c361STejun Heo *ddsp_taskp = p; 2058*bba2c361STejun Heo 2059*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); 2060*bba2c361STejun Heo 2061*bba2c361STejun Heo *ddsp_taskp = NULL; 2062*bba2c361STejun Heo if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2063*bba2c361STejun Heo goto direct; 2064*bba2c361STejun Heo 2065*bba2c361STejun Heo /* 2066*bba2c361STejun Heo * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY 2067*bba2c361STejun Heo * so ops.dequeue() is called when it leaves custody. 2068*bba2c361STejun Heo */ 2069*bba2c361STejun Heo p->scx.flags |= SCX_TASK_IN_CUSTODY; 2070*bba2c361STejun Heo 2071*bba2c361STejun Heo /* 2072*bba2c361STejun Heo * If not directly dispatched, QUEUEING isn't clear yet and dispatch or 2073*bba2c361STejun Heo * dequeue may be waiting. The store_release matches their load_acquire. 2074*bba2c361STejun Heo */ 2075*bba2c361STejun Heo atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); 2076*bba2c361STejun Heo return; 2077*bba2c361STejun Heo 2078*bba2c361STejun Heo direct: 2079*bba2c361STejun Heo direct_dispatch(sch, p, enq_flags); 2080*bba2c361STejun Heo return; 2081*bba2c361STejun Heo local_norefill: 2082*bba2c361STejun Heo dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); 2083*bba2c361STejun Heo return; 2084*bba2c361STejun Heo local: 2085*bba2c361STejun Heo dsq = &rq->scx.local_dsq; 2086*bba2c361STejun Heo goto enqueue; 2087*bba2c361STejun Heo global: 2088*bba2c361STejun Heo dsq = find_global_dsq(sch, task_cpu(p)); 2089*bba2c361STejun Heo goto enqueue; 2090*bba2c361STejun Heo bypass: 2091*bba2c361STejun Heo dsq = bypass_enq_target_dsq(sch, task_cpu(p)); 2092*bba2c361STejun Heo goto enqueue; 2093*bba2c361STejun Heo 2094*bba2c361STejun Heo enqueue: 2095*bba2c361STejun Heo /* 2096*bba2c361STejun Heo * For task-ordering, slice refill must be treated as implying the end 2097*bba2c361STejun Heo * of the current slice. Otherwise, the longer @p stays on the CPU, the 2098*bba2c361STejun Heo * higher priority it becomes from scx_prio_less()'s POV. 2099*bba2c361STejun Heo */ 2100*bba2c361STejun Heo touch_core_sched(rq, p); 2101*bba2c361STejun Heo refill_task_slice_dfl(sch, p); 2102*bba2c361STejun Heo clear_direct_dispatch(p); 2103*bba2c361STejun Heo dispatch_enqueue(sch, rq, dsq, p, enq_flags); 2104*bba2c361STejun Heo } 2105*bba2c361STejun Heo 2106*bba2c361STejun Heo static bool task_runnable(const struct task_struct *p) 2107*bba2c361STejun Heo { 2108*bba2c361STejun Heo return !list_empty(&p->scx.runnable_node); 2109*bba2c361STejun Heo } 2110*bba2c361STejun Heo 2111*bba2c361STejun Heo static void set_task_runnable(struct rq *rq, struct task_struct *p) 2112*bba2c361STejun Heo { 2113*bba2c361STejun Heo lockdep_assert_rq_held(rq); 2114*bba2c361STejun Heo 2115*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { 2116*bba2c361STejun Heo p->scx.runnable_at = jiffies; 2117*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; 2118*bba2c361STejun Heo } 2119*bba2c361STejun Heo 2120*bba2c361STejun Heo /* 2121*bba2c361STejun Heo * list_add_tail() must be used. scx_bypass() depends on tasks being 2122*bba2c361STejun Heo * appended to the runnable_list. 2123*bba2c361STejun Heo */ 2124*bba2c361STejun Heo list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); 2125*bba2c361STejun Heo } 2126*bba2c361STejun Heo 2127*bba2c361STejun Heo static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) 2128*bba2c361STejun Heo { 2129*bba2c361STejun Heo list_del_init(&p->scx.runnable_node); 2130*bba2c361STejun Heo if (reset_runnable_at) 2131*bba2c361STejun Heo p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 2132*bba2c361STejun Heo } 2133*bba2c361STejun Heo 2134*bba2c361STejun Heo static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 2135*bba2c361STejun Heo { 2136*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 2137*bba2c361STejun Heo int sticky_cpu = p->scx.sticky_cpu; 2138*bba2c361STejun Heo u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 2139*bba2c361STejun Heo 2140*bba2c361STejun Heo if (enq_flags & ENQUEUE_WAKEUP) 2141*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_IN_WAKEUP; 2142*bba2c361STejun Heo 2143*bba2c361STejun Heo /* 2144*bba2c361STejun Heo * Restoring a running task will be immediately followed by 2145*bba2c361STejun Heo * set_next_task_scx() which expects the task to not be on the BPF 2146*bba2c361STejun Heo * scheduler as tasks can only start running through local DSQs. Force 2147*bba2c361STejun Heo * direct-dispatch into the local DSQ by setting the sticky_cpu. 2148*bba2c361STejun Heo */ 2149*bba2c361STejun Heo if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) 2150*bba2c361STejun Heo sticky_cpu = cpu_of(rq); 2151*bba2c361STejun Heo 2152*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_QUEUED) { 2153*bba2c361STejun Heo WARN_ON_ONCE(!task_runnable(p)); 2154*bba2c361STejun Heo goto out; 2155*bba2c361STejun Heo } 2156*bba2c361STejun Heo 2157*bba2c361STejun Heo set_task_runnable(rq, p); 2158*bba2c361STejun Heo p->scx.flags |= SCX_TASK_QUEUED; 2159*bba2c361STejun Heo rq->scx.nr_running++; 2160*bba2c361STejun Heo add_nr_running(rq, 1); 2161*bba2c361STejun Heo 2162*bba2c361STejun Heo if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) 2163*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); 2164*bba2c361STejun Heo 2165*bba2c361STejun Heo if (enq_flags & SCX_ENQ_WAKEUP) 2166*bba2c361STejun Heo touch_core_sched(rq, p); 2167*bba2c361STejun Heo 2168*bba2c361STejun Heo /* Start dl_server if this is the first task being enqueued */ 2169*bba2c361STejun Heo if (rq->scx.nr_running == 1) 2170*bba2c361STejun Heo dl_server_start(&rq->ext_server); 2171*bba2c361STejun Heo 2172*bba2c361STejun Heo do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2173*bba2c361STejun Heo 2174*bba2c361STejun Heo if (sticky_cpu >= 0) 2175*bba2c361STejun Heo p->scx.sticky_cpu = -1; 2176*bba2c361STejun Heo out: 2177*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2178*bba2c361STejun Heo 2179*bba2c361STejun Heo if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2180*bba2c361STejun Heo unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2181*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); 2182*bba2c361STejun Heo } 2183*bba2c361STejun Heo 2184*bba2c361STejun Heo static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) 2185*bba2c361STejun Heo { 2186*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 2187*bba2c361STejun Heo unsigned long opss; 2188*bba2c361STejun Heo 2189*bba2c361STejun Heo /* dequeue is always temporary, don't reset runnable_at */ 2190*bba2c361STejun Heo clr_task_runnable(p, false); 2191*bba2c361STejun Heo 2192*bba2c361STejun Heo retry: 2193*bba2c361STejun Heo /* acquire ensures that we see the preceding updates on QUEUED */ 2194*bba2c361STejun Heo opss = atomic_long_read_acquire(&p->scx.ops_state); 2195*bba2c361STejun Heo 2196*bba2c361STejun Heo switch (opss & SCX_OPSS_STATE_MASK) { 2197*bba2c361STejun Heo case SCX_OPSS_NONE: 2198*bba2c361STejun Heo break; 2199*bba2c361STejun Heo case SCX_OPSS_QUEUEING: 2200*bba2c361STejun Heo /* 2201*bba2c361STejun Heo * QUEUEING is started and finished while holding @p's rq lock. 2202*bba2c361STejun Heo * As we're holding the rq lock now, we shouldn't see QUEUEING. 2203*bba2c361STejun Heo */ 2204*bba2c361STejun Heo BUG(); 2205*bba2c361STejun Heo case SCX_OPSS_QUEUED: 2206*bba2c361STejun Heo /* 2207*bba2c361STejun Heo * A queued task must always be in BPF scheduler's custody. If 2208*bba2c361STejun Heo * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another 2209*bba2c361STejun Heo * CPU has already passed call_task_dequeue() (which clears the 2210*bba2c361STejun Heo * flag), but has not yet written SCX_OPSS_NONE. That final 2211*bba2c361STejun Heo * store does not require this rq's lock, so retrying with 2212*bba2c361STejun Heo * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, 2213*bba2c361STejun Heo * handled by the fallthrough) on a subsequent iteration. 2214*bba2c361STejun Heo */ 2215*bba2c361STejun Heo if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { 2216*bba2c361STejun Heo cpu_relax(); 2217*bba2c361STejun Heo goto retry; 2218*bba2c361STejun Heo } 2219*bba2c361STejun Heo 2220*bba2c361STejun Heo if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2221*bba2c361STejun Heo SCX_OPSS_NONE)) 2222*bba2c361STejun Heo break; 2223*bba2c361STejun Heo fallthrough; 2224*bba2c361STejun Heo case SCX_OPSS_DISPATCHING: 2225*bba2c361STejun Heo /* 2226*bba2c361STejun Heo * If @p is being dispatched from the BPF scheduler to a DSQ, 2227*bba2c361STejun Heo * wait for the transfer to complete so that @p doesn't get 2228*bba2c361STejun Heo * added to its DSQ after dequeueing is complete. 2229*bba2c361STejun Heo * 2230*bba2c361STejun Heo * As we're waiting on DISPATCHING with the rq locked, the 2231*bba2c361STejun Heo * dispatching side shouldn't try to lock the rq while 2232*bba2c361STejun Heo * DISPATCHING is set. See dispatch_to_local_dsq(). 2233*bba2c361STejun Heo * 2234*bba2c361STejun Heo * DISPATCHING shouldn't have qseq set and control can reach 2235*bba2c361STejun Heo * here with NONE @opss from the above QUEUED case block. 2236*bba2c361STejun Heo * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. 2237*bba2c361STejun Heo */ 2238*bba2c361STejun Heo wait_ops_state(p, SCX_OPSS_DISPATCHING); 2239*bba2c361STejun Heo BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); 2240*bba2c361STejun Heo break; 2241*bba2c361STejun Heo } 2242*bba2c361STejun Heo 2243*bba2c361STejun Heo /* 2244*bba2c361STejun Heo * Call ops.dequeue() if the task is still in BPF custody. 2245*bba2c361STejun Heo * 2246*bba2c361STejun Heo * The code that clears ops_state to %SCX_OPSS_NONE does not always 2247*bba2c361STejun Heo * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when 2248*bba2c361STejun Heo * we're moving a task that was in %SCX_OPSS_DISPATCHING to a 2249*bba2c361STejun Heo * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE 2250*bba2c361STejun Heo * so that a concurrent dequeue can proceed, but we clear 2251*bba2c361STejun Heo * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the 2252*bba2c361STejun Heo * task. So we can see NONE + IN_CUSTODY here and we must handle 2253*bba2c361STejun Heo * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see 2254*bba2c361STejun Heo * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until 2255*bba2c361STejun Heo * it is enqueued on the destination. 2256*bba2c361STejun Heo */ 2257*bba2c361STejun Heo call_task_dequeue(sch, rq, p, deq_flags); 2258*bba2c361STejun Heo } 2259*bba2c361STejun Heo 2260*bba2c361STejun Heo static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) 2261*bba2c361STejun Heo { 2262*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 2263*bba2c361STejun Heo u64 deq_flags = core_deq_flags; 2264*bba2c361STejun Heo 2265*bba2c361STejun Heo /* 2266*bba2c361STejun Heo * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property 2267*bba2c361STejun Heo * change (not sleep or core-sched pick). 2268*bba2c361STejun Heo */ 2269*bba2c361STejun Heo if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) 2270*bba2c361STejun Heo deq_flags |= SCX_DEQ_SCHED_CHANGE; 2271*bba2c361STejun Heo 2272*bba2c361STejun Heo if (!(p->scx.flags & SCX_TASK_QUEUED)) { 2273*bba2c361STejun Heo WARN_ON_ONCE(task_runnable(p)); 2274*bba2c361STejun Heo return true; 2275*bba2c361STejun Heo } 2276*bba2c361STejun Heo 2277*bba2c361STejun Heo ops_dequeue(rq, p, deq_flags); 2278*bba2c361STejun Heo 2279*bba2c361STejun Heo /* 2280*bba2c361STejun Heo * A currently running task which is going off @rq first gets dequeued 2281*bba2c361STejun Heo * and then stops running. As we want running <-> stopping transitions 2282*bba2c361STejun Heo * to be contained within runnable <-> quiescent transitions, trigger 2283*bba2c361STejun Heo * ->stopping() early here instead of in put_prev_task_scx(). 2284*bba2c361STejun Heo * 2285*bba2c361STejun Heo * @p may go through multiple stopping <-> running transitions between 2286*bba2c361STejun Heo * here and put_prev_task_scx() if task attribute changes occur while 2287*bba2c361STejun Heo * balance_one() leaves @rq unlocked. However, they don't contain any 2288*bba2c361STejun Heo * information meaningful to the BPF scheduler and can be suppressed by 2289*bba2c361STejun Heo * skipping the callbacks if the task is !QUEUED. 2290*bba2c361STejun Heo */ 2291*bba2c361STejun Heo if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { 2292*bba2c361STejun Heo update_curr_scx(rq); 2293*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, stopping, rq, p, false); 2294*bba2c361STejun Heo } 2295*bba2c361STejun Heo 2296*bba2c361STejun Heo if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) 2297*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); 2298*bba2c361STejun Heo 2299*bba2c361STejun Heo if (deq_flags & SCX_DEQ_SLEEP) 2300*bba2c361STejun Heo p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; 2301*bba2c361STejun Heo else 2302*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; 2303*bba2c361STejun Heo 2304*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_QUEUED; 2305*bba2c361STejun Heo rq->scx.nr_running--; 2306*bba2c361STejun Heo sub_nr_running(rq, 1); 2307*bba2c361STejun Heo 2308*bba2c361STejun Heo dispatch_dequeue(rq, p); 2309*bba2c361STejun Heo clear_direct_dispatch(p); 2310*bba2c361STejun Heo return true; 2311*bba2c361STejun Heo } 2312*bba2c361STejun Heo 2313*bba2c361STejun Heo static void yield_task_scx(struct rq *rq) 2314*bba2c361STejun Heo { 2315*bba2c361STejun Heo struct task_struct *p = rq->donor; 2316*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 2317*bba2c361STejun Heo 2318*bba2c361STejun Heo if (SCX_HAS_OP(sch, yield)) 2319*bba2c361STejun Heo SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); 2320*bba2c361STejun Heo else 2321*bba2c361STejun Heo p->scx.slice = 0; 2322*bba2c361STejun Heo } 2323*bba2c361STejun Heo 2324*bba2c361STejun Heo static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 2325*bba2c361STejun Heo { 2326*bba2c361STejun Heo struct task_struct *from = rq->donor; 2327*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(from); 2328*bba2c361STejun Heo 2329*bba2c361STejun Heo if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) 2330*bba2c361STejun Heo return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); 2331*bba2c361STejun Heo else 2332*bba2c361STejun Heo return false; 2333*bba2c361STejun Heo } 2334*bba2c361STejun Heo 2335*bba2c361STejun Heo static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) 2336*bba2c361STejun Heo { 2337*bba2c361STejun Heo /* 2338*bba2c361STejun Heo * Preemption between SCX tasks is implemented by resetting the victim 2339*bba2c361STejun Heo * task's slice to 0 and triggering reschedule on the target CPU. 2340*bba2c361STejun Heo * Nothing to do. 2341*bba2c361STejun Heo */ 2342*bba2c361STejun Heo if (p->sched_class == &ext_sched_class) 2343*bba2c361STejun Heo return; 2344*bba2c361STejun Heo 2345*bba2c361STejun Heo /* 2346*bba2c361STejun Heo * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. 2347*bba2c361STejun Heo * This captures all preemption cases including: 2348*bba2c361STejun Heo * 2349*bba2c361STejun Heo * - A SCX task is currently running. 2350*bba2c361STejun Heo * 2351*bba2c361STejun Heo * - @rq is waking from idle due to a SCX task waking to it. 2352*bba2c361STejun Heo * 2353*bba2c361STejun Heo * - A higher-priority wakes up while SCX dispatch is in progress. 2354*bba2c361STejun Heo */ 2355*bba2c361STejun Heo if (rq->scx.nr_immed) 2356*bba2c361STejun Heo schedule_reenq_local(rq, 0); 2357*bba2c361STejun Heo } 2358*bba2c361STejun Heo 2359*bba2c361STejun Heo static void move_local_task_to_local_dsq(struct scx_sched *sch, 2360*bba2c361STejun Heo struct task_struct *p, u64 enq_flags, 2361*bba2c361STejun Heo struct scx_dispatch_q *src_dsq, 2362*bba2c361STejun Heo struct rq *dst_rq) 2363*bba2c361STejun Heo { 2364*bba2c361STejun Heo struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; 2365*bba2c361STejun Heo 2366*bba2c361STejun Heo /* @dsq is locked and @p is on @dst_rq */ 2367*bba2c361STejun Heo lockdep_assert_held(&src_dsq->lock); 2368*bba2c361STejun Heo lockdep_assert_rq_held(dst_rq); 2369*bba2c361STejun Heo 2370*bba2c361STejun Heo WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2371*bba2c361STejun Heo 2372*bba2c361STejun Heo if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 2373*bba2c361STejun Heo list_add(&p->scx.dsq_list.node, &dst_dsq->list); 2374*bba2c361STejun Heo else 2375*bba2c361STejun Heo list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); 2376*bba2c361STejun Heo 2377*bba2c361STejun Heo dsq_inc_nr(dst_dsq, p, enq_flags); 2378*bba2c361STejun Heo p->scx.dsq = dst_dsq; 2379*bba2c361STejun Heo 2380*bba2c361STejun Heo local_dsq_post_enq(sch, dst_dsq, p, enq_flags); 2381*bba2c361STejun Heo } 2382*bba2c361STejun Heo 2383*bba2c361STejun Heo /** 2384*bba2c361STejun Heo * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ 2385*bba2c361STejun Heo * @p: task to move 2386*bba2c361STejun Heo * @enq_flags: %SCX_ENQ_* 2387*bba2c361STejun Heo * @src_rq: rq to move the task from, locked on entry, released on return 2388*bba2c361STejun Heo * @dst_rq: rq to move the task into, locked on return 2389*bba2c361STejun Heo * 2390*bba2c361STejun Heo * Move @p which is currently on @src_rq to @dst_rq's local DSQ. 2391*bba2c361STejun Heo */ 2392*bba2c361STejun Heo static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2393*bba2c361STejun Heo struct rq *src_rq, struct rq *dst_rq) 2394*bba2c361STejun Heo { 2395*bba2c361STejun Heo lockdep_assert_rq_held(src_rq); 2396*bba2c361STejun Heo 2397*bba2c361STejun Heo /* 2398*bba2c361STejun Heo * Set sticky_cpu before deactivate_task() to properly mark the 2399*bba2c361STejun Heo * beginning of an SCX-internal migration. 2400*bba2c361STejun Heo */ 2401*bba2c361STejun Heo p->scx.sticky_cpu = cpu_of(dst_rq); 2402*bba2c361STejun Heo deactivate_task(src_rq, p, 0); 2403*bba2c361STejun Heo set_task_cpu(p, cpu_of(dst_rq)); 2404*bba2c361STejun Heo 2405*bba2c361STejun Heo raw_spin_rq_unlock(src_rq); 2406*bba2c361STejun Heo raw_spin_rq_lock(dst_rq); 2407*bba2c361STejun Heo 2408*bba2c361STejun Heo /* 2409*bba2c361STejun Heo * We want to pass scx-specific enq_flags but activate_task() will 2410*bba2c361STejun Heo * truncate the upper 32 bit. As we own @rq, we can pass them through 2411*bba2c361STejun Heo * @rq->scx.extra_enq_flags instead. 2412*bba2c361STejun Heo */ 2413*bba2c361STejun Heo WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2414*bba2c361STejun Heo WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2415*bba2c361STejun Heo dst_rq->scx.extra_enq_flags = enq_flags; 2416*bba2c361STejun Heo activate_task(dst_rq, p, 0); 2417*bba2c361STejun Heo dst_rq->scx.extra_enq_flags = 0; 2418*bba2c361STejun Heo } 2419*bba2c361STejun Heo 2420*bba2c361STejun Heo /* 2421*bba2c361STejun Heo * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two 2422*bba2c361STejun Heo * differences: 2423*bba2c361STejun Heo * 2424*bba2c361STejun Heo * - is_cpu_allowed() asks "Can this task run on this CPU?" while 2425*bba2c361STejun Heo * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to 2426*bba2c361STejun Heo * this CPU?". 2427*bba2c361STejun Heo * 2428*bba2c361STejun Heo * While migration is disabled, is_cpu_allowed() has to say "yes" as the task 2429*bba2c361STejun Heo * must be allowed to finish on the CPU that it's currently on regardless of 2430*bba2c361STejun Heo * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the 2431*bba2c361STejun Heo * BPF scheduler shouldn't attempt to migrate a task which has migration 2432*bba2c361STejun Heo * disabled. 2433*bba2c361STejun Heo * 2434*bba2c361STejun Heo * - The BPF scheduler is bypassed while the rq is offline and we can always say 2435*bba2c361STejun Heo * no to the BPF scheduler initiated migrations while offline. 2436*bba2c361STejun Heo * 2437*bba2c361STejun Heo * The caller must ensure that @p and @rq are on different CPUs. 2438*bba2c361STejun Heo */ 2439*bba2c361STejun Heo static bool task_can_run_on_remote_rq(struct scx_sched *sch, 2440*bba2c361STejun Heo struct task_struct *p, struct rq *rq, 2441*bba2c361STejun Heo bool enforce) 2442*bba2c361STejun Heo { 2443*bba2c361STejun Heo s32 cpu = cpu_of(rq); 2444*bba2c361STejun Heo 2445*bba2c361STejun Heo WARN_ON_ONCE(task_cpu(p) == cpu); 2446*bba2c361STejun Heo 2447*bba2c361STejun Heo /* 2448*bba2c361STejun Heo * If @p has migration disabled, @p->cpus_ptr is updated to contain only 2449*bba2c361STejun Heo * the pinned CPU in migrate_disable_switch() while @p is being switched 2450*bba2c361STejun Heo * out. However, put_prev_task_scx() is called before @p->cpus_ptr is 2451*bba2c361STejun Heo * updated and thus another CPU may see @p on a DSQ inbetween leading to 2452*bba2c361STejun Heo * @p passing the below task_allowed_on_cpu() check while migration is 2453*bba2c361STejun Heo * disabled. 2454*bba2c361STejun Heo * 2455*bba2c361STejun Heo * Test the migration disabled state first as the race window is narrow 2456*bba2c361STejun Heo * and the BPF scheduler failing to check migration disabled state can 2457*bba2c361STejun Heo * easily be masked if task_allowed_on_cpu() is done first. 2458*bba2c361STejun Heo */ 2459*bba2c361STejun Heo if (unlikely(is_migration_disabled(p))) { 2460*bba2c361STejun Heo if (enforce) 2461*bba2c361STejun Heo scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2462*bba2c361STejun Heo p->comm, p->pid, task_cpu(p), cpu); 2463*bba2c361STejun Heo return false; 2464*bba2c361STejun Heo } 2465*bba2c361STejun Heo 2466*bba2c361STejun Heo /* 2467*bba2c361STejun Heo * We don't require the BPF scheduler to avoid dispatching to offline 2468*bba2c361STejun Heo * CPUs mostly for convenience but also because CPUs can go offline 2469*bba2c361STejun Heo * between scx_bpf_dsq_insert() calls and here. Trigger error iff the 2470*bba2c361STejun Heo * picked CPU is outside the allowed mask. 2471*bba2c361STejun Heo */ 2472*bba2c361STejun Heo if (!task_allowed_on_cpu(p, cpu)) { 2473*bba2c361STejun Heo if (enforce) 2474*bba2c361STejun Heo scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2475*bba2c361STejun Heo cpu, p->comm, p->pid); 2476*bba2c361STejun Heo return false; 2477*bba2c361STejun Heo } 2478*bba2c361STejun Heo 2479*bba2c361STejun Heo if (!scx_rq_online(rq)) { 2480*bba2c361STejun Heo if (enforce) 2481*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2482*bba2c361STejun Heo return false; 2483*bba2c361STejun Heo } 2484*bba2c361STejun Heo 2485*bba2c361STejun Heo return true; 2486*bba2c361STejun Heo } 2487*bba2c361STejun Heo 2488*bba2c361STejun Heo /** 2489*bba2c361STejun Heo * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq 2490*bba2c361STejun Heo * @p: target task 2491*bba2c361STejun Heo * @dsq: locked DSQ @p is currently on 2492*bba2c361STejun Heo * @src_rq: rq @p is currently on, stable with @dsq locked 2493*bba2c361STejun Heo * 2494*bba2c361STejun Heo * Called with @dsq locked but no rq's locked. We want to move @p to a different 2495*bba2c361STejun Heo * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is 2496*bba2c361STejun Heo * required when transferring into a local DSQ. Even when transferring into a 2497*bba2c361STejun Heo * non-local DSQ, it's better to use the same mechanism to protect against 2498*bba2c361STejun Heo * dequeues and maintain the invariant that @p->scx.dsq can only change while 2499*bba2c361STejun Heo * @src_rq is locked, which e.g. scx_dump_task() depends on. 2500*bba2c361STejun Heo * 2501*bba2c361STejun Heo * We want to grab @src_rq but that can deadlock if we try while locking @dsq, 2502*bba2c361STejun Heo * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As 2503*bba2c361STejun Heo * this may race with dequeue, which can't drop the rq lock or fail, do a little 2504*bba2c361STejun Heo * dancing from our side. 2505*bba2c361STejun Heo * 2506*bba2c361STejun Heo * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets 2507*bba2c361STejun Heo * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu 2508*bba2c361STejun Heo * would be cleared to -1. While other cpus may have updated it to different 2509*bba2c361STejun Heo * values afterwards, as this operation can't be preempted or recurse, the 2510*bba2c361STejun Heo * holding_cpu can never become this CPU again before we're done. Thus, we can 2511*bba2c361STejun Heo * tell whether we lost to dequeue by testing whether the holding_cpu still 2512*bba2c361STejun Heo * points to this CPU. See dispatch_dequeue() for the counterpart. 2513*bba2c361STejun Heo * 2514*bba2c361STejun Heo * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is 2515*bba2c361STejun Heo * still valid. %false if lost to dequeue. 2516*bba2c361STejun Heo */ 2517*bba2c361STejun Heo static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, 2518*bba2c361STejun Heo struct scx_dispatch_q *dsq, 2519*bba2c361STejun Heo struct rq *src_rq) 2520*bba2c361STejun Heo { 2521*bba2c361STejun Heo s32 cpu = raw_smp_processor_id(); 2522*bba2c361STejun Heo 2523*bba2c361STejun Heo lockdep_assert_held(&dsq->lock); 2524*bba2c361STejun Heo 2525*bba2c361STejun Heo WARN_ON_ONCE(p->scx.holding_cpu >= 0); 2526*bba2c361STejun Heo task_unlink_from_dsq(p, dsq); 2527*bba2c361STejun Heo p->scx.holding_cpu = cpu; 2528*bba2c361STejun Heo 2529*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 2530*bba2c361STejun Heo raw_spin_rq_lock(src_rq); 2531*bba2c361STejun Heo 2532*bba2c361STejun Heo /* task_rq couldn't have changed if we're still the holding cpu */ 2533*bba2c361STejun Heo return likely(p->scx.holding_cpu == cpu) && 2534*bba2c361STejun Heo !WARN_ON_ONCE(src_rq != task_rq(p)); 2535*bba2c361STejun Heo } 2536*bba2c361STejun Heo 2537*bba2c361STejun Heo static bool consume_remote_task(struct rq *this_rq, 2538*bba2c361STejun Heo struct task_struct *p, u64 enq_flags, 2539*bba2c361STejun Heo struct scx_dispatch_q *dsq, struct rq *src_rq) 2540*bba2c361STejun Heo { 2541*bba2c361STejun Heo raw_spin_rq_unlock(this_rq); 2542*bba2c361STejun Heo 2543*bba2c361STejun Heo if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { 2544*bba2c361STejun Heo move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); 2545*bba2c361STejun Heo return true; 2546*bba2c361STejun Heo } else { 2547*bba2c361STejun Heo raw_spin_rq_unlock(src_rq); 2548*bba2c361STejun Heo raw_spin_rq_lock(this_rq); 2549*bba2c361STejun Heo return false; 2550*bba2c361STejun Heo } 2551*bba2c361STejun Heo } 2552*bba2c361STejun Heo 2553*bba2c361STejun Heo /** 2554*bba2c361STejun Heo * move_task_between_dsqs() - Move a task from one DSQ to another 2555*bba2c361STejun Heo * @sch: scx_sched being operated on 2556*bba2c361STejun Heo * @p: target task 2557*bba2c361STejun Heo * @enq_flags: %SCX_ENQ_* 2558*bba2c361STejun Heo * @src_dsq: DSQ @p is currently on, must not be a local DSQ 2559*bba2c361STejun Heo * @dst_dsq: DSQ @p is being moved to, can be any DSQ 2560*bba2c361STejun Heo * 2561*bba2c361STejun Heo * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local 2562*bba2c361STejun Heo * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq 2563*bba2c361STejun Heo * will change. As @p's task_rq is locked, this function doesn't need to use the 2564*bba2c361STejun Heo * holding_cpu mechanism. 2565*bba2c361STejun Heo * 2566*bba2c361STejun Heo * On return, @src_dsq is unlocked and only @p's new task_rq, which is the 2567*bba2c361STejun Heo * return value, is locked. 2568*bba2c361STejun Heo */ 2569*bba2c361STejun Heo static struct rq *move_task_between_dsqs(struct scx_sched *sch, 2570*bba2c361STejun Heo struct task_struct *p, u64 enq_flags, 2571*bba2c361STejun Heo struct scx_dispatch_q *src_dsq, 2572*bba2c361STejun Heo struct scx_dispatch_q *dst_dsq) 2573*bba2c361STejun Heo { 2574*bba2c361STejun Heo struct rq *src_rq = task_rq(p), *dst_rq; 2575*bba2c361STejun Heo 2576*bba2c361STejun Heo BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); 2577*bba2c361STejun Heo lockdep_assert_held(&src_dsq->lock); 2578*bba2c361STejun Heo lockdep_assert_rq_held(src_rq); 2579*bba2c361STejun Heo 2580*bba2c361STejun Heo if (dst_dsq->id == SCX_DSQ_LOCAL) { 2581*bba2c361STejun Heo dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2582*bba2c361STejun Heo if (src_rq != dst_rq && 2583*bba2c361STejun Heo unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2584*bba2c361STejun Heo dst_dsq = find_global_dsq(sch, task_cpu(p)); 2585*bba2c361STejun Heo dst_rq = src_rq; 2586*bba2c361STejun Heo enq_flags |= SCX_ENQ_GDSQ_FALLBACK; 2587*bba2c361STejun Heo } 2588*bba2c361STejun Heo } else { 2589*bba2c361STejun Heo /* no need to migrate if destination is a non-local DSQ */ 2590*bba2c361STejun Heo dst_rq = src_rq; 2591*bba2c361STejun Heo } 2592*bba2c361STejun Heo 2593*bba2c361STejun Heo /* 2594*bba2c361STejun Heo * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different 2595*bba2c361STejun Heo * CPU, @p will be migrated. 2596*bba2c361STejun Heo */ 2597*bba2c361STejun Heo if (dst_dsq->id == SCX_DSQ_LOCAL) { 2598*bba2c361STejun Heo /* @p is going from a non-local DSQ to a local DSQ */ 2599*bba2c361STejun Heo if (src_rq == dst_rq) { 2600*bba2c361STejun Heo task_unlink_from_dsq(p, src_dsq); 2601*bba2c361STejun Heo move_local_task_to_local_dsq(sch, p, enq_flags, 2602*bba2c361STejun Heo src_dsq, dst_rq); 2603*bba2c361STejun Heo raw_spin_unlock(&src_dsq->lock); 2604*bba2c361STejun Heo } else { 2605*bba2c361STejun Heo raw_spin_unlock(&src_dsq->lock); 2606*bba2c361STejun Heo move_remote_task_to_local_dsq(p, enq_flags, 2607*bba2c361STejun Heo src_rq, dst_rq); 2608*bba2c361STejun Heo } 2609*bba2c361STejun Heo } else { 2610*bba2c361STejun Heo /* 2611*bba2c361STejun Heo * @p is going from a non-local DSQ to a non-local DSQ. As 2612*bba2c361STejun Heo * $src_dsq is already locked, do an abbreviated dequeue. 2613*bba2c361STejun Heo */ 2614*bba2c361STejun Heo dispatch_dequeue_locked(p, src_dsq); 2615*bba2c361STejun Heo raw_spin_unlock(&src_dsq->lock); 2616*bba2c361STejun Heo 2617*bba2c361STejun Heo dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); 2618*bba2c361STejun Heo } 2619*bba2c361STejun Heo 2620*bba2c361STejun Heo return dst_rq; 2621*bba2c361STejun Heo } 2622*bba2c361STejun Heo 2623*bba2c361STejun Heo static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 2624*bba2c361STejun Heo struct scx_dispatch_q *dsq, u64 enq_flags) 2625*bba2c361STejun Heo { 2626*bba2c361STejun Heo struct task_struct *p; 2627*bba2c361STejun Heo retry: 2628*bba2c361STejun Heo /* 2629*bba2c361STejun Heo * The caller can't expect to successfully consume a task if the task's 2630*bba2c361STejun Heo * addition to @dsq isn't guaranteed to be visible somehow. Test 2631*bba2c361STejun Heo * @dsq->list without locking and skip if it seems empty. 2632*bba2c361STejun Heo */ 2633*bba2c361STejun Heo if (list_empty(&dsq->list)) 2634*bba2c361STejun Heo return false; 2635*bba2c361STejun Heo 2636*bba2c361STejun Heo raw_spin_lock(&dsq->lock); 2637*bba2c361STejun Heo 2638*bba2c361STejun Heo nldsq_for_each_task(p, dsq) { 2639*bba2c361STejun Heo struct rq *task_rq = task_rq(p); 2640*bba2c361STejun Heo 2641*bba2c361STejun Heo /* 2642*bba2c361STejun Heo * This loop can lead to multiple lockup scenarios, e.g. the BPF 2643*bba2c361STejun Heo * scheduler can put an enormous number of affinitized tasks into 2644*bba2c361STejun Heo * a contended DSQ, or the outer retry loop can repeatedly race 2645*bba2c361STejun Heo * against scx_bypass() dequeueing tasks from @dsq trying to put 2646*bba2c361STejun Heo * the system into the bypass mode. This can easily live-lock the 2647*bba2c361STejun Heo * machine. If aborting, exit from all non-bypass DSQs. 2648*bba2c361STejun Heo */ 2649*bba2c361STejun Heo if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) 2650*bba2c361STejun Heo break; 2651*bba2c361STejun Heo 2652*bba2c361STejun Heo if (rq == task_rq) { 2653*bba2c361STejun Heo task_unlink_from_dsq(p, dsq); 2654*bba2c361STejun Heo move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); 2655*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 2656*bba2c361STejun Heo return true; 2657*bba2c361STejun Heo } 2658*bba2c361STejun Heo 2659*bba2c361STejun Heo if (task_can_run_on_remote_rq(sch, p, rq, false)) { 2660*bba2c361STejun Heo if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) 2661*bba2c361STejun Heo return true; 2662*bba2c361STejun Heo goto retry; 2663*bba2c361STejun Heo } 2664*bba2c361STejun Heo } 2665*bba2c361STejun Heo 2666*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 2667*bba2c361STejun Heo return false; 2668*bba2c361STejun Heo } 2669*bba2c361STejun Heo 2670*bba2c361STejun Heo static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) 2671*bba2c361STejun Heo { 2672*bba2c361STejun Heo int node = cpu_to_node(cpu_of(rq)); 2673*bba2c361STejun Heo 2674*bba2c361STejun Heo return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); 2675*bba2c361STejun Heo } 2676*bba2c361STejun Heo 2677*bba2c361STejun Heo /** 2678*bba2c361STejun Heo * dispatch_to_local_dsq - Dispatch a task to a local dsq 2679*bba2c361STejun Heo * @sch: scx_sched being operated on 2680*bba2c361STejun Heo * @rq: current rq which is locked 2681*bba2c361STejun Heo * @dst_dsq: destination DSQ 2682*bba2c361STejun Heo * @p: task to dispatch 2683*bba2c361STejun Heo * @enq_flags: %SCX_ENQ_* 2684*bba2c361STejun Heo * 2685*bba2c361STejun Heo * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local 2686*bba2c361STejun Heo * DSQ. This function performs all the synchronization dancing needed because 2687*bba2c361STejun Heo * local DSQs are protected with rq locks. 2688*bba2c361STejun Heo * 2689*bba2c361STejun Heo * The caller must have exclusive ownership of @p (e.g. through 2690*bba2c361STejun Heo * %SCX_OPSS_DISPATCHING). 2691*bba2c361STejun Heo */ 2692*bba2c361STejun Heo static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, 2693*bba2c361STejun Heo struct scx_dispatch_q *dst_dsq, 2694*bba2c361STejun Heo struct task_struct *p, u64 enq_flags) 2695*bba2c361STejun Heo { 2696*bba2c361STejun Heo struct rq *src_rq = task_rq(p); 2697*bba2c361STejun Heo struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 2698*bba2c361STejun Heo struct rq *locked_rq = rq; 2699*bba2c361STejun Heo 2700*bba2c361STejun Heo /* 2701*bba2c361STejun Heo * We're synchronized against dequeue through DISPATCHING. As @p can't 2702*bba2c361STejun Heo * be dequeued, its task_rq and cpus_allowed are stable too. 2703*bba2c361STejun Heo * 2704*bba2c361STejun Heo * If dispatching to @rq that @p is already on, no lock dancing needed. 2705*bba2c361STejun Heo */ 2706*bba2c361STejun Heo if (rq == src_rq && rq == dst_rq) { 2707*bba2c361STejun Heo dispatch_enqueue(sch, rq, dst_dsq, p, 2708*bba2c361STejun Heo enq_flags | SCX_ENQ_CLEAR_OPSS); 2709*bba2c361STejun Heo return; 2710*bba2c361STejun Heo } 2711*bba2c361STejun Heo 2712*bba2c361STejun Heo if (src_rq != dst_rq && 2713*bba2c361STejun Heo unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 2714*bba2c361STejun Heo dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, 2715*bba2c361STejun Heo enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); 2716*bba2c361STejun Heo return; 2717*bba2c361STejun Heo } 2718*bba2c361STejun Heo 2719*bba2c361STejun Heo /* 2720*bba2c361STejun Heo * @p is on a possibly remote @src_rq which we need to lock to move the 2721*bba2c361STejun Heo * task. If dequeue is in progress, it'd be locking @src_rq and waiting 2722*bba2c361STejun Heo * on DISPATCHING, so we can't grab @src_rq lock while holding 2723*bba2c361STejun Heo * DISPATCHING. 2724*bba2c361STejun Heo * 2725*bba2c361STejun Heo * As DISPATCHING guarantees that @p is wholly ours, we can pretend that 2726*bba2c361STejun Heo * we're moving from a DSQ and use the same mechanism - mark the task 2727*bba2c361STejun Heo * under transfer with holding_cpu, release DISPATCHING and then follow 2728*bba2c361STejun Heo * the same protocol. See unlink_dsq_and_lock_src_rq(). 2729*bba2c361STejun Heo */ 2730*bba2c361STejun Heo p->scx.holding_cpu = raw_smp_processor_id(); 2731*bba2c361STejun Heo 2732*bba2c361STejun Heo /* store_release ensures that dequeue sees the above */ 2733*bba2c361STejun Heo atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2734*bba2c361STejun Heo 2735*bba2c361STejun Heo /* switch to @src_rq lock */ 2736*bba2c361STejun Heo if (locked_rq != src_rq) { 2737*bba2c361STejun Heo raw_spin_rq_unlock(locked_rq); 2738*bba2c361STejun Heo locked_rq = src_rq; 2739*bba2c361STejun Heo raw_spin_rq_lock(src_rq); 2740*bba2c361STejun Heo } 2741*bba2c361STejun Heo 2742*bba2c361STejun Heo /* task_rq couldn't have changed if we're still the holding cpu */ 2743*bba2c361STejun Heo if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && 2744*bba2c361STejun Heo !WARN_ON_ONCE(src_rq != task_rq(p))) { 2745*bba2c361STejun Heo /* 2746*bba2c361STejun Heo * If @p is staying on the same rq, there's no need to go 2747*bba2c361STejun Heo * through the full deactivate/activate cycle. Optimize by 2748*bba2c361STejun Heo * abbreviating move_remote_task_to_local_dsq(). 2749*bba2c361STejun Heo */ 2750*bba2c361STejun Heo if (src_rq == dst_rq) { 2751*bba2c361STejun Heo p->scx.holding_cpu = -1; 2752*bba2c361STejun Heo dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, 2753*bba2c361STejun Heo enq_flags); 2754*bba2c361STejun Heo } else { 2755*bba2c361STejun Heo move_remote_task_to_local_dsq(p, enq_flags, 2756*bba2c361STejun Heo src_rq, dst_rq); 2757*bba2c361STejun Heo /* task has been moved to dst_rq, which is now locked */ 2758*bba2c361STejun Heo locked_rq = dst_rq; 2759*bba2c361STejun Heo } 2760*bba2c361STejun Heo 2761*bba2c361STejun Heo /* if the destination CPU is idle, wake it up */ 2762*bba2c361STejun Heo if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) 2763*bba2c361STejun Heo resched_curr(dst_rq); 2764*bba2c361STejun Heo } 2765*bba2c361STejun Heo 2766*bba2c361STejun Heo /* switch back to @rq lock */ 2767*bba2c361STejun Heo if (locked_rq != rq) { 2768*bba2c361STejun Heo raw_spin_rq_unlock(locked_rq); 2769*bba2c361STejun Heo raw_spin_rq_lock(rq); 2770*bba2c361STejun Heo } 2771*bba2c361STejun Heo } 2772*bba2c361STejun Heo 2773*bba2c361STejun Heo /** 2774*bba2c361STejun Heo * finish_dispatch - Asynchronously finish dispatching a task 2775*bba2c361STejun Heo * @rq: current rq which is locked 2776*bba2c361STejun Heo * @p: task to finish dispatching 2777*bba2c361STejun Heo * @qseq_at_dispatch: qseq when @p started getting dispatched 2778*bba2c361STejun Heo * @dsq_id: destination DSQ ID 2779*bba2c361STejun Heo * @enq_flags: %SCX_ENQ_* 2780*bba2c361STejun Heo * 2781*bba2c361STejun Heo * Dispatching to local DSQs may need to wait for queueing to complete or 2782*bba2c361STejun Heo * require rq lock dancing. As we don't wanna do either while inside 2783*bba2c361STejun Heo * ops.dispatch() to avoid locking order inversion, we split dispatching into 2784*bba2c361STejun Heo * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the 2785*bba2c361STejun Heo * task and its qseq. Once ops.dispatch() returns, this function is called to 2786*bba2c361STejun Heo * finish up. 2787*bba2c361STejun Heo * 2788*bba2c361STejun Heo * There is no guarantee that @p is still valid for dispatching or even that it 2789*bba2c361STejun Heo * was valid in the first place. Make sure that the task is still owned by the 2790*bba2c361STejun Heo * BPF scheduler and claim the ownership before dispatching. 2791*bba2c361STejun Heo */ 2792*bba2c361STejun Heo static void finish_dispatch(struct scx_sched *sch, struct rq *rq, 2793*bba2c361STejun Heo struct task_struct *p, 2794*bba2c361STejun Heo unsigned long qseq_at_dispatch, 2795*bba2c361STejun Heo u64 dsq_id, u64 enq_flags) 2796*bba2c361STejun Heo { 2797*bba2c361STejun Heo struct scx_dispatch_q *dsq; 2798*bba2c361STejun Heo unsigned long opss; 2799*bba2c361STejun Heo 2800*bba2c361STejun Heo touch_core_sched_dispatch(rq, p); 2801*bba2c361STejun Heo retry: 2802*bba2c361STejun Heo /* 2803*bba2c361STejun Heo * No need for _acquire here. @p is accessed only after a successful 2804*bba2c361STejun Heo * try_cmpxchg to DISPATCHING. 2805*bba2c361STejun Heo */ 2806*bba2c361STejun Heo opss = atomic_long_read(&p->scx.ops_state); 2807*bba2c361STejun Heo 2808*bba2c361STejun Heo switch (opss & SCX_OPSS_STATE_MASK) { 2809*bba2c361STejun Heo case SCX_OPSS_DISPATCHING: 2810*bba2c361STejun Heo case SCX_OPSS_NONE: 2811*bba2c361STejun Heo /* someone else already got to it */ 2812*bba2c361STejun Heo return; 2813*bba2c361STejun Heo case SCX_OPSS_QUEUED: 2814*bba2c361STejun Heo /* 2815*bba2c361STejun Heo * If qseq doesn't match, @p has gone through at least one 2816*bba2c361STejun Heo * dispatch/dequeue and re-enqueue cycle between 2817*bba2c361STejun Heo * scx_bpf_dsq_insert() and here and we have no claim on it. 2818*bba2c361STejun Heo */ 2819*bba2c361STejun Heo if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) 2820*bba2c361STejun Heo return; 2821*bba2c361STejun Heo 2822*bba2c361STejun Heo /* see SCX_EV_INSERT_NOT_OWNED definition */ 2823*bba2c361STejun Heo if (unlikely(!scx_task_on_sched(sch, p))) { 2824*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 2825*bba2c361STejun Heo return; 2826*bba2c361STejun Heo } 2827*bba2c361STejun Heo 2828*bba2c361STejun Heo /* 2829*bba2c361STejun Heo * While we know @p is accessible, we don't yet have a claim on 2830*bba2c361STejun Heo * it - the BPF scheduler is allowed to dispatch tasks 2831*bba2c361STejun Heo * spuriously and there can be a racing dequeue attempt. Let's 2832*bba2c361STejun Heo * claim @p by atomically transitioning it from QUEUED to 2833*bba2c361STejun Heo * DISPATCHING. 2834*bba2c361STejun Heo */ 2835*bba2c361STejun Heo if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, 2836*bba2c361STejun Heo SCX_OPSS_DISPATCHING))) 2837*bba2c361STejun Heo break; 2838*bba2c361STejun Heo goto retry; 2839*bba2c361STejun Heo case SCX_OPSS_QUEUEING: 2840*bba2c361STejun Heo /* 2841*bba2c361STejun Heo * do_enqueue_task() is in the process of transferring the task 2842*bba2c361STejun Heo * to the BPF scheduler while holding @p's rq lock. As we aren't 2843*bba2c361STejun Heo * holding any kernel or BPF resource that the enqueue path may 2844*bba2c361STejun Heo * depend upon, it's safe to wait. 2845*bba2c361STejun Heo */ 2846*bba2c361STejun Heo wait_ops_state(p, opss); 2847*bba2c361STejun Heo goto retry; 2848*bba2c361STejun Heo } 2849*bba2c361STejun Heo 2850*bba2c361STejun Heo BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); 2851*bba2c361STejun Heo 2852*bba2c361STejun Heo dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); 2853*bba2c361STejun Heo 2854*bba2c361STejun Heo if (dsq->id == SCX_DSQ_LOCAL) 2855*bba2c361STejun Heo dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 2856*bba2c361STejun Heo else 2857*bba2c361STejun Heo dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2858*bba2c361STejun Heo } 2859*bba2c361STejun Heo 2860*bba2c361STejun Heo static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) 2861*bba2c361STejun Heo { 2862*bba2c361STejun Heo struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2863*bba2c361STejun Heo u32 u; 2864*bba2c361STejun Heo 2865*bba2c361STejun Heo for (u = 0; u < dspc->cursor; u++) { 2866*bba2c361STejun Heo struct scx_dsp_buf_ent *ent = &dspc->buf[u]; 2867*bba2c361STejun Heo 2868*bba2c361STejun Heo finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, 2869*bba2c361STejun Heo ent->enq_flags); 2870*bba2c361STejun Heo } 2871*bba2c361STejun Heo 2872*bba2c361STejun Heo dspc->nr_tasks += dspc->cursor; 2873*bba2c361STejun Heo dspc->cursor = 0; 2874*bba2c361STejun Heo } 2875*bba2c361STejun Heo 2876*bba2c361STejun Heo static inline void maybe_queue_balance_callback(struct rq *rq) 2877*bba2c361STejun Heo { 2878*bba2c361STejun Heo lockdep_assert_rq_held(rq); 2879*bba2c361STejun Heo 2880*bba2c361STejun Heo if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2881*bba2c361STejun Heo return; 2882*bba2c361STejun Heo 2883*bba2c361STejun Heo queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2884*bba2c361STejun Heo deferred_bal_cb_workfn); 2885*bba2c361STejun Heo 2886*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2887*bba2c361STejun Heo } 2888*bba2c361STejun Heo 2889*bba2c361STejun Heo /* 2890*bba2c361STejun Heo * One user of this function is scx_bpf_dispatch() which can be called 2891*bba2c361STejun Heo * recursively as sub-sched dispatches nest. Always inline to reduce stack usage 2892*bba2c361STejun Heo * from the call frame. 2893*bba2c361STejun Heo */ 2894*bba2c361STejun Heo static __always_inline bool 2895*bba2c361STejun Heo scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, 2896*bba2c361STejun Heo struct task_struct *prev, bool nested) 2897*bba2c361STejun Heo { 2898*bba2c361STejun Heo struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 2899*bba2c361STejun Heo int nr_loops = SCX_DSP_MAX_LOOPS; 2900*bba2c361STejun Heo s32 cpu = cpu_of(rq); 2901*bba2c361STejun Heo bool prev_on_sch = (prev->sched_class == &ext_sched_class) && 2902*bba2c361STejun Heo scx_task_on_sched(sch, prev); 2903*bba2c361STejun Heo 2904*bba2c361STejun Heo if (consume_global_dsq(sch, rq)) 2905*bba2c361STejun Heo return true; 2906*bba2c361STejun Heo 2907*bba2c361STejun Heo if (bypass_dsp_enabled(sch)) { 2908*bba2c361STejun Heo /* if @sch is bypassing, only the bypass DSQs are active */ 2909*bba2c361STejun Heo if (scx_bypassing(sch, cpu)) 2910*bba2c361STejun Heo return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2911*bba2c361STejun Heo 2912*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 2913*bba2c361STejun Heo /* 2914*bba2c361STejun Heo * If @sch isn't bypassing but its children are, @sch is 2915*bba2c361STejun Heo * responsible for making forward progress for both its own 2916*bba2c361STejun Heo * tasks that aren't bypassing and the bypassing descendants' 2917*bba2c361STejun Heo * tasks. The following implements a simple built-in behavior - 2918*bba2c361STejun Heo * let each CPU try to run the bypass DSQ every Nth time. 2919*bba2c361STejun Heo * 2920*bba2c361STejun Heo * Later, if necessary, we can add an ops flag to suppress the 2921*bba2c361STejun Heo * auto-consumption and a kfunc to consume the bypass DSQ and, 2922*bba2c361STejun Heo * so that the BPF scheduler can fully control scheduling of 2923*bba2c361STejun Heo * bypassed tasks. 2924*bba2c361STejun Heo */ 2925*bba2c361STejun Heo struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 2926*bba2c361STejun Heo 2927*bba2c361STejun Heo if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && 2928*bba2c361STejun Heo consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { 2929*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); 2930*bba2c361STejun Heo return true; 2931*bba2c361STejun Heo } 2932*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 2933*bba2c361STejun Heo } 2934*bba2c361STejun Heo 2935*bba2c361STejun Heo if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2936*bba2c361STejun Heo return false; 2937*bba2c361STejun Heo 2938*bba2c361STejun Heo dspc->rq = rq; 2939*bba2c361STejun Heo 2940*bba2c361STejun Heo /* 2941*bba2c361STejun Heo * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, 2942*bba2c361STejun Heo * the local DSQ might still end up empty after a successful 2943*bba2c361STejun Heo * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() 2944*bba2c361STejun Heo * produced some tasks, retry. The BPF scheduler may depend on this 2945*bba2c361STejun Heo * looping behavior to simplify its implementation. 2946*bba2c361STejun Heo */ 2947*bba2c361STejun Heo do { 2948*bba2c361STejun Heo dspc->nr_tasks = 0; 2949*bba2c361STejun Heo 2950*bba2c361STejun Heo if (nested) { 2951*bba2c361STejun Heo SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), 2952*bba2c361STejun Heo prev_on_sch ? prev : NULL); 2953*bba2c361STejun Heo } else { 2954*bba2c361STejun Heo /* stash @prev so that nested invocations can access it */ 2955*bba2c361STejun Heo rq->scx.sub_dispatch_prev = prev; 2956*bba2c361STejun Heo SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), 2957*bba2c361STejun Heo prev_on_sch ? prev : NULL); 2958*bba2c361STejun Heo rq->scx.sub_dispatch_prev = NULL; 2959*bba2c361STejun Heo } 2960*bba2c361STejun Heo 2961*bba2c361STejun Heo flush_dispatch_buf(sch, rq); 2962*bba2c361STejun Heo 2963*bba2c361STejun Heo if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { 2964*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_BAL_KEEP; 2965*bba2c361STejun Heo return true; 2966*bba2c361STejun Heo } 2967*bba2c361STejun Heo if (rq->scx.local_dsq.nr) 2968*bba2c361STejun Heo return true; 2969*bba2c361STejun Heo if (consume_global_dsq(sch, rq)) 2970*bba2c361STejun Heo return true; 2971*bba2c361STejun Heo 2972*bba2c361STejun Heo /* 2973*bba2c361STejun Heo * ops.dispatch() can trap us in this loop by repeatedly 2974*bba2c361STejun Heo * dispatching ineligible tasks. Break out once in a while to 2975*bba2c361STejun Heo * allow the watchdog to run. As IRQ can't be enabled in 2976*bba2c361STejun Heo * balance(), we want to complete this scheduling cycle and then 2977*bba2c361STejun Heo * start a new one. IOW, we want to call resched_curr() on the 2978*bba2c361STejun Heo * next, most likely idle, task, not the current one. Use 2979*bba2c361STejun Heo * __scx_bpf_kick_cpu() for deferred kicking. 2980*bba2c361STejun Heo */ 2981*bba2c361STejun Heo if (unlikely(!--nr_loops)) { 2982*bba2c361STejun Heo scx_kick_cpu(sch, cpu, 0); 2983*bba2c361STejun Heo break; 2984*bba2c361STejun Heo } 2985*bba2c361STejun Heo } while (dspc->nr_tasks); 2986*bba2c361STejun Heo 2987*bba2c361STejun Heo /* 2988*bba2c361STejun Heo * Prevent the CPU from going idle while bypassed descendants have tasks 2989*bba2c361STejun Heo * queued. Without this fallback, bypassed tasks could stall if the host 2990*bba2c361STejun Heo * scheduler's ops.dispatch() doesn't yield any tasks. 2991*bba2c361STejun Heo */ 2992*bba2c361STejun Heo if (bypass_dsp_enabled(sch)) 2993*bba2c361STejun Heo return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); 2994*bba2c361STejun Heo 2995*bba2c361STejun Heo return false; 2996*bba2c361STejun Heo } 2997*bba2c361STejun Heo 2998*bba2c361STejun Heo static int balance_one(struct rq *rq, struct task_struct *prev) 2999*bba2c361STejun Heo { 3000*bba2c361STejun Heo struct scx_sched *sch = scx_root; 3001*bba2c361STejun Heo s32 cpu = cpu_of(rq); 3002*bba2c361STejun Heo 3003*bba2c361STejun Heo lockdep_assert_rq_held(rq); 3004*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_IN_BALANCE; 3005*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 3006*bba2c361STejun Heo 3007*bba2c361STejun Heo if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 3008*bba2c361STejun Heo unlikely(rq->scx.cpu_released)) { 3009*bba2c361STejun Heo /* 3010*bba2c361STejun Heo * If the previous sched_class for the current CPU was not SCX, 3011*bba2c361STejun Heo * notify the BPF scheduler that it again has control of the 3012*bba2c361STejun Heo * core. This callback complements ->cpu_release(), which is 3013*bba2c361STejun Heo * emitted in switch_class(). 3014*bba2c361STejun Heo */ 3015*bba2c361STejun Heo if (sch->ops.cpu_acquire) 3016*bba2c361STejun Heo SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); 3017*bba2c361STejun Heo rq->scx.cpu_released = false; 3018*bba2c361STejun Heo } 3019*bba2c361STejun Heo 3020*bba2c361STejun Heo if (prev->sched_class == &ext_sched_class) { 3021*bba2c361STejun Heo update_curr_scx(rq); 3022*bba2c361STejun Heo 3023*bba2c361STejun Heo /* 3024*bba2c361STejun Heo * If @prev is runnable & has slice left, it has priority and 3025*bba2c361STejun Heo * fetching more just increases latency for the fetched tasks. 3026*bba2c361STejun Heo * Tell pick_task_scx() to keep running @prev. If the BPF 3027*bba2c361STejun Heo * scheduler wants to handle this explicitly, it should 3028*bba2c361STejun Heo * implement ->cpu_release(). 3029*bba2c361STejun Heo * 3030*bba2c361STejun Heo * See scx_disable_workfn() for the explanation on the bypassing 3031*bba2c361STejun Heo * test. 3032*bba2c361STejun Heo */ 3033*bba2c361STejun Heo if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && 3034*bba2c361STejun Heo !scx_bypassing(sch, cpu)) { 3035*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_BAL_KEEP; 3036*bba2c361STejun Heo goto has_tasks; 3037*bba2c361STejun Heo } 3038*bba2c361STejun Heo } 3039*bba2c361STejun Heo 3040*bba2c361STejun Heo /* if there already are tasks to run, nothing to do */ 3041*bba2c361STejun Heo if (rq->scx.local_dsq.nr) 3042*bba2c361STejun Heo goto has_tasks; 3043*bba2c361STejun Heo 3044*bba2c361STejun Heo if (scx_dispatch_sched(sch, rq, prev, false)) 3045*bba2c361STejun Heo goto has_tasks; 3046*bba2c361STejun Heo 3047*bba2c361STejun Heo /* 3048*bba2c361STejun Heo * Didn't find another task to run. Keep running @prev unless 3049*bba2c361STejun Heo * %SCX_OPS_ENQ_LAST is in effect. 3050*bba2c361STejun Heo */ 3051*bba2c361STejun Heo if ((prev->scx.flags & SCX_TASK_QUEUED) && 3052*bba2c361STejun Heo (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { 3053*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_BAL_KEEP; 3054*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); 3055*bba2c361STejun Heo goto has_tasks; 3056*bba2c361STejun Heo } 3057*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 3058*bba2c361STejun Heo return false; 3059*bba2c361STejun Heo 3060*bba2c361STejun Heo has_tasks: 3061*bba2c361STejun Heo /* 3062*bba2c361STejun Heo * @rq may have extra IMMED tasks without reenq scheduled: 3063*bba2c361STejun Heo * 3064*bba2c361STejun Heo * - rq_is_open() can't reliably tell when and how slice is going to be 3065*bba2c361STejun Heo * modified for $curr and allows IMMED tasks to be queued while 3066*bba2c361STejun Heo * dispatch is in progress. 3067*bba2c361STejun Heo * 3068*bba2c361STejun Heo * - A non-IMMED HEAD task can get queued in front of an IMMED task 3069*bba2c361STejun Heo * between the IMMED queueing and the subsequent scheduling event. 3070*bba2c361STejun Heo */ 3071*bba2c361STejun Heo if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) 3072*bba2c361STejun Heo schedule_reenq_local(rq, 0); 3073*bba2c361STejun Heo 3074*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 3075*bba2c361STejun Heo return true; 3076*bba2c361STejun Heo } 3077*bba2c361STejun Heo 3078*bba2c361STejun Heo static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) 3079*bba2c361STejun Heo { 3080*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 3081*bba2c361STejun Heo 3082*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_QUEUED) { 3083*bba2c361STejun Heo /* 3084*bba2c361STejun Heo * Core-sched might decide to execute @p before it is 3085*bba2c361STejun Heo * dispatched. Call ops_dequeue() to notify the BPF scheduler. 3086*bba2c361STejun Heo */ 3087*bba2c361STejun Heo ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); 3088*bba2c361STejun Heo dispatch_dequeue(rq, p); 3089*bba2c361STejun Heo } 3090*bba2c361STejun Heo 3091*bba2c361STejun Heo p->se.exec_start = rq_clock_task(rq); 3092*bba2c361STejun Heo 3093*bba2c361STejun Heo /* see dequeue_task_scx() on why we skip when !QUEUED */ 3094*bba2c361STejun Heo if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) 3095*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, running, rq, p); 3096*bba2c361STejun Heo 3097*bba2c361STejun Heo clr_task_runnable(p, true); 3098*bba2c361STejun Heo 3099*bba2c361STejun Heo /* 3100*bba2c361STejun Heo * @p is getting newly scheduled or got kicked after someone updated its 3101*bba2c361STejun Heo * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). 3102*bba2c361STejun Heo */ 3103*bba2c361STejun Heo if ((p->scx.slice == SCX_SLICE_INF) != 3104*bba2c361STejun Heo (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { 3105*bba2c361STejun Heo if (p->scx.slice == SCX_SLICE_INF) 3106*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; 3107*bba2c361STejun Heo else 3108*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; 3109*bba2c361STejun Heo 3110*bba2c361STejun Heo sched_update_tick_dependency(rq); 3111*bba2c361STejun Heo 3112*bba2c361STejun Heo /* 3113*bba2c361STejun Heo * For now, let's refresh the load_avgs just when transitioning 3114*bba2c361STejun Heo * in and out of nohz. In the future, we might want to add a 3115*bba2c361STejun Heo * mechanism which calls the following periodically on 3116*bba2c361STejun Heo * tick-stopped CPUs. 3117*bba2c361STejun Heo */ 3118*bba2c361STejun Heo update_other_load_avgs(rq); 3119*bba2c361STejun Heo } 3120*bba2c361STejun Heo } 3121*bba2c361STejun Heo 3122*bba2c361STejun Heo static enum scx_cpu_preempt_reason 3123*bba2c361STejun Heo preempt_reason_from_class(const struct sched_class *class) 3124*bba2c361STejun Heo { 3125*bba2c361STejun Heo if (class == &stop_sched_class) 3126*bba2c361STejun Heo return SCX_CPU_PREEMPT_STOP; 3127*bba2c361STejun Heo if (class == &dl_sched_class) 3128*bba2c361STejun Heo return SCX_CPU_PREEMPT_DL; 3129*bba2c361STejun Heo if (class == &rt_sched_class) 3130*bba2c361STejun Heo return SCX_CPU_PREEMPT_RT; 3131*bba2c361STejun Heo return SCX_CPU_PREEMPT_UNKNOWN; 3132*bba2c361STejun Heo } 3133*bba2c361STejun Heo 3134*bba2c361STejun Heo static void switch_class(struct rq *rq, struct task_struct *next) 3135*bba2c361STejun Heo { 3136*bba2c361STejun Heo struct scx_sched *sch = scx_root; 3137*bba2c361STejun Heo const struct sched_class *next_class = next->sched_class; 3138*bba2c361STejun Heo 3139*bba2c361STejun Heo if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 3140*bba2c361STejun Heo return; 3141*bba2c361STejun Heo 3142*bba2c361STejun Heo /* 3143*bba2c361STejun Heo * The callback is conceptually meant to convey that the CPU is no 3144*bba2c361STejun Heo * longer under the control of SCX. Therefore, don't invoke the callback 3145*bba2c361STejun Heo * if the next class is below SCX (in which case the BPF scheduler has 3146*bba2c361STejun Heo * actively decided not to schedule any tasks on the CPU). 3147*bba2c361STejun Heo */ 3148*bba2c361STejun Heo if (sched_class_above(&ext_sched_class, next_class)) 3149*bba2c361STejun Heo return; 3150*bba2c361STejun Heo 3151*bba2c361STejun Heo /* 3152*bba2c361STejun Heo * At this point we know that SCX was preempted by a higher priority 3153*bba2c361STejun Heo * sched_class, so invoke the ->cpu_release() callback if we have not 3154*bba2c361STejun Heo * done so already. We only send the callback once between SCX being 3155*bba2c361STejun Heo * preempted, and it regaining control of the CPU. 3156*bba2c361STejun Heo * 3157*bba2c361STejun Heo * ->cpu_release() complements ->cpu_acquire(), which is emitted the 3158*bba2c361STejun Heo * next time that balance_one() is invoked. 3159*bba2c361STejun Heo */ 3160*bba2c361STejun Heo if (!rq->scx.cpu_released) { 3161*bba2c361STejun Heo if (sch->ops.cpu_release) { 3162*bba2c361STejun Heo struct scx_cpu_release_args args = { 3163*bba2c361STejun Heo .reason = preempt_reason_from_class(next_class), 3164*bba2c361STejun Heo .task = next, 3165*bba2c361STejun Heo }; 3166*bba2c361STejun Heo 3167*bba2c361STejun Heo SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); 3168*bba2c361STejun Heo } 3169*bba2c361STejun Heo rq->scx.cpu_released = true; 3170*bba2c361STejun Heo } 3171*bba2c361STejun Heo } 3172*bba2c361STejun Heo 3173*bba2c361STejun Heo static void put_prev_task_scx(struct rq *rq, struct task_struct *p, 3174*bba2c361STejun Heo struct task_struct *next) 3175*bba2c361STejun Heo { 3176*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 3177*bba2c361STejun Heo 3178*bba2c361STejun Heo /* see kick_sync_wait_bal_cb() */ 3179*bba2c361STejun Heo smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3180*bba2c361STejun Heo 3181*bba2c361STejun Heo update_curr_scx(rq); 3182*bba2c361STejun Heo 3183*bba2c361STejun Heo /* see dequeue_task_scx() on why we skip when !QUEUED */ 3184*bba2c361STejun Heo if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) 3185*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, stopping, rq, p, true); 3186*bba2c361STejun Heo 3187*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_QUEUED) { 3188*bba2c361STejun Heo set_task_runnable(rq, p); 3189*bba2c361STejun Heo 3190*bba2c361STejun Heo /* 3191*bba2c361STejun Heo * If @p has slice left and is being put, @p is getting 3192*bba2c361STejun Heo * preempted by a higher priority scheduler class or core-sched 3193*bba2c361STejun Heo * forcing a different task. Leave it at the head of the local 3194*bba2c361STejun Heo * DSQ unless it was an IMMED task. IMMED tasks should not 3195*bba2c361STejun Heo * linger on a busy CPU, reenqueue them to the BPF scheduler. 3196*bba2c361STejun Heo */ 3197*bba2c361STejun Heo if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { 3198*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_IMMED) { 3199*bba2c361STejun Heo p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; 3200*bba2c361STejun Heo do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 3201*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 3202*bba2c361STejun Heo } else { 3203*bba2c361STejun Heo dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); 3204*bba2c361STejun Heo } 3205*bba2c361STejun Heo goto switch_class; 3206*bba2c361STejun Heo } 3207*bba2c361STejun Heo 3208*bba2c361STejun Heo /* 3209*bba2c361STejun Heo * If @p is runnable but we're about to enter a lower 3210*bba2c361STejun Heo * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell 3211*bba2c361STejun Heo * ops.enqueue() that @p is the only one available for this cpu, 3212*bba2c361STejun Heo * which should trigger an explicit follow-up scheduling event. 3213*bba2c361STejun Heo */ 3214*bba2c361STejun Heo if (next && sched_class_above(&ext_sched_class, next->sched_class)) { 3215*bba2c361STejun Heo WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); 3216*bba2c361STejun Heo do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); 3217*bba2c361STejun Heo } else { 3218*bba2c361STejun Heo do_enqueue_task(rq, p, 0, -1); 3219*bba2c361STejun Heo } 3220*bba2c361STejun Heo } 3221*bba2c361STejun Heo 3222*bba2c361STejun Heo switch_class: 3223*bba2c361STejun Heo if (next && next->sched_class != &ext_sched_class) 3224*bba2c361STejun Heo switch_class(rq, next); 3225*bba2c361STejun Heo } 3226*bba2c361STejun Heo 3227*bba2c361STejun Heo static void kick_sync_wait_bal_cb(struct rq *rq) 3228*bba2c361STejun Heo { 3229*bba2c361STejun Heo struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3230*bba2c361STejun Heo unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3231*bba2c361STejun Heo bool waited; 3232*bba2c361STejun Heo s32 cpu; 3233*bba2c361STejun Heo 3234*bba2c361STejun Heo /* 3235*bba2c361STejun Heo * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3236*bba2c361STejun Heo * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3237*bba2c361STejun Heo * flush) while we wait for its kick_sync to advance. 3238*bba2c361STejun Heo * 3239*bba2c361STejun Heo * Also, keep advancing our own kick_sync so that new kick_sync waits 3240*bba2c361STejun Heo * targeting us, which can start after we drop the lock, cannot form 3241*bba2c361STejun Heo * cyclic dependencies. 3242*bba2c361STejun Heo */ 3243*bba2c361STejun Heo retry: 3244*bba2c361STejun Heo waited = false; 3245*bba2c361STejun Heo for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3246*bba2c361STejun Heo /* 3247*bba2c361STejun Heo * smp_load_acquire() pairs with smp_store_release() on 3248*bba2c361STejun Heo * kick_sync updates on the target CPUs. 3249*bba2c361STejun Heo */ 3250*bba2c361STejun Heo if (cpu == cpu_of(rq) || 3251*bba2c361STejun Heo smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3252*bba2c361STejun Heo cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3253*bba2c361STejun Heo continue; 3254*bba2c361STejun Heo } 3255*bba2c361STejun Heo 3256*bba2c361STejun Heo raw_spin_rq_unlock_irq(rq); 3257*bba2c361STejun Heo while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3258*bba2c361STejun Heo smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3259*bba2c361STejun Heo cpu_relax(); 3260*bba2c361STejun Heo } 3261*bba2c361STejun Heo raw_spin_rq_lock_irq(rq); 3262*bba2c361STejun Heo waited = true; 3263*bba2c361STejun Heo } 3264*bba2c361STejun Heo 3265*bba2c361STejun Heo if (waited) 3266*bba2c361STejun Heo goto retry; 3267*bba2c361STejun Heo } 3268*bba2c361STejun Heo 3269*bba2c361STejun Heo static struct task_struct *first_local_task(struct rq *rq) 3270*bba2c361STejun Heo { 3271*bba2c361STejun Heo return list_first_entry_or_null(&rq->scx.local_dsq.list, 3272*bba2c361STejun Heo struct task_struct, scx.dsq_list.node); 3273*bba2c361STejun Heo } 3274*bba2c361STejun Heo 3275*bba2c361STejun Heo static struct task_struct * 3276*bba2c361STejun Heo do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 3277*bba2c361STejun Heo { 3278*bba2c361STejun Heo struct task_struct *prev = rq->curr; 3279*bba2c361STejun Heo bool keep_prev; 3280*bba2c361STejun Heo struct task_struct *p; 3281*bba2c361STejun Heo 3282*bba2c361STejun Heo /* see kick_sync_wait_bal_cb() */ 3283*bba2c361STejun Heo smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3284*bba2c361STejun Heo 3285*bba2c361STejun Heo rq_modified_begin(rq, &ext_sched_class); 3286*bba2c361STejun Heo 3287*bba2c361STejun Heo rq_unpin_lock(rq, rf); 3288*bba2c361STejun Heo balance_one(rq, prev); 3289*bba2c361STejun Heo rq_repin_lock(rq, rf); 3290*bba2c361STejun Heo maybe_queue_balance_callback(rq); 3291*bba2c361STejun Heo 3292*bba2c361STejun Heo /* 3293*bba2c361STejun Heo * Defer to a balance callback which can drop rq lock and enable 3294*bba2c361STejun Heo * IRQs. Waiting directly in the pick path would deadlock against 3295*bba2c361STejun Heo * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3296*bba2c361STejun Heo */ 3297*bba2c361STejun Heo if (unlikely(rq->scx.kick_sync_pending)) { 3298*bba2c361STejun Heo rq->scx.kick_sync_pending = false; 3299*bba2c361STejun Heo queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3300*bba2c361STejun Heo kick_sync_wait_bal_cb); 3301*bba2c361STejun Heo } 3302*bba2c361STejun Heo 3303*bba2c361STejun Heo /* 3304*bba2c361STejun Heo * If any higher-priority sched class enqueued a runnable task on 3305*bba2c361STejun Heo * this rq during balance_one(), abort and return RETRY_TASK, so 3306*bba2c361STejun Heo * that the scheduler loop can restart. 3307*bba2c361STejun Heo * 3308*bba2c361STejun Heo * If @force_scx is true, always try to pick a SCHED_EXT task, 3309*bba2c361STejun Heo * regardless of any higher-priority sched classes activity. 3310*bba2c361STejun Heo */ 3311*bba2c361STejun Heo if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 3312*bba2c361STejun Heo return RETRY_TASK; 3313*bba2c361STejun Heo 3314*bba2c361STejun Heo keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 3315*bba2c361STejun Heo if (unlikely(keep_prev && 3316*bba2c361STejun Heo prev->sched_class != &ext_sched_class)) { 3317*bba2c361STejun Heo WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 3318*bba2c361STejun Heo keep_prev = false; 3319*bba2c361STejun Heo } 3320*bba2c361STejun Heo 3321*bba2c361STejun Heo /* 3322*bba2c361STejun Heo * If balance_one() is telling us to keep running @prev, replenish slice 3323*bba2c361STejun Heo * if necessary and keep running @prev. Otherwise, pop the first one 3324*bba2c361STejun Heo * from the local DSQ. 3325*bba2c361STejun Heo */ 3326*bba2c361STejun Heo if (keep_prev) { 3327*bba2c361STejun Heo p = prev; 3328*bba2c361STejun Heo if (!p->scx.slice) 3329*bba2c361STejun Heo refill_task_slice_dfl(scx_task_sched(p), p); 3330*bba2c361STejun Heo } else { 3331*bba2c361STejun Heo p = first_local_task(rq); 3332*bba2c361STejun Heo if (!p) 3333*bba2c361STejun Heo return NULL; 3334*bba2c361STejun Heo 3335*bba2c361STejun Heo if (unlikely(!p->scx.slice)) { 3336*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 3337*bba2c361STejun Heo 3338*bba2c361STejun Heo if (!scx_bypassing(sch, cpu_of(rq)) && 3339*bba2c361STejun Heo !sch->warned_zero_slice) { 3340*bba2c361STejun Heo printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 3341*bba2c361STejun Heo p->comm, p->pid, __func__); 3342*bba2c361STejun Heo sch->warned_zero_slice = true; 3343*bba2c361STejun Heo } 3344*bba2c361STejun Heo refill_task_slice_dfl(sch, p); 3345*bba2c361STejun Heo } 3346*bba2c361STejun Heo } 3347*bba2c361STejun Heo 3348*bba2c361STejun Heo return p; 3349*bba2c361STejun Heo } 3350*bba2c361STejun Heo 3351*bba2c361STejun Heo static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 3352*bba2c361STejun Heo { 3353*bba2c361STejun Heo return do_pick_task_scx(rq, rf, false); 3354*bba2c361STejun Heo } 3355*bba2c361STejun Heo 3356*bba2c361STejun Heo /* 3357*bba2c361STejun Heo * Select the next task to run from the ext scheduling class. 3358*bba2c361STejun Heo * 3359*bba2c361STejun Heo * Use do_pick_task_scx() directly with @force_scx enabled, since the 3360*bba2c361STejun Heo * dl_server must always select a sched_ext task. 3361*bba2c361STejun Heo */ 3362*bba2c361STejun Heo static struct task_struct * 3363*bba2c361STejun Heo ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 3364*bba2c361STejun Heo { 3365*bba2c361STejun Heo if (!scx_enabled()) 3366*bba2c361STejun Heo return NULL; 3367*bba2c361STejun Heo 3368*bba2c361STejun Heo return do_pick_task_scx(dl_se->rq, rf, true); 3369*bba2c361STejun Heo } 3370*bba2c361STejun Heo 3371*bba2c361STejun Heo /* 3372*bba2c361STejun Heo * Initialize the ext server deadline entity. 3373*bba2c361STejun Heo */ 3374*bba2c361STejun Heo void ext_server_init(struct rq *rq) 3375*bba2c361STejun Heo { 3376*bba2c361STejun Heo struct sched_dl_entity *dl_se = &rq->ext_server; 3377*bba2c361STejun Heo 3378*bba2c361STejun Heo init_dl_entity(dl_se); 3379*bba2c361STejun Heo 3380*bba2c361STejun Heo dl_server_init(dl_se, rq, ext_server_pick_task); 3381*bba2c361STejun Heo } 3382*bba2c361STejun Heo 3383*bba2c361STejun Heo #ifdef CONFIG_SCHED_CORE 3384*bba2c361STejun Heo /** 3385*bba2c361STejun Heo * scx_prio_less - Task ordering for core-sched 3386*bba2c361STejun Heo * @a: task A 3387*bba2c361STejun Heo * @b: task B 3388*bba2c361STejun Heo * @in_fi: in forced idle state 3389*bba2c361STejun Heo * 3390*bba2c361STejun Heo * Core-sched is implemented as an additional scheduling layer on top of the 3391*bba2c361STejun Heo * usual sched_class'es and needs to find out the expected task ordering. For 3392*bba2c361STejun Heo * SCX, core-sched calls this function to interrogate the task ordering. 3393*bba2c361STejun Heo * 3394*bba2c361STejun Heo * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used 3395*bba2c361STejun Heo * to implement the default task ordering. The older the timestamp, the higher 3396*bba2c361STejun Heo * priority the task - the global FIFO ordering matching the default scheduling 3397*bba2c361STejun Heo * behavior. 3398*bba2c361STejun Heo * 3399*bba2c361STejun Heo * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to 3400*bba2c361STejun Heo * implement FIFO ordering within each local DSQ. See pick_task_scx(). 3401*bba2c361STejun Heo */ 3402*bba2c361STejun Heo bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 3403*bba2c361STejun Heo bool in_fi) 3404*bba2c361STejun Heo { 3405*bba2c361STejun Heo struct scx_sched *sch_a = scx_task_sched(a); 3406*bba2c361STejun Heo struct scx_sched *sch_b = scx_task_sched(b); 3407*bba2c361STejun Heo 3408*bba2c361STejun Heo /* 3409*bba2c361STejun Heo * The const qualifiers are dropped from task_struct pointers when 3410*bba2c361STejun Heo * calling ops.core_sched_before(). Accesses are controlled by the 3411*bba2c361STejun Heo * verifier. 3412*bba2c361STejun Heo */ 3413*bba2c361STejun Heo if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && 3414*bba2c361STejun Heo !scx_bypassing(sch_a, task_cpu(a))) 3415*bba2c361STejun Heo return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, 3416*bba2c361STejun Heo task_rq(a), 3417*bba2c361STejun Heo (struct task_struct *)a, 3418*bba2c361STejun Heo (struct task_struct *)b); 3419*bba2c361STejun Heo else 3420*bba2c361STejun Heo return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); 3421*bba2c361STejun Heo } 3422*bba2c361STejun Heo #endif /* CONFIG_SCHED_CORE */ 3423*bba2c361STejun Heo 3424*bba2c361STejun Heo static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3425*bba2c361STejun Heo { 3426*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 3427*bba2c361STejun Heo bool bypassing; 3428*bba2c361STejun Heo 3429*bba2c361STejun Heo /* 3430*bba2c361STejun Heo * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3431*bba2c361STejun Heo * can be a good migration opportunity with low cache and memory 3432*bba2c361STejun Heo * footprint. Returning a CPU different than @prev_cpu triggers 3433*bba2c361STejun Heo * immediate rq migration. However, for SCX, as the current rq 3434*bba2c361STejun Heo * association doesn't dictate where the task is going to run, this 3435*bba2c361STejun Heo * doesn't fit well. If necessary, we can later add a dedicated method 3436*bba2c361STejun Heo * which can decide to preempt self to force it through the regular 3437*bba2c361STejun Heo * scheduling path. 3438*bba2c361STejun Heo */ 3439*bba2c361STejun Heo if (unlikely(wake_flags & WF_EXEC)) 3440*bba2c361STejun Heo return prev_cpu; 3441*bba2c361STejun Heo 3442*bba2c361STejun Heo bypassing = scx_bypassing(sch, task_cpu(p)); 3443*bba2c361STejun Heo if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { 3444*bba2c361STejun Heo s32 cpu; 3445*bba2c361STejun Heo struct task_struct **ddsp_taskp; 3446*bba2c361STejun Heo 3447*bba2c361STejun Heo ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); 3448*bba2c361STejun Heo WARN_ON_ONCE(*ddsp_taskp); 3449*bba2c361STejun Heo *ddsp_taskp = p; 3450*bba2c361STejun Heo 3451*bba2c361STejun Heo this_rq()->scx.in_select_cpu = true; 3452*bba2c361STejun Heo cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, 3453*bba2c361STejun Heo scx_cpu_arg(prev_cpu), wake_flags); 3454*bba2c361STejun Heo cpu = scx_cpu_ret(sch, cpu); 3455*bba2c361STejun Heo this_rq()->scx.in_select_cpu = false; 3456*bba2c361STejun Heo p->scx.selected_cpu = cpu; 3457*bba2c361STejun Heo *ddsp_taskp = NULL; 3458*bba2c361STejun Heo if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()")) 3459*bba2c361STejun Heo return cpu; 3460*bba2c361STejun Heo else 3461*bba2c361STejun Heo return prev_cpu; 3462*bba2c361STejun Heo } else { 3463*bba2c361STejun Heo s32 cpu; 3464*bba2c361STejun Heo 3465*bba2c361STejun Heo cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 3466*bba2c361STejun Heo if (cpu >= 0) { 3467*bba2c361STejun Heo refill_task_slice_dfl(sch, p); 3468*bba2c361STejun Heo p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3469*bba2c361STejun Heo } else { 3470*bba2c361STejun Heo cpu = prev_cpu; 3471*bba2c361STejun Heo } 3472*bba2c361STejun Heo p->scx.selected_cpu = cpu; 3473*bba2c361STejun Heo 3474*bba2c361STejun Heo if (bypassing) 3475*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 3476*bba2c361STejun Heo return cpu; 3477*bba2c361STejun Heo } 3478*bba2c361STejun Heo } 3479*bba2c361STejun Heo 3480*bba2c361STejun Heo static void task_woken_scx(struct rq *rq, struct task_struct *p) 3481*bba2c361STejun Heo { 3482*bba2c361STejun Heo run_deferred(rq); 3483*bba2c361STejun Heo } 3484*bba2c361STejun Heo 3485*bba2c361STejun Heo static void set_cpus_allowed_scx(struct task_struct *p, 3486*bba2c361STejun Heo struct affinity_context *ac) 3487*bba2c361STejun Heo { 3488*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 3489*bba2c361STejun Heo 3490*bba2c361STejun Heo set_cpus_allowed_common(p, ac); 3491*bba2c361STejun Heo 3492*bba2c361STejun Heo if (task_dead_and_done(p)) 3493*bba2c361STejun Heo return; 3494*bba2c361STejun Heo 3495*bba2c361STejun Heo /* 3496*bba2c361STejun Heo * The effective cpumask is stored in @p->cpus_ptr which may temporarily 3497*bba2c361STejun Heo * differ from the configured one in @p->cpus_mask. Always tell the bpf 3498*bba2c361STejun Heo * scheduler the effective one. 3499*bba2c361STejun Heo * 3500*bba2c361STejun Heo * Fine-grained memory write control is enforced by BPF making the const 3501*bba2c361STejun Heo * designation pointless. Cast it away when calling the operation. 3502*bba2c361STejun Heo */ 3503*bba2c361STejun Heo if (SCX_HAS_OP(sch, set_cpumask)) 3504*bba2c361STejun Heo scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr); 3505*bba2c361STejun Heo } 3506*bba2c361STejun Heo 3507*bba2c361STejun Heo static void handle_hotplug(struct rq *rq, bool online) 3508*bba2c361STejun Heo { 3509*bba2c361STejun Heo struct scx_sched *sch = scx_root; 3510*bba2c361STejun Heo s32 cpu = cpu_of(rq); 3511*bba2c361STejun Heo 3512*bba2c361STejun Heo atomic_long_inc(&scx_hotplug_seq); 3513*bba2c361STejun Heo 3514*bba2c361STejun Heo /* 3515*bba2c361STejun Heo * scx_root updates are protected by cpus_read_lock() and will stay 3516*bba2c361STejun Heo * stable here. Note that we can't depend on scx_enabled() test as the 3517*bba2c361STejun Heo * hotplug ops need to be enabled before __scx_enabled is set. 3518*bba2c361STejun Heo */ 3519*bba2c361STejun Heo if (unlikely(!sch)) 3520*bba2c361STejun Heo return; 3521*bba2c361STejun Heo 3522*bba2c361STejun Heo if (scx_enabled()) 3523*bba2c361STejun Heo scx_idle_update_selcpu_topology(&sch->ops); 3524*bba2c361STejun Heo 3525*bba2c361STejun Heo if (online && SCX_HAS_OP(sch, cpu_online)) 3526*bba2c361STejun Heo SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu)); 3527*bba2c361STejun Heo else if (!online && SCX_HAS_OP(sch, cpu_offline)) 3528*bba2c361STejun Heo SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu)); 3529*bba2c361STejun Heo else 3530*bba2c361STejun Heo scx_exit(sch, SCX_EXIT_UNREG_KERN, 3531*bba2c361STejun Heo SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 3532*bba2c361STejun Heo "cpu %d going %s, exiting scheduler", cpu, 3533*bba2c361STejun Heo online ? "online" : "offline"); 3534*bba2c361STejun Heo } 3535*bba2c361STejun Heo 3536*bba2c361STejun Heo void scx_rq_activate(struct rq *rq) 3537*bba2c361STejun Heo { 3538*bba2c361STejun Heo handle_hotplug(rq, true); 3539*bba2c361STejun Heo } 3540*bba2c361STejun Heo 3541*bba2c361STejun Heo void scx_rq_deactivate(struct rq *rq) 3542*bba2c361STejun Heo { 3543*bba2c361STejun Heo handle_hotplug(rq, false); 3544*bba2c361STejun Heo } 3545*bba2c361STejun Heo 3546*bba2c361STejun Heo static void rq_online_scx(struct rq *rq) 3547*bba2c361STejun Heo { 3548*bba2c361STejun Heo rq->scx.flags |= SCX_RQ_ONLINE; 3549*bba2c361STejun Heo } 3550*bba2c361STejun Heo 3551*bba2c361STejun Heo static void rq_offline_scx(struct rq *rq) 3552*bba2c361STejun Heo { 3553*bba2c361STejun Heo rq->scx.flags &= ~SCX_RQ_ONLINE; 3554*bba2c361STejun Heo } 3555*bba2c361STejun Heo 3556*bba2c361STejun Heo static bool check_rq_for_timeouts(struct rq *rq) 3557*bba2c361STejun Heo { 3558*bba2c361STejun Heo struct scx_sched *sch; 3559*bba2c361STejun Heo struct task_struct *p; 3560*bba2c361STejun Heo struct rq_flags rf; 3561*bba2c361STejun Heo bool timed_out = false; 3562*bba2c361STejun Heo 3563*bba2c361STejun Heo rq_lock_irqsave(rq, &rf); 3564*bba2c361STejun Heo sch = rcu_dereference_bh(scx_root); 3565*bba2c361STejun Heo if (unlikely(!sch)) 3566*bba2c361STejun Heo goto out_unlock; 3567*bba2c361STejun Heo 3568*bba2c361STejun Heo list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { 3569*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 3570*bba2c361STejun Heo unsigned long last_runnable = p->scx.runnable_at; 3571*bba2c361STejun Heo 3572*bba2c361STejun Heo if (unlikely(time_after(jiffies, 3573*bba2c361STejun Heo last_runnable + READ_ONCE(sch->watchdog_timeout)))) { 3574*bba2c361STejun Heo u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 3575*bba2c361STejun Heo 3576*bba2c361STejun Heo __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq), 3577*bba2c361STejun Heo "%s[%d] failed to run for %u.%03us", 3578*bba2c361STejun Heo p->comm, p->pid, dur_ms / 1000, 3579*bba2c361STejun Heo dur_ms % 1000); 3580*bba2c361STejun Heo timed_out = true; 3581*bba2c361STejun Heo break; 3582*bba2c361STejun Heo } 3583*bba2c361STejun Heo } 3584*bba2c361STejun Heo out_unlock: 3585*bba2c361STejun Heo rq_unlock_irqrestore(rq, &rf); 3586*bba2c361STejun Heo return timed_out; 3587*bba2c361STejun Heo } 3588*bba2c361STejun Heo 3589*bba2c361STejun Heo static void scx_watchdog_workfn(struct work_struct *work) 3590*bba2c361STejun Heo { 3591*bba2c361STejun Heo unsigned long intv; 3592*bba2c361STejun Heo int cpu; 3593*bba2c361STejun Heo 3594*bba2c361STejun Heo WRITE_ONCE(scx_watchdog_timestamp, jiffies); 3595*bba2c361STejun Heo 3596*bba2c361STejun Heo for_each_online_cpu(cpu) { 3597*bba2c361STejun Heo if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) 3598*bba2c361STejun Heo break; 3599*bba2c361STejun Heo 3600*bba2c361STejun Heo cond_resched(); 3601*bba2c361STejun Heo } 3602*bba2c361STejun Heo 3603*bba2c361STejun Heo intv = READ_ONCE(scx_watchdog_interval); 3604*bba2c361STejun Heo if (intv < ULONG_MAX) 3605*bba2c361STejun Heo queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3606*bba2c361STejun Heo } 3607*bba2c361STejun Heo 3608*bba2c361STejun Heo void scx_tick(struct rq *rq) 3609*bba2c361STejun Heo { 3610*bba2c361STejun Heo struct scx_sched *root; 3611*bba2c361STejun Heo unsigned long last_check; 3612*bba2c361STejun Heo 3613*bba2c361STejun Heo if (!scx_enabled()) 3614*bba2c361STejun Heo return; 3615*bba2c361STejun Heo 3616*bba2c361STejun Heo root = rcu_dereference_bh(scx_root); 3617*bba2c361STejun Heo if (unlikely(!root)) 3618*bba2c361STejun Heo return; 3619*bba2c361STejun Heo 3620*bba2c361STejun Heo last_check = READ_ONCE(scx_watchdog_timestamp); 3621*bba2c361STejun Heo if (unlikely(time_after(jiffies, 3622*bba2c361STejun Heo last_check + READ_ONCE(root->watchdog_timeout)))) { 3623*bba2c361STejun Heo u32 dur_ms = jiffies_to_msecs(jiffies - last_check); 3624*bba2c361STejun Heo 3625*bba2c361STejun Heo scx_exit(root, SCX_EXIT_ERROR_STALL, 0, 3626*bba2c361STejun Heo "watchdog failed to check in for %u.%03us", 3627*bba2c361STejun Heo dur_ms / 1000, dur_ms % 1000); 3628*bba2c361STejun Heo } 3629*bba2c361STejun Heo 3630*bba2c361STejun Heo update_other_load_avgs(rq); 3631*bba2c361STejun Heo } 3632*bba2c361STejun Heo 3633*bba2c361STejun Heo static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) 3634*bba2c361STejun Heo { 3635*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(curr); 3636*bba2c361STejun Heo 3637*bba2c361STejun Heo update_curr_scx(rq); 3638*bba2c361STejun Heo 3639*bba2c361STejun Heo /* 3640*bba2c361STejun Heo * While disabling, always resched and refresh core-sched timestamp as 3641*bba2c361STejun Heo * we can't trust the slice management or ops.core_sched_before(). 3642*bba2c361STejun Heo */ 3643*bba2c361STejun Heo if (scx_bypassing(sch, cpu_of(rq))) { 3644*bba2c361STejun Heo curr->scx.slice = 0; 3645*bba2c361STejun Heo touch_core_sched(rq, curr); 3646*bba2c361STejun Heo } else if (SCX_HAS_OP(sch, tick)) { 3647*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, tick, rq, curr); 3648*bba2c361STejun Heo } 3649*bba2c361STejun Heo 3650*bba2c361STejun Heo if (!curr->scx.slice) 3651*bba2c361STejun Heo resched_curr(rq); 3652*bba2c361STejun Heo } 3653*bba2c361STejun Heo 3654*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 3655*bba2c361STejun Heo static struct cgroup *tg_cgrp(struct task_group *tg) 3656*bba2c361STejun Heo { 3657*bba2c361STejun Heo /* 3658*bba2c361STejun Heo * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3659*bba2c361STejun Heo * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3660*bba2c361STejun Heo * root cgroup. 3661*bba2c361STejun Heo */ 3662*bba2c361STejun Heo if (tg && tg->css.cgroup) 3663*bba2c361STejun Heo return tg->css.cgroup; 3664*bba2c361STejun Heo else 3665*bba2c361STejun Heo return &cgrp_dfl_root.cgrp; 3666*bba2c361STejun Heo } 3667*bba2c361STejun Heo 3668*bba2c361STejun Heo #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3669*bba2c361STejun Heo 3670*bba2c361STejun Heo #else /* CONFIG_EXT_GROUP_SCHED */ 3671*bba2c361STejun Heo 3672*bba2c361STejun Heo #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3673*bba2c361STejun Heo 3674*bba2c361STejun Heo #endif /* CONFIG_EXT_GROUP_SCHED */ 3675*bba2c361STejun Heo 3676*bba2c361STejun Heo static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) 3677*bba2c361STejun Heo { 3678*bba2c361STejun Heo int ret; 3679*bba2c361STejun Heo 3680*bba2c361STejun Heo p->scx.disallow = false; 3681*bba2c361STejun Heo 3682*bba2c361STejun Heo if (SCX_HAS_OP(sch, init_task)) { 3683*bba2c361STejun Heo struct scx_init_task_args args = { 3684*bba2c361STejun Heo SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) 3685*bba2c361STejun Heo .fork = fork, 3686*bba2c361STejun Heo }; 3687*bba2c361STejun Heo 3688*bba2c361STejun Heo ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); 3689*bba2c361STejun Heo if (unlikely(ret)) { 3690*bba2c361STejun Heo ret = ops_sanitize_err(sch, "init_task", ret); 3691*bba2c361STejun Heo return ret; 3692*bba2c361STejun Heo } 3693*bba2c361STejun Heo } 3694*bba2c361STejun Heo 3695*bba2c361STejun Heo if (p->scx.disallow) { 3696*bba2c361STejun Heo if (unlikely(scx_parent(sch))) { 3697*bba2c361STejun Heo scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", 3698*bba2c361STejun Heo p->comm, p->pid); 3699*bba2c361STejun Heo } else if (unlikely(fork)) { 3700*bba2c361STejun Heo scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", 3701*bba2c361STejun Heo p->comm, p->pid); 3702*bba2c361STejun Heo } else { 3703*bba2c361STejun Heo struct rq *rq; 3704*bba2c361STejun Heo struct rq_flags rf; 3705*bba2c361STejun Heo 3706*bba2c361STejun Heo rq = task_rq_lock(p, &rf); 3707*bba2c361STejun Heo 3708*bba2c361STejun Heo /* 3709*bba2c361STejun Heo * We're in the load path and @p->policy will be applied 3710*bba2c361STejun Heo * right after. Reverting @p->policy here and rejecting 3711*bba2c361STejun Heo * %SCHED_EXT transitions from scx_check_setscheduler() 3712*bba2c361STejun Heo * guarantees that if ops.init_task() sets @p->disallow, 3713*bba2c361STejun Heo * @p can never be in SCX. 3714*bba2c361STejun Heo */ 3715*bba2c361STejun Heo if (p->policy == SCHED_EXT) { 3716*bba2c361STejun Heo p->policy = SCHED_NORMAL; 3717*bba2c361STejun Heo atomic_long_inc(&scx_nr_rejected); 3718*bba2c361STejun Heo } 3719*bba2c361STejun Heo 3720*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 3721*bba2c361STejun Heo } 3722*bba2c361STejun Heo } 3723*bba2c361STejun Heo 3724*bba2c361STejun Heo return 0; 3725*bba2c361STejun Heo } 3726*bba2c361STejun Heo 3727*bba2c361STejun Heo static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3728*bba2c361STejun Heo { 3729*bba2c361STejun Heo struct rq *rq = task_rq(p); 3730*bba2c361STejun Heo u32 weight; 3731*bba2c361STejun Heo 3732*bba2c361STejun Heo lockdep_assert_rq_held(rq); 3733*bba2c361STejun Heo 3734*bba2c361STejun Heo /* 3735*bba2c361STejun Heo * Verify the task is not in BPF scheduler's custody. If flag 3736*bba2c361STejun Heo * transitions are consistent, the flag should always be clear 3737*bba2c361STejun Heo * here. 3738*bba2c361STejun Heo */ 3739*bba2c361STejun Heo WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3740*bba2c361STejun Heo 3741*bba2c361STejun Heo /* 3742*bba2c361STejun Heo * Set the weight before calling ops.enable() so that the scheduler 3743*bba2c361STejun Heo * doesn't see a stale value if they inspect the task struct. 3744*bba2c361STejun Heo */ 3745*bba2c361STejun Heo if (task_has_idle_policy(p)) 3746*bba2c361STejun Heo weight = WEIGHT_IDLEPRIO; 3747*bba2c361STejun Heo else 3748*bba2c361STejun Heo weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; 3749*bba2c361STejun Heo 3750*bba2c361STejun Heo p->scx.weight = sched_weight_to_cgroup(weight); 3751*bba2c361STejun Heo 3752*bba2c361STejun Heo if (SCX_HAS_OP(sch, enable)) 3753*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, enable, rq, p); 3754*bba2c361STejun Heo 3755*bba2c361STejun Heo if (SCX_HAS_OP(sch, set_weight)) 3756*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 3757*bba2c361STejun Heo } 3758*bba2c361STejun Heo 3759*bba2c361STejun Heo static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) 3760*bba2c361STejun Heo { 3761*bba2c361STejun Heo __scx_enable_task(sch, p); 3762*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_ENABLED); 3763*bba2c361STejun Heo } 3764*bba2c361STejun Heo 3765*bba2c361STejun Heo static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) 3766*bba2c361STejun Heo { 3767*bba2c361STejun Heo struct rq *rq = task_rq(p); 3768*bba2c361STejun Heo 3769*bba2c361STejun Heo lockdep_assert_rq_held(rq); 3770*bba2c361STejun Heo WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); 3771*bba2c361STejun Heo 3772*bba2c361STejun Heo clear_direct_dispatch(p); 3773*bba2c361STejun Heo 3774*bba2c361STejun Heo if (SCX_HAS_OP(sch, disable)) 3775*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, disable, rq, p); 3776*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_READY); 3777*bba2c361STejun Heo 3778*bba2c361STejun Heo /* 3779*bba2c361STejun Heo * Verify the task is not in BPF scheduler's custody. If flag 3780*bba2c361STejun Heo * transitions are consistent, the flag should always be clear 3781*bba2c361STejun Heo * here. 3782*bba2c361STejun Heo */ 3783*bba2c361STejun Heo WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); 3784*bba2c361STejun Heo } 3785*bba2c361STejun Heo 3786*bba2c361STejun Heo static void __scx_disable_and_exit_task(struct scx_sched *sch, 3787*bba2c361STejun Heo struct task_struct *p) 3788*bba2c361STejun Heo { 3789*bba2c361STejun Heo struct scx_exit_task_args args = { 3790*bba2c361STejun Heo .cancelled = false, 3791*bba2c361STejun Heo }; 3792*bba2c361STejun Heo 3793*bba2c361STejun Heo lockdep_assert_held(&p->pi_lock); 3794*bba2c361STejun Heo lockdep_assert_rq_held(task_rq(p)); 3795*bba2c361STejun Heo 3796*bba2c361STejun Heo switch (scx_get_task_state(p)) { 3797*bba2c361STejun Heo case SCX_TASK_NONE: 3798*bba2c361STejun Heo return; 3799*bba2c361STejun Heo case SCX_TASK_INIT: 3800*bba2c361STejun Heo args.cancelled = true; 3801*bba2c361STejun Heo break; 3802*bba2c361STejun Heo case SCX_TASK_READY: 3803*bba2c361STejun Heo break; 3804*bba2c361STejun Heo case SCX_TASK_ENABLED: 3805*bba2c361STejun Heo scx_disable_task(sch, p); 3806*bba2c361STejun Heo break; 3807*bba2c361STejun Heo default: 3808*bba2c361STejun Heo WARN_ON_ONCE(true); 3809*bba2c361STejun Heo return; 3810*bba2c361STejun Heo } 3811*bba2c361STejun Heo 3812*bba2c361STejun Heo if (SCX_HAS_OP(sch, exit_task)) 3813*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3814*bba2c361STejun Heo } 3815*bba2c361STejun Heo 3816*bba2c361STejun Heo /* 3817*bba2c361STejun Heo * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never 3818*bba2c361STejun Heo * ran. The task state has not been transitioned, so this mirrors the 3819*bba2c361STejun Heo * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). 3820*bba2c361STejun Heo */ 3821*bba2c361STejun Heo static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) 3822*bba2c361STejun Heo { 3823*bba2c361STejun Heo struct scx_exit_task_args args = { .cancelled = true }; 3824*bba2c361STejun Heo 3825*bba2c361STejun Heo lockdep_assert_held(&p->pi_lock); 3826*bba2c361STejun Heo lockdep_assert_rq_held(task_rq(p)); 3827*bba2c361STejun Heo 3828*bba2c361STejun Heo if (SCX_HAS_OP(sch, exit_task)) 3829*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); 3830*bba2c361STejun Heo } 3831*bba2c361STejun Heo 3832*bba2c361STejun Heo static void scx_disable_and_exit_task(struct scx_sched *sch, 3833*bba2c361STejun Heo struct task_struct *p) 3834*bba2c361STejun Heo { 3835*bba2c361STejun Heo __scx_disable_and_exit_task(sch, p); 3836*bba2c361STejun Heo 3837*bba2c361STejun Heo /* 3838*bba2c361STejun Heo * If set, @p exited between __scx_init_task() and scx_enable_task() in 3839*bba2c361STejun Heo * scx_sub_enable() and is initialized for both the associated sched and 3840*bba2c361STejun Heo * its parent. Exit for the child too - scx_enable_task() never ran for 3841*bba2c361STejun Heo * it, so undo only init_task. The flag is only set on the sub-enable 3842*bba2c361STejun Heo * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. 3843*bba2c361STejun Heo */ 3844*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_SUB_INIT) { 3845*bba2c361STejun Heo if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) 3846*bba2c361STejun Heo scx_sub_init_cancel_task(scx_enabling_sub_sched, p); 3847*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_SUB_INIT; 3848*bba2c361STejun Heo } 3849*bba2c361STejun Heo 3850*bba2c361STejun Heo scx_set_task_sched(p, NULL); 3851*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_NONE); 3852*bba2c361STejun Heo } 3853*bba2c361STejun Heo 3854*bba2c361STejun Heo void init_scx_entity(struct sched_ext_entity *scx) 3855*bba2c361STejun Heo { 3856*bba2c361STejun Heo memset(scx, 0, sizeof(*scx)); 3857*bba2c361STejun Heo INIT_LIST_HEAD(&scx->dsq_list.node); 3858*bba2c361STejun Heo RB_CLEAR_NODE(&scx->dsq_priq); 3859*bba2c361STejun Heo scx->sticky_cpu = -1; 3860*bba2c361STejun Heo scx->holding_cpu = -1; 3861*bba2c361STejun Heo INIT_LIST_HEAD(&scx->runnable_node); 3862*bba2c361STejun Heo scx->runnable_at = jiffies; 3863*bba2c361STejun Heo scx->ddsp_dsq_id = SCX_DSQ_INVALID; 3864*bba2c361STejun Heo scx->slice = SCX_SLICE_DFL; 3865*bba2c361STejun Heo } 3866*bba2c361STejun Heo 3867*bba2c361STejun Heo /* See scx_tid_alloc / scx_tid_cursor. */ 3868*bba2c361STejun Heo static u64 scx_alloc_tid(void) 3869*bba2c361STejun Heo { 3870*bba2c361STejun Heo struct scx_tid_alloc *ta; 3871*bba2c361STejun Heo 3872*bba2c361STejun Heo guard(preempt)(); 3873*bba2c361STejun Heo ta = this_cpu_ptr(&scx_tid_alloc); 3874*bba2c361STejun Heo 3875*bba2c361STejun Heo if (unlikely(ta->next >= ta->end)) { 3876*bba2c361STejun Heo ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor); 3877*bba2c361STejun Heo ta->end = ta->next + SCX_TID_CHUNK; 3878*bba2c361STejun Heo } 3879*bba2c361STejun Heo return ta->next++; 3880*bba2c361STejun Heo } 3881*bba2c361STejun Heo 3882*bba2c361STejun Heo static void scx_tid_hash_insert(struct task_struct *p) 3883*bba2c361STejun Heo { 3884*bba2c361STejun Heo int ret; 3885*bba2c361STejun Heo 3886*bba2c361STejun Heo lockdep_assert_held(&scx_tasks_lock); 3887*bba2c361STejun Heo 3888*bba2c361STejun Heo ret = rhashtable_lookup_insert_fast(&scx_tid_hash, 3889*bba2c361STejun Heo &p->scx.tid_hash_node, 3890*bba2c361STejun Heo scx_tid_hash_params); 3891*bba2c361STejun Heo WARN_ON_ONCE(ret); 3892*bba2c361STejun Heo } 3893*bba2c361STejun Heo 3894*bba2c361STejun Heo void scx_pre_fork(struct task_struct *p) 3895*bba2c361STejun Heo { 3896*bba2c361STejun Heo /* 3897*bba2c361STejun Heo * BPF scheduler enable/disable paths want to be able to iterate and 3898*bba2c361STejun Heo * update all tasks which can become complex when racing forks. As 3899*bba2c361STejun Heo * enable/disable are very cold paths, let's use a percpu_rwsem to 3900*bba2c361STejun Heo * exclude forks. 3901*bba2c361STejun Heo */ 3902*bba2c361STejun Heo percpu_down_read(&scx_fork_rwsem); 3903*bba2c361STejun Heo } 3904*bba2c361STejun Heo 3905*bba2c361STejun Heo int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) 3906*bba2c361STejun Heo { 3907*bba2c361STejun Heo s32 ret; 3908*bba2c361STejun Heo 3909*bba2c361STejun Heo percpu_rwsem_assert_held(&scx_fork_rwsem); 3910*bba2c361STejun Heo 3911*bba2c361STejun Heo p->scx.tid = scx_alloc_tid(); 3912*bba2c361STejun Heo 3913*bba2c361STejun Heo if (scx_init_task_enabled) { 3914*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 3915*bba2c361STejun Heo struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; 3916*bba2c361STejun Heo #else 3917*bba2c361STejun Heo struct scx_sched *sch = scx_root; 3918*bba2c361STejun Heo #endif 3919*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 3920*bba2c361STejun Heo ret = __scx_init_task(sch, p, true); 3921*bba2c361STejun Heo if (unlikely(ret)) { 3922*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_NONE); 3923*bba2c361STejun Heo return ret; 3924*bba2c361STejun Heo } 3925*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_INIT); 3926*bba2c361STejun Heo scx_set_task_sched(p, sch); 3927*bba2c361STejun Heo } 3928*bba2c361STejun Heo 3929*bba2c361STejun Heo return 0; 3930*bba2c361STejun Heo } 3931*bba2c361STejun Heo 3932*bba2c361STejun Heo void scx_post_fork(struct task_struct *p) 3933*bba2c361STejun Heo { 3934*bba2c361STejun Heo if (scx_init_task_enabled) { 3935*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_READY); 3936*bba2c361STejun Heo 3937*bba2c361STejun Heo /* 3938*bba2c361STejun Heo * Enable the task immediately if it's running on sched_ext. 3939*bba2c361STejun Heo * Otherwise, it'll be enabled in switching_to_scx() if and 3940*bba2c361STejun Heo * when it's ever configured to run with a SCHED_EXT policy. 3941*bba2c361STejun Heo */ 3942*bba2c361STejun Heo if (p->sched_class == &ext_sched_class) { 3943*bba2c361STejun Heo struct rq_flags rf; 3944*bba2c361STejun Heo struct rq *rq; 3945*bba2c361STejun Heo 3946*bba2c361STejun Heo rq = task_rq_lock(p, &rf); 3947*bba2c361STejun Heo scx_enable_task(scx_task_sched(p), p); 3948*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 3949*bba2c361STejun Heo } 3950*bba2c361STejun Heo } 3951*bba2c361STejun Heo 3952*bba2c361STejun Heo scoped_guard(raw_spinlock_irq, &scx_tasks_lock) { 3953*bba2c361STejun Heo list_add_tail(&p->scx.tasks_node, &scx_tasks); 3954*bba2c361STejun Heo if (scx_tid_to_task_enabled()) 3955*bba2c361STejun Heo scx_tid_hash_insert(p); 3956*bba2c361STejun Heo } 3957*bba2c361STejun Heo 3958*bba2c361STejun Heo percpu_up_read(&scx_fork_rwsem); 3959*bba2c361STejun Heo } 3960*bba2c361STejun Heo 3961*bba2c361STejun Heo void scx_cancel_fork(struct task_struct *p) 3962*bba2c361STejun Heo { 3963*bba2c361STejun Heo if (scx_enabled()) { 3964*bba2c361STejun Heo struct rq *rq; 3965*bba2c361STejun Heo struct rq_flags rf; 3966*bba2c361STejun Heo 3967*bba2c361STejun Heo rq = task_rq_lock(p, &rf); 3968*bba2c361STejun Heo WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); 3969*bba2c361STejun Heo scx_disable_and_exit_task(scx_task_sched(p), p); 3970*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 3971*bba2c361STejun Heo } 3972*bba2c361STejun Heo 3973*bba2c361STejun Heo percpu_up_read(&scx_fork_rwsem); 3974*bba2c361STejun Heo } 3975*bba2c361STejun Heo 3976*bba2c361STejun Heo /** 3977*bba2c361STejun Heo * task_dead_and_done - Is a task dead and done running? 3978*bba2c361STejun Heo * @p: target task 3979*bba2c361STejun Heo * 3980*bba2c361STejun Heo * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3981*bba2c361STejun Heo * task no longer exists from SCX's POV. However, certain sched_class ops may be 3982*bba2c361STejun Heo * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3983*bba2c361STejun Heo * may try to switch a task which finished sched_ext_dead() back into SCX 3984*bba2c361STejun Heo * triggering invalid SCX task state transitions and worse. 3985*bba2c361STejun Heo * 3986*bba2c361STejun Heo * Once a task has finished the final switch, sched_ext_dead() is the only thing 3987*bba2c361STejun Heo * that needs to happen on the task. Use this test to short-circuit sched_class 3988*bba2c361STejun Heo * operations which may be called on dead tasks. 3989*bba2c361STejun Heo */ 3990*bba2c361STejun Heo static bool task_dead_and_done(struct task_struct *p) 3991*bba2c361STejun Heo { 3992*bba2c361STejun Heo struct rq *rq = task_rq(p); 3993*bba2c361STejun Heo 3994*bba2c361STejun Heo lockdep_assert_rq_held(rq); 3995*bba2c361STejun Heo 3996*bba2c361STejun Heo /* 3997*bba2c361STejun Heo * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3998*bba2c361STejun Heo * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3999*bba2c361STejun Heo * won't ever run again. 4000*bba2c361STejun Heo */ 4001*bba2c361STejun Heo return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 4002*bba2c361STejun Heo !task_on_cpu(rq, p); 4003*bba2c361STejun Heo } 4004*bba2c361STejun Heo 4005*bba2c361STejun Heo void sched_ext_dead(struct task_struct *p) 4006*bba2c361STejun Heo { 4007*bba2c361STejun Heo /* 4008*bba2c361STejun Heo * By the time control reaches here, @p has %TASK_DEAD set, switched out 4009*bba2c361STejun Heo * for the last time and then dropped the rq lock - task_dead_and_done() 4010*bba2c361STejun Heo * should be returning %true nullifying the straggling sched_class ops. 4011*bba2c361STejun Heo * Remove from scx_tasks and exit @p. 4012*bba2c361STejun Heo */ 4013*bba2c361STejun Heo scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) { 4014*bba2c361STejun Heo list_del_init(&p->scx.tasks_node); 4015*bba2c361STejun Heo if (scx_tid_to_task_enabled()) 4016*bba2c361STejun Heo rhashtable_remove_fast(&scx_tid_hash, 4017*bba2c361STejun Heo &p->scx.tid_hash_node, 4018*bba2c361STejun Heo scx_tid_hash_params); 4019*bba2c361STejun Heo } 4020*bba2c361STejun Heo 4021*bba2c361STejun Heo /* 4022*bba2c361STejun Heo * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> 4023*bba2c361STejun Heo * ENABLED transitions can't race us. Disable ops for @p. 4024*bba2c361STejun Heo * 4025*bba2c361STejun Heo * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see 4026*bba2c361STejun Heo * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup 4027*bba2c361STejun Heo * iteration is only used from sub-sched paths, which require root 4028*bba2c361STejun Heo * enabled. Root enable transitions every live task to at least READY. 4029*bba2c361STejun Heo * 4030*bba2c361STejun Heo * %INIT_BEGIN means ops.init_task() is running for @p. Don't call 4031*bba2c361STejun Heo * into ops; transition to %DEAD so the post-init recheck unwinds 4032*bba2c361STejun Heo * via scx_sub_init_cancel_task(). 4033*bba2c361STejun Heo */ 4034*bba2c361STejun Heo if (scx_get_task_state(p) != SCX_TASK_NONE) { 4035*bba2c361STejun Heo struct rq_flags rf; 4036*bba2c361STejun Heo struct rq *rq; 4037*bba2c361STejun Heo 4038*bba2c361STejun Heo rq = task_rq_lock(p, &rf); 4039*bba2c361STejun Heo if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) 4040*bba2c361STejun Heo scx_disable_and_exit_task(scx_task_sched(p), p); 4041*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_DEAD); 4042*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 4043*bba2c361STejun Heo } 4044*bba2c361STejun Heo } 4045*bba2c361STejun Heo 4046*bba2c361STejun Heo static void reweight_task_scx(struct rq *rq, struct task_struct *p, 4047*bba2c361STejun Heo const struct load_weight *lw) 4048*bba2c361STejun Heo { 4049*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 4050*bba2c361STejun Heo 4051*bba2c361STejun Heo lockdep_assert_rq_held(task_rq(p)); 4052*bba2c361STejun Heo 4053*bba2c361STejun Heo if (task_dead_and_done(p)) 4054*bba2c361STejun Heo return; 4055*bba2c361STejun Heo 4056*bba2c361STejun Heo p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 4057*bba2c361STejun Heo if (SCX_HAS_OP(sch, set_weight)) 4058*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); 4059*bba2c361STejun Heo } 4060*bba2c361STejun Heo 4061*bba2c361STejun Heo static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 4062*bba2c361STejun Heo { 4063*bba2c361STejun Heo } 4064*bba2c361STejun Heo 4065*bba2c361STejun Heo static void switching_to_scx(struct rq *rq, struct task_struct *p) 4066*bba2c361STejun Heo { 4067*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 4068*bba2c361STejun Heo 4069*bba2c361STejun Heo if (task_dead_and_done(p)) 4070*bba2c361STejun Heo return; 4071*bba2c361STejun Heo 4072*bba2c361STejun Heo scx_enable_task(sch, p); 4073*bba2c361STejun Heo 4074*bba2c361STejun Heo /* 4075*bba2c361STejun Heo * set_cpus_allowed_scx() is not called while @p is associated with a 4076*bba2c361STejun Heo * different scheduler class. Keep the BPF scheduler up-to-date. 4077*bba2c361STejun Heo */ 4078*bba2c361STejun Heo if (SCX_HAS_OP(sch, set_cpumask)) 4079*bba2c361STejun Heo scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr); 4080*bba2c361STejun Heo } 4081*bba2c361STejun Heo 4082*bba2c361STejun Heo static void switched_from_scx(struct rq *rq, struct task_struct *p) 4083*bba2c361STejun Heo { 4084*bba2c361STejun Heo if (task_dead_and_done(p)) 4085*bba2c361STejun Heo return; 4086*bba2c361STejun Heo 4087*bba2c361STejun Heo /* 4088*bba2c361STejun Heo * %NONE means SCX is no longer tracking @p at the task level (e.g. 4089*bba2c361STejun Heo * scx_fail_parent() handed @p back to the parent at NONE pending the 4090*bba2c361STejun Heo * parent's own teardown). There is nothing to disable; calling 4091*bba2c361STejun Heo * scx_disable_task() would WARN on the non-%ENABLED state and trigger a 4092*bba2c361STejun Heo * NONE -> READY validation failure. 4093*bba2c361STejun Heo */ 4094*bba2c361STejun Heo if (scx_get_task_state(p) == SCX_TASK_NONE) 4095*bba2c361STejun Heo return; 4096*bba2c361STejun Heo 4097*bba2c361STejun Heo scx_disable_task(scx_task_sched(p), p); 4098*bba2c361STejun Heo } 4099*bba2c361STejun Heo 4100*bba2c361STejun Heo static void switched_to_scx(struct rq *rq, struct task_struct *p) {} 4101*bba2c361STejun Heo 4102*bba2c361STejun Heo int scx_check_setscheduler(struct task_struct *p, int policy) 4103*bba2c361STejun Heo { 4104*bba2c361STejun Heo lockdep_assert_rq_held(task_rq(p)); 4105*bba2c361STejun Heo 4106*bba2c361STejun Heo /* if disallow, reject transitioning into SCX */ 4107*bba2c361STejun Heo if (scx_enabled() && READ_ONCE(p->scx.disallow) && 4108*bba2c361STejun Heo p->policy != policy && policy == SCHED_EXT) 4109*bba2c361STejun Heo return -EACCES; 4110*bba2c361STejun Heo 4111*bba2c361STejun Heo return 0; 4112*bba2c361STejun Heo } 4113*bba2c361STejun Heo 4114*bba2c361STejun Heo static void process_ddsp_deferred_locals(struct rq *rq) 4115*bba2c361STejun Heo { 4116*bba2c361STejun Heo struct task_struct *p; 4117*bba2c361STejun Heo 4118*bba2c361STejun Heo lockdep_assert_rq_held(rq); 4119*bba2c361STejun Heo 4120*bba2c361STejun Heo /* 4121*bba2c361STejun Heo * Now that @rq can be unlocked, execute the deferred enqueueing of 4122*bba2c361STejun Heo * tasks directly dispatched to the local DSQs of other CPUs. See 4123*bba2c361STejun Heo * direct_dispatch(). Keep popping from the head instead of using 4124*bba2c361STejun Heo * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq 4125*bba2c361STejun Heo * temporarily. 4126*bba2c361STejun Heo */ 4127*bba2c361STejun Heo while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, 4128*bba2c361STejun Heo struct task_struct, scx.dsq_list.node))) { 4129*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 4130*bba2c361STejun Heo struct scx_dispatch_q *dsq; 4131*bba2c361STejun Heo u64 dsq_id = p->scx.ddsp_dsq_id; 4132*bba2c361STejun Heo u64 enq_flags = p->scx.ddsp_enq_flags; 4133*bba2c361STejun Heo 4134*bba2c361STejun Heo list_del_init(&p->scx.dsq_list.node); 4135*bba2c361STejun Heo clear_direct_dispatch(p); 4136*bba2c361STejun Heo 4137*bba2c361STejun Heo dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); 4138*bba2c361STejun Heo if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) 4139*bba2c361STejun Heo dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); 4140*bba2c361STejun Heo } 4141*bba2c361STejun Heo } 4142*bba2c361STejun Heo 4143*bba2c361STejun Heo /* 4144*bba2c361STejun Heo * Determine whether @p should be reenqueued from a local DSQ. 4145*bba2c361STejun Heo * 4146*bba2c361STejun Heo * @reenq_flags is mutable and accumulates state across the DSQ walk: 4147*bba2c361STejun Heo * 4148*bba2c361STejun Heo * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" 4149*bba2c361STejun Heo * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at 4150*bba2c361STejun Heo * the head consumes the first slot. 4151*bba2c361STejun Heo * 4152*bba2c361STejun Heo * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if 4153*bba2c361STejun Heo * rq_is_open() is true. 4154*bba2c361STejun Heo * 4155*bba2c361STejun Heo * An IMMED task is kept (returns %false) only if it's the first task in the DSQ 4156*bba2c361STejun Heo * AND the current task is done — i.e. it will execute immediately. All other 4157*bba2c361STejun Heo * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, 4158*bba2c361STejun Heo * every IMMED task behind it gets reenqueued. 4159*bba2c361STejun Heo * 4160*bba2c361STejun Heo * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | 4161*bba2c361STejun Heo * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local 4162*bba2c361STejun Heo * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers 4163*bba2c361STejun Heo * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT 4164*bba2c361STejun Heo * in process_deferred_reenq_locals(). 4165*bba2c361STejun Heo */ 4166*bba2c361STejun Heo static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) 4167*bba2c361STejun Heo { 4168*bba2c361STejun Heo bool first; 4169*bba2c361STejun Heo 4170*bba2c361STejun Heo first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); 4171*bba2c361STejun Heo *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; 4172*bba2c361STejun Heo 4173*bba2c361STejun Heo *reason = SCX_TASK_REENQ_KFUNC; 4174*bba2c361STejun Heo 4175*bba2c361STejun Heo if ((p->scx.flags & SCX_TASK_IMMED) && 4176*bba2c361STejun Heo (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { 4177*bba2c361STejun Heo __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); 4178*bba2c361STejun Heo *reason = SCX_TASK_REENQ_IMMED; 4179*bba2c361STejun Heo return true; 4180*bba2c361STejun Heo } 4181*bba2c361STejun Heo 4182*bba2c361STejun Heo return *reenq_flags & SCX_REENQ_ANY; 4183*bba2c361STejun Heo } 4184*bba2c361STejun Heo 4185*bba2c361STejun Heo static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) 4186*bba2c361STejun Heo { 4187*bba2c361STejun Heo LIST_HEAD(tasks); 4188*bba2c361STejun Heo u32 nr_enqueued = 0; 4189*bba2c361STejun Heo struct task_struct *p, *n; 4190*bba2c361STejun Heo 4191*bba2c361STejun Heo lockdep_assert_rq_held(rq); 4192*bba2c361STejun Heo 4193*bba2c361STejun Heo if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) 4194*bba2c361STejun Heo reenq_flags &= ~__SCX_REENQ_TSR_MASK; 4195*bba2c361STejun Heo if (rq_is_open(rq, 0)) 4196*bba2c361STejun Heo reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; 4197*bba2c361STejun Heo 4198*bba2c361STejun Heo /* 4199*bba2c361STejun Heo * The BPF scheduler may choose to dispatch tasks back to 4200*bba2c361STejun Heo * @rq->scx.local_dsq. Move all candidate tasks off to a private list 4201*bba2c361STejun Heo * first to avoid processing the same tasks repeatedly. 4202*bba2c361STejun Heo */ 4203*bba2c361STejun Heo list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, 4204*bba2c361STejun Heo scx.dsq_list.node) { 4205*bba2c361STejun Heo struct scx_sched *task_sch = scx_task_sched(p); 4206*bba2c361STejun Heo u32 reason; 4207*bba2c361STejun Heo 4208*bba2c361STejun Heo /* 4209*bba2c361STejun Heo * If @p is being migrated, @p's current CPU may not agree with 4210*bba2c361STejun Heo * its allowed CPUs and the migration_cpu_stop is about to 4211*bba2c361STejun Heo * deactivate and re-activate @p anyway. Skip re-enqueueing. 4212*bba2c361STejun Heo * 4213*bba2c361STejun Heo * While racing sched property changes may also dequeue and 4214*bba2c361STejun Heo * re-enqueue a migrating task while its current CPU and allowed 4215*bba2c361STejun Heo * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 4216*bba2c361STejun Heo * the current local DSQ for running tasks and thus are not 4217*bba2c361STejun Heo * visible to the BPF scheduler. 4218*bba2c361STejun Heo */ 4219*bba2c361STejun Heo if (p->migration_pending) 4220*bba2c361STejun Heo continue; 4221*bba2c361STejun Heo 4222*bba2c361STejun Heo if (!scx_is_descendant(task_sch, sch)) 4223*bba2c361STejun Heo continue; 4224*bba2c361STejun Heo 4225*bba2c361STejun Heo if (!local_task_should_reenq(p, &reenq_flags, &reason)) 4226*bba2c361STejun Heo continue; 4227*bba2c361STejun Heo 4228*bba2c361STejun Heo dispatch_dequeue(rq, p); 4229*bba2c361STejun Heo 4230*bba2c361STejun Heo if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4231*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4232*bba2c361STejun Heo p->scx.flags |= reason; 4233*bba2c361STejun Heo 4234*bba2c361STejun Heo list_add_tail(&p->scx.dsq_list.node, &tasks); 4235*bba2c361STejun Heo } 4236*bba2c361STejun Heo 4237*bba2c361STejun Heo list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { 4238*bba2c361STejun Heo list_del_init(&p->scx.dsq_list.node); 4239*bba2c361STejun Heo 4240*bba2c361STejun Heo do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); 4241*bba2c361STejun Heo 4242*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4243*bba2c361STejun Heo nr_enqueued++; 4244*bba2c361STejun Heo } 4245*bba2c361STejun Heo 4246*bba2c361STejun Heo return nr_enqueued; 4247*bba2c361STejun Heo } 4248*bba2c361STejun Heo 4249*bba2c361STejun Heo static void process_deferred_reenq_locals(struct rq *rq) 4250*bba2c361STejun Heo { 4251*bba2c361STejun Heo u64 seq = ++rq->scx.deferred_reenq_locals_seq; 4252*bba2c361STejun Heo 4253*bba2c361STejun Heo lockdep_assert_rq_held(rq); 4254*bba2c361STejun Heo 4255*bba2c361STejun Heo while (true) { 4256*bba2c361STejun Heo struct scx_sched *sch; 4257*bba2c361STejun Heo u64 reenq_flags; 4258*bba2c361STejun Heo bool skip = false; 4259*bba2c361STejun Heo 4260*bba2c361STejun Heo scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4261*bba2c361STejun Heo struct scx_deferred_reenq_local *drl = 4262*bba2c361STejun Heo list_first_entry_or_null(&rq->scx.deferred_reenq_locals, 4263*bba2c361STejun Heo struct scx_deferred_reenq_local, 4264*bba2c361STejun Heo node); 4265*bba2c361STejun Heo struct scx_sched_pcpu *sch_pcpu; 4266*bba2c361STejun Heo 4267*bba2c361STejun Heo if (!drl) 4268*bba2c361STejun Heo return; 4269*bba2c361STejun Heo 4270*bba2c361STejun Heo sch_pcpu = container_of(drl, struct scx_sched_pcpu, 4271*bba2c361STejun Heo deferred_reenq_local); 4272*bba2c361STejun Heo sch = sch_pcpu->sch; 4273*bba2c361STejun Heo 4274*bba2c361STejun Heo reenq_flags = drl->flags; 4275*bba2c361STejun Heo WRITE_ONCE(drl->flags, 0); 4276*bba2c361STejun Heo list_del_init(&drl->node); 4277*bba2c361STejun Heo 4278*bba2c361STejun Heo if (likely(drl->seq != seq)) { 4279*bba2c361STejun Heo drl->seq = seq; 4280*bba2c361STejun Heo drl->cnt = 0; 4281*bba2c361STejun Heo } else { 4282*bba2c361STejun Heo if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { 4283*bba2c361STejun Heo scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", 4284*bba2c361STejun Heo drl->cnt); 4285*bba2c361STejun Heo skip = true; 4286*bba2c361STejun Heo } 4287*bba2c361STejun Heo 4288*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); 4289*bba2c361STejun Heo } 4290*bba2c361STejun Heo } 4291*bba2c361STejun Heo 4292*bba2c361STejun Heo if (!skip) { 4293*bba2c361STejun Heo /* see schedule_dsq_reenq() */ 4294*bba2c361STejun Heo smp_mb(); 4295*bba2c361STejun Heo 4296*bba2c361STejun Heo reenq_local(sch, rq, reenq_flags); 4297*bba2c361STejun Heo } 4298*bba2c361STejun Heo } 4299*bba2c361STejun Heo } 4300*bba2c361STejun Heo 4301*bba2c361STejun Heo static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) 4302*bba2c361STejun Heo { 4303*bba2c361STejun Heo *reason = SCX_TASK_REENQ_KFUNC; 4304*bba2c361STejun Heo return reenq_flags & SCX_REENQ_ANY; 4305*bba2c361STejun Heo } 4306*bba2c361STejun Heo 4307*bba2c361STejun Heo static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 4308*bba2c361STejun Heo { 4309*bba2c361STejun Heo struct rq *locked_rq = rq; 4310*bba2c361STejun Heo struct scx_sched *sch = dsq->sched; 4311*bba2c361STejun Heo struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 4312*bba2c361STejun Heo struct task_struct *p; 4313*bba2c361STejun Heo s32 nr_enqueued = 0; 4314*bba2c361STejun Heo 4315*bba2c361STejun Heo lockdep_assert_rq_held(rq); 4316*bba2c361STejun Heo 4317*bba2c361STejun Heo raw_spin_lock(&dsq->lock); 4318*bba2c361STejun Heo 4319*bba2c361STejun Heo while (likely(!READ_ONCE(sch->bypass_depth))) { 4320*bba2c361STejun Heo struct rq *task_rq; 4321*bba2c361STejun Heo u32 reason; 4322*bba2c361STejun Heo 4323*bba2c361STejun Heo p = nldsq_cursor_next_task(&cursor, dsq); 4324*bba2c361STejun Heo if (!p) 4325*bba2c361STejun Heo break; 4326*bba2c361STejun Heo 4327*bba2c361STejun Heo if (!user_task_should_reenq(p, reenq_flags, &reason)) 4328*bba2c361STejun Heo continue; 4329*bba2c361STejun Heo 4330*bba2c361STejun Heo task_rq = task_rq(p); 4331*bba2c361STejun Heo 4332*bba2c361STejun Heo if (locked_rq != task_rq) { 4333*bba2c361STejun Heo if (locked_rq) 4334*bba2c361STejun Heo raw_spin_rq_unlock(locked_rq); 4335*bba2c361STejun Heo if (unlikely(!raw_spin_rq_trylock(task_rq))) { 4336*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 4337*bba2c361STejun Heo raw_spin_rq_lock(task_rq); 4338*bba2c361STejun Heo raw_spin_lock(&dsq->lock); 4339*bba2c361STejun Heo } 4340*bba2c361STejun Heo locked_rq = task_rq; 4341*bba2c361STejun Heo 4342*bba2c361STejun Heo /* did we lose @p while switching locks? */ 4343*bba2c361STejun Heo if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 4344*bba2c361STejun Heo continue; 4345*bba2c361STejun Heo } 4346*bba2c361STejun Heo 4347*bba2c361STejun Heo /* @p is on @dsq, its rq and @dsq are locked */ 4348*bba2c361STejun Heo dispatch_dequeue_locked(p, dsq); 4349*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 4350*bba2c361STejun Heo 4351*bba2c361STejun Heo if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) 4352*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4353*bba2c361STejun Heo p->scx.flags |= reason; 4354*bba2c361STejun Heo 4355*bba2c361STejun Heo do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 4356*bba2c361STejun Heo 4357*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; 4358*bba2c361STejun Heo 4359*bba2c361STejun Heo if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 4360*bba2c361STejun Heo raw_spin_rq_unlock(locked_rq); 4361*bba2c361STejun Heo locked_rq = NULL; 4362*bba2c361STejun Heo cpu_relax(); 4363*bba2c361STejun Heo } 4364*bba2c361STejun Heo 4365*bba2c361STejun Heo raw_spin_lock(&dsq->lock); 4366*bba2c361STejun Heo } 4367*bba2c361STejun Heo 4368*bba2c361STejun Heo list_del_init(&cursor.node); 4369*bba2c361STejun Heo raw_spin_unlock(&dsq->lock); 4370*bba2c361STejun Heo 4371*bba2c361STejun Heo if (locked_rq != rq) { 4372*bba2c361STejun Heo if (locked_rq) 4373*bba2c361STejun Heo raw_spin_rq_unlock(locked_rq); 4374*bba2c361STejun Heo raw_spin_rq_lock(rq); 4375*bba2c361STejun Heo } 4376*bba2c361STejun Heo } 4377*bba2c361STejun Heo 4378*bba2c361STejun Heo static void process_deferred_reenq_users(struct rq *rq) 4379*bba2c361STejun Heo { 4380*bba2c361STejun Heo lockdep_assert_rq_held(rq); 4381*bba2c361STejun Heo 4382*bba2c361STejun Heo while (true) { 4383*bba2c361STejun Heo struct scx_dispatch_q *dsq; 4384*bba2c361STejun Heo u64 reenq_flags; 4385*bba2c361STejun Heo 4386*bba2c361STejun Heo scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 4387*bba2c361STejun Heo struct scx_deferred_reenq_user *dru = 4388*bba2c361STejun Heo list_first_entry_or_null(&rq->scx.deferred_reenq_users, 4389*bba2c361STejun Heo struct scx_deferred_reenq_user, 4390*bba2c361STejun Heo node); 4391*bba2c361STejun Heo struct scx_dsq_pcpu *dsq_pcpu; 4392*bba2c361STejun Heo 4393*bba2c361STejun Heo if (!dru) 4394*bba2c361STejun Heo return; 4395*bba2c361STejun Heo 4396*bba2c361STejun Heo dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 4397*bba2c361STejun Heo deferred_reenq_user); 4398*bba2c361STejun Heo dsq = dsq_pcpu->dsq; 4399*bba2c361STejun Heo reenq_flags = dru->flags; 4400*bba2c361STejun Heo WRITE_ONCE(dru->flags, 0); 4401*bba2c361STejun Heo list_del_init(&dru->node); 4402*bba2c361STejun Heo } 4403*bba2c361STejun Heo 4404*bba2c361STejun Heo /* see schedule_dsq_reenq() */ 4405*bba2c361STejun Heo smp_mb(); 4406*bba2c361STejun Heo 4407*bba2c361STejun Heo BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 4408*bba2c361STejun Heo reenq_user(rq, dsq, reenq_flags); 4409*bba2c361STejun Heo } 4410*bba2c361STejun Heo } 4411*bba2c361STejun Heo 4412*bba2c361STejun Heo static void run_deferred(struct rq *rq) 4413*bba2c361STejun Heo { 4414*bba2c361STejun Heo process_ddsp_deferred_locals(rq); 4415*bba2c361STejun Heo 4416*bba2c361STejun Heo if (!list_empty(&rq->scx.deferred_reenq_locals)) 4417*bba2c361STejun Heo process_deferred_reenq_locals(rq); 4418*bba2c361STejun Heo 4419*bba2c361STejun Heo if (!list_empty(&rq->scx.deferred_reenq_users)) 4420*bba2c361STejun Heo process_deferred_reenq_users(rq); 4421*bba2c361STejun Heo } 4422*bba2c361STejun Heo 4423*bba2c361STejun Heo #ifdef CONFIG_NO_HZ_FULL 4424*bba2c361STejun Heo bool scx_can_stop_tick(struct rq *rq) 4425*bba2c361STejun Heo { 4426*bba2c361STejun Heo struct task_struct *p = rq->curr; 4427*bba2c361STejun Heo struct scx_sched *sch = scx_task_sched(p); 4428*bba2c361STejun Heo 4429*bba2c361STejun Heo if (p->sched_class != &ext_sched_class) 4430*bba2c361STejun Heo return true; 4431*bba2c361STejun Heo 4432*bba2c361STejun Heo if (scx_bypassing(sch, cpu_of(rq))) 4433*bba2c361STejun Heo return false; 4434*bba2c361STejun Heo 4435*bba2c361STejun Heo /* 4436*bba2c361STejun Heo * @rq can dispatch from different DSQs, so we can't tell whether it 4437*bba2c361STejun Heo * needs the tick or not by looking at nr_running. Allow stopping ticks 4438*bba2c361STejun Heo * iff the BPF scheduler indicated so. See set_next_task_scx(). 4439*bba2c361STejun Heo */ 4440*bba2c361STejun Heo return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; 4441*bba2c361STejun Heo } 4442*bba2c361STejun Heo #endif 4443*bba2c361STejun Heo 4444*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 4445*bba2c361STejun Heo 4446*bba2c361STejun Heo DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 4447*bba2c361STejun Heo static bool scx_cgroup_enabled; 4448*bba2c361STejun Heo 4449*bba2c361STejun Heo void scx_tg_init(struct task_group *tg) 4450*bba2c361STejun Heo { 4451*bba2c361STejun Heo tg->scx.weight = CGROUP_WEIGHT_DFL; 4452*bba2c361STejun Heo tg->scx.bw_period_us = default_bw_period_us(); 4453*bba2c361STejun Heo tg->scx.bw_quota_us = RUNTIME_INF; 4454*bba2c361STejun Heo tg->scx.idle = false; 4455*bba2c361STejun Heo } 4456*bba2c361STejun Heo 4457*bba2c361STejun Heo int scx_tg_online(struct task_group *tg) 4458*bba2c361STejun Heo { 4459*bba2c361STejun Heo struct scx_sched *sch = scx_root; 4460*bba2c361STejun Heo int ret = 0; 4461*bba2c361STejun Heo 4462*bba2c361STejun Heo WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 4463*bba2c361STejun Heo 4464*bba2c361STejun Heo if (scx_cgroup_enabled) { 4465*bba2c361STejun Heo if (SCX_HAS_OP(sch, cgroup_init)) { 4466*bba2c361STejun Heo struct scx_cgroup_init_args args = 4467*bba2c361STejun Heo { .weight = tg->scx.weight, 4468*bba2c361STejun Heo .bw_period_us = tg->scx.bw_period_us, 4469*bba2c361STejun Heo .bw_quota_us = tg->scx.bw_quota_us, 4470*bba2c361STejun Heo .bw_burst_us = tg->scx.bw_burst_us }; 4471*bba2c361STejun Heo 4472*bba2c361STejun Heo ret = SCX_CALL_OP_RET(sch, cgroup_init, 4473*bba2c361STejun Heo NULL, tg->css.cgroup, &args); 4474*bba2c361STejun Heo if (ret) 4475*bba2c361STejun Heo ret = ops_sanitize_err(sch, "cgroup_init", ret); 4476*bba2c361STejun Heo } 4477*bba2c361STejun Heo if (ret == 0) 4478*bba2c361STejun Heo tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; 4479*bba2c361STejun Heo } else { 4480*bba2c361STejun Heo tg->scx.flags |= SCX_TG_ONLINE; 4481*bba2c361STejun Heo } 4482*bba2c361STejun Heo 4483*bba2c361STejun Heo return ret; 4484*bba2c361STejun Heo } 4485*bba2c361STejun Heo 4486*bba2c361STejun Heo void scx_tg_offline(struct task_group *tg) 4487*bba2c361STejun Heo { 4488*bba2c361STejun Heo struct scx_sched *sch = scx_root; 4489*bba2c361STejun Heo 4490*bba2c361STejun Heo WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 4491*bba2c361STejun Heo 4492*bba2c361STejun Heo if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 4493*bba2c361STejun Heo (tg->scx.flags & SCX_TG_INITED)) 4494*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); 4495*bba2c361STejun Heo tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 4496*bba2c361STejun Heo } 4497*bba2c361STejun Heo 4498*bba2c361STejun Heo int scx_cgroup_can_attach(struct cgroup_taskset *tset) 4499*bba2c361STejun Heo { 4500*bba2c361STejun Heo struct scx_sched *sch = scx_root; 4501*bba2c361STejun Heo struct cgroup_subsys_state *css; 4502*bba2c361STejun Heo struct task_struct *p; 4503*bba2c361STejun Heo int ret; 4504*bba2c361STejun Heo 4505*bba2c361STejun Heo if (!scx_cgroup_enabled) 4506*bba2c361STejun Heo return 0; 4507*bba2c361STejun Heo 4508*bba2c361STejun Heo cgroup_taskset_for_each(p, css, tset) { 4509*bba2c361STejun Heo struct cgroup *from = tg_cgrp(task_group(p)); 4510*bba2c361STejun Heo struct cgroup *to = tg_cgrp(css_tg(css)); 4511*bba2c361STejun Heo 4512*bba2c361STejun Heo WARN_ON_ONCE(p->scx.cgrp_moving_from); 4513*bba2c361STejun Heo 4514*bba2c361STejun Heo /* 4515*bba2c361STejun Heo * sched_move_task() omits identity migrations. Let's match the 4516*bba2c361STejun Heo * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 4517*bba2c361STejun Heo * always match one-to-one. 4518*bba2c361STejun Heo */ 4519*bba2c361STejun Heo if (from == to) 4520*bba2c361STejun Heo continue; 4521*bba2c361STejun Heo 4522*bba2c361STejun Heo if (SCX_HAS_OP(sch, cgroup_prep_move)) { 4523*bba2c361STejun Heo ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, 4524*bba2c361STejun Heo p, from, css->cgroup); 4525*bba2c361STejun Heo if (ret) 4526*bba2c361STejun Heo goto err; 4527*bba2c361STejun Heo } 4528*bba2c361STejun Heo 4529*bba2c361STejun Heo p->scx.cgrp_moving_from = from; 4530*bba2c361STejun Heo } 4531*bba2c361STejun Heo 4532*bba2c361STejun Heo return 0; 4533*bba2c361STejun Heo 4534*bba2c361STejun Heo err: 4535*bba2c361STejun Heo cgroup_taskset_for_each(p, css, tset) { 4536*bba2c361STejun Heo if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4537*bba2c361STejun Heo p->scx.cgrp_moving_from) 4538*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4539*bba2c361STejun Heo p, p->scx.cgrp_moving_from, css->cgroup); 4540*bba2c361STejun Heo p->scx.cgrp_moving_from = NULL; 4541*bba2c361STejun Heo } 4542*bba2c361STejun Heo 4543*bba2c361STejun Heo return ops_sanitize_err(sch, "cgroup_prep_move", ret); 4544*bba2c361STejun Heo } 4545*bba2c361STejun Heo 4546*bba2c361STejun Heo void scx_cgroup_move_task(struct task_struct *p) 4547*bba2c361STejun Heo { 4548*bba2c361STejun Heo struct scx_sched *sch = scx_root; 4549*bba2c361STejun Heo 4550*bba2c361STejun Heo if (!scx_cgroup_enabled) 4551*bba2c361STejun Heo return; 4552*bba2c361STejun Heo 4553*bba2c361STejun Heo /* 4554*bba2c361STejun Heo * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's 4555*bba2c361STejun Heo * cgroup changes. Migration keys off css rather than cgroup identity, 4556*bba2c361STejun Heo * so it can hand an unchanged-cgroup task here with cgrp_moving_from 4557*bba2c361STejun Heo * NULL. Nothing to report to the BPF scheduler then, so skip it and 4558*bba2c361STejun Heo * keep prep_move and move paired. 4559*bba2c361STejun Heo */ 4560*bba2c361STejun Heo if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from) 4561*bba2c361STejun Heo SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), 4562*bba2c361STejun Heo p, p->scx.cgrp_moving_from, 4563*bba2c361STejun Heo tg_cgrp(task_group(p))); 4564*bba2c361STejun Heo p->scx.cgrp_moving_from = NULL; 4565*bba2c361STejun Heo } 4566*bba2c361STejun Heo 4567*bba2c361STejun Heo void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 4568*bba2c361STejun Heo { 4569*bba2c361STejun Heo struct scx_sched *sch = scx_root; 4570*bba2c361STejun Heo struct cgroup_subsys_state *css; 4571*bba2c361STejun Heo struct task_struct *p; 4572*bba2c361STejun Heo 4573*bba2c361STejun Heo if (!scx_cgroup_enabled) 4574*bba2c361STejun Heo return; 4575*bba2c361STejun Heo 4576*bba2c361STejun Heo cgroup_taskset_for_each(p, css, tset) { 4577*bba2c361STejun Heo if (SCX_HAS_OP(sch, cgroup_cancel_move) && 4578*bba2c361STejun Heo p->scx.cgrp_moving_from) 4579*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_cancel_move, NULL, 4580*bba2c361STejun Heo p, p->scx.cgrp_moving_from, css->cgroup); 4581*bba2c361STejun Heo p->scx.cgrp_moving_from = NULL; 4582*bba2c361STejun Heo } 4583*bba2c361STejun Heo } 4584*bba2c361STejun Heo 4585*bba2c361STejun Heo void scx_group_set_weight(struct task_group *tg, unsigned long weight) 4586*bba2c361STejun Heo { 4587*bba2c361STejun Heo struct scx_sched *sch; 4588*bba2c361STejun Heo 4589*bba2c361STejun Heo percpu_down_read(&scx_cgroup_ops_rwsem); 4590*bba2c361STejun Heo sch = scx_root; 4591*bba2c361STejun Heo 4592*bba2c361STejun Heo if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 4593*bba2c361STejun Heo tg->scx.weight != weight) 4594*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); 4595*bba2c361STejun Heo 4596*bba2c361STejun Heo tg->scx.weight = weight; 4597*bba2c361STejun Heo 4598*bba2c361STejun Heo percpu_up_read(&scx_cgroup_ops_rwsem); 4599*bba2c361STejun Heo } 4600*bba2c361STejun Heo 4601*bba2c361STejun Heo void scx_group_set_idle(struct task_group *tg, bool idle) 4602*bba2c361STejun Heo { 4603*bba2c361STejun Heo struct scx_sched *sch; 4604*bba2c361STejun Heo 4605*bba2c361STejun Heo percpu_down_read(&scx_cgroup_ops_rwsem); 4606*bba2c361STejun Heo sch = scx_root; 4607*bba2c361STejun Heo 4608*bba2c361STejun Heo if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 4609*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); 4610*bba2c361STejun Heo 4611*bba2c361STejun Heo /* Update the task group's idle state */ 4612*bba2c361STejun Heo tg->scx.idle = idle; 4613*bba2c361STejun Heo 4614*bba2c361STejun Heo percpu_up_read(&scx_cgroup_ops_rwsem); 4615*bba2c361STejun Heo } 4616*bba2c361STejun Heo 4617*bba2c361STejun Heo void scx_group_set_bandwidth(struct task_group *tg, 4618*bba2c361STejun Heo u64 period_us, u64 quota_us, u64 burst_us) 4619*bba2c361STejun Heo { 4620*bba2c361STejun Heo struct scx_sched *sch; 4621*bba2c361STejun Heo 4622*bba2c361STejun Heo percpu_down_read(&scx_cgroup_ops_rwsem); 4623*bba2c361STejun Heo sch = scx_root; 4624*bba2c361STejun Heo 4625*bba2c361STejun Heo if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4626*bba2c361STejun Heo (tg->scx.bw_period_us != period_us || 4627*bba2c361STejun Heo tg->scx.bw_quota_us != quota_us || 4628*bba2c361STejun Heo tg->scx.bw_burst_us != burst_us)) 4629*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, 4630*bba2c361STejun Heo tg_cgrp(tg), period_us, quota_us, burst_us); 4631*bba2c361STejun Heo 4632*bba2c361STejun Heo tg->scx.bw_period_us = period_us; 4633*bba2c361STejun Heo tg->scx.bw_quota_us = quota_us; 4634*bba2c361STejun Heo tg->scx.bw_burst_us = burst_us; 4635*bba2c361STejun Heo 4636*bba2c361STejun Heo percpu_up_read(&scx_cgroup_ops_rwsem); 4637*bba2c361STejun Heo } 4638*bba2c361STejun Heo #endif /* CONFIG_EXT_GROUP_SCHED */ 4639*bba2c361STejun Heo 4640*bba2c361STejun Heo #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) 4641*bba2c361STejun Heo static struct cgroup *root_cgroup(void) 4642*bba2c361STejun Heo { 4643*bba2c361STejun Heo return &cgrp_dfl_root.cgrp; 4644*bba2c361STejun Heo } 4645*bba2c361STejun Heo 4646*bba2c361STejun Heo static void scx_cgroup_lock(void) 4647*bba2c361STejun Heo { 4648*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 4649*bba2c361STejun Heo percpu_down_write(&scx_cgroup_ops_rwsem); 4650*bba2c361STejun Heo #endif 4651*bba2c361STejun Heo cgroup_lock(); 4652*bba2c361STejun Heo } 4653*bba2c361STejun Heo 4654*bba2c361STejun Heo static void scx_cgroup_unlock(void) 4655*bba2c361STejun Heo { 4656*bba2c361STejun Heo cgroup_unlock(); 4657*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 4658*bba2c361STejun Heo percpu_up_write(&scx_cgroup_ops_rwsem); 4659*bba2c361STejun Heo #endif 4660*bba2c361STejun Heo } 4661*bba2c361STejun Heo #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4662*bba2c361STejun Heo static inline struct cgroup *root_cgroup(void) { return NULL; } 4663*bba2c361STejun Heo static inline void scx_cgroup_lock(void) {} 4664*bba2c361STejun Heo static inline void scx_cgroup_unlock(void) {} 4665*bba2c361STejun Heo #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ 4666*bba2c361STejun Heo 4667*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 4668*bba2c361STejun Heo static struct cgroup *sch_cgroup(struct scx_sched *sch) 4669*bba2c361STejun Heo { 4670*bba2c361STejun Heo return sch->cgrp; 4671*bba2c361STejun Heo } 4672*bba2c361STejun Heo 4673*bba2c361STejun Heo /* for each descendant of @cgrp including self, set ->scx_sched to @sch */ 4674*bba2c361STejun Heo static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) 4675*bba2c361STejun Heo { 4676*bba2c361STejun Heo struct cgroup *pos; 4677*bba2c361STejun Heo struct cgroup_subsys_state *css; 4678*bba2c361STejun Heo 4679*bba2c361STejun Heo cgroup_for_each_live_descendant_pre(pos, css, cgrp) 4680*bba2c361STejun Heo rcu_assign_pointer(pos->scx_sched, sch); 4681*bba2c361STejun Heo } 4682*bba2c361STejun Heo #else /* CONFIG_EXT_SUB_SCHED */ 4683*bba2c361STejun Heo static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } 4684*bba2c361STejun Heo static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} 4685*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 4686*bba2c361STejun Heo 4687*bba2c361STejun Heo /* 4688*bba2c361STejun Heo * Omitted operations: 4689*bba2c361STejun Heo * 4690*bba2c361STejun Heo * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. 4691*bba2c361STejun Heo * 4692*bba2c361STejun Heo * - task_fork/dead: We need fork/dead notifications for all tasks regardless of 4693*bba2c361STejun Heo * their current sched_class. Call them directly from sched core instead. 4694*bba2c361STejun Heo */ 4695*bba2c361STejun Heo DEFINE_SCHED_CLASS(ext) = { 4696*bba2c361STejun Heo .enqueue_task = enqueue_task_scx, 4697*bba2c361STejun Heo .dequeue_task = dequeue_task_scx, 4698*bba2c361STejun Heo .yield_task = yield_task_scx, 4699*bba2c361STejun Heo .yield_to_task = yield_to_task_scx, 4700*bba2c361STejun Heo 4701*bba2c361STejun Heo .wakeup_preempt = wakeup_preempt_scx, 4702*bba2c361STejun Heo 4703*bba2c361STejun Heo .pick_task = pick_task_scx, 4704*bba2c361STejun Heo 4705*bba2c361STejun Heo .put_prev_task = put_prev_task_scx, 4706*bba2c361STejun Heo .set_next_task = set_next_task_scx, 4707*bba2c361STejun Heo 4708*bba2c361STejun Heo .select_task_rq = select_task_rq_scx, 4709*bba2c361STejun Heo .task_woken = task_woken_scx, 4710*bba2c361STejun Heo .set_cpus_allowed = set_cpus_allowed_scx, 4711*bba2c361STejun Heo 4712*bba2c361STejun Heo .rq_online = rq_online_scx, 4713*bba2c361STejun Heo .rq_offline = rq_offline_scx, 4714*bba2c361STejun Heo 4715*bba2c361STejun Heo .task_tick = task_tick_scx, 4716*bba2c361STejun Heo 4717*bba2c361STejun Heo .switching_to = switching_to_scx, 4718*bba2c361STejun Heo .switched_from = switched_from_scx, 4719*bba2c361STejun Heo .switched_to = switched_to_scx, 4720*bba2c361STejun Heo .reweight_task = reweight_task_scx, 4721*bba2c361STejun Heo .prio_changed = prio_changed_scx, 4722*bba2c361STejun Heo 4723*bba2c361STejun Heo .update_curr = update_curr_scx, 4724*bba2c361STejun Heo 4725*bba2c361STejun Heo #ifdef CONFIG_UCLAMP_TASK 4726*bba2c361STejun Heo .uclamp_enabled = 1, 4727*bba2c361STejun Heo #endif 4728*bba2c361STejun Heo }; 4729*bba2c361STejun Heo 4730*bba2c361STejun Heo static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, 4731*bba2c361STejun Heo struct scx_sched *sch) 4732*bba2c361STejun Heo { 4733*bba2c361STejun Heo s32 cpu; 4734*bba2c361STejun Heo 4735*bba2c361STejun Heo memset(dsq, 0, sizeof(*dsq)); 4736*bba2c361STejun Heo 4737*bba2c361STejun Heo raw_spin_lock_init(&dsq->lock); 4738*bba2c361STejun Heo INIT_LIST_HEAD(&dsq->list); 4739*bba2c361STejun Heo dsq->id = dsq_id; 4740*bba2c361STejun Heo dsq->sched = sch; 4741*bba2c361STejun Heo 4742*bba2c361STejun Heo dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); 4743*bba2c361STejun Heo if (!dsq->pcpu) 4744*bba2c361STejun Heo return -ENOMEM; 4745*bba2c361STejun Heo 4746*bba2c361STejun Heo for_each_possible_cpu(cpu) { 4747*bba2c361STejun Heo struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4748*bba2c361STejun Heo 4749*bba2c361STejun Heo pcpu->dsq = dsq; 4750*bba2c361STejun Heo INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4751*bba2c361STejun Heo } 4752*bba2c361STejun Heo 4753*bba2c361STejun Heo return 0; 4754*bba2c361STejun Heo } 4755*bba2c361STejun Heo 4756*bba2c361STejun Heo static void exit_dsq(struct scx_dispatch_q *dsq) 4757*bba2c361STejun Heo { 4758*bba2c361STejun Heo s32 cpu; 4759*bba2c361STejun Heo 4760*bba2c361STejun Heo for_each_possible_cpu(cpu) { 4761*bba2c361STejun Heo struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4762*bba2c361STejun Heo struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4763*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 4764*bba2c361STejun Heo 4765*bba2c361STejun Heo /* 4766*bba2c361STejun Heo * There must have been a RCU grace period since the last 4767*bba2c361STejun Heo * insertion and @dsq should be off the deferred list by now. 4768*bba2c361STejun Heo */ 4769*bba2c361STejun Heo if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4770*bba2c361STejun Heo guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4771*bba2c361STejun Heo list_del_init(&dru->node); 4772*bba2c361STejun Heo } 4773*bba2c361STejun Heo } 4774*bba2c361STejun Heo 4775*bba2c361STejun Heo free_percpu(dsq->pcpu); 4776*bba2c361STejun Heo } 4777*bba2c361STejun Heo 4778*bba2c361STejun Heo static void free_dsq_rcufn(struct rcu_head *rcu) 4779*bba2c361STejun Heo { 4780*bba2c361STejun Heo struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); 4781*bba2c361STejun Heo 4782*bba2c361STejun Heo exit_dsq(dsq); 4783*bba2c361STejun Heo kfree(dsq); 4784*bba2c361STejun Heo } 4785*bba2c361STejun Heo 4786*bba2c361STejun Heo static void free_dsq_irq_workfn(struct irq_work *irq_work) 4787*bba2c361STejun Heo { 4788*bba2c361STejun Heo struct llist_node *to_free = llist_del_all(&dsqs_to_free); 4789*bba2c361STejun Heo struct scx_dispatch_q *dsq, *tmp_dsq; 4790*bba2c361STejun Heo 4791*bba2c361STejun Heo llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) 4792*bba2c361STejun Heo call_rcu(&dsq->rcu, free_dsq_rcufn); 4793*bba2c361STejun Heo } 4794*bba2c361STejun Heo 4795*bba2c361STejun Heo static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); 4796*bba2c361STejun Heo 4797*bba2c361STejun Heo static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) 4798*bba2c361STejun Heo { 4799*bba2c361STejun Heo struct scx_dispatch_q *dsq; 4800*bba2c361STejun Heo unsigned long flags; 4801*bba2c361STejun Heo 4802*bba2c361STejun Heo rcu_read_lock(); 4803*bba2c361STejun Heo 4804*bba2c361STejun Heo dsq = find_user_dsq(sch, dsq_id); 4805*bba2c361STejun Heo if (!dsq) 4806*bba2c361STejun Heo goto out_unlock_rcu; 4807*bba2c361STejun Heo 4808*bba2c361STejun Heo raw_spin_lock_irqsave(&dsq->lock, flags); 4809*bba2c361STejun Heo 4810*bba2c361STejun Heo if (dsq->nr) { 4811*bba2c361STejun Heo scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", 4812*bba2c361STejun Heo dsq->id, dsq->nr); 4813*bba2c361STejun Heo goto out_unlock_dsq; 4814*bba2c361STejun Heo } 4815*bba2c361STejun Heo 4816*bba2c361STejun Heo if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, 4817*bba2c361STejun Heo dsq_hash_params)) 4818*bba2c361STejun Heo goto out_unlock_dsq; 4819*bba2c361STejun Heo 4820*bba2c361STejun Heo /* 4821*bba2c361STejun Heo * Mark dead by invalidating ->id to prevent dispatch_enqueue() from 4822*bba2c361STejun Heo * queueing more tasks. As this function can be called from anywhere, 4823*bba2c361STejun Heo * freeing is bounced through an irq work to avoid nesting RCU 4824*bba2c361STejun Heo * operations inside scheduler locks. 4825*bba2c361STejun Heo */ 4826*bba2c361STejun Heo dsq->id = SCX_DSQ_INVALID; 4827*bba2c361STejun Heo if (llist_add(&dsq->free_node, &dsqs_to_free)) 4828*bba2c361STejun Heo irq_work_queue(&free_dsq_irq_work); 4829*bba2c361STejun Heo 4830*bba2c361STejun Heo out_unlock_dsq: 4831*bba2c361STejun Heo raw_spin_unlock_irqrestore(&dsq->lock, flags); 4832*bba2c361STejun Heo out_unlock_rcu: 4833*bba2c361STejun Heo rcu_read_unlock(); 4834*bba2c361STejun Heo } 4835*bba2c361STejun Heo 4836*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 4837*bba2c361STejun Heo static void scx_cgroup_exit(struct scx_sched *sch) 4838*bba2c361STejun Heo { 4839*bba2c361STejun Heo struct cgroup_subsys_state *css; 4840*bba2c361STejun Heo 4841*bba2c361STejun Heo scx_cgroup_enabled = false; 4842*bba2c361STejun Heo 4843*bba2c361STejun Heo /* 4844*bba2c361STejun Heo * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4845*bba2c361STejun Heo * cgroups and exit all the inited ones, all online cgroups are exited. 4846*bba2c361STejun Heo */ 4847*bba2c361STejun Heo css_for_each_descendant_post(css, &root_task_group.css) { 4848*bba2c361STejun Heo struct task_group *tg = css_tg(css); 4849*bba2c361STejun Heo 4850*bba2c361STejun Heo if (!(tg->scx.flags & SCX_TG_INITED)) 4851*bba2c361STejun Heo continue; 4852*bba2c361STejun Heo tg->scx.flags &= ~SCX_TG_INITED; 4853*bba2c361STejun Heo 4854*bba2c361STejun Heo if (!sch->ops.cgroup_exit) 4855*bba2c361STejun Heo continue; 4856*bba2c361STejun Heo 4857*bba2c361STejun Heo SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); 4858*bba2c361STejun Heo } 4859*bba2c361STejun Heo } 4860*bba2c361STejun Heo 4861*bba2c361STejun Heo static int scx_cgroup_init(struct scx_sched *sch) 4862*bba2c361STejun Heo { 4863*bba2c361STejun Heo struct cgroup_subsys_state *css; 4864*bba2c361STejun Heo int ret; 4865*bba2c361STejun Heo 4866*bba2c361STejun Heo /* 4867*bba2c361STejun Heo * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 4868*bba2c361STejun Heo * cgroups and init, all online cgroups are initialized. 4869*bba2c361STejun Heo */ 4870*bba2c361STejun Heo css_for_each_descendant_pre(css, &root_task_group.css) { 4871*bba2c361STejun Heo struct task_group *tg = css_tg(css); 4872*bba2c361STejun Heo struct scx_cgroup_init_args args = { 4873*bba2c361STejun Heo .weight = tg->scx.weight, 4874*bba2c361STejun Heo .bw_period_us = tg->scx.bw_period_us, 4875*bba2c361STejun Heo .bw_quota_us = tg->scx.bw_quota_us, 4876*bba2c361STejun Heo .bw_burst_us = tg->scx.bw_burst_us, 4877*bba2c361STejun Heo }; 4878*bba2c361STejun Heo 4879*bba2c361STejun Heo if ((tg->scx.flags & 4880*bba2c361STejun Heo (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 4881*bba2c361STejun Heo continue; 4882*bba2c361STejun Heo 4883*bba2c361STejun Heo if (!sch->ops.cgroup_init) { 4884*bba2c361STejun Heo tg->scx.flags |= SCX_TG_INITED; 4885*bba2c361STejun Heo continue; 4886*bba2c361STejun Heo } 4887*bba2c361STejun Heo 4888*bba2c361STejun Heo ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, 4889*bba2c361STejun Heo css->cgroup, &args); 4890*bba2c361STejun Heo if (ret) { 4891*bba2c361STejun Heo scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 4892*bba2c361STejun Heo return ret; 4893*bba2c361STejun Heo } 4894*bba2c361STejun Heo tg->scx.flags |= SCX_TG_INITED; 4895*bba2c361STejun Heo } 4896*bba2c361STejun Heo 4897*bba2c361STejun Heo WARN_ON_ONCE(scx_cgroup_enabled); 4898*bba2c361STejun Heo scx_cgroup_enabled = true; 4899*bba2c361STejun Heo 4900*bba2c361STejun Heo return 0; 4901*bba2c361STejun Heo } 4902*bba2c361STejun Heo 4903*bba2c361STejun Heo #else 4904*bba2c361STejun Heo static void scx_cgroup_exit(struct scx_sched *sch) {} 4905*bba2c361STejun Heo static int scx_cgroup_init(struct scx_sched *sch) { return 0; } 4906*bba2c361STejun Heo #endif 4907*bba2c361STejun Heo 4908*bba2c361STejun Heo 4909*bba2c361STejun Heo /******************************************************************************** 4910*bba2c361STejun Heo * Sysfs interface and ops enable/disable. 4911*bba2c361STejun Heo */ 4912*bba2c361STejun Heo 4913*bba2c361STejun Heo #define SCX_ATTR(_name) \ 4914*bba2c361STejun Heo static struct kobj_attribute scx_attr_##_name = { \ 4915*bba2c361STejun Heo .attr = { .name = __stringify(_name), .mode = 0444 }, \ 4916*bba2c361STejun Heo .show = scx_attr_##_name##_show, \ 4917*bba2c361STejun Heo } 4918*bba2c361STejun Heo 4919*bba2c361STejun Heo static ssize_t scx_attr_state_show(struct kobject *kobj, 4920*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 4921*bba2c361STejun Heo { 4922*bba2c361STejun Heo return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); 4923*bba2c361STejun Heo } 4924*bba2c361STejun Heo SCX_ATTR(state); 4925*bba2c361STejun Heo 4926*bba2c361STejun Heo static ssize_t scx_attr_switch_all_show(struct kobject *kobj, 4927*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 4928*bba2c361STejun Heo { 4929*bba2c361STejun Heo return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); 4930*bba2c361STejun Heo } 4931*bba2c361STejun Heo SCX_ATTR(switch_all); 4932*bba2c361STejun Heo 4933*bba2c361STejun Heo static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, 4934*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 4935*bba2c361STejun Heo { 4936*bba2c361STejun Heo return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); 4937*bba2c361STejun Heo } 4938*bba2c361STejun Heo SCX_ATTR(nr_rejected); 4939*bba2c361STejun Heo 4940*bba2c361STejun Heo static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, 4941*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 4942*bba2c361STejun Heo { 4943*bba2c361STejun Heo return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); 4944*bba2c361STejun Heo } 4945*bba2c361STejun Heo SCX_ATTR(hotplug_seq); 4946*bba2c361STejun Heo 4947*bba2c361STejun Heo static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, 4948*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 4949*bba2c361STejun Heo { 4950*bba2c361STejun Heo return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); 4951*bba2c361STejun Heo } 4952*bba2c361STejun Heo SCX_ATTR(enable_seq); 4953*bba2c361STejun Heo 4954*bba2c361STejun Heo static struct attribute *scx_global_attrs[] = { 4955*bba2c361STejun Heo &scx_attr_state.attr, 4956*bba2c361STejun Heo &scx_attr_switch_all.attr, 4957*bba2c361STejun Heo &scx_attr_nr_rejected.attr, 4958*bba2c361STejun Heo &scx_attr_hotplug_seq.attr, 4959*bba2c361STejun Heo &scx_attr_enable_seq.attr, 4960*bba2c361STejun Heo NULL, 4961*bba2c361STejun Heo }; 4962*bba2c361STejun Heo 4963*bba2c361STejun Heo static const struct attribute_group scx_global_attr_group = { 4964*bba2c361STejun Heo .attrs = scx_global_attrs, 4965*bba2c361STejun Heo }; 4966*bba2c361STejun Heo 4967*bba2c361STejun Heo static void free_pnode(struct scx_sched_pnode *pnode); 4968*bba2c361STejun Heo static void free_exit_info(struct scx_exit_info *ei); 4969*bba2c361STejun Heo 4970*bba2c361STejun Heo static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch) 4971*bba2c361STejun Heo { 4972*bba2c361STejun Heo size_t size = struct_size_t(struct scx_cmask, bits, 4973*bba2c361STejun Heo SCX_CMASK_NR_WORDS(num_possible_cpus())); 4974*bba2c361STejun Heo int cpu; 4975*bba2c361STejun Heo 4976*bba2c361STejun Heo if (!sch->is_cid_type || !sch->arena_pool) 4977*bba2c361STejun Heo return 0; 4978*bba2c361STejun Heo 4979*bba2c361STejun Heo sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *); 4980*bba2c361STejun Heo if (!sch->set_cmask_scratch) 4981*bba2c361STejun Heo return -ENOMEM; 4982*bba2c361STejun Heo 4983*bba2c361STejun Heo for_each_possible_cpu(cpu) { 4984*bba2c361STejun Heo struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); 4985*bba2c361STejun Heo 4986*bba2c361STejun Heo *slot = scx_arena_alloc(sch, size); 4987*bba2c361STejun Heo if (!*slot) 4988*bba2c361STejun Heo return -ENOMEM; 4989*bba2c361STejun Heo scx_cmask_init(*slot, 0, num_possible_cpus()); 4990*bba2c361STejun Heo } 4991*bba2c361STejun Heo return 0; 4992*bba2c361STejun Heo } 4993*bba2c361STejun Heo 4994*bba2c361STejun Heo static void scx_set_cmask_scratch_free(struct scx_sched *sch) 4995*bba2c361STejun Heo { 4996*bba2c361STejun Heo size_t size = struct_size_t(struct scx_cmask, bits, 4997*bba2c361STejun Heo SCX_CMASK_NR_WORDS(num_possible_cpus())); 4998*bba2c361STejun Heo int cpu; 4999*bba2c361STejun Heo 5000*bba2c361STejun Heo if (!sch->set_cmask_scratch) 5001*bba2c361STejun Heo return; 5002*bba2c361STejun Heo 5003*bba2c361STejun Heo for_each_possible_cpu(cpu) { 5004*bba2c361STejun Heo struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); 5005*bba2c361STejun Heo 5006*bba2c361STejun Heo scx_arena_free(sch, *slot, size); 5007*bba2c361STejun Heo } 5008*bba2c361STejun Heo free_percpu(sch->set_cmask_scratch); 5009*bba2c361STejun Heo sch->set_cmask_scratch = NULL; 5010*bba2c361STejun Heo } 5011*bba2c361STejun Heo 5012*bba2c361STejun Heo static void scx_sched_free_rcu_work(struct work_struct *work) 5013*bba2c361STejun Heo { 5014*bba2c361STejun Heo struct rcu_work *rcu_work = to_rcu_work(work); 5015*bba2c361STejun Heo struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); 5016*bba2c361STejun Heo struct rhashtable_iter rht_iter; 5017*bba2c361STejun Heo struct scx_dispatch_q *dsq; 5018*bba2c361STejun Heo int cpu, node; 5019*bba2c361STejun Heo 5020*bba2c361STejun Heo irq_work_sync(&sch->disable_irq_work); 5021*bba2c361STejun Heo kthread_destroy_worker(sch->helper); 5022*bba2c361STejun Heo timer_shutdown_sync(&sch->bypass_lb_timer); 5023*bba2c361STejun Heo free_cpumask_var(sch->bypass_lb_donee_cpumask); 5024*bba2c361STejun Heo free_cpumask_var(sch->bypass_lb_resched_cpumask); 5025*bba2c361STejun Heo 5026*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 5027*bba2c361STejun Heo kfree(sch->cgrp_path); 5028*bba2c361STejun Heo if (sch_cgroup(sch)) 5029*bba2c361STejun Heo cgroup_put(sch_cgroup(sch)); 5030*bba2c361STejun Heo if (sch->sub_kset) 5031*bba2c361STejun Heo kobject_put(&sch->sub_kset->kobj); 5032*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 5033*bba2c361STejun Heo 5034*bba2c361STejun Heo for_each_possible_cpu(cpu) { 5035*bba2c361STejun Heo struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 5036*bba2c361STejun Heo 5037*bba2c361STejun Heo /* 5038*bba2c361STejun Heo * $sch would have entered bypass mode before the RCU grace 5039*bba2c361STejun Heo * period. As that blocks new deferrals, all 5040*bba2c361STejun Heo * deferred_reenq_local_node's must be off-list by now. 5041*bba2c361STejun Heo */ 5042*bba2c361STejun Heo WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); 5043*bba2c361STejun Heo 5044*bba2c361STejun Heo exit_dsq(bypass_dsq(sch, cpu)); 5045*bba2c361STejun Heo } 5046*bba2c361STejun Heo 5047*bba2c361STejun Heo free_percpu(sch->pcpu); 5048*bba2c361STejun Heo 5049*bba2c361STejun Heo for_each_node_state(node, N_POSSIBLE) 5050*bba2c361STejun Heo free_pnode(sch->pnode[node]); 5051*bba2c361STejun Heo kfree(sch->pnode); 5052*bba2c361STejun Heo 5053*bba2c361STejun Heo rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); 5054*bba2c361STejun Heo do { 5055*bba2c361STejun Heo rhashtable_walk_start(&rht_iter); 5056*bba2c361STejun Heo 5057*bba2c361STejun Heo while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) 5058*bba2c361STejun Heo destroy_dsq(sch, dsq->id); 5059*bba2c361STejun Heo 5060*bba2c361STejun Heo rhashtable_walk_stop(&rht_iter); 5061*bba2c361STejun Heo } while (dsq == ERR_PTR(-EAGAIN)); 5062*bba2c361STejun Heo rhashtable_walk_exit(&rht_iter); 5063*bba2c361STejun Heo 5064*bba2c361STejun Heo rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 5065*bba2c361STejun Heo free_exit_info(sch->exit_info); 5066*bba2c361STejun Heo scx_set_cmask_scratch_free(sch); 5067*bba2c361STejun Heo scx_arena_pool_destroy(sch); 5068*bba2c361STejun Heo if (sch->arena_map) 5069*bba2c361STejun Heo bpf_map_put(sch->arena_map); 5070*bba2c361STejun Heo kfree(sch); 5071*bba2c361STejun Heo } 5072*bba2c361STejun Heo 5073*bba2c361STejun Heo static void scx_kobj_release(struct kobject *kobj) 5074*bba2c361STejun Heo { 5075*bba2c361STejun Heo struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 5076*bba2c361STejun Heo 5077*bba2c361STejun Heo INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); 5078*bba2c361STejun Heo queue_rcu_work(system_dfl_wq, &sch->rcu_work); 5079*bba2c361STejun Heo } 5080*bba2c361STejun Heo 5081*bba2c361STejun Heo static ssize_t scx_attr_ops_show(struct kobject *kobj, 5082*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 5083*bba2c361STejun Heo { 5084*bba2c361STejun Heo struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 5085*bba2c361STejun Heo 5086*bba2c361STejun Heo return sysfs_emit(buf, "%s\n", sch->ops.name); 5087*bba2c361STejun Heo } 5088*bba2c361STejun Heo SCX_ATTR(ops); 5089*bba2c361STejun Heo 5090*bba2c361STejun Heo #define scx_attr_event_show(buf, at, events, kind) ({ \ 5091*bba2c361STejun Heo sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 5092*bba2c361STejun Heo }) 5093*bba2c361STejun Heo 5094*bba2c361STejun Heo static ssize_t scx_attr_events_show(struct kobject *kobj, 5095*bba2c361STejun Heo struct kobj_attribute *ka, char *buf) 5096*bba2c361STejun Heo { 5097*bba2c361STejun Heo struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 5098*bba2c361STejun Heo struct scx_event_stats events; 5099*bba2c361STejun Heo int at = 0; 5100*bba2c361STejun Heo 5101*bba2c361STejun Heo scx_read_events(sch, &events); 5102*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 5103*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 5104*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 5105*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 5106*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 5107*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); 5108*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); 5109*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); 5110*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 5111*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 5112*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 5113*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); 5114*bba2c361STejun Heo at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); 5115*bba2c361STejun Heo return at; 5116*bba2c361STejun Heo } 5117*bba2c361STejun Heo SCX_ATTR(events); 5118*bba2c361STejun Heo 5119*bba2c361STejun Heo static struct attribute *scx_sched_attrs[] = { 5120*bba2c361STejun Heo &scx_attr_ops.attr, 5121*bba2c361STejun Heo &scx_attr_events.attr, 5122*bba2c361STejun Heo NULL, 5123*bba2c361STejun Heo }; 5124*bba2c361STejun Heo ATTRIBUTE_GROUPS(scx_sched); 5125*bba2c361STejun Heo 5126*bba2c361STejun Heo static const struct kobj_type scx_ktype = { 5127*bba2c361STejun Heo .release = scx_kobj_release, 5128*bba2c361STejun Heo .sysfs_ops = &kobj_sysfs_ops, 5129*bba2c361STejun Heo .default_groups = scx_sched_groups, 5130*bba2c361STejun Heo }; 5131*bba2c361STejun Heo 5132*bba2c361STejun Heo static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 5133*bba2c361STejun Heo { 5134*bba2c361STejun Heo const struct scx_sched *sch; 5135*bba2c361STejun Heo 5136*bba2c361STejun Heo /* 5137*bba2c361STejun Heo * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) 5138*bba2c361STejun Heo * and sub-scheduler kset kobjects (kset_ktype) through the parent 5139*bba2c361STejun Heo * chain walk. Filter out the latter to avoid invalid casts. 5140*bba2c361STejun Heo */ 5141*bba2c361STejun Heo if (kobj->ktype != &scx_ktype) 5142*bba2c361STejun Heo return 0; 5143*bba2c361STejun Heo 5144*bba2c361STejun Heo sch = container_of(kobj, struct scx_sched, kobj); 5145*bba2c361STejun Heo 5146*bba2c361STejun Heo return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 5147*bba2c361STejun Heo } 5148*bba2c361STejun Heo 5149*bba2c361STejun Heo static const struct kset_uevent_ops scx_uevent_ops = { 5150*bba2c361STejun Heo .uevent = scx_uevent, 5151*bba2c361STejun Heo }; 5152*bba2c361STejun Heo 5153*bba2c361STejun Heo /* 5154*bba2c361STejun Heo * Used by sched_fork() and __setscheduler_prio() to pick the matching 5155*bba2c361STejun Heo * sched_class. dl/rt are already handled. 5156*bba2c361STejun Heo */ 5157*bba2c361STejun Heo bool task_should_scx(int policy) 5158*bba2c361STejun Heo { 5159*bba2c361STejun Heo /* if disabled, nothing should be on it */ 5160*bba2c361STejun Heo if (!scx_enabled()) 5161*bba2c361STejun Heo return false; 5162*bba2c361STejun Heo 5163*bba2c361STejun Heo /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ 5164*bba2c361STejun Heo if (READ_ONCE(scx_switching_all)) 5165*bba2c361STejun Heo return true; 5166*bba2c361STejun Heo 5167*bba2c361STejun Heo /* 5168*bba2c361STejun Heo * scx is tearing down - keep new SCHED_EXT tasks out. 5169*bba2c361STejun Heo * 5170*bba2c361STejun Heo * Must come after scx_switching_all test, which serves as a proxy 5171*bba2c361STejun Heo * for __scx_switched_all. While __scx_switched_all is set, we must 5172*bba2c361STejun Heo * return true via the branch above: a fork routed to fair would 5173*bba2c361STejun Heo * stall because next_active_class() skips fair. 5174*bba2c361STejun Heo * 5175*bba2c361STejun Heo * This can develop into a deadlock - scx holds scx_enable_mutex across 5176*bba2c361STejun Heo * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is 5177*bba2c361STejun Heo * the stalled task, the disable path can never grab the mutex to clear 5178*bba2c361STejun Heo * scx_switching_all. 5179*bba2c361STejun Heo */ 5180*bba2c361STejun Heo if (unlikely(scx_enable_state() == SCX_DISABLING)) 5181*bba2c361STejun Heo return false; 5182*bba2c361STejun Heo 5183*bba2c361STejun Heo return policy == SCHED_EXT; 5184*bba2c361STejun Heo } 5185*bba2c361STejun Heo 5186*bba2c361STejun Heo bool scx_allow_ttwu_queue(const struct task_struct *p) 5187*bba2c361STejun Heo { 5188*bba2c361STejun Heo struct scx_sched *sch; 5189*bba2c361STejun Heo 5190*bba2c361STejun Heo if (!scx_enabled()) 5191*bba2c361STejun Heo return true; 5192*bba2c361STejun Heo 5193*bba2c361STejun Heo sch = scx_task_sched(p); 5194*bba2c361STejun Heo if (unlikely(!sch)) 5195*bba2c361STejun Heo return true; 5196*bba2c361STejun Heo 5197*bba2c361STejun Heo if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 5198*bba2c361STejun Heo return true; 5199*bba2c361STejun Heo 5200*bba2c361STejun Heo if (unlikely(p->sched_class != &ext_sched_class)) 5201*bba2c361STejun Heo return true; 5202*bba2c361STejun Heo 5203*bba2c361STejun Heo return false; 5204*bba2c361STejun Heo } 5205*bba2c361STejun Heo 5206*bba2c361STejun Heo /** 5207*bba2c361STejun Heo * handle_lockup - sched_ext common lockup handler 5208*bba2c361STejun Heo * @fmt: format string 5209*bba2c361STejun Heo * 5210*bba2c361STejun Heo * Called on system stall or lockup condition and initiates abort of sched_ext 5211*bba2c361STejun Heo * if enabled, which may resolve the reported lockup. 5212*bba2c361STejun Heo * 5213*bba2c361STejun Heo * Returns %true if sched_ext is enabled and abort was initiated, which may 5214*bba2c361STejun Heo * resolve the lockup. %false if sched_ext is not enabled or abort was already 5215*bba2c361STejun Heo * initiated by someone else. 5216*bba2c361STejun Heo */ 5217*bba2c361STejun Heo static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 5218*bba2c361STejun Heo { 5219*bba2c361STejun Heo struct scx_sched *sch; 5220*bba2c361STejun Heo va_list args; 5221*bba2c361STejun Heo bool ret; 5222*bba2c361STejun Heo 5223*bba2c361STejun Heo guard(rcu)(); 5224*bba2c361STejun Heo 5225*bba2c361STejun Heo sch = rcu_dereference(scx_root); 5226*bba2c361STejun Heo if (unlikely(!sch)) 5227*bba2c361STejun Heo return false; 5228*bba2c361STejun Heo 5229*bba2c361STejun Heo switch (scx_enable_state()) { 5230*bba2c361STejun Heo case SCX_ENABLING: 5231*bba2c361STejun Heo case SCX_ENABLED: 5232*bba2c361STejun Heo va_start(args, fmt); 5233*bba2c361STejun Heo ret = scx_verror(sch, fmt, args); 5234*bba2c361STejun Heo va_end(args); 5235*bba2c361STejun Heo return ret; 5236*bba2c361STejun Heo default: 5237*bba2c361STejun Heo return false; 5238*bba2c361STejun Heo } 5239*bba2c361STejun Heo } 5240*bba2c361STejun Heo 5241*bba2c361STejun Heo /** 5242*bba2c361STejun Heo * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 5243*bba2c361STejun Heo * 5244*bba2c361STejun Heo * While there are various reasons why RCU CPU stalls can occur on a system 5245*bba2c361STejun Heo * that may not be caused by the current BPF scheduler, try kicking out the 5246*bba2c361STejun Heo * current scheduler in an attempt to recover the system to a good state before 5247*bba2c361STejun Heo * issuing panics. 5248*bba2c361STejun Heo * 5249*bba2c361STejun Heo * Returns %true if sched_ext is enabled and abort was initiated, which may 5250*bba2c361STejun Heo * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 5251*bba2c361STejun Heo * else already initiated abort. 5252*bba2c361STejun Heo */ 5253*bba2c361STejun Heo bool scx_rcu_cpu_stall(void) 5254*bba2c361STejun Heo { 5255*bba2c361STejun Heo return handle_lockup("RCU CPU stall detected!"); 5256*bba2c361STejun Heo } 5257*bba2c361STejun Heo 5258*bba2c361STejun Heo /** 5259*bba2c361STejun Heo * scx_softlockup - sched_ext softlockup handler 5260*bba2c361STejun Heo * @dur_s: number of seconds of CPU stuck due to soft lockup 5261*bba2c361STejun Heo * 5262*bba2c361STejun Heo * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can 5263*bba2c361STejun Heo * live-lock the system by making many CPUs target the same DSQ to the point 5264*bba2c361STejun Heo * where soft-lockup detection triggers. This function is called from 5265*bba2c361STejun Heo * soft-lockup watchdog when the triggering point is close and tries to unjam 5266*bba2c361STejun Heo * the system and aborting the BPF scheduler. 5267*bba2c361STejun Heo */ 5268*bba2c361STejun Heo void scx_softlockup(u32 dur_s) 5269*bba2c361STejun Heo { 5270*bba2c361STejun Heo if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 5271*bba2c361STejun Heo return; 5272*bba2c361STejun Heo 5273*bba2c361STejun Heo printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 5274*bba2c361STejun Heo smp_processor_id(), dur_s); 5275*bba2c361STejun Heo } 5276*bba2c361STejun Heo 5277*bba2c361STejun Heo /* 5278*bba2c361STejun Heo * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), 5279*bba2c361STejun Heo * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing 5280*bba2c361STejun Heo * it from NMI context can lead to deadlocks. Defer via irq_work; the 5281*bba2c361STejun Heo * disable path runs off irq_work anyway. 5282*bba2c361STejun Heo */ 5283*bba2c361STejun Heo static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); 5284*bba2c361STejun Heo 5285*bba2c361STejun Heo static void scx_hardlockup_irq_workfn(struct irq_work *work) 5286*bba2c361STejun Heo { 5287*bba2c361STejun Heo int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); 5288*bba2c361STejun Heo 5289*bba2c361STejun Heo if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) 5290*bba2c361STejun Heo printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 5291*bba2c361STejun Heo cpu); 5292*bba2c361STejun Heo } 5293*bba2c361STejun Heo 5294*bba2c361STejun Heo static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); 5295*bba2c361STejun Heo 5296*bba2c361STejun Heo /** 5297*bba2c361STejun Heo * scx_hardlockup - sched_ext hardlockup handler 5298*bba2c361STejun Heo * 5299*bba2c361STejun Heo * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 5300*bba2c361STejun Heo * numerous affinitized tasks in a single queue and directing all CPUs at it. 5301*bba2c361STejun Heo * Try kicking out the current scheduler in an attempt to recover the system to 5302*bba2c361STejun Heo * a good state before taking more drastic actions. 5303*bba2c361STejun Heo * 5304*bba2c361STejun Heo * Queues an irq_work; the handle_lockup() call happens in IRQ context (see 5305*bba2c361STejun Heo * scx_hardlockup_irq_workfn). 5306*bba2c361STejun Heo * 5307*bba2c361STejun Heo * Returns %true if sched_ext is enabled and the work was queued, %false 5308*bba2c361STejun Heo * otherwise. 5309*bba2c361STejun Heo */ 5310*bba2c361STejun Heo bool scx_hardlockup(int cpu) 5311*bba2c361STejun Heo { 5312*bba2c361STejun Heo if (!rcu_access_pointer(scx_root)) 5313*bba2c361STejun Heo return false; 5314*bba2c361STejun Heo 5315*bba2c361STejun Heo atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); 5316*bba2c361STejun Heo irq_work_queue(&scx_hardlockup_irq_work); 5317*bba2c361STejun Heo return true; 5318*bba2c361STejun Heo } 5319*bba2c361STejun Heo 5320*bba2c361STejun Heo static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, 5321*bba2c361STejun Heo struct cpumask *donee_mask, struct cpumask *resched_mask, 5322*bba2c361STejun Heo u32 nr_donor_target, u32 nr_donee_target) 5323*bba2c361STejun Heo { 5324*bba2c361STejun Heo struct rq *donor_rq = cpu_rq(donor); 5325*bba2c361STejun Heo struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); 5326*bba2c361STejun Heo struct task_struct *p, *n; 5327*bba2c361STejun Heo struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); 5328*bba2c361STejun Heo s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 5329*bba2c361STejun Heo u32 nr_balanced = 0, min_delta_us; 5330*bba2c361STejun Heo 5331*bba2c361STejun Heo /* 5332*bba2c361STejun Heo * All we want to guarantee is reasonable forward progress. No reason to 5333*bba2c361STejun Heo * fine tune. Assuming every task on @donor_dsq runs their full slice, 5334*bba2c361STejun Heo * consider offloading iff the total queued duration is over the 5335*bba2c361STejun Heo * threshold. 5336*bba2c361STejun Heo */ 5337*bba2c361STejun Heo min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 5338*bba2c361STejun Heo if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 5339*bba2c361STejun Heo return 0; 5340*bba2c361STejun Heo 5341*bba2c361STejun Heo raw_spin_rq_lock_irq(donor_rq); 5342*bba2c361STejun Heo raw_spin_lock(&donor_dsq->lock); 5343*bba2c361STejun Heo list_add(&cursor.node, &donor_dsq->list); 5344*bba2c361STejun Heo resume: 5345*bba2c361STejun Heo n = container_of(&cursor, struct task_struct, scx.dsq_list); 5346*bba2c361STejun Heo n = nldsq_next_task(donor_dsq, n, false); 5347*bba2c361STejun Heo 5348*bba2c361STejun Heo while ((p = n)) { 5349*bba2c361STejun Heo struct scx_dispatch_q *donee_dsq; 5350*bba2c361STejun Heo int donee; 5351*bba2c361STejun Heo 5352*bba2c361STejun Heo n = nldsq_next_task(donor_dsq, n, false); 5353*bba2c361STejun Heo 5354*bba2c361STejun Heo if (donor_dsq->nr <= nr_donor_target) 5355*bba2c361STejun Heo break; 5356*bba2c361STejun Heo 5357*bba2c361STejun Heo if (cpumask_empty(donee_mask)) 5358*bba2c361STejun Heo break; 5359*bba2c361STejun Heo 5360*bba2c361STejun Heo /* 5361*bba2c361STejun Heo * If an earlier pass placed @p on @donor_dsq from a different 5362*bba2c361STejun Heo * CPU and the donee hasn't consumed it yet, @p is still on the 5363*bba2c361STejun Heo * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved 5364*bba2c361STejun Heo * without its rq locked. Skip. 5365*bba2c361STejun Heo */ 5366*bba2c361STejun Heo if (task_rq(p) != donor_rq) 5367*bba2c361STejun Heo continue; 5368*bba2c361STejun Heo 5369*bba2c361STejun Heo donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 5370*bba2c361STejun Heo if (donee >= nr_cpu_ids) 5371*bba2c361STejun Heo continue; 5372*bba2c361STejun Heo 5373*bba2c361STejun Heo donee_dsq = bypass_dsq(sch, donee); 5374*bba2c361STejun Heo 5375*bba2c361STejun Heo /* 5376*bba2c361STejun Heo * $p's rq is not locked but $p's DSQ lock protects its 5377*bba2c361STejun Heo * scheduling properties making this test safe. 5378*bba2c361STejun Heo */ 5379*bba2c361STejun Heo if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) 5380*bba2c361STejun Heo continue; 5381*bba2c361STejun Heo 5382*bba2c361STejun Heo /* 5383*bba2c361STejun Heo * Moving $p from one non-local DSQ to another. The source rq 5384*bba2c361STejun Heo * and DSQ are already locked. Do an abbreviated dequeue and 5385*bba2c361STejun Heo * then perform enqueue without unlocking $donor_dsq. 5386*bba2c361STejun Heo * 5387*bba2c361STejun Heo * We don't want to drop and reacquire the lock on each 5388*bba2c361STejun Heo * iteration as @donor_dsq can be very long and potentially 5389*bba2c361STejun Heo * highly contended. Donee DSQs are less likely to be contended. 5390*bba2c361STejun Heo * The nested locking is safe as only this LB moves tasks 5391*bba2c361STejun Heo * between bypass DSQs. 5392*bba2c361STejun Heo */ 5393*bba2c361STejun Heo dispatch_dequeue_locked(p, donor_dsq); 5394*bba2c361STejun Heo dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); 5395*bba2c361STejun Heo 5396*bba2c361STejun Heo /* 5397*bba2c361STejun Heo * $donee might have been idle and need to be woken up. No need 5398*bba2c361STejun Heo * to be clever. Kick every CPU that receives tasks. 5399*bba2c361STejun Heo */ 5400*bba2c361STejun Heo cpumask_set_cpu(donee, resched_mask); 5401*bba2c361STejun Heo 5402*bba2c361STejun Heo if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 5403*bba2c361STejun Heo cpumask_clear_cpu(donee, donee_mask); 5404*bba2c361STejun Heo 5405*bba2c361STejun Heo nr_balanced++; 5406*bba2c361STejun Heo if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 5407*bba2c361STejun Heo list_move_tail(&cursor.node, &n->scx.dsq_list.node); 5408*bba2c361STejun Heo raw_spin_unlock(&donor_dsq->lock); 5409*bba2c361STejun Heo raw_spin_rq_unlock_irq(donor_rq); 5410*bba2c361STejun Heo cpu_relax(); 5411*bba2c361STejun Heo raw_spin_rq_lock_irq(donor_rq); 5412*bba2c361STejun Heo raw_spin_lock(&donor_dsq->lock); 5413*bba2c361STejun Heo goto resume; 5414*bba2c361STejun Heo } 5415*bba2c361STejun Heo } 5416*bba2c361STejun Heo 5417*bba2c361STejun Heo list_del_init(&cursor.node); 5418*bba2c361STejun Heo raw_spin_unlock(&donor_dsq->lock); 5419*bba2c361STejun Heo raw_spin_rq_unlock_irq(donor_rq); 5420*bba2c361STejun Heo 5421*bba2c361STejun Heo return nr_balanced; 5422*bba2c361STejun Heo } 5423*bba2c361STejun Heo 5424*bba2c361STejun Heo static void bypass_lb_node(struct scx_sched *sch, int node) 5425*bba2c361STejun Heo { 5426*bba2c361STejun Heo const struct cpumask *node_mask = cpumask_of_node(node); 5427*bba2c361STejun Heo struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; 5428*bba2c361STejun Heo struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; 5429*bba2c361STejun Heo u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 5430*bba2c361STejun Heo u32 nr_target, nr_donor_target; 5431*bba2c361STejun Heo u32 before_min = U32_MAX, before_max = 0; 5432*bba2c361STejun Heo u32 after_min = U32_MAX, after_max = 0; 5433*bba2c361STejun Heo int cpu; 5434*bba2c361STejun Heo 5435*bba2c361STejun Heo /* count the target tasks and CPUs */ 5436*bba2c361STejun Heo for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5437*bba2c361STejun Heo u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5438*bba2c361STejun Heo 5439*bba2c361STejun Heo nr_tasks += nr; 5440*bba2c361STejun Heo nr_cpus++; 5441*bba2c361STejun Heo 5442*bba2c361STejun Heo before_min = min(nr, before_min); 5443*bba2c361STejun Heo before_max = max(nr, before_max); 5444*bba2c361STejun Heo } 5445*bba2c361STejun Heo 5446*bba2c361STejun Heo if (!nr_cpus) 5447*bba2c361STejun Heo return; 5448*bba2c361STejun Heo 5449*bba2c361STejun Heo /* 5450*bba2c361STejun Heo * We don't want CPUs to have more than $nr_donor_target tasks and 5451*bba2c361STejun Heo * balancing to fill donee CPUs upto $nr_target. Once targets are 5452*bba2c361STejun Heo * calculated, find the donee CPUs. 5453*bba2c361STejun Heo */ 5454*bba2c361STejun Heo nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 5455*bba2c361STejun Heo nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 5456*bba2c361STejun Heo 5457*bba2c361STejun Heo cpumask_clear(donee_mask); 5458*bba2c361STejun Heo for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5459*bba2c361STejun Heo if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) 5460*bba2c361STejun Heo cpumask_set_cpu(cpu, donee_mask); 5461*bba2c361STejun Heo } 5462*bba2c361STejun Heo 5463*bba2c361STejun Heo /* iterate !donee CPUs and see if they should be offloaded */ 5464*bba2c361STejun Heo cpumask_clear(resched_mask); 5465*bba2c361STejun Heo for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5466*bba2c361STejun Heo if (cpumask_empty(donee_mask)) 5467*bba2c361STejun Heo break; 5468*bba2c361STejun Heo if (cpumask_test_cpu(cpu, donee_mask)) 5469*bba2c361STejun Heo continue; 5470*bba2c361STejun Heo if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) 5471*bba2c361STejun Heo continue; 5472*bba2c361STejun Heo 5473*bba2c361STejun Heo nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, 5474*bba2c361STejun Heo nr_donor_target, nr_target); 5475*bba2c361STejun Heo } 5476*bba2c361STejun Heo 5477*bba2c361STejun Heo for_each_cpu(cpu, resched_mask) 5478*bba2c361STejun Heo resched_cpu(cpu); 5479*bba2c361STejun Heo 5480*bba2c361STejun Heo for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 5481*bba2c361STejun Heo u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); 5482*bba2c361STejun Heo 5483*bba2c361STejun Heo after_min = min(nr, after_min); 5484*bba2c361STejun Heo after_max = max(nr, after_max); 5485*bba2c361STejun Heo 5486*bba2c361STejun Heo } 5487*bba2c361STejun Heo 5488*bba2c361STejun Heo trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 5489*bba2c361STejun Heo before_min, before_max, after_min, after_max); 5490*bba2c361STejun Heo } 5491*bba2c361STejun Heo 5492*bba2c361STejun Heo /* 5493*bba2c361STejun Heo * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 5494*bba2c361STejun Heo * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 5495*bba2c361STejun Heo * bypass DSQs can be overloaded. If there are enough tasks to saturate other 5496*bba2c361STejun Heo * lightly loaded CPUs, such imbalance can lead to very high execution latency 5497*bba2c361STejun Heo * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 5498*bba2c361STejun Heo * outcomes, a simple load balancing mechanism is implemented by the following 5499*bba2c361STejun Heo * timer which runs periodically while bypass mode is in effect. 5500*bba2c361STejun Heo */ 5501*bba2c361STejun Heo static void scx_bypass_lb_timerfn(struct timer_list *timer) 5502*bba2c361STejun Heo { 5503*bba2c361STejun Heo struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); 5504*bba2c361STejun Heo int node; 5505*bba2c361STejun Heo u32 intv_us; 5506*bba2c361STejun Heo 5507*bba2c361STejun Heo if (!bypass_dsp_enabled(sch)) 5508*bba2c361STejun Heo return; 5509*bba2c361STejun Heo 5510*bba2c361STejun Heo for_each_node_with_cpus(node) 5511*bba2c361STejun Heo bypass_lb_node(sch, node); 5512*bba2c361STejun Heo 5513*bba2c361STejun Heo intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5514*bba2c361STejun Heo if (intv_us) 5515*bba2c361STejun Heo mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 5516*bba2c361STejun Heo } 5517*bba2c361STejun Heo 5518*bba2c361STejun Heo static bool inc_bypass_depth(struct scx_sched *sch) 5519*bba2c361STejun Heo { 5520*bba2c361STejun Heo lockdep_assert_held(&scx_bypass_lock); 5521*bba2c361STejun Heo 5522*bba2c361STejun Heo WARN_ON_ONCE(sch->bypass_depth < 0); 5523*bba2c361STejun Heo WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); 5524*bba2c361STejun Heo if (sch->bypass_depth != 1) 5525*bba2c361STejun Heo return false; 5526*bba2c361STejun Heo 5527*bba2c361STejun Heo WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 5528*bba2c361STejun Heo sch->bypass_timestamp = ktime_get_ns(); 5529*bba2c361STejun Heo scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 5530*bba2c361STejun Heo return true; 5531*bba2c361STejun Heo } 5532*bba2c361STejun Heo 5533*bba2c361STejun Heo static bool dec_bypass_depth(struct scx_sched *sch) 5534*bba2c361STejun Heo { 5535*bba2c361STejun Heo lockdep_assert_held(&scx_bypass_lock); 5536*bba2c361STejun Heo 5537*bba2c361STejun Heo WARN_ON_ONCE(sch->bypass_depth < 1); 5538*bba2c361STejun Heo WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); 5539*bba2c361STejun Heo if (sch->bypass_depth != 0) 5540*bba2c361STejun Heo return false; 5541*bba2c361STejun Heo 5542*bba2c361STejun Heo WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); 5543*bba2c361STejun Heo scx_add_event(sch, SCX_EV_BYPASS_DURATION, 5544*bba2c361STejun Heo ktime_get_ns() - sch->bypass_timestamp); 5545*bba2c361STejun Heo return true; 5546*bba2c361STejun Heo } 5547*bba2c361STejun Heo 5548*bba2c361STejun Heo static void enable_bypass_dsp(struct scx_sched *sch) 5549*bba2c361STejun Heo { 5550*bba2c361STejun Heo struct scx_sched *host = scx_parent(sch) ?: sch; 5551*bba2c361STejun Heo u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); 5552*bba2c361STejun Heo s32 ret; 5553*bba2c361STejun Heo 5554*bba2c361STejun Heo /* 5555*bba2c361STejun Heo * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. 5556*bba2c361STejun Heo * Shouldn't stagger. 5557*bba2c361STejun Heo */ 5558*bba2c361STejun Heo if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) 5559*bba2c361STejun Heo return; 5560*bba2c361STejun Heo 5561*bba2c361STejun Heo /* 5562*bba2c361STejun Heo * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of 5563*bba2c361STejun Heo * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is 5564*bba2c361STejun Heo * called iff @sch is not already bypassed due to an ancestor bypassing, 5565*bba2c361STejun Heo * we can assume that the parent is not bypassing and thus will be the 5566*bba2c361STejun Heo * host of the bypass DSQs. 5567*bba2c361STejun Heo * 5568*bba2c361STejun Heo * While the situation may change in the future, the following 5569*bba2c361STejun Heo * guarantees that the nearest non-bypassing ancestor or root has bypass 5570*bba2c361STejun Heo * dispatch enabled while a descendant is bypassing, which is all that's 5571*bba2c361STejun Heo * required. 5572*bba2c361STejun Heo * 5573*bba2c361STejun Heo * bypass_dsp_enabled() test is used to determine whether to enter the 5574*bba2c361STejun Heo * bypass dispatch handling path from both bypassing and hosting scheds. 5575*bba2c361STejun Heo * Bump enable depth on both @sch and bypass dispatch host. 5576*bba2c361STejun Heo */ 5577*bba2c361STejun Heo ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); 5578*bba2c361STejun Heo WARN_ON_ONCE(ret <= 0); 5579*bba2c361STejun Heo 5580*bba2c361STejun Heo if (host != sch) { 5581*bba2c361STejun Heo ret = atomic_inc_return(&host->bypass_dsp_enable_depth); 5582*bba2c361STejun Heo WARN_ON_ONCE(ret <= 0); 5583*bba2c361STejun Heo } 5584*bba2c361STejun Heo 5585*bba2c361STejun Heo /* 5586*bba2c361STejun Heo * The LB timer will stop running if bypass dispatch is disabled. Start 5587*bba2c361STejun Heo * after enabling bypass dispatch. 5588*bba2c361STejun Heo */ 5589*bba2c361STejun Heo if (intv_us && !timer_pending(&host->bypass_lb_timer)) 5590*bba2c361STejun Heo mod_timer(&host->bypass_lb_timer, 5591*bba2c361STejun Heo jiffies + usecs_to_jiffies(intv_us)); 5592*bba2c361STejun Heo } 5593*bba2c361STejun Heo 5594*bba2c361STejun Heo /* may be called without holding scx_bypass_lock */ 5595*bba2c361STejun Heo static void disable_bypass_dsp(struct scx_sched *sch) 5596*bba2c361STejun Heo { 5597*bba2c361STejun Heo s32 ret; 5598*bba2c361STejun Heo 5599*bba2c361STejun Heo if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) 5600*bba2c361STejun Heo return; 5601*bba2c361STejun Heo 5602*bba2c361STejun Heo ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); 5603*bba2c361STejun Heo WARN_ON_ONCE(ret < 0); 5604*bba2c361STejun Heo 5605*bba2c361STejun Heo if (scx_parent(sch)) { 5606*bba2c361STejun Heo ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); 5607*bba2c361STejun Heo WARN_ON_ONCE(ret < 0); 5608*bba2c361STejun Heo } 5609*bba2c361STejun Heo } 5610*bba2c361STejun Heo 5611*bba2c361STejun Heo /** 5612*bba2c361STejun Heo * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 5613*bba2c361STejun Heo * @sch: sched to bypass 5614*bba2c361STejun Heo * @bypass: true for bypass, false for unbypass 5615*bba2c361STejun Heo * 5616*bba2c361STejun Heo * Bypassing guarantees that all runnable tasks make forward progress without 5617*bba2c361STejun Heo * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might 5618*bba2c361STejun Heo * be held by tasks that the BPF scheduler is forgetting to run, which 5619*bba2c361STejun Heo * unfortunately also excludes toggling the static branches. 5620*bba2c361STejun Heo * 5621*bba2c361STejun Heo * Let's work around by overriding a couple ops and modifying behaviors based on 5622*bba2c361STejun Heo * the DISABLING state and then cycling the queued tasks through dequeue/enqueue 5623*bba2c361STejun Heo * to force global FIFO scheduling. 5624*bba2c361STejun Heo * 5625*bba2c361STejun Heo * - ops.select_cpu() is ignored and the default select_cpu() is used. 5626*bba2c361STejun Heo * 5627*bba2c361STejun Heo * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. 5628*bba2c361STejun Heo * %SCX_OPS_ENQ_LAST is also ignored. 5629*bba2c361STejun Heo * 5630*bba2c361STejun Heo * - ops.dispatch() is ignored. 5631*bba2c361STejun Heo * 5632*bba2c361STejun Heo * - balance_one() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice 5633*bba2c361STejun Heo * can't be trusted. Whenever a tick triggers, the running task is rotated to 5634*bba2c361STejun Heo * the tail of the queue with core_sched_at touched. 5635*bba2c361STejun Heo * 5636*bba2c361STejun Heo * - pick_next_task() suppresses zero slice warning. 5637*bba2c361STejun Heo * 5638*bba2c361STejun Heo * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 5639*bba2c361STejun Heo * operations. 5640*bba2c361STejun Heo * 5641*bba2c361STejun Heo * - scx_prio_less() reverts to the default core_sched_at order. 5642*bba2c361STejun Heo */ 5643*bba2c361STejun Heo static void scx_bypass(struct scx_sched *sch, bool bypass) 5644*bba2c361STejun Heo { 5645*bba2c361STejun Heo struct scx_sched *pos; 5646*bba2c361STejun Heo unsigned long flags; 5647*bba2c361STejun Heo int cpu; 5648*bba2c361STejun Heo 5649*bba2c361STejun Heo raw_spin_lock_irqsave(&scx_bypass_lock, flags); 5650*bba2c361STejun Heo 5651*bba2c361STejun Heo if (bypass) { 5652*bba2c361STejun Heo if (!inc_bypass_depth(sch)) 5653*bba2c361STejun Heo goto unlock; 5654*bba2c361STejun Heo 5655*bba2c361STejun Heo enable_bypass_dsp(sch); 5656*bba2c361STejun Heo } else { 5657*bba2c361STejun Heo if (!dec_bypass_depth(sch)) 5658*bba2c361STejun Heo goto unlock; 5659*bba2c361STejun Heo } 5660*bba2c361STejun Heo 5661*bba2c361STejun Heo /* 5662*bba2c361STejun Heo * Bypass state is propagated to all descendants - an scx_sched bypasses 5663*bba2c361STejun Heo * if itself or any of its ancestors are in bypass mode. 5664*bba2c361STejun Heo */ 5665*bba2c361STejun Heo raw_spin_lock(&scx_sched_lock); 5666*bba2c361STejun Heo scx_for_each_descendant_pre(pos, sch) { 5667*bba2c361STejun Heo if (pos == sch) 5668*bba2c361STejun Heo continue; 5669*bba2c361STejun Heo if (bypass) 5670*bba2c361STejun Heo inc_bypass_depth(pos); 5671*bba2c361STejun Heo else 5672*bba2c361STejun Heo dec_bypass_depth(pos); 5673*bba2c361STejun Heo } 5674*bba2c361STejun Heo raw_spin_unlock(&scx_sched_lock); 5675*bba2c361STejun Heo 5676*bba2c361STejun Heo /* 5677*bba2c361STejun Heo * No task property is changing. We just need to make sure all currently 5678*bba2c361STejun Heo * queued tasks are re-queued according to the new scx_bypassing() 5679*bba2c361STejun Heo * state. As an optimization, walk each rq's runnable_list instead of 5680*bba2c361STejun Heo * the scx_tasks list. 5681*bba2c361STejun Heo * 5682*bba2c361STejun Heo * This function can't trust the scheduler and thus can't use 5683*bba2c361STejun Heo * cpus_read_lock(). Walk all possible CPUs instead of online. 5684*bba2c361STejun Heo */ 5685*bba2c361STejun Heo for_each_possible_cpu(cpu) { 5686*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 5687*bba2c361STejun Heo struct task_struct *p, *n; 5688*bba2c361STejun Heo 5689*bba2c361STejun Heo raw_spin_rq_lock(rq); 5690*bba2c361STejun Heo raw_spin_lock(&scx_sched_lock); 5691*bba2c361STejun Heo 5692*bba2c361STejun Heo scx_for_each_descendant_pre(pos, sch) { 5693*bba2c361STejun Heo struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); 5694*bba2c361STejun Heo 5695*bba2c361STejun Heo if (pos->bypass_depth) 5696*bba2c361STejun Heo pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; 5697*bba2c361STejun Heo else 5698*bba2c361STejun Heo pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; 5699*bba2c361STejun Heo } 5700*bba2c361STejun Heo 5701*bba2c361STejun Heo raw_spin_unlock(&scx_sched_lock); 5702*bba2c361STejun Heo 5703*bba2c361STejun Heo /* 5704*bba2c361STejun Heo * We need to guarantee that no tasks are on the BPF scheduler 5705*bba2c361STejun Heo * while bypassing. Either we see enabled or the enable path 5706*bba2c361STejun Heo * sees scx_bypassing() before moving tasks to SCX. 5707*bba2c361STejun Heo */ 5708*bba2c361STejun Heo if (!scx_enabled()) { 5709*bba2c361STejun Heo raw_spin_rq_unlock(rq); 5710*bba2c361STejun Heo continue; 5711*bba2c361STejun Heo } 5712*bba2c361STejun Heo 5713*bba2c361STejun Heo /* 5714*bba2c361STejun Heo * The use of list_for_each_entry_safe_reverse() is required 5715*bba2c361STejun Heo * because each task is going to be removed from and added back 5716*bba2c361STejun Heo * to the runnable_list during iteration. Because they're added 5717*bba2c361STejun Heo * to the tail of the list, safe reverse iteration can still 5718*bba2c361STejun Heo * visit all nodes. 5719*bba2c361STejun Heo */ 5720*bba2c361STejun Heo list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 5721*bba2c361STejun Heo scx.runnable_node) { 5722*bba2c361STejun Heo if (!scx_is_descendant(scx_task_sched(p), sch)) 5723*bba2c361STejun Heo continue; 5724*bba2c361STejun Heo 5725*bba2c361STejun Heo /* cycling deq/enq is enough, see the function comment */ 5726*bba2c361STejun Heo scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5727*bba2c361STejun Heo /* nothing */ ; 5728*bba2c361STejun Heo } 5729*bba2c361STejun Heo } 5730*bba2c361STejun Heo 5731*bba2c361STejun Heo /* resched to restore ticks and idle state */ 5732*bba2c361STejun Heo if (cpu_online(cpu) || cpu == smp_processor_id()) 5733*bba2c361STejun Heo resched_curr(rq); 5734*bba2c361STejun Heo 5735*bba2c361STejun Heo raw_spin_rq_unlock(rq); 5736*bba2c361STejun Heo } 5737*bba2c361STejun Heo 5738*bba2c361STejun Heo /* disarming must come after moving all tasks out of the bypass DSQs */ 5739*bba2c361STejun Heo if (!bypass) 5740*bba2c361STejun Heo disable_bypass_dsp(sch); 5741*bba2c361STejun Heo unlock: 5742*bba2c361STejun Heo raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); 5743*bba2c361STejun Heo } 5744*bba2c361STejun Heo 5745*bba2c361STejun Heo static void free_exit_info(struct scx_exit_info *ei) 5746*bba2c361STejun Heo { 5747*bba2c361STejun Heo kvfree(ei->dump); 5748*bba2c361STejun Heo kfree(ei->msg); 5749*bba2c361STejun Heo kfree(ei->bt); 5750*bba2c361STejun Heo kfree(ei); 5751*bba2c361STejun Heo } 5752*bba2c361STejun Heo 5753*bba2c361STejun Heo static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) 5754*bba2c361STejun Heo { 5755*bba2c361STejun Heo struct scx_exit_info *ei; 5756*bba2c361STejun Heo 5757*bba2c361STejun Heo ei = kzalloc_obj(*ei); 5758*bba2c361STejun Heo if (!ei) 5759*bba2c361STejun Heo return NULL; 5760*bba2c361STejun Heo 5761*bba2c361STejun Heo ei->exit_cpu = -1; 5762*bba2c361STejun Heo ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); 5763*bba2c361STejun Heo ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); 5764*bba2c361STejun Heo ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); 5765*bba2c361STejun Heo 5766*bba2c361STejun Heo if (!ei->bt || !ei->msg || !ei->dump) { 5767*bba2c361STejun Heo free_exit_info(ei); 5768*bba2c361STejun Heo return NULL; 5769*bba2c361STejun Heo } 5770*bba2c361STejun Heo 5771*bba2c361STejun Heo return ei; 5772*bba2c361STejun Heo } 5773*bba2c361STejun Heo 5774*bba2c361STejun Heo static const char *scx_exit_reason(enum scx_exit_kind kind) 5775*bba2c361STejun Heo { 5776*bba2c361STejun Heo switch (kind) { 5777*bba2c361STejun Heo case SCX_EXIT_UNREG: 5778*bba2c361STejun Heo return "unregistered from user space"; 5779*bba2c361STejun Heo case SCX_EXIT_UNREG_BPF: 5780*bba2c361STejun Heo return "unregistered from BPF"; 5781*bba2c361STejun Heo case SCX_EXIT_UNREG_KERN: 5782*bba2c361STejun Heo return "unregistered from the main kernel"; 5783*bba2c361STejun Heo case SCX_EXIT_SYSRQ: 5784*bba2c361STejun Heo return "disabled by sysrq-S"; 5785*bba2c361STejun Heo case SCX_EXIT_PARENT: 5786*bba2c361STejun Heo return "parent exiting"; 5787*bba2c361STejun Heo case SCX_EXIT_ERROR: 5788*bba2c361STejun Heo return "runtime error"; 5789*bba2c361STejun Heo case SCX_EXIT_ERROR_BPF: 5790*bba2c361STejun Heo return "scx_bpf_error"; 5791*bba2c361STejun Heo case SCX_EXIT_ERROR_STALL: 5792*bba2c361STejun Heo return "runnable task stall"; 5793*bba2c361STejun Heo default: 5794*bba2c361STejun Heo return "<UNKNOWN>"; 5795*bba2c361STejun Heo } 5796*bba2c361STejun Heo } 5797*bba2c361STejun Heo 5798*bba2c361STejun Heo static void free_kick_syncs(void) 5799*bba2c361STejun Heo { 5800*bba2c361STejun Heo int cpu; 5801*bba2c361STejun Heo 5802*bba2c361STejun Heo for_each_possible_cpu(cpu) { 5803*bba2c361STejun Heo struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 5804*bba2c361STejun Heo struct scx_kick_syncs *to_free; 5805*bba2c361STejun Heo 5806*bba2c361STejun Heo to_free = rcu_replace_pointer(*ksyncs, NULL, true); 5807*bba2c361STejun Heo if (to_free) 5808*bba2c361STejun Heo kvfree_rcu(to_free, rcu); 5809*bba2c361STejun Heo } 5810*bba2c361STejun Heo } 5811*bba2c361STejun Heo 5812*bba2c361STejun Heo static void refresh_watchdog(void) 5813*bba2c361STejun Heo { 5814*bba2c361STejun Heo struct scx_sched *sch; 5815*bba2c361STejun Heo unsigned long intv = ULONG_MAX; 5816*bba2c361STejun Heo 5817*bba2c361STejun Heo /* take the shortest timeout and use its half for watchdog interval */ 5818*bba2c361STejun Heo rcu_read_lock(); 5819*bba2c361STejun Heo list_for_each_entry_rcu(sch, &scx_sched_all, all) 5820*bba2c361STejun Heo intv = max(min(intv, sch->watchdog_timeout / 2), 1); 5821*bba2c361STejun Heo rcu_read_unlock(); 5822*bba2c361STejun Heo 5823*bba2c361STejun Heo WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5824*bba2c361STejun Heo WRITE_ONCE(scx_watchdog_interval, intv); 5825*bba2c361STejun Heo 5826*bba2c361STejun Heo if (intv < ULONG_MAX) 5827*bba2c361STejun Heo mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5828*bba2c361STejun Heo else 5829*bba2c361STejun Heo cancel_delayed_work_sync(&scx_watchdog_work); 5830*bba2c361STejun Heo } 5831*bba2c361STejun Heo 5832*bba2c361STejun Heo static s32 scx_link_sched(struct scx_sched *sch) 5833*bba2c361STejun Heo { 5834*bba2c361STejun Heo const char *err_msg = ""; 5835*bba2c361STejun Heo s32 ret = 0; 5836*bba2c361STejun Heo 5837*bba2c361STejun Heo scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5838*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 5839*bba2c361STejun Heo struct scx_sched *parent = scx_parent(sch); 5840*bba2c361STejun Heo 5841*bba2c361STejun Heo if (parent) { 5842*bba2c361STejun Heo /* 5843*bba2c361STejun Heo * scx_claim_exit() propagates exit_kind transition to 5844*bba2c361STejun Heo * its sub-scheds while holding scx_sched_lock - either 5845*bba2c361STejun Heo * we can see the parent's non-NONE exit_kind or the 5846*bba2c361STejun Heo * parent can shoot us down. 5847*bba2c361STejun Heo */ 5848*bba2c361STejun Heo if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { 5849*bba2c361STejun Heo err_msg = "parent disabled"; 5850*bba2c361STejun Heo ret = -ENOENT; 5851*bba2c361STejun Heo break; 5852*bba2c361STejun Heo } 5853*bba2c361STejun Heo 5854*bba2c361STejun Heo ret = rhashtable_lookup_insert_fast(&scx_sched_hash, 5855*bba2c361STejun Heo &sch->hash_node, scx_sched_hash_params); 5856*bba2c361STejun Heo if (ret) { 5857*bba2c361STejun Heo err_msg = "failed to insert into scx_sched_hash"; 5858*bba2c361STejun Heo break; 5859*bba2c361STejun Heo } 5860*bba2c361STejun Heo 5861*bba2c361STejun Heo list_add_tail(&sch->sibling, &parent->children); 5862*bba2c361STejun Heo } 5863*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 5864*bba2c361STejun Heo 5865*bba2c361STejun Heo list_add_tail_rcu(&sch->all, &scx_sched_all); 5866*bba2c361STejun Heo } 5867*bba2c361STejun Heo 5868*bba2c361STejun Heo /* 5869*bba2c361STejun Heo * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after 5870*bba2c361STejun Heo * the guard above is released. 5871*bba2c361STejun Heo */ 5872*bba2c361STejun Heo if (ret) { 5873*bba2c361STejun Heo scx_error(sch, "%s (%d)", err_msg, ret); 5874*bba2c361STejun Heo return ret; 5875*bba2c361STejun Heo } 5876*bba2c361STejun Heo 5877*bba2c361STejun Heo refresh_watchdog(); 5878*bba2c361STejun Heo return 0; 5879*bba2c361STejun Heo } 5880*bba2c361STejun Heo 5881*bba2c361STejun Heo static void scx_unlink_sched(struct scx_sched *sch) 5882*bba2c361STejun Heo { 5883*bba2c361STejun Heo scoped_guard(raw_spinlock_irq, &scx_sched_lock) { 5884*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 5885*bba2c361STejun Heo if (scx_parent(sch)) { 5886*bba2c361STejun Heo rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, 5887*bba2c361STejun Heo scx_sched_hash_params); 5888*bba2c361STejun Heo list_del_init(&sch->sibling); 5889*bba2c361STejun Heo } 5890*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 5891*bba2c361STejun Heo list_del_rcu(&sch->all); 5892*bba2c361STejun Heo } 5893*bba2c361STejun Heo 5894*bba2c361STejun Heo refresh_watchdog(); 5895*bba2c361STejun Heo } 5896*bba2c361STejun Heo 5897*bba2c361STejun Heo /* 5898*bba2c361STejun Heo * Called to disable future dumps and wait for in-progress one while disabling 5899*bba2c361STejun Heo * @sch. Once @sch becomes empty during disable, there's no point in dumping it. 5900*bba2c361STejun Heo * This prevents calling dump ops on a dead sch. 5901*bba2c361STejun Heo */ 5902*bba2c361STejun Heo static void scx_disable_dump(struct scx_sched *sch) 5903*bba2c361STejun Heo { 5904*bba2c361STejun Heo guard(raw_spinlock_irqsave)(&scx_dump_lock); 5905*bba2c361STejun Heo sch->dump_disabled = true; 5906*bba2c361STejun Heo } 5907*bba2c361STejun Heo 5908*bba2c361STejun Heo static void scx_log_sched_disable(struct scx_sched *sch) 5909*bba2c361STejun Heo { 5910*bba2c361STejun Heo struct scx_exit_info *ei = sch->exit_info; 5911*bba2c361STejun Heo const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler"; 5912*bba2c361STejun Heo 5913*bba2c361STejun Heo if (ei->kind >= SCX_EXIT_ERROR) { 5914*bba2c361STejun Heo pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, 5915*bba2c361STejun Heo sch->ops.name, ei->reason); 5916*bba2c361STejun Heo 5917*bba2c361STejun Heo if (ei->msg[0] != '\0') 5918*bba2c361STejun Heo pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); 5919*bba2c361STejun Heo #ifdef CONFIG_STACKTRACE 5920*bba2c361STejun Heo stack_trace_print(ei->bt, ei->bt_len, 2); 5921*bba2c361STejun Heo #endif 5922*bba2c361STejun Heo } else { 5923*bba2c361STejun Heo pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, 5924*bba2c361STejun Heo sch->ops.name, ei->reason); 5925*bba2c361STejun Heo } 5926*bba2c361STejun Heo } 5927*bba2c361STejun Heo 5928*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 5929*bba2c361STejun Heo static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); 5930*bba2c361STejun Heo 5931*bba2c361STejun Heo static void drain_descendants(struct scx_sched *sch) 5932*bba2c361STejun Heo { 5933*bba2c361STejun Heo /* 5934*bba2c361STejun Heo * Child scheds that finished the critical part of disabling will take 5935*bba2c361STejun Heo * themselves off @sch->children. Wait for it to drain. As propagation 5936*bba2c361STejun Heo * is recursive, empty @sch->children means that all proper descendant 5937*bba2c361STejun Heo * scheds reached unlinking stage. 5938*bba2c361STejun Heo */ 5939*bba2c361STejun Heo wait_event(scx_unlink_waitq, list_empty(&sch->children)); 5940*bba2c361STejun Heo } 5941*bba2c361STejun Heo 5942*bba2c361STejun Heo static void scx_fail_parent(struct scx_sched *sch, 5943*bba2c361STejun Heo struct task_struct *failed, s32 fail_code) 5944*bba2c361STejun Heo { 5945*bba2c361STejun Heo struct scx_sched *parent = scx_parent(sch); 5946*bba2c361STejun Heo struct scx_task_iter sti; 5947*bba2c361STejun Heo struct task_struct *p; 5948*bba2c361STejun Heo 5949*bba2c361STejun Heo scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", 5950*bba2c361STejun Heo fail_code, failed->comm, failed->pid); 5951*bba2c361STejun Heo 5952*bba2c361STejun Heo /* 5953*bba2c361STejun Heo * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into 5954*bba2c361STejun Heo * it. This may cause downstream failures on the BPF side but $parent is 5955*bba2c361STejun Heo * dying anyway. 5956*bba2c361STejun Heo */ 5957*bba2c361STejun Heo scx_bypass(parent, true); 5958*bba2c361STejun Heo 5959*bba2c361STejun Heo scx_task_iter_start(&sti, sch->cgrp); 5960*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 5961*bba2c361STejun Heo if (scx_task_on_sched(parent, p)) 5962*bba2c361STejun Heo continue; 5963*bba2c361STejun Heo 5964*bba2c361STejun Heo scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 5965*bba2c361STejun Heo scx_disable_and_exit_task(sch, p); 5966*bba2c361STejun Heo scx_set_task_sched(p, parent); 5967*bba2c361STejun Heo } 5968*bba2c361STejun Heo } 5969*bba2c361STejun Heo scx_task_iter_stop(&sti); 5970*bba2c361STejun Heo } 5971*bba2c361STejun Heo 5972*bba2c361STejun Heo static void scx_sub_disable(struct scx_sched *sch) 5973*bba2c361STejun Heo { 5974*bba2c361STejun Heo struct scx_sched *parent = scx_parent(sch); 5975*bba2c361STejun Heo struct scx_task_iter sti; 5976*bba2c361STejun Heo struct task_struct *p; 5977*bba2c361STejun Heo int ret; 5978*bba2c361STejun Heo 5979*bba2c361STejun Heo /* 5980*bba2c361STejun Heo * Guarantee forward progress and wait for descendants to be disabled. 5981*bba2c361STejun Heo * To limit disruptions, $parent is not bypassed. Tasks are fully 5982*bba2c361STejun Heo * prepped and then inserted back into $parent. 5983*bba2c361STejun Heo */ 5984*bba2c361STejun Heo scx_bypass(sch, true); 5985*bba2c361STejun Heo drain_descendants(sch); 5986*bba2c361STejun Heo 5987*bba2c361STejun Heo /* 5988*bba2c361STejun Heo * Here, every runnable task is guaranteed to make forward progress and 5989*bba2c361STejun Heo * we can safely use blocking synchronization constructs. Actually 5990*bba2c361STejun Heo * disable ops. 5991*bba2c361STejun Heo */ 5992*bba2c361STejun Heo mutex_lock(&scx_enable_mutex); 5993*bba2c361STejun Heo percpu_down_write(&scx_fork_rwsem); 5994*bba2c361STejun Heo scx_cgroup_lock(); 5995*bba2c361STejun Heo 5996*bba2c361STejun Heo set_cgroup_sched(sch_cgroup(sch), parent); 5997*bba2c361STejun Heo 5998*bba2c361STejun Heo scx_task_iter_start(&sti, sch->cgrp); 5999*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 6000*bba2c361STejun Heo struct rq *rq; 6001*bba2c361STejun Heo struct rq_flags rf; 6002*bba2c361STejun Heo 6003*bba2c361STejun Heo /* filter out duplicate visits */ 6004*bba2c361STejun Heo if (scx_task_on_sched(parent, p)) 6005*bba2c361STejun Heo continue; 6006*bba2c361STejun Heo 6007*bba2c361STejun Heo /* 6008*bba2c361STejun Heo * By the time control reaches here, all descendant schedulers 6009*bba2c361STejun Heo * should already have been disabled. 6010*bba2c361STejun Heo */ 6011*bba2c361STejun Heo WARN_ON_ONCE(!scx_task_on_sched(sch, p)); 6012*bba2c361STejun Heo 6013*bba2c361STejun Heo /* 6014*bba2c361STejun Heo * @p is pinned by the iter: css_task_iter_next() takes a 6015*bba2c361STejun Heo * reference and holds it until the next iter_next() call, so 6016*bba2c361STejun Heo * @p->usage is guaranteed > 0. 6017*bba2c361STejun Heo */ 6018*bba2c361STejun Heo get_task_struct(p); 6019*bba2c361STejun Heo 6020*bba2c361STejun Heo scx_task_iter_unlock(&sti); 6021*bba2c361STejun Heo 6022*bba2c361STejun Heo /* 6023*bba2c361STejun Heo * $p is READY or ENABLED on @sch. Initialize for $parent, 6024*bba2c361STejun Heo * disable and exit from @sch, and then switch over to $parent. 6025*bba2c361STejun Heo * 6026*bba2c361STejun Heo * If a task fails to initialize for $parent, the only available 6027*bba2c361STejun Heo * action is disabling $parent too. While this allows disabling 6028*bba2c361STejun Heo * of a child sched to cause the parent scheduler to fail, the 6029*bba2c361STejun Heo * failure can only originate from ops.init_task() of the 6030*bba2c361STejun Heo * parent. A child can't directly affect the parent through its 6031*bba2c361STejun Heo * own failures. 6032*bba2c361STejun Heo */ 6033*bba2c361STejun Heo ret = __scx_init_task(parent, p, false); 6034*bba2c361STejun Heo if (ret) { 6035*bba2c361STejun Heo scx_fail_parent(sch, p, ret); 6036*bba2c361STejun Heo put_task_struct(p); 6037*bba2c361STejun Heo break; 6038*bba2c361STejun Heo } 6039*bba2c361STejun Heo 6040*bba2c361STejun Heo rq = task_rq_lock(p, &rf); 6041*bba2c361STejun Heo 6042*bba2c361STejun Heo if (scx_get_task_state(p) == SCX_TASK_DEAD) { 6043*bba2c361STejun Heo /* 6044*bba2c361STejun Heo * sched_ext_dead() raced us between __scx_init_task() 6045*bba2c361STejun Heo * and this rq lock and ran exit_task() on @sch (the 6046*bba2c361STejun Heo * sched @p was on at that point), not on $parent. 6047*bba2c361STejun Heo * $parent's just-completed init is owed an exit_task() 6048*bba2c361STejun Heo * and we issue it here. 6049*bba2c361STejun Heo */ 6050*bba2c361STejun Heo scx_sub_init_cancel_task(parent, p); 6051*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 6052*bba2c361STejun Heo put_task_struct(p); 6053*bba2c361STejun Heo continue; 6054*bba2c361STejun Heo } 6055*bba2c361STejun Heo 6056*bba2c361STejun Heo scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 6057*bba2c361STejun Heo /* 6058*bba2c361STejun Heo * $p is initialized for $parent and still attached to 6059*bba2c361STejun Heo * @sch. Disable and exit for @sch, switch over to 6060*bba2c361STejun Heo * $parent, override the state to READY to account for 6061*bba2c361STejun Heo * $p having already been initialized, and then enable. 6062*bba2c361STejun Heo */ 6063*bba2c361STejun Heo scx_disable_and_exit_task(sch, p); 6064*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 6065*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_INIT); 6066*bba2c361STejun Heo scx_set_task_sched(p, parent); 6067*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_READY); 6068*bba2c361STejun Heo scx_enable_task(parent, p); 6069*bba2c361STejun Heo } 6070*bba2c361STejun Heo 6071*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 6072*bba2c361STejun Heo put_task_struct(p); 6073*bba2c361STejun Heo } 6074*bba2c361STejun Heo scx_task_iter_stop(&sti); 6075*bba2c361STejun Heo 6076*bba2c361STejun Heo scx_disable_dump(sch); 6077*bba2c361STejun Heo 6078*bba2c361STejun Heo scx_cgroup_unlock(); 6079*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 6080*bba2c361STejun Heo 6081*bba2c361STejun Heo /* 6082*bba2c361STejun Heo * All tasks are moved off of @sch but there may still be on-going 6083*bba2c361STejun Heo * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use 6084*bba2c361STejun Heo * the expedited version as ancestors may be waiting in bypass mode. 6085*bba2c361STejun Heo * Also, tell the parent that there is no need to keep running bypass 6086*bba2c361STejun Heo * DSQs for us. 6087*bba2c361STejun Heo */ 6088*bba2c361STejun Heo synchronize_rcu_expedited(); 6089*bba2c361STejun Heo disable_bypass_dsp(sch); 6090*bba2c361STejun Heo 6091*bba2c361STejun Heo scx_unlink_sched(sch); 6092*bba2c361STejun Heo 6093*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 6094*bba2c361STejun Heo 6095*bba2c361STejun Heo /* 6096*bba2c361STejun Heo * @sch is now unlinked from the parent's children list. Notify and call 6097*bba2c361STejun Heo * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called 6098*bba2c361STejun Heo * after unlinking and releasing all locks. See scx_claim_exit(). 6099*bba2c361STejun Heo */ 6100*bba2c361STejun Heo wake_up_all(&scx_unlink_waitq); 6101*bba2c361STejun Heo 6102*bba2c361STejun Heo if (parent->ops.sub_detach && sch->sub_attached) { 6103*bba2c361STejun Heo struct scx_sub_detach_args sub_detach_args = { 6104*bba2c361STejun Heo .ops = &sch->ops, 6105*bba2c361STejun Heo .cgroup_path = sch->cgrp_path, 6106*bba2c361STejun Heo }; 6107*bba2c361STejun Heo SCX_CALL_OP(parent, sub_detach, NULL, 6108*bba2c361STejun Heo &sub_detach_args); 6109*bba2c361STejun Heo } 6110*bba2c361STejun Heo 6111*bba2c361STejun Heo scx_log_sched_disable(sch); 6112*bba2c361STejun Heo 6113*bba2c361STejun Heo if (sch->ops.exit) 6114*bba2c361STejun Heo SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 6115*bba2c361STejun Heo if (sch->sub_kset) 6116*bba2c361STejun Heo kobject_del(&sch->sub_kset->kobj); 6117*bba2c361STejun Heo kobject_del(&sch->kobj); 6118*bba2c361STejun Heo } 6119*bba2c361STejun Heo #else /* CONFIG_EXT_SUB_SCHED */ 6120*bba2c361STejun Heo static inline void drain_descendants(struct scx_sched *sch) { } 6121*bba2c361STejun Heo static inline void scx_sub_disable(struct scx_sched *sch) { } 6122*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 6123*bba2c361STejun Heo 6124*bba2c361STejun Heo static void scx_root_disable(struct scx_sched *sch) 6125*bba2c361STejun Heo { 6126*bba2c361STejun Heo struct scx_task_iter sti; 6127*bba2c361STejun Heo struct task_struct *p; 6128*bba2c361STejun Heo bool was_switched_all; 6129*bba2c361STejun Heo int cpu; 6130*bba2c361STejun Heo 6131*bba2c361STejun Heo /* guarantee forward progress and wait for descendants to be disabled */ 6132*bba2c361STejun Heo scx_bypass(sch, true); 6133*bba2c361STejun Heo drain_descendants(sch); 6134*bba2c361STejun Heo 6135*bba2c361STejun Heo switch (scx_set_enable_state(SCX_DISABLING)) { 6136*bba2c361STejun Heo case SCX_DISABLING: 6137*bba2c361STejun Heo WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); 6138*bba2c361STejun Heo break; 6139*bba2c361STejun Heo case SCX_DISABLED: 6140*bba2c361STejun Heo pr_warn("sched_ext: ops error detected without ops (%s)\n", 6141*bba2c361STejun Heo sch->exit_info->msg); 6142*bba2c361STejun Heo WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6143*bba2c361STejun Heo goto done; 6144*bba2c361STejun Heo default: 6145*bba2c361STejun Heo break; 6146*bba2c361STejun Heo } 6147*bba2c361STejun Heo 6148*bba2c361STejun Heo /* 6149*bba2c361STejun Heo * Here, every runnable task is guaranteed to make forward progress and 6150*bba2c361STejun Heo * we can safely use blocking synchronization constructs. Actually 6151*bba2c361STejun Heo * disable ops. 6152*bba2c361STejun Heo */ 6153*bba2c361STejun Heo mutex_lock(&scx_enable_mutex); 6154*bba2c361STejun Heo 6155*bba2c361STejun Heo was_switched_all = scx_switched_all(); 6156*bba2c361STejun Heo 6157*bba2c361STejun Heo static_branch_disable(&__scx_switched_all); 6158*bba2c361STejun Heo WRITE_ONCE(scx_switching_all, false); 6159*bba2c361STejun Heo 6160*bba2c361STejun Heo /* 6161*bba2c361STejun Heo * Shut down cgroup support before tasks so that the cgroup attach path 6162*bba2c361STejun Heo * doesn't race against scx_disable_and_exit_task(). 6163*bba2c361STejun Heo */ 6164*bba2c361STejun Heo scx_cgroup_lock(); 6165*bba2c361STejun Heo scx_cgroup_exit(sch); 6166*bba2c361STejun Heo scx_cgroup_unlock(); 6167*bba2c361STejun Heo 6168*bba2c361STejun Heo /* 6169*bba2c361STejun Heo * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 6170*bba2c361STejun Heo * must be switched out and exited synchronously. 6171*bba2c361STejun Heo */ 6172*bba2c361STejun Heo percpu_down_write(&scx_fork_rwsem); 6173*bba2c361STejun Heo 6174*bba2c361STejun Heo scx_init_task_enabled = false; 6175*bba2c361STejun Heo 6176*bba2c361STejun Heo scx_task_iter_start(&sti, NULL); 6177*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 6178*bba2c361STejun Heo unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 6179*bba2c361STejun Heo const struct sched_class *old_class = p->sched_class; 6180*bba2c361STejun Heo const struct sched_class *new_class = scx_setscheduler_class(p); 6181*bba2c361STejun Heo 6182*bba2c361STejun Heo update_rq_clock(task_rq(p)); 6183*bba2c361STejun Heo 6184*bba2c361STejun Heo if (old_class != new_class) 6185*bba2c361STejun Heo queue_flags |= DEQUEUE_CLASS; 6186*bba2c361STejun Heo 6187*bba2c361STejun Heo scoped_guard (sched_change, p, queue_flags) { 6188*bba2c361STejun Heo p->sched_class = new_class; 6189*bba2c361STejun Heo } 6190*bba2c361STejun Heo 6191*bba2c361STejun Heo scx_disable_and_exit_task(scx_task_sched(p), p); 6192*bba2c361STejun Heo } 6193*bba2c361STejun Heo scx_task_iter_stop(&sti); 6194*bba2c361STejun Heo 6195*bba2c361STejun Heo scx_disable_dump(sch); 6196*bba2c361STejun Heo 6197*bba2c361STejun Heo scx_cgroup_lock(); 6198*bba2c361STejun Heo set_cgroup_sched(sch_cgroup(sch), NULL); 6199*bba2c361STejun Heo scx_cgroup_unlock(); 6200*bba2c361STejun Heo 6201*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 6202*bba2c361STejun Heo 6203*bba2c361STejun Heo /* 6204*bba2c361STejun Heo * Invalidate all the rq clocks to prevent getting outdated 6205*bba2c361STejun Heo * rq clocks from a previous scx scheduler. 6206*bba2c361STejun Heo * 6207*bba2c361STejun Heo * Also re-balance the dl_server bandwidth reservations: detach 6208*bba2c361STejun Heo * ext_server (no more sched_ext tasks) and reinstate fair_server if it 6209*bba2c361STejun Heo * was previously detached because we were running in full mode. 6210*bba2c361STejun Heo * 6211*bba2c361STejun Heo * Unlike the enable path, this runs on a recovery path that cannot 6212*bba2c361STejun Heo * fail, so we use dl_server_swap_bw() to atomically free ext_server's 6213*bba2c361STejun Heo * bandwidth and reclaim it for fair_server under the same dl_b lock. 6214*bba2c361STejun Heo * 6215*bba2c361STejun Heo * The swap can still fail with -EBUSY if someone bumped ext_server's 6216*bba2c361STejun Heo * runtime via debugfs between enable and disable; in that narrow case 6217*bba2c361STejun Heo * both servers end up detached and we just WARN. 6218*bba2c361STejun Heo */ 6219*bba2c361STejun Heo for_each_possible_cpu(cpu) { 6220*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 6221*bba2c361STejun Heo 6222*bba2c361STejun Heo scx_rq_clock_invalidate(rq); 6223*bba2c361STejun Heo 6224*bba2c361STejun Heo scoped_guard(rq_lock_irqsave, rq) { 6225*bba2c361STejun Heo update_rq_clock(rq); 6226*bba2c361STejun Heo if (was_switched_all) { 6227*bba2c361STejun Heo if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server, 6228*bba2c361STejun Heo &rq->fair_server))) 6229*bba2c361STejun Heo pr_warn("failed to re-attach fair_server on CPU %d\n", cpu); 6230*bba2c361STejun Heo } else { 6231*bba2c361STejun Heo dl_server_detach_bw(&rq->ext_server); 6232*bba2c361STejun Heo } 6233*bba2c361STejun Heo } 6234*bba2c361STejun Heo } 6235*bba2c361STejun Heo 6236*bba2c361STejun Heo /* no task is on scx, turn off all the switches and flush in-progress calls */ 6237*bba2c361STejun Heo static_branch_disable(&__scx_enabled); 6238*bba2c361STejun Heo static_branch_disable(&__scx_is_cid_type); 6239*bba2c361STejun Heo if (sch->ops.flags & SCX_OPS_TID_TO_TASK) 6240*bba2c361STejun Heo static_branch_disable(&__scx_tid_to_task_enabled); 6241*bba2c361STejun Heo bitmap_zero(sch->has_op, SCX_OPI_END); 6242*bba2c361STejun Heo scx_idle_disable(); 6243*bba2c361STejun Heo synchronize_rcu(); 6244*bba2c361STejun Heo if (sch->ops.flags & SCX_OPS_TID_TO_TASK) 6245*bba2c361STejun Heo rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); 6246*bba2c361STejun Heo 6247*bba2c361STejun Heo scx_log_sched_disable(sch); 6248*bba2c361STejun Heo 6249*bba2c361STejun Heo if (sch->ops.exit) 6250*bba2c361STejun Heo SCX_CALL_OP(sch, exit, NULL, sch->exit_info); 6251*bba2c361STejun Heo 6252*bba2c361STejun Heo scx_unlink_sched(sch); 6253*bba2c361STejun Heo 6254*bba2c361STejun Heo /* 6255*bba2c361STejun Heo * scx_root clearing must be inside cpus_read_lock(). See 6256*bba2c361STejun Heo * handle_hotplug(). 6257*bba2c361STejun Heo */ 6258*bba2c361STejun Heo cpus_read_lock(); 6259*bba2c361STejun Heo RCU_INIT_POINTER(scx_root, NULL); 6260*bba2c361STejun Heo cpus_read_unlock(); 6261*bba2c361STejun Heo 6262*bba2c361STejun Heo /* 6263*bba2c361STejun Heo * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs 6264*bba2c361STejun Heo * could observe an object of the same name still in the hierarchy when 6265*bba2c361STejun Heo * the next scheduler is loaded. 6266*bba2c361STejun Heo */ 6267*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 6268*bba2c361STejun Heo if (sch->sub_kset) 6269*bba2c361STejun Heo kobject_del(&sch->sub_kset->kobj); 6270*bba2c361STejun Heo #endif 6271*bba2c361STejun Heo kobject_del(&sch->kobj); 6272*bba2c361STejun Heo 6273*bba2c361STejun Heo free_kick_syncs(); 6274*bba2c361STejun Heo 6275*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 6276*bba2c361STejun Heo 6277*bba2c361STejun Heo WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); 6278*bba2c361STejun Heo done: 6279*bba2c361STejun Heo scx_bypass(sch, false); 6280*bba2c361STejun Heo } 6281*bba2c361STejun Heo 6282*bba2c361STejun Heo /* 6283*bba2c361STejun Heo * Claim the exit on @sch. The caller must ensure that the helper kthread work 6284*bba2c361STejun Heo * is kicked before the current task can be preempted. Once exit_kind is 6285*bba2c361STejun Heo * claimed, scx_error() can no longer trigger, so if the current task gets 6286*bba2c361STejun Heo * preempted and the BPF scheduler fails to schedule it back, the helper work 6287*bba2c361STejun Heo * will never be kicked and the whole system can wedge. 6288*bba2c361STejun Heo */ 6289*bba2c361STejun Heo static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 6290*bba2c361STejun Heo { 6291*bba2c361STejun Heo int none = SCX_EXIT_NONE; 6292*bba2c361STejun Heo 6293*bba2c361STejun Heo lockdep_assert_preemption_disabled(); 6294*bba2c361STejun Heo 6295*bba2c361STejun Heo if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) 6296*bba2c361STejun Heo kind = SCX_EXIT_ERROR; 6297*bba2c361STejun Heo 6298*bba2c361STejun Heo if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 6299*bba2c361STejun Heo return false; 6300*bba2c361STejun Heo 6301*bba2c361STejun Heo /* 6302*bba2c361STejun Heo * Some CPUs may be trapped in the dispatch paths. Set the aborting 6303*bba2c361STejun Heo * flag to break potential live-lock scenarios, ensuring we can 6304*bba2c361STejun Heo * successfully reach scx_bypass(). 6305*bba2c361STejun Heo */ 6306*bba2c361STejun Heo WRITE_ONCE(sch->aborting, true); 6307*bba2c361STejun Heo 6308*bba2c361STejun Heo /* 6309*bba2c361STejun Heo * Propagate exits to descendants immediately. Each has a dedicated 6310*bba2c361STejun Heo * helper kthread and can run in parallel. While most of disabling is 6311*bba2c361STejun Heo * serialized, running them in separate threads allows parallelizing 6312*bba2c361STejun Heo * ops.exit(), which can take arbitrarily long prolonging bypass mode. 6313*bba2c361STejun Heo * 6314*bba2c361STejun Heo * To guarantee forward progress, this propagation must be in-line so 6315*bba2c361STejun Heo * that ->aborting is synchronously asserted for all sub-scheds. The 6316*bba2c361STejun Heo * propagation is also the interlocking point against sub-sched 6317*bba2c361STejun Heo * attachment. See scx_link_sched(). 6318*bba2c361STejun Heo * 6319*bba2c361STejun Heo * This doesn't cause recursions as propagation only takes place for 6320*bba2c361STejun Heo * non-propagation exits. 6321*bba2c361STejun Heo */ 6322*bba2c361STejun Heo if (kind != SCX_EXIT_PARENT) { 6323*bba2c361STejun Heo scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { 6324*bba2c361STejun Heo struct scx_sched *pos; 6325*bba2c361STejun Heo scx_for_each_descendant_pre(pos, sch) 6326*bba2c361STejun Heo scx_disable(pos, SCX_EXIT_PARENT); 6327*bba2c361STejun Heo } 6328*bba2c361STejun Heo } 6329*bba2c361STejun Heo 6330*bba2c361STejun Heo return true; 6331*bba2c361STejun Heo } 6332*bba2c361STejun Heo 6333*bba2c361STejun Heo static void scx_disable_workfn(struct kthread_work *work) 6334*bba2c361STejun Heo { 6335*bba2c361STejun Heo struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); 6336*bba2c361STejun Heo struct scx_exit_info *ei = sch->exit_info; 6337*bba2c361STejun Heo int kind; 6338*bba2c361STejun Heo 6339*bba2c361STejun Heo kind = atomic_read(&sch->exit_kind); 6340*bba2c361STejun Heo while (true) { 6341*bba2c361STejun Heo if (kind == SCX_EXIT_DONE) /* already disabled? */ 6342*bba2c361STejun Heo return; 6343*bba2c361STejun Heo WARN_ON_ONCE(kind == SCX_EXIT_NONE); 6344*bba2c361STejun Heo if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) 6345*bba2c361STejun Heo break; 6346*bba2c361STejun Heo } 6347*bba2c361STejun Heo ei->kind = kind; 6348*bba2c361STejun Heo ei->reason = scx_exit_reason(ei->kind); 6349*bba2c361STejun Heo 6350*bba2c361STejun Heo if (scx_parent(sch)) 6351*bba2c361STejun Heo scx_sub_disable(sch); 6352*bba2c361STejun Heo else 6353*bba2c361STejun Heo scx_root_disable(sch); 6354*bba2c361STejun Heo } 6355*bba2c361STejun Heo 6356*bba2c361STejun Heo static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) 6357*bba2c361STejun Heo { 6358*bba2c361STejun Heo guard(preempt)(); 6359*bba2c361STejun Heo if (scx_claim_exit(sch, kind)) 6360*bba2c361STejun Heo irq_work_queue(&sch->disable_irq_work); 6361*bba2c361STejun Heo } 6362*bba2c361STejun Heo 6363*bba2c361STejun Heo /** 6364*bba2c361STejun Heo * scx_flush_disable_work - flush the disable work and wait for it to finish 6365*bba2c361STejun Heo * @sch: the scheduler 6366*bba2c361STejun Heo * 6367*bba2c361STejun Heo * sch->disable_work might still not queued, causing kthread_flush_work() 6368*bba2c361STejun Heo * as a noop. Syncing the irq_work first is required to guarantee the 6369*bba2c361STejun Heo * kthread work has been queued before waiting for it. 6370*bba2c361STejun Heo */ 6371*bba2c361STejun Heo static void scx_flush_disable_work(struct scx_sched *sch) 6372*bba2c361STejun Heo { 6373*bba2c361STejun Heo int kind; 6374*bba2c361STejun Heo 6375*bba2c361STejun Heo do { 6376*bba2c361STejun Heo irq_work_sync(&sch->disable_irq_work); 6377*bba2c361STejun Heo kthread_flush_work(&sch->disable_work); 6378*bba2c361STejun Heo kind = atomic_read(&sch->exit_kind); 6379*bba2c361STejun Heo } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); 6380*bba2c361STejun Heo } 6381*bba2c361STejun Heo 6382*bba2c361STejun Heo static void dump_newline(struct seq_buf *s) 6383*bba2c361STejun Heo { 6384*bba2c361STejun Heo trace_sched_ext_dump(""); 6385*bba2c361STejun Heo 6386*bba2c361STejun Heo /* @s may be zero sized and seq_buf triggers WARN if so */ 6387*bba2c361STejun Heo if (s->size) 6388*bba2c361STejun Heo seq_buf_putc(s, '\n'); 6389*bba2c361STejun Heo } 6390*bba2c361STejun Heo 6391*bba2c361STejun Heo static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) 6392*bba2c361STejun Heo { 6393*bba2c361STejun Heo va_list args; 6394*bba2c361STejun Heo 6395*bba2c361STejun Heo #ifdef CONFIG_TRACEPOINTS 6396*bba2c361STejun Heo if (trace_sched_ext_dump_enabled()) { 6397*bba2c361STejun Heo /* protected by scx_dump_lock */ 6398*bba2c361STejun Heo static char line_buf[SCX_EXIT_MSG_LEN]; 6399*bba2c361STejun Heo 6400*bba2c361STejun Heo va_start(args, fmt); 6401*bba2c361STejun Heo vscnprintf(line_buf, sizeof(line_buf), fmt, args); 6402*bba2c361STejun Heo va_end(args); 6403*bba2c361STejun Heo 6404*bba2c361STejun Heo trace_call__sched_ext_dump(line_buf); 6405*bba2c361STejun Heo } 6406*bba2c361STejun Heo #endif 6407*bba2c361STejun Heo /* @s may be zero sized and seq_buf triggers WARN if so */ 6408*bba2c361STejun Heo if (s->size) { 6409*bba2c361STejun Heo va_start(args, fmt); 6410*bba2c361STejun Heo seq_buf_vprintf(s, fmt, args); 6411*bba2c361STejun Heo va_end(args); 6412*bba2c361STejun Heo 6413*bba2c361STejun Heo seq_buf_putc(s, '\n'); 6414*bba2c361STejun Heo } 6415*bba2c361STejun Heo } 6416*bba2c361STejun Heo 6417*bba2c361STejun Heo static void dump_stack_trace(struct seq_buf *s, const char *prefix, 6418*bba2c361STejun Heo const unsigned long *bt, unsigned int len) 6419*bba2c361STejun Heo { 6420*bba2c361STejun Heo unsigned int i; 6421*bba2c361STejun Heo 6422*bba2c361STejun Heo for (i = 0; i < len; i++) 6423*bba2c361STejun Heo dump_line(s, "%s%pS", prefix, (void *)bt[i]); 6424*bba2c361STejun Heo } 6425*bba2c361STejun Heo 6426*bba2c361STejun Heo static void ops_dump_init(struct seq_buf *s, const char *prefix) 6427*bba2c361STejun Heo { 6428*bba2c361STejun Heo struct scx_dump_data *dd = &scx_dump_data; 6429*bba2c361STejun Heo 6430*bba2c361STejun Heo lockdep_assert_irqs_disabled(); 6431*bba2c361STejun Heo 6432*bba2c361STejun Heo dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ 6433*bba2c361STejun Heo dd->first = true; 6434*bba2c361STejun Heo dd->cursor = 0; 6435*bba2c361STejun Heo dd->s = s; 6436*bba2c361STejun Heo dd->prefix = prefix; 6437*bba2c361STejun Heo } 6438*bba2c361STejun Heo 6439*bba2c361STejun Heo static void ops_dump_flush(void) 6440*bba2c361STejun Heo { 6441*bba2c361STejun Heo struct scx_dump_data *dd = &scx_dump_data; 6442*bba2c361STejun Heo char *line = dd->buf.line; 6443*bba2c361STejun Heo 6444*bba2c361STejun Heo if (!dd->cursor) 6445*bba2c361STejun Heo return; 6446*bba2c361STejun Heo 6447*bba2c361STejun Heo /* 6448*bba2c361STejun Heo * There's something to flush and this is the first line. Insert a blank 6449*bba2c361STejun Heo * line to distinguish ops dump. 6450*bba2c361STejun Heo */ 6451*bba2c361STejun Heo if (dd->first) { 6452*bba2c361STejun Heo dump_newline(dd->s); 6453*bba2c361STejun Heo dd->first = false; 6454*bba2c361STejun Heo } 6455*bba2c361STejun Heo 6456*bba2c361STejun Heo /* 6457*bba2c361STejun Heo * There may be multiple lines in $line. Scan and emit each line 6458*bba2c361STejun Heo * separately. 6459*bba2c361STejun Heo */ 6460*bba2c361STejun Heo while (true) { 6461*bba2c361STejun Heo char *end = line; 6462*bba2c361STejun Heo char c; 6463*bba2c361STejun Heo 6464*bba2c361STejun Heo while (*end != '\n' && *end != '\0') 6465*bba2c361STejun Heo end++; 6466*bba2c361STejun Heo 6467*bba2c361STejun Heo /* 6468*bba2c361STejun Heo * If $line overflowed, it may not have newline at the end. 6469*bba2c361STejun Heo * Always emit with a newline. 6470*bba2c361STejun Heo */ 6471*bba2c361STejun Heo c = *end; 6472*bba2c361STejun Heo *end = '\0'; 6473*bba2c361STejun Heo dump_line(dd->s, "%s%s", dd->prefix, line); 6474*bba2c361STejun Heo if (c == '\0') 6475*bba2c361STejun Heo break; 6476*bba2c361STejun Heo 6477*bba2c361STejun Heo /* move to the next line */ 6478*bba2c361STejun Heo end++; 6479*bba2c361STejun Heo if (*end == '\0') 6480*bba2c361STejun Heo break; 6481*bba2c361STejun Heo line = end; 6482*bba2c361STejun Heo } 6483*bba2c361STejun Heo 6484*bba2c361STejun Heo dd->cursor = 0; 6485*bba2c361STejun Heo } 6486*bba2c361STejun Heo 6487*bba2c361STejun Heo static void ops_dump_exit(void) 6488*bba2c361STejun Heo { 6489*bba2c361STejun Heo ops_dump_flush(); 6490*bba2c361STejun Heo scx_dump_data.cpu = -1; 6491*bba2c361STejun Heo } 6492*bba2c361STejun Heo 6493*bba2c361STejun Heo static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, 6494*bba2c361STejun Heo struct rq *rq, struct task_struct *p, char marker) 6495*bba2c361STejun Heo { 6496*bba2c361STejun Heo static unsigned long bt[SCX_EXIT_BT_LEN]; 6497*bba2c361STejun Heo struct scx_sched *task_sch = scx_task_sched(p); 6498*bba2c361STejun Heo const char *own_marker; 6499*bba2c361STejun Heo char sch_id_buf[32]; 6500*bba2c361STejun Heo char dsq_id_buf[19] = "(n/a)"; 6501*bba2c361STejun Heo unsigned long ops_state = atomic_long_read(&p->scx.ops_state); 6502*bba2c361STejun Heo unsigned int bt_len = 0; 6503*bba2c361STejun Heo 6504*bba2c361STejun Heo own_marker = task_sch == sch ? "*" : ""; 6505*bba2c361STejun Heo 6506*bba2c361STejun Heo if (task_sch->level == 0) 6507*bba2c361STejun Heo scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); 6508*bba2c361STejun Heo else 6509*bba2c361STejun Heo scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", 6510*bba2c361STejun Heo task_sch->level, task_sch->ops.sub_cgroup_id); 6511*bba2c361STejun Heo 6512*bba2c361STejun Heo if (p->scx.dsq) 6513*bba2c361STejun Heo scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", 6514*bba2c361STejun Heo (unsigned long long)p->scx.dsq->id); 6515*bba2c361STejun Heo 6516*bba2c361STejun Heo dump_newline(s); 6517*bba2c361STejun Heo dump_line(s, " %c%c %s[%d] %s%s %+ldms", 6518*bba2c361STejun Heo marker, task_state_to_char(p), p->comm, p->pid, 6519*bba2c361STejun Heo own_marker, sch_id_buf, 6520*bba2c361STejun Heo jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); 6521*bba2c361STejun Heo dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", 6522*bba2c361STejun Heo scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, 6523*bba2c361STejun Heo p->scx.flags & ~SCX_TASK_STATE_MASK, 6524*bba2c361STejun Heo p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, 6525*bba2c361STejun Heo ops_state >> SCX_OPSS_QSEQ_SHIFT); 6526*bba2c361STejun Heo dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", 6527*bba2c361STejun Heo p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 6528*bba2c361STejun Heo dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 6529*bba2c361STejun Heo p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 6530*bba2c361STejun Heo dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 6531*bba2c361STejun Heo p->migration_disabled); 6532*bba2c361STejun Heo 6533*bba2c361STejun Heo if (SCX_HAS_OP(sch, dump_task)) { 6534*bba2c361STejun Heo ops_dump_init(s, " "); 6535*bba2c361STejun Heo SCX_CALL_OP(sch, dump_task, rq, dctx, p); 6536*bba2c361STejun Heo ops_dump_exit(); 6537*bba2c361STejun Heo } 6538*bba2c361STejun Heo 6539*bba2c361STejun Heo #ifdef CONFIG_STACKTRACE 6540*bba2c361STejun Heo bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); 6541*bba2c361STejun Heo #endif 6542*bba2c361STejun Heo if (bt_len) { 6543*bba2c361STejun Heo dump_newline(s); 6544*bba2c361STejun Heo dump_stack_trace(s, " ", bt, bt_len); 6545*bba2c361STejun Heo } 6546*bba2c361STejun Heo } 6547*bba2c361STejun Heo 6548*bba2c361STejun Heo static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s, 6549*bba2c361STejun Heo struct scx_dump_ctx *dctx, int cpu, 6550*bba2c361STejun Heo bool dump_all_tasks) 6551*bba2c361STejun Heo { 6552*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 6553*bba2c361STejun Heo struct rq_flags rf; 6554*bba2c361STejun Heo struct task_struct *p; 6555*bba2c361STejun Heo struct seq_buf ns; 6556*bba2c361STejun Heo size_t avail, used; 6557*bba2c361STejun Heo char *buf; 6558*bba2c361STejun Heo bool idle; 6559*bba2c361STejun Heo 6560*bba2c361STejun Heo rq_lock_irqsave(rq, &rf); 6561*bba2c361STejun Heo 6562*bba2c361STejun Heo idle = list_empty(&rq->scx.runnable_list) && 6563*bba2c361STejun Heo rq->curr->sched_class == &idle_sched_class; 6564*bba2c361STejun Heo 6565*bba2c361STejun Heo if (idle && !SCX_HAS_OP(sch, dump_cpu)) 6566*bba2c361STejun Heo goto next; 6567*bba2c361STejun Heo 6568*bba2c361STejun Heo /* 6569*bba2c361STejun Heo * We don't yet know whether ops.dump_cpu() will produce output 6570*bba2c361STejun Heo * and we may want to skip the default CPU dump if it doesn't. 6571*bba2c361STejun Heo * Use a nested seq_buf to generate the standard dump so that we 6572*bba2c361STejun Heo * can decide whether to commit later. 6573*bba2c361STejun Heo */ 6574*bba2c361STejun Heo avail = seq_buf_get_buf(s, &buf); 6575*bba2c361STejun Heo seq_buf_init(&ns, buf, avail); 6576*bba2c361STejun Heo 6577*bba2c361STejun Heo dump_newline(&ns); 6578*bba2c361STejun Heo dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 6579*bba2c361STejun Heo cpu, rq->scx.nr_running, rq->scx.flags, 6580*bba2c361STejun Heo rq->scx.cpu_released, rq->scx.ops_qseq, 6581*bba2c361STejun Heo rq->scx.kick_sync); 6582*bba2c361STejun Heo dump_line(&ns, " curr=%s[%d] class=%ps", 6583*bba2c361STejun Heo rq->curr->comm, rq->curr->pid, 6584*bba2c361STejun Heo rq->curr->sched_class); 6585*bba2c361STejun Heo if (!cpumask_empty(rq->scx.cpus_to_kick)) 6586*bba2c361STejun Heo dump_line(&ns, " cpus_to_kick : %*pb", 6587*bba2c361STejun Heo cpumask_pr_args(rq->scx.cpus_to_kick)); 6588*bba2c361STejun Heo if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) 6589*bba2c361STejun Heo dump_line(&ns, " idle_to_kick : %*pb", 6590*bba2c361STejun Heo cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); 6591*bba2c361STejun Heo if (!cpumask_empty(rq->scx.cpus_to_preempt)) 6592*bba2c361STejun Heo dump_line(&ns, " cpus_to_preempt: %*pb", 6593*bba2c361STejun Heo cpumask_pr_args(rq->scx.cpus_to_preempt)); 6594*bba2c361STejun Heo if (!cpumask_empty(rq->scx.cpus_to_wait)) 6595*bba2c361STejun Heo dump_line(&ns, " cpus_to_wait : %*pb", 6596*bba2c361STejun Heo cpumask_pr_args(rq->scx.cpus_to_wait)); 6597*bba2c361STejun Heo if (!cpumask_empty(rq->scx.cpus_to_sync)) 6598*bba2c361STejun Heo dump_line(&ns, " cpus_to_sync : %*pb", 6599*bba2c361STejun Heo cpumask_pr_args(rq->scx.cpus_to_sync)); 6600*bba2c361STejun Heo 6601*bba2c361STejun Heo used = seq_buf_used(&ns); 6602*bba2c361STejun Heo if (SCX_HAS_OP(sch, dump_cpu)) { 6603*bba2c361STejun Heo ops_dump_init(&ns, " "); 6604*bba2c361STejun Heo SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle); 6605*bba2c361STejun Heo ops_dump_exit(); 6606*bba2c361STejun Heo } 6607*bba2c361STejun Heo 6608*bba2c361STejun Heo /* 6609*bba2c361STejun Heo * If idle && nothing generated by ops.dump_cpu(), there's 6610*bba2c361STejun Heo * nothing interesting. Skip. 6611*bba2c361STejun Heo */ 6612*bba2c361STejun Heo if (idle && used == seq_buf_used(&ns)) 6613*bba2c361STejun Heo goto next; 6614*bba2c361STejun Heo 6615*bba2c361STejun Heo /* 6616*bba2c361STejun Heo * $s may already have overflowed when $ns was created. If so, 6617*bba2c361STejun Heo * calling commit on it will trigger BUG. 6618*bba2c361STejun Heo */ 6619*bba2c361STejun Heo if (avail) { 6620*bba2c361STejun Heo seq_buf_commit(s, seq_buf_used(&ns)); 6621*bba2c361STejun Heo if (seq_buf_has_overflowed(&ns)) 6622*bba2c361STejun Heo seq_buf_set_overflow(s); 6623*bba2c361STejun Heo } 6624*bba2c361STejun Heo 6625*bba2c361STejun Heo if (rq->curr->sched_class == &ext_sched_class && 6626*bba2c361STejun Heo (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) 6627*bba2c361STejun Heo scx_dump_task(sch, s, dctx, rq, rq->curr, '*'); 6628*bba2c361STejun Heo 6629*bba2c361STejun Heo list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) 6630*bba2c361STejun Heo if (dump_all_tasks || scx_task_on_sched(sch, p)) 6631*bba2c361STejun Heo scx_dump_task(sch, s, dctx, rq, p, ' '); 6632*bba2c361STejun Heo next: 6633*bba2c361STejun Heo rq_unlock_irqrestore(rq, &rf); 6634*bba2c361STejun Heo } 6635*bba2c361STejun Heo 6636*bba2c361STejun Heo /* 6637*bba2c361STejun Heo * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless 6638*bba2c361STejun Heo * of which scheduler they belong to. If false, only dump tasks owned by @sch. 6639*bba2c361STejun Heo * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped 6640*bba2c361STejun Heo * separately. For error dumps, @dump_all_tasks=true since only the failing 6641*bba2c361STejun Heo * scheduler is dumped. 6642*bba2c361STejun Heo */ 6643*bba2c361STejun Heo static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, 6644*bba2c361STejun Heo size_t dump_len, bool dump_all_tasks) 6645*bba2c361STejun Heo { 6646*bba2c361STejun Heo static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; 6647*bba2c361STejun Heo struct scx_dump_ctx dctx = { 6648*bba2c361STejun Heo .kind = ei->kind, 6649*bba2c361STejun Heo .exit_code = ei->exit_code, 6650*bba2c361STejun Heo .reason = ei->reason, 6651*bba2c361STejun Heo .at_ns = ktime_get_ns(), 6652*bba2c361STejun Heo .at_jiffies = jiffies, 6653*bba2c361STejun Heo }; 6654*bba2c361STejun Heo struct seq_buf s; 6655*bba2c361STejun Heo struct scx_event_stats events; 6656*bba2c361STejun Heo int cpu; 6657*bba2c361STejun Heo 6658*bba2c361STejun Heo guard(raw_spinlock_irqsave)(&scx_dump_lock); 6659*bba2c361STejun Heo 6660*bba2c361STejun Heo if (sch->dump_disabled) 6661*bba2c361STejun Heo return; 6662*bba2c361STejun Heo 6663*bba2c361STejun Heo seq_buf_init(&s, ei->dump, dump_len); 6664*bba2c361STejun Heo 6665*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 6666*bba2c361STejun Heo if (sch->level == 0) 6667*bba2c361STejun Heo dump_line(&s, "%s: root", sch->ops.name); 6668*bba2c361STejun Heo else 6669*bba2c361STejun Heo dump_line(&s, "%s: sub%d-%llu %s", 6670*bba2c361STejun Heo sch->ops.name, sch->level, sch->ops.sub_cgroup_id, 6671*bba2c361STejun Heo sch->cgrp_path); 6672*bba2c361STejun Heo #endif 6673*bba2c361STejun Heo if (ei->kind == SCX_EXIT_NONE) { 6674*bba2c361STejun Heo dump_line(&s, "Debug dump triggered by %s", ei->reason); 6675*bba2c361STejun Heo } else { 6676*bba2c361STejun Heo if (ei->exit_cpu >= 0) 6677*bba2c361STejun Heo dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:", 6678*bba2c361STejun Heo current->comm, current->pid, ei->kind, 6679*bba2c361STejun Heo ei->exit_cpu); 6680*bba2c361STejun Heo else 6681*bba2c361STejun Heo dump_line(&s, "%s[%d] triggered exit kind %d:", 6682*bba2c361STejun Heo current->comm, current->pid, ei->kind); 6683*bba2c361STejun Heo dump_line(&s, " %s (%s)", ei->reason, ei->msg); 6684*bba2c361STejun Heo dump_newline(&s); 6685*bba2c361STejun Heo dump_line(&s, "Backtrace:"); 6686*bba2c361STejun Heo dump_stack_trace(&s, " ", ei->bt, ei->bt_len); 6687*bba2c361STejun Heo } 6688*bba2c361STejun Heo 6689*bba2c361STejun Heo if (SCX_HAS_OP(sch, dump)) { 6690*bba2c361STejun Heo ops_dump_init(&s, ""); 6691*bba2c361STejun Heo SCX_CALL_OP(sch, dump, NULL, &dctx); 6692*bba2c361STejun Heo ops_dump_exit(); 6693*bba2c361STejun Heo } 6694*bba2c361STejun Heo 6695*bba2c361STejun Heo dump_newline(&s); 6696*bba2c361STejun Heo dump_line(&s, "CPU states"); 6697*bba2c361STejun Heo dump_line(&s, "----------"); 6698*bba2c361STejun Heo 6699*bba2c361STejun Heo /* 6700*bba2c361STejun Heo * Dump the exit CPU first so it isn't lost to dump truncation, then 6701*bba2c361STejun Heo * walk the rest in order, skipping the one already dumped. 6702*bba2c361STejun Heo */ 6703*bba2c361STejun Heo if (ei->exit_cpu >= 0) 6704*bba2c361STejun Heo scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks); 6705*bba2c361STejun Heo for_each_possible_cpu(cpu) { 6706*bba2c361STejun Heo if (cpu != ei->exit_cpu) 6707*bba2c361STejun Heo scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks); 6708*bba2c361STejun Heo } 6709*bba2c361STejun Heo 6710*bba2c361STejun Heo dump_newline(&s); 6711*bba2c361STejun Heo dump_line(&s, "Event counters"); 6712*bba2c361STejun Heo dump_line(&s, "--------------"); 6713*bba2c361STejun Heo 6714*bba2c361STejun Heo scx_read_events(sch, &events); 6715*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 6716*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6717*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 6718*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 6719*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 6720*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); 6721*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); 6722*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); 6723*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 6724*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 6725*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 6726*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); 6727*bba2c361STejun Heo scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); 6728*bba2c361STejun Heo 6729*bba2c361STejun Heo if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 6730*bba2c361STejun Heo memcpy(ei->dump + dump_len - sizeof(trunc_marker), 6731*bba2c361STejun Heo trunc_marker, sizeof(trunc_marker)); 6732*bba2c361STejun Heo } 6733*bba2c361STejun Heo 6734*bba2c361STejun Heo static void scx_disable_irq_workfn(struct irq_work *irq_work) 6735*bba2c361STejun Heo { 6736*bba2c361STejun Heo struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); 6737*bba2c361STejun Heo struct scx_exit_info *ei = sch->exit_info; 6738*bba2c361STejun Heo 6739*bba2c361STejun Heo if (ei->kind >= SCX_EXIT_ERROR) 6740*bba2c361STejun Heo scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); 6741*bba2c361STejun Heo 6742*bba2c361STejun Heo kthread_queue_work(sch->helper, &sch->disable_work); 6743*bba2c361STejun Heo } 6744*bba2c361STejun Heo 6745*bba2c361STejun Heo bool scx_vexit(struct scx_sched *sch, 6746*bba2c361STejun Heo enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu, 6747*bba2c361STejun Heo const char *fmt, va_list args) 6748*bba2c361STejun Heo { 6749*bba2c361STejun Heo struct scx_exit_info *ei = sch->exit_info; 6750*bba2c361STejun Heo 6751*bba2c361STejun Heo guard(preempt)(); 6752*bba2c361STejun Heo 6753*bba2c361STejun Heo if (!scx_claim_exit(sch, kind)) 6754*bba2c361STejun Heo return false; 6755*bba2c361STejun Heo 6756*bba2c361STejun Heo ei->exit_code = exit_code; 6757*bba2c361STejun Heo #ifdef CONFIG_STACKTRACE 6758*bba2c361STejun Heo if (kind >= SCX_EXIT_ERROR) 6759*bba2c361STejun Heo ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); 6760*bba2c361STejun Heo #endif 6761*bba2c361STejun Heo vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); 6762*bba2c361STejun Heo 6763*bba2c361STejun Heo /* 6764*bba2c361STejun Heo * Set ei->kind and ->reason for scx_dump_state(). They'll be set again 6765*bba2c361STejun Heo * in scx_disable_workfn(). 6766*bba2c361STejun Heo */ 6767*bba2c361STejun Heo ei->kind = kind; 6768*bba2c361STejun Heo ei->reason = scx_exit_reason(ei->kind); 6769*bba2c361STejun Heo ei->exit_cpu = exit_cpu; 6770*bba2c361STejun Heo 6771*bba2c361STejun Heo irq_work_queue(&sch->disable_irq_work); 6772*bba2c361STejun Heo return true; 6773*bba2c361STejun Heo } 6774*bba2c361STejun Heo 6775*bba2c361STejun Heo static int alloc_kick_syncs(void) 6776*bba2c361STejun Heo { 6777*bba2c361STejun Heo int cpu; 6778*bba2c361STejun Heo 6779*bba2c361STejun Heo /* 6780*bba2c361STejun Heo * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 6781*bba2c361STejun Heo * can exceed percpu allocator limits on large machines. 6782*bba2c361STejun Heo */ 6783*bba2c361STejun Heo for_each_possible_cpu(cpu) { 6784*bba2c361STejun Heo struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 6785*bba2c361STejun Heo struct scx_kick_syncs *new_ksyncs; 6786*bba2c361STejun Heo 6787*bba2c361STejun Heo WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 6788*bba2c361STejun Heo 6789*bba2c361STejun Heo new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 6790*bba2c361STejun Heo GFP_KERNEL, cpu_to_node(cpu)); 6791*bba2c361STejun Heo if (!new_ksyncs) { 6792*bba2c361STejun Heo free_kick_syncs(); 6793*bba2c361STejun Heo return -ENOMEM; 6794*bba2c361STejun Heo } 6795*bba2c361STejun Heo 6796*bba2c361STejun Heo rcu_assign_pointer(*ksyncs, new_ksyncs); 6797*bba2c361STejun Heo } 6798*bba2c361STejun Heo 6799*bba2c361STejun Heo return 0; 6800*bba2c361STejun Heo } 6801*bba2c361STejun Heo 6802*bba2c361STejun Heo static void free_pnode(struct scx_sched_pnode *pnode) 6803*bba2c361STejun Heo { 6804*bba2c361STejun Heo if (!pnode) 6805*bba2c361STejun Heo return; 6806*bba2c361STejun Heo exit_dsq(&pnode->global_dsq); 6807*bba2c361STejun Heo kfree(pnode); 6808*bba2c361STejun Heo } 6809*bba2c361STejun Heo 6810*bba2c361STejun Heo static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) 6811*bba2c361STejun Heo { 6812*bba2c361STejun Heo struct scx_sched_pnode *pnode; 6813*bba2c361STejun Heo 6814*bba2c361STejun Heo pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); 6815*bba2c361STejun Heo if (!pnode) 6816*bba2c361STejun Heo return NULL; 6817*bba2c361STejun Heo 6818*bba2c361STejun Heo if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { 6819*bba2c361STejun Heo kfree(pnode); 6820*bba2c361STejun Heo return NULL; 6821*bba2c361STejun Heo } 6822*bba2c361STejun Heo 6823*bba2c361STejun Heo return pnode; 6824*bba2c361STejun Heo } 6825*bba2c361STejun Heo 6826*bba2c361STejun Heo /* 6827*bba2c361STejun Heo * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 6828*bba2c361STejun Heo * starvation. During the READY -> ENABLED task switching loop, the calling 6829*bba2c361STejun Heo * thread's sched_class gets switched from fair to ext. As fair has higher 6830*bba2c361STejun Heo * priority than ext, the calling thread can be indefinitely starved under 6831*bba2c361STejun Heo * fair-class saturation, leading to a system hang. 6832*bba2c361STejun Heo */ 6833*bba2c361STejun Heo struct scx_enable_cmd { 6834*bba2c361STejun Heo struct kthread_work work; 6835*bba2c361STejun Heo union { 6836*bba2c361STejun Heo struct sched_ext_ops *ops; 6837*bba2c361STejun Heo struct sched_ext_ops_cid *ops_cid; 6838*bba2c361STejun Heo }; 6839*bba2c361STejun Heo bool is_cid_type; 6840*bba2c361STejun Heo struct bpf_map *arena_map; /* arena ref to transfer to sch */ 6841*bba2c361STejun Heo int ret; 6842*bba2c361STejun Heo }; 6843*bba2c361STejun Heo 6844*bba2c361STejun Heo /* 6845*bba2c361STejun Heo * Allocate and initialize a new scx_sched. @cgrp's reference is always 6846*bba2c361STejun Heo * consumed whether the function succeeds or fails. 6847*bba2c361STejun Heo */ 6848*bba2c361STejun Heo static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd, 6849*bba2c361STejun Heo struct cgroup *cgrp, 6850*bba2c361STejun Heo struct scx_sched *parent) 6851*bba2c361STejun Heo { 6852*bba2c361STejun Heo struct sched_ext_ops *ops = cmd->ops; 6853*bba2c361STejun Heo struct scx_sched *sch; 6854*bba2c361STejun Heo s32 level = parent ? parent->level + 1 : 0; 6855*bba2c361STejun Heo s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; 6856*bba2c361STejun Heo 6857*bba2c361STejun Heo sch = kzalloc_flex(*sch, ancestors, level + 1); 6858*bba2c361STejun Heo if (!sch) { 6859*bba2c361STejun Heo ret = -ENOMEM; 6860*bba2c361STejun Heo goto err_put_cgrp; 6861*bba2c361STejun Heo } 6862*bba2c361STejun Heo 6863*bba2c361STejun Heo sch->exit_info = alloc_exit_info(ops->exit_dump_len); 6864*bba2c361STejun Heo if (!sch->exit_info) { 6865*bba2c361STejun Heo ret = -ENOMEM; 6866*bba2c361STejun Heo goto err_free_sch; 6867*bba2c361STejun Heo } 6868*bba2c361STejun Heo 6869*bba2c361STejun Heo ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); 6870*bba2c361STejun Heo if (ret < 0) 6871*bba2c361STejun Heo goto err_free_ei; 6872*bba2c361STejun Heo 6873*bba2c361STejun Heo sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); 6874*bba2c361STejun Heo if (!sch->pnode) { 6875*bba2c361STejun Heo ret = -ENOMEM; 6876*bba2c361STejun Heo goto err_free_hash; 6877*bba2c361STejun Heo } 6878*bba2c361STejun Heo 6879*bba2c361STejun Heo for_each_node_state(node, N_POSSIBLE) { 6880*bba2c361STejun Heo sch->pnode[node] = alloc_pnode(sch, node); 6881*bba2c361STejun Heo if (!sch->pnode[node]) { 6882*bba2c361STejun Heo ret = -ENOMEM; 6883*bba2c361STejun Heo goto err_free_pnode; 6884*bba2c361STejun Heo } 6885*bba2c361STejun Heo } 6886*bba2c361STejun Heo 6887*bba2c361STejun Heo sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; 6888*bba2c361STejun Heo sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, 6889*bba2c361STejun Heo dsp_ctx.buf, sch->dsp_max_batch), 6890*bba2c361STejun Heo __alignof__(struct scx_sched_pcpu)); 6891*bba2c361STejun Heo if (!sch->pcpu) { 6892*bba2c361STejun Heo ret = -ENOMEM; 6893*bba2c361STejun Heo goto err_free_pnode; 6894*bba2c361STejun Heo } 6895*bba2c361STejun Heo 6896*bba2c361STejun Heo for_each_possible_cpu(cpu) { 6897*bba2c361STejun Heo ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); 6898*bba2c361STejun Heo if (ret) { 6899*bba2c361STejun Heo bypass_fail_cpu = cpu; 6900*bba2c361STejun Heo goto err_free_pcpu; 6901*bba2c361STejun Heo } 6902*bba2c361STejun Heo } 6903*bba2c361STejun Heo 6904*bba2c361STejun Heo for_each_possible_cpu(cpu) { 6905*bba2c361STejun Heo struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); 6906*bba2c361STejun Heo 6907*bba2c361STejun Heo pcpu->sch = sch; 6908*bba2c361STejun Heo INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); 6909*bba2c361STejun Heo } 6910*bba2c361STejun Heo 6911*bba2c361STejun Heo sch->helper = kthread_run_worker(0, "sched_ext_helper"); 6912*bba2c361STejun Heo if (IS_ERR(sch->helper)) { 6913*bba2c361STejun Heo ret = PTR_ERR(sch->helper); 6914*bba2c361STejun Heo goto err_free_pcpu; 6915*bba2c361STejun Heo } 6916*bba2c361STejun Heo 6917*bba2c361STejun Heo sched_set_fifo(sch->helper->task); 6918*bba2c361STejun Heo 6919*bba2c361STejun Heo if (parent) 6920*bba2c361STejun Heo memcpy(sch->ancestors, parent->ancestors, 6921*bba2c361STejun Heo level * sizeof(parent->ancestors[0])); 6922*bba2c361STejun Heo sch->ancestors[level] = sch; 6923*bba2c361STejun Heo sch->level = level; 6924*bba2c361STejun Heo 6925*bba2c361STejun Heo if (ops->timeout_ms) 6926*bba2c361STejun Heo sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); 6927*bba2c361STejun Heo else 6928*bba2c361STejun Heo sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; 6929*bba2c361STejun Heo 6930*bba2c361STejun Heo sch->slice_dfl = SCX_SLICE_DFL; 6931*bba2c361STejun Heo atomic_set(&sch->exit_kind, SCX_EXIT_NONE); 6932*bba2c361STejun Heo sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); 6933*bba2c361STejun Heo kthread_init_work(&sch->disable_work, scx_disable_workfn); 6934*bba2c361STejun Heo timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); 6935*bba2c361STejun Heo 6936*bba2c361STejun Heo if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { 6937*bba2c361STejun Heo ret = -ENOMEM; 6938*bba2c361STejun Heo goto err_stop_helper; 6939*bba2c361STejun Heo } 6940*bba2c361STejun Heo if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { 6941*bba2c361STejun Heo ret = -ENOMEM; 6942*bba2c361STejun Heo goto err_free_lb_cpumask; 6943*bba2c361STejun Heo } 6944*bba2c361STejun Heo /* 6945*bba2c361STejun Heo * Copy ops through the right union view. For cid-form the source is 6946*bba2c361STejun Heo * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/ 6947*bba2c361STejun Heo * cpu_release; those stay zero from kzalloc. 6948*bba2c361STejun Heo */ 6949*bba2c361STejun Heo if (cmd->is_cid_type) { 6950*bba2c361STejun Heo sch->ops_cid = *cmd->ops_cid; 6951*bba2c361STejun Heo sch->is_cid_type = true; 6952*bba2c361STejun Heo } else { 6953*bba2c361STejun Heo sch->ops = *cmd->ops; 6954*bba2c361STejun Heo } 6955*bba2c361STejun Heo 6956*bba2c361STejun Heo rcu_assign_pointer(ops->priv, sch); 6957*bba2c361STejun Heo 6958*bba2c361STejun Heo sch->kobj.kset = scx_kset; 6959*bba2c361STejun Heo INIT_LIST_HEAD(&sch->all); 6960*bba2c361STejun Heo 6961*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 6962*bba2c361STejun Heo char *buf = kzalloc(PATH_MAX, GFP_KERNEL); 6963*bba2c361STejun Heo if (!buf) { 6964*bba2c361STejun Heo ret = -ENOMEM; 6965*bba2c361STejun Heo goto err_free_lb_resched; 6966*bba2c361STejun Heo } 6967*bba2c361STejun Heo cgroup_path(cgrp, buf, PATH_MAX); 6968*bba2c361STejun Heo sch->cgrp_path = kstrdup(buf, GFP_KERNEL); 6969*bba2c361STejun Heo kfree(buf); 6970*bba2c361STejun Heo if (!sch->cgrp_path) { 6971*bba2c361STejun Heo ret = -ENOMEM; 6972*bba2c361STejun Heo goto err_free_lb_resched; 6973*bba2c361STejun Heo } 6974*bba2c361STejun Heo 6975*bba2c361STejun Heo sch->cgrp = cgrp; 6976*bba2c361STejun Heo INIT_LIST_HEAD(&sch->children); 6977*bba2c361STejun Heo INIT_LIST_HEAD(&sch->sibling); 6978*bba2c361STejun Heo 6979*bba2c361STejun Heo if (parent) 6980*bba2c361STejun Heo ret = kobject_init_and_add(&sch->kobj, &scx_ktype, 6981*bba2c361STejun Heo &parent->sub_kset->kobj, 6982*bba2c361STejun Heo "sub-%llu", cgroup_id(cgrp)); 6983*bba2c361STejun Heo else 6984*bba2c361STejun Heo ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 6985*bba2c361STejun Heo 6986*bba2c361STejun Heo if (ret < 0) { 6987*bba2c361STejun Heo RCU_INIT_POINTER(ops->priv, NULL); 6988*bba2c361STejun Heo kobject_put(&sch->kobj); 6989*bba2c361STejun Heo return ERR_PTR(ret); 6990*bba2c361STejun Heo } 6991*bba2c361STejun Heo 6992*bba2c361STejun Heo if (ops->sub_attach) { 6993*bba2c361STejun Heo sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); 6994*bba2c361STejun Heo if (!sch->sub_kset) { 6995*bba2c361STejun Heo RCU_INIT_POINTER(ops->priv, NULL); 6996*bba2c361STejun Heo kobject_put(&sch->kobj); 6997*bba2c361STejun Heo return ERR_PTR(-ENOMEM); 6998*bba2c361STejun Heo } 6999*bba2c361STejun Heo } 7000*bba2c361STejun Heo #else /* CONFIG_EXT_SUB_SCHED */ 7001*bba2c361STejun Heo ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); 7002*bba2c361STejun Heo if (ret < 0) { 7003*bba2c361STejun Heo RCU_INIT_POINTER(ops->priv, NULL); 7004*bba2c361STejun Heo kobject_put(&sch->kobj); 7005*bba2c361STejun Heo return ERR_PTR(ret); 7006*bba2c361STejun Heo } 7007*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 7008*bba2c361STejun Heo 7009*bba2c361STejun Heo /* 7010*bba2c361STejun Heo * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so 7011*bba2c361STejun Heo * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid 7012*bba2c361STejun Heo * drops the ref. After this point, sch owns the ref and any cleanup 7013*bba2c361STejun Heo * runs through scx_sched_free_rcu_work() which puts it. 7014*bba2c361STejun Heo */ 7015*bba2c361STejun Heo sch->arena_map = cmd->arena_map; 7016*bba2c361STejun Heo /* BPF arena is only available on MMU && 64BIT */ 7017*bba2c361STejun Heo #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 7018*bba2c361STejun Heo if (sch->arena_map) 7019*bba2c361STejun Heo sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map); 7020*bba2c361STejun Heo #endif 7021*bba2c361STejun Heo cmd->arena_map = NULL; 7022*bba2c361STejun Heo return sch; 7023*bba2c361STejun Heo 7024*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 7025*bba2c361STejun Heo err_free_lb_resched: 7026*bba2c361STejun Heo RCU_INIT_POINTER(ops->priv, NULL); 7027*bba2c361STejun Heo free_cpumask_var(sch->bypass_lb_resched_cpumask); 7028*bba2c361STejun Heo #endif 7029*bba2c361STejun Heo err_free_lb_cpumask: 7030*bba2c361STejun Heo free_cpumask_var(sch->bypass_lb_donee_cpumask); 7031*bba2c361STejun Heo err_stop_helper: 7032*bba2c361STejun Heo kthread_destroy_worker(sch->helper); 7033*bba2c361STejun Heo err_free_pcpu: 7034*bba2c361STejun Heo for_each_possible_cpu(cpu) { 7035*bba2c361STejun Heo if (cpu == bypass_fail_cpu) 7036*bba2c361STejun Heo break; 7037*bba2c361STejun Heo exit_dsq(bypass_dsq(sch, cpu)); 7038*bba2c361STejun Heo } 7039*bba2c361STejun Heo free_percpu(sch->pcpu); 7040*bba2c361STejun Heo err_free_pnode: 7041*bba2c361STejun Heo for_each_node_state(node, N_POSSIBLE) 7042*bba2c361STejun Heo free_pnode(sch->pnode[node]); 7043*bba2c361STejun Heo kfree(sch->pnode); 7044*bba2c361STejun Heo err_free_hash: 7045*bba2c361STejun Heo rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); 7046*bba2c361STejun Heo err_free_ei: 7047*bba2c361STejun Heo free_exit_info(sch->exit_info); 7048*bba2c361STejun Heo err_free_sch: 7049*bba2c361STejun Heo kfree(sch); 7050*bba2c361STejun Heo err_put_cgrp: 7051*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 7052*bba2c361STejun Heo cgroup_put(cgrp); 7053*bba2c361STejun Heo #endif 7054*bba2c361STejun Heo return ERR_PTR(ret); 7055*bba2c361STejun Heo } 7056*bba2c361STejun Heo 7057*bba2c361STejun Heo static int check_hotplug_seq(struct scx_sched *sch, 7058*bba2c361STejun Heo const struct sched_ext_ops *ops) 7059*bba2c361STejun Heo { 7060*bba2c361STejun Heo unsigned long long global_hotplug_seq; 7061*bba2c361STejun Heo 7062*bba2c361STejun Heo /* 7063*bba2c361STejun Heo * If a hotplug event has occurred between when a scheduler was 7064*bba2c361STejun Heo * initialized, and when we were able to attach, exit and notify user 7065*bba2c361STejun Heo * space about it. 7066*bba2c361STejun Heo */ 7067*bba2c361STejun Heo if (ops->hotplug_seq) { 7068*bba2c361STejun Heo global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); 7069*bba2c361STejun Heo if (ops->hotplug_seq != global_hotplug_seq) { 7070*bba2c361STejun Heo scx_exit(sch, SCX_EXIT_UNREG_KERN, 7071*bba2c361STejun Heo SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 7072*bba2c361STejun Heo "expected hotplug seq %llu did not match actual %llu", 7073*bba2c361STejun Heo ops->hotplug_seq, global_hotplug_seq); 7074*bba2c361STejun Heo return -EBUSY; 7075*bba2c361STejun Heo } 7076*bba2c361STejun Heo } 7077*bba2c361STejun Heo 7078*bba2c361STejun Heo return 0; 7079*bba2c361STejun Heo } 7080*bba2c361STejun Heo 7081*bba2c361STejun Heo static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) 7082*bba2c361STejun Heo { 7083*bba2c361STejun Heo /* 7084*bba2c361STejun Heo * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the 7085*bba2c361STejun Heo * ops.enqueue() callback isn't implemented. 7086*bba2c361STejun Heo */ 7087*bba2c361STejun Heo if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { 7088*bba2c361STejun Heo scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); 7089*bba2c361STejun Heo return -EINVAL; 7090*bba2c361STejun Heo } 7091*bba2c361STejun Heo 7092*bba2c361STejun Heo /* 7093*bba2c361STejun Heo * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched 7094*bba2c361STejun Heo * may set it to declare a dependency; reject if the root hasn't 7095*bba2c361STejun Heo * enabled it. 7096*bba2c361STejun Heo */ 7097*bba2c361STejun Heo if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) && 7098*bba2c361STejun Heo !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) { 7099*bba2c361STejun Heo scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it"); 7100*bba2c361STejun Heo return -EINVAL; 7101*bba2c361STejun Heo } 7102*bba2c361STejun Heo 7103*bba2c361STejun Heo /* 7104*bba2c361STejun Heo * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 7105*bba2c361STejun Heo * selection policy to be enabled. 7106*bba2c361STejun Heo */ 7107*bba2c361STejun Heo if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 7108*bba2c361STejun Heo (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 7109*bba2c361STejun Heo scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 7110*bba2c361STejun Heo return -EINVAL; 7111*bba2c361STejun Heo } 7112*bba2c361STejun Heo 7113*bba2c361STejun Heo /* 7114*bba2c361STejun Heo * cid-form's struct is shorter and doesn't include the cpu_acquire / 7115*bba2c361STejun Heo * cpu_release tail; reading those fields off a cid-form @ops would 7116*bba2c361STejun Heo * run past the BPF allocation. Skip for cid-form. 7117*bba2c361STejun Heo */ 7118*bba2c361STejun Heo if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release)) 7119*bba2c361STejun Heo pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 7120*bba2c361STejun Heo 7121*bba2c361STejun Heo /* 7122*bba2c361STejun Heo * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched 7123*bba2c361STejun Heo * attaches through a cid-form-only interface (sub_attach/sub_detach), 7124*bba2c361STejun Heo * and a root that accepts sub-scheds must expose cid-form state to 7125*bba2c361STejun Heo * them. Reject cpu-form schedulers on either side. 7126*bba2c361STejun Heo */ 7127*bba2c361STejun Heo if (!sch->is_cid_type) { 7128*bba2c361STejun Heo if (scx_parent(sch)) { 7129*bba2c361STejun Heo scx_error(sch, "sub-sched requires cid-form struct_ops"); 7130*bba2c361STejun Heo return -EINVAL; 7131*bba2c361STejun Heo } 7132*bba2c361STejun Heo if (ops->sub_attach || ops->sub_detach) { 7133*bba2c361STejun Heo scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops"); 7134*bba2c361STejun Heo return -EINVAL; 7135*bba2c361STejun Heo } 7136*bba2c361STejun Heo } 7137*bba2c361STejun Heo 7138*bba2c361STejun Heo return 0; 7139*bba2c361STejun Heo } 7140*bba2c361STejun Heo 7141*bba2c361STejun Heo static void scx_root_enable_workfn(struct kthread_work *work) 7142*bba2c361STejun Heo { 7143*bba2c361STejun Heo struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7144*bba2c361STejun Heo struct sched_ext_ops *ops = cmd->ops; 7145*bba2c361STejun Heo struct cgroup *cgrp = root_cgroup(); 7146*bba2c361STejun Heo struct scx_sched *sch; 7147*bba2c361STejun Heo struct scx_task_iter sti; 7148*bba2c361STejun Heo struct task_struct *p; 7149*bba2c361STejun Heo int i, cpu, ret; 7150*bba2c361STejun Heo 7151*bba2c361STejun Heo mutex_lock(&scx_enable_mutex); 7152*bba2c361STejun Heo 7153*bba2c361STejun Heo if (scx_enable_state() != SCX_DISABLED) { 7154*bba2c361STejun Heo ret = -EBUSY; 7155*bba2c361STejun Heo goto err_unlock; 7156*bba2c361STejun Heo } 7157*bba2c361STejun Heo 7158*bba2c361STejun Heo /* 7159*bba2c361STejun Heo * @ops->priv binds @ops to its scx_sched instance. It is set here by 7160*bba2c361STejun Heo * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), 7161*bba2c361STejun Heo * which runs after scx_root_disable() has dropped scx_enable_mutex. If 7162*bba2c361STejun Heo * it's still non-NULL here, a previous attachment on @ops has not 7163*bba2c361STejun Heo * finished tearing down; proceeding would let the in-flight unreg's 7164*bba2c361STejun Heo * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. 7165*bba2c361STejun Heo */ 7166*bba2c361STejun Heo if (rcu_access_pointer(ops->priv)) { 7167*bba2c361STejun Heo ret = -EBUSY; 7168*bba2c361STejun Heo goto err_unlock; 7169*bba2c361STejun Heo } 7170*bba2c361STejun Heo 7171*bba2c361STejun Heo ret = alloc_kick_syncs(); 7172*bba2c361STejun Heo if (ret) 7173*bba2c361STejun Heo goto err_unlock; 7174*bba2c361STejun Heo 7175*bba2c361STejun Heo if (ops->flags & SCX_OPS_TID_TO_TASK) { 7176*bba2c361STejun Heo ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params); 7177*bba2c361STejun Heo if (ret) 7178*bba2c361STejun Heo goto err_free_ksyncs; 7179*bba2c361STejun Heo } 7180*bba2c361STejun Heo 7181*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 7182*bba2c361STejun Heo cgroup_get(cgrp); 7183*bba2c361STejun Heo #endif 7184*bba2c361STejun Heo sch = scx_alloc_and_add_sched(cmd, cgrp, NULL); 7185*bba2c361STejun Heo if (IS_ERR(sch)) { 7186*bba2c361STejun Heo ret = PTR_ERR(sch); 7187*bba2c361STejun Heo goto err_free_tid_hash; 7188*bba2c361STejun Heo } 7189*bba2c361STejun Heo 7190*bba2c361STejun Heo if (sch->is_cid_type) 7191*bba2c361STejun Heo static_branch_enable(&__scx_is_cid_type); 7192*bba2c361STejun Heo 7193*bba2c361STejun Heo /* 7194*bba2c361STejun Heo * Transition to ENABLING and clear exit info to arm the disable path. 7195*bba2c361STejun Heo * Failure triggers full disabling from here on. 7196*bba2c361STejun Heo */ 7197*bba2c361STejun Heo WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 7198*bba2c361STejun Heo WARN_ON_ONCE(scx_root); 7199*bba2c361STejun Heo 7200*bba2c361STejun Heo atomic_long_set(&scx_nr_rejected, 0); 7201*bba2c361STejun Heo 7202*bba2c361STejun Heo for_each_possible_cpu(cpu) { 7203*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 7204*bba2c361STejun Heo 7205*bba2c361STejun Heo rq->scx.local_dsq.sched = sch; 7206*bba2c361STejun Heo rq->scx.cpuperf_target = SCX_CPUPERF_ONE; 7207*bba2c361STejun Heo } 7208*bba2c361STejun Heo 7209*bba2c361STejun Heo /* 7210*bba2c361STejun Heo * Keep CPUs stable during enable so that the BPF scheduler can track 7211*bba2c361STejun Heo * online CPUs by watching ->on/offline_cpu() after ->init(). 7212*bba2c361STejun Heo */ 7213*bba2c361STejun Heo cpus_read_lock(); 7214*bba2c361STejun Heo 7215*bba2c361STejun Heo /* 7216*bba2c361STejun Heo * Build the cid mapping before publishing scx_root. The cid kfuncs 7217*bba2c361STejun Heo * dereference the cid arrays unconditionally once scx_prog_sched() 7218*bba2c361STejun Heo * returns non-NULL; the rcu_assign_pointer() below pairs with their 7219*bba2c361STejun Heo * rcu_dereference() to make the populated arrays visible. 7220*bba2c361STejun Heo */ 7221*bba2c361STejun Heo ret = scx_cid_init(sch); 7222*bba2c361STejun Heo if (ret) { 7223*bba2c361STejun Heo cpus_read_unlock(); 7224*bba2c361STejun Heo goto err_disable; 7225*bba2c361STejun Heo } 7226*bba2c361STejun Heo 7227*bba2c361STejun Heo /* 7228*bba2c361STejun Heo * Make the scheduler instance visible. Must be inside cpus_read_lock(). 7229*bba2c361STejun Heo * See handle_hotplug(). 7230*bba2c361STejun Heo */ 7231*bba2c361STejun Heo rcu_assign_pointer(scx_root, sch); 7232*bba2c361STejun Heo 7233*bba2c361STejun Heo ret = scx_link_sched(sch); 7234*bba2c361STejun Heo if (ret) { 7235*bba2c361STejun Heo cpus_read_unlock(); 7236*bba2c361STejun Heo goto err_disable; 7237*bba2c361STejun Heo } 7238*bba2c361STejun Heo 7239*bba2c361STejun Heo scx_idle_enable(ops); 7240*bba2c361STejun Heo 7241*bba2c361STejun Heo if (sch->ops.init) { 7242*bba2c361STejun Heo ret = SCX_CALL_OP_RET(sch, init, NULL); 7243*bba2c361STejun Heo if (ret) { 7244*bba2c361STejun Heo ret = ops_sanitize_err(sch, "init", ret); 7245*bba2c361STejun Heo cpus_read_unlock(); 7246*bba2c361STejun Heo scx_error(sch, "ops.init() failed (%d)", ret); 7247*bba2c361STejun Heo goto err_disable; 7248*bba2c361STejun Heo } 7249*bba2c361STejun Heo sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7250*bba2c361STejun Heo } 7251*bba2c361STejun Heo 7252*bba2c361STejun Heo ret = scx_arena_pool_init(sch); 7253*bba2c361STejun Heo if (ret) { 7254*bba2c361STejun Heo cpus_read_unlock(); 7255*bba2c361STejun Heo goto err_disable; 7256*bba2c361STejun Heo } 7257*bba2c361STejun Heo 7258*bba2c361STejun Heo ret = scx_set_cmask_scratch_alloc(sch); 7259*bba2c361STejun Heo if (ret) { 7260*bba2c361STejun Heo cpus_read_unlock(); 7261*bba2c361STejun Heo goto err_disable; 7262*bba2c361STejun Heo } 7263*bba2c361STejun Heo 7264*bba2c361STejun Heo for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) 7265*bba2c361STejun Heo if (((void (**)(void))ops)[i]) 7266*bba2c361STejun Heo set_bit(i, sch->has_op); 7267*bba2c361STejun Heo 7268*bba2c361STejun Heo ret = check_hotplug_seq(sch, ops); 7269*bba2c361STejun Heo if (ret) { 7270*bba2c361STejun Heo cpus_read_unlock(); 7271*bba2c361STejun Heo goto err_disable; 7272*bba2c361STejun Heo } 7273*bba2c361STejun Heo scx_idle_update_selcpu_topology(ops); 7274*bba2c361STejun Heo 7275*bba2c361STejun Heo cpus_read_unlock(); 7276*bba2c361STejun Heo 7277*bba2c361STejun Heo ret = validate_ops(sch, ops); 7278*bba2c361STejun Heo if (ret) 7279*bba2c361STejun Heo goto err_disable; 7280*bba2c361STejun Heo 7281*bba2c361STejun Heo /* 7282*bba2c361STejun Heo * Attach the ext_server bandwidth reservation before anything is 7283*bba2c361STejun Heo * committed so that we can fail the enable if the root domain cannot 7284*bba2c361STejun Heo * accommodate it. The matching fair_server detach is deferred to the 7285*bba2c361STejun Heo * tail of this function, after the switch is fully committed and can no 7286*bba2c361STejun Heo * longer fail. 7287*bba2c361STejun Heo * 7288*bba2c361STejun Heo * On failure, err_disable funnels into scx_root_disable() which 7289*bba2c361STejun Heo * detaches ext_server, so partially-attached state is cleaned up 7290*bba2c361STejun Heo * automatically. 7291*bba2c361STejun Heo */ 7292*bba2c361STejun Heo for_each_possible_cpu(cpu) { 7293*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 7294*bba2c361STejun Heo 7295*bba2c361STejun Heo scoped_guard(rq_lock_irqsave, rq) { 7296*bba2c361STejun Heo update_rq_clock(rq); 7297*bba2c361STejun Heo ret = dl_server_attach_bw(&rq->ext_server); 7298*bba2c361STejun Heo } 7299*bba2c361STejun Heo if (ret) { 7300*bba2c361STejun Heo pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n", 7301*bba2c361STejun Heo cpu, ret); 7302*bba2c361STejun Heo goto err_disable; 7303*bba2c361STejun Heo } 7304*bba2c361STejun Heo } 7305*bba2c361STejun Heo 7306*bba2c361STejun Heo /* 7307*bba2c361STejun Heo * Once __scx_enabled is set, %current can be switched to SCX anytime. 7308*bba2c361STejun Heo * This can lead to stalls as some BPF schedulers (e.g. userspace 7309*bba2c361STejun Heo * scheduling) may not function correctly before all tasks are switched. 7310*bba2c361STejun Heo * Init in bypass mode to guarantee forward progress. 7311*bba2c361STejun Heo */ 7312*bba2c361STejun Heo scx_bypass(sch, true); 7313*bba2c361STejun Heo 7314*bba2c361STejun Heo for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 7315*bba2c361STejun Heo if (((void (**)(void))ops)[i]) 7316*bba2c361STejun Heo set_bit(i, sch->has_op); 7317*bba2c361STejun Heo 7318*bba2c361STejun Heo if (sch->ops.cpu_acquire || sch->ops.cpu_release) 7319*bba2c361STejun Heo sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; 7320*bba2c361STejun Heo 7321*bba2c361STejun Heo /* 7322*bba2c361STejun Heo * Lock out forks, cgroup on/offlining and moves before opening the 7323*bba2c361STejun Heo * floodgate so that they don't wander into the operations prematurely. 7324*bba2c361STejun Heo */ 7325*bba2c361STejun Heo percpu_down_write(&scx_fork_rwsem); 7326*bba2c361STejun Heo 7327*bba2c361STejun Heo WARN_ON_ONCE(scx_init_task_enabled); 7328*bba2c361STejun Heo scx_init_task_enabled = true; 7329*bba2c361STejun Heo 7330*bba2c361STejun Heo /* flip under fork_rwsem; the iter below covers existing tasks */ 7331*bba2c361STejun Heo if (ops->flags & SCX_OPS_TID_TO_TASK) 7332*bba2c361STejun Heo static_branch_enable(&__scx_tid_to_task_enabled); 7333*bba2c361STejun Heo 7334*bba2c361STejun Heo /* 7335*bba2c361STejun Heo * Enable ops for every task. Fork is excluded by scx_fork_rwsem 7336*bba2c361STejun Heo * preventing new tasks from being added. No need to exclude tasks 7337*bba2c361STejun Heo * leaving as sched_ext_free() can handle both prepped and enabled 7338*bba2c361STejun Heo * tasks. Prep all tasks first and then enable them with preemption 7339*bba2c361STejun Heo * disabled. 7340*bba2c361STejun Heo * 7341*bba2c361STejun Heo * All cgroups should be initialized before scx_init_task() so that the 7342*bba2c361STejun Heo * BPF scheduler can reliably track each task's cgroup membership from 7343*bba2c361STejun Heo * scx_init_task(). Lock out cgroup on/offlining and task migrations 7344*bba2c361STejun Heo * while tasks are being initialized so that scx_cgroup_can_attach() 7345*bba2c361STejun Heo * never sees uninitialized tasks. 7346*bba2c361STejun Heo */ 7347*bba2c361STejun Heo scx_cgroup_lock(); 7348*bba2c361STejun Heo set_cgroup_sched(sch_cgroup(sch), sch); 7349*bba2c361STejun Heo ret = scx_cgroup_init(sch); 7350*bba2c361STejun Heo if (ret) 7351*bba2c361STejun Heo goto err_disable_unlock_all; 7352*bba2c361STejun Heo 7353*bba2c361STejun Heo scx_task_iter_start(&sti, NULL); 7354*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 7355*bba2c361STejun Heo /* 7356*bba2c361STejun Heo * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD 7357*bba2c361STejun Heo * tasks are filtered by scx_task_iter_next_locked(). 7358*bba2c361STejun Heo * sched_ext_dead() removes @p from scx_tasks under the same 7359*bba2c361STejun Heo * lock before put_task_struct_rcu_user() runs, so @p->usage 7360*bba2c361STejun Heo * is guaranteed > 0 here. 7361*bba2c361STejun Heo */ 7362*bba2c361STejun Heo get_task_struct(p); 7363*bba2c361STejun Heo 7364*bba2c361STejun Heo /* 7365*bba2c361STejun Heo * Set %INIT_BEGIN under the iter's rq lock so that a concurrent 7366*bba2c361STejun Heo * sched_ext_dead() does not call ops.exit_task() on @p while 7367*bba2c361STejun Heo * ops.init_task() is running. If sched_ext_dead() runs before 7368*bba2c361STejun Heo * this store, it has already removed @p from scx_tasks and the 7369*bba2c361STejun Heo * iter won't visit @p; if it runs after, it observes 7370*bba2c361STejun Heo * %INIT_BEGIN and transitions to %DEAD without calling ops, 7371*bba2c361STejun Heo * leaving the post-init recheck below to unwind. 7372*bba2c361STejun Heo */ 7373*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_INIT_BEGIN); 7374*bba2c361STejun Heo scx_task_iter_unlock(&sti); 7375*bba2c361STejun Heo 7376*bba2c361STejun Heo ret = __scx_init_task(sch, p, false); 7377*bba2c361STejun Heo 7378*bba2c361STejun Heo scx_task_iter_relock(&sti, p); 7379*bba2c361STejun Heo 7380*bba2c361STejun Heo if (unlikely(ret)) { 7381*bba2c361STejun Heo if (scx_get_task_state(p) != SCX_TASK_DEAD) 7382*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_NONE); 7383*bba2c361STejun Heo scx_task_iter_stop(&sti); 7384*bba2c361STejun Heo scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 7385*bba2c361STejun Heo ret, p->comm, p->pid); 7386*bba2c361STejun Heo put_task_struct(p); 7387*bba2c361STejun Heo goto err_disable_unlock_all; 7388*bba2c361STejun Heo } 7389*bba2c361STejun Heo 7390*bba2c361STejun Heo if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7391*bba2c361STejun Heo /* 7392*bba2c361STejun Heo * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. 7393*bba2c361STejun Heo * ops.exit_task() is owed to the sched __scx_init_task() 7394*bba2c361STejun Heo * ran against; call it now. 7395*bba2c361STejun Heo */ 7396*bba2c361STejun Heo scx_sub_init_cancel_task(sch, p); 7397*bba2c361STejun Heo } else { 7398*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_INIT); 7399*bba2c361STejun Heo scx_set_task_sched(p, sch); 7400*bba2c361STejun Heo scx_set_task_state(p, SCX_TASK_READY); 7401*bba2c361STejun Heo } 7402*bba2c361STejun Heo 7403*bba2c361STejun Heo /* 7404*bba2c361STejun Heo * Insert into the tid hash. scx_tasks_lock is held by the iter; 7405*bba2c361STejun Heo * list_empty() guards against sched_ext_dead() having taken @p 7406*bba2c361STejun Heo * off the list while init ran unlocked. 7407*bba2c361STejun Heo */ 7408*bba2c361STejun Heo if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node)) 7409*bba2c361STejun Heo scx_tid_hash_insert(p); 7410*bba2c361STejun Heo 7411*bba2c361STejun Heo put_task_struct(p); 7412*bba2c361STejun Heo } 7413*bba2c361STejun Heo scx_task_iter_stop(&sti); 7414*bba2c361STejun Heo scx_cgroup_unlock(); 7415*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 7416*bba2c361STejun Heo 7417*bba2c361STejun Heo /* 7418*bba2c361STejun Heo * All tasks are READY. It's safe to turn on scx_enabled() and switch 7419*bba2c361STejun Heo * all eligible tasks. 7420*bba2c361STejun Heo */ 7421*bba2c361STejun Heo WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 7422*bba2c361STejun Heo static_branch_enable(&__scx_enabled); 7423*bba2c361STejun Heo 7424*bba2c361STejun Heo /* 7425*bba2c361STejun Heo * We're fully committed and can't fail. The task READY -> ENABLED 7426*bba2c361STejun Heo * transitions here are synchronized against sched_ext_free() through 7427*bba2c361STejun Heo * scx_tasks_lock. 7428*bba2c361STejun Heo */ 7429*bba2c361STejun Heo percpu_down_write(&scx_fork_rwsem); 7430*bba2c361STejun Heo scx_task_iter_start(&sti, NULL); 7431*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 7432*bba2c361STejun Heo unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 7433*bba2c361STejun Heo const struct sched_class *old_class = p->sched_class; 7434*bba2c361STejun Heo const struct sched_class *new_class = scx_setscheduler_class(p); 7435*bba2c361STejun Heo 7436*bba2c361STejun Heo if (scx_get_task_state(p) != SCX_TASK_READY) 7437*bba2c361STejun Heo continue; 7438*bba2c361STejun Heo 7439*bba2c361STejun Heo if (old_class != new_class) 7440*bba2c361STejun Heo queue_flags |= DEQUEUE_CLASS; 7441*bba2c361STejun Heo 7442*bba2c361STejun Heo scoped_guard (sched_change, p, queue_flags) { 7443*bba2c361STejun Heo p->scx.slice = READ_ONCE(sch->slice_dfl); 7444*bba2c361STejun Heo p->sched_class = new_class; 7445*bba2c361STejun Heo } 7446*bba2c361STejun Heo } 7447*bba2c361STejun Heo scx_task_iter_stop(&sti); 7448*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 7449*bba2c361STejun Heo 7450*bba2c361STejun Heo scx_bypass(sch, false); 7451*bba2c361STejun Heo 7452*bba2c361STejun Heo if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { 7453*bba2c361STejun Heo WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); 7454*bba2c361STejun Heo goto err_disable; 7455*bba2c361STejun Heo } 7456*bba2c361STejun Heo 7457*bba2c361STejun Heo if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) 7458*bba2c361STejun Heo static_branch_enable(&__scx_switched_all); 7459*bba2c361STejun Heo 7460*bba2c361STejun Heo /* 7461*bba2c361STejun Heo * Detach the fair_server bandwidth reservation now that the switch 7462*bba2c361STejun Heo * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no 7463*bba2c361STejun Heo * task will ever run in the fair class, so give that bandwidth 7464*bba2c361STejun Heo * back to the RT class. The matching ext_server attach already 7465*bba2c361STejun Heo * happened earlier; this only releases bandwidth and cannot fail. 7466*bba2c361STejun Heo * 7467*bba2c361STejun Heo * In partial mode keep fair_server attached. 7468*bba2c361STejun Heo */ 7469*bba2c361STejun Heo if (scx_switched_all()) { 7470*bba2c361STejun Heo for_each_possible_cpu(cpu) { 7471*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 7472*bba2c361STejun Heo 7473*bba2c361STejun Heo guard(rq_lock_irqsave)(rq); 7474*bba2c361STejun Heo update_rq_clock(rq); 7475*bba2c361STejun Heo dl_server_detach_bw(&rq->fair_server); 7476*bba2c361STejun Heo } 7477*bba2c361STejun Heo } 7478*bba2c361STejun Heo 7479*bba2c361STejun Heo pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", 7480*bba2c361STejun Heo sch->ops.name, scx_switched_all() ? "" : " (partial)"); 7481*bba2c361STejun Heo kobject_uevent(&sch->kobj, KOBJ_ADD); 7482*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 7483*bba2c361STejun Heo 7484*bba2c361STejun Heo atomic_long_inc(&scx_enable_seq); 7485*bba2c361STejun Heo 7486*bba2c361STejun Heo cmd->ret = 0; 7487*bba2c361STejun Heo return; 7488*bba2c361STejun Heo 7489*bba2c361STejun Heo err_free_tid_hash: 7490*bba2c361STejun Heo if (ops->flags & SCX_OPS_TID_TO_TASK) 7491*bba2c361STejun Heo rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); 7492*bba2c361STejun Heo err_free_ksyncs: 7493*bba2c361STejun Heo free_kick_syncs(); 7494*bba2c361STejun Heo err_unlock: 7495*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 7496*bba2c361STejun Heo cmd->ret = ret; 7497*bba2c361STejun Heo return; 7498*bba2c361STejun Heo 7499*bba2c361STejun Heo err_disable_unlock_all: 7500*bba2c361STejun Heo scx_cgroup_unlock(); 7501*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 7502*bba2c361STejun Heo /* we'll soon enter disable path, keep bypass on */ 7503*bba2c361STejun Heo err_disable: 7504*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 7505*bba2c361STejun Heo /* 7506*bba2c361STejun Heo * Returning an error code here would not pass all the error information 7507*bba2c361STejun Heo * to userspace. Record errno using scx_error() for cases scx_error() 7508*bba2c361STejun Heo * wasn't already invoked and exit indicating success so that the error 7509*bba2c361STejun Heo * is notified through ops.exit() with all the details. 7510*bba2c361STejun Heo * 7511*bba2c361STejun Heo * Flush scx_disable_work to ensure that error is reported before init 7512*bba2c361STejun Heo * completion. sch's base reference will be put by bpf_scx_unreg(). 7513*bba2c361STejun Heo */ 7514*bba2c361STejun Heo scx_error(sch, "scx_root_enable() failed (%d)", ret); 7515*bba2c361STejun Heo scx_flush_disable_work(sch); 7516*bba2c361STejun Heo cmd->ret = 0; 7517*bba2c361STejun Heo } 7518*bba2c361STejun Heo 7519*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 7520*bba2c361STejun Heo /* verify that a scheduler can be attached to @cgrp and return the parent */ 7521*bba2c361STejun Heo static struct scx_sched *find_parent_sched(struct cgroup *cgrp) 7522*bba2c361STejun Heo { 7523*bba2c361STejun Heo struct scx_sched *parent = cgrp->scx_sched; 7524*bba2c361STejun Heo struct scx_sched *pos; 7525*bba2c361STejun Heo 7526*bba2c361STejun Heo lockdep_assert_held(&scx_sched_lock); 7527*bba2c361STejun Heo 7528*bba2c361STejun Heo /* can't attach twice to the same cgroup */ 7529*bba2c361STejun Heo if (parent->cgrp == cgrp) 7530*bba2c361STejun Heo return ERR_PTR(-EBUSY); 7531*bba2c361STejun Heo 7532*bba2c361STejun Heo /* does $parent allow sub-scheds? */ 7533*bba2c361STejun Heo if (!parent->ops.sub_attach) 7534*bba2c361STejun Heo return ERR_PTR(-EOPNOTSUPP); 7535*bba2c361STejun Heo 7536*bba2c361STejun Heo /* can't insert between $parent and its exiting children */ 7537*bba2c361STejun Heo list_for_each_entry(pos, &parent->children, sibling) 7538*bba2c361STejun Heo if (cgroup_is_descendant(pos->cgrp, cgrp)) 7539*bba2c361STejun Heo return ERR_PTR(-EBUSY); 7540*bba2c361STejun Heo 7541*bba2c361STejun Heo return parent; 7542*bba2c361STejun Heo } 7543*bba2c361STejun Heo 7544*bba2c361STejun Heo static bool assert_task_ready_or_enabled(struct task_struct *p) 7545*bba2c361STejun Heo { 7546*bba2c361STejun Heo u32 state = scx_get_task_state(p); 7547*bba2c361STejun Heo 7548*bba2c361STejun Heo switch (state) { 7549*bba2c361STejun Heo case SCX_TASK_READY: 7550*bba2c361STejun Heo case SCX_TASK_ENABLED: 7551*bba2c361STejun Heo return true; 7552*bba2c361STejun Heo default: 7553*bba2c361STejun Heo WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", 7554*bba2c361STejun Heo state, p->comm, p->pid); 7555*bba2c361STejun Heo return false; 7556*bba2c361STejun Heo } 7557*bba2c361STejun Heo } 7558*bba2c361STejun Heo 7559*bba2c361STejun Heo static void scx_sub_enable_workfn(struct kthread_work *work) 7560*bba2c361STejun Heo { 7561*bba2c361STejun Heo struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); 7562*bba2c361STejun Heo struct sched_ext_ops *ops = cmd->ops; 7563*bba2c361STejun Heo struct cgroup *cgrp; 7564*bba2c361STejun Heo struct scx_sched *parent, *sch; 7565*bba2c361STejun Heo struct scx_task_iter sti; 7566*bba2c361STejun Heo struct task_struct *p; 7567*bba2c361STejun Heo s32 i, ret; 7568*bba2c361STejun Heo 7569*bba2c361STejun Heo mutex_lock(&scx_enable_mutex); 7570*bba2c361STejun Heo 7571*bba2c361STejun Heo if (!scx_enabled()) { 7572*bba2c361STejun Heo ret = -ENODEV; 7573*bba2c361STejun Heo goto out_unlock; 7574*bba2c361STejun Heo } 7575*bba2c361STejun Heo 7576*bba2c361STejun Heo /* See scx_root_enable_workfn() for the @ops->priv check. */ 7577*bba2c361STejun Heo if (rcu_access_pointer(ops->priv)) { 7578*bba2c361STejun Heo ret = -EBUSY; 7579*bba2c361STejun Heo goto out_unlock; 7580*bba2c361STejun Heo } 7581*bba2c361STejun Heo 7582*bba2c361STejun Heo cgrp = cgroup_get_from_id(ops->sub_cgroup_id); 7583*bba2c361STejun Heo if (IS_ERR(cgrp)) { 7584*bba2c361STejun Heo ret = PTR_ERR(cgrp); 7585*bba2c361STejun Heo goto out_unlock; 7586*bba2c361STejun Heo } 7587*bba2c361STejun Heo 7588*bba2c361STejun Heo raw_spin_lock_irq(&scx_sched_lock); 7589*bba2c361STejun Heo parent = find_parent_sched(cgrp); 7590*bba2c361STejun Heo if (IS_ERR(parent)) { 7591*bba2c361STejun Heo raw_spin_unlock_irq(&scx_sched_lock); 7592*bba2c361STejun Heo ret = PTR_ERR(parent); 7593*bba2c361STejun Heo goto out_put_cgrp; 7594*bba2c361STejun Heo } 7595*bba2c361STejun Heo kobject_get(&parent->kobj); 7596*bba2c361STejun Heo raw_spin_unlock_irq(&scx_sched_lock); 7597*bba2c361STejun Heo 7598*bba2c361STejun Heo /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ 7599*bba2c361STejun Heo sch = scx_alloc_and_add_sched(cmd, cgrp, parent); 7600*bba2c361STejun Heo kobject_put(&parent->kobj); 7601*bba2c361STejun Heo if (IS_ERR(sch)) { 7602*bba2c361STejun Heo ret = PTR_ERR(sch); 7603*bba2c361STejun Heo goto out_unlock; 7604*bba2c361STejun Heo } 7605*bba2c361STejun Heo 7606*bba2c361STejun Heo ret = scx_link_sched(sch); 7607*bba2c361STejun Heo if (ret) 7608*bba2c361STejun Heo goto err_disable; 7609*bba2c361STejun Heo 7610*bba2c361STejun Heo if (sch->level >= SCX_SUB_MAX_DEPTH) { 7611*bba2c361STejun Heo scx_error(sch, "max nesting depth %d violated", 7612*bba2c361STejun Heo SCX_SUB_MAX_DEPTH); 7613*bba2c361STejun Heo goto err_disable; 7614*bba2c361STejun Heo } 7615*bba2c361STejun Heo 7616*bba2c361STejun Heo if (sch->ops.init) { 7617*bba2c361STejun Heo ret = SCX_CALL_OP_RET(sch, init, NULL); 7618*bba2c361STejun Heo if (ret) { 7619*bba2c361STejun Heo ret = ops_sanitize_err(sch, "init", ret); 7620*bba2c361STejun Heo scx_error(sch, "ops.init() failed (%d)", ret); 7621*bba2c361STejun Heo goto err_disable; 7622*bba2c361STejun Heo } 7623*bba2c361STejun Heo sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 7624*bba2c361STejun Heo } 7625*bba2c361STejun Heo 7626*bba2c361STejun Heo ret = scx_arena_pool_init(sch); 7627*bba2c361STejun Heo if (ret) 7628*bba2c361STejun Heo goto err_disable; 7629*bba2c361STejun Heo 7630*bba2c361STejun Heo ret = scx_set_cmask_scratch_alloc(sch); 7631*bba2c361STejun Heo if (ret) 7632*bba2c361STejun Heo goto err_disable; 7633*bba2c361STejun Heo 7634*bba2c361STejun Heo if (validate_ops(sch, ops)) 7635*bba2c361STejun Heo goto err_disable; 7636*bba2c361STejun Heo 7637*bba2c361STejun Heo struct scx_sub_attach_args sub_attach_args = { 7638*bba2c361STejun Heo .ops = &sch->ops, 7639*bba2c361STejun Heo .cgroup_path = sch->cgrp_path, 7640*bba2c361STejun Heo }; 7641*bba2c361STejun Heo 7642*bba2c361STejun Heo ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, 7643*bba2c361STejun Heo &sub_attach_args); 7644*bba2c361STejun Heo if (ret) { 7645*bba2c361STejun Heo ret = ops_sanitize_err(sch, "sub_attach", ret); 7646*bba2c361STejun Heo scx_error(sch, "parent rejected (%d)", ret); 7647*bba2c361STejun Heo goto err_disable; 7648*bba2c361STejun Heo } 7649*bba2c361STejun Heo sch->sub_attached = true; 7650*bba2c361STejun Heo 7651*bba2c361STejun Heo scx_bypass(sch, true); 7652*bba2c361STejun Heo 7653*bba2c361STejun Heo for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 7654*bba2c361STejun Heo if (((void (**)(void))ops)[i]) 7655*bba2c361STejun Heo set_bit(i, sch->has_op); 7656*bba2c361STejun Heo 7657*bba2c361STejun Heo percpu_down_write(&scx_fork_rwsem); 7658*bba2c361STejun Heo scx_cgroup_lock(); 7659*bba2c361STejun Heo 7660*bba2c361STejun Heo /* 7661*bba2c361STejun Heo * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see 7662*bba2c361STejun Heo * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. 7663*bba2c361STejun Heo */ 7664*bba2c361STejun Heo set_cgroup_sched(sch_cgroup(sch), sch); 7665*bba2c361STejun Heo if (!(cgrp->self.flags & CSS_ONLINE)) { 7666*bba2c361STejun Heo scx_error(sch, "cgroup is not online"); 7667*bba2c361STejun Heo goto err_unlock_and_disable; 7668*bba2c361STejun Heo } 7669*bba2c361STejun Heo 7670*bba2c361STejun Heo /* 7671*bba2c361STejun Heo * Initialize tasks for the new child $sch without exiting them for 7672*bba2c361STejun Heo * $parent so that the tasks can always be reverted back to $parent 7673*bba2c361STejun Heo * sched on child init failure. 7674*bba2c361STejun Heo */ 7675*bba2c361STejun Heo WARN_ON_ONCE(scx_enabling_sub_sched); 7676*bba2c361STejun Heo scx_enabling_sub_sched = sch; 7677*bba2c361STejun Heo 7678*bba2c361STejun Heo scx_task_iter_start(&sti, sch->cgrp); 7679*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 7680*bba2c361STejun Heo struct rq *rq; 7681*bba2c361STejun Heo struct rq_flags rf; 7682*bba2c361STejun Heo 7683*bba2c361STejun Heo /* 7684*bba2c361STejun Heo * Task iteration may visit the same task twice when racing 7685*bba2c361STejun Heo * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which 7686*bba2c361STejun Heo * finished __scx_init_task() and skip if set. 7687*bba2c361STejun Heo * 7688*bba2c361STejun Heo * A task may exit and get freed between __scx_init_task() 7689*bba2c361STejun Heo * completion and scx_enable_task(). In such cases, 7690*bba2c361STejun Heo * scx_disable_and_exit_task() must exit the task for both the 7691*bba2c361STejun Heo * parent and child scheds. 7692*bba2c361STejun Heo */ 7693*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_SUB_INIT) 7694*bba2c361STejun Heo continue; 7695*bba2c361STejun Heo 7696*bba2c361STejun Heo /* @p is pinned by the iter; see scx_sub_disable() */ 7697*bba2c361STejun Heo get_task_struct(p); 7698*bba2c361STejun Heo 7699*bba2c361STejun Heo if (!assert_task_ready_or_enabled(p)) { 7700*bba2c361STejun Heo ret = -EINVAL; 7701*bba2c361STejun Heo goto abort; 7702*bba2c361STejun Heo } 7703*bba2c361STejun Heo 7704*bba2c361STejun Heo scx_task_iter_unlock(&sti); 7705*bba2c361STejun Heo 7706*bba2c361STejun Heo /* 7707*bba2c361STejun Heo * As $p is still on $parent, it can't be transitioned to INIT. 7708*bba2c361STejun Heo * Let's worry about task state later. Use __scx_init_task(). 7709*bba2c361STejun Heo */ 7710*bba2c361STejun Heo ret = __scx_init_task(sch, p, false); 7711*bba2c361STejun Heo if (ret) 7712*bba2c361STejun Heo goto abort; 7713*bba2c361STejun Heo 7714*bba2c361STejun Heo rq = task_rq_lock(p, &rf); 7715*bba2c361STejun Heo 7716*bba2c361STejun Heo if (scx_get_task_state(p) == SCX_TASK_DEAD) { 7717*bba2c361STejun Heo /* 7718*bba2c361STejun Heo * sched_ext_dead() raced us between __scx_init_task() 7719*bba2c361STejun Heo * and this rq lock and ran exit_task() on $parent (the 7720*bba2c361STejun Heo * sched @p was on at that point), not on @sch. @sch's 7721*bba2c361STejun Heo * just-completed init is owed an exit_task() and we 7722*bba2c361STejun Heo * issue it here. 7723*bba2c361STejun Heo */ 7724*bba2c361STejun Heo scx_sub_init_cancel_task(sch, p); 7725*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 7726*bba2c361STejun Heo put_task_struct(p); 7727*bba2c361STejun Heo continue; 7728*bba2c361STejun Heo } 7729*bba2c361STejun Heo 7730*bba2c361STejun Heo p->scx.flags |= SCX_TASK_SUB_INIT; 7731*bba2c361STejun Heo task_rq_unlock(rq, p, &rf); 7732*bba2c361STejun Heo 7733*bba2c361STejun Heo put_task_struct(p); 7734*bba2c361STejun Heo } 7735*bba2c361STejun Heo scx_task_iter_stop(&sti); 7736*bba2c361STejun Heo 7737*bba2c361STejun Heo /* 7738*bba2c361STejun Heo * All tasks are prepped. Disable/exit tasks for $parent and enable for 7739*bba2c361STejun Heo * the new @sch. 7740*bba2c361STejun Heo */ 7741*bba2c361STejun Heo scx_task_iter_start(&sti, sch->cgrp); 7742*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 7743*bba2c361STejun Heo /* 7744*bba2c361STejun Heo * Use clearing of %SCX_TASK_SUB_INIT to detect and skip 7745*bba2c361STejun Heo * duplicate iterations. 7746*bba2c361STejun Heo */ 7747*bba2c361STejun Heo if (!(p->scx.flags & SCX_TASK_SUB_INIT)) 7748*bba2c361STejun Heo continue; 7749*bba2c361STejun Heo 7750*bba2c361STejun Heo scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 7751*bba2c361STejun Heo /* 7752*bba2c361STejun Heo * $p must be either READY or ENABLED. If ENABLED, 7753*bba2c361STejun Heo * __scx_disabled_and_exit_task() first disables and 7754*bba2c361STejun Heo * makes it READY. However, after exiting $p, it will 7755*bba2c361STejun Heo * leave $p as READY. 7756*bba2c361STejun Heo */ 7757*bba2c361STejun Heo assert_task_ready_or_enabled(p); 7758*bba2c361STejun Heo __scx_disable_and_exit_task(parent, p); 7759*bba2c361STejun Heo 7760*bba2c361STejun Heo /* 7761*bba2c361STejun Heo * $p is now only initialized for @sch and READY, which 7762*bba2c361STejun Heo * is what we want. Assign it to @sch and enable. 7763*bba2c361STejun Heo */ 7764*bba2c361STejun Heo scx_set_task_sched(p, sch); 7765*bba2c361STejun Heo scx_enable_task(sch, p); 7766*bba2c361STejun Heo 7767*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_SUB_INIT; 7768*bba2c361STejun Heo } 7769*bba2c361STejun Heo } 7770*bba2c361STejun Heo scx_task_iter_stop(&sti); 7771*bba2c361STejun Heo 7772*bba2c361STejun Heo scx_enabling_sub_sched = NULL; 7773*bba2c361STejun Heo 7774*bba2c361STejun Heo scx_cgroup_unlock(); 7775*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 7776*bba2c361STejun Heo 7777*bba2c361STejun Heo scx_bypass(sch, false); 7778*bba2c361STejun Heo 7779*bba2c361STejun Heo pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); 7780*bba2c361STejun Heo kobject_uevent(&sch->kobj, KOBJ_ADD); 7781*bba2c361STejun Heo ret = 0; 7782*bba2c361STejun Heo goto out_unlock; 7783*bba2c361STejun Heo 7784*bba2c361STejun Heo out_put_cgrp: 7785*bba2c361STejun Heo cgroup_put(cgrp); 7786*bba2c361STejun Heo out_unlock: 7787*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 7788*bba2c361STejun Heo cmd->ret = ret; 7789*bba2c361STejun Heo return; 7790*bba2c361STejun Heo 7791*bba2c361STejun Heo abort: 7792*bba2c361STejun Heo put_task_struct(p); 7793*bba2c361STejun Heo scx_task_iter_stop(&sti); 7794*bba2c361STejun Heo 7795*bba2c361STejun Heo /* 7796*bba2c361STejun Heo * Undo __scx_init_task() for tasks we marked. scx_enable_task() never 7797*bba2c361STejun Heo * ran for @sch on them, so calling scx_disable_task() here would invoke 7798*bba2c361STejun Heo * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched 7799*bba2c361STejun Heo * must stay set until SUB_INIT is cleared from every marked task - 7800*bba2c361STejun Heo * scx_disable_and_exit_task() reads it when a task exits concurrently. 7801*bba2c361STejun Heo */ 7802*bba2c361STejun Heo scx_task_iter_start(&sti, sch->cgrp); 7803*bba2c361STejun Heo while ((p = scx_task_iter_next_locked(&sti))) { 7804*bba2c361STejun Heo if (p->scx.flags & SCX_TASK_SUB_INIT) { 7805*bba2c361STejun Heo scx_sub_init_cancel_task(sch, p); 7806*bba2c361STejun Heo p->scx.flags &= ~SCX_TASK_SUB_INIT; 7807*bba2c361STejun Heo } 7808*bba2c361STejun Heo } 7809*bba2c361STejun Heo scx_task_iter_stop(&sti); 7810*bba2c361STejun Heo scx_enabling_sub_sched = NULL; 7811*bba2c361STejun Heo err_unlock_and_disable: 7812*bba2c361STejun Heo /* we'll soon enter disable path, keep bypass on */ 7813*bba2c361STejun Heo scx_cgroup_unlock(); 7814*bba2c361STejun Heo percpu_up_write(&scx_fork_rwsem); 7815*bba2c361STejun Heo err_disable: 7816*bba2c361STejun Heo mutex_unlock(&scx_enable_mutex); 7817*bba2c361STejun Heo scx_flush_disable_work(sch); 7818*bba2c361STejun Heo cmd->ret = 0; 7819*bba2c361STejun Heo } 7820*bba2c361STejun Heo 7821*bba2c361STejun Heo static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, 7822*bba2c361STejun Heo unsigned long action, void *data) 7823*bba2c361STejun Heo { 7824*bba2c361STejun Heo struct cgroup *cgrp = data; 7825*bba2c361STejun Heo struct cgroup *parent = cgroup_parent(cgrp); 7826*bba2c361STejun Heo 7827*bba2c361STejun Heo if (!cgroup_on_dfl(cgrp)) 7828*bba2c361STejun Heo return NOTIFY_OK; 7829*bba2c361STejun Heo 7830*bba2c361STejun Heo switch (action) { 7831*bba2c361STejun Heo case CGROUP_LIFETIME_ONLINE: 7832*bba2c361STejun Heo /* inherit ->scx_sched from $parent */ 7833*bba2c361STejun Heo if (parent) 7834*bba2c361STejun Heo rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); 7835*bba2c361STejun Heo break; 7836*bba2c361STejun Heo case CGROUP_LIFETIME_OFFLINE: 7837*bba2c361STejun Heo /* if there is a sched attached, shoot it down */ 7838*bba2c361STejun Heo if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) 7839*bba2c361STejun Heo scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, 7840*bba2c361STejun Heo SCX_ECODE_RSN_CGROUP_OFFLINE, 7841*bba2c361STejun Heo "cgroup %llu going offline", cgroup_id(cgrp)); 7842*bba2c361STejun Heo break; 7843*bba2c361STejun Heo } 7844*bba2c361STejun Heo 7845*bba2c361STejun Heo return NOTIFY_OK; 7846*bba2c361STejun Heo } 7847*bba2c361STejun Heo 7848*bba2c361STejun Heo static struct notifier_block scx_cgroup_lifetime_nb = { 7849*bba2c361STejun Heo .notifier_call = scx_cgroup_lifetime_notify, 7850*bba2c361STejun Heo }; 7851*bba2c361STejun Heo 7852*bba2c361STejun Heo static s32 __init scx_cgroup_lifetime_notifier_init(void) 7853*bba2c361STejun Heo { 7854*bba2c361STejun Heo return blocking_notifier_chain_register(&cgroup_lifetime_notifier, 7855*bba2c361STejun Heo &scx_cgroup_lifetime_nb); 7856*bba2c361STejun Heo } 7857*bba2c361STejun Heo core_initcall(scx_cgroup_lifetime_notifier_init); 7858*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 7859*bba2c361STejun Heo 7860*bba2c361STejun Heo static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link) 7861*bba2c361STejun Heo { 7862*bba2c361STejun Heo static struct kthread_worker *helper; 7863*bba2c361STejun Heo static DEFINE_MUTEX(helper_mutex); 7864*bba2c361STejun Heo 7865*bba2c361STejun Heo if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { 7866*bba2c361STejun Heo pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 7867*bba2c361STejun Heo return -EINVAL; 7868*bba2c361STejun Heo } 7869*bba2c361STejun Heo 7870*bba2c361STejun Heo if (!READ_ONCE(helper)) { 7871*bba2c361STejun Heo mutex_lock(&helper_mutex); 7872*bba2c361STejun Heo if (!helper) { 7873*bba2c361STejun Heo struct kthread_worker *w = 7874*bba2c361STejun Heo kthread_run_worker(0, "scx_enable_helper"); 7875*bba2c361STejun Heo if (IS_ERR_OR_NULL(w)) { 7876*bba2c361STejun Heo mutex_unlock(&helper_mutex); 7877*bba2c361STejun Heo return -ENOMEM; 7878*bba2c361STejun Heo } 7879*bba2c361STejun Heo sched_set_fifo(w->task); 7880*bba2c361STejun Heo WRITE_ONCE(helper, w); 7881*bba2c361STejun Heo } 7882*bba2c361STejun Heo mutex_unlock(&helper_mutex); 7883*bba2c361STejun Heo } 7884*bba2c361STejun Heo 7885*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 7886*bba2c361STejun Heo if (cmd->ops->sub_cgroup_id > 1) 7887*bba2c361STejun Heo kthread_init_work(&cmd->work, scx_sub_enable_workfn); 7888*bba2c361STejun Heo else 7889*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 7890*bba2c361STejun Heo kthread_init_work(&cmd->work, scx_root_enable_workfn); 7891*bba2c361STejun Heo 7892*bba2c361STejun Heo kthread_queue_work(READ_ONCE(helper), &cmd->work); 7893*bba2c361STejun Heo kthread_flush_work(&cmd->work); 7894*bba2c361STejun Heo return cmd->ret; 7895*bba2c361STejun Heo } 7896*bba2c361STejun Heo 7897*bba2c361STejun Heo 7898*bba2c361STejun Heo /******************************************************************************** 7899*bba2c361STejun Heo * bpf_struct_ops plumbing. 7900*bba2c361STejun Heo */ 7901*bba2c361STejun Heo #include <linux/bpf_verifier.h> 7902*bba2c361STejun Heo #include <linux/bpf.h> 7903*bba2c361STejun Heo #include <linux/btf.h> 7904*bba2c361STejun Heo 7905*bba2c361STejun Heo static const struct btf_type *task_struct_type; 7906*bba2c361STejun Heo 7907*bba2c361STejun Heo static bool bpf_scx_is_valid_access(int off, int size, 7908*bba2c361STejun Heo enum bpf_access_type type, 7909*bba2c361STejun Heo const struct bpf_prog *prog, 7910*bba2c361STejun Heo struct bpf_insn_access_aux *info) 7911*bba2c361STejun Heo { 7912*bba2c361STejun Heo if (type != BPF_READ) 7913*bba2c361STejun Heo return false; 7914*bba2c361STejun Heo if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 7915*bba2c361STejun Heo return false; 7916*bba2c361STejun Heo if (off % size != 0) 7917*bba2c361STejun Heo return false; 7918*bba2c361STejun Heo 7919*bba2c361STejun Heo return btf_ctx_access(off, size, type, prog, info); 7920*bba2c361STejun Heo } 7921*bba2c361STejun Heo 7922*bba2c361STejun Heo static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, 7923*bba2c361STejun Heo const struct bpf_reg_state *reg, int off, 7924*bba2c361STejun Heo int size) 7925*bba2c361STejun Heo { 7926*bba2c361STejun Heo const struct btf_type *t; 7927*bba2c361STejun Heo 7928*bba2c361STejun Heo t = btf_type_by_id(reg->btf, reg->btf_id); 7929*bba2c361STejun Heo if (t == task_struct_type) { 7930*bba2c361STejun Heo /* 7931*bba2c361STejun Heo * COMPAT: Will be removed in v6.23. 7932*bba2c361STejun Heo */ 7933*bba2c361STejun Heo if ((off >= offsetof(struct task_struct, scx.slice) && 7934*bba2c361STejun Heo off + size <= offsetofend(struct task_struct, scx.slice)) || 7935*bba2c361STejun Heo (off >= offsetof(struct task_struct, scx.dsq_vtime) && 7936*bba2c361STejun Heo off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { 7937*bba2c361STejun Heo pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); 7938*bba2c361STejun Heo return SCALAR_VALUE; 7939*bba2c361STejun Heo } 7940*bba2c361STejun Heo 7941*bba2c361STejun Heo if (off >= offsetof(struct task_struct, scx.disallow) && 7942*bba2c361STejun Heo off + size <= offsetofend(struct task_struct, scx.disallow)) 7943*bba2c361STejun Heo return SCALAR_VALUE; 7944*bba2c361STejun Heo } 7945*bba2c361STejun Heo 7946*bba2c361STejun Heo return -EACCES; 7947*bba2c361STejun Heo } 7948*bba2c361STejun Heo 7949*bba2c361STejun Heo static const struct bpf_verifier_ops bpf_scx_verifier_ops = { 7950*bba2c361STejun Heo .get_func_proto = bpf_base_func_proto, 7951*bba2c361STejun Heo .is_valid_access = bpf_scx_is_valid_access, 7952*bba2c361STejun Heo .btf_struct_access = bpf_scx_btf_struct_access, 7953*bba2c361STejun Heo }; 7954*bba2c361STejun Heo 7955*bba2c361STejun Heo static int bpf_scx_init_member(const struct btf_type *t, 7956*bba2c361STejun Heo const struct btf_member *member, 7957*bba2c361STejun Heo void *kdata, const void *udata) 7958*bba2c361STejun Heo { 7959*bba2c361STejun Heo const struct sched_ext_ops *uops = udata; 7960*bba2c361STejun Heo struct sched_ext_ops *ops = kdata; 7961*bba2c361STejun Heo u32 moff = __btf_member_bit_offset(t, member) / 8; 7962*bba2c361STejun Heo int ret; 7963*bba2c361STejun Heo 7964*bba2c361STejun Heo switch (moff) { 7965*bba2c361STejun Heo case offsetof(struct sched_ext_ops, dispatch_max_batch): 7966*bba2c361STejun Heo if (*(u32 *)(udata + moff) > INT_MAX) 7967*bba2c361STejun Heo return -E2BIG; 7968*bba2c361STejun Heo ops->dispatch_max_batch = *(u32 *)(udata + moff); 7969*bba2c361STejun Heo return 1; 7970*bba2c361STejun Heo case offsetof(struct sched_ext_ops, flags): 7971*bba2c361STejun Heo if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) 7972*bba2c361STejun Heo return -EINVAL; 7973*bba2c361STejun Heo ops->flags = *(u64 *)(udata + moff); 7974*bba2c361STejun Heo return 1; 7975*bba2c361STejun Heo case offsetof(struct sched_ext_ops, name): 7976*bba2c361STejun Heo ret = bpf_obj_name_cpy(ops->name, uops->name, 7977*bba2c361STejun Heo sizeof(ops->name)); 7978*bba2c361STejun Heo if (ret < 0) 7979*bba2c361STejun Heo return ret; 7980*bba2c361STejun Heo if (ret == 0) 7981*bba2c361STejun Heo return -EINVAL; 7982*bba2c361STejun Heo return 1; 7983*bba2c361STejun Heo case offsetof(struct sched_ext_ops, timeout_ms): 7984*bba2c361STejun Heo if (msecs_to_jiffies(*(u32 *)(udata + moff)) > 7985*bba2c361STejun Heo SCX_WATCHDOG_MAX_TIMEOUT) 7986*bba2c361STejun Heo return -E2BIG; 7987*bba2c361STejun Heo ops->timeout_ms = *(u32 *)(udata + moff); 7988*bba2c361STejun Heo return 1; 7989*bba2c361STejun Heo case offsetof(struct sched_ext_ops, exit_dump_len): 7990*bba2c361STejun Heo ops->exit_dump_len = 7991*bba2c361STejun Heo *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; 7992*bba2c361STejun Heo return 1; 7993*bba2c361STejun Heo case offsetof(struct sched_ext_ops, hotplug_seq): 7994*bba2c361STejun Heo ops->hotplug_seq = *(u64 *)(udata + moff); 7995*bba2c361STejun Heo return 1; 7996*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 7997*bba2c361STejun Heo case offsetof(struct sched_ext_ops, sub_cgroup_id): 7998*bba2c361STejun Heo ops->sub_cgroup_id = *(u64 *)(udata + moff); 7999*bba2c361STejun Heo return 1; 8000*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 8001*bba2c361STejun Heo } 8002*bba2c361STejun Heo 8003*bba2c361STejun Heo return 0; 8004*bba2c361STejun Heo } 8005*bba2c361STejun Heo 8006*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 8007*bba2c361STejun Heo static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) 8008*bba2c361STejun Heo { 8009*bba2c361STejun Heo struct scx_sched *sch; 8010*bba2c361STejun Heo 8011*bba2c361STejun Heo guard(rcu)(); 8012*bba2c361STejun Heo sch = scx_prog_sched(prog->aux); 8013*bba2c361STejun Heo if (unlikely(!sch)) 8014*bba2c361STejun Heo return; 8015*bba2c361STejun Heo 8016*bba2c361STejun Heo scx_error(sch, "dispatch recursion detected"); 8017*bba2c361STejun Heo } 8018*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 8019*bba2c361STejun Heo 8020*bba2c361STejun Heo static int bpf_scx_check_member(const struct btf_type *t, 8021*bba2c361STejun Heo const struct btf_member *member, 8022*bba2c361STejun Heo const struct bpf_prog *prog) 8023*bba2c361STejun Heo { 8024*bba2c361STejun Heo u32 moff = __btf_member_bit_offset(t, member) / 8; 8025*bba2c361STejun Heo 8026*bba2c361STejun Heo switch (moff) { 8027*bba2c361STejun Heo case offsetof(struct sched_ext_ops, init_task): 8028*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 8029*bba2c361STejun Heo case offsetof(struct sched_ext_ops, cgroup_init): 8030*bba2c361STejun Heo case offsetof(struct sched_ext_ops, cgroup_exit): 8031*bba2c361STejun Heo case offsetof(struct sched_ext_ops, cgroup_prep_move): 8032*bba2c361STejun Heo #endif 8033*bba2c361STejun Heo case offsetof(struct sched_ext_ops, cpu_online): 8034*bba2c361STejun Heo case offsetof(struct sched_ext_ops, cpu_offline): 8035*bba2c361STejun Heo case offsetof(struct sched_ext_ops, init): 8036*bba2c361STejun Heo case offsetof(struct sched_ext_ops, exit): 8037*bba2c361STejun Heo case offsetof(struct sched_ext_ops, sub_attach): 8038*bba2c361STejun Heo case offsetof(struct sched_ext_ops, sub_detach): 8039*bba2c361STejun Heo break; 8040*bba2c361STejun Heo default: 8041*bba2c361STejun Heo if (prog->sleepable) 8042*bba2c361STejun Heo return -EINVAL; 8043*bba2c361STejun Heo } 8044*bba2c361STejun Heo 8045*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 8046*bba2c361STejun Heo /* 8047*bba2c361STejun Heo * Enable private stack for operations that can nest along the 8048*bba2c361STejun Heo * hierarchy. 8049*bba2c361STejun Heo * 8050*bba2c361STejun Heo * XXX - Ideally, we should only do this for scheds that allow 8051*bba2c361STejun Heo * sub-scheds and sub-scheds themselves but I don't know how to access 8052*bba2c361STejun Heo * struct_ops from here. 8053*bba2c361STejun Heo */ 8054*bba2c361STejun Heo switch (moff) { 8055*bba2c361STejun Heo case offsetof(struct sched_ext_ops, dispatch): 8056*bba2c361STejun Heo prog->aux->priv_stack_requested = true; 8057*bba2c361STejun Heo prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; 8058*bba2c361STejun Heo } 8059*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 8060*bba2c361STejun Heo 8061*bba2c361STejun Heo return 0; 8062*bba2c361STejun Heo } 8063*bba2c361STejun Heo 8064*bba2c361STejun Heo static int bpf_scx_reg(void *kdata, struct bpf_link *link) 8065*bba2c361STejun Heo { 8066*bba2c361STejun Heo struct scx_enable_cmd cmd = { .ops = kdata }; 8067*bba2c361STejun Heo 8068*bba2c361STejun Heo return scx_enable(&cmd, link); 8069*bba2c361STejun Heo } 8070*bba2c361STejun Heo 8071*bba2c361STejun Heo struct scx_arena_scan { 8072*bba2c361STejun Heo struct bpf_map *arena; 8073*bba2c361STejun Heo int err; 8074*bba2c361STejun Heo }; 8075*bba2c361STejun Heo 8076*bba2c361STejun Heo /* 8077*bba2c361STejun Heo * The verifier enforces one arena per BPF program, so each struct_ops 8078*bba2c361STejun Heo * member prog contributes at most one arena via bpf_prog_arena(). 8079*bba2c361STejun Heo * Require all non-NULL contributions to match. 8080*bba2c361STejun Heo */ 8081*bba2c361STejun Heo static int scx_arena_scan_prog(struct bpf_prog *prog, void *data) 8082*bba2c361STejun Heo { 8083*bba2c361STejun Heo struct scx_arena_scan *s = data; 8084*bba2c361STejun Heo struct bpf_map *arena = NULL; 8085*bba2c361STejun Heo 8086*bba2c361STejun Heo /* arena.o, which defines these, is built only on MMU && 64BIT */ 8087*bba2c361STejun Heo #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 8088*bba2c361STejun Heo arena = bpf_prog_arena(prog); 8089*bba2c361STejun Heo #endif 8090*bba2c361STejun Heo if (!arena) 8091*bba2c361STejun Heo return 0; 8092*bba2c361STejun Heo if (s->arena && s->arena != arena) { 8093*bba2c361STejun Heo s->err = -EINVAL; 8094*bba2c361STejun Heo return 1; 8095*bba2c361STejun Heo } 8096*bba2c361STejun Heo s->arena = arena; 8097*bba2c361STejun Heo return 0; 8098*bba2c361STejun Heo } 8099*bba2c361STejun Heo 8100*bba2c361STejun Heo static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link) 8101*bba2c361STejun Heo { 8102*bba2c361STejun Heo struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true }; 8103*bba2c361STejun Heo struct scx_arena_scan scan = {}; 8104*bba2c361STejun Heo int ret; 8105*bba2c361STejun Heo 8106*bba2c361STejun Heo bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan); 8107*bba2c361STejun Heo if (scan.err) { 8108*bba2c361STejun Heo pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n"); 8109*bba2c361STejun Heo return scan.err; 8110*bba2c361STejun Heo } 8111*bba2c361STejun Heo if (!scan.arena) { 8112*bba2c361STejun Heo pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n"); 8113*bba2c361STejun Heo return -EINVAL; 8114*bba2c361STejun Heo } 8115*bba2c361STejun Heo 8116*bba2c361STejun Heo bpf_map_inc(scan.arena); 8117*bba2c361STejun Heo cmd.arena_map = scan.arena; 8118*bba2c361STejun Heo ret = scx_enable(&cmd, link); 8119*bba2c361STejun Heo if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */ 8120*bba2c361STejun Heo bpf_map_put(cmd.arena_map); 8121*bba2c361STejun Heo return ret; 8122*bba2c361STejun Heo } 8123*bba2c361STejun Heo 8124*bba2c361STejun Heo static void bpf_scx_unreg(void *kdata, struct bpf_link *link) 8125*bba2c361STejun Heo { 8126*bba2c361STejun Heo struct sched_ext_ops *ops = kdata; 8127*bba2c361STejun Heo struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); 8128*bba2c361STejun Heo 8129*bba2c361STejun Heo scx_disable(sch, SCX_EXIT_UNREG); 8130*bba2c361STejun Heo scx_flush_disable_work(sch); 8131*bba2c361STejun Heo RCU_INIT_POINTER(ops->priv, NULL); 8132*bba2c361STejun Heo kobject_put(&sch->kobj); 8133*bba2c361STejun Heo } 8134*bba2c361STejun Heo 8135*bba2c361STejun Heo static int bpf_scx_init(struct btf *btf) 8136*bba2c361STejun Heo { 8137*bba2c361STejun Heo task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); 8138*bba2c361STejun Heo 8139*bba2c361STejun Heo return 0; 8140*bba2c361STejun Heo } 8141*bba2c361STejun Heo 8142*bba2c361STejun Heo static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) 8143*bba2c361STejun Heo { 8144*bba2c361STejun Heo /* 8145*bba2c361STejun Heo * sched_ext does not support updating the actively-loaded BPF 8146*bba2c361STejun Heo * scheduler, as registering a BPF scheduler can always fail if the 8147*bba2c361STejun Heo * scheduler returns an error code for e.g. ops.init(), ops.init_task(), 8148*bba2c361STejun Heo * etc. Similarly, we can always race with unregistration happening 8149*bba2c361STejun Heo * elsewhere, such as with sysrq. 8150*bba2c361STejun Heo */ 8151*bba2c361STejun Heo return -EOPNOTSUPP; 8152*bba2c361STejun Heo } 8153*bba2c361STejun Heo 8154*bba2c361STejun Heo static int bpf_scx_validate(void *kdata) 8155*bba2c361STejun Heo { 8156*bba2c361STejun Heo return 0; 8157*bba2c361STejun Heo } 8158*bba2c361STejun Heo 8159*bba2c361STejun Heo static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } 8160*bba2c361STejun Heo static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} 8161*bba2c361STejun Heo static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} 8162*bba2c361STejun Heo static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} 8163*bba2c361STejun Heo static void sched_ext_ops__tick(struct task_struct *p) {} 8164*bba2c361STejun Heo static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} 8165*bba2c361STejun Heo static void sched_ext_ops__running(struct task_struct *p) {} 8166*bba2c361STejun Heo static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} 8167*bba2c361STejun Heo static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} 8168*bba2c361STejun Heo static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } 8169*bba2c361STejun Heo static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } 8170*bba2c361STejun Heo static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} 8171*bba2c361STejun Heo static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} 8172*bba2c361STejun Heo static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} 8173*bba2c361STejun Heo static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} 8174*bba2c361STejun Heo static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} 8175*bba2c361STejun Heo static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } 8176*bba2c361STejun Heo static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} 8177*bba2c361STejun Heo static void sched_ext_ops__enable(struct task_struct *p) {} 8178*bba2c361STejun Heo static void sched_ext_ops__disable(struct task_struct *p) {} 8179*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 8180*bba2c361STejun Heo static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 8181*bba2c361STejun Heo static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} 8182*bba2c361STejun Heo static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 8183*bba2c361STejun Heo static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 8184*bba2c361STejun Heo static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 8185*bba2c361STejun Heo static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 8186*bba2c361STejun Heo static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 8187*bba2c361STejun Heo static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 8188*bba2c361STejun Heo #endif /* CONFIG_EXT_GROUP_SCHED */ 8189*bba2c361STejun Heo static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } 8190*bba2c361STejun Heo static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} 8191*bba2c361STejun Heo static void sched_ext_ops__cpu_online(s32 cpu) {} 8192*bba2c361STejun Heo static void sched_ext_ops__cpu_offline(s32 cpu) {} 8193*bba2c361STejun Heo static s32 sched_ext_ops__init(void) { return -EINVAL; } 8194*bba2c361STejun Heo static void sched_ext_ops__exit(struct scx_exit_info *info) {} 8195*bba2c361STejun Heo static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} 8196*bba2c361STejun Heo static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} 8197*bba2c361STejun Heo static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} 8198*bba2c361STejun Heo 8199*bba2c361STejun Heo static struct sched_ext_ops __bpf_ops_sched_ext_ops = { 8200*bba2c361STejun Heo .select_cpu = sched_ext_ops__select_cpu, 8201*bba2c361STejun Heo .enqueue = sched_ext_ops__enqueue, 8202*bba2c361STejun Heo .dequeue = sched_ext_ops__dequeue, 8203*bba2c361STejun Heo .dispatch = sched_ext_ops__dispatch, 8204*bba2c361STejun Heo .tick = sched_ext_ops__tick, 8205*bba2c361STejun Heo .runnable = sched_ext_ops__runnable, 8206*bba2c361STejun Heo .running = sched_ext_ops__running, 8207*bba2c361STejun Heo .stopping = sched_ext_ops__stopping, 8208*bba2c361STejun Heo .quiescent = sched_ext_ops__quiescent, 8209*bba2c361STejun Heo .yield = sched_ext_ops__yield, 8210*bba2c361STejun Heo .core_sched_before = sched_ext_ops__core_sched_before, 8211*bba2c361STejun Heo .set_weight = sched_ext_ops__set_weight, 8212*bba2c361STejun Heo .set_cpumask = sched_ext_ops__set_cpumask, 8213*bba2c361STejun Heo .update_idle = sched_ext_ops__update_idle, 8214*bba2c361STejun Heo .cpu_acquire = sched_ext_ops__cpu_acquire, 8215*bba2c361STejun Heo .cpu_release = sched_ext_ops__cpu_release, 8216*bba2c361STejun Heo .init_task = sched_ext_ops__init_task, 8217*bba2c361STejun Heo .exit_task = sched_ext_ops__exit_task, 8218*bba2c361STejun Heo .enable = sched_ext_ops__enable, 8219*bba2c361STejun Heo .disable = sched_ext_ops__disable, 8220*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 8221*bba2c361STejun Heo .cgroup_init = sched_ext_ops__cgroup_init, 8222*bba2c361STejun Heo .cgroup_exit = sched_ext_ops__cgroup_exit, 8223*bba2c361STejun Heo .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 8224*bba2c361STejun Heo .cgroup_move = sched_ext_ops__cgroup_move, 8225*bba2c361STejun Heo .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 8226*bba2c361STejun Heo .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 8227*bba2c361STejun Heo .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 8228*bba2c361STejun Heo .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 8229*bba2c361STejun Heo #endif 8230*bba2c361STejun Heo .sub_attach = sched_ext_ops__sub_attach, 8231*bba2c361STejun Heo .sub_detach = sched_ext_ops__sub_detach, 8232*bba2c361STejun Heo .cpu_online = sched_ext_ops__cpu_online, 8233*bba2c361STejun Heo .cpu_offline = sched_ext_ops__cpu_offline, 8234*bba2c361STejun Heo .init = sched_ext_ops__init, 8235*bba2c361STejun Heo .exit = sched_ext_ops__exit, 8236*bba2c361STejun Heo .dump = sched_ext_ops__dump, 8237*bba2c361STejun Heo .dump_cpu = sched_ext_ops__dump_cpu, 8238*bba2c361STejun Heo .dump_task = sched_ext_ops__dump_task, 8239*bba2c361STejun Heo }; 8240*bba2c361STejun Heo 8241*bba2c361STejun Heo static struct bpf_struct_ops bpf_sched_ext_ops = { 8242*bba2c361STejun Heo .verifier_ops = &bpf_scx_verifier_ops, 8243*bba2c361STejun Heo .reg = bpf_scx_reg, 8244*bba2c361STejun Heo .unreg = bpf_scx_unreg, 8245*bba2c361STejun Heo .check_member = bpf_scx_check_member, 8246*bba2c361STejun Heo .init_member = bpf_scx_init_member, 8247*bba2c361STejun Heo .init = bpf_scx_init, 8248*bba2c361STejun Heo .update = bpf_scx_update, 8249*bba2c361STejun Heo .validate = bpf_scx_validate, 8250*bba2c361STejun Heo .name = "sched_ext_ops", 8251*bba2c361STejun Heo .owner = THIS_MODULE, 8252*bba2c361STejun Heo .cfi_stubs = &__bpf_ops_sched_ext_ops 8253*bba2c361STejun Heo }; 8254*bba2c361STejun Heo 8255*bba2c361STejun Heo /* 8256*bba2c361STejun Heo * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types 8257*bba2c361STejun Heo * identical, only param names differ across structs) are reused; only 8258*bba2c361STejun Heo * set_cmask needs a fresh stub since the second argument type differs. 8259*bba2c361STejun Heo */ 8260*bba2c361STejun Heo static void sched_ext_ops_cid__set_cmask(struct task_struct *p, 8261*bba2c361STejun Heo const struct scx_cmask *cmask) {} 8262*bba2c361STejun Heo 8263*bba2c361STejun Heo static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = { 8264*bba2c361STejun Heo .select_cid = sched_ext_ops__select_cpu, 8265*bba2c361STejun Heo .enqueue = sched_ext_ops__enqueue, 8266*bba2c361STejun Heo .dequeue = sched_ext_ops__dequeue, 8267*bba2c361STejun Heo .dispatch = sched_ext_ops__dispatch, 8268*bba2c361STejun Heo .tick = sched_ext_ops__tick, 8269*bba2c361STejun Heo .runnable = sched_ext_ops__runnable, 8270*bba2c361STejun Heo .running = sched_ext_ops__running, 8271*bba2c361STejun Heo .stopping = sched_ext_ops__stopping, 8272*bba2c361STejun Heo .quiescent = sched_ext_ops__quiescent, 8273*bba2c361STejun Heo .yield = sched_ext_ops__yield, 8274*bba2c361STejun Heo .core_sched_before = sched_ext_ops__core_sched_before, 8275*bba2c361STejun Heo .set_weight = sched_ext_ops__set_weight, 8276*bba2c361STejun Heo .set_cmask = sched_ext_ops_cid__set_cmask, 8277*bba2c361STejun Heo .update_idle = sched_ext_ops__update_idle, 8278*bba2c361STejun Heo .init_task = sched_ext_ops__init_task, 8279*bba2c361STejun Heo .exit_task = sched_ext_ops__exit_task, 8280*bba2c361STejun Heo .enable = sched_ext_ops__enable, 8281*bba2c361STejun Heo .disable = sched_ext_ops__disable, 8282*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 8283*bba2c361STejun Heo .cgroup_init = sched_ext_ops__cgroup_init, 8284*bba2c361STejun Heo .cgroup_exit = sched_ext_ops__cgroup_exit, 8285*bba2c361STejun Heo .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, 8286*bba2c361STejun Heo .cgroup_move = sched_ext_ops__cgroup_move, 8287*bba2c361STejun Heo .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 8288*bba2c361STejun Heo .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 8289*bba2c361STejun Heo .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 8290*bba2c361STejun Heo .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 8291*bba2c361STejun Heo #endif 8292*bba2c361STejun Heo .sub_attach = sched_ext_ops__sub_attach, 8293*bba2c361STejun Heo .sub_detach = sched_ext_ops__sub_detach, 8294*bba2c361STejun Heo .cid_online = sched_ext_ops__cpu_online, 8295*bba2c361STejun Heo .cid_offline = sched_ext_ops__cpu_offline, 8296*bba2c361STejun Heo .init = sched_ext_ops__init, 8297*bba2c361STejun Heo .exit = sched_ext_ops__exit, 8298*bba2c361STejun Heo .dump = sched_ext_ops__dump, 8299*bba2c361STejun Heo .dump_cid = sched_ext_ops__dump_cpu, 8300*bba2c361STejun Heo .dump_task = sched_ext_ops__dump_task, 8301*bba2c361STejun Heo }; 8302*bba2c361STejun Heo 8303*bba2c361STejun Heo /* 8304*bba2c361STejun Heo * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form. 8305*bba2c361STejun Heo * init_member, check_member, reg, unreg, etc. process kdata as the byte block 8306*bba2c361STejun Heo * verified to match by the BUILD_BUG_ON checks in scx_init(). 8307*bba2c361STejun Heo */ 8308*bba2c361STejun Heo static struct bpf_struct_ops bpf_sched_ext_ops_cid = { 8309*bba2c361STejun Heo .verifier_ops = &bpf_scx_verifier_ops, 8310*bba2c361STejun Heo .reg = bpf_scx_reg_cid, 8311*bba2c361STejun Heo .unreg = bpf_scx_unreg, 8312*bba2c361STejun Heo .check_member = bpf_scx_check_member, 8313*bba2c361STejun Heo .init_member = bpf_scx_init_member, 8314*bba2c361STejun Heo .init = bpf_scx_init, 8315*bba2c361STejun Heo .update = bpf_scx_update, 8316*bba2c361STejun Heo .validate = bpf_scx_validate, 8317*bba2c361STejun Heo .name = "sched_ext_ops_cid", 8318*bba2c361STejun Heo .owner = THIS_MODULE, 8319*bba2c361STejun Heo .cfi_stubs = &__bpf_ops_sched_ext_ops_cid 8320*bba2c361STejun Heo }; 8321*bba2c361STejun Heo 8322*bba2c361STejun Heo 8323*bba2c361STejun Heo /******************************************************************************** 8324*bba2c361STejun Heo * System integration and init. 8325*bba2c361STejun Heo */ 8326*bba2c361STejun Heo 8327*bba2c361STejun Heo static void sysrq_handle_sched_ext_reset(u8 key) 8328*bba2c361STejun Heo { 8329*bba2c361STejun Heo struct scx_sched *sch; 8330*bba2c361STejun Heo 8331*bba2c361STejun Heo sch = rcu_dereference(scx_root); 8332*bba2c361STejun Heo if (likely(sch)) 8333*bba2c361STejun Heo scx_disable(sch, SCX_EXIT_SYSRQ); 8334*bba2c361STejun Heo else 8335*bba2c361STejun Heo pr_info("sched_ext: BPF schedulers not loaded\n"); 8336*bba2c361STejun Heo } 8337*bba2c361STejun Heo 8338*bba2c361STejun Heo static const struct sysrq_key_op sysrq_sched_ext_reset_op = { 8339*bba2c361STejun Heo .handler = sysrq_handle_sched_ext_reset, 8340*bba2c361STejun Heo .help_msg = "reset-sched-ext(S)", 8341*bba2c361STejun Heo .action_msg = "Disable sched_ext and revert all tasks to CFS", 8342*bba2c361STejun Heo .enable_mask = SYSRQ_ENABLE_RTNICE, 8343*bba2c361STejun Heo }; 8344*bba2c361STejun Heo 8345*bba2c361STejun Heo static void sysrq_handle_sched_ext_dump(u8 key) 8346*bba2c361STejun Heo { 8347*bba2c361STejun Heo struct scx_exit_info ei = { 8348*bba2c361STejun Heo .kind = SCX_EXIT_NONE, 8349*bba2c361STejun Heo .exit_cpu = -1, 8350*bba2c361STejun Heo .reason = "SysRq-D", 8351*bba2c361STejun Heo }; 8352*bba2c361STejun Heo struct scx_sched *sch; 8353*bba2c361STejun Heo 8354*bba2c361STejun Heo list_for_each_entry_rcu(sch, &scx_sched_all, all) 8355*bba2c361STejun Heo scx_dump_state(sch, &ei, 0, false); 8356*bba2c361STejun Heo } 8357*bba2c361STejun Heo 8358*bba2c361STejun Heo static const struct sysrq_key_op sysrq_sched_ext_dump_op = { 8359*bba2c361STejun Heo .handler = sysrq_handle_sched_ext_dump, 8360*bba2c361STejun Heo .help_msg = "dump-sched-ext(D)", 8361*bba2c361STejun Heo .action_msg = "Trigger sched_ext debug dump", 8362*bba2c361STejun Heo .enable_mask = SYSRQ_ENABLE_RTNICE, 8363*bba2c361STejun Heo }; 8364*bba2c361STejun Heo 8365*bba2c361STejun Heo static bool can_skip_idle_kick(struct rq *rq) 8366*bba2c361STejun Heo { 8367*bba2c361STejun Heo lockdep_assert_rq_held(rq); 8368*bba2c361STejun Heo 8369*bba2c361STejun Heo /* 8370*bba2c361STejun Heo * We can skip idle kicking if @rq is going to go through at least one 8371*bba2c361STejun Heo * full SCX scheduling cycle before going idle. Just checking whether 8372*bba2c361STejun Heo * curr is not idle is insufficient because we could be racing 8373*bba2c361STejun Heo * balance_one() trying to pull the next task from a remote rq, which 8374*bba2c361STejun Heo * may fail, and @rq may become idle afterwards. 8375*bba2c361STejun Heo * 8376*bba2c361STejun Heo * The race window is small and we don't and can't guarantee that @rq is 8377*bba2c361STejun Heo * only kicked while idle anyway. Skip only when sure. 8378*bba2c361STejun Heo */ 8379*bba2c361STejun Heo return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 8380*bba2c361STejun Heo } 8381*bba2c361STejun Heo 8382*bba2c361STejun Heo static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 8383*bba2c361STejun Heo { 8384*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 8385*bba2c361STejun Heo struct scx_rq *this_scx = &this_rq->scx; 8386*bba2c361STejun Heo const struct sched_class *cur_class; 8387*bba2c361STejun Heo bool should_wait = false; 8388*bba2c361STejun Heo unsigned long flags; 8389*bba2c361STejun Heo 8390*bba2c361STejun Heo raw_spin_rq_lock_irqsave(rq, flags); 8391*bba2c361STejun Heo cur_class = rq->curr->sched_class; 8392*bba2c361STejun Heo 8393*bba2c361STejun Heo /* 8394*bba2c361STejun Heo * During CPU hotplug, a CPU may depend on kicking itself to make 8395*bba2c361STejun Heo * forward progress. Allow kicking self regardless of online state. If 8396*bba2c361STejun Heo * @cpu is running a higher class task, we have no control over @cpu. 8397*bba2c361STejun Heo * Skip kicking. 8398*bba2c361STejun Heo */ 8399*bba2c361STejun Heo if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 8400*bba2c361STejun Heo !sched_class_above(cur_class, &ext_sched_class)) { 8401*bba2c361STejun Heo if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 8402*bba2c361STejun Heo if (cur_class == &ext_sched_class) 8403*bba2c361STejun Heo rq->curr->scx.slice = 0; 8404*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 8405*bba2c361STejun Heo } 8406*bba2c361STejun Heo 8407*bba2c361STejun Heo if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 8408*bba2c361STejun Heo if (cur_class == &ext_sched_class) { 8409*bba2c361STejun Heo cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 8410*bba2c361STejun Heo ksyncs[cpu] = rq->scx.kick_sync; 8411*bba2c361STejun Heo should_wait = true; 8412*bba2c361STejun Heo } 8413*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 8414*bba2c361STejun Heo } 8415*bba2c361STejun Heo 8416*bba2c361STejun Heo resched_curr(rq); 8417*bba2c361STejun Heo } else { 8418*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 8419*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 8420*bba2c361STejun Heo } 8421*bba2c361STejun Heo 8422*bba2c361STejun Heo raw_spin_rq_unlock_irqrestore(rq, flags); 8423*bba2c361STejun Heo 8424*bba2c361STejun Heo return should_wait; 8425*bba2c361STejun Heo } 8426*bba2c361STejun Heo 8427*bba2c361STejun Heo static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) 8428*bba2c361STejun Heo { 8429*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 8430*bba2c361STejun Heo unsigned long flags; 8431*bba2c361STejun Heo 8432*bba2c361STejun Heo raw_spin_rq_lock_irqsave(rq, flags); 8433*bba2c361STejun Heo 8434*bba2c361STejun Heo if (!can_skip_idle_kick(rq) && 8435*bba2c361STejun Heo (cpu_online(cpu) || cpu == cpu_of(this_rq))) 8436*bba2c361STejun Heo resched_curr(rq); 8437*bba2c361STejun Heo 8438*bba2c361STejun Heo raw_spin_rq_unlock_irqrestore(rq, flags); 8439*bba2c361STejun Heo } 8440*bba2c361STejun Heo 8441*bba2c361STejun Heo static void kick_cpus_irq_workfn(struct irq_work *irq_work) 8442*bba2c361STejun Heo { 8443*bba2c361STejun Heo struct rq *this_rq = this_rq(); 8444*bba2c361STejun Heo struct scx_rq *this_scx = &this_rq->scx; 8445*bba2c361STejun Heo struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 8446*bba2c361STejun Heo bool should_wait = false; 8447*bba2c361STejun Heo unsigned long *ksyncs; 8448*bba2c361STejun Heo s32 cpu; 8449*bba2c361STejun Heo 8450*bba2c361STejun Heo /* can race with free_kick_syncs() during scheduler disable */ 8451*bba2c361STejun Heo if (unlikely(!ksyncs_pcpu)) 8452*bba2c361STejun Heo return; 8453*bba2c361STejun Heo 8454*bba2c361STejun Heo ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 8455*bba2c361STejun Heo 8456*bba2c361STejun Heo for_each_cpu(cpu, this_scx->cpus_to_kick) { 8457*bba2c361STejun Heo should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 8458*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 8459*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 8460*bba2c361STejun Heo } 8461*bba2c361STejun Heo 8462*bba2c361STejun Heo for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { 8463*bba2c361STejun Heo kick_one_cpu_if_idle(cpu, this_rq); 8464*bba2c361STejun Heo cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 8465*bba2c361STejun Heo } 8466*bba2c361STejun Heo 8467*bba2c361STejun Heo /* 8468*bba2c361STejun Heo * Can't wait in hardirq — kick_sync can't advance, deadlocking if 8469*bba2c361STejun Heo * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 8470*bba2c361STejun Heo */ 8471*bba2c361STejun Heo if (should_wait) { 8472*bba2c361STejun Heo raw_spin_rq_lock(this_rq); 8473*bba2c361STejun Heo this_scx->kick_sync_pending = true; 8474*bba2c361STejun Heo resched_curr(this_rq); 8475*bba2c361STejun Heo raw_spin_rq_unlock(this_rq); 8476*bba2c361STejun Heo } 8477*bba2c361STejun Heo } 8478*bba2c361STejun Heo 8479*bba2c361STejun Heo /** 8480*bba2c361STejun Heo * print_scx_info - print out sched_ext scheduler state 8481*bba2c361STejun Heo * @log_lvl: the log level to use when printing 8482*bba2c361STejun Heo * @p: target task 8483*bba2c361STejun Heo * 8484*bba2c361STejun Heo * If a sched_ext scheduler is enabled, print the name and state of the 8485*bba2c361STejun Heo * scheduler. If @p is on sched_ext, print further information about the task. 8486*bba2c361STejun Heo * 8487*bba2c361STejun Heo * This function can be safely called on any task as long as the task_struct 8488*bba2c361STejun Heo * itself is accessible. While safe, this function isn't synchronized and may 8489*bba2c361STejun Heo * print out mixups or garbages of limited length. 8490*bba2c361STejun Heo */ 8491*bba2c361STejun Heo void print_scx_info(const char *log_lvl, struct task_struct *p) 8492*bba2c361STejun Heo { 8493*bba2c361STejun Heo struct scx_sched *sch; 8494*bba2c361STejun Heo enum scx_enable_state state = scx_enable_state(); 8495*bba2c361STejun Heo const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; 8496*bba2c361STejun Heo char runnable_at_buf[22] = "?"; 8497*bba2c361STejun Heo struct sched_class *class; 8498*bba2c361STejun Heo unsigned long runnable_at; 8499*bba2c361STejun Heo 8500*bba2c361STejun Heo guard(rcu)(); 8501*bba2c361STejun Heo 8502*bba2c361STejun Heo sch = scx_task_sched_rcu(p); 8503*bba2c361STejun Heo 8504*bba2c361STejun Heo if (!sch) 8505*bba2c361STejun Heo return; 8506*bba2c361STejun Heo 8507*bba2c361STejun Heo /* 8508*bba2c361STejun Heo * Carefully check if the task was running on sched_ext, and then 8509*bba2c361STejun Heo * carefully copy the time it's been runnable, and its state. 8510*bba2c361STejun Heo */ 8511*bba2c361STejun Heo if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || 8512*bba2c361STejun Heo class != &ext_sched_class) { 8513*bba2c361STejun Heo printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, 8514*bba2c361STejun Heo scx_enable_state_str[state], all); 8515*bba2c361STejun Heo return; 8516*bba2c361STejun Heo } 8517*bba2c361STejun Heo 8518*bba2c361STejun Heo if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, 8519*bba2c361STejun Heo sizeof(runnable_at))) 8520*bba2c361STejun Heo scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", 8521*bba2c361STejun Heo jiffies_delta_msecs(runnable_at, jiffies)); 8522*bba2c361STejun Heo 8523*bba2c361STejun Heo /* print everything onto one line to conserve console space */ 8524*bba2c361STejun Heo printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", 8525*bba2c361STejun Heo log_lvl, sch->ops.name, scx_enable_state_str[state], all, 8526*bba2c361STejun Heo runnable_at_buf); 8527*bba2c361STejun Heo } 8528*bba2c361STejun Heo 8529*bba2c361STejun Heo static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) 8530*bba2c361STejun Heo { 8531*bba2c361STejun Heo struct scx_sched *sch; 8532*bba2c361STejun Heo 8533*bba2c361STejun Heo guard(rcu)(); 8534*bba2c361STejun Heo 8535*bba2c361STejun Heo sch = rcu_dereference(scx_root); 8536*bba2c361STejun Heo if (!sch) 8537*bba2c361STejun Heo return NOTIFY_OK; 8538*bba2c361STejun Heo 8539*bba2c361STejun Heo /* 8540*bba2c361STejun Heo * SCX schedulers often have userspace components which are sometimes 8541*bba2c361STejun Heo * involved in critial scheduling paths. PM operations involve freezing 8542*bba2c361STejun Heo * userspace which can lead to scheduling misbehaviors including stalls. 8543*bba2c361STejun Heo * Let's bypass while PM operations are in progress. 8544*bba2c361STejun Heo */ 8545*bba2c361STejun Heo switch (event) { 8546*bba2c361STejun Heo case PM_HIBERNATION_PREPARE: 8547*bba2c361STejun Heo case PM_SUSPEND_PREPARE: 8548*bba2c361STejun Heo case PM_RESTORE_PREPARE: 8549*bba2c361STejun Heo scx_bypass(sch, true); 8550*bba2c361STejun Heo break; 8551*bba2c361STejun Heo case PM_POST_HIBERNATION: 8552*bba2c361STejun Heo case PM_POST_SUSPEND: 8553*bba2c361STejun Heo case PM_POST_RESTORE: 8554*bba2c361STejun Heo scx_bypass(sch, false); 8555*bba2c361STejun Heo break; 8556*bba2c361STejun Heo } 8557*bba2c361STejun Heo 8558*bba2c361STejun Heo return NOTIFY_OK; 8559*bba2c361STejun Heo } 8560*bba2c361STejun Heo 8561*bba2c361STejun Heo static struct notifier_block scx_pm_notifier = { 8562*bba2c361STejun Heo .notifier_call = scx_pm_handler, 8563*bba2c361STejun Heo }; 8564*bba2c361STejun Heo 8565*bba2c361STejun Heo void __init init_sched_ext_class(void) 8566*bba2c361STejun Heo { 8567*bba2c361STejun Heo s32 cpu, v; 8568*bba2c361STejun Heo 8569*bba2c361STejun Heo /* 8570*bba2c361STejun Heo * The following is to prevent the compiler from optimizing out the enum 8571*bba2c361STejun Heo * definitions so that BPF scheduler implementations can use them 8572*bba2c361STejun Heo * through the generated vmlinux.h. 8573*bba2c361STejun Heo */ 8574*bba2c361STejun Heo WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 8575*bba2c361STejun Heo SCX_TG_ONLINE); 8576*bba2c361STejun Heo 8577*bba2c361STejun Heo scx_idle_init_masks(); 8578*bba2c361STejun Heo 8579*bba2c361STejun Heo for_each_possible_cpu(cpu) { 8580*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu); 8581*bba2c361STejun Heo int n = cpu_to_node(cpu); 8582*bba2c361STejun Heo 8583*bba2c361STejun Heo /* local_dsq's sch will be set during scx_root_enable() */ 8584*bba2c361STejun Heo BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); 8585*bba2c361STejun Heo 8586*bba2c361STejun Heo INIT_LIST_HEAD(&rq->scx.runnable_list); 8587*bba2c361STejun Heo INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 8588*bba2c361STejun Heo 8589*bba2c361STejun Heo BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 8590*bba2c361STejun Heo BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 8591*bba2c361STejun Heo BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 8592*bba2c361STejun Heo BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 8593*bba2c361STejun Heo BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 8594*bba2c361STejun Heo raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 8595*bba2c361STejun Heo INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 8596*bba2c361STejun Heo INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 8597*bba2c361STejun Heo rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 8598*bba2c361STejun Heo rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 8599*bba2c361STejun Heo 8600*bba2c361STejun Heo if (cpu_online(cpu)) 8601*bba2c361STejun Heo cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; 8602*bba2c361STejun Heo } 8603*bba2c361STejun Heo 8604*bba2c361STejun Heo register_sysrq_key('S', &sysrq_sched_ext_reset_op); 8605*bba2c361STejun Heo register_sysrq_key('D', &sysrq_sched_ext_dump_op); 8606*bba2c361STejun Heo INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); 8607*bba2c361STejun Heo 8608*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 8609*bba2c361STejun Heo BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); 8610*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 8611*bba2c361STejun Heo } 8612*bba2c361STejun Heo 8613*bba2c361STejun Heo 8614*bba2c361STejun Heo /******************************************************************************** 8615*bba2c361STejun Heo * Helpers that can be called from the BPF scheduler. 8616*bba2c361STejun Heo */ 8617*bba2c361STejun Heo static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) 8618*bba2c361STejun Heo { 8619*bba2c361STejun Heo bool is_local = dsq_id == SCX_DSQ_LOCAL || 8620*bba2c361STejun Heo (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; 8621*bba2c361STejun Heo 8622*bba2c361STejun Heo if (*enq_flags & SCX_ENQ_IMMED) { 8623*bba2c361STejun Heo if (unlikely(!is_local)) { 8624*bba2c361STejun Heo scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); 8625*bba2c361STejun Heo return false; 8626*bba2c361STejun Heo } 8627*bba2c361STejun Heo } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { 8628*bba2c361STejun Heo *enq_flags |= SCX_ENQ_IMMED; 8629*bba2c361STejun Heo } 8630*bba2c361STejun Heo 8631*bba2c361STejun Heo return true; 8632*bba2c361STejun Heo } 8633*bba2c361STejun Heo 8634*bba2c361STejun Heo static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 8635*bba2c361STejun Heo u64 dsq_id, u64 *enq_flags) 8636*bba2c361STejun Heo { 8637*bba2c361STejun Heo lockdep_assert_irqs_disabled(); 8638*bba2c361STejun Heo 8639*bba2c361STejun Heo if (unlikely(!p)) { 8640*bba2c361STejun Heo scx_error(sch, "called with NULL task"); 8641*bba2c361STejun Heo return false; 8642*bba2c361STejun Heo } 8643*bba2c361STejun Heo 8644*bba2c361STejun Heo if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 8645*bba2c361STejun Heo scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); 8646*bba2c361STejun Heo return false; 8647*bba2c361STejun Heo } 8648*bba2c361STejun Heo 8649*bba2c361STejun Heo /* see SCX_EV_INSERT_NOT_OWNED definition */ 8650*bba2c361STejun Heo if (unlikely(!scx_task_on_sched(sch, p))) { 8651*bba2c361STejun Heo __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); 8652*bba2c361STejun Heo return false; 8653*bba2c361STejun Heo } 8654*bba2c361STejun Heo 8655*bba2c361STejun Heo if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) 8656*bba2c361STejun Heo return false; 8657*bba2c361STejun Heo 8658*bba2c361STejun Heo return true; 8659*bba2c361STejun Heo } 8660*bba2c361STejun Heo 8661*bba2c361STejun Heo static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 8662*bba2c361STejun Heo u64 dsq_id, u64 enq_flags) 8663*bba2c361STejun Heo { 8664*bba2c361STejun Heo struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 8665*bba2c361STejun Heo struct task_struct *ddsp_task; 8666*bba2c361STejun Heo 8667*bba2c361STejun Heo ddsp_task = __this_cpu_read(direct_dispatch_task); 8668*bba2c361STejun Heo if (ddsp_task) { 8669*bba2c361STejun Heo mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 8670*bba2c361STejun Heo return; 8671*bba2c361STejun Heo } 8672*bba2c361STejun Heo 8673*bba2c361STejun Heo if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { 8674*bba2c361STejun Heo scx_error(sch, "dispatch buffer overflow"); 8675*bba2c361STejun Heo return; 8676*bba2c361STejun Heo } 8677*bba2c361STejun Heo 8678*bba2c361STejun Heo dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ 8679*bba2c361STejun Heo .task = p, 8680*bba2c361STejun Heo .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, 8681*bba2c361STejun Heo .dsq_id = dsq_id, 8682*bba2c361STejun Heo .enq_flags = enq_flags, 8683*bba2c361STejun Heo }; 8684*bba2c361STejun Heo } 8685*bba2c361STejun Heo 8686*bba2c361STejun Heo __bpf_kfunc_start_defs(); 8687*bba2c361STejun Heo 8688*bba2c361STejun Heo /** 8689*bba2c361STejun Heo * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ 8690*bba2c361STejun Heo * @p: task_struct to insert 8691*bba2c361STejun Heo * @dsq_id: DSQ to insert into 8692*bba2c361STejun Heo * @slice: duration @p can run for in nsecs, 0 to keep the current value 8693*bba2c361STejun Heo * @enq_flags: SCX_ENQ_* 8694*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8695*bba2c361STejun Heo * 8696*bba2c361STejun Heo * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to 8697*bba2c361STejun Heo * call this function spuriously. Can be called from ops.enqueue(), 8698*bba2c361STejun Heo * ops.select_cpu(), and ops.dispatch(). 8699*bba2c361STejun Heo * 8700*bba2c361STejun Heo * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch 8701*bba2c361STejun Heo * and @p must match the task being enqueued. 8702*bba2c361STejun Heo * 8703*bba2c361STejun Heo * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p 8704*bba2c361STejun Heo * will be directly inserted into the corresponding dispatch queue after 8705*bba2c361STejun Heo * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be 8706*bba2c361STejun Heo * inserted into the local DSQ of the CPU returned by ops.select_cpu(). 8707*bba2c361STejun Heo * @enq_flags are OR'd with the enqueue flags on the enqueue path before the 8708*bba2c361STejun Heo * task is inserted. 8709*bba2c361STejun Heo * 8710*bba2c361STejun Heo * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id 8711*bba2c361STejun Heo * and this function can be called upto ops.dispatch_max_batch times to insert 8712*bba2c361STejun Heo * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the 8713*bba2c361STejun Heo * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the 8714*bba2c361STejun Heo * counter. 8715*bba2c361STejun Heo * 8716*bba2c361STejun Heo * This function doesn't have any locking restrictions and may be called under 8717*bba2c361STejun Heo * BPF locks (in the future when BPF introduces more flexible locking). 8718*bba2c361STejun Heo * 8719*bba2c361STejun Heo * @p is allowed to run for @slice. The scheduling path is triggered on slice 8720*bba2c361STejun Heo * exhaustion. If zero, the current residual slice is maintained. If 8721*bba2c361STejun Heo * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 8722*bba2c361STejun Heo * scx_bpf_kick_cpu() to trigger scheduling. 8723*bba2c361STejun Heo * 8724*bba2c361STejun Heo * Returns %true on successful insertion, %false on failure. On the root 8725*bba2c361STejun Heo * scheduler, %false return triggers scheduler abort and the caller doesn't need 8726*bba2c361STejun Heo * to check the return value. 8727*bba2c361STejun Heo */ 8728*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 8729*bba2c361STejun Heo u64 slice, u64 enq_flags, 8730*bba2c361STejun Heo const struct bpf_prog_aux *aux) 8731*bba2c361STejun Heo { 8732*bba2c361STejun Heo struct scx_sched *sch; 8733*bba2c361STejun Heo 8734*bba2c361STejun Heo guard(rcu)(); 8735*bba2c361STejun Heo sch = scx_prog_sched(aux); 8736*bba2c361STejun Heo if (unlikely(!sch)) 8737*bba2c361STejun Heo return false; 8738*bba2c361STejun Heo 8739*bba2c361STejun Heo if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8740*bba2c361STejun Heo return false; 8741*bba2c361STejun Heo 8742*bba2c361STejun Heo if (slice) 8743*bba2c361STejun Heo p->scx.slice = slice; 8744*bba2c361STejun Heo else 8745*bba2c361STejun Heo p->scx.slice = p->scx.slice ?: 1; 8746*bba2c361STejun Heo 8747*bba2c361STejun Heo scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 8748*bba2c361STejun Heo 8749*bba2c361STejun Heo return true; 8750*bba2c361STejun Heo } 8751*bba2c361STejun Heo 8752*bba2c361STejun Heo /* 8753*bba2c361STejun Heo * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 8754*bba2c361STejun Heo */ 8755*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 8756*bba2c361STejun Heo u64 slice, u64 enq_flags, 8757*bba2c361STejun Heo const struct bpf_prog_aux *aux) 8758*bba2c361STejun Heo { 8759*bba2c361STejun Heo scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); 8760*bba2c361STejun Heo } 8761*bba2c361STejun Heo 8762*bba2c361STejun Heo static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 8763*bba2c361STejun Heo u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 8764*bba2c361STejun Heo { 8765*bba2c361STejun Heo if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) 8766*bba2c361STejun Heo return false; 8767*bba2c361STejun Heo 8768*bba2c361STejun Heo if (slice) 8769*bba2c361STejun Heo p->scx.slice = slice; 8770*bba2c361STejun Heo else 8771*bba2c361STejun Heo p->scx.slice = p->scx.slice ?: 1; 8772*bba2c361STejun Heo 8773*bba2c361STejun Heo p->scx.dsq_vtime = vtime; 8774*bba2c361STejun Heo 8775*bba2c361STejun Heo scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 8776*bba2c361STejun Heo 8777*bba2c361STejun Heo return true; 8778*bba2c361STejun Heo } 8779*bba2c361STejun Heo 8780*bba2c361STejun Heo struct scx_bpf_dsq_insert_vtime_args { 8781*bba2c361STejun Heo /* @p can't be packed together as KF_RCU is not transitive */ 8782*bba2c361STejun Heo u64 dsq_id; 8783*bba2c361STejun Heo u64 slice; 8784*bba2c361STejun Heo u64 vtime; 8785*bba2c361STejun Heo u64 enq_flags; 8786*bba2c361STejun Heo }; 8787*bba2c361STejun Heo 8788*bba2c361STejun Heo /** 8789*bba2c361STejun Heo * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 8790*bba2c361STejun Heo * @p: task_struct to insert 8791*bba2c361STejun Heo * @args: struct containing the rest of the arguments 8792*bba2c361STejun Heo * @args->dsq_id: DSQ to insert into 8793*bba2c361STejun Heo * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 8794*bba2c361STejun Heo * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 8795*bba2c361STejun Heo * @args->enq_flags: SCX_ENQ_* 8796*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8797*bba2c361STejun Heo * 8798*bba2c361STejun Heo * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 8799*bba2c361STejun Heo * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 8800*bba2c361STejun Heo * as an inline wrapper in common.bpf.h. 8801*bba2c361STejun Heo * 8802*bba2c361STejun Heo * Insert @p into the vtime priority queue of the DSQ identified by 8803*bba2c361STejun Heo * @args->dsq_id. Tasks queued into the priority queue are ordered by 8804*bba2c361STejun Heo * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 8805*bba2c361STejun Heo * 8806*bba2c361STejun Heo * @args->vtime ordering is according to time_before64() which considers 8807*bba2c361STejun Heo * wrapping. A numerically larger vtime may indicate an earlier position in the 8808*bba2c361STejun Heo * ordering and vice-versa. 8809*bba2c361STejun Heo * 8810*bba2c361STejun Heo * A DSQ can only be used as a FIFO or priority queue at any given time and this 8811*bba2c361STejun Heo * function must not be called on a DSQ which already has one or more FIFO tasks 8812*bba2c361STejun Heo * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 8813*bba2c361STejun Heo * SCX_DSQ_GLOBAL) cannot be used as priority queues. 8814*bba2c361STejun Heo * 8815*bba2c361STejun Heo * Returns %true on successful insertion, %false on failure. On the root 8816*bba2c361STejun Heo * scheduler, %false return triggers scheduler abort and the caller doesn't need 8817*bba2c361STejun Heo * to check the return value. 8818*bba2c361STejun Heo */ 8819*bba2c361STejun Heo __bpf_kfunc bool 8820*bba2c361STejun Heo __scx_bpf_dsq_insert_vtime(struct task_struct *p, 8821*bba2c361STejun Heo struct scx_bpf_dsq_insert_vtime_args *args, 8822*bba2c361STejun Heo const struct bpf_prog_aux *aux) 8823*bba2c361STejun Heo { 8824*bba2c361STejun Heo struct scx_sched *sch; 8825*bba2c361STejun Heo 8826*bba2c361STejun Heo guard(rcu)(); 8827*bba2c361STejun Heo 8828*bba2c361STejun Heo sch = scx_prog_sched(aux); 8829*bba2c361STejun Heo if (unlikely(!sch)) 8830*bba2c361STejun Heo return false; 8831*bba2c361STejun Heo 8832*bba2c361STejun Heo return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 8833*bba2c361STejun Heo args->vtime, args->enq_flags); 8834*bba2c361STejun Heo } 8835*bba2c361STejun Heo 8836*bba2c361STejun Heo /* 8837*bba2c361STejun Heo * COMPAT: Will be removed in v6.23. 8838*bba2c361STejun Heo */ 8839*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 8840*bba2c361STejun Heo u64 slice, u64 vtime, u64 enq_flags) 8841*bba2c361STejun Heo { 8842*bba2c361STejun Heo struct scx_sched *sch; 8843*bba2c361STejun Heo 8844*bba2c361STejun Heo guard(rcu)(); 8845*bba2c361STejun Heo 8846*bba2c361STejun Heo sch = rcu_dereference(scx_root); 8847*bba2c361STejun Heo if (unlikely(!sch)) 8848*bba2c361STejun Heo return; 8849*bba2c361STejun Heo 8850*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 8851*bba2c361STejun Heo /* 8852*bba2c361STejun Heo * Disallow if any sub-scheds are attached. There is no way to tell 8853*bba2c361STejun Heo * which scheduler called us, just error out @p's scheduler. 8854*bba2c361STejun Heo */ 8855*bba2c361STejun Heo if (unlikely(!list_empty(&sch->children))) { 8856*bba2c361STejun Heo scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); 8857*bba2c361STejun Heo return; 8858*bba2c361STejun Heo } 8859*bba2c361STejun Heo #endif 8860*bba2c361STejun Heo 8861*bba2c361STejun Heo scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 8862*bba2c361STejun Heo } 8863*bba2c361STejun Heo 8864*bba2c361STejun Heo __bpf_kfunc_end_defs(); 8865*bba2c361STejun Heo 8866*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 8867*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) 8868*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) 8869*bba2c361STejun Heo BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) 8870*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 8871*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 8872*bba2c361STejun Heo 8873*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { 8874*bba2c361STejun Heo .owner = THIS_MODULE, 8875*bba2c361STejun Heo .set = &scx_kfunc_ids_enqueue_dispatch, 8876*bba2c361STejun Heo .filter = scx_kfunc_context_filter, 8877*bba2c361STejun Heo }; 8878*bba2c361STejun Heo 8879*bba2c361STejun Heo static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, 8880*bba2c361STejun Heo struct task_struct *p, u64 dsq_id, u64 enq_flags) 8881*bba2c361STejun Heo { 8882*bba2c361STejun Heo struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; 8883*bba2c361STejun Heo struct scx_sched *sch; 8884*bba2c361STejun Heo struct rq *this_rq, *src_rq, *locked_rq; 8885*bba2c361STejun Heo bool dispatched = false; 8886*bba2c361STejun Heo bool in_balance; 8887*bba2c361STejun Heo unsigned long flags; 8888*bba2c361STejun Heo 8889*bba2c361STejun Heo /* 8890*bba2c361STejun Heo * The verifier considers an iterator slot initialized on any 8891*bba2c361STejun Heo * KF_ITER_NEW return, so a BPF program may legally reach here after 8892*bba2c361STejun Heo * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. 8893*bba2c361STejun Heo */ 8894*bba2c361STejun Heo if (unlikely(!src_dsq)) 8895*bba2c361STejun Heo return false; 8896*bba2c361STejun Heo 8897*bba2c361STejun Heo sch = src_dsq->sched; 8898*bba2c361STejun Heo 8899*bba2c361STejun Heo if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) 8900*bba2c361STejun Heo return false; 8901*bba2c361STejun Heo 8902*bba2c361STejun Heo /* 8903*bba2c361STejun Heo * If the BPF scheduler keeps calling this function repeatedly, it can 8904*bba2c361STejun Heo * cause similar live-lock conditions as consume_dispatch_q(). 8905*bba2c361STejun Heo */ 8906*bba2c361STejun Heo if (unlikely(READ_ONCE(sch->aborting))) 8907*bba2c361STejun Heo return false; 8908*bba2c361STejun Heo 8909*bba2c361STejun Heo if (unlikely(!scx_task_on_sched(sch, p))) { 8910*bba2c361STejun Heo scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", 8911*bba2c361STejun Heo p->comm, p->pid); 8912*bba2c361STejun Heo return false; 8913*bba2c361STejun Heo } 8914*bba2c361STejun Heo 8915*bba2c361STejun Heo /* 8916*bba2c361STejun Heo * Can be called from either ops.dispatch() locking this_rq() or any 8917*bba2c361STejun Heo * context where no rq lock is held. If latter, lock @p's task_rq which 8918*bba2c361STejun Heo * we'll likely need anyway. 8919*bba2c361STejun Heo */ 8920*bba2c361STejun Heo src_rq = task_rq(p); 8921*bba2c361STejun Heo 8922*bba2c361STejun Heo local_irq_save(flags); 8923*bba2c361STejun Heo this_rq = this_rq(); 8924*bba2c361STejun Heo in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; 8925*bba2c361STejun Heo 8926*bba2c361STejun Heo if (in_balance) { 8927*bba2c361STejun Heo if (this_rq != src_rq) { 8928*bba2c361STejun Heo raw_spin_rq_unlock(this_rq); 8929*bba2c361STejun Heo raw_spin_rq_lock(src_rq); 8930*bba2c361STejun Heo } 8931*bba2c361STejun Heo } else { 8932*bba2c361STejun Heo raw_spin_rq_lock(src_rq); 8933*bba2c361STejun Heo } 8934*bba2c361STejun Heo 8935*bba2c361STejun Heo locked_rq = src_rq; 8936*bba2c361STejun Heo raw_spin_lock(&src_dsq->lock); 8937*bba2c361STejun Heo 8938*bba2c361STejun Heo /* did someone else get to it while we dropped the locks? */ 8939*bba2c361STejun Heo if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { 8940*bba2c361STejun Heo raw_spin_unlock(&src_dsq->lock); 8941*bba2c361STejun Heo goto out; 8942*bba2c361STejun Heo } 8943*bba2c361STejun Heo 8944*bba2c361STejun Heo /* @p is still on $src_dsq and stable, determine the destination */ 8945*bba2c361STejun Heo dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); 8946*bba2c361STejun Heo 8947*bba2c361STejun Heo /* 8948*bba2c361STejun Heo * Apply vtime and slice updates before moving so that the new time is 8949*bba2c361STejun Heo * visible before inserting into $dst_dsq. @p is still on $src_dsq but 8950*bba2c361STejun Heo * this is safe as we're locking it. 8951*bba2c361STejun Heo */ 8952*bba2c361STejun Heo if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) 8953*bba2c361STejun Heo p->scx.dsq_vtime = kit->vtime; 8954*bba2c361STejun Heo if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) 8955*bba2c361STejun Heo p->scx.slice = kit->slice; 8956*bba2c361STejun Heo 8957*bba2c361STejun Heo /* execute move */ 8958*bba2c361STejun Heo locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); 8959*bba2c361STejun Heo dispatched = true; 8960*bba2c361STejun Heo out: 8961*bba2c361STejun Heo if (in_balance) { 8962*bba2c361STejun Heo if (this_rq != locked_rq) { 8963*bba2c361STejun Heo raw_spin_rq_unlock(locked_rq); 8964*bba2c361STejun Heo raw_spin_rq_lock(this_rq); 8965*bba2c361STejun Heo } 8966*bba2c361STejun Heo } else { 8967*bba2c361STejun Heo raw_spin_rq_unlock_irqrestore(locked_rq, flags); 8968*bba2c361STejun Heo } 8969*bba2c361STejun Heo 8970*bba2c361STejun Heo kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | 8971*bba2c361STejun Heo __SCX_DSQ_ITER_HAS_VTIME); 8972*bba2c361STejun Heo return dispatched; 8973*bba2c361STejun Heo } 8974*bba2c361STejun Heo 8975*bba2c361STejun Heo __bpf_kfunc_start_defs(); 8976*bba2c361STejun Heo 8977*bba2c361STejun Heo /** 8978*bba2c361STejun Heo * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots 8979*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8980*bba2c361STejun Heo * 8981*bba2c361STejun Heo * Can only be called from ops.dispatch(). 8982*bba2c361STejun Heo */ 8983*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) 8984*bba2c361STejun Heo { 8985*bba2c361STejun Heo struct scx_sched *sch; 8986*bba2c361STejun Heo 8987*bba2c361STejun Heo guard(rcu)(); 8988*bba2c361STejun Heo 8989*bba2c361STejun Heo sch = scx_prog_sched(aux); 8990*bba2c361STejun Heo if (unlikely(!sch)) 8991*bba2c361STejun Heo return 0; 8992*bba2c361STejun Heo 8993*bba2c361STejun Heo return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); 8994*bba2c361STejun Heo } 8995*bba2c361STejun Heo 8996*bba2c361STejun Heo /** 8997*bba2c361STejun Heo * scx_bpf_dispatch_cancel - Cancel the latest dispatch 8998*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 8999*bba2c361STejun Heo * 9000*bba2c361STejun Heo * Cancel the latest dispatch. Can be called multiple times to cancel further 9001*bba2c361STejun Heo * dispatches. Can only be called from ops.dispatch(). 9002*bba2c361STejun Heo */ 9003*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) 9004*bba2c361STejun Heo { 9005*bba2c361STejun Heo struct scx_sched *sch; 9006*bba2c361STejun Heo struct scx_dsp_ctx *dspc; 9007*bba2c361STejun Heo 9008*bba2c361STejun Heo guard(rcu)(); 9009*bba2c361STejun Heo 9010*bba2c361STejun Heo sch = scx_prog_sched(aux); 9011*bba2c361STejun Heo if (unlikely(!sch)) 9012*bba2c361STejun Heo return; 9013*bba2c361STejun Heo 9014*bba2c361STejun Heo dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 9015*bba2c361STejun Heo 9016*bba2c361STejun Heo if (dspc->cursor > 0) 9017*bba2c361STejun Heo dspc->cursor--; 9018*bba2c361STejun Heo else 9019*bba2c361STejun Heo scx_error(sch, "dispatch buffer underflow"); 9020*bba2c361STejun Heo } 9021*bba2c361STejun Heo 9022*bba2c361STejun Heo /** 9023*bba2c361STejun Heo * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ 9024*bba2c361STejun Heo * @dsq_id: DSQ to move task from. Must be a user-created DSQ 9025*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9026*bba2c361STejun Heo * @enq_flags: %SCX_ENQ_* 9027*bba2c361STejun Heo * 9028*bba2c361STejun Heo * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's 9029*bba2c361STejun Heo * local DSQ for execution with @enq_flags applied. Can only be called from 9030*bba2c361STejun Heo * ops.dispatch(). 9031*bba2c361STejun Heo * 9032*bba2c361STejun Heo * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as 9033*bba2c361STejun Heo * sources. Local DSQs support reenqueueing (a task can be picked up for 9034*bba2c361STejun Heo * execution, dequeued for property changes, or reenqueued), but the BPF 9035*bba2c361STejun Heo * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL 9036*bba2c361STejun Heo * is similar but also doesn't support reenqueueing, as it maps to multiple 9037*bba2c361STejun Heo * per-node DSQs making the scope difficult to define; this may change in the 9038*bba2c361STejun Heo * future. 9039*bba2c361STejun Heo * 9040*bba2c361STejun Heo * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() 9041*bba2c361STejun Heo * before trying to move from the specified DSQ. It may also grab rq locks and 9042*bba2c361STejun Heo * thus can't be called under any BPF locks. 9043*bba2c361STejun Heo * 9044*bba2c361STejun Heo * Returns %true if a task has been moved, %false if there isn't any task to 9045*bba2c361STejun Heo * move. 9046*bba2c361STejun Heo */ 9047*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, 9048*bba2c361STejun Heo const struct bpf_prog_aux *aux) 9049*bba2c361STejun Heo { 9050*bba2c361STejun Heo struct scx_dispatch_q *dsq; 9051*bba2c361STejun Heo struct scx_sched *sch; 9052*bba2c361STejun Heo struct scx_dsp_ctx *dspc; 9053*bba2c361STejun Heo 9054*bba2c361STejun Heo guard(rcu)(); 9055*bba2c361STejun Heo 9056*bba2c361STejun Heo sch = scx_prog_sched(aux); 9057*bba2c361STejun Heo if (unlikely(!sch)) 9058*bba2c361STejun Heo return false; 9059*bba2c361STejun Heo 9060*bba2c361STejun Heo if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) 9061*bba2c361STejun Heo return false; 9062*bba2c361STejun Heo 9063*bba2c361STejun Heo dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; 9064*bba2c361STejun Heo 9065*bba2c361STejun Heo flush_dispatch_buf(sch, dspc->rq); 9066*bba2c361STejun Heo 9067*bba2c361STejun Heo dsq = find_user_dsq(sch, dsq_id); 9068*bba2c361STejun Heo if (unlikely(!dsq)) { 9069*bba2c361STejun Heo scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); 9070*bba2c361STejun Heo return false; 9071*bba2c361STejun Heo } 9072*bba2c361STejun Heo 9073*bba2c361STejun Heo if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { 9074*bba2c361STejun Heo /* 9075*bba2c361STejun Heo * A successfully consumed task can be dequeued before it starts 9076*bba2c361STejun Heo * running while the CPU is trying to migrate other dispatched 9077*bba2c361STejun Heo * tasks. Bump nr_tasks to tell balance_one() to retry on empty 9078*bba2c361STejun Heo * local DSQ. 9079*bba2c361STejun Heo */ 9080*bba2c361STejun Heo dspc->nr_tasks++; 9081*bba2c361STejun Heo return true; 9082*bba2c361STejun Heo } else { 9083*bba2c361STejun Heo return false; 9084*bba2c361STejun Heo } 9085*bba2c361STejun Heo } 9086*bba2c361STejun Heo 9087*bba2c361STejun Heo /* 9088*bba2c361STejun Heo * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. 9089*bba2c361STejun Heo */ 9090*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) 9091*bba2c361STejun Heo { 9092*bba2c361STejun Heo return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); 9093*bba2c361STejun Heo } 9094*bba2c361STejun Heo 9095*bba2c361STejun Heo /** 9096*bba2c361STejun Heo * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs 9097*bba2c361STejun Heo * @it__iter: DSQ iterator in progress 9098*bba2c361STejun Heo * @slice: duration the moved task can run for in nsecs 9099*bba2c361STejun Heo * 9100*bba2c361STejun Heo * Override the slice of the next task that will be moved from @it__iter using 9101*bba2c361STejun Heo * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous 9102*bba2c361STejun Heo * slice duration is kept. 9103*bba2c361STejun Heo */ 9104*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, 9105*bba2c361STejun Heo u64 slice) 9106*bba2c361STejun Heo { 9107*bba2c361STejun Heo struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 9108*bba2c361STejun Heo 9109*bba2c361STejun Heo kit->slice = slice; 9110*bba2c361STejun Heo kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; 9111*bba2c361STejun Heo } 9112*bba2c361STejun Heo 9113*bba2c361STejun Heo /** 9114*bba2c361STejun Heo * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs 9115*bba2c361STejun Heo * @it__iter: DSQ iterator in progress 9116*bba2c361STejun Heo * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ 9117*bba2c361STejun Heo * 9118*bba2c361STejun Heo * Override the vtime of the next task that will be moved from @it__iter using 9119*bba2c361STejun Heo * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice 9120*bba2c361STejun Heo * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the 9121*bba2c361STejun Heo * override is ignored and cleared. 9122*bba2c361STejun Heo */ 9123*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, 9124*bba2c361STejun Heo u64 vtime) 9125*bba2c361STejun Heo { 9126*bba2c361STejun Heo struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; 9127*bba2c361STejun Heo 9128*bba2c361STejun Heo kit->vtime = vtime; 9129*bba2c361STejun Heo kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; 9130*bba2c361STejun Heo } 9131*bba2c361STejun Heo 9132*bba2c361STejun Heo /** 9133*bba2c361STejun Heo * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ 9134*bba2c361STejun Heo * @it__iter: DSQ iterator in progress 9135*bba2c361STejun Heo * @p: task to transfer 9136*bba2c361STejun Heo * @dsq_id: DSQ to move @p to 9137*bba2c361STejun Heo * @enq_flags: SCX_ENQ_* 9138*bba2c361STejun Heo * 9139*bba2c361STejun Heo * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ 9140*bba2c361STejun Heo * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can 9141*bba2c361STejun Heo * be the destination. 9142*bba2c361STejun Heo * 9143*bba2c361STejun Heo * For the transfer to be successful, @p must still be on the DSQ and have been 9144*bba2c361STejun Heo * queued before the DSQ iteration started. This function doesn't care whether 9145*bba2c361STejun Heo * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have 9146*bba2c361STejun Heo * been queued before the iteration started. 9147*bba2c361STejun Heo * 9148*bba2c361STejun Heo * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. 9149*bba2c361STejun Heo * 9150*bba2c361STejun Heo * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 9151*bba2c361STejun Heo * lock (e.g. BPF timers or SYSCALL programs). 9152*bba2c361STejun Heo * 9153*bba2c361STejun Heo * Returns %true if @p has been consumed, %false if @p had already been 9154*bba2c361STejun Heo * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 9155*bba2c361STejun Heo * DSQ. 9156*bba2c361STejun Heo */ 9157*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 9158*bba2c361STejun Heo struct task_struct *p, u64 dsq_id, 9159*bba2c361STejun Heo u64 enq_flags) 9160*bba2c361STejun Heo { 9161*bba2c361STejun Heo return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 9162*bba2c361STejun Heo p, dsq_id, enq_flags); 9163*bba2c361STejun Heo } 9164*bba2c361STejun Heo 9165*bba2c361STejun Heo /** 9166*bba2c361STejun Heo * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ 9167*bba2c361STejun Heo * @it__iter: DSQ iterator in progress 9168*bba2c361STejun Heo * @p: task to transfer 9169*bba2c361STejun Heo * @dsq_id: DSQ to move @p to 9170*bba2c361STejun Heo * @enq_flags: SCX_ENQ_* 9171*bba2c361STejun Heo * 9172*bba2c361STejun Heo * Transfer @p which is on the DSQ currently iterated by @it__iter to the 9173*bba2c361STejun Heo * priority queue of the DSQ specified by @dsq_id. The destination must be a 9174*bba2c361STejun Heo * user DSQ as only user DSQs support priority queue. 9175*bba2c361STejun Heo * 9176*bba2c361STejun Heo * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() 9177*bba2c361STejun Heo * and scx_bpf_dsq_move_set_vtime() to update. 9178*bba2c361STejun Heo * 9179*bba2c361STejun Heo * All other aspects are identical to scx_bpf_dsq_move(). See 9180*bba2c361STejun Heo * scx_bpf_dsq_insert_vtime() for more information on @vtime. 9181*bba2c361STejun Heo */ 9182*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, 9183*bba2c361STejun Heo struct task_struct *p, u64 dsq_id, 9184*bba2c361STejun Heo u64 enq_flags) 9185*bba2c361STejun Heo { 9186*bba2c361STejun Heo return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, 9187*bba2c361STejun Heo p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 9188*bba2c361STejun Heo } 9189*bba2c361STejun Heo 9190*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 9191*bba2c361STejun Heo /** 9192*bba2c361STejun Heo * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler 9193*bba2c361STejun Heo * @cgroup_id: cgroup ID of the child scheduler to dispatch 9194*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9195*bba2c361STejun Heo * 9196*bba2c361STejun Heo * Allows a parent scheduler to trigger dispatching on one of its direct 9197*bba2c361STejun Heo * child schedulers. The child scheduler runs its dispatch operation to 9198*bba2c361STejun Heo * move tasks from dispatch queues to the local runqueue. 9199*bba2c361STejun Heo * 9200*bba2c361STejun Heo * Returns: true on success, false if cgroup_id is invalid, not a direct 9201*bba2c361STejun Heo * child, or caller lacks dispatch permission. 9202*bba2c361STejun Heo */ 9203*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) 9204*bba2c361STejun Heo { 9205*bba2c361STejun Heo struct rq *this_rq = this_rq(); 9206*bba2c361STejun Heo struct scx_sched *parent, *child; 9207*bba2c361STejun Heo 9208*bba2c361STejun Heo guard(rcu)(); 9209*bba2c361STejun Heo parent = scx_prog_sched(aux); 9210*bba2c361STejun Heo if (unlikely(!parent)) 9211*bba2c361STejun Heo return false; 9212*bba2c361STejun Heo 9213*bba2c361STejun Heo child = scx_find_sub_sched(cgroup_id); 9214*bba2c361STejun Heo 9215*bba2c361STejun Heo if (unlikely(!child)) 9216*bba2c361STejun Heo return false; 9217*bba2c361STejun Heo 9218*bba2c361STejun Heo if (unlikely(scx_parent(child) != parent)) { 9219*bba2c361STejun Heo scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", 9220*bba2c361STejun Heo cgroup_id); 9221*bba2c361STejun Heo return false; 9222*bba2c361STejun Heo } 9223*bba2c361STejun Heo 9224*bba2c361STejun Heo return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, 9225*bba2c361STejun Heo true); 9226*bba2c361STejun Heo } 9227*bba2c361STejun Heo #endif /* CONFIG_EXT_SUB_SCHED */ 9228*bba2c361STejun Heo 9229*bba2c361STejun Heo __bpf_kfunc_end_defs(); 9230*bba2c361STejun Heo 9231*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_dispatch) 9232*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) 9233*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) 9234*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) 9235*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) 9236*bba2c361STejun Heo /* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ 9237*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 9238*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 9239*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 9240*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 9241*bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED 9242*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) 9243*bba2c361STejun Heo #endif 9244*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_dispatch) 9245*bba2c361STejun Heo 9246*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { 9247*bba2c361STejun Heo .owner = THIS_MODULE, 9248*bba2c361STejun Heo .set = &scx_kfunc_ids_dispatch, 9249*bba2c361STejun Heo .filter = scx_kfunc_context_filter, 9250*bba2c361STejun Heo }; 9251*bba2c361STejun Heo 9252*bba2c361STejun Heo __bpf_kfunc_start_defs(); 9253*bba2c361STejun Heo 9254*bba2c361STejun Heo /** 9255*bba2c361STejun Heo * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9256*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9257*bba2c361STejun Heo * 9258*bba2c361STejun Heo * Iterate over all of the tasks currently enqueued on the local DSQ of the 9259*bba2c361STejun Heo * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 9260*bba2c361STejun Heo * processed tasks. Can only be called from ops.cpu_release(). 9261*bba2c361STejun Heo */ 9262*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) 9263*bba2c361STejun Heo { 9264*bba2c361STejun Heo struct scx_sched *sch; 9265*bba2c361STejun Heo struct rq *rq; 9266*bba2c361STejun Heo 9267*bba2c361STejun Heo guard(rcu)(); 9268*bba2c361STejun Heo sch = scx_prog_sched(aux); 9269*bba2c361STejun Heo if (unlikely(!sch)) 9270*bba2c361STejun Heo return 0; 9271*bba2c361STejun Heo 9272*bba2c361STejun Heo rq = cpu_rq(smp_processor_id()); 9273*bba2c361STejun Heo lockdep_assert_rq_held(rq); 9274*bba2c361STejun Heo 9275*bba2c361STejun Heo return reenq_local(sch, rq, SCX_REENQ_ANY); 9276*bba2c361STejun Heo } 9277*bba2c361STejun Heo 9278*bba2c361STejun Heo __bpf_kfunc_end_defs(); 9279*bba2c361STejun Heo 9280*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) 9281*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) 9282*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) 9283*bba2c361STejun Heo 9284*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { 9285*bba2c361STejun Heo .owner = THIS_MODULE, 9286*bba2c361STejun Heo .set = &scx_kfunc_ids_cpu_release, 9287*bba2c361STejun Heo .filter = scx_kfunc_context_filter, 9288*bba2c361STejun Heo }; 9289*bba2c361STejun Heo 9290*bba2c361STejun Heo __bpf_kfunc_start_defs(); 9291*bba2c361STejun Heo 9292*bba2c361STejun Heo /** 9293*bba2c361STejun Heo * scx_bpf_create_dsq - Create a custom DSQ 9294*bba2c361STejun Heo * @dsq_id: DSQ to create 9295*bba2c361STejun Heo * @node: NUMA node to allocate from 9296*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9297*bba2c361STejun Heo * 9298*bba2c361STejun Heo * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable 9299*bba2c361STejun Heo * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. 9300*bba2c361STejun Heo */ 9301*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) 9302*bba2c361STejun Heo { 9303*bba2c361STejun Heo struct scx_dispatch_q *dsq; 9304*bba2c361STejun Heo struct scx_sched *sch; 9305*bba2c361STejun Heo s32 ret; 9306*bba2c361STejun Heo 9307*bba2c361STejun Heo if (unlikely(node >= (int)nr_node_ids || 9308*bba2c361STejun Heo (node < 0 && node != NUMA_NO_NODE))) 9309*bba2c361STejun Heo return -EINVAL; 9310*bba2c361STejun Heo 9311*bba2c361STejun Heo if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) 9312*bba2c361STejun Heo return -EINVAL; 9313*bba2c361STejun Heo 9314*bba2c361STejun Heo dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); 9315*bba2c361STejun Heo if (!dsq) 9316*bba2c361STejun Heo return -ENOMEM; 9317*bba2c361STejun Heo 9318*bba2c361STejun Heo /* 9319*bba2c361STejun Heo * init_dsq() must be called in GFP_KERNEL context. Init it with NULL 9320*bba2c361STejun Heo * @sch and update afterwards. 9321*bba2c361STejun Heo */ 9322*bba2c361STejun Heo ret = init_dsq(dsq, dsq_id, NULL); 9323*bba2c361STejun Heo if (ret) { 9324*bba2c361STejun Heo kfree(dsq); 9325*bba2c361STejun Heo return ret; 9326*bba2c361STejun Heo } 9327*bba2c361STejun Heo 9328*bba2c361STejun Heo rcu_read_lock(); 9329*bba2c361STejun Heo 9330*bba2c361STejun Heo sch = scx_prog_sched(aux); 9331*bba2c361STejun Heo if (sch) { 9332*bba2c361STejun Heo dsq->sched = sch; 9333*bba2c361STejun Heo ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, 9334*bba2c361STejun Heo dsq_hash_params); 9335*bba2c361STejun Heo } else { 9336*bba2c361STejun Heo ret = -ENODEV; 9337*bba2c361STejun Heo } 9338*bba2c361STejun Heo 9339*bba2c361STejun Heo rcu_read_unlock(); 9340*bba2c361STejun Heo if (ret) { 9341*bba2c361STejun Heo exit_dsq(dsq); 9342*bba2c361STejun Heo kfree(dsq); 9343*bba2c361STejun Heo } 9344*bba2c361STejun Heo return ret; 9345*bba2c361STejun Heo } 9346*bba2c361STejun Heo 9347*bba2c361STejun Heo __bpf_kfunc_end_defs(); 9348*bba2c361STejun Heo 9349*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 9350*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 9351*bba2c361STejun Heo /* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ 9352*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 9353*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 9354*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 9355*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 9356*bba2c361STejun Heo /* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ 9357*bba2c361STejun Heo BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 9358*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 9359*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 9360*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_unlocked) 9361*bba2c361STejun Heo 9362*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { 9363*bba2c361STejun Heo .owner = THIS_MODULE, 9364*bba2c361STejun Heo .set = &scx_kfunc_ids_unlocked, 9365*bba2c361STejun Heo .filter = scx_kfunc_context_filter, 9366*bba2c361STejun Heo }; 9367*bba2c361STejun Heo 9368*bba2c361STejun Heo __bpf_kfunc_start_defs(); 9369*bba2c361STejun Heo 9370*bba2c361STejun Heo /** 9371*bba2c361STejun Heo * scx_bpf_task_set_slice - Set task's time slice 9372*bba2c361STejun Heo * @p: task of interest 9373*bba2c361STejun Heo * @slice: time slice to set in nsecs 9374*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9375*bba2c361STejun Heo * 9376*bba2c361STejun Heo * Set @p's time slice to @slice. Returns %true on success, %false if the 9377*bba2c361STejun Heo * calling scheduler doesn't have authority over @p. 9378*bba2c361STejun Heo */ 9379*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, 9380*bba2c361STejun Heo const struct bpf_prog_aux *aux) 9381*bba2c361STejun Heo { 9382*bba2c361STejun Heo struct scx_sched *sch; 9383*bba2c361STejun Heo 9384*bba2c361STejun Heo guard(rcu)(); 9385*bba2c361STejun Heo sch = scx_prog_sched(aux); 9386*bba2c361STejun Heo if (unlikely(!sch || !scx_task_on_sched(sch, p))) 9387*bba2c361STejun Heo return false; 9388*bba2c361STejun Heo 9389*bba2c361STejun Heo p->scx.slice = slice; 9390*bba2c361STejun Heo return true; 9391*bba2c361STejun Heo } 9392*bba2c361STejun Heo 9393*bba2c361STejun Heo /** 9394*bba2c361STejun Heo * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 9395*bba2c361STejun Heo * @p: task of interest 9396*bba2c361STejun Heo * @vtime: virtual time to set 9397*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9398*bba2c361STejun Heo * 9399*bba2c361STejun Heo * Set @p's virtual time to @vtime. Returns %true on success, %false if the 9400*bba2c361STejun Heo * calling scheduler doesn't have authority over @p. 9401*bba2c361STejun Heo */ 9402*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, 9403*bba2c361STejun Heo const struct bpf_prog_aux *aux) 9404*bba2c361STejun Heo { 9405*bba2c361STejun Heo struct scx_sched *sch; 9406*bba2c361STejun Heo 9407*bba2c361STejun Heo guard(rcu)(); 9408*bba2c361STejun Heo sch = scx_prog_sched(aux); 9409*bba2c361STejun Heo if (unlikely(!sch || !scx_task_on_sched(sch, p))) 9410*bba2c361STejun Heo return false; 9411*bba2c361STejun Heo 9412*bba2c361STejun Heo p->scx.dsq_vtime = vtime; 9413*bba2c361STejun Heo return true; 9414*bba2c361STejun Heo } 9415*bba2c361STejun Heo 9416*bba2c361STejun Heo static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 9417*bba2c361STejun Heo { 9418*bba2c361STejun Heo struct rq *this_rq; 9419*bba2c361STejun Heo unsigned long irq_flags; 9420*bba2c361STejun Heo 9421*bba2c361STejun Heo local_irq_save(irq_flags); 9422*bba2c361STejun Heo 9423*bba2c361STejun Heo this_rq = this_rq(); 9424*bba2c361STejun Heo 9425*bba2c361STejun Heo /* 9426*bba2c361STejun Heo * While bypassing for PM ops, IRQ handling may not be online which can 9427*bba2c361STejun Heo * lead to irq_work_queue() malfunction such as infinite busy wait for 9428*bba2c361STejun Heo * IRQ status update. Suppress kicking. 9429*bba2c361STejun Heo */ 9430*bba2c361STejun Heo if (scx_bypassing(sch, cpu_of(this_rq))) 9431*bba2c361STejun Heo goto out; 9432*bba2c361STejun Heo 9433*bba2c361STejun Heo /* 9434*bba2c361STejun Heo * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting 9435*bba2c361STejun Heo * rq locks. We can probably be smarter and avoid bouncing if called 9436*bba2c361STejun Heo * from ops which don't hold a rq lock. 9437*bba2c361STejun Heo */ 9438*bba2c361STejun Heo if (flags & SCX_KICK_IDLE) { 9439*bba2c361STejun Heo struct rq *target_rq = cpu_rq(cpu); 9440*bba2c361STejun Heo 9441*bba2c361STejun Heo if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 9442*bba2c361STejun Heo scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 9443*bba2c361STejun Heo 9444*bba2c361STejun Heo if (raw_spin_rq_trylock(target_rq)) { 9445*bba2c361STejun Heo if (can_skip_idle_kick(target_rq)) { 9446*bba2c361STejun Heo raw_spin_rq_unlock(target_rq); 9447*bba2c361STejun Heo goto out; 9448*bba2c361STejun Heo } 9449*bba2c361STejun Heo raw_spin_rq_unlock(target_rq); 9450*bba2c361STejun Heo } 9451*bba2c361STejun Heo cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); 9452*bba2c361STejun Heo } else { 9453*bba2c361STejun Heo cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); 9454*bba2c361STejun Heo 9455*bba2c361STejun Heo if (flags & SCX_KICK_PREEMPT) 9456*bba2c361STejun Heo cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); 9457*bba2c361STejun Heo if (flags & SCX_KICK_WAIT) 9458*bba2c361STejun Heo cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); 9459*bba2c361STejun Heo } 9460*bba2c361STejun Heo 9461*bba2c361STejun Heo irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 9462*bba2c361STejun Heo out: 9463*bba2c361STejun Heo local_irq_restore(irq_flags); 9464*bba2c361STejun Heo } 9465*bba2c361STejun Heo 9466*bba2c361STejun Heo /** 9467*bba2c361STejun Heo * scx_bpf_kick_cpu - Trigger reschedule on a CPU 9468*bba2c361STejun Heo * @cpu: cpu to kick 9469*bba2c361STejun Heo * @flags: %SCX_KICK_* flags 9470*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9471*bba2c361STejun Heo * 9472*bba2c361STejun Heo * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 9473*bba2c361STejun Heo * trigger rescheduling on a busy CPU. This can be called from any online 9474*bba2c361STejun Heo * scx_ops operation and the actual kicking is performed asynchronously through 9475*bba2c361STejun Heo * an irq work. 9476*bba2c361STejun Heo */ 9477*bba2c361STejun Heo __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) 9478*bba2c361STejun Heo { 9479*bba2c361STejun Heo struct scx_sched *sch; 9480*bba2c361STejun Heo 9481*bba2c361STejun Heo guard(rcu)(); 9482*bba2c361STejun Heo sch = scx_prog_sched(aux); 9483*bba2c361STejun Heo if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9484*bba2c361STejun Heo scx_kick_cpu(sch, cpu, flags); 9485*bba2c361STejun Heo } 9486*bba2c361STejun Heo 9487*bba2c361STejun Heo /** 9488*bba2c361STejun Heo * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid 9489*bba2c361STejun Heo * @cid: cid to kick 9490*bba2c361STejun Heo * @flags: %SCX_KICK_* flags 9491*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9492*bba2c361STejun Heo * 9493*bba2c361STejun Heo * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success, 9494*bba2c361STejun Heo * -errno otherwise. 9495*bba2c361STejun Heo */ 9496*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux) 9497*bba2c361STejun Heo { 9498*bba2c361STejun Heo struct scx_sched *sch; 9499*bba2c361STejun Heo s32 cpu; 9500*bba2c361STejun Heo 9501*bba2c361STejun Heo guard(rcu)(); 9502*bba2c361STejun Heo sch = scx_prog_sched(aux); 9503*bba2c361STejun Heo if (unlikely(!sch)) 9504*bba2c361STejun Heo return -ENODEV; 9505*bba2c361STejun Heo cpu = scx_cid_to_cpu(sch, cid); 9506*bba2c361STejun Heo if (cpu < 0) 9507*bba2c361STejun Heo return cpu; 9508*bba2c361STejun Heo scx_kick_cpu(sch, cpu, flags); 9509*bba2c361STejun Heo return 0; 9510*bba2c361STejun Heo } 9511*bba2c361STejun Heo 9512*bba2c361STejun Heo /** 9513*bba2c361STejun Heo * scx_bpf_dsq_nr_queued - Return the number of queued tasks 9514*bba2c361STejun Heo * @dsq_id: id of the DSQ 9515*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9516*bba2c361STejun Heo * 9517*bba2c361STejun Heo * Return the number of tasks in the DSQ matching @dsq_id. If not found, 9518*bba2c361STejun Heo * -%ENOENT is returned. 9519*bba2c361STejun Heo */ 9520*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) 9521*bba2c361STejun Heo { 9522*bba2c361STejun Heo struct scx_sched *sch; 9523*bba2c361STejun Heo struct scx_dispatch_q *dsq; 9524*bba2c361STejun Heo s32 ret; 9525*bba2c361STejun Heo 9526*bba2c361STejun Heo preempt_disable(); 9527*bba2c361STejun Heo 9528*bba2c361STejun Heo sch = scx_prog_sched(aux); 9529*bba2c361STejun Heo if (unlikely(!sch)) { 9530*bba2c361STejun Heo ret = -ENODEV; 9531*bba2c361STejun Heo goto out; 9532*bba2c361STejun Heo } 9533*bba2c361STejun Heo 9534*bba2c361STejun Heo if (dsq_id == SCX_DSQ_LOCAL) { 9535*bba2c361STejun Heo ret = READ_ONCE(this_rq()->scx.local_dsq.nr); 9536*bba2c361STejun Heo goto out; 9537*bba2c361STejun Heo } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { 9538*bba2c361STejun Heo s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); 9539*bba2c361STejun Heo 9540*bba2c361STejun Heo if (scx_cpu_valid(sch, cpu, NULL)) { 9541*bba2c361STejun Heo ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); 9542*bba2c361STejun Heo goto out; 9543*bba2c361STejun Heo } 9544*bba2c361STejun Heo } else { 9545*bba2c361STejun Heo dsq = find_user_dsq(sch, dsq_id); 9546*bba2c361STejun Heo if (dsq) { 9547*bba2c361STejun Heo ret = READ_ONCE(dsq->nr); 9548*bba2c361STejun Heo goto out; 9549*bba2c361STejun Heo } 9550*bba2c361STejun Heo } 9551*bba2c361STejun Heo ret = -ENOENT; 9552*bba2c361STejun Heo out: 9553*bba2c361STejun Heo preempt_enable(); 9554*bba2c361STejun Heo return ret; 9555*bba2c361STejun Heo } 9556*bba2c361STejun Heo 9557*bba2c361STejun Heo /** 9558*bba2c361STejun Heo * scx_bpf_destroy_dsq - Destroy a custom DSQ 9559*bba2c361STejun Heo * @dsq_id: DSQ to destroy 9560*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9561*bba2c361STejun Heo * 9562*bba2c361STejun Heo * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with 9563*bba2c361STejun Heo * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is 9564*bba2c361STejun Heo * empty and no further tasks are dispatched to it. Ignored if called on a DSQ 9565*bba2c361STejun Heo * which doesn't exist. Can be called from any online scx_ops operations. 9566*bba2c361STejun Heo */ 9567*bba2c361STejun Heo __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) 9568*bba2c361STejun Heo { 9569*bba2c361STejun Heo struct scx_sched *sch; 9570*bba2c361STejun Heo 9571*bba2c361STejun Heo guard(rcu)(); 9572*bba2c361STejun Heo sch = scx_prog_sched(aux); 9573*bba2c361STejun Heo if (sch) 9574*bba2c361STejun Heo destroy_dsq(sch, dsq_id); 9575*bba2c361STejun Heo } 9576*bba2c361STejun Heo 9577*bba2c361STejun Heo /** 9578*bba2c361STejun Heo * bpf_iter_scx_dsq_new - Create a DSQ iterator 9579*bba2c361STejun Heo * @it: iterator to initialize 9580*bba2c361STejun Heo * @dsq_id: DSQ to iterate 9581*bba2c361STejun Heo * @flags: %SCX_DSQ_ITER_* 9582*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9583*bba2c361STejun Heo * 9584*bba2c361STejun Heo * Initialize BPF iterator @it which can be used with bpf_for_each() to walk 9585*bba2c361STejun Heo * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes 9586*bba2c361STejun Heo * tasks which are already queued when this function is invoked. 9587*bba2c361STejun Heo */ 9588*bba2c361STejun Heo __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, 9589*bba2c361STejun Heo u64 flags, const struct bpf_prog_aux *aux) 9590*bba2c361STejun Heo { 9591*bba2c361STejun Heo struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9592*bba2c361STejun Heo struct scx_sched *sch; 9593*bba2c361STejun Heo 9594*bba2c361STejun Heo BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > 9595*bba2c361STejun Heo sizeof(struct bpf_iter_scx_dsq)); 9596*bba2c361STejun Heo BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 9597*bba2c361STejun Heo __alignof__(struct bpf_iter_scx_dsq)); 9598*bba2c361STejun Heo BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 9599*bba2c361STejun Heo ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 9600*bba2c361STejun Heo 9601*bba2c361STejun Heo /* 9602*bba2c361STejun Heo * next() and destroy() will be called regardless of the return value. 9603*bba2c361STejun Heo * Always clear $kit->dsq. 9604*bba2c361STejun Heo */ 9605*bba2c361STejun Heo kit->dsq = NULL; 9606*bba2c361STejun Heo 9607*bba2c361STejun Heo sch = scx_prog_sched(aux); 9608*bba2c361STejun Heo if (unlikely(!sch)) 9609*bba2c361STejun Heo return -ENODEV; 9610*bba2c361STejun Heo 9611*bba2c361STejun Heo if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 9612*bba2c361STejun Heo return -EINVAL; 9613*bba2c361STejun Heo 9614*bba2c361STejun Heo kit->dsq = find_user_dsq(sch, dsq_id); 9615*bba2c361STejun Heo if (!kit->dsq) 9616*bba2c361STejun Heo return -ENOENT; 9617*bba2c361STejun Heo 9618*bba2c361STejun Heo kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); 9619*bba2c361STejun Heo 9620*bba2c361STejun Heo return 0; 9621*bba2c361STejun Heo } 9622*bba2c361STejun Heo 9623*bba2c361STejun Heo /** 9624*bba2c361STejun Heo * bpf_iter_scx_dsq_next - Progress a DSQ iterator 9625*bba2c361STejun Heo * @it: iterator to progress 9626*bba2c361STejun Heo * 9627*bba2c361STejun Heo * Return the next task. See bpf_iter_scx_dsq_new(). 9628*bba2c361STejun Heo */ 9629*bba2c361STejun Heo __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) 9630*bba2c361STejun Heo { 9631*bba2c361STejun Heo struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9632*bba2c361STejun Heo 9633*bba2c361STejun Heo if (!kit->dsq) 9634*bba2c361STejun Heo return NULL; 9635*bba2c361STejun Heo 9636*bba2c361STejun Heo guard(raw_spinlock_irqsave)(&kit->dsq->lock); 9637*bba2c361STejun Heo 9638*bba2c361STejun Heo return nldsq_cursor_next_task(&kit->cursor, kit->dsq); 9639*bba2c361STejun Heo } 9640*bba2c361STejun Heo 9641*bba2c361STejun Heo /** 9642*bba2c361STejun Heo * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator 9643*bba2c361STejun Heo * @it: iterator to destroy 9644*bba2c361STejun Heo * 9645*bba2c361STejun Heo * Undo scx_iter_scx_dsq_new(). 9646*bba2c361STejun Heo */ 9647*bba2c361STejun Heo __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) 9648*bba2c361STejun Heo { 9649*bba2c361STejun Heo struct bpf_iter_scx_dsq_kern *kit = (void *)it; 9650*bba2c361STejun Heo 9651*bba2c361STejun Heo if (!kit->dsq) 9652*bba2c361STejun Heo return; 9653*bba2c361STejun Heo 9654*bba2c361STejun Heo if (!list_empty(&kit->cursor.node)) { 9655*bba2c361STejun Heo unsigned long flags; 9656*bba2c361STejun Heo 9657*bba2c361STejun Heo raw_spin_lock_irqsave(&kit->dsq->lock, flags); 9658*bba2c361STejun Heo list_del_init(&kit->cursor.node); 9659*bba2c361STejun Heo raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 9660*bba2c361STejun Heo } 9661*bba2c361STejun Heo kit->dsq = NULL; 9662*bba2c361STejun Heo } 9663*bba2c361STejun Heo 9664*bba2c361STejun Heo /** 9665*bba2c361STejun Heo * scx_bpf_dsq_peek - Lockless peek at the first element. 9666*bba2c361STejun Heo * @dsq_id: DSQ to examine. 9667*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9668*bba2c361STejun Heo * 9669*bba2c361STejun Heo * Read the first element in the DSQ. This is semantically equivalent to using 9670*bba2c361STejun Heo * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 9671*bba2c361STejun Heo * this provides only a point-in-time snapshot, and the contents may change 9672*bba2c361STejun Heo * by the time any subsequent locking operation reads the queue. 9673*bba2c361STejun Heo * 9674*bba2c361STejun Heo * Returns the pointer, or NULL indicates an empty queue OR internal error. 9675*bba2c361STejun Heo */ 9676*bba2c361STejun Heo __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, 9677*bba2c361STejun Heo const struct bpf_prog_aux *aux) 9678*bba2c361STejun Heo { 9679*bba2c361STejun Heo struct scx_sched *sch; 9680*bba2c361STejun Heo struct scx_dispatch_q *dsq; 9681*bba2c361STejun Heo 9682*bba2c361STejun Heo sch = scx_prog_sched(aux); 9683*bba2c361STejun Heo if (unlikely(!sch)) 9684*bba2c361STejun Heo return NULL; 9685*bba2c361STejun Heo 9686*bba2c361STejun Heo if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 9687*bba2c361STejun Heo scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 9688*bba2c361STejun Heo return NULL; 9689*bba2c361STejun Heo } 9690*bba2c361STejun Heo 9691*bba2c361STejun Heo dsq = find_user_dsq(sch, dsq_id); 9692*bba2c361STejun Heo if (unlikely(!dsq)) { 9693*bba2c361STejun Heo scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 9694*bba2c361STejun Heo return NULL; 9695*bba2c361STejun Heo } 9696*bba2c361STejun Heo 9697*bba2c361STejun Heo return rcu_dereference(dsq->first_task); 9698*bba2c361STejun Heo } 9699*bba2c361STejun Heo 9700*bba2c361STejun Heo /** 9701*bba2c361STejun Heo * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ 9702*bba2c361STejun Heo * @dsq_id: DSQ to re-enqueue 9703*bba2c361STejun Heo * @reenq_flags: %SCX_RENQ_* 9704*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9705*bba2c361STejun Heo * 9706*bba2c361STejun Heo * Iterate over all of the tasks currently enqueued on the DSQ identified by 9707*bba2c361STejun Heo * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are 9708*bba2c361STejun Heo * supported: 9709*bba2c361STejun Heo * 9710*bba2c361STejun Heo * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 9711*bba2c361STejun Heo * - User DSQs 9712*bba2c361STejun Heo * 9713*bba2c361STejun Heo * Re-enqueues are performed asynchronously. Can be called from anywhere. 9714*bba2c361STejun Heo */ 9715*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, 9716*bba2c361STejun Heo const struct bpf_prog_aux *aux) 9717*bba2c361STejun Heo { 9718*bba2c361STejun Heo struct scx_sched *sch; 9719*bba2c361STejun Heo struct scx_dispatch_q *dsq; 9720*bba2c361STejun Heo 9721*bba2c361STejun Heo guard(preempt)(); 9722*bba2c361STejun Heo 9723*bba2c361STejun Heo sch = scx_prog_sched(aux); 9724*bba2c361STejun Heo if (unlikely(!sch)) 9725*bba2c361STejun Heo return; 9726*bba2c361STejun Heo 9727*bba2c361STejun Heo if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { 9728*bba2c361STejun Heo scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); 9729*bba2c361STejun Heo return; 9730*bba2c361STejun Heo } 9731*bba2c361STejun Heo 9732*bba2c361STejun Heo /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ 9733*bba2c361STejun Heo if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) 9734*bba2c361STejun Heo reenq_flags |= SCX_REENQ_ANY; 9735*bba2c361STejun Heo 9736*bba2c361STejun Heo dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); 9737*bba2c361STejun Heo schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); 9738*bba2c361STejun Heo } 9739*bba2c361STejun Heo 9740*bba2c361STejun Heo /** 9741*bba2c361STejun Heo * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 9742*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9743*bba2c361STejun Heo * 9744*bba2c361STejun Heo * Iterate over all of the tasks currently enqueued on the local DSQ of the 9745*bba2c361STejun Heo * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 9746*bba2c361STejun Heo * anywhere. 9747*bba2c361STejun Heo * 9748*bba2c361STejun Heo * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the 9749*bba2c361STejun Heo * future. 9750*bba2c361STejun Heo */ 9751*bba2c361STejun Heo __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) 9752*bba2c361STejun Heo { 9753*bba2c361STejun Heo scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); 9754*bba2c361STejun Heo } 9755*bba2c361STejun Heo 9756*bba2c361STejun Heo __bpf_kfunc_end_defs(); 9757*bba2c361STejun Heo 9758*bba2c361STejun Heo __printf(5, 0) 9759*bba2c361STejun Heo static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 9760*bba2c361STejun Heo size_t line_size, char *fmt, unsigned long long *data, 9761*bba2c361STejun Heo u32 data__sz) 9762*bba2c361STejun Heo { 9763*bba2c361STejun Heo struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 9764*bba2c361STejun Heo s32 ret; 9765*bba2c361STejun Heo 9766*bba2c361STejun Heo if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 9767*bba2c361STejun Heo (data__sz && !data)) { 9768*bba2c361STejun Heo scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 9769*bba2c361STejun Heo return -EINVAL; 9770*bba2c361STejun Heo } 9771*bba2c361STejun Heo 9772*bba2c361STejun Heo ret = copy_from_kernel_nofault(data_buf, data, data__sz); 9773*bba2c361STejun Heo if (ret < 0) { 9774*bba2c361STejun Heo scx_error(sch, "failed to read data fields (%d)", ret); 9775*bba2c361STejun Heo return ret; 9776*bba2c361STejun Heo } 9777*bba2c361STejun Heo 9778*bba2c361STejun Heo ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 9779*bba2c361STejun Heo &bprintf_data); 9780*bba2c361STejun Heo if (ret < 0) { 9781*bba2c361STejun Heo scx_error(sch, "format preparation failed (%d)", ret); 9782*bba2c361STejun Heo return ret; 9783*bba2c361STejun Heo } 9784*bba2c361STejun Heo 9785*bba2c361STejun Heo ret = bstr_printf(line_buf, line_size, fmt, 9786*bba2c361STejun Heo bprintf_data.bin_args); 9787*bba2c361STejun Heo bpf_bprintf_cleanup(&bprintf_data); 9788*bba2c361STejun Heo if (ret < 0) { 9789*bba2c361STejun Heo scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 9790*bba2c361STejun Heo return ret; 9791*bba2c361STejun Heo } 9792*bba2c361STejun Heo 9793*bba2c361STejun Heo return ret; 9794*bba2c361STejun Heo } 9795*bba2c361STejun Heo 9796*bba2c361STejun Heo __printf(3, 0) 9797*bba2c361STejun Heo static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 9798*bba2c361STejun Heo char *fmt, unsigned long long *data, u32 data__sz) 9799*bba2c361STejun Heo { 9800*bba2c361STejun Heo return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 9801*bba2c361STejun Heo fmt, data, data__sz); 9802*bba2c361STejun Heo } 9803*bba2c361STejun Heo 9804*bba2c361STejun Heo __bpf_kfunc_start_defs(); 9805*bba2c361STejun Heo 9806*bba2c361STejun Heo /** 9807*bba2c361STejun Heo * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. 9808*bba2c361STejun Heo * @exit_code: Exit value to pass to user space via struct scx_exit_info. 9809*bba2c361STejun Heo * @fmt: error message format string 9810*bba2c361STejun Heo * @data: format string parameters packaged using ___bpf_fill() macro 9811*bba2c361STejun Heo * @data__sz: @data len, must end in '__sz' for the verifier 9812*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9813*bba2c361STejun Heo * 9814*bba2c361STejun Heo * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops 9815*bba2c361STejun Heo * disabling. 9816*bba2c361STejun Heo */ 9817*bba2c361STejun Heo __printf(2, 0) 9818*bba2c361STejun Heo __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 9819*bba2c361STejun Heo unsigned long long *data, u32 data__sz, 9820*bba2c361STejun Heo const struct bpf_prog_aux *aux) 9821*bba2c361STejun Heo { 9822*bba2c361STejun Heo struct scx_sched *sch; 9823*bba2c361STejun Heo unsigned long flags; 9824*bba2c361STejun Heo 9825*bba2c361STejun Heo raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9826*bba2c361STejun Heo sch = scx_prog_sched(aux); 9827*bba2c361STejun Heo if (likely(sch) && 9828*bba2c361STejun Heo bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9829*bba2c361STejun Heo scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 9830*bba2c361STejun Heo raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9831*bba2c361STejun Heo } 9832*bba2c361STejun Heo 9833*bba2c361STejun Heo /** 9834*bba2c361STejun Heo * scx_bpf_error_bstr - Indicate fatal error 9835*bba2c361STejun Heo * @fmt: error message format string 9836*bba2c361STejun Heo * @data: format string parameters packaged using ___bpf_fill() macro 9837*bba2c361STejun Heo * @data__sz: @data len, must end in '__sz' for the verifier 9838*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9839*bba2c361STejun Heo * 9840*bba2c361STejun Heo * Indicate that the BPF scheduler encountered a fatal error and initiate ops 9841*bba2c361STejun Heo * disabling. 9842*bba2c361STejun Heo */ 9843*bba2c361STejun Heo __printf(1, 0) 9844*bba2c361STejun Heo __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 9845*bba2c361STejun Heo u32 data__sz, const struct bpf_prog_aux *aux) 9846*bba2c361STejun Heo { 9847*bba2c361STejun Heo struct scx_sched *sch; 9848*bba2c361STejun Heo unsigned long flags; 9849*bba2c361STejun Heo 9850*bba2c361STejun Heo raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 9851*bba2c361STejun Heo sch = scx_prog_sched(aux); 9852*bba2c361STejun Heo if (likely(sch) && 9853*bba2c361STejun Heo bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 9854*bba2c361STejun Heo scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 9855*bba2c361STejun Heo raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 9856*bba2c361STejun Heo } 9857*bba2c361STejun Heo 9858*bba2c361STejun Heo /** 9859*bba2c361STejun Heo * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler 9860*bba2c361STejun Heo * @fmt: format string 9861*bba2c361STejun Heo * @data: format string parameters packaged using ___bpf_fill() macro 9862*bba2c361STejun Heo * @data__sz: @data len, must end in '__sz' for the verifier 9863*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9864*bba2c361STejun Heo * 9865*bba2c361STejun Heo * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and 9866*bba2c361STejun Heo * dump_task() to generate extra debug dump specific to the BPF scheduler. 9867*bba2c361STejun Heo * 9868*bba2c361STejun Heo * The extra dump may be multiple lines. A single line may be split over 9869*bba2c361STejun Heo * multiple calls. The last line is automatically terminated. 9870*bba2c361STejun Heo */ 9871*bba2c361STejun Heo __printf(1, 0) 9872*bba2c361STejun Heo __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 9873*bba2c361STejun Heo u32 data__sz, const struct bpf_prog_aux *aux) 9874*bba2c361STejun Heo { 9875*bba2c361STejun Heo struct scx_sched *sch; 9876*bba2c361STejun Heo struct scx_dump_data *dd = &scx_dump_data; 9877*bba2c361STejun Heo struct scx_bstr_buf *buf = &dd->buf; 9878*bba2c361STejun Heo s32 ret; 9879*bba2c361STejun Heo 9880*bba2c361STejun Heo guard(rcu)(); 9881*bba2c361STejun Heo 9882*bba2c361STejun Heo sch = scx_prog_sched(aux); 9883*bba2c361STejun Heo if (unlikely(!sch)) 9884*bba2c361STejun Heo return; 9885*bba2c361STejun Heo 9886*bba2c361STejun Heo if (raw_smp_processor_id() != dd->cpu) { 9887*bba2c361STejun Heo scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 9888*bba2c361STejun Heo return; 9889*bba2c361STejun Heo } 9890*bba2c361STejun Heo 9891*bba2c361STejun Heo /* append the formatted string to the line buf */ 9892*bba2c361STejun Heo ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 9893*bba2c361STejun Heo sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 9894*bba2c361STejun Heo if (ret < 0) { 9895*bba2c361STejun Heo dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", 9896*bba2c361STejun Heo dd->prefix, fmt, data, data__sz, ret); 9897*bba2c361STejun Heo return; 9898*bba2c361STejun Heo } 9899*bba2c361STejun Heo 9900*bba2c361STejun Heo dd->cursor += ret; 9901*bba2c361STejun Heo dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); 9902*bba2c361STejun Heo 9903*bba2c361STejun Heo if (!dd->cursor) 9904*bba2c361STejun Heo return; 9905*bba2c361STejun Heo 9906*bba2c361STejun Heo /* 9907*bba2c361STejun Heo * If the line buf overflowed or ends in a newline, flush it into the 9908*bba2c361STejun Heo * dump. This is to allow the caller to generate a single line over 9909*bba2c361STejun Heo * multiple calls. As ops_dump_flush() can also handle multiple lines in 9910*bba2c361STejun Heo * the line buf, the only case which can lead to an unexpected 9911*bba2c361STejun Heo * truncation is when the caller keeps generating newlines in the middle 9912*bba2c361STejun Heo * instead of the end consecutively. Don't do that. 9913*bba2c361STejun Heo */ 9914*bba2c361STejun Heo if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 9915*bba2c361STejun Heo ops_dump_flush(); 9916*bba2c361STejun Heo } 9917*bba2c361STejun Heo 9918*bba2c361STejun Heo /** 9919*bba2c361STejun Heo * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 9920*bba2c361STejun Heo * @cpu: CPU of interest 9921*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9922*bba2c361STejun Heo * 9923*bba2c361STejun Heo * Return the maximum relative capacity of @cpu in relation to the most 9924*bba2c361STejun Heo * performant CPU in the system. The return value is in the range [1, 9925*bba2c361STejun Heo * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). 9926*bba2c361STejun Heo */ 9927*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) 9928*bba2c361STejun Heo { 9929*bba2c361STejun Heo struct scx_sched *sch; 9930*bba2c361STejun Heo 9931*bba2c361STejun Heo guard(rcu)(); 9932*bba2c361STejun Heo 9933*bba2c361STejun Heo sch = scx_prog_sched(aux); 9934*bba2c361STejun Heo if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9935*bba2c361STejun Heo return arch_scale_cpu_capacity(cpu); 9936*bba2c361STejun Heo else 9937*bba2c361STejun Heo return SCX_CPUPERF_ONE; 9938*bba2c361STejun Heo } 9939*bba2c361STejun Heo 9940*bba2c361STejun Heo /** 9941*bba2c361STejun Heo * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid 9942*bba2c361STejun Heo * @cid: cid of the CPU to query 9943*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9944*bba2c361STejun Heo * 9945*bba2c361STejun Heo * cid-addressed equivalent of scx_bpf_cpuperf_cap(). 9946*bba2c361STejun Heo */ 9947*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux) 9948*bba2c361STejun Heo { 9949*bba2c361STejun Heo struct scx_sched *sch; 9950*bba2c361STejun Heo s32 cpu; 9951*bba2c361STejun Heo 9952*bba2c361STejun Heo guard(rcu)(); 9953*bba2c361STejun Heo 9954*bba2c361STejun Heo sch = scx_prog_sched(aux); 9955*bba2c361STejun Heo if (unlikely(!sch)) 9956*bba2c361STejun Heo return SCX_CPUPERF_ONE; 9957*bba2c361STejun Heo cpu = scx_cid_to_cpu(sch, cid); 9958*bba2c361STejun Heo if (cpu < 0) 9959*bba2c361STejun Heo return SCX_CPUPERF_ONE; 9960*bba2c361STejun Heo return arch_scale_cpu_capacity(cpu); 9961*bba2c361STejun Heo } 9962*bba2c361STejun Heo 9963*bba2c361STejun Heo /** 9964*bba2c361STejun Heo * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU 9965*bba2c361STejun Heo * @cpu: CPU of interest 9966*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9967*bba2c361STejun Heo * 9968*bba2c361STejun Heo * Return the current relative performance of @cpu in relation to its maximum. 9969*bba2c361STejun Heo * The return value is in the range [1, %SCX_CPUPERF_ONE]. 9970*bba2c361STejun Heo * 9971*bba2c361STejun Heo * The current performance level of a CPU in relation to the maximum performance 9972*bba2c361STejun Heo * available in the system can be calculated as follows: 9973*bba2c361STejun Heo * 9974*bba2c361STejun Heo * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE 9975*bba2c361STejun Heo * 9976*bba2c361STejun Heo * The result is in the range [1, %SCX_CPUPERF_ONE]. 9977*bba2c361STejun Heo */ 9978*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) 9979*bba2c361STejun Heo { 9980*bba2c361STejun Heo struct scx_sched *sch; 9981*bba2c361STejun Heo 9982*bba2c361STejun Heo guard(rcu)(); 9983*bba2c361STejun Heo 9984*bba2c361STejun Heo sch = scx_prog_sched(aux); 9985*bba2c361STejun Heo if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) 9986*bba2c361STejun Heo return arch_scale_freq_capacity(cpu); 9987*bba2c361STejun Heo else 9988*bba2c361STejun Heo return SCX_CPUPERF_ONE; 9989*bba2c361STejun Heo } 9990*bba2c361STejun Heo 9991*bba2c361STejun Heo /** 9992*bba2c361STejun Heo * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid 9993*bba2c361STejun Heo * @cid: cid of the CPU to query 9994*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 9995*bba2c361STejun Heo * 9996*bba2c361STejun Heo * cid-addressed equivalent of scx_bpf_cpuperf_cur(). 9997*bba2c361STejun Heo */ 9998*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux) 9999*bba2c361STejun Heo { 10000*bba2c361STejun Heo struct scx_sched *sch; 10001*bba2c361STejun Heo s32 cpu; 10002*bba2c361STejun Heo 10003*bba2c361STejun Heo guard(rcu)(); 10004*bba2c361STejun Heo 10005*bba2c361STejun Heo sch = scx_prog_sched(aux); 10006*bba2c361STejun Heo if (unlikely(!sch)) 10007*bba2c361STejun Heo return SCX_CPUPERF_ONE; 10008*bba2c361STejun Heo cpu = scx_cid_to_cpu(sch, cid); 10009*bba2c361STejun Heo if (cpu < 0) 10010*bba2c361STejun Heo return SCX_CPUPERF_ONE; 10011*bba2c361STejun Heo return arch_scale_freq_capacity(cpu); 10012*bba2c361STejun Heo } 10013*bba2c361STejun Heo 10014*bba2c361STejun Heo /** 10015*bba2c361STejun Heo * scx_bpf_cpuperf_set - Set the relative performance target of a CPU 10016*bba2c361STejun Heo * @cpu: CPU of interest 10017*bba2c361STejun Heo * @perf: target performance level [0, %SCX_CPUPERF_ONE] 10018*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10019*bba2c361STejun Heo * 10020*bba2c361STejun Heo * Set the target performance level of @cpu to @perf. @perf is in linear 10021*bba2c361STejun Heo * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the 10022*bba2c361STejun Heo * schedutil cpufreq governor chooses the target frequency. 10023*bba2c361STejun Heo * 10024*bba2c361STejun Heo * The actual performance level chosen, CPU grouping, and the overhead and 10025*bba2c361STejun Heo * latency of the operations are dependent on the hardware and cpufreq driver in 10026*bba2c361STejun Heo * use. Consult hardware and cpufreq documentation for more information. The 10027*bba2c361STejun Heo * current performance level can be monitored using scx_bpf_cpuperf_cur(). 10028*bba2c361STejun Heo */ 10029*bba2c361STejun Heo __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) 10030*bba2c361STejun Heo { 10031*bba2c361STejun Heo struct scx_sched *sch; 10032*bba2c361STejun Heo 10033*bba2c361STejun Heo guard(rcu)(); 10034*bba2c361STejun Heo 10035*bba2c361STejun Heo sch = scx_prog_sched(aux); 10036*bba2c361STejun Heo if (unlikely(!sch)) 10037*bba2c361STejun Heo return; 10038*bba2c361STejun Heo 10039*bba2c361STejun Heo if (unlikely(perf > SCX_CPUPERF_ONE)) { 10040*bba2c361STejun Heo scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 10041*bba2c361STejun Heo return; 10042*bba2c361STejun Heo } 10043*bba2c361STejun Heo 10044*bba2c361STejun Heo if (scx_cpu_valid(sch, cpu, NULL)) { 10045*bba2c361STejun Heo struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 10046*bba2c361STejun Heo struct rq_flags rf; 10047*bba2c361STejun Heo 10048*bba2c361STejun Heo /* 10049*bba2c361STejun Heo * When called with an rq lock held, restrict the operation 10050*bba2c361STejun Heo * to the corresponding CPU to prevent ABBA deadlocks. 10051*bba2c361STejun Heo */ 10052*bba2c361STejun Heo if (locked_rq && rq != locked_rq) { 10053*bba2c361STejun Heo scx_error(sch, "Invalid target CPU %d", cpu); 10054*bba2c361STejun Heo return; 10055*bba2c361STejun Heo } 10056*bba2c361STejun Heo 10057*bba2c361STejun Heo /* 10058*bba2c361STejun Heo * If no rq lock is held, allow to operate on any CPU by 10059*bba2c361STejun Heo * acquiring the corresponding rq lock. 10060*bba2c361STejun Heo */ 10061*bba2c361STejun Heo if (!locked_rq) { 10062*bba2c361STejun Heo rq_lock_irqsave(rq, &rf); 10063*bba2c361STejun Heo update_rq_clock(rq); 10064*bba2c361STejun Heo } 10065*bba2c361STejun Heo 10066*bba2c361STejun Heo rq->scx.cpuperf_target = perf; 10067*bba2c361STejun Heo cpufreq_update_util(rq, 0); 10068*bba2c361STejun Heo 10069*bba2c361STejun Heo if (!locked_rq) 10070*bba2c361STejun Heo rq_unlock_irqrestore(rq, &rf); 10071*bba2c361STejun Heo } 10072*bba2c361STejun Heo } 10073*bba2c361STejun Heo 10074*bba2c361STejun Heo /** 10075*bba2c361STejun Heo * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid 10076*bba2c361STejun Heo * @cid: cid of the CPU to target 10077*bba2c361STejun Heo * @perf: target performance level [0, %SCX_CPUPERF_ONE] 10078*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10079*bba2c361STejun Heo * 10080*bba2c361STejun Heo * cid-addressed equivalent of scx_bpf_cpuperf_set(). 10081*bba2c361STejun Heo */ 10082*bba2c361STejun Heo __bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf, 10083*bba2c361STejun Heo const struct bpf_prog_aux *aux) 10084*bba2c361STejun Heo { 10085*bba2c361STejun Heo struct scx_sched *sch; 10086*bba2c361STejun Heo s32 cpu; 10087*bba2c361STejun Heo 10088*bba2c361STejun Heo guard(rcu)(); 10089*bba2c361STejun Heo 10090*bba2c361STejun Heo sch = scx_prog_sched(aux); 10091*bba2c361STejun Heo if (unlikely(!sch)) 10092*bba2c361STejun Heo return; 10093*bba2c361STejun Heo cpu = scx_cid_to_cpu(sch, cid); 10094*bba2c361STejun Heo if (cpu < 0) 10095*bba2c361STejun Heo return; 10096*bba2c361STejun Heo scx_bpf_cpuperf_set(cpu, perf, aux); 10097*bba2c361STejun Heo } 10098*bba2c361STejun Heo 10099*bba2c361STejun Heo /** 10100*bba2c361STejun Heo * scx_bpf_nr_node_ids - Return the number of possible node IDs 10101*bba2c361STejun Heo * 10102*bba2c361STejun Heo * All valid node IDs in the system are smaller than the returned value. 10103*bba2c361STejun Heo */ 10104*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 10105*bba2c361STejun Heo { 10106*bba2c361STejun Heo return nr_node_ids; 10107*bba2c361STejun Heo } 10108*bba2c361STejun Heo 10109*bba2c361STejun Heo /** 10110*bba2c361STejun Heo * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 10111*bba2c361STejun Heo * 10112*bba2c361STejun Heo * All valid CPU IDs in the system are smaller than the returned value. 10113*bba2c361STejun Heo */ 10114*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) 10115*bba2c361STejun Heo { 10116*bba2c361STejun Heo return nr_cpu_ids; 10117*bba2c361STejun Heo } 10118*bba2c361STejun Heo 10119*bba2c361STejun Heo /** 10120*bba2c361STejun Heo * scx_bpf_nr_cids - Return the size of the cid space 10121*bba2c361STejun Heo * 10122*bba2c361STejun Heo * Equals num_possible_cpus(). All valid cids are in [0, return value). 10123*bba2c361STejun Heo */ 10124*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_nr_cids(void) 10125*bba2c361STejun Heo { 10126*bba2c361STejun Heo return num_possible_cpus(); 10127*bba2c361STejun Heo } 10128*bba2c361STejun Heo 10129*bba2c361STejun Heo /** 10130*bba2c361STejun Heo * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space 10131*bba2c361STejun Heo * 10132*bba2c361STejun Heo * Return num_online_cpus(). The standard model restarts the scheduler on 10133*bba2c361STejun Heo * hotplug, which lets schedulers treat [0, nr_online_cids) as the online 10134*bba2c361STejun Heo * range. Schedulers that prefer to handle hotplug without a restart should 10135*bba2c361STejun Heo * install a custom mapping via scx_bpf_cid_override() and track onlining 10136*bba2c361STejun Heo * through the ops.cid_online / ops.cid_offline callbacks. 10137*bba2c361STejun Heo */ 10138*bba2c361STejun Heo __bpf_kfunc u32 scx_bpf_nr_online_cids(void) 10139*bba2c361STejun Heo { 10140*bba2c361STejun Heo return num_online_cpus(); 10141*bba2c361STejun Heo } 10142*bba2c361STejun Heo 10143*bba2c361STejun Heo /** 10144*bba2c361STejun Heo * scx_bpf_this_cid - Return the cid of the CPU this program is running on 10145*bba2c361STejun Heo * 10146*bba2c361STejun Heo * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs. 10147*bba2c361STejun Heo * The current cpu is trivially valid, so this is just a table lookup. Return 10148*bba2c361STejun Heo * -EINVAL if called from a non-SCX program before any scheduler has ever 10149*bba2c361STejun Heo * been enabled (the cid table is still unallocated at that point). 10150*bba2c361STejun Heo */ 10151*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_this_cid(void) 10152*bba2c361STejun Heo { 10153*bba2c361STejun Heo s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); 10154*bba2c361STejun Heo 10155*bba2c361STejun Heo if (!tbl) 10156*bba2c361STejun Heo return -EINVAL; 10157*bba2c361STejun Heo return tbl[raw_smp_processor_id()]; 10158*bba2c361STejun Heo } 10159*bba2c361STejun Heo 10160*bba2c361STejun Heo /** 10161*bba2c361STejun Heo * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask 10162*bba2c361STejun Heo */ 10163*bba2c361STejun Heo __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) 10164*bba2c361STejun Heo { 10165*bba2c361STejun Heo return cpu_possible_mask; 10166*bba2c361STejun Heo } 10167*bba2c361STejun Heo 10168*bba2c361STejun Heo /** 10169*bba2c361STejun Heo * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask 10170*bba2c361STejun Heo */ 10171*bba2c361STejun Heo __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) 10172*bba2c361STejun Heo { 10173*bba2c361STejun Heo return cpu_online_mask; 10174*bba2c361STejun Heo } 10175*bba2c361STejun Heo 10176*bba2c361STejun Heo /** 10177*bba2c361STejun Heo * scx_bpf_put_cpumask - Release a possible/online cpumask 10178*bba2c361STejun Heo * @cpumask: cpumask to release 10179*bba2c361STejun Heo */ 10180*bba2c361STejun Heo __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) 10181*bba2c361STejun Heo { 10182*bba2c361STejun Heo /* 10183*bba2c361STejun Heo * Empty function body because we aren't actually acquiring or releasing 10184*bba2c361STejun Heo * a reference to a global cpumask, which is read-only in the caller and 10185*bba2c361STejun Heo * is never released. The acquire / release semantics here are just used 10186*bba2c361STejun Heo * to make the cpumask is a trusted pointer in the caller. 10187*bba2c361STejun Heo */ 10188*bba2c361STejun Heo } 10189*bba2c361STejun Heo 10190*bba2c361STejun Heo /** 10191*bba2c361STejun Heo * scx_bpf_task_running - Is task currently running? 10192*bba2c361STejun Heo * @p: task of interest 10193*bba2c361STejun Heo */ 10194*bba2c361STejun Heo __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) 10195*bba2c361STejun Heo { 10196*bba2c361STejun Heo return task_rq(p)->curr == p; 10197*bba2c361STejun Heo } 10198*bba2c361STejun Heo 10199*bba2c361STejun Heo /** 10200*bba2c361STejun Heo * scx_bpf_task_cpu - CPU a task is currently associated with 10201*bba2c361STejun Heo * @p: task of interest 10202*bba2c361STejun Heo */ 10203*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) 10204*bba2c361STejun Heo { 10205*bba2c361STejun Heo return task_cpu(p); 10206*bba2c361STejun Heo } 10207*bba2c361STejun Heo 10208*bba2c361STejun Heo /** 10209*bba2c361STejun Heo * scx_bpf_task_cid - cid a task is currently associated with 10210*bba2c361STejun Heo * @p: task of interest 10211*bba2c361STejun Heo * 10212*bba2c361STejun Heo * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a 10213*bba2c361STejun Heo * valid cpu, so this is just a table lookup. Return -EINVAL if called from 10214*bba2c361STejun Heo * a non-SCX program before any scheduler has ever been enabled. 10215*bba2c361STejun Heo */ 10216*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p) 10217*bba2c361STejun Heo { 10218*bba2c361STejun Heo s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); 10219*bba2c361STejun Heo 10220*bba2c361STejun Heo if (!tbl) 10221*bba2c361STejun Heo return -EINVAL; 10222*bba2c361STejun Heo return tbl[task_cpu(p)]; 10223*bba2c361STejun Heo } 10224*bba2c361STejun Heo 10225*bba2c361STejun Heo /** 10226*bba2c361STejun Heo * scx_bpf_cpu_rq - Fetch the rq of a CPU 10227*bba2c361STejun Heo * @cpu: CPU of the rq 10228*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10229*bba2c361STejun Heo */ 10230*bba2c361STejun Heo __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) 10231*bba2c361STejun Heo { 10232*bba2c361STejun Heo struct scx_sched *sch; 10233*bba2c361STejun Heo 10234*bba2c361STejun Heo guard(rcu)(); 10235*bba2c361STejun Heo 10236*bba2c361STejun Heo sch = scx_prog_sched(aux); 10237*bba2c361STejun Heo if (unlikely(!sch)) 10238*bba2c361STejun Heo return NULL; 10239*bba2c361STejun Heo 10240*bba2c361STejun Heo if (!scx_cpu_valid(sch, cpu, NULL)) 10241*bba2c361STejun Heo return NULL; 10242*bba2c361STejun Heo 10243*bba2c361STejun Heo if (!sch->warned_deprecated_rq) { 10244*bba2c361STejun Heo printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 10245*bba2c361STejun Heo "use scx_bpf_locked_rq() when holding rq lock " 10246*bba2c361STejun Heo "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 10247*bba2c361STejun Heo sch->warned_deprecated_rq = true; 10248*bba2c361STejun Heo } 10249*bba2c361STejun Heo 10250*bba2c361STejun Heo return cpu_rq(cpu); 10251*bba2c361STejun Heo } 10252*bba2c361STejun Heo 10253*bba2c361STejun Heo /** 10254*bba2c361STejun Heo * scx_bpf_locked_rq - Return the rq currently locked by SCX 10255*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10256*bba2c361STejun Heo * 10257*bba2c361STejun Heo * Returns the rq if a rq lock is currently held by SCX. 10258*bba2c361STejun Heo * Otherwise emits an error and returns NULL. 10259*bba2c361STejun Heo */ 10260*bba2c361STejun Heo __bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) 10261*bba2c361STejun Heo { 10262*bba2c361STejun Heo struct scx_sched *sch; 10263*bba2c361STejun Heo struct rq *rq; 10264*bba2c361STejun Heo 10265*bba2c361STejun Heo guard(preempt)(); 10266*bba2c361STejun Heo 10267*bba2c361STejun Heo sch = scx_prog_sched(aux); 10268*bba2c361STejun Heo if (unlikely(!sch)) 10269*bba2c361STejun Heo return NULL; 10270*bba2c361STejun Heo 10271*bba2c361STejun Heo rq = scx_locked_rq(); 10272*bba2c361STejun Heo if (!rq) { 10273*bba2c361STejun Heo scx_error(sch, "accessing rq without holding rq lock"); 10274*bba2c361STejun Heo return NULL; 10275*bba2c361STejun Heo } 10276*bba2c361STejun Heo 10277*bba2c361STejun Heo return rq; 10278*bba2c361STejun Heo } 10279*bba2c361STejun Heo 10280*bba2c361STejun Heo /** 10281*bba2c361STejun Heo * scx_bpf_cpu_curr - Return remote CPU's curr task 10282*bba2c361STejun Heo * @cpu: CPU of interest 10283*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10284*bba2c361STejun Heo * 10285*bba2c361STejun Heo * Callers must hold RCU read lock (KF_RCU). 10286*bba2c361STejun Heo */ 10287*bba2c361STejun Heo __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) 10288*bba2c361STejun Heo { 10289*bba2c361STejun Heo struct scx_sched *sch; 10290*bba2c361STejun Heo 10291*bba2c361STejun Heo guard(rcu)(); 10292*bba2c361STejun Heo 10293*bba2c361STejun Heo sch = scx_prog_sched(aux); 10294*bba2c361STejun Heo if (unlikely(!sch)) 10295*bba2c361STejun Heo return NULL; 10296*bba2c361STejun Heo 10297*bba2c361STejun Heo if (!scx_cpu_valid(sch, cpu, NULL)) 10298*bba2c361STejun Heo return NULL; 10299*bba2c361STejun Heo 10300*bba2c361STejun Heo return rcu_dereference(cpu_rq(cpu)->curr); 10301*bba2c361STejun Heo } 10302*bba2c361STejun Heo 10303*bba2c361STejun Heo /** 10304*bba2c361STejun Heo * scx_bpf_cid_curr - Return the curr task on the CPU at @cid 10305*bba2c361STejun Heo * @cid: cid of interest 10306*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10307*bba2c361STejun Heo * 10308*bba2c361STejun Heo * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU 10309*bba2c361STejun Heo * read lock (KF_RCU). 10310*bba2c361STejun Heo */ 10311*bba2c361STejun Heo __bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux) 10312*bba2c361STejun Heo { 10313*bba2c361STejun Heo struct scx_sched *sch; 10314*bba2c361STejun Heo s32 cpu; 10315*bba2c361STejun Heo 10316*bba2c361STejun Heo guard(rcu)(); 10317*bba2c361STejun Heo 10318*bba2c361STejun Heo sch = scx_prog_sched(aux); 10319*bba2c361STejun Heo if (unlikely(!sch)) 10320*bba2c361STejun Heo return NULL; 10321*bba2c361STejun Heo cpu = scx_cid_to_cpu(sch, cid); 10322*bba2c361STejun Heo if (cpu < 0) 10323*bba2c361STejun Heo return NULL; 10324*bba2c361STejun Heo return rcu_dereference(cpu_rq(cpu)->curr); 10325*bba2c361STejun Heo } 10326*bba2c361STejun Heo 10327*bba2c361STejun Heo /** 10328*bba2c361STejun Heo * scx_bpf_tid_to_task - Look up a task by its scx tid 10329*bba2c361STejun Heo * @tid: task ID previously read from p->scx.tid 10330*bba2c361STejun Heo * 10331*bba2c361STejun Heo * Returns the task with the given tid, or NULL if no such task exists. The 10332*bba2c361STejun Heo * returned pointer is valid until the end of the current RCU read section 10333*bba2c361STejun Heo * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root 10334*bba2c361STejun Heo * scheduler; otherwise an error is raised and NULL returned. 10335*bba2c361STejun Heo */ 10336*bba2c361STejun Heo __bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid) 10337*bba2c361STejun Heo { 10338*bba2c361STejun Heo struct sched_ext_entity *scx; 10339*bba2c361STejun Heo 10340*bba2c361STejun Heo if (!scx_tid_to_task_enabled()) { 10341*bba2c361STejun Heo struct scx_sched *sch = rcu_dereference(scx_root); 10342*bba2c361STejun Heo 10343*bba2c361STejun Heo if (sch) 10344*bba2c361STejun Heo scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK"); 10345*bba2c361STejun Heo return NULL; 10346*bba2c361STejun Heo } 10347*bba2c361STejun Heo 10348*bba2c361STejun Heo scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params); 10349*bba2c361STejun Heo if (!scx) 10350*bba2c361STejun Heo return NULL; 10351*bba2c361STejun Heo 10352*bba2c361STejun Heo return container_of(scx, struct task_struct, scx); 10353*bba2c361STejun Heo } 10354*bba2c361STejun Heo 10355*bba2c361STejun Heo /** 10356*bba2c361STejun Heo * scx_bpf_now - Returns a high-performance monotonically non-decreasing 10357*bba2c361STejun Heo * clock for the current CPU. The clock returned is in nanoseconds. 10358*bba2c361STejun Heo * 10359*bba2c361STejun Heo * It provides the following properties: 10360*bba2c361STejun Heo * 10361*bba2c361STejun Heo * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently 10362*bba2c361STejun Heo * to account for execution time and track tasks' runtime properties. 10363*bba2c361STejun Heo * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which 10364*bba2c361STejun Heo * eventually reads a hardware timestamp counter -- is neither performant nor 10365*bba2c361STejun Heo * scalable. scx_bpf_now() aims to provide a high-performance clock by 10366*bba2c361STejun Heo * using the rq clock in the scheduler core whenever possible. 10367*bba2c361STejun Heo * 10368*bba2c361STejun Heo * 2) High enough resolution for the BPF scheduler use cases: In most BPF 10369*bba2c361STejun Heo * scheduler use cases, the required clock resolution is lower than the most 10370*bba2c361STejun Heo * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically 10371*bba2c361STejun Heo * uses the rq clock in the scheduler core whenever it is valid. It considers 10372*bba2c361STejun Heo * that the rq clock is valid from the time the rq clock is updated 10373*bba2c361STejun Heo * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). 10374*bba2c361STejun Heo * 10375*bba2c361STejun Heo * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() 10376*bba2c361STejun Heo * guarantees the clock never goes backward when comparing them in the same 10377*bba2c361STejun Heo * CPU. On the other hand, when comparing clocks in different CPUs, there 10378*bba2c361STejun Heo * is no such guarantee -- the clock can go backward. It provides a 10379*bba2c361STejun Heo * monotonically *non-decreasing* clock so that it would provide the same 10380*bba2c361STejun Heo * clock values in two different scx_bpf_now() calls in the same CPU 10381*bba2c361STejun Heo * during the same period of when the rq clock is valid. 10382*bba2c361STejun Heo */ 10383*bba2c361STejun Heo __bpf_kfunc u64 scx_bpf_now(void) 10384*bba2c361STejun Heo { 10385*bba2c361STejun Heo struct rq *rq; 10386*bba2c361STejun Heo u64 clock; 10387*bba2c361STejun Heo 10388*bba2c361STejun Heo preempt_disable(); 10389*bba2c361STejun Heo 10390*bba2c361STejun Heo rq = this_rq(); 10391*bba2c361STejun Heo if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { 10392*bba2c361STejun Heo /* 10393*bba2c361STejun Heo * If the rq clock is valid, use the cached rq clock. 10394*bba2c361STejun Heo * 10395*bba2c361STejun Heo * Note that scx_bpf_now() is re-entrant between a process 10396*bba2c361STejun Heo * context and an interrupt context (e.g., timer interrupt). 10397*bba2c361STejun Heo * However, we don't need to consider the race between them 10398*bba2c361STejun Heo * because such race is not observable from a caller. 10399*bba2c361STejun Heo */ 10400*bba2c361STejun Heo clock = READ_ONCE(rq->scx.clock); 10401*bba2c361STejun Heo } else { 10402*bba2c361STejun Heo /* 10403*bba2c361STejun Heo * Otherwise, return a fresh rq clock. 10404*bba2c361STejun Heo * 10405*bba2c361STejun Heo * The rq clock is updated outside of the rq lock. 10406*bba2c361STejun Heo * In this case, keep the updated rq clock invalid so the next 10407*bba2c361STejun Heo * kfunc call outside the rq lock gets a fresh rq clock. 10408*bba2c361STejun Heo */ 10409*bba2c361STejun Heo clock = sched_clock_cpu(cpu_of(rq)); 10410*bba2c361STejun Heo } 10411*bba2c361STejun Heo 10412*bba2c361STejun Heo preempt_enable(); 10413*bba2c361STejun Heo 10414*bba2c361STejun Heo return clock; 10415*bba2c361STejun Heo } 10416*bba2c361STejun Heo 10417*bba2c361STejun Heo static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) 10418*bba2c361STejun Heo { 10419*bba2c361STejun Heo struct scx_event_stats *e_cpu; 10420*bba2c361STejun Heo int cpu; 10421*bba2c361STejun Heo 10422*bba2c361STejun Heo /* Aggregate per-CPU event counters into @events. */ 10423*bba2c361STejun Heo memset(events, 0, sizeof(*events)); 10424*bba2c361STejun Heo for_each_possible_cpu(cpu) { 10425*bba2c361STejun Heo e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 10426*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 10427*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 10428*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 10429*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 10430*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 10431*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); 10432*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); 10433*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); 10434*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); 10435*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); 10436*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); 10437*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); 10438*bba2c361STejun Heo scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); 10439*bba2c361STejun Heo } 10440*bba2c361STejun Heo } 10441*bba2c361STejun Heo 10442*bba2c361STejun Heo /* 10443*bba2c361STejun Heo * scx_bpf_events - Get a system-wide event counter to 10444*bba2c361STejun Heo * @events: output buffer from a BPF program 10445*bba2c361STejun Heo * @events__sz: @events len, must end in '__sz'' for the verifier 10446*bba2c361STejun Heo */ 10447*bba2c361STejun Heo __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 10448*bba2c361STejun Heo size_t events__sz) 10449*bba2c361STejun Heo { 10450*bba2c361STejun Heo struct scx_sched *sch; 10451*bba2c361STejun Heo struct scx_event_stats e_sys; 10452*bba2c361STejun Heo 10453*bba2c361STejun Heo rcu_read_lock(); 10454*bba2c361STejun Heo sch = rcu_dereference(scx_root); 10455*bba2c361STejun Heo if (sch) 10456*bba2c361STejun Heo scx_read_events(sch, &e_sys); 10457*bba2c361STejun Heo else 10458*bba2c361STejun Heo memset(&e_sys, 0, sizeof(e_sys)); 10459*bba2c361STejun Heo rcu_read_unlock(); 10460*bba2c361STejun Heo 10461*bba2c361STejun Heo /* 10462*bba2c361STejun Heo * We cannot entirely trust a BPF-provided size since a BPF program 10463*bba2c361STejun Heo * might be compiled against a different vmlinux.h, of which 10464*bba2c361STejun Heo * scx_event_stats would be larger (a newer vmlinux.h) or smaller 10465*bba2c361STejun Heo * (an older vmlinux.h). Hence, we use the smaller size to avoid 10466*bba2c361STejun Heo * memory corruption. 10467*bba2c361STejun Heo */ 10468*bba2c361STejun Heo events__sz = min(events__sz, sizeof(*events)); 10469*bba2c361STejun Heo memcpy(events, &e_sys, events__sz); 10470*bba2c361STejun Heo } 10471*bba2c361STejun Heo 10472*bba2c361STejun Heo #ifdef CONFIG_CGROUP_SCHED 10473*bba2c361STejun Heo /** 10474*bba2c361STejun Heo * scx_bpf_task_cgroup - Return the sched cgroup of a task 10475*bba2c361STejun Heo * @p: task of interest 10476*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 10477*bba2c361STejun Heo * 10478*bba2c361STejun Heo * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 10479*bba2c361STejun Heo * from the scheduler's POV. SCX operations should use this function to 10480*bba2c361STejun Heo * determine @p's current cgroup as, unlike following @p->cgroups, 10481*bba2c361STejun Heo * @p->sched_task_group is stable for the duration of the SCX op. See 10482*bba2c361STejun Heo * SCX_CALL_OP_TASK() for details. 10483*bba2c361STejun Heo */ 10484*bba2c361STejun Heo __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, 10485*bba2c361STejun Heo const struct bpf_prog_aux *aux) 10486*bba2c361STejun Heo { 10487*bba2c361STejun Heo struct task_group *tg = p->sched_task_group; 10488*bba2c361STejun Heo struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 10489*bba2c361STejun Heo struct scx_sched *sch; 10490*bba2c361STejun Heo 10491*bba2c361STejun Heo guard(rcu)(); 10492*bba2c361STejun Heo 10493*bba2c361STejun Heo sch = scx_prog_sched(aux); 10494*bba2c361STejun Heo if (unlikely(!sch)) 10495*bba2c361STejun Heo goto out; 10496*bba2c361STejun Heo 10497*bba2c361STejun Heo if (!scx_kf_arg_task_ok(sch, p)) 10498*bba2c361STejun Heo goto out; 10499*bba2c361STejun Heo 10500*bba2c361STejun Heo cgrp = tg_cgrp(tg); 10501*bba2c361STejun Heo 10502*bba2c361STejun Heo out: 10503*bba2c361STejun Heo cgroup_get(cgrp); 10504*bba2c361STejun Heo return cgrp; 10505*bba2c361STejun Heo } 10506*bba2c361STejun Heo #endif /* CONFIG_CGROUP_SCHED */ 10507*bba2c361STejun Heo 10508*bba2c361STejun Heo __bpf_kfunc_end_defs(); 10509*bba2c361STejun Heo 10510*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_any) 10511*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); 10512*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); 10513*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 10514*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS) 10515*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) 10516*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) 10517*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) 10518*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) 10519*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) 10520*bba2c361STejun Heo BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) 10521*bba2c361STejun Heo BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 10522*bba2c361STejun Heo BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 10523*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) 10524*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) 10525*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) 10526*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 10527*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 10528*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 10529*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS) 10530*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS) 10531*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS) 10532*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 10533*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 10534*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_nr_cids) 10535*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_nr_online_cids) 10536*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_this_cid) 10537*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 10538*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 10539*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 10540*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 10541*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 10542*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU) 10543*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 10544*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) 10545*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10546*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10547*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED) 10548*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_now) 10549*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_events) 10550*bba2c361STejun Heo #ifdef CONFIG_CGROUP_SCHED 10551*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) 10552*bba2c361STejun Heo #endif 10553*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_any) 10554*bba2c361STejun Heo 10555*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_any = { 10556*bba2c361STejun Heo .owner = THIS_MODULE, 10557*bba2c361STejun Heo .set = &scx_kfunc_ids_any, 10558*bba2c361STejun Heo .filter = scx_kfunc_context_filter, 10559*bba2c361STejun Heo }; 10560*bba2c361STejun Heo 10561*bba2c361STejun Heo /* 10562*bba2c361STejun Heo * cpu-form kfuncs that are forbidden from cid-form schedulers 10563*bba2c361STejun Heo * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must 10564*bba2c361STejun Heo * use the cid-form alternative (cid/cmask kfuncs). 10565*bba2c361STejun Heo * 10566*bba2c361STejun Heo * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter 10567*bba2c361STejun Heo * tests this set independently and rejects matches before the per-op 10568*bba2c361STejun Heo * allow-list check runs. 10569*bba2c361STejun Heo * 10570*bba2c361STejun Heo * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and 10571*bba2c361STejun Heo * intersects flags across duplicate entries, so each entry must carry the 10572*bba2c361STejun Heo * same flags as the kfunc's primary declaration; otherwise the flags get 10573*bba2c361STejun Heo * dropped globally. 10574*bba2c361STejun Heo */ 10575*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_cpu_only) 10576*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) 10577*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 10578*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) 10579*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) 10580*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) 10581*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) 10582*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) 10583*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) 10584*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 10585*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) 10586*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) 10587*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) 10588*bba2c361STejun Heo BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) 10589*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 10590*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10591*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10592*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10593*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) 10594*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 10595*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) 10596*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) 10597*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) 10598*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) 10599*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) 10600*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_cpu_only) 10601*bba2c361STejun Heo 10602*bba2c361STejun Heo /* 10603*bba2c361STejun Heo * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc 10604*bba2c361STejun Heo * group; an op may permit zero or more groups, with the union expressed in 10605*bba2c361STejun Heo * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) 10606*bba2c361STejun Heo * consults this table to decide whether a context-sensitive kfunc is callable 10607*bba2c361STejun Heo * from a given SCX op. 10608*bba2c361STejun Heo */ 10609*bba2c361STejun Heo enum scx_kf_allow_flags { 10610*bba2c361STejun Heo SCX_KF_ALLOW_UNLOCKED = 1 << 0, 10611*bba2c361STejun Heo SCX_KF_ALLOW_INIT = 1 << 1, 10612*bba2c361STejun Heo SCX_KF_ALLOW_CPU_RELEASE = 1 << 2, 10613*bba2c361STejun Heo SCX_KF_ALLOW_DISPATCH = 1 << 3, 10614*bba2c361STejun Heo SCX_KF_ALLOW_ENQUEUE = 1 << 4, 10615*bba2c361STejun Heo SCX_KF_ALLOW_SELECT_CPU = 1 << 5, 10616*bba2c361STejun Heo }; 10617*bba2c361STejun Heo 10618*bba2c361STejun Heo /* 10619*bba2c361STejun Heo * Map each SCX op to the union of kfunc groups it permits, indexed by 10620*bba2c361STejun Heo * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not 10621*bba2c361STejun Heo * context-sensitive. 10622*bba2c361STejun Heo */ 10623*bba2c361STejun Heo static const u32 scx_kf_allow_flags[] = { 10624*bba2c361STejun Heo [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 10625*bba2c361STejun Heo [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, 10626*bba2c361STejun Heo [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, 10627*bba2c361STejun Heo [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, 10628*bba2c361STejun Heo [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, 10629*bba2c361STejun Heo [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, 10630*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 10631*bba2c361STejun Heo [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, 10632*bba2c361STejun Heo [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, 10633*bba2c361STejun Heo [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, 10634*bba2c361STejun Heo [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, 10635*bba2c361STejun Heo [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, 10636*bba2c361STejun Heo [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, 10637*bba2c361STejun Heo [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, 10638*bba2c361STejun Heo #endif /* CONFIG_EXT_GROUP_SCHED */ 10639*bba2c361STejun Heo [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, 10640*bba2c361STejun Heo [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, 10641*bba2c361STejun Heo [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, 10642*bba2c361STejun Heo [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, 10643*bba2c361STejun Heo [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT, 10644*bba2c361STejun Heo [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, 10645*bba2c361STejun Heo }; 10646*bba2c361STejun Heo 10647*bba2c361STejun Heo /* 10648*bba2c361STejun Heo * Verifier-time filter for SCX kfuncs. Registered via the .filter field on 10649*bba2c361STejun Heo * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc 10650*bba2c361STejun Heo * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or 10651*bba2c361STejun Heo * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the 10652*bba2c361STejun Heo * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by 10653*bba2c361STejun Heo * falling through to "allow" when none of the SCX sets contain the kfunc. 10654*bba2c361STejun Heo */ 10655*bba2c361STejun Heo int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) 10656*bba2c361STejun Heo { 10657*bba2c361STejun Heo bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); 10658*bba2c361STejun Heo bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id); 10659*bba2c361STejun Heo bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); 10660*bba2c361STejun Heo bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); 10661*bba2c361STejun Heo bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); 10662*bba2c361STejun Heo bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); 10663*bba2c361STejun Heo bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); 10664*bba2c361STejun Heo bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); 10665*bba2c361STejun Heo bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id); 10666*bba2c361STejun Heo u32 moff, flags; 10667*bba2c361STejun Heo 10668*bba2c361STejun Heo /* Not an SCX kfunc - allow. */ 10669*bba2c361STejun Heo if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch || 10670*bba2c361STejun Heo in_cpu_release || in_idle || in_any)) 10671*bba2c361STejun Heo return 0; 10672*bba2c361STejun Heo 10673*bba2c361STejun Heo /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ 10674*bba2c361STejun Heo if (prog->type == BPF_PROG_TYPE_SYSCALL) 10675*bba2c361STejun Heo return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; 10676*bba2c361STejun Heo 10677*bba2c361STejun Heo if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) 10678*bba2c361STejun Heo return (in_any || in_idle) ? 0 : -EACCES; 10679*bba2c361STejun Heo 10680*bba2c361STejun Heo /* 10681*bba2c361STejun Heo * add_subprog_and_kfunc() collects all kfunc calls, including dead code 10682*bba2c361STejun Heo * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets 10683*bba2c361STejun Heo * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; 10684*bba2c361STejun Heo * do_check_main() re-runs the filter with st_ops set and enforces the 10685*bba2c361STejun Heo * actual restrictions. 10686*bba2c361STejun Heo */ 10687*bba2c361STejun Heo if (!prog->aux->st_ops) 10688*bba2c361STejun Heo return 0; 10689*bba2c361STejun Heo 10690*bba2c361STejun Heo /* 10691*bba2c361STejun Heo * Non-SCX struct_ops: SCX kfuncs are not permitted. 10692*bba2c361STejun Heo * 10693*bba2c361STejun Heo * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid 10694*bba2c361STejun Heo * (cid-form) are valid SCX struct_ops. Member offsets match between 10695*bba2c361STejun Heo * the two (verified by BUILD_BUG_ON in scx_init()), so the shared 10696*bba2c361STejun Heo * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to 10697*bba2c361STejun Heo * both. 10698*bba2c361STejun Heo */ 10699*bba2c361STejun Heo if (prog->aux->st_ops != &bpf_sched_ext_ops && 10700*bba2c361STejun Heo prog->aux->st_ops != &bpf_sched_ext_ops_cid) 10701*bba2c361STejun Heo return -EACCES; 10702*bba2c361STejun Heo 10703*bba2c361STejun Heo /* 10704*bba2c361STejun Heo * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both 10705*bba2c361STejun Heo * small s32s and trivially confused, so cpu-only kfuncs are rejected at 10706*bba2c361STejun Heo * load time. The reverse (cpu-form calling cid-form kfuncs) is 10707*bba2c361STejun Heo * intentionally permissive to ease gradual cpumask -> cid migration. 10708*bba2c361STejun Heo */ 10709*bba2c361STejun Heo if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only) 10710*bba2c361STejun Heo return -EACCES; 10711*bba2c361STejun Heo 10712*bba2c361STejun Heo /* SCX struct_ops: check the per-op allow list. */ 10713*bba2c361STejun Heo if (in_any || in_idle) 10714*bba2c361STejun Heo return 0; 10715*bba2c361STejun Heo 10716*bba2c361STejun Heo moff = prog->aux->attach_st_ops_member_off; 10717*bba2c361STejun Heo flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; 10718*bba2c361STejun Heo 10719*bba2c361STejun Heo if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) 10720*bba2c361STejun Heo return 0; 10721*bba2c361STejun Heo if ((flags & SCX_KF_ALLOW_INIT) && in_init) 10722*bba2c361STejun Heo return 0; 10723*bba2c361STejun Heo if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) 10724*bba2c361STejun Heo return 0; 10725*bba2c361STejun Heo if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) 10726*bba2c361STejun Heo return 0; 10727*bba2c361STejun Heo if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) 10728*bba2c361STejun Heo return 0; 10729*bba2c361STejun Heo if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) 10730*bba2c361STejun Heo return 0; 10731*bba2c361STejun Heo 10732*bba2c361STejun Heo return -EACCES; 10733*bba2c361STejun Heo } 10734*bba2c361STejun Heo 10735*bba2c361STejun Heo static int __init scx_init(void) 10736*bba2c361STejun Heo { 10737*bba2c361STejun Heo int ret; 10738*bba2c361STejun Heo 10739*bba2c361STejun Heo /* 10740*bba2c361STejun Heo * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv. 10741*bba2c361STejun Heo * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets 10742*bba2c361STejun Heo * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets 10743*bba2c361STejun Heo * matching for the shared fields. Catch any drift at boot. 10744*bba2c361STejun Heo */ 10745*bba2c361STejun Heo #define CID_OFFSET_MATCH(cpu_field, cid_field) \ 10746*bba2c361STejun Heo BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \ 10747*bba2c361STejun Heo offsetof(struct sched_ext_ops_cid, cid_field)) 10748*bba2c361STejun Heo /* data fields used by bpf_scx_init_member() */ 10749*bba2c361STejun Heo CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch); 10750*bba2c361STejun Heo CID_OFFSET_MATCH(flags, flags); 10751*bba2c361STejun Heo CID_OFFSET_MATCH(name, name); 10752*bba2c361STejun Heo CID_OFFSET_MATCH(timeout_ms, timeout_ms); 10753*bba2c361STejun Heo CID_OFFSET_MATCH(exit_dump_len, exit_dump_len); 10754*bba2c361STejun Heo CID_OFFSET_MATCH(hotplug_seq, hotplug_seq); 10755*bba2c361STejun Heo CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id); 10756*bba2c361STejun Heo /* shared callbacks: the union view requires byte-for-byte offset match */ 10757*bba2c361STejun Heo CID_OFFSET_MATCH(enqueue, enqueue); 10758*bba2c361STejun Heo CID_OFFSET_MATCH(dequeue, dequeue); 10759*bba2c361STejun Heo CID_OFFSET_MATCH(dispatch, dispatch); 10760*bba2c361STejun Heo CID_OFFSET_MATCH(tick, tick); 10761*bba2c361STejun Heo CID_OFFSET_MATCH(runnable, runnable); 10762*bba2c361STejun Heo CID_OFFSET_MATCH(running, running); 10763*bba2c361STejun Heo CID_OFFSET_MATCH(stopping, stopping); 10764*bba2c361STejun Heo CID_OFFSET_MATCH(quiescent, quiescent); 10765*bba2c361STejun Heo CID_OFFSET_MATCH(yield, yield); 10766*bba2c361STejun Heo CID_OFFSET_MATCH(core_sched_before, core_sched_before); 10767*bba2c361STejun Heo CID_OFFSET_MATCH(set_weight, set_weight); 10768*bba2c361STejun Heo CID_OFFSET_MATCH(update_idle, update_idle); 10769*bba2c361STejun Heo CID_OFFSET_MATCH(init_task, init_task); 10770*bba2c361STejun Heo CID_OFFSET_MATCH(exit_task, exit_task); 10771*bba2c361STejun Heo CID_OFFSET_MATCH(enable, enable); 10772*bba2c361STejun Heo CID_OFFSET_MATCH(disable, disable); 10773*bba2c361STejun Heo CID_OFFSET_MATCH(dump, dump); 10774*bba2c361STejun Heo CID_OFFSET_MATCH(dump_task, dump_task); 10775*bba2c361STejun Heo CID_OFFSET_MATCH(sub_attach, sub_attach); 10776*bba2c361STejun Heo CID_OFFSET_MATCH(sub_detach, sub_detach); 10777*bba2c361STejun Heo CID_OFFSET_MATCH(init, init); 10778*bba2c361STejun Heo CID_OFFSET_MATCH(exit, exit); 10779*bba2c361STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED 10780*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_init, cgroup_init); 10781*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_exit, cgroup_exit); 10782*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move); 10783*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_move, cgroup_move); 10784*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move); 10785*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight); 10786*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth); 10787*bba2c361STejun Heo CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle); 10788*bba2c361STejun Heo #endif 10789*bba2c361STejun Heo /* renamed callbacks must occupy the same slot as their cpu-form sibling */ 10790*bba2c361STejun Heo CID_OFFSET_MATCH(select_cpu, select_cid); 10791*bba2c361STejun Heo CID_OFFSET_MATCH(set_cpumask, set_cmask); 10792*bba2c361STejun Heo CID_OFFSET_MATCH(cpu_online, cid_online); 10793*bba2c361STejun Heo CID_OFFSET_MATCH(cpu_offline, cid_offline); 10794*bba2c361STejun Heo CID_OFFSET_MATCH(dump_cpu, dump_cid); 10795*bba2c361STejun Heo /* @priv tail must align since both share the same data block */ 10796*bba2c361STejun Heo CID_OFFSET_MATCH(priv, priv); 10797*bba2c361STejun Heo /* 10798*bba2c361STejun Heo * cid-form must end exactly at @priv - validate_ops() skips 10799*bba2c361STejun Heo * cpu_acquire/cpu_release for cid-form because reading those fields 10800*bba2c361STejun Heo * past the BPF allocation would be UB. 10801*bba2c361STejun Heo */ 10802*bba2c361STejun Heo BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) != 10803*bba2c361STejun Heo offsetofend(struct sched_ext_ops, priv)); 10804*bba2c361STejun Heo #undef CID_OFFSET_MATCH 10805*bba2c361STejun Heo 10806*bba2c361STejun Heo /* 10807*bba2c361STejun Heo * kfunc registration can't be done from init_sched_ext_class() as 10808*bba2c361STejun Heo * register_btf_kfunc_id_set() needs most of the system to be up. 10809*bba2c361STejun Heo * 10810*bba2c361STejun Heo * Some kfuncs are context-sensitive and can only be called from 10811*bba2c361STejun Heo * specific SCX ops. They are grouped into per-context BTF sets, each 10812*bba2c361STejun Heo * registered with scx_kfunc_context_filter as its .filter callback. The 10813*bba2c361STejun Heo * BPF core dedups identical filter pointers per hook 10814*bba2c361STejun Heo * (btf_populate_kfunc_set()), so the filter is invoked exactly once per 10815*bba2c361STejun Heo * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op 10816*bba2c361STejun Heo * restrictions at verify time. 10817*bba2c361STejun Heo */ 10818*bba2c361STejun Heo if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10819*bba2c361STejun Heo &scx_kfunc_set_enqueue_dispatch)) || 10820*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10821*bba2c361STejun Heo &scx_kfunc_set_dispatch)) || 10822*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10823*bba2c361STejun Heo &scx_kfunc_set_cpu_release)) || 10824*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10825*bba2c361STejun Heo &scx_kfunc_set_unlocked)) || 10826*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 10827*bba2c361STejun Heo &scx_kfunc_set_unlocked)) || 10828*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 10829*bba2c361STejun Heo &scx_kfunc_set_any)) || 10830*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 10831*bba2c361STejun Heo &scx_kfunc_set_any)) || 10832*bba2c361STejun Heo (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 10833*bba2c361STejun Heo &scx_kfunc_set_any))) { 10834*bba2c361STejun Heo pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 10835*bba2c361STejun Heo return ret; 10836*bba2c361STejun Heo } 10837*bba2c361STejun Heo 10838*bba2c361STejun Heo ret = scx_idle_init(); 10839*bba2c361STejun Heo if (ret) { 10840*bba2c361STejun Heo pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 10841*bba2c361STejun Heo return ret; 10842*bba2c361STejun Heo } 10843*bba2c361STejun Heo 10844*bba2c361STejun Heo ret = scx_cid_kfunc_init(); 10845*bba2c361STejun Heo if (ret) { 10846*bba2c361STejun Heo pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret); 10847*bba2c361STejun Heo return ret; 10848*bba2c361STejun Heo } 10849*bba2c361STejun Heo 10850*bba2c361STejun Heo ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); 10851*bba2c361STejun Heo if (ret) { 10852*bba2c361STejun Heo pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); 10853*bba2c361STejun Heo return ret; 10854*bba2c361STejun Heo } 10855*bba2c361STejun Heo 10856*bba2c361STejun Heo ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid); 10857*bba2c361STejun Heo if (ret) { 10858*bba2c361STejun Heo pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret); 10859*bba2c361STejun Heo return ret; 10860*bba2c361STejun Heo } 10861*bba2c361STejun Heo 10862*bba2c361STejun Heo ret = register_pm_notifier(&scx_pm_notifier); 10863*bba2c361STejun Heo if (ret) { 10864*bba2c361STejun Heo pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); 10865*bba2c361STejun Heo return ret; 10866*bba2c361STejun Heo } 10867*bba2c361STejun Heo 10868*bba2c361STejun Heo scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); 10869*bba2c361STejun Heo if (!scx_kset) { 10870*bba2c361STejun Heo pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); 10871*bba2c361STejun Heo return -ENOMEM; 10872*bba2c361STejun Heo } 10873*bba2c361STejun Heo 10874*bba2c361STejun Heo ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); 10875*bba2c361STejun Heo if (ret < 0) { 10876*bba2c361STejun Heo pr_err("sched_ext: Failed to add global attributes\n"); 10877*bba2c361STejun Heo return ret; 10878*bba2c361STejun Heo } 10879*bba2c361STejun Heo 10880*bba2c361STejun Heo return 0; 10881*bba2c361STejun Heo } 10882*bba2c361STejun Heo __initcall(scx_init); 10883