1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * A simple scheduler. 4 * 5 * By default, it operates as a simple global weighted vtime scheduler and can 6 * be switched to FIFO scheduling. It also demonstrates the following niceties. 7 * 8 * - Statistics tracking how many tasks are queued to local and global dsq's. 9 * - Termination notification for userspace. 10 * 11 * While very simple, this scheduler should work reasonably well on CPUs with a 12 * uniform L3 cache topology. While preemption is not implemented, the fact that 13 * the scheduling queue is shared across all CPUs means that whatever is at the 14 * front of the queue is likely to be executed fairly quickly given enough 15 * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads 16 * but comes with the usual problems with FIFO scheduling where saturating 17 * threads can easily drown out interactive ones. 18 * 19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 21 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 22 */ 23 #include <scx/common.bpf.h> 24 25 char _license[] SEC("license") = "GPL"; 26 27 const volatile bool fifo_sched; 28 29 static u64 vtime_now; 30 UEI_DEFINE(uei); 31 32 /* 33 * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues 34 * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We 35 * therefore create a separate DSQ with ID 0 that we dispatch to and consume 36 * from. If scx_simple only supported global FIFO scheduling, then we could 37 * just use SCX_DSQ_GLOBAL. 38 */ 39 #define SHARED_DSQ 0 40 41 struct { 42 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 43 __uint(key_size, sizeof(u32)); 44 __uint(value_size, sizeof(u64)); 45 __uint(max_entries, 2); /* [local, global] */ 46 } stats SEC(".maps"); 47 48 static void stat_inc(u32 idx) 49 { 50 u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); 51 if (cnt_p) 52 (*cnt_p)++; 53 } 54 55 static inline bool vtime_before(u64 a, u64 b) 56 { 57 return (s64)(a - b) < 0; 58 } 59 60 s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 61 { 62 bool is_idle = false; 63 s32 cpu; 64 65 cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 66 if (is_idle) { 67 stat_inc(0); /* count local queueing */ 68 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 69 } 70 71 return cpu; 72 } 73 74 void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) 75 { 76 stat_inc(1); /* count global queueing */ 77 78 if (fifo_sched) { 79 scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); 80 } else { 81 u64 vtime = p->scx.dsq_vtime; 82 83 /* 84 * Limit the amount of budget that an idling task can accumulate 85 * to one slice. 86 */ 87 if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) 88 vtime = vtime_now - SCX_SLICE_DFL; 89 90 scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, 91 enq_flags); 92 } 93 } 94 95 void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) 96 { 97 scx_bpf_consume(SHARED_DSQ); 98 } 99 100 void BPF_STRUCT_OPS(simple_running, struct task_struct *p) 101 { 102 if (fifo_sched) 103 return; 104 105 /* 106 * Global vtime always progresses forward as tasks start executing. The 107 * test and update can be performed concurrently from multiple CPUs and 108 * thus racy. Any error should be contained and temporary. Let's just 109 * live with it. 110 */ 111 if (vtime_before(vtime_now, p->scx.dsq_vtime)) 112 vtime_now = p->scx.dsq_vtime; 113 } 114 115 void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) 116 { 117 if (fifo_sched) 118 return; 119 120 /* 121 * Scale the execution time by the inverse of the weight and charge. 122 * 123 * Note that the default yield implementation yields by setting 124 * @p->scx.slice to zero and the following would treat the yielding task 125 * as if it has consumed all its slice. If this penalizes yielding tasks 126 * too much, determine the execution time by taking explicit timestamps 127 * instead of depending on @p->scx.slice. 128 */ 129 p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; 130 } 131 132 void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) 133 { 134 p->scx.dsq_vtime = vtime_now; 135 } 136 137 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) 138 { 139 return scx_bpf_create_dsq(SHARED_DSQ, -1); 140 } 141 142 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) 143 { 144 UEI_RECORD(uei, ei); 145 } 146 147 SCX_OPS_DEFINE(simple_ops, 148 .select_cpu = (void *)simple_select_cpu, 149 .enqueue = (void *)simple_enqueue, 150 .dispatch = (void *)simple_dispatch, 151 .running = (void *)simple_running, 152 .stopping = (void *)simple_stopping, 153 .enable = (void *)simple_enable, 154 .init = (void *)simple_init, 155 .exit = (void *)simple_exit, 156 .name = "simple"); 157