1 /*
2 * Copyright 2026 The FreeBSD Foundation
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7 * under sponsorship from the FreeBSD Foundation.
8 */
9
10 #include "opt_sched.h"
11
12 #include <sys/systm.h>
13 #include <sys/kernel.h>
14 #include <sys/lock.h>
15 #include <sys/proc.h>
16 #include <sys/runq.h>
17 #include <sys/sbuf.h>
18 #include <sys/sched.h>
19 #include <sys/smp.h>
20 #include <sys/sysctl.h>
21 #include <machine/ifunc.h>
22
23 const struct sched_instance *active_sched;
24
25 #define __DEFINE_SHIM(__m, __r, __n, __p, __a) \
26 DEFINE_IFUNC(, __r, __n, __p) \
27 { \
28 return (active_sched->__m); \
29 }
30 #define DEFINE_SHIM0(__m, __r, __n) \
31 __DEFINE_SHIM(__m, __r, __n, (void), ())
32 #define DEFINE_SHIM1(__m, __r, __n, __t1, __a1) \
33 __DEFINE_SHIM(__m, __r, __n, (__t1 __a1), (__a1))
34 #define DEFINE_SHIM2(__m, __r, __n, __t1, __a1, __t2, __a2) \
35 __DEFINE_SHIM(__m, __r, __n, (__t1 __a1, __t2 __a2), (__a1, __a2))
36
37 DEFINE_SHIM0(load, int, sched_load)
38 DEFINE_SHIM0(rr_interval, int, sched_rr_interval)
39 DEFINE_SHIM0(runnable, bool, sched_runnable)
40 DEFINE_SHIM2(exit, void, sched_exit, struct proc *, p,
41 struct thread *, childtd)
42 DEFINE_SHIM2(fork, void, sched_fork, struct thread *, td,
43 struct thread *, childtd)
44 DEFINE_SHIM1(fork_exit, void, sched_fork_exit, struct thread *, td)
45 DEFINE_SHIM2(class, void, sched_class, struct thread *, td, int, class)
46 DEFINE_SHIM2(nice, void, sched_nice, struct proc *, p, int, nice)
47 DEFINE_SHIM0(ap_entry, void, sched_ap_entry)
48 DEFINE_SHIM2(exit_thread, void, sched_exit_thread, struct thread *, td,
49 struct thread *, child)
50 DEFINE_SHIM1(estcpu, u_int, sched_estcpu, struct thread *, td)
51 DEFINE_SHIM2(fork_thread, void, sched_fork_thread, struct thread *, td,
52 struct thread *, child)
53 DEFINE_SHIM2(ithread_prio, void, sched_ithread_prio, struct thread *, td,
54 u_char, prio)
55 DEFINE_SHIM2(lend_prio, void, sched_lend_prio, struct thread *, td,
56 u_char, prio)
57 DEFINE_SHIM2(lend_user_prio, void, sched_lend_user_prio, struct thread *, td,
58 u_char, pri)
59 DEFINE_SHIM2(lend_user_prio_cond, void, sched_lend_user_prio_cond,
60 struct thread *, td, u_char, pri)
61 DEFINE_SHIM1(pctcpu, fixpt_t, sched_pctcpu, struct thread *, td)
62 DEFINE_SHIM2(prio, void, sched_prio, struct thread *, td, u_char, prio)
63 DEFINE_SHIM2(sleep, void, sched_sleep, struct thread *, td, int, prio)
64 DEFINE_SHIM2(sswitch, void, sched_switch, struct thread *, td, int, flags)
65 DEFINE_SHIM1(throw, void, sched_throw, struct thread *, td)
66 DEFINE_SHIM2(unlend_prio, void, sched_unlend_prio, struct thread *, td,
67 u_char, prio)
68 DEFINE_SHIM2(user_prio, void, sched_user_prio, struct thread *, td,
69 u_char, prio)
70 DEFINE_SHIM1(userret_slowpath, void, sched_userret_slowpath,
71 struct thread *, td)
72 DEFINE_SHIM2(add, void, sched_add, struct thread *, td, int, flags)
73 DEFINE_SHIM0(choose, struct thread *, sched_choose)
74 DEFINE_SHIM2(clock, void, sched_clock, struct thread *, td, int, cnt)
75 DEFINE_SHIM1(idletd, void, sched_idletd, void *, dummy)
76 DEFINE_SHIM1(preempt, void, sched_preempt, struct thread *, td)
77 DEFINE_SHIM1(relinquish, void, sched_relinquish, struct thread *, td)
78 DEFINE_SHIM1(rem, void, sched_rem, struct thread *, td)
79 DEFINE_SHIM2(wakeup, void, sched_wakeup, struct thread *, td, int, srqflags)
80 DEFINE_SHIM2(bind, void, sched_bind, struct thread *, td, int, cpu)
81 DEFINE_SHIM1(unbind, void, sched_unbind, struct thread *, td)
82 DEFINE_SHIM1(is_bound, int, sched_is_bound, struct thread *, td)
83 DEFINE_SHIM1(affinity, void, sched_affinity, struct thread *, td)
84 DEFINE_SHIM0(sizeof_proc, int, sched_sizeof_proc)
85 DEFINE_SHIM0(sizeof_thread, int, sched_sizeof_thread)
86 DEFINE_SHIM1(tdname, char *, sched_tdname, struct thread *, td)
87 DEFINE_SHIM1(clear_tdname, void, sched_clear_tdname, struct thread *, td)
88 DEFINE_SHIM0(do_timer_accounting, bool, sched_do_timer_accounting)
89 DEFINE_SHIM1(find_l2_neighbor, int, sched_find_l2_neighbor, int, cpu)
90 DEFINE_SHIM0(init_ap, void, schedinit_ap)
91
92
93 SCHED_STAT_DEFINE(ithread_demotions, "Interrupt thread priority demotions");
94 SCHED_STAT_DEFINE(ithread_preemptions,
95 "Interrupt thread preemptions due to time-sharing");
96
97 SDT_PROVIDER_DEFINE(sched);
98
99 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *",
100 "struct proc *", "uint8_t");
101 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *",
102 "struct proc *", "void *");
103 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *",
104 "struct proc *", "void *", "int");
105 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *",
106 "struct proc *", "uint8_t", "struct thread *");
107 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
108 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
109 "struct proc *");
110 SDT_PROBE_DEFINE(sched, , , on__cpu);
111 SDT_PROBE_DEFINE(sched, , , remain__cpu);
112 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
113 "struct proc *");
114
115 #ifdef KDTRACE_HOOKS
116 #include <sys/dtrace_bsd.h>
117 int __read_mostly dtrace_vtime_active;
118 dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
119 #endif
120
121 static char sched_name[32] = "ULE";
122
123 SET_DECLARE(sched_instance_set, struct sched_selection);
124
125 void
sched_instance_select(void)126 sched_instance_select(void)
127 {
128 struct sched_selection *s, **ss;
129 int i;
130
131 TUNABLE_STR_FETCH("kern.sched.name", sched_name, sizeof(sched_name));
132 SET_FOREACH(ss, sched_instance_set) {
133 s = *ss;
134 for (i = 0; s->name[i] == sched_name[i]; i++) {
135 if (s->name[i] == '\0') {
136 active_sched = s->instance;
137 return;
138 }
139 }
140 }
141
142 /*
143 * No scheduler matching the configuration was found. If
144 * there is any scheduler compiled in, at all, use the first
145 * scheduler from the linker set.
146 */
147 if (SET_BEGIN(sched_instance_set) < SET_LIMIT(sched_instance_set)) {
148 s = *SET_BEGIN(sched_instance_set);
149 active_sched = s->instance;
150 for (i = 0;; i++) {
151 sched_name[i] = s->name[i];
152 if (s->name[i] == '\0')
153 break;
154 }
155 }
156 }
157
158 void
schedinit(void)159 schedinit(void)
160 {
161 if (active_sched == NULL)
162 panic("Cannot find scheduler %s", sched_name);
163 active_sched->init();
164 }
165
166 struct cpu_group __read_mostly *cpu_top; /* CPU topology */
167
168 static void
sched_setup(void * dummy)169 sched_setup(void *dummy)
170 {
171 cpu_top = smp_topo();
172 active_sched->setup();
173 }
174 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
175
176 static void
sched_initticks(void * dummy)177 sched_initticks(void *dummy)
178 {
179 active_sched->initticks();
180 }
181 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
182 NULL);
183
184 static void
sched_schedcpu(void)185 sched_schedcpu(void)
186 {
187 active_sched->schedcpu();
188 }
189 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, sched_schedcpu, NULL);
190
191 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
192 "Scheduler");
193
194 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, sched_name, 0,
195 "Scheduler name");
196
197 static int
sysctl_kern_sched_available(SYSCTL_HANDLER_ARGS)198 sysctl_kern_sched_available(SYSCTL_HANDLER_ARGS)
199 {
200 struct sched_selection *s, **ss;
201 struct sbuf *sb, sm;
202 int error;
203 bool first;
204
205 sb = sbuf_new_for_sysctl(&sm, NULL, 0, req);
206 if (sb == NULL)
207 return (ENOMEM);
208 first = true;
209 SET_FOREACH(ss, sched_instance_set) {
210 s = *ss;
211 if (first)
212 first = false;
213 else
214 sbuf_cat(sb, ",");
215 sbuf_cat(sb, s->name);
216 }
217 error = sbuf_finish(sb);
218 sbuf_delete(sb);
219 return (error);
220 }
221
222 SYSCTL_PROC(_kern_sched, OID_AUTO, available,
223 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
224 NULL, 0, sysctl_kern_sched_available, "A",
225 "List of available schedulers");
226
227 fixpt_t ccpu;
228 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0,
229 "Decay factor used for updating %CPU");
230
231 /*
232 * Build the CPU topology dump string. Is recursively called to collect
233 * the topology tree.
234 */
235 static int
sysctl_kern_sched_topology_spec_internal(struct sbuf * sb,struct cpu_group * cg,int indent)236 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
237 struct cpu_group *cg, int indent)
238 {
239 char cpusetbuf[CPUSETBUFSIZ];
240 int i, first;
241
242 if (cpu_top == NULL) {
243 sbuf_printf(sb, "%*s<group level=\"1\" cache-level=\"1\">\n",
244 indent, "");
245 sbuf_printf(sb, "%*s</group>\n", indent, "");
246 return (0);
247 }
248
249 sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
250 "", 1 + indent / 2, cg->cg_level);
251 sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
252 cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
253 first = TRUE;
254 for (i = cg->cg_first; i <= cg->cg_last; i++) {
255 if (CPU_ISSET(i, &cg->cg_mask)) {
256 if (!first)
257 sbuf_cat(sb, ", ");
258 else
259 first = FALSE;
260 sbuf_printf(sb, "%d", i);
261 }
262 }
263 sbuf_cat(sb, "</cpu>\n");
264
265 if (cg->cg_flags != 0) {
266 sbuf_printf(sb, "%*s <flags>", indent, "");
267 if ((cg->cg_flags & CG_FLAG_HTT) != 0)
268 sbuf_cat(sb, "<flag name=\"HTT\">HTT group</flag>");
269 if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
270 sbuf_cat(sb, "<flag name=\"THREAD\">THREAD group</flag>");
271 if ((cg->cg_flags & CG_FLAG_SMT) != 0)
272 sbuf_cat(sb, "<flag name=\"SMT\">SMT group</flag>");
273 if ((cg->cg_flags & CG_FLAG_NODE) != 0)
274 sbuf_cat(sb, "<flag name=\"NODE\">NUMA node</flag>");
275 sbuf_cat(sb, "</flags>\n");
276 }
277
278 if (cg->cg_children > 0) {
279 sbuf_printf(sb, "%*s <children>\n", indent, "");
280 for (i = 0; i < cg->cg_children; i++)
281 sysctl_kern_sched_topology_spec_internal(sb,
282 &cg->cg_child[i], indent + 2);
283 sbuf_printf(sb, "%*s </children>\n", indent, "");
284 }
285 sbuf_printf(sb, "%*s</group>\n", indent, "");
286 return (0);
287 }
288
289 /*
290 * Sysctl handler for retrieving topology dump. It's a wrapper for
291 * the recursive sysctl_kern_smp_topology_spec_internal().
292 */
293 static int
sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)294 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
295 {
296 struct sbuf *topo;
297 int err;
298
299 topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
300 if (topo == NULL)
301 return (ENOMEM);
302
303 sbuf_cat(topo, "<groups>\n");
304 err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
305 sbuf_cat(topo, "</groups>\n");
306
307 if (err == 0)
308 err = sbuf_finish(topo);
309 sbuf_delete(topo);
310 return (err);
311 }
312
313 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
314 CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0,
315 sysctl_kern_sched_topology_spec, "A",
316 "XML dump of detected CPU topology");
317