1 /*
2 * Copyright 2026 The FreeBSD Foundation
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7 * under sponsorship from the FreeBSD Foundation.
8 */
9
10 #include "opt_sched.h"
11
12 #include <sys/systm.h>
13 #include <sys/kernel.h>
14 #include <sys/lock.h>
15 #include <sys/proc.h>
16 #include <sys/runq.h>
17 #include <sys/sbuf.h>
18 #include <sys/sched.h>
19 #include <sys/smp.h>
20 #include <sys/sysctl.h>
21 #include <machine/ifunc.h>
22
23 const struct sched_instance *active_sched;
24
25 #ifndef __DO_NOT_HAVE_SYS_IFUNCS
26 #define __DEFINE_SHIM(__m, __r, __n, __p, __a) \
27 DEFINE_IFUNC(, __r, __n, __p) \
28 { \
29 return (active_sched->__m); \
30 }
31 #else
32 #define __DEFINE_SHIM(__m, __r, __n, __p, __a) \
33 __r \
34 __n __p \
35 { \
36 return (active_sched->__m __a); \
37 }
38 #endif
39 #define DEFINE_SHIM0(__m, __r, __n) \
40 __DEFINE_SHIM(__m, __r, __n, (void), ())
41 #define DEFINE_SHIM1(__m, __r, __n, __t1, __a1) \
42 __DEFINE_SHIM(__m, __r, __n, (__t1 __a1), (__a1))
43 #define DEFINE_SHIM2(__m, __r, __n, __t1, __a1, __t2, __a2) \
44 __DEFINE_SHIM(__m, __r, __n, (__t1 __a1, __t2 __a2), (__a1, __a2))
45
46 DEFINE_SHIM0(load, int, sched_load)
47 DEFINE_SHIM0(rr_interval, int, sched_rr_interval)
48 DEFINE_SHIM0(runnable, bool, sched_runnable)
49 DEFINE_SHIM2(exit, void, sched_exit, struct proc *, p,
50 struct thread *, childtd)
51 DEFINE_SHIM2(fork, void, sched_fork, struct thread *, td,
52 struct thread *, childtd)
53 DEFINE_SHIM1(fork_exit, void, sched_fork_exit, struct thread *, td)
54 DEFINE_SHIM2(class, void, sched_class, struct thread *, td, int, class)
55 DEFINE_SHIM2(nice, void, sched_nice, struct proc *, p, int, nice)
56 DEFINE_SHIM0(ap_entry, void, sched_ap_entry)
57 DEFINE_SHIM2(exit_thread, void, sched_exit_thread, struct thread *, td,
58 struct thread *, child)
59 DEFINE_SHIM1(estcpu, u_int, sched_estcpu, struct thread *, td)
60 DEFINE_SHIM2(fork_thread, void, sched_fork_thread, struct thread *, td,
61 struct thread *, child)
62 DEFINE_SHIM2(ithread_prio, void, sched_ithread_prio, struct thread *, td,
63 u_char, prio)
64 DEFINE_SHIM2(lend_prio, void, sched_lend_prio, struct thread *, td,
65 u_char, prio)
66 DEFINE_SHIM2(lend_user_prio, void, sched_lend_user_prio, struct thread *, td,
67 u_char, pri)
68 DEFINE_SHIM2(lend_user_prio_cond, void, sched_lend_user_prio_cond,
69 struct thread *, td, u_char, pri)
70 DEFINE_SHIM1(pctcpu, fixpt_t, sched_pctcpu, struct thread *, td)
71 DEFINE_SHIM2(prio, void, sched_prio, struct thread *, td, u_char, prio)
72 DEFINE_SHIM2(sleep, void, sched_sleep, struct thread *, td, int, prio)
73 DEFINE_SHIM2(sswitch, void, sched_switch, struct thread *, td, int, flags)
74 DEFINE_SHIM1(throw, void, sched_throw, struct thread *, td)
75 DEFINE_SHIM2(unlend_prio, void, sched_unlend_prio, struct thread *, td,
76 u_char, prio)
77 DEFINE_SHIM2(user_prio, void, sched_user_prio, struct thread *, td,
78 u_char, prio)
79 DEFINE_SHIM1(userret_slowpath, void, sched_userret_slowpath,
80 struct thread *, td)
81 DEFINE_SHIM2(add, void, sched_add, struct thread *, td, int, flags)
82 DEFINE_SHIM0(choose, struct thread *, sched_choose)
83 DEFINE_SHIM2(clock, void, sched_clock, struct thread *, td, int, cnt)
84 DEFINE_SHIM1(idletd, void, sched_idletd, void *, dummy)
85 DEFINE_SHIM1(preempt, void, sched_preempt, struct thread *, td)
86 DEFINE_SHIM1(relinquish, void, sched_relinquish, struct thread *, td)
87 DEFINE_SHIM1(rem, void, sched_rem, struct thread *, td)
88 DEFINE_SHIM2(wakeup, void, sched_wakeup, struct thread *, td, int, srqflags)
89 DEFINE_SHIM2(bind, void, sched_bind, struct thread *, td, int, cpu)
90 DEFINE_SHIM1(unbind, void, sched_unbind, struct thread *, td)
91 DEFINE_SHIM1(is_bound, int, sched_is_bound, struct thread *, td)
92 DEFINE_SHIM1(affinity, void, sched_affinity, struct thread *, td)
93 DEFINE_SHIM0(sizeof_proc, int, sched_sizeof_proc)
94 DEFINE_SHIM0(sizeof_thread, int, sched_sizeof_thread)
95 DEFINE_SHIM1(tdname, char *, sched_tdname, struct thread *, td)
96 DEFINE_SHIM1(clear_tdname, void, sched_clear_tdname, struct thread *, td)
97 DEFINE_SHIM0(do_timer_accounting, bool, sched_do_timer_accounting)
98 DEFINE_SHIM1(find_l2_neighbor, int, sched_find_l2_neighbor, int, cpu)
99 DEFINE_SHIM0(init_ap, void, schedinit_ap)
100
101
102 SCHED_STAT_DEFINE(ithread_demotions, "Interrupt thread priority demotions");
103 SCHED_STAT_DEFINE(ithread_preemptions,
104 "Interrupt thread preemptions due to time-sharing");
105
106 SDT_PROVIDER_DEFINE(sched);
107
108 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *",
109 "struct proc *", "uint8_t");
110 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *",
111 "struct proc *", "void *");
112 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *",
113 "struct proc *", "void *", "int");
114 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *",
115 "struct proc *", "uint8_t", "struct thread *");
116 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
117 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
118 "struct proc *");
119 SDT_PROBE_DEFINE(sched, , , on__cpu);
120 SDT_PROBE_DEFINE(sched, , , remain__cpu);
121 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
122 "struct proc *");
123
124 #ifdef KDTRACE_HOOKS
125 #include <sys/dtrace_bsd.h>
126 int __read_mostly dtrace_vtime_active;
127 dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
128 #endif
129
130 static char sched_name[32] = "ULE";
131
132 SET_DECLARE(sched_instance_set, struct sched_selection);
133
134 void
sched_instance_select(void)135 sched_instance_select(void)
136 {
137 struct sched_selection *s, **ss;
138 int i;
139
140 TUNABLE_STR_FETCH("kern.sched.name", sched_name, sizeof(sched_name));
141 SET_FOREACH(ss, sched_instance_set) {
142 s = *ss;
143 for (i = 0; s->name[i] == sched_name[i]; i++) {
144 if (s->name[i] == '\0') {
145 active_sched = s->instance;
146 return;
147 }
148 }
149 }
150
151 /*
152 * No scheduler matching the configuration was found. If
153 * there is any scheduler compiled in, at all, use the first
154 * scheduler from the linker set.
155 */
156 if (SET_BEGIN(sched_instance_set) < SET_LIMIT(sched_instance_set)) {
157 s = *SET_BEGIN(sched_instance_set);
158 active_sched = s->instance;
159 for (i = 0;; i++) {
160 sched_name[i] = s->name[i];
161 if (s->name[i] == '\0')
162 break;
163 }
164 }
165 }
166
167 void
schedinit(void)168 schedinit(void)
169 {
170 if (active_sched == NULL)
171 panic("Cannot find scheduler %s", sched_name);
172 active_sched->init();
173 }
174
175 struct cpu_group __read_mostly *cpu_top; /* CPU topology */
176
177 static void
sched_setup(void * dummy)178 sched_setup(void *dummy)
179 {
180 cpu_top = smp_topo();
181 active_sched->setup();
182 }
183 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
184
185 static void
sched_initticks(void * dummy)186 sched_initticks(void *dummy)
187 {
188 active_sched->initticks();
189 }
190 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
191 NULL);
192
193 static void
sched_schedcpu(void)194 sched_schedcpu(void)
195 {
196 active_sched->schedcpu();
197 }
198 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, sched_schedcpu, NULL);
199
200 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
201 "Scheduler");
202
203 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, sched_name, 0,
204 "Scheduler name");
205
206 static int
sysctl_kern_sched_available(SYSCTL_HANDLER_ARGS)207 sysctl_kern_sched_available(SYSCTL_HANDLER_ARGS)
208 {
209 struct sched_selection *s, **ss;
210 struct sbuf *sb, sm;
211 int error;
212 bool first;
213
214 sb = sbuf_new_for_sysctl(&sm, NULL, 0, req);
215 if (sb == NULL)
216 return (ENOMEM);
217 first = true;
218 SET_FOREACH(ss, sched_instance_set) {
219 s = *ss;
220 if (first)
221 first = false;
222 else
223 sbuf_cat(sb, ",");
224 sbuf_cat(sb, s->name);
225 }
226 error = sbuf_finish(sb);
227 sbuf_delete(sb);
228 return (error);
229 }
230
231 SYSCTL_PROC(_kern_sched, OID_AUTO, available,
232 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
233 NULL, 0, sysctl_kern_sched_available, "A",
234 "List of available schedulers");
235
236 fixpt_t ccpu;
237 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0,
238 "Decay factor used for updating %CPU");
239
240 /*
241 * Build the CPU topology dump string. Is recursively called to collect
242 * the topology tree.
243 */
244 static int
sysctl_kern_sched_topology_spec_internal(struct sbuf * sb,struct cpu_group * cg,int indent)245 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
246 struct cpu_group *cg, int indent)
247 {
248 char cpusetbuf[CPUSETBUFSIZ];
249 int i, first;
250
251 if (cpu_top == NULL) {
252 sbuf_printf(sb, "%*s<group level=\"1\" cache-level=\"1\">\n",
253 indent, "");
254 sbuf_printf(sb, "%*s</group>\n", indent, "");
255 return (0);
256 }
257
258 sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
259 "", 1 + indent / 2, cg->cg_level);
260 sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
261 cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
262 first = TRUE;
263 for (i = cg->cg_first; i <= cg->cg_last; i++) {
264 if (CPU_ISSET(i, &cg->cg_mask)) {
265 if (!first)
266 sbuf_cat(sb, ", ");
267 else
268 first = FALSE;
269 sbuf_printf(sb, "%d", i);
270 }
271 }
272 sbuf_cat(sb, "</cpu>\n");
273
274 if (cg->cg_flags != 0) {
275 sbuf_printf(sb, "%*s <flags>", indent, "");
276 if ((cg->cg_flags & CG_FLAG_HTT) != 0)
277 sbuf_cat(sb, "<flag name=\"HTT\">HTT group</flag>");
278 if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
279 sbuf_cat(sb, "<flag name=\"THREAD\">THREAD group</flag>");
280 if ((cg->cg_flags & CG_FLAG_SMT) != 0)
281 sbuf_cat(sb, "<flag name=\"SMT\">SMT group</flag>");
282 if ((cg->cg_flags & CG_FLAG_NODE) != 0)
283 sbuf_cat(sb, "<flag name=\"NODE\">NUMA node</flag>");
284 sbuf_cat(sb, "</flags>\n");
285 }
286
287 if (cg->cg_children > 0) {
288 sbuf_printf(sb, "%*s <children>\n", indent, "");
289 for (i = 0; i < cg->cg_children; i++)
290 sysctl_kern_sched_topology_spec_internal(sb,
291 &cg->cg_child[i], indent + 2);
292 sbuf_printf(sb, "%*s </children>\n", indent, "");
293 }
294 sbuf_printf(sb, "%*s</group>\n", indent, "");
295 return (0);
296 }
297
298 /*
299 * Sysctl handler for retrieving topology dump. It's a wrapper for
300 * the recursive sysctl_kern_smp_topology_spec_internal().
301 */
302 static int
sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)303 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
304 {
305 struct sbuf *topo;
306 int err;
307
308 topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
309 if (topo == NULL)
310 return (ENOMEM);
311
312 sbuf_cat(topo, "<groups>\n");
313 err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
314 sbuf_cat(topo, "</groups>\n");
315
316 if (err == 0)
317 err = sbuf_finish(topo);
318 sbuf_delete(topo);
319 return (err);
320 }
321
322 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
323 CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0,
324 sysctl_kern_sched_topology_spec, "A",
325 "XML dump of detected CPU topology");
326