1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include "opt_sched.h"
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kdb.h>
35 #include <sys/kernel.h>
36 #include <sys/ktr.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/sched.h>
42 #include <sys/smp.h>
43 #include <sys/sysctl.h>
44
45 #include <machine/cpu.h>
46
47 /* Uncomment this to enable logging of critical_enter/exit. */
48 #if 0
49 #define KTR_CRITICAL KTR_SCHED
50 #else
51 #define KTR_CRITICAL 0
52 #endif
53
54 #ifdef FULL_PREEMPTION
55 #ifndef PREEMPTION
56 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
57 #endif
58 #endif
59
60 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
61
62 /*
63 * kern.sched.preemption allows user space to determine if preemption support
64 * is compiled in or not. It is not currently a boot or runtime flag that
65 * can be changed.
66 */
67 #ifdef PREEMPTION
68 static int kern_sched_preemption = 1;
69 #else
70 static int kern_sched_preemption = 0;
71 #endif
72 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
73 &kern_sched_preemption, 0, "Kernel preemption enabled");
74
75 /*
76 * Support for scheduler stats exported via kern.sched.stats. All stats may
77 * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere
78 * with SCHED_STAT_DEFINE().
79 */
80 #ifdef SCHED_STATS
81 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
82 "switch stats");
83
84 /* Switch reasons from mi_switch(9). */
85 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
86 SCHED_STAT_DEFINE_VAR(owepreempt,
87 &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
88 SCHED_STAT_DEFINE_VAR(turnstile,
89 &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
90 SCHED_STAT_DEFINE_VAR(sleepq,
91 &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
92 SCHED_STAT_DEFINE_VAR(relinquish,
93 &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
94 SCHED_STAT_DEFINE_VAR(needresched,
95 &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
96 SCHED_STAT_DEFINE_VAR(idle,
97 &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
98 SCHED_STAT_DEFINE_VAR(iwait,
99 &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
100 SCHED_STAT_DEFINE_VAR(suspend,
101 &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
102 SCHED_STAT_DEFINE_VAR(remotepreempt,
103 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
104 SCHED_STAT_DEFINE_VAR(remotewakeidle,
105 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
106 SCHED_STAT_DEFINE_VAR(bind,
107 &DPCPU_NAME(sched_switch_stats[SWT_BIND]), "");
108
109 static int
sysctl_stats_reset(SYSCTL_HANDLER_ARGS)110 sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
111 {
112 struct sysctl_oid *p;
113 uintptr_t counter;
114 int error;
115 int val;
116 int i;
117
118 val = 0;
119 error = sysctl_handle_int(oidp, &val, 0, req);
120 if (error != 0 || req->newptr == NULL)
121 return (error);
122 if (val == 0)
123 return (0);
124 /*
125 * Traverse the list of children of _kern_sched_stats and reset each
126 * to 0. Skip the reset entry.
127 */
128 RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) {
129 if (p == oidp || p->oid_arg1 == NULL)
130 continue;
131 counter = (uintptr_t)p->oid_arg1;
132 CPU_FOREACH(i) {
133 *(long *)(dpcpu_off[i] + counter) = 0;
134 }
135 }
136 return (0);
137 }
138
139 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset,
140 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0,
141 sysctl_stats_reset, "I",
142 "Reset scheduler statistics");
143 #endif
144
145 /************************************************************************
146 * Functions that manipulate runnability from a thread perspective. *
147 ************************************************************************/
148 /*
149 * Select the thread that will be run next.
150 */
151
152 static __noinline struct thread *
choosethread_panic(struct thread * td)153 choosethread_panic(struct thread *td)
154 {
155
156 /*
157 * If we are in panic, only allow system threads,
158 * plus the one we are running in, to be run.
159 */
160 retry:
161 if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
162 (td->td_flags & TDF_INPANIC) == 0)) {
163 /* note that it is no longer on the run queue */
164 TD_SET_CAN_RUN(td);
165 td = sched_choose();
166 goto retry;
167 }
168
169 TD_SET_RUNNING(td);
170 return (td);
171 }
172
173 struct thread *
choosethread(void)174 choosethread(void)
175 {
176 struct thread *td;
177
178 td = sched_choose();
179
180 if (KERNEL_PANICKED())
181 return (choosethread_panic(td));
182
183 TD_SET_RUNNING(td);
184 return (td);
185 }
186
187 /*
188 * Kernel thread preemption implementation. Critical sections mark
189 * regions of code in which preemptions are not allowed.
190 *
191 * It might seem a good idea to inline critical_enter() but, in order
192 * to prevent instructions reordering by the compiler, a __compiler_membar()
193 * would have to be used here (the same as sched_pin()). The performance
194 * penalty imposed by the membar could, then, produce slower code than
195 * the function call itself, for most cases.
196 */
197 void
critical_enter_KBI(void)198 critical_enter_KBI(void)
199 {
200 #ifdef KTR
201 struct thread *td = curthread;
202 #endif
203 critical_enter();
204 CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
205 (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
206 }
207
208 void __noinline
critical_exit_preempt(void)209 critical_exit_preempt(void)
210 {
211 struct thread *td;
212 int flags;
213
214 /*
215 * If td_critnest is 0, it is possible that we are going to get
216 * preempted again before reaching the code below. This happens
217 * rarely and is harmless. However, this means td_owepreempt may
218 * now be unset.
219 */
220 td = curthread;
221 if (td->td_critnest != 0)
222 return;
223 if (kdb_active)
224 return;
225
226 /*
227 * Microoptimization: we committed to switch,
228 * disable preemption in interrupt handlers
229 * while spinning for the thread lock.
230 */
231 td->td_critnest = 1;
232 thread_lock(td);
233 td->td_critnest--;
234 flags = SW_INVOL | SW_PREEMPT;
235 if (TD_IS_IDLETHREAD(td))
236 flags |= SWT_IDLE;
237 else
238 flags |= SWT_OWEPREEMPT;
239 mi_switch(flags);
240 }
241
242 void
critical_exit_KBI(void)243 critical_exit_KBI(void)
244 {
245 #ifdef KTR
246 struct thread *td = curthread;
247 #endif
248 critical_exit();
249 CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
250 (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
251 }
252
253 /************************************************************************
254 * SYSTEM RUN QUEUE manipulations and tests *
255 ************************************************************************/
256 /*
257 * Initialize a run structure.
258 */
259 void
runq_init(struct runq * rq)260 runq_init(struct runq *rq)
261 {
262 int i;
263
264 bzero(rq, sizeof *rq);
265 for (i = 0; i < RQ_NQS; i++)
266 TAILQ_INIT(&rq->rq_queues[i]);
267 }
268
269 /*
270 * Clear the status bit of the queue corresponding to priority level pri,
271 * indicating that it is empty.
272 */
273 static __inline void
runq_clrbit(struct runq * rq,int pri)274 runq_clrbit(struct runq *rq, int pri)
275 {
276 struct rqbits *rqb;
277
278 rqb = &rq->rq_status;
279 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
280 rqb->rqb_bits[RQB_WORD(pri)],
281 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
282 RQB_BIT(pri), RQB_WORD(pri));
283 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
284 }
285
286 /*
287 * Find the index of the first non-empty run queue. This is done by
288 * scanning the status bits, a set bit indicates a non-empty queue.
289 */
290 static __inline int
runq_findbit(struct runq * rq)291 runq_findbit(struct runq *rq)
292 {
293 struct rqbits *rqb;
294 int pri;
295 int i;
296
297 rqb = &rq->rq_status;
298 for (i = 0; i < RQB_LEN; i++)
299 if (rqb->rqb_bits[i]) {
300 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
301 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
302 rqb->rqb_bits[i], i, pri);
303 return (pri);
304 }
305
306 return (-1);
307 }
308
309 static __inline int
runq_findbit_from(struct runq * rq,u_char pri)310 runq_findbit_from(struct runq *rq, u_char pri)
311 {
312 struct rqbits *rqb;
313 rqb_word_t mask;
314 int i;
315
316 /*
317 * Set the mask for the first word so we ignore priorities before 'pri'.
318 */
319 mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
320 rqb = &rq->rq_status;
321 again:
322 for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
323 mask = rqb->rqb_bits[i] & mask;
324 if (mask == 0)
325 continue;
326 pri = RQB_FFS(mask) + (i << RQB_L2BPW);
327 CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
328 mask, i, pri);
329 return (pri);
330 }
331 if (pri == 0)
332 return (-1);
333 /*
334 * Wrap back around to the beginning of the list just once so we
335 * scan the whole thing.
336 */
337 pri = 0;
338 goto again;
339 }
340
341 /*
342 * Set the status bit of the queue corresponding to priority level pri,
343 * indicating that it is non-empty.
344 */
345 static __inline void
runq_setbit(struct runq * rq,int pri)346 runq_setbit(struct runq *rq, int pri)
347 {
348 struct rqbits *rqb;
349
350 rqb = &rq->rq_status;
351 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
352 rqb->rqb_bits[RQB_WORD(pri)],
353 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
354 RQB_BIT(pri), RQB_WORD(pri));
355 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
356 }
357
358 /*
359 * Add the thread to the queue specified by its priority, and set the
360 * corresponding status bit.
361 */
362 void
runq_add(struct runq * rq,struct thread * td,int flags)363 runq_add(struct runq *rq, struct thread *td, int flags)
364 {
365 struct rqhead *rqh;
366 int pri;
367
368 pri = td->td_priority / RQ_PPQ;
369 td->td_rqindex = pri;
370 runq_setbit(rq, pri);
371 rqh = &rq->rq_queues[pri];
372 CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
373 td, td->td_priority, pri, rqh);
374 if (flags & SRQ_PREEMPTED) {
375 TAILQ_INSERT_HEAD(rqh, td, td_runq);
376 } else {
377 TAILQ_INSERT_TAIL(rqh, td, td_runq);
378 }
379 }
380
381 void
runq_add_pri(struct runq * rq,struct thread * td,u_char pri,int flags)382 runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
383 {
384 struct rqhead *rqh;
385
386 KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
387 td->td_rqindex = pri;
388 runq_setbit(rq, pri);
389 rqh = &rq->rq_queues[pri];
390 CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
391 td, td->td_priority, pri, rqh);
392 if (flags & SRQ_PREEMPTED) {
393 TAILQ_INSERT_HEAD(rqh, td, td_runq);
394 } else {
395 TAILQ_INSERT_TAIL(rqh, td, td_runq);
396 }
397 }
398 /*
399 * Return true if there are runnable processes of any priority on the run
400 * queue, false otherwise. Has no side effects, does not modify the run
401 * queue structure.
402 */
403 int
runq_check(struct runq * rq)404 runq_check(struct runq *rq)
405 {
406 struct rqbits *rqb;
407 int i;
408
409 rqb = &rq->rq_status;
410 for (i = 0; i < RQB_LEN; i++)
411 if (rqb->rqb_bits[i]) {
412 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
413 rqb->rqb_bits[i], i);
414 return (1);
415 }
416 CTR0(KTR_RUNQ, "runq_check: empty");
417
418 return (0);
419 }
420
421 /*
422 * Find the highest priority process on the run queue.
423 */
424 struct thread *
runq_choose_fuzz(struct runq * rq,int fuzz)425 runq_choose_fuzz(struct runq *rq, int fuzz)
426 {
427 struct rqhead *rqh;
428 struct thread *td;
429 int pri;
430
431 while ((pri = runq_findbit(rq)) != -1) {
432 rqh = &rq->rq_queues[pri];
433 /* fuzz == 1 is normal.. 0 or less are ignored */
434 if (fuzz > 1) {
435 /*
436 * In the first couple of entries, check if
437 * there is one for our CPU as a preference.
438 */
439 int count = fuzz;
440 int cpu = PCPU_GET(cpuid);
441 struct thread *td2;
442 td2 = td = TAILQ_FIRST(rqh);
443
444 while (count-- && td2) {
445 if (td2->td_lastcpu == cpu) {
446 td = td2;
447 break;
448 }
449 td2 = TAILQ_NEXT(td2, td_runq);
450 }
451 } else
452 td = TAILQ_FIRST(rqh);
453 KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
454 CTR3(KTR_RUNQ,
455 "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
456 return (td);
457 }
458 CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
459
460 return (NULL);
461 }
462
463 /*
464 * Find the highest priority process on the run queue.
465 */
466 struct thread *
runq_choose(struct runq * rq)467 runq_choose(struct runq *rq)
468 {
469 struct rqhead *rqh;
470 struct thread *td;
471 int pri;
472
473 while ((pri = runq_findbit(rq)) != -1) {
474 rqh = &rq->rq_queues[pri];
475 td = TAILQ_FIRST(rqh);
476 KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
477 CTR3(KTR_RUNQ,
478 "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
479 return (td);
480 }
481 CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
482
483 return (NULL);
484 }
485
486 struct thread *
runq_choose_from(struct runq * rq,u_char idx)487 runq_choose_from(struct runq *rq, u_char idx)
488 {
489 struct rqhead *rqh;
490 struct thread *td;
491 int pri;
492
493 if ((pri = runq_findbit_from(rq, idx)) != -1) {
494 rqh = &rq->rq_queues[pri];
495 td = TAILQ_FIRST(rqh);
496 KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
497 CTR4(KTR_RUNQ,
498 "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
499 pri, td, td->td_rqindex, rqh);
500 return (td);
501 }
502 CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
503
504 return (NULL);
505 }
506 /*
507 * Remove the thread from the queue specified by its priority, and clear the
508 * corresponding status bit if the queue becomes empty.
509 * Caller must set state afterwards.
510 */
511 void
runq_remove(struct runq * rq,struct thread * td)512 runq_remove(struct runq *rq, struct thread *td)
513 {
514
515 runq_remove_idx(rq, td, NULL);
516 }
517
518 void
runq_remove_idx(struct runq * rq,struct thread * td,u_char * idx)519 runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
520 {
521 struct rqhead *rqh;
522 u_char pri;
523
524 KASSERT(td->td_flags & TDF_INMEM,
525 ("runq_remove_idx: thread swapped out"));
526 pri = td->td_rqindex;
527 KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
528 rqh = &rq->rq_queues[pri];
529 CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
530 td, td->td_priority, pri, rqh);
531 TAILQ_REMOVE(rqh, td, td_runq);
532 if (TAILQ_EMPTY(rqh)) {
533 CTR0(KTR_RUNQ, "runq_remove_idx: empty");
534 runq_clrbit(rq, pri);
535 if (idx != NULL && *idx == pri)
536 *idx = (pri + 1) % RQ_NQS;
537 }
538 }
539