xref: /freebsd/sys/kern/kern_switch.c (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include "opt_sched.h"
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kdb.h>
35 #include <sys/kernel.h>
36 #include <sys/ktr.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/sched.h>
42 #include <sys/smp.h>
43 #include <sys/sysctl.h>
44 
45 #include <machine/cpu.h>
46 
47 /* Uncomment this to enable logging of critical_enter/exit. */
48 #if 0
49 #define	KTR_CRITICAL	KTR_SCHED
50 #else
51 #define	KTR_CRITICAL	0
52 #endif
53 
54 #ifdef FULL_PREEMPTION
55 #ifndef PREEMPTION
56 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
57 #endif
58 #endif
59 
60 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
61 
62 /*
63  * kern.sched.preemption allows user space to determine if preemption support
64  * is compiled in or not.  It is not currently a boot or runtime flag that
65  * can be changed.
66  */
67 #ifdef PREEMPTION
68 static int kern_sched_preemption = 1;
69 #else
70 static int kern_sched_preemption = 0;
71 #endif
72 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
73     &kern_sched_preemption, 0, "Kernel preemption enabled");
74 
75 /*
76  * Support for scheduler stats exported via kern.sched.stats.  All stats may
77  * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
78  * with SCHED_STAT_DEFINE().
79  */
80 #ifdef SCHED_STATS
81 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
82     "switch stats");
83 
84 /* Switch reasons from mi_switch(9). */
85 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
86 SCHED_STAT_DEFINE_VAR(owepreempt,
87     &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
88 SCHED_STAT_DEFINE_VAR(turnstile,
89     &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
90 SCHED_STAT_DEFINE_VAR(sleepq,
91     &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
92 SCHED_STAT_DEFINE_VAR(relinquish,
93     &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
94 SCHED_STAT_DEFINE_VAR(needresched,
95     &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
96 SCHED_STAT_DEFINE_VAR(idle,
97     &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
98 SCHED_STAT_DEFINE_VAR(iwait,
99     &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
100 SCHED_STAT_DEFINE_VAR(suspend,
101     &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
102 SCHED_STAT_DEFINE_VAR(remotepreempt,
103     &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
104 SCHED_STAT_DEFINE_VAR(remotewakeidle,
105     &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
106 SCHED_STAT_DEFINE_VAR(bind,
107     &DPCPU_NAME(sched_switch_stats[SWT_BIND]), "");
108 
109 static int
110 sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
111 {
112 	struct sysctl_oid *p;
113 	uintptr_t counter;
114         int error;
115 	int val;
116 	int i;
117 
118         val = 0;
119         error = sysctl_handle_int(oidp, &val, 0, req);
120         if (error != 0 || req->newptr == NULL)
121                 return (error);
122         if (val == 0)
123                 return (0);
124 	/*
125 	 * Traverse the list of children of _kern_sched_stats and reset each
126 	 * to 0.  Skip the reset entry.
127 	 */
128 	RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) {
129 		if (p == oidp || p->oid_arg1 == NULL)
130 			continue;
131 		counter = (uintptr_t)p->oid_arg1;
132 		CPU_FOREACH(i) {
133 			*(long *)(dpcpu_off[i] + counter) = 0;
134 		}
135 	}
136 	return (0);
137 }
138 
139 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset,
140     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0,
141     sysctl_stats_reset, "I",
142     "Reset scheduler statistics");
143 #endif
144 
145 /************************************************************************
146  * Functions that manipulate runnability from a thread perspective.	*
147  ************************************************************************/
148 /*
149  * Select the thread that will be run next.
150  */
151 
152 static __noinline struct thread *
153 choosethread_panic(struct thread *td)
154 {
155 
156 	/*
157 	 * If we are in panic, only allow system threads,
158 	 * plus the one we are running in, to be run.
159 	 */
160 retry:
161 	if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
162 	    (td->td_flags & TDF_INPANIC) == 0)) {
163 		/* note that it is no longer on the run queue */
164 		TD_SET_CAN_RUN(td);
165 		td = sched_choose();
166 		goto retry;
167 	}
168 
169 	TD_SET_RUNNING(td);
170 	return (td);
171 }
172 
173 struct thread *
174 choosethread(void)
175 {
176 	struct thread *td;
177 
178 	td = sched_choose();
179 
180 	if (KERNEL_PANICKED())
181 		return (choosethread_panic(td));
182 
183 	TD_SET_RUNNING(td);
184 	return (td);
185 }
186 
187 /*
188  * Kernel thread preemption implementation.  Critical sections mark
189  * regions of code in which preemptions are not allowed.
190  *
191  * It might seem a good idea to inline critical_enter() but, in order
192  * to prevent instructions reordering by the compiler, a __compiler_membar()
193  * would have to be used here (the same as sched_pin()).  The performance
194  * penalty imposed by the membar could, then, produce slower code than
195  * the function call itself, for most cases.
196  */
197 void
198 critical_enter_KBI(void)
199 {
200 #ifdef KTR
201 	struct thread *td = curthread;
202 #endif
203 	critical_enter();
204 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
205 	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
206 }
207 
208 void __noinline
209 critical_exit_preempt(void)
210 {
211 	struct thread *td;
212 	int flags;
213 
214 	/*
215 	 * If td_critnest is 0, it is possible that we are going to get
216 	 * preempted again before reaching the code below. This happens
217 	 * rarely and is harmless. However, this means td_owepreempt may
218 	 * now be unset.
219 	 */
220 	td = curthread;
221 	if (td->td_critnest != 0)
222 		return;
223 	if (kdb_active)
224 		return;
225 
226 	/*
227 	 * Microoptimization: we committed to switch,
228 	 * disable preemption in interrupt handlers
229 	 * while spinning for the thread lock.
230 	 */
231 	td->td_critnest = 1;
232 	thread_lock(td);
233 	td->td_critnest--;
234 	flags = SW_INVOL | SW_PREEMPT;
235 	if (TD_IS_IDLETHREAD(td))
236 		flags |= SWT_IDLE;
237 	else
238 		flags |= SWT_OWEPREEMPT;
239 	mi_switch(flags);
240 }
241 
242 void
243 critical_exit_KBI(void)
244 {
245 #ifdef KTR
246 	struct thread *td = curthread;
247 #endif
248 	critical_exit();
249 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
250 	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
251 }
252 
253 /************************************************************************
254  * SYSTEM RUN QUEUE manipulations and tests				*
255  ************************************************************************/
256 /*
257  * Initialize a run structure.
258  */
259 void
260 runq_init(struct runq *rq)
261 {
262 	int i;
263 
264 	bzero(rq, sizeof *rq);
265 	for (i = 0; i < RQ_NQS; i++)
266 		TAILQ_INIT(&rq->rq_queues[i]);
267 }
268 
269 /*
270  * Clear the status bit of the queue corresponding to priority level pri,
271  * indicating that it is empty.
272  */
273 static __inline void
274 runq_clrbit(struct runq *rq, int pri)
275 {
276 	struct rqbits *rqb;
277 
278 	rqb = &rq->rq_status;
279 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
280 	    rqb->rqb_bits[RQB_WORD(pri)],
281 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
282 	    RQB_BIT(pri), RQB_WORD(pri));
283 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
284 }
285 
286 /*
287  * Find the index of the first non-empty run queue.  This is done by
288  * scanning the status bits, a set bit indicates a non-empty queue.
289  */
290 static __inline int
291 runq_findbit(struct runq *rq)
292 {
293 	struct rqbits *rqb;
294 	int pri;
295 	int i;
296 
297 	rqb = &rq->rq_status;
298 	for (i = 0; i < RQB_LEN; i++)
299 		if (rqb->rqb_bits[i]) {
300 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
301 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
302 			    rqb->rqb_bits[i], i, pri);
303 			return (pri);
304 		}
305 
306 	return (-1);
307 }
308 
309 static __inline int
310 runq_findbit_from(struct runq *rq, u_char pri)
311 {
312 	struct rqbits *rqb;
313 	rqb_word_t mask;
314 	int i;
315 
316 	/*
317 	 * Set the mask for the first word so we ignore priorities before 'pri'.
318 	 */
319 	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
320 	rqb = &rq->rq_status;
321 again:
322 	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
323 		mask = rqb->rqb_bits[i] & mask;
324 		if (mask == 0)
325 			continue;
326 		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
327 		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
328 		    mask, i, pri);
329 		return (pri);
330 	}
331 	if (pri == 0)
332 		return (-1);
333 	/*
334 	 * Wrap back around to the beginning of the list just once so we
335 	 * scan the whole thing.
336 	 */
337 	pri = 0;
338 	goto again;
339 }
340 
341 /*
342  * Set the status bit of the queue corresponding to priority level pri,
343  * indicating that it is non-empty.
344  */
345 static __inline void
346 runq_setbit(struct runq *rq, int pri)
347 {
348 	struct rqbits *rqb;
349 
350 	rqb = &rq->rq_status;
351 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
352 	    rqb->rqb_bits[RQB_WORD(pri)],
353 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
354 	    RQB_BIT(pri), RQB_WORD(pri));
355 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
356 }
357 
358 /*
359  * Add the thread to the queue specified by its priority, and set the
360  * corresponding status bit.
361  */
362 void
363 runq_add(struct runq *rq, struct thread *td, int flags)
364 {
365 	struct rqhead *rqh;
366 	int pri;
367 
368 	pri = td->td_priority / RQ_PPQ;
369 	td->td_rqindex = pri;
370 	runq_setbit(rq, pri);
371 	rqh = &rq->rq_queues[pri];
372 	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
373 	    td, td->td_priority, pri, rqh);
374 	if (flags & SRQ_PREEMPTED) {
375 		TAILQ_INSERT_HEAD(rqh, td, td_runq);
376 	} else {
377 		TAILQ_INSERT_TAIL(rqh, td, td_runq);
378 	}
379 }
380 
381 void
382 runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
383 {
384 	struct rqhead *rqh;
385 
386 	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
387 	td->td_rqindex = pri;
388 	runq_setbit(rq, pri);
389 	rqh = &rq->rq_queues[pri];
390 	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
391 	    td, td->td_priority, pri, rqh);
392 	if (flags & SRQ_PREEMPTED) {
393 		TAILQ_INSERT_HEAD(rqh, td, td_runq);
394 	} else {
395 		TAILQ_INSERT_TAIL(rqh, td, td_runq);
396 	}
397 }
398 /*
399  * Return true if there are runnable processes of any priority on the run
400  * queue, false otherwise.  Has no side effects, does not modify the run
401  * queue structure.
402  */
403 int
404 runq_check(struct runq *rq)
405 {
406 	struct rqbits *rqb;
407 	int i;
408 
409 	rqb = &rq->rq_status;
410 	for (i = 0; i < RQB_LEN; i++)
411 		if (rqb->rqb_bits[i]) {
412 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
413 			    rqb->rqb_bits[i], i);
414 			return (1);
415 		}
416 	CTR0(KTR_RUNQ, "runq_check: empty");
417 
418 	return (0);
419 }
420 
421 /*
422  * Find the highest priority process on the run queue.
423  */
424 struct thread *
425 runq_choose_fuzz(struct runq *rq, int fuzz)
426 {
427 	struct rqhead *rqh;
428 	struct thread *td;
429 	int pri;
430 
431 	while ((pri = runq_findbit(rq)) != -1) {
432 		rqh = &rq->rq_queues[pri];
433 		/* fuzz == 1 is normal.. 0 or less are ignored */
434 		if (fuzz > 1) {
435 			/*
436 			 * In the first couple of entries, check if
437 			 * there is one for our CPU as a preference.
438 			 */
439 			int count = fuzz;
440 			int cpu = PCPU_GET(cpuid);
441 			struct thread *td2;
442 			td2 = td = TAILQ_FIRST(rqh);
443 
444 			while (count-- && td2) {
445 				if (td2->td_lastcpu == cpu) {
446 					td = td2;
447 					break;
448 				}
449 				td2 = TAILQ_NEXT(td2, td_runq);
450 			}
451 		} else
452 			td = TAILQ_FIRST(rqh);
453 		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
454 		CTR3(KTR_RUNQ,
455 		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
456 		return (td);
457 	}
458 	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
459 
460 	return (NULL);
461 }
462 
463 /*
464  * Find the highest priority process on the run queue.
465  */
466 struct thread *
467 runq_choose(struct runq *rq)
468 {
469 	struct rqhead *rqh;
470 	struct thread *td;
471 	int pri;
472 
473 	while ((pri = runq_findbit(rq)) != -1) {
474 		rqh = &rq->rq_queues[pri];
475 		td = TAILQ_FIRST(rqh);
476 		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
477 		CTR3(KTR_RUNQ,
478 		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
479 		return (td);
480 	}
481 	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
482 
483 	return (NULL);
484 }
485 
486 struct thread *
487 runq_choose_from(struct runq *rq, u_char idx)
488 {
489 	struct rqhead *rqh;
490 	struct thread *td;
491 	int pri;
492 
493 	if ((pri = runq_findbit_from(rq, idx)) != -1) {
494 		rqh = &rq->rq_queues[pri];
495 		td = TAILQ_FIRST(rqh);
496 		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
497 		CTR4(KTR_RUNQ,
498 		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
499 		    pri, td, td->td_rqindex, rqh);
500 		return (td);
501 	}
502 	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
503 
504 	return (NULL);
505 }
506 /*
507  * Remove the thread from the queue specified by its priority, and clear the
508  * corresponding status bit if the queue becomes empty.
509  * Caller must set state afterwards.
510  */
511 void
512 runq_remove(struct runq *rq, struct thread *td)
513 {
514 
515 	runq_remove_idx(rq, td, NULL);
516 }
517 
518 void
519 runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
520 {
521 	struct rqhead *rqh;
522 	u_char pri;
523 
524 	KASSERT(td->td_flags & TDF_INMEM,
525 		("runq_remove_idx: thread swapped out"));
526 	pri = td->td_rqindex;
527 	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
528 	rqh = &rq->rq_queues[pri];
529 	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
530 	    td, td->td_priority, pri, rqh);
531 	TAILQ_REMOVE(rqh, td, td_runq);
532 	if (TAILQ_EMPTY(rqh)) {
533 		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
534 		runq_clrbit(rq, pri);
535 		if (idx != NULL && *idx == pri)
536 			*idx = (pri + 1) % RQ_NQS;
537 	}
538 }
539