xref: /freebsd/sys/kern/kern_switch.c (revision 2b743a9e9ddc6736208dc8ca1ce06ce64ad20a19)
1 /*-
2  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_sched.h"
32 
33 #ifndef KERN_SWITCH_INCLUDE
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kdb.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/queue.h>
43 #include <sys/sched.h>
44 #else  /* KERN_SWITCH_INCLUDE */
45 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
46 #include <sys/smp.h>
47 #endif
48 #if defined(SMP) && defined(SCHED_4BSD)
49 #include <sys/sysctl.h>
50 #endif
51 
52 /* Uncomment this to enable logging of critical_enter/exit. */
53 #if 0
54 #define	KTR_CRITICAL	KTR_SCHED
55 #else
56 #define	KTR_CRITICAL	0
57 #endif
58 
59 #ifdef FULL_PREEMPTION
60 #ifndef PREEMPTION
61 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
62 #endif
63 #endif
64 
65 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
66 
67 /*
68  * kern.sched.preemption allows user space to determine if preemption support
69  * is compiled in or not.  It is not currently a boot or runtime flag that
70  * can be changed.
71  */
72 #ifdef PREEMPTION
73 static int kern_sched_preemption = 1;
74 #else
75 static int kern_sched_preemption = 0;
76 #endif
77 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
78     &kern_sched_preemption, 0, "Kernel preemption enabled");
79 
80 /************************************************************************
81  * Functions that manipulate runnability from a thread perspective.	*
82  ************************************************************************/
83 /*
84  * Select the thread that will be run next.
85  */
86 struct thread *
87 choosethread(void)
88 {
89 	struct thread *td;
90 
91 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
92 	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
93 		/* Shutting down, run idlethread on AP's */
94 		td = PCPU_GET(idlethread);
95 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
96 		TD_SET_RUNNING(td);
97 		return (td);
98 	}
99 #endif
100 
101 retry:
102 	td = sched_choose();
103 
104 	/*
105 	 * If we are in panic, only allow system threads,
106 	 * plus the one we are running in, to be run.
107 	 */
108 	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
109 	    (td->td_flags & TDF_INPANIC) == 0)) {
110 		/* note that it is no longer on the run queue */
111 		TD_SET_CAN_RUN(td);
112 		goto retry;
113 	}
114 
115 	TD_SET_RUNNING(td);
116 	return (td);
117 }
118 
119 /*
120  * Kernel thread preemption implementation.  Critical sections mark
121  * regions of code in which preemptions are not allowed.
122  */
123 void
124 critical_enter(void)
125 {
126 	struct thread *td;
127 
128 	td = curthread;
129 	td->td_critnest++;
130 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
131 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
132 }
133 
134 void
135 critical_exit(void)
136 {
137 	struct thread *td;
138 
139 	td = curthread;
140 	KASSERT(td->td_critnest != 0,
141 	    ("critical_exit: td_critnest == 0"));
142 #ifdef PREEMPTION
143 	if (td->td_critnest == 1) {
144 		td->td_critnest = 0;
145 		mtx_assert(&sched_lock, MA_NOTOWNED);
146 		if (td->td_owepreempt) {
147 			td->td_critnest = 1;
148 			mtx_lock_spin(&sched_lock);
149 			td->td_critnest--;
150 			mi_switch(SW_INVOL, NULL);
151 			mtx_unlock_spin(&sched_lock);
152 		}
153 	} else
154 #endif
155 		td->td_critnest--;
156 
157 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
158 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
159 }
160 
161 /*
162  * This function is called when a thread is about to be put on run queue
163  * because it has been made runnable or its priority has been adjusted.  It
164  * determines if the new thread should be immediately preempted to.  If so,
165  * it switches to it and eventually returns true.  If not, it returns false
166  * so that the caller may place the thread on an appropriate run queue.
167  */
168 int
169 maybe_preempt(struct thread *td)
170 {
171 #ifdef PREEMPTION
172 	struct thread *ctd;
173 	int cpri, pri;
174 #endif
175 
176 	mtx_assert(&sched_lock, MA_OWNED);
177 #ifdef PREEMPTION
178 	/*
179 	 * The new thread should not preempt the current thread if any of the
180 	 * following conditions are true:
181 	 *
182 	 *  - The kernel is in the throes of crashing (panicstr).
183 	 *  - The current thread has a higher (numerically lower) or
184 	 *    equivalent priority.  Note that this prevents curthread from
185 	 *    trying to preempt to itself.
186 	 *  - It is too early in the boot for context switches (cold is set).
187 	 *  - The current thread has an inhibitor set or is in the process of
188 	 *    exiting.  In this case, the current thread is about to switch
189 	 *    out anyways, so there's no point in preempting.  If we did,
190 	 *    the current thread would not be properly resumed as well, so
191 	 *    just avoid that whole landmine.
192 	 *  - If the new thread's priority is not a realtime priority and
193 	 *    the current thread's priority is not an idle priority and
194 	 *    FULL_PREEMPTION is disabled.
195 	 *
196 	 * If all of these conditions are false, but the current thread is in
197 	 * a nested critical section, then we have to defer the preemption
198 	 * until we exit the critical section.  Otherwise, switch immediately
199 	 * to the new thread.
200 	 */
201 	ctd = curthread;
202 	KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
203 	  ("thread has no (or wrong) sched-private part."));
204 	KASSERT((td->td_inhibitors == 0),
205 			("maybe_preempt: trying to run inhibited thread"));
206 	pri = td->td_priority;
207 	cpri = ctd->td_priority;
208 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
209 	    TD_IS_INHIBITED(ctd))
210 		return (0);
211 #ifndef FULL_PREEMPTION
212 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
213 		return (0);
214 #endif
215 
216 	if (ctd->td_critnest > 1) {
217 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
218 		    ctd->td_critnest);
219 		ctd->td_owepreempt = 1;
220 		return (0);
221 	}
222 
223 	/*
224 	 * Thread is runnable but not yet put on system run queue.
225 	 */
226 	MPASS(TD_ON_RUNQ(td));
227 	TD_SET_RUNNING(td);
228 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
229 	    td->td_proc->p_pid, td->td_proc->p_comm);
230 	mi_switch(SW_INVOL|SW_PREEMPT, td);
231 	return (1);
232 #else
233 	return (0);
234 #endif
235 }
236 
237 #if 0
238 #ifndef PREEMPTION
239 /* XXX: There should be a non-static version of this. */
240 static void
241 printf_caddr_t(void *data)
242 {
243 	printf("%s", (char *)data);
244 }
245 static char preempt_warning[] =
246     "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
247 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
248     preempt_warning)
249 #endif
250 #endif
251 
252 /************************************************************************
253  * SYSTEM RUN QUEUE manipulations and tests				*
254  ************************************************************************/
255 /*
256  * Initialize a run structure.
257  */
258 void
259 runq_init(struct runq *rq)
260 {
261 	int i;
262 
263 	bzero(rq, sizeof *rq);
264 	for (i = 0; i < RQ_NQS; i++)
265 		TAILQ_INIT(&rq->rq_queues[i]);
266 }
267 
268 /*
269  * Clear the status bit of the queue corresponding to priority level pri,
270  * indicating that it is empty.
271  */
272 static __inline void
273 runq_clrbit(struct runq *rq, int pri)
274 {
275 	struct rqbits *rqb;
276 
277 	rqb = &rq->rq_status;
278 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
279 	    rqb->rqb_bits[RQB_WORD(pri)],
280 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
281 	    RQB_BIT(pri), RQB_WORD(pri));
282 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
283 }
284 
285 /*
286  * Find the index of the first non-empty run queue.  This is done by
287  * scanning the status bits, a set bit indicates a non-empty queue.
288  */
289 static __inline int
290 runq_findbit(struct runq *rq)
291 {
292 	struct rqbits *rqb;
293 	int pri;
294 	int i;
295 
296 	rqb = &rq->rq_status;
297 	for (i = 0; i < RQB_LEN; i++)
298 		if (rqb->rqb_bits[i]) {
299 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
300 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
301 			    rqb->rqb_bits[i], i, pri);
302 			return (pri);
303 		}
304 
305 	return (-1);
306 }
307 
308 static __inline int
309 runq_findbit_from(struct runq *rq, u_char start)
310 {
311 	struct rqbits *rqb;
312 	int bit;
313 	int pri;
314 	int i;
315 
316 	rqb = &rq->rq_status;
317 	bit = start & (RQB_BPW -1);
318 	pri = 0;
319 	CTR1(KTR_RUNQ, "runq_findbit_from: start %d", start);
320 again:
321 	for (i = RQB_WORD(start); i < RQB_LEN; i++) {
322 		CTR3(KTR_RUNQ, "runq_findbit_from: bits %d = %#x bit = %d",
323 		    i, rqb->rqb_bits[i], bit);
324 		if (rqb->rqb_bits[i]) {
325 			if (bit != 0) {
326 				for (pri = bit; pri < RQB_BPW; pri++)
327 					if (rqb->rqb_bits[i] & (1ul << pri))
328 						break;
329 				bit = 0;
330 				if (pri >= RQB_BPW)
331 					continue;
332 			} else
333 				pri = RQB_FFS(rqb->rqb_bits[i]);
334 			pri += (i << RQB_L2BPW);
335 			CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
336 			    rqb->rqb_bits[i], i, pri);
337 			return (pri);
338 		}
339 		bit = 0;
340 	}
341 	if (start != 0) {
342 		CTR0(KTR_RUNQ, "runq_findbit_from: restarting");
343 		start = 0;
344 		goto again;
345 	}
346 
347 	return (-1);
348 }
349 
350 /*
351  * Set the status bit of the queue corresponding to priority level pri,
352  * indicating that it is non-empty.
353  */
354 static __inline void
355 runq_setbit(struct runq *rq, int pri)
356 {
357 	struct rqbits *rqb;
358 
359 	rqb = &rq->rq_status;
360 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
361 	    rqb->rqb_bits[RQB_WORD(pri)],
362 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
363 	    RQB_BIT(pri), RQB_WORD(pri));
364 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
365 }
366 
367 /*
368  * Add the thread to the queue specified by its priority, and set the
369  * corresponding status bit.
370  */
371 void
372 runq_add(struct runq *rq, struct td_sched *ts, int flags)
373 {
374 	struct rqhead *rqh;
375 	int pri;
376 
377 	pri = ts->ts_thread->td_priority / RQ_PPQ;
378 	ts->ts_rqindex = pri;
379 	runq_setbit(rq, pri);
380 	rqh = &rq->rq_queues[pri];
381 	CTR5(KTR_RUNQ, "runq_add: td=%p ts=%p pri=%d %d rqh=%p",
382 	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
383 	if (flags & SRQ_PREEMPTED) {
384 		TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
385 	} else {
386 		TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
387 	}
388 }
389 
390 void
391 runq_add_pri(struct runq *rq, struct td_sched *ts, u_char pri, int flags)
392 {
393 	struct rqhead *rqh;
394 
395 	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
396 	ts->ts_rqindex = pri;
397 	runq_setbit(rq, pri);
398 	rqh = &rq->rq_queues[pri];
399 	CTR5(KTR_RUNQ, "runq_add_pri: td=%p ke=%p pri=%d idx=%d rqh=%p",
400 	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
401 	if (flags & SRQ_PREEMPTED) {
402 		TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
403 	} else {
404 		TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
405 	}
406 }
407 /*
408  * Return true if there are runnable processes of any priority on the run
409  * queue, false otherwise.  Has no side effects, does not modify the run
410  * queue structure.
411  */
412 int
413 runq_check(struct runq *rq)
414 {
415 	struct rqbits *rqb;
416 	int i;
417 
418 	rqb = &rq->rq_status;
419 	for (i = 0; i < RQB_LEN; i++)
420 		if (rqb->rqb_bits[i]) {
421 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
422 			    rqb->rqb_bits[i], i);
423 			return (1);
424 		}
425 	CTR0(KTR_RUNQ, "runq_check: empty");
426 
427 	return (0);
428 }
429 
430 #if defined(SMP) && defined(SCHED_4BSD)
431 int runq_fuzz = 1;
432 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
433 #endif
434 
435 /*
436  * Find the highest priority process on the run queue.
437  */
438 struct td_sched *
439 runq_choose(struct runq *rq)
440 {
441 	struct rqhead *rqh;
442 	struct td_sched *ts;
443 	int pri;
444 
445 	mtx_assert(&sched_lock, MA_OWNED);
446 	while ((pri = runq_findbit(rq)) != -1) {
447 		rqh = &rq->rq_queues[pri];
448 #if defined(SMP) && defined(SCHED_4BSD)
449 		/* fuzz == 1 is normal.. 0 or less are ignored */
450 		if (runq_fuzz > 1) {
451 			/*
452 			 * In the first couple of entries, check if
453 			 * there is one for our CPU as a preference.
454 			 */
455 			int count = runq_fuzz;
456 			int cpu = PCPU_GET(cpuid);
457 			struct td_sched *ts2;
458 			ts2 = ts = TAILQ_FIRST(rqh);
459 
460 			while (count-- && ts2) {
461 				if (ts->ts_thread->td_lastcpu == cpu) {
462 					ts = ts2;
463 					break;
464 				}
465 				ts2 = TAILQ_NEXT(ts2, ts_procq);
466 			}
467 		} else
468 #endif
469 			ts = TAILQ_FIRST(rqh);
470 		KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
471 		CTR3(KTR_RUNQ,
472 		    "runq_choose: pri=%d td_sched=%p rqh=%p", pri, ts, rqh);
473 		return (ts);
474 	}
475 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
476 
477 	return (NULL);
478 }
479 
480 struct td_sched *
481 runq_choose_from(struct runq *rq, u_char idx)
482 {
483 	struct rqhead *rqh;
484 	struct td_sched *ts;
485 	int pri;
486 
487 	mtx_assert(&sched_lock, MA_OWNED);
488 	if ((pri = runq_findbit_from(rq, idx)) != -1) {
489 		rqh = &rq->rq_queues[pri];
490 		ts = TAILQ_FIRST(rqh);
491 		KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
492 		CTR4(KTR_RUNQ,
493 		    "runq_choose_from: pri=%d kse=%p idx=%d rqh=%p",
494 		    pri, ts, ts->ts_rqindex, rqh);
495 		return (ts);
496 	}
497 	CTR1(KTR_RUNQ, "runq_choose_from: idleproc pri=%d", pri);
498 
499 	return (NULL);
500 }
501 /*
502  * Remove the thread from the queue specified by its priority, and clear the
503  * corresponding status bit if the queue becomes empty.
504  * Caller must set state afterwards.
505  */
506 void
507 runq_remove(struct runq *rq, struct td_sched *ts)
508 {
509 
510 	runq_remove_idx(rq, ts, NULL);
511 }
512 
513 void
514 runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
515 {
516 	struct rqhead *rqh;
517 	u_char pri;
518 
519 	KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM,
520 		("runq_remove_idx: process swapped out"));
521 	pri = ts->ts_rqindex;
522 	rqh = &rq->rq_queues[pri];
523 	CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
524 	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
525 	TAILQ_REMOVE(rqh, ts, ts_procq);
526 	if (TAILQ_EMPTY(rqh)) {
527 		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
528 		runq_clrbit(rq, pri);
529 		if (idx != NULL && *idx == pri)
530 			*idx = (pri + 1) % RQ_NQS;
531 	}
532 }
533 
534 /****** functions that are temporarily here ***********/
535 #include <vm/uma.h>
536 extern struct mtx kse_zombie_lock;
537 
538 /*
539  *  Allocate scheduler specific per-process resources.
540  * The thread and proc have already been linked in.
541  *
542  * Called from:
543  *  proc_init() (UMA init method)
544  */
545 void
546 sched_newproc(struct proc *p, struct thread *td)
547 {
548 }
549 
550 /*
551  * thread is being either created or recycled.
552  * Fix up the per-scheduler resources associated with it.
553  * Called from:
554  *  sched_fork_thread()
555  *  thread_dtor()  (*may go away)
556  *  thread_init()  (*may go away)
557  */
558 void
559 sched_newthread(struct thread *td)
560 {
561 	struct td_sched *ts;
562 
563 	ts = (struct td_sched *) (td + 1);
564 	bzero(ts, sizeof(*ts));
565 	td->td_sched     = ts;
566 	ts->ts_thread	= td;
567 }
568 
569 /*
570  * Called from:
571  *  thr_create()
572  *  proc_init() (UMA) via sched_newproc()
573  */
574 void
575 sched_init_concurrency(struct proc *p)
576 {
577 }
578 
579 /*
580  * Change the concurrency of an existing proc to N
581  * Called from:
582  *  kse_create()
583  *  kse_exit()
584  *  thread_exit()
585  *  thread_single()
586  */
587 void
588 sched_set_concurrency(struct proc *p, int concurrency)
589 {
590 }
591 
592 /*
593  * Called from thread_exit() for all exiting thread
594  *
595  * Not to be confused with sched_exit_thread()
596  * that is only called from thread_exit() for threads exiting
597  * without the rest of the process exiting because it is also called from
598  * sched_exit() and we wouldn't want to call it twice.
599  * XXX This can probably be fixed.
600  */
601 void
602 sched_thread_exit(struct thread *td)
603 {
604 }
605 
606 #endif /* KERN_SWITCH_INCLUDE */
607