xref: /freebsd/sys/kern/kern_switch.c (revision 44692526be9c523c310e7d55810f3bad56883567)
1 /*
2  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /***
28 Here is the logic..
29 
30 If there are N processors, then there are at most N KSEs (kernel
31 schedulable entities) working to process threads that belong to a
32 KSEGROUP (kg). If there are X of these KSEs actually running at the
33 moment in question, then there are at most M (N-X) of these KSEs on
34 the run queue, as running KSEs are not on the queue.
35 
36 Runnable threads are queued off the KSEGROUP in priority order.
37 If there are M or more threads runnable, the top M threads
38 (by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39 their priority from those threads and are put on the run queue.
40 
41 The last thread that had a priority high enough to have a KSE associated
42 with it, AND IS ON THE RUN QUEUE is pointed to by
43 kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44 assigned as all the available KSEs are activly running, or because there
45 are no threads queued, that pointer is NULL.
46 
47 When a KSE is removed from the run queue to become runnable, we know
48 it was associated with the highest priority thread in the queue (at the head
49 of the queue). If it is also the last assigned we know M was 1 and must
50 now be 0. Since the thread is no longer queued that pointer must be
51 removed from it. Since we know there were no more KSEs available,
52 (M was 1 and is now 0) and since we are not FREEING our KSE
53 but using it, we know there are STILL no more KSEs available, we can prove
54 that the next thread in the ksegrp list will not have a KSE to assign to
55 it, so we can show that the pointer must be made 'invalid' (NULL).
56 
57 The pointer exists so that when a new thread is made runnable, it can
58 have its priority compared with the last assigned thread to see if
59 it should 'steal' its KSE or not.. i.e. is it 'earlier'
60 on the list than that thread or later.. If it's earlier, then the KSE is
61 removed from the last assigned (which is now not assigned a KSE)
62 and reassigned to the new thread, which is placed earlier in the list.
63 The pointer is then backed up to the previous thread (which may or may not
64 be the new thread).
65 
66 When a thread sleeps or is removed, the KSE becomes available and if there
67 are queued threads that are not assigned KSEs, the highest priority one of
68 them is assigned the KSE, which is then placed back on the run queue at
69 the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70 to point to it.
71 
72 The following diagram shows 2 KSEs and 3 threads from a single process.
73 
74  RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75               \    \____
76                \        \
77     KSEGROUP---thread--thread--thread    (queued in priority order)
78         \                 /
79          \_______________/
80           (last_assigned)
81 
82 The result of this scheme is that the M available KSEs are always
83 queued at the priorities they have inherrited from the M highest priority
84 threads for that KSEGROUP. If this situation changes, the KSEs are
85 reassigned to keep this true.
86 ***/
87 
88 #include <sys/cdefs.h>
89 __FBSDID("$FreeBSD$");
90 
91 #include "opt_sched.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kdb.h>
96 #include <sys/kernel.h>
97 #include <sys/ktr.h>
98 #include <sys/lock.h>
99 #include <sys/mutex.h>
100 #include <sys/proc.h>
101 #include <sys/queue.h>
102 #include <sys/sched.h>
103 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
104 #include <sys/smp.h>
105 #endif
106 #include <machine/critical.h>
107 #if defined(SMP) && defined(SCHED_4BSD)
108 #include <sys/sysctl.h>
109 #endif
110 
111 #ifdef FULL_PREEMPTION
112 #ifndef PREEMPTION
113 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
114 #endif
115 #endif
116 
117 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
118 
119 /************************************************************************
120  * Functions that manipulate runnability from a thread perspective.	*
121  ************************************************************************/
122 /*
123  * Select the KSE that will be run next.  From that find the thread, and
124  * remove it from the KSEGRP's run queue.  If there is thread clustering,
125  * this will be what does it.
126  */
127 struct thread *
128 choosethread(void)
129 {
130 	struct kse *ke;
131 	struct thread *td;
132 	struct ksegrp *kg;
133 
134 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
135 	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
136 		/* Shutting down, run idlethread on AP's */
137 		td = PCPU_GET(idlethread);
138 		ke = td->td_kse;
139 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
140 		ke->ke_flags |= KEF_DIDRUN;
141 		TD_SET_RUNNING(td);
142 		return (td);
143 	}
144 #endif
145 
146 retry:
147 	ke = sched_choose();
148 	if (ke) {
149 		td = ke->ke_thread;
150 		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
151 		kg = ke->ke_ksegrp;
152 		if (td->td_proc->p_flag & P_SA) {
153 			if (kg->kg_last_assigned == td) {
154 				kg->kg_last_assigned = TAILQ_PREV(td,
155 				    threadqueue, td_runq);
156 			}
157 			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
158 			kg->kg_runnable--;
159 		}
160 		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
161 		    td, td->td_priority);
162 	} else {
163 		/* Simulate runq_choose() having returned the idle thread */
164 		td = PCPU_GET(idlethread);
165 		ke = td->td_kse;
166 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
167 	}
168 	ke->ke_flags |= KEF_DIDRUN;
169 
170 	/*
171 	 * If we are in panic, only allow system threads,
172 	 * plus the one we are running in, to be run.
173 	 */
174 	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
175 	    (td->td_flags & TDF_INPANIC) == 0)) {
176 		/* note that it is no longer on the run queue */
177 		TD_SET_CAN_RUN(td);
178 		goto retry;
179 	}
180 
181 	TD_SET_RUNNING(td);
182 	return (td);
183 }
184 
185 /*
186  * Given a surplus KSE, either assign a new runable thread to it
187  * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
188  * Assumes that the original thread is not runnable.
189  */
190 void
191 kse_reassign(struct kse *ke)
192 {
193 	struct ksegrp *kg;
194 	struct thread *td;
195 	struct thread *original;
196 
197 	mtx_assert(&sched_lock, MA_OWNED);
198 	original = ke->ke_thread;
199 	KASSERT(original == NULL || TD_IS_INHIBITED(original),
200     	    ("reassigning KSE with runnable thread"));
201 	kg = ke->ke_ksegrp;
202 	if (original)
203 		original->td_kse = NULL;
204 
205 	/*
206 	 * Find the first unassigned thread
207 	 */
208 	if ((td = kg->kg_last_assigned) != NULL)
209 		td = TAILQ_NEXT(td, td_runq);
210 	else
211 		td = TAILQ_FIRST(&kg->kg_runq);
212 
213 	/*
214 	 * If we found one, assign it the kse, otherwise idle the kse.
215 	 */
216 	if (td) {
217 		kg->kg_last_assigned = td;
218 		td->td_kse = ke;
219 		ke->ke_thread = td;
220 		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
221 		sched_add(td, SRQ_BORING);
222 		return;
223 	}
224 
225 	ke->ke_state = KES_IDLE;
226 	ke->ke_thread = NULL;
227 	TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
228 	kg->kg_idle_kses++;
229 	CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke);
230 	return;
231 }
232 
233 #if 0
234 /*
235  * Remove a thread from its KSEGRP's run queue.
236  * This in turn may remove it from a KSE if it was already assigned
237  * to one, possibly causing a new thread to be assigned to the KSE
238  * and the KSE getting a new priority.
239  */
240 static void
241 remrunqueue(struct thread *td)
242 {
243 	struct thread *td2, *td3;
244 	struct ksegrp *kg;
245 	struct kse *ke;
246 
247 	mtx_assert(&sched_lock, MA_OWNED);
248 	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
249 	kg = td->td_ksegrp;
250 	ke = td->td_kse;
251 	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
252 	TD_SET_CAN_RUN(td);
253 	/*
254 	 * If it is not a threaded process, take the shortcut.
255 	 */
256 	if ((td->td_proc->p_flag & P_SA) == 0) {
257 		/* Bring its kse with it, leave the thread attached */
258 		sched_rem(td);
259 		ke->ke_state = KES_THREAD;
260 		return;
261 	}
262    	td3 = TAILQ_PREV(td, threadqueue, td_runq);
263 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
264 	kg->kg_runnable--;
265 	if (ke) {
266 		/*
267 		 * This thread has been assigned to a KSE.
268 		 * We need to dissociate it and try assign the
269 		 * KSE to the next available thread. Then, we should
270 		 * see if we need to move the KSE in the run queues.
271 		 */
272 		sched_rem(td);
273 		ke->ke_state = KES_THREAD;
274 		td2 = kg->kg_last_assigned;
275 		KASSERT((td2 != NULL), ("last assigned has wrong value"));
276 		if (td2 == td)
277 			kg->kg_last_assigned = td3;
278 		kse_reassign(ke);
279 	}
280 }
281 #endif
282 
283 /*
284  * Change the priority of a thread that is on the run queue.
285  */
286 void
287 adjustrunqueue( struct thread *td, int newpri)
288 {
289 	struct ksegrp *kg;
290 	struct kse *ke;
291 
292 	mtx_assert(&sched_lock, MA_OWNED);
293 	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
294 
295 	ke = td->td_kse;
296 	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
297 	/*
298 	 * If it is not a threaded process, take the shortcut.
299 	 */
300 	if ((td->td_proc->p_flag & P_SA) == 0) {
301 		/* We only care about the kse in the run queue. */
302 		td->td_priority = newpri;
303 		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
304 			sched_rem(td);
305 			sched_add(td, SRQ_BORING);
306 		}
307 		return;
308 	}
309 
310 	/* It is a threaded process */
311 	kg = td->td_ksegrp;
312 	TD_SET_CAN_RUN(td);
313 	if (ke) {
314 		if (kg->kg_last_assigned == td) {
315 			kg->kg_last_assigned =
316 			    TAILQ_PREV(td, threadqueue, td_runq);
317 		}
318 		sched_rem(td);
319 	}
320 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
321 	kg->kg_runnable--;
322 	td->td_priority = newpri;
323 	setrunqueue(td, SRQ_BORING);
324 }
325 
326 void
327 setrunqueue(struct thread *td, int flags)
328 {
329 	struct kse *ke;
330 	struct ksegrp *kg;
331 	struct thread *td2;
332 	struct thread *tda;
333 	int count;
334 
335 	CTR4(KTR_RUNQ, "setrunqueue: td:%p ke:%p kg:%p pid:%d",
336 	    td, td->td_kse, td->td_ksegrp, td->td_proc->p_pid);
337 	mtx_assert(&sched_lock, MA_OWNED);
338 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
339 	    ("setrunqueue: bad thread state"));
340 	TD_SET_RUNQ(td);
341 	kg = td->td_ksegrp;
342 	if ((td->td_proc->p_flag & P_SA) == 0) {
343 		/*
344 		 * Common path optimisation: Only one of everything
345 		 * and the KSE is always already attached.
346 		 * Totally ignore the ksegrp run queue.
347 		 */
348 		sched_add(td, flags);
349 		return;
350 	}
351 
352 	tda = kg->kg_last_assigned;
353 	if ((ke = td->td_kse) == NULL) {
354 		if (kg->kg_idle_kses) {
355 			/*
356 			 * There is a free one so it's ours for the asking..
357 			 */
358 			ke = TAILQ_FIRST(&kg->kg_iq);
359 			CTR2(KTR_RUNQ, "setrunqueue: kg:%p: Use free ke:%p",
360 			    kg, ke);
361 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
362 			ke->ke_state = KES_THREAD;
363 			kg->kg_idle_kses--;
364 		} else if (tda && (tda->td_priority > td->td_priority)) {
365 			/*
366 			 * None free, but there is one we can commandeer.
367 			 */
368 			ke = tda->td_kse;
369 			CTR3(KTR_RUNQ,
370 			    "setrunqueue: kg:%p: take ke:%p from td: %p",
371 			    kg, ke, tda);
372 			sched_rem(tda);
373 			tda->td_kse = NULL;
374 			ke->ke_thread = NULL;
375 			tda = kg->kg_last_assigned =
376 		    	    TAILQ_PREV(tda, threadqueue, td_runq);
377 		}
378 	} else {
379 		/*
380 		 * Temporarily disassociate so it looks like the other cases.
381 		 */
382 		ke->ke_thread = NULL;
383 		td->td_kse = NULL;
384 	}
385 
386 	/*
387 	 * Add the thread to the ksegrp's run queue at
388 	 * the appropriate place.
389 	 */
390 	count = 0;
391 	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
392 		if (td2->td_priority > td->td_priority) {
393 			kg->kg_runnable++;
394 			TAILQ_INSERT_BEFORE(td2, td, td_runq);
395 			break;
396 		}
397 		/* XXX Debugging hack */
398 		if (++count > 10000) {
399 			printf("setrunqueue(): corrupt kq_runq, td= %p\n", td);
400 			panic("deadlock in setrunqueue");
401 		}
402 	}
403 	if (td2 == NULL) {
404 		/* We ran off the end of the TAILQ or it was empty. */
405 		kg->kg_runnable++;
406 		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
407 	}
408 
409 	/*
410 	 * If we have a ke to use, then put it on the run queue and
411 	 * If needed, readjust the last_assigned pointer.
412 	 */
413 	if (ke) {
414 		if (tda == NULL) {
415 			/*
416 			 * No pre-existing last assigned so whoever is first
417 			 * gets the KSE we brought in.. (maybe us)
418 			 */
419 			td2 = TAILQ_FIRST(&kg->kg_runq);
420 			KASSERT((td2->td_kse == NULL),
421 			    ("unexpected ke present"));
422 			td2->td_kse = ke;
423 			ke->ke_thread = td2;
424 			kg->kg_last_assigned = td2;
425 		} else if (tda->td_priority > td->td_priority) {
426 			/*
427 			 * It's ours, grab it, but last_assigned is past us
428 			 * so don't change it.
429 			 */
430 			td->td_kse = ke;
431 			ke->ke_thread = td;
432 		} else {
433 			/*
434 			 * We are past last_assigned, so
435 			 * put the new kse on whatever is next,
436 			 * which may or may not be us.
437 			 */
438 			td2 = TAILQ_NEXT(tda, td_runq);
439 			kg->kg_last_assigned = td2;
440 			td2->td_kse = ke;
441 			ke->ke_thread = td2;
442 		}
443 		sched_add(ke->ke_thread, flags);
444 	} else {
445 		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
446 			td, td->td_ksegrp, td->td_proc->p_pid);
447 	}
448 }
449 
450 /*
451  * Kernel thread preemption implementation.  Critical sections mark
452  * regions of code in which preemptions are not allowed.
453  */
454 void
455 critical_enter(void)
456 {
457 	struct thread *td;
458 
459 	td = curthread;
460 	if (td->td_critnest == 0)
461 		cpu_critical_enter(td);
462 	td->td_critnest++;
463 }
464 
465 void
466 critical_exit(void)
467 {
468 	struct thread *td;
469 
470 	td = curthread;
471 	KASSERT(td->td_critnest != 0,
472 	    ("critical_exit: td_critnest == 0"));
473 	if (td->td_critnest == 1) {
474 #ifdef PREEMPTION
475 		mtx_assert(&sched_lock, MA_NOTOWNED);
476 		if (td->td_pflags & TDP_OWEPREEMPT) {
477 			mtx_lock_spin(&sched_lock);
478 			mi_switch(SW_INVOL, NULL);
479 			mtx_unlock_spin(&sched_lock);
480 		}
481 #endif
482 		td->td_critnest = 0;
483 		cpu_critical_exit(td);
484 	} else {
485 		td->td_critnest--;
486 	}
487 }
488 
489 /*
490  * This function is called when a thread is about to be put on run queue
491  * because it has been made runnable or its priority has been adjusted.  It
492  * determines if the new thread should be immediately preempted to.  If so,
493  * it switches to it and eventually returns true.  If not, it returns false
494  * so that the caller may place the thread on an appropriate run queue.
495  */
496 int
497 maybe_preempt(struct thread *td)
498 {
499 #ifdef PREEMPTION
500 	struct thread *ctd;
501 	int cpri, pri;
502 #endif
503 
504 	mtx_assert(&sched_lock, MA_OWNED);
505 #ifdef PREEMPTION
506 	/*
507 	 * The new thread should not preempt the current thread if any of the
508 	 * following conditions are true:
509 	 *
510 	 *  - The current thread has a higher (numerically lower) or
511 	 *    equivalent priority.  Note that this prevents curthread from
512 	 *    trying to preempt to itself.
513 	 *  - It is too early in the boot for context switches (cold is set).
514 	 *  - The current thread has an inhibitor set or is in the process of
515 	 *    exiting.  In this case, the current thread is about to switch
516 	 *    out anyways, so there's no point in preempting.  If we did,
517 	 *    the current thread would not be properly resumed as well, so
518 	 *    just avoid that whole landmine.
519 	 *  - If the new thread's priority is not a realtime priority and
520 	 *    the current thread's priority is not an idle priority and
521 	 *    FULL_PREEMPTION is disabled.
522 	 *
523 	 * If all of these conditions are false, but the current thread is in
524 	 * a nested critical section, then we have to defer the preemption
525 	 * until we exit the critical section.  Otherwise, switch immediately
526 	 * to the new thread.
527 	 */
528 	ctd = curthread;
529 	if (ctd->td_kse == NULL || ctd->td_kse->ke_thread != ctd)
530 		return (0);
531 	pri = td->td_priority;
532 	cpri = ctd->td_priority;
533 	if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
534 	    td->td_kse->ke_state != KES_THREAD)
535 		return (0);
536 #ifndef FULL_PREEMPTION
537 	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
538 	    !(cpri >= PRI_MIN_IDLE))
539 		return (0);
540 #endif
541 	if (ctd->td_critnest > 1) {
542 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
543 		    ctd->td_critnest);
544 		ctd->td_pflags |= TDP_OWEPREEMPT;
545 		return (0);
546 	}
547 
548 	/*
549 	 * Our thread state says that we are already on a run queue, so
550 	 * update our state as if we had been dequeued by choosethread().
551 	 */
552 	MPASS(TD_ON_RUNQ(td));
553 	TD_SET_RUNNING(td);
554 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
555 	    td->td_proc->p_pid, td->td_proc->p_comm);
556 	mi_switch(SW_INVOL, td);
557 	return (1);
558 #else
559 	return (0);
560 #endif
561 }
562 
563 #if 0
564 #ifndef PREEMPTION
565 /* XXX: There should be a non-static version of this. */
566 static void
567 printf_caddr_t(void *data)
568 {
569 	printf("%s", (char *)data);
570 }
571 static char preempt_warning[] =
572     "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
573 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
574     preempt_warning)
575 #endif
576 #endif
577 
578 /************************************************************************
579  * SYSTEM RUN QUEUE manipulations and tests				*
580  ************************************************************************/
581 /*
582  * Initialize a run structure.
583  */
584 void
585 runq_init(struct runq *rq)
586 {
587 	int i;
588 
589 	bzero(rq, sizeof *rq);
590 	for (i = 0; i < RQ_NQS; i++)
591 		TAILQ_INIT(&rq->rq_queues[i]);
592 }
593 
594 /*
595  * Clear the status bit of the queue corresponding to priority level pri,
596  * indicating that it is empty.
597  */
598 static __inline void
599 runq_clrbit(struct runq *rq, int pri)
600 {
601 	struct rqbits *rqb;
602 
603 	rqb = &rq->rq_status;
604 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
605 	    rqb->rqb_bits[RQB_WORD(pri)],
606 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
607 	    RQB_BIT(pri), RQB_WORD(pri));
608 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
609 }
610 
611 /*
612  * Find the index of the first non-empty run queue.  This is done by
613  * scanning the status bits, a set bit indicates a non-empty queue.
614  */
615 static __inline int
616 runq_findbit(struct runq *rq)
617 {
618 	struct rqbits *rqb;
619 	int pri;
620 	int i;
621 
622 	rqb = &rq->rq_status;
623 	for (i = 0; i < RQB_LEN; i++)
624 		if (rqb->rqb_bits[i]) {
625 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
626 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
627 			    rqb->rqb_bits[i], i, pri);
628 			return (pri);
629 		}
630 
631 	return (-1);
632 }
633 
634 /*
635  * Set the status bit of the queue corresponding to priority level pri,
636  * indicating that it is non-empty.
637  */
638 static __inline void
639 runq_setbit(struct runq *rq, int pri)
640 {
641 	struct rqbits *rqb;
642 
643 	rqb = &rq->rq_status;
644 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
645 	    rqb->rqb_bits[RQB_WORD(pri)],
646 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
647 	    RQB_BIT(pri), RQB_WORD(pri));
648 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
649 }
650 
651 /*
652  * Add the KSE to the queue specified by its priority, and set the
653  * corresponding status bit.
654  */
655 void
656 runq_add(struct runq *rq, struct kse *ke)
657 {
658 	struct rqhead *rqh;
659 	int pri;
660 
661 	pri = ke->ke_thread->td_priority / RQ_PPQ;
662 	ke->ke_rqindex = pri;
663 	runq_setbit(rq, pri);
664 	rqh = &rq->rq_queues[pri];
665 	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
666 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
667 	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
668 }
669 
670 /*
671  * Return true if there are runnable processes of any priority on the run
672  * queue, false otherwise.  Has no side effects, does not modify the run
673  * queue structure.
674  */
675 int
676 runq_check(struct runq *rq)
677 {
678 	struct rqbits *rqb;
679 	int i;
680 
681 	rqb = &rq->rq_status;
682 	for (i = 0; i < RQB_LEN; i++)
683 		if (rqb->rqb_bits[i]) {
684 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
685 			    rqb->rqb_bits[i], i);
686 			return (1);
687 		}
688 	CTR0(KTR_RUNQ, "runq_check: empty");
689 
690 	return (0);
691 }
692 
693 #if defined(SMP) && defined(SCHED_4BSD)
694 int runq_fuzz = 1;
695 SYSCTL_DECL(_kern_sched);
696 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
697 #endif
698 
699 /*
700  * Find the highest priority process on the run queue.
701  */
702 struct kse *
703 runq_choose(struct runq *rq)
704 {
705 	struct rqhead *rqh;
706 	struct kse *ke;
707 	int pri;
708 
709 	mtx_assert(&sched_lock, MA_OWNED);
710 	while ((pri = runq_findbit(rq)) != -1) {
711 		rqh = &rq->rq_queues[pri];
712 #if defined(SMP) && defined(SCHED_4BSD)
713 		/* fuzz == 1 is normal.. 0 or less are ignored */
714 		if (runq_fuzz > 1) {
715 			/*
716 			 * In the first couple of entries, check if
717 			 * there is one for our CPU as a preference.
718 			 */
719 			int count = runq_fuzz;
720 			int cpu = PCPU_GET(cpuid);
721 			struct kse *ke2;
722 			ke2 = ke = TAILQ_FIRST(rqh);
723 
724 			while (count-- && ke2) {
725 				if (ke->ke_thread->td_lastcpu == cpu) {
726 					ke = ke2;
727 					break;
728 				}
729 				ke2 = TAILQ_NEXT(ke2, ke_procq);
730 			}
731 		} else
732 #endif
733 			ke = TAILQ_FIRST(rqh);
734 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
735 		CTR3(KTR_RUNQ,
736 		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
737 		return (ke);
738 	}
739 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
740 
741 	return (NULL);
742 }
743 
744 /*
745  * Remove the KSE from the queue specified by its priority, and clear the
746  * corresponding status bit if the queue becomes empty.
747  * Caller must set ke->ke_state afterwards.
748  */
749 void
750 runq_remove(struct runq *rq, struct kse *ke)
751 {
752 	struct rqhead *rqh;
753 	int pri;
754 
755 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
756 		("runq_remove: process swapped out"));
757 	pri = ke->ke_rqindex;
758 	rqh = &rq->rq_queues[pri];
759 	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
760 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
761 	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
762 	TAILQ_REMOVE(rqh, ke, ke_procq);
763 	if (TAILQ_EMPTY(rqh)) {
764 		CTR0(KTR_RUNQ, "runq_remove: empty");
765 		runq_clrbit(rq, pri);
766 	}
767 }
768 
769