xref: /freebsd/sys/kern/kern_switch.c (revision 779186434a3d7c3ca8c09ef90f75cf757f369a96)
19454b2d8SWarner Losh /*-
2d5a08a60SJake Burkholder  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3d5a08a60SJake Burkholder  * All rights reserved.
4dba6c5a6SPeter Wemm  *
5dba6c5a6SPeter Wemm  * Redistribution and use in source and binary forms, with or without
6dba6c5a6SPeter Wemm  * modification, are permitted provided that the following conditions
7dba6c5a6SPeter Wemm  * are met:
8dba6c5a6SPeter Wemm  * 1. Redistributions of source code must retain the above copyright
9dba6c5a6SPeter Wemm  *    notice, this list of conditions and the following disclaimer.
10dba6c5a6SPeter Wemm  * 2. Redistributions in binary form must reproduce the above copyright
11dba6c5a6SPeter Wemm  *    notice, this list of conditions and the following disclaimer in the
12dba6c5a6SPeter Wemm  *    documentation and/or other materials provided with the distribution.
13dba6c5a6SPeter Wemm  *
14dba6c5a6SPeter Wemm  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15dba6c5a6SPeter Wemm  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16dba6c5a6SPeter Wemm  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17dba6c5a6SPeter Wemm  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18dba6c5a6SPeter Wemm  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19dba6c5a6SPeter Wemm  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20dba6c5a6SPeter Wemm  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21dba6c5a6SPeter Wemm  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22dba6c5a6SPeter Wemm  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23dba6c5a6SPeter Wemm  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24dba6c5a6SPeter Wemm  * SUCH DAMAGE.
25dba6c5a6SPeter Wemm  */
26dba6c5a6SPeter Wemm 
27e602ba25SJulian Elischer /***
28e602ba25SJulian Elischer Here is the logic..
29e602ba25SJulian Elischer 
30e602ba25SJulian Elischer If there are N processors, then there are at most N KSEs (kernel
31e602ba25SJulian Elischer schedulable entities) working to process threads that belong to a
3209a4a69cSRobert Watson KSEGROUP (kg). If there are X of these KSEs actually running at the
33e602ba25SJulian Elischer moment in question, then there are at most M (N-X) of these KSEs on
34e602ba25SJulian Elischer the run queue, as running KSEs are not on the queue.
35e602ba25SJulian Elischer 
36e602ba25SJulian Elischer Runnable threads are queued off the KSEGROUP in priority order.
37e602ba25SJulian Elischer If there are M or more threads runnable, the top M threads
38e602ba25SJulian Elischer (by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39e602ba25SJulian Elischer their priority from those threads and are put on the run queue.
40e602ba25SJulian Elischer 
41e602ba25SJulian Elischer The last thread that had a priority high enough to have a KSE associated
42e602ba25SJulian Elischer with it, AND IS ON THE RUN QUEUE is pointed to by
43e602ba25SJulian Elischer kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44e602ba25SJulian Elischer assigned as all the available KSEs are activly running, or because there
45e602ba25SJulian Elischer are no threads queued, that pointer is NULL.
46e602ba25SJulian Elischer 
47e602ba25SJulian Elischer When a KSE is removed from the run queue to become runnable, we know
48e602ba25SJulian Elischer it was associated with the highest priority thread in the queue (at the head
49e602ba25SJulian Elischer of the queue). If it is also the last assigned we know M was 1 and must
50e602ba25SJulian Elischer now be 0. Since the thread is no longer queued that pointer must be
51e602ba25SJulian Elischer removed from it. Since we know there were no more KSEs available,
52e602ba25SJulian Elischer (M was 1 and is now 0) and since we are not FREEING our KSE
53e602ba25SJulian Elischer but using it, we know there are STILL no more KSEs available, we can prove
54e602ba25SJulian Elischer that the next thread in the ksegrp list will not have a KSE to assign to
55e602ba25SJulian Elischer it, so we can show that the pointer must be made 'invalid' (NULL).
56e602ba25SJulian Elischer 
57e602ba25SJulian Elischer The pointer exists so that when a new thread is made runnable, it can
58e602ba25SJulian Elischer have its priority compared with the last assigned thread to see if
59e602ba25SJulian Elischer it should 'steal' its KSE or not.. i.e. is it 'earlier'
60e602ba25SJulian Elischer on the list than that thread or later.. If it's earlier, then the KSE is
61e602ba25SJulian Elischer removed from the last assigned (which is now not assigned a KSE)
62e602ba25SJulian Elischer and reassigned to the new thread, which is placed earlier in the list.
63e602ba25SJulian Elischer The pointer is then backed up to the previous thread (which may or may not
64e602ba25SJulian Elischer be the new thread).
65e602ba25SJulian Elischer 
66e602ba25SJulian Elischer When a thread sleeps or is removed, the KSE becomes available and if there
67e602ba25SJulian Elischer are queued threads that are not assigned KSEs, the highest priority one of
68e602ba25SJulian Elischer them is assigned the KSE, which is then placed back on the run queue at
69e602ba25SJulian Elischer the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70e602ba25SJulian Elischer to point to it.
71e602ba25SJulian Elischer 
72e602ba25SJulian Elischer The following diagram shows 2 KSEs and 3 threads from a single process.
73e602ba25SJulian Elischer 
74e602ba25SJulian Elischer  RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75e602ba25SJulian Elischer               \    \____
76e602ba25SJulian Elischer                \        \
77e602ba25SJulian Elischer     KSEGROUP---thread--thread--thread    (queued in priority order)
78e602ba25SJulian Elischer         \                 /
79e602ba25SJulian Elischer          \_______________/
80e602ba25SJulian Elischer           (last_assigned)
81e602ba25SJulian Elischer 
82e602ba25SJulian Elischer The result of this scheme is that the M available KSEs are always
83e602ba25SJulian Elischer queued at the priorities they have inherrited from the M highest priority
84e602ba25SJulian Elischer threads for that KSEGROUP. If this situation changes, the KSEs are
85e602ba25SJulian Elischer reassigned to keep this true.
86677b542eSDavid E. O'Brien ***/
87e602ba25SJulian Elischer 
88677b542eSDavid E. O'Brien #include <sys/cdefs.h>
89677b542eSDavid E. O'Brien __FBSDID("$FreeBSD$");
90e602ba25SJulian Elischer 
916804a3abSJulian Elischer #include "opt_sched.h"
920c0b25aeSJohn Baldwin 
93ed062c8dSJulian Elischer #ifndef KERN_SWITCH_INCLUDE
94dba6c5a6SPeter Wemm #include <sys/param.h>
95dba6c5a6SPeter Wemm #include <sys/systm.h>
962d50560aSMarcel Moolenaar #include <sys/kdb.h>
97dba6c5a6SPeter Wemm #include <sys/kernel.h>
980384fff8SJason Evans #include <sys/ktr.h>
99f34fa851SJohn Baldwin #include <sys/lock.h>
10035e0e5b3SJohn Baldwin #include <sys/mutex.h>
101dba6c5a6SPeter Wemm #include <sys/proc.h>
102dba6c5a6SPeter Wemm #include <sys/queue.h>
103b43179fbSJeff Roberson #include <sys/sched.h>
104ed062c8dSJulian Elischer #else  /* KERN_SWITCH_INCLUDE */
1050d2a2989SPeter Wemm #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106cc66ebe2SPeter Wemm #include <sys/smp.h>
107cc66ebe2SPeter Wemm #endif
1086804a3abSJulian Elischer #if defined(SMP) && defined(SCHED_4BSD)
1096804a3abSJulian Elischer #include <sys/sysctl.h>
1106804a3abSJulian Elischer #endif
1116804a3abSJulian Elischer 
1129923b511SScott Long #ifdef FULL_PREEMPTION
1139923b511SScott Long #ifndef PREEMPTION
1149923b511SScott Long #error "The FULL_PREEMPTION option requires the PREEMPTION option"
1159923b511SScott Long #endif
1169923b511SScott Long #endif
117dba6c5a6SPeter Wemm 
118d2ac2316SJake Burkholder CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
119d2ac2316SJake Burkholder 
120ed062c8dSJulian Elischer #define td_kse td_sched
121ed062c8dSJulian Elischer 
1226220dcbaSRobert Watson /*
1236220dcbaSRobert Watson  * kern.sched.preemption allows user space to determine if preemption support
1246220dcbaSRobert Watson  * is compiled in or not.  It is not currently a boot or runtime flag that
1256220dcbaSRobert Watson  * can be changed.
1266220dcbaSRobert Watson  */
1276220dcbaSRobert Watson #ifdef PREEMPTION
1286220dcbaSRobert Watson static int kern_sched_preemption = 1;
1296220dcbaSRobert Watson #else
1306220dcbaSRobert Watson static int kern_sched_preemption = 0;
1316220dcbaSRobert Watson #endif
1326220dcbaSRobert Watson SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
1336220dcbaSRobert Watson     &kern_sched_preemption, 0, "Kernel preemption enabled");
1346220dcbaSRobert Watson 
135e602ba25SJulian Elischer /************************************************************************
136e602ba25SJulian Elischer  * Functions that manipulate runnability from a thread perspective.	*
137e602ba25SJulian Elischer  ************************************************************************/
138e602ba25SJulian Elischer /*
1395215b187SJeff Roberson  * Select the KSE that will be run next.  From that find the thread, and
140e602ba25SJulian Elischer  * remove it from the KSEGRP's run queue.  If there is thread clustering,
141e602ba25SJulian Elischer  * this will be what does it.
142e602ba25SJulian Elischer  */
143b40ce416SJulian Elischer struct thread *
144b40ce416SJulian Elischer choosethread(void)
145dba6c5a6SPeter Wemm {
146e602ba25SJulian Elischer 	struct kse *ke;
147e602ba25SJulian Elischer 	struct thread *td;
148e602ba25SJulian Elischer 	struct ksegrp *kg;
149e602ba25SJulian Elischer 
1500d2a2989SPeter Wemm #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
151cc66ebe2SPeter Wemm 	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
152cc66ebe2SPeter Wemm 		/* Shutting down, run idlethread on AP's */
153cc66ebe2SPeter Wemm 		td = PCPU_GET(idlethread);
154cc66ebe2SPeter Wemm 		ke = td->td_kse;
155cc66ebe2SPeter Wemm 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
156cc66ebe2SPeter Wemm 		ke->ke_flags |= KEF_DIDRUN;
157cc66ebe2SPeter Wemm 		TD_SET_RUNNING(td);
158cc66ebe2SPeter Wemm 		return (td);
159cc66ebe2SPeter Wemm 	}
160cc66ebe2SPeter Wemm #endif
161cc66ebe2SPeter Wemm 
162fe799533SAndrew Gallatin retry:
163cc66ebe2SPeter Wemm 	ke = sched_choose();
164cc66ebe2SPeter Wemm 	if (ke) {
165e602ba25SJulian Elischer 		td = ke->ke_thread;
166e602ba25SJulian Elischer 		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
167e602ba25SJulian Elischer 		kg = ke->ke_ksegrp;
168ed062c8dSJulian Elischer 		if (td->td_proc->p_flag & P_HADTHREADS) {
16933c06e1dSJulian Elischer 			if (kg->kg_last_assigned == td) {
170e602ba25SJulian Elischer 				kg->kg_last_assigned = TAILQ_PREV(td,
171e602ba25SJulian Elischer 				    threadqueue, td_runq);
17233c06e1dSJulian Elischer 			}
173d03c79eeSDavid Xu 			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
1741a5cd27bSJulian Elischer 		}
175e602ba25SJulian Elischer 		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
176e602ba25SJulian Elischer 		    td, td->td_priority);
177e602ba25SJulian Elischer 	} else {
17840e55026SJulian Elischer 		/* Simulate runq_choose() having returned the idle thread */
179e602ba25SJulian Elischer 		td = PCPU_GET(idlethread);
180472be958SJulian Elischer 		ke = td->td_kse;
181e602ba25SJulian Elischer 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
182e602ba25SJulian Elischer 	}
183472be958SJulian Elischer 	ke->ke_flags |= KEF_DIDRUN;
18493a7aa79SJulian Elischer 
18593a7aa79SJulian Elischer 	/*
186faaa20f6SJulian Elischer 	 * If we are in panic, only allow system threads,
187faaa20f6SJulian Elischer 	 * plus the one we are running in, to be run.
18893a7aa79SJulian Elischer 	 */
189fe799533SAndrew Gallatin 	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
190faaa20f6SJulian Elischer 	    (td->td_flags & TDF_INPANIC) == 0)) {
191faaa20f6SJulian Elischer 		/* note that it is no longer on the run queue */
192faaa20f6SJulian Elischer 		TD_SET_CAN_RUN(td);
193fe799533SAndrew Gallatin 		goto retry;
194faaa20f6SJulian Elischer 	}
19593a7aa79SJulian Elischer 
19671fad9fdSJulian Elischer 	TD_SET_RUNNING(td);
197e602ba25SJulian Elischer 	return (td);
198e602ba25SJulian Elischer }
199e602ba25SJulian Elischer 
200e602ba25SJulian Elischer /*
201ed062c8dSJulian Elischer  * Given a surplus system slot, try assign a new runnable thread to it.
202ed062c8dSJulian Elischer  * Called from:
203ed062c8dSJulian Elischer  *  sched_thread_exit()  (local)
204ed062c8dSJulian Elischer  *  sched_switch()  (local)
205ed062c8dSJulian Elischer  *  sched_thread_exit()  (local)
20614f0e2e9SJulian Elischer  *  remrunqueue()  (local)  (not at the moment)
207e602ba25SJulian Elischer  */
208ed062c8dSJulian Elischer static void
209ed062c8dSJulian Elischer slot_fill(struct ksegrp *kg)
210e602ba25SJulian Elischer {
211e602ba25SJulian Elischer 	struct thread *td;
212e602ba25SJulian Elischer 
21333c06e1dSJulian Elischer 	mtx_assert(&sched_lock, MA_OWNED);
214ed062c8dSJulian Elischer 	while (kg->kg_avail_opennings > 0) {
215e602ba25SJulian Elischer 		/*
2166f8132a8SJulian Elischer 		 * Find the first unassigned thread
2176f8132a8SJulian Elischer 		 */
2185215b187SJeff Roberson 		if ((td = kg->kg_last_assigned) != NULL)
2196f8132a8SJulian Elischer 			td = TAILQ_NEXT(td, td_runq);
2205215b187SJeff Roberson 		else
2216f8132a8SJulian Elischer 			td = TAILQ_FIRST(&kg->kg_runq);
2226f8132a8SJulian Elischer 
2236f8132a8SJulian Elischer 		/*
224ed062c8dSJulian Elischer 		 * If we found one, send it to the system scheduler.
225e602ba25SJulian Elischer 		 */
226e602ba25SJulian Elischer 		if (td) {
227e602ba25SJulian Elischer 			kg->kg_last_assigned = td;
22884f9d4b1SStephan Uphoff 			sched_add(td, SRQ_YIELDING);
229ed062c8dSJulian Elischer 			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
230ed062c8dSJulian Elischer 		} else {
231ed062c8dSJulian Elischer 			/* no threads to use up the slots. quit now */
232ed062c8dSJulian Elischer 			break;
23348bfcdddSJulian Elischer 		}
234ed062c8dSJulian Elischer 	}
235d5a08a60SJake Burkholder }
236d5a08a60SJake Burkholder 
237e8807f22SJulian Elischer #ifdef	SCHED_4BSD
238e602ba25SJulian Elischer /*
239e602ba25SJulian Elischer  * Remove a thread from its KSEGRP's run queue.
240e602ba25SJulian Elischer  * This in turn may remove it from a KSE if it was already assigned
241e602ba25SJulian Elischer  * to one, possibly causing a new thread to be assigned to the KSE
2425215b187SJeff Roberson  * and the KSE getting a new priority.
243e602ba25SJulian Elischer  */
2441f955e2dSJulian Elischer static void
245b40ce416SJulian Elischer remrunqueue(struct thread *td)
246d5a08a60SJake Burkholder {
24748bfcdddSJulian Elischer 	struct thread *td2, *td3;
248e602ba25SJulian Elischer 	struct ksegrp *kg;
249e602ba25SJulian Elischer 	struct kse *ke;
250e602ba25SJulian Elischer 
251e602ba25SJulian Elischer 	mtx_assert(&sched_lock, MA_OWNED);
25271fad9fdSJulian Elischer 	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
253e602ba25SJulian Elischer 	kg = td->td_ksegrp;
254e602ba25SJulian Elischer 	ke = td->td_kse;
255e602ba25SJulian Elischer 	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
25671fad9fdSJulian Elischer 	TD_SET_CAN_RUN(td);
2575215b187SJeff Roberson 	/*
2585215b187SJeff Roberson 	 * If it is not a threaded process, take the shortcut.
2595215b187SJeff Roberson 	 */
260ed062c8dSJulian Elischer 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
2613389af30SJulian Elischer 		/* remve from sys run queue and free up a slot */
2627cf90fb3SJeff Roberson 		sched_rem(td);
263c3b98db0SJulian Elischer 		ke->ke_state = KES_THREAD;
264e602ba25SJulian Elischer 		return;
265d5a08a60SJake Burkholder 	}
26648bfcdddSJulian Elischer    	td3 = TAILQ_PREV(td, threadqueue, td_runq);
26748bfcdddSJulian Elischer 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
268ed062c8dSJulian Elischer 	if (ke->ke_state == KES_ONRUNQ) {
269e602ba25SJulian Elischer 		/*
2703389af30SJulian Elischer 		 * This thread has been assigned to the system run queue.
271e602ba25SJulian Elischer 		 * We need to dissociate it and try assign the
272e602ba25SJulian Elischer 		 * KSE to the next available thread. Then, we should
273e602ba25SJulian Elischer 		 * see if we need to move the KSE in the run queues.
274e602ba25SJulian Elischer 		 */
2757cf90fb3SJeff Roberson 		sched_rem(td);
27693a7aa79SJulian Elischer 		ke->ke_state = KES_THREAD;
277e602ba25SJulian Elischer 		td2 = kg->kg_last_assigned;
278e602ba25SJulian Elischer 		KASSERT((td2 != NULL), ("last assigned has wrong value"));
27948bfcdddSJulian Elischer 		if (td2 == td)
280e602ba25SJulian Elischer 			kg->kg_last_assigned = td3;
2813389af30SJulian Elischer 		/* slot_fill(kg); */ /* will replace it with another */
282e602ba25SJulian Elischer 	}
283e602ba25SJulian Elischer }
284e8807f22SJulian Elischer #endif
2851f955e2dSJulian Elischer 
2861f955e2dSJulian Elischer /*
2871f955e2dSJulian Elischer  * Change the priority of a thread that is on the run queue.
2881f955e2dSJulian Elischer  */
2891f955e2dSJulian Elischer void
2901f955e2dSJulian Elischer adjustrunqueue( struct thread *td, int newpri)
2911f955e2dSJulian Elischer {
2921f955e2dSJulian Elischer 	struct ksegrp *kg;
2931f955e2dSJulian Elischer 	struct kse *ke;
2941f955e2dSJulian Elischer 
2951f955e2dSJulian Elischer 	mtx_assert(&sched_lock, MA_OWNED);
2961f955e2dSJulian Elischer 	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
2975215b187SJeff Roberson 
2981f955e2dSJulian Elischer 	ke = td->td_kse;
2991f955e2dSJulian Elischer 	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
3005215b187SJeff Roberson 	/*
3015215b187SJeff Roberson 	 * If it is not a threaded process, take the shortcut.
3025215b187SJeff Roberson 	 */
303ed062c8dSJulian Elischer 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
3041f955e2dSJulian Elischer 		/* We only care about the kse in the run queue. */
30524c5baaeSJulian Elischer 		td->td_priority = newpri;
3061f955e2dSJulian Elischer 		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
3077cf90fb3SJeff Roberson 			sched_rem(td);
3082630e4c9SJulian Elischer 			sched_add(td, SRQ_BORING);
3091f955e2dSJulian Elischer 		}
3101f955e2dSJulian Elischer 		return;
3111f955e2dSJulian Elischer 	}
3125215b187SJeff Roberson 
3135215b187SJeff Roberson 	/* It is a threaded process */
3141f955e2dSJulian Elischer 	kg = td->td_ksegrp;
315ed062c8dSJulian Elischer 	if (ke->ke_state == KES_ONRUNQ) {
3161f955e2dSJulian Elischer 		if (kg->kg_last_assigned == td) {
3171f955e2dSJulian Elischer 			kg->kg_last_assigned =
3181f955e2dSJulian Elischer 			    TAILQ_PREV(td, threadqueue, td_runq);
3191f955e2dSJulian Elischer 		}
3207cf90fb3SJeff Roberson 		sched_rem(td);
3211f955e2dSJulian Elischer 	}
3221f955e2dSJulian Elischer 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
32314f0e2e9SJulian Elischer 	TD_SET_CAN_RUN(td);
3241f955e2dSJulian Elischer 	td->td_priority = newpri;
3252630e4c9SJulian Elischer 	setrunqueue(td, SRQ_BORING);
3261f955e2dSJulian Elischer }
32784f9d4b1SStephan Uphoff 
32884f9d4b1SStephan Uphoff /*
32984f9d4b1SStephan Uphoff  * This function is called when a thread is about to be put on a
33084f9d4b1SStephan Uphoff  * ksegrp run queue because it has been made runnable or its
33184f9d4b1SStephan Uphoff  * priority has been adjusted and the ksegrp does not have a
33284f9d4b1SStephan Uphoff  * free kse slot.  It determines if a thread from the same ksegrp
33384f9d4b1SStephan Uphoff  * should be preempted.  If so, it tries to switch threads
33484f9d4b1SStephan Uphoff  * if the thread is on the same cpu or notifies another cpu that
33584f9d4b1SStephan Uphoff  * it should switch threads.
33684f9d4b1SStephan Uphoff  */
33784f9d4b1SStephan Uphoff 
33884f9d4b1SStephan Uphoff static void
33984f9d4b1SStephan Uphoff maybe_preempt_in_ksegrp(struct thread *td)
3407c71b645SStephan Uphoff #if  !defined(SMP)
34184f9d4b1SStephan Uphoff {
34213e7430fSPoul-Henning Kamp 	struct thread *running_thread;
3437c71b645SStephan Uphoff 
3447c71b645SStephan Uphoff #ifndef FULL_PREEMPTION
3457c71b645SStephan Uphoff 	int pri;
3467c71b645SStephan Uphoff 	pri = td->td_priority;
3477c71b645SStephan Uphoff 	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD))
3487c71b645SStephan Uphoff 		return;
3497c71b645SStephan Uphoff #endif
3507c71b645SStephan Uphoff 	mtx_assert(&sched_lock, MA_OWNED);
3517c71b645SStephan Uphoff 	running_thread = curthread;
3527c71b645SStephan Uphoff 
3537c71b645SStephan Uphoff 	if (running_thread->td_ksegrp != td->td_ksegrp)
3547c71b645SStephan Uphoff 		return;
3557c71b645SStephan Uphoff 
3567c71b645SStephan Uphoff 	if (td->td_priority > running_thread->td_priority)
3577c71b645SStephan Uphoff 		return;
3587c71b645SStephan Uphoff #ifdef PREEMPTION
3597c71b645SStephan Uphoff 	if (running_thread->td_critnest > 1)
36077918643SStephan Uphoff 		running_thread->td_owepreempt = 1;
3617c71b645SStephan Uphoff 	 else
3627c71b645SStephan Uphoff 		 mi_switch(SW_INVOL, NULL);
3637c71b645SStephan Uphoff 
3647c71b645SStephan Uphoff #else
3657c71b645SStephan Uphoff 	running_thread->td_flags |= TDF_NEEDRESCHED;
3667c71b645SStephan Uphoff #endif
3677c71b645SStephan Uphoff 	return;
3687c71b645SStephan Uphoff }
3697c71b645SStephan Uphoff 
3707c71b645SStephan Uphoff #else /* SMP */
3717c71b645SStephan Uphoff {
3727c71b645SStephan Uphoff 	struct thread *running_thread;
37384f9d4b1SStephan Uphoff 	int worst_pri;
37484f9d4b1SStephan Uphoff 	struct ksegrp *kg;
37584f9d4b1SStephan Uphoff 	cpumask_t cpumask,dontuse;
37684f9d4b1SStephan Uphoff 	struct pcpu *pc;
37784f9d4b1SStephan Uphoff 	struct pcpu *best_pcpu;
37884f9d4b1SStephan Uphoff 	struct thread *cputhread;
37984f9d4b1SStephan Uphoff 
38084f9d4b1SStephan Uphoff #ifndef FULL_PREEMPTION
38184f9d4b1SStephan Uphoff 	int pri;
38284f9d4b1SStephan Uphoff 	pri = td->td_priority;
38384f9d4b1SStephan Uphoff 	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD))
38484f9d4b1SStephan Uphoff 		return;
38584f9d4b1SStephan Uphoff #endif
38684f9d4b1SStephan Uphoff 
38784f9d4b1SStephan Uphoff 	mtx_assert(&sched_lock, MA_OWNED);
38884f9d4b1SStephan Uphoff 
38984f9d4b1SStephan Uphoff 	running_thread = curthread;
39084f9d4b1SStephan Uphoff 
39184f9d4b1SStephan Uphoff #if !defined(KSEG_PEEMPT_BEST_CPU)
39284f9d4b1SStephan Uphoff 	if (running_thread->td_ksegrp != td->td_ksegrp) {
39384f9d4b1SStephan Uphoff #endif
39484f9d4b1SStephan Uphoff 		kg = td->td_ksegrp;
39584f9d4b1SStephan Uphoff 
39684f9d4b1SStephan Uphoff 		/* if someone is ahead of this thread, wait our turn */
39784f9d4b1SStephan Uphoff 		if (td != TAILQ_FIRST(&kg->kg_runq))
39884f9d4b1SStephan Uphoff 			return;
39984f9d4b1SStephan Uphoff 
40084f9d4b1SStephan Uphoff 		worst_pri = td->td_priority;
40184f9d4b1SStephan Uphoff 		best_pcpu = NULL;
40284f9d4b1SStephan Uphoff 		dontuse   = stopped_cpus | idle_cpus_mask;
40384f9d4b1SStephan Uphoff 
40484f9d4b1SStephan Uphoff 		/*
40584f9d4b1SStephan Uphoff 		 * Find a cpu with the worst priority that runs at thread from
40684f9d4b1SStephan Uphoff 		 * the same  ksegrp - if multiple exist give first the last run
40784f9d4b1SStephan Uphoff 		 * cpu and then the current cpu priority
40884f9d4b1SStephan Uphoff 		 */
40984f9d4b1SStephan Uphoff 
41084f9d4b1SStephan Uphoff 		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
41184f9d4b1SStephan Uphoff 			cpumask   = pc->pc_cpumask;
41284f9d4b1SStephan Uphoff 			cputhread = pc->pc_curthread;
41384f9d4b1SStephan Uphoff 
41484f9d4b1SStephan Uphoff 			if ((cpumask & dontuse)  ||
41584f9d4b1SStephan Uphoff 			    cputhread->td_ksegrp != kg)
41684f9d4b1SStephan Uphoff 				continue;
41784f9d4b1SStephan Uphoff 
41884f9d4b1SStephan Uphoff 			if (cputhread->td_priority > worst_pri) {
41984f9d4b1SStephan Uphoff 				worst_pri = cputhread->td_priority;
42084f9d4b1SStephan Uphoff 				best_pcpu = pc;
42184f9d4b1SStephan Uphoff 				continue;
42284f9d4b1SStephan Uphoff 			}
42384f9d4b1SStephan Uphoff 
42484f9d4b1SStephan Uphoff 			if (cputhread->td_priority == worst_pri &&
42584f9d4b1SStephan Uphoff 			    best_pcpu != NULL &&
42684f9d4b1SStephan Uphoff 			    (td->td_lastcpu == pc->pc_cpuid ||
42784f9d4b1SStephan Uphoff 				(PCPU_GET(cpumask) == cpumask &&
42884f9d4b1SStephan Uphoff 				    td->td_lastcpu != best_pcpu->pc_cpuid)))
42984f9d4b1SStephan Uphoff 			    best_pcpu = pc;
43084f9d4b1SStephan Uphoff 		}
43184f9d4b1SStephan Uphoff 
43284f9d4b1SStephan Uphoff 		/* Check if we need to preempt someone */
43384f9d4b1SStephan Uphoff 		if (best_pcpu == NULL)
43484f9d4b1SStephan Uphoff 			return;
43584f9d4b1SStephan Uphoff 
43684f9d4b1SStephan Uphoff 		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
43784f9d4b1SStephan Uphoff 			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
43884f9d4b1SStephan Uphoff 			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
43984f9d4b1SStephan Uphoff 			return;
44084f9d4b1SStephan Uphoff 		}
44184f9d4b1SStephan Uphoff #if !defined(KSEG_PEEMPT_BEST_CPU)
44284f9d4b1SStephan Uphoff 	}
44384f9d4b1SStephan Uphoff #endif
44484f9d4b1SStephan Uphoff 
44584f9d4b1SStephan Uphoff 	if (td->td_priority > running_thread->td_priority)
44684f9d4b1SStephan Uphoff 		return;
44784f9d4b1SStephan Uphoff #ifdef PREEMPTION
44884f9d4b1SStephan Uphoff 	if (running_thread->td_critnest > 1)
44977918643SStephan Uphoff 		running_thread->td_owepreempt = 1;
45084f9d4b1SStephan Uphoff 	 else
45184f9d4b1SStephan Uphoff 		 mi_switch(SW_INVOL, NULL);
45284f9d4b1SStephan Uphoff 
45384f9d4b1SStephan Uphoff #else
45484f9d4b1SStephan Uphoff 	running_thread->td_flags |= TDF_NEEDRESCHED;
45584f9d4b1SStephan Uphoff #endif
45684f9d4b1SStephan Uphoff 	return;
45784f9d4b1SStephan Uphoff }
4587c71b645SStephan Uphoff #endif /* !SMP */
4597c71b645SStephan Uphoff 
46084f9d4b1SStephan Uphoff 
461ed062c8dSJulian Elischer int limitcount;
462d5a08a60SJake Burkholder void
4632630e4c9SJulian Elischer setrunqueue(struct thread *td, int flags)
464d5a08a60SJake Burkholder {
465e602ba25SJulian Elischer 	struct ksegrp *kg;
466e602ba25SJulian Elischer 	struct thread *td2;
467e602ba25SJulian Elischer 	struct thread *tda;
468e602ba25SJulian Elischer 
469ed062c8dSJulian Elischer 	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
470ed062c8dSJulian Elischer 	    td, td->td_ksegrp, td->td_proc->p_pid);
47185da7a56SJeff Roberson 	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
47285da7a56SJeff Roberson             td, td->td_proc->p_comm, td->td_priority, curthread,
47385da7a56SJeff Roberson             curthread->td_proc->p_comm);
474e602ba25SJulian Elischer 	mtx_assert(&sched_lock, MA_OWNED);
475b2578c6cSJulian Elischer 	KASSERT((td->td_inhibitors == 0),
476b2578c6cSJulian Elischer 			("setrunqueue: trying to run inhibitted thread"));
47771fad9fdSJulian Elischer 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
47871fad9fdSJulian Elischer 	    ("setrunqueue: bad thread state"));
47971fad9fdSJulian Elischer 	TD_SET_RUNQ(td);
480e602ba25SJulian Elischer 	kg = td->td_ksegrp;
481ed062c8dSJulian Elischer 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
48248bfcdddSJulian Elischer 		/*
48348bfcdddSJulian Elischer 		 * Common path optimisation: Only one of everything
48448bfcdddSJulian Elischer 		 * and the KSE is always already attached.
48548bfcdddSJulian Elischer 		 * Totally ignore the ksegrp run queue.
48648bfcdddSJulian Elischer 		 */
487ed062c8dSJulian Elischer 		if (kg->kg_avail_opennings != 1) {
48854983505SJulian Elischer 			if (limitcount < 1) {
489ed062c8dSJulian Elischer 				limitcount++;
49054983505SJulian Elischer 				printf("pid %d: corrected slot count (%d->1)\n",
491ed062c8dSJulian Elischer 				    td->td_proc->p_pid, kg->kg_avail_opennings);
492ed062c8dSJulian Elischer 
493ed062c8dSJulian Elischer 			}
494ed062c8dSJulian Elischer 			kg->kg_avail_opennings = 1;
495ed062c8dSJulian Elischer 		}
4962630e4c9SJulian Elischer 		sched_add(td, flags);
49748bfcdddSJulian Elischer 		return;
49848bfcdddSJulian Elischer 	}
49948bfcdddSJulian Elischer 
50014f0e2e9SJulian Elischer 	/*
50114f0e2e9SJulian Elischer 	 * If the concurrency has reduced, and we would go in the
50214f0e2e9SJulian Elischer 	 * assigned section, then keep removing entries from the
50314f0e2e9SJulian Elischer 	 * system run queue, until we are not in that section
50414f0e2e9SJulian Elischer 	 * or there is room for us to be put in that section.
50514f0e2e9SJulian Elischer 	 * What we MUST avoid is the case where there are threads of less
50614f0e2e9SJulian Elischer 	 * priority than the new one scheduled, but it can not
50714f0e2e9SJulian Elischer 	 * be scheduled itself. That would lead to a non contiguous set
50814f0e2e9SJulian Elischer 	 * of scheduled threads, and everything would break.
50914f0e2e9SJulian Elischer 	 */
510e602ba25SJulian Elischer 	tda = kg->kg_last_assigned;
51114f0e2e9SJulian Elischer 	while ((kg->kg_avail_opennings <= 0) &&
512ed062c8dSJulian Elischer 	    (tda && (tda->td_priority > td->td_priority))) {
513e602ba25SJulian Elischer 		/*
514e602ba25SJulian Elischer 		 * None free, but there is one we can commandeer.
515e602ba25SJulian Elischer 		 */
516ed062c8dSJulian Elischer 		CTR2(KTR_RUNQ,
517ed062c8dSJulian Elischer 		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
51894816f6dSJeff Roberson 		sched_rem(tda);
519e602ba25SJulian Elischer 		tda = kg->kg_last_assigned =
520e602ba25SJulian Elischer 		    TAILQ_PREV(tda, threadqueue, td_runq);
521d5a08a60SJake Burkholder 	}
522d5a08a60SJake Burkholder 
523e602ba25SJulian Elischer 	/*
524e602ba25SJulian Elischer 	 * Add the thread to the ksegrp's run queue at
525e602ba25SJulian Elischer 	 * the appropriate place.
526e602ba25SJulian Elischer 	 */
527e602ba25SJulian Elischer 	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
528e602ba25SJulian Elischer 		if (td2->td_priority > td->td_priority) {
529e602ba25SJulian Elischer 			TAILQ_INSERT_BEFORE(td2, td, td_runq);
530e602ba25SJulian Elischer 			break;
531e602ba25SJulian Elischer 		}
532e602ba25SJulian Elischer 	}
533e602ba25SJulian Elischer 	if (td2 == NULL) {
534e602ba25SJulian Elischer 		/* We ran off the end of the TAILQ or it was empty. */
535e602ba25SJulian Elischer 		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
536e602ba25SJulian Elischer 	}
537e602ba25SJulian Elischer 
538e602ba25SJulian Elischer 	/*
539ed062c8dSJulian Elischer 	 * If we have a slot to use, then put the thread on the system
540ed062c8dSJulian Elischer 	 * run queue and if needed, readjust the last_assigned pointer.
54114f0e2e9SJulian Elischer 	 * it may be that we need to schedule something anyhow
54214f0e2e9SJulian Elischer 	 * even if the availabel slots are -ve so that
54314f0e2e9SJulian Elischer 	 * all the items < last_assigned are scheduled.
544e602ba25SJulian Elischer 	 */
545ed062c8dSJulian Elischer 	if (kg->kg_avail_opennings > 0) {
546e602ba25SJulian Elischer 		if (tda == NULL) {
547e602ba25SJulian Elischer 			/*
548e602ba25SJulian Elischer 			 * No pre-existing last assigned so whoever is first
54914f0e2e9SJulian Elischer 			 * gets the slot.. (maybe us)
550e602ba25SJulian Elischer 			 */
551e602ba25SJulian Elischer 			td2 = TAILQ_FIRST(&kg->kg_runq);
552e602ba25SJulian Elischer 			kg->kg_last_assigned = td2;
553e602ba25SJulian Elischer 		} else if (tda->td_priority > td->td_priority) {
554ed062c8dSJulian Elischer 			td2 = td;
555e602ba25SJulian Elischer 		} else {
556e602ba25SJulian Elischer 			/*
557e602ba25SJulian Elischer 			 * We are past last_assigned, so
55814f0e2e9SJulian Elischer 			 * give the next slot to whatever is next,
559e602ba25SJulian Elischer 			 * which may or may not be us.
560e602ba25SJulian Elischer 			 */
561e602ba25SJulian Elischer 			td2 = TAILQ_NEXT(tda, td_runq);
562e602ba25SJulian Elischer 			kg->kg_last_assigned = td2;
563e602ba25SJulian Elischer 		}
564ed062c8dSJulian Elischer 		sched_add(td2, flags);
565732d9528SJulian Elischer 	} else {
566732d9528SJulian Elischer 		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
567732d9528SJulian Elischer 			td, td->td_ksegrp, td->td_proc->p_pid);
56884f9d4b1SStephan Uphoff 		if ((flags & SRQ_YIELDING) == 0)
56984f9d4b1SStephan Uphoff 			maybe_preempt_in_ksegrp(td);
570e602ba25SJulian Elischer 	}
571e602ba25SJulian Elischer }
572e602ba25SJulian Elischer 
5730c0b25aeSJohn Baldwin /*
5740c0b25aeSJohn Baldwin  * Kernel thread preemption implementation.  Critical sections mark
5750c0b25aeSJohn Baldwin  * regions of code in which preemptions are not allowed.
5760c0b25aeSJohn Baldwin  */
5777e1f6dfeSJohn Baldwin void
5787e1f6dfeSJohn Baldwin critical_enter(void)
5797e1f6dfeSJohn Baldwin {
5807e1f6dfeSJohn Baldwin 	struct thread *td;
5817e1f6dfeSJohn Baldwin 
5827e1f6dfeSJohn Baldwin 	td = curthread;
5837e1f6dfeSJohn Baldwin 	td->td_critnest++;
584f42a43faSRobert Watson 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
585f42a43faSRobert Watson 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
5867e1f6dfeSJohn Baldwin }
5877e1f6dfeSJohn Baldwin 
5887e1f6dfeSJohn Baldwin void
5897e1f6dfeSJohn Baldwin critical_exit(void)
5907e1f6dfeSJohn Baldwin {
5917e1f6dfeSJohn Baldwin 	struct thread *td;
5927e1f6dfeSJohn Baldwin 
5937e1f6dfeSJohn Baldwin 	td = curthread;
594b209e5e3SJeff Roberson 	KASSERT(td->td_critnest != 0,
595b209e5e3SJeff Roberson 	    ("critical_exit: td_critnest == 0"));
5967e1f6dfeSJohn Baldwin 	if (td->td_critnest == 1) {
597b96741f4SScott Long 		if (td->td_pflags & TDP_WAKEPROC0) {
598b96741f4SScott Long 			td->td_pflags &= ~TDP_WAKEPROC0;
599b96741f4SScott Long 			wakeup(&proc0);
600b96741f4SScott Long 		}
60177918643SStephan Uphoff 
60277918643SStephan Uphoff 		td->td_critnest = 0;
60377918643SStephan Uphoff 
6040c0b25aeSJohn Baldwin #ifdef PREEMPTION
60552eb8464SJohn Baldwin 		mtx_assert(&sched_lock, MA_NOTOWNED);
60677918643SStephan Uphoff 		if (td->td_owepreempt) {
60777918643SStephan Uphoff 			td->td_critnest = 1;
6080c0b25aeSJohn Baldwin 			mtx_lock_spin(&sched_lock);
60977918643SStephan Uphoff 			td->td_critnest--;
6100c0b25aeSJohn Baldwin 			mi_switch(SW_INVOL, NULL);
6110c0b25aeSJohn Baldwin 			mtx_unlock_spin(&sched_lock);
6120c0b25aeSJohn Baldwin 		}
61377918643SStephan Uphoff 
6140c0b25aeSJohn Baldwin #endif
61577918643SStephan Uphoff 
616d74ac681SMatthew Dillon 	} else {
6177e1f6dfeSJohn Baldwin 		td->td_critnest--;
6187e1f6dfeSJohn Baldwin 	}
619f42a43faSRobert Watson 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
620f42a43faSRobert Watson 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
621d74ac681SMatthew Dillon }
6227e1f6dfeSJohn Baldwin 
6230c0b25aeSJohn Baldwin /*
6240c0b25aeSJohn Baldwin  * This function is called when a thread is about to be put on run queue
6250c0b25aeSJohn Baldwin  * because it has been made runnable or its priority has been adjusted.  It
6260c0b25aeSJohn Baldwin  * determines if the new thread should be immediately preempted to.  If so,
6270c0b25aeSJohn Baldwin  * it switches to it and eventually returns true.  If not, it returns false
6280c0b25aeSJohn Baldwin  * so that the caller may place the thread on an appropriate run queue.
6290c0b25aeSJohn Baldwin  */
6300c0b25aeSJohn Baldwin int
6310c0b25aeSJohn Baldwin maybe_preempt(struct thread *td)
6320c0b25aeSJohn Baldwin {
6338b44a2e2SMarcel Moolenaar #ifdef PREEMPTION
6340c0b25aeSJohn Baldwin 	struct thread *ctd;
6350c0b25aeSJohn Baldwin 	int cpri, pri;
6368b44a2e2SMarcel Moolenaar #endif
6370c0b25aeSJohn Baldwin 
6380c0b25aeSJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
6390c0b25aeSJohn Baldwin #ifdef PREEMPTION
6400c0b25aeSJohn Baldwin 	/*
6410c0b25aeSJohn Baldwin 	 * The new thread should not preempt the current thread if any of the
6420c0b25aeSJohn Baldwin 	 * following conditions are true:
6430c0b25aeSJohn Baldwin 	 *
644bc608306SRobert Watson 	 *  - The kernel is in the throes of crashing (panicstr).
64552eb8464SJohn Baldwin 	 *  - The current thread has a higher (numerically lower) or
64652eb8464SJohn Baldwin 	 *    equivalent priority.  Note that this prevents curthread from
64752eb8464SJohn Baldwin 	 *    trying to preempt to itself.
6480c0b25aeSJohn Baldwin 	 *  - It is too early in the boot for context switches (cold is set).
6490c0b25aeSJohn Baldwin 	 *  - The current thread has an inhibitor set or is in the process of
6500c0b25aeSJohn Baldwin 	 *    exiting.  In this case, the current thread is about to switch
6510c0b25aeSJohn Baldwin 	 *    out anyways, so there's no point in preempting.  If we did,
6520c0b25aeSJohn Baldwin 	 *    the current thread would not be properly resumed as well, so
6530c0b25aeSJohn Baldwin 	 *    just avoid that whole landmine.
6540c0b25aeSJohn Baldwin 	 *  - If the new thread's priority is not a realtime priority and
6550c0b25aeSJohn Baldwin 	 *    the current thread's priority is not an idle priority and
6560c0b25aeSJohn Baldwin 	 *    FULL_PREEMPTION is disabled.
6570c0b25aeSJohn Baldwin 	 *
6580c0b25aeSJohn Baldwin 	 * If all of these conditions are false, but the current thread is in
6590c0b25aeSJohn Baldwin 	 * a nested critical section, then we have to defer the preemption
6600c0b25aeSJohn Baldwin 	 * until we exit the critical section.  Otherwise, switch immediately
6610c0b25aeSJohn Baldwin 	 * to the new thread.
6620c0b25aeSJohn Baldwin 	 */
6630c0b25aeSJohn Baldwin 	ctd = curthread;
6646a574b2aSJulian Elischer 	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
6656a574b2aSJulian Elischer 	  ("thread has no (or wrong) sched-private part."));
666b2578c6cSJulian Elischer 	KASSERT((td->td_inhibitors == 0),
667b2578c6cSJulian Elischer 			("maybe_preempt: trying to run inhibitted thread"));
6680c0b25aeSJohn Baldwin 	pri = td->td_priority;
6690c0b25aeSJohn Baldwin 	cpri = ctd->td_priority;
670bc608306SRobert Watson 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
671bc608306SRobert Watson 	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
6720c0b25aeSJohn Baldwin 		return (0);
6730c0b25aeSJohn Baldwin #ifndef FULL_PREEMPTION
6740c0b25aeSJohn Baldwin 	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
6750c0b25aeSJohn Baldwin 	    !(cpri >= PRI_MIN_IDLE))
6760c0b25aeSJohn Baldwin 		return (0);
6770c0b25aeSJohn Baldwin #endif
6780c0b25aeSJohn Baldwin 	if (ctd->td_critnest > 1) {
6790c0b25aeSJohn Baldwin 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
6800c0b25aeSJohn Baldwin 		    ctd->td_critnest);
68177918643SStephan Uphoff 		ctd->td_owepreempt = 1;
6820c0b25aeSJohn Baldwin 		return (0);
6830c0b25aeSJohn Baldwin 	}
6840c0b25aeSJohn Baldwin 
6850c0b25aeSJohn Baldwin 	/*
686c20c691bSJulian Elischer 	 * Thread is runnable but not yet put on system run queue.
6870c0b25aeSJohn Baldwin 	 */
6880c0b25aeSJohn Baldwin 	MPASS(TD_ON_RUNQ(td));
6891f9f5df6SJulian Elischer 	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
6901f9f5df6SJulian Elischer 	if (td->td_proc->p_flag & P_HADTHREADS) {
6911f9f5df6SJulian Elischer 		/*
6921f9f5df6SJulian Elischer 		 * If this is a threaded process we actually ARE on the
6931f9f5df6SJulian Elischer 		 * ksegrp run queue so take it off that first.
6949da3e923SJulian Elischer 		 * Also undo any damage done to the last_assigned pointer.
6959da3e923SJulian Elischer 		 * XXX Fix setrunqueue so this isn't needed
6961f9f5df6SJulian Elischer 		 */
6979da3e923SJulian Elischer 		struct ksegrp *kg;
6989da3e923SJulian Elischer 
6999da3e923SJulian Elischer 		kg = td->td_ksegrp;
7009da3e923SJulian Elischer 		if (kg->kg_last_assigned == td)
7019da3e923SJulian Elischer 			kg->kg_last_assigned =
7029da3e923SJulian Elischer 			    TAILQ_PREV(td, threadqueue, td_runq);
7039da3e923SJulian Elischer 		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
7041f9f5df6SJulian Elischer 	}
7051f9f5df6SJulian Elischer 
7060c0b25aeSJohn Baldwin 	TD_SET_RUNNING(td);
7070c0b25aeSJohn Baldwin 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
7080c0b25aeSJohn Baldwin 	    td->td_proc->p_pid, td->td_proc->p_comm);
709c20c691bSJulian Elischer 	mi_switch(SW_INVOL|SW_PREEMPT, td);
7100c0b25aeSJohn Baldwin 	return (1);
7110c0b25aeSJohn Baldwin #else
7120c0b25aeSJohn Baldwin 	return (0);
7130c0b25aeSJohn Baldwin #endif
7140c0b25aeSJohn Baldwin }
7150c0b25aeSJohn Baldwin 
71644fe3c1fSJohn Baldwin #if 0
7170c0b25aeSJohn Baldwin #ifndef PREEMPTION
7180c0b25aeSJohn Baldwin /* XXX: There should be a non-static version of this. */
7190c0b25aeSJohn Baldwin static void
7200c0b25aeSJohn Baldwin printf_caddr_t(void *data)
7210c0b25aeSJohn Baldwin {
7220c0b25aeSJohn Baldwin 	printf("%s", (char *)data);
7230c0b25aeSJohn Baldwin }
7240c0b25aeSJohn Baldwin static char preempt_warning[] =
7250c0b25aeSJohn Baldwin     "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
7260c0b25aeSJohn Baldwin SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
7270c0b25aeSJohn Baldwin     preempt_warning)
7280c0b25aeSJohn Baldwin #endif
72944fe3c1fSJohn Baldwin #endif
730e602ba25SJulian Elischer 
731e602ba25SJulian Elischer /************************************************************************
732e602ba25SJulian Elischer  * SYSTEM RUN QUEUE manipulations and tests				*
733e602ba25SJulian Elischer  ************************************************************************/
734e602ba25SJulian Elischer /*
735e602ba25SJulian Elischer  * Initialize a run structure.
736e602ba25SJulian Elischer  */
737e602ba25SJulian Elischer void
738e602ba25SJulian Elischer runq_init(struct runq *rq)
739e602ba25SJulian Elischer {
740e602ba25SJulian Elischer 	int i;
741e602ba25SJulian Elischer 
742e602ba25SJulian Elischer 	bzero(rq, sizeof *rq);
743e602ba25SJulian Elischer 	for (i = 0; i < RQ_NQS; i++)
744e602ba25SJulian Elischer 		TAILQ_INIT(&rq->rq_queues[i]);
745e602ba25SJulian Elischer }
746e602ba25SJulian Elischer 
747d5a08a60SJake Burkholder /*
748d5a08a60SJake Burkholder  * Clear the status bit of the queue corresponding to priority level pri,
749d5a08a60SJake Burkholder  * indicating that it is empty.
750d5a08a60SJake Burkholder  */
751d5a08a60SJake Burkholder static __inline void
752d5a08a60SJake Burkholder runq_clrbit(struct runq *rq, int pri)
753d5a08a60SJake Burkholder {
754d5a08a60SJake Burkholder 	struct rqbits *rqb;
755d5a08a60SJake Burkholder 
756d5a08a60SJake Burkholder 	rqb = &rq->rq_status;
757d5a08a60SJake Burkholder 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
758d5a08a60SJake Burkholder 	    rqb->rqb_bits[RQB_WORD(pri)],
759d5a08a60SJake Burkholder 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
760d5a08a60SJake Burkholder 	    RQB_BIT(pri), RQB_WORD(pri));
761d5a08a60SJake Burkholder 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
762d5a08a60SJake Burkholder }
763d5a08a60SJake Burkholder 
764d5a08a60SJake Burkholder /*
765d5a08a60SJake Burkholder  * Find the index of the first non-empty run queue.  This is done by
766d5a08a60SJake Burkholder  * scanning the status bits, a set bit indicates a non-empty queue.
767d5a08a60SJake Burkholder  */
768d5a08a60SJake Burkholder static __inline int
769d5a08a60SJake Burkholder runq_findbit(struct runq *rq)
770d5a08a60SJake Burkholder {
771d5a08a60SJake Burkholder 	struct rqbits *rqb;
772d5a08a60SJake Burkholder 	int pri;
773d5a08a60SJake Burkholder 	int i;
774d5a08a60SJake Burkholder 
775d5a08a60SJake Burkholder 	rqb = &rq->rq_status;
776d5a08a60SJake Burkholder 	for (i = 0; i < RQB_LEN; i++)
777d5a08a60SJake Burkholder 		if (rqb->rqb_bits[i]) {
7782f9267ecSPeter Wemm 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
779d5a08a60SJake Burkholder 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
780d5a08a60SJake Burkholder 			    rqb->rqb_bits[i], i, pri);
781d5a08a60SJake Burkholder 			return (pri);
782d5a08a60SJake Burkholder 		}
783d5a08a60SJake Burkholder 
784d5a08a60SJake Burkholder 	return (-1);
785d5a08a60SJake Burkholder }
786d5a08a60SJake Burkholder 
787d5a08a60SJake Burkholder /*
788d5a08a60SJake Burkholder  * Set the status bit of the queue corresponding to priority level pri,
789d5a08a60SJake Burkholder  * indicating that it is non-empty.
790d5a08a60SJake Burkholder  */
791d5a08a60SJake Burkholder static __inline void
792d5a08a60SJake Burkholder runq_setbit(struct runq *rq, int pri)
793d5a08a60SJake Burkholder {
794d5a08a60SJake Burkholder 	struct rqbits *rqb;
795d5a08a60SJake Burkholder 
796d5a08a60SJake Burkholder 	rqb = &rq->rq_status;
797d5a08a60SJake Burkholder 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
798d5a08a60SJake Burkholder 	    rqb->rqb_bits[RQB_WORD(pri)],
799d5a08a60SJake Burkholder 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
800d5a08a60SJake Burkholder 	    RQB_BIT(pri), RQB_WORD(pri));
801d5a08a60SJake Burkholder 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
802d5a08a60SJake Burkholder }
803d5a08a60SJake Burkholder 
804d5a08a60SJake Burkholder /*
805e602ba25SJulian Elischer  * Add the KSE to the queue specified by its priority, and set the
806d5a08a60SJake Burkholder  * corresponding status bit.
807d5a08a60SJake Burkholder  */
808d5a08a60SJake Burkholder void
809c20c691bSJulian Elischer runq_add(struct runq *rq, struct kse *ke, int flags)
810d5a08a60SJake Burkholder {
811d5a08a60SJake Burkholder 	struct rqhead *rqh;
812d5a08a60SJake Burkholder 	int pri;
813dba6c5a6SPeter Wemm 
8142c100766SJulian Elischer 	pri = ke->ke_thread->td_priority / RQ_PPQ;
815b40ce416SJulian Elischer 	ke->ke_rqindex = pri;
816d5a08a60SJake Burkholder 	runq_setbit(rq, pri);
817d5a08a60SJake Burkholder 	rqh = &rq->rq_queues[pri];
818732d9528SJulian Elischer 	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
819732d9528SJulian Elischer 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
820c20c691bSJulian Elischer 	if (flags & SRQ_PREEMPTED) {
821c20c691bSJulian Elischer 		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
822c20c691bSJulian Elischer 	} else {
823b40ce416SJulian Elischer 		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
824dba6c5a6SPeter Wemm 	}
825c20c691bSJulian Elischer }
826d5a08a60SJake Burkholder 
827d5a08a60SJake Burkholder /*
828d5a08a60SJake Burkholder  * Return true if there are runnable processes of any priority on the run
829d5a08a60SJake Burkholder  * queue, false otherwise.  Has no side effects, does not modify the run
830d5a08a60SJake Burkholder  * queue structure.
831d5a08a60SJake Burkholder  */
832d5a08a60SJake Burkholder int
833d5a08a60SJake Burkholder runq_check(struct runq *rq)
834d5a08a60SJake Burkholder {
835d5a08a60SJake Burkholder 	struct rqbits *rqb;
836d5a08a60SJake Burkholder 	int i;
837d5a08a60SJake Burkholder 
838d5a08a60SJake Burkholder 	rqb = &rq->rq_status;
839d5a08a60SJake Burkholder 	for (i = 0; i < RQB_LEN; i++)
840d5a08a60SJake Burkholder 		if (rqb->rqb_bits[i]) {
841d5a08a60SJake Burkholder 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
842d5a08a60SJake Burkholder 			    rqb->rqb_bits[i], i);
843d5a08a60SJake Burkholder 			return (1);
844dba6c5a6SPeter Wemm 		}
845d5a08a60SJake Burkholder 	CTR0(KTR_RUNQ, "runq_check: empty");
846d5a08a60SJake Burkholder 
847d5a08a60SJake Burkholder 	return (0);
848dba6c5a6SPeter Wemm }
849d5a08a60SJake Burkholder 
8506804a3abSJulian Elischer #if defined(SMP) && defined(SCHED_4BSD)
8516804a3abSJulian Elischer int runq_fuzz = 1;
8526804a3abSJulian Elischer SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
8536804a3abSJulian Elischer #endif
8546804a3abSJulian Elischer 
855d5a08a60SJake Burkholder /*
856b43179fbSJeff Roberson  * Find the highest priority process on the run queue.
857d5a08a60SJake Burkholder  */
858b40ce416SJulian Elischer struct kse *
859d5a08a60SJake Burkholder runq_choose(struct runq *rq)
860d5a08a60SJake Burkholder {
861d5a08a60SJake Burkholder 	struct rqhead *rqh;
862b40ce416SJulian Elischer 	struct kse *ke;
863d5a08a60SJake Burkholder 	int pri;
864d5a08a60SJake Burkholder 
865d5a08a60SJake Burkholder 	mtx_assert(&sched_lock, MA_OWNED);
866e602ba25SJulian Elischer 	while ((pri = runq_findbit(rq)) != -1) {
867d5a08a60SJake Burkholder 		rqh = &rq->rq_queues[pri];
8686804a3abSJulian Elischer #if defined(SMP) && defined(SCHED_4BSD)
8696804a3abSJulian Elischer 		/* fuzz == 1 is normal.. 0 or less are ignored */
8706804a3abSJulian Elischer 		if (runq_fuzz > 1) {
8716804a3abSJulian Elischer 			/*
8726804a3abSJulian Elischer 			 * In the first couple of entries, check if
8736804a3abSJulian Elischer 			 * there is one for our CPU as a preference.
8746804a3abSJulian Elischer 			 */
8756804a3abSJulian Elischer 			int count = runq_fuzz;
8766804a3abSJulian Elischer 			int cpu = PCPU_GET(cpuid);
8776804a3abSJulian Elischer 			struct kse *ke2;
8786804a3abSJulian Elischer 			ke2 = ke = TAILQ_FIRST(rqh);
8796804a3abSJulian Elischer 
8806804a3abSJulian Elischer 			while (count-- && ke2) {
8816804a3abSJulian Elischer 				if (ke->ke_thread->td_lastcpu == cpu) {
8826804a3abSJulian Elischer 					ke = ke2;
8836804a3abSJulian Elischer 					break;
8846804a3abSJulian Elischer 				}
8856804a3abSJulian Elischer 				ke2 = TAILQ_NEXT(ke2, ke_procq);
8866804a3abSJulian Elischer 			}
8876804a3abSJulian Elischer 		} else
8886804a3abSJulian Elischer #endif
889b40ce416SJulian Elischer 			ke = TAILQ_FIRST(rqh);
890b40ce416SJulian Elischer 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
891e602ba25SJulian Elischer 		CTR3(KTR_RUNQ,
892e602ba25SJulian Elischer 		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
893b40ce416SJulian Elischer 		return (ke);
894d5a08a60SJake Burkholder 	}
895d5a08a60SJake Burkholder 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
896d5a08a60SJake Burkholder 
897e602ba25SJulian Elischer 	return (NULL);
898d5a08a60SJake Burkholder }
899d5a08a60SJake Burkholder 
900d5a08a60SJake Burkholder /*
901e602ba25SJulian Elischer  * Remove the KSE from the queue specified by its priority, and clear the
902d5a08a60SJake Burkholder  * corresponding status bit if the queue becomes empty.
903e602ba25SJulian Elischer  * Caller must set ke->ke_state afterwards.
904d5a08a60SJake Burkholder  */
905d5a08a60SJake Burkholder void
906b40ce416SJulian Elischer runq_remove(struct runq *rq, struct kse *ke)
907d5a08a60SJake Burkholder {
908d5a08a60SJake Burkholder 	struct rqhead *rqh;
909d5a08a60SJake Burkholder 	int pri;
910d5a08a60SJake Burkholder 
9119eb881f8SSeigo Tanimura 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
9129eb881f8SSeigo Tanimura 		("runq_remove: process swapped out"));
913b40ce416SJulian Elischer 	pri = ke->ke_rqindex;
914d5a08a60SJake Burkholder 	rqh = &rq->rq_queues[pri];
915732d9528SJulian Elischer 	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
916732d9528SJulian Elischer 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
917b40ce416SJulian Elischer 	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
918b40ce416SJulian Elischer 	TAILQ_REMOVE(rqh, ke, ke_procq);
919d5a08a60SJake Burkholder 	if (TAILQ_EMPTY(rqh)) {
920d5a08a60SJake Burkholder 		CTR0(KTR_RUNQ, "runq_remove: empty");
921d5a08a60SJake Burkholder 		runq_clrbit(rq, pri);
922d5a08a60SJake Burkholder 	}
923dba6c5a6SPeter Wemm }
924e602ba25SJulian Elischer 
925ed062c8dSJulian Elischer /****** functions that are temporarily here ***********/
926ed062c8dSJulian Elischer #include <vm/uma.h>
927ed062c8dSJulian Elischer extern struct mtx kse_zombie_lock;
928ed062c8dSJulian Elischer 
929ed062c8dSJulian Elischer /*
930ed062c8dSJulian Elischer  *  Allocate scheduler specific per-process resources.
931ed062c8dSJulian Elischer  * The thread and ksegrp have already been linked in.
932ed062c8dSJulian Elischer  * In this case just set the default concurrency value.
933ed062c8dSJulian Elischer  *
934ed062c8dSJulian Elischer  * Called from:
935ed062c8dSJulian Elischer  *  proc_init() (UMA init method)
936ed062c8dSJulian Elischer  */
937ed062c8dSJulian Elischer void
938ed062c8dSJulian Elischer sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
939ed062c8dSJulian Elischer {
940ed062c8dSJulian Elischer 
941ed062c8dSJulian Elischer 	/* This can go in sched_fork */
942ed062c8dSJulian Elischer 	sched_init_concurrency(kg);
943ed062c8dSJulian Elischer }
944ed062c8dSJulian Elischer 
945ed062c8dSJulian Elischer /*
946ed062c8dSJulian Elischer  * thread is being either created or recycled.
947ed062c8dSJulian Elischer  * Fix up the per-scheduler resources associated with it.
948ed062c8dSJulian Elischer  * Called from:
949ed062c8dSJulian Elischer  *  sched_fork_thread()
950ed062c8dSJulian Elischer  *  thread_dtor()  (*may go away)
951ed062c8dSJulian Elischer  *  thread_init()  (*may go away)
952ed062c8dSJulian Elischer  */
953ed062c8dSJulian Elischer void
954ed062c8dSJulian Elischer sched_newthread(struct thread *td)
955ed062c8dSJulian Elischer {
956ed062c8dSJulian Elischer 	struct td_sched *ke;
957ed062c8dSJulian Elischer 
958ed062c8dSJulian Elischer 	ke = (struct td_sched *) (td + 1);
959ed062c8dSJulian Elischer 	bzero(ke, sizeof(*ke));
960ed062c8dSJulian Elischer 	td->td_sched     = ke;
961ed062c8dSJulian Elischer 	ke->ke_thread	= td;
962ed062c8dSJulian Elischer 	ke->ke_state	= KES_THREAD;
963ed062c8dSJulian Elischer }
964ed062c8dSJulian Elischer 
965ed062c8dSJulian Elischer /*
966ed062c8dSJulian Elischer  * Set up an initial concurrency of 1
967ed062c8dSJulian Elischer  * and set the given thread (if given) to be using that
968ed062c8dSJulian Elischer  * concurrency slot.
969ed062c8dSJulian Elischer  * May be used "offline"..before the ksegrp is attached to the world
970ed062c8dSJulian Elischer  * and thus wouldn't need schedlock in that case.
971ed062c8dSJulian Elischer  * Called from:
972ed062c8dSJulian Elischer  *  thr_create()
973ed062c8dSJulian Elischer  *  proc_init() (UMA) via sched_newproc()
974ed062c8dSJulian Elischer  */
975ed062c8dSJulian Elischer void
976ed062c8dSJulian Elischer sched_init_concurrency(struct ksegrp *kg)
977ed062c8dSJulian Elischer {
978ed062c8dSJulian Elischer 
979d39063f2SJulian Elischer 	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
980ed062c8dSJulian Elischer 	kg->kg_concurrency = 1;
981ed062c8dSJulian Elischer 	kg->kg_avail_opennings = 1;
982ed062c8dSJulian Elischer }
983ed062c8dSJulian Elischer 
984ed062c8dSJulian Elischer /*
985ed062c8dSJulian Elischer  * Change the concurrency of an existing ksegrp to N
986ed062c8dSJulian Elischer  * Called from:
987ed062c8dSJulian Elischer  *  kse_create()
988ed062c8dSJulian Elischer  *  kse_exit()
989ed062c8dSJulian Elischer  *  thread_exit()
990ed062c8dSJulian Elischer  *  thread_single()
991ed062c8dSJulian Elischer  */
992ed062c8dSJulian Elischer void
993ed062c8dSJulian Elischer sched_set_concurrency(struct ksegrp *kg, int concurrency)
994ed062c8dSJulian Elischer {
995ed062c8dSJulian Elischer 
996d39063f2SJulian Elischer 	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
997d39063f2SJulian Elischer 	    kg,
998d39063f2SJulian Elischer 	    concurrency,
999d39063f2SJulian Elischer 	    kg->kg_avail_opennings,
1000d39063f2SJulian Elischer 	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1001ed062c8dSJulian Elischer 	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1002ed062c8dSJulian Elischer 	kg->kg_concurrency = concurrency;
1003ed062c8dSJulian Elischer }
1004ed062c8dSJulian Elischer 
1005ed062c8dSJulian Elischer /*
1006ed062c8dSJulian Elischer  * Called from thread_exit() for all exiting thread
1007ed062c8dSJulian Elischer  *
1008ed062c8dSJulian Elischer  * Not to be confused with sched_exit_thread()
1009ed062c8dSJulian Elischer  * that is only called from thread_exit() for threads exiting
1010ed062c8dSJulian Elischer  * without the rest of the process exiting because it is also called from
1011ed062c8dSJulian Elischer  * sched_exit() and we wouldn't want to call it twice.
1012ed062c8dSJulian Elischer  * XXX This can probably be fixed.
1013ed062c8dSJulian Elischer  */
1014ed062c8dSJulian Elischer void
1015ed062c8dSJulian Elischer sched_thread_exit(struct thread *td)
1016ed062c8dSJulian Elischer {
1017ed062c8dSJulian Elischer 
1018d39063f2SJulian Elischer 	SLOT_RELEASE(td->td_ksegrp);
1019ed062c8dSJulian Elischer 	slot_fill(td->td_ksegrp);
1020ed062c8dSJulian Elischer }
1021ed062c8dSJulian Elischer 
1022ed062c8dSJulian Elischer #endif /* KERN_SWITCH_INCLUDE */
1023