xref: /freebsd/sys/kern/sched_4bsd.c (revision 14f0e2e9bf2529b504c3094971c553cadf813e83)
1b43179fbSJeff Roberson /*-
2b43179fbSJeff Roberson  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3b43179fbSJeff Roberson  *	The Regents of the University of California.  All rights reserved.
4b43179fbSJeff Roberson  * (c) UNIX System Laboratories, Inc.
5b43179fbSJeff Roberson  * All or some portions of this file are derived from material licensed
6b43179fbSJeff Roberson  * to the University of California by American Telephone and Telegraph
7b43179fbSJeff Roberson  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8b43179fbSJeff Roberson  * the permission of UNIX System Laboratories, Inc.
9b43179fbSJeff Roberson  *
10b43179fbSJeff Roberson  * Redistribution and use in source and binary forms, with or without
11b43179fbSJeff Roberson  * modification, are permitted provided that the following conditions
12b43179fbSJeff Roberson  * are met:
13b43179fbSJeff Roberson  * 1. Redistributions of source code must retain the above copyright
14b43179fbSJeff Roberson  *    notice, this list of conditions and the following disclaimer.
15b43179fbSJeff Roberson  * 2. Redistributions in binary form must reproduce the above copyright
16b43179fbSJeff Roberson  *    notice, this list of conditions and the following disclaimer in the
17b43179fbSJeff Roberson  *    documentation and/or other materials provided with the distribution.
18b43179fbSJeff Roberson  * 4. Neither the name of the University nor the names of its contributors
19b43179fbSJeff Roberson  *    may be used to endorse or promote products derived from this software
20b43179fbSJeff Roberson  *    without specific prior written permission.
21b43179fbSJeff Roberson  *
22b43179fbSJeff Roberson  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23b43179fbSJeff Roberson  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24b43179fbSJeff Roberson  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25b43179fbSJeff Roberson  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26b43179fbSJeff Roberson  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27b43179fbSJeff Roberson  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28b43179fbSJeff Roberson  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29b43179fbSJeff Roberson  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30b43179fbSJeff Roberson  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31b43179fbSJeff Roberson  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32b43179fbSJeff Roberson  * SUCH DAMAGE.
33b43179fbSJeff Roberson  */
34b43179fbSJeff Roberson 
35677b542eSDavid E. O'Brien #include <sys/cdefs.h>
36677b542eSDavid E. O'Brien __FBSDID("$FreeBSD$");
37677b542eSDavid E. O'Brien 
38ed062c8dSJulian Elischer #define kse td_sched
39ed062c8dSJulian Elischer 
40b43179fbSJeff Roberson #include <sys/param.h>
41b43179fbSJeff Roberson #include <sys/systm.h>
42b43179fbSJeff Roberson #include <sys/kernel.h>
43b43179fbSJeff Roberson #include <sys/ktr.h>
44b43179fbSJeff Roberson #include <sys/lock.h>
45c55bbb6cSJohn Baldwin #include <sys/kthread.h>
46b43179fbSJeff Roberson #include <sys/mutex.h>
47b43179fbSJeff Roberson #include <sys/proc.h>
48b43179fbSJeff Roberson #include <sys/resourcevar.h>
49b43179fbSJeff Roberson #include <sys/sched.h>
50b43179fbSJeff Roberson #include <sys/smp.h>
51b43179fbSJeff Roberson #include <sys/sysctl.h>
52b43179fbSJeff Roberson #include <sys/sx.h>
53293968d8SJulian Elischer #include <machine/smp.h>
54b43179fbSJeff Roberson 
5506439a04SJeff Roberson /*
5606439a04SJeff Roberson  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
5706439a04SJeff Roberson  * the range 100-256 Hz (approximately).
5806439a04SJeff Roberson  */
5906439a04SJeff Roberson #define	ESTCPULIM(e) \
6006439a04SJeff Roberson     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
6106439a04SJeff Roberson     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
62b698380fSBruce Evans #ifdef SMP
63b698380fSBruce Evans #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
64b698380fSBruce Evans #else
6506439a04SJeff Roberson #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
66b698380fSBruce Evans #endif
6706439a04SJeff Roberson #define	NICE_WEIGHT		1	/* Priorities per nice level. */
6806439a04SJeff Roberson 
69ed062c8dSJulian Elischer /*
70ed062c8dSJulian Elischer  * The schedulable entity that can be given a context to run.
71ed062c8dSJulian Elischer  * A process may have several of these. Probably one per processor
72ed062c8dSJulian Elischer  * but posibly a few more. In this universe they are grouped
73ed062c8dSJulian Elischer  * with a KSEG that contains the priority and niceness
74ed062c8dSJulian Elischer  * for the group.
75ed062c8dSJulian Elischer  */
76ed062c8dSJulian Elischer struct kse {
77ed062c8dSJulian Elischer 	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
78ed062c8dSJulian Elischer 	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
79ed062c8dSJulian Elischer 	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
80ed062c8dSJulian Elischer 	struct thread	*ke_thread;	/* (*) Active associated thread. */
81ed062c8dSJulian Elischer 	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
82ed062c8dSJulian Elischer 	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
83ed062c8dSJulian Elischer 	char		ke_rqindex;	/* (j) Run queue index. */
84ed062c8dSJulian Elischer 	enum {
85ed062c8dSJulian Elischer 		KES_THREAD = 0x0,	/* slaved to thread state */
86ed062c8dSJulian Elischer 		KES_ONRUNQ
87ed062c8dSJulian Elischer 	} ke_state;			/* (j) KSE status. */
88ed062c8dSJulian Elischer 	int		ke_cpticks;	/* (j) Ticks of cpu time. */
89ed062c8dSJulian Elischer 	struct runq	*ke_runq;	/* runq the kse is currently on */
90bcb06d59SJeff Roberson };
91ed062c8dSJulian Elischer 
92ed062c8dSJulian Elischer #define ke_proc		ke_thread->td_proc
93ed062c8dSJulian Elischer #define ke_ksegrp	ke_thread->td_ksegrp
94ed062c8dSJulian Elischer 
95ed062c8dSJulian Elischer #define td_kse td_sched
96ed062c8dSJulian Elischer 
97ed062c8dSJulian Elischer /* flags kept in td_flags */
98ed062c8dSJulian Elischer #define TDF_DIDRUN	TDF_SCHED0	/* KSE actually ran. */
99ed062c8dSJulian Elischer #define TDF_EXIT	TDF_SCHED1	/* KSE is being killed. */
100ed062c8dSJulian Elischer #define TDF_BOUND	TDF_SCHED2
101ed062c8dSJulian Elischer 
102ed062c8dSJulian Elischer #define ke_flags	ke_thread->td_flags
103ed062c8dSJulian Elischer #define KEF_DIDRUN	TDF_DIDRUN /* KSE actually ran. */
104ed062c8dSJulian Elischer #define KEF_EXIT	TDF_EXIT /* KSE is being killed. */
105ed062c8dSJulian Elischer #define KEF_BOUND	TDF_BOUND /* stuck to one CPU */
106bcb06d59SJeff Roberson 
107e17c57b1SJeff Roberson #define SKE_RUNQ_PCPU(ke)						\
108e17c57b1SJeff Roberson     ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
109e17c57b1SJeff Roberson 
110ed062c8dSJulian Elischer struct kg_sched {
111ed062c8dSJulian Elischer 	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
112ed062c8dSJulian Elischer 					   /* the system scheduler. */
113ed062c8dSJulian Elischer 	int	skg_avail_opennings;	/* (j) Num KSEs requested in group. */
114ed062c8dSJulian Elischer 	int	skg_concurrency;	/* (j) Num KSEs requested in group. */
115ed062c8dSJulian Elischer 	int	skg_runq_kses;		/* (j) Num KSEs on runq. */
116ed062c8dSJulian Elischer };
117ed062c8dSJulian Elischer #define kg_last_assigned	kg_sched->skg_last_assigned
118ed062c8dSJulian Elischer #define kg_avail_opennings	kg_sched->skg_avail_opennings
119ed062c8dSJulian Elischer #define kg_concurrency		kg_sched->skg_concurrency
120ed062c8dSJulian Elischer #define kg_runq_kses		kg_sched->skg_runq_kses
121ed062c8dSJulian Elischer 
122e17c57b1SJeff Roberson /*
123e17c57b1SJeff Roberson  * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
124f2f51f8aSJeff Roberson  * cpus.
125e17c57b1SJeff Roberson  */
126e17c57b1SJeff Roberson #define KSE_CAN_MIGRATE(ke)						\
1271e7fad6bSScott Long     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
128bcb06d59SJeff Roberson 
129ed062c8dSJulian Elischer static struct kse kse0;
130ed062c8dSJulian Elischer static struct kg_sched kg_sched0;
131b43179fbSJeff Roberson 
132ca59f152SJeff Roberson static int	sched_tdcnt;	/* Total runnable threads in the system. */
133b43179fbSJeff Roberson static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
1344974b53eSMaxime Henrion #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
135b43179fbSJeff Roberson 
136b43179fbSJeff Roberson static struct callout roundrobin_callout;
137b43179fbSJeff Roberson 
138ed062c8dSJulian Elischer static void	slot_fill(struct ksegrp *kg);
139ed062c8dSJulian Elischer static struct kse *sched_choose(void);		/* XXX Should be thread * */
140ed062c8dSJulian Elischer 
141e17c57b1SJeff Roberson static void	setup_runqs(void);
142b43179fbSJeff Roberson static void	roundrobin(void *arg);
143c55bbb6cSJohn Baldwin static void	schedcpu(void);
144e17c57b1SJeff Roberson static void	schedcpu_thread(void);
145b43179fbSJeff Roberson static void	sched_setup(void *dummy);
146b43179fbSJeff Roberson static void	maybe_resched(struct thread *td);
147b43179fbSJeff Roberson static void	updatepri(struct ksegrp *kg);
148b43179fbSJeff Roberson static void	resetpriority(struct ksegrp *kg);
14900b0483dSJulian Elischer #ifdef SMP
15082a1dfc1SJulian Elischer static int	forward_wakeup(int  cpunum);
15100b0483dSJulian Elischer #endif
152b43179fbSJeff Roberson 
153e17c57b1SJeff Roberson static struct kproc_desc sched_kp = {
154e17c57b1SJeff Roberson         "schedcpu",
155e17c57b1SJeff Roberson         schedcpu_thread,
156e17c57b1SJeff Roberson         NULL
157e17c57b1SJeff Roberson };
158e17c57b1SJeff Roberson SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp)
159e17c57b1SJeff Roberson SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
160b43179fbSJeff Roberson 
161b43179fbSJeff Roberson /*
162b43179fbSJeff Roberson  * Global run queue.
163b43179fbSJeff Roberson  */
164b43179fbSJeff Roberson static struct runq runq;
165e17c57b1SJeff Roberson 
166e17c57b1SJeff Roberson #ifdef SMP
167e17c57b1SJeff Roberson /*
168e17c57b1SJeff Roberson  * Per-CPU run queues
169e17c57b1SJeff Roberson  */
170e17c57b1SJeff Roberson static struct runq runq_pcpu[MAXCPU];
171e17c57b1SJeff Roberson #endif
172e17c57b1SJeff Roberson 
173e17c57b1SJeff Roberson static void
174e17c57b1SJeff Roberson setup_runqs(void)
175e17c57b1SJeff Roberson {
176e17c57b1SJeff Roberson #ifdef SMP
177e17c57b1SJeff Roberson 	int i;
178e17c57b1SJeff Roberson 
179e17c57b1SJeff Roberson 	for (i = 0; i < MAXCPU; ++i)
180e17c57b1SJeff Roberson 		runq_init(&runq_pcpu[i]);
181e17c57b1SJeff Roberson #endif
182e17c57b1SJeff Roberson 
183e17c57b1SJeff Roberson 	runq_init(&runq);
184e17c57b1SJeff Roberson }
185b43179fbSJeff Roberson 
186b43179fbSJeff Roberson static int
187b43179fbSJeff Roberson sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
188b43179fbSJeff Roberson {
189b43179fbSJeff Roberson 	int error, new_val;
190b43179fbSJeff Roberson 
191b43179fbSJeff Roberson 	new_val = sched_quantum * tick;
192b43179fbSJeff Roberson 	error = sysctl_handle_int(oidp, &new_val, 0, req);
193b43179fbSJeff Roberson         if (error != 0 || req->newptr == NULL)
194b43179fbSJeff Roberson 		return (error);
195b43179fbSJeff Roberson 	if (new_val < tick)
196b43179fbSJeff Roberson 		return (EINVAL);
197b43179fbSJeff Roberson 	sched_quantum = new_val / tick;
198b43179fbSJeff Roberson 	hogticks = 2 * sched_quantum;
199b43179fbSJeff Roberson 	return (0);
200b43179fbSJeff Roberson }
201b43179fbSJeff Roberson 
202e038d354SScott Long SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
203dc095794SScott Long 
204e038d354SScott Long SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
205e038d354SScott Long     "Scheduler name");
206dc095794SScott Long 
207dc095794SScott Long SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
208b43179fbSJeff Roberson     0, sizeof sched_quantum, sysctl_kern_quantum, "I",
209b43179fbSJeff Roberson     "Roundrobin scheduling quantum in microseconds");
210b43179fbSJeff Roberson 
21137c28a02SJulian Elischer #ifdef SMP
21282a1dfc1SJulian Elischer /* Enable forwarding of wakeups to all other cpus */
21382a1dfc1SJulian Elischer SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
21482a1dfc1SJulian Elischer 
215bce73aedSJulian Elischer static int forward_wakeup_enabled = 1;
21682a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
21782a1dfc1SJulian Elischer 	   &forward_wakeup_enabled, 0,
21882a1dfc1SJulian Elischer 	   "Forwarding of wakeup to idle CPUs");
21982a1dfc1SJulian Elischer 
22082a1dfc1SJulian Elischer static int forward_wakeups_requested = 0;
22182a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
22282a1dfc1SJulian Elischer 	   &forward_wakeups_requested, 0,
22382a1dfc1SJulian Elischer 	   "Requests for Forwarding of wakeup to idle CPUs");
22482a1dfc1SJulian Elischer 
22582a1dfc1SJulian Elischer static int forward_wakeups_delivered = 0;
22682a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
22782a1dfc1SJulian Elischer 	   &forward_wakeups_delivered, 0,
22882a1dfc1SJulian Elischer 	   "Completed Forwarding of wakeup to idle CPUs");
22982a1dfc1SJulian Elischer 
230bce73aedSJulian Elischer static int forward_wakeup_use_mask = 1;
23182a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
23282a1dfc1SJulian Elischer 	   &forward_wakeup_use_mask, 0,
23382a1dfc1SJulian Elischer 	   "Use the mask of idle cpus");
23482a1dfc1SJulian Elischer 
23582a1dfc1SJulian Elischer static int forward_wakeup_use_loop = 0;
23682a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
23782a1dfc1SJulian Elischer 	   &forward_wakeup_use_loop, 0,
23882a1dfc1SJulian Elischer 	   "Use a loop to find idle cpus");
23982a1dfc1SJulian Elischer 
24082a1dfc1SJulian Elischer static int forward_wakeup_use_single = 0;
24182a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
24282a1dfc1SJulian Elischer 	   &forward_wakeup_use_single, 0,
24382a1dfc1SJulian Elischer 	   "Only signal one idle cpu");
24482a1dfc1SJulian Elischer 
24582a1dfc1SJulian Elischer static int forward_wakeup_use_htt = 0;
24682a1dfc1SJulian Elischer SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
24782a1dfc1SJulian Elischer 	   &forward_wakeup_use_htt, 0,
24882a1dfc1SJulian Elischer 	   "account for htt");
2493389af30SJulian Elischer 
25037c28a02SJulian Elischer #endif
2513389af30SJulian Elischer static int sched_followon = 0;
2523389af30SJulian Elischer SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
2533389af30SJulian Elischer 	   &sched_followon, 0,
2543389af30SJulian Elischer 	   "allow threads to share a quantum");
2553389af30SJulian Elischer 
2563389af30SJulian Elischer static int sched_pfollowons = 0;
2573389af30SJulian Elischer SYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD,
2583389af30SJulian Elischer 	   &sched_pfollowons, 0,
2593389af30SJulian Elischer 	   "number of followons done to a different ksegrp");
2603389af30SJulian Elischer 
2613389af30SJulian Elischer static int sched_kgfollowons = 0;
2623389af30SJulian Elischer SYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD,
2633389af30SJulian Elischer 	   &sched_kgfollowons, 0,
2643389af30SJulian Elischer 	   "number of followons done in a ksegrp");
26582a1dfc1SJulian Elischer 
266b43179fbSJeff Roberson /*
267b43179fbSJeff Roberson  * Arrange to reschedule if necessary, taking the priorities and
268b43179fbSJeff Roberson  * schedulers into account.
269b43179fbSJeff Roberson  */
270b43179fbSJeff Roberson static void
271b43179fbSJeff Roberson maybe_resched(struct thread *td)
272b43179fbSJeff Roberson {
273b43179fbSJeff Roberson 
274b43179fbSJeff Roberson 	mtx_assert(&sched_lock, MA_OWNED);
275ed062c8dSJulian Elischer 	if (td->td_priority < curthread->td_priority)
2764a338afdSJulian Elischer 		curthread->td_flags |= TDF_NEEDRESCHED;
277b43179fbSJeff Roberson }
278b43179fbSJeff Roberson 
279b43179fbSJeff Roberson /*
280b43179fbSJeff Roberson  * Force switch among equal priority processes every 100ms.
281b43179fbSJeff Roberson  * We don't actually need to force a context switch of the current process.
282b43179fbSJeff Roberson  * The act of firing the event triggers a context switch to softclock() and
283b43179fbSJeff Roberson  * then switching back out again which is equivalent to a preemption, thus
284b43179fbSJeff Roberson  * no further work is needed on the local CPU.
285b43179fbSJeff Roberson  */
286b43179fbSJeff Roberson /* ARGSUSED */
287b43179fbSJeff Roberson static void
288b43179fbSJeff Roberson roundrobin(void *arg)
289b43179fbSJeff Roberson {
290b43179fbSJeff Roberson 
291b43179fbSJeff Roberson #ifdef SMP
292b43179fbSJeff Roberson 	mtx_lock_spin(&sched_lock);
293b43179fbSJeff Roberson 	forward_roundrobin();
294b43179fbSJeff Roberson 	mtx_unlock_spin(&sched_lock);
295b43179fbSJeff Roberson #endif
296b43179fbSJeff Roberson 
297b43179fbSJeff Roberson 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
298b43179fbSJeff Roberson }
299b43179fbSJeff Roberson 
300b43179fbSJeff Roberson /*
301b43179fbSJeff Roberson  * Constants for digital decay and forget:
30270fca427SJohn Baldwin  *	90% of (kg_estcpu) usage in 5 * loadav time
30370fca427SJohn Baldwin  *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
304b43179fbSJeff Roberson  *          Note that, as ps(1) mentions, this can let percentages
305b43179fbSJeff Roberson  *          total over 100% (I've seen 137.9% for 3 processes).
306b43179fbSJeff Roberson  *
30770fca427SJohn Baldwin  * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
308b43179fbSJeff Roberson  *
30970fca427SJohn Baldwin  * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
310b43179fbSJeff Roberson  * That is, the system wants to compute a value of decay such
311b43179fbSJeff Roberson  * that the following for loop:
312b43179fbSJeff Roberson  * 	for (i = 0; i < (5 * loadavg); i++)
31370fca427SJohn Baldwin  * 		kg_estcpu *= decay;
314b43179fbSJeff Roberson  * will compute
31570fca427SJohn Baldwin  * 	kg_estcpu *= 0.1;
316b43179fbSJeff Roberson  * for all values of loadavg:
317b43179fbSJeff Roberson  *
318b43179fbSJeff Roberson  * Mathematically this loop can be expressed by saying:
319b43179fbSJeff Roberson  * 	decay ** (5 * loadavg) ~= .1
320b43179fbSJeff Roberson  *
321b43179fbSJeff Roberson  * The system computes decay as:
322b43179fbSJeff Roberson  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
323b43179fbSJeff Roberson  *
324b43179fbSJeff Roberson  * We wish to prove that the system's computation of decay
325b43179fbSJeff Roberson  * will always fulfill the equation:
326b43179fbSJeff Roberson  * 	decay ** (5 * loadavg) ~= .1
327b43179fbSJeff Roberson  *
328b43179fbSJeff Roberson  * If we compute b as:
329b43179fbSJeff Roberson  * 	b = 2 * loadavg
330b43179fbSJeff Roberson  * then
331b43179fbSJeff Roberson  * 	decay = b / (b + 1)
332b43179fbSJeff Roberson  *
333b43179fbSJeff Roberson  * We now need to prove two things:
334b43179fbSJeff Roberson  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
335b43179fbSJeff Roberson  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
336b43179fbSJeff Roberson  *
337b43179fbSJeff Roberson  * Facts:
338b43179fbSJeff Roberson  *         For x close to zero, exp(x) =~ 1 + x, since
339b43179fbSJeff Roberson  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
340b43179fbSJeff Roberson  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
341b43179fbSJeff Roberson  *         For x close to zero, ln(1+x) =~ x, since
342b43179fbSJeff Roberson  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
343b43179fbSJeff Roberson  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
344b43179fbSJeff Roberson  *         ln(.1) =~ -2.30
345b43179fbSJeff Roberson  *
346b43179fbSJeff Roberson  * Proof of (1):
347b43179fbSJeff Roberson  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
348b43179fbSJeff Roberson  *	solving for factor,
349b43179fbSJeff Roberson  *      ln(factor) =~ (-2.30/5*loadav), or
350b43179fbSJeff Roberson  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
351b43179fbSJeff Roberson  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
352b43179fbSJeff Roberson  *
353b43179fbSJeff Roberson  * Proof of (2):
354b43179fbSJeff Roberson  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
355b43179fbSJeff Roberson  *	solving for power,
356b43179fbSJeff Roberson  *      power*ln(b/(b+1)) =~ -2.30, or
357b43179fbSJeff Roberson  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
358b43179fbSJeff Roberson  *
359b43179fbSJeff Roberson  * Actual power values for the implemented algorithm are as follows:
360b43179fbSJeff Roberson  *      loadav: 1       2       3       4
361b43179fbSJeff Roberson  *      power:  5.68    10.32   14.94   19.55
362b43179fbSJeff Roberson  */
363b43179fbSJeff Roberson 
364b43179fbSJeff Roberson /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
365b43179fbSJeff Roberson #define	loadfactor(loadav)	(2 * (loadav))
366b43179fbSJeff Roberson #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
367b43179fbSJeff Roberson 
36870fca427SJohn Baldwin /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
369b43179fbSJeff Roberson static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
370b43179fbSJeff Roberson SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
371b43179fbSJeff Roberson 
372b43179fbSJeff Roberson /*
373b43179fbSJeff Roberson  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
374b43179fbSJeff Roberson  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
375b43179fbSJeff Roberson  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
376b43179fbSJeff Roberson  *
377b43179fbSJeff Roberson  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
378b43179fbSJeff Roberson  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
379b43179fbSJeff Roberson  *
380b43179fbSJeff Roberson  * If you don't want to bother with the faster/more-accurate formula, you
381b43179fbSJeff Roberson  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
382b43179fbSJeff Roberson  * (more general) method of calculating the %age of CPU used by a process.
383b43179fbSJeff Roberson  */
384b43179fbSJeff Roberson #define	CCPU_SHIFT	11
385b43179fbSJeff Roberson 
386b43179fbSJeff Roberson /*
387b43179fbSJeff Roberson  * Recompute process priorities, every hz ticks.
388b43179fbSJeff Roberson  * MP-safe, called without the Giant mutex.
389b43179fbSJeff Roberson  */
390b43179fbSJeff Roberson /* ARGSUSED */
391b43179fbSJeff Roberson static void
392c55bbb6cSJohn Baldwin schedcpu(void)
393b43179fbSJeff Roberson {
394b43179fbSJeff Roberson 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
395b43179fbSJeff Roberson 	struct thread *td;
396b43179fbSJeff Roberson 	struct proc *p;
397b43179fbSJeff Roberson 	struct kse *ke;
398b43179fbSJeff Roberson 	struct ksegrp *kg;
39970fca427SJohn Baldwin 	int awake, realstathz;
400b43179fbSJeff Roberson 
401b43179fbSJeff Roberson 	realstathz = stathz ? stathz : hz;
402b43179fbSJeff Roberson 	sx_slock(&allproc_lock);
403b43179fbSJeff Roberson 	FOREACH_PROC_IN_SYSTEM(p) {
40470fca427SJohn Baldwin 		/*
40570fca427SJohn Baldwin 		 * Prevent state changes and protect run queue.
40670fca427SJohn Baldwin 		 */
407b43179fbSJeff Roberson 		mtx_lock_spin(&sched_lock);
40870fca427SJohn Baldwin 		/*
40970fca427SJohn Baldwin 		 * Increment time in/out of memory.  We ignore overflow; with
41070fca427SJohn Baldwin 		 * 16-bit int's (remember them?) overflow takes 45 days.
41170fca427SJohn Baldwin 		 */
412b43179fbSJeff Roberson 		p->p_swtime++;
413b43179fbSJeff Roberson 		FOREACH_KSEGRP_IN_PROC(p, kg) {
414b43179fbSJeff Roberson 			awake = 0;
415ed062c8dSJulian Elischer 			FOREACH_THREAD_IN_GROUP(kg, td) {
416ed062c8dSJulian Elischer 				ke = td->td_kse;
417b43179fbSJeff Roberson 				/*
41870fca427SJohn Baldwin 				 * Increment sleep time (if sleeping).  We
41970fca427SJohn Baldwin 				 * ignore overflow, as above.
420b43179fbSJeff Roberson 				 */
421b43179fbSJeff Roberson 				/*
422b43179fbSJeff Roberson 				 * The kse slptimes are not touched in wakeup
423b43179fbSJeff Roberson 				 * because the thread may not HAVE a KSE.
424b43179fbSJeff Roberson 				 */
425b43179fbSJeff Roberson 				if (ke->ke_state == KES_ONRUNQ) {
426b43179fbSJeff Roberson 					awake = 1;
427b43179fbSJeff Roberson 					ke->ke_flags &= ~KEF_DIDRUN;
428b43179fbSJeff Roberson 				} else if ((ke->ke_state == KES_THREAD) &&
429ed062c8dSJulian Elischer 				    (TD_IS_RUNNING(td))) {
430b43179fbSJeff Roberson 					awake = 1;
431b43179fbSJeff Roberson 					/* Do not clear KEF_DIDRUN */
432b43179fbSJeff Roberson 				} else if (ke->ke_flags & KEF_DIDRUN) {
433b43179fbSJeff Roberson 					awake = 1;
434b43179fbSJeff Roberson 					ke->ke_flags &= ~KEF_DIDRUN;
435b43179fbSJeff Roberson 				}
436b43179fbSJeff Roberson 
437b43179fbSJeff Roberson 				/*
43870fca427SJohn Baldwin 				 * ke_pctcpu is only for ps and ttyinfo().
43970fca427SJohn Baldwin 				 * Do it per kse, and add them up at the end?
440b43179fbSJeff Roberson 				 * XXXKSE
441b43179fbSJeff Roberson 				 */
44270fca427SJohn Baldwin 				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
443bcb06d59SJeff Roberson 				    FSHIFT;
444b43179fbSJeff Roberson 				/*
445b43179fbSJeff Roberson 				 * If the kse has been idle the entire second,
446b43179fbSJeff Roberson 				 * stop recalculating its priority until
447b43179fbSJeff Roberson 				 * it wakes up.
448b43179fbSJeff Roberson 				 */
449ad59c36bSJulian Elischer 				if (ke->ke_cpticks == 0)
450b43179fbSJeff Roberson 					continue;
451b43179fbSJeff Roberson #if	(FSHIFT >= CCPU_SHIFT)
4528fb913faSJeff Roberson 				ke->ke_pctcpu += (realstathz == 100)
453ad59c36bSJulian Elischer 				    ? ((fixpt_t) ke->ke_cpticks) <<
454b43179fbSJeff Roberson 				    (FSHIFT - CCPU_SHIFT) :
455ad59c36bSJulian Elischer 				    100 * (((fixpt_t) ke->ke_cpticks)
456bcb06d59SJeff Roberson 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
457b43179fbSJeff Roberson #else
4588fb913faSJeff Roberson 				ke->ke_pctcpu += ((FSCALE - ccpu) *
459ad59c36bSJulian Elischer 				    (ke->ke_cpticks *
460bcb06d59SJeff Roberson 				    FSCALE / realstathz)) >> FSHIFT;
461b43179fbSJeff Roberson #endif
462ad59c36bSJulian Elischer 				ke->ke_cpticks = 0;
463b43179fbSJeff Roberson 			} /* end of kse loop */
464b43179fbSJeff Roberson 			/*
465b43179fbSJeff Roberson 			 * If there are ANY running threads in this KSEGRP,
466b43179fbSJeff Roberson 			 * then don't count it as sleeping.
467b43179fbSJeff Roberson 			 */
468b43179fbSJeff Roberson 			if (awake) {
469b43179fbSJeff Roberson 				if (kg->kg_slptime > 1) {
470b43179fbSJeff Roberson 					/*
471b43179fbSJeff Roberson 					 * In an ideal world, this should not
472b43179fbSJeff Roberson 					 * happen, because whoever woke us
473b43179fbSJeff Roberson 					 * up from the long sleep should have
474b43179fbSJeff Roberson 					 * unwound the slptime and reset our
475b43179fbSJeff Roberson 					 * priority before we run at the stale
476b43179fbSJeff Roberson 					 * priority.  Should KASSERT at some
477b43179fbSJeff Roberson 					 * point when all the cases are fixed.
478b43179fbSJeff Roberson 					 */
479b43179fbSJeff Roberson 					updatepri(kg);
480b43179fbSJeff Roberson 				}
481b43179fbSJeff Roberson 				kg->kg_slptime = 0;
48270fca427SJohn Baldwin 			} else
483b43179fbSJeff Roberson 				kg->kg_slptime++;
484b43179fbSJeff Roberson 			if (kg->kg_slptime > 1)
485b43179fbSJeff Roberson 				continue;
486b43179fbSJeff Roberson 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
487b43179fbSJeff Roberson 		      	resetpriority(kg);
488b43179fbSJeff Roberson 			FOREACH_THREAD_IN_GROUP(kg, td) {
489b43179fbSJeff Roberson 				if (td->td_priority >= PUSER) {
4901f955e2dSJulian Elischer 					sched_prio(td, kg->kg_user_pri);
491b43179fbSJeff Roberson 				}
492b43179fbSJeff Roberson 			}
493b43179fbSJeff Roberson 		} /* end of ksegrp loop */
494b43179fbSJeff Roberson 		mtx_unlock_spin(&sched_lock);
495b43179fbSJeff Roberson 	} /* end of process loop */
496b43179fbSJeff Roberson 	sx_sunlock(&allproc_lock);
497c55bbb6cSJohn Baldwin }
498c55bbb6cSJohn Baldwin 
499c55bbb6cSJohn Baldwin /*
500c55bbb6cSJohn Baldwin  * Main loop for a kthread that executes schedcpu once a second.
501c55bbb6cSJohn Baldwin  */
502c55bbb6cSJohn Baldwin static void
503e17c57b1SJeff Roberson schedcpu_thread(void)
504c55bbb6cSJohn Baldwin {
505c55bbb6cSJohn Baldwin 	int nowake;
506c55bbb6cSJohn Baldwin 
507c55bbb6cSJohn Baldwin 	for (;;) {
508c55bbb6cSJohn Baldwin 		schedcpu();
509c55bbb6cSJohn Baldwin 		tsleep(&nowake, curthread->td_priority, "-", hz);
510c55bbb6cSJohn Baldwin 	}
511b43179fbSJeff Roberson }
512b43179fbSJeff Roberson 
513b43179fbSJeff Roberson /*
514b43179fbSJeff Roberson  * Recalculate the priority of a process after it has slept for a while.
51570fca427SJohn Baldwin  * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
51670fca427SJohn Baldwin  * least six times the loadfactor will decay kg_estcpu to zero.
517b43179fbSJeff Roberson  */
518b43179fbSJeff Roberson static void
519b43179fbSJeff Roberson updatepri(struct ksegrp *kg)
520b43179fbSJeff Roberson {
52170fca427SJohn Baldwin 	register fixpt_t loadfac;
522b43179fbSJeff Roberson 	register unsigned int newcpu;
523b43179fbSJeff Roberson 
52470fca427SJohn Baldwin 	loadfac = loadfactor(averunnable.ldavg[0]);
525b43179fbSJeff Roberson 	if (kg->kg_slptime > 5 * loadfac)
526b43179fbSJeff Roberson 		kg->kg_estcpu = 0;
527b43179fbSJeff Roberson 	else {
52870fca427SJohn Baldwin 		newcpu = kg->kg_estcpu;
52970fca427SJohn Baldwin 		kg->kg_slptime--;	/* was incremented in schedcpu() */
530b43179fbSJeff Roberson 		while (newcpu && --kg->kg_slptime)
531b43179fbSJeff Roberson 			newcpu = decay_cpu(loadfac, newcpu);
532b43179fbSJeff Roberson 		kg->kg_estcpu = newcpu;
533b43179fbSJeff Roberson 	}
534b43179fbSJeff Roberson 	resetpriority(kg);
535b43179fbSJeff Roberson }
536b43179fbSJeff Roberson 
537b43179fbSJeff Roberson /*
538b43179fbSJeff Roberson  * Compute the priority of a process when running in user mode.
539b43179fbSJeff Roberson  * Arrange to reschedule if the resulting priority is better
540b43179fbSJeff Roberson  * than that of the current process.
541b43179fbSJeff Roberson  */
542b43179fbSJeff Roberson static void
543b43179fbSJeff Roberson resetpriority(struct ksegrp *kg)
544b43179fbSJeff Roberson {
545b43179fbSJeff Roberson 	register unsigned int newpriority;
546b43179fbSJeff Roberson 	struct thread *td;
547b43179fbSJeff Roberson 
548b43179fbSJeff Roberson 	if (kg->kg_pri_class == PRI_TIMESHARE) {
549b43179fbSJeff Roberson 		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
550fa885116SJulian Elischer 		    NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN);
551b43179fbSJeff Roberson 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
552b43179fbSJeff Roberson 		    PRI_MAX_TIMESHARE);
553b43179fbSJeff Roberson 		kg->kg_user_pri = newpriority;
554b43179fbSJeff Roberson 	}
555b43179fbSJeff Roberson 	FOREACH_THREAD_IN_GROUP(kg, td) {
556b43179fbSJeff Roberson 		maybe_resched(td);			/* XXXKSE silly */
557b43179fbSJeff Roberson 	}
558b43179fbSJeff Roberson }
559b43179fbSJeff Roberson 
560b43179fbSJeff Roberson /* ARGSUSED */
561b43179fbSJeff Roberson static void
562b43179fbSJeff Roberson sched_setup(void *dummy)
563b43179fbSJeff Roberson {
564e17c57b1SJeff Roberson 	setup_runqs();
56570fca427SJohn Baldwin 
566b43179fbSJeff Roberson 	if (sched_quantum == 0)
567b43179fbSJeff Roberson 		sched_quantum = SCHED_QUANTUM;
568b43179fbSJeff Roberson 	hogticks = 2 * sched_quantum;
569b43179fbSJeff Roberson 
5708cbec0c8SRobert Watson 	callout_init(&roundrobin_callout, CALLOUT_MPSAFE);
571b43179fbSJeff Roberson 
572b43179fbSJeff Roberson 	/* Kick off timeout driven events by calling first time. */
573b43179fbSJeff Roberson 	roundrobin(NULL);
574ca59f152SJeff Roberson 
575ca59f152SJeff Roberson 	/* Account for thread0. */
576ca59f152SJeff Roberson 	sched_tdcnt++;
577b43179fbSJeff Roberson }
578b43179fbSJeff Roberson 
579b43179fbSJeff Roberson /* External interfaces start here */
580ed062c8dSJulian Elischer /*
581ed062c8dSJulian Elischer  * Very early in the boot some setup of scheduler-specific
582ed062c8dSJulian Elischer  * parts of proc0 and of soem scheduler resources needs to be done.
583ed062c8dSJulian Elischer  * Called from:
584ed062c8dSJulian Elischer  *  proc0_init()
585ed062c8dSJulian Elischer  */
586ed062c8dSJulian Elischer void
587ed062c8dSJulian Elischer schedinit(void)
588ed062c8dSJulian Elischer {
589ed062c8dSJulian Elischer 	/*
590ed062c8dSJulian Elischer 	 * Set up the scheduler specific parts of proc0.
591ed062c8dSJulian Elischer 	 */
592ed062c8dSJulian Elischer 	proc0.p_sched = NULL; /* XXX */
593ed062c8dSJulian Elischer 	ksegrp0.kg_sched = &kg_sched0;
594ed062c8dSJulian Elischer 	thread0.td_sched = &kse0;
595ed062c8dSJulian Elischer 	kse0.ke_thread = &thread0;
596ed062c8dSJulian Elischer 	kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
597ed062c8dSJulian Elischer 	kse0.ke_state = KES_THREAD;
598ed062c8dSJulian Elischer 	kg_sched0.skg_concurrency = 1;
599ed062c8dSJulian Elischer 	kg_sched0.skg_avail_opennings = 0; /* we are already running */
600ed062c8dSJulian Elischer }
601ed062c8dSJulian Elischer 
602b43179fbSJeff Roberson int
603b43179fbSJeff Roberson sched_runnable(void)
604b43179fbSJeff Roberson {
605e17c57b1SJeff Roberson #ifdef SMP
606e17c57b1SJeff Roberson 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
607e17c57b1SJeff Roberson #else
608b43179fbSJeff Roberson 	return runq_check(&runq);
609e17c57b1SJeff Roberson #endif
610b43179fbSJeff Roberson }
611b43179fbSJeff Roberson 
612b43179fbSJeff Roberson int
613b43179fbSJeff Roberson sched_rr_interval(void)
614b43179fbSJeff Roberson {
615b43179fbSJeff Roberson 	if (sched_quantum == 0)
616b43179fbSJeff Roberson 		sched_quantum = SCHED_QUANTUM;
617b43179fbSJeff Roberson 	return (sched_quantum);
618b43179fbSJeff Roberson }
619b43179fbSJeff Roberson 
620b43179fbSJeff Roberson /*
621b43179fbSJeff Roberson  * We adjust the priority of the current process.  The priority of
622b43179fbSJeff Roberson  * a process gets worse as it accumulates CPU time.  The cpu usage
62370fca427SJohn Baldwin  * estimator (kg_estcpu) is increased here.  resetpriority() will
62470fca427SJohn Baldwin  * compute a different priority each time kg_estcpu increases by
625b43179fbSJeff Roberson  * INVERSE_ESTCPU_WEIGHT
626b43179fbSJeff Roberson  * (until MAXPRI is reached).  The cpu usage estimator ramps up
627b43179fbSJeff Roberson  * quite quickly when the process is running (linearly), and decays
628b43179fbSJeff Roberson  * away exponentially, at a rate which is proportionally slower when
629b43179fbSJeff Roberson  * the system is busy.  The basic principle is that the system will
630b43179fbSJeff Roberson  * 90% forget that the process used a lot of CPU time in 5 * loadav
631b43179fbSJeff Roberson  * seconds.  This causes the system to favor processes which haven't
632b43179fbSJeff Roberson  * run much recently, and to round-robin among other processes.
633b43179fbSJeff Roberson  */
634b43179fbSJeff Roberson void
6357cf90fb3SJeff Roberson sched_clock(struct thread *td)
636b43179fbSJeff Roberson {
637b43179fbSJeff Roberson 	struct ksegrp *kg;
6387cf90fb3SJeff Roberson 	struct kse *ke;
639b43179fbSJeff Roberson 
6402056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
6417cf90fb3SJeff Roberson 	kg = td->td_ksegrp;
6427cf90fb3SJeff Roberson 	ke = td->td_kse;
643f7f9e7f3SJeff Roberson 
644ad59c36bSJulian Elischer 	ke->ke_cpticks++;
645b43179fbSJeff Roberson 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
646b43179fbSJeff Roberson 	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
647b43179fbSJeff Roberson 		resetpriority(kg);
648b43179fbSJeff Roberson 		if (td->td_priority >= PUSER)
649b43179fbSJeff Roberson 			td->td_priority = kg->kg_user_pri;
650b43179fbSJeff Roberson 	}
651b43179fbSJeff Roberson }
65270fca427SJohn Baldwin 
653b43179fbSJeff Roberson /*
654b43179fbSJeff Roberson  * charge childs scheduling cpu usage to parent.
655b43179fbSJeff Roberson  *
656b43179fbSJeff Roberson  * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
657b43179fbSJeff Roberson  * Charge it to the ksegrp that did the wait since process estcpu is sum of
658b43179fbSJeff Roberson  * all ksegrps, this is strictly as expected.  Assume that the child process
659b43179fbSJeff Roberson  * aggregated all the estcpu into the 'built-in' ksegrp.
660b43179fbSJeff Roberson  */
661b43179fbSJeff Roberson void
66255d44f79SJulian Elischer sched_exit(struct proc *p, struct thread *td)
663f7f9e7f3SJeff Roberson {
66455d44f79SJulian Elischer 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
66555d44f79SJulian Elischer 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
666f7f9e7f3SJeff Roberson }
667f7f9e7f3SJeff Roberson 
668f7f9e7f3SJeff Roberson void
66955d44f79SJulian Elischer sched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd)
670b43179fbSJeff Roberson {
6712056d0a1SJohn Baldwin 
6722056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
67355d44f79SJulian Elischer 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu);
674b43179fbSJeff Roberson }
675b43179fbSJeff Roberson 
676b43179fbSJeff Roberson void
677f7f9e7f3SJeff Roberson sched_exit_thread(struct thread *td, struct thread *child)
678b43179fbSJeff Roberson {
6797d5ea13fSDoug Rabson 	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
680ca59f152SJeff Roberson 		sched_tdcnt--;
681f7f9e7f3SJeff Roberson }
682bcb06d59SJeff Roberson 
683f7f9e7f3SJeff Roberson void
684ed062c8dSJulian Elischer sched_fork(struct thread *td, struct thread *childtd)
685f7f9e7f3SJeff Roberson {
686ed062c8dSJulian Elischer 	sched_fork_ksegrp(td, childtd->td_ksegrp);
687ed062c8dSJulian Elischer 	sched_fork_thread(td, childtd);
688f7f9e7f3SJeff Roberson }
689f7f9e7f3SJeff Roberson 
690f7f9e7f3SJeff Roberson void
69155d44f79SJulian Elischer sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
692f7f9e7f3SJeff Roberson {
6932056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
69455d44f79SJulian Elischer 	child->kg_estcpu = td->td_ksegrp->kg_estcpu;
695f7f9e7f3SJeff Roberson }
696bcb06d59SJeff Roberson 
697f7f9e7f3SJeff Roberson void
698ed062c8dSJulian Elischer sched_fork_thread(struct thread *td, struct thread *childtd)
699f7f9e7f3SJeff Roberson {
700ed062c8dSJulian Elischer 	sched_newthread(childtd);
701b43179fbSJeff Roberson }
702b43179fbSJeff Roberson 
703b43179fbSJeff Roberson void
704fa885116SJulian Elischer sched_nice(struct proc *p, int nice)
705b43179fbSJeff Roberson {
706fa885116SJulian Elischer 	struct ksegrp *kg;
7070b5318c8SJohn Baldwin 
708fa885116SJulian Elischer 	PROC_LOCK_ASSERT(p, MA_OWNED);
7090b5318c8SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
710fa885116SJulian Elischer 	p->p_nice = nice;
711fa885116SJulian Elischer 	FOREACH_KSEGRP_IN_PROC(p, kg) {
712b43179fbSJeff Roberson 		resetpriority(kg);
713b43179fbSJeff Roberson 	}
714fa885116SJulian Elischer }
715b43179fbSJeff Roberson 
716f7f9e7f3SJeff Roberson void
717f7f9e7f3SJeff Roberson sched_class(struct ksegrp *kg, int class)
718f7f9e7f3SJeff Roberson {
7192056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
720f7f9e7f3SJeff Roberson 	kg->kg_pri_class = class;
721f7f9e7f3SJeff Roberson }
722f7f9e7f3SJeff Roberson 
7231f955e2dSJulian Elischer /*
7241f955e2dSJulian Elischer  * Adjust the priority of a thread.
7251f955e2dSJulian Elischer  * This may include moving the thread within the KSEGRP,
7261f955e2dSJulian Elischer  * changing the assignment of a kse to the thread,
7271f955e2dSJulian Elischer  * and moving a KSE in the system run queue.
7281f955e2dSJulian Elischer  */
729b43179fbSJeff Roberson void
730b43179fbSJeff Roberson sched_prio(struct thread *td, u_char prio)
731b43179fbSJeff Roberson {
732b43179fbSJeff Roberson 
7332056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
734b43179fbSJeff Roberson 	if (TD_ON_RUNQ(td)) {
7351f955e2dSJulian Elischer 		adjustrunqueue(td, prio);
7361f955e2dSJulian Elischer 	} else {
7371f955e2dSJulian Elischer 		td->td_priority = prio;
738b43179fbSJeff Roberson 	}
739b43179fbSJeff Roberson }
740b43179fbSJeff Roberson 
741b43179fbSJeff Roberson void
74244f3b092SJohn Baldwin sched_sleep(struct thread *td)
743b43179fbSJeff Roberson {
7442056d0a1SJohn Baldwin 
7452056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
746b43179fbSJeff Roberson 	td->td_ksegrp->kg_slptime = 0;
74744f3b092SJohn Baldwin 	td->td_base_pri = td->td_priority;
748b43179fbSJeff Roberson }
749b43179fbSJeff Roberson 
7503389af30SJulian Elischer static void remrunqueue(struct thread *td);
7513389af30SJulian Elischer 
752b43179fbSJeff Roberson void
7533389af30SJulian Elischer sched_switch(struct thread *td, struct thread *newtd, int flags)
754b43179fbSJeff Roberson {
755b43179fbSJeff Roberson 	struct kse *ke;
7563389af30SJulian Elischer 	struct ksegrp *kg;
757b43179fbSJeff Roberson 	struct proc *p;
758b43179fbSJeff Roberson 
759b43179fbSJeff Roberson 	ke = td->td_kse;
760b43179fbSJeff Roberson 	p = td->td_proc;
761b43179fbSJeff Roberson 
7622056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
763b43179fbSJeff Roberson 
764f2f51f8aSJeff Roberson 	if ((p->p_flag & P_NOLOAD) == 0)
765ca59f152SJeff Roberson 		sched_tdcnt--;
7663389af30SJulian Elischer 	/*
7673389af30SJulian Elischer 	 * We are volunteering to switch out so we get to nominate
7683389af30SJulian Elischer 	 * a successor for the rest of our quantum
7693389af30SJulian Elischer 	 * First try another thread in our ksegrp, and then look for
7703389af30SJulian Elischer 	 * other ksegrps in our process.
7713389af30SJulian Elischer 	 */
7723389af30SJulian Elischer 	if (sched_followon &&
7733389af30SJulian Elischer 	    (p->p_flag & P_HADTHREADS) &&
7743389af30SJulian Elischer 	    (flags & SW_VOL) &&
7753389af30SJulian Elischer 	    newtd == NULL) {
7763389af30SJulian Elischer 		/* lets schedule another thread from this process */
7773389af30SJulian Elischer 		 kg = td->td_ksegrp;
7783389af30SJulian Elischer 		 if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
7793389af30SJulian Elischer 			remrunqueue(newtd);
7803389af30SJulian Elischer 			sched_kgfollowons++;
7813389af30SJulian Elischer 		 } else {
7823389af30SJulian Elischer 			FOREACH_KSEGRP_IN_PROC(p, kg) {
7833389af30SJulian Elischer 				if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
7843389af30SJulian Elischer 					sched_pfollowons++;
7853389af30SJulian Elischer 					remrunqueue(newtd);
7863389af30SJulian Elischer 					break;
7873389af30SJulian Elischer 				}
7883389af30SJulian Elischer 			}
7893389af30SJulian Elischer 		}
7903389af30SJulian Elischer 	}
7913389af30SJulian Elischer 
792ed062c8dSJulian Elischer 	/*
793ed062c8dSJulian Elischer 	 * The thread we are about to run needs to be counted as if it had been
794ed062c8dSJulian Elischer 	 * added to the run queue and selected.
79514f0e2e9SJulian Elischer 	 * it came from:
79614f0e2e9SJulian Elischer 	 * A preemption
79714f0e2e9SJulian Elischer 	 * An upcall
79814f0e2e9SJulian Elischer 	 * A followon
79914f0e2e9SJulian Elischer 	 * Do this before saving curthread so that the slot count
80014f0e2e9SJulian Elischer 	 * doesn't give an overly optimistic view when that happens.
801ed062c8dSJulian Elischer 	 */
802ed062c8dSJulian Elischer 	if (newtd) {
803b2578c6cSJulian Elischer 		KASSERT((newtd->td_inhibitors == 0),
804b2578c6cSJulian Elischer 			("trying to run inhibitted thread"));
805ed062c8dSJulian Elischer 		newtd->td_ksegrp->kg_avail_opennings--;
806ed062c8dSJulian Elischer 		newtd->td_kse->ke_flags |= KEF_DIDRUN;
807ed062c8dSJulian Elischer         	TD_SET_RUNNING(newtd);
8080fe38d47SJulian Elischer 		if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
8090fe38d47SJulian Elischer 			sched_tdcnt++;
810ed062c8dSJulian Elischer 	}
8113389af30SJulian Elischer 
812060563ecSJulian Elischer 	td->td_lastcpu = td->td_oncpu;
81352eb8464SJohn Baldwin 	td->td_flags &= ~TDF_NEEDRESCHED;
81452eb8464SJohn Baldwin 	td->td_pflags &= ~TDP_OWEPREEMPT;
815ca59f152SJeff Roberson 	td->td_oncpu = NOCPU;
816b43179fbSJeff Roberson 	/*
817b43179fbSJeff Roberson 	 * At the last moment, if this thread is still marked RUNNING,
818b43179fbSJeff Roberson 	 * then put it back on the run queue as it has not been suspended
819bf0acc27SJohn Baldwin 	 * or stopped or any thing else similar.  We never put the idle
820bf0acc27SJohn Baldwin 	 * threads on the run queue, however.
821b43179fbSJeff Roberson 	 */
822bf0acc27SJohn Baldwin 	if (td == PCPU_GET(idlethread))
823bf0acc27SJohn Baldwin 		TD_SET_CAN_RUN(td);
824ed062c8dSJulian Elischer 	else {
825ed062c8dSJulian Elischer 		td->td_ksegrp->kg_avail_opennings++;
826ed062c8dSJulian Elischer 		if (TD_IS_RUNNING(td)) {
827b43179fbSJeff Roberson 			/* Put us back on the run queue (kse and all). */
8282630e4c9SJulian Elischer 			setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
829ed062c8dSJulian Elischer 		} else if (p->p_flag & P_HADTHREADS) {
830b43179fbSJeff Roberson 			/*
831b43179fbSJeff Roberson 			 * We will not be on the run queue. So we must be
832b43179fbSJeff Roberson 			 * sleeping or similar. As it's available,
833b43179fbSJeff Roberson 			 * someone else can use the KSE if they need it.
834b43179fbSJeff Roberson 			 */
835ed062c8dSJulian Elischer 			slot_fill(td->td_ksegrp);
836ed062c8dSJulian Elischer 		}
837b43179fbSJeff Roberson 	}
838bf0acc27SJohn Baldwin 	if (newtd == NULL)
839ae53b483SJeff Roberson 		newtd = choosethread();
840ae53b483SJeff Roberson 	if (td != newtd)
841ae53b483SJeff Roberson 		cpu_switch(td, newtd);
842ae53b483SJeff Roberson 	sched_lock.mtx_lock = (uintptr_t)td;
843ae53b483SJeff Roberson 	td->td_oncpu = PCPU_GET(cpuid);
844b43179fbSJeff Roberson }
845b43179fbSJeff Roberson 
846b43179fbSJeff Roberson void
847b43179fbSJeff Roberson sched_wakeup(struct thread *td)
848b43179fbSJeff Roberson {
849b43179fbSJeff Roberson 	struct ksegrp *kg;
850b43179fbSJeff Roberson 
8512056d0a1SJohn Baldwin 	mtx_assert(&sched_lock, MA_OWNED);
852b43179fbSJeff Roberson 	kg = td->td_ksegrp;
853b43179fbSJeff Roberson 	if (kg->kg_slptime > 1)
854b43179fbSJeff Roberson 		updatepri(kg);
855b43179fbSJeff Roberson 	kg->kg_slptime = 0;
8562630e4c9SJulian Elischer 	setrunqueue(td, SRQ_BORING);
857b43179fbSJeff Roberson }
858b43179fbSJeff Roberson 
85937c28a02SJulian Elischer #ifdef SMP
86082a1dfc1SJulian Elischer /* enable HTT_2 if you have a 2-way HTT cpu.*/
86182a1dfc1SJulian Elischer static int
86282a1dfc1SJulian Elischer forward_wakeup(int  cpunum)
86382a1dfc1SJulian Elischer {
86482a1dfc1SJulian Elischer 	cpumask_t map, me, dontuse;
86582a1dfc1SJulian Elischer 	cpumask_t map2;
86682a1dfc1SJulian Elischer 	struct pcpu *pc;
86782a1dfc1SJulian Elischer 	cpumask_t id, map3;
86882a1dfc1SJulian Elischer 
86982a1dfc1SJulian Elischer 	mtx_assert(&sched_lock, MA_OWNED);
87082a1dfc1SJulian Elischer 
871ed062c8dSJulian Elischer 	CTR0(KTR_RUNQ, "forward_wakeup()");
87282a1dfc1SJulian Elischer 
87382a1dfc1SJulian Elischer 	if ((!forward_wakeup_enabled) ||
87482a1dfc1SJulian Elischer 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
87582a1dfc1SJulian Elischer 		return (0);
87682a1dfc1SJulian Elischer 	if (!smp_started || cold || panicstr)
87782a1dfc1SJulian Elischer 		return (0);
87882a1dfc1SJulian Elischer 
87982a1dfc1SJulian Elischer 	forward_wakeups_requested++;
88082a1dfc1SJulian Elischer 
88182a1dfc1SJulian Elischer /*
88282a1dfc1SJulian Elischer  * check the idle mask we received against what we calculated before
88382a1dfc1SJulian Elischer  * in the old version.
88482a1dfc1SJulian Elischer  */
88582a1dfc1SJulian Elischer 	me = PCPU_GET(cpumask);
88682a1dfc1SJulian Elischer 	/*
88782a1dfc1SJulian Elischer 	 * don't bother if we should be doing it ourself..
88882a1dfc1SJulian Elischer 	 */
88982a1dfc1SJulian Elischer 	if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
89082a1dfc1SJulian Elischer 		return (0);
89182a1dfc1SJulian Elischer 
89282a1dfc1SJulian Elischer 	dontuse = me | stopped_cpus | hlt_cpus_mask;
89382a1dfc1SJulian Elischer 	map3 = 0;
89482a1dfc1SJulian Elischer 	if (forward_wakeup_use_loop) {
89582a1dfc1SJulian Elischer 		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
89682a1dfc1SJulian Elischer 			id = pc->pc_cpumask;
89782a1dfc1SJulian Elischer 			if ( (id & dontuse) == 0 &&
89882a1dfc1SJulian Elischer 			    pc->pc_curthread == pc->pc_idlethread) {
89982a1dfc1SJulian Elischer 				map3 |= id;
90082a1dfc1SJulian Elischer 			}
90182a1dfc1SJulian Elischer 		}
90282a1dfc1SJulian Elischer 	}
90382a1dfc1SJulian Elischer 
90482a1dfc1SJulian Elischer 	if (forward_wakeup_use_mask) {
90582a1dfc1SJulian Elischer 		map = 0;
90682a1dfc1SJulian Elischer 		map = idle_cpus_mask & ~dontuse;
90782a1dfc1SJulian Elischer 
90882a1dfc1SJulian Elischer 		/* If they are both on, compare and use loop if different */
90982a1dfc1SJulian Elischer 		if (forward_wakeup_use_loop) {
91082a1dfc1SJulian Elischer 			if (map != map3) {
91182a1dfc1SJulian Elischer 				printf("map (%02X) != map3 (%02X)\n",
91282a1dfc1SJulian Elischer 						map, map3);
91382a1dfc1SJulian Elischer 				map = map3;
91482a1dfc1SJulian Elischer 			}
91582a1dfc1SJulian Elischer 		}
91682a1dfc1SJulian Elischer 	} else {
91782a1dfc1SJulian Elischer 		map = map3;
91882a1dfc1SJulian Elischer 	}
91982a1dfc1SJulian Elischer 	/* If we only allow a specific CPU, then mask off all the others */
92082a1dfc1SJulian Elischer 	if (cpunum != NOCPU) {
92182a1dfc1SJulian Elischer 		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
92282a1dfc1SJulian Elischer 		map &= (1 << cpunum);
92382a1dfc1SJulian Elischer 	} else {
92482a1dfc1SJulian Elischer 		/* Try choose an idle die. */
92582a1dfc1SJulian Elischer 		if (forward_wakeup_use_htt) {
92682a1dfc1SJulian Elischer 			map2 =  (map & (map >> 1)) & 0x5555;
92782a1dfc1SJulian Elischer 			if (map2) {
92882a1dfc1SJulian Elischer 				map = map2;
92982a1dfc1SJulian Elischer 			}
93082a1dfc1SJulian Elischer 		}
93182a1dfc1SJulian Elischer 
93282a1dfc1SJulian Elischer 		/* set only one bit */
93382a1dfc1SJulian Elischer 		if (forward_wakeup_use_single) {
93482a1dfc1SJulian Elischer 			map = map & ((~map) + 1);
93582a1dfc1SJulian Elischer 		}
93682a1dfc1SJulian Elischer 	}
93782a1dfc1SJulian Elischer 	if (map) {
93882a1dfc1SJulian Elischer 		forward_wakeups_delivered++;
93982a1dfc1SJulian Elischer 		ipi_selected(map, IPI_AST);
94082a1dfc1SJulian Elischer 		return (1);
94182a1dfc1SJulian Elischer 	}
94282a1dfc1SJulian Elischer 	if (cpunum == NOCPU)
94382a1dfc1SJulian Elischer 		printf("forward_wakeup: Idle processor not found\n");
94482a1dfc1SJulian Elischer 	return (0);
94582a1dfc1SJulian Elischer }
94637c28a02SJulian Elischer #endif
94782a1dfc1SJulian Elischer 
948b43179fbSJeff Roberson void
9492630e4c9SJulian Elischer sched_add(struct thread *td, int flags)
950b43179fbSJeff Roberson {
9517cf90fb3SJeff Roberson 	struct kse *ke;
9526804a3abSJulian Elischer #ifdef SMP
9536804a3abSJulian Elischer 	int forwarded = 0;
9546804a3abSJulian Elischer 	int cpu;
9556804a3abSJulian Elischer #endif
9567cf90fb3SJeff Roberson 
9577cf90fb3SJeff Roberson 	ke = td->td_kse;
958b43179fbSJeff Roberson 	mtx_assert(&sched_lock, MA_OWNED);
959b43179fbSJeff Roberson 	KASSERT(ke->ke_state != KES_ONRUNQ,
9605a2b158dSJeff Roberson 	    ("sched_add: kse %p (%s) already in run queue", ke,
961b43179fbSJeff Roberson 	    ke->ke_proc->p_comm));
962b43179fbSJeff Roberson 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
9635a2b158dSJeff Roberson 	    ("sched_add: process swapped out"));
9640c0b25aeSJohn Baldwin 
9650c0b25aeSJohn Baldwin #ifdef SMP
966e17c57b1SJeff Roberson 	if (KSE_CAN_MIGRATE(ke)) {
9676804a3abSJulian Elischer 		CTR2(KTR_RUNQ,
9686804a3abSJulian Elischer 		    "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
9696804a3abSJulian Elischer 		cpu = NOCPU;
970e17c57b1SJeff Roberson 		ke->ke_runq = &runq;
971e17c57b1SJeff Roberson 	} else {
972e17c57b1SJeff Roberson 		if (!SKE_RUNQ_PCPU(ke))
9736804a3abSJulian Elischer 			ke->ke_runq = &runq_pcpu[(cpu = PCPU_GET(cpuid))];
9746804a3abSJulian Elischer 		else
9756804a3abSJulian Elischer 			cpu = td->td_lastcpu;
9766804a3abSJulian Elischer 		CTR3(KTR_RUNQ,
9776804a3abSJulian Elischer 		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
978e17c57b1SJeff Roberson 	}
979e17c57b1SJeff Roberson #else
980732d9528SJulian Elischer 	CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
981e17c57b1SJeff Roberson 	ke->ke_runq = &runq;
9826804a3abSJulian Elischer 
983e17c57b1SJeff Roberson #endif
9846804a3abSJulian Elischer 	/*
9856804a3abSJulian Elischer 	 * If we are yielding (on the way out anyhow)
9866804a3abSJulian Elischer 	 * or the thread being saved is US,
9876804a3abSJulian Elischer 	 * then don't try be smart about preemption
9886804a3abSJulian Elischer 	 * or kicking off another CPU
9896804a3abSJulian Elischer 	 * as it won't help and may hinder.
9906804a3abSJulian Elischer 	 * In the YIEDLING case, we are about to run whoever is
9916804a3abSJulian Elischer 	 * being put in the queue anyhow, and in the
9926804a3abSJulian Elischer 	 * OURSELF case, we are puting ourself on the run queue
9936804a3abSJulian Elischer 	 * which also only happens when we are about to yield.
9946804a3abSJulian Elischer 	 */
9956804a3abSJulian Elischer 	if((flags & SRQ_YIELDING) == 0) {
9966804a3abSJulian Elischer #ifdef SMP
9976804a3abSJulian Elischer 		cpumask_t me = PCPU_GET(cpumask);
9986804a3abSJulian Elischer 		int idle = idle_cpus_mask & me;
9996804a3abSJulian Elischer 		/*
10006804a3abSJulian Elischer 		 * Only try to kick off another CPU if
10016804a3abSJulian Elischer 		 * the thread is unpinned
10026804a3abSJulian Elischer 		 * or pinned to another cpu,
10036804a3abSJulian Elischer 		 * and there are other available and idle CPUs.
10046a574b2aSJulian Elischer 		 * if we are idle, or it's an interrupt,
10056a574b2aSJulian Elischer 		 * then skip straight to preemption.
10066804a3abSJulian Elischer 		 */
10076a574b2aSJulian Elischer 		if ( (! idle) && ((flags & SRQ_INTR) == 0) &&
10086804a3abSJulian Elischer 		    (idle_cpus_mask & ~(hlt_cpus_mask | me)) &&
10096804a3abSJulian Elischer 		    ( KSE_CAN_MIGRATE(ke) ||
10106804a3abSJulian Elischer 		      ke->ke_runq != &runq_pcpu[PCPU_GET(cpuid)])) {
10116804a3abSJulian Elischer 			forwarded = forward_wakeup(cpu);
10126804a3abSJulian Elischer 		}
10136804a3abSJulian Elischer 		/*
10146804a3abSJulian Elischer 		 * If we failed to kick off another cpu, then look to
10156804a3abSJulian Elischer 		 * see if we should preempt this CPU. Only allow this
10166804a3abSJulian Elischer 		 * if it is not pinned or IS pinned to this CPU.
10176804a3abSJulian Elischer 		 * If we are the idle thread, we also try do preempt.
10186804a3abSJulian Elischer 		 * as it will be quicker and being idle, we won't
10196804a3abSJulian Elischer 		 * lose in doing so..
10206804a3abSJulian Elischer 		 */
10216804a3abSJulian Elischer 		if ((!forwarded) &&
10226804a3abSJulian Elischer 		    (ke->ke_runq == &runq ||
10236804a3abSJulian Elischer 		     ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)]))
10246804a3abSJulian Elischer #endif
10256804a3abSJulian Elischer 
10266804a3abSJulian Elischer 		{
10276804a3abSJulian Elischer 			if (maybe_preempt(td))
10286804a3abSJulian Elischer 				return;
10296804a3abSJulian Elischer 		}
10306804a3abSJulian Elischer 	}
1031f2f51f8aSJeff Roberson 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1032ca59f152SJeff Roberson 		sched_tdcnt++;
103314f0e2e9SJulian Elischer 	td->td_ksegrp->kg_avail_opennings--;
1034e17c57b1SJeff Roberson 	runq_add(ke->ke_runq, ke);
10350f54f482SJulian Elischer 	ke->ke_ksegrp->kg_runq_kses++;
10360f54f482SJulian Elischer 	ke->ke_state = KES_ONRUNQ;
10376942d433SJohn Baldwin 	maybe_resched(td);
1038b43179fbSJeff Roberson }
1039b43179fbSJeff Roberson 
1040b43179fbSJeff Roberson void
10417cf90fb3SJeff Roberson sched_rem(struct thread *td)
1042b43179fbSJeff Roberson {
10437cf90fb3SJeff Roberson 	struct kse *ke;
10447cf90fb3SJeff Roberson 
10457cf90fb3SJeff Roberson 	ke = td->td_kse;
1046b43179fbSJeff Roberson 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
10475a2b158dSJeff Roberson 	    ("sched_rem: process swapped out"));
10485a2b158dSJeff Roberson 	KASSERT((ke->ke_state == KES_ONRUNQ),
10495a2b158dSJeff Roberson 	    ("sched_rem: KSE not on run queue"));
1050b43179fbSJeff Roberson 	mtx_assert(&sched_lock, MA_OWNED);
1051b43179fbSJeff Roberson 
1052f2f51f8aSJeff Roberson 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1053ca59f152SJeff Roberson 		sched_tdcnt--;
105414f0e2e9SJulian Elischer 	td->td_ksegrp->kg_avail_opennings++;
1055ad59c36bSJulian Elischer 	runq_remove(ke->ke_runq, ke);
1056e17c57b1SJeff Roberson 
1057b43179fbSJeff Roberson 	ke->ke_state = KES_THREAD;
105814f0e2e9SJulian Elischer 	td->td_ksegrp->kg_runq_kses--;
1059b43179fbSJeff Roberson }
1060b43179fbSJeff Roberson 
106114f0e2e9SJulian Elischer /*
106214f0e2e9SJulian Elischer  * Select threads to run.
106314f0e2e9SJulian Elischer  * Notice that the running threads still consume a slot.
106414f0e2e9SJulian Elischer  */
1065b43179fbSJeff Roberson struct kse *
1066b43179fbSJeff Roberson sched_choose(void)
1067b43179fbSJeff Roberson {
1068b43179fbSJeff Roberson 	struct kse *ke;
1069e17c57b1SJeff Roberson 	struct runq *rq;
1070b43179fbSJeff Roberson 
1071e17c57b1SJeff Roberson #ifdef SMP
1072e17c57b1SJeff Roberson 	struct kse *kecpu;
1073e17c57b1SJeff Roberson 
1074e17c57b1SJeff Roberson 	rq = &runq;
1075b43179fbSJeff Roberson 	ke = runq_choose(&runq);
1076e17c57b1SJeff Roberson 	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1077e17c57b1SJeff Roberson 
1078e17c57b1SJeff Roberson 	if (ke == NULL ||
1079e17c57b1SJeff Roberson 	    (kecpu != NULL &&
1080e17c57b1SJeff Roberson 	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
1081732d9528SJulian Elischer 		CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu,
1082e17c57b1SJeff Roberson 		     PCPU_GET(cpuid));
1083e17c57b1SJeff Roberson 		ke = kecpu;
1084e17c57b1SJeff Roberson 		rq = &runq_pcpu[PCPU_GET(cpuid)];
1085e17c57b1SJeff Roberson 	} else {
1086732d9528SJulian Elischer 		CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke);
1087e17c57b1SJeff Roberson 	}
1088e17c57b1SJeff Roberson 
1089e17c57b1SJeff Roberson #else
1090e17c57b1SJeff Roberson 	rq = &runq;
1091e17c57b1SJeff Roberson 	ke = runq_choose(&runq);
1092e17c57b1SJeff Roberson #endif
1093b43179fbSJeff Roberson 
1094b43179fbSJeff Roberson 	if (ke != NULL) {
1095e17c57b1SJeff Roberson 		runq_remove(rq, ke);
1096b43179fbSJeff Roberson 		ke->ke_state = KES_THREAD;
10970f54f482SJulian Elischer 		ke->ke_ksegrp->kg_runq_kses--;
1098b43179fbSJeff Roberson 
1099b43179fbSJeff Roberson 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
11005a2b158dSJeff Roberson 		    ("sched_choose: process swapped out"));
1101b43179fbSJeff Roberson 	}
1102b43179fbSJeff Roberson 	return (ke);
1103b43179fbSJeff Roberson }
1104b43179fbSJeff Roberson 
1105b43179fbSJeff Roberson void
1106b43179fbSJeff Roberson sched_userret(struct thread *td)
1107b43179fbSJeff Roberson {
1108b43179fbSJeff Roberson 	struct ksegrp *kg;
1109b43179fbSJeff Roberson 	/*
1110b43179fbSJeff Roberson 	 * XXX we cheat slightly on the locking here to avoid locking in
1111b43179fbSJeff Roberson 	 * the usual case.  Setting td_priority here is essentially an
1112b43179fbSJeff Roberson 	 * incomplete workaround for not setting it properly elsewhere.
1113b43179fbSJeff Roberson 	 * Now that some interrupt handlers are threads, not setting it
1114b43179fbSJeff Roberson 	 * properly elsewhere can clobber it in the window between setting
1115b43179fbSJeff Roberson 	 * it here and returning to user mode, so don't waste time setting
1116b43179fbSJeff Roberson 	 * it perfectly here.
1117b43179fbSJeff Roberson 	 */
1118b43179fbSJeff Roberson 	kg = td->td_ksegrp;
1119b43179fbSJeff Roberson 	if (td->td_priority != kg->kg_user_pri) {
1120b43179fbSJeff Roberson 		mtx_lock_spin(&sched_lock);
1121b43179fbSJeff Roberson 		td->td_priority = kg->kg_user_pri;
1122b43179fbSJeff Roberson 		mtx_unlock_spin(&sched_lock);
1123b43179fbSJeff Roberson 	}
1124b43179fbSJeff Roberson }
1125de028f5aSJeff Roberson 
1126e17c57b1SJeff Roberson void
1127e17c57b1SJeff Roberson sched_bind(struct thread *td, int cpu)
1128e17c57b1SJeff Roberson {
1129e17c57b1SJeff Roberson 	struct kse *ke;
1130e17c57b1SJeff Roberson 
1131e17c57b1SJeff Roberson 	mtx_assert(&sched_lock, MA_OWNED);
1132e17c57b1SJeff Roberson 	KASSERT(TD_IS_RUNNING(td),
1133e17c57b1SJeff Roberson 	    ("sched_bind: cannot bind non-running thread"));
1134e17c57b1SJeff Roberson 
1135e17c57b1SJeff Roberson 	ke = td->td_kse;
1136e17c57b1SJeff Roberson 
1137e17c57b1SJeff Roberson 	ke->ke_flags |= KEF_BOUND;
1138e17c57b1SJeff Roberson #ifdef SMP
1139e17c57b1SJeff Roberson 	ke->ke_runq = &runq_pcpu[cpu];
1140e17c57b1SJeff Roberson 	if (PCPU_GET(cpuid) == cpu)
1141e17c57b1SJeff Roberson 		return;
1142e17c57b1SJeff Roberson 
1143e17c57b1SJeff Roberson 	ke->ke_state = KES_THREAD;
1144e17c57b1SJeff Roberson 
1145bf0acc27SJohn Baldwin 	mi_switch(SW_VOL, NULL);
1146e17c57b1SJeff Roberson #endif
1147e17c57b1SJeff Roberson }
1148e17c57b1SJeff Roberson 
1149e17c57b1SJeff Roberson void
1150e17c57b1SJeff Roberson sched_unbind(struct thread* td)
1151e17c57b1SJeff Roberson {
1152e17c57b1SJeff Roberson 	mtx_assert(&sched_lock, MA_OWNED);
1153e17c57b1SJeff Roberson 	td->td_kse->ke_flags &= ~KEF_BOUND;
1154e17c57b1SJeff Roberson }
1155e17c57b1SJeff Roberson 
1156de028f5aSJeff Roberson int
1157ca59f152SJeff Roberson sched_load(void)
1158ca59f152SJeff Roberson {
1159ca59f152SJeff Roberson 	return (sched_tdcnt);
1160ca59f152SJeff Roberson }
1161ca59f152SJeff Roberson 
1162ca59f152SJeff Roberson int
1163de028f5aSJeff Roberson sched_sizeof_ksegrp(void)
1164de028f5aSJeff Roberson {
1165ed062c8dSJulian Elischer 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1166de028f5aSJeff Roberson }
1167de028f5aSJeff Roberson int
1168de028f5aSJeff Roberson sched_sizeof_proc(void)
1169de028f5aSJeff Roberson {
1170de028f5aSJeff Roberson 	return (sizeof(struct proc));
1171de028f5aSJeff Roberson }
1172de028f5aSJeff Roberson int
1173de028f5aSJeff Roberson sched_sizeof_thread(void)
1174de028f5aSJeff Roberson {
1175ed062c8dSJulian Elischer 	return (sizeof(struct thread) + sizeof(struct kse));
1176de028f5aSJeff Roberson }
117779acfc49SJeff Roberson 
117879acfc49SJeff Roberson fixpt_t
11797cf90fb3SJeff Roberson sched_pctcpu(struct thread *td)
118079acfc49SJeff Roberson {
118155f2099aSJeff Roberson 	struct kse *ke;
118255f2099aSJeff Roberson 
118355f2099aSJeff Roberson 	ke = td->td_kse;
118455f2099aSJeff Roberson 	return (ke->ke_pctcpu);
118555f2099aSJeff Roberson 
118655f2099aSJeff Roberson 	return (0);
118779acfc49SJeff Roberson }
1188ed062c8dSJulian Elischer #define KERN_SWITCH_INCLUDE 1
1189ed062c8dSJulian Elischer #include "kern/kern_switch.c"
1190