xref: /freebsd/sys/kern/sched_4bsd.c (revision d51f8d20247c373ab2c2db8aed596b8ac44e7a34)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_hwpmc_hooks.h"
39 #include "opt_sched.h"
40 #include "opt_kdtrace.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/cpuset.h>
45 #include <sys/kernel.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/kthread.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/sched.h>
53 #include <sys/smp.h>
54 #include <sys/sysctl.h>
55 #include <sys/sx.h>
56 #include <sys/turnstile.h>
57 #include <sys/umtx.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
60 
61 #ifdef HWPMC_HOOKS
62 #include <sys/pmckern.h>
63 #endif
64 
65 #ifdef KDTRACE_HOOKS
66 #include <sys/dtrace_bsd.h>
67 int				dtrace_vtime_active;
68 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
69 #endif
70 
71 /*
72  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
73  * the range 100-256 Hz (approximately).
74  */
75 #define	ESTCPULIM(e) \
76     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
77     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
78 #ifdef SMP
79 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
80 #else
81 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
82 #endif
83 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
84 
85 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
86 
87 /*
88  * The schedulable entity that runs a context.
89  * This is  an extension to the thread structure and is tailored to
90  * the requirements of this scheduler
91  */
92 struct td_sched {
93 	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
94 	int		ts_cpticks;	/* (j) Ticks of cpu time. */
95 	int		ts_slptime;	/* (j) Seconds !RUNNING. */
96 	int		ts_flags;
97 	struct runq	*ts_runq;	/* runq the thread is currently on */
98 #ifdef KTR
99 	char		ts_name[TS_NAME_LEN];
100 #endif
101 };
102 
103 /* flags kept in td_flags */
104 #define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
105 #define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
106 
107 /* flags kept in ts_flags */
108 #define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
109 
110 #define SKE_RUNQ_PCPU(ts)						\
111     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
112 
113 #define	THREAD_CAN_SCHED(td, cpu)	\
114     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
115 
116 static struct td_sched td_sched0;
117 struct mtx sched_lock;
118 
119 static int	sched_tdcnt;	/* Total runnable threads in the system. */
120 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
121 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
122 
123 static void	setup_runqs(void);
124 static void	schedcpu(void);
125 static void	schedcpu_thread(void);
126 static void	sched_priority(struct thread *td, u_char prio);
127 static void	sched_setup(void *dummy);
128 static void	maybe_resched(struct thread *td);
129 static void	updatepri(struct thread *td);
130 static void	resetpriority(struct thread *td);
131 static void	resetpriority_thread(struct thread *td);
132 #ifdef SMP
133 static int	sched_pickcpu(struct thread *td);
134 static int	forward_wakeup(int cpunum);
135 static void	kick_other_cpu(int pri, int cpuid);
136 #endif
137 
138 static struct kproc_desc sched_kp = {
139         "schedcpu",
140         schedcpu_thread,
141         NULL
142 };
143 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start,
144     &sched_kp);
145 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
146 
147 /*
148  * Global run queue.
149  */
150 static struct runq runq;
151 
152 #ifdef SMP
153 /*
154  * Per-CPU run queues
155  */
156 static struct runq runq_pcpu[MAXCPU];
157 long runq_length[MAXCPU];
158 #endif
159 
160 struct pcpuidlestat {
161 	u_int idlecalls;
162 	u_int oldidlecalls;
163 };
164 static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
165 
166 static void
167 setup_runqs(void)
168 {
169 #ifdef SMP
170 	int i;
171 
172 	for (i = 0; i < MAXCPU; ++i)
173 		runq_init(&runq_pcpu[i]);
174 #endif
175 
176 	runq_init(&runq);
177 }
178 
179 static int
180 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
181 {
182 	int error, new_val;
183 
184 	new_val = sched_quantum * tick;
185 	error = sysctl_handle_int(oidp, &new_val, 0, req);
186         if (error != 0 || req->newptr == NULL)
187 		return (error);
188 	if (new_val < tick)
189 		return (EINVAL);
190 	sched_quantum = new_val / tick;
191 	hogticks = 2 * sched_quantum;
192 	return (0);
193 }
194 
195 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
196 
197 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
198     "Scheduler name");
199 
200 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
201     0, sizeof sched_quantum, sysctl_kern_quantum, "I",
202     "Roundrobin scheduling quantum in microseconds");
203 
204 #ifdef SMP
205 /* Enable forwarding of wakeups to all other cpus */
206 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
207 
208 static int runq_fuzz = 1;
209 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
210 
211 static int forward_wakeup_enabled = 1;
212 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
213 	   &forward_wakeup_enabled, 0,
214 	   "Forwarding of wakeup to idle CPUs");
215 
216 static int forward_wakeups_requested = 0;
217 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
218 	   &forward_wakeups_requested, 0,
219 	   "Requests for Forwarding of wakeup to idle CPUs");
220 
221 static int forward_wakeups_delivered = 0;
222 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
223 	   &forward_wakeups_delivered, 0,
224 	   "Completed Forwarding of wakeup to idle CPUs");
225 
226 static int forward_wakeup_use_mask = 1;
227 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
228 	   &forward_wakeup_use_mask, 0,
229 	   "Use the mask of idle cpus");
230 
231 static int forward_wakeup_use_loop = 0;
232 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
233 	   &forward_wakeup_use_loop, 0,
234 	   "Use a loop to find idle cpus");
235 
236 static int forward_wakeup_use_single = 0;
237 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
238 	   &forward_wakeup_use_single, 0,
239 	   "Only signal one idle cpu");
240 
241 static int forward_wakeup_use_htt = 0;
242 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
243 	   &forward_wakeup_use_htt, 0,
244 	   "account for htt");
245 
246 #endif
247 #if 0
248 static int sched_followon = 0;
249 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
250 	   &sched_followon, 0,
251 	   "allow threads to share a quantum");
252 #endif
253 
254 static __inline void
255 sched_load_add(void)
256 {
257 
258 	sched_tdcnt++;
259 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
260 }
261 
262 static __inline void
263 sched_load_rem(void)
264 {
265 
266 	sched_tdcnt--;
267 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
268 }
269 /*
270  * Arrange to reschedule if necessary, taking the priorities and
271  * schedulers into account.
272  */
273 static void
274 maybe_resched(struct thread *td)
275 {
276 
277 	THREAD_LOCK_ASSERT(td, MA_OWNED);
278 	if (td->td_priority < curthread->td_priority)
279 		curthread->td_flags |= TDF_NEEDRESCHED;
280 }
281 
282 /*
283  * This function is called when a thread is about to be put on run queue
284  * because it has been made runnable or its priority has been adjusted.  It
285  * determines if the new thread should be immediately preempted to.  If so,
286  * it switches to it and eventually returns true.  If not, it returns false
287  * so that the caller may place the thread on an appropriate run queue.
288  */
289 int
290 maybe_preempt(struct thread *td)
291 {
292 #ifdef PREEMPTION
293 	struct thread *ctd;
294 	int cpri, pri;
295 
296 	/*
297 	 * The new thread should not preempt the current thread if any of the
298 	 * following conditions are true:
299 	 *
300 	 *  - The kernel is in the throes of crashing (panicstr).
301 	 *  - The current thread has a higher (numerically lower) or
302 	 *    equivalent priority.  Note that this prevents curthread from
303 	 *    trying to preempt to itself.
304 	 *  - It is too early in the boot for context switches (cold is set).
305 	 *  - The current thread has an inhibitor set or is in the process of
306 	 *    exiting.  In this case, the current thread is about to switch
307 	 *    out anyways, so there's no point in preempting.  If we did,
308 	 *    the current thread would not be properly resumed as well, so
309 	 *    just avoid that whole landmine.
310 	 *  - If the new thread's priority is not a realtime priority and
311 	 *    the current thread's priority is not an idle priority and
312 	 *    FULL_PREEMPTION is disabled.
313 	 *
314 	 * If all of these conditions are false, but the current thread is in
315 	 * a nested critical section, then we have to defer the preemption
316 	 * until we exit the critical section.  Otherwise, switch immediately
317 	 * to the new thread.
318 	 */
319 	ctd = curthread;
320 	THREAD_LOCK_ASSERT(td, MA_OWNED);
321 	KASSERT((td->td_inhibitors == 0),
322 			("maybe_preempt: trying to run inhibited thread"));
323 	pri = td->td_priority;
324 	cpri = ctd->td_priority;
325 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
326 	    TD_IS_INHIBITED(ctd))
327 		return (0);
328 #ifndef FULL_PREEMPTION
329 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
330 		return (0);
331 #endif
332 
333 	if (ctd->td_critnest > 1) {
334 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
335 		    ctd->td_critnest);
336 		ctd->td_owepreempt = 1;
337 		return (0);
338 	}
339 	/*
340 	 * Thread is runnable but not yet put on system run queue.
341 	 */
342 	MPASS(ctd->td_lock == td->td_lock);
343 	MPASS(TD_ON_RUNQ(td));
344 	TD_SET_RUNNING(td);
345 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
346 	    td->td_proc->p_pid, td->td_name);
347 	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
348 	/*
349 	 * td's lock pointer may have changed.  We have to return with it
350 	 * locked.
351 	 */
352 	spinlock_enter();
353 	thread_unlock(ctd);
354 	thread_lock(td);
355 	spinlock_exit();
356 	return (1);
357 #else
358 	return (0);
359 #endif
360 }
361 
362 /*
363  * Constants for digital decay and forget:
364  *	90% of (td_estcpu) usage in 5 * loadav time
365  *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
366  *          Note that, as ps(1) mentions, this can let percentages
367  *          total over 100% (I've seen 137.9% for 3 processes).
368  *
369  * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
370  *
371  * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
372  * That is, the system wants to compute a value of decay such
373  * that the following for loop:
374  * 	for (i = 0; i < (5 * loadavg); i++)
375  * 		td_estcpu *= decay;
376  * will compute
377  * 	td_estcpu *= 0.1;
378  * for all values of loadavg:
379  *
380  * Mathematically this loop can be expressed by saying:
381  * 	decay ** (5 * loadavg) ~= .1
382  *
383  * The system computes decay as:
384  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
385  *
386  * We wish to prove that the system's computation of decay
387  * will always fulfill the equation:
388  * 	decay ** (5 * loadavg) ~= .1
389  *
390  * If we compute b as:
391  * 	b = 2 * loadavg
392  * then
393  * 	decay = b / (b + 1)
394  *
395  * We now need to prove two things:
396  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
397  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
398  *
399  * Facts:
400  *         For x close to zero, exp(x) =~ 1 + x, since
401  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
402  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
403  *         For x close to zero, ln(1+x) =~ x, since
404  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
405  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
406  *         ln(.1) =~ -2.30
407  *
408  * Proof of (1):
409  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
410  *	solving for factor,
411  *      ln(factor) =~ (-2.30/5*loadav), or
412  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
413  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
414  *
415  * Proof of (2):
416  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
417  *	solving for power,
418  *      power*ln(b/(b+1)) =~ -2.30, or
419  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
420  *
421  * Actual power values for the implemented algorithm are as follows:
422  *      loadav: 1       2       3       4
423  *      power:  5.68    10.32   14.94   19.55
424  */
425 
426 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
427 #define	loadfactor(loadav)	(2 * (loadav))
428 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
429 
430 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
431 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
432 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
433 
434 /*
435  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
436  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
437  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
438  *
439  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
440  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
441  *
442  * If you don't want to bother with the faster/more-accurate formula, you
443  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
444  * (more general) method of calculating the %age of CPU used by a process.
445  */
446 #define	CCPU_SHIFT	11
447 
448 /*
449  * Recompute process priorities, every hz ticks.
450  * MP-safe, called without the Giant mutex.
451  */
452 /* ARGSUSED */
453 static void
454 schedcpu(void)
455 {
456 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
457 	struct thread *td;
458 	struct proc *p;
459 	struct td_sched *ts;
460 	int awake, realstathz;
461 
462 	realstathz = stathz ? stathz : hz;
463 	sx_slock(&allproc_lock);
464 	FOREACH_PROC_IN_SYSTEM(p) {
465 		PROC_LOCK(p);
466 		if (p->p_state == PRS_NEW) {
467 			PROC_UNLOCK(p);
468 			continue;
469 		}
470 		FOREACH_THREAD_IN_PROC(p, td) {
471 			awake = 0;
472 			thread_lock(td);
473 			ts = td->td_sched;
474 			/*
475 			 * Increment sleep time (if sleeping).  We
476 			 * ignore overflow, as above.
477 			 */
478 			/*
479 			 * The td_sched slptimes are not touched in wakeup
480 			 * because the thread may not HAVE everything in
481 			 * memory? XXX I think this is out of date.
482 			 */
483 			if (TD_ON_RUNQ(td)) {
484 				awake = 1;
485 				td->td_flags &= ~TDF_DIDRUN;
486 			} else if (TD_IS_RUNNING(td)) {
487 				awake = 1;
488 				/* Do not clear TDF_DIDRUN */
489 			} else if (td->td_flags & TDF_DIDRUN) {
490 				awake = 1;
491 				td->td_flags &= ~TDF_DIDRUN;
492 			}
493 
494 			/*
495 			 * ts_pctcpu is only for ps and ttyinfo().
496 			 */
497 			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
498 			/*
499 			 * If the td_sched has been idle the entire second,
500 			 * stop recalculating its priority until
501 			 * it wakes up.
502 			 */
503 			if (ts->ts_cpticks != 0) {
504 #if	(FSHIFT >= CCPU_SHIFT)
505 				ts->ts_pctcpu += (realstathz == 100)
506 				    ? ((fixpt_t) ts->ts_cpticks) <<
507 				    (FSHIFT - CCPU_SHIFT) :
508 				    100 * (((fixpt_t) ts->ts_cpticks)
509 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
510 #else
511 				ts->ts_pctcpu += ((FSCALE - ccpu) *
512 				    (ts->ts_cpticks *
513 				    FSCALE / realstathz)) >> FSHIFT;
514 #endif
515 				ts->ts_cpticks = 0;
516 			}
517 			/*
518 			 * If there are ANY running threads in this process,
519 			 * then don't count it as sleeping.
520 			 * XXX: this is broken.
521 			 */
522 			if (awake) {
523 				if (ts->ts_slptime > 1) {
524 					/*
525 					 * In an ideal world, this should not
526 					 * happen, because whoever woke us
527 					 * up from the long sleep should have
528 					 * unwound the slptime and reset our
529 					 * priority before we run at the stale
530 					 * priority.  Should KASSERT at some
531 					 * point when all the cases are fixed.
532 					 */
533 					updatepri(td);
534 				}
535 				ts->ts_slptime = 0;
536 			} else
537 				ts->ts_slptime++;
538 			if (ts->ts_slptime > 1) {
539 				thread_unlock(td);
540 				continue;
541 			}
542 			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
543 		      	resetpriority(td);
544 			resetpriority_thread(td);
545 			thread_unlock(td);
546 		}
547 		PROC_UNLOCK(p);
548 	}
549 	sx_sunlock(&allproc_lock);
550 }
551 
552 /*
553  * Main loop for a kthread that executes schedcpu once a second.
554  */
555 static void
556 schedcpu_thread(void)
557 {
558 
559 	for (;;) {
560 		schedcpu();
561 		pause("-", hz);
562 	}
563 }
564 
565 /*
566  * Recalculate the priority of a process after it has slept for a while.
567  * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
568  * least six times the loadfactor will decay td_estcpu to zero.
569  */
570 static void
571 updatepri(struct thread *td)
572 {
573 	struct td_sched *ts;
574 	fixpt_t loadfac;
575 	unsigned int newcpu;
576 
577 	ts = td->td_sched;
578 	loadfac = loadfactor(averunnable.ldavg[0]);
579 	if (ts->ts_slptime > 5 * loadfac)
580 		td->td_estcpu = 0;
581 	else {
582 		newcpu = td->td_estcpu;
583 		ts->ts_slptime--;	/* was incremented in schedcpu() */
584 		while (newcpu && --ts->ts_slptime)
585 			newcpu = decay_cpu(loadfac, newcpu);
586 		td->td_estcpu = newcpu;
587 	}
588 }
589 
590 /*
591  * Compute the priority of a process when running in user mode.
592  * Arrange to reschedule if the resulting priority is better
593  * than that of the current process.
594  */
595 static void
596 resetpriority(struct thread *td)
597 {
598 	register unsigned int newpriority;
599 
600 	if (td->td_pri_class == PRI_TIMESHARE) {
601 		newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
602 		    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
603 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
604 		    PRI_MAX_TIMESHARE);
605 		sched_user_prio(td, newpriority);
606 	}
607 }
608 
609 /*
610  * Update the thread's priority when the associated process's user
611  * priority changes.
612  */
613 static void
614 resetpriority_thread(struct thread *td)
615 {
616 
617 	/* Only change threads with a time sharing user priority. */
618 	if (td->td_priority < PRI_MIN_TIMESHARE ||
619 	    td->td_priority > PRI_MAX_TIMESHARE)
620 		return;
621 
622 	/* XXX the whole needresched thing is broken, but not silly. */
623 	maybe_resched(td);
624 
625 	sched_prio(td, td->td_user_pri);
626 }
627 
628 /* ARGSUSED */
629 static void
630 sched_setup(void *dummy)
631 {
632 	setup_runqs();
633 
634 	if (sched_quantum == 0)
635 		sched_quantum = SCHED_QUANTUM;
636 	hogticks = 2 * sched_quantum;
637 
638 	/* Account for thread0. */
639 	sched_load_add();
640 }
641 
642 /* External interfaces start here */
643 
644 /*
645  * Very early in the boot some setup of scheduler-specific
646  * parts of proc0 and of some scheduler resources needs to be done.
647  * Called from:
648  *  proc0_init()
649  */
650 void
651 schedinit(void)
652 {
653 	/*
654 	 * Set up the scheduler specific parts of proc0.
655 	 */
656 	proc0.p_sched = NULL; /* XXX */
657 	thread0.td_sched = &td_sched0;
658 	thread0.td_lock = &sched_lock;
659 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
660 }
661 
662 int
663 sched_runnable(void)
664 {
665 #ifdef SMP
666 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
667 #else
668 	return runq_check(&runq);
669 #endif
670 }
671 
672 int
673 sched_rr_interval(void)
674 {
675 	if (sched_quantum == 0)
676 		sched_quantum = SCHED_QUANTUM;
677 	return (sched_quantum);
678 }
679 
680 /*
681  * We adjust the priority of the current process.  The priority of
682  * a process gets worse as it accumulates CPU time.  The cpu usage
683  * estimator (td_estcpu) is increased here.  resetpriority() will
684  * compute a different priority each time td_estcpu increases by
685  * INVERSE_ESTCPU_WEIGHT
686  * (until MAXPRI is reached).  The cpu usage estimator ramps up
687  * quite quickly when the process is running (linearly), and decays
688  * away exponentially, at a rate which is proportionally slower when
689  * the system is busy.  The basic principle is that the system will
690  * 90% forget that the process used a lot of CPU time in 5 * loadav
691  * seconds.  This causes the system to favor processes which haven't
692  * run much recently, and to round-robin among other processes.
693  */
694 void
695 sched_clock(struct thread *td)
696 {
697 	struct pcpuidlestat *stat;
698 	struct td_sched *ts;
699 
700 	THREAD_LOCK_ASSERT(td, MA_OWNED);
701 	ts = td->td_sched;
702 
703 	ts->ts_cpticks++;
704 	td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
705 	if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
706 		resetpriority(td);
707 		resetpriority_thread(td);
708 	}
709 
710 	/*
711 	 * Force a context switch if the current thread has used up a full
712 	 * quantum (default quantum is 100ms).
713 	 */
714 	if (!TD_IS_IDLETHREAD(td) &&
715 	    ticks - PCPU_GET(switchticks) >= sched_quantum)
716 		td->td_flags |= TDF_NEEDRESCHED;
717 
718 	stat = DPCPU_PTR(idlestat);
719 	stat->oldidlecalls = stat->idlecalls;
720 	stat->idlecalls = 0;
721 }
722 
723 /*
724  * Charge child's scheduling CPU usage to parent.
725  */
726 void
727 sched_exit(struct proc *p, struct thread *td)
728 {
729 
730 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
731 	    "prio:td", td->td_priority);
732 
733 	PROC_LOCK_ASSERT(p, MA_OWNED);
734 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
735 }
736 
737 void
738 sched_exit_thread(struct thread *td, struct thread *child)
739 {
740 
741 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
742 	    "prio:td", child->td_priority);
743 	thread_lock(td);
744 	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
745 	thread_unlock(td);
746 	thread_lock(child);
747 	if ((child->td_flags & TDF_NOLOAD) == 0)
748 		sched_load_rem();
749 	thread_unlock(child);
750 }
751 
752 void
753 sched_fork(struct thread *td, struct thread *childtd)
754 {
755 	sched_fork_thread(td, childtd);
756 }
757 
758 void
759 sched_fork_thread(struct thread *td, struct thread *childtd)
760 {
761 	struct td_sched *ts;
762 
763 	childtd->td_estcpu = td->td_estcpu;
764 	childtd->td_lock = &sched_lock;
765 	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
766 	childtd->td_priority = childtd->td_base_pri;
767 	ts = childtd->td_sched;
768 	bzero(ts, sizeof(*ts));
769 	ts->ts_flags |= (td->td_sched->ts_flags & TSF_AFFINITY);
770 }
771 
772 void
773 sched_nice(struct proc *p, int nice)
774 {
775 	struct thread *td;
776 
777 	PROC_LOCK_ASSERT(p, MA_OWNED);
778 	p->p_nice = nice;
779 	FOREACH_THREAD_IN_PROC(p, td) {
780 		thread_lock(td);
781 		resetpriority(td);
782 		resetpriority_thread(td);
783 		thread_unlock(td);
784 	}
785 }
786 
787 void
788 sched_class(struct thread *td, int class)
789 {
790 	THREAD_LOCK_ASSERT(td, MA_OWNED);
791 	td->td_pri_class = class;
792 }
793 
794 /*
795  * Adjust the priority of a thread.
796  */
797 static void
798 sched_priority(struct thread *td, u_char prio)
799 {
800 
801 
802 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
803 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
804 	    sched_tdname(curthread));
805 	if (td != curthread && prio > td->td_priority) {
806 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
807 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
808 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
809 	}
810 	THREAD_LOCK_ASSERT(td, MA_OWNED);
811 	if (td->td_priority == prio)
812 		return;
813 	td->td_priority = prio;
814 	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
815 		sched_rem(td);
816 		sched_add(td, SRQ_BORING);
817 	}
818 }
819 
820 /*
821  * Update a thread's priority when it is lent another thread's
822  * priority.
823  */
824 void
825 sched_lend_prio(struct thread *td, u_char prio)
826 {
827 
828 	td->td_flags |= TDF_BORROWING;
829 	sched_priority(td, prio);
830 }
831 
832 /*
833  * Restore a thread's priority when priority propagation is
834  * over.  The prio argument is the minimum priority the thread
835  * needs to have to satisfy other possible priority lending
836  * requests.  If the thread's regulary priority is less
837  * important than prio the thread will keep a priority boost
838  * of prio.
839  */
840 void
841 sched_unlend_prio(struct thread *td, u_char prio)
842 {
843 	u_char base_pri;
844 
845 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
846 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
847 		base_pri = td->td_user_pri;
848 	else
849 		base_pri = td->td_base_pri;
850 	if (prio >= base_pri) {
851 		td->td_flags &= ~TDF_BORROWING;
852 		sched_prio(td, base_pri);
853 	} else
854 		sched_lend_prio(td, prio);
855 }
856 
857 void
858 sched_prio(struct thread *td, u_char prio)
859 {
860 	u_char oldprio;
861 
862 	/* First, update the base priority. */
863 	td->td_base_pri = prio;
864 
865 	/*
866 	 * If the thread is borrowing another thread's priority, don't ever
867 	 * lower the priority.
868 	 */
869 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
870 		return;
871 
872 	/* Change the real priority. */
873 	oldprio = td->td_priority;
874 	sched_priority(td, prio);
875 
876 	/*
877 	 * If the thread is on a turnstile, then let the turnstile update
878 	 * its state.
879 	 */
880 	if (TD_ON_LOCK(td) && oldprio != prio)
881 		turnstile_adjust(td, oldprio);
882 }
883 
884 void
885 sched_user_prio(struct thread *td, u_char prio)
886 {
887 
888 	THREAD_LOCK_ASSERT(td, MA_OWNED);
889 	td->td_base_user_pri = prio;
890 	if (td->td_lend_user_pri <= prio)
891 		return;
892 	td->td_user_pri = prio;
893 }
894 
895 void
896 sched_lend_user_prio(struct thread *td, u_char prio)
897 {
898 
899 	THREAD_LOCK_ASSERT(td, MA_OWNED);
900 	td->td_lend_user_pri = prio;
901 	td->td_user_pri = min(prio, td->td_base_user_pri);
902 	if (td->td_priority > td->td_user_pri)
903 		sched_prio(td, td->td_user_pri);
904 	else if (td->td_priority != td->td_user_pri)
905 		td->td_flags |= TDF_NEEDRESCHED;
906 }
907 
908 void
909 sched_sleep(struct thread *td, int pri)
910 {
911 
912 	THREAD_LOCK_ASSERT(td, MA_OWNED);
913 	td->td_slptick = ticks;
914 	td->td_sched->ts_slptime = 0;
915 	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
916 		sched_prio(td, pri);
917 	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
918 		td->td_flags |= TDF_CANSWAP;
919 }
920 
921 void
922 sched_switch(struct thread *td, struct thread *newtd, int flags)
923 {
924 	struct mtx *tmtx;
925 	struct td_sched *ts;
926 	struct proc *p;
927 
928 	tmtx = NULL;
929 	ts = td->td_sched;
930 	p = td->td_proc;
931 
932 	THREAD_LOCK_ASSERT(td, MA_OWNED);
933 
934 	/*
935 	 * Switch to the sched lock to fix things up and pick
936 	 * a new thread.
937 	 * Block the td_lock in order to avoid breaking the critical path.
938 	 */
939 	if (td->td_lock != &sched_lock) {
940 		mtx_lock_spin(&sched_lock);
941 		tmtx = thread_lock_block(td);
942 	}
943 
944 	if ((td->td_flags & TDF_NOLOAD) == 0)
945 		sched_load_rem();
946 
947 	td->td_lastcpu = td->td_oncpu;
948 	if (!(flags & SW_PREEMPT))
949 		td->td_flags &= ~TDF_NEEDRESCHED;
950 	td->td_owepreempt = 0;
951 	td->td_oncpu = NOCPU;
952 
953 	/*
954 	 * At the last moment, if this thread is still marked RUNNING,
955 	 * then put it back on the run queue as it has not been suspended
956 	 * or stopped or any thing else similar.  We never put the idle
957 	 * threads on the run queue, however.
958 	 */
959 	if (td->td_flags & TDF_IDLETD) {
960 		TD_SET_CAN_RUN(td);
961 #ifdef SMP
962 		idle_cpus_mask &= ~PCPU_GET(cpumask);
963 #endif
964 	} else {
965 		if (TD_IS_RUNNING(td)) {
966 			/* Put us back on the run queue. */
967 			sched_add(td, (flags & SW_PREEMPT) ?
968 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
969 			    SRQ_OURSELF|SRQ_YIELDING);
970 		}
971 	}
972 	if (newtd) {
973 		/*
974 		 * The thread we are about to run needs to be counted
975 		 * as if it had been added to the run queue and selected.
976 		 * It came from:
977 		 * * A preemption
978 		 * * An upcall
979 		 * * A followon
980 		 */
981 		KASSERT((newtd->td_inhibitors == 0),
982 			("trying to run inhibited thread"));
983 		newtd->td_flags |= TDF_DIDRUN;
984         	TD_SET_RUNNING(newtd);
985 		if ((newtd->td_flags & TDF_NOLOAD) == 0)
986 			sched_load_add();
987 	} else {
988 		newtd = choosethread();
989 		MPASS(newtd->td_lock == &sched_lock);
990 	}
991 
992 	if (td != newtd) {
993 #ifdef	HWPMC_HOOKS
994 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
995 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
996 #endif
997                 /* I feel sleepy */
998 		lock_profile_release_lock(&sched_lock.lock_object);
999 #ifdef KDTRACE_HOOKS
1000 		/*
1001 		 * If DTrace has set the active vtime enum to anything
1002 		 * other than INACTIVE (0), then it should have set the
1003 		 * function to call.
1004 		 */
1005 		if (dtrace_vtime_active)
1006 			(*dtrace_vtime_switch_func)(newtd);
1007 #endif
1008 
1009 		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
1010 		lock_profile_obtain_lock_success(&sched_lock.lock_object,
1011 		    0, 0, __FILE__, __LINE__);
1012 		/*
1013 		 * Where am I?  What year is it?
1014 		 * We are in the same thread that went to sleep above,
1015 		 * but any amount of time may have passed. All our context
1016 		 * will still be available as will local variables.
1017 		 * PCPU values however may have changed as we may have
1018 		 * changed CPU so don't trust cached values of them.
1019 		 * New threads will go to fork_exit() instead of here
1020 		 * so if you change things here you may need to change
1021 		 * things there too.
1022 		 *
1023 		 * If the thread above was exiting it will never wake
1024 		 * up again here, so either it has saved everything it
1025 		 * needed to, or the thread_wait() or wait() will
1026 		 * need to reap it.
1027 		 */
1028 #ifdef	HWPMC_HOOKS
1029 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1030 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1031 #endif
1032 	}
1033 
1034 #ifdef SMP
1035 	if (td->td_flags & TDF_IDLETD)
1036 		idle_cpus_mask |= PCPU_GET(cpumask);
1037 #endif
1038 	sched_lock.mtx_lock = (uintptr_t)td;
1039 	td->td_oncpu = PCPU_GET(cpuid);
1040 	MPASS(td->td_lock == &sched_lock);
1041 }
1042 
1043 void
1044 sched_wakeup(struct thread *td)
1045 {
1046 	struct td_sched *ts;
1047 
1048 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1049 	ts = td->td_sched;
1050 	td->td_flags &= ~TDF_CANSWAP;
1051 	if (ts->ts_slptime > 1) {
1052 		updatepri(td);
1053 		resetpriority(td);
1054 	}
1055 	td->td_slptick = 0;
1056 	ts->ts_slptime = 0;
1057 	sched_add(td, SRQ_BORING);
1058 }
1059 
1060 #ifdef SMP
1061 static int
1062 forward_wakeup(int cpunum)
1063 {
1064 	struct pcpu *pc;
1065 	cpumask_t dontuse, id, map, map2, map3, me;
1066 
1067 	mtx_assert(&sched_lock, MA_OWNED);
1068 
1069 	CTR0(KTR_RUNQ, "forward_wakeup()");
1070 
1071 	if ((!forward_wakeup_enabled) ||
1072 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
1073 		return (0);
1074 	if (!smp_started || cold || panicstr)
1075 		return (0);
1076 
1077 	forward_wakeups_requested++;
1078 
1079 	/*
1080 	 * Check the idle mask we received against what we calculated
1081 	 * before in the old version.
1082 	 */
1083 	me = PCPU_GET(cpumask);
1084 
1085 	/* Don't bother if we should be doing it ourself. */
1086 	if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
1087 		return (0);
1088 
1089 	dontuse = me | stopped_cpus | hlt_cpus_mask;
1090 	map3 = 0;
1091 	if (forward_wakeup_use_loop) {
1092 		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
1093 			id = pc->pc_cpumask;
1094 			if ((id & dontuse) == 0 &&
1095 			    pc->pc_curthread == pc->pc_idlethread) {
1096 				map3 |= id;
1097 			}
1098 		}
1099 	}
1100 
1101 	if (forward_wakeup_use_mask) {
1102 		map = 0;
1103 		map = idle_cpus_mask & ~dontuse;
1104 
1105 		/* If they are both on, compare and use loop if different. */
1106 		if (forward_wakeup_use_loop) {
1107 			if (map != map3) {
1108 				printf("map (%02X) != map3 (%02X)\n", map,
1109 				    map3);
1110 				map = map3;
1111 			}
1112 		}
1113 	} else {
1114 		map = map3;
1115 	}
1116 
1117 	/* If we only allow a specific CPU, then mask off all the others. */
1118 	if (cpunum != NOCPU) {
1119 		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
1120 		map &= (1 << cpunum);
1121 	} else {
1122 		/* Try choose an idle die. */
1123 		if (forward_wakeup_use_htt) {
1124 			map2 =  (map & (map >> 1)) & 0x5555;
1125 			if (map2) {
1126 				map = map2;
1127 			}
1128 		}
1129 
1130 		/* Set only one bit. */
1131 		if (forward_wakeup_use_single) {
1132 			map = map & ((~map) + 1);
1133 		}
1134 	}
1135 	if (map) {
1136 		forward_wakeups_delivered++;
1137 		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
1138 			id = pc->pc_cpumask;
1139 			if ((map & id) == 0)
1140 				continue;
1141 			if (cpu_idle_wakeup(pc->pc_cpuid))
1142 				map &= ~id;
1143 		}
1144 		if (map)
1145 			ipi_selected(map, IPI_AST);
1146 		return (1);
1147 	}
1148 	if (cpunum == NOCPU)
1149 		printf("forward_wakeup: Idle processor not found\n");
1150 	return (0);
1151 }
1152 
1153 static void
1154 kick_other_cpu(int pri, int cpuid)
1155 {
1156 	struct pcpu *pcpu;
1157 	int cpri;
1158 
1159 	pcpu = pcpu_find(cpuid);
1160 	if (idle_cpus_mask & pcpu->pc_cpumask) {
1161 		forward_wakeups_delivered++;
1162 		if (!cpu_idle_wakeup(cpuid))
1163 			ipi_cpu(cpuid, IPI_AST);
1164 		return;
1165 	}
1166 
1167 	cpri = pcpu->pc_curthread->td_priority;
1168 	if (pri >= cpri)
1169 		return;
1170 
1171 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
1172 #if !defined(FULL_PREEMPTION)
1173 	if (pri <= PRI_MAX_ITHD)
1174 #endif /* ! FULL_PREEMPTION */
1175 	{
1176 		ipi_cpu(cpuid, IPI_PREEMPT);
1177 		return;
1178 	}
1179 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
1180 
1181 	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
1182 	ipi_cpu(cpuid, IPI_AST);
1183 	return;
1184 }
1185 #endif /* SMP */
1186 
1187 #ifdef SMP
1188 static int
1189 sched_pickcpu(struct thread *td)
1190 {
1191 	int best, cpu;
1192 
1193 	mtx_assert(&sched_lock, MA_OWNED);
1194 
1195 	if (THREAD_CAN_SCHED(td, td->td_lastcpu))
1196 		best = td->td_lastcpu;
1197 	else
1198 		best = NOCPU;
1199 	CPU_FOREACH(cpu) {
1200 		if (!THREAD_CAN_SCHED(td, cpu))
1201 			continue;
1202 
1203 		if (best == NOCPU)
1204 			best = cpu;
1205 		else if (runq_length[cpu] < runq_length[best])
1206 			best = cpu;
1207 	}
1208 	KASSERT(best != NOCPU, ("no valid CPUs"));
1209 
1210 	return (best);
1211 }
1212 #endif
1213 
1214 void
1215 sched_add(struct thread *td, int flags)
1216 #ifdef SMP
1217 {
1218 	struct td_sched *ts;
1219 	int forwarded = 0;
1220 	int cpu;
1221 	int single_cpu = 0;
1222 
1223 	ts = td->td_sched;
1224 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1225 	KASSERT((td->td_inhibitors == 0),
1226 	    ("sched_add: trying to run inhibited thread"));
1227 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1228 	    ("sched_add: bad thread state"));
1229 	KASSERT(td->td_flags & TDF_INMEM,
1230 	    ("sched_add: thread swapped out"));
1231 
1232 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1233 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1234 	    sched_tdname(curthread));
1235 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1236 	    KTR_ATTR_LINKED, sched_tdname(td));
1237 
1238 
1239 	/*
1240 	 * Now that the thread is moving to the run-queue, set the lock
1241 	 * to the scheduler's lock.
1242 	 */
1243 	if (td->td_lock != &sched_lock) {
1244 		mtx_lock_spin(&sched_lock);
1245 		thread_lock_set(td, &sched_lock);
1246 	}
1247 	TD_SET_RUNQ(td);
1248 
1249 	if (td->td_pinned != 0) {
1250 		cpu = td->td_lastcpu;
1251 		ts->ts_runq = &runq_pcpu[cpu];
1252 		single_cpu = 1;
1253 		CTR3(KTR_RUNQ,
1254 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
1255 		    cpu);
1256 	} else if (td->td_flags & TDF_BOUND) {
1257 		/* Find CPU from bound runq. */
1258 		KASSERT(SKE_RUNQ_PCPU(ts),
1259 		    ("sched_add: bound td_sched not on cpu runq"));
1260 		cpu = ts->ts_runq - &runq_pcpu[0];
1261 		single_cpu = 1;
1262 		CTR3(KTR_RUNQ,
1263 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
1264 		    cpu);
1265 	} else if (ts->ts_flags & TSF_AFFINITY) {
1266 		/* Find a valid CPU for our cpuset */
1267 		cpu = sched_pickcpu(td);
1268 		ts->ts_runq = &runq_pcpu[cpu];
1269 		single_cpu = 1;
1270 		CTR3(KTR_RUNQ,
1271 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
1272 		    cpu);
1273 	} else {
1274 		CTR2(KTR_RUNQ,
1275 		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
1276 		    td);
1277 		cpu = NOCPU;
1278 		ts->ts_runq = &runq;
1279 	}
1280 
1281 	if (single_cpu && (cpu != PCPU_GET(cpuid))) {
1282 	        kick_other_cpu(td->td_priority, cpu);
1283 	} else {
1284 		if (!single_cpu) {
1285 			cpumask_t me = PCPU_GET(cpumask);
1286 			cpumask_t idle = idle_cpus_mask & me;
1287 
1288 			if (!idle && ((flags & SRQ_INTR) == 0) &&
1289 			    (idle_cpus_mask & ~(hlt_cpus_mask | me)))
1290 				forwarded = forward_wakeup(cpu);
1291 		}
1292 
1293 		if (!forwarded) {
1294 			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
1295 				return;
1296 			else
1297 				maybe_resched(td);
1298 		}
1299 	}
1300 
1301 	if ((td->td_flags & TDF_NOLOAD) == 0)
1302 		sched_load_add();
1303 	runq_add(ts->ts_runq, td, flags);
1304 	if (cpu != NOCPU)
1305 		runq_length[cpu]++;
1306 }
1307 #else /* SMP */
1308 {
1309 	struct td_sched *ts;
1310 
1311 	ts = td->td_sched;
1312 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1313 	KASSERT((td->td_inhibitors == 0),
1314 	    ("sched_add: trying to run inhibited thread"));
1315 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1316 	    ("sched_add: bad thread state"));
1317 	KASSERT(td->td_flags & TDF_INMEM,
1318 	    ("sched_add: thread swapped out"));
1319 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1320 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1321 	    sched_tdname(curthread));
1322 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1323 	    KTR_ATTR_LINKED, sched_tdname(td));
1324 
1325 	/*
1326 	 * Now that the thread is moving to the run-queue, set the lock
1327 	 * to the scheduler's lock.
1328 	 */
1329 	if (td->td_lock != &sched_lock) {
1330 		mtx_lock_spin(&sched_lock);
1331 		thread_lock_set(td, &sched_lock);
1332 	}
1333 	TD_SET_RUNQ(td);
1334 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
1335 	ts->ts_runq = &runq;
1336 
1337 	/*
1338 	 * If we are yielding (on the way out anyhow) or the thread
1339 	 * being saved is US, then don't try be smart about preemption
1340 	 * or kicking off another CPU as it won't help and may hinder.
1341 	 * In the YIEDLING case, we are about to run whoever is being
1342 	 * put in the queue anyhow, and in the OURSELF case, we are
1343 	 * puting ourself on the run queue which also only happens
1344 	 * when we are about to yield.
1345 	 */
1346 	if ((flags & SRQ_YIELDING) == 0) {
1347 		if (maybe_preempt(td))
1348 			return;
1349 	}
1350 	if ((td->td_flags & TDF_NOLOAD) == 0)
1351 		sched_load_add();
1352 	runq_add(ts->ts_runq, td, flags);
1353 	maybe_resched(td);
1354 }
1355 #endif /* SMP */
1356 
1357 void
1358 sched_rem(struct thread *td)
1359 {
1360 	struct td_sched *ts;
1361 
1362 	ts = td->td_sched;
1363 	KASSERT(td->td_flags & TDF_INMEM,
1364 	    ("sched_rem: thread swapped out"));
1365 	KASSERT(TD_ON_RUNQ(td),
1366 	    ("sched_rem: thread not on run queue"));
1367 	mtx_assert(&sched_lock, MA_OWNED);
1368 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
1369 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1370 	    sched_tdname(curthread));
1371 
1372 	if ((td->td_flags & TDF_NOLOAD) == 0)
1373 		sched_load_rem();
1374 #ifdef SMP
1375 	if (ts->ts_runq != &runq)
1376 		runq_length[ts->ts_runq - runq_pcpu]--;
1377 #endif
1378 	runq_remove(ts->ts_runq, td);
1379 	TD_SET_CAN_RUN(td);
1380 }
1381 
1382 /*
1383  * Select threads to run.  Note that running threads still consume a
1384  * slot.
1385  */
1386 struct thread *
1387 sched_choose(void)
1388 {
1389 	struct thread *td;
1390 	struct runq *rq;
1391 
1392 	mtx_assert(&sched_lock,  MA_OWNED);
1393 #ifdef SMP
1394 	struct thread *tdcpu;
1395 
1396 	rq = &runq;
1397 	td = runq_choose_fuzz(&runq, runq_fuzz);
1398 	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1399 
1400 	if (td == NULL ||
1401 	    (tdcpu != NULL &&
1402 	     tdcpu->td_priority < td->td_priority)) {
1403 		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
1404 		     PCPU_GET(cpuid));
1405 		td = tdcpu;
1406 		rq = &runq_pcpu[PCPU_GET(cpuid)];
1407 	} else {
1408 		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
1409 	}
1410 
1411 #else
1412 	rq = &runq;
1413 	td = runq_choose(&runq);
1414 #endif
1415 
1416 	if (td) {
1417 #ifdef SMP
1418 		if (td == tdcpu)
1419 			runq_length[PCPU_GET(cpuid)]--;
1420 #endif
1421 		runq_remove(rq, td);
1422 		td->td_flags |= TDF_DIDRUN;
1423 
1424 		KASSERT(td->td_flags & TDF_INMEM,
1425 		    ("sched_choose: thread swapped out"));
1426 		return (td);
1427 	}
1428 	return (PCPU_GET(idlethread));
1429 }
1430 
1431 void
1432 sched_preempt(struct thread *td)
1433 {
1434 	thread_lock(td);
1435 	if (td->td_critnest > 1)
1436 		td->td_owepreempt = 1;
1437 	else
1438 		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
1439 	thread_unlock(td);
1440 }
1441 
1442 void
1443 sched_userret(struct thread *td)
1444 {
1445 	/*
1446 	 * XXX we cheat slightly on the locking here to avoid locking in
1447 	 * the usual case.  Setting td_priority here is essentially an
1448 	 * incomplete workaround for not setting it properly elsewhere.
1449 	 * Now that some interrupt handlers are threads, not setting it
1450 	 * properly elsewhere can clobber it in the window between setting
1451 	 * it here and returning to user mode, so don't waste time setting
1452 	 * it perfectly here.
1453 	 */
1454 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1455 	    ("thread with borrowed priority returning to userland"));
1456 	if (td->td_priority != td->td_user_pri) {
1457 		thread_lock(td);
1458 		td->td_priority = td->td_user_pri;
1459 		td->td_base_pri = td->td_user_pri;
1460 		thread_unlock(td);
1461 	}
1462 }
1463 
1464 void
1465 sched_bind(struct thread *td, int cpu)
1466 {
1467 	struct td_sched *ts;
1468 
1469 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
1470 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
1471 
1472 	ts = td->td_sched;
1473 
1474 	td->td_flags |= TDF_BOUND;
1475 #ifdef SMP
1476 	ts->ts_runq = &runq_pcpu[cpu];
1477 	if (PCPU_GET(cpuid) == cpu)
1478 		return;
1479 
1480 	mi_switch(SW_VOL, NULL);
1481 #endif
1482 }
1483 
1484 void
1485 sched_unbind(struct thread* td)
1486 {
1487 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1488 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
1489 	td->td_flags &= ~TDF_BOUND;
1490 }
1491 
1492 int
1493 sched_is_bound(struct thread *td)
1494 {
1495 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1496 	return (td->td_flags & TDF_BOUND);
1497 }
1498 
1499 void
1500 sched_relinquish(struct thread *td)
1501 {
1502 	thread_lock(td);
1503 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
1504 	thread_unlock(td);
1505 }
1506 
1507 int
1508 sched_load(void)
1509 {
1510 	return (sched_tdcnt);
1511 }
1512 
1513 int
1514 sched_sizeof_proc(void)
1515 {
1516 	return (sizeof(struct proc));
1517 }
1518 
1519 int
1520 sched_sizeof_thread(void)
1521 {
1522 	return (sizeof(struct thread) + sizeof(struct td_sched));
1523 }
1524 
1525 fixpt_t
1526 sched_pctcpu(struct thread *td)
1527 {
1528 	struct td_sched *ts;
1529 
1530 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1531 	ts = td->td_sched;
1532 	return (ts->ts_pctcpu);
1533 }
1534 
1535 void
1536 sched_tick(int cnt)
1537 {
1538 }
1539 
1540 /*
1541  * The actual idle process.
1542  */
1543 void
1544 sched_idletd(void *dummy)
1545 {
1546 	struct pcpuidlestat *stat;
1547 
1548 	stat = DPCPU_PTR(idlestat);
1549 	for (;;) {
1550 		mtx_assert(&Giant, MA_NOTOWNED);
1551 
1552 		while (sched_runnable() == 0) {
1553 			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
1554 			stat->idlecalls++;
1555 		}
1556 
1557 		mtx_lock_spin(&sched_lock);
1558 		mi_switch(SW_VOL | SWT_IDLE, NULL);
1559 		mtx_unlock_spin(&sched_lock);
1560 	}
1561 }
1562 
1563 /*
1564  * A CPU is entering for the first time or a thread is exiting.
1565  */
1566 void
1567 sched_throw(struct thread *td)
1568 {
1569 	/*
1570 	 * Correct spinlock nesting.  The idle thread context that we are
1571 	 * borrowing was created so that it would start out with a single
1572 	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
1573 	 * explicitly acquired locks in this function, the nesting count
1574 	 * is now 2 rather than 1.  Since we are nested, calling
1575 	 * spinlock_exit() will simply adjust the counts without allowing
1576 	 * spin lock using code to interrupt us.
1577 	 */
1578 	if (td == NULL) {
1579 		mtx_lock_spin(&sched_lock);
1580 		spinlock_exit();
1581 	} else {
1582 		lock_profile_release_lock(&sched_lock.lock_object);
1583 		MPASS(td->td_lock == &sched_lock);
1584 	}
1585 	mtx_assert(&sched_lock, MA_OWNED);
1586 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
1587 	PCPU_SET(switchtime, cpu_ticks());
1588 	PCPU_SET(switchticks, ticks);
1589 	cpu_throw(td, choosethread());	/* doesn't return */
1590 }
1591 
1592 void
1593 sched_fork_exit(struct thread *td)
1594 {
1595 
1596 	/*
1597 	 * Finish setting up thread glue so that it begins execution in a
1598 	 * non-nested critical section with sched_lock held but not recursed.
1599 	 */
1600 	td->td_oncpu = PCPU_GET(cpuid);
1601 	sched_lock.mtx_lock = (uintptr_t)td;
1602 	lock_profile_obtain_lock_success(&sched_lock.lock_object,
1603 	    0, 0, __FILE__, __LINE__);
1604 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
1605 }
1606 
1607 char *
1608 sched_tdname(struct thread *td)
1609 {
1610 #ifdef KTR
1611 	struct td_sched *ts;
1612 
1613 	ts = td->td_sched;
1614 	if (ts->ts_name[0] == '\0')
1615 		snprintf(ts->ts_name, sizeof(ts->ts_name),
1616 		    "%s tid %d", td->td_name, td->td_tid);
1617 	return (ts->ts_name);
1618 #else
1619 	return (td->td_name);
1620 #endif
1621 }
1622 
1623 void
1624 sched_affinity(struct thread *td)
1625 {
1626 #ifdef SMP
1627 	struct td_sched *ts;
1628 	int cpu;
1629 
1630 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1631 
1632 	/*
1633 	 * Set the TSF_AFFINITY flag if there is at least one CPU this
1634 	 * thread can't run on.
1635 	 */
1636 	ts = td->td_sched;
1637 	ts->ts_flags &= ~TSF_AFFINITY;
1638 	CPU_FOREACH(cpu) {
1639 		if (!THREAD_CAN_SCHED(td, cpu)) {
1640 			ts->ts_flags |= TSF_AFFINITY;
1641 			break;
1642 		}
1643 	}
1644 
1645 	/*
1646 	 * If this thread can run on all CPUs, nothing else to do.
1647 	 */
1648 	if (!(ts->ts_flags & TSF_AFFINITY))
1649 		return;
1650 
1651 	/* Pinned threads and bound threads should be left alone. */
1652 	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
1653 		return;
1654 
1655 	switch (td->td_state) {
1656 	case TDS_RUNQ:
1657 		/*
1658 		 * If we are on a per-CPU runqueue that is in the set,
1659 		 * then nothing needs to be done.
1660 		 */
1661 		if (ts->ts_runq != &runq &&
1662 		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
1663 			return;
1664 
1665 		/* Put this thread on a valid per-CPU runqueue. */
1666 		sched_rem(td);
1667 		sched_add(td, SRQ_BORING);
1668 		break;
1669 	case TDS_RUNNING:
1670 		/*
1671 		 * See if our current CPU is in the set.  If not, force a
1672 		 * context switch.
1673 		 */
1674 		if (THREAD_CAN_SCHED(td, td->td_oncpu))
1675 			return;
1676 
1677 		td->td_flags |= TDF_NEEDRESCHED;
1678 		if (td != curthread)
1679 			ipi_cpu(cpu, IPI_AST);
1680 		break;
1681 	default:
1682 		break;
1683 	}
1684 #endif
1685 }
1686