xref: /freebsd/sys/kern/sched_ule.c (revision 9a93305a2ee11e1b10eed54a8bec207b58097265)
1 /*-
2  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include "opt_hwpmc_hooks.h"
31 #include "opt_sched.h"
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kdb.h>
36 #include <sys/kernel.h>
37 #include <sys/ktr.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/proc.h>
41 #include <sys/resource.h>
42 #include <sys/resourcevar.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sx.h>
46 #include <sys/sysctl.h>
47 #include <sys/sysproto.h>
48 #include <sys/turnstile.h>
49 #include <sys/umtx.h>
50 #include <sys/vmmeter.h>
51 #ifdef KTRACE
52 #include <sys/uio.h>
53 #include <sys/ktrace.h>
54 #endif
55 
56 #ifdef HWPMC_HOOKS
57 #include <sys/pmckern.h>
58 #endif
59 
60 #include <machine/cpu.h>
61 #include <machine/smp.h>
62 
63 #ifndef PREEMPTION
64 #error	"SCHED_ULE requires options PREEMPTION"
65 #endif
66 
67 /*
68  * TODO:
69  *	Pick idle from affinity group or self group first.
70  *	Implement pick_score.
71  */
72 
73 /*
74  * Thread scheduler specific section.
75  */
76 struct td_sched {
77 	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
78 	int		ts_flags;	/* (j) TSF_* flags. */
79 	struct thread	*ts_thread;	/* (*) Active associated thread. */
80 	u_char		ts_rqindex;	/* (j) Run queue index. */
81 	int		ts_slptime;
82 	int		ts_slice;
83 	struct runq	*ts_runq;
84 	u_char		ts_cpu;		/* CPU that we have affinity for. */
85 	/* The following variables are only used for pctcpu calculation */
86 	int		ts_ltick;	/* Last tick that we were running on */
87 	int		ts_ftick;	/* First tick that we were running on */
88 	int		ts_ticks;	/* Tick count */
89 #ifdef SMP
90 	int		ts_rltick;	/* Real last tick, for affinity. */
91 #endif
92 
93 	/* originally from kg_sched */
94 	u_int	skg_slptime;		/* Number of ticks we vol. slept */
95 	u_int	skg_runtime;		/* Number of ticks we were running */
96 };
97 /* flags kept in ts_flags */
98 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
99 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
100 #define	TSF_DIDRUN	0x2000		/* Thread actually ran. */
101 
102 static struct td_sched td_sched0;
103 
104 /*
105  * Cpu percentage computation macros and defines.
106  *
107  * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
108  * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
109  * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
110  * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
111  * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
112  * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
113  */
114 #define	SCHED_TICK_SECS		10
115 #define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
116 #define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
117 #define	SCHED_TICK_SHIFT	10
118 #define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
119 #define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
120 
121 /*
122  * These macros determine priorities for non-interactive threads.  They are
123  * assigned a priority based on their recent cpu utilization as expressed
124  * by the ratio of ticks to the tick total.  NHALF priorities at the start
125  * and end of the MIN to MAX timeshare range are only reachable with negative
126  * or positive nice respectively.
127  *
128  * PRI_RANGE:	Priority range for utilization dependent priorities.
129  * PRI_NRESV:	Number of nice values.
130  * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
131  * PRI_NICE:	Determines the part of the priority inherited from nice.
132  */
133 #define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
134 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
135 #define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
136 #define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
137 #define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
138 #define	SCHED_PRI_TICKS(ts)						\
139     (SCHED_TICK_HZ((ts)) /						\
140     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
141 #define	SCHED_PRI_NICE(nice)	(nice)
142 
143 /*
144  * These determine the interactivity of a process.  Interactivity differs from
145  * cpu utilization in that it expresses the voluntary time slept vs time ran
146  * while cpu utilization includes all time not running.  This more accurately
147  * models the intent of the thread.
148  *
149  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
150  *		before throttling back.
151  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
152  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
153  * INTERACT_THRESH:	Threshhold for placement on the current runq.
154  */
155 #define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
156 #define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
157 #define	SCHED_INTERACT_MAX	(100)
158 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
159 #define	SCHED_INTERACT_THRESH	(30)
160 
161 /*
162  * tickincr:		Converts a stathz tick into a hz domain scaled by
163  *			the shift factor.  Without the shift the error rate
164  *			due to rounding would be unacceptably high.
165  * realstathz:		stathz is sometimes 0 and run off of hz.
166  * sched_slice:		Runtime of each thread before rescheduling.
167  */
168 static int sched_interact = SCHED_INTERACT_THRESH;
169 static int realstathz;
170 static int tickincr;
171 static int sched_slice;
172 
173 /*
174  * tdq - per processor runqs and statistics.
175  */
176 struct tdq {
177 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
178 	struct runq	tdq_timeshare;		/* timeshare run queue. */
179 	struct runq	tdq_realtime;		/* real-time run queue. */
180 	int		tdq_idx;		/* Current insert index. */
181 	int		tdq_ridx;		/* Current removal index. */
182 	int		tdq_load;		/* Aggregate load. */
183 	int		tdq_flags;		/* Thread queue flags */
184 #ifdef SMP
185 	int		tdq_transferable;
186 	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
187 	struct tdq_group *tdq_group;		/* Our processor group. */
188 #else
189 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
190 #endif
191 };
192 
193 #define	TDQF_BUSY	0x0001			/* Queue is marked as busy */
194 
195 #ifdef SMP
196 /*
197  * tdq groups are groups of processors which can cheaply share threads.  When
198  * one processor in the group goes idle it will check the runqs of the other
199  * processors in its group prior to halting and waiting for an interrupt.
200  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
201  * In a numa environment we'd want an idle bitmap per group and a two tiered
202  * load balancer.
203  */
204 struct tdq_group {
205 	int	tdg_cpus;		/* Count of CPUs in this tdq group. */
206 	cpumask_t tdg_cpumask;		/* Mask of cpus in this group. */
207 	cpumask_t tdg_idlemask;		/* Idle cpus in this group. */
208 	cpumask_t tdg_mask;		/* Bit mask for first cpu. */
209 	int	tdg_load;		/* Total load of this group. */
210 	int	tdg_transferable;	/* Transferable load of this group. */
211 	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
212 };
213 
214 #define	SCHED_AFFINITY_DEFAULT	(hz / 100)
215 #define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
216 
217 /*
218  * Run-time tunables.
219  */
220 static int rebalance = 0;
221 static int pick_pri = 1;
222 static int affinity;
223 static int tryself = 1;
224 static int tryselfidle = 1;
225 static int ipi_ast = 0;
226 static int ipi_preempt = 1;
227 static int ipi_thresh = PRI_MIN_KERN;
228 static int steal_htt = 1;
229 static int steal_busy = 1;
230 static int busy_thresh = 4;
231 
232 /*
233  * One thread queue per processor.
234  */
235 static volatile cpumask_t tdq_idle;
236 static volatile cpumask_t tdq_busy;
237 static int tdg_maxid;
238 static struct tdq	tdq_cpu[MAXCPU];
239 static struct tdq_group tdq_groups[MAXCPU];
240 static int bal_tick;
241 static int gbal_tick;
242 static int balance_groups;
243 
244 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
245 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
246 #define	TDQ_ID(x)	((x) - tdq_cpu)
247 #define	TDQ_GROUP(x)	(&tdq_groups[(x)])
248 #else	/* !SMP */
249 static struct tdq	tdq_cpu;
250 
251 #define	TDQ_SELF()	(&tdq_cpu)
252 #define	TDQ_CPU(x)	(&tdq_cpu)
253 #endif
254 
255 static void sched_priority(struct thread *);
256 static void sched_thread_priority(struct thread *, u_char);
257 static int sched_interact_score(struct thread *);
258 static void sched_interact_update(struct thread *);
259 static void sched_interact_fork(struct thread *);
260 static void sched_pctcpu_update(struct td_sched *);
261 static inline void sched_pin_td(struct thread *td);
262 static inline void sched_unpin_td(struct thread *td);
263 
264 /* Operations on per processor queues */
265 static struct td_sched * tdq_choose(struct tdq *);
266 static void tdq_setup(struct tdq *);
267 static void tdq_load_add(struct tdq *, struct td_sched *);
268 static void tdq_load_rem(struct tdq *, struct td_sched *);
269 static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
270 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
271 void tdq_print(int cpu);
272 static void runq_print(struct runq *rq);
273 #ifdef SMP
274 static int tdq_pickidle(struct tdq *, struct td_sched *);
275 static int tdq_pickpri(struct tdq *, struct td_sched *, int);
276 static struct td_sched *runq_steal(struct runq *);
277 static void sched_balance(void);
278 static void sched_balance_groups(void);
279 static void sched_balance_group(struct tdq_group *);
280 static void sched_balance_pair(struct tdq *, struct tdq *);
281 static void sched_smp_tick(struct thread *);
282 static void tdq_move(struct tdq *, int);
283 static int tdq_idled(struct tdq *);
284 static void tdq_notify(struct td_sched *);
285 static struct td_sched *tdq_steal(struct tdq *, int);
286 
287 #define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
288 #endif
289 
290 static void sched_setup(void *dummy);
291 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
292 
293 static void sched_initticks(void *dummy);
294 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
295 
296 static inline void
297 sched_pin_td(struct thread *td)
298 {
299 	td->td_pinned++;
300 }
301 
302 static inline void
303 sched_unpin_td(struct thread *td)
304 {
305 	td->td_pinned--;
306 }
307 
308 static void
309 runq_print(struct runq *rq)
310 {
311 	struct rqhead *rqh;
312 	struct td_sched *ts;
313 	int pri;
314 	int j;
315 	int i;
316 
317 	for (i = 0; i < RQB_LEN; i++) {
318 		printf("\t\trunq bits %d 0x%zx\n",
319 		    i, rq->rq_status.rqb_bits[i]);
320 		for (j = 0; j < RQB_BPW; j++)
321 			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
322 				pri = j + (i << RQB_L2BPW);
323 				rqh = &rq->rq_queues[pri];
324 				TAILQ_FOREACH(ts, rqh, ts_procq) {
325 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
326 					    ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
327 				}
328 			}
329 	}
330 }
331 
332 void
333 tdq_print(int cpu)
334 {
335 	struct tdq *tdq;
336 
337 	tdq = TDQ_CPU(cpu);
338 
339 	printf("tdq:\n");
340 	printf("\tload:           %d\n", tdq->tdq_load);
341 	printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
342 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
343 	printf("\trealtime runq:\n");
344 	runq_print(&tdq->tdq_realtime);
345 	printf("\ttimeshare runq:\n");
346 	runq_print(&tdq->tdq_timeshare);
347 	printf("\tidle runq:\n");
348 	runq_print(&tdq->tdq_idle);
349 #ifdef SMP
350 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
351 #endif
352 }
353 
354 static __inline void
355 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
356 {
357 #ifdef SMP
358 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
359 		tdq->tdq_transferable++;
360 		tdq->tdq_group->tdg_transferable++;
361 		ts->ts_flags |= TSF_XFERABLE;
362 		if (tdq->tdq_transferable >= busy_thresh &&
363 		    (tdq->tdq_flags & TDQF_BUSY) == 0) {
364 			tdq->tdq_flags |= TDQF_BUSY;
365 			atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
366 		}
367 	}
368 #endif
369 	if (ts->ts_runq == &tdq->tdq_timeshare) {
370 		int pri;
371 
372 		pri = ts->ts_thread->td_priority;
373 		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
374 			("Invalid priority %d on timeshare runq", pri));
375 		/*
376 		 * This queue contains only priorities between MIN and MAX
377 		 * realtime.  Use the whole queue to represent these values.
378 		 */
379 #define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
380 		if ((flags & SRQ_BORROWING) == 0) {
381 			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
382 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
383 			/*
384 			 * This effectively shortens the queue by one so we
385 			 * can have a one slot difference between idx and
386 			 * ridx while we wait for threads to drain.
387 			 */
388 			if (tdq->tdq_ridx != tdq->tdq_idx &&
389 			    pri == tdq->tdq_ridx)
390 				pri = (pri - 1) % RQ_NQS;
391 		} else
392 			pri = tdq->tdq_ridx;
393 		runq_add_pri(ts->ts_runq, ts, pri, flags);
394 	} else
395 		runq_add(ts->ts_runq, ts, flags);
396 }
397 
398 static __inline void
399 tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
400 {
401 #ifdef SMP
402 	if (ts->ts_flags & TSF_XFERABLE) {
403 		tdq->tdq_transferable--;
404 		tdq->tdq_group->tdg_transferable--;
405 		ts->ts_flags &= ~TSF_XFERABLE;
406 		if (tdq->tdq_transferable < busy_thresh &&
407 		    (tdq->tdq_flags & TDQF_BUSY)) {
408 			atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
409 			tdq->tdq_flags &= ~TDQF_BUSY;
410 		}
411 	}
412 #endif
413 	if (ts->ts_runq == &tdq->tdq_timeshare) {
414 		if (tdq->tdq_idx != tdq->tdq_ridx)
415 			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
416 		else
417 			runq_remove_idx(ts->ts_runq, ts, NULL);
418 		/*
419 		 * For timeshare threads we update the priority here so
420 		 * the priority reflects the time we've been sleeping.
421 		 */
422 		ts->ts_ltick = ticks;
423 		sched_pctcpu_update(ts);
424 		sched_priority(ts->ts_thread);
425 	} else
426 		runq_remove(ts->ts_runq, ts);
427 }
428 
429 static void
430 tdq_load_add(struct tdq *tdq, struct td_sched *ts)
431 {
432 	int class;
433 	mtx_assert(&sched_lock, MA_OWNED);
434 	class = PRI_BASE(ts->ts_thread->td_pri_class);
435 	tdq->tdq_load++;
436 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
437 	if (class != PRI_ITHD &&
438 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
439 #ifdef SMP
440 		tdq->tdq_group->tdg_load++;
441 #else
442 		tdq->tdq_sysload++;
443 #endif
444 }
445 
446 static void
447 tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
448 {
449 	int class;
450 	mtx_assert(&sched_lock, MA_OWNED);
451 	class = PRI_BASE(ts->ts_thread->td_pri_class);
452 	if (class != PRI_ITHD &&
453 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
454 #ifdef SMP
455 		tdq->tdq_group->tdg_load--;
456 #else
457 		tdq->tdq_sysload--;
458 #endif
459 	tdq->tdq_load--;
460 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
461 	ts->ts_runq = NULL;
462 }
463 
464 #ifdef SMP
465 static void
466 sched_smp_tick(struct thread *td)
467 {
468 	struct tdq *tdq;
469 
470 	tdq = TDQ_SELF();
471 	if (rebalance) {
472 		if (ticks >= bal_tick)
473 			sched_balance();
474 		if (ticks >= gbal_tick && balance_groups)
475 			sched_balance_groups();
476 	}
477 	td->td_sched->ts_rltick = ticks;
478 }
479 
480 /*
481  * sched_balance is a simple CPU load balancing algorithm.  It operates by
482  * finding the least loaded and most loaded cpu and equalizing their load
483  * by migrating some processes.
484  *
485  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
486  * installations will only have 2 cpus.  Secondly, load balancing too much at
487  * once can have an unpleasant effect on the system.  The scheduler rarely has
488  * enough information to make perfect decisions.  So this algorithm chooses
489  * algorithm simplicity and more gradual effects on load in larger systems.
490  *
491  * It could be improved by considering the priorities and slices assigned to
492  * each task prior to balancing them.  There are many pathological cases with
493  * any approach and so the semi random algorithm below may work as well as any.
494  *
495  */
496 static void
497 sched_balance(void)
498 {
499 	struct tdq_group *high;
500 	struct tdq_group *low;
501 	struct tdq_group *tdg;
502 	int cnt;
503 	int i;
504 
505 	bal_tick = ticks + (random() % (hz * 2));
506 	if (smp_started == 0)
507 		return;
508 	low = high = NULL;
509 	i = random() % (tdg_maxid + 1);
510 	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
511 		tdg = TDQ_GROUP(i);
512 		/*
513 		 * Find the CPU with the highest load that has some
514 		 * threads to transfer.
515 		 */
516 		if ((high == NULL || tdg->tdg_load > high->tdg_load)
517 		    && tdg->tdg_transferable)
518 			high = tdg;
519 		if (low == NULL || tdg->tdg_load < low->tdg_load)
520 			low = tdg;
521 		if (++i > tdg_maxid)
522 			i = 0;
523 	}
524 	if (low != NULL && high != NULL && high != low)
525 		sched_balance_pair(LIST_FIRST(&high->tdg_members),
526 		    LIST_FIRST(&low->tdg_members));
527 }
528 
529 static void
530 sched_balance_groups(void)
531 {
532 	int i;
533 
534 	gbal_tick = ticks + (random() % (hz * 2));
535 	mtx_assert(&sched_lock, MA_OWNED);
536 	if (smp_started)
537 		for (i = 0; i <= tdg_maxid; i++)
538 			sched_balance_group(TDQ_GROUP(i));
539 }
540 
541 static void
542 sched_balance_group(struct tdq_group *tdg)
543 {
544 	struct tdq *tdq;
545 	struct tdq *high;
546 	struct tdq *low;
547 	int load;
548 
549 	if (tdg->tdg_transferable == 0)
550 		return;
551 	low = NULL;
552 	high = NULL;
553 	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
554 		load = tdq->tdq_load;
555 		if (high == NULL || load > high->tdq_load)
556 			high = tdq;
557 		if (low == NULL || load < low->tdq_load)
558 			low = tdq;
559 	}
560 	if (high != NULL && low != NULL && high != low)
561 		sched_balance_pair(high, low);
562 }
563 
564 static void
565 sched_balance_pair(struct tdq *high, struct tdq *low)
566 {
567 	int transferable;
568 	int high_load;
569 	int low_load;
570 	int move;
571 	int diff;
572 	int i;
573 
574 	/*
575 	 * If we're transfering within a group we have to use this specific
576 	 * tdq's transferable count, otherwise we can steal from other members
577 	 * of the group.
578 	 */
579 	if (high->tdq_group == low->tdq_group) {
580 		transferable = high->tdq_transferable;
581 		high_load = high->tdq_load;
582 		low_load = low->tdq_load;
583 	} else {
584 		transferable = high->tdq_group->tdg_transferable;
585 		high_load = high->tdq_group->tdg_load;
586 		low_load = low->tdq_group->tdg_load;
587 	}
588 	if (transferable == 0)
589 		return;
590 	/*
591 	 * Determine what the imbalance is and then adjust that to how many
592 	 * threads we actually have to give up (transferable).
593 	 */
594 	diff = high_load - low_load;
595 	move = diff / 2;
596 	if (diff & 0x1)
597 		move++;
598 	move = min(move, transferable);
599 	for (i = 0; i < move; i++)
600 		tdq_move(high, TDQ_ID(low));
601 	return;
602 }
603 
604 static void
605 tdq_move(struct tdq *from, int cpu)
606 {
607 	struct tdq *tdq;
608 	struct tdq *to;
609 	struct td_sched *ts;
610 
611 	tdq = from;
612 	to = TDQ_CPU(cpu);
613 	ts = tdq_steal(tdq, 1);
614 	if (ts == NULL) {
615 		struct tdq_group *tdg;
616 
617 		tdg = tdq->tdq_group;
618 		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
619 			if (tdq == from || tdq->tdq_transferable == 0)
620 				continue;
621 			ts = tdq_steal(tdq, 1);
622 			break;
623 		}
624 		if (ts == NULL)
625 			panic("tdq_move: No threads available with a "
626 			    "transferable count of %d\n",
627 			    tdg->tdg_transferable);
628 	}
629 	if (tdq == to)
630 		return;
631 	sched_rem(ts->ts_thread);
632 	ts->ts_cpu = cpu;
633 	sched_pin_td(ts->ts_thread);
634 	sched_add(ts->ts_thread, SRQ_YIELDING);
635 	sched_unpin_td(ts->ts_thread);
636 }
637 
638 static int
639 tdq_idled(struct tdq *tdq)
640 {
641 	struct tdq_group *tdg;
642 	struct tdq *steal;
643 	struct td_sched *ts;
644 
645 	tdg = tdq->tdq_group;
646 	/*
647 	 * If we're in a cpu group, try and steal threads from another cpu in
648 	 * the group before idling.
649 	 */
650 	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
651 		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
652 			if (steal == tdq || steal->tdq_transferable == 0)
653 				continue;
654 			ts = tdq_steal(steal, 0);
655 			if (ts)
656 				goto steal;
657 		}
658 	}
659 	if (steal_busy) {
660 		while (tdq_busy) {
661 			int cpu;
662 
663 			cpu = ffs(tdq_busy);
664 			if (cpu == 0)
665 				break;
666 			cpu--;
667 			steal = TDQ_CPU(cpu);
668 			if (steal->tdq_transferable == 0)
669 				continue;
670 			ts = tdq_steal(steal, 1);
671 			if (ts == NULL)
672 				continue;
673 			CTR5(KTR_SCHED,
674 			    "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
675 			    ts->ts_thread, ts->ts_thread->td_proc->p_comm,
676 			    ts->ts_thread->td_priority, cpu, tdq_busy);
677 			goto steal;
678 		}
679 	}
680 	/*
681 	 * We only set the idled bit when all of the cpus in the group are
682 	 * idle.  Otherwise we could get into a situation where a thread bounces
683 	 * back and forth between two idle cores on seperate physical CPUs.
684 	 */
685 	tdg->tdg_idlemask |= PCPU_GET(cpumask);
686 	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
687 		atomic_set_int(&tdq_idle, tdg->tdg_mask);
688 	return (1);
689 steal:
690 	sched_rem(ts->ts_thread);
691 	ts->ts_cpu = PCPU_GET(cpuid);
692 	sched_pin_td(ts->ts_thread);
693 	sched_add(ts->ts_thread, SRQ_YIELDING);
694 	sched_unpin_td(ts->ts_thread);
695 
696 	return (0);
697 }
698 
699 static void
700 tdq_notify(struct td_sched *ts)
701 {
702 	struct thread *td;
703 	struct pcpu *pcpu;
704 	int prio;
705 	int cpu;
706 
707 	prio = ts->ts_thread->td_priority;
708 	cpu = ts->ts_cpu;
709 	pcpu = pcpu_find(cpu);
710 	td = pcpu->pc_curthread;
711 
712 	/*
713 	 * If our priority is not better than the current priority there is
714 	 * nothing to do.
715 	 */
716 	if (prio > td->td_priority)
717 		return;
718 	/* Always set NEEDRESCHED. */
719 	td->td_flags |= TDF_NEEDRESCHED;
720 	/*
721 	 * IPI if we exceed the threshold or if the target cpu is running an
722 	 * idle thread.
723 	 */
724 	if (prio > ipi_thresh && td->td_priority < PRI_MIN_IDLE)
725 		return;
726 	if (ipi_ast)
727 		ipi_selected(1 << cpu, IPI_AST);
728 	else if (ipi_preempt)
729 		ipi_selected(1 << cpu, IPI_PREEMPT);
730 }
731 
732 static struct td_sched *
733 runq_steal(struct runq *rq)
734 {
735 	struct rqhead *rqh;
736 	struct rqbits *rqb;
737 	struct td_sched *ts;
738 	int word;
739 	int bit;
740 
741 	mtx_assert(&sched_lock, MA_OWNED);
742 	rqb = &rq->rq_status;
743 	for (word = 0; word < RQB_LEN; word++) {
744 		if (rqb->rqb_bits[word] == 0)
745 			continue;
746 		for (bit = 0; bit < RQB_BPW; bit++) {
747 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
748 				continue;
749 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
750 			TAILQ_FOREACH(ts, rqh, ts_procq) {
751 				if (THREAD_CAN_MIGRATE(ts->ts_thread))
752 					return (ts);
753 			}
754 		}
755 	}
756 	return (NULL);
757 }
758 
759 static struct td_sched *
760 tdq_steal(struct tdq *tdq, int stealidle)
761 {
762 	struct td_sched *ts;
763 
764 	/*
765 	 * Steal from next first to try to get a non-interactive task that
766 	 * may not have run for a while.
767 	 * XXX Need to effect steal order for timeshare threads.
768 	 */
769 	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
770 		return (ts);
771 	if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL)
772 		return (ts);
773 	if (stealidle)
774 		return (runq_steal(&tdq->tdq_idle));
775 	return (NULL);
776 }
777 
778 int
779 tdq_pickidle(struct tdq *tdq, struct td_sched *ts)
780 {
781 	struct tdq_group *tdg;
782 	int self;
783 	int cpu;
784 
785 	self = PCPU_GET(cpuid);
786 	if (smp_started == 0)
787 		return (self);
788 	/*
789 	 * If the current CPU has idled, just run it here.
790 	 */
791 	if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0)
792 		return (self);
793 	/*
794 	 * Try the last group we ran on.
795 	 */
796 	tdg = TDQ_CPU(ts->ts_cpu)->tdq_group;
797 	cpu = ffs(tdg->tdg_idlemask);
798 	if (cpu)
799 		return (cpu - 1);
800 	/*
801 	 * Search for an idle group.
802 	 */
803 	cpu = ffs(tdq_idle);
804 	if (cpu)
805 		return (cpu - 1);
806 	/*
807 	 * XXX If there are no idle groups, check for an idle core.
808 	 */
809 	/*
810 	 * No idle CPUs?
811 	 */
812 	return (self);
813 }
814 
815 static int
816 tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
817 {
818 	struct pcpu *pcpu;
819 	int lowpri;
820 	int lowcpu;
821 	int lowload;
822 	int load;
823 	int self;
824 	int pri;
825 	int cpu;
826 
827 	self = PCPU_GET(cpuid);
828 	if (smp_started == 0)
829 		return (self);
830 
831 	pri = ts->ts_thread->td_priority;
832 	/*
833 	 * Regardless of affinity, if the last cpu is idle send it there.
834 	 */
835 	pcpu = pcpu_find(ts->ts_cpu);
836 	if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) {
837 		CTR5(KTR_SCHED,
838 		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
839 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
840 		    pcpu->pc_curthread->td_priority);
841 		return (ts->ts_cpu);
842 	}
843 	/*
844 	 * If we have affinity, try to place it on the cpu we last ran on.
845 	 */
846 	if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) {
847 		CTR5(KTR_SCHED,
848 		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
849 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
850 		    pcpu->pc_curthread->td_priority);
851 		return (ts->ts_cpu);
852 	}
853 	/*
854 	 * Try ourself first; If we're running something lower priority this
855 	 * may have some locality with the waking thread and execute faster
856 	 * here.
857 	 */
858 	if (tryself) {
859 		/*
860 		 * If we're being awoken by an interrupt thread or the waker
861 		 * is going right to sleep run here as well.
862 		 */
863 		if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING ||
864 		    curthread->td_pri_class == PRI_ITHD)) {
865 			CTR2(KTR_SCHED, "tryself load %d flags %d",
866 			    TDQ_SELF()->tdq_load, flags);
867 			return (self);
868 		}
869 	}
870 	/*
871 	 * Look for an idle group.
872 	 */
873 	CTR1(KTR_SCHED, "tdq_idle %X", tdq_idle);
874 	cpu = ffs(tdq_idle);
875 	if (cpu)
876 		return (cpu - 1);
877 	if (tryselfidle && pri < curthread->td_priority) {
878 		CTR1(KTR_SCHED, "tryself %d",
879 		    curthread->td_priority);
880 		return (self);
881 	}
882 	/*
883  	 * Now search for the cpu running the lowest priority thread with
884 	 * the least load.
885 	 */
886 	lowload = 0;
887 	lowpri = lowcpu = 0;
888 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
889 		if (CPU_ABSENT(cpu))
890 			continue;
891 		pcpu = pcpu_find(cpu);
892 		pri = pcpu->pc_curthread->td_priority;
893 		CTR4(KTR_SCHED,
894 		    "cpu %d pri %d lowcpu %d lowpri %d",
895 		    cpu, pri, lowcpu, lowpri);
896 		if (pri < lowpri)
897 			continue;
898 		load = TDQ_CPU(cpu)->tdq_load;
899 		if (lowpri && lowpri == pri && load > lowload)
900 			continue;
901 		lowpri = pri;
902 		lowcpu = cpu;
903 		lowload = load;
904 	}
905 
906 	return (lowcpu);
907 }
908 
909 #endif	/* SMP */
910 
911 /*
912  * Pick the highest priority task we have and return it.
913  */
914 
915 static struct td_sched *
916 tdq_choose(struct tdq *tdq)
917 {
918 	struct td_sched *ts;
919 
920 	mtx_assert(&sched_lock, MA_OWNED);
921 
922 	ts = runq_choose(&tdq->tdq_realtime);
923 	if (ts != NULL) {
924 		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME,
925 		    ("tdq_choose: Invalid priority on realtime queue %d",
926 		    ts->ts_thread->td_priority));
927 		return (ts);
928 	}
929 	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
930 	if (ts != NULL) {
931 		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE &&
932 		    ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
933 		    ("tdq_choose: Invalid priority on timeshare queue %d",
934 		    ts->ts_thread->td_priority));
935 		return (ts);
936 	}
937 
938 	ts = runq_choose(&tdq->tdq_idle);
939 	if (ts != NULL) {
940 		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
941 		    ("tdq_choose: Invalid priority on idle queue %d",
942 		    ts->ts_thread->td_priority));
943 		return (ts);
944 	}
945 
946 	return (NULL);
947 }
948 
949 static void
950 tdq_setup(struct tdq *tdq)
951 {
952 	runq_init(&tdq->tdq_realtime);
953 	runq_init(&tdq->tdq_timeshare);
954 	runq_init(&tdq->tdq_idle);
955 	tdq->tdq_load = 0;
956 }
957 
958 static void
959 sched_setup(void *dummy)
960 {
961 #ifdef SMP
962 	int i;
963 #endif
964 
965 	/*
966 	 * To avoid divide-by-zero, we set realstathz a dummy value
967 	 * in case which sched_clock() called before sched_initticks().
968 	 */
969 	realstathz = hz;
970 	sched_slice = (realstathz/7);	/* 140ms */
971 	tickincr = 1 << SCHED_TICK_SHIFT;
972 
973 #ifdef SMP
974 	balance_groups = 0;
975 	/*
976 	 * Initialize the tdqs.
977 	 */
978 	for (i = 0; i < MAXCPU; i++) {
979 		struct tdq *tdq;
980 
981 		tdq = &tdq_cpu[i];
982 		tdq_setup(&tdq_cpu[i]);
983 	}
984 	if (smp_topology == NULL) {
985 		struct tdq_group *tdg;
986 		struct tdq *tdq;
987 		int cpus;
988 
989 		for (cpus = 0, i = 0; i < MAXCPU; i++) {
990 			if (CPU_ABSENT(i))
991 				continue;
992 			tdq = &tdq_cpu[i];
993 			tdg = &tdq_groups[cpus];
994 			/*
995 			 * Setup a tdq group with one member.
996 			 */
997 			tdq->tdq_transferable = 0;
998 			tdq->tdq_group = tdg;
999 			tdg->tdg_cpus = 1;
1000 			tdg->tdg_idlemask = 0;
1001 			tdg->tdg_cpumask = tdg->tdg_mask = 1 << i;
1002 			tdg->tdg_load = 0;
1003 			tdg->tdg_transferable = 0;
1004 			LIST_INIT(&tdg->tdg_members);
1005 			LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
1006 			cpus++;
1007 		}
1008 		tdg_maxid = cpus - 1;
1009 	} else {
1010 		struct tdq_group *tdg;
1011 		struct cpu_group *cg;
1012 		int j;
1013 
1014 		for (i = 0; i < smp_topology->ct_count; i++) {
1015 			cg = &smp_topology->ct_group[i];
1016 			tdg = &tdq_groups[i];
1017 			/*
1018 			 * Initialize the group.
1019 			 */
1020 			tdg->tdg_idlemask = 0;
1021 			tdg->tdg_load = 0;
1022 			tdg->tdg_transferable = 0;
1023 			tdg->tdg_cpus = cg->cg_count;
1024 			tdg->tdg_cpumask = cg->cg_mask;
1025 			LIST_INIT(&tdg->tdg_members);
1026 			/*
1027 			 * Find all of the group members and add them.
1028 			 */
1029 			for (j = 0; j < MAXCPU; j++) {
1030 				if ((cg->cg_mask & (1 << j)) != 0) {
1031 					if (tdg->tdg_mask == 0)
1032 						tdg->tdg_mask = 1 << j;
1033 					tdq_cpu[j].tdq_transferable = 0;
1034 					tdq_cpu[j].tdq_group = tdg;
1035 					LIST_INSERT_HEAD(&tdg->tdg_members,
1036 					    &tdq_cpu[j], tdq_siblings);
1037 				}
1038 			}
1039 			if (tdg->tdg_cpus > 1)
1040 				balance_groups = 1;
1041 		}
1042 		tdg_maxid = smp_topology->ct_count - 1;
1043 	}
1044 	/*
1045 	 * Stagger the group and global load balancer so they do not
1046 	 * interfere with each other.
1047 	 */
1048 	bal_tick = ticks + hz;
1049 	if (balance_groups)
1050 		gbal_tick = ticks + (hz / 2);
1051 #else
1052 	tdq_setup(TDQ_SELF());
1053 #endif
1054 	mtx_lock_spin(&sched_lock);
1055 	tdq_load_add(TDQ_SELF(), &td_sched0);
1056 	mtx_unlock_spin(&sched_lock);
1057 }
1058 
1059 /* ARGSUSED */
1060 static void
1061 sched_initticks(void *dummy)
1062 {
1063 	mtx_lock_spin(&sched_lock);
1064 	realstathz = stathz ? stathz : hz;
1065 	sched_slice = (realstathz/7);	/* ~140ms */
1066 
1067 	/*
1068 	 * tickincr is shifted out by 10 to avoid rounding errors due to
1069 	 * hz not being evenly divisible by stathz on all platforms.
1070 	 */
1071 	tickincr = (hz << SCHED_TICK_SHIFT) / realstathz;
1072 	/*
1073 	 * This does not work for values of stathz that are more than
1074 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1075 	 */
1076 	if (tickincr == 0)
1077 		tickincr = 1;
1078 #ifdef SMP
1079 	affinity = SCHED_AFFINITY_DEFAULT;
1080 #endif
1081 	mtx_unlock_spin(&sched_lock);
1082 }
1083 
1084 
1085 /*
1086  * Scale the scheduling priority according to the "interactivity" of this
1087  * process.
1088  */
1089 static void
1090 sched_priority(struct thread *td)
1091 {
1092 	int score;
1093 	int pri;
1094 
1095 	if (td->td_pri_class != PRI_TIMESHARE)
1096 		return;
1097 	/*
1098 	 * If the score is interactive we place the thread in the realtime
1099 	 * queue with a priority that is less than kernel and interrupt
1100 	 * priorities.  These threads are not subject to nice restrictions.
1101 	 *
1102 	 * Scores greater than this are placed on the normal realtime queue
1103 	 * where the priority is partially decided by the most recent cpu
1104 	 * utilization and the rest is decided by nice value.
1105 	 */
1106 	score = sched_interact_score(td);
1107 	if (score < sched_interact) {
1108 		pri = PRI_MIN_REALTIME;
1109 		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1110 		    * score;
1111 		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1112 		    ("sched_priority: invalid interactive priority %d score %d",
1113 		    pri, score));
1114 	} else {
1115 		pri = SCHED_PRI_MIN;
1116 		if (td->td_sched->ts_ticks)
1117 			pri += SCHED_PRI_TICKS(td->td_sched);
1118 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1119 		if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) {
1120 			static int once = 1;
1121 			if (once) {
1122 				printf("sched_priority: invalid priority %d",
1123 				    pri);
1124 				printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n",
1125 				    td->td_proc->p_nice,
1126 				    td->td_sched->ts_ticks,
1127 				    td->td_sched->ts_ftick,
1128 				    td->td_sched->ts_ltick,
1129 				    SCHED_PRI_TICKS(td->td_sched));
1130 				once = 0;
1131 			}
1132 			pri = min(max(pri, PRI_MIN_TIMESHARE),
1133 			    PRI_MAX_TIMESHARE);
1134 		}
1135 	}
1136 	sched_user_prio(td, pri);
1137 
1138 	return;
1139 }
1140 
1141 /*
1142  * This routine enforces a maximum limit on the amount of scheduling history
1143  * kept.  It is called after either the slptime or runtime is adjusted.
1144  */
1145 static void
1146 sched_interact_update(struct thread *td)
1147 {
1148 	struct td_sched *ts;
1149 	u_int sum;
1150 
1151 	ts = td->td_sched;
1152 	sum = ts->skg_runtime + ts->skg_slptime;
1153 	if (sum < SCHED_SLP_RUN_MAX)
1154 		return;
1155 	/*
1156 	 * This only happens from two places:
1157 	 * 1) We have added an unusual amount of run time from fork_exit.
1158 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1159 	 */
1160 	if (sum > SCHED_SLP_RUN_MAX * 2) {
1161 		if (ts->skg_runtime > ts->skg_slptime) {
1162 			ts->skg_runtime = SCHED_SLP_RUN_MAX;
1163 			ts->skg_slptime = 1;
1164 		} else {
1165 			ts->skg_slptime = SCHED_SLP_RUN_MAX;
1166 			ts->skg_runtime = 1;
1167 		}
1168 		return;
1169 	}
1170 	/*
1171 	 * If we have exceeded by more than 1/5th then the algorithm below
1172 	 * will not bring us back into range.  Dividing by two here forces
1173 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1174 	 */
1175 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1176 		ts->skg_runtime /= 2;
1177 		ts->skg_slptime /= 2;
1178 		return;
1179 	}
1180 	ts->skg_runtime = (ts->skg_runtime / 5) * 4;
1181 	ts->skg_slptime = (ts->skg_slptime / 5) * 4;
1182 }
1183 
1184 static void
1185 sched_interact_fork(struct thread *td)
1186 {
1187 	int ratio;
1188 	int sum;
1189 
1190 	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1191 	if (sum > SCHED_SLP_RUN_FORK) {
1192 		ratio = sum / SCHED_SLP_RUN_FORK;
1193 		td->td_sched->skg_runtime /= ratio;
1194 		td->td_sched->skg_slptime /= ratio;
1195 	}
1196 }
1197 
1198 static int
1199 sched_interact_score(struct thread *td)
1200 {
1201 	int div;
1202 
1203 	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
1204 		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
1205 		return (SCHED_INTERACT_HALF +
1206 		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
1207 	} if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
1208 		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
1209 		return (td->td_sched->skg_runtime / div);
1210 	}
1211 
1212 	/*
1213 	 * This can happen if slptime and runtime are 0.
1214 	 */
1215 	return (0);
1216 
1217 }
1218 
1219 /*
1220  * Called from proc0_init() to bootstrap the scheduler.
1221  */
1222 void
1223 schedinit(void)
1224 {
1225 
1226 	/*
1227 	 * Set up the scheduler specific parts of proc0.
1228 	 */
1229 	proc0.p_sched = NULL; /* XXX */
1230 	thread0.td_sched = &td_sched0;
1231 	td_sched0.ts_ltick = ticks;
1232 	td_sched0.ts_ftick = ticks;
1233 	td_sched0.ts_thread = &thread0;
1234 }
1235 
1236 /*
1237  * This is only somewhat accurate since given many processes of the same
1238  * priority they will switch when their slices run out, which will be
1239  * at most sched_slice stathz ticks.
1240  */
1241 int
1242 sched_rr_interval(void)
1243 {
1244 
1245 	/* Convert sched_slice to hz */
1246 	return (hz/(realstathz/sched_slice));
1247 }
1248 
1249 static void
1250 sched_pctcpu_update(struct td_sched *ts)
1251 {
1252 
1253 	if (ts->ts_ticks == 0)
1254 		return;
1255 	if (ticks - (hz / 10) < ts->ts_ltick &&
1256 	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1257 		return;
1258 	/*
1259 	 * Adjust counters and watermark for pctcpu calc.
1260 	 */
1261 	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1262 		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1263 			    SCHED_TICK_TARG;
1264 	else
1265 		ts->ts_ticks = 0;
1266 	ts->ts_ltick = ticks;
1267 	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1268 }
1269 
1270 static void
1271 sched_thread_priority(struct thread *td, u_char prio)
1272 {
1273 	struct td_sched *ts;
1274 
1275 	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1276 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1277 	    curthread->td_proc->p_comm);
1278 	ts = td->td_sched;
1279 	mtx_assert(&sched_lock, MA_OWNED);
1280 	if (td->td_priority == prio)
1281 		return;
1282 
1283 	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1284 		/*
1285 		 * If the priority has been elevated due to priority
1286 		 * propagation, we may have to move ourselves to a new
1287 		 * queue.  This could be optimized to not re-add in some
1288 		 * cases.
1289 		 */
1290 		sched_rem(td);
1291 		td->td_priority = prio;
1292 		sched_add(td, SRQ_BORROWING);
1293 	} else
1294 		td->td_priority = prio;
1295 }
1296 
1297 /*
1298  * Update a thread's priority when it is lent another thread's
1299  * priority.
1300  */
1301 void
1302 sched_lend_prio(struct thread *td, u_char prio)
1303 {
1304 
1305 	td->td_flags |= TDF_BORROWING;
1306 	sched_thread_priority(td, prio);
1307 }
1308 
1309 /*
1310  * Restore a thread's priority when priority propagation is
1311  * over.  The prio argument is the minimum priority the thread
1312  * needs to have to satisfy other possible priority lending
1313  * requests.  If the thread's regular priority is less
1314  * important than prio, the thread will keep a priority boost
1315  * of prio.
1316  */
1317 void
1318 sched_unlend_prio(struct thread *td, u_char prio)
1319 {
1320 	u_char base_pri;
1321 
1322 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1323 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1324 		base_pri = td->td_user_pri;
1325 	else
1326 		base_pri = td->td_base_pri;
1327 	if (prio >= base_pri) {
1328 		td->td_flags &= ~TDF_BORROWING;
1329 		sched_thread_priority(td, base_pri);
1330 	} else
1331 		sched_lend_prio(td, prio);
1332 }
1333 
1334 void
1335 sched_prio(struct thread *td, u_char prio)
1336 {
1337 	u_char oldprio;
1338 
1339 	/* First, update the base priority. */
1340 	td->td_base_pri = prio;
1341 
1342 	/*
1343 	 * If the thread is borrowing another thread's priority, don't
1344 	 * ever lower the priority.
1345 	 */
1346 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1347 		return;
1348 
1349 	/* Change the real priority. */
1350 	oldprio = td->td_priority;
1351 	sched_thread_priority(td, prio);
1352 
1353 	/*
1354 	 * If the thread is on a turnstile, then let the turnstile update
1355 	 * its state.
1356 	 */
1357 	if (TD_ON_LOCK(td) && oldprio != prio)
1358 		turnstile_adjust(td, oldprio);
1359 }
1360 
1361 void
1362 sched_user_prio(struct thread *td, u_char prio)
1363 {
1364 	u_char oldprio;
1365 
1366 	td->td_base_user_pri = prio;
1367 	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1368                 return;
1369 	oldprio = td->td_user_pri;
1370 	td->td_user_pri = prio;
1371 
1372 	if (TD_ON_UPILOCK(td) && oldprio != prio)
1373 		umtx_pi_adjust(td, oldprio);
1374 }
1375 
1376 void
1377 sched_lend_user_prio(struct thread *td, u_char prio)
1378 {
1379 	u_char oldprio;
1380 
1381 	td->td_flags |= TDF_UBORROWING;
1382 
1383 	oldprio = td->td_user_pri;
1384 	td->td_user_pri = prio;
1385 
1386 	if (TD_ON_UPILOCK(td) && oldprio != prio)
1387 		umtx_pi_adjust(td, oldprio);
1388 }
1389 
1390 void
1391 sched_unlend_user_prio(struct thread *td, u_char prio)
1392 {
1393 	u_char base_pri;
1394 
1395 	base_pri = td->td_base_user_pri;
1396 	if (prio >= base_pri) {
1397 		td->td_flags &= ~TDF_UBORROWING;
1398 		sched_user_prio(td, base_pri);
1399 	} else
1400 		sched_lend_user_prio(td, prio);
1401 }
1402 
1403 void
1404 sched_switch(struct thread *td, struct thread *newtd, int flags)
1405 {
1406 	struct tdq *tdq;
1407 	struct td_sched *ts;
1408 	int preempt;
1409 
1410 	mtx_assert(&sched_lock, MA_OWNED);
1411 
1412 	preempt = flags & SW_PREEMPT;
1413 	tdq = TDQ_SELF();
1414 	ts = td->td_sched;
1415 	td->td_lastcpu = td->td_oncpu;
1416 	td->td_oncpu = NOCPU;
1417 	td->td_flags &= ~TDF_NEEDRESCHED;
1418 	td->td_owepreempt = 0;
1419 	/*
1420 	 * If the thread has been assigned it may be in the process of switching
1421 	 * to the new cpu.  This is the case in sched_bind().
1422 	 */
1423 	if (td == PCPU_GET(idlethread)) {
1424 		TD_SET_CAN_RUN(td);
1425 	} else {
1426 		tdq_load_rem(tdq, ts);
1427 		if (TD_IS_RUNNING(td)) {
1428 			/*
1429 			 * Don't allow the thread to migrate
1430 			 * from a preemption.
1431 			 */
1432 			if (preempt)
1433 				sched_pin_td(td);
1434 			sched_add(td, preempt ?
1435 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1436 			    SRQ_OURSELF|SRQ_YIELDING);
1437 			if (preempt)
1438 				sched_unpin_td(td);
1439 		}
1440 	}
1441 	if (newtd != NULL) {
1442 		/*
1443 		 * If we bring in a thread account for it as if it had been
1444 		 * added to the run queue and then chosen.
1445 		 */
1446 		newtd->td_sched->ts_flags |= TSF_DIDRUN;
1447 		TD_SET_RUNNING(newtd);
1448 		tdq_load_add(TDQ_SELF(), newtd->td_sched);
1449 	} else
1450 		newtd = choosethread();
1451 	if (td != newtd) {
1452 #ifdef	HWPMC_HOOKS
1453 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1454 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1455 #endif
1456 
1457 		cpu_switch(td, newtd);
1458 #ifdef	HWPMC_HOOKS
1459 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1460 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1461 #endif
1462 	}
1463 	sched_lock.mtx_lock = (uintptr_t)td;
1464 	td->td_oncpu = PCPU_GET(cpuid);
1465 }
1466 
1467 void
1468 sched_nice(struct proc *p, int nice)
1469 {
1470 	struct thread *td;
1471 
1472 	PROC_LOCK_ASSERT(p, MA_OWNED);
1473 	mtx_assert(&sched_lock, MA_OWNED);
1474 
1475 	p->p_nice = nice;
1476 	FOREACH_THREAD_IN_PROC(p, td) {
1477 		sched_priority(td);
1478 		sched_prio(td, td->td_base_user_pri);
1479 	}
1480 }
1481 
1482 void
1483 sched_sleep(struct thread *td)
1484 {
1485 
1486 	mtx_assert(&sched_lock, MA_OWNED);
1487 
1488 	td->td_sched->ts_slptime = ticks;
1489 }
1490 
1491 void
1492 sched_wakeup(struct thread *td)
1493 {
1494 	int slptime;
1495 
1496 	mtx_assert(&sched_lock, MA_OWNED);
1497 
1498 	/*
1499 	 * If we slept for more than a tick update our interactivity and
1500 	 * priority.
1501 	 */
1502 	slptime = td->td_sched->ts_slptime;
1503 	td->td_sched->ts_slptime = 0;
1504 	if (slptime && slptime != ticks) {
1505 		u_int hzticks;
1506 
1507 		hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
1508 		td->td_sched->skg_slptime += hzticks;
1509 		sched_interact_update(td);
1510 		sched_pctcpu_update(td->td_sched);
1511 		sched_priority(td);
1512 	}
1513 	sched_add(td, SRQ_BORING);
1514 }
1515 
1516 /*
1517  * Penalize the parent for creating a new child and initialize the child's
1518  * priority.
1519  */
1520 void
1521 sched_fork(struct thread *td, struct thread *child)
1522 {
1523 	mtx_assert(&sched_lock, MA_OWNED);
1524 	sched_fork_thread(td, child);
1525 	/*
1526 	 * Penalize the parent and child for forking.
1527 	 */
1528 	sched_interact_fork(child);
1529 	sched_priority(child);
1530 	td->td_sched->skg_runtime += tickincr;
1531 	sched_interact_update(td);
1532 	sched_priority(td);
1533 }
1534 
1535 void
1536 sched_fork_thread(struct thread *td, struct thread *child)
1537 {
1538 	struct td_sched *ts;
1539 	struct td_sched *ts2;
1540 
1541 	/*
1542 	 * Initialize child.
1543 	 */
1544 	sched_newthread(child);
1545 	ts = td->td_sched;
1546 	ts2 = child->td_sched;
1547 	ts2->ts_cpu = ts->ts_cpu;
1548 	ts2->ts_runq = NULL;
1549 	/*
1550 	 * Grab our parents cpu estimation information and priority.
1551 	 */
1552 	ts2->ts_ticks = ts->ts_ticks;
1553 	ts2->ts_ltick = ts->ts_ltick;
1554 	ts2->ts_ftick = ts->ts_ftick;
1555 	child->td_user_pri = td->td_user_pri;
1556 	child->td_base_user_pri = td->td_base_user_pri;
1557 	/*
1558 	 * And update interactivity score.
1559 	 */
1560 	ts2->skg_slptime = ts->skg_slptime;
1561 	ts2->skg_runtime = ts->skg_runtime;
1562 	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1563 }
1564 
1565 void
1566 sched_class(struct thread *td, int class)
1567 {
1568 
1569 	mtx_assert(&sched_lock, MA_OWNED);
1570 	if (td->td_pri_class == class)
1571 		return;
1572 
1573 #ifdef SMP
1574 	/*
1575 	 * On SMP if we're on the RUNQ we must adjust the transferable
1576 	 * count because could be changing to or from an interrupt
1577 	 * class.
1578 	 */
1579 	if (TD_ON_RUNQ(td)) {
1580 		struct tdq *tdq;
1581 
1582 		tdq = TDQ_CPU(td->td_sched->ts_cpu);
1583 		if (THREAD_CAN_MIGRATE(td)) {
1584 			tdq->tdq_transferable--;
1585 			tdq->tdq_group->tdg_transferable--;
1586 		}
1587 		td->td_pri_class = class;
1588 		if (THREAD_CAN_MIGRATE(td)) {
1589 			tdq->tdq_transferable++;
1590 			tdq->tdq_group->tdg_transferable++;
1591 		}
1592 	}
1593 #endif
1594 	td->td_pri_class = class;
1595 }
1596 
1597 /*
1598  * Return some of the child's priority and interactivity to the parent.
1599  */
1600 void
1601 sched_exit(struct proc *p, struct thread *child)
1602 {
1603 	struct thread *td;
1604 
1605 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1606 	    child, child->td_proc->p_comm, child->td_priority);
1607 
1608 	td = FIRST_THREAD_IN_PROC(p);
1609 	sched_exit_thread(td, child);
1610 }
1611 
1612 void
1613 sched_exit_thread(struct thread *td, struct thread *child)
1614 {
1615 
1616 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1617 	    child, child->td_proc->p_comm, child->td_priority);
1618 
1619 	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
1620 #ifdef KSE
1621 	/*
1622 	 * KSE forks and exits so often that this penalty causes short-lived
1623 	 * threads to always be non-interactive.  This causes mozilla to
1624 	 * crawl under load.
1625 	 */
1626 	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
1627 		return;
1628 #endif
1629 	/*
1630 	 * Give the child's runtime to the parent without returning the
1631 	 * sleep time as a penalty to the parent.  This causes shells that
1632 	 * launch expensive things to mark their children as expensive.
1633 	 */
1634 	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
1635 	sched_interact_update(td);
1636 	sched_priority(td);
1637 }
1638 
1639 void
1640 sched_userret(struct thread *td)
1641 {
1642 	/*
1643 	 * XXX we cheat slightly on the locking here to avoid locking in
1644 	 * the usual case.  Setting td_priority here is essentially an
1645 	 * incomplete workaround for not setting it properly elsewhere.
1646 	 * Now that some interrupt handlers are threads, not setting it
1647 	 * properly elsewhere can clobber it in the window between setting
1648 	 * it here and returning to user mode, so don't waste time setting
1649 	 * it perfectly here.
1650 	 */
1651 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1652 	    ("thread with borrowed priority returning to userland"));
1653 	if (td->td_priority != td->td_user_pri) {
1654 		mtx_lock_spin(&sched_lock);
1655 		td->td_priority = td->td_user_pri;
1656 		td->td_base_pri = td->td_user_pri;
1657 		mtx_unlock_spin(&sched_lock);
1658         }
1659 }
1660 
1661 void
1662 sched_clock(struct thread *td)
1663 {
1664 	struct tdq *tdq;
1665 	struct td_sched *ts;
1666 
1667 	mtx_assert(&sched_lock, MA_OWNED);
1668 #ifdef SMP
1669 	sched_smp_tick(td);
1670 #endif
1671 	tdq = TDQ_SELF();
1672 	/*
1673 	 * Advance the insert index once for each tick to ensure that all
1674 	 * threads get a chance to run.
1675 	 */
1676 	if (tdq->tdq_idx == tdq->tdq_ridx) {
1677 		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
1678 		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
1679 			tdq->tdq_ridx = tdq->tdq_idx;
1680 	}
1681 	ts = td->td_sched;
1682 	/*
1683 	 * We only do slicing code for TIMESHARE threads.
1684 	 */
1685 	if (td->td_pri_class != PRI_TIMESHARE)
1686 		return;
1687 	/*
1688 	 * We used a tick; charge it to the thread so that we can compute our
1689 	 * interactivity.
1690 	 */
1691 	td->td_sched->skg_runtime += tickincr;
1692 	sched_interact_update(td);
1693 	/*
1694 	 * We used up one time slice.
1695 	 */
1696 	if (--ts->ts_slice > 0)
1697 		return;
1698 	/*
1699 	 * We're out of time, recompute priorities and requeue.
1700 	 */
1701 	sched_priority(td);
1702 	td->td_flags |= TDF_NEEDRESCHED;
1703 }
1704 
1705 int
1706 sched_runnable(void)
1707 {
1708 	struct tdq *tdq;
1709 	int load;
1710 
1711 	load = 1;
1712 
1713 	tdq = TDQ_SELF();
1714 #ifdef SMP
1715 	if (tdq_busy)
1716 		goto out;
1717 #endif
1718 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1719 		if (tdq->tdq_load > 0)
1720 			goto out;
1721 	} else
1722 		if (tdq->tdq_load - 1 > 0)
1723 			goto out;
1724 	load = 0;
1725 out:
1726 	return (load);
1727 }
1728 
1729 struct thread *
1730 sched_choose(void)
1731 {
1732 	struct tdq *tdq;
1733 	struct td_sched *ts;
1734 
1735 	mtx_assert(&sched_lock, MA_OWNED);
1736 	tdq = TDQ_SELF();
1737 #ifdef SMP
1738 restart:
1739 #endif
1740 	ts = tdq_choose(tdq);
1741 	if (ts) {
1742 #ifdef SMP
1743 		if (ts->ts_thread->td_priority > PRI_MIN_IDLE)
1744 			if (tdq_idled(tdq) == 0)
1745 				goto restart;
1746 #endif
1747 		tdq_runq_rem(tdq, ts);
1748 		return (ts->ts_thread);
1749 	}
1750 #ifdef SMP
1751 	if (tdq_idled(tdq) == 0)
1752 		goto restart;
1753 #endif
1754 	return (PCPU_GET(idlethread));
1755 }
1756 
1757 static int
1758 sched_preempt(struct thread *td)
1759 {
1760 	struct thread *ctd;
1761 	int cpri;
1762 	int pri;
1763 
1764 	ctd = curthread;
1765 	pri = td->td_priority;
1766 	cpri = ctd->td_priority;
1767 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
1768 		return (0);
1769 	/*
1770 	 * Always preempt IDLE threads.  Otherwise only if the preempting
1771 	 * thread is an ithread.
1772 	 */
1773 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
1774 		return (0);
1775 	if (ctd->td_critnest > 1) {
1776 		CTR1(KTR_PROC, "sched_preempt: in critical section %d",
1777 		    ctd->td_critnest);
1778 		ctd->td_owepreempt = 1;
1779 		return (0);
1780 	}
1781 	/*
1782 	 * Thread is runnable but not yet put on system run queue.
1783 	 */
1784 	MPASS(TD_ON_RUNQ(td));
1785 	TD_SET_RUNNING(td);
1786 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
1787 	    td->td_proc->p_pid, td->td_proc->p_comm);
1788 	mi_switch(SW_INVOL|SW_PREEMPT, td);
1789 	return (1);
1790 }
1791 
1792 void
1793 sched_add(struct thread *td, int flags)
1794 {
1795 	struct tdq *tdq;
1796 	struct td_sched *ts;
1797 	int preemptive;
1798 	int class;
1799 #ifdef SMP
1800 	int cpuid;
1801 	int cpumask;
1802 #endif
1803 	ts = td->td_sched;
1804 
1805 	mtx_assert(&sched_lock, MA_OWNED);
1806 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1807 	    td, td->td_proc->p_comm, td->td_priority, curthread,
1808 	    curthread->td_proc->p_comm);
1809 	KASSERT((td->td_inhibitors == 0),
1810 	    ("sched_add: trying to run inhibited thread"));
1811 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1812 	    ("sched_add: bad thread state"));
1813 	KASSERT(td->td_proc->p_sflag & PS_INMEM,
1814 	    ("sched_add: process swapped out"));
1815 	KASSERT(ts->ts_runq == NULL,
1816 	    ("sched_add: thread %p is still assigned to a run queue", td));
1817         TD_SET_RUNQ(td);
1818 	tdq = TDQ_SELF();
1819 	class = PRI_BASE(td->td_pri_class);
1820 	preemptive = !(flags & SRQ_YIELDING);
1821 	/*
1822 	 * Recalculate the priority before we select the target cpu or
1823 	 * run-queue.
1824 	 */
1825 	if (class == PRI_TIMESHARE)
1826 		sched_priority(td);
1827 	if (ts->ts_slice == 0)
1828 		ts->ts_slice = sched_slice;
1829 #ifdef SMP
1830 	cpuid = PCPU_GET(cpuid);
1831 	/*
1832 	 * Pick the destination cpu and if it isn't ours transfer to the
1833 	 * target cpu.
1834 	 */
1835 	if (THREAD_CAN_MIGRATE(td)) {
1836 		if (td->td_priority <= PRI_MAX_ITHD) {
1837 			CTR2(KTR_SCHED, "ithd %d < %d", td->td_priority, PRI_MAX_ITHD);
1838 			ts->ts_cpu = cpuid;
1839 		}
1840 		if (pick_pri)
1841 			ts->ts_cpu = tdq_pickpri(tdq, ts, flags);
1842 		else
1843 			ts->ts_cpu = tdq_pickidle(tdq, ts);
1844 	} else
1845 		CTR1(KTR_SCHED, "pinned %d", td->td_pinned);
1846 	if (ts->ts_cpu != cpuid)
1847 		preemptive = 0;
1848 	tdq = TDQ_CPU(ts->ts_cpu);
1849 	cpumask = 1 << ts->ts_cpu;
1850 	/*
1851 	 * If we had been idle, clear our bit in the group and potentially
1852 	 * the global bitmap.
1853 	 */
1854 	if ((class != PRI_IDLE && class != PRI_ITHD) &&
1855 	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
1856 		/*
1857 		 * Check to see if our group is unidling, and if so, remove it
1858 		 * from the global idle mask.
1859 		 */
1860 		if (tdq->tdq_group->tdg_idlemask ==
1861 		    tdq->tdq_group->tdg_cpumask)
1862 			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
1863 		/*
1864 		 * Now remove ourselves from the group specific idle mask.
1865 		 */
1866 		tdq->tdq_group->tdg_idlemask &= ~cpumask;
1867 	}
1868 #endif
1869 	/*
1870 	 * Pick the run queue based on priority.
1871 	 */
1872 	if (td->td_priority <= PRI_MAX_REALTIME)
1873 		ts->ts_runq = &tdq->tdq_realtime;
1874 	else if (td->td_priority <= PRI_MAX_TIMESHARE)
1875 		ts->ts_runq = &tdq->tdq_timeshare;
1876 	else
1877 		ts->ts_runq = &tdq->tdq_idle;
1878 	if (preemptive && sched_preempt(td))
1879 		return;
1880 	tdq_runq_add(tdq, ts, flags);
1881 	tdq_load_add(tdq, ts);
1882 #ifdef SMP
1883 	if (ts->ts_cpu != cpuid) {
1884 		tdq_notify(ts);
1885 		return;
1886 	}
1887 #endif
1888 	if (td->td_priority < curthread->td_priority)
1889 		curthread->td_flags |= TDF_NEEDRESCHED;
1890 }
1891 
1892 void
1893 sched_rem(struct thread *td)
1894 {
1895 	struct tdq *tdq;
1896 	struct td_sched *ts;
1897 
1898 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1899 	    td, td->td_proc->p_comm, td->td_priority, curthread,
1900 	    curthread->td_proc->p_comm);
1901 	mtx_assert(&sched_lock, MA_OWNED);
1902 	ts = td->td_sched;
1903 	KASSERT(TD_ON_RUNQ(td),
1904 	    ("sched_rem: thread not on run queue"));
1905 
1906 	tdq = TDQ_CPU(ts->ts_cpu);
1907 	tdq_runq_rem(tdq, ts);
1908 	tdq_load_rem(tdq, ts);
1909 	TD_SET_CAN_RUN(td);
1910 }
1911 
1912 fixpt_t
1913 sched_pctcpu(struct thread *td)
1914 {
1915 	fixpt_t pctcpu;
1916 	struct td_sched *ts;
1917 
1918 	pctcpu = 0;
1919 	ts = td->td_sched;
1920 	if (ts == NULL)
1921 		return (0);
1922 
1923 	mtx_lock_spin(&sched_lock);
1924 	if (ts->ts_ticks) {
1925 		int rtick;
1926 
1927 		sched_pctcpu_update(ts);
1928 		/* How many rtick per second ? */
1929 		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
1930 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
1931 	}
1932 	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
1933 	mtx_unlock_spin(&sched_lock);
1934 
1935 	return (pctcpu);
1936 }
1937 
1938 void
1939 sched_bind(struct thread *td, int cpu)
1940 {
1941 	struct td_sched *ts;
1942 
1943 	mtx_assert(&sched_lock, MA_OWNED);
1944 	ts = td->td_sched;
1945 	if (ts->ts_flags & TSF_BOUND)
1946 		sched_unbind(td);
1947 	ts->ts_flags |= TSF_BOUND;
1948 #ifdef SMP
1949 	sched_pin();
1950 	if (PCPU_GET(cpuid) == cpu)
1951 		return;
1952 	ts->ts_cpu = cpu;
1953 	/* When we return from mi_switch we'll be on the correct cpu. */
1954 	mi_switch(SW_VOL, NULL);
1955 #endif
1956 }
1957 
1958 void
1959 sched_unbind(struct thread *td)
1960 {
1961 	struct td_sched *ts;
1962 
1963 	mtx_assert(&sched_lock, MA_OWNED);
1964 	ts = td->td_sched;
1965 	if ((ts->ts_flags & TSF_BOUND) == 0)
1966 		return;
1967 	ts->ts_flags &= ~TSF_BOUND;
1968 #ifdef SMP
1969 	sched_unpin();
1970 #endif
1971 }
1972 
1973 int
1974 sched_is_bound(struct thread *td)
1975 {
1976 	mtx_assert(&sched_lock, MA_OWNED);
1977 	return (td->td_sched->ts_flags & TSF_BOUND);
1978 }
1979 
1980 void
1981 sched_relinquish(struct thread *td)
1982 {
1983 	mtx_lock_spin(&sched_lock);
1984 	if (td->td_pri_class == PRI_TIMESHARE)
1985 		sched_prio(td, PRI_MAX_TIMESHARE);
1986 	mi_switch(SW_VOL, NULL);
1987 	mtx_unlock_spin(&sched_lock);
1988 }
1989 
1990 int
1991 sched_load(void)
1992 {
1993 #ifdef SMP
1994 	int total;
1995 	int i;
1996 
1997 	total = 0;
1998 	for (i = 0; i <= tdg_maxid; i++)
1999 		total += TDQ_GROUP(i)->tdg_load;
2000 	return (total);
2001 #else
2002 	return (TDQ_SELF()->tdq_sysload);
2003 #endif
2004 }
2005 
2006 int
2007 sched_sizeof_proc(void)
2008 {
2009 	return (sizeof(struct proc));
2010 }
2011 
2012 int
2013 sched_sizeof_thread(void)
2014 {
2015 	return (sizeof(struct thread) + sizeof(struct td_sched));
2016 }
2017 
2018 void
2019 sched_tick(void)
2020 {
2021 	struct td_sched *ts;
2022 
2023 	ts = curthread->td_sched;
2024 	/* Adjust ticks for pctcpu */
2025 	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2026 	ts->ts_ltick = ticks;
2027 	/*
2028 	 * Update if we've exceeded our desired tick threshhold by over one
2029 	 * second.
2030 	 */
2031 	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2032 		sched_pctcpu_update(ts);
2033 }
2034 
2035 /*
2036  * The actual idle process.
2037  */
2038 void
2039 sched_idletd(void *dummy)
2040 {
2041 	struct proc *p;
2042 	struct thread *td;
2043 
2044 	td = curthread;
2045 	p = td->td_proc;
2046 	mtx_assert(&Giant, MA_NOTOWNED);
2047 	/* ULE Relies on preemption for idle interruption. */
2048 	for (;;)
2049 		cpu_idle();
2050 }
2051 
2052 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
2053 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
2054     "Scheduler name");
2055 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
2056 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
2057 SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
2058 SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
2059 #ifdef SMP
2060 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "");
2061 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW,
2062     &affinity, 0, "");
2063 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW,
2064     &tryself, 0, "");
2065 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW,
2066     &tryselfidle, 0, "");
2067 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "");
2068 SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, "");
2069 SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, "");
2070 SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
2071 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
2072 SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
2073 SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
2074 #endif
2075 
2076 /* ps compat */
2077 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
2078 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2079 
2080 
2081 #define KERN_SWITCH_INCLUDE 1
2082 #include "kern/kern_switch.c"
2083