xref: /freebsd/sys/kern/sched_ule.c (revision 08c9a16c4f983367165a141eafa60e27fcd0e76b)
1 /*-
2  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /*
28  * This file implements the ULE scheduler.  ULE supports independent CPU
29  * run queues and fine grain locking.  It has superior interactive
30  * performance under load even on uni-processor systems.
31  *
32  * etymology:
33  *   ULE is the last three letters in schedule.  It owes it's name to a
34  * generic user created for a scheduling system by Paul Mikesell at
35  * Isilon Systems and a general lack of creativity on the part of the author.
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40 
41 #include "opt_hwpmc_hooks.h"
42 #include "opt_sched.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kdb.h>
47 #include <sys/kernel.h>
48 #include <sys/ktr.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/proc.h>
52 #include <sys/resource.h>
53 #include <sys/resourcevar.h>
54 #include <sys/sched.h>
55 #include <sys/smp.h>
56 #include <sys/sx.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/turnstile.h>
60 #include <sys/umtx.h>
61 #include <sys/vmmeter.h>
62 #ifdef KTRACE
63 #include <sys/uio.h>
64 #include <sys/ktrace.h>
65 #endif
66 
67 #ifdef HWPMC_HOOKS
68 #include <sys/pmckern.h>
69 #endif
70 
71 #include <machine/cpu.h>
72 #include <machine/smp.h>
73 
74 #ifndef PREEMPTION
75 #error	"SCHED_ULE requires options PREEMPTION"
76 #endif
77 
78 #define	KTR_ULE	0
79 
80 /*
81  * Thread scheduler specific section.  All fields are protected
82  * by the thread lock.
83  */
84 struct td_sched {
85 	TAILQ_ENTRY(td_sched) ts_procq;	/* Run queue. */
86 	struct thread	*ts_thread;	/* Active associated thread. */
87 	struct runq	*ts_runq;	/* Run-queue we're queued on. */
88 	short		ts_flags;	/* TSF_* flags. */
89 	u_char		ts_rqindex;	/* Run queue index. */
90 	u_char		ts_cpu;		/* CPU that we have affinity for. */
91 	int		ts_slptick;	/* Tick when we went to sleep. */
92 	int		ts_slice;	/* Ticks of slice remaining. */
93 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
94 	u_int		ts_runtime;	/* Number of ticks we were running */
95 	/* The following variables are only used for pctcpu calculation */
96 	int		ts_ltick;	/* Last tick that we were running on */
97 	int		ts_ftick;	/* First tick that we were running on */
98 	int		ts_ticks;	/* Tick count */
99 #ifdef SMP
100 	int		ts_rltick;	/* Real last tick, for affinity. */
101 #endif
102 };
103 /* flags kept in ts_flags */
104 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
105 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
106 
107 static struct td_sched td_sched0;
108 
109 /*
110  * Cpu percentage computation macros and defines.
111  *
112  * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
113  * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
114  * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
115  * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
116  * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
117  * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
118  */
119 #define	SCHED_TICK_SECS		10
120 #define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
121 #define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
122 #define	SCHED_TICK_SHIFT	10
123 #define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
124 #define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
125 
126 /*
127  * These macros determine priorities for non-interactive threads.  They are
128  * assigned a priority based on their recent cpu utilization as expressed
129  * by the ratio of ticks to the tick total.  NHALF priorities at the start
130  * and end of the MIN to MAX timeshare range are only reachable with negative
131  * or positive nice respectively.
132  *
133  * PRI_RANGE:	Priority range for utilization dependent priorities.
134  * PRI_NRESV:	Number of nice values.
135  * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
136  * PRI_NICE:	Determines the part of the priority inherited from nice.
137  */
138 #define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
139 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
140 #define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
141 #define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
142 #define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN)
143 #define	SCHED_PRI_TICKS(ts)						\
144     (SCHED_TICK_HZ((ts)) /						\
145     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
146 #define	SCHED_PRI_NICE(nice)	(nice)
147 
148 /*
149  * These determine the interactivity of a process.  Interactivity differs from
150  * cpu utilization in that it expresses the voluntary time slept vs time ran
151  * while cpu utilization includes all time not running.  This more accurately
152  * models the intent of the thread.
153  *
154  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
155  *		before throttling back.
156  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
157  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
158  * INTERACT_THRESH:	Threshhold for placement on the current runq.
159  */
160 #define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
161 #define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
162 #define	SCHED_INTERACT_MAX	(100)
163 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
164 #define	SCHED_INTERACT_THRESH	(30)
165 
166 /*
167  * tickincr:		Converts a stathz tick into a hz domain scaled by
168  *			the shift factor.  Without the shift the error rate
169  *			due to rounding would be unacceptably high.
170  * realstathz:		stathz is sometimes 0 and run off of hz.
171  * sched_slice:		Runtime of each thread before rescheduling.
172  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
173  */
174 static int sched_interact = SCHED_INTERACT_THRESH;
175 static int realstathz;
176 static int tickincr;
177 static int sched_slice;
178 static int preempt_thresh = PRI_MIN_KERN;
179 
180 #define	SCHED_BAL_SECS	2	/* How often we run the rebalance algorithm. */
181 
182 /*
183  * tdq - per processor runqs and statistics.  All fields are protected by the
184  * tdq_lock.  The load and lowpri may be accessed without to avoid excess
185  * locking in sched_pickcpu();
186  */
187 struct tdq {
188 	struct mtx	tdq_lock;		/* Protects all fields below. */
189 	struct runq	tdq_realtime;		/* real-time run queue. */
190 	struct runq	tdq_timeshare;		/* timeshare run queue. */
191 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
192 	int		tdq_load;		/* Aggregate load. */
193 	u_char		tdq_idx;		/* Current insert index. */
194 	u_char		tdq_ridx;		/* Current removal index. */
195 #ifdef SMP
196 	u_char		tdq_lowpri;		/* Lowest priority thread. */
197 	int		tdq_transferable;	/* Transferable thread count. */
198 	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
199 	struct tdq_group *tdq_group;		/* Our processor group. */
200 #else
201 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
202 #endif
203 	char		tdq_name[16];		/* lock name. */
204 } __aligned(64);
205 
206 
207 #ifdef SMP
208 /*
209  * tdq groups are groups of processors which can cheaply share threads.  When
210  * one processor in the group goes idle it will check the runqs of the other
211  * processors in its group prior to halting and waiting for an interrupt.
212  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
213  * In a numa environment we'd want an idle bitmap per group and a two tiered
214  * load balancer.
215  */
216 struct tdq_group {
217 	int	tdg_cpus;		/* Count of CPUs in this tdq group. */
218 	cpumask_t tdg_cpumask;		/* Mask of cpus in this group. */
219 	cpumask_t tdg_idlemask;		/* Idle cpus in this group. */
220 	cpumask_t tdg_mask;		/* Bit mask for first cpu. */
221 	int	tdg_load;		/* Total load of this group. */
222 	int	tdg_transferable;	/* Transferable load of this group. */
223 	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
224 } __aligned(64);
225 
226 #define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 300))
227 #define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
228 
229 /*
230  * Run-time tunables.
231  */
232 static int rebalance = 0;
233 static int pick_pri = 0;
234 static int pick_zero = 0;
235 static int affinity;
236 static int tryself = 1;
237 static int tryselfidle = 1;
238 static int steal_htt = 0;
239 static int steal_idle = 0;
240 static int topology = 0;
241 
242 /*
243  * One thread queue per processor.
244  */
245 static volatile cpumask_t tdq_idle;
246 static int tdg_maxid;
247 static struct tdq	tdq_cpu[MAXCPU];
248 static struct tdq_group tdq_groups[MAXCPU];
249 static struct callout balco;
250 static struct callout gbalco;
251 
252 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
253 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
254 #define	TDQ_ID(x)	((x) - tdq_cpu)
255 #define	TDQ_GROUP(x)	(&tdq_groups[(x)])
256 #else	/* !SMP */
257 static struct tdq	tdq_cpu;
258 
259 #define	TDQ_ID(x)	(0)
260 #define	TDQ_SELF()	(&tdq_cpu)
261 #define	TDQ_CPU(x)	(&tdq_cpu)
262 #endif
263 
264 #define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
265 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
266 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
267 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
268 #define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
269 
270 static void sched_priority(struct thread *);
271 static void sched_thread_priority(struct thread *, u_char);
272 static int sched_interact_score(struct thread *);
273 static void sched_interact_update(struct thread *);
274 static void sched_interact_fork(struct thread *);
275 static void sched_pctcpu_update(struct td_sched *);
276 
277 /* Operations on per processor queues */
278 static struct td_sched * tdq_choose(struct tdq *);
279 static void tdq_setup(struct tdq *);
280 static void tdq_load_add(struct tdq *, struct td_sched *);
281 static void tdq_load_rem(struct tdq *, struct td_sched *);
282 static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
283 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
284 void tdq_print(int cpu);
285 static void runq_print(struct runq *rq);
286 static void tdq_add(struct tdq *, struct thread *, int);
287 #ifdef SMP
288 static void tdq_move(struct tdq *, struct tdq *);
289 static int tdq_idled(struct tdq *);
290 static void tdq_notify(struct td_sched *);
291 static struct td_sched *tdq_steal(struct tdq *, int);
292 static struct td_sched *runq_steal(struct runq *);
293 static int sched_pickcpu(struct td_sched *, int);
294 static void sched_balance(void *);
295 static void sched_balance_groups(void *);
296 static void sched_balance_group(struct tdq_group *);
297 static void sched_balance_pair(struct tdq *, struct tdq *);
298 static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
299 static inline struct mtx *thread_block_switch(struct thread *);
300 static inline void thread_unblock_switch(struct thread *, struct mtx *);
301 
302 #define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
303 #endif
304 
305 static void sched_setup(void *dummy);
306 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
307 
308 static void sched_initticks(void *dummy);
309 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
310 
311 /*
312  * Print the threads waiting on a run-queue.
313  */
314 static void
315 runq_print(struct runq *rq)
316 {
317 	struct rqhead *rqh;
318 	struct td_sched *ts;
319 	int pri;
320 	int j;
321 	int i;
322 
323 	for (i = 0; i < RQB_LEN; i++) {
324 		printf("\t\trunq bits %d 0x%zx\n",
325 		    i, rq->rq_status.rqb_bits[i]);
326 		for (j = 0; j < RQB_BPW; j++)
327 			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
328 				pri = j + (i << RQB_L2BPW);
329 				rqh = &rq->rq_queues[pri];
330 				TAILQ_FOREACH(ts, rqh, ts_procq) {
331 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
332 					    ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
333 				}
334 			}
335 	}
336 }
337 
338 /*
339  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
340  */
341 void
342 tdq_print(int cpu)
343 {
344 	struct tdq *tdq;
345 
346 	tdq = TDQ_CPU(cpu);
347 
348 	printf("tdq:\n");
349 	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
350 	printf("\tlock name       %s\n", tdq->tdq_name);
351 	printf("\tload:           %d\n", tdq->tdq_load);
352 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
353 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
354 	printf("\trealtime runq:\n");
355 	runq_print(&tdq->tdq_realtime);
356 	printf("\ttimeshare runq:\n");
357 	runq_print(&tdq->tdq_timeshare);
358 	printf("\tidle runq:\n");
359 	runq_print(&tdq->tdq_idle);
360 #ifdef SMP
361 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
362 	printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
363 #endif
364 }
365 
366 #define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
367 /*
368  * Add a thread to the actual run-queue.  Keeps transferable counts up to
369  * date with what is actually on the run-queue.  Selects the correct
370  * queue position for timeshare threads.
371  */
372 static __inline void
373 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
374 {
375 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
376 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
377 #ifdef SMP
378 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
379 		tdq->tdq_transferable++;
380 		tdq->tdq_group->tdg_transferable++;
381 		ts->ts_flags |= TSF_XFERABLE;
382 	}
383 #endif
384 	if (ts->ts_runq == &tdq->tdq_timeshare) {
385 		u_char pri;
386 
387 		pri = ts->ts_thread->td_priority;
388 		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
389 			("Invalid priority %d on timeshare runq", pri));
390 		/*
391 		 * This queue contains only priorities between MIN and MAX
392 		 * realtime.  Use the whole queue to represent these values.
393 		 */
394 		if ((flags & SRQ_BORROWING) == 0) {
395 			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
396 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
397 			/*
398 			 * This effectively shortens the queue by one so we
399 			 * can have a one slot difference between idx and
400 			 * ridx while we wait for threads to drain.
401 			 */
402 			if (tdq->tdq_ridx != tdq->tdq_idx &&
403 			    pri == tdq->tdq_ridx)
404 				pri = (unsigned char)(pri - 1) % RQ_NQS;
405 		} else
406 			pri = tdq->tdq_ridx;
407 		runq_add_pri(ts->ts_runq, ts, pri, flags);
408 	} else
409 		runq_add(ts->ts_runq, ts, flags);
410 }
411 
412 /*
413  * Remove a thread from a run-queue.  This typically happens when a thread
414  * is selected to run.  Running threads are not on the queue and the
415  * transferable count does not reflect them.
416  */
417 static __inline void
418 tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
419 {
420 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
421 	KASSERT(ts->ts_runq != NULL,
422 	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
423 #ifdef SMP
424 	if (ts->ts_flags & TSF_XFERABLE) {
425 		tdq->tdq_transferable--;
426 		tdq->tdq_group->tdg_transferable--;
427 		ts->ts_flags &= ~TSF_XFERABLE;
428 	}
429 #endif
430 	if (ts->ts_runq == &tdq->tdq_timeshare) {
431 		if (tdq->tdq_idx != tdq->tdq_ridx)
432 			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
433 		else
434 			runq_remove_idx(ts->ts_runq, ts, NULL);
435 		/*
436 		 * For timeshare threads we update the priority here so
437 		 * the priority reflects the time we've been sleeping.
438 		 */
439 		ts->ts_ltick = ticks;
440 		sched_pctcpu_update(ts);
441 		sched_priority(ts->ts_thread);
442 	} else
443 		runq_remove(ts->ts_runq, ts);
444 }
445 
446 /*
447  * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
448  * for this thread to the referenced thread queue.
449  */
450 static void
451 tdq_load_add(struct tdq *tdq, struct td_sched *ts)
452 {
453 	int class;
454 
455 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
456 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
457 	class = PRI_BASE(ts->ts_thread->td_pri_class);
458 	tdq->tdq_load++;
459 	CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
460 	if (class != PRI_ITHD &&
461 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
462 #ifdef SMP
463 		tdq->tdq_group->tdg_load++;
464 #else
465 		tdq->tdq_sysload++;
466 #endif
467 }
468 
469 /*
470  * Remove the load from a thread that is transitioning to a sleep state or
471  * exiting.
472  */
473 static void
474 tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
475 {
476 	int class;
477 
478 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
479 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
480 	class = PRI_BASE(ts->ts_thread->td_pri_class);
481 	if (class != PRI_ITHD &&
482 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
483 #ifdef SMP
484 		tdq->tdq_group->tdg_load--;
485 #else
486 		tdq->tdq_sysload--;
487 #endif
488 	KASSERT(tdq->tdq_load != 0,
489 	    ("tdq_load_rem: Removing with 0 load on queue %d", (int)TDQ_ID(tdq)));
490 	tdq->tdq_load--;
491 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
492 	ts->ts_runq = NULL;
493 }
494 
495 #ifdef SMP
496 /*
497  * sched_balance is a simple CPU load balancing algorithm.  It operates by
498  * finding the least loaded and most loaded cpu and equalizing their load
499  * by migrating some processes.
500  *
501  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
502  * installations will only have 2 cpus.  Secondly, load balancing too much at
503  * once can have an unpleasant effect on the system.  The scheduler rarely has
504  * enough information to make perfect decisions.  So this algorithm chooses
505  * simplicity and more gradual effects on load in larger systems.
506  *
507  */
508 static void
509 sched_balance(void *arg)
510 {
511 	struct tdq_group *high;
512 	struct tdq_group *low;
513 	struct tdq_group *tdg;
514 	int cnt;
515 	int i;
516 
517 	callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
518 	    sched_balance, NULL);
519 	if (smp_started == 0 || rebalance == 0)
520 		return;
521 	low = high = NULL;
522 	i = random() % (tdg_maxid + 1);
523 	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
524 		tdg = TDQ_GROUP(i);
525 		/*
526 		 * Find the CPU with the highest load that has some
527 		 * threads to transfer.
528 		 */
529 		if ((high == NULL || tdg->tdg_load > high->tdg_load)
530 		    && tdg->tdg_transferable)
531 			high = tdg;
532 		if (low == NULL || tdg->tdg_load < low->tdg_load)
533 			low = tdg;
534 		if (++i > tdg_maxid)
535 			i = 0;
536 	}
537 	if (low != NULL && high != NULL && high != low)
538 		sched_balance_pair(LIST_FIRST(&high->tdg_members),
539 		    LIST_FIRST(&low->tdg_members));
540 }
541 
542 /*
543  * Balance load between CPUs in a group.  Will only migrate within the group.
544  */
545 static void
546 sched_balance_groups(void *arg)
547 {
548 	int i;
549 
550 	callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
551 	    sched_balance_groups, NULL);
552 	if (smp_started == 0 || rebalance == 0)
553 		return;
554 	for (i = 0; i <= tdg_maxid; i++)
555 		sched_balance_group(TDQ_GROUP(i));
556 }
557 
558 /*
559  * Finds the greatest imbalance between two tdqs in a group.
560  */
561 static void
562 sched_balance_group(struct tdq_group *tdg)
563 {
564 	struct tdq *tdq;
565 	struct tdq *high;
566 	struct tdq *low;
567 	int load;
568 
569 	if (tdg->tdg_transferable == 0)
570 		return;
571 	low = NULL;
572 	high = NULL;
573 	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
574 		load = tdq->tdq_load;
575 		if (high == NULL || load > high->tdq_load)
576 			high = tdq;
577 		if (low == NULL || load < low->tdq_load)
578 			low = tdq;
579 	}
580 	if (high != NULL && low != NULL && high != low)
581 		sched_balance_pair(high, low);
582 }
583 
584 /*
585  * Lock two thread queues using their address to maintain lock order.
586  */
587 static void
588 tdq_lock_pair(struct tdq *one, struct tdq *two)
589 {
590 	if (one < two) {
591 		TDQ_LOCK(one);
592 		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
593 	} else {
594 		TDQ_LOCK(two);
595 		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
596 	}
597 }
598 
599 /*
600  * Transfer load between two imbalanced thread queues.
601  */
602 static void
603 sched_balance_pair(struct tdq *high, struct tdq *low)
604 {
605 	int transferable;
606 	int high_load;
607 	int low_load;
608 	int move;
609 	int diff;
610 	int i;
611 
612 	tdq_lock_pair(high, low);
613 	/*
614 	 * If we're transfering within a group we have to use this specific
615 	 * tdq's transferable count, otherwise we can steal from other members
616 	 * of the group.
617 	 */
618 	if (high->tdq_group == low->tdq_group) {
619 		transferable = high->tdq_transferable;
620 		high_load = high->tdq_load;
621 		low_load = low->tdq_load;
622 	} else {
623 		transferable = high->tdq_group->tdg_transferable;
624 		high_load = high->tdq_group->tdg_load;
625 		low_load = low->tdq_group->tdg_load;
626 	}
627 	/*
628 	 * Determine what the imbalance is and then adjust that to how many
629 	 * threads we actually have to give up (transferable).
630 	 */
631 	if (transferable != 0) {
632 		diff = high_load - low_load;
633 		move = diff / 2;
634 		if (diff & 0x1)
635 			move++;
636 		move = min(move, transferable);
637 		for (i = 0; i < move; i++)
638 			tdq_move(high, low);
639 	}
640 	TDQ_UNLOCK(high);
641 	TDQ_UNLOCK(low);
642 	return;
643 }
644 
645 /*
646  * Move a thread from one thread queue to another.
647  */
648 static void
649 tdq_move(struct tdq *from, struct tdq *to)
650 {
651 	struct td_sched *ts;
652 	struct thread *td;
653 	struct tdq *tdq;
654 	int cpu;
655 
656 	tdq = from;
657 	cpu = TDQ_ID(to);
658 	ts = tdq_steal(tdq, 1);
659 	if (ts == NULL) {
660 		struct tdq_group *tdg;
661 
662 		tdg = tdq->tdq_group;
663 		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
664 			if (tdq == from || tdq->tdq_transferable == 0)
665 				continue;
666 			ts = tdq_steal(tdq, 1);
667 			break;
668 		}
669 		if (ts == NULL)
670 			return;
671 	}
672 	if (tdq == to)
673 		return;
674 	td = ts->ts_thread;
675 	/*
676 	 * Although the run queue is locked the thread may be blocked.  Lock
677 	 * it to clear this.
678 	 */
679 	thread_lock(td);
680 	/* Drop recursive lock on from. */
681 	TDQ_UNLOCK(from);
682 	sched_rem(td);
683 	ts->ts_cpu = cpu;
684 	td->td_lock = TDQ_LOCKPTR(to);
685 	tdq_add(to, td, SRQ_YIELDING);
686 	tdq_notify(ts);
687 }
688 
689 /*
690  * This tdq has idled.  Try to steal a thread from another cpu and switch
691  * to it.
692  */
693 static int
694 tdq_idled(struct tdq *tdq)
695 {
696 	struct tdq_group *tdg;
697 	struct tdq *steal;
698 	struct td_sched *ts;
699 	struct thread *td;
700 	int highload;
701 	int highcpu;
702 	int load;
703 	int cpu;
704 
705 	/* We don't want to be preempted while we're iterating over tdqs */
706 	spinlock_enter();
707 	tdg = tdq->tdq_group;
708 	/*
709 	 * If we're in a cpu group, try and steal threads from another cpu in
710 	 * the group before idling.
711 	 */
712 	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
713 		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
714 			if (steal == tdq || steal->tdq_transferable == 0)
715 				continue;
716 			TDQ_LOCK(steal);
717 			ts = tdq_steal(steal, 0);
718 			if (ts)
719 				goto steal;
720 			TDQ_UNLOCK(steal);
721 		}
722 	}
723 	for (;;) {
724 		if (steal_idle == 0)
725 			break;
726 		highcpu = 0;
727 		highload = 0;
728 		for (cpu = 0; cpu <= mp_maxid; cpu++) {
729 			if (CPU_ABSENT(cpu))
730 				continue;
731 			steal = TDQ_CPU(cpu);
732 			load = TDQ_CPU(cpu)->tdq_transferable;
733 			if (load < highload)
734 				continue;
735 			highload = load;
736 			highcpu = cpu;
737 		}
738 		if (highload < 2)
739 			break;
740 		steal = TDQ_CPU(highcpu);
741 		TDQ_LOCK(steal);
742 		if (steal->tdq_transferable > 1 &&
743 		    (ts = tdq_steal(steal, 1)) != NULL)
744 			goto steal;
745 		TDQ_UNLOCK(steal);
746 		break;
747 	}
748 	spinlock_exit();
749 	return (1);
750 steal:
751 	td = ts->ts_thread;
752 	thread_lock(td);
753 	spinlock_exit();
754 	MPASS(td->td_lock == TDQ_LOCKPTR(steal));
755 	TDQ_UNLOCK(steal);
756 	sched_rem(td);
757 	sched_setcpu(ts, PCPU_GET(cpuid), SRQ_YIELDING);
758 	tdq_add(tdq, td, SRQ_YIELDING);
759 	MPASS(td->td_lock == curthread->td_lock);
760 	mi_switch(SW_VOL, NULL);
761 	thread_unlock(curthread);
762 
763 	return (0);
764 }
765 
766 /*
767  * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
768  */
769 static void
770 tdq_notify(struct td_sched *ts)
771 {
772 	struct thread *ctd;
773 	struct pcpu *pcpu;
774 	int cpri;
775 	int pri;
776 	int cpu;
777 
778 	cpu = ts->ts_cpu;
779 	pri = ts->ts_thread->td_priority;
780 	pcpu = pcpu_find(cpu);
781 	ctd = pcpu->pc_curthread;
782 	cpri = ctd->td_priority;
783 
784 	/*
785 	 * If our priority is not better than the current priority there is
786 	 * nothing to do.
787 	 */
788 	if (pri > cpri)
789 		return;
790 	/*
791 	 * Always IPI idle.
792 	 */
793 	if (cpri > PRI_MIN_IDLE)
794 		goto sendipi;
795 	/*
796 	 * If we're realtime or better and there is timeshare or worse running
797 	 * send an IPI.
798 	 */
799 	if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
800 		goto sendipi;
801 	/*
802 	 * Otherwise only IPI if we exceed the threshold.
803 	 */
804 	if (pri > preempt_thresh)
805 		return;
806 sendipi:
807 	ctd->td_flags |= TDF_NEEDRESCHED;
808 	ipi_selected(1 << cpu, IPI_PREEMPT);
809 }
810 
811 /*
812  * Steals load from a timeshare queue.  Honors the rotating queue head
813  * index.
814  */
815 static struct td_sched *
816 runq_steal_from(struct runq *rq, u_char start)
817 {
818 	struct td_sched *ts;
819 	struct rqbits *rqb;
820 	struct rqhead *rqh;
821 	int first;
822 	int bit;
823 	int pri;
824 	int i;
825 
826 	rqb = &rq->rq_status;
827 	bit = start & (RQB_BPW -1);
828 	pri = 0;
829 	first = 0;
830 again:
831 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
832 		if (rqb->rqb_bits[i] == 0)
833 			continue;
834 		if (bit != 0) {
835 			for (pri = bit; pri < RQB_BPW; pri++)
836 				if (rqb->rqb_bits[i] & (1ul << pri))
837 					break;
838 			if (pri >= RQB_BPW)
839 				continue;
840 		} else
841 			pri = RQB_FFS(rqb->rqb_bits[i]);
842 		pri += (i << RQB_L2BPW);
843 		rqh = &rq->rq_queues[pri];
844 		TAILQ_FOREACH(ts, rqh, ts_procq) {
845 			if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
846 				return (ts);
847 			first = 1;
848 		}
849 	}
850 	if (start != 0) {
851 		start = 0;
852 		goto again;
853 	}
854 
855 	return (NULL);
856 }
857 
858 /*
859  * Steals load from a standard linear queue.
860  */
861 static struct td_sched *
862 runq_steal(struct runq *rq)
863 {
864 	struct rqhead *rqh;
865 	struct rqbits *rqb;
866 	struct td_sched *ts;
867 	int first;
868 	int word;
869 	int bit;
870 
871 	first = 0;
872 	rqb = &rq->rq_status;
873 	for (word = 0; word < RQB_LEN; word++) {
874 		if (rqb->rqb_bits[word] == 0)
875 			continue;
876 		for (bit = 0; bit < RQB_BPW; bit++) {
877 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
878 				continue;
879 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
880 			TAILQ_FOREACH(ts, rqh, ts_procq) {
881 				if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
882 					return (ts);
883 				first = 1;
884 			}
885 		}
886 	}
887 	return (NULL);
888 }
889 
890 /*
891  * Attempt to steal a thread in priority order from a thread queue.
892  */
893 static struct td_sched *
894 tdq_steal(struct tdq *tdq, int stealidle)
895 {
896 	struct td_sched *ts;
897 
898 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
899 	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
900 		return (ts);
901 	if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL)
902 		return (ts);
903 	if (stealidle)
904 		return (runq_steal(&tdq->tdq_idle));
905 	return (NULL);
906 }
907 
908 /*
909  * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
910  * current lock and returns with the assigned queue locked.  If this is
911  * via sched_switch() we leave the thread in a blocked state as an
912  * optimization.
913  */
914 static inline struct tdq *
915 sched_setcpu(struct td_sched *ts, int cpu, int flags)
916 {
917 	struct thread *td;
918 	struct tdq *tdq;
919 
920 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
921 
922 	tdq = TDQ_CPU(cpu);
923 	td = ts->ts_thread;
924 	ts->ts_cpu = cpu;
925 	if (td->td_lock == TDQ_LOCKPTR(tdq))
926 		return (tdq);
927 #ifdef notyet
928 	/*
929 	 * If the thread isn't running it's lockptr is a
930 	 * turnstile or a sleepqueue.  We can just lock_set without
931 	 * blocking.
932 	 */
933 	if (TD_CAN_RUN(td)) {
934 		TDQ_LOCK(tdq);
935 		thread_lock_set(td, TDQ_LOCKPTR(tdq));
936 		return (tdq);
937 	}
938 #endif
939 	/*
940 	 * The hard case, migration, we need to block the thread first to
941 	 * prevent order reversals with other cpus locks.
942 	 */
943 	thread_lock_block(td);
944 	TDQ_LOCK(tdq);
945 	/* Return to sched_switch() with the lock still blocked */
946 	if ((flags & SRQ_OURSELF) == 0)
947 		thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
948 	return (tdq);
949 }
950 
951 /*
952  * Find the thread queue running the lowest priority thread.
953  */
954 static int
955 tdq_lowestpri(void)
956 {
957 	struct tdq *tdq;
958 	int lowpri;
959 	int lowcpu;
960 	int lowload;
961 	int load;
962 	int cpu;
963 	int pri;
964 
965 	lowload = 0;
966 	lowpri = lowcpu = 0;
967 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
968 		if (CPU_ABSENT(cpu))
969 			continue;
970 		tdq = TDQ_CPU(cpu);
971 		pri = tdq->tdq_lowpri;
972 		load = TDQ_CPU(cpu)->tdq_load;
973 		CTR4(KTR_ULE,
974 		    "cpu %d pri %d lowcpu %d lowpri %d",
975 		    cpu, pri, lowcpu, lowpri);
976 		if (pri < lowpri)
977 			continue;
978 		if (lowpri && lowpri == pri && load > lowload)
979 			continue;
980 		lowpri = pri;
981 		lowcpu = cpu;
982 		lowload = load;
983 	}
984 
985 	return (lowcpu);
986 }
987 
988 /*
989  * Find the thread queue with the least load.
990  */
991 static int
992 tdq_lowestload(void)
993 {
994 	struct tdq *tdq;
995 	int lowload;
996 	int lowpri;
997 	int lowcpu;
998 	int load;
999 	int cpu;
1000 	int pri;
1001 
1002 	lowcpu = 0;
1003 	lowload = TDQ_CPU(0)->tdq_load;
1004 	lowpri = TDQ_CPU(0)->tdq_lowpri;
1005 	for (cpu = 1; cpu <= mp_maxid; cpu++) {
1006 		if (CPU_ABSENT(cpu))
1007 			continue;
1008 		tdq = TDQ_CPU(cpu);
1009 		load = tdq->tdq_load;
1010 		pri = tdq->tdq_lowpri;
1011 		CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d",
1012 		    cpu, load, lowcpu, lowload);
1013 		if (load > lowload)
1014 			continue;
1015 		if (load == lowload && pri < lowpri)
1016 			continue;
1017 		lowcpu = cpu;
1018 		lowload = load;
1019 		lowpri = pri;
1020 	}
1021 
1022 	return (lowcpu);
1023 }
1024 
1025 /*
1026  * Pick the destination cpu for sched_add().  Respects affinity and makes
1027  * a determination based on load or priority of available processors.
1028  */
1029 static int
1030 sched_pickcpu(struct td_sched *ts, int flags)
1031 {
1032 	struct tdq *tdq;
1033 	int self;
1034 	int pri;
1035 	int cpu;
1036 
1037 	cpu = self = PCPU_GET(cpuid);
1038 	if (smp_started == 0)
1039 		return (self);
1040 	pri = ts->ts_thread->td_priority;
1041 	cpu = ts->ts_cpu;
1042 	/*
1043 	 * Regardless of affinity, if the last cpu is idle send it there.
1044 	 */
1045 	tdq = TDQ_CPU(cpu);
1046 	if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
1047 		CTR5(KTR_ULE,
1048 		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
1049 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
1050 		    tdq->tdq_lowpri);
1051 		return (ts->ts_cpu);
1052 	}
1053 	/*
1054 	 * If we have affinity, try to place it on the cpu we last ran on.
1055 	 */
1056 	if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
1057 		CTR5(KTR_ULE,
1058 		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
1059 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
1060 		    tdq->tdq_lowpri);
1061 		return (ts->ts_cpu);
1062 	}
1063 	/*
1064 	 * Try ourself first; If we're running something lower priority this
1065 	 * may have some locality with the waking thread and execute faster
1066 	 * here.
1067 	 */
1068 	if (tryself) {
1069 		/*
1070 		 * If we're being awoken by an interrupt thread or the waker
1071 		 * is going right to sleep run here as well.
1072 		 */
1073 		if ((TDQ_SELF()->tdq_load <= 1) && (flags & (SRQ_YIELDING) ||
1074 		    curthread->td_pri_class == PRI_ITHD)) {
1075 			CTR2(KTR_ULE, "tryself load %d flags %d",
1076 			    TDQ_SELF()->tdq_load, flags);
1077 			return (self);
1078 		}
1079 	}
1080 	/*
1081 	 * Look for an idle group.
1082 	 */
1083 	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
1084 	cpu = ffs(tdq_idle);
1085 	if (cpu)
1086 		return (--cpu);
1087 	if (tryselfidle && pri < curthread->td_priority) {
1088 		CTR1(KTR_ULE, "tryselfidle %d",
1089 		    curthread->td_priority);
1090 		return (self);
1091 	}
1092 	/*
1093 	 * XXX Under heavy load mysql performs way better if you
1094 	 * serialize the non-running threads on one cpu.  This is
1095 	 * a horrible hack.
1096 	 */
1097 	if (pick_zero)
1098 		return (0);
1099 	/*
1100  	 * Now search for the cpu running the lowest priority thread with
1101 	 * the least load.
1102 	 */
1103 	if (pick_pri)
1104 		cpu = tdq_lowestpri();
1105 	else
1106 		cpu = tdq_lowestload();
1107 	return (cpu);
1108 }
1109 
1110 #endif	/* SMP */
1111 
1112 /*
1113  * Pick the highest priority task we have and return it.
1114  */
1115 static struct td_sched *
1116 tdq_choose(struct tdq *tdq)
1117 {
1118 	struct td_sched *ts;
1119 
1120 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1121 	ts = runq_choose(&tdq->tdq_realtime);
1122 	if (ts != NULL)
1123 		return (ts);
1124 	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1125 	if (ts != NULL) {
1126 		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
1127 		    ("tdq_choose: Invalid priority on timeshare queue %d",
1128 		    ts->ts_thread->td_priority));
1129 		return (ts);
1130 	}
1131 
1132 	ts = runq_choose(&tdq->tdq_idle);
1133 	if (ts != NULL) {
1134 		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
1135 		    ("tdq_choose: Invalid priority on idle queue %d",
1136 		    ts->ts_thread->td_priority));
1137 		return (ts);
1138 	}
1139 
1140 	return (NULL);
1141 }
1142 
1143 /*
1144  * Initialize a thread queue.
1145  */
1146 static void
1147 tdq_setup(struct tdq *tdq)
1148 {
1149 
1150 	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1151 	    "sched lock %d", (int)TDQ_ID(tdq));
1152 	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1153 	    MTX_SPIN | MTX_RECURSE);
1154 	runq_init(&tdq->tdq_realtime);
1155 	runq_init(&tdq->tdq_timeshare);
1156 	runq_init(&tdq->tdq_idle);
1157 	tdq->tdq_load = 0;
1158 }
1159 
1160 /*
1161  * Setup the thread queues and initialize the topology based on MD
1162  * information.
1163  */
1164 static void
1165 sched_setup(void *dummy)
1166 {
1167 	struct tdq *tdq;
1168 #ifdef SMP
1169 	int balance_groups;
1170 	int i;
1171 
1172 	balance_groups = 0;
1173 	/*
1174 	 * Initialize the tdqs.
1175 	 */
1176 	for (i = 0; i < MAXCPU; i++) {
1177 		tdq = &tdq_cpu[i];
1178 		tdq_setup(&tdq_cpu[i]);
1179 	}
1180 	if (smp_topology == NULL) {
1181 		struct tdq_group *tdg;
1182 		int cpus;
1183 
1184 		for (cpus = 0, i = 0; i < MAXCPU; i++) {
1185 			if (CPU_ABSENT(i))
1186 				continue;
1187 			tdq = &tdq_cpu[i];
1188 			tdg = &tdq_groups[cpus];
1189 			/*
1190 			 * Setup a tdq group with one member.
1191 			 */
1192 			tdq->tdq_transferable = 0;
1193 			tdq->tdq_group = tdg;
1194 			tdg->tdg_cpus = 1;
1195 			tdg->tdg_idlemask = 0;
1196 			tdg->tdg_cpumask = tdg->tdg_mask = 1 << i;
1197 			tdg->tdg_load = 0;
1198 			tdg->tdg_transferable = 0;
1199 			LIST_INIT(&tdg->tdg_members);
1200 			LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
1201 			cpus++;
1202 		}
1203 		tdg_maxid = cpus - 1;
1204 	} else {
1205 		struct tdq_group *tdg;
1206 		struct cpu_group *cg;
1207 		int j;
1208 
1209 		topology = 1;
1210 		for (i = 0; i < smp_topology->ct_count; i++) {
1211 			cg = &smp_topology->ct_group[i];
1212 			tdg = &tdq_groups[i];
1213 			/*
1214 			 * Initialize the group.
1215 			 */
1216 			tdg->tdg_idlemask = 0;
1217 			tdg->tdg_load = 0;
1218 			tdg->tdg_transferable = 0;
1219 			tdg->tdg_cpus = cg->cg_count;
1220 			tdg->tdg_cpumask = cg->cg_mask;
1221 			LIST_INIT(&tdg->tdg_members);
1222 			/*
1223 			 * Find all of the group members and add them.
1224 			 */
1225 			for (j = 0; j < MAXCPU; j++) {
1226 				if ((cg->cg_mask & (1 << j)) != 0) {
1227 					if (tdg->tdg_mask == 0)
1228 						tdg->tdg_mask = 1 << j;
1229 					tdq_cpu[j].tdq_transferable = 0;
1230 					tdq_cpu[j].tdq_group = tdg;
1231 					LIST_INSERT_HEAD(&tdg->tdg_members,
1232 					    &tdq_cpu[j], tdq_siblings);
1233 				}
1234 			}
1235 			if (tdg->tdg_cpus > 1)
1236 				balance_groups = 1;
1237 		}
1238 		tdg_maxid = smp_topology->ct_count - 1;
1239 	}
1240 	/*
1241 	 * Initialize long-term cpu balancing algorithm.
1242 	 */
1243 	callout_init(&balco, CALLOUT_MPSAFE);
1244 	callout_init(&gbalco, CALLOUT_MPSAFE);
1245 	sched_balance(NULL);
1246 	if (balance_groups)
1247 		sched_balance_groups(NULL);
1248 
1249 #else
1250 	tdq_setup(TDQ_SELF());
1251 #endif
1252 	/*
1253 	 * To avoid divide-by-zero, we set realstathz a dummy value
1254 	 * in case which sched_clock() called before sched_initticks().
1255 	 */
1256 	realstathz = hz;
1257 	sched_slice = (realstathz/10);	/* ~100ms */
1258 	tickincr = 1 << SCHED_TICK_SHIFT;
1259 
1260 	/* Add thread0's load since it's running. */
1261 	tdq = TDQ_SELF();
1262 	TDQ_LOCK(tdq);
1263 	tdq_load_add(tdq, &td_sched0);
1264 	TDQ_UNLOCK(tdq);
1265 }
1266 
1267 /*
1268  * This routine determines the tickincr after stathz and hz are setup.
1269  */
1270 /* ARGSUSED */
1271 static void
1272 sched_initticks(void *dummy)
1273 {
1274 	int incr;
1275 
1276 	realstathz = stathz ? stathz : hz;
1277 	sched_slice = (realstathz/10);	/* ~100ms */
1278 
1279 	/*
1280 	 * tickincr is shifted out by 10 to avoid rounding errors due to
1281 	 * hz not being evenly divisible by stathz on all platforms.
1282 	 */
1283 	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1284 	/*
1285 	 * This does not work for values of stathz that are more than
1286 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1287 	 */
1288 	if (incr == 0)
1289 		incr = 1;
1290 	tickincr = incr;
1291 #ifdef SMP
1292 	affinity = SCHED_AFFINITY_DEFAULT;
1293 #endif
1294 }
1295 
1296 
1297 /*
1298  * This is the core of the interactivity algorithm.  Determines a score based
1299  * on past behavior.  It is the ratio of sleep time to run time scaled to
1300  * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1301  * differs from the cpu usage because it does not account for time spent
1302  * waiting on a run-queue.  Would be prettier if we had floating point.
1303  */
1304 static int
1305 sched_interact_score(struct thread *td)
1306 {
1307 	struct td_sched *ts;
1308 	int div;
1309 
1310 	ts = td->td_sched;
1311 	/*
1312 	 * The score is only needed if this is likely to be an interactive
1313 	 * task.  Don't go through the expense of computing it if there's
1314 	 * no chance.
1315 	 */
1316 	if (sched_interact <= SCHED_INTERACT_HALF &&
1317 		ts->ts_runtime >= ts->ts_slptime)
1318 			return (SCHED_INTERACT_HALF);
1319 
1320 	if (ts->ts_runtime > ts->ts_slptime) {
1321 		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1322 		return (SCHED_INTERACT_HALF +
1323 		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1324 	}
1325 	if (ts->ts_slptime > ts->ts_runtime) {
1326 		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1327 		return (ts->ts_runtime / div);
1328 	}
1329 	/* runtime == slptime */
1330 	if (ts->ts_runtime)
1331 		return (SCHED_INTERACT_HALF);
1332 
1333 	/*
1334 	 * This can happen if slptime and runtime are 0.
1335 	 */
1336 	return (0);
1337 
1338 }
1339 
1340 /*
1341  * Scale the scheduling priority according to the "interactivity" of this
1342  * process.
1343  */
1344 static void
1345 sched_priority(struct thread *td)
1346 {
1347 	int score;
1348 	int pri;
1349 
1350 	if (td->td_pri_class != PRI_TIMESHARE)
1351 		return;
1352 	/*
1353 	 * If the score is interactive we place the thread in the realtime
1354 	 * queue with a priority that is less than kernel and interrupt
1355 	 * priorities.  These threads are not subject to nice restrictions.
1356 	 *
1357 	 * Scores greater than this are placed on the normal timeshare queue
1358 	 * where the priority is partially decided by the most recent cpu
1359 	 * utilization and the rest is decided by nice value.
1360 	 */
1361 	score = sched_interact_score(td);
1362 	if (score < sched_interact) {
1363 		pri = PRI_MIN_REALTIME;
1364 		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1365 		    * score;
1366 		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1367 		    ("sched_priority: invalid interactive priority %d score %d",
1368 		    pri, score));
1369 	} else {
1370 		pri = SCHED_PRI_MIN;
1371 		if (td->td_sched->ts_ticks)
1372 			pri += SCHED_PRI_TICKS(td->td_sched);
1373 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1374 		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1375 		    ("sched_priority: invalid priority %d: nice %d, "
1376 		    "ticks %d ftick %d ltick %d tick pri %d",
1377 		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1378 		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1379 		    SCHED_PRI_TICKS(td->td_sched)));
1380 	}
1381 	sched_user_prio(td, pri);
1382 
1383 	return;
1384 }
1385 
1386 /*
1387  * This routine enforces a maximum limit on the amount of scheduling history
1388  * kept.  It is called after either the slptime or runtime is adjusted.  This
1389  * function is ugly due to integer math.
1390  */
1391 static void
1392 sched_interact_update(struct thread *td)
1393 {
1394 	struct td_sched *ts;
1395 	u_int sum;
1396 
1397 	ts = td->td_sched;
1398 	sum = ts->ts_runtime + ts->ts_slptime;
1399 	if (sum < SCHED_SLP_RUN_MAX)
1400 		return;
1401 	/*
1402 	 * This only happens from two places:
1403 	 * 1) We have added an unusual amount of run time from fork_exit.
1404 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1405 	 */
1406 	if (sum > SCHED_SLP_RUN_MAX * 2) {
1407 		if (ts->ts_runtime > ts->ts_slptime) {
1408 			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1409 			ts->ts_slptime = 1;
1410 		} else {
1411 			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1412 			ts->ts_runtime = 1;
1413 		}
1414 		return;
1415 	}
1416 	/*
1417 	 * If we have exceeded by more than 1/5th then the algorithm below
1418 	 * will not bring us back into range.  Dividing by two here forces
1419 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1420 	 */
1421 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1422 		ts->ts_runtime /= 2;
1423 		ts->ts_slptime /= 2;
1424 		return;
1425 	}
1426 	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1427 	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1428 }
1429 
1430 /*
1431  * Scale back the interactivity history when a child thread is created.  The
1432  * history is inherited from the parent but the thread may behave totally
1433  * differently.  For example, a shell spawning a compiler process.  We want
1434  * to learn that the compiler is behaving badly very quickly.
1435  */
1436 static void
1437 sched_interact_fork(struct thread *td)
1438 {
1439 	int ratio;
1440 	int sum;
1441 
1442 	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1443 	if (sum > SCHED_SLP_RUN_FORK) {
1444 		ratio = sum / SCHED_SLP_RUN_FORK;
1445 		td->td_sched->ts_runtime /= ratio;
1446 		td->td_sched->ts_slptime /= ratio;
1447 	}
1448 }
1449 
1450 /*
1451  * Called from proc0_init() to setup the scheduler fields.
1452  */
1453 void
1454 schedinit(void)
1455 {
1456 
1457 	/*
1458 	 * Set up the scheduler specific parts of proc0.
1459 	 */
1460 	proc0.p_sched = NULL; /* XXX */
1461 	thread0.td_sched = &td_sched0;
1462 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1463 	td_sched0.ts_ltick = ticks;
1464 	td_sched0.ts_ftick = ticks;
1465 	td_sched0.ts_thread = &thread0;
1466 }
1467 
1468 /*
1469  * This is only somewhat accurate since given many processes of the same
1470  * priority they will switch when their slices run out, which will be
1471  * at most sched_slice stathz ticks.
1472  */
1473 int
1474 sched_rr_interval(void)
1475 {
1476 
1477 	/* Convert sched_slice to hz */
1478 	return (hz/(realstathz/sched_slice));
1479 }
1480 
1481 /*
1482  * Update the percent cpu tracking information when it is requested or
1483  * the total history exceeds the maximum.  We keep a sliding history of
1484  * tick counts that slowly decays.  This is less precise than the 4BSD
1485  * mechanism since it happens with less regular and frequent events.
1486  */
1487 static void
1488 sched_pctcpu_update(struct td_sched *ts)
1489 {
1490 
1491 	if (ts->ts_ticks == 0)
1492 		return;
1493 	if (ticks - (hz / 10) < ts->ts_ltick &&
1494 	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1495 		return;
1496 	/*
1497 	 * Adjust counters and watermark for pctcpu calc.
1498 	 */
1499 	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1500 		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1501 			    SCHED_TICK_TARG;
1502 	else
1503 		ts->ts_ticks = 0;
1504 	ts->ts_ltick = ticks;
1505 	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1506 }
1507 
1508 /*
1509  * Adjust the priority of a thread.  Move it to the appropriate run-queue
1510  * if necessary.  This is the back-end for several priority related
1511  * functions.
1512  */
1513 static void
1514 sched_thread_priority(struct thread *td, u_char prio)
1515 {
1516 	struct td_sched *ts;
1517 
1518 	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1519 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1520 	    curthread->td_proc->p_comm);
1521 	ts = td->td_sched;
1522 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1523 	if (td->td_priority == prio)
1524 		return;
1525 
1526 	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1527 		/*
1528 		 * If the priority has been elevated due to priority
1529 		 * propagation, we may have to move ourselves to a new
1530 		 * queue.  This could be optimized to not re-add in some
1531 		 * cases.
1532 		 */
1533 		sched_rem(td);
1534 		td->td_priority = prio;
1535 		sched_add(td, SRQ_BORROWING);
1536 	} else {
1537 #ifdef SMP
1538 		struct tdq *tdq;
1539 
1540 		tdq = TDQ_CPU(ts->ts_cpu);
1541 		if (prio < tdq->tdq_lowpri)
1542 			tdq->tdq_lowpri = prio;
1543 #endif
1544 		td->td_priority = prio;
1545 	}
1546 }
1547 
1548 /*
1549  * Update a thread's priority when it is lent another thread's
1550  * priority.
1551  */
1552 void
1553 sched_lend_prio(struct thread *td, u_char prio)
1554 {
1555 
1556 	td->td_flags |= TDF_BORROWING;
1557 	sched_thread_priority(td, prio);
1558 }
1559 
1560 /*
1561  * Restore a thread's priority when priority propagation is
1562  * over.  The prio argument is the minimum priority the thread
1563  * needs to have to satisfy other possible priority lending
1564  * requests.  If the thread's regular priority is less
1565  * important than prio, the thread will keep a priority boost
1566  * of prio.
1567  */
1568 void
1569 sched_unlend_prio(struct thread *td, u_char prio)
1570 {
1571 	u_char base_pri;
1572 
1573 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1574 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1575 		base_pri = td->td_user_pri;
1576 	else
1577 		base_pri = td->td_base_pri;
1578 	if (prio >= base_pri) {
1579 		td->td_flags &= ~TDF_BORROWING;
1580 		sched_thread_priority(td, base_pri);
1581 	} else
1582 		sched_lend_prio(td, prio);
1583 }
1584 
1585 /*
1586  * Standard entry for setting the priority to an absolute value.
1587  */
1588 void
1589 sched_prio(struct thread *td, u_char prio)
1590 {
1591 	u_char oldprio;
1592 
1593 	/* First, update the base priority. */
1594 	td->td_base_pri = prio;
1595 
1596 	/*
1597 	 * If the thread is borrowing another thread's priority, don't
1598 	 * ever lower the priority.
1599 	 */
1600 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1601 		return;
1602 
1603 	/* Change the real priority. */
1604 	oldprio = td->td_priority;
1605 	sched_thread_priority(td, prio);
1606 
1607 	/*
1608 	 * If the thread is on a turnstile, then let the turnstile update
1609 	 * its state.
1610 	 */
1611 	if (TD_ON_LOCK(td) && oldprio != prio)
1612 		turnstile_adjust(td, oldprio);
1613 }
1614 
1615 /*
1616  * Set the base user priority, does not effect current running priority.
1617  */
1618 void
1619 sched_user_prio(struct thread *td, u_char prio)
1620 {
1621 	u_char oldprio;
1622 
1623 	td->td_base_user_pri = prio;
1624 	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1625                 return;
1626 	oldprio = td->td_user_pri;
1627 	td->td_user_pri = prio;
1628 
1629 	if (TD_ON_UPILOCK(td) && oldprio != prio)
1630 		umtx_pi_adjust(td, oldprio);
1631 }
1632 
1633 void
1634 sched_lend_user_prio(struct thread *td, u_char prio)
1635 {
1636 	u_char oldprio;
1637 
1638 	td->td_flags |= TDF_UBORROWING;
1639 
1640 	oldprio = td->td_user_pri;
1641 	td->td_user_pri = prio;
1642 
1643 	if (TD_ON_UPILOCK(td) && oldprio != prio)
1644 		umtx_pi_adjust(td, oldprio);
1645 }
1646 
1647 void
1648 sched_unlend_user_prio(struct thread *td, u_char prio)
1649 {
1650 	u_char base_pri;
1651 
1652 	base_pri = td->td_base_user_pri;
1653 	if (prio >= base_pri) {
1654 		td->td_flags &= ~TDF_UBORROWING;
1655 		sched_user_prio(td, base_pri);
1656 	} else
1657 		sched_lend_user_prio(td, prio);
1658 }
1659 
1660 /*
1661  * Add the thread passed as 'newtd' to the run queue before selecting
1662  * the next thread to run.  This is only used for KSE.
1663  */
1664 static void
1665 sched_switchin(struct tdq *tdq, struct thread *td)
1666 {
1667 #ifdef SMP
1668 	spinlock_enter();
1669 	TDQ_UNLOCK(tdq);
1670 	thread_lock(td);
1671 	spinlock_exit();
1672 	sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING);
1673 #else
1674 	td->td_lock = TDQ_LOCKPTR(tdq);
1675 #endif
1676 	tdq_add(tdq, td, SRQ_YIELDING);
1677 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1678 }
1679 
1680 /*
1681  * Block a thread for switching.  Similar to thread_block() but does not
1682  * bump the spin count.
1683  */
1684 static inline struct mtx *
1685 thread_block_switch(struct thread *td)
1686 {
1687 	struct mtx *lock;
1688 
1689 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1690 	lock = td->td_lock;
1691 	td->td_lock = &blocked_lock;
1692 	mtx_unlock_spin(lock);
1693 
1694 	return (lock);
1695 }
1696 
1697 /*
1698  * Release a thread that was blocked with thread_block_switch().
1699  */
1700 static inline void
1701 thread_unblock_switch(struct thread *td, struct mtx *mtx)
1702 {
1703 	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1704 	    (uintptr_t)mtx);
1705 }
1706 
1707 /*
1708  * Switch threads.  This function has to handle threads coming in while
1709  * blocked for some reason, running, or idle.  It also must deal with
1710  * migrating a thread from one queue to another as running threads may
1711  * be assigned elsewhere via binding.
1712  */
1713 void
1714 sched_switch(struct thread *td, struct thread *newtd, int flags)
1715 {
1716 	struct tdq *tdq;
1717 	struct td_sched *ts;
1718 	struct mtx *mtx;
1719 	int cpuid;
1720 
1721 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1722 
1723 	cpuid = PCPU_GET(cpuid);
1724 	tdq = TDQ_CPU(cpuid);
1725 	ts = td->td_sched;
1726 	mtx = TDQ_LOCKPTR(tdq);
1727 #ifdef SMP
1728 	ts->ts_rltick = ticks;
1729 	if (newtd && newtd->td_priority < tdq->tdq_lowpri)
1730 		tdq->tdq_lowpri = newtd->td_priority;
1731 #endif
1732 	td->td_lastcpu = td->td_oncpu;
1733 	td->td_oncpu = NOCPU;
1734 	td->td_flags &= ~TDF_NEEDRESCHED;
1735 	td->td_owepreempt = 0;
1736 	/*
1737 	 * The lock pointer in an idle thread should never change.  Reset it
1738 	 * to CAN_RUN as well.
1739 	 */
1740 	if (TD_IS_IDLETHREAD(td)) {
1741 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1742 		TD_SET_CAN_RUN(td);
1743 	} else if (TD_IS_RUNNING(td)) {
1744 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1745 		/* Remove our load so the selection algorithm is not biased. */
1746 		tdq_load_rem(tdq, ts);
1747 		sched_add(td, (flags & SW_PREEMPT) ?
1748 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1749 		    SRQ_OURSELF|SRQ_YIELDING);
1750 		/*
1751 		 * When migrating we return from sched_add with an extra
1752 		 * spinlock nesting, the tdq locked, and a blocked thread.
1753 		 * This is to optimize out an extra block/unblock cycle here.
1754 		 */
1755 		if (ts->ts_cpu != cpuid) {
1756 			mtx = TDQ_LOCKPTR(TDQ_CPU(ts->ts_cpu));
1757 			mtx_unlock_spin(mtx);
1758 			TDQ_LOCK(tdq);
1759 			spinlock_exit();
1760 		}
1761 	} else {
1762 		/* This thread must be going to sleep. */
1763 		TDQ_LOCK(tdq);
1764 		mtx = thread_block_switch(td);
1765 		tdq_load_rem(tdq, ts);
1766 	}
1767 	/*
1768 	 * We enter here with the thread blocked and assigned to the
1769 	 * appropriate cpu run-queue or sleep-queue and with the current
1770 	 * thread-queue locked.
1771 	 */
1772 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1773 	/*
1774 	 * If KSE assigned a new thread just add it here and let choosethread
1775 	 * select the best one.
1776 	 */
1777 	if (newtd != NULL)
1778 		sched_switchin(tdq, newtd);
1779 	newtd = choosethread();
1780 	/*
1781 	 * Call the MD code to switch contexts if necessary.
1782 	 */
1783 	if (td != newtd) {
1784 #ifdef	HWPMC_HOOKS
1785 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1786 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1787 #endif
1788 		cpu_switch(td, newtd, mtx);
1789 		/*
1790 		 * We may return from cpu_switch on a different cpu.  However,
1791 		 * we always return with td_lock pointing to the current cpu's
1792 		 * run queue lock.
1793 		 */
1794 		cpuid = PCPU_GET(cpuid);
1795 		tdq = TDQ_CPU(cpuid);
1796 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td;
1797 #ifdef	HWPMC_HOOKS
1798 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1799 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1800 #endif
1801 	} else
1802 		thread_unblock_switch(td, mtx);
1803 	/*
1804 	 * Assert that all went well and return.
1805 	 */
1806 #ifdef SMP
1807 	/* We should always get here with the lowest priority td possible */
1808 	tdq->tdq_lowpri = td->td_priority;
1809 #endif
1810 	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1811 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1812 	td->td_oncpu = cpuid;
1813 }
1814 
1815 /*
1816  * Adjust thread priorities as a result of a nice request.
1817  */
1818 void
1819 sched_nice(struct proc *p, int nice)
1820 {
1821 	struct thread *td;
1822 
1823 	PROC_LOCK_ASSERT(p, MA_OWNED);
1824 	PROC_SLOCK_ASSERT(p, MA_OWNED);
1825 
1826 	p->p_nice = nice;
1827 	FOREACH_THREAD_IN_PROC(p, td) {
1828 		thread_lock(td);
1829 		sched_priority(td);
1830 		sched_prio(td, td->td_base_user_pri);
1831 		thread_unlock(td);
1832 	}
1833 }
1834 
1835 /*
1836  * Record the sleep time for the interactivity scorer.
1837  */
1838 void
1839 sched_sleep(struct thread *td)
1840 {
1841 
1842 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1843 
1844 	td->td_sched->ts_slptick = ticks;
1845 }
1846 
1847 /*
1848  * Schedule a thread to resume execution and record how long it voluntarily
1849  * slept.  We also update the pctcpu, interactivity, and priority.
1850  */
1851 void
1852 sched_wakeup(struct thread *td)
1853 {
1854 	struct td_sched *ts;
1855 	int slptick;
1856 
1857 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1858 	ts = td->td_sched;
1859 	/*
1860 	 * If we slept for more than a tick update our interactivity and
1861 	 * priority.
1862 	 */
1863 	slptick = ts->ts_slptick;
1864 	ts->ts_slptick = 0;
1865 	if (slptick && slptick != ticks) {
1866 		u_int hzticks;
1867 
1868 		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1869 		ts->ts_slptime += hzticks;
1870 		sched_interact_update(td);
1871 		sched_pctcpu_update(ts);
1872 		sched_priority(td);
1873 	}
1874 	/* Reset the slice value after we sleep. */
1875 	ts->ts_slice = sched_slice;
1876 	sched_add(td, SRQ_BORING);
1877 }
1878 
1879 /*
1880  * Penalize the parent for creating a new child and initialize the child's
1881  * priority.
1882  */
1883 void
1884 sched_fork(struct thread *td, struct thread *child)
1885 {
1886 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1887 	sched_fork_thread(td, child);
1888 	/*
1889 	 * Penalize the parent and child for forking.
1890 	 */
1891 	sched_interact_fork(child);
1892 	sched_priority(child);
1893 	td->td_sched->ts_runtime += tickincr;
1894 	sched_interact_update(td);
1895 	sched_priority(td);
1896 }
1897 
1898 /*
1899  * Fork a new thread, may be within the same process.
1900  */
1901 void
1902 sched_fork_thread(struct thread *td, struct thread *child)
1903 {
1904 	struct td_sched *ts;
1905 	struct td_sched *ts2;
1906 
1907 	/*
1908 	 * Initialize child.
1909 	 */
1910 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1911 	sched_newthread(child);
1912 	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1913 	ts = td->td_sched;
1914 	ts2 = child->td_sched;
1915 	ts2->ts_cpu = ts->ts_cpu;
1916 	ts2->ts_runq = NULL;
1917 	/*
1918 	 * Grab our parents cpu estimation information and priority.
1919 	 */
1920 	ts2->ts_ticks = ts->ts_ticks;
1921 	ts2->ts_ltick = ts->ts_ltick;
1922 	ts2->ts_ftick = ts->ts_ftick;
1923 	child->td_user_pri = td->td_user_pri;
1924 	child->td_base_user_pri = td->td_base_user_pri;
1925 	/*
1926 	 * And update interactivity score.
1927 	 */
1928 	ts2->ts_slptime = ts->ts_slptime;
1929 	ts2->ts_runtime = ts->ts_runtime;
1930 	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1931 }
1932 
1933 /*
1934  * Adjust the priority class of a thread.
1935  */
1936 void
1937 sched_class(struct thread *td, int class)
1938 {
1939 
1940 	THREAD_LOCK_ASSERT(td, MA_OWNED);
1941 	if (td->td_pri_class == class)
1942 		return;
1943 
1944 #ifdef SMP
1945 	/*
1946 	 * On SMP if we're on the RUNQ we must adjust the transferable
1947 	 * count because could be changing to or from an interrupt
1948 	 * class.
1949 	 */
1950 	if (TD_ON_RUNQ(td)) {
1951 		struct tdq *tdq;
1952 
1953 		tdq = TDQ_CPU(td->td_sched->ts_cpu);
1954 		if (THREAD_CAN_MIGRATE(td)) {
1955 			tdq->tdq_transferable--;
1956 			tdq->tdq_group->tdg_transferable--;
1957 		}
1958 		td->td_pri_class = class;
1959 		if (THREAD_CAN_MIGRATE(td)) {
1960 			tdq->tdq_transferable++;
1961 			tdq->tdq_group->tdg_transferable++;
1962 		}
1963 	}
1964 #endif
1965 	td->td_pri_class = class;
1966 }
1967 
1968 /*
1969  * Return some of the child's priority and interactivity to the parent.
1970  */
1971 void
1972 sched_exit(struct proc *p, struct thread *child)
1973 {
1974 	struct thread *td;
1975 
1976 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1977 	    child, child->td_proc->p_comm, child->td_priority);
1978 
1979 	PROC_SLOCK_ASSERT(p, MA_OWNED);
1980 	td = FIRST_THREAD_IN_PROC(p);
1981 	sched_exit_thread(td, child);
1982 }
1983 
1984 /*
1985  * Penalize another thread for the time spent on this one.  This helps to
1986  * worsen the priority and interactivity of processes which schedule batch
1987  * jobs such as make.  This has little effect on the make process itself but
1988  * causes new processes spawned by it to receive worse scores immediately.
1989  */
1990 void
1991 sched_exit_thread(struct thread *td, struct thread *child)
1992 {
1993 
1994 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1995 	    child, child->td_proc->p_comm, child->td_priority);
1996 
1997 #ifdef KSE
1998 	/*
1999 	 * KSE forks and exits so often that this penalty causes short-lived
2000 	 * threads to always be non-interactive.  This causes mozilla to
2001 	 * crawl under load.
2002 	 */
2003 	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
2004 		return;
2005 #endif
2006 	/*
2007 	 * Give the child's runtime to the parent without returning the
2008 	 * sleep time as a penalty to the parent.  This causes shells that
2009 	 * launch expensive things to mark their children as expensive.
2010 	 */
2011 	thread_lock(td);
2012 	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2013 	sched_interact_update(td);
2014 	sched_priority(td);
2015 	thread_unlock(td);
2016 }
2017 
2018 /*
2019  * Fix priorities on return to user-space.  Priorities may be elevated due
2020  * to static priorities in msleep() or similar.
2021  */
2022 void
2023 sched_userret(struct thread *td)
2024 {
2025 	/*
2026 	 * XXX we cheat slightly on the locking here to avoid locking in
2027 	 * the usual case.  Setting td_priority here is essentially an
2028 	 * incomplete workaround for not setting it properly elsewhere.
2029 	 * Now that some interrupt handlers are threads, not setting it
2030 	 * properly elsewhere can clobber it in the window between setting
2031 	 * it here and returning to user mode, so don't waste time setting
2032 	 * it perfectly here.
2033 	 */
2034 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
2035 	    ("thread with borrowed priority returning to userland"));
2036 	if (td->td_priority != td->td_user_pri) {
2037 		thread_lock(td);
2038 		td->td_priority = td->td_user_pri;
2039 		td->td_base_pri = td->td_user_pri;
2040 		thread_unlock(td);
2041         }
2042 }
2043 
2044 /*
2045  * Handle a stathz tick.  This is really only relevant for timeshare
2046  * threads.
2047  */
2048 void
2049 sched_clock(struct thread *td)
2050 {
2051 	struct tdq *tdq;
2052 	struct td_sched *ts;
2053 
2054 	THREAD_LOCK_ASSERT(td, MA_OWNED);
2055 	tdq = TDQ_SELF();
2056 	/*
2057 	 * Advance the insert index once for each tick to ensure that all
2058 	 * threads get a chance to run.
2059 	 */
2060 	if (tdq->tdq_idx == tdq->tdq_ridx) {
2061 		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2062 		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2063 			tdq->tdq_ridx = tdq->tdq_idx;
2064 	}
2065 	ts = td->td_sched;
2066 	/*
2067 	 * We only do slicing code for TIMESHARE threads.
2068 	 */
2069 	if (td->td_pri_class != PRI_TIMESHARE)
2070 		return;
2071 	/*
2072 	 * We used a tick; charge it to the thread so that we can compute our
2073 	 * interactivity.
2074 	 */
2075 	td->td_sched->ts_runtime += tickincr;
2076 	sched_interact_update(td);
2077 	/*
2078 	 * We used up one time slice.
2079 	 */
2080 	if (--ts->ts_slice > 0)
2081 		return;
2082 	/*
2083 	 * We're out of time, recompute priorities and requeue.
2084 	 */
2085 	sched_priority(td);
2086 	td->td_flags |= TDF_NEEDRESCHED;
2087 }
2088 
2089 /*
2090  * Called once per hz tick.  Used for cpu utilization information.  This
2091  * is easier than trying to scale based on stathz.
2092  */
2093 void
2094 sched_tick(void)
2095 {
2096 	struct td_sched *ts;
2097 
2098 	ts = curthread->td_sched;
2099 	/* Adjust ticks for pctcpu */
2100 	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2101 	ts->ts_ltick = ticks;
2102 	/*
2103 	 * Update if we've exceeded our desired tick threshhold by over one
2104 	 * second.
2105 	 */
2106 	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2107 		sched_pctcpu_update(ts);
2108 }
2109 
2110 /*
2111  * Return whether the current CPU has runnable tasks.  Used for in-kernel
2112  * cooperative idle threads.
2113  */
2114 int
2115 sched_runnable(void)
2116 {
2117 	struct tdq *tdq;
2118 	int load;
2119 
2120 	load = 1;
2121 
2122 	tdq = TDQ_SELF();
2123 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2124 		if (tdq->tdq_load > 0)
2125 			goto out;
2126 	} else
2127 		if (tdq->tdq_load - 1 > 0)
2128 			goto out;
2129 	load = 0;
2130 out:
2131 	return (load);
2132 }
2133 
2134 /*
2135  * Choose the highest priority thread to run.  The thread is removed from
2136  * the run-queue while running however the load remains.  For SMP we set
2137  * the tdq in the global idle bitmask if it idles here.
2138  */
2139 struct thread *
2140 sched_choose(void)
2141 {
2142 #ifdef SMP
2143 	struct tdq_group *tdg;
2144 #endif
2145 	struct td_sched *ts;
2146 	struct tdq *tdq;
2147 
2148 	tdq = TDQ_SELF();
2149 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2150 	ts = tdq_choose(tdq);
2151 	if (ts) {
2152 		tdq_runq_rem(tdq, ts);
2153 		return (ts->ts_thread);
2154 	}
2155 #ifdef SMP
2156 	/*
2157 	 * We only set the idled bit when all of the cpus in the group are
2158 	 * idle.  Otherwise we could get into a situation where a thread bounces
2159 	 * back and forth between two idle cores on seperate physical CPUs.
2160 	 */
2161 	tdg = tdq->tdq_group;
2162 	tdg->tdg_idlemask |= PCPU_GET(cpumask);
2163 	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
2164 		atomic_set_int(&tdq_idle, tdg->tdg_mask);
2165 	tdq->tdq_lowpri = PRI_MAX_IDLE;
2166 #endif
2167 	return (PCPU_GET(idlethread));
2168 }
2169 
2170 /*
2171  * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2172  * we always request it once we exit a critical section.
2173  */
2174 static inline void
2175 sched_setpreempt(struct thread *td)
2176 {
2177 	struct thread *ctd;
2178 	int cpri;
2179 	int pri;
2180 
2181 	ctd = curthread;
2182 	pri = td->td_priority;
2183 	cpri = ctd->td_priority;
2184 	if (td->td_priority < ctd->td_priority)
2185 		curthread->td_flags |= TDF_NEEDRESCHED;
2186 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2187 		return;
2188 	/*
2189 	 * Always preempt IDLE threads.  Otherwise only if the preempting
2190 	 * thread is an ithread.
2191 	 */
2192 	if (pri > preempt_thresh && cpri < PRI_MIN_IDLE)
2193 		return;
2194 	ctd->td_owepreempt = 1;
2195 	return;
2196 }
2197 
2198 /*
2199  * Add a thread to a thread queue.  Initializes priority, slice, runq, and
2200  * add it to the appropriate queue.  This is the internal function called
2201  * when the tdq is predetermined.
2202  */
2203 void
2204 tdq_add(struct tdq *tdq, struct thread *td, int flags)
2205 {
2206 	struct td_sched *ts;
2207 	int class;
2208 #ifdef SMP
2209 	int cpumask;
2210 #endif
2211 
2212 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2213 	KASSERT((td->td_inhibitors == 0),
2214 	    ("sched_add: trying to run inhibited thread"));
2215 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2216 	    ("sched_add: bad thread state"));
2217 	KASSERT(td->td_proc->p_sflag & PS_INMEM,
2218 	    ("sched_add: process swapped out"));
2219 
2220 	ts = td->td_sched;
2221 	class = PRI_BASE(td->td_pri_class);
2222         TD_SET_RUNQ(td);
2223 	if (ts->ts_slice == 0)
2224 		ts->ts_slice = sched_slice;
2225 	/*
2226 	 * Pick the run queue based on priority.
2227 	 */
2228 	if (td->td_priority <= PRI_MAX_REALTIME)
2229 		ts->ts_runq = &tdq->tdq_realtime;
2230 	else if (td->td_priority <= PRI_MAX_TIMESHARE)
2231 		ts->ts_runq = &tdq->tdq_timeshare;
2232 	else
2233 		ts->ts_runq = &tdq->tdq_idle;
2234 #ifdef SMP
2235 	cpumask = 1 << ts->ts_cpu;
2236 	/*
2237 	 * If we had been idle, clear our bit in the group and potentially
2238 	 * the global bitmap.
2239 	 */
2240 	if ((class != PRI_IDLE && class != PRI_ITHD) &&
2241 	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
2242 		/*
2243 		 * Check to see if our group is unidling, and if so, remove it
2244 		 * from the global idle mask.
2245 		 */
2246 		if (tdq->tdq_group->tdg_idlemask ==
2247 		    tdq->tdq_group->tdg_cpumask)
2248 			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
2249 		/*
2250 		 * Now remove ourselves from the group specific idle mask.
2251 		 */
2252 		tdq->tdq_group->tdg_idlemask &= ~cpumask;
2253 	}
2254 	if (td->td_priority < tdq->tdq_lowpri)
2255 		tdq->tdq_lowpri = td->td_priority;
2256 #endif
2257 	tdq_runq_add(tdq, ts, flags);
2258 	tdq_load_add(tdq, ts);
2259 }
2260 
2261 /*
2262  * Select the target thread queue and add a thread to it.  Request
2263  * preemption or IPI a remote processor if required.
2264  */
2265 void
2266 sched_add(struct thread *td, int flags)
2267 {
2268 	struct td_sched *ts;
2269 	struct tdq *tdq;
2270 #ifdef SMP
2271 	int cpuid;
2272 	int cpu;
2273 #endif
2274 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
2275 	    td, td->td_proc->p_comm, td->td_priority, curthread,
2276 	    curthread->td_proc->p_comm);
2277 	THREAD_LOCK_ASSERT(td, MA_OWNED);
2278 	ts = td->td_sched;
2279 	/*
2280 	 * Recalculate the priority before we select the target cpu or
2281 	 * run-queue.
2282 	 */
2283 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2284 		sched_priority(td);
2285 #ifdef SMP
2286 	cpuid = PCPU_GET(cpuid);
2287 	/*
2288 	 * Pick the destination cpu and if it isn't ours transfer to the
2289 	 * target cpu.
2290 	 */
2291 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td))
2292 		cpu = cpuid;
2293 	else if (!THREAD_CAN_MIGRATE(td))
2294 		cpu = ts->ts_cpu;
2295 	else
2296 		cpu = sched_pickcpu(ts, flags);
2297 	tdq = sched_setcpu(ts, cpu, flags);
2298 	tdq_add(tdq, td, flags);
2299 	if (cpu != cpuid) {
2300 		tdq_notify(ts);
2301 		return;
2302 	}
2303 #else
2304 	tdq = TDQ_SELF();
2305 	TDQ_LOCK(tdq);
2306 	/*
2307 	 * Now that the thread is moving to the run-queue, set the lock
2308 	 * to the scheduler's lock.
2309 	 */
2310 	thread_lock_set(td, TDQ_LOCKPTR(tdq));
2311 	tdq_add(tdq, td, flags);
2312 #endif
2313 	if (!(flags & SRQ_YIELDING))
2314 		sched_setpreempt(td);
2315 }
2316 
2317 /*
2318  * Remove a thread from a run-queue without running it.  This is used
2319  * when we're stealing a thread from a remote queue.  Otherwise all threads
2320  * exit by calling sched_exit_thread() and sched_throw() themselves.
2321  */
2322 void
2323 sched_rem(struct thread *td)
2324 {
2325 	struct tdq *tdq;
2326 	struct td_sched *ts;
2327 
2328 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
2329 	    td, td->td_proc->p_comm, td->td_priority, curthread,
2330 	    curthread->td_proc->p_comm);
2331 	ts = td->td_sched;
2332 	tdq = TDQ_CPU(ts->ts_cpu);
2333 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2334 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2335 	KASSERT(TD_ON_RUNQ(td),
2336 	    ("sched_rem: thread not on run queue"));
2337 	tdq_runq_rem(tdq, ts);
2338 	tdq_load_rem(tdq, ts);
2339 	TD_SET_CAN_RUN(td);
2340 }
2341 
2342 /*
2343  * Fetch cpu utilization information.  Updates on demand.
2344  */
2345 fixpt_t
2346 sched_pctcpu(struct thread *td)
2347 {
2348 	fixpt_t pctcpu;
2349 	struct td_sched *ts;
2350 
2351 	pctcpu = 0;
2352 	ts = td->td_sched;
2353 	if (ts == NULL)
2354 		return (0);
2355 
2356 	thread_lock(td);
2357 	if (ts->ts_ticks) {
2358 		int rtick;
2359 
2360 		sched_pctcpu_update(ts);
2361 		/* How many rtick per second ? */
2362 		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2363 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2364 	}
2365 	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
2366 	thread_unlock(td);
2367 
2368 	return (pctcpu);
2369 }
2370 
2371 /*
2372  * Bind a thread to a target cpu.
2373  */
2374 void
2375 sched_bind(struct thread *td, int cpu)
2376 {
2377 	struct td_sched *ts;
2378 
2379 	THREAD_LOCK_ASSERT(td, MA_OWNED);
2380 	ts = td->td_sched;
2381 	if (ts->ts_flags & TSF_BOUND)
2382 		sched_unbind(td);
2383 	ts->ts_flags |= TSF_BOUND;
2384 #ifdef SMP
2385 	sched_pin();
2386 	if (PCPU_GET(cpuid) == cpu)
2387 		return;
2388 	ts->ts_cpu = cpu;
2389 	/* When we return from mi_switch we'll be on the correct cpu. */
2390 	mi_switch(SW_VOL, NULL);
2391 #endif
2392 }
2393 
2394 /*
2395  * Release a bound thread.
2396  */
2397 void
2398 sched_unbind(struct thread *td)
2399 {
2400 	struct td_sched *ts;
2401 
2402 	THREAD_LOCK_ASSERT(td, MA_OWNED);
2403 	ts = td->td_sched;
2404 	if ((ts->ts_flags & TSF_BOUND) == 0)
2405 		return;
2406 	ts->ts_flags &= ~TSF_BOUND;
2407 #ifdef SMP
2408 	sched_unpin();
2409 #endif
2410 }
2411 
2412 int
2413 sched_is_bound(struct thread *td)
2414 {
2415 	THREAD_LOCK_ASSERT(td, MA_OWNED);
2416 	return (td->td_sched->ts_flags & TSF_BOUND);
2417 }
2418 
2419 /*
2420  * Basic yield call.
2421  */
2422 void
2423 sched_relinquish(struct thread *td)
2424 {
2425 	thread_lock(td);
2426 	if (td->td_pri_class == PRI_TIMESHARE)
2427 		sched_prio(td, PRI_MAX_TIMESHARE);
2428 	SCHED_STAT_INC(switch_relinquish);
2429 	mi_switch(SW_VOL, NULL);
2430 	thread_unlock(td);
2431 }
2432 
2433 /*
2434  * Return the total system load.
2435  */
2436 int
2437 sched_load(void)
2438 {
2439 #ifdef SMP
2440 	int total;
2441 	int i;
2442 
2443 	total = 0;
2444 	for (i = 0; i <= tdg_maxid; i++)
2445 		total += TDQ_GROUP(i)->tdg_load;
2446 	return (total);
2447 #else
2448 	return (TDQ_SELF()->tdq_sysload);
2449 #endif
2450 }
2451 
2452 int
2453 sched_sizeof_proc(void)
2454 {
2455 	return (sizeof(struct proc));
2456 }
2457 
2458 int
2459 sched_sizeof_thread(void)
2460 {
2461 	return (sizeof(struct thread) + sizeof(struct td_sched));
2462 }
2463 
2464 /*
2465  * The actual idle process.
2466  */
2467 void
2468 sched_idletd(void *dummy)
2469 {
2470 	struct thread *td;
2471 	struct tdq *tdq;
2472 
2473 	td = curthread;
2474 	tdq = TDQ_SELF();
2475 	mtx_assert(&Giant, MA_NOTOWNED);
2476 	/* ULE relies on preemption for idle interruption. */
2477 	for (;;) {
2478 #ifdef SMP
2479 		if (tdq_idled(tdq))
2480 			cpu_idle();
2481 #else
2482 		cpu_idle();
2483 #endif
2484 	}
2485 }
2486 
2487 /*
2488  * A CPU is entering for the first time or a thread is exiting.
2489  */
2490 void
2491 sched_throw(struct thread *td)
2492 {
2493 	struct tdq *tdq;
2494 
2495 	tdq = TDQ_SELF();
2496 	if (td == NULL) {
2497 		/* Correct spinlock nesting and acquire the correct lock. */
2498 		TDQ_LOCK(tdq);
2499 		spinlock_exit();
2500 	} else {
2501 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2502 		tdq_load_rem(tdq, td->td_sched);
2503 	}
2504 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2505 	PCPU_SET(switchtime, cpu_ticks());
2506 	PCPU_SET(switchticks, ticks);
2507 	cpu_throw(td, choosethread());	/* doesn't return */
2508 }
2509 
2510 /*
2511  * This is called from fork_exit().  Just acquire the correct locks and
2512  * let fork do the rest of the work.
2513  */
2514 void
2515 sched_fork_exit(struct thread *td)
2516 {
2517 	struct td_sched *ts;
2518 	struct tdq *tdq;
2519 	int cpuid;
2520 
2521 	/*
2522 	 * Finish setting up thread glue so that it begins execution in a
2523 	 * non-nested critical section with the scheduler lock held.
2524 	 */
2525 	cpuid = PCPU_GET(cpuid);
2526 	tdq = TDQ_CPU(cpuid);
2527 	ts = td->td_sched;
2528 	if (TD_IS_IDLETHREAD(td))
2529 		td->td_lock = TDQ_LOCKPTR(tdq);
2530 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2531 	td->td_oncpu = cpuid;
2532 	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td;
2533 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
2534 }
2535 
2536 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
2537     "Scheduler");
2538 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2539     "Scheduler name");
2540 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2541     "Slice size for timeshare threads");
2542 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2543      "Interactivity score threshold");
2544 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2545      0,"Min priority for preemption, lower priorities have greater precedence");
2546 #ifdef SMP
2547 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
2548     "Pick the target cpu based on priority rather than load.");
2549 SYSCTL_INT(_kern_sched, OID_AUTO, pick_zero, CTLFLAG_RW, &pick_zero, 0,
2550     "If there are no idle cpus pick cpu0");
2551 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2552     "Number of hz ticks to keep thread affinity for");
2553 SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
2554 SYSCTL_INT(_kern_sched, OID_AUTO, tryselfidle, CTLFLAG_RW,
2555     &tryselfidle, 0, "");
2556 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2557     "Enables the long-term load balancer");
2558 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2559     "Steals work from another hyper-threaded core on idle");
2560 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2561     "Attempts to steal work from other cores before idling");
2562 SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0,
2563     "True when a topology has been specified by the MD code.");
2564 #endif
2565 
2566 /* ps compat */
2567 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
2568 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2569 
2570 
2571 #define KERN_SWITCH_INCLUDE 1
2572 #include "kern/kern_switch.c"
2573