xref: /illumos-gate/usr/src/uts/common/disp/disp.c (revision 7247f8883be6bcac5fe4735b6f87f873387dbbef)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysinfo.h>
39 #include <sys/var.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/inline.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47 #include <sys/kmem.h>
48 #include <sys/cpuvar.h>
49 #include <sys/vtrace.h>
50 #include <sys/tnf.h>
51 #include <sys/cpupart.h>
52 #include <sys/lgrp.h>
53 #include <sys/pg.h>
54 #include <sys/cmt.h>
55 #include <sys/bitset.h>
56 #include <sys/schedctl.h>
57 #include <sys/atomic.h>
58 #include <sys/dtrace.h>
59 #include <sys/sdt.h>
60 
61 #include <vm/as.h>
62 
63 #define	BOUND_CPU	0x1
64 #define	BOUND_PARTITION	0x2
65 #define	BOUND_INTR	0x4
66 
67 /* Dispatch queue allocation structure and functions */
68 struct disp_queue_info {
69 	disp_t	*dp;
70 	dispq_t *olddispq;
71 	dispq_t *newdispq;
72 	ulong_t	*olddqactmap;
73 	ulong_t	*newdqactmap;
74 	int	oldnglobpris;
75 };
76 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
77     disp_t *dp);
78 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
79 static void	disp_dq_free(struct disp_queue_info *dptr);
80 
81 /* platform-specific routine to call when processor is idle */
82 static void	generic_idle_cpu();
83 void		(*idle_cpu)() = generic_idle_cpu;
84 
85 /* routines invoked when a CPU enters/exits the idle loop */
86 static void	idle_enter();
87 static void	idle_exit();
88 
89 /* platform-specific routine to call when thread is enqueued */
90 static void	generic_enq_thread(cpu_t *, int);
91 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
92 
93 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
94 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
95 pri_t	intr_pri;		/* interrupt thread priority base level */
96 
97 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
98 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
99 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
100 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
101 int	nswapped;		/* total number of swapped threads */
102 void	disp_swapped_enq(kthread_t *tp);
103 static void	disp_swapped_setrun(kthread_t *tp);
104 static void	cpu_resched(cpu_t *cp, pri_t tpri);
105 
106 /*
107  * If this is set, only interrupt threads will cause kernel preemptions.
108  * This is done by changing the value of kpreemptpri.  kpreemptpri
109  * will either be the max sysclass pri + 1 or the min interrupt pri.
110  */
111 int	only_intr_kpreempt;
112 
113 extern void set_idle_cpu(int cpun);
114 extern void unset_idle_cpu(int cpun);
115 static void setkpdq(kthread_t *tp, int borf);
116 #define	SETKP_BACK	0
117 #define	SETKP_FRONT	1
118 /*
119  * Parameter that determines how recently a thread must have run
120  * on the CPU to be considered loosely-bound to that CPU to reduce
121  * cold cache effects.  The interval is in hertz.
122  */
123 #define	RECHOOSE_INTERVAL 3
124 int	rechoose_interval = RECHOOSE_INTERVAL;
125 static cpu_t	*cpu_choose(kthread_t *, pri_t);
126 
127 /*
128  * Parameter that determines how long (in nanoseconds) a thread must
129  * be sitting on a run queue before it can be stolen by another CPU
130  * to reduce migrations.  The interval is in nanoseconds.
131  *
132  * The nosteal_nsec should be set by a platform code to an appropriate value.
133  * Setting it to 0 effectively disables the nosteal 'protection'
134  */
135 hrtime_t nosteal_nsec = -1;
136 
137 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
138 
139 disp_lock_t	transition_lock;	/* lock on transitioning threads */
140 disp_lock_t	stop_lock;		/* lock on stopped threads */
141 
142 static void	cpu_dispqalloc(int numpris);
143 
144 /*
145  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
146  * a thread because it was sitting on its run queue for a very short
147  * period of time.
148  */
149 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
150 
151 static kthread_t	*disp_getwork(cpu_t *to);
152 static kthread_t	*disp_getbest(disp_t *from);
153 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
154 
155 void	swtch_to(kthread_t *);
156 
157 /*
158  * dispatcher and scheduler initialization
159  */
160 
161 /*
162  * disp_setup - Common code to calculate and allocate dispatcher
163  *		variables and structures based on the maximum priority.
164  */
165 static void
166 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
167 {
168 	pri_t	newnglobpris;
169 
170 	ASSERT(MUTEX_HELD(&cpu_lock));
171 
172 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
173 
174 	if (newnglobpris > oldnglobpris) {
175 		/*
176 		 * Allocate new kp queues for each CPU partition.
177 		 */
178 		cpupart_kpqalloc(newnglobpris);
179 
180 		/*
181 		 * Allocate new dispatch queues for each CPU.
182 		 */
183 		cpu_dispqalloc(newnglobpris);
184 
185 		/*
186 		 * compute new interrupt thread base priority
187 		 */
188 		intr_pri = maxglobpri;
189 		if (only_intr_kpreempt) {
190 			kpreemptpri = intr_pri + 1;
191 			if (kpqpri == KPQPRI)
192 				kpqpri = kpreemptpri;
193 		}
194 		v.v_nglobpris = newnglobpris;
195 	}
196 }
197 
198 /*
199  * dispinit - Called to initialize all loaded classes and the
200  *	      dispatcher framework.
201  */
202 void
203 dispinit(void)
204 {
205 	id_t	cid;
206 	pri_t	maxglobpri;
207 	pri_t	cl_maxglobpri;
208 
209 	maxglobpri = -1;
210 
211 	/*
212 	 * Initialize transition lock, which will always be set.
213 	 */
214 	DISP_LOCK_INIT(&transition_lock);
215 	disp_lock_enter_high(&transition_lock);
216 	DISP_LOCK_INIT(&stop_lock);
217 
218 	mutex_enter(&cpu_lock);
219 	CPU->cpu_disp->disp_maxrunpri = -1;
220 	CPU->cpu_disp->disp_max_unbound_pri = -1;
221 
222 	/*
223 	 * Initialize the default CPU partition.
224 	 */
225 	cpupart_initialize_default();
226 	/*
227 	 * Call the class specific initialization functions for
228 	 * all pre-installed schedulers.
229 	 *
230 	 * We pass the size of a class specific parameter
231 	 * buffer to each of the initialization functions
232 	 * to try to catch problems with backward compatibility
233 	 * of class modules.
234 	 *
235 	 * For example a new class module running on an old system
236 	 * which didn't provide sufficiently large parameter buffers
237 	 * would be bad news. Class initialization modules can check for
238 	 * this and take action if they detect a problem.
239 	 */
240 
241 	for (cid = 0; cid < nclass; cid++) {
242 		sclass_t	*sc;
243 
244 		sc = &sclass[cid];
245 		if (SCHED_INSTALLED(sc)) {
246 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
247 			    &sc->cl_funcs);
248 			if (cl_maxglobpri > maxglobpri)
249 				maxglobpri = cl_maxglobpri;
250 		}
251 	}
252 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
253 	if (kpqpri == KPQPRI)
254 		kpqpri = kpreemptpri;
255 
256 	ASSERT(maxglobpri >= 0);
257 	disp_setup(maxglobpri, 0);
258 
259 	mutex_exit(&cpu_lock);
260 
261 	/*
262 	 * Get the default class ID; this may be later modified via
263 	 * dispadmin(1M).  This will load the class (normally TS) and that will
264 	 * call disp_add(), which is why we had to drop cpu_lock first.
265 	 */
266 	if (getcid(defaultclass, &defaultcid) != 0) {
267 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
268 		    defaultclass);
269 	}
270 }
271 
272 /*
273  * disp_add - Called with class pointer to initialize the dispatcher
274  *	      for a newly loaded class.
275  */
276 void
277 disp_add(sclass_t *clp)
278 {
279 	pri_t	maxglobpri;
280 	pri_t	cl_maxglobpri;
281 
282 	mutex_enter(&cpu_lock);
283 	/*
284 	 * Initialize the scheduler class.
285 	 */
286 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
287 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
288 	if (cl_maxglobpri > maxglobpri)
289 		maxglobpri = cl_maxglobpri;
290 
291 	/*
292 	 * Save old queue information.  Since we're initializing a
293 	 * new scheduling class which has just been loaded, then
294 	 * the size of the dispq may have changed.  We need to handle
295 	 * that here.
296 	 */
297 	disp_setup(maxglobpri, v.v_nglobpris);
298 
299 	mutex_exit(&cpu_lock);
300 }
301 
302 
303 /*
304  * For each CPU, allocate new dispatch queues
305  * with the stated number of priorities.
306  */
307 static void
308 cpu_dispqalloc(int numpris)
309 {
310 	cpu_t	*cpup;
311 	struct disp_queue_info	*disp_mem;
312 	int i, num;
313 
314 	ASSERT(MUTEX_HELD(&cpu_lock));
315 
316 	disp_mem = kmem_zalloc(NCPU *
317 	    sizeof (struct disp_queue_info), KM_SLEEP);
318 
319 	/*
320 	 * This routine must allocate all of the memory before stopping
321 	 * the cpus because it must not sleep in kmem_alloc while the
322 	 * CPUs are stopped.  Locks they hold will not be freed until they
323 	 * are restarted.
324 	 */
325 	i = 0;
326 	cpup = cpu_list;
327 	do {
328 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
329 		i++;
330 		cpup = cpup->cpu_next;
331 	} while (cpup != cpu_list);
332 	num = i;
333 
334 	pause_cpus(NULL);
335 	for (i = 0; i < num; i++)
336 		disp_dq_assign(&disp_mem[i], numpris);
337 	start_cpus();
338 
339 	/*
340 	 * I must free all of the memory after starting the cpus because
341 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
342 	 */
343 	for (i = 0; i < num; i++)
344 		disp_dq_free(&disp_mem[i]);
345 
346 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
347 }
348 
349 static void
350 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
351 {
352 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
353 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
354 	    sizeof (long), KM_SLEEP);
355 	dptr->dp = dp;
356 }
357 
358 static void
359 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
360 {
361 	disp_t	*dp;
362 
363 	dp = dptr->dp;
364 	dptr->olddispq = dp->disp_q;
365 	dptr->olddqactmap = dp->disp_qactmap;
366 	dptr->oldnglobpris = dp->disp_npri;
367 
368 	ASSERT(dptr->oldnglobpris < numpris);
369 
370 	if (dptr->olddispq != NULL) {
371 		/*
372 		 * Use kcopy because bcopy is platform-specific
373 		 * and could block while we might have paused the cpus.
374 		 */
375 		(void) kcopy(dptr->olddispq, dptr->newdispq,
376 		    dptr->oldnglobpris * sizeof (dispq_t));
377 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
378 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
379 		    sizeof (long));
380 	}
381 	dp->disp_q = dptr->newdispq;
382 	dp->disp_qactmap = dptr->newdqactmap;
383 	dp->disp_q_limit = &dptr->newdispq[numpris];
384 	dp->disp_npri = numpris;
385 }
386 
387 static void
388 disp_dq_free(struct disp_queue_info *dptr)
389 {
390 	if (dptr->olddispq != NULL)
391 		kmem_free(dptr->olddispq,
392 		    dptr->oldnglobpris * sizeof (dispq_t));
393 	if (dptr->olddqactmap != NULL)
394 		kmem_free(dptr->olddqactmap,
395 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
396 }
397 
398 /*
399  * For a newly created CPU, initialize the dispatch queue.
400  * This is called before the CPU is known through cpu[] or on any lists.
401  */
402 void
403 disp_cpu_init(cpu_t *cp)
404 {
405 	disp_t	*dp;
406 	dispq_t	*newdispq;
407 	ulong_t	*newdqactmap;
408 
409 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
410 
411 	if (cp == cpu0_disp.disp_cpu)
412 		dp = &cpu0_disp;
413 	else
414 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
415 	bzero(dp, sizeof (disp_t));
416 	cp->cpu_disp = dp;
417 	dp->disp_cpu = cp;
418 	dp->disp_maxrunpri = -1;
419 	dp->disp_max_unbound_pri = -1;
420 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
421 	/*
422 	 * Allocate memory for the dispatcher queue headers
423 	 * and the active queue bitmap.
424 	 */
425 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
426 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
427 	    sizeof (long), KM_SLEEP);
428 	dp->disp_q = newdispq;
429 	dp->disp_qactmap = newdqactmap;
430 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
431 	dp->disp_npri = v.v_nglobpris;
432 }
433 
434 void
435 disp_cpu_fini(cpu_t *cp)
436 {
437 	ASSERT(MUTEX_HELD(&cpu_lock));
438 
439 	disp_kp_free(cp->cpu_disp);
440 	if (cp->cpu_disp != &cpu0_disp)
441 		kmem_free(cp->cpu_disp, sizeof (disp_t));
442 }
443 
444 /*
445  * Allocate new, larger kpreempt dispatch queue to replace the old one.
446  */
447 void
448 disp_kp_alloc(disp_t *dq, pri_t npri)
449 {
450 	struct disp_queue_info	mem_info;
451 
452 	if (npri > dq->disp_npri) {
453 		/*
454 		 * Allocate memory for the new array.
455 		 */
456 		disp_dq_alloc(&mem_info, npri, dq);
457 
458 		/*
459 		 * We need to copy the old structures to the new
460 		 * and free the old.
461 		 */
462 		disp_dq_assign(&mem_info, npri);
463 		disp_dq_free(&mem_info);
464 	}
465 }
466 
467 /*
468  * Free dispatch queue.
469  * Used for the kpreempt queues for a removed CPU partition and
470  * for the per-CPU queues of deleted CPUs.
471  */
472 void
473 disp_kp_free(disp_t *dq)
474 {
475 	struct disp_queue_info	mem_info;
476 
477 	mem_info.olddispq = dq->disp_q;
478 	mem_info.olddqactmap = dq->disp_qactmap;
479 	mem_info.oldnglobpris = dq->disp_npri;
480 	disp_dq_free(&mem_info);
481 }
482 
483 /*
484  * End dispatcher and scheduler initialization.
485  */
486 
487 /*
488  * See if there's anything to do other than remain idle.
489  * Return non-zero if there is.
490  *
491  * This function must be called with high spl, or with
492  * kernel preemption disabled to prevent the partition's
493  * active cpu list from changing while being traversed.
494  *
495  */
496 int
497 disp_anywork(void)
498 {
499 	cpu_t   *cp = CPU;
500 	cpu_t   *ocp;
501 
502 	if (cp->cpu_disp->disp_nrunnable != 0)
503 		return (1);
504 
505 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
506 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
507 			return (1);
508 
509 		/*
510 		 * Work can be taken from another CPU if:
511 		 *	- There is unbound work on the run queue
512 		 *	- That work isn't a thread undergoing a
513 		 *	- context switch on an otherwise empty queue.
514 		 *	- The CPU isn't running the idle loop.
515 		 */
516 		for (ocp = cp->cpu_next_part; ocp != cp;
517 		    ocp = ocp->cpu_next_part) {
518 			ASSERT(CPU_ACTIVE(ocp));
519 
520 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
521 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
522 			    ocp->cpu_disp->disp_nrunnable == 1) &&
523 			    ocp->cpu_dispatch_pri != -1)
524 				return (1);
525 		}
526 	}
527 	return (0);
528 }
529 
530 /*
531  * Called when CPU enters the idle loop
532  */
533 static void
534 idle_enter()
535 {
536 	cpu_t		*cp = CPU;
537 
538 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
539 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
540 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
541 }
542 
543 /*
544  * Called when CPU exits the idle loop
545  */
546 static void
547 idle_exit()
548 {
549 	cpu_t		*cp = CPU;
550 
551 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
552 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
553 }
554 
555 /*
556  * Idle loop.
557  */
558 void
559 idle()
560 {
561 	struct cpu	*cp = CPU;		/* pointer to this CPU */
562 	kthread_t	*t;			/* taken thread */
563 
564 	idle_enter();
565 
566 	/*
567 	 * Uniprocessor version of idle loop.
568 	 * Do this until notified that we're on an actual multiprocessor.
569 	 */
570 	while (ncpus == 1) {
571 		if (cp->cpu_disp->disp_nrunnable == 0) {
572 			(*idle_cpu)();
573 			continue;
574 		}
575 		idle_exit();
576 		swtch();
577 
578 		idle_enter(); /* returned from swtch */
579 	}
580 
581 	/*
582 	 * Multiprocessor idle loop.
583 	 */
584 	for (;;) {
585 		/*
586 		 * If CPU is completely quiesced by p_online(2), just wait
587 		 * here with minimal bus traffic until put online.
588 		 */
589 		while (cp->cpu_flags & CPU_QUIESCED)
590 			(*idle_cpu)();
591 
592 		if (cp->cpu_disp->disp_nrunnable != 0) {
593 			idle_exit();
594 			swtch();
595 		} else {
596 			if (cp->cpu_flags & CPU_OFFLINE)
597 				continue;
598 			if ((t = disp_getwork(cp)) == NULL) {
599 				if (cp->cpu_chosen_level != -1) {
600 					disp_t *dp = cp->cpu_disp;
601 					disp_t *kpq;
602 
603 					disp_lock_enter(&dp->disp_lock);
604 					/*
605 					 * Set kpq under lock to prevent
606 					 * migration between partitions.
607 					 */
608 					kpq = &cp->cpu_part->cp_kp_queue;
609 					if (kpq->disp_maxrunpri == -1)
610 						cp->cpu_chosen_level = -1;
611 					disp_lock_exit(&dp->disp_lock);
612 				}
613 				(*idle_cpu)();
614 				continue;
615 			}
616 			/*
617 			 * If there was a thread but we couldn't steal
618 			 * it, then keep trying.
619 			 */
620 			if (t == T_DONTSTEAL)
621 				continue;
622 			idle_exit();
623 			swtch_to(t);
624 		}
625 		idle_enter(); /* returned from swtch/swtch_to */
626 	}
627 }
628 
629 
630 /*
631  * Preempt the currently running thread in favor of the highest
632  * priority thread.  The class of the current thread controls
633  * where it goes on the dispatcher queues. If panicking, turn
634  * preemption off.
635  */
636 void
637 preempt()
638 {
639 	kthread_t 	*t = curthread;
640 	klwp_t 		*lwp = ttolwp(curthread);
641 
642 	if (panicstr)
643 		return;
644 
645 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
646 
647 	thread_lock(t);
648 
649 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
650 		/*
651 		 * this thread has already been chosen to be run on
652 		 * another CPU. Clear kprunrun on this CPU since we're
653 		 * already headed for swtch().
654 		 */
655 		CPU->cpu_kprunrun = 0;
656 		thread_unlock_nopreempt(t);
657 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
658 	} else {
659 		if (lwp != NULL)
660 			lwp->lwp_ru.nivcsw++;
661 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
662 		THREAD_TRANSITION(t);
663 		CL_PREEMPT(t);
664 		DTRACE_SCHED(preempt);
665 		thread_unlock_nopreempt(t);
666 
667 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
668 
669 		swtch();		/* clears CPU->cpu_runrun via disp() */
670 	}
671 }
672 
673 extern kthread_t *thread_unpin();
674 
675 /*
676  * disp() - find the highest priority thread for this processor to run, and
677  * set it in TS_ONPROC state so that resume() can be called to run it.
678  */
679 static kthread_t *
680 disp()
681 {
682 	cpu_t		*cpup;
683 	disp_t		*dp;
684 	kthread_t	*tp;
685 	dispq_t		*dq;
686 	int		maxrunword;
687 	pri_t		pri;
688 	disp_t		*kpq;
689 
690 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
691 
692 	cpup = CPU;
693 	/*
694 	 * Find the highest priority loaded, runnable thread.
695 	 */
696 	dp = cpup->cpu_disp;
697 
698 reschedule:
699 	/*
700 	 * If there is more important work on the global queue with a better
701 	 * priority than the maximum on this CPU, take it now.
702 	 */
703 	kpq = &cpup->cpu_part->cp_kp_queue;
704 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
705 	    pri >= dp->disp_maxrunpri &&
706 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
707 	    (tp = disp_getbest(kpq)) != NULL) {
708 		if (disp_ratify(tp, kpq) != NULL) {
709 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
710 			    "disp_end:tid %p", tp);
711 			return (tp);
712 		}
713 	}
714 
715 	disp_lock_enter(&dp->disp_lock);
716 	pri = dp->disp_maxrunpri;
717 
718 	/*
719 	 * If there is nothing to run, look at what's runnable on other queues.
720 	 * Choose the idle thread if the CPU is quiesced.
721 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
722 	 * interrupt threads, which will be the only threads on the CPU's own
723 	 * queue, but cannot run threads from other queues.
724 	 */
725 	if (pri == -1) {
726 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
727 			disp_lock_exit(&dp->disp_lock);
728 			if ((tp = disp_getwork(cpup)) == NULL ||
729 			    tp == T_DONTSTEAL) {
730 				tp = cpup->cpu_idle_thread;
731 				(void) splhigh();
732 				THREAD_ONPROC(tp, cpup);
733 				cpup->cpu_dispthread = tp;
734 				cpup->cpu_dispatch_pri = -1;
735 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
736 				cpup->cpu_chosen_level = -1;
737 			}
738 		} else {
739 			disp_lock_exit_high(&dp->disp_lock);
740 			tp = cpup->cpu_idle_thread;
741 			THREAD_ONPROC(tp, cpup);
742 			cpup->cpu_dispthread = tp;
743 			cpup->cpu_dispatch_pri = -1;
744 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
745 			cpup->cpu_chosen_level = -1;
746 		}
747 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
748 		    "disp_end:tid %p", tp);
749 		return (tp);
750 	}
751 
752 	dq = &dp->disp_q[pri];
753 	tp = dq->dq_first;
754 
755 	ASSERT(tp != NULL);
756 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
757 
758 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
759 
760 	/*
761 	 * Found it so remove it from queue.
762 	 */
763 	dp->disp_nrunnable--;
764 	dq->dq_sruncnt--;
765 	if ((dq->dq_first = tp->t_link) == NULL) {
766 		ulong_t	*dqactmap = dp->disp_qactmap;
767 
768 		ASSERT(dq->dq_sruncnt == 0);
769 		dq->dq_last = NULL;
770 
771 		/*
772 		 * The queue is empty, so the corresponding bit needs to be
773 		 * turned off in dqactmap.   If nrunnable != 0 just took the
774 		 * last runnable thread off the
775 		 * highest queue, so recompute disp_maxrunpri.
776 		 */
777 		maxrunword = pri >> BT_ULSHIFT;
778 		dqactmap[maxrunword] &= ~BT_BIW(pri);
779 
780 		if (dp->disp_nrunnable == 0) {
781 			dp->disp_max_unbound_pri = -1;
782 			dp->disp_maxrunpri = -1;
783 		} else {
784 			int ipri;
785 
786 			ipri = bt_gethighbit(dqactmap, maxrunword);
787 			dp->disp_maxrunpri = ipri;
788 			if (ipri < dp->disp_max_unbound_pri)
789 				dp->disp_max_unbound_pri = ipri;
790 		}
791 	} else {
792 		tp->t_link = NULL;
793 	}
794 
795 	/*
796 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
797 	 * out this thread before we have a chance to run it.
798 	 * While running, it is protected against swapping by t_lock.
799 	 */
800 	tp->t_schedflag |= TS_DONT_SWAP;
801 	cpup->cpu_dispthread = tp;		/* protected by spl only */
802 	cpup->cpu_dispatch_pri = pri;
803 	ASSERT(pri == DISP_PRIO(tp));
804 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
805 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
806 
807 	ASSERT(tp != NULL);
808 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
809 	    "disp_end:tid %p", tp);
810 
811 	if (disp_ratify(tp, kpq) == NULL)
812 		goto reschedule;
813 
814 	return (tp);
815 }
816 
817 /*
818  * swtch()
819  *	Find best runnable thread and run it.
820  *	Called with the current thread already switched to a new state,
821  *	on a sleep queue, run queue, stopped, and not zombied.
822  *	May be called at any spl level less than or equal to LOCK_LEVEL.
823  *	Always drops spl to the base level (spl0()).
824  */
825 void
826 swtch()
827 {
828 	kthread_t	*t = curthread;
829 	kthread_t	*next;
830 	cpu_t		*cp;
831 
832 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
833 
834 	if (t->t_flag & T_INTR_THREAD)
835 		cpu_intr_swtch_enter(t);
836 
837 	if (t->t_intr != NULL) {
838 		/*
839 		 * We are an interrupt thread.  Setup and return
840 		 * the interrupted thread to be resumed.
841 		 */
842 		(void) splhigh();	/* block other scheduler action */
843 		cp = CPU;		/* now protected against migration */
844 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
845 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
846 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
847 		next = thread_unpin();
848 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
849 		resume_from_intr(next);
850 	} else {
851 #ifdef	DEBUG
852 		if (t->t_state == TS_ONPROC &&
853 		    t->t_disp_queue->disp_cpu == CPU &&
854 		    t->t_preempt == 0) {
855 			thread_lock(t);
856 			ASSERT(t->t_state != TS_ONPROC ||
857 			    t->t_disp_queue->disp_cpu != CPU ||
858 			    t->t_preempt != 0);	/* cannot migrate */
859 			thread_unlock_nopreempt(t);
860 		}
861 #endif	/* DEBUG */
862 		cp = CPU;
863 		next = disp();		/* returns with spl high */
864 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
865 
866 		/* OK to steal anything left on run queue */
867 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
868 
869 		if (next != t) {
870 			if (t == cp->cpu_idle_thread) {
871 				PG_NRUN_UPDATE(cp, 1);
872 			} else if (next == cp->cpu_idle_thread) {
873 				PG_NRUN_UPDATE(cp, -1);
874 			}
875 
876 			/*
877 			 * If t was previously in the TS_ONPROC state,
878 			 * setfrontdq and setbackdq won't have set its t_waitrq.
879 			 * Since we now finally know that we're switching away
880 			 * from this thread, set its t_waitrq if it is on a run
881 			 * queue.
882 			 */
883 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
884 				t->t_waitrq = gethrtime_unscaled();
885 			}
886 
887 			/*
888 			 * restore mstate of thread that we are switching to
889 			 */
890 			restore_mstate(next);
891 
892 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
893 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
894 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
895 
896 			if (dtrace_vtime_active)
897 				dtrace_vtime_switch(next);
898 
899 			resume(next);
900 			/*
901 			 * The TR_RESUME_END and TR_SWTCH_END trace points
902 			 * appear at the end of resume(), because we may not
903 			 * return here
904 			 */
905 		} else {
906 			if (t->t_flag & T_INTR_THREAD)
907 				cpu_intr_swtch_exit(t);
908 
909 			DTRACE_SCHED(remain__cpu);
910 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
911 			(void) spl0();
912 		}
913 	}
914 }
915 
916 /*
917  * swtch_from_zombie()
918  *	Special case of swtch(), which allows checks for TS_ZOMB to be
919  *	eliminated from normal resume.
920  *	Find best runnable thread and run it.
921  *	Called with the current thread zombied.
922  *	Zombies cannot migrate, so CPU references are safe.
923  */
924 void
925 swtch_from_zombie()
926 {
927 	kthread_t	*next;
928 	cpu_t		*cpu = CPU;
929 
930 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
931 
932 	ASSERT(curthread->t_state == TS_ZOMB);
933 
934 	next = disp();			/* returns with spl high */
935 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
936 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
937 	ASSERT(next != curthread);
938 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
939 
940 	if (next == cpu->cpu_idle_thread)
941 		PG_NRUN_UPDATE(cpu, -1);
942 
943 	restore_mstate(next);
944 
945 	if (dtrace_vtime_active)
946 		dtrace_vtime_switch(next);
947 
948 	resume_from_zombie(next);
949 	/*
950 	 * The TR_RESUME_END and TR_SWTCH_END trace points
951 	 * appear at the end of resume(), because we certainly will not
952 	 * return here
953 	 */
954 }
955 
956 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
957 static int
958 thread_on_queue(kthread_t *tp)
959 {
960 	cpu_t	*cp;
961 	cpu_t	*self;
962 	disp_t	*dp;
963 
964 	self = CPU;
965 	cp = self->cpu_next_onln;
966 	dp = cp->cpu_disp;
967 	for (;;) {
968 		dispq_t		*dq;
969 		dispq_t		*eq;
970 
971 		disp_lock_enter_high(&dp->disp_lock);
972 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
973 			kthread_t	*rp;
974 
975 			ASSERT(dq->dq_last == NULL ||
976 			    dq->dq_last->t_link == NULL);
977 			for (rp = dq->dq_first; rp; rp = rp->t_link)
978 				if (tp == rp) {
979 					disp_lock_exit_high(&dp->disp_lock);
980 					return (1);
981 				}
982 		}
983 		disp_lock_exit_high(&dp->disp_lock);
984 		if (cp == NULL)
985 			break;
986 		if (cp == self) {
987 			cp = NULL;
988 			dp = &cp->cpu_part->cp_kp_queue;
989 		} else {
990 			cp = cp->cpu_next_onln;
991 			dp = cp->cpu_disp;
992 		}
993 	}
994 	return (0);
995 }	/* end of thread_on_queue */
996 #else
997 
998 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
999 
1000 #endif  /* DEBUG */
1001 
1002 /*
1003  * like swtch(), but switch to a specified thread taken from another CPU.
1004  *	called with spl high..
1005  */
1006 void
1007 swtch_to(kthread_t *next)
1008 {
1009 	cpu_t			*cp = CPU;
1010 
1011 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1012 
1013 	/*
1014 	 * Update context switch statistics.
1015 	 */
1016 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1017 
1018 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1019 
1020 	if (curthread == cp->cpu_idle_thread)
1021 		PG_NRUN_UPDATE(cp, 1);
1022 
1023 	/* OK to steal anything left on run queue */
1024 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1025 
1026 	/* record last execution time */
1027 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
1028 
1029 	/*
1030 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1031 	 * won't have set its t_waitrq.  Since we now finally know that we're
1032 	 * switching away from this thread, set its t_waitrq if it is on a run
1033 	 * queue.
1034 	 */
1035 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1036 		curthread->t_waitrq = gethrtime_unscaled();
1037 	}
1038 
1039 	/* restore next thread to previously running microstate */
1040 	restore_mstate(next);
1041 
1042 	if (dtrace_vtime_active)
1043 		dtrace_vtime_switch(next);
1044 
1045 	resume(next);
1046 	/*
1047 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1048 	 * appear at the end of resume(), because we may not
1049 	 * return here
1050 	 */
1051 }
1052 
1053 
1054 
1055 #define	CPU_IDLING(pri)	((pri) == -1)
1056 
1057 static void
1058 cpu_resched(cpu_t *cp, pri_t tpri)
1059 {
1060 	int	call_poke_cpu = 0;
1061 	pri_t   cpupri = cp->cpu_dispatch_pri;
1062 
1063 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1064 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1065 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1066 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1067 			cp->cpu_runrun = 1;
1068 			aston(cp->cpu_dispthread);
1069 			if (tpri < kpreemptpri && cp != CPU)
1070 				call_poke_cpu = 1;
1071 		}
1072 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1073 			cp->cpu_kprunrun = 1;
1074 			if (cp != CPU)
1075 				call_poke_cpu = 1;
1076 		}
1077 	}
1078 
1079 	/*
1080 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1081 	 */
1082 	membar_enter();
1083 
1084 	if (call_poke_cpu)
1085 		poke_cpu(cp->cpu_id);
1086 }
1087 
1088 /*
1089  * Perform multi-level CMT load balancing of running threads.
1090  * tp is the thread being enqueued
1091  * cp is the hint CPU (chosen by cpu_choose()).
1092  */
1093 static cpu_t *
1094 cmt_balance(kthread_t *tp, cpu_t *cp)
1095 {
1096 	int		hint, i, cpu, nsiblings;
1097 	int		self = 0;
1098 	group_t		*cmt_pgs, *siblings;
1099 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
1100 	int		pg_nrun, tpg_nrun;
1101 	int		level = 0;
1102 	cpu_t		*newcp;
1103 
1104 	ASSERT(THREAD_LOCK_HELD(tp));
1105 
1106 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
1107 
1108 	if (GROUP_SIZE(cmt_pgs) == 0)
1109 		return (cp);	/* nothing to do */
1110 
1111 	if (tp == curthread)
1112 		self = 1;
1113 
1114 	/*
1115 	 * Balance across siblings in the CPUs CMT lineage
1116 	 */
1117 	do {
1118 		pg = GROUP_ACCESS(cmt_pgs, level);
1119 
1120 		siblings = pg->cmt_siblings;
1121 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
1122 		if (nsiblings == 1)
1123 			continue;	/* nobody to balance against */
1124 
1125 		pg_nrun = pg->cmt_nrunning;
1126 		if (self &&
1127 		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
1128 			pg_nrun--;	/* Ignore curthread's effect */
1129 
1130 		hint = pg->cmt_hint;
1131 		/*
1132 		 * Check for validity of the hint
1133 		 * It should reference a valid sibling
1134 		 */
1135 		if (hint >= nsiblings)
1136 			hint = pg->cmt_hint = 0;
1137 		else
1138 			pg->cmt_hint++;
1139 
1140 		/*
1141 		 * Find a balancing candidate from among our siblings
1142 		 * "hint" is a hint for where to start looking
1143 		 */
1144 		i = hint;
1145 		do {
1146 			ASSERT(i < nsiblings);
1147 			pg_tmp = GROUP_ACCESS(siblings, i);
1148 
1149 			/*
1150 			 * The candidate must not be us, and must
1151 			 * have some CPU resources in the thread's
1152 			 * partition
1153 			 */
1154 			if (pg_tmp != pg &&
1155 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
1156 			    ((pg_t *)pg_tmp)->pg_id)) {
1157 				tpg = pg_tmp;
1158 				break;
1159 			}
1160 
1161 			if (++i >= nsiblings)
1162 				i = 0;
1163 		} while (i != hint);
1164 
1165 		if (!tpg)
1166 			continue;	/* no candidates at this level */
1167 
1168 		/*
1169 		 * Check if the balancing target is underloaded
1170 		 * Decide to balance if the target is running fewer
1171 		 * threads, or if it's running the same number of threads
1172 		 * with more online CPUs
1173 		 */
1174 		tpg_nrun = tpg->cmt_nrunning;
1175 		if (pg_nrun > tpg_nrun ||
1176 		    (pg_nrun == tpg_nrun &&
1177 		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
1178 		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
1179 			break;
1180 		}
1181 		tpg = NULL;
1182 	} while (++level < GROUP_SIZE(cmt_pgs));
1183 
1184 
1185 	if (tpg) {
1186 		/*
1187 		 * Select an idle CPU from the target PG
1188 		 */
1189 		for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) {
1190 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
1191 			if (newcp->cpu_part == tp->t_cpupart &&
1192 			    newcp->cpu_dispatch_pri == -1) {
1193 				cp = newcp;
1194 				break;
1195 			}
1196 		}
1197 	}
1198 
1199 	return (cp);
1200 }
1201 
1202 /*
1203  * setbackdq() keeps runqs balanced such that the difference in length
1204  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1205  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1206  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1207  * try to keep runqs perfectly balanced regardless of the thread priority.
1208  */
1209 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1210 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1211 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1212 
1213 /*
1214  * Put the specified thread on the back of the dispatcher
1215  * queue corresponding to its current priority.
1216  *
1217  * Called with the thread in transition, onproc or stopped state
1218  * and locked (transition implies locked) and at high spl.
1219  * Returns with the thread in TS_RUN state and still locked.
1220  */
1221 void
1222 setbackdq(kthread_t *tp)
1223 {
1224 	dispq_t	*dq;
1225 	disp_t		*dp;
1226 	cpu_t		*cp;
1227 	pri_t		tpri;
1228 	int		bound;
1229 
1230 	ASSERT(THREAD_LOCK_HELD(tp));
1231 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1232 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1233 
1234 	/*
1235 	 * If thread is "swapped" or on the swap queue don't
1236 	 * queue it, but wake sched.
1237 	 */
1238 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1239 		disp_swapped_setrun(tp);
1240 		return;
1241 	}
1242 
1243 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1244 		bound = 1;
1245 	else
1246 		bound = 0;
1247 
1248 	tpri = DISP_PRIO(tp);
1249 	if (ncpus == 1)
1250 		cp = tp->t_cpu;
1251 	else if (!bound) {
1252 		if (tpri >= kpqpri) {
1253 			setkpdq(tp, SETKP_BACK);
1254 			return;
1255 		}
1256 		/*
1257 		 * Let cpu_choose suggest a CPU.
1258 		 */
1259 		cp = cpu_choose(tp, tpri);
1260 
1261 		if (tp->t_cpupart == cp->cpu_part) {
1262 			int	qlen;
1263 
1264 			/*
1265 			 * Perform any CMT load balancing
1266 			 */
1267 			cp = cmt_balance(tp, cp);
1268 
1269 			/*
1270 			 * Balance across the run queues
1271 			 */
1272 			qlen = RUNQ_LEN(cp, tpri);
1273 			if (tpri >= RUNQ_MATCH_PRI &&
1274 			    !(tp->t_schedflag & TS_RUNQMATCH))
1275 				qlen -= RUNQ_MAX_DIFF;
1276 			if (qlen > 0) {
1277 				cpu_t *newcp;
1278 
1279 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1280 					newcp = cp->cpu_next_part;
1281 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1282 					newcp = cp->cpu_next_part;
1283 				}
1284 
1285 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1286 					DTRACE_PROBE3(runq__balance,
1287 					    kthread_t *, tp,
1288 					    cpu_t *, cp, cpu_t *, newcp);
1289 					cp = newcp;
1290 				}
1291 			}
1292 		} else {
1293 			/*
1294 			 * Migrate to a cpu in the new partition.
1295 			 */
1296 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1297 			    tp->t_lpl, tp->t_pri, NULL);
1298 		}
1299 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1300 	} else {
1301 		/*
1302 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1303 		 * a short time until weak binding that existed when the
1304 		 * strong binding was established has dropped) so we must
1305 		 * favour weak binding over strong.
1306 		 */
1307 		cp = tp->t_weakbound_cpu ?
1308 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1309 	}
1310 	/*
1311 	 * A thread that is ONPROC may be temporarily placed on the run queue
1312 	 * but then chosen to run again by disp.  If the thread we're placing on
1313 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1314 	 * replacement process is actually scheduled in swtch().  In this
1315 	 * situation, curthread is the only thread that could be in the ONPROC
1316 	 * state.
1317 	 */
1318 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1319 		hrtime_t curtime;
1320 
1321 		curtime = gethrtime_unscaled();
1322 		(void) cpu_update_pct(tp, curtime);
1323 		tp->t_waitrq = curtime;
1324 	} else {
1325 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1326 	}
1327 
1328 	dp = cp->cpu_disp;
1329 	disp_lock_enter_high(&dp->disp_lock);
1330 
1331 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1332 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1333 	    tpri, cp, tp);
1334 
1335 #ifndef NPROBE
1336 	/* Kernel probe */
1337 	if (tnf_tracing_active)
1338 		tnf_thread_queue(tp, cp, tpri);
1339 #endif /* NPROBE */
1340 
1341 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1342 
1343 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1344 	tp->t_disp_queue = dp;
1345 	tp->t_link = NULL;
1346 
1347 	dq = &dp->disp_q[tpri];
1348 	dp->disp_nrunnable++;
1349 	if (!bound)
1350 		dp->disp_steal = 0;
1351 	membar_enter();
1352 
1353 	if (dq->dq_sruncnt++ != 0) {
1354 		ASSERT(dq->dq_first != NULL);
1355 		dq->dq_last->t_link = tp;
1356 		dq->dq_last = tp;
1357 	} else {
1358 		ASSERT(dq->dq_first == NULL);
1359 		ASSERT(dq->dq_last == NULL);
1360 		dq->dq_first = dq->dq_last = tp;
1361 		BT_SET(dp->disp_qactmap, tpri);
1362 		if (tpri > dp->disp_maxrunpri) {
1363 			dp->disp_maxrunpri = tpri;
1364 			membar_enter();
1365 			cpu_resched(cp, tpri);
1366 		}
1367 	}
1368 
1369 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1370 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1371 		    cp == CPU) {
1372 			/*
1373 			 * If there are no other unbound threads on the
1374 			 * run queue, don't allow other CPUs to steal
1375 			 * this thread while we are in the middle of a
1376 			 * context switch. We may just switch to it
1377 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1378 			 * in swtch and swtch_to.
1379 			 */
1380 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1381 		}
1382 		dp->disp_max_unbound_pri = tpri;
1383 	}
1384 	(*disp_enq_thread)(cp, bound);
1385 }
1386 
1387 /*
1388  * Put the specified thread on the front of the dispatcher
1389  * queue corresponding to its current priority.
1390  *
1391  * Called with the thread in transition, onproc or stopped state
1392  * and locked (transition implies locked) and at high spl.
1393  * Returns with the thread in TS_RUN state and still locked.
1394  */
1395 void
1396 setfrontdq(kthread_t *tp)
1397 {
1398 	disp_t		*dp;
1399 	dispq_t		*dq;
1400 	cpu_t		*cp;
1401 	pri_t		tpri;
1402 	int		bound;
1403 
1404 	ASSERT(THREAD_LOCK_HELD(tp));
1405 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1406 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1407 
1408 	/*
1409 	 * If thread is "swapped" or on the swap queue don't
1410 	 * queue it, but wake sched.
1411 	 */
1412 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1413 		disp_swapped_setrun(tp);
1414 		return;
1415 	}
1416 
1417 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1418 		bound = 1;
1419 	else
1420 		bound = 0;
1421 
1422 	tpri = DISP_PRIO(tp);
1423 	if (ncpus == 1)
1424 		cp = tp->t_cpu;
1425 	else if (!bound) {
1426 		if (tpri >= kpqpri) {
1427 			setkpdq(tp, SETKP_FRONT);
1428 			return;
1429 		}
1430 		cp = tp->t_cpu;
1431 		if (tp->t_cpupart == cp->cpu_part) {
1432 			/*
1433 			 * If we are of higher or equal priority than
1434 			 * the highest priority runnable thread of
1435 			 * the current CPU, just pick this CPU.  Otherwise
1436 			 * Let cpu_choose() select the CPU.  If this cpu
1437 			 * is the target of an offline request then do not
1438 			 * pick it - a thread_nomigrate() on the in motion
1439 			 * cpu relies on this when it forces a preempt.
1440 			 */
1441 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1442 			    cp == cpu_inmotion)
1443 				cp = cpu_choose(tp, tpri);
1444 		} else {
1445 			/*
1446 			 * Migrate to a cpu in the new partition.
1447 			 */
1448 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1449 			    tp->t_lpl, tp->t_pri, NULL);
1450 		}
1451 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1452 	} else {
1453 		/*
1454 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1455 		 * a short time until weak binding that existed when the
1456 		 * strong binding was established has dropped) so we must
1457 		 * favour weak binding over strong.
1458 		 */
1459 		cp = tp->t_weakbound_cpu ?
1460 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1461 	}
1462 
1463 	/*
1464 	 * A thread that is ONPROC may be temporarily placed on the run queue
1465 	 * but then chosen to run again by disp.  If the thread we're placing on
1466 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1467 	 * replacement process is actually scheduled in swtch().  In this
1468 	 * situation, curthread is the only thread that could be in the ONPROC
1469 	 * state.
1470 	 */
1471 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1472 		hrtime_t curtime;
1473 
1474 		curtime = gethrtime_unscaled();
1475 		(void) cpu_update_pct(tp, curtime);
1476 		tp->t_waitrq = curtime;
1477 	} else {
1478 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1479 	}
1480 
1481 	dp = cp->cpu_disp;
1482 	disp_lock_enter_high(&dp->disp_lock);
1483 
1484 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1485 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1486 
1487 #ifndef NPROBE
1488 	/* Kernel probe */
1489 	if (tnf_tracing_active)
1490 		tnf_thread_queue(tp, cp, tpri);
1491 #endif /* NPROBE */
1492 
1493 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1494 
1495 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1496 	tp->t_disp_queue = dp;
1497 
1498 	dq = &dp->disp_q[tpri];
1499 	dp->disp_nrunnable++;
1500 	if (!bound)
1501 		dp->disp_steal = 0;
1502 	membar_enter();
1503 
1504 	if (dq->dq_sruncnt++ != 0) {
1505 		ASSERT(dq->dq_last != NULL);
1506 		tp->t_link = dq->dq_first;
1507 		dq->dq_first = tp;
1508 	} else {
1509 		ASSERT(dq->dq_last == NULL);
1510 		ASSERT(dq->dq_first == NULL);
1511 		tp->t_link = NULL;
1512 		dq->dq_first = dq->dq_last = tp;
1513 		BT_SET(dp->disp_qactmap, tpri);
1514 		if (tpri > dp->disp_maxrunpri) {
1515 			dp->disp_maxrunpri = tpri;
1516 			membar_enter();
1517 			cpu_resched(cp, tpri);
1518 		}
1519 	}
1520 
1521 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1522 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1523 		    cp == CPU) {
1524 			/*
1525 			 * If there are no other unbound threads on the
1526 			 * run queue, don't allow other CPUs to steal
1527 			 * this thread while we are in the middle of a
1528 			 * context switch. We may just switch to it
1529 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1530 			 * in swtch and swtch_to.
1531 			 */
1532 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1533 		}
1534 		dp->disp_max_unbound_pri = tpri;
1535 	}
1536 	(*disp_enq_thread)(cp, bound);
1537 }
1538 
1539 /*
1540  * Put a high-priority unbound thread on the kp queue
1541  */
1542 static void
1543 setkpdq(kthread_t *tp, int borf)
1544 {
1545 	dispq_t	*dq;
1546 	disp_t	*dp;
1547 	cpu_t	*cp;
1548 	pri_t	tpri;
1549 
1550 	tpri = DISP_PRIO(tp);
1551 
1552 	dp = &tp->t_cpupart->cp_kp_queue;
1553 	disp_lock_enter_high(&dp->disp_lock);
1554 
1555 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1556 
1557 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1558 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1559 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1560 	tp->t_disp_queue = dp;
1561 	dp->disp_nrunnable++;
1562 	dq = &dp->disp_q[tpri];
1563 
1564 	if (dq->dq_sruncnt++ != 0) {
1565 		if (borf == SETKP_BACK) {
1566 			ASSERT(dq->dq_first != NULL);
1567 			tp->t_link = NULL;
1568 			dq->dq_last->t_link = tp;
1569 			dq->dq_last = tp;
1570 		} else {
1571 			ASSERT(dq->dq_last != NULL);
1572 			tp->t_link = dq->dq_first;
1573 			dq->dq_first = tp;
1574 		}
1575 	} else {
1576 		if (borf == SETKP_BACK) {
1577 			ASSERT(dq->dq_first == NULL);
1578 			ASSERT(dq->dq_last == NULL);
1579 			dq->dq_first = dq->dq_last = tp;
1580 		} else {
1581 			ASSERT(dq->dq_last == NULL);
1582 			ASSERT(dq->dq_first == NULL);
1583 			tp->t_link = NULL;
1584 			dq->dq_first = dq->dq_last = tp;
1585 		}
1586 		BT_SET(dp->disp_qactmap, tpri);
1587 		if (tpri > dp->disp_max_unbound_pri)
1588 			dp->disp_max_unbound_pri = tpri;
1589 		if (tpri > dp->disp_maxrunpri) {
1590 			dp->disp_maxrunpri = tpri;
1591 			membar_enter();
1592 		}
1593 	}
1594 
1595 	cp = tp->t_cpu;
1596 	if (tp->t_cpupart != cp->cpu_part) {
1597 		/* migrate to a cpu in the new partition */
1598 		cp = tp->t_cpupart->cp_cpulist;
1599 	}
1600 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1601 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1602 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1603 
1604 #ifndef NPROBE
1605 	/* Kernel probe */
1606 	if (tnf_tracing_active)
1607 		tnf_thread_queue(tp, cp, tpri);
1608 #endif /* NPROBE */
1609 
1610 	if (cp->cpu_chosen_level < tpri)
1611 		cp->cpu_chosen_level = tpri;
1612 	cpu_resched(cp, tpri);
1613 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1614 	(*disp_enq_thread)(cp, 0);
1615 }
1616 
1617 /*
1618  * Remove a thread from the dispatcher queue if it is on it.
1619  * It is not an error if it is not found but we return whether
1620  * or not it was found in case the caller wants to check.
1621  */
1622 int
1623 dispdeq(kthread_t *tp)
1624 {
1625 	disp_t		*dp;
1626 	dispq_t		*dq;
1627 	kthread_t	*rp;
1628 	kthread_t	*trp;
1629 	kthread_t	**ptp;
1630 	int		tpri;
1631 
1632 	ASSERT(THREAD_LOCK_HELD(tp));
1633 
1634 	if (tp->t_state != TS_RUN)
1635 		return (0);
1636 
1637 	/*
1638 	 * The thread is "swapped" or is on the swap queue and
1639 	 * hence no longer on the run queue, so return true.
1640 	 */
1641 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1642 		return (1);
1643 
1644 	tpri = DISP_PRIO(tp);
1645 	dp = tp->t_disp_queue;
1646 	ASSERT(tpri < dp->disp_npri);
1647 	dq = &dp->disp_q[tpri];
1648 	ptp = &dq->dq_first;
1649 	rp = *ptp;
1650 	trp = NULL;
1651 
1652 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1653 
1654 	/*
1655 	 * Search for thread in queue.
1656 	 * Double links would simplify this at the expense of disp/setrun.
1657 	 */
1658 	while (rp != tp && rp != NULL) {
1659 		trp = rp;
1660 		ptp = &trp->t_link;
1661 		rp = trp->t_link;
1662 	}
1663 
1664 	if (rp == NULL) {
1665 		panic("dispdeq: thread not on queue");
1666 	}
1667 
1668 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1669 
1670 	/*
1671 	 * Found it so remove it from queue.
1672 	 */
1673 	if ((*ptp = rp->t_link) == NULL)
1674 		dq->dq_last = trp;
1675 
1676 	dp->disp_nrunnable--;
1677 	if (--dq->dq_sruncnt == 0) {
1678 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1679 		if (dp->disp_nrunnable == 0) {
1680 			dp->disp_max_unbound_pri = -1;
1681 			dp->disp_maxrunpri = -1;
1682 		} else if (tpri == dp->disp_maxrunpri) {
1683 			int ipri;
1684 
1685 			ipri = bt_gethighbit(dp->disp_qactmap,
1686 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1687 			if (ipri < dp->disp_max_unbound_pri)
1688 				dp->disp_max_unbound_pri = ipri;
1689 			dp->disp_maxrunpri = ipri;
1690 		}
1691 	}
1692 	tp->t_link = NULL;
1693 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1694 	return (1);
1695 }
1696 
1697 
1698 /*
1699  * dq_sruninc and dq_srundec are public functions for
1700  * incrementing/decrementing the sruncnts when a thread on
1701  * a dispatcher queue is made schedulable/unschedulable by
1702  * resetting the TS_LOAD flag.
1703  *
1704  * The caller MUST have the thread lock and therefore the dispatcher
1705  * queue lock so that the operation which changes
1706  * the flag, the operation that checks the status of the thread to
1707  * determine if it's on a disp queue AND the call to this function
1708  * are one atomic operation with respect to interrupts.
1709  */
1710 
1711 /*
1712  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1713  */
1714 void
1715 dq_sruninc(kthread_t *t)
1716 {
1717 	ASSERT(t->t_state == TS_RUN);
1718 	ASSERT(t->t_schedflag & TS_LOAD);
1719 
1720 	THREAD_TRANSITION(t);
1721 	setfrontdq(t);
1722 }
1723 
1724 /*
1725  * See comment on calling conventions above.
1726  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1727  */
1728 void
1729 dq_srundec(kthread_t *t)
1730 {
1731 	ASSERT(t->t_schedflag & TS_LOAD);
1732 
1733 	(void) dispdeq(t);
1734 	disp_swapped_enq(t);
1735 }
1736 
1737 /*
1738  * Change the dispatcher lock of thread to the "swapped_lock"
1739  * and return with thread lock still held.
1740  *
1741  * Called with thread_lock held, in transition state, and at high spl.
1742  */
1743 void
1744 disp_swapped_enq(kthread_t *tp)
1745 {
1746 	ASSERT(THREAD_LOCK_HELD(tp));
1747 	ASSERT(tp->t_schedflag & TS_LOAD);
1748 
1749 	switch (tp->t_state) {
1750 	case TS_RUN:
1751 		disp_lock_enter_high(&swapped_lock);
1752 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1753 		break;
1754 	case TS_ONPROC:
1755 		disp_lock_enter_high(&swapped_lock);
1756 		THREAD_TRANSITION(tp);
1757 		wake_sched_sec = 1;		/* tell clock to wake sched */
1758 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1759 		break;
1760 	default:
1761 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1762 	}
1763 }
1764 
1765 /*
1766  * This routine is called by setbackdq/setfrontdq if the thread is
1767  * not loaded or loaded and on the swap queue.
1768  *
1769  * Thread state TS_SLEEP implies that a swapped thread
1770  * has been woken up and needs to be swapped in by the swapper.
1771  *
1772  * Thread state TS_RUN, it implies that the priority of a swapped
1773  * thread is being increased by scheduling class (e.g. ts_update).
1774  */
1775 static void
1776 disp_swapped_setrun(kthread_t *tp)
1777 {
1778 	ASSERT(THREAD_LOCK_HELD(tp));
1779 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1780 
1781 	switch (tp->t_state) {
1782 	case TS_SLEEP:
1783 		disp_lock_enter_high(&swapped_lock);
1784 		/*
1785 		 * Wakeup sched immediately (i.e., next tick) if the
1786 		 * thread priority is above maxclsyspri.
1787 		 */
1788 		if (DISP_PRIO(tp) > maxclsyspri)
1789 			wake_sched = 1;
1790 		else
1791 			wake_sched_sec = 1;
1792 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1793 		break;
1794 	case TS_RUN:				/* called from ts_update */
1795 		break;
1796 	default:
1797 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1798 	}
1799 }
1800 
1801 
1802 /*
1803  *	Make a thread give up its processor.  Find the processor on
1804  *	which this thread is executing, and have that processor
1805  *	preempt.
1806  */
1807 void
1808 cpu_surrender(kthread_t *tp)
1809 {
1810 	cpu_t	*cpup;
1811 	int	max_pri;
1812 	int	max_run_pri;
1813 	klwp_t	*lwp;
1814 
1815 	ASSERT(THREAD_LOCK_HELD(tp));
1816 
1817 	if (tp->t_state != TS_ONPROC)
1818 		return;
1819 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1820 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1821 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1822 	if (max_pri < max_run_pri)
1823 		max_pri = max_run_pri;
1824 
1825 	cpup->cpu_runrun = 1;
1826 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1827 		cpup->cpu_kprunrun = 1;
1828 	}
1829 
1830 	/*
1831 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1832 	 */
1833 	membar_enter();
1834 
1835 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1836 
1837 	/*
1838 	 * Make the target thread take an excursion through trap()
1839 	 * to do preempt() (unless we're already in trap or post_syscall,
1840 	 * calling cpu_surrender via CL_TRAPRET).
1841 	 */
1842 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1843 	    lwp->lwp_state != LWP_USER) {
1844 		aston(tp);
1845 		if (cpup != CPU)
1846 			poke_cpu(cpup->cpu_id);
1847 	}
1848 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1849 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1850 }
1851 
1852 
1853 /*
1854  * Commit to and ratify a scheduling decision
1855  */
1856 /*ARGSUSED*/
1857 static kthread_t *
1858 disp_ratify(kthread_t *tp, disp_t *kpq)
1859 {
1860 	pri_t	tpri, maxpri;
1861 	pri_t	maxkpri;
1862 	cpu_t	*cpup;
1863 
1864 	ASSERT(tp != NULL);
1865 	/*
1866 	 * Commit to, then ratify scheduling decision
1867 	 */
1868 	cpup = CPU;
1869 	if (cpup->cpu_runrun != 0)
1870 		cpup->cpu_runrun = 0;
1871 	if (cpup->cpu_kprunrun != 0)
1872 		cpup->cpu_kprunrun = 0;
1873 	if (cpup->cpu_chosen_level != -1)
1874 		cpup->cpu_chosen_level = -1;
1875 	membar_enter();
1876 	tpri = DISP_PRIO(tp);
1877 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1878 	maxkpri = kpq->disp_maxrunpri;
1879 	if (maxpri < maxkpri)
1880 		maxpri = maxkpri;
1881 	if (tpri < maxpri) {
1882 		/*
1883 		 * should have done better
1884 		 * put this one back and indicate to try again
1885 		 */
1886 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1887 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1888 		thread_lock_high(tp);
1889 		THREAD_TRANSITION(tp);
1890 		setfrontdq(tp);
1891 		thread_unlock_nopreempt(tp);
1892 
1893 		tp = NULL;
1894 	}
1895 	return (tp);
1896 }
1897 
1898 /*
1899  * See if there is any work on the dispatcher queue for other CPUs.
1900  * If there is, dequeue the best thread and return.
1901  */
1902 static kthread_t *
1903 disp_getwork(cpu_t *cp)
1904 {
1905 	cpu_t		*ocp;		/* other CPU */
1906 	cpu_t		*ocp_start;
1907 	cpu_t		*tcp;		/* target local CPU */
1908 	kthread_t	*tp;
1909 	kthread_t	*retval = NULL;
1910 	pri_t		maxpri;
1911 	disp_t		*kpq;		/* kp queue for this partition */
1912 	lpl_t		*lpl, *lpl_leaf;
1913 	int		hint, leafidx;
1914 	hrtime_t	stealtime;
1915 
1916 	maxpri = -1;
1917 	tcp = NULL;
1918 
1919 	kpq = &cp->cpu_part->cp_kp_queue;
1920 	while (kpq->disp_maxrunpri >= 0) {
1921 		/*
1922 		 * Try to take a thread from the kp_queue.
1923 		 */
1924 		tp = (disp_getbest(kpq));
1925 		if (tp)
1926 			return (disp_ratify(tp, kpq));
1927 	}
1928 
1929 	kpreempt_disable();		/* protect the cpu_active list */
1930 
1931 	/*
1932 	 * Try to find something to do on another CPU's run queue.
1933 	 * Loop through all other CPUs looking for the one with the highest
1934 	 * priority unbound thread.
1935 	 *
1936 	 * On NUMA machines, the partition's CPUs are consulted in order of
1937 	 * distance from the current CPU. This way, the first available
1938 	 * work found is also the closest, and will suffer the least
1939 	 * from being migrated.
1940 	 */
1941 	lpl = lpl_leaf = cp->cpu_lpl;
1942 	hint = leafidx = 0;
1943 
1944 	/*
1945 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1946 	 * broader levels of locality
1947 	 */
1948 	do {
1949 		/* This loop iterates over the lpl's leaves */
1950 		do {
1951 			if (lpl_leaf != cp->cpu_lpl)
1952 				ocp = lpl_leaf->lpl_cpus;
1953 			else
1954 				ocp = cp->cpu_next_lpl;
1955 
1956 			/* This loop iterates over the CPUs in the leaf */
1957 			ocp_start = ocp;
1958 			do {
1959 				pri_t pri;
1960 
1961 				ASSERT(CPU_ACTIVE(ocp));
1962 
1963 				/*
1964 				 * End our stroll around this lpl if:
1965 				 *
1966 				 * - Something became runnable on the local
1967 				 *   queue...which also ends our stroll around
1968 				 *   the partition.
1969 				 *
1970 				 * - We happen across another idle CPU.
1971 				 *   Since it is patrolling the next portion
1972 				 *   of the lpl's list (assuming it's not
1973 				 *   halted), move to the next higher level
1974 				 *   of locality.
1975 				 */
1976 				if (cp->cpu_disp->disp_nrunnable != 0) {
1977 					kpreempt_enable();
1978 					return (NULL);
1979 				}
1980 				if (ocp->cpu_dispatch_pri == -1) {
1981 					if (ocp->cpu_disp_flags &
1982 					    CPU_DISP_HALTED)
1983 						continue;
1984 					else
1985 						break;
1986 				}
1987 
1988 				/*
1989 				 * If there's only one thread and the CPU
1990 				 * is in the middle of a context switch,
1991 				 * or it's currently running the idle thread,
1992 				 * don't steal it.
1993 				 */
1994 				if ((ocp->cpu_disp_flags &
1995 				    CPU_DISP_DONTSTEAL) &&
1996 				    ocp->cpu_disp->disp_nrunnable == 1)
1997 					continue;
1998 
1999 				pri = ocp->cpu_disp->disp_max_unbound_pri;
2000 				if (pri > maxpri) {
2001 					/*
2002 					 * Don't steal threads that we attempted
2003 					 * to steal recently until they're ready
2004 					 * to be stolen again.
2005 					 */
2006 					stealtime = ocp->cpu_disp->disp_steal;
2007 					if (stealtime == 0 ||
2008 					    stealtime - gethrtime() <= 0) {
2009 						maxpri = pri;
2010 						tcp = ocp;
2011 					} else {
2012 						/*
2013 						 * Don't update tcp, just set
2014 						 * the retval to T_DONTSTEAL, so
2015 						 * that if no acceptable CPUs
2016 						 * are found the return value
2017 						 * will be T_DONTSTEAL rather
2018 						 * then NULL.
2019 						 */
2020 						retval = T_DONTSTEAL;
2021 					}
2022 				}
2023 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2024 
2025 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2026 				leafidx = 0;
2027 				lpl_leaf = lpl->lpl_rset[leafidx];
2028 			}
2029 		} while (leafidx != hint);
2030 
2031 		hint = leafidx = lpl->lpl_hint;
2032 		if ((lpl = lpl->lpl_parent) != NULL)
2033 			lpl_leaf = lpl->lpl_rset[hint];
2034 	} while (!tcp && lpl);
2035 
2036 	kpreempt_enable();
2037 
2038 	/*
2039 	 * If another queue looks good, and there is still nothing on
2040 	 * the local queue, try to transfer one or more threads
2041 	 * from it to our queue.
2042 	 */
2043 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2044 		tp = disp_getbest(tcp->cpu_disp);
2045 		if (tp == NULL || tp == T_DONTSTEAL)
2046 			return (tp);
2047 		return (disp_ratify(tp, kpq));
2048 	}
2049 	return (retval);
2050 }
2051 
2052 
2053 /*
2054  * disp_fix_unbound_pri()
2055  *	Determines the maximum priority of unbound threads on the queue.
2056  *	The priority is kept for the queue, but is only increased, never
2057  *	reduced unless some CPU is looking for something on that queue.
2058  *
2059  *	The priority argument is the known upper limit.
2060  *
2061  *	Perhaps this should be kept accurately, but that probably means
2062  *	separate bitmaps for bound and unbound threads.  Since only idled
2063  *	CPUs will have to do this recalculation, it seems better this way.
2064  */
2065 static void
2066 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2067 {
2068 	kthread_t	*tp;
2069 	dispq_t		*dq;
2070 	ulong_t		*dqactmap = dp->disp_qactmap;
2071 	ulong_t		mapword;
2072 	int		wx;
2073 
2074 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2075 
2076 	ASSERT(pri >= 0);			/* checked by caller */
2077 
2078 	/*
2079 	 * Start the search at the next lowest priority below the supplied
2080 	 * priority.  This depends on the bitmap implementation.
2081 	 */
2082 	do {
2083 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2084 
2085 		/*
2086 		 * Form mask for all lower priorities in the word.
2087 		 */
2088 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2089 
2090 		/*
2091 		 * Get next lower active priority.
2092 		 */
2093 		if (mapword != 0) {
2094 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2095 		} else if (wx > 0) {
2096 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2097 			if (pri < 0)
2098 				break;
2099 		} else {
2100 			pri = -1;
2101 			break;
2102 		}
2103 
2104 		/*
2105 		 * Search the queue for unbound, runnable threads.
2106 		 */
2107 		dq = &dp->disp_q[pri];
2108 		tp = dq->dq_first;
2109 
2110 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2111 			tp = tp->t_link;
2112 		}
2113 
2114 		/*
2115 		 * If a thread was found, set the priority and return.
2116 		 */
2117 	} while (tp == NULL);
2118 
2119 	/*
2120 	 * pri holds the maximum unbound thread priority or -1.
2121 	 */
2122 	if (dp->disp_max_unbound_pri != pri)
2123 		dp->disp_max_unbound_pri = pri;
2124 }
2125 
2126 /*
2127  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2128  * 	check if the CPU to which is was previously bound should have
2129  * 	its disp_max_unbound_pri increased.
2130  */
2131 void
2132 disp_adjust_unbound_pri(kthread_t *tp)
2133 {
2134 	disp_t *dp;
2135 	pri_t tpri;
2136 
2137 	ASSERT(THREAD_LOCK_HELD(tp));
2138 
2139 	/*
2140 	 * Don't do anything if the thread is not bound, or
2141 	 * currently not runnable or swapped out.
2142 	 */
2143 	if (tp->t_bound_cpu == NULL ||
2144 	    tp->t_state != TS_RUN ||
2145 	    tp->t_schedflag & TS_ON_SWAPQ)
2146 		return;
2147 
2148 	tpri = DISP_PRIO(tp);
2149 	dp = tp->t_bound_cpu->cpu_disp;
2150 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2151 	if (tpri > dp->disp_max_unbound_pri)
2152 		dp->disp_max_unbound_pri = tpri;
2153 }
2154 
2155 /*
2156  * disp_getbest()
2157  *   De-queue the highest priority unbound runnable thread.
2158  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2159  *   Returns NULL if nothing found.
2160  *   Returns T_DONTSTEAL if the thread was not stealable.
2161  *   so that the caller will try again later.
2162  *
2163  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2164  *   its type.
2165  */
2166 static kthread_t *
2167 disp_getbest(disp_t *dp)
2168 {
2169 	kthread_t	*tp;
2170 	dispq_t		*dq;
2171 	pri_t		pri;
2172 	cpu_t		*cp, *tcp;
2173 	boolean_t	allbound;
2174 
2175 	disp_lock_enter(&dp->disp_lock);
2176 
2177 	/*
2178 	 * If there is nothing to run, or the CPU is in the middle of a
2179 	 * context switch of the only thread, return NULL.
2180 	 */
2181 	tcp = dp->disp_cpu;
2182 	cp = CPU;
2183 	pri = dp->disp_max_unbound_pri;
2184 	if (pri == -1 ||
2185 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2186 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2187 		disp_lock_exit_nopreempt(&dp->disp_lock);
2188 		return (NULL);
2189 	}
2190 
2191 	dq = &dp->disp_q[pri];
2192 
2193 
2194 	/*
2195 	 * Assume that all threads are bound on this queue, and change it
2196 	 * later when we find out that it is not the case.
2197 	 */
2198 	allbound = B_TRUE;
2199 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2200 		hrtime_t now, nosteal, rqtime;
2201 
2202 		/*
2203 		 * Skip over bound threads which could be here even
2204 		 * though disp_max_unbound_pri indicated this level.
2205 		 */
2206 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2207 			continue;
2208 
2209 		/*
2210 		 * We've got some unbound threads on this queue, so turn
2211 		 * the allbound flag off now.
2212 		 */
2213 		allbound = B_FALSE;
2214 
2215 		/*
2216 		 * The thread is a candidate for stealing from its run queue. We
2217 		 * don't want to steal threads that became runnable just a
2218 		 * moment ago. This improves CPU affinity for threads that get
2219 		 * preempted for short periods of time and go back on the run
2220 		 * queue.
2221 		 *
2222 		 * We want to let it stay on its run queue if it was only placed
2223 		 * there recently and it was running on the same CPU before that
2224 		 * to preserve its cache investment. For the thread to remain on
2225 		 * its run queue, ALL of the following conditions must be
2226 		 * satisfied:
2227 		 *
2228 		 * - the disp queue should not be the kernel preemption queue
2229 		 * - delayed idle stealing should not be disabled
2230 		 * - nosteal_nsec should be non-zero
2231 		 * - it should run with user priority
2232 		 * - it should be on the run queue of the CPU where it was
2233 		 *   running before being placed on the run queue
2234 		 * - it should be the only thread on the run queue (to prevent
2235 		 *   extra scheduling latency for other threads)
2236 		 * - it should sit on the run queue for less than per-chip
2237 		 *   nosteal interval or global nosteal interval
2238 		 * - in case of CPUs with shared cache it should sit in a run
2239 		 *   queue of a CPU from a different chip
2240 		 *
2241 		 * The checks are arranged so that the ones that are faster are
2242 		 * placed earlier.
2243 		 */
2244 		if (tcp == NULL ||
2245 		    pri >= minclsyspri ||
2246 		    tp->t_cpu != tcp)
2247 			break;
2248 
2249 		/*
2250 		 * Steal immediately if, due to CMT processor architecture
2251 		 * migraiton between cp and tcp would incur no performance
2252 		 * penalty.
2253 		 */
2254 		if (pg_cmt_can_migrate(cp, tcp))
2255 			break;
2256 
2257 		nosteal = nosteal_nsec;
2258 		if (nosteal == 0)
2259 			break;
2260 
2261 		/*
2262 		 * Calculate time spent sitting on run queue
2263 		 */
2264 		now = gethrtime_unscaled();
2265 		rqtime = now - tp->t_waitrq;
2266 		scalehrtime(&rqtime);
2267 
2268 		/*
2269 		 * Steal immediately if the time spent on this run queue is more
2270 		 * than allowed nosteal delay.
2271 		 *
2272 		 * Negative rqtime check is needed here to avoid infinite
2273 		 * stealing delays caused by unlikely but not impossible
2274 		 * drifts between CPU times on different CPUs.
2275 		 */
2276 		if (rqtime > nosteal || rqtime < 0)
2277 			break;
2278 
2279 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2280 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2281 		scalehrtime(&now);
2282 		/*
2283 		 * Calculate when this thread becomes stealable
2284 		 */
2285 		now += (nosteal - rqtime);
2286 
2287 		/*
2288 		 * Calculate time when some thread becomes stealable
2289 		 */
2290 		if (now < dp->disp_steal)
2291 			dp->disp_steal = now;
2292 	}
2293 
2294 	/*
2295 	 * If there were no unbound threads on this queue, find the queue
2296 	 * where they are and then return later. The value of
2297 	 * disp_max_unbound_pri is not always accurate because it isn't
2298 	 * reduced until another idle CPU looks for work.
2299 	 */
2300 	if (allbound)
2301 		disp_fix_unbound_pri(dp, pri);
2302 
2303 	/*
2304 	 * If we reached the end of the queue and found no unbound threads
2305 	 * then return NULL so that other CPUs will be considered.  If there
2306 	 * are unbound threads but they cannot yet be stolen, then
2307 	 * return T_DONTSTEAL and try again later.
2308 	 */
2309 	if (tp == NULL) {
2310 		disp_lock_exit_nopreempt(&dp->disp_lock);
2311 		return (allbound ? NULL : T_DONTSTEAL);
2312 	}
2313 
2314 	/*
2315 	 * Found a runnable, unbound thread, so remove it from queue.
2316 	 * dispdeq() requires that we have the thread locked, and we do,
2317 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2318 	 * put the thread in transition state, thereby dropping the dispq
2319 	 * lock.
2320 	 */
2321 
2322 #ifdef DEBUG
2323 	{
2324 		int	thread_was_on_queue;
2325 
2326 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2327 		ASSERT(thread_was_on_queue);
2328 	}
2329 
2330 #else /* DEBUG */
2331 	(void) dispdeq(tp);			/* drops disp_lock */
2332 #endif /* DEBUG */
2333 
2334 	/*
2335 	 * Reset the disp_queue steal time - we do not know what is the smallest
2336 	 * value across the queue is.
2337 	 */
2338 	dp->disp_steal = 0;
2339 
2340 	tp->t_schedflag |= TS_DONT_SWAP;
2341 
2342 	/*
2343 	 * Setup thread to run on the current CPU.
2344 	 */
2345 	tp->t_disp_queue = cp->cpu_disp;
2346 
2347 	cp->cpu_dispthread = tp;		/* protected by spl only */
2348 	cp->cpu_dispatch_pri = pri;
2349 	ASSERT(pri == DISP_PRIO(tp));
2350 
2351 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2352 
2353 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2354 
2355 	/*
2356 	 * Return with spl high so that swtch() won't need to raise it.
2357 	 * The disp_lock was dropped by dispdeq().
2358 	 */
2359 
2360 	return (tp);
2361 }
2362 
2363 /*
2364  * disp_bound_common() - common routine for higher level functions
2365  *	that check for bound threads under certain conditions.
2366  *	If 'threadlistsafe' is set then there is no need to acquire
2367  *	pidlock to stop the thread list from changing (eg, if
2368  *	disp_bound_* is called with cpus paused).
2369  */
2370 static int
2371 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2372 {
2373 	int		found = 0;
2374 	kthread_t	*tp;
2375 
2376 	ASSERT(flag);
2377 
2378 	if (!threadlistsafe)
2379 		mutex_enter(&pidlock);
2380 	tp = curthread;		/* faster than allthreads */
2381 	do {
2382 		if (tp->t_state != TS_FREE) {
2383 			/*
2384 			 * If an interrupt thread is busy, but the
2385 			 * caller doesn't care (i.e. BOUND_INTR is off),
2386 			 * then just ignore it and continue through.
2387 			 */
2388 			if ((tp->t_flag & T_INTR_THREAD) &&
2389 			    !(flag & BOUND_INTR))
2390 				continue;
2391 
2392 			/*
2393 			 * Skip the idle thread for the CPU
2394 			 * we're about to set offline.
2395 			 */
2396 			if (tp == cp->cpu_idle_thread)
2397 				continue;
2398 
2399 			/*
2400 			 * Skip the pause thread for the CPU
2401 			 * we're about to set offline.
2402 			 */
2403 			if (tp == cp->cpu_pause_thread)
2404 				continue;
2405 
2406 			if ((flag & BOUND_CPU) &&
2407 			    (tp->t_bound_cpu == cp ||
2408 			    tp->t_bind_cpu == cp->cpu_id ||
2409 			    tp->t_weakbound_cpu == cp)) {
2410 				found = 1;
2411 				break;
2412 			}
2413 
2414 			if ((flag & BOUND_PARTITION) &&
2415 			    (tp->t_cpupart == cp->cpu_part)) {
2416 				found = 1;
2417 				break;
2418 			}
2419 		}
2420 	} while ((tp = tp->t_next) != curthread && found == 0);
2421 	if (!threadlistsafe)
2422 		mutex_exit(&pidlock);
2423 	return (found);
2424 }
2425 
2426 /*
2427  * disp_bound_threads - return nonzero if threads are bound to the processor.
2428  *	Called infrequently.  Keep this simple.
2429  *	Includes threads that are asleep or stopped but not onproc.
2430  */
2431 int
2432 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2433 {
2434 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2435 }
2436 
2437 /*
2438  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2439  * to the given processor, including interrupt threads.
2440  */
2441 int
2442 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2443 {
2444 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2445 }
2446 
2447 /*
2448  * disp_bound_partition - return nonzero if threads are bound to the same
2449  * partition as the processor.
2450  *	Called infrequently.  Keep this simple.
2451  *	Includes threads that are asleep or stopped but not onproc.
2452  */
2453 int
2454 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2455 {
2456 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2457 }
2458 
2459 /*
2460  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2461  * threads to other CPUs.
2462  */
2463 void
2464 disp_cpu_inactive(cpu_t *cp)
2465 {
2466 	kthread_t	*tp;
2467 	disp_t		*dp = cp->cpu_disp;
2468 	dispq_t		*dq;
2469 	pri_t		pri;
2470 	int		wasonq;
2471 
2472 	disp_lock_enter(&dp->disp_lock);
2473 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2474 		dq = &dp->disp_q[pri];
2475 		tp = dq->dq_first;
2476 
2477 		/*
2478 		 * Skip over bound threads.
2479 		 */
2480 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2481 			tp = tp->t_link;
2482 		}
2483 
2484 		if (tp == NULL) {
2485 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2486 			disp_fix_unbound_pri(dp, pri);
2487 			continue;
2488 		}
2489 
2490 		wasonq = dispdeq(tp);		/* drops disp_lock */
2491 		ASSERT(wasonq);
2492 		ASSERT(tp->t_weakbound_cpu == NULL);
2493 
2494 		setbackdq(tp);
2495 		/*
2496 		 * Called from cpu_offline:
2497 		 *
2498 		 * cp has already been removed from the list of active cpus
2499 		 * and tp->t_cpu has been changed so there is no risk of
2500 		 * tp ending up back on cp.
2501 		 *
2502 		 * Called from cpupart_move_cpu:
2503 		 *
2504 		 * The cpu has moved to a new cpupart.  Any threads that
2505 		 * were on it's dispatch queues before the move remain
2506 		 * in the old partition and can't run in the new partition.
2507 		 */
2508 		ASSERT(tp->t_cpu != cp);
2509 		thread_unlock(tp);
2510 
2511 		disp_lock_enter(&dp->disp_lock);
2512 	}
2513 	disp_lock_exit(&dp->disp_lock);
2514 }
2515 
2516 /*
2517  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2518  *	The hint passed in is used as a starting point so we don't favor
2519  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2520  *	used CPU for the thread.
2521  *
2522  *	The lgroup and priority are used to determine the best CPU to run on
2523  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2524  *	the thread priority will indicate whether the thread will actually run
2525  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2526  *	lgroup which are running the lowest priority threads are found.  The
2527  *	remote CPU is chosen only if the thread will not run locally on a CPU
2528  *	within the lgroup, but will run on the remote CPU. If the thread
2529  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2530  *
2531  *	The lpl specified also identifies the cpu partition from which
2532  *	disp_lowpri_cpu should select a CPU.
2533  *
2534  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2535  *      behalf of the current thread. (curthread is looking for a new cpu)
2536  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2537  *      ignored.
2538  *
2539  *      If a cpu is the target of an offline request then try to avoid it.
2540  *
2541  *	This function must be called at either high SPL, or with preemption
2542  *	disabled, so that the "hint" CPU cannot be removed from the online
2543  *	CPU list while we are traversing it.
2544  */
2545 cpu_t *
2546 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2547 {
2548 	cpu_t	*bestcpu;
2549 	cpu_t	*besthomecpu;
2550 	cpu_t   *cp, *cpstart;
2551 
2552 	pri_t   bestpri;
2553 	pri_t   cpupri;
2554 
2555 	klgrpset_t	done;
2556 	klgrpset_t	cur_set;
2557 
2558 	lpl_t		*lpl_iter, *lpl_leaf;
2559 	int		i;
2560 
2561 	/*
2562 	 * Scan for a CPU currently running the lowest priority thread.
2563 	 * Cannot get cpu_lock here because it is adaptive.
2564 	 * We do not require lock on CPU list.
2565 	 */
2566 	ASSERT(hint != NULL);
2567 	ASSERT(lpl != NULL);
2568 	ASSERT(lpl->lpl_ncpu > 0);
2569 
2570 	/*
2571 	 * First examine local CPUs. Note that it's possible the hint CPU
2572 	 * passed in in remote to the specified home lgroup. If our priority
2573 	 * isn't sufficient enough such that we can run immediately at home,
2574 	 * then examine CPUs remote to our home lgroup.
2575 	 * We would like to give preference to CPUs closest to "home".
2576 	 * If we can't find a CPU where we'll run at a given level
2577 	 * of locality, we expand our search to include the next level.
2578 	 */
2579 	bestcpu = besthomecpu = NULL;
2580 	klgrpset_clear(done);
2581 	/* start with lpl we were passed */
2582 
2583 	lpl_iter = lpl;
2584 
2585 	do {
2586 
2587 		bestpri = SHRT_MAX;
2588 		klgrpset_clear(cur_set);
2589 
2590 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2591 			lpl_leaf = lpl_iter->lpl_rset[i];
2592 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2593 				continue;
2594 
2595 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2596 
2597 			if (hint->cpu_lpl == lpl_leaf)
2598 				cp = cpstart = hint;
2599 			else
2600 				cp = cpstart = lpl_leaf->lpl_cpus;
2601 
2602 			do {
2603 				if (cp == curcpu)
2604 					cpupri = -1;
2605 				else if (cp == cpu_inmotion)
2606 					cpupri = SHRT_MAX;
2607 				else
2608 					cpupri = cp->cpu_dispatch_pri;
2609 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2610 					cpupri = cp->cpu_disp->disp_maxrunpri;
2611 				if (cp->cpu_chosen_level > cpupri)
2612 					cpupri = cp->cpu_chosen_level;
2613 				if (cpupri < bestpri) {
2614 					if (CPU_IDLING(cpupri)) {
2615 						ASSERT((cp->cpu_flags &
2616 						    CPU_QUIESCED) == 0);
2617 						return (cp);
2618 					}
2619 					bestcpu = cp;
2620 					bestpri = cpupri;
2621 				}
2622 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2623 		}
2624 
2625 		if (bestcpu && (tpri > bestpri)) {
2626 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2627 			return (bestcpu);
2628 		}
2629 		if (besthomecpu == NULL)
2630 			besthomecpu = bestcpu;
2631 		/*
2632 		 * Add the lgrps we just considered to the "done" set
2633 		 */
2634 		klgrpset_or(done, cur_set);
2635 
2636 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2637 
2638 	/*
2639 	 * The specified priority isn't high enough to run immediately
2640 	 * anywhere, so just return the best CPU from the home lgroup.
2641 	 */
2642 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2643 	return (besthomecpu);
2644 }
2645 
2646 /*
2647  * This routine provides the generic idle cpu function for all processors.
2648  * If a processor has some specific code to execute when idle (say, to stop
2649  * the pipeline and save power) then that routine should be defined in the
2650  * processors specific code (module_xx.c) and the global variable idle_cpu
2651  * set to that function.
2652  */
2653 static void
2654 generic_idle_cpu(void)
2655 {
2656 }
2657 
2658 /*ARGSUSED*/
2659 static void
2660 generic_enq_thread(cpu_t *cpu, int bound)
2661 {
2662 }
2663 
2664 /*
2665  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2666  *	- t->t_cpu is not in this thread's assigned lgrp
2667  *	- the time since the thread last came off t->t_cpu exceeds the
2668  *	  rechoose time for this cpu (ignore this if t is curthread in
2669  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2670  *	- t->t_cpu is presently the target of an offline or partition move
2671  *	  request
2672  */
2673 static cpu_t *
2674 cpu_choose(kthread_t *t, pri_t tpri)
2675 {
2676 	ASSERT(tpri < kpqpri);
2677 
2678 	if ((((lbolt - t->t_disp_time) > rechoose_interval) &&
2679 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2680 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2681 	}
2682 
2683 	/*
2684 	 * Take a trip through disp_lowpri_cpu() if the thread was
2685 	 * running outside it's home lgroup
2686 	 */
2687 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2688 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2689 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2690 		    (t == curthread) ? t->t_cpu : NULL));
2691 	}
2692 	return (t->t_cpu);
2693 }
2694