xref: /titanic_52/usr/src/uts/common/disp/disp.c (revision 9a7670889e9c36ec355371e6b02f2d9084f040dc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysinfo.h>
39 #include <sys/var.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/inline.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47 #include <sys/kmem.h>
48 #include <sys/cpuvar.h>
49 #include <sys/vtrace.h>
50 #include <sys/tnf.h>
51 #include <sys/cpupart.h>
52 #include <sys/lgrp.h>
53 #include <sys/chip.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 
59 #include <vm/as.h>
60 
61 #define	BOUND_CPU	0x1
62 #define	BOUND_PARTITION	0x2
63 #define	BOUND_INTR	0x4
64 
65 /* Dispatch queue allocation structure and functions */
66 struct disp_queue_info {
67 	disp_t	*dp;
68 	dispq_t *olddispq;
69 	dispq_t *newdispq;
70 	ulong_t	*olddqactmap;
71 	ulong_t	*newdqactmap;
72 	int	oldnglobpris;
73 };
74 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
75     disp_t *dp);
76 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
77 static void	disp_dq_free(struct disp_queue_info *dptr);
78 
79 /* platform-specific routine to call when processor is idle */
80 static void	generic_idle_cpu();
81 void		(*idle_cpu)() = generic_idle_cpu;
82 
83 /* routines invoked when a CPU enters/exits the idle loop */
84 static void	idle_enter();
85 static void	idle_exit();
86 
87 /* platform-specific routine to call when thread is enqueued */
88 static void	generic_enq_thread(cpu_t *, int);
89 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
90 
91 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
92 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
93 pri_t	intr_pri;		/* interrupt thread priority base level */
94 
95 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
96 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
97 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
98 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
99 int	nswapped;		/* total number of swapped threads */
100 void	disp_swapped_enq(kthread_t *tp);
101 static void	disp_swapped_setrun(kthread_t *tp);
102 static void	cpu_resched(cpu_t *cp, pri_t tpri);
103 
104 /*
105  * If this is set, only interrupt threads will cause kernel preemptions.
106  * This is done by changing the value of kpreemptpri.  kpreemptpri
107  * will either be the max sysclass pri + 1 or the min interrupt pri.
108  */
109 int	only_intr_kpreempt;
110 
111 extern void set_idle_cpu(int cpun);
112 extern void unset_idle_cpu(int cpun);
113 static void setkpdq(kthread_t *tp, int borf);
114 #define	SETKP_BACK	0
115 #define	SETKP_FRONT	1
116 /*
117  * Parameter that determines how recently a thread must have run
118  * on the CPU to be considered loosely-bound to that CPU to reduce
119  * cold cache effects.  The interval is in hertz.
120  *
121  * The platform may define a per physical processor adjustment of
122  * this parameter. For efficiency, the effective rechoose interval
123  * (rechoose_interval + per chip adjustment) is maintained in the
124  * cpu structures. See cpu_choose()
125  */
126 int	rechoose_interval = RECHOOSE_INTERVAL;
127 static cpu_t	*cpu_choose(kthread_t *, pri_t);
128 
129 /*
130  * Parameter that determines how long (in nanoseconds) a thread must
131  * be sitting on a run queue before it can be stolen by another CPU
132  * to reduce migrations.  The interval is in nanoseconds.
133  *
134  * The nosteal_nsec should be set by a platform code to an appropriate value.
135  *
136  */
137 hrtime_t nosteal_nsec = 0;
138 
139 /*
140  * Value of nosteal_nsec meaning that nosteal optimization should be disabled
141  */
142 #define	NOSTEAL_DISABLED 1
143 
144 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
145 
146 disp_lock_t	transition_lock;	/* lock on transitioning threads */
147 disp_lock_t	stop_lock;		/* lock on stopped threads */
148 
149 static void	cpu_dispqalloc(int numpris);
150 
151 /*
152  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
153  * a thread because it was sitting on its run queue for a very short
154  * period of time.
155  */
156 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
157 
158 static kthread_t	*disp_getwork(cpu_t *to);
159 static kthread_t	*disp_getbest(disp_t *from);
160 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
161 
162 void	swtch_to(kthread_t *);
163 
164 /*
165  * dispatcher and scheduler initialization
166  */
167 
168 /*
169  * disp_setup - Common code to calculate and allocate dispatcher
170  *		variables and structures based on the maximum priority.
171  */
172 static void
173 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
174 {
175 	pri_t	newnglobpris;
176 
177 	ASSERT(MUTEX_HELD(&cpu_lock));
178 
179 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
180 
181 	if (newnglobpris > oldnglobpris) {
182 		/*
183 		 * Allocate new kp queues for each CPU partition.
184 		 */
185 		cpupart_kpqalloc(newnglobpris);
186 
187 		/*
188 		 * Allocate new dispatch queues for each CPU.
189 		 */
190 		cpu_dispqalloc(newnglobpris);
191 
192 		/*
193 		 * compute new interrupt thread base priority
194 		 */
195 		intr_pri = maxglobpri;
196 		if (only_intr_kpreempt) {
197 			kpreemptpri = intr_pri + 1;
198 			if (kpqpri == KPQPRI)
199 				kpqpri = kpreemptpri;
200 		}
201 		v.v_nglobpris = newnglobpris;
202 	}
203 }
204 
205 /*
206  * dispinit - Called to initialize all loaded classes and the
207  *	      dispatcher framework.
208  */
209 void
210 dispinit(void)
211 {
212 	id_t	cid;
213 	pri_t	maxglobpri;
214 	pri_t	cl_maxglobpri;
215 
216 	maxglobpri = -1;
217 
218 	/*
219 	 * Initialize transition lock, which will always be set.
220 	 */
221 	DISP_LOCK_INIT(&transition_lock);
222 	disp_lock_enter_high(&transition_lock);
223 	DISP_LOCK_INIT(&stop_lock);
224 
225 	mutex_enter(&cpu_lock);
226 	CPU->cpu_disp->disp_maxrunpri = -1;
227 	CPU->cpu_disp->disp_max_unbound_pri = -1;
228 	/*
229 	 * Initialize the default CPU partition.
230 	 */
231 	cpupart_initialize_default();
232 	/*
233 	 * Call the class specific initialization functions for
234 	 * all pre-installed schedulers.
235 	 *
236 	 * We pass the size of a class specific parameter
237 	 * buffer to each of the initialization functions
238 	 * to try to catch problems with backward compatibility
239 	 * of class modules.
240 	 *
241 	 * For example a new class module running on an old system
242 	 * which didn't provide sufficiently large parameter buffers
243 	 * would be bad news. Class initialization modules can check for
244 	 * this and take action if they detect a problem.
245 	 */
246 
247 	for (cid = 0; cid < nclass; cid++) {
248 		sclass_t	*sc;
249 
250 		sc = &sclass[cid];
251 		if (SCHED_INSTALLED(sc)) {
252 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
253 			    &sc->cl_funcs);
254 			if (cl_maxglobpri > maxglobpri)
255 				maxglobpri = cl_maxglobpri;
256 		}
257 	}
258 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
259 	if (kpqpri == KPQPRI)
260 		kpqpri = kpreemptpri;
261 
262 	ASSERT(maxglobpri >= 0);
263 	disp_setup(maxglobpri, 0);
264 
265 	mutex_exit(&cpu_lock);
266 
267 	/*
268 	 * Get the default class ID; this may be later modified via
269 	 * dispadmin(1M).  This will load the class (normally TS) and that will
270 	 * call disp_add(), which is why we had to drop cpu_lock first.
271 	 */
272 	if (getcid(defaultclass, &defaultcid) != 0) {
273 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
274 		    defaultclass);
275 	}
276 }
277 
278 /*
279  * disp_add - Called with class pointer to initialize the dispatcher
280  *	      for a newly loaded class.
281  */
282 void
283 disp_add(sclass_t *clp)
284 {
285 	pri_t	maxglobpri;
286 	pri_t	cl_maxglobpri;
287 
288 	mutex_enter(&cpu_lock);
289 	/*
290 	 * Initialize the scheduler class.
291 	 */
292 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
293 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
294 	if (cl_maxglobpri > maxglobpri)
295 		maxglobpri = cl_maxglobpri;
296 
297 	/*
298 	 * Save old queue information.  Since we're initializing a
299 	 * new scheduling class which has just been loaded, then
300 	 * the size of the dispq may have changed.  We need to handle
301 	 * that here.
302 	 */
303 	disp_setup(maxglobpri, v.v_nglobpris);
304 
305 	mutex_exit(&cpu_lock);
306 }
307 
308 
309 /*
310  * For each CPU, allocate new dispatch queues
311  * with the stated number of priorities.
312  */
313 static void
314 cpu_dispqalloc(int numpris)
315 {
316 	cpu_t	*cpup;
317 	struct disp_queue_info	*disp_mem;
318 	int i, num;
319 
320 	ASSERT(MUTEX_HELD(&cpu_lock));
321 
322 	disp_mem = kmem_zalloc(NCPU *
323 	    sizeof (struct disp_queue_info), KM_SLEEP);
324 
325 	/*
326 	 * This routine must allocate all of the memory before stopping
327 	 * the cpus because it must not sleep in kmem_alloc while the
328 	 * CPUs are stopped.  Locks they hold will not be freed until they
329 	 * are restarted.
330 	 */
331 	i = 0;
332 	cpup = cpu_list;
333 	do {
334 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
335 		i++;
336 		cpup = cpup->cpu_next;
337 	} while (cpup != cpu_list);
338 	num = i;
339 
340 	pause_cpus(NULL);
341 	for (i = 0; i < num; i++)
342 		disp_dq_assign(&disp_mem[i], numpris);
343 	start_cpus();
344 
345 	/*
346 	 * I must free all of the memory after starting the cpus because
347 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
348 	 */
349 	for (i = 0; i < num; i++)
350 		disp_dq_free(&disp_mem[i]);
351 
352 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
353 }
354 
355 static void
356 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
357 {
358 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
359 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
360 	    sizeof (long), KM_SLEEP);
361 	dptr->dp = dp;
362 }
363 
364 static void
365 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
366 {
367 	disp_t	*dp;
368 
369 	dp = dptr->dp;
370 	dptr->olddispq = dp->disp_q;
371 	dptr->olddqactmap = dp->disp_qactmap;
372 	dptr->oldnglobpris = dp->disp_npri;
373 
374 	ASSERT(dptr->oldnglobpris < numpris);
375 
376 	if (dptr->olddispq != NULL) {
377 		/*
378 		 * Use kcopy because bcopy is platform-specific
379 		 * and could block while we might have paused the cpus.
380 		 */
381 		(void) kcopy(dptr->olddispq, dptr->newdispq,
382 		    dptr->oldnglobpris * sizeof (dispq_t));
383 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
384 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
385 		    sizeof (long));
386 	}
387 	dp->disp_q = dptr->newdispq;
388 	dp->disp_qactmap = dptr->newdqactmap;
389 	dp->disp_q_limit = &dptr->newdispq[numpris];
390 	dp->disp_npri = numpris;
391 }
392 
393 static void
394 disp_dq_free(struct disp_queue_info *dptr)
395 {
396 	if (dptr->olddispq != NULL)
397 		kmem_free(dptr->olddispq,
398 		    dptr->oldnglobpris * sizeof (dispq_t));
399 	if (dptr->olddqactmap != NULL)
400 		kmem_free(dptr->olddqactmap,
401 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
402 }
403 
404 /*
405  * For a newly created CPU, initialize the dispatch queue.
406  * This is called before the CPU is known through cpu[] or on any lists.
407  */
408 void
409 disp_cpu_init(cpu_t *cp)
410 {
411 	disp_t	*dp;
412 	dispq_t	*newdispq;
413 	ulong_t	*newdqactmap;
414 
415 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
416 
417 	if (cp == cpu0_disp.disp_cpu)
418 		dp = &cpu0_disp;
419 	else
420 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
421 	bzero(dp, sizeof (disp_t));
422 	cp->cpu_disp = dp;
423 	dp->disp_cpu = cp;
424 	dp->disp_maxrunpri = -1;
425 	dp->disp_max_unbound_pri = -1;
426 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
427 	/*
428 	 * Allocate memory for the dispatcher queue headers
429 	 * and the active queue bitmap.
430 	 */
431 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
432 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
433 	    sizeof (long), KM_SLEEP);
434 	dp->disp_q = newdispq;
435 	dp->disp_qactmap = newdqactmap;
436 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
437 	dp->disp_npri = v.v_nglobpris;
438 }
439 
440 void
441 disp_cpu_fini(cpu_t *cp)
442 {
443 	ASSERT(MUTEX_HELD(&cpu_lock));
444 
445 	disp_kp_free(cp->cpu_disp);
446 	if (cp->cpu_disp != &cpu0_disp)
447 		kmem_free(cp->cpu_disp, sizeof (disp_t));
448 }
449 
450 /*
451  * Allocate new, larger kpreempt dispatch queue to replace the old one.
452  */
453 void
454 disp_kp_alloc(disp_t *dq, pri_t npri)
455 {
456 	struct disp_queue_info	mem_info;
457 
458 	if (npri > dq->disp_npri) {
459 		/*
460 		 * Allocate memory for the new array.
461 		 */
462 		disp_dq_alloc(&mem_info, npri, dq);
463 
464 		/*
465 		 * We need to copy the old structures to the new
466 		 * and free the old.
467 		 */
468 		disp_dq_assign(&mem_info, npri);
469 		disp_dq_free(&mem_info);
470 	}
471 }
472 
473 /*
474  * Free dispatch queue.
475  * Used for the kpreempt queues for a removed CPU partition and
476  * for the per-CPU queues of deleted CPUs.
477  */
478 void
479 disp_kp_free(disp_t *dq)
480 {
481 	struct disp_queue_info	mem_info;
482 
483 	mem_info.olddispq = dq->disp_q;
484 	mem_info.olddqactmap = dq->disp_qactmap;
485 	mem_info.oldnglobpris = dq->disp_npri;
486 	disp_dq_free(&mem_info);
487 }
488 
489 /*
490  * End dispatcher and scheduler initialization.
491  */
492 
493 /*
494  * See if there's anything to do other than remain idle.
495  * Return non-zero if there is.
496  *
497  * This function must be called with high spl, or with
498  * kernel preemption disabled to prevent the partition's
499  * active cpu list from changing while being traversed.
500  *
501  */
502 int
503 disp_anywork(void)
504 {
505 	cpu_t   *cp = CPU;
506 	cpu_t   *ocp;
507 
508 	if (cp->cpu_disp->disp_nrunnable != 0)
509 		return (1);
510 
511 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
512 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
513 			return (1);
514 
515 		/*
516 		 * Work can be taken from another CPU if:
517 		 *	- There is unbound work on the run queue
518 		 *	- That work isn't a thread undergoing a
519 		 *	- context switch on an otherwise empty queue.
520 		 *	- The CPU isn't running the idle loop.
521 		 */
522 		for (ocp = cp->cpu_next_part; ocp != cp;
523 		    ocp = ocp->cpu_next_part) {
524 			ASSERT(CPU_ACTIVE(ocp));
525 
526 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
527 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
528 			    ocp->cpu_disp->disp_nrunnable == 1) &&
529 			    ocp->cpu_dispatch_pri != -1)
530 				return (1);
531 		}
532 	}
533 	return (0);
534 }
535 
536 /*
537  * Called when CPU enters the idle loop
538  */
539 static void
540 idle_enter()
541 {
542 	cpu_t		*cp = CPU;
543 
544 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
545 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
546 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
547 }
548 
549 /*
550  * Called when CPU exits the idle loop
551  */
552 static void
553 idle_exit()
554 {
555 	cpu_t		*cp = CPU;
556 
557 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
558 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
559 }
560 
561 /*
562  * Idle loop.
563  */
564 void
565 idle()
566 {
567 	struct cpu	*cp = CPU;		/* pointer to this CPU */
568 	kthread_t	*t;			/* taken thread */
569 
570 	idle_enter();
571 
572 	/*
573 	 * Uniprocessor version of idle loop.
574 	 * Do this until notified that we're on an actual multiprocessor.
575 	 */
576 	while (ncpus == 1) {
577 		if (cp->cpu_disp->disp_nrunnable == 0) {
578 			(*idle_cpu)();
579 			continue;
580 		}
581 		idle_exit();
582 		swtch();
583 
584 		idle_enter(); /* returned from swtch */
585 	}
586 
587 	/*
588 	 * Multiprocessor idle loop.
589 	 */
590 	for (;;) {
591 		/*
592 		 * If CPU is completely quiesced by p_online(2), just wait
593 		 * here with minimal bus traffic until put online.
594 		 */
595 		while (cp->cpu_flags & CPU_QUIESCED)
596 			(*idle_cpu)();
597 
598 		if (cp->cpu_disp->disp_nrunnable != 0) {
599 			idle_exit();
600 			swtch();
601 		} else {
602 			if (cp->cpu_flags & CPU_OFFLINE)
603 				continue;
604 			if ((t = disp_getwork(cp)) == NULL) {
605 				if (cp->cpu_chosen_level != -1) {
606 					disp_t *dp = cp->cpu_disp;
607 					disp_t *kpq;
608 
609 					disp_lock_enter(&dp->disp_lock);
610 					/*
611 					 * Set kpq under lock to prevent
612 					 * migration between partitions.
613 					 */
614 					kpq = &cp->cpu_part->cp_kp_queue;
615 					if (kpq->disp_maxrunpri == -1)
616 						cp->cpu_chosen_level = -1;
617 					disp_lock_exit(&dp->disp_lock);
618 				}
619 				(*idle_cpu)();
620 				continue;
621 			}
622 			/*
623 			 * If there was a thread but we couldn't steal
624 			 * it, then keep trying.
625 			 */
626 			if (t == T_DONTSTEAL)
627 				continue;
628 			idle_exit();
629 			swtch_to(t);
630 		}
631 		idle_enter(); /* returned from swtch/swtch_to */
632 	}
633 }
634 
635 
636 /*
637  * Preempt the currently running thread in favor of the highest
638  * priority thread.  The class of the current thread controls
639  * where it goes on the dispatcher queues. If panicking, turn
640  * preemption off.
641  */
642 void
643 preempt()
644 {
645 	kthread_t 	*t = curthread;
646 	klwp_t 		*lwp = ttolwp(curthread);
647 
648 	if (panicstr)
649 		return;
650 
651 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
652 
653 	thread_lock(t);
654 
655 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
656 		/*
657 		 * this thread has already been chosen to be run on
658 		 * another CPU. Clear kprunrun on this CPU since we're
659 		 * already headed for swtch().
660 		 */
661 		CPU->cpu_kprunrun = 0;
662 		thread_unlock_nopreempt(t);
663 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
664 	} else {
665 		if (lwp != NULL)
666 			lwp->lwp_ru.nivcsw++;
667 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
668 		THREAD_TRANSITION(t);
669 		CL_PREEMPT(t);
670 		DTRACE_SCHED(preempt);
671 		thread_unlock_nopreempt(t);
672 
673 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
674 
675 		swtch();		/* clears CPU->cpu_runrun via disp() */
676 	}
677 }
678 
679 extern kthread_t *thread_unpin();
680 
681 /*
682  * disp() - find the highest priority thread for this processor to run, and
683  * set it in TS_ONPROC state so that resume() can be called to run it.
684  */
685 static kthread_t *
686 disp()
687 {
688 	cpu_t		*cpup;
689 	disp_t		*dp;
690 	kthread_t	*tp;
691 	dispq_t		*dq;
692 	int		maxrunword;
693 	pri_t		pri;
694 	disp_t		*kpq;
695 
696 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
697 
698 	cpup = CPU;
699 	/*
700 	 * Find the highest priority loaded, runnable thread.
701 	 */
702 	dp = cpup->cpu_disp;
703 
704 reschedule:
705 	/*
706 	 * If there is more important work on the global queue with a better
707 	 * priority than the maximum on this CPU, take it now.
708 	 */
709 	kpq = &cpup->cpu_part->cp_kp_queue;
710 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
711 	    pri >= dp->disp_maxrunpri &&
712 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
713 	    (tp = disp_getbest(kpq)) != NULL) {
714 		if (disp_ratify(tp, kpq) != NULL) {
715 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
716 			    "disp_end:tid %p", tp);
717 			return (tp);
718 		}
719 	}
720 
721 	disp_lock_enter(&dp->disp_lock);
722 	pri = dp->disp_maxrunpri;
723 
724 	/*
725 	 * If there is nothing to run, look at what's runnable on other queues.
726 	 * Choose the idle thread if the CPU is quiesced.
727 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
728 	 * interrupt threads, which will be the only threads on the CPU's own
729 	 * queue, but cannot run threads from other queues.
730 	 */
731 	if (pri == -1) {
732 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
733 			disp_lock_exit(&dp->disp_lock);
734 			if ((tp = disp_getwork(cpup)) == NULL ||
735 			    tp == T_DONTSTEAL) {
736 				tp = cpup->cpu_idle_thread;
737 				(void) splhigh();
738 				THREAD_ONPROC(tp, cpup);
739 				cpup->cpu_dispthread = tp;
740 				cpup->cpu_dispatch_pri = -1;
741 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
742 				cpup->cpu_chosen_level = -1;
743 			}
744 		} else {
745 			disp_lock_exit_high(&dp->disp_lock);
746 			tp = cpup->cpu_idle_thread;
747 			THREAD_ONPROC(tp, cpup);
748 			cpup->cpu_dispthread = tp;
749 			cpup->cpu_dispatch_pri = -1;
750 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
751 			cpup->cpu_chosen_level = -1;
752 		}
753 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
754 			"disp_end:tid %p", tp);
755 		return (tp);
756 	}
757 
758 	dq = &dp->disp_q[pri];
759 	tp = dq->dq_first;
760 
761 	ASSERT(tp != NULL);
762 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
763 
764 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
765 
766 	/*
767 	 * Found it so remove it from queue.
768 	 */
769 	dp->disp_nrunnable--;
770 	dq->dq_sruncnt--;
771 	if ((dq->dq_first = tp->t_link) == NULL) {
772 		ulong_t	*dqactmap = dp->disp_qactmap;
773 
774 		ASSERT(dq->dq_sruncnt == 0);
775 		dq->dq_last = NULL;
776 
777 		/*
778 		 * The queue is empty, so the corresponding bit needs to be
779 		 * turned off in dqactmap.   If nrunnable != 0 just took the
780 		 * last runnable thread off the
781 		 * highest queue, so recompute disp_maxrunpri.
782 		 */
783 		maxrunword = pri >> BT_ULSHIFT;
784 		dqactmap[maxrunword] &= ~BT_BIW(pri);
785 
786 		if (dp->disp_nrunnable == 0) {
787 			dp->disp_max_unbound_pri = -1;
788 			dp->disp_maxrunpri = -1;
789 		} else {
790 			int ipri;
791 
792 			ipri = bt_gethighbit(dqactmap, maxrunword);
793 			dp->disp_maxrunpri = ipri;
794 			if (ipri < dp->disp_max_unbound_pri)
795 				dp->disp_max_unbound_pri = ipri;
796 		}
797 	} else {
798 		tp->t_link = NULL;
799 	}
800 
801 	/*
802 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
803 	 * out this thread before we have a chance to run it.
804 	 * While running, it is protected against swapping by t_lock.
805 	 */
806 	tp->t_schedflag |= TS_DONT_SWAP;
807 	cpup->cpu_dispthread = tp;		/* protected by spl only */
808 	cpup->cpu_dispatch_pri = pri;
809 	ASSERT(pri == DISP_PRIO(tp));
810 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
811 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
812 
813 	ASSERT(tp != NULL);
814 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
815 		"disp_end:tid %p", tp);
816 
817 	if (disp_ratify(tp, kpq) == NULL)
818 		goto reschedule;
819 
820 	return (tp);
821 }
822 
823 /*
824  * swtch()
825  *	Find best runnable thread and run it.
826  *	Called with the current thread already switched to a new state,
827  *	on a sleep queue, run queue, stopped, and not zombied.
828  *	May be called at any spl level less than or equal to LOCK_LEVEL.
829  *	Always drops spl to the base level (spl0()).
830  */
831 void
832 swtch()
833 {
834 	kthread_t	*t = curthread;
835 	kthread_t	*next;
836 	cpu_t		*cp;
837 
838 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
839 
840 	if (t->t_flag & T_INTR_THREAD)
841 		cpu_intr_swtch_enter(t);
842 
843 	if (t->t_intr != NULL) {
844 		/*
845 		 * We are an interrupt thread.  Setup and return
846 		 * the interrupted thread to be resumed.
847 		 */
848 		(void) splhigh();	/* block other scheduler action */
849 		cp = CPU;		/* now protected against migration */
850 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
851 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
852 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
853 		next = thread_unpin();
854 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
855 		resume_from_intr(next);
856 	} else {
857 #ifdef	DEBUG
858 		if (t->t_state == TS_ONPROC &&
859 		    t->t_disp_queue->disp_cpu == CPU &&
860 		    t->t_preempt == 0) {
861 			thread_lock(t);
862 			ASSERT(t->t_state != TS_ONPROC ||
863 			    t->t_disp_queue->disp_cpu != CPU ||
864 			    t->t_preempt != 0);	/* cannot migrate */
865 			thread_unlock_nopreempt(t);
866 		}
867 #endif	/* DEBUG */
868 		cp = CPU;
869 		next = disp();		/* returns with spl high */
870 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
871 
872 		/* OK to steal anything left on run queue */
873 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
874 
875 		if (next != t) {
876 			if (t == cp->cpu_idle_thread) {
877 				CHIP_NRUNNING(cp->cpu_chip, 1);
878 			} else if (next == cp->cpu_idle_thread) {
879 				CHIP_NRUNNING(cp->cpu_chip, -1);
880 			}
881 
882 			/*
883 			 * If t was previously in the TS_ONPROC state,
884 			 * setfrontdq and setbackdq won't have set its t_waitrq.
885 			 * Since we now finally know that we're switching away
886 			 * from this thread, set its t_waitrq if it is on a run
887 			 * queue.
888 			 */
889 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
890 				t->t_waitrq = gethrtime_unscaled();
891 			}
892 
893 			/*
894 			 * restore mstate of thread that we are switching to
895 			 */
896 			restore_mstate(next);
897 
898 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
899 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
900 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
901 
902 			if (dtrace_vtime_active)
903 				dtrace_vtime_switch(next);
904 
905 			resume(next);
906 			/*
907 			 * The TR_RESUME_END and TR_SWTCH_END trace points
908 			 * appear at the end of resume(), because we may not
909 			 * return here
910 			 */
911 		} else {
912 			if (t->t_flag & T_INTR_THREAD)
913 				cpu_intr_swtch_exit(t);
914 
915 			DTRACE_SCHED(remain__cpu);
916 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
917 			(void) spl0();
918 		}
919 	}
920 }
921 
922 /*
923  * swtch_from_zombie()
924  *	Special case of swtch(), which allows checks for TS_ZOMB to be
925  *	eliminated from normal resume.
926  *	Find best runnable thread and run it.
927  *	Called with the current thread zombied.
928  *	Zombies cannot migrate, so CPU references are safe.
929  */
930 void
931 swtch_from_zombie()
932 {
933 	kthread_t	*next;
934 	cpu_t		*cpu = CPU;
935 
936 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
937 
938 	ASSERT(curthread->t_state == TS_ZOMB);
939 
940 	next = disp();			/* returns with spl high */
941 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
942 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
943 	ASSERT(next != curthread);
944 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
945 
946 	if (next == cpu->cpu_idle_thread)
947 		CHIP_NRUNNING(cpu->cpu_chip, -1);
948 
949 	restore_mstate(next);
950 
951 	if (dtrace_vtime_active)
952 		dtrace_vtime_switch(next);
953 
954 	resume_from_zombie(next);
955 	/*
956 	 * The TR_RESUME_END and TR_SWTCH_END trace points
957 	 * appear at the end of resume(), because we certainly will not
958 	 * return here
959 	 */
960 }
961 
962 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
963 static int
964 thread_on_queue(kthread_t *tp)
965 {
966 	cpu_t	*cp;
967 	cpu_t	*self;
968 	disp_t	*dp;
969 
970 	self = CPU;
971 	cp = self->cpu_next_onln;
972 	dp = cp->cpu_disp;
973 	for (;;) {
974 		dispq_t		*dq;
975 		dispq_t		*eq;
976 
977 		disp_lock_enter_high(&dp->disp_lock);
978 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
979 			kthread_t	*rp;
980 
981 			ASSERT(dq->dq_last == NULL ||
982 				dq->dq_last->t_link == NULL);
983 			for (rp = dq->dq_first; rp; rp = rp->t_link)
984 				if (tp == rp) {
985 					disp_lock_exit_high(&dp->disp_lock);
986 					return (1);
987 				}
988 		}
989 		disp_lock_exit_high(&dp->disp_lock);
990 		if (cp == NULL)
991 			break;
992 		if (cp == self) {
993 			cp = NULL;
994 			dp = &cp->cpu_part->cp_kp_queue;
995 		} else {
996 			cp = cp->cpu_next_onln;
997 			dp = cp->cpu_disp;
998 		}
999 	}
1000 	return (0);
1001 }	/* end of thread_on_queue */
1002 #else
1003 
1004 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1005 
1006 #endif  /* DEBUG */
1007 
1008 /*
1009  * like swtch(), but switch to a specified thread taken from another CPU.
1010  *	called with spl high..
1011  */
1012 void
1013 swtch_to(kthread_t *next)
1014 {
1015 	cpu_t			*cp = CPU;
1016 
1017 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1018 
1019 	/*
1020 	 * Update context switch statistics.
1021 	 */
1022 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1023 
1024 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1025 
1026 	if (curthread == cp->cpu_idle_thread)
1027 		CHIP_NRUNNING(cp->cpu_chip, 1);
1028 
1029 	/* OK to steal anything left on run queue */
1030 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1031 
1032 	/* record last execution time */
1033 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
1034 
1035 	/*
1036 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1037 	 * won't have set its t_waitrq.  Since we now finally know that we're
1038 	 * switching away from this thread, set its t_waitrq if it is on a run
1039 	 * queue.
1040 	 */
1041 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1042 		curthread->t_waitrq = gethrtime_unscaled();
1043 	}
1044 
1045 	/* restore next thread to previously running microstate */
1046 	restore_mstate(next);
1047 
1048 	if (dtrace_vtime_active)
1049 		dtrace_vtime_switch(next);
1050 
1051 	resume(next);
1052 	/*
1053 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1054 	 * appear at the end of resume(), because we may not
1055 	 * return here
1056 	 */
1057 }
1058 
1059 
1060 
1061 #define	CPU_IDLING(pri)	((pri) == -1)
1062 
1063 static void
1064 cpu_resched(cpu_t *cp, pri_t tpri)
1065 {
1066 	int	call_poke_cpu = 0;
1067 	pri_t   cpupri = cp->cpu_dispatch_pri;
1068 
1069 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1070 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1071 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1072 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1073 			cp->cpu_runrun = 1;
1074 			aston(cp->cpu_dispthread);
1075 			if (tpri < kpreemptpri && cp != CPU)
1076 				call_poke_cpu = 1;
1077 		}
1078 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1079 			cp->cpu_kprunrun = 1;
1080 			if (cp != CPU)
1081 				call_poke_cpu = 1;
1082 		}
1083 	}
1084 
1085 	/*
1086 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1087 	 */
1088 	membar_enter();
1089 
1090 	if (call_poke_cpu)
1091 		poke_cpu(cp->cpu_id);
1092 }
1093 
1094 /*
1095  * Routine used by setbackdq() to balance load across the physical
1096  * processors. Returns a CPU of a lesser loaded chip in the lgroup
1097  * if balancing is necessary, or the "hint" CPU if it's not.
1098  *
1099  * - tp is the thread being enqueued
1100  * - cp is a hint CPU (chosen by cpu_choose()).
1101  * - curchip (if not NULL) is the chip on which the current thread
1102  *   is running.
1103  *
1104  * The thread lock for "tp" must be held while calling this routine.
1105  */
1106 static cpu_t *
1107 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip)
1108 {
1109 	int	chp_nrun, ochp_nrun;
1110 	chip_t	*chp, *nchp;
1111 
1112 	chp = cp->cpu_chip;
1113 	chp_nrun = chp->chip_nrunning;
1114 
1115 	if (chp == curchip)
1116 		chp_nrun--;	/* Ignore curthread */
1117 
1118 	/*
1119 	 * If this chip isn't at all idle, then let
1120 	 * run queue balancing do the work.
1121 	 */
1122 	if (chp_nrun == chp->chip_ncpu)
1123 		return (cp);
1124 
1125 	nchp = chp->chip_balance;
1126 	do {
1127 		if (nchp == chp ||
1128 		    !CHIP_IN_CPUPART(nchp, tp->t_cpupart))
1129 			continue;
1130 
1131 		ochp_nrun = nchp->chip_nrunning;
1132 
1133 		/*
1134 		 * If the other chip is running less threads,
1135 		 * or if it's running the same number of threads, but
1136 		 * has more online logical CPUs, then choose to balance.
1137 		 */
1138 		if (chp_nrun > ochp_nrun ||
1139 		    (chp_nrun == ochp_nrun &&
1140 		    nchp->chip_ncpu > chp->chip_ncpu)) {
1141 			cp = nchp->chip_cpus;
1142 			nchp->chip_cpus = cp->cpu_next_chip;
1143 
1144 			/*
1145 			 * Find a CPU on the chip in the correct
1146 			 * partition. We know at least one exists
1147 			 * because of the CHIP_IN_CPUPART() check above.
1148 			 */
1149 			while (cp->cpu_part != tp->t_cpupart)
1150 				cp = cp->cpu_next_chip;
1151 		}
1152 		chp->chip_balance = nchp->chip_next_lgrp;
1153 		break;
1154 	} while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance);
1155 
1156 	ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart));
1157 	return (cp);
1158 }
1159 
1160 /*
1161  * setbackdq() keeps runqs balanced such that the difference in length
1162  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1163  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1164  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1165  * try to keep runqs perfectly balanced regardless of the thread priority.
1166  */
1167 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1168 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1169 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1170 
1171 /*
1172  * Put the specified thread on the back of the dispatcher
1173  * queue corresponding to its current priority.
1174  *
1175  * Called with the thread in transition, onproc or stopped state
1176  * and locked (transition implies locked) and at high spl.
1177  * Returns with the thread in TS_RUN state and still locked.
1178  */
1179 void
1180 setbackdq(kthread_t *tp)
1181 {
1182 	dispq_t	*dq;
1183 	disp_t		*dp;
1184 	chip_t		*curchip = NULL;
1185 	cpu_t		*cp;
1186 	pri_t		tpri;
1187 	int		bound;
1188 
1189 	ASSERT(THREAD_LOCK_HELD(tp));
1190 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1191 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1192 
1193 	/*
1194 	 * If thread is "swapped" or on the swap queue don't
1195 	 * queue it, but wake sched.
1196 	 */
1197 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1198 		disp_swapped_setrun(tp);
1199 		return;
1200 	}
1201 
1202 	tpri = DISP_PRIO(tp);
1203 	if (tp == curthread) {
1204 		curchip = CPU->cpu_chip;
1205 	}
1206 
1207 	if (ncpus == 1)
1208 		cp = tp->t_cpu;
1209 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1210 		if (tpri >= kpqpri) {
1211 			setkpdq(tp, SETKP_BACK);
1212 			return;
1213 		}
1214 		/*
1215 		 * Let cpu_choose suggest a CPU.
1216 		 */
1217 		cp = cpu_choose(tp, tpri);
1218 
1219 		if (tp->t_cpupart == cp->cpu_part) {
1220 			int	qlen;
1221 
1222 			/*
1223 			 * Select another CPU if we need
1224 			 * to do some load balancing across the
1225 			 * physical processors.
1226 			 */
1227 			if (CHIP_SHOULD_BALANCE(cp->cpu_chip))
1228 				cp = chip_balance(tp, cp, curchip);
1229 
1230 			/*
1231 			 * Balance across the run queues
1232 			 */
1233 			qlen = RUNQ_LEN(cp, tpri);
1234 			if (tpri >= RUNQ_MATCH_PRI &&
1235 			    !(tp->t_schedflag & TS_RUNQMATCH))
1236 				qlen -= RUNQ_MAX_DIFF;
1237 			if (qlen > 0) {
1238 				cpu_t *newcp;
1239 
1240 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1241 					newcp = cp->cpu_next_part;
1242 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1243 					newcp = cp->cpu_next_part;
1244 				}
1245 
1246 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1247 					DTRACE_PROBE3(runq__balance,
1248 					    kthread_t *, tp,
1249 					    cpu_t *, cp, cpu_t *, newcp);
1250 					cp = newcp;
1251 				}
1252 			}
1253 		} else {
1254 			/*
1255 			 * Migrate to a cpu in the new partition.
1256 			 */
1257 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1258 			    tp->t_lpl, tp->t_pri, NULL);
1259 		}
1260 		bound = 0;
1261 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1262 	} else {
1263 		/*
1264 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1265 		 * a short time until weak binding that existed when the
1266 		 * strong binding was established has dropped) so we must
1267 		 * favour weak binding over strong.
1268 		 */
1269 		cp = tp->t_weakbound_cpu ?
1270 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1271 		bound = 1;
1272 	}
1273 	/*
1274 	 * A thread that is ONPROC may be temporarily placed on the run queue
1275 	 * but then chosen to run again by disp.  If the thread we're placing on
1276 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1277 	 * replacement process is actually scheduled in swtch().  In this
1278 	 * situation, curthread is the only thread that could be in the ONPROC
1279 	 * state.
1280 	 */
1281 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1282 		hrtime_t curtime;
1283 
1284 		curtime = gethrtime_unscaled();
1285 		(void) cpu_update_pct(tp, curtime);
1286 		tp->t_waitrq = curtime;
1287 	} else {
1288 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1289 	}
1290 
1291 	dp = cp->cpu_disp;
1292 	disp_lock_enter_high(&dp->disp_lock);
1293 
1294 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1295 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1296 		tpri, cp, tp);
1297 
1298 #ifndef NPROBE
1299 	/* Kernel probe */
1300 	if (tnf_tracing_active)
1301 		tnf_thread_queue(tp, cp, tpri);
1302 #endif /* NPROBE */
1303 
1304 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1305 
1306 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1307 	tp->t_disp_queue = dp;
1308 	tp->t_link = NULL;
1309 
1310 	dq = &dp->disp_q[tpri];
1311 	dp->disp_nrunnable++;
1312 	if (!bound)
1313 		dp->disp_steal = 0;
1314 	membar_enter();
1315 
1316 	if (dq->dq_sruncnt++ != 0) {
1317 		ASSERT(dq->dq_first != NULL);
1318 		dq->dq_last->t_link = tp;
1319 		dq->dq_last = tp;
1320 	} else {
1321 		ASSERT(dq->dq_first == NULL);
1322 		ASSERT(dq->dq_last == NULL);
1323 		dq->dq_first = dq->dq_last = tp;
1324 		BT_SET(dp->disp_qactmap, tpri);
1325 		if (tpri > dp->disp_maxrunpri) {
1326 			dp->disp_maxrunpri = tpri;
1327 			membar_enter();
1328 			cpu_resched(cp, tpri);
1329 		}
1330 	}
1331 
1332 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1333 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1334 		    cp == CPU) {
1335 			/*
1336 			 * If there are no other unbound threads on the
1337 			 * run queue, don't allow other CPUs to steal
1338 			 * this thread while we are in the middle of a
1339 			 * context switch. We may just switch to it
1340 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1341 			 * in swtch and swtch_to.
1342 			 */
1343 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1344 		}
1345 		dp->disp_max_unbound_pri = tpri;
1346 	}
1347 	(*disp_enq_thread)(cp, bound);
1348 }
1349 
1350 /*
1351  * Put the specified thread on the front of the dispatcher
1352  * queue corresponding to its current priority.
1353  *
1354  * Called with the thread in transition, onproc or stopped state
1355  * and locked (transition implies locked) and at high spl.
1356  * Returns with the thread in TS_RUN state and still locked.
1357  */
1358 void
1359 setfrontdq(kthread_t *tp)
1360 {
1361 	disp_t		*dp;
1362 	dispq_t		*dq;
1363 	cpu_t		*cp;
1364 	pri_t		tpri;
1365 	int		bound;
1366 
1367 	ASSERT(THREAD_LOCK_HELD(tp));
1368 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1369 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1370 
1371 	/*
1372 	 * If thread is "swapped" or on the swap queue don't
1373 	 * queue it, but wake sched.
1374 	 */
1375 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1376 		disp_swapped_setrun(tp);
1377 		return;
1378 	}
1379 
1380 	tpri = DISP_PRIO(tp);
1381 	if (ncpus == 1)
1382 		cp = tp->t_cpu;
1383 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1384 		if (tpri >= kpqpri) {
1385 			setkpdq(tp, SETKP_FRONT);
1386 			return;
1387 		}
1388 		cp = tp->t_cpu;
1389 		if (tp->t_cpupart == cp->cpu_part) {
1390 			/*
1391 			 * If we are of higher or equal priority than
1392 			 * the highest priority runnable thread of
1393 			 * the current CPU, just pick this CPU.  Otherwise
1394 			 * Let cpu_choose() select the CPU.  If this cpu
1395 			 * is the target of an offline request then do not
1396 			 * pick it - a thread_nomigrate() on the in motion
1397 			 * cpu relies on this when it forces a preempt.
1398 			 */
1399 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1400 			    cp == cpu_inmotion)
1401 				cp = cpu_choose(tp, tpri);
1402 		} else {
1403 			/*
1404 			 * Migrate to a cpu in the new partition.
1405 			 */
1406 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1407 			    tp->t_lpl, tp->t_pri, NULL);
1408 		}
1409 		bound = 0;
1410 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1411 	} else {
1412 		/*
1413 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1414 		 * a short time until weak binding that existed when the
1415 		 * strong binding was established has dropped) so we must
1416 		 * favour weak binding over strong.
1417 		 */
1418 		cp = tp->t_weakbound_cpu ?
1419 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1420 		bound = 1;
1421 	}
1422 
1423 	/*
1424 	 * A thread that is ONPROC may be temporarily placed on the run queue
1425 	 * but then chosen to run again by disp.  If the thread we're placing on
1426 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1427 	 * replacement process is actually scheduled in swtch().  In this
1428 	 * situation, curthread is the only thread that could be in the ONPROC
1429 	 * state.
1430 	 */
1431 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1432 		hrtime_t curtime;
1433 
1434 		curtime = gethrtime_unscaled();
1435 		(void) cpu_update_pct(tp, curtime);
1436 		tp->t_waitrq = curtime;
1437 	} else {
1438 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1439 	}
1440 
1441 	dp = cp->cpu_disp;
1442 	disp_lock_enter_high(&dp->disp_lock);
1443 
1444 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1445 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1446 
1447 #ifndef NPROBE
1448 	/* Kernel probe */
1449 	if (tnf_tracing_active)
1450 		tnf_thread_queue(tp, cp, tpri);
1451 #endif /* NPROBE */
1452 
1453 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1454 
1455 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1456 	tp->t_disp_queue = dp;
1457 
1458 	dq = &dp->disp_q[tpri];
1459 	dp->disp_nrunnable++;
1460 	if (!bound)
1461 		dp->disp_steal = 0;
1462 	membar_enter();
1463 
1464 	if (dq->dq_sruncnt++ != 0) {
1465 		ASSERT(dq->dq_last != NULL);
1466 		tp->t_link = dq->dq_first;
1467 		dq->dq_first = tp;
1468 	} else {
1469 		ASSERT(dq->dq_last == NULL);
1470 		ASSERT(dq->dq_first == NULL);
1471 		tp->t_link = NULL;
1472 		dq->dq_first = dq->dq_last = tp;
1473 		BT_SET(dp->disp_qactmap, tpri);
1474 		if (tpri > dp->disp_maxrunpri) {
1475 			dp->disp_maxrunpri = tpri;
1476 			membar_enter();
1477 			cpu_resched(cp, tpri);
1478 		}
1479 	}
1480 
1481 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1482 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1483 		    cp == CPU) {
1484 			/*
1485 			 * If there are no other unbound threads on the
1486 			 * run queue, don't allow other CPUs to steal
1487 			 * this thread while we are in the middle of a
1488 			 * context switch. We may just switch to it
1489 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1490 			 * in swtch and swtch_to.
1491 			 */
1492 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1493 		}
1494 		dp->disp_max_unbound_pri = tpri;
1495 	}
1496 	(*disp_enq_thread)(cp, bound);
1497 }
1498 
1499 /*
1500  * Put a high-priority unbound thread on the kp queue
1501  */
1502 static void
1503 setkpdq(kthread_t *tp, int borf)
1504 {
1505 	dispq_t	*dq;
1506 	disp_t	*dp;
1507 	cpu_t	*cp;
1508 	pri_t	tpri;
1509 
1510 	tpri = DISP_PRIO(tp);
1511 
1512 	dp = &tp->t_cpupart->cp_kp_queue;
1513 	disp_lock_enter_high(&dp->disp_lock);
1514 
1515 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1516 
1517 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1518 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1519 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1520 	tp->t_disp_queue = dp;
1521 	dp->disp_nrunnable++;
1522 	dq = &dp->disp_q[tpri];
1523 
1524 	if (dq->dq_sruncnt++ != 0) {
1525 		if (borf == SETKP_BACK) {
1526 			ASSERT(dq->dq_first != NULL);
1527 			tp->t_link = NULL;
1528 			dq->dq_last->t_link = tp;
1529 			dq->dq_last = tp;
1530 		} else {
1531 			ASSERT(dq->dq_last != NULL);
1532 			tp->t_link = dq->dq_first;
1533 			dq->dq_first = tp;
1534 		}
1535 	} else {
1536 		if (borf == SETKP_BACK) {
1537 			ASSERT(dq->dq_first == NULL);
1538 			ASSERT(dq->dq_last == NULL);
1539 			dq->dq_first = dq->dq_last = tp;
1540 		} else {
1541 			ASSERT(dq->dq_last == NULL);
1542 			ASSERT(dq->dq_first == NULL);
1543 			tp->t_link = NULL;
1544 			dq->dq_first = dq->dq_last = tp;
1545 		}
1546 		BT_SET(dp->disp_qactmap, tpri);
1547 		if (tpri > dp->disp_max_unbound_pri)
1548 			dp->disp_max_unbound_pri = tpri;
1549 		if (tpri > dp->disp_maxrunpri) {
1550 			dp->disp_maxrunpri = tpri;
1551 			membar_enter();
1552 		}
1553 	}
1554 
1555 	cp = tp->t_cpu;
1556 	if (tp->t_cpupart != cp->cpu_part) {
1557 		/* migrate to a cpu in the new partition */
1558 		cp = tp->t_cpupart->cp_cpulist;
1559 	}
1560 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1561 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1562 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1563 
1564 #ifndef NPROBE
1565 	/* Kernel probe */
1566 	if (tnf_tracing_active)
1567 		tnf_thread_queue(tp, cp, tpri);
1568 #endif /* NPROBE */
1569 
1570 	if (cp->cpu_chosen_level < tpri)
1571 		cp->cpu_chosen_level = tpri;
1572 	cpu_resched(cp, tpri);
1573 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1574 	(*disp_enq_thread)(cp, 0);
1575 }
1576 
1577 /*
1578  * Remove a thread from the dispatcher queue if it is on it.
1579  * It is not an error if it is not found but we return whether
1580  * or not it was found in case the caller wants to check.
1581  */
1582 int
1583 dispdeq(kthread_t *tp)
1584 {
1585 	disp_t		*dp;
1586 	dispq_t		*dq;
1587 	kthread_t	*rp;
1588 	kthread_t	*trp;
1589 	kthread_t	**ptp;
1590 	int		tpri;
1591 
1592 	ASSERT(THREAD_LOCK_HELD(tp));
1593 
1594 	if (tp->t_state != TS_RUN)
1595 		return (0);
1596 
1597 	/*
1598 	 * The thread is "swapped" or is on the swap queue and
1599 	 * hence no longer on the run queue, so return true.
1600 	 */
1601 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1602 		return (1);
1603 
1604 	tpri = DISP_PRIO(tp);
1605 	dp = tp->t_disp_queue;
1606 	ASSERT(tpri < dp->disp_npri);
1607 	dq = &dp->disp_q[tpri];
1608 	ptp = &dq->dq_first;
1609 	rp = *ptp;
1610 	trp = NULL;
1611 
1612 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1613 
1614 	/*
1615 	 * Search for thread in queue.
1616 	 * Double links would simplify this at the expense of disp/setrun.
1617 	 */
1618 	while (rp != tp && rp != NULL) {
1619 		trp = rp;
1620 		ptp = &trp->t_link;
1621 		rp = trp->t_link;
1622 	}
1623 
1624 	if (rp == NULL) {
1625 		panic("dispdeq: thread not on queue");
1626 	}
1627 
1628 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1629 
1630 	/*
1631 	 * Found it so remove it from queue.
1632 	 */
1633 	if ((*ptp = rp->t_link) == NULL)
1634 		dq->dq_last = trp;
1635 
1636 	dp->disp_nrunnable--;
1637 	if (--dq->dq_sruncnt == 0) {
1638 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1639 		if (dp->disp_nrunnable == 0) {
1640 			dp->disp_max_unbound_pri = -1;
1641 			dp->disp_maxrunpri = -1;
1642 		} else if (tpri == dp->disp_maxrunpri) {
1643 			int ipri;
1644 
1645 			ipri = bt_gethighbit(dp->disp_qactmap,
1646 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1647 			if (ipri < dp->disp_max_unbound_pri)
1648 				dp->disp_max_unbound_pri = ipri;
1649 			dp->disp_maxrunpri = ipri;
1650 		}
1651 	}
1652 	tp->t_link = NULL;
1653 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1654 	return (1);
1655 }
1656 
1657 
1658 /*
1659  * dq_sruninc and dq_srundec are public functions for
1660  * incrementing/decrementing the sruncnts when a thread on
1661  * a dispatcher queue is made schedulable/unschedulable by
1662  * resetting the TS_LOAD flag.
1663  *
1664  * The caller MUST have the thread lock and therefore the dispatcher
1665  * queue lock so that the operation which changes
1666  * the flag, the operation that checks the status of the thread to
1667  * determine if it's on a disp queue AND the call to this function
1668  * are one atomic operation with respect to interrupts.
1669  */
1670 
1671 /*
1672  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1673  */
1674 void
1675 dq_sruninc(kthread_t *t)
1676 {
1677 	ASSERT(t->t_state == TS_RUN);
1678 	ASSERT(t->t_schedflag & TS_LOAD);
1679 
1680 	THREAD_TRANSITION(t);
1681 	setfrontdq(t);
1682 }
1683 
1684 /*
1685  * See comment on calling conventions above.
1686  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1687  */
1688 void
1689 dq_srundec(kthread_t *t)
1690 {
1691 	ASSERT(t->t_schedflag & TS_LOAD);
1692 
1693 	(void) dispdeq(t);
1694 	disp_swapped_enq(t);
1695 }
1696 
1697 /*
1698  * Change the dispatcher lock of thread to the "swapped_lock"
1699  * and return with thread lock still held.
1700  *
1701  * Called with thread_lock held, in transition state, and at high spl.
1702  */
1703 void
1704 disp_swapped_enq(kthread_t *tp)
1705 {
1706 	ASSERT(THREAD_LOCK_HELD(tp));
1707 	ASSERT(tp->t_schedflag & TS_LOAD);
1708 
1709 	switch (tp->t_state) {
1710 	case TS_RUN:
1711 		disp_lock_enter_high(&swapped_lock);
1712 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1713 		break;
1714 	case TS_ONPROC:
1715 		disp_lock_enter_high(&swapped_lock);
1716 		THREAD_TRANSITION(tp);
1717 		wake_sched_sec = 1;		/* tell clock to wake sched */
1718 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1719 		break;
1720 	default:
1721 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1722 	}
1723 }
1724 
1725 /*
1726  * This routine is called by setbackdq/setfrontdq if the thread is
1727  * not loaded or loaded and on the swap queue.
1728  *
1729  * Thread state TS_SLEEP implies that a swapped thread
1730  * has been woken up and needs to be swapped in by the swapper.
1731  *
1732  * Thread state TS_RUN, it implies that the priority of a swapped
1733  * thread is being increased by scheduling class (e.g. ts_update).
1734  */
1735 static void
1736 disp_swapped_setrun(kthread_t *tp)
1737 {
1738 	ASSERT(THREAD_LOCK_HELD(tp));
1739 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1740 
1741 	switch (tp->t_state) {
1742 	case TS_SLEEP:
1743 		disp_lock_enter_high(&swapped_lock);
1744 		/*
1745 		 * Wakeup sched immediately (i.e., next tick) if the
1746 		 * thread priority is above maxclsyspri.
1747 		 */
1748 		if (DISP_PRIO(tp) > maxclsyspri)
1749 			wake_sched = 1;
1750 		else
1751 			wake_sched_sec = 1;
1752 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1753 		break;
1754 	case TS_RUN:				/* called from ts_update */
1755 		break;
1756 	default:
1757 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1758 	}
1759 }
1760 
1761 
1762 /*
1763  *	Make a thread give up its processor.  Find the processor on
1764  *	which this thread is executing, and have that processor
1765  *	preempt.
1766  */
1767 void
1768 cpu_surrender(kthread_t *tp)
1769 {
1770 	cpu_t	*cpup;
1771 	int	max_pri;
1772 	int	max_run_pri;
1773 	klwp_t	*lwp;
1774 
1775 	ASSERT(THREAD_LOCK_HELD(tp));
1776 
1777 	if (tp->t_state != TS_ONPROC)
1778 		return;
1779 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1780 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1781 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1782 	if (max_pri < max_run_pri)
1783 		max_pri = max_run_pri;
1784 
1785 	cpup->cpu_runrun = 1;
1786 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1787 		cpup->cpu_kprunrun = 1;
1788 	}
1789 
1790 	/*
1791 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1792 	 */
1793 	membar_enter();
1794 
1795 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1796 
1797 	/*
1798 	 * Make the target thread take an excursion through trap()
1799 	 * to do preempt() (unless we're already in trap or post_syscall,
1800 	 * calling cpu_surrender via CL_TRAPRET).
1801 	 */
1802 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1803 	    lwp->lwp_state != LWP_USER) {
1804 		aston(tp);
1805 		if (cpup != CPU)
1806 			poke_cpu(cpup->cpu_id);
1807 	}
1808 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1809 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1810 }
1811 
1812 
1813 /*
1814  * Commit to and ratify a scheduling decision
1815  */
1816 /*ARGSUSED*/
1817 static kthread_t *
1818 disp_ratify(kthread_t *tp, disp_t *kpq)
1819 {
1820 	pri_t	tpri, maxpri;
1821 	pri_t	maxkpri;
1822 	cpu_t	*cpup;
1823 
1824 	ASSERT(tp != NULL);
1825 	/*
1826 	 * Commit to, then ratify scheduling decision
1827 	 */
1828 	cpup = CPU;
1829 	if (cpup->cpu_runrun != 0)
1830 		cpup->cpu_runrun = 0;
1831 	if (cpup->cpu_kprunrun != 0)
1832 		cpup->cpu_kprunrun = 0;
1833 	if (cpup->cpu_chosen_level != -1)
1834 		cpup->cpu_chosen_level = -1;
1835 	membar_enter();
1836 	tpri = DISP_PRIO(tp);
1837 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1838 	maxkpri = kpq->disp_maxrunpri;
1839 	if (maxpri < maxkpri)
1840 		maxpri = maxkpri;
1841 	if (tpri < maxpri) {
1842 		/*
1843 		 * should have done better
1844 		 * put this one back and indicate to try again
1845 		 */
1846 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1847 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1848 		thread_lock_high(tp);
1849 		THREAD_TRANSITION(tp);
1850 		setfrontdq(tp);
1851 		thread_unlock_nopreempt(tp);
1852 
1853 		tp = NULL;
1854 	}
1855 	return (tp);
1856 }
1857 
1858 /*
1859  * See if there is any work on the dispatcher queue for other CPUs.
1860  * If there is, dequeue the best thread and return.
1861  */
1862 static kthread_t *
1863 disp_getwork(cpu_t *cp)
1864 {
1865 	cpu_t		*ocp;		/* other CPU */
1866 	cpu_t		*ocp_start;
1867 	cpu_t		*tcp;		/* target local CPU */
1868 	kthread_t	*tp;
1869 	kthread_t	*retval = NULL;
1870 	pri_t		maxpri;
1871 	disp_t		*kpq;		/* kp queue for this partition */
1872 	lpl_t		*lpl, *lpl_leaf;
1873 	int		hint, leafidx;
1874 	hrtime_t	stealtime;
1875 
1876 	maxpri = -1;
1877 	tcp = NULL;
1878 
1879 	kpq = &cp->cpu_part->cp_kp_queue;
1880 	while (kpq->disp_maxrunpri >= 0) {
1881 		/*
1882 		 * Try to take a thread from the kp_queue.
1883 		 */
1884 		tp = (disp_getbest(kpq));
1885 		if (tp)
1886 			return (disp_ratify(tp, kpq));
1887 	}
1888 
1889 	kpreempt_disable();		/* protect the cpu_active list */
1890 
1891 	/*
1892 	 * Try to find something to do on another CPU's run queue.
1893 	 * Loop through all other CPUs looking for the one with the highest
1894 	 * priority unbound thread.
1895 	 *
1896 	 * On NUMA machines, the partition's CPUs are consulted in order of
1897 	 * distance from the current CPU. This way, the first available
1898 	 * work found is also the closest, and will suffer the least
1899 	 * from being migrated.
1900 	 */
1901 	lpl = lpl_leaf = cp->cpu_lpl;
1902 	hint = leafidx = 0;
1903 
1904 	/*
1905 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1906 	 * broader levels of locality
1907 	 */
1908 	do {
1909 		/* This loop iterates over the lpl's leaves */
1910 		do {
1911 			if (lpl_leaf != cp->cpu_lpl)
1912 				ocp = lpl_leaf->lpl_cpus;
1913 			else
1914 				ocp = cp->cpu_next_lpl;
1915 
1916 			/* This loop iterates over the CPUs in the leaf */
1917 			ocp_start = ocp;
1918 			do {
1919 				pri_t pri;
1920 
1921 				ASSERT(CPU_ACTIVE(ocp));
1922 
1923 				/*
1924 				 * End our stroll around the partition if:
1925 				 *
1926 				 * - Something became runnable on the local
1927 				 *	queue
1928 				 *
1929 				 * - We're at the broadest level of locality and
1930 				 *   we happen across another idle CPU. At the
1931 				 *   highest level of locality, all CPUs will
1932 				 *   walk the partition's CPUs in the same
1933 				 *   order, so we can end our stroll taking
1934 				 *   comfort in knowing the other idle CPU is
1935 				 *   already covering the next portion of the
1936 				 *   list.
1937 				 */
1938 				if (cp->cpu_disp->disp_nrunnable != 0)
1939 					break;
1940 				if (ocp->cpu_dispatch_pri == -1) {
1941 					if (ocp->cpu_disp_flags &
1942 					    CPU_DISP_HALTED)
1943 						continue;
1944 					else if (lpl->lpl_parent == NULL)
1945 						break;
1946 				}
1947 
1948 				/*
1949 				 * If there's only one thread and the CPU
1950 				 * is in the middle of a context switch,
1951 				 * or it's currently running the idle thread,
1952 				 * don't steal it.
1953 				 */
1954 				if ((ocp->cpu_disp_flags &
1955 					CPU_DISP_DONTSTEAL) &&
1956 				    ocp->cpu_disp->disp_nrunnable == 1)
1957 					continue;
1958 
1959 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1960 				if (pri > maxpri) {
1961 					/*
1962 					 * Don't steal threads that we attempted
1963 					 * to be stolen very recently until
1964 					 * they're ready to be stolen again.
1965 					 */
1966 					stealtime = ocp->cpu_disp->disp_steal;
1967 					if (stealtime == 0 ||
1968 					    stealtime - gethrtime() <= 0) {
1969 						maxpri = pri;
1970 						tcp = ocp;
1971 					} else {
1972 						/*
1973 						 * Don't update tcp, just set
1974 						 * the retval to T_DONTSTEAL, so
1975 						 * that if no acceptable CPUs
1976 						 * are found the return value
1977 						 * will be T_DONTSTEAL rather
1978 						 * then NULL.
1979 						 */
1980 						retval = T_DONTSTEAL;
1981 					}
1982 				}
1983 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1984 
1985 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1986 				leafidx = 0;
1987 				lpl_leaf = lpl->lpl_rset[leafidx];
1988 			}
1989 		} while (leafidx != hint);
1990 
1991 		hint = leafidx = lpl->lpl_hint;
1992 		if ((lpl = lpl->lpl_parent) != NULL)
1993 			lpl_leaf = lpl->lpl_rset[hint];
1994 	} while (!tcp && lpl);
1995 
1996 	kpreempt_enable();
1997 
1998 	/*
1999 	 * If another queue looks good, and there is still nothing on
2000 	 * the local queue, try to transfer one or more threads
2001 	 * from it to our queue.
2002 	 */
2003 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2004 		tp = disp_getbest(tcp->cpu_disp);
2005 		if (tp == NULL || tp == T_DONTSTEAL)
2006 			return (tp);
2007 		return (disp_ratify(tp, kpq));
2008 	}
2009 	return (retval);
2010 }
2011 
2012 
2013 /*
2014  * disp_fix_unbound_pri()
2015  *	Determines the maximum priority of unbound threads on the queue.
2016  *	The priority is kept for the queue, but is only increased, never
2017  *	reduced unless some CPU is looking for something on that queue.
2018  *
2019  *	The priority argument is the known upper limit.
2020  *
2021  *	Perhaps this should be kept accurately, but that probably means
2022  *	separate bitmaps for bound and unbound threads.  Since only idled
2023  *	CPUs will have to do this recalculation, it seems better this way.
2024  */
2025 static void
2026 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2027 {
2028 	kthread_t	*tp;
2029 	dispq_t		*dq;
2030 	ulong_t		*dqactmap = dp->disp_qactmap;
2031 	ulong_t		mapword;
2032 	int		wx;
2033 
2034 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2035 
2036 	ASSERT(pri >= 0);			/* checked by caller */
2037 
2038 	/*
2039 	 * Start the search at the next lowest priority below the supplied
2040 	 * priority.  This depends on the bitmap implementation.
2041 	 */
2042 	do {
2043 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2044 
2045 		/*
2046 		 * Form mask for all lower priorities in the word.
2047 		 */
2048 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2049 
2050 		/*
2051 		 * Get next lower active priority.
2052 		 */
2053 		if (mapword != 0) {
2054 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2055 		} else if (wx > 0) {
2056 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2057 			if (pri < 0)
2058 				break;
2059 		} else {
2060 			pri = -1;
2061 			break;
2062 		}
2063 
2064 		/*
2065 		 * Search the queue for unbound, runnable threads.
2066 		 */
2067 		dq = &dp->disp_q[pri];
2068 		tp = dq->dq_first;
2069 
2070 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2071 			tp = tp->t_link;
2072 		}
2073 
2074 		/*
2075 		 * If a thread was found, set the priority and return.
2076 		 */
2077 	} while (tp == NULL);
2078 
2079 	/*
2080 	 * pri holds the maximum unbound thread priority or -1.
2081 	 */
2082 	if (dp->disp_max_unbound_pri != pri)
2083 		dp->disp_max_unbound_pri = pri;
2084 }
2085 
2086 /*
2087  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2088  * 	check if the CPU to which is was previously bound should have
2089  * 	its disp_max_unbound_pri increased.
2090  */
2091 void
2092 disp_adjust_unbound_pri(kthread_t *tp)
2093 {
2094 	disp_t *dp;
2095 	pri_t tpri;
2096 
2097 	ASSERT(THREAD_LOCK_HELD(tp));
2098 
2099 	/*
2100 	 * Don't do anything if the thread is not bound, or
2101 	 * currently not runnable or swapped out.
2102 	 */
2103 	if (tp->t_bound_cpu == NULL ||
2104 	    tp->t_state != TS_RUN ||
2105 	    tp->t_schedflag & TS_ON_SWAPQ)
2106 		return;
2107 
2108 	tpri = DISP_PRIO(tp);
2109 	dp = tp->t_bound_cpu->cpu_disp;
2110 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2111 	if (tpri > dp->disp_max_unbound_pri)
2112 		dp->disp_max_unbound_pri = tpri;
2113 }
2114 
2115 /*
2116  * disp_getbest()
2117  *   De-queue the highest priority unbound runnable thread.
2118  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2119  *   Returns NULL if nothing found.
2120  *   Returns T_DONTSTEAL if the thread was not stealable.
2121  *   so that the caller will try again later.
2122  *
2123  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2124  *   its type.
2125  */
2126 static kthread_t *
2127 disp_getbest(disp_t *dp)
2128 {
2129 	kthread_t	*tp;
2130 	dispq_t		*dq;
2131 	pri_t		pri;
2132 	cpu_t		*cp, *tcp;
2133 	boolean_t	allbound;
2134 
2135 	disp_lock_enter(&dp->disp_lock);
2136 
2137 	/*
2138 	 * If there is nothing to run, or the CPU is in the middle of a
2139 	 * context switch of the only thread, return NULL.
2140 	 */
2141 	tcp = dp->disp_cpu;
2142 	cp = CPU;
2143 	pri = dp->disp_max_unbound_pri;
2144 	if (pri == -1 ||
2145 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2146 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2147 		disp_lock_exit_nopreempt(&dp->disp_lock);
2148 		return (NULL);
2149 	}
2150 
2151 	dq = &dp->disp_q[pri];
2152 
2153 
2154 	/*
2155 	 * Assume that all threads are bound on this queue, and change it
2156 	 * later when we find out that it is not the case.
2157 	 */
2158 	allbound = B_TRUE;
2159 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2160 		hrtime_t now, nosteal, rqtime;
2161 		chip_type_t chtype;
2162 		chip_t *chip;
2163 
2164 		/*
2165 		 * Skip over bound threads which could be here even
2166 		 * though disp_max_unbound_pri indicated this level.
2167 		 */
2168 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2169 			continue;
2170 
2171 		/*
2172 		 * We've got some unbound threads on this queue, so turn
2173 		 * the allbound flag off now.
2174 		 */
2175 		allbound = B_FALSE;
2176 
2177 		/*
2178 		 * The thread is a candidate for stealing from its run queue. We
2179 		 * don't want to steal threads that became runnable just a
2180 		 * moment ago. This improves CPU affinity for threads that get
2181 		 * preempted for short periods of time and go back on the run
2182 		 * queue.
2183 		 *
2184 		 * We want to let it stay on its run queue if it was only placed
2185 		 * there recently and it was running on the same CPU before that
2186 		 * to preserve its cache investment. For the thread to remain on
2187 		 * its run queue, ALL of the following conditions must be
2188 		 * satisfied:
2189 		 *
2190 		 * - the disp queue should not be the kernel preemption queue
2191 		 * - delayed idle stealing should not be disabled
2192 		 * - nosteal_nsec should be non-zero
2193 		 * - it should run with user priority
2194 		 * - it should be on the run queue of the CPU where it was
2195 		 *   running before being placed on the run queue
2196 		 * - it should be the only thread on the run queue (to prevent
2197 		 *   extra scheduling latency for other threads)
2198 		 * - it should sit on the run queue for less than per-chip
2199 		 *   nosteal interval or global nosteal interval
2200 		 * - in case of CPUs with shared cache it should sit in a run
2201 		 *   queue of a CPU from a different chip
2202 		 *
2203 		 * The checks are arranged so that the ones that are faster are
2204 		 * placed earlier.
2205 		 */
2206 		if (tcp == NULL ||
2207 		    pri >= minclsyspri ||
2208 		    tp->t_cpu != tcp)
2209 			break;
2210 
2211 		/*
2212 		 * Steal immediately if the chip has shared cache and we are
2213 		 * sharing the chip with the target thread's CPU.
2214 		 */
2215 		chip = tcp->cpu_chip;
2216 		chtype = chip->chip_type;
2217 		if ((chtype == CHIP_SMT || chtype == CHIP_CMP_SHARED_CACHE) &&
2218 		    chip == cp->cpu_chip)
2219 			break;
2220 
2221 		/*
2222 		 * Get the value of nosteal interval either from nosteal_nsec
2223 		 * global variable or from a value specified by a chip
2224 		 */
2225 		nosteal = nosteal_nsec ? nosteal_nsec : chip->chip_nosteal;
2226 		if (nosteal == 0 || nosteal == NOSTEAL_DISABLED)
2227 			break;
2228 
2229 		/*
2230 		 * Calculate time spent sitting on run queue
2231 		 */
2232 		now = gethrtime_unscaled();
2233 		rqtime = now - tp->t_waitrq;
2234 		scalehrtime(&rqtime);
2235 
2236 		/*
2237 		 * Steal immediately if the time spent on this run queue is more
2238 		 * than allowed nosteal delay.
2239 		 *
2240 		 * Negative rqtime check is needed here to avoid infinite
2241 		 * stealing delays caused by unlikely but not impossible
2242 		 * drifts between CPU times on different CPUs.
2243 		 */
2244 		if (rqtime > nosteal || rqtime < 0)
2245 			break;
2246 
2247 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2248 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2249 		scalehrtime(&now);
2250 		/*
2251 		 * Calculate when this thread becomes stealable
2252 		 */
2253 		now += (nosteal - rqtime);
2254 
2255 		/*
2256 		 * Calculate time when some thread becomes stealable
2257 		 */
2258 		if (now < dp->disp_steal)
2259 			dp->disp_steal = now;
2260 	}
2261 
2262 	/*
2263 	 * If there were no unbound threads on this queue, find the queue
2264 	 * where they are and then return later. The value of
2265 	 * disp_max_unbound_pri is not always accurate because it isn't
2266 	 * reduced until another idle CPU looks for work.
2267 	 */
2268 	if (allbound)
2269 		disp_fix_unbound_pri(dp, pri);
2270 
2271 	/*
2272 	 * If we reached the end of the queue and found no unbound threads
2273 	 * then return NULL so that other CPUs will be considered.  If there
2274 	 * are unbound threads but they cannot yet be stolen, then
2275 	 * return T_DONTSTEAL and try again later.
2276 	 */
2277 	if (tp == NULL) {
2278 		disp_lock_exit_nopreempt(&dp->disp_lock);
2279 		return (allbound ? NULL : T_DONTSTEAL);
2280 	}
2281 
2282 	/*
2283 	 * Found a runnable, unbound thread, so remove it from queue.
2284 	 * dispdeq() requires that we have the thread locked, and we do,
2285 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2286 	 * put the thread in transition state, thereby dropping the dispq
2287 	 * lock.
2288 	 */
2289 
2290 #ifdef DEBUG
2291 	{
2292 		int	thread_was_on_queue;
2293 
2294 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2295 		ASSERT(thread_was_on_queue);
2296 	}
2297 
2298 #else /* DEBUG */
2299 	(void) dispdeq(tp);			/* drops disp_lock */
2300 #endif /* DEBUG */
2301 
2302 	/*
2303 	 * Reset the disp_queue steal time - we do not know what is the smallest
2304 	 * value across the queue is.
2305 	 */
2306 	dp->disp_steal = 0;
2307 
2308 	tp->t_schedflag |= TS_DONT_SWAP;
2309 
2310 	/*
2311 	 * Setup thread to run on the current CPU.
2312 	 */
2313 	tp->t_disp_queue = cp->cpu_disp;
2314 
2315 	cp->cpu_dispthread = tp;		/* protected by spl only */
2316 	cp->cpu_dispatch_pri = pri;
2317 	ASSERT(pri == DISP_PRIO(tp));
2318 
2319 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2320 
2321 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2322 
2323 	/*
2324 	 * Return with spl high so that swtch() won't need to raise it.
2325 	 * The disp_lock was dropped by dispdeq().
2326 	 */
2327 
2328 	return (tp);
2329 }
2330 
2331 /*
2332  * disp_bound_common() - common routine for higher level functions
2333  *	that check for bound threads under certain conditions.
2334  *	If 'threadlistsafe' is set then there is no need to acquire
2335  *	pidlock to stop the thread list from changing (eg, if
2336  *	disp_bound_* is called with cpus paused).
2337  */
2338 static int
2339 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2340 {
2341 	int		found = 0;
2342 	kthread_t	*tp;
2343 
2344 	ASSERT(flag);
2345 
2346 	if (!threadlistsafe)
2347 		mutex_enter(&pidlock);
2348 	tp = curthread;		/* faster than allthreads */
2349 	do {
2350 		if (tp->t_state != TS_FREE) {
2351 			/*
2352 			 * If an interrupt thread is busy, but the
2353 			 * caller doesn't care (i.e. BOUND_INTR is off),
2354 			 * then just ignore it and continue through.
2355 			 */
2356 			if ((tp->t_flag & T_INTR_THREAD) &&
2357 			    !(flag & BOUND_INTR))
2358 				continue;
2359 
2360 			/*
2361 			 * Skip the idle thread for the CPU
2362 			 * we're about to set offline.
2363 			 */
2364 			if (tp == cp->cpu_idle_thread)
2365 				continue;
2366 
2367 			/*
2368 			 * Skip the pause thread for the CPU
2369 			 * we're about to set offline.
2370 			 */
2371 			if (tp == cp->cpu_pause_thread)
2372 				continue;
2373 
2374 			if ((flag & BOUND_CPU) &&
2375 			    (tp->t_bound_cpu == cp ||
2376 			    tp->t_bind_cpu == cp->cpu_id ||
2377 			    tp->t_weakbound_cpu == cp)) {
2378 				found = 1;
2379 				break;
2380 			}
2381 
2382 			if ((flag & BOUND_PARTITION) &&
2383 			    (tp->t_cpupart == cp->cpu_part)) {
2384 				found = 1;
2385 				break;
2386 			}
2387 		}
2388 	} while ((tp = tp->t_next) != curthread && found == 0);
2389 	if (!threadlistsafe)
2390 		mutex_exit(&pidlock);
2391 	return (found);
2392 }
2393 
2394 /*
2395  * disp_bound_threads - return nonzero if threads are bound to the processor.
2396  *	Called infrequently.  Keep this simple.
2397  *	Includes threads that are asleep or stopped but not onproc.
2398  */
2399 int
2400 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2401 {
2402 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2403 }
2404 
2405 /*
2406  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2407  * to the given processor, including interrupt threads.
2408  */
2409 int
2410 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2411 {
2412 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2413 }
2414 
2415 /*
2416  * disp_bound_partition - return nonzero if threads are bound to the same
2417  * partition as the processor.
2418  *	Called infrequently.  Keep this simple.
2419  *	Includes threads that are asleep or stopped but not onproc.
2420  */
2421 int
2422 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2423 {
2424 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2425 }
2426 
2427 /*
2428  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2429  * threads to other CPUs.
2430  */
2431 void
2432 disp_cpu_inactive(cpu_t *cp)
2433 {
2434 	kthread_t	*tp;
2435 	disp_t		*dp = cp->cpu_disp;
2436 	dispq_t		*dq;
2437 	pri_t		pri;
2438 	int		wasonq;
2439 
2440 	disp_lock_enter(&dp->disp_lock);
2441 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2442 		dq = &dp->disp_q[pri];
2443 		tp = dq->dq_first;
2444 
2445 		/*
2446 		 * Skip over bound threads.
2447 		 */
2448 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2449 			tp = tp->t_link;
2450 		}
2451 
2452 		if (tp == NULL) {
2453 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2454 			disp_fix_unbound_pri(dp, pri);
2455 			continue;
2456 		}
2457 
2458 		wasonq = dispdeq(tp);		/* drops disp_lock */
2459 		ASSERT(wasonq);
2460 		ASSERT(tp->t_weakbound_cpu == NULL);
2461 
2462 		setbackdq(tp);
2463 		/*
2464 		 * Called from cpu_offline:
2465 		 *
2466 		 * cp has already been removed from the list of active cpus
2467 		 * and tp->t_cpu has been changed so there is no risk of
2468 		 * tp ending up back on cp.
2469 		 *
2470 		 * Called from cpupart_move_cpu:
2471 		 *
2472 		 * The cpu has moved to a new cpupart.  Any threads that
2473 		 * were on it's dispatch queues before the move remain
2474 		 * in the old partition and can't run in the new partition.
2475 		 */
2476 		ASSERT(tp->t_cpu != cp);
2477 		thread_unlock(tp);
2478 
2479 		disp_lock_enter(&dp->disp_lock);
2480 	}
2481 	disp_lock_exit(&dp->disp_lock);
2482 }
2483 
2484 /*
2485  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2486  *	The hint passed in is used as a starting point so we don't favor
2487  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2488  *	used CPU for the thread.
2489  *
2490  *	The lgroup and priority are used to determine the best CPU to run on
2491  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2492  *	the thread priority will indicate whether the thread will actually run
2493  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2494  *	lgroup which are running the lowest priority threads are found.  The
2495  *	remote CPU is chosen only if the thread will not run locally on a CPU
2496  *	within the lgroup, but will run on the remote CPU. If the thread
2497  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2498  *
2499  *	The lpl specified also identifies the cpu partition from which
2500  *	disp_lowpri_cpu should select a CPU.
2501  *
2502  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2503  *      behalf of the current thread. (curthread is looking for a new cpu)
2504  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2505  *      ignored.
2506  *
2507  *      If a cpu is the target of an offline request then try to avoid it.
2508  *
2509  *	This function must be called at either high SPL, or with preemption
2510  *	disabled, so that the "hint" CPU cannot be removed from the online
2511  *	CPU list while we are traversing it.
2512  */
2513 cpu_t *
2514 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2515 {
2516 	cpu_t	*bestcpu;
2517 	cpu_t	*besthomecpu;
2518 	cpu_t   *cp, *cpstart;
2519 
2520 	pri_t   bestpri;
2521 	pri_t   cpupri;
2522 
2523 	klgrpset_t	done;
2524 	klgrpset_t	cur_set;
2525 
2526 	lpl_t		*lpl_iter, *lpl_leaf;
2527 	int		i;
2528 
2529 	/*
2530 	 * Scan for a CPU currently running the lowest priority thread.
2531 	 * Cannot get cpu_lock here because it is adaptive.
2532 	 * We do not require lock on CPU list.
2533 	 */
2534 	ASSERT(hint != NULL);
2535 	ASSERT(lpl != NULL);
2536 	ASSERT(lpl->lpl_ncpu > 0);
2537 
2538 	/*
2539 	 * First examine local CPUs. Note that it's possible the hint CPU
2540 	 * passed in in remote to the specified home lgroup. If our priority
2541 	 * isn't sufficient enough such that we can run immediately at home,
2542 	 * then examine CPUs remote to our home lgroup.
2543 	 * We would like to give preference to CPUs closest to "home".
2544 	 * If we can't find a CPU where we'll run at a given level
2545 	 * of locality, we expand our search to include the next level.
2546 	 */
2547 	bestcpu = besthomecpu = NULL;
2548 	klgrpset_clear(done);
2549 	/* start with lpl we were passed */
2550 
2551 	lpl_iter = lpl;
2552 
2553 	do {
2554 
2555 		bestpri = SHRT_MAX;
2556 		klgrpset_clear(cur_set);
2557 
2558 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2559 			lpl_leaf = lpl_iter->lpl_rset[i];
2560 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2561 				continue;
2562 
2563 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2564 
2565 			if (hint->cpu_lpl == lpl_leaf)
2566 				cp = cpstart = hint;
2567 			else
2568 				cp = cpstart = lpl_leaf->lpl_cpus;
2569 
2570 			do {
2571 				if (cp == curcpu)
2572 					cpupri = -1;
2573 				else if (cp == cpu_inmotion)
2574 					cpupri = SHRT_MAX;
2575 				else
2576 					cpupri = cp->cpu_dispatch_pri;
2577 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2578 					cpupri = cp->cpu_disp->disp_maxrunpri;
2579 				if (cp->cpu_chosen_level > cpupri)
2580 					cpupri = cp->cpu_chosen_level;
2581 				if (cpupri < bestpri) {
2582 					if (CPU_IDLING(cpupri)) {
2583 						ASSERT((cp->cpu_flags &
2584 						    CPU_QUIESCED) == 0);
2585 						return (cp);
2586 					}
2587 					bestcpu = cp;
2588 					bestpri = cpupri;
2589 				}
2590 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2591 		}
2592 
2593 		if (bestcpu && (tpri > bestpri)) {
2594 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2595 			return (bestcpu);
2596 		}
2597 		if (besthomecpu == NULL)
2598 			besthomecpu = bestcpu;
2599 		/*
2600 		 * Add the lgrps we just considered to the "done" set
2601 		 */
2602 		klgrpset_or(done, cur_set);
2603 
2604 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2605 
2606 	/*
2607 	 * The specified priority isn't high enough to run immediately
2608 	 * anywhere, so just return the best CPU from the home lgroup.
2609 	 */
2610 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2611 	return (besthomecpu);
2612 }
2613 
2614 /*
2615  * This routine provides the generic idle cpu function for all processors.
2616  * If a processor has some specific code to execute when idle (say, to stop
2617  * the pipeline and save power) then that routine should be defined in the
2618  * processors specific code (module_xx.c) and the global variable idle_cpu
2619  * set to that function.
2620  */
2621 static void
2622 generic_idle_cpu(void)
2623 {
2624 }
2625 
2626 /*ARGSUSED*/
2627 static void
2628 generic_enq_thread(cpu_t *cpu, int bound)
2629 {
2630 }
2631 
2632 /*
2633  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2634  *	- t->t_cpu is not in this thread's assigned lgrp
2635  *	- the time since the thread last came off t->t_cpu exceeds the
2636  *	  rechoose time for this cpu (ignore this if t is curthread in
2637  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2638  *	- t->t_cpu is presently the target of an offline or partition move
2639  *	  request
2640  */
2641 static cpu_t *
2642 cpu_choose(kthread_t *t, pri_t tpri)
2643 {
2644 	ASSERT(tpri < kpqpri);
2645 
2646 	if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) &&
2647 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2648 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2649 	}
2650 
2651 	/*
2652 	 * Take a trip through disp_lowpri_cpu() if the thread was
2653 	 * running outside it's home lgroup
2654 	 */
2655 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2656 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2657 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2658 		    (t == curthread) ? t->t_cpu : NULL));
2659 	}
2660 	return (t->t_cpu);
2661 }
2662