xref: /titanic_50/usr/src/uts/common/disp/disp.c (revision 80b80bf0416a7eb4be4215b2e192cafd03ca80b7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysinfo.h>
39 #include <sys/var.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/inline.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47 #include <sys/kmem.h>
48 #include <sys/cpuvar.h>
49 #include <sys/vtrace.h>
50 #include <sys/tnf.h>
51 #include <sys/cpupart.h>
52 #include <sys/lgrp.h>
53 #include <sys/pg.h>
54 #include <sys/cmt.h>
55 #include <sys/bitset.h>
56 #include <sys/schedctl.h>
57 #include <sys/atomic.h>
58 #include <sys/dtrace.h>
59 #include <sys/sdt.h>
60 
61 #include <vm/as.h>
62 
63 #define	BOUND_CPU	0x1
64 #define	BOUND_PARTITION	0x2
65 #define	BOUND_INTR	0x4
66 
67 /* Dispatch queue allocation structure and functions */
68 struct disp_queue_info {
69 	disp_t	*dp;
70 	dispq_t *olddispq;
71 	dispq_t *newdispq;
72 	ulong_t	*olddqactmap;
73 	ulong_t	*newdqactmap;
74 	int	oldnglobpris;
75 };
76 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
77     disp_t *dp);
78 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
79 static void	disp_dq_free(struct disp_queue_info *dptr);
80 
81 /* platform-specific routine to call when processor is idle */
82 static void	generic_idle_cpu();
83 void		(*idle_cpu)() = generic_idle_cpu;
84 
85 /* routines invoked when a CPU enters/exits the idle loop */
86 static void	idle_enter();
87 static void	idle_exit();
88 
89 /* platform-specific routine to call when thread is enqueued */
90 static void	generic_enq_thread(cpu_t *, int);
91 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
92 
93 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
94 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
95 pri_t	intr_pri;		/* interrupt thread priority base level */
96 
97 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
98 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
99 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
100 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
101 int	nswapped;		/* total number of swapped threads */
102 void	disp_swapped_enq(kthread_t *tp);
103 static void	disp_swapped_setrun(kthread_t *tp);
104 static void	cpu_resched(cpu_t *cp, pri_t tpri);
105 
106 /*
107  * If this is set, only interrupt threads will cause kernel preemptions.
108  * This is done by changing the value of kpreemptpri.  kpreemptpri
109  * will either be the max sysclass pri + 1 or the min interrupt pri.
110  */
111 int	only_intr_kpreempt;
112 
113 extern void set_idle_cpu(int cpun);
114 extern void unset_idle_cpu(int cpun);
115 static void setkpdq(kthread_t *tp, int borf);
116 #define	SETKP_BACK	0
117 #define	SETKP_FRONT	1
118 /*
119  * Parameter that determines how recently a thread must have run
120  * on the CPU to be considered loosely-bound to that CPU to reduce
121  * cold cache effects.  The interval is in hertz.
122  */
123 #define	RECHOOSE_INTERVAL 3
124 int	rechoose_interval = RECHOOSE_INTERVAL;
125 static cpu_t	*cpu_choose(kthread_t *, pri_t);
126 
127 /*
128  * Parameter that determines how long (in nanoseconds) a thread must
129  * be sitting on a run queue before it can be stolen by another CPU
130  * to reduce migrations.  The interval is in nanoseconds.
131  *
132  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
133  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
134  * here indicating it is uninitiallized.
135  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
136  *
137  */
138 #define	NOSTEAL_UNINITIALIZED	(-1)
139 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
140 extern void cmp_set_nosteal_interval(void);
141 
142 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
143 
144 disp_lock_t	transition_lock;	/* lock on transitioning threads */
145 disp_lock_t	stop_lock;		/* lock on stopped threads */
146 
147 static void	cpu_dispqalloc(int numpris);
148 
149 /*
150  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
151  * a thread because it was sitting on its run queue for a very short
152  * period of time.
153  */
154 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
155 
156 static kthread_t	*disp_getwork(cpu_t *to);
157 static kthread_t	*disp_getbest(disp_t *from);
158 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
159 
160 void	swtch_to(kthread_t *);
161 
162 /*
163  * dispatcher and scheduler initialization
164  */
165 
166 /*
167  * disp_setup - Common code to calculate and allocate dispatcher
168  *		variables and structures based on the maximum priority.
169  */
170 static void
171 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
172 {
173 	pri_t	newnglobpris;
174 
175 	ASSERT(MUTEX_HELD(&cpu_lock));
176 
177 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
178 
179 	if (newnglobpris > oldnglobpris) {
180 		/*
181 		 * Allocate new kp queues for each CPU partition.
182 		 */
183 		cpupart_kpqalloc(newnglobpris);
184 
185 		/*
186 		 * Allocate new dispatch queues for each CPU.
187 		 */
188 		cpu_dispqalloc(newnglobpris);
189 
190 		/*
191 		 * compute new interrupt thread base priority
192 		 */
193 		intr_pri = maxglobpri;
194 		if (only_intr_kpreempt) {
195 			kpreemptpri = intr_pri + 1;
196 			if (kpqpri == KPQPRI)
197 				kpqpri = kpreemptpri;
198 		}
199 		v.v_nglobpris = newnglobpris;
200 	}
201 }
202 
203 /*
204  * dispinit - Called to initialize all loaded classes and the
205  *	      dispatcher framework.
206  */
207 void
208 dispinit(void)
209 {
210 	id_t	cid;
211 	pri_t	maxglobpri;
212 	pri_t	cl_maxglobpri;
213 
214 	maxglobpri = -1;
215 
216 	/*
217 	 * Initialize transition lock, which will always be set.
218 	 */
219 	DISP_LOCK_INIT(&transition_lock);
220 	disp_lock_enter_high(&transition_lock);
221 	DISP_LOCK_INIT(&stop_lock);
222 
223 	mutex_enter(&cpu_lock);
224 	CPU->cpu_disp->disp_maxrunpri = -1;
225 	CPU->cpu_disp->disp_max_unbound_pri = -1;
226 
227 	/*
228 	 * Initialize the default CPU partition.
229 	 */
230 	cpupart_initialize_default();
231 	/*
232 	 * Call the class specific initialization functions for
233 	 * all pre-installed schedulers.
234 	 *
235 	 * We pass the size of a class specific parameter
236 	 * buffer to each of the initialization functions
237 	 * to try to catch problems with backward compatibility
238 	 * of class modules.
239 	 *
240 	 * For example a new class module running on an old system
241 	 * which didn't provide sufficiently large parameter buffers
242 	 * would be bad news. Class initialization modules can check for
243 	 * this and take action if they detect a problem.
244 	 */
245 
246 	for (cid = 0; cid < nclass; cid++) {
247 		sclass_t	*sc;
248 
249 		sc = &sclass[cid];
250 		if (SCHED_INSTALLED(sc)) {
251 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
252 			    &sc->cl_funcs);
253 			if (cl_maxglobpri > maxglobpri)
254 				maxglobpri = cl_maxglobpri;
255 		}
256 	}
257 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
258 	if (kpqpri == KPQPRI)
259 		kpqpri = kpreemptpri;
260 
261 	ASSERT(maxglobpri >= 0);
262 	disp_setup(maxglobpri, 0);
263 
264 	mutex_exit(&cpu_lock);
265 
266 	/*
267 	 * Platform specific sticky scheduler setup.
268 	 */
269 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
270 		cmp_set_nosteal_interval();
271 
272 	/*
273 	 * Get the default class ID; this may be later modified via
274 	 * dispadmin(1M).  This will load the class (normally TS) and that will
275 	 * call disp_add(), which is why we had to drop cpu_lock first.
276 	 */
277 	if (getcid(defaultclass, &defaultcid) != 0) {
278 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
279 		    defaultclass);
280 	}
281 }
282 
283 /*
284  * disp_add - Called with class pointer to initialize the dispatcher
285  *	      for a newly loaded class.
286  */
287 void
288 disp_add(sclass_t *clp)
289 {
290 	pri_t	maxglobpri;
291 	pri_t	cl_maxglobpri;
292 
293 	mutex_enter(&cpu_lock);
294 	/*
295 	 * Initialize the scheduler class.
296 	 */
297 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
298 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
299 	if (cl_maxglobpri > maxglobpri)
300 		maxglobpri = cl_maxglobpri;
301 
302 	/*
303 	 * Save old queue information.  Since we're initializing a
304 	 * new scheduling class which has just been loaded, then
305 	 * the size of the dispq may have changed.  We need to handle
306 	 * that here.
307 	 */
308 	disp_setup(maxglobpri, v.v_nglobpris);
309 
310 	mutex_exit(&cpu_lock);
311 }
312 
313 
314 /*
315  * For each CPU, allocate new dispatch queues
316  * with the stated number of priorities.
317  */
318 static void
319 cpu_dispqalloc(int numpris)
320 {
321 	cpu_t	*cpup;
322 	struct disp_queue_info	*disp_mem;
323 	int i, num;
324 
325 	ASSERT(MUTEX_HELD(&cpu_lock));
326 
327 	disp_mem = kmem_zalloc(NCPU *
328 	    sizeof (struct disp_queue_info), KM_SLEEP);
329 
330 	/*
331 	 * This routine must allocate all of the memory before stopping
332 	 * the cpus because it must not sleep in kmem_alloc while the
333 	 * CPUs are stopped.  Locks they hold will not be freed until they
334 	 * are restarted.
335 	 */
336 	i = 0;
337 	cpup = cpu_list;
338 	do {
339 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
340 		i++;
341 		cpup = cpup->cpu_next;
342 	} while (cpup != cpu_list);
343 	num = i;
344 
345 	pause_cpus(NULL);
346 	for (i = 0; i < num; i++)
347 		disp_dq_assign(&disp_mem[i], numpris);
348 	start_cpus();
349 
350 	/*
351 	 * I must free all of the memory after starting the cpus because
352 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
353 	 */
354 	for (i = 0; i < num; i++)
355 		disp_dq_free(&disp_mem[i]);
356 
357 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
358 }
359 
360 static void
361 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
362 {
363 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
364 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
365 	    sizeof (long), KM_SLEEP);
366 	dptr->dp = dp;
367 }
368 
369 static void
370 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
371 {
372 	disp_t	*dp;
373 
374 	dp = dptr->dp;
375 	dptr->olddispq = dp->disp_q;
376 	dptr->olddqactmap = dp->disp_qactmap;
377 	dptr->oldnglobpris = dp->disp_npri;
378 
379 	ASSERT(dptr->oldnglobpris < numpris);
380 
381 	if (dptr->olddispq != NULL) {
382 		/*
383 		 * Use kcopy because bcopy is platform-specific
384 		 * and could block while we might have paused the cpus.
385 		 */
386 		(void) kcopy(dptr->olddispq, dptr->newdispq,
387 		    dptr->oldnglobpris * sizeof (dispq_t));
388 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
389 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
390 		    sizeof (long));
391 	}
392 	dp->disp_q = dptr->newdispq;
393 	dp->disp_qactmap = dptr->newdqactmap;
394 	dp->disp_q_limit = &dptr->newdispq[numpris];
395 	dp->disp_npri = numpris;
396 }
397 
398 static void
399 disp_dq_free(struct disp_queue_info *dptr)
400 {
401 	if (dptr->olddispq != NULL)
402 		kmem_free(dptr->olddispq,
403 		    dptr->oldnglobpris * sizeof (dispq_t));
404 	if (dptr->olddqactmap != NULL)
405 		kmem_free(dptr->olddqactmap,
406 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
407 }
408 
409 /*
410  * For a newly created CPU, initialize the dispatch queue.
411  * This is called before the CPU is known through cpu[] or on any lists.
412  */
413 void
414 disp_cpu_init(cpu_t *cp)
415 {
416 	disp_t	*dp;
417 	dispq_t	*newdispq;
418 	ulong_t	*newdqactmap;
419 
420 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
421 
422 	if (cp == cpu0_disp.disp_cpu)
423 		dp = &cpu0_disp;
424 	else
425 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
426 	bzero(dp, sizeof (disp_t));
427 	cp->cpu_disp = dp;
428 	dp->disp_cpu = cp;
429 	dp->disp_maxrunpri = -1;
430 	dp->disp_max_unbound_pri = -1;
431 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
432 	/*
433 	 * Allocate memory for the dispatcher queue headers
434 	 * and the active queue bitmap.
435 	 */
436 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
437 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
438 	    sizeof (long), KM_SLEEP);
439 	dp->disp_q = newdispq;
440 	dp->disp_qactmap = newdqactmap;
441 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
442 	dp->disp_npri = v.v_nglobpris;
443 }
444 
445 void
446 disp_cpu_fini(cpu_t *cp)
447 {
448 	ASSERT(MUTEX_HELD(&cpu_lock));
449 
450 	disp_kp_free(cp->cpu_disp);
451 	if (cp->cpu_disp != &cpu0_disp)
452 		kmem_free(cp->cpu_disp, sizeof (disp_t));
453 }
454 
455 /*
456  * Allocate new, larger kpreempt dispatch queue to replace the old one.
457  */
458 void
459 disp_kp_alloc(disp_t *dq, pri_t npri)
460 {
461 	struct disp_queue_info	mem_info;
462 
463 	if (npri > dq->disp_npri) {
464 		/*
465 		 * Allocate memory for the new array.
466 		 */
467 		disp_dq_alloc(&mem_info, npri, dq);
468 
469 		/*
470 		 * We need to copy the old structures to the new
471 		 * and free the old.
472 		 */
473 		disp_dq_assign(&mem_info, npri);
474 		disp_dq_free(&mem_info);
475 	}
476 }
477 
478 /*
479  * Free dispatch queue.
480  * Used for the kpreempt queues for a removed CPU partition and
481  * for the per-CPU queues of deleted CPUs.
482  */
483 void
484 disp_kp_free(disp_t *dq)
485 {
486 	struct disp_queue_info	mem_info;
487 
488 	mem_info.olddispq = dq->disp_q;
489 	mem_info.olddqactmap = dq->disp_qactmap;
490 	mem_info.oldnglobpris = dq->disp_npri;
491 	disp_dq_free(&mem_info);
492 }
493 
494 /*
495  * End dispatcher and scheduler initialization.
496  */
497 
498 /*
499  * See if there's anything to do other than remain idle.
500  * Return non-zero if there is.
501  *
502  * This function must be called with high spl, or with
503  * kernel preemption disabled to prevent the partition's
504  * active cpu list from changing while being traversed.
505  *
506  */
507 int
508 disp_anywork(void)
509 {
510 	cpu_t   *cp = CPU;
511 	cpu_t   *ocp;
512 
513 	if (cp->cpu_disp->disp_nrunnable != 0)
514 		return (1);
515 
516 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
517 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
518 			return (1);
519 
520 		/*
521 		 * Work can be taken from another CPU if:
522 		 *	- There is unbound work on the run queue
523 		 *	- That work isn't a thread undergoing a
524 		 *	- context switch on an otherwise empty queue.
525 		 *	- The CPU isn't running the idle loop.
526 		 */
527 		for (ocp = cp->cpu_next_part; ocp != cp;
528 		    ocp = ocp->cpu_next_part) {
529 			ASSERT(CPU_ACTIVE(ocp));
530 
531 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
532 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
533 			    ocp->cpu_disp->disp_nrunnable == 1) &&
534 			    ocp->cpu_dispatch_pri != -1)
535 				return (1);
536 		}
537 	}
538 	return (0);
539 }
540 
541 /*
542  * Called when CPU enters the idle loop
543  */
544 static void
545 idle_enter()
546 {
547 	cpu_t		*cp = CPU;
548 
549 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
550 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
551 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
552 }
553 
554 /*
555  * Called when CPU exits the idle loop
556  */
557 static void
558 idle_exit()
559 {
560 	cpu_t		*cp = CPU;
561 
562 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
563 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
564 }
565 
566 /*
567  * Idle loop.
568  */
569 void
570 idle()
571 {
572 	struct cpu	*cp = CPU;		/* pointer to this CPU */
573 	kthread_t	*t;			/* taken thread */
574 
575 	idle_enter();
576 
577 	/*
578 	 * Uniprocessor version of idle loop.
579 	 * Do this until notified that we're on an actual multiprocessor.
580 	 */
581 	while (ncpus == 1) {
582 		if (cp->cpu_disp->disp_nrunnable == 0) {
583 			(*idle_cpu)();
584 			continue;
585 		}
586 		idle_exit();
587 		swtch();
588 
589 		idle_enter(); /* returned from swtch */
590 	}
591 
592 	/*
593 	 * Multiprocessor idle loop.
594 	 */
595 	for (;;) {
596 		/*
597 		 * If CPU is completely quiesced by p_online(2), just wait
598 		 * here with minimal bus traffic until put online.
599 		 */
600 		while (cp->cpu_flags & CPU_QUIESCED)
601 			(*idle_cpu)();
602 
603 		if (cp->cpu_disp->disp_nrunnable != 0) {
604 			idle_exit();
605 			swtch();
606 		} else {
607 			if (cp->cpu_flags & CPU_OFFLINE)
608 				continue;
609 			if ((t = disp_getwork(cp)) == NULL) {
610 				if (cp->cpu_chosen_level != -1) {
611 					disp_t *dp = cp->cpu_disp;
612 					disp_t *kpq;
613 
614 					disp_lock_enter(&dp->disp_lock);
615 					/*
616 					 * Set kpq under lock to prevent
617 					 * migration between partitions.
618 					 */
619 					kpq = &cp->cpu_part->cp_kp_queue;
620 					if (kpq->disp_maxrunpri == -1)
621 						cp->cpu_chosen_level = -1;
622 					disp_lock_exit(&dp->disp_lock);
623 				}
624 				(*idle_cpu)();
625 				continue;
626 			}
627 			/*
628 			 * If there was a thread but we couldn't steal
629 			 * it, then keep trying.
630 			 */
631 			if (t == T_DONTSTEAL)
632 				continue;
633 			idle_exit();
634 			swtch_to(t);
635 		}
636 		idle_enter(); /* returned from swtch/swtch_to */
637 	}
638 }
639 
640 
641 /*
642  * Preempt the currently running thread in favor of the highest
643  * priority thread.  The class of the current thread controls
644  * where it goes on the dispatcher queues. If panicking, turn
645  * preemption off.
646  */
647 void
648 preempt()
649 {
650 	kthread_t 	*t = curthread;
651 	klwp_t 		*lwp = ttolwp(curthread);
652 
653 	if (panicstr)
654 		return;
655 
656 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
657 
658 	thread_lock(t);
659 
660 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
661 		/*
662 		 * this thread has already been chosen to be run on
663 		 * another CPU. Clear kprunrun on this CPU since we're
664 		 * already headed for swtch().
665 		 */
666 		CPU->cpu_kprunrun = 0;
667 		thread_unlock_nopreempt(t);
668 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
669 	} else {
670 		if (lwp != NULL)
671 			lwp->lwp_ru.nivcsw++;
672 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
673 		THREAD_TRANSITION(t);
674 		CL_PREEMPT(t);
675 		DTRACE_SCHED(preempt);
676 		thread_unlock_nopreempt(t);
677 
678 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
679 
680 		swtch();		/* clears CPU->cpu_runrun via disp() */
681 	}
682 }
683 
684 extern kthread_t *thread_unpin();
685 
686 /*
687  * disp() - find the highest priority thread for this processor to run, and
688  * set it in TS_ONPROC state so that resume() can be called to run it.
689  */
690 static kthread_t *
691 disp()
692 {
693 	cpu_t		*cpup;
694 	disp_t		*dp;
695 	kthread_t	*tp;
696 	dispq_t		*dq;
697 	int		maxrunword;
698 	pri_t		pri;
699 	disp_t		*kpq;
700 
701 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
702 
703 	cpup = CPU;
704 	/*
705 	 * Find the highest priority loaded, runnable thread.
706 	 */
707 	dp = cpup->cpu_disp;
708 
709 reschedule:
710 	/*
711 	 * If there is more important work on the global queue with a better
712 	 * priority than the maximum on this CPU, take it now.
713 	 */
714 	kpq = &cpup->cpu_part->cp_kp_queue;
715 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
716 	    pri >= dp->disp_maxrunpri &&
717 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
718 	    (tp = disp_getbest(kpq)) != NULL) {
719 		if (disp_ratify(tp, kpq) != NULL) {
720 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
721 			    "disp_end:tid %p", tp);
722 			return (tp);
723 		}
724 	}
725 
726 	disp_lock_enter(&dp->disp_lock);
727 	pri = dp->disp_maxrunpri;
728 
729 	/*
730 	 * If there is nothing to run, look at what's runnable on other queues.
731 	 * Choose the idle thread if the CPU is quiesced.
732 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
733 	 * interrupt threads, which will be the only threads on the CPU's own
734 	 * queue, but cannot run threads from other queues.
735 	 */
736 	if (pri == -1) {
737 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
738 			disp_lock_exit(&dp->disp_lock);
739 			if ((tp = disp_getwork(cpup)) == NULL ||
740 			    tp == T_DONTSTEAL) {
741 				tp = cpup->cpu_idle_thread;
742 				(void) splhigh();
743 				THREAD_ONPROC(tp, cpup);
744 				cpup->cpu_dispthread = tp;
745 				cpup->cpu_dispatch_pri = -1;
746 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
747 				cpup->cpu_chosen_level = -1;
748 			}
749 		} else {
750 			disp_lock_exit_high(&dp->disp_lock);
751 			tp = cpup->cpu_idle_thread;
752 			THREAD_ONPROC(tp, cpup);
753 			cpup->cpu_dispthread = tp;
754 			cpup->cpu_dispatch_pri = -1;
755 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
756 			cpup->cpu_chosen_level = -1;
757 		}
758 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
759 		    "disp_end:tid %p", tp);
760 		return (tp);
761 	}
762 
763 	dq = &dp->disp_q[pri];
764 	tp = dq->dq_first;
765 
766 	ASSERT(tp != NULL);
767 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
768 
769 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
770 
771 	/*
772 	 * Found it so remove it from queue.
773 	 */
774 	dp->disp_nrunnable--;
775 	dq->dq_sruncnt--;
776 	if ((dq->dq_first = tp->t_link) == NULL) {
777 		ulong_t	*dqactmap = dp->disp_qactmap;
778 
779 		ASSERT(dq->dq_sruncnt == 0);
780 		dq->dq_last = NULL;
781 
782 		/*
783 		 * The queue is empty, so the corresponding bit needs to be
784 		 * turned off in dqactmap.   If nrunnable != 0 just took the
785 		 * last runnable thread off the
786 		 * highest queue, so recompute disp_maxrunpri.
787 		 */
788 		maxrunword = pri >> BT_ULSHIFT;
789 		dqactmap[maxrunword] &= ~BT_BIW(pri);
790 
791 		if (dp->disp_nrunnable == 0) {
792 			dp->disp_max_unbound_pri = -1;
793 			dp->disp_maxrunpri = -1;
794 		} else {
795 			int ipri;
796 
797 			ipri = bt_gethighbit(dqactmap, maxrunword);
798 			dp->disp_maxrunpri = ipri;
799 			if (ipri < dp->disp_max_unbound_pri)
800 				dp->disp_max_unbound_pri = ipri;
801 		}
802 	} else {
803 		tp->t_link = NULL;
804 	}
805 
806 	/*
807 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
808 	 * out this thread before we have a chance to run it.
809 	 * While running, it is protected against swapping by t_lock.
810 	 */
811 	tp->t_schedflag |= TS_DONT_SWAP;
812 	cpup->cpu_dispthread = tp;		/* protected by spl only */
813 	cpup->cpu_dispatch_pri = pri;
814 	ASSERT(pri == DISP_PRIO(tp));
815 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
816 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
817 
818 	ASSERT(tp != NULL);
819 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
820 	    "disp_end:tid %p", tp);
821 
822 	if (disp_ratify(tp, kpq) == NULL)
823 		goto reschedule;
824 
825 	return (tp);
826 }
827 
828 /*
829  * swtch()
830  *	Find best runnable thread and run it.
831  *	Called with the current thread already switched to a new state,
832  *	on a sleep queue, run queue, stopped, and not zombied.
833  *	May be called at any spl level less than or equal to LOCK_LEVEL.
834  *	Always drops spl to the base level (spl0()).
835  */
836 void
837 swtch()
838 {
839 	kthread_t	*t = curthread;
840 	kthread_t	*next;
841 	cpu_t		*cp;
842 
843 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
844 
845 	if (t->t_flag & T_INTR_THREAD)
846 		cpu_intr_swtch_enter(t);
847 
848 	if (t->t_intr != NULL) {
849 		/*
850 		 * We are an interrupt thread.  Setup and return
851 		 * the interrupted thread to be resumed.
852 		 */
853 		(void) splhigh();	/* block other scheduler action */
854 		cp = CPU;		/* now protected against migration */
855 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
856 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
857 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
858 		next = thread_unpin();
859 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
860 		resume_from_intr(next);
861 	} else {
862 #ifdef	DEBUG
863 		if (t->t_state == TS_ONPROC &&
864 		    t->t_disp_queue->disp_cpu == CPU &&
865 		    t->t_preempt == 0) {
866 			thread_lock(t);
867 			ASSERT(t->t_state != TS_ONPROC ||
868 			    t->t_disp_queue->disp_cpu != CPU ||
869 			    t->t_preempt != 0);	/* cannot migrate */
870 			thread_unlock_nopreempt(t);
871 		}
872 #endif	/* DEBUG */
873 		cp = CPU;
874 		next = disp();		/* returns with spl high */
875 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
876 
877 		/* OK to steal anything left on run queue */
878 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
879 
880 		if (next != t) {
881 			if (t == cp->cpu_idle_thread) {
882 				PG_NRUN_UPDATE(cp, 1);
883 			} else if (next == cp->cpu_idle_thread) {
884 				PG_NRUN_UPDATE(cp, -1);
885 			}
886 
887 			/*
888 			 * If t was previously in the TS_ONPROC state,
889 			 * setfrontdq and setbackdq won't have set its t_waitrq.
890 			 * Since we now finally know that we're switching away
891 			 * from this thread, set its t_waitrq if it is on a run
892 			 * queue.
893 			 */
894 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
895 				t->t_waitrq = gethrtime_unscaled();
896 			}
897 
898 			/*
899 			 * restore mstate of thread that we are switching to
900 			 */
901 			restore_mstate(next);
902 
903 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
904 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
905 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
906 
907 			if (dtrace_vtime_active)
908 				dtrace_vtime_switch(next);
909 
910 			resume(next);
911 			/*
912 			 * The TR_RESUME_END and TR_SWTCH_END trace points
913 			 * appear at the end of resume(), because we may not
914 			 * return here
915 			 */
916 		} else {
917 			if (t->t_flag & T_INTR_THREAD)
918 				cpu_intr_swtch_exit(t);
919 
920 			DTRACE_SCHED(remain__cpu);
921 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
922 			(void) spl0();
923 		}
924 	}
925 }
926 
927 /*
928  * swtch_from_zombie()
929  *	Special case of swtch(), which allows checks for TS_ZOMB to be
930  *	eliminated from normal resume.
931  *	Find best runnable thread and run it.
932  *	Called with the current thread zombied.
933  *	Zombies cannot migrate, so CPU references are safe.
934  */
935 void
936 swtch_from_zombie()
937 {
938 	kthread_t	*next;
939 	cpu_t		*cpu = CPU;
940 
941 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
942 
943 	ASSERT(curthread->t_state == TS_ZOMB);
944 
945 	next = disp();			/* returns with spl high */
946 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
947 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
948 	ASSERT(next != curthread);
949 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
950 
951 	if (next == cpu->cpu_idle_thread)
952 		PG_NRUN_UPDATE(cpu, -1);
953 
954 	restore_mstate(next);
955 
956 	if (dtrace_vtime_active)
957 		dtrace_vtime_switch(next);
958 
959 	resume_from_zombie(next);
960 	/*
961 	 * The TR_RESUME_END and TR_SWTCH_END trace points
962 	 * appear at the end of resume(), because we certainly will not
963 	 * return here
964 	 */
965 }
966 
967 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
968 static int
969 thread_on_queue(kthread_t *tp)
970 {
971 	cpu_t	*cp;
972 	cpu_t	*self;
973 	disp_t	*dp;
974 
975 	self = CPU;
976 	cp = self->cpu_next_onln;
977 	dp = cp->cpu_disp;
978 	for (;;) {
979 		dispq_t		*dq;
980 		dispq_t		*eq;
981 
982 		disp_lock_enter_high(&dp->disp_lock);
983 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
984 			kthread_t	*rp;
985 
986 			ASSERT(dq->dq_last == NULL ||
987 			    dq->dq_last->t_link == NULL);
988 			for (rp = dq->dq_first; rp; rp = rp->t_link)
989 				if (tp == rp) {
990 					disp_lock_exit_high(&dp->disp_lock);
991 					return (1);
992 				}
993 		}
994 		disp_lock_exit_high(&dp->disp_lock);
995 		if (cp == NULL)
996 			break;
997 		if (cp == self) {
998 			cp = NULL;
999 			dp = &cp->cpu_part->cp_kp_queue;
1000 		} else {
1001 			cp = cp->cpu_next_onln;
1002 			dp = cp->cpu_disp;
1003 		}
1004 	}
1005 	return (0);
1006 }	/* end of thread_on_queue */
1007 #else
1008 
1009 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1010 
1011 #endif  /* DEBUG */
1012 
1013 /*
1014  * like swtch(), but switch to a specified thread taken from another CPU.
1015  *	called with spl high..
1016  */
1017 void
1018 swtch_to(kthread_t *next)
1019 {
1020 	cpu_t			*cp = CPU;
1021 
1022 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1023 
1024 	/*
1025 	 * Update context switch statistics.
1026 	 */
1027 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1028 
1029 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1030 
1031 	if (curthread == cp->cpu_idle_thread)
1032 		PG_NRUN_UPDATE(cp, 1);
1033 
1034 	/* OK to steal anything left on run queue */
1035 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1036 
1037 	/* record last execution time */
1038 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
1039 
1040 	/*
1041 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1042 	 * won't have set its t_waitrq.  Since we now finally know that we're
1043 	 * switching away from this thread, set its t_waitrq if it is on a run
1044 	 * queue.
1045 	 */
1046 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1047 		curthread->t_waitrq = gethrtime_unscaled();
1048 	}
1049 
1050 	/* restore next thread to previously running microstate */
1051 	restore_mstate(next);
1052 
1053 	if (dtrace_vtime_active)
1054 		dtrace_vtime_switch(next);
1055 
1056 	resume(next);
1057 	/*
1058 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1059 	 * appear at the end of resume(), because we may not
1060 	 * return here
1061 	 */
1062 }
1063 
1064 
1065 
1066 #define	CPU_IDLING(pri)	((pri) == -1)
1067 
1068 static void
1069 cpu_resched(cpu_t *cp, pri_t tpri)
1070 {
1071 	int	call_poke_cpu = 0;
1072 	pri_t   cpupri = cp->cpu_dispatch_pri;
1073 
1074 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1075 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1076 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1077 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1078 			cp->cpu_runrun = 1;
1079 			aston(cp->cpu_dispthread);
1080 			if (tpri < kpreemptpri && cp != CPU)
1081 				call_poke_cpu = 1;
1082 		}
1083 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1084 			cp->cpu_kprunrun = 1;
1085 			if (cp != CPU)
1086 				call_poke_cpu = 1;
1087 		}
1088 	}
1089 
1090 	/*
1091 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1092 	 */
1093 	membar_enter();
1094 
1095 	if (call_poke_cpu)
1096 		poke_cpu(cp->cpu_id);
1097 }
1098 
1099 /*
1100  * Perform multi-level CMT load balancing of running threads.
1101  * tp is the thread being enqueued
1102  * cp is the hint CPU (chosen by cpu_choose()).
1103  */
1104 static cpu_t *
1105 cmt_balance(kthread_t *tp, cpu_t *cp)
1106 {
1107 	int		hint, i, cpu, nsiblings;
1108 	int		self = 0;
1109 	group_t		*cmt_pgs, *siblings;
1110 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
1111 	int		pg_nrun, tpg_nrun;
1112 	int		level = 0;
1113 	cpu_t		*newcp;
1114 
1115 	ASSERT(THREAD_LOCK_HELD(tp));
1116 
1117 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
1118 
1119 	if (GROUP_SIZE(cmt_pgs) == 0)
1120 		return (cp);	/* nothing to do */
1121 
1122 	if (tp == curthread)
1123 		self = 1;
1124 
1125 	/*
1126 	 * Balance across siblings in the CPUs CMT lineage
1127 	 */
1128 	do {
1129 		pg = GROUP_ACCESS(cmt_pgs, level);
1130 
1131 		siblings = pg->cmt_siblings;
1132 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
1133 		if (nsiblings == 1)
1134 			continue;	/* nobody to balance against */
1135 
1136 		pg_nrun = pg->cmt_nrunning;
1137 		if (self &&
1138 		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
1139 			pg_nrun--;	/* Ignore curthread's effect */
1140 
1141 		hint = pg->cmt_hint;
1142 		/*
1143 		 * Check for validity of the hint
1144 		 * It should reference a valid sibling
1145 		 */
1146 		if (hint >= nsiblings)
1147 			hint = pg->cmt_hint = 0;
1148 		else
1149 			pg->cmt_hint++;
1150 
1151 		/*
1152 		 * Find a balancing candidate from among our siblings
1153 		 * "hint" is a hint for where to start looking
1154 		 */
1155 		i = hint;
1156 		do {
1157 			ASSERT(i < nsiblings);
1158 			pg_tmp = GROUP_ACCESS(siblings, i);
1159 
1160 			/*
1161 			 * The candidate must not be us, and must
1162 			 * have some CPU resources in the thread's
1163 			 * partition
1164 			 */
1165 			if (pg_tmp != pg &&
1166 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
1167 			    ((pg_t *)pg_tmp)->pg_id)) {
1168 				tpg = pg_tmp;
1169 				break;
1170 			}
1171 
1172 			if (++i >= nsiblings)
1173 				i = 0;
1174 		} while (i != hint);
1175 
1176 		if (!tpg)
1177 			continue;	/* no candidates at this level */
1178 
1179 		/*
1180 		 * Check if the balancing target is underloaded
1181 		 * Decide to balance if the target is running fewer
1182 		 * threads, or if it's running the same number of threads
1183 		 * with more online CPUs
1184 		 */
1185 		tpg_nrun = tpg->cmt_nrunning;
1186 		if (pg_nrun > tpg_nrun ||
1187 		    (pg_nrun == tpg_nrun &&
1188 		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
1189 		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
1190 			break;
1191 		}
1192 		tpg = NULL;
1193 	} while (++level < GROUP_SIZE(cmt_pgs));
1194 
1195 
1196 	if (tpg) {
1197 		/*
1198 		 * Select an idle CPU from the target PG
1199 		 */
1200 		for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) {
1201 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
1202 			if (newcp->cpu_part == tp->t_cpupart &&
1203 			    newcp->cpu_dispatch_pri == -1) {
1204 				cp = newcp;
1205 				break;
1206 			}
1207 		}
1208 	}
1209 
1210 	return (cp);
1211 }
1212 
1213 /*
1214  * setbackdq() keeps runqs balanced such that the difference in length
1215  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1216  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1217  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1218  * try to keep runqs perfectly balanced regardless of the thread priority.
1219  */
1220 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1221 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1222 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1223 
1224 /*
1225  * Put the specified thread on the back of the dispatcher
1226  * queue corresponding to its current priority.
1227  *
1228  * Called with the thread in transition, onproc or stopped state
1229  * and locked (transition implies locked) and at high spl.
1230  * Returns with the thread in TS_RUN state and still locked.
1231  */
1232 void
1233 setbackdq(kthread_t *tp)
1234 {
1235 	dispq_t	*dq;
1236 	disp_t		*dp;
1237 	cpu_t		*cp;
1238 	pri_t		tpri;
1239 	int		bound;
1240 
1241 	ASSERT(THREAD_LOCK_HELD(tp));
1242 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1243 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1244 
1245 	/*
1246 	 * If thread is "swapped" or on the swap queue don't
1247 	 * queue it, but wake sched.
1248 	 */
1249 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1250 		disp_swapped_setrun(tp);
1251 		return;
1252 	}
1253 
1254 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1255 		bound = 1;
1256 	else
1257 		bound = 0;
1258 
1259 	tpri = DISP_PRIO(tp);
1260 	if (ncpus == 1)
1261 		cp = tp->t_cpu;
1262 	else if (!bound) {
1263 		if (tpri >= kpqpri) {
1264 			setkpdq(tp, SETKP_BACK);
1265 			return;
1266 		}
1267 		/*
1268 		 * Let cpu_choose suggest a CPU.
1269 		 */
1270 		cp = cpu_choose(tp, tpri);
1271 
1272 		if (tp->t_cpupart == cp->cpu_part) {
1273 			int	qlen;
1274 
1275 			/*
1276 			 * Perform any CMT load balancing
1277 			 */
1278 			cp = cmt_balance(tp, cp);
1279 
1280 			/*
1281 			 * Balance across the run queues
1282 			 */
1283 			qlen = RUNQ_LEN(cp, tpri);
1284 			if (tpri >= RUNQ_MATCH_PRI &&
1285 			    !(tp->t_schedflag & TS_RUNQMATCH))
1286 				qlen -= RUNQ_MAX_DIFF;
1287 			if (qlen > 0) {
1288 				cpu_t *newcp;
1289 
1290 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1291 					newcp = cp->cpu_next_part;
1292 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1293 					newcp = cp->cpu_next_part;
1294 				}
1295 
1296 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1297 					DTRACE_PROBE3(runq__balance,
1298 					    kthread_t *, tp,
1299 					    cpu_t *, cp, cpu_t *, newcp);
1300 					cp = newcp;
1301 				}
1302 			}
1303 		} else {
1304 			/*
1305 			 * Migrate to a cpu in the new partition.
1306 			 */
1307 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1308 			    tp->t_lpl, tp->t_pri, NULL);
1309 		}
1310 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1311 	} else {
1312 		/*
1313 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1314 		 * a short time until weak binding that existed when the
1315 		 * strong binding was established has dropped) so we must
1316 		 * favour weak binding over strong.
1317 		 */
1318 		cp = tp->t_weakbound_cpu ?
1319 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1320 	}
1321 	/*
1322 	 * A thread that is ONPROC may be temporarily placed on the run queue
1323 	 * but then chosen to run again by disp.  If the thread we're placing on
1324 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1325 	 * replacement process is actually scheduled in swtch().  In this
1326 	 * situation, curthread is the only thread that could be in the ONPROC
1327 	 * state.
1328 	 */
1329 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1330 		hrtime_t curtime;
1331 
1332 		curtime = gethrtime_unscaled();
1333 		(void) cpu_update_pct(tp, curtime);
1334 		tp->t_waitrq = curtime;
1335 	} else {
1336 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1337 	}
1338 
1339 	dp = cp->cpu_disp;
1340 	disp_lock_enter_high(&dp->disp_lock);
1341 
1342 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1343 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1344 	    tpri, cp, tp);
1345 
1346 #ifndef NPROBE
1347 	/* Kernel probe */
1348 	if (tnf_tracing_active)
1349 		tnf_thread_queue(tp, cp, tpri);
1350 #endif /* NPROBE */
1351 
1352 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1353 
1354 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1355 	tp->t_disp_queue = dp;
1356 	tp->t_link = NULL;
1357 
1358 	dq = &dp->disp_q[tpri];
1359 	dp->disp_nrunnable++;
1360 	if (!bound)
1361 		dp->disp_steal = 0;
1362 	membar_enter();
1363 
1364 	if (dq->dq_sruncnt++ != 0) {
1365 		ASSERT(dq->dq_first != NULL);
1366 		dq->dq_last->t_link = tp;
1367 		dq->dq_last = tp;
1368 	} else {
1369 		ASSERT(dq->dq_first == NULL);
1370 		ASSERT(dq->dq_last == NULL);
1371 		dq->dq_first = dq->dq_last = tp;
1372 		BT_SET(dp->disp_qactmap, tpri);
1373 		if (tpri > dp->disp_maxrunpri) {
1374 			dp->disp_maxrunpri = tpri;
1375 			membar_enter();
1376 			cpu_resched(cp, tpri);
1377 		}
1378 	}
1379 
1380 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1381 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1382 		    cp == CPU) {
1383 			/*
1384 			 * If there are no other unbound threads on the
1385 			 * run queue, don't allow other CPUs to steal
1386 			 * this thread while we are in the middle of a
1387 			 * context switch. We may just switch to it
1388 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1389 			 * in swtch and swtch_to.
1390 			 */
1391 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1392 		}
1393 		dp->disp_max_unbound_pri = tpri;
1394 	}
1395 	(*disp_enq_thread)(cp, bound);
1396 }
1397 
1398 /*
1399  * Put the specified thread on the front of the dispatcher
1400  * queue corresponding to its current priority.
1401  *
1402  * Called with the thread in transition, onproc or stopped state
1403  * and locked (transition implies locked) and at high spl.
1404  * Returns with the thread in TS_RUN state and still locked.
1405  */
1406 void
1407 setfrontdq(kthread_t *tp)
1408 {
1409 	disp_t		*dp;
1410 	dispq_t		*dq;
1411 	cpu_t		*cp;
1412 	pri_t		tpri;
1413 	int		bound;
1414 
1415 	ASSERT(THREAD_LOCK_HELD(tp));
1416 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1417 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1418 
1419 	/*
1420 	 * If thread is "swapped" or on the swap queue don't
1421 	 * queue it, but wake sched.
1422 	 */
1423 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1424 		disp_swapped_setrun(tp);
1425 		return;
1426 	}
1427 
1428 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1429 		bound = 1;
1430 	else
1431 		bound = 0;
1432 
1433 	tpri = DISP_PRIO(tp);
1434 	if (ncpus == 1)
1435 		cp = tp->t_cpu;
1436 	else if (!bound) {
1437 		if (tpri >= kpqpri) {
1438 			setkpdq(tp, SETKP_FRONT);
1439 			return;
1440 		}
1441 		cp = tp->t_cpu;
1442 		if (tp->t_cpupart == cp->cpu_part) {
1443 			/*
1444 			 * If we are of higher or equal priority than
1445 			 * the highest priority runnable thread of
1446 			 * the current CPU, just pick this CPU.  Otherwise
1447 			 * Let cpu_choose() select the CPU.  If this cpu
1448 			 * is the target of an offline request then do not
1449 			 * pick it - a thread_nomigrate() on the in motion
1450 			 * cpu relies on this when it forces a preempt.
1451 			 */
1452 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1453 			    cp == cpu_inmotion)
1454 				cp = cpu_choose(tp, tpri);
1455 		} else {
1456 			/*
1457 			 * Migrate to a cpu in the new partition.
1458 			 */
1459 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1460 			    tp->t_lpl, tp->t_pri, NULL);
1461 		}
1462 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1463 	} else {
1464 		/*
1465 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1466 		 * a short time until weak binding that existed when the
1467 		 * strong binding was established has dropped) so we must
1468 		 * favour weak binding over strong.
1469 		 */
1470 		cp = tp->t_weakbound_cpu ?
1471 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1472 	}
1473 
1474 	/*
1475 	 * A thread that is ONPROC may be temporarily placed on the run queue
1476 	 * but then chosen to run again by disp.  If the thread we're placing on
1477 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1478 	 * replacement process is actually scheduled in swtch().  In this
1479 	 * situation, curthread is the only thread that could be in the ONPROC
1480 	 * state.
1481 	 */
1482 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1483 		hrtime_t curtime;
1484 
1485 		curtime = gethrtime_unscaled();
1486 		(void) cpu_update_pct(tp, curtime);
1487 		tp->t_waitrq = curtime;
1488 	} else {
1489 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1490 	}
1491 
1492 	dp = cp->cpu_disp;
1493 	disp_lock_enter_high(&dp->disp_lock);
1494 
1495 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1496 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1497 
1498 #ifndef NPROBE
1499 	/* Kernel probe */
1500 	if (tnf_tracing_active)
1501 		tnf_thread_queue(tp, cp, tpri);
1502 #endif /* NPROBE */
1503 
1504 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1505 
1506 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1507 	tp->t_disp_queue = dp;
1508 
1509 	dq = &dp->disp_q[tpri];
1510 	dp->disp_nrunnable++;
1511 	if (!bound)
1512 		dp->disp_steal = 0;
1513 	membar_enter();
1514 
1515 	if (dq->dq_sruncnt++ != 0) {
1516 		ASSERT(dq->dq_last != NULL);
1517 		tp->t_link = dq->dq_first;
1518 		dq->dq_first = tp;
1519 	} else {
1520 		ASSERT(dq->dq_last == NULL);
1521 		ASSERT(dq->dq_first == NULL);
1522 		tp->t_link = NULL;
1523 		dq->dq_first = dq->dq_last = tp;
1524 		BT_SET(dp->disp_qactmap, tpri);
1525 		if (tpri > dp->disp_maxrunpri) {
1526 			dp->disp_maxrunpri = tpri;
1527 			membar_enter();
1528 			cpu_resched(cp, tpri);
1529 		}
1530 	}
1531 
1532 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1533 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1534 		    cp == CPU) {
1535 			/*
1536 			 * If there are no other unbound threads on the
1537 			 * run queue, don't allow other CPUs to steal
1538 			 * this thread while we are in the middle of a
1539 			 * context switch. We may just switch to it
1540 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1541 			 * in swtch and swtch_to.
1542 			 */
1543 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1544 		}
1545 		dp->disp_max_unbound_pri = tpri;
1546 	}
1547 	(*disp_enq_thread)(cp, bound);
1548 }
1549 
1550 /*
1551  * Put a high-priority unbound thread on the kp queue
1552  */
1553 static void
1554 setkpdq(kthread_t *tp, int borf)
1555 {
1556 	dispq_t	*dq;
1557 	disp_t	*dp;
1558 	cpu_t	*cp;
1559 	pri_t	tpri;
1560 
1561 	tpri = DISP_PRIO(tp);
1562 
1563 	dp = &tp->t_cpupart->cp_kp_queue;
1564 	disp_lock_enter_high(&dp->disp_lock);
1565 
1566 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1567 
1568 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1569 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1570 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1571 	tp->t_disp_queue = dp;
1572 	dp->disp_nrunnable++;
1573 	dq = &dp->disp_q[tpri];
1574 
1575 	if (dq->dq_sruncnt++ != 0) {
1576 		if (borf == SETKP_BACK) {
1577 			ASSERT(dq->dq_first != NULL);
1578 			tp->t_link = NULL;
1579 			dq->dq_last->t_link = tp;
1580 			dq->dq_last = tp;
1581 		} else {
1582 			ASSERT(dq->dq_last != NULL);
1583 			tp->t_link = dq->dq_first;
1584 			dq->dq_first = tp;
1585 		}
1586 	} else {
1587 		if (borf == SETKP_BACK) {
1588 			ASSERT(dq->dq_first == NULL);
1589 			ASSERT(dq->dq_last == NULL);
1590 			dq->dq_first = dq->dq_last = tp;
1591 		} else {
1592 			ASSERT(dq->dq_last == NULL);
1593 			ASSERT(dq->dq_first == NULL);
1594 			tp->t_link = NULL;
1595 			dq->dq_first = dq->dq_last = tp;
1596 		}
1597 		BT_SET(dp->disp_qactmap, tpri);
1598 		if (tpri > dp->disp_max_unbound_pri)
1599 			dp->disp_max_unbound_pri = tpri;
1600 		if (tpri > dp->disp_maxrunpri) {
1601 			dp->disp_maxrunpri = tpri;
1602 			membar_enter();
1603 		}
1604 	}
1605 
1606 	cp = tp->t_cpu;
1607 	if (tp->t_cpupart != cp->cpu_part) {
1608 		/* migrate to a cpu in the new partition */
1609 		cp = tp->t_cpupart->cp_cpulist;
1610 	}
1611 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1612 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1613 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1614 
1615 #ifndef NPROBE
1616 	/* Kernel probe */
1617 	if (tnf_tracing_active)
1618 		tnf_thread_queue(tp, cp, tpri);
1619 #endif /* NPROBE */
1620 
1621 	if (cp->cpu_chosen_level < tpri)
1622 		cp->cpu_chosen_level = tpri;
1623 	cpu_resched(cp, tpri);
1624 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1625 	(*disp_enq_thread)(cp, 0);
1626 }
1627 
1628 /*
1629  * Remove a thread from the dispatcher queue if it is on it.
1630  * It is not an error if it is not found but we return whether
1631  * or not it was found in case the caller wants to check.
1632  */
1633 int
1634 dispdeq(kthread_t *tp)
1635 {
1636 	disp_t		*dp;
1637 	dispq_t		*dq;
1638 	kthread_t	*rp;
1639 	kthread_t	*trp;
1640 	kthread_t	**ptp;
1641 	int		tpri;
1642 
1643 	ASSERT(THREAD_LOCK_HELD(tp));
1644 
1645 	if (tp->t_state != TS_RUN)
1646 		return (0);
1647 
1648 	/*
1649 	 * The thread is "swapped" or is on the swap queue and
1650 	 * hence no longer on the run queue, so return true.
1651 	 */
1652 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1653 		return (1);
1654 
1655 	tpri = DISP_PRIO(tp);
1656 	dp = tp->t_disp_queue;
1657 	ASSERT(tpri < dp->disp_npri);
1658 	dq = &dp->disp_q[tpri];
1659 	ptp = &dq->dq_first;
1660 	rp = *ptp;
1661 	trp = NULL;
1662 
1663 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1664 
1665 	/*
1666 	 * Search for thread in queue.
1667 	 * Double links would simplify this at the expense of disp/setrun.
1668 	 */
1669 	while (rp != tp && rp != NULL) {
1670 		trp = rp;
1671 		ptp = &trp->t_link;
1672 		rp = trp->t_link;
1673 	}
1674 
1675 	if (rp == NULL) {
1676 		panic("dispdeq: thread not on queue");
1677 	}
1678 
1679 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1680 
1681 	/*
1682 	 * Found it so remove it from queue.
1683 	 */
1684 	if ((*ptp = rp->t_link) == NULL)
1685 		dq->dq_last = trp;
1686 
1687 	dp->disp_nrunnable--;
1688 	if (--dq->dq_sruncnt == 0) {
1689 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1690 		if (dp->disp_nrunnable == 0) {
1691 			dp->disp_max_unbound_pri = -1;
1692 			dp->disp_maxrunpri = -1;
1693 		} else if (tpri == dp->disp_maxrunpri) {
1694 			int ipri;
1695 
1696 			ipri = bt_gethighbit(dp->disp_qactmap,
1697 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1698 			if (ipri < dp->disp_max_unbound_pri)
1699 				dp->disp_max_unbound_pri = ipri;
1700 			dp->disp_maxrunpri = ipri;
1701 		}
1702 	}
1703 	tp->t_link = NULL;
1704 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1705 	return (1);
1706 }
1707 
1708 
1709 /*
1710  * dq_sruninc and dq_srundec are public functions for
1711  * incrementing/decrementing the sruncnts when a thread on
1712  * a dispatcher queue is made schedulable/unschedulable by
1713  * resetting the TS_LOAD flag.
1714  *
1715  * The caller MUST have the thread lock and therefore the dispatcher
1716  * queue lock so that the operation which changes
1717  * the flag, the operation that checks the status of the thread to
1718  * determine if it's on a disp queue AND the call to this function
1719  * are one atomic operation with respect to interrupts.
1720  */
1721 
1722 /*
1723  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1724  */
1725 void
1726 dq_sruninc(kthread_t *t)
1727 {
1728 	ASSERT(t->t_state == TS_RUN);
1729 	ASSERT(t->t_schedflag & TS_LOAD);
1730 
1731 	THREAD_TRANSITION(t);
1732 	setfrontdq(t);
1733 }
1734 
1735 /*
1736  * See comment on calling conventions above.
1737  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1738  */
1739 void
1740 dq_srundec(kthread_t *t)
1741 {
1742 	ASSERT(t->t_schedflag & TS_LOAD);
1743 
1744 	(void) dispdeq(t);
1745 	disp_swapped_enq(t);
1746 }
1747 
1748 /*
1749  * Change the dispatcher lock of thread to the "swapped_lock"
1750  * and return with thread lock still held.
1751  *
1752  * Called with thread_lock held, in transition state, and at high spl.
1753  */
1754 void
1755 disp_swapped_enq(kthread_t *tp)
1756 {
1757 	ASSERT(THREAD_LOCK_HELD(tp));
1758 	ASSERT(tp->t_schedflag & TS_LOAD);
1759 
1760 	switch (tp->t_state) {
1761 	case TS_RUN:
1762 		disp_lock_enter_high(&swapped_lock);
1763 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1764 		break;
1765 	case TS_ONPROC:
1766 		disp_lock_enter_high(&swapped_lock);
1767 		THREAD_TRANSITION(tp);
1768 		wake_sched_sec = 1;		/* tell clock to wake sched */
1769 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1770 		break;
1771 	default:
1772 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1773 	}
1774 }
1775 
1776 /*
1777  * This routine is called by setbackdq/setfrontdq if the thread is
1778  * not loaded or loaded and on the swap queue.
1779  *
1780  * Thread state TS_SLEEP implies that a swapped thread
1781  * has been woken up and needs to be swapped in by the swapper.
1782  *
1783  * Thread state TS_RUN, it implies that the priority of a swapped
1784  * thread is being increased by scheduling class (e.g. ts_update).
1785  */
1786 static void
1787 disp_swapped_setrun(kthread_t *tp)
1788 {
1789 	ASSERT(THREAD_LOCK_HELD(tp));
1790 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1791 
1792 	switch (tp->t_state) {
1793 	case TS_SLEEP:
1794 		disp_lock_enter_high(&swapped_lock);
1795 		/*
1796 		 * Wakeup sched immediately (i.e., next tick) if the
1797 		 * thread priority is above maxclsyspri.
1798 		 */
1799 		if (DISP_PRIO(tp) > maxclsyspri)
1800 			wake_sched = 1;
1801 		else
1802 			wake_sched_sec = 1;
1803 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1804 		break;
1805 	case TS_RUN:				/* called from ts_update */
1806 		break;
1807 	default:
1808 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1809 	}
1810 }
1811 
1812 
1813 /*
1814  *	Make a thread give up its processor.  Find the processor on
1815  *	which this thread is executing, and have that processor
1816  *	preempt.
1817  */
1818 void
1819 cpu_surrender(kthread_t *tp)
1820 {
1821 	cpu_t	*cpup;
1822 	int	max_pri;
1823 	int	max_run_pri;
1824 	klwp_t	*lwp;
1825 
1826 	ASSERT(THREAD_LOCK_HELD(tp));
1827 
1828 	if (tp->t_state != TS_ONPROC)
1829 		return;
1830 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1831 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1832 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1833 	if (max_pri < max_run_pri)
1834 		max_pri = max_run_pri;
1835 
1836 	cpup->cpu_runrun = 1;
1837 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1838 		cpup->cpu_kprunrun = 1;
1839 	}
1840 
1841 	/*
1842 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1843 	 */
1844 	membar_enter();
1845 
1846 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1847 
1848 	/*
1849 	 * Make the target thread take an excursion through trap()
1850 	 * to do preempt() (unless we're already in trap or post_syscall,
1851 	 * calling cpu_surrender via CL_TRAPRET).
1852 	 */
1853 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1854 	    lwp->lwp_state != LWP_USER) {
1855 		aston(tp);
1856 		if (cpup != CPU)
1857 			poke_cpu(cpup->cpu_id);
1858 	}
1859 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1860 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1861 }
1862 
1863 
1864 /*
1865  * Commit to and ratify a scheduling decision
1866  */
1867 /*ARGSUSED*/
1868 static kthread_t *
1869 disp_ratify(kthread_t *tp, disp_t *kpq)
1870 {
1871 	pri_t	tpri, maxpri;
1872 	pri_t	maxkpri;
1873 	cpu_t	*cpup;
1874 
1875 	ASSERT(tp != NULL);
1876 	/*
1877 	 * Commit to, then ratify scheduling decision
1878 	 */
1879 	cpup = CPU;
1880 	if (cpup->cpu_runrun != 0)
1881 		cpup->cpu_runrun = 0;
1882 	if (cpup->cpu_kprunrun != 0)
1883 		cpup->cpu_kprunrun = 0;
1884 	if (cpup->cpu_chosen_level != -1)
1885 		cpup->cpu_chosen_level = -1;
1886 	membar_enter();
1887 	tpri = DISP_PRIO(tp);
1888 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1889 	maxkpri = kpq->disp_maxrunpri;
1890 	if (maxpri < maxkpri)
1891 		maxpri = maxkpri;
1892 	if (tpri < maxpri) {
1893 		/*
1894 		 * should have done better
1895 		 * put this one back and indicate to try again
1896 		 */
1897 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1898 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1899 		thread_lock_high(tp);
1900 		THREAD_TRANSITION(tp);
1901 		setfrontdq(tp);
1902 		thread_unlock_nopreempt(tp);
1903 
1904 		tp = NULL;
1905 	}
1906 	return (tp);
1907 }
1908 
1909 /*
1910  * See if there is any work on the dispatcher queue for other CPUs.
1911  * If there is, dequeue the best thread and return.
1912  */
1913 static kthread_t *
1914 disp_getwork(cpu_t *cp)
1915 {
1916 	cpu_t		*ocp;		/* other CPU */
1917 	cpu_t		*ocp_start;
1918 	cpu_t		*tcp;		/* target local CPU */
1919 	kthread_t	*tp;
1920 	kthread_t	*retval = NULL;
1921 	pri_t		maxpri;
1922 	disp_t		*kpq;		/* kp queue for this partition */
1923 	lpl_t		*lpl, *lpl_leaf;
1924 	int		hint, leafidx;
1925 	hrtime_t	stealtime;
1926 
1927 	maxpri = -1;
1928 	tcp = NULL;
1929 
1930 	kpq = &cp->cpu_part->cp_kp_queue;
1931 	while (kpq->disp_maxrunpri >= 0) {
1932 		/*
1933 		 * Try to take a thread from the kp_queue.
1934 		 */
1935 		tp = (disp_getbest(kpq));
1936 		if (tp)
1937 			return (disp_ratify(tp, kpq));
1938 	}
1939 
1940 	kpreempt_disable();		/* protect the cpu_active list */
1941 
1942 	/*
1943 	 * Try to find something to do on another CPU's run queue.
1944 	 * Loop through all other CPUs looking for the one with the highest
1945 	 * priority unbound thread.
1946 	 *
1947 	 * On NUMA machines, the partition's CPUs are consulted in order of
1948 	 * distance from the current CPU. This way, the first available
1949 	 * work found is also the closest, and will suffer the least
1950 	 * from being migrated.
1951 	 */
1952 	lpl = lpl_leaf = cp->cpu_lpl;
1953 	hint = leafidx = 0;
1954 
1955 	/*
1956 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1957 	 * broader levels of locality
1958 	 */
1959 	do {
1960 		/* This loop iterates over the lpl's leaves */
1961 		do {
1962 			if (lpl_leaf != cp->cpu_lpl)
1963 				ocp = lpl_leaf->lpl_cpus;
1964 			else
1965 				ocp = cp->cpu_next_lpl;
1966 
1967 			/* This loop iterates over the CPUs in the leaf */
1968 			ocp_start = ocp;
1969 			do {
1970 				pri_t pri;
1971 
1972 				ASSERT(CPU_ACTIVE(ocp));
1973 
1974 				/*
1975 				 * End our stroll around this lpl if:
1976 				 *
1977 				 * - Something became runnable on the local
1978 				 *   queue...which also ends our stroll around
1979 				 *   the partition.
1980 				 *
1981 				 * - We happen across another idle CPU.
1982 				 *   Since it is patrolling the next portion
1983 				 *   of the lpl's list (assuming it's not
1984 				 *   halted), move to the next higher level
1985 				 *   of locality.
1986 				 */
1987 				if (cp->cpu_disp->disp_nrunnable != 0) {
1988 					kpreempt_enable();
1989 					return (NULL);
1990 				}
1991 				if (ocp->cpu_dispatch_pri == -1) {
1992 					if (ocp->cpu_disp_flags &
1993 					    CPU_DISP_HALTED)
1994 						continue;
1995 					else
1996 						break;
1997 				}
1998 
1999 				/*
2000 				 * If there's only one thread and the CPU
2001 				 * is in the middle of a context switch,
2002 				 * or it's currently running the idle thread,
2003 				 * don't steal it.
2004 				 */
2005 				if ((ocp->cpu_disp_flags &
2006 				    CPU_DISP_DONTSTEAL) &&
2007 				    ocp->cpu_disp->disp_nrunnable == 1)
2008 					continue;
2009 
2010 				pri = ocp->cpu_disp->disp_max_unbound_pri;
2011 				if (pri > maxpri) {
2012 					/*
2013 					 * Don't steal threads that we attempted
2014 					 * to steal recently until they're ready
2015 					 * to be stolen again.
2016 					 */
2017 					stealtime = ocp->cpu_disp->disp_steal;
2018 					if (stealtime == 0 ||
2019 					    stealtime - gethrtime() <= 0) {
2020 						maxpri = pri;
2021 						tcp = ocp;
2022 					} else {
2023 						/*
2024 						 * Don't update tcp, just set
2025 						 * the retval to T_DONTSTEAL, so
2026 						 * that if no acceptable CPUs
2027 						 * are found the return value
2028 						 * will be T_DONTSTEAL rather
2029 						 * then NULL.
2030 						 */
2031 						retval = T_DONTSTEAL;
2032 					}
2033 				}
2034 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2035 
2036 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2037 				leafidx = 0;
2038 				lpl_leaf = lpl->lpl_rset[leafidx];
2039 			}
2040 		} while (leafidx != hint);
2041 
2042 		hint = leafidx = lpl->lpl_hint;
2043 		if ((lpl = lpl->lpl_parent) != NULL)
2044 			lpl_leaf = lpl->lpl_rset[hint];
2045 	} while (!tcp && lpl);
2046 
2047 	kpreempt_enable();
2048 
2049 	/*
2050 	 * If another queue looks good, and there is still nothing on
2051 	 * the local queue, try to transfer one or more threads
2052 	 * from it to our queue.
2053 	 */
2054 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2055 		tp = disp_getbest(tcp->cpu_disp);
2056 		if (tp == NULL || tp == T_DONTSTEAL)
2057 			return (tp);
2058 		return (disp_ratify(tp, kpq));
2059 	}
2060 	return (retval);
2061 }
2062 
2063 
2064 /*
2065  * disp_fix_unbound_pri()
2066  *	Determines the maximum priority of unbound threads on the queue.
2067  *	The priority is kept for the queue, but is only increased, never
2068  *	reduced unless some CPU is looking for something on that queue.
2069  *
2070  *	The priority argument is the known upper limit.
2071  *
2072  *	Perhaps this should be kept accurately, but that probably means
2073  *	separate bitmaps for bound and unbound threads.  Since only idled
2074  *	CPUs will have to do this recalculation, it seems better this way.
2075  */
2076 static void
2077 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2078 {
2079 	kthread_t	*tp;
2080 	dispq_t		*dq;
2081 	ulong_t		*dqactmap = dp->disp_qactmap;
2082 	ulong_t		mapword;
2083 	int		wx;
2084 
2085 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2086 
2087 	ASSERT(pri >= 0);			/* checked by caller */
2088 
2089 	/*
2090 	 * Start the search at the next lowest priority below the supplied
2091 	 * priority.  This depends on the bitmap implementation.
2092 	 */
2093 	do {
2094 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2095 
2096 		/*
2097 		 * Form mask for all lower priorities in the word.
2098 		 */
2099 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2100 
2101 		/*
2102 		 * Get next lower active priority.
2103 		 */
2104 		if (mapword != 0) {
2105 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2106 		} else if (wx > 0) {
2107 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2108 			if (pri < 0)
2109 				break;
2110 		} else {
2111 			pri = -1;
2112 			break;
2113 		}
2114 
2115 		/*
2116 		 * Search the queue for unbound, runnable threads.
2117 		 */
2118 		dq = &dp->disp_q[pri];
2119 		tp = dq->dq_first;
2120 
2121 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2122 			tp = tp->t_link;
2123 		}
2124 
2125 		/*
2126 		 * If a thread was found, set the priority and return.
2127 		 */
2128 	} while (tp == NULL);
2129 
2130 	/*
2131 	 * pri holds the maximum unbound thread priority or -1.
2132 	 */
2133 	if (dp->disp_max_unbound_pri != pri)
2134 		dp->disp_max_unbound_pri = pri;
2135 }
2136 
2137 /*
2138  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2139  * 	check if the CPU to which is was previously bound should have
2140  * 	its disp_max_unbound_pri increased.
2141  */
2142 void
2143 disp_adjust_unbound_pri(kthread_t *tp)
2144 {
2145 	disp_t *dp;
2146 	pri_t tpri;
2147 
2148 	ASSERT(THREAD_LOCK_HELD(tp));
2149 
2150 	/*
2151 	 * Don't do anything if the thread is not bound, or
2152 	 * currently not runnable or swapped out.
2153 	 */
2154 	if (tp->t_bound_cpu == NULL ||
2155 	    tp->t_state != TS_RUN ||
2156 	    tp->t_schedflag & TS_ON_SWAPQ)
2157 		return;
2158 
2159 	tpri = DISP_PRIO(tp);
2160 	dp = tp->t_bound_cpu->cpu_disp;
2161 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2162 	if (tpri > dp->disp_max_unbound_pri)
2163 		dp->disp_max_unbound_pri = tpri;
2164 }
2165 
2166 /*
2167  * disp_getbest()
2168  *   De-queue the highest priority unbound runnable thread.
2169  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2170  *   Returns NULL if nothing found.
2171  *   Returns T_DONTSTEAL if the thread was not stealable.
2172  *   so that the caller will try again later.
2173  *
2174  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2175  *   its type.
2176  */
2177 static kthread_t *
2178 disp_getbest(disp_t *dp)
2179 {
2180 	kthread_t	*tp;
2181 	dispq_t		*dq;
2182 	pri_t		pri;
2183 	cpu_t		*cp, *tcp;
2184 	boolean_t	allbound;
2185 
2186 	disp_lock_enter(&dp->disp_lock);
2187 
2188 	/*
2189 	 * If there is nothing to run, or the CPU is in the middle of a
2190 	 * context switch of the only thread, return NULL.
2191 	 */
2192 	tcp = dp->disp_cpu;
2193 	cp = CPU;
2194 	pri = dp->disp_max_unbound_pri;
2195 	if (pri == -1 ||
2196 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2197 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2198 		disp_lock_exit_nopreempt(&dp->disp_lock);
2199 		return (NULL);
2200 	}
2201 
2202 	dq = &dp->disp_q[pri];
2203 
2204 
2205 	/*
2206 	 * Assume that all threads are bound on this queue, and change it
2207 	 * later when we find out that it is not the case.
2208 	 */
2209 	allbound = B_TRUE;
2210 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2211 		hrtime_t now, nosteal, rqtime;
2212 
2213 		/*
2214 		 * Skip over bound threads which could be here even
2215 		 * though disp_max_unbound_pri indicated this level.
2216 		 */
2217 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2218 			continue;
2219 
2220 		/*
2221 		 * We've got some unbound threads on this queue, so turn
2222 		 * the allbound flag off now.
2223 		 */
2224 		allbound = B_FALSE;
2225 
2226 		/*
2227 		 * The thread is a candidate for stealing from its run queue. We
2228 		 * don't want to steal threads that became runnable just a
2229 		 * moment ago. This improves CPU affinity for threads that get
2230 		 * preempted for short periods of time and go back on the run
2231 		 * queue.
2232 		 *
2233 		 * We want to let it stay on its run queue if it was only placed
2234 		 * there recently and it was running on the same CPU before that
2235 		 * to preserve its cache investment. For the thread to remain on
2236 		 * its run queue, ALL of the following conditions must be
2237 		 * satisfied:
2238 		 *
2239 		 * - the disp queue should not be the kernel preemption queue
2240 		 * - delayed idle stealing should not be disabled
2241 		 * - nosteal_nsec should be non-zero
2242 		 * - it should run with user priority
2243 		 * - it should be on the run queue of the CPU where it was
2244 		 *   running before being placed on the run queue
2245 		 * - it should be the only thread on the run queue (to prevent
2246 		 *   extra scheduling latency for other threads)
2247 		 * - it should sit on the run queue for less than per-chip
2248 		 *   nosteal interval or global nosteal interval
2249 		 * - in case of CPUs with shared cache it should sit in a run
2250 		 *   queue of a CPU from a different chip
2251 		 *
2252 		 * The checks are arranged so that the ones that are faster are
2253 		 * placed earlier.
2254 		 */
2255 		if (tcp == NULL ||
2256 		    pri >= minclsyspri ||
2257 		    tp->t_cpu != tcp)
2258 			break;
2259 
2260 		/*
2261 		 * Steal immediately if, due to CMT processor architecture
2262 		 * migraiton between cp and tcp would incur no performance
2263 		 * penalty.
2264 		 */
2265 		if (pg_cmt_can_migrate(cp, tcp))
2266 			break;
2267 
2268 		nosteal = nosteal_nsec;
2269 		if (nosteal == 0)
2270 			break;
2271 
2272 		/*
2273 		 * Calculate time spent sitting on run queue
2274 		 */
2275 		now = gethrtime_unscaled();
2276 		rqtime = now - tp->t_waitrq;
2277 		scalehrtime(&rqtime);
2278 
2279 		/*
2280 		 * Steal immediately if the time spent on this run queue is more
2281 		 * than allowed nosteal delay.
2282 		 *
2283 		 * Negative rqtime check is needed here to avoid infinite
2284 		 * stealing delays caused by unlikely but not impossible
2285 		 * drifts between CPU times on different CPUs.
2286 		 */
2287 		if (rqtime > nosteal || rqtime < 0)
2288 			break;
2289 
2290 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2291 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2292 		scalehrtime(&now);
2293 		/*
2294 		 * Calculate when this thread becomes stealable
2295 		 */
2296 		now += (nosteal - rqtime);
2297 
2298 		/*
2299 		 * Calculate time when some thread becomes stealable
2300 		 */
2301 		if (now < dp->disp_steal)
2302 			dp->disp_steal = now;
2303 	}
2304 
2305 	/*
2306 	 * If there were no unbound threads on this queue, find the queue
2307 	 * where they are and then return later. The value of
2308 	 * disp_max_unbound_pri is not always accurate because it isn't
2309 	 * reduced until another idle CPU looks for work.
2310 	 */
2311 	if (allbound)
2312 		disp_fix_unbound_pri(dp, pri);
2313 
2314 	/*
2315 	 * If we reached the end of the queue and found no unbound threads
2316 	 * then return NULL so that other CPUs will be considered.  If there
2317 	 * are unbound threads but they cannot yet be stolen, then
2318 	 * return T_DONTSTEAL and try again later.
2319 	 */
2320 	if (tp == NULL) {
2321 		disp_lock_exit_nopreempt(&dp->disp_lock);
2322 		return (allbound ? NULL : T_DONTSTEAL);
2323 	}
2324 
2325 	/*
2326 	 * Found a runnable, unbound thread, so remove it from queue.
2327 	 * dispdeq() requires that we have the thread locked, and we do,
2328 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2329 	 * put the thread in transition state, thereby dropping the dispq
2330 	 * lock.
2331 	 */
2332 
2333 #ifdef DEBUG
2334 	{
2335 		int	thread_was_on_queue;
2336 
2337 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2338 		ASSERT(thread_was_on_queue);
2339 	}
2340 
2341 #else /* DEBUG */
2342 	(void) dispdeq(tp);			/* drops disp_lock */
2343 #endif /* DEBUG */
2344 
2345 	/*
2346 	 * Reset the disp_queue steal time - we do not know what is the smallest
2347 	 * value across the queue is.
2348 	 */
2349 	dp->disp_steal = 0;
2350 
2351 	tp->t_schedflag |= TS_DONT_SWAP;
2352 
2353 	/*
2354 	 * Setup thread to run on the current CPU.
2355 	 */
2356 	tp->t_disp_queue = cp->cpu_disp;
2357 
2358 	cp->cpu_dispthread = tp;		/* protected by spl only */
2359 	cp->cpu_dispatch_pri = pri;
2360 	ASSERT(pri == DISP_PRIO(tp));
2361 
2362 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2363 
2364 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2365 
2366 	/*
2367 	 * Return with spl high so that swtch() won't need to raise it.
2368 	 * The disp_lock was dropped by dispdeq().
2369 	 */
2370 
2371 	return (tp);
2372 }
2373 
2374 /*
2375  * disp_bound_common() - common routine for higher level functions
2376  *	that check for bound threads under certain conditions.
2377  *	If 'threadlistsafe' is set then there is no need to acquire
2378  *	pidlock to stop the thread list from changing (eg, if
2379  *	disp_bound_* is called with cpus paused).
2380  */
2381 static int
2382 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2383 {
2384 	int		found = 0;
2385 	kthread_t	*tp;
2386 
2387 	ASSERT(flag);
2388 
2389 	if (!threadlistsafe)
2390 		mutex_enter(&pidlock);
2391 	tp = curthread;		/* faster than allthreads */
2392 	do {
2393 		if (tp->t_state != TS_FREE) {
2394 			/*
2395 			 * If an interrupt thread is busy, but the
2396 			 * caller doesn't care (i.e. BOUND_INTR is off),
2397 			 * then just ignore it and continue through.
2398 			 */
2399 			if ((tp->t_flag & T_INTR_THREAD) &&
2400 			    !(flag & BOUND_INTR))
2401 				continue;
2402 
2403 			/*
2404 			 * Skip the idle thread for the CPU
2405 			 * we're about to set offline.
2406 			 */
2407 			if (tp == cp->cpu_idle_thread)
2408 				continue;
2409 
2410 			/*
2411 			 * Skip the pause thread for the CPU
2412 			 * we're about to set offline.
2413 			 */
2414 			if (tp == cp->cpu_pause_thread)
2415 				continue;
2416 
2417 			if ((flag & BOUND_CPU) &&
2418 			    (tp->t_bound_cpu == cp ||
2419 			    tp->t_bind_cpu == cp->cpu_id ||
2420 			    tp->t_weakbound_cpu == cp)) {
2421 				found = 1;
2422 				break;
2423 			}
2424 
2425 			if ((flag & BOUND_PARTITION) &&
2426 			    (tp->t_cpupart == cp->cpu_part)) {
2427 				found = 1;
2428 				break;
2429 			}
2430 		}
2431 	} while ((tp = tp->t_next) != curthread && found == 0);
2432 	if (!threadlistsafe)
2433 		mutex_exit(&pidlock);
2434 	return (found);
2435 }
2436 
2437 /*
2438  * disp_bound_threads - return nonzero if threads are bound to the processor.
2439  *	Called infrequently.  Keep this simple.
2440  *	Includes threads that are asleep or stopped but not onproc.
2441  */
2442 int
2443 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2444 {
2445 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2446 }
2447 
2448 /*
2449  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2450  * to the given processor, including interrupt threads.
2451  */
2452 int
2453 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2454 {
2455 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2456 }
2457 
2458 /*
2459  * disp_bound_partition - return nonzero if threads are bound to the same
2460  * partition as the processor.
2461  *	Called infrequently.  Keep this simple.
2462  *	Includes threads that are asleep or stopped but not onproc.
2463  */
2464 int
2465 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2466 {
2467 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2468 }
2469 
2470 /*
2471  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2472  * threads to other CPUs.
2473  */
2474 void
2475 disp_cpu_inactive(cpu_t *cp)
2476 {
2477 	kthread_t	*tp;
2478 	disp_t		*dp = cp->cpu_disp;
2479 	dispq_t		*dq;
2480 	pri_t		pri;
2481 	int		wasonq;
2482 
2483 	disp_lock_enter(&dp->disp_lock);
2484 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2485 		dq = &dp->disp_q[pri];
2486 		tp = dq->dq_first;
2487 
2488 		/*
2489 		 * Skip over bound threads.
2490 		 */
2491 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2492 			tp = tp->t_link;
2493 		}
2494 
2495 		if (tp == NULL) {
2496 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2497 			disp_fix_unbound_pri(dp, pri);
2498 			continue;
2499 		}
2500 
2501 		wasonq = dispdeq(tp);		/* drops disp_lock */
2502 		ASSERT(wasonq);
2503 		ASSERT(tp->t_weakbound_cpu == NULL);
2504 
2505 		setbackdq(tp);
2506 		/*
2507 		 * Called from cpu_offline:
2508 		 *
2509 		 * cp has already been removed from the list of active cpus
2510 		 * and tp->t_cpu has been changed so there is no risk of
2511 		 * tp ending up back on cp.
2512 		 *
2513 		 * Called from cpupart_move_cpu:
2514 		 *
2515 		 * The cpu has moved to a new cpupart.  Any threads that
2516 		 * were on it's dispatch queues before the move remain
2517 		 * in the old partition and can't run in the new partition.
2518 		 */
2519 		ASSERT(tp->t_cpu != cp);
2520 		thread_unlock(tp);
2521 
2522 		disp_lock_enter(&dp->disp_lock);
2523 	}
2524 	disp_lock_exit(&dp->disp_lock);
2525 }
2526 
2527 /*
2528  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2529  *	The hint passed in is used as a starting point so we don't favor
2530  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2531  *	used CPU for the thread.
2532  *
2533  *	The lgroup and priority are used to determine the best CPU to run on
2534  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2535  *	the thread priority will indicate whether the thread will actually run
2536  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2537  *	lgroup which are running the lowest priority threads are found.  The
2538  *	remote CPU is chosen only if the thread will not run locally on a CPU
2539  *	within the lgroup, but will run on the remote CPU. If the thread
2540  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2541  *
2542  *	The lpl specified also identifies the cpu partition from which
2543  *	disp_lowpri_cpu should select a CPU.
2544  *
2545  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2546  *      behalf of the current thread. (curthread is looking for a new cpu)
2547  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2548  *      ignored.
2549  *
2550  *      If a cpu is the target of an offline request then try to avoid it.
2551  *
2552  *	This function must be called at either high SPL, or with preemption
2553  *	disabled, so that the "hint" CPU cannot be removed from the online
2554  *	CPU list while we are traversing it.
2555  */
2556 cpu_t *
2557 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2558 {
2559 	cpu_t	*bestcpu;
2560 	cpu_t	*besthomecpu;
2561 	cpu_t   *cp, *cpstart;
2562 
2563 	pri_t   bestpri;
2564 	pri_t   cpupri;
2565 
2566 	klgrpset_t	done;
2567 	klgrpset_t	cur_set;
2568 
2569 	lpl_t		*lpl_iter, *lpl_leaf;
2570 	int		i;
2571 
2572 	/*
2573 	 * Scan for a CPU currently running the lowest priority thread.
2574 	 * Cannot get cpu_lock here because it is adaptive.
2575 	 * We do not require lock on CPU list.
2576 	 */
2577 	ASSERT(hint != NULL);
2578 	ASSERT(lpl != NULL);
2579 	ASSERT(lpl->lpl_ncpu > 0);
2580 
2581 	/*
2582 	 * First examine local CPUs. Note that it's possible the hint CPU
2583 	 * passed in in remote to the specified home lgroup. If our priority
2584 	 * isn't sufficient enough such that we can run immediately at home,
2585 	 * then examine CPUs remote to our home lgroup.
2586 	 * We would like to give preference to CPUs closest to "home".
2587 	 * If we can't find a CPU where we'll run at a given level
2588 	 * of locality, we expand our search to include the next level.
2589 	 */
2590 	bestcpu = besthomecpu = NULL;
2591 	klgrpset_clear(done);
2592 	/* start with lpl we were passed */
2593 
2594 	lpl_iter = lpl;
2595 
2596 	do {
2597 
2598 		bestpri = SHRT_MAX;
2599 		klgrpset_clear(cur_set);
2600 
2601 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2602 			lpl_leaf = lpl_iter->lpl_rset[i];
2603 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2604 				continue;
2605 
2606 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2607 
2608 			if (hint->cpu_lpl == lpl_leaf)
2609 				cp = cpstart = hint;
2610 			else
2611 				cp = cpstart = lpl_leaf->lpl_cpus;
2612 
2613 			do {
2614 				if (cp == curcpu)
2615 					cpupri = -1;
2616 				else if (cp == cpu_inmotion)
2617 					cpupri = SHRT_MAX;
2618 				else
2619 					cpupri = cp->cpu_dispatch_pri;
2620 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2621 					cpupri = cp->cpu_disp->disp_maxrunpri;
2622 				if (cp->cpu_chosen_level > cpupri)
2623 					cpupri = cp->cpu_chosen_level;
2624 				if (cpupri < bestpri) {
2625 					if (CPU_IDLING(cpupri)) {
2626 						ASSERT((cp->cpu_flags &
2627 						    CPU_QUIESCED) == 0);
2628 						return (cp);
2629 					}
2630 					bestcpu = cp;
2631 					bestpri = cpupri;
2632 				}
2633 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2634 		}
2635 
2636 		if (bestcpu && (tpri > bestpri)) {
2637 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2638 			return (bestcpu);
2639 		}
2640 		if (besthomecpu == NULL)
2641 			besthomecpu = bestcpu;
2642 		/*
2643 		 * Add the lgrps we just considered to the "done" set
2644 		 */
2645 		klgrpset_or(done, cur_set);
2646 
2647 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2648 
2649 	/*
2650 	 * The specified priority isn't high enough to run immediately
2651 	 * anywhere, so just return the best CPU from the home lgroup.
2652 	 */
2653 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2654 	return (besthomecpu);
2655 }
2656 
2657 /*
2658  * This routine provides the generic idle cpu function for all processors.
2659  * If a processor has some specific code to execute when idle (say, to stop
2660  * the pipeline and save power) then that routine should be defined in the
2661  * processors specific code (module_xx.c) and the global variable idle_cpu
2662  * set to that function.
2663  */
2664 static void
2665 generic_idle_cpu(void)
2666 {
2667 }
2668 
2669 /*ARGSUSED*/
2670 static void
2671 generic_enq_thread(cpu_t *cpu, int bound)
2672 {
2673 }
2674 
2675 /*
2676  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2677  *	- t->t_cpu is not in this thread's assigned lgrp
2678  *	- the time since the thread last came off t->t_cpu exceeds the
2679  *	  rechoose time for this cpu (ignore this if t is curthread in
2680  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2681  *	- t->t_cpu is presently the target of an offline or partition move
2682  *	  request
2683  */
2684 static cpu_t *
2685 cpu_choose(kthread_t *t, pri_t tpri)
2686 {
2687 	ASSERT(tpri < kpqpri);
2688 
2689 	if ((((lbolt - t->t_disp_time) > rechoose_interval) &&
2690 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2691 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2692 	}
2693 
2694 	/*
2695 	 * Take a trip through disp_lowpri_cpu() if the thread was
2696 	 * running outside it's home lgroup
2697 	 */
2698 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2699 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2700 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2701 		    (t == curthread) ? t->t_cpu : NULL));
2702 	}
2703 	return (t->t_cpu);
2704 }
2705