xref: /titanic_50/usr/src/uts/common/disp/disp.c (revision b86efd96f8acd85ddaa930a2f0c1d664237e4aaf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysinfo.h>
39 #include <sys/var.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/inline.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47 #include <sys/kmem.h>
48 #include <sys/cpuvar.h>
49 #include <sys/vtrace.h>
50 #include <sys/tnf.h>
51 #include <sys/cpupart.h>
52 #include <sys/lgrp.h>
53 #include <sys/chip.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 
59 #include <vm/as.h>
60 
61 #define	BOUND_CPU	0x1
62 #define	BOUND_PARTITION	0x2
63 #define	BOUND_INTR	0x4
64 
65 /* Dispatch queue allocation structure and functions */
66 struct disp_queue_info {
67 	disp_t	*dp;
68 	dispq_t *olddispq;
69 	dispq_t *newdispq;
70 	ulong_t	*olddqactmap;
71 	ulong_t	*newdqactmap;
72 	int	oldnglobpris;
73 };
74 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
75     disp_t *dp);
76 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
77 static void	disp_dq_free(struct disp_queue_info *dptr);
78 
79 /* platform-specific routine to call when processor is idle */
80 static void	generic_idle_cpu();
81 void		(*idle_cpu)() = generic_idle_cpu;
82 
83 /* routines invoked when a CPU enters/exits the idle loop */
84 static void	idle_enter();
85 static void	idle_exit();
86 
87 /* platform-specific routine to call when thread is enqueued */
88 static void	generic_enq_thread(cpu_t *, int);
89 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
90 
91 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
92 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
93 pri_t	intr_pri;		/* interrupt thread priority base level */
94 
95 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
96 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
97 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
98 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
99 int	nswapped;		/* total number of swapped threads */
100 void	disp_swapped_enq(kthread_t *tp);
101 static void	disp_swapped_setrun(kthread_t *tp);
102 static void	cpu_resched(cpu_t *cp, pri_t tpri);
103 
104 /*
105  * If this is set, only interrupt threads will cause kernel preemptions.
106  * This is done by changing the value of kpreemptpri.  kpreemptpri
107  * will either be the max sysclass pri + 1 or the min interrupt pri.
108  */
109 int	only_intr_kpreempt;
110 
111 extern void set_idle_cpu(int cpun);
112 extern void unset_idle_cpu(int cpun);
113 static void setkpdq(kthread_t *tp, int borf);
114 #define	SETKP_BACK	0
115 #define	SETKP_FRONT	1
116 /*
117  * Parameter that determines how recently a thread must have run
118  * on the CPU to be considered loosely-bound to that CPU to reduce
119  * cold cache effects.  The interval is in hertz.
120  *
121  * The platform may define a per physical processor adjustment of
122  * this parameter. For efficiency, the effective rechoose interval
123  * (rechoose_interval + per chip adjustment) is maintained in the
124  * cpu structures. See cpu_choose()
125  */
126 int	rechoose_interval = RECHOOSE_INTERVAL;
127 static cpu_t	*cpu_choose(kthread_t *, pri_t);
128 
129 /*
130  * Parameter that determines how long (in nanoseconds) a thread must
131  * be sitting on a run queue before it can be stolen by another CPU
132  * to reduce migrations.  The interval is in nanoseconds.
133  *
134  * The nosteal_nsec should be set by a platform code to an appropriate value.
135  *
136  */
137 hrtime_t nosteal_nsec = 0;
138 
139 /*
140  * Value of nosteal_nsec meaning that nosteal optimization should be disabled
141  */
142 #define	NOSTEAL_DISABLED 1
143 
144 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
145 
146 disp_lock_t	transition_lock;	/* lock on transitioning threads */
147 disp_lock_t	stop_lock;		/* lock on stopped threads */
148 
149 static void	cpu_dispqalloc(int numpris);
150 
151 /*
152  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
153  * a thread because it was sitting on its run queue for a very short
154  * period of time.
155  */
156 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
157 
158 static kthread_t	*disp_getwork(cpu_t *to);
159 static kthread_t	*disp_getbest(disp_t *from);
160 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
161 
162 void	swtch_to(kthread_t *);
163 
164 /*
165  * dispatcher and scheduler initialization
166  */
167 
168 /*
169  * disp_setup - Common code to calculate and allocate dispatcher
170  *		variables and structures based on the maximum priority.
171  */
172 static void
173 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
174 {
175 	pri_t	newnglobpris;
176 
177 	ASSERT(MUTEX_HELD(&cpu_lock));
178 
179 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
180 
181 	if (newnglobpris > oldnglobpris) {
182 		/*
183 		 * Allocate new kp queues for each CPU partition.
184 		 */
185 		cpupart_kpqalloc(newnglobpris);
186 
187 		/*
188 		 * Allocate new dispatch queues for each CPU.
189 		 */
190 		cpu_dispqalloc(newnglobpris);
191 
192 		/*
193 		 * compute new interrupt thread base priority
194 		 */
195 		intr_pri = maxglobpri;
196 		if (only_intr_kpreempt) {
197 			kpreemptpri = intr_pri + 1;
198 			if (kpqpri == KPQPRI)
199 				kpqpri = kpreemptpri;
200 		}
201 		v.v_nglobpris = newnglobpris;
202 	}
203 }
204 
205 /*
206  * dispinit - Called to initialize all loaded classes and the
207  *	      dispatcher framework.
208  */
209 void
210 dispinit(void)
211 {
212 	id_t	cid;
213 	pri_t	maxglobpri;
214 	pri_t	cl_maxglobpri;
215 
216 	maxglobpri = -1;
217 
218 	/*
219 	 * Initialize transition lock, which will always be set.
220 	 */
221 	DISP_LOCK_INIT(&transition_lock);
222 	disp_lock_enter_high(&transition_lock);
223 	DISP_LOCK_INIT(&stop_lock);
224 
225 	mutex_enter(&cpu_lock);
226 	CPU->cpu_disp->disp_maxrunpri = -1;
227 	CPU->cpu_disp->disp_max_unbound_pri = -1;
228 	/*
229 	 * Initialize the default CPU partition.
230 	 */
231 	cpupart_initialize_default();
232 	/*
233 	 * Call the class specific initialization functions for
234 	 * all pre-installed schedulers.
235 	 *
236 	 * We pass the size of a class specific parameter
237 	 * buffer to each of the initialization functions
238 	 * to try to catch problems with backward compatibility
239 	 * of class modules.
240 	 *
241 	 * For example a new class module running on an old system
242 	 * which didn't provide sufficiently large parameter buffers
243 	 * would be bad news. Class initialization modules can check for
244 	 * this and take action if they detect a problem.
245 	 */
246 
247 	for (cid = 0; cid < nclass; cid++) {
248 		sclass_t	*sc;
249 
250 		sc = &sclass[cid];
251 		if (SCHED_INSTALLED(sc)) {
252 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
253 			    &sc->cl_funcs);
254 			if (cl_maxglobpri > maxglobpri)
255 				maxglobpri = cl_maxglobpri;
256 		}
257 	}
258 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
259 	if (kpqpri == KPQPRI)
260 		kpqpri = kpreemptpri;
261 
262 	ASSERT(maxglobpri >= 0);
263 	disp_setup(maxglobpri, 0);
264 
265 	mutex_exit(&cpu_lock);
266 
267 	/*
268 	 * Get the default class ID; this may be later modified via
269 	 * dispadmin(1M).  This will load the class (normally TS) and that will
270 	 * call disp_add(), which is why we had to drop cpu_lock first.
271 	 */
272 	if (getcid(defaultclass, &defaultcid) != 0) {
273 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
274 		    defaultclass);
275 	}
276 }
277 
278 /*
279  * disp_add - Called with class pointer to initialize the dispatcher
280  *	      for a newly loaded class.
281  */
282 void
283 disp_add(sclass_t *clp)
284 {
285 	pri_t	maxglobpri;
286 	pri_t	cl_maxglobpri;
287 
288 	mutex_enter(&cpu_lock);
289 	/*
290 	 * Initialize the scheduler class.
291 	 */
292 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
293 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
294 	if (cl_maxglobpri > maxglobpri)
295 		maxglobpri = cl_maxglobpri;
296 
297 	/*
298 	 * Save old queue information.  Since we're initializing a
299 	 * new scheduling class which has just been loaded, then
300 	 * the size of the dispq may have changed.  We need to handle
301 	 * that here.
302 	 */
303 	disp_setup(maxglobpri, v.v_nglobpris);
304 
305 	mutex_exit(&cpu_lock);
306 }
307 
308 
309 /*
310  * For each CPU, allocate new dispatch queues
311  * with the stated number of priorities.
312  */
313 static void
314 cpu_dispqalloc(int numpris)
315 {
316 	cpu_t	*cpup;
317 	struct disp_queue_info	*disp_mem;
318 	int i, num;
319 
320 	ASSERT(MUTEX_HELD(&cpu_lock));
321 
322 	disp_mem = kmem_zalloc(NCPU *
323 	    sizeof (struct disp_queue_info), KM_SLEEP);
324 
325 	/*
326 	 * This routine must allocate all of the memory before stopping
327 	 * the cpus because it must not sleep in kmem_alloc while the
328 	 * CPUs are stopped.  Locks they hold will not be freed until they
329 	 * are restarted.
330 	 */
331 	i = 0;
332 	cpup = cpu_list;
333 	do {
334 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
335 		i++;
336 		cpup = cpup->cpu_next;
337 	} while (cpup != cpu_list);
338 	num = i;
339 
340 	pause_cpus(NULL);
341 	for (i = 0; i < num; i++)
342 		disp_dq_assign(&disp_mem[i], numpris);
343 	start_cpus();
344 
345 	/*
346 	 * I must free all of the memory after starting the cpus because
347 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
348 	 */
349 	for (i = 0; i < num; i++)
350 		disp_dq_free(&disp_mem[i]);
351 
352 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
353 }
354 
355 static void
356 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
357 {
358 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
359 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
360 	    sizeof (long), KM_SLEEP);
361 	dptr->dp = dp;
362 }
363 
364 static void
365 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
366 {
367 	disp_t	*dp;
368 
369 	dp = dptr->dp;
370 	dptr->olddispq = dp->disp_q;
371 	dptr->olddqactmap = dp->disp_qactmap;
372 	dptr->oldnglobpris = dp->disp_npri;
373 
374 	ASSERT(dptr->oldnglobpris < numpris);
375 
376 	if (dptr->olddispq != NULL) {
377 		/*
378 		 * Use kcopy because bcopy is platform-specific
379 		 * and could block while we might have paused the cpus.
380 		 */
381 		(void) kcopy(dptr->olddispq, dptr->newdispq,
382 		    dptr->oldnglobpris * sizeof (dispq_t));
383 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
384 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
385 		    sizeof (long));
386 	}
387 	dp->disp_q = dptr->newdispq;
388 	dp->disp_qactmap = dptr->newdqactmap;
389 	dp->disp_q_limit = &dptr->newdispq[numpris];
390 	dp->disp_npri = numpris;
391 }
392 
393 static void
394 disp_dq_free(struct disp_queue_info *dptr)
395 {
396 	if (dptr->olddispq != NULL)
397 		kmem_free(dptr->olddispq,
398 		    dptr->oldnglobpris * sizeof (dispq_t));
399 	if (dptr->olddqactmap != NULL)
400 		kmem_free(dptr->olddqactmap,
401 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
402 }
403 
404 /*
405  * For a newly created CPU, initialize the dispatch queue.
406  * This is called before the CPU is known through cpu[] or on any lists.
407  */
408 void
409 disp_cpu_init(cpu_t *cp)
410 {
411 	disp_t	*dp;
412 	dispq_t	*newdispq;
413 	ulong_t	*newdqactmap;
414 
415 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
416 
417 	if (cp == cpu0_disp.disp_cpu)
418 		dp = &cpu0_disp;
419 	else
420 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
421 	bzero(dp, sizeof (disp_t));
422 	cp->cpu_disp = dp;
423 	dp->disp_cpu = cp;
424 	dp->disp_maxrunpri = -1;
425 	dp->disp_max_unbound_pri = -1;
426 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
427 	/*
428 	 * Allocate memory for the dispatcher queue headers
429 	 * and the active queue bitmap.
430 	 */
431 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
432 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
433 	    sizeof (long), KM_SLEEP);
434 	dp->disp_q = newdispq;
435 	dp->disp_qactmap = newdqactmap;
436 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
437 	dp->disp_npri = v.v_nglobpris;
438 }
439 
440 void
441 disp_cpu_fini(cpu_t *cp)
442 {
443 	ASSERT(MUTEX_HELD(&cpu_lock));
444 
445 	disp_kp_free(cp->cpu_disp);
446 	if (cp->cpu_disp != &cpu0_disp)
447 		kmem_free(cp->cpu_disp, sizeof (disp_t));
448 }
449 
450 /*
451  * Allocate new, larger kpreempt dispatch queue to replace the old one.
452  */
453 void
454 disp_kp_alloc(disp_t *dq, pri_t npri)
455 {
456 	struct disp_queue_info	mem_info;
457 
458 	if (npri > dq->disp_npri) {
459 		/*
460 		 * Allocate memory for the new array.
461 		 */
462 		disp_dq_alloc(&mem_info, npri, dq);
463 
464 		/*
465 		 * We need to copy the old structures to the new
466 		 * and free the old.
467 		 */
468 		disp_dq_assign(&mem_info, npri);
469 		disp_dq_free(&mem_info);
470 	}
471 }
472 
473 /*
474  * Free dispatch queue.
475  * Used for the kpreempt queues for a removed CPU partition and
476  * for the per-CPU queues of deleted CPUs.
477  */
478 void
479 disp_kp_free(disp_t *dq)
480 {
481 	struct disp_queue_info	mem_info;
482 
483 	mem_info.olddispq = dq->disp_q;
484 	mem_info.olddqactmap = dq->disp_qactmap;
485 	mem_info.oldnglobpris = dq->disp_npri;
486 	disp_dq_free(&mem_info);
487 }
488 
489 /*
490  * End dispatcher and scheduler initialization.
491  */
492 
493 /*
494  * See if there's anything to do other than remain idle.
495  * Return non-zero if there is.
496  *
497  * This function must be called with high spl, or with
498  * kernel preemption disabled to prevent the partition's
499  * active cpu list from changing while being traversed.
500  *
501  */
502 int
503 disp_anywork(void)
504 {
505 	cpu_t   *cp = CPU;
506 	cpu_t   *ocp;
507 
508 	if (cp->cpu_disp->disp_nrunnable != 0)
509 		return (1);
510 
511 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
512 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
513 			return (1);
514 
515 		/*
516 		 * Work can be taken from another CPU if:
517 		 *	- There is unbound work on the run queue
518 		 *	- That work isn't a thread undergoing a
519 		 *	- context switch on an otherwise empty queue.
520 		 *	- The CPU isn't running the idle loop.
521 		 */
522 		for (ocp = cp->cpu_next_part; ocp != cp;
523 		    ocp = ocp->cpu_next_part) {
524 			ASSERT(CPU_ACTIVE(ocp));
525 
526 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
527 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
528 			    ocp->cpu_disp->disp_nrunnable == 1) &&
529 			    ocp->cpu_dispatch_pri != -1)
530 				return (1);
531 		}
532 	}
533 	return (0);
534 }
535 
536 /*
537  * Called when CPU enters the idle loop
538  */
539 static void
540 idle_enter()
541 {
542 	cpu_t		*cp = CPU;
543 
544 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
545 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
546 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
547 }
548 
549 /*
550  * Called when CPU exits the idle loop
551  */
552 static void
553 idle_exit()
554 {
555 	cpu_t		*cp = CPU;
556 
557 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
558 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
559 }
560 
561 /*
562  * Idle loop.
563  */
564 void
565 idle()
566 {
567 	struct cpu	*cp = CPU;		/* pointer to this CPU */
568 	kthread_t	*t;			/* taken thread */
569 
570 	idle_enter();
571 
572 	/*
573 	 * Uniprocessor version of idle loop.
574 	 * Do this until notified that we're on an actual multiprocessor.
575 	 */
576 	while (ncpus == 1) {
577 		if (cp->cpu_disp->disp_nrunnable == 0) {
578 			(*idle_cpu)();
579 			continue;
580 		}
581 		idle_exit();
582 		swtch();
583 
584 		idle_enter(); /* returned from swtch */
585 	}
586 
587 	/*
588 	 * Multiprocessor idle loop.
589 	 */
590 	for (;;) {
591 		/*
592 		 * If CPU is completely quiesced by p_online(2), just wait
593 		 * here with minimal bus traffic until put online.
594 		 */
595 		while (cp->cpu_flags & CPU_QUIESCED)
596 			(*idle_cpu)();
597 
598 		if (cp->cpu_disp->disp_nrunnable != 0) {
599 			idle_exit();
600 			swtch();
601 		} else {
602 			if (cp->cpu_flags & CPU_OFFLINE)
603 				continue;
604 			if ((t = disp_getwork(cp)) == NULL) {
605 				if (cp->cpu_chosen_level != -1) {
606 					disp_t *dp = cp->cpu_disp;
607 					disp_t *kpq;
608 
609 					disp_lock_enter(&dp->disp_lock);
610 					/*
611 					 * Set kpq under lock to prevent
612 					 * migration between partitions.
613 					 */
614 					kpq = &cp->cpu_part->cp_kp_queue;
615 					if (kpq->disp_maxrunpri == -1)
616 						cp->cpu_chosen_level = -1;
617 					disp_lock_exit(&dp->disp_lock);
618 				}
619 				(*idle_cpu)();
620 				continue;
621 			}
622 			/*
623 			 * If there was a thread but we couldn't steal
624 			 * it, then keep trying.
625 			 */
626 			if (t == T_DONTSTEAL)
627 				continue;
628 			idle_exit();
629 			restore_mstate(t);
630 			swtch_to(t);
631 		}
632 		idle_enter(); /* returned from swtch/swtch_to */
633 	}
634 }
635 
636 
637 /*
638  * Preempt the currently running thread in favor of the highest
639  * priority thread.  The class of the current thread controls
640  * where it goes on the dispatcher queues. If panicking, turn
641  * preemption off.
642  */
643 void
644 preempt()
645 {
646 	kthread_t 	*t = curthread;
647 	klwp_t 		*lwp = ttolwp(curthread);
648 
649 	if (panicstr)
650 		return;
651 
652 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
653 
654 	thread_lock(t);
655 
656 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
657 		/*
658 		 * this thread has already been chosen to be run on
659 		 * another CPU. Clear kprunrun on this CPU since we're
660 		 * already headed for swtch().
661 		 */
662 		CPU->cpu_kprunrun = 0;
663 		thread_unlock_nopreempt(t);
664 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
665 	} else {
666 		if (lwp != NULL)
667 			lwp->lwp_ru.nivcsw++;
668 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
669 		THREAD_TRANSITION(t);
670 		CL_PREEMPT(t);
671 		DTRACE_SCHED(preempt);
672 		thread_unlock_nopreempt(t);
673 
674 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
675 
676 		swtch();		/* clears CPU->cpu_runrun via disp() */
677 	}
678 }
679 
680 extern kthread_t *thread_unpin();
681 
682 /*
683  * disp() - find the highest priority thread for this processor to run, and
684  * set it in TS_ONPROC state so that resume() can be called to run it.
685  */
686 static kthread_t *
687 disp()
688 {
689 	cpu_t		*cpup;
690 	disp_t		*dp;
691 	kthread_t	*tp;
692 	dispq_t		*dq;
693 	int		maxrunword;
694 	pri_t		pri;
695 	disp_t		*kpq;
696 
697 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
698 
699 	cpup = CPU;
700 	/*
701 	 * Find the highest priority loaded, runnable thread.
702 	 */
703 	dp = cpup->cpu_disp;
704 
705 reschedule:
706 	/*
707 	 * If there is more important work on the global queue with a better
708 	 * priority than the maximum on this CPU, take it now.
709 	 */
710 	kpq = &cpup->cpu_part->cp_kp_queue;
711 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
712 	    pri >= dp->disp_maxrunpri &&
713 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
714 	    (tp = disp_getbest(kpq)) != NULL) {
715 		if (disp_ratify(tp, kpq) != NULL) {
716 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
717 			    "disp_end:tid %p", tp);
718 			restore_mstate(tp);
719 			return (tp);
720 		}
721 	}
722 
723 	disp_lock_enter(&dp->disp_lock);
724 	pri = dp->disp_maxrunpri;
725 
726 	/*
727 	 * If there is nothing to run, look at what's runnable on other queues.
728 	 * Choose the idle thread if the CPU is quiesced.
729 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
730 	 * interrupt threads, which will be the only threads on the CPU's own
731 	 * queue, but cannot run threads from other queues.
732 	 */
733 	if (pri == -1) {
734 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
735 			disp_lock_exit(&dp->disp_lock);
736 			if ((tp = disp_getwork(cpup)) == NULL ||
737 			    tp == T_DONTSTEAL) {
738 				tp = cpup->cpu_idle_thread;
739 				(void) splhigh();
740 				THREAD_ONPROC(tp, cpup);
741 				cpup->cpu_dispthread = tp;
742 				cpup->cpu_dispatch_pri = -1;
743 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
744 				cpup->cpu_chosen_level = -1;
745 			}
746 		} else {
747 			disp_lock_exit_high(&dp->disp_lock);
748 			tp = cpup->cpu_idle_thread;
749 			THREAD_ONPROC(tp, cpup);
750 			cpup->cpu_dispthread = tp;
751 			cpup->cpu_dispatch_pri = -1;
752 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
753 			cpup->cpu_chosen_level = -1;
754 		}
755 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
756 			"disp_end:tid %p", tp);
757 		restore_mstate(tp);
758 		return (tp);
759 	}
760 
761 	dq = &dp->disp_q[pri];
762 	tp = dq->dq_first;
763 
764 	ASSERT(tp != NULL);
765 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
766 
767 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
768 
769 	/*
770 	 * Found it so remove it from queue.
771 	 */
772 	dp->disp_nrunnable--;
773 	dq->dq_sruncnt--;
774 	if ((dq->dq_first = tp->t_link) == NULL) {
775 		ulong_t	*dqactmap = dp->disp_qactmap;
776 
777 		ASSERT(dq->dq_sruncnt == 0);
778 		dq->dq_last = NULL;
779 
780 		/*
781 		 * The queue is empty, so the corresponding bit needs to be
782 		 * turned off in dqactmap.   If nrunnable != 0 just took the
783 		 * last runnable thread off the
784 		 * highest queue, so recompute disp_maxrunpri.
785 		 */
786 		maxrunword = pri >> BT_ULSHIFT;
787 		dqactmap[maxrunword] &= ~BT_BIW(pri);
788 
789 		if (dp->disp_nrunnable == 0) {
790 			dp->disp_max_unbound_pri = -1;
791 			dp->disp_maxrunpri = -1;
792 		} else {
793 			int ipri;
794 
795 			ipri = bt_gethighbit(dqactmap, maxrunword);
796 			dp->disp_maxrunpri = ipri;
797 			if (ipri < dp->disp_max_unbound_pri)
798 				dp->disp_max_unbound_pri = ipri;
799 		}
800 	} else {
801 		tp->t_link = NULL;
802 	}
803 
804 	/*
805 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
806 	 * out this thread before we have a chance to run it.
807 	 * While running, it is protected against swapping by t_lock.
808 	 */
809 	tp->t_schedflag |= TS_DONT_SWAP;
810 	cpup->cpu_dispthread = tp;		/* protected by spl only */
811 	cpup->cpu_dispatch_pri = pri;
812 	ASSERT(pri == DISP_PRIO(tp));
813 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
814 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
815 
816 	ASSERT(tp != NULL);
817 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
818 		"disp_end:tid %p", tp);
819 
820 	if (disp_ratify(tp, kpq) == NULL)
821 		goto reschedule;
822 
823 	restore_mstate(tp);
824 	return (tp);
825 }
826 
827 /*
828  * swtch()
829  *	Find best runnable thread and run it.
830  *	Called with the current thread already switched to a new state,
831  *	on a sleep queue, run queue, stopped, and not zombied.
832  *	May be called at any spl level less than or equal to LOCK_LEVEL.
833  *	Always drops spl to the base level (spl0()).
834  */
835 void
836 swtch()
837 {
838 	kthread_t	*t = curthread;
839 	kthread_t	*next;
840 	cpu_t		*cp;
841 
842 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
843 
844 	if (t->t_flag & T_INTR_THREAD)
845 		cpu_intr_swtch_enter(t);
846 
847 	if (t->t_intr != NULL) {
848 		/*
849 		 * We are an interrupt thread.  Setup and return
850 		 * the interrupted thread to be resumed.
851 		 */
852 		(void) splhigh();	/* block other scheduler action */
853 		cp = CPU;		/* now protected against migration */
854 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
855 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
856 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
857 		next = thread_unpin();
858 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
859 		resume_from_intr(next);
860 	} else {
861 #ifdef	DEBUG
862 		if (t->t_state == TS_ONPROC &&
863 		    t->t_disp_queue->disp_cpu == CPU &&
864 		    t->t_preempt == 0) {
865 			thread_lock(t);
866 			ASSERT(t->t_state != TS_ONPROC ||
867 			    t->t_disp_queue->disp_cpu != CPU ||
868 			    t->t_preempt != 0);	/* cannot migrate */
869 			thread_unlock_nopreempt(t);
870 		}
871 #endif	/* DEBUG */
872 		cp = CPU;
873 		next = disp();		/* returns with spl high */
874 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
875 
876 		/* OK to steal anything left on run queue */
877 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
878 
879 		if (next != t) {
880 			if (t == cp->cpu_idle_thread) {
881 				CHIP_NRUNNING(cp->cpu_chip, 1);
882 			} else if (next == cp->cpu_idle_thread) {
883 				CHIP_NRUNNING(cp->cpu_chip, -1);
884 			}
885 
886 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
887 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
888 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
889 
890 			if (dtrace_vtime_active)
891 				dtrace_vtime_switch(next);
892 
893 			resume(next);
894 			/*
895 			 * The TR_RESUME_END and TR_SWTCH_END trace points
896 			 * appear at the end of resume(), because we may not
897 			 * return here
898 			 */
899 		} else {
900 			if (t->t_flag & T_INTR_THREAD)
901 				cpu_intr_swtch_exit(t);
902 
903 			DTRACE_SCHED(remain__cpu);
904 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
905 			(void) spl0();
906 		}
907 	}
908 }
909 
910 /*
911  * swtch_from_zombie()
912  *	Special case of swtch(), which allows checks for TS_ZOMB to be
913  *	eliminated from normal resume.
914  *	Find best runnable thread and run it.
915  *	Called with the current thread zombied.
916  *	Zombies cannot migrate, so CPU references are safe.
917  */
918 void
919 swtch_from_zombie()
920 {
921 	kthread_t	*next;
922 	cpu_t		*cpu = CPU;
923 
924 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
925 
926 	ASSERT(curthread->t_state == TS_ZOMB);
927 
928 	next = disp();			/* returns with spl high */
929 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
930 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
931 	ASSERT(next != curthread);
932 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
933 
934 	if (next == cpu->cpu_idle_thread)
935 		CHIP_NRUNNING(cpu->cpu_chip, -1);
936 
937 	if (dtrace_vtime_active)
938 		dtrace_vtime_switch(next);
939 
940 	resume_from_zombie(next);
941 	/*
942 	 * The TR_RESUME_END and TR_SWTCH_END trace points
943 	 * appear at the end of resume(), because we certainly will not
944 	 * return here
945 	 */
946 }
947 
948 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
949 static int
950 thread_on_queue(kthread_t *tp)
951 {
952 	cpu_t	*cp;
953 	cpu_t	*self;
954 	disp_t	*dp;
955 
956 	self = CPU;
957 	cp = self->cpu_next_onln;
958 	dp = cp->cpu_disp;
959 	for (;;) {
960 		dispq_t		*dq;
961 		dispq_t		*eq;
962 
963 		disp_lock_enter_high(&dp->disp_lock);
964 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
965 			kthread_t	*rp;
966 
967 			ASSERT(dq->dq_last == NULL ||
968 				dq->dq_last->t_link == NULL);
969 			for (rp = dq->dq_first; rp; rp = rp->t_link)
970 				if (tp == rp) {
971 					disp_lock_exit_high(&dp->disp_lock);
972 					return (1);
973 				}
974 		}
975 		disp_lock_exit_high(&dp->disp_lock);
976 		if (cp == NULL)
977 			break;
978 		if (cp == self) {
979 			cp = NULL;
980 			dp = &cp->cpu_part->cp_kp_queue;
981 		} else {
982 			cp = cp->cpu_next_onln;
983 			dp = cp->cpu_disp;
984 		}
985 	}
986 	return (0);
987 }	/* end of thread_on_queue */
988 #else
989 
990 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
991 
992 #endif  /* DEBUG */
993 
994 /*
995  * like swtch(), but switch to a specified thread taken from another CPU.
996  *	called with spl high..
997  */
998 void
999 swtch_to(kthread_t *next)
1000 {
1001 	cpu_t			*cp = CPU;
1002 
1003 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1004 
1005 	/*
1006 	 * Update context switch statistics.
1007 	 */
1008 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1009 
1010 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1011 
1012 	if (curthread == cp->cpu_idle_thread)
1013 		CHIP_NRUNNING(cp->cpu_chip, 1);
1014 
1015 	/* OK to steal anything left on run queue */
1016 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1017 
1018 	/* record last execution time */
1019 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
1020 
1021 	if (dtrace_vtime_active)
1022 		dtrace_vtime_switch(next);
1023 
1024 	resume(next);
1025 	/*
1026 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1027 	 * appear at the end of resume(), because we may not
1028 	 * return here
1029 	 */
1030 }
1031 
1032 
1033 
1034 #define	CPU_IDLING(pri)	((pri) == -1)
1035 
1036 static void
1037 cpu_resched(cpu_t *cp, pri_t tpri)
1038 {
1039 	int	call_poke_cpu = 0;
1040 	pri_t   cpupri = cp->cpu_dispatch_pri;
1041 
1042 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1043 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1044 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1045 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1046 			cp->cpu_runrun = 1;
1047 			aston(cp->cpu_dispthread);
1048 			if (tpri < kpreemptpri && cp != CPU)
1049 				call_poke_cpu = 1;
1050 		}
1051 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1052 			cp->cpu_kprunrun = 1;
1053 			if (cp != CPU)
1054 				call_poke_cpu = 1;
1055 		}
1056 	}
1057 
1058 	/*
1059 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1060 	 */
1061 	membar_enter();
1062 
1063 	if (call_poke_cpu)
1064 		poke_cpu(cp->cpu_id);
1065 }
1066 
1067 /*
1068  * Routine used by setbackdq() to balance load across the physical
1069  * processors. Returns a CPU of a lesser loaded chip in the lgroup
1070  * if balancing is necessary, or the "hint" CPU if it's not.
1071  *
1072  * - tp is the thread being enqueued
1073  * - cp is a hint CPU (chosen by cpu_choose()).
1074  * - curchip (if not NULL) is the chip on which the current thread
1075  *   is running.
1076  *
1077  * The thread lock for "tp" must be held while calling this routine.
1078  */
1079 static cpu_t *
1080 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip)
1081 {
1082 	int	chp_nrun, ochp_nrun;
1083 	chip_t	*chp, *nchp;
1084 
1085 	chp = cp->cpu_chip;
1086 	chp_nrun = chp->chip_nrunning;
1087 
1088 	if (chp == curchip)
1089 		chp_nrun--;	/* Ignore curthread */
1090 
1091 	/*
1092 	 * If this chip isn't at all idle, then let
1093 	 * run queue balancing do the work.
1094 	 */
1095 	if (chp_nrun == chp->chip_ncpu)
1096 		return (cp);
1097 
1098 	nchp = chp->chip_balance;
1099 	do {
1100 		if (nchp == chp ||
1101 		    !CHIP_IN_CPUPART(nchp, tp->t_cpupart))
1102 			continue;
1103 
1104 		ochp_nrun = nchp->chip_nrunning;
1105 
1106 		/*
1107 		 * If the other chip is running less threads,
1108 		 * or if it's running the same number of threads, but
1109 		 * has more online logical CPUs, then choose to balance.
1110 		 */
1111 		if (chp_nrun > ochp_nrun ||
1112 		    (chp_nrun == ochp_nrun &&
1113 		    nchp->chip_ncpu > chp->chip_ncpu)) {
1114 			cp = nchp->chip_cpus;
1115 			nchp->chip_cpus = cp->cpu_next_chip;
1116 
1117 			/*
1118 			 * Find a CPU on the chip in the correct
1119 			 * partition. We know at least one exists
1120 			 * because of the CHIP_IN_CPUPART() check above.
1121 			 */
1122 			while (cp->cpu_part != tp->t_cpupart)
1123 				cp = cp->cpu_next_chip;
1124 		}
1125 		chp->chip_balance = nchp->chip_next_lgrp;
1126 		break;
1127 	} while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance);
1128 
1129 	ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart));
1130 	return (cp);
1131 }
1132 
1133 /*
1134  * setbackdq() keeps runqs balanced such that the difference in length
1135  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1136  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1137  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1138  * try to keep runqs perfectly balanced regardless of the thread priority.
1139  */
1140 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1141 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1142 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1143 
1144 /*
1145  * Put the specified thread on the back of the dispatcher
1146  * queue corresponding to its current priority.
1147  *
1148  * Called with the thread in transition, onproc or stopped state
1149  * and locked (transition implies locked) and at high spl.
1150  * Returns with the thread in TS_RUN state and still locked.
1151  */
1152 void
1153 setbackdq(kthread_t *tp)
1154 {
1155 	dispq_t	*dq;
1156 	disp_t		*dp;
1157 	chip_t		*curchip = NULL;
1158 	cpu_t		*cp;
1159 	pri_t		tpri;
1160 	int		bound;
1161 
1162 	ASSERT(THREAD_LOCK_HELD(tp));
1163 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1164 
1165 	if (tp->t_waitrq == 0) {
1166 		hrtime_t curtime;
1167 
1168 		curtime = gethrtime_unscaled();
1169 		(void) cpu_update_pct(tp, curtime);
1170 		tp->t_waitrq = curtime;
1171 	} else {
1172 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1173 	}
1174 
1175 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1176 
1177 	/*
1178 	 * If thread is "swapped" or on the swap queue don't
1179 	 * queue it, but wake sched.
1180 	 */
1181 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1182 		disp_swapped_setrun(tp);
1183 		return;
1184 	}
1185 
1186 	tpri = DISP_PRIO(tp);
1187 	if (tp == curthread) {
1188 		curchip = CPU->cpu_chip;
1189 	}
1190 
1191 	if (ncpus == 1)
1192 		cp = tp->t_cpu;
1193 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1194 		if (tpri >= kpqpri) {
1195 			setkpdq(tp, SETKP_BACK);
1196 			return;
1197 		}
1198 		/*
1199 		 * Let cpu_choose suggest a CPU.
1200 		 */
1201 		cp = cpu_choose(tp, tpri);
1202 
1203 		if (tp->t_cpupart == cp->cpu_part) {
1204 			int	qlen;
1205 
1206 			/*
1207 			 * Select another CPU if we need
1208 			 * to do some load balancing across the
1209 			 * physical processors.
1210 			 */
1211 			if (CHIP_SHOULD_BALANCE(cp->cpu_chip))
1212 				cp = chip_balance(tp, cp, curchip);
1213 
1214 			/*
1215 			 * Balance across the run queues
1216 			 */
1217 			qlen = RUNQ_LEN(cp, tpri);
1218 			if (tpri >= RUNQ_MATCH_PRI &&
1219 			    !(tp->t_schedflag & TS_RUNQMATCH))
1220 				qlen -= RUNQ_MAX_DIFF;
1221 			if (qlen > 0) {
1222 				cpu_t *newcp;
1223 
1224 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1225 					newcp = cp->cpu_next_part;
1226 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1227 					newcp = cp->cpu_next_part;
1228 				}
1229 
1230 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1231 					DTRACE_PROBE3(runq__balance,
1232 					    kthread_t *, tp,
1233 					    cpu_t *, cp, cpu_t *, newcp);
1234 					cp = newcp;
1235 				}
1236 			}
1237 		} else {
1238 			/*
1239 			 * Migrate to a cpu in the new partition.
1240 			 */
1241 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1242 			    tp->t_lpl, tp->t_pri, NULL);
1243 		}
1244 		bound = 0;
1245 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1246 	} else {
1247 		/*
1248 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1249 		 * a short time until weak binding that existed when the
1250 		 * strong binding was established has dropped) so we must
1251 		 * favour weak binding over strong.
1252 		 */
1253 		cp = tp->t_weakbound_cpu ?
1254 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1255 		bound = 1;
1256 	}
1257 	dp = cp->cpu_disp;
1258 	disp_lock_enter_high(&dp->disp_lock);
1259 
1260 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1261 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1262 		tpri, cp, tp);
1263 
1264 #ifndef NPROBE
1265 	/* Kernel probe */
1266 	if (tnf_tracing_active)
1267 		tnf_thread_queue(tp, cp, tpri);
1268 #endif /* NPROBE */
1269 
1270 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1271 
1272 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1273 	tp->t_disp_queue = dp;
1274 	tp->t_link = NULL;
1275 
1276 	dq = &dp->disp_q[tpri];
1277 	dp->disp_nrunnable++;
1278 	if (!bound)
1279 		dp->disp_steal = 0;
1280 	membar_enter();
1281 
1282 	if (dq->dq_sruncnt++ != 0) {
1283 		ASSERT(dq->dq_first != NULL);
1284 		dq->dq_last->t_link = tp;
1285 		dq->dq_last = tp;
1286 	} else {
1287 		ASSERT(dq->dq_first == NULL);
1288 		ASSERT(dq->dq_last == NULL);
1289 		dq->dq_first = dq->dq_last = tp;
1290 		BT_SET(dp->disp_qactmap, tpri);
1291 		if (tpri > dp->disp_maxrunpri) {
1292 			dp->disp_maxrunpri = tpri;
1293 			membar_enter();
1294 			cpu_resched(cp, tpri);
1295 		}
1296 	}
1297 
1298 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1299 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1300 		    cp == CPU) {
1301 			/*
1302 			 * If there are no other unbound threads on the
1303 			 * run queue, don't allow other CPUs to steal
1304 			 * this thread while we are in the middle of a
1305 			 * context switch. We may just switch to it
1306 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1307 			 * in swtch and swtch_to.
1308 			 */
1309 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1310 		}
1311 		dp->disp_max_unbound_pri = tpri;
1312 	}
1313 	(*disp_enq_thread)(cp, bound);
1314 }
1315 
1316 /*
1317  * Put the specified thread on the front of the dispatcher
1318  * queue corresponding to its current priority.
1319  *
1320  * Called with the thread in transition, onproc or stopped state
1321  * and locked (transition implies locked) and at high spl.
1322  * Returns with the thread in TS_RUN state and still locked.
1323  */
1324 void
1325 setfrontdq(kthread_t *tp)
1326 {
1327 	disp_t		*dp;
1328 	dispq_t		*dq;
1329 	cpu_t		*cp;
1330 	pri_t		tpri;
1331 	int		bound;
1332 
1333 	ASSERT(THREAD_LOCK_HELD(tp));
1334 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1335 
1336 	if (tp->t_waitrq == 0) {
1337 		hrtime_t curtime;
1338 
1339 		curtime = gethrtime_unscaled();
1340 		(void) cpu_update_pct(tp, curtime);
1341 		tp->t_waitrq = curtime;
1342 	} else {
1343 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1344 	}
1345 
1346 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1347 
1348 	/*
1349 	 * If thread is "swapped" or on the swap queue don't
1350 	 * queue it, but wake sched.
1351 	 */
1352 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1353 		disp_swapped_setrun(tp);
1354 		return;
1355 	}
1356 
1357 	tpri = DISP_PRIO(tp);
1358 	if (ncpus == 1)
1359 		cp = tp->t_cpu;
1360 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1361 		if (tpri >= kpqpri) {
1362 			setkpdq(tp, SETKP_FRONT);
1363 			return;
1364 		}
1365 		cp = tp->t_cpu;
1366 		if (tp->t_cpupart == cp->cpu_part) {
1367 			/*
1368 			 * If we are of higher or equal priority than
1369 			 * the highest priority runnable thread of
1370 			 * the current CPU, just pick this CPU.  Otherwise
1371 			 * Let cpu_choose() select the CPU.  If this cpu
1372 			 * is the target of an offline request then do not
1373 			 * pick it - a thread_nomigrate() on the in motion
1374 			 * cpu relies on this when it forces a preempt.
1375 			 */
1376 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1377 			    cp == cpu_inmotion)
1378 				cp = cpu_choose(tp, tpri);
1379 		} else {
1380 			/*
1381 			 * Migrate to a cpu in the new partition.
1382 			 */
1383 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1384 			    tp->t_lpl, tp->t_pri, NULL);
1385 		}
1386 		bound = 0;
1387 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1388 	} else {
1389 		/*
1390 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1391 		 * a short time until weak binding that existed when the
1392 		 * strong binding was established has dropped) so we must
1393 		 * favour weak binding over strong.
1394 		 */
1395 		cp = tp->t_weakbound_cpu ?
1396 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1397 		bound = 1;
1398 	}
1399 	dp = cp->cpu_disp;
1400 	disp_lock_enter_high(&dp->disp_lock);
1401 
1402 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1403 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1404 
1405 #ifndef NPROBE
1406 	/* Kernel probe */
1407 	if (tnf_tracing_active)
1408 		tnf_thread_queue(tp, cp, tpri);
1409 #endif /* NPROBE */
1410 
1411 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1412 
1413 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1414 	tp->t_disp_queue = dp;
1415 
1416 	dq = &dp->disp_q[tpri];
1417 	dp->disp_nrunnable++;
1418 	if (!bound)
1419 		dp->disp_steal = 0;
1420 	membar_enter();
1421 
1422 	if (dq->dq_sruncnt++ != 0) {
1423 		ASSERT(dq->dq_last != NULL);
1424 		tp->t_link = dq->dq_first;
1425 		dq->dq_first = tp;
1426 	} else {
1427 		ASSERT(dq->dq_last == NULL);
1428 		ASSERT(dq->dq_first == NULL);
1429 		tp->t_link = NULL;
1430 		dq->dq_first = dq->dq_last = tp;
1431 		BT_SET(dp->disp_qactmap, tpri);
1432 		if (tpri > dp->disp_maxrunpri) {
1433 			dp->disp_maxrunpri = tpri;
1434 			membar_enter();
1435 			cpu_resched(cp, tpri);
1436 		}
1437 	}
1438 
1439 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1440 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1441 		    cp == CPU) {
1442 			/*
1443 			 * If there are no other unbound threads on the
1444 			 * run queue, don't allow other CPUs to steal
1445 			 * this thread while we are in the middle of a
1446 			 * context switch. We may just switch to it
1447 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1448 			 * in swtch and swtch_to.
1449 			 */
1450 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1451 		}
1452 		dp->disp_max_unbound_pri = tpri;
1453 	}
1454 	(*disp_enq_thread)(cp, bound);
1455 }
1456 
1457 /*
1458  * Put a high-priority unbound thread on the kp queue
1459  */
1460 static void
1461 setkpdq(kthread_t *tp, int borf)
1462 {
1463 	dispq_t	*dq;
1464 	disp_t	*dp;
1465 	cpu_t	*cp;
1466 	pri_t	tpri;
1467 
1468 	tpri = DISP_PRIO(tp);
1469 
1470 	dp = &tp->t_cpupart->cp_kp_queue;
1471 	disp_lock_enter_high(&dp->disp_lock);
1472 
1473 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1474 
1475 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1476 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1477 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1478 	tp->t_disp_queue = dp;
1479 	dp->disp_nrunnable++;
1480 	dq = &dp->disp_q[tpri];
1481 
1482 	if (dq->dq_sruncnt++ != 0) {
1483 		if (borf == SETKP_BACK) {
1484 			ASSERT(dq->dq_first != NULL);
1485 			tp->t_link = NULL;
1486 			dq->dq_last->t_link = tp;
1487 			dq->dq_last = tp;
1488 		} else {
1489 			ASSERT(dq->dq_last != NULL);
1490 			tp->t_link = dq->dq_first;
1491 			dq->dq_first = tp;
1492 		}
1493 	} else {
1494 		if (borf == SETKP_BACK) {
1495 			ASSERT(dq->dq_first == NULL);
1496 			ASSERT(dq->dq_last == NULL);
1497 			dq->dq_first = dq->dq_last = tp;
1498 		} else {
1499 			ASSERT(dq->dq_last == NULL);
1500 			ASSERT(dq->dq_first == NULL);
1501 			tp->t_link = NULL;
1502 			dq->dq_first = dq->dq_last = tp;
1503 		}
1504 		BT_SET(dp->disp_qactmap, tpri);
1505 		if (tpri > dp->disp_max_unbound_pri)
1506 			dp->disp_max_unbound_pri = tpri;
1507 		if (tpri > dp->disp_maxrunpri) {
1508 			dp->disp_maxrunpri = tpri;
1509 			membar_enter();
1510 		}
1511 	}
1512 
1513 	cp = tp->t_cpu;
1514 	if (tp->t_cpupart != cp->cpu_part) {
1515 		/* migrate to a cpu in the new partition */
1516 		cp = tp->t_cpupart->cp_cpulist;
1517 	}
1518 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1519 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1520 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1521 
1522 #ifndef NPROBE
1523 	/* Kernel probe */
1524 	if (tnf_tracing_active)
1525 		tnf_thread_queue(tp, cp, tpri);
1526 #endif /* NPROBE */
1527 
1528 	if (cp->cpu_chosen_level < tpri)
1529 		cp->cpu_chosen_level = tpri;
1530 	cpu_resched(cp, tpri);
1531 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1532 	(*disp_enq_thread)(cp, 0);
1533 }
1534 
1535 /*
1536  * Remove a thread from the dispatcher queue if it is on it.
1537  * It is not an error if it is not found but we return whether
1538  * or not it was found in case the caller wants to check.
1539  */
1540 int
1541 dispdeq(kthread_t *tp)
1542 {
1543 	disp_t		*dp;
1544 	dispq_t		*dq;
1545 	kthread_t	*rp;
1546 	kthread_t	*trp;
1547 	kthread_t	**ptp;
1548 	int		tpri;
1549 
1550 	ASSERT(THREAD_LOCK_HELD(tp));
1551 
1552 	if (tp->t_state != TS_RUN)
1553 		return (0);
1554 
1555 	/*
1556 	 * The thread is "swapped" or is on the swap queue and
1557 	 * hence no longer on the run queue, so return true.
1558 	 */
1559 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1560 		return (1);
1561 
1562 	tpri = DISP_PRIO(tp);
1563 	dp = tp->t_disp_queue;
1564 	ASSERT(tpri < dp->disp_npri);
1565 	dq = &dp->disp_q[tpri];
1566 	ptp = &dq->dq_first;
1567 	rp = *ptp;
1568 	trp = NULL;
1569 
1570 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1571 
1572 	/*
1573 	 * Search for thread in queue.
1574 	 * Double links would simplify this at the expense of disp/setrun.
1575 	 */
1576 	while (rp != tp && rp != NULL) {
1577 		trp = rp;
1578 		ptp = &trp->t_link;
1579 		rp = trp->t_link;
1580 	}
1581 
1582 	if (rp == NULL) {
1583 		panic("dispdeq: thread not on queue");
1584 	}
1585 
1586 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1587 
1588 	/*
1589 	 * Found it so remove it from queue.
1590 	 */
1591 	if ((*ptp = rp->t_link) == NULL)
1592 		dq->dq_last = trp;
1593 
1594 	dp->disp_nrunnable--;
1595 	if (--dq->dq_sruncnt == 0) {
1596 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1597 		if (dp->disp_nrunnable == 0) {
1598 			dp->disp_max_unbound_pri = -1;
1599 			dp->disp_maxrunpri = -1;
1600 		} else if (tpri == dp->disp_maxrunpri) {
1601 			int ipri;
1602 
1603 			ipri = bt_gethighbit(dp->disp_qactmap,
1604 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1605 			if (ipri < dp->disp_max_unbound_pri)
1606 				dp->disp_max_unbound_pri = ipri;
1607 			dp->disp_maxrunpri = ipri;
1608 		}
1609 	}
1610 	tp->t_link = NULL;
1611 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1612 	return (1);
1613 }
1614 
1615 
1616 /*
1617  * dq_sruninc and dq_srundec are public functions for
1618  * incrementing/decrementing the sruncnts when a thread on
1619  * a dispatcher queue is made schedulable/unschedulable by
1620  * resetting the TS_LOAD flag.
1621  *
1622  * The caller MUST have the thread lock and therefore the dispatcher
1623  * queue lock so that the operation which changes
1624  * the flag, the operation that checks the status of the thread to
1625  * determine if it's on a disp queue AND the call to this function
1626  * are one atomic operation with respect to interrupts.
1627  */
1628 
1629 /*
1630  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1631  */
1632 void
1633 dq_sruninc(kthread_t *t)
1634 {
1635 	ASSERT(t->t_state == TS_RUN);
1636 	ASSERT(t->t_schedflag & TS_LOAD);
1637 
1638 	THREAD_TRANSITION(t);
1639 	setfrontdq(t);
1640 }
1641 
1642 /*
1643  * See comment on calling conventions above.
1644  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1645  */
1646 void
1647 dq_srundec(kthread_t *t)
1648 {
1649 	ASSERT(t->t_schedflag & TS_LOAD);
1650 
1651 	(void) dispdeq(t);
1652 	disp_swapped_enq(t);
1653 }
1654 
1655 /*
1656  * Change the dispatcher lock of thread to the "swapped_lock"
1657  * and return with thread lock still held.
1658  *
1659  * Called with thread_lock held, in transition state, and at high spl.
1660  */
1661 void
1662 disp_swapped_enq(kthread_t *tp)
1663 {
1664 	ASSERT(THREAD_LOCK_HELD(tp));
1665 	ASSERT(tp->t_schedflag & TS_LOAD);
1666 
1667 	switch (tp->t_state) {
1668 	case TS_RUN:
1669 		disp_lock_enter_high(&swapped_lock);
1670 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1671 		break;
1672 	case TS_ONPROC:
1673 		disp_lock_enter_high(&swapped_lock);
1674 		THREAD_TRANSITION(tp);
1675 		wake_sched_sec = 1;		/* tell clock to wake sched */
1676 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1677 		break;
1678 	default:
1679 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1680 	}
1681 }
1682 
1683 /*
1684  * This routine is called by setbackdq/setfrontdq if the thread is
1685  * not loaded or loaded and on the swap queue.
1686  *
1687  * Thread state TS_SLEEP implies that a swapped thread
1688  * has been woken up and needs to be swapped in by the swapper.
1689  *
1690  * Thread state TS_RUN, it implies that the priority of a swapped
1691  * thread is being increased by scheduling class (e.g. ts_update).
1692  */
1693 static void
1694 disp_swapped_setrun(kthread_t *tp)
1695 {
1696 	ASSERT(THREAD_LOCK_HELD(tp));
1697 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1698 
1699 	switch (tp->t_state) {
1700 	case TS_SLEEP:
1701 		disp_lock_enter_high(&swapped_lock);
1702 		/*
1703 		 * Wakeup sched immediately (i.e., next tick) if the
1704 		 * thread priority is above maxclsyspri.
1705 		 */
1706 		if (DISP_PRIO(tp) > maxclsyspri)
1707 			wake_sched = 1;
1708 		else
1709 			wake_sched_sec = 1;
1710 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1711 		break;
1712 	case TS_RUN:				/* called from ts_update */
1713 		break;
1714 	default:
1715 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1716 	}
1717 }
1718 
1719 
1720 /*
1721  *	Make a thread give up its processor.  Find the processor on
1722  *	which this thread is executing, and have that processor
1723  *	preempt.
1724  */
1725 void
1726 cpu_surrender(kthread_t *tp)
1727 {
1728 	cpu_t	*cpup;
1729 	int	max_pri;
1730 	int	max_run_pri;
1731 	klwp_t	*lwp;
1732 
1733 	ASSERT(THREAD_LOCK_HELD(tp));
1734 
1735 	if (tp->t_state != TS_ONPROC)
1736 		return;
1737 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1738 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1739 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1740 	if (max_pri < max_run_pri)
1741 		max_pri = max_run_pri;
1742 
1743 	cpup->cpu_runrun = 1;
1744 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1745 		cpup->cpu_kprunrun = 1;
1746 	}
1747 
1748 	/*
1749 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1750 	 */
1751 	membar_enter();
1752 
1753 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1754 
1755 	/*
1756 	 * Make the target thread take an excursion through trap()
1757 	 * to do preempt() (unless we're already in trap or post_syscall,
1758 	 * calling cpu_surrender via CL_TRAPRET).
1759 	 */
1760 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1761 	    lwp->lwp_state != LWP_USER) {
1762 		aston(tp);
1763 		if (cpup != CPU)
1764 			poke_cpu(cpup->cpu_id);
1765 	}
1766 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1767 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1768 }
1769 
1770 
1771 /*
1772  * Commit to and ratify a scheduling decision
1773  */
1774 /*ARGSUSED*/
1775 static kthread_t *
1776 disp_ratify(kthread_t *tp, disp_t *kpq)
1777 {
1778 	pri_t	tpri, maxpri;
1779 	pri_t	maxkpri;
1780 	cpu_t	*cpup;
1781 
1782 	ASSERT(tp != NULL);
1783 	/*
1784 	 * Commit to, then ratify scheduling decision
1785 	 */
1786 	cpup = CPU;
1787 	if (cpup->cpu_runrun != 0)
1788 		cpup->cpu_runrun = 0;
1789 	if (cpup->cpu_kprunrun != 0)
1790 		cpup->cpu_kprunrun = 0;
1791 	if (cpup->cpu_chosen_level != -1)
1792 		cpup->cpu_chosen_level = -1;
1793 	membar_enter();
1794 	tpri = DISP_PRIO(tp);
1795 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1796 	maxkpri = kpq->disp_maxrunpri;
1797 	if (maxpri < maxkpri)
1798 		maxpri = maxkpri;
1799 	if (tpri < maxpri) {
1800 		/*
1801 		 * should have done better
1802 		 * put this one back and indicate to try again
1803 		 */
1804 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1805 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1806 		thread_lock_high(tp);
1807 		THREAD_TRANSITION(tp);
1808 		setfrontdq(tp);
1809 		thread_unlock_nopreempt(tp);
1810 
1811 		tp = NULL;
1812 	}
1813 	return (tp);
1814 }
1815 
1816 /*
1817  * See if there is any work on the dispatcher queue for other CPUs.
1818  * If there is, dequeue the best thread and return.
1819  */
1820 static kthread_t *
1821 disp_getwork(cpu_t *cp)
1822 {
1823 	cpu_t		*ocp;		/* other CPU */
1824 	cpu_t		*ocp_start;
1825 	cpu_t		*tcp;		/* target local CPU */
1826 	kthread_t	*tp;
1827 	kthread_t	*retval = NULL;
1828 	pri_t		maxpri;
1829 	disp_t		*kpq;		/* kp queue for this partition */
1830 	lpl_t		*lpl, *lpl_leaf;
1831 	int		hint, leafidx;
1832 	hrtime_t	stealtime;
1833 
1834 	maxpri = -1;
1835 	tcp = NULL;
1836 
1837 	kpq = &cp->cpu_part->cp_kp_queue;
1838 	while (kpq->disp_maxrunpri >= 0) {
1839 		/*
1840 		 * Try to take a thread from the kp_queue.
1841 		 */
1842 		tp = (disp_getbest(kpq));
1843 		if (tp)
1844 			return (disp_ratify(tp, kpq));
1845 	}
1846 
1847 	kpreempt_disable();		/* protect the cpu_active list */
1848 
1849 	/*
1850 	 * Try to find something to do on another CPU's run queue.
1851 	 * Loop through all other CPUs looking for the one with the highest
1852 	 * priority unbound thread.
1853 	 *
1854 	 * On NUMA machines, the partition's CPUs are consulted in order of
1855 	 * distance from the current CPU. This way, the first available
1856 	 * work found is also the closest, and will suffer the least
1857 	 * from being migrated.
1858 	 */
1859 	lpl = lpl_leaf = cp->cpu_lpl;
1860 	hint = leafidx = 0;
1861 
1862 	/*
1863 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1864 	 * broader levels of locality
1865 	 */
1866 	do {
1867 		/* This loop iterates over the lpl's leaves */
1868 		do {
1869 			if (lpl_leaf != cp->cpu_lpl)
1870 				ocp = lpl_leaf->lpl_cpus;
1871 			else
1872 				ocp = cp->cpu_next_lpl;
1873 
1874 			/* This loop iterates over the CPUs in the leaf */
1875 			ocp_start = ocp;
1876 			do {
1877 				pri_t pri;
1878 
1879 				ASSERT(CPU_ACTIVE(ocp));
1880 
1881 				/*
1882 				 * End our stroll around the partition if:
1883 				 *
1884 				 * - Something became runnable on the local
1885 				 *	queue
1886 				 *
1887 				 * - We're at the broadest level of locality and
1888 				 *   we happen across another idle CPU. At the
1889 				 *   highest level of locality, all CPUs will
1890 				 *   walk the partition's CPUs in the same
1891 				 *   order, so we can end our stroll taking
1892 				 *   comfort in knowing the other idle CPU is
1893 				 *   already covering the next portion of the
1894 				 *   list.
1895 				 */
1896 				if (cp->cpu_disp->disp_nrunnable != 0)
1897 					break;
1898 				if (ocp->cpu_dispatch_pri == -1) {
1899 					if (ocp->cpu_disp_flags &
1900 					    CPU_DISP_HALTED)
1901 						continue;
1902 					else if (lpl->lpl_parent == NULL)
1903 						break;
1904 				}
1905 
1906 				/*
1907 				 * If there's only one thread and the CPU
1908 				 * is in the middle of a context switch,
1909 				 * or it's currently running the idle thread,
1910 				 * don't steal it.
1911 				 */
1912 				if ((ocp->cpu_disp_flags &
1913 					CPU_DISP_DONTSTEAL) &&
1914 				    ocp->cpu_disp->disp_nrunnable == 1)
1915 					continue;
1916 
1917 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1918 				if (pri > maxpri) {
1919 					/*
1920 					 * Don't steal threads that we attempted
1921 					 * to be stolen very recently until
1922 					 * they're ready to be stolen again.
1923 					 */
1924 					stealtime = ocp->cpu_disp->disp_steal;
1925 					if (stealtime == 0 ||
1926 					    stealtime - gethrtime() <= 0) {
1927 						maxpri = pri;
1928 						tcp = ocp;
1929 					} else {
1930 						/*
1931 						 * Don't update tcp, just set
1932 						 * the retval to T_DONTSTEAL, so
1933 						 * that if no acceptable CPUs
1934 						 * are found the return value
1935 						 * will be T_DONTSTEAL rather
1936 						 * then NULL.
1937 						 */
1938 						retval = T_DONTSTEAL;
1939 					}
1940 				}
1941 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1942 
1943 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1944 				leafidx = 0;
1945 				lpl_leaf = lpl->lpl_rset[leafidx];
1946 			}
1947 		} while (leafidx != hint);
1948 
1949 		hint = leafidx = lpl->lpl_hint;
1950 		if ((lpl = lpl->lpl_parent) != NULL)
1951 			lpl_leaf = lpl->lpl_rset[hint];
1952 	} while (!tcp && lpl);
1953 
1954 	kpreempt_enable();
1955 
1956 	/*
1957 	 * If another queue looks good, and there is still nothing on
1958 	 * the local queue, try to transfer one or more threads
1959 	 * from it to our queue.
1960 	 */
1961 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1962 		tp = disp_getbest(tcp->cpu_disp);
1963 		if (tp == NULL || tp == T_DONTSTEAL)
1964 			return (tp);
1965 		return (disp_ratify(tp, kpq));
1966 	}
1967 	return (retval);
1968 }
1969 
1970 
1971 /*
1972  * disp_fix_unbound_pri()
1973  *	Determines the maximum priority of unbound threads on the queue.
1974  *	The priority is kept for the queue, but is only increased, never
1975  *	reduced unless some CPU is looking for something on that queue.
1976  *
1977  *	The priority argument is the known upper limit.
1978  *
1979  *	Perhaps this should be kept accurately, but that probably means
1980  *	separate bitmaps for bound and unbound threads.  Since only idled
1981  *	CPUs will have to do this recalculation, it seems better this way.
1982  */
1983 static void
1984 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1985 {
1986 	kthread_t	*tp;
1987 	dispq_t		*dq;
1988 	ulong_t		*dqactmap = dp->disp_qactmap;
1989 	ulong_t		mapword;
1990 	int		wx;
1991 
1992 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1993 
1994 	ASSERT(pri >= 0);			/* checked by caller */
1995 
1996 	/*
1997 	 * Start the search at the next lowest priority below the supplied
1998 	 * priority.  This depends on the bitmap implementation.
1999 	 */
2000 	do {
2001 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2002 
2003 		/*
2004 		 * Form mask for all lower priorities in the word.
2005 		 */
2006 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2007 
2008 		/*
2009 		 * Get next lower active priority.
2010 		 */
2011 		if (mapword != 0) {
2012 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2013 		} else if (wx > 0) {
2014 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2015 			if (pri < 0)
2016 				break;
2017 		} else {
2018 			pri = -1;
2019 			break;
2020 		}
2021 
2022 		/*
2023 		 * Search the queue for unbound, runnable threads.
2024 		 */
2025 		dq = &dp->disp_q[pri];
2026 		tp = dq->dq_first;
2027 
2028 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2029 			tp = tp->t_link;
2030 		}
2031 
2032 		/*
2033 		 * If a thread was found, set the priority and return.
2034 		 */
2035 	} while (tp == NULL);
2036 
2037 	/*
2038 	 * pri holds the maximum unbound thread priority or -1.
2039 	 */
2040 	if (dp->disp_max_unbound_pri != pri)
2041 		dp->disp_max_unbound_pri = pri;
2042 }
2043 
2044 /*
2045  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2046  * 	check if the CPU to which is was previously bound should have
2047  * 	its disp_max_unbound_pri increased.
2048  */
2049 void
2050 disp_adjust_unbound_pri(kthread_t *tp)
2051 {
2052 	disp_t *dp;
2053 	pri_t tpri;
2054 
2055 	ASSERT(THREAD_LOCK_HELD(tp));
2056 
2057 	/*
2058 	 * Don't do anything if the thread is not bound, or
2059 	 * currently not runnable or swapped out.
2060 	 */
2061 	if (tp->t_bound_cpu == NULL ||
2062 	    tp->t_state != TS_RUN ||
2063 	    tp->t_schedflag & TS_ON_SWAPQ)
2064 		return;
2065 
2066 	tpri = DISP_PRIO(tp);
2067 	dp = tp->t_bound_cpu->cpu_disp;
2068 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2069 	if (tpri > dp->disp_max_unbound_pri)
2070 		dp->disp_max_unbound_pri = tpri;
2071 }
2072 
2073 /*
2074  * disp_getbest()
2075  *   De-queue the highest priority unbound runnable thread.
2076  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2077  *   Returns NULL if nothing found.
2078  *   Returns T_DONTSTEAL if the thread was not stealable.
2079  *   so that the caller will try again later.
2080  *
2081  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2082  *   its type.
2083  */
2084 static kthread_t *
2085 disp_getbest(disp_t *dp)
2086 {
2087 	kthread_t	*tp;
2088 	dispq_t		*dq;
2089 	pri_t		pri;
2090 	cpu_t		*cp, *tcp;
2091 	boolean_t	allbound;
2092 
2093 	disp_lock_enter(&dp->disp_lock);
2094 
2095 	/*
2096 	 * If there is nothing to run, or the CPU is in the middle of a
2097 	 * context switch of the only thread, return NULL.
2098 	 */
2099 	tcp = dp->disp_cpu;
2100 	cp = CPU;
2101 	pri = dp->disp_max_unbound_pri;
2102 	if (pri == -1 ||
2103 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2104 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2105 		disp_lock_exit_nopreempt(&dp->disp_lock);
2106 		return (NULL);
2107 	}
2108 
2109 	dq = &dp->disp_q[pri];
2110 
2111 
2112 	/*
2113 	 * Assume that all threads are bound on this queue, and change it
2114 	 * later when we find out that it is not the case.
2115 	 */
2116 	allbound = B_TRUE;
2117 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2118 		hrtime_t now, nosteal, rqtime;
2119 		chip_type_t chtype;
2120 		chip_t *chip;
2121 
2122 		/*
2123 		 * Skip over bound threads which could be here even
2124 		 * though disp_max_unbound_pri indicated this level.
2125 		 */
2126 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2127 			continue;
2128 
2129 		/*
2130 		 * We've got some unbound threads on this queue, so turn
2131 		 * the allbound flag off now.
2132 		 */
2133 		allbound = B_FALSE;
2134 
2135 		/*
2136 		 * The thread is a candidate for stealing from its run queue. We
2137 		 * don't want to steal threads that became runnable just a
2138 		 * moment ago. This improves CPU affinity for threads that get
2139 		 * preempted for short periods of time and go back on the run
2140 		 * queue.
2141 		 *
2142 		 * We want to let it stay on its run queue if it was only placed
2143 		 * there recently and it was running on the same CPU before that
2144 		 * to preserve its cache investment. For the thread to remain on
2145 		 * its run queue, ALL of the following conditions must be
2146 		 * satisfied:
2147 		 *
2148 		 * - the disp queue should not be the kernel preemption queue
2149 		 * - delayed idle stealing should not be disabled
2150 		 * - nosteal_nsec should be non-zero
2151 		 * - it should run with user priority
2152 		 * - it should be on the run queue of the CPU where it was
2153 		 *   running before being placed on the run queue
2154 		 * - it should be the only thread on the run queue (to prevent
2155 		 *   extra scheduling latency for other threads)
2156 		 * - it should sit on the run queue for less than per-chip
2157 		 *   nosteal interval or global nosteal interval
2158 		 * - in case of CPUs with shared cache it should sit in a run
2159 		 *   queue of a CPU from a different chip
2160 		 *
2161 		 * The checks are arranged so that the ones that are faster are
2162 		 * placed earlier.
2163 		 */
2164 		if (tcp == NULL ||
2165 		    pri >= minclsyspri ||
2166 		    tp->t_cpu != tcp)
2167 			break;
2168 
2169 		/*
2170 		 * Steal immediately if the chip has shared cache and we are
2171 		 * sharing the chip with the target thread's CPU.
2172 		 */
2173 		chip = tcp->cpu_chip;
2174 		chtype = chip->chip_type;
2175 		if ((chtype == CHIP_SMT || chtype == CHIP_CMP_SHARED_CACHE) &&
2176 		    chip == cp->cpu_chip)
2177 			break;
2178 
2179 		/*
2180 		 * Get the value of nosteal interval either from nosteal_nsec
2181 		 * global variable or from a value specified by a chip
2182 		 */
2183 		nosteal = nosteal_nsec ? nosteal_nsec : chip->chip_nosteal;
2184 		if (nosteal == 0 || nosteal == NOSTEAL_DISABLED)
2185 			break;
2186 
2187 		/*
2188 		 * Calculate time spent sitting on run queue
2189 		 */
2190 		now = gethrtime_unscaled();
2191 		rqtime = now - tp->t_waitrq;
2192 		scalehrtime(&rqtime);
2193 
2194 		/*
2195 		 * Steal immediately if the time spent on this run queue is more
2196 		 * than allowed nosteal delay.
2197 		 *
2198 		 * Negative rqtime check is needed here to avoid infinite
2199 		 * stealing delays caused by unlikely but not impossible
2200 		 * drifts between CPU times on different CPUs.
2201 		 */
2202 		if (rqtime > nosteal || rqtime < 0)
2203 			break;
2204 
2205 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2206 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2207 		scalehrtime(&now);
2208 		/*
2209 		 * Calculate when this thread becomes stealable
2210 		 */
2211 		now += (nosteal - rqtime);
2212 
2213 		/*
2214 		 * Calculate time when some thread becomes stealable
2215 		 */
2216 		if (now < dp->disp_steal)
2217 			dp->disp_steal = now;
2218 	}
2219 
2220 	/*
2221 	 * If there were no unbound threads on this queue, find the queue
2222 	 * where they are and then return later. The value of
2223 	 * disp_max_unbound_pri is not always accurate because it isn't
2224 	 * reduced until another idle CPU looks for work.
2225 	 */
2226 	if (allbound)
2227 		disp_fix_unbound_pri(dp, pri);
2228 
2229 	/*
2230 	 * If we reached the end of the queue and found no unbound threads
2231 	 * then return NULL so that other CPUs will be considered.  If there
2232 	 * are unbound threads but they cannot yet be stolen, then
2233 	 * return T_DONTSTEAL and try again later.
2234 	 */
2235 	if (tp == NULL) {
2236 		disp_lock_exit_nopreempt(&dp->disp_lock);
2237 		return (allbound ? NULL : T_DONTSTEAL);
2238 	}
2239 
2240 	/*
2241 	 * Found a runnable, unbound thread, so remove it from queue.
2242 	 * dispdeq() requires that we have the thread locked, and we do,
2243 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2244 	 * put the thread in transition state, thereby dropping the dispq
2245 	 * lock.
2246 	 */
2247 
2248 #ifdef DEBUG
2249 	{
2250 		int	thread_was_on_queue;
2251 
2252 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2253 		ASSERT(thread_was_on_queue);
2254 	}
2255 
2256 #else /* DEBUG */
2257 	(void) dispdeq(tp);			/* drops disp_lock */
2258 #endif /* DEBUG */
2259 
2260 	/*
2261 	 * Reset the disp_queue steal time - we do not know what is the smallest
2262 	 * value across the queue is.
2263 	 */
2264 	dp->disp_steal = 0;
2265 
2266 	tp->t_schedflag |= TS_DONT_SWAP;
2267 
2268 	/*
2269 	 * Setup thread to run on the current CPU.
2270 	 */
2271 	tp->t_disp_queue = cp->cpu_disp;
2272 
2273 	cp->cpu_dispthread = tp;		/* protected by spl only */
2274 	cp->cpu_dispatch_pri = pri;
2275 	ASSERT(pri == DISP_PRIO(tp));
2276 
2277 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2278 
2279 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2280 
2281 	/*
2282 	 * Return with spl high so that swtch() won't need to raise it.
2283 	 * The disp_lock was dropped by dispdeq().
2284 	 */
2285 
2286 	return (tp);
2287 }
2288 
2289 /*
2290  * disp_bound_common() - common routine for higher level functions
2291  *	that check for bound threads under certain conditions.
2292  *	If 'threadlistsafe' is set then there is no need to acquire
2293  *	pidlock to stop the thread list from changing (eg, if
2294  *	disp_bound_* is called with cpus paused).
2295  */
2296 static int
2297 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2298 {
2299 	int		found = 0;
2300 	kthread_t	*tp;
2301 
2302 	ASSERT(flag);
2303 
2304 	if (!threadlistsafe)
2305 		mutex_enter(&pidlock);
2306 	tp = curthread;		/* faster than allthreads */
2307 	do {
2308 		if (tp->t_state != TS_FREE) {
2309 			/*
2310 			 * If an interrupt thread is busy, but the
2311 			 * caller doesn't care (i.e. BOUND_INTR is off),
2312 			 * then just ignore it and continue through.
2313 			 */
2314 			if ((tp->t_flag & T_INTR_THREAD) &&
2315 			    !(flag & BOUND_INTR))
2316 				continue;
2317 
2318 			/*
2319 			 * Skip the idle thread for the CPU
2320 			 * we're about to set offline.
2321 			 */
2322 			if (tp == cp->cpu_idle_thread)
2323 				continue;
2324 
2325 			/*
2326 			 * Skip the pause thread for the CPU
2327 			 * we're about to set offline.
2328 			 */
2329 			if (tp == cp->cpu_pause_thread)
2330 				continue;
2331 
2332 			if ((flag & BOUND_CPU) &&
2333 			    (tp->t_bound_cpu == cp ||
2334 			    tp->t_bind_cpu == cp->cpu_id ||
2335 			    tp->t_weakbound_cpu == cp)) {
2336 				found = 1;
2337 				break;
2338 			}
2339 
2340 			if ((flag & BOUND_PARTITION) &&
2341 			    (tp->t_cpupart == cp->cpu_part)) {
2342 				found = 1;
2343 				break;
2344 			}
2345 		}
2346 	} while ((tp = tp->t_next) != curthread && found == 0);
2347 	if (!threadlistsafe)
2348 		mutex_exit(&pidlock);
2349 	return (found);
2350 }
2351 
2352 /*
2353  * disp_bound_threads - return nonzero if threads are bound to the processor.
2354  *	Called infrequently.  Keep this simple.
2355  *	Includes threads that are asleep or stopped but not onproc.
2356  */
2357 int
2358 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2359 {
2360 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2361 }
2362 
2363 /*
2364  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2365  * to the given processor, including interrupt threads.
2366  */
2367 int
2368 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2369 {
2370 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2371 }
2372 
2373 /*
2374  * disp_bound_partition - return nonzero if threads are bound to the same
2375  * partition as the processor.
2376  *	Called infrequently.  Keep this simple.
2377  *	Includes threads that are asleep or stopped but not onproc.
2378  */
2379 int
2380 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2381 {
2382 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2383 }
2384 
2385 /*
2386  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2387  * threads to other CPUs.
2388  */
2389 void
2390 disp_cpu_inactive(cpu_t *cp)
2391 {
2392 	kthread_t	*tp;
2393 	disp_t		*dp = cp->cpu_disp;
2394 	dispq_t		*dq;
2395 	pri_t		pri;
2396 	int		wasonq;
2397 
2398 	disp_lock_enter(&dp->disp_lock);
2399 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2400 		dq = &dp->disp_q[pri];
2401 		tp = dq->dq_first;
2402 
2403 		/*
2404 		 * Skip over bound threads.
2405 		 */
2406 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2407 			tp = tp->t_link;
2408 		}
2409 
2410 		if (tp == NULL) {
2411 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2412 			disp_fix_unbound_pri(dp, pri);
2413 			continue;
2414 		}
2415 
2416 		wasonq = dispdeq(tp);		/* drops disp_lock */
2417 		ASSERT(wasonq);
2418 		ASSERT(tp->t_weakbound_cpu == NULL);
2419 
2420 		setbackdq(tp);
2421 		/*
2422 		 * Called from cpu_offline:
2423 		 *
2424 		 * cp has already been removed from the list of active cpus
2425 		 * and tp->t_cpu has been changed so there is no risk of
2426 		 * tp ending up back on cp.
2427 		 *
2428 		 * Called from cpupart_move_cpu:
2429 		 *
2430 		 * The cpu has moved to a new cpupart.  Any threads that
2431 		 * were on it's dispatch queues before the move remain
2432 		 * in the old partition and can't run in the new partition.
2433 		 */
2434 		ASSERT(tp->t_cpu != cp);
2435 		thread_unlock(tp);
2436 
2437 		disp_lock_enter(&dp->disp_lock);
2438 	}
2439 	disp_lock_exit(&dp->disp_lock);
2440 }
2441 
2442 /*
2443  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2444  *	The hint passed in is used as a starting point so we don't favor
2445  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2446  *	used CPU for the thread.
2447  *
2448  *	The lgroup and priority are used to determine the best CPU to run on
2449  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2450  *	the thread priority will indicate whether the thread will actually run
2451  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2452  *	lgroup which are running the lowest priority threads are found.  The
2453  *	remote CPU is chosen only if the thread will not run locally on a CPU
2454  *	within the lgroup, but will run on the remote CPU. If the thread
2455  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2456  *
2457  *	The lpl specified also identifies the cpu partition from which
2458  *	disp_lowpri_cpu should select a CPU.
2459  *
2460  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2461  *      behalf of the current thread. (curthread is looking for a new cpu)
2462  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2463  *      ignored.
2464  *
2465  *      If a cpu is the target of an offline request then try to avoid it.
2466  *
2467  *	This function must be called at either high SPL, or with preemption
2468  *	disabled, so that the "hint" CPU cannot be removed from the online
2469  *	CPU list while we are traversing it.
2470  */
2471 cpu_t *
2472 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2473 {
2474 	cpu_t	*bestcpu;
2475 	cpu_t	*besthomecpu;
2476 	cpu_t   *cp, *cpstart;
2477 
2478 	pri_t   bestpri;
2479 	pri_t   cpupri;
2480 
2481 	klgrpset_t	done;
2482 	klgrpset_t	cur_set;
2483 
2484 	lpl_t		*lpl_iter, *lpl_leaf;
2485 	int		i;
2486 
2487 	/*
2488 	 * Scan for a CPU currently running the lowest priority thread.
2489 	 * Cannot get cpu_lock here because it is adaptive.
2490 	 * We do not require lock on CPU list.
2491 	 */
2492 	ASSERT(hint != NULL);
2493 	ASSERT(lpl != NULL);
2494 	ASSERT(lpl->lpl_ncpu > 0);
2495 
2496 	/*
2497 	 * First examine local CPUs. Note that it's possible the hint CPU
2498 	 * passed in in remote to the specified home lgroup. If our priority
2499 	 * isn't sufficient enough such that we can run immediately at home,
2500 	 * then examine CPUs remote to our home lgroup.
2501 	 * We would like to give preference to CPUs closest to "home".
2502 	 * If we can't find a CPU where we'll run at a given level
2503 	 * of locality, we expand our search to include the next level.
2504 	 */
2505 	bestcpu = besthomecpu = NULL;
2506 	klgrpset_clear(done);
2507 	/* start with lpl we were passed */
2508 
2509 	lpl_iter = lpl;
2510 
2511 	do {
2512 
2513 		bestpri = SHRT_MAX;
2514 		klgrpset_clear(cur_set);
2515 
2516 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2517 			lpl_leaf = lpl_iter->lpl_rset[i];
2518 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2519 				continue;
2520 
2521 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2522 
2523 			if (hint->cpu_lpl == lpl_leaf)
2524 				cp = cpstart = hint;
2525 			else
2526 				cp = cpstart = lpl_leaf->lpl_cpus;
2527 
2528 			do {
2529 				if (cp == curcpu)
2530 					cpupri = -1;
2531 				else if (cp == cpu_inmotion)
2532 					cpupri = SHRT_MAX;
2533 				else
2534 					cpupri = cp->cpu_dispatch_pri;
2535 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2536 					cpupri = cp->cpu_disp->disp_maxrunpri;
2537 				if (cp->cpu_chosen_level > cpupri)
2538 					cpupri = cp->cpu_chosen_level;
2539 				if (cpupri < bestpri) {
2540 					if (CPU_IDLING(cpupri)) {
2541 						ASSERT((cp->cpu_flags &
2542 						    CPU_QUIESCED) == 0);
2543 						return (cp);
2544 					}
2545 					bestcpu = cp;
2546 					bestpri = cpupri;
2547 				}
2548 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2549 		}
2550 
2551 		if (bestcpu && (tpri > bestpri)) {
2552 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2553 			return (bestcpu);
2554 		}
2555 		if (besthomecpu == NULL)
2556 			besthomecpu = bestcpu;
2557 		/*
2558 		 * Add the lgrps we just considered to the "done" set
2559 		 */
2560 		klgrpset_or(done, cur_set);
2561 
2562 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2563 
2564 	/*
2565 	 * The specified priority isn't high enough to run immediately
2566 	 * anywhere, so just return the best CPU from the home lgroup.
2567 	 */
2568 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2569 	return (besthomecpu);
2570 }
2571 
2572 /*
2573  * This routine provides the generic idle cpu function for all processors.
2574  * If a processor has some specific code to execute when idle (say, to stop
2575  * the pipeline and save power) then that routine should be defined in the
2576  * processors specific code (module_xx.c) and the global variable idle_cpu
2577  * set to that function.
2578  */
2579 static void
2580 generic_idle_cpu(void)
2581 {
2582 }
2583 
2584 /*ARGSUSED*/
2585 static void
2586 generic_enq_thread(cpu_t *cpu, int bound)
2587 {
2588 }
2589 
2590 /*
2591  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2592  *	- t->t_cpu is not in this thread's assigned lgrp
2593  *	- the time since the thread last came off t->t_cpu exceeds the
2594  *	  rechoose time for this cpu (ignore this if t is curthread in
2595  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2596  *	- t->t_cpu is presently the target of an offline or partition move
2597  *	  request
2598  */
2599 static cpu_t *
2600 cpu_choose(kthread_t *t, pri_t tpri)
2601 {
2602 	ASSERT(tpri < kpqpri);
2603 
2604 	if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) &&
2605 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2606 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2607 	}
2608 
2609 	/*
2610 	 * Take a trip through disp_lowpri_cpu() if the thread was
2611 	 * running outside it's home lgroup
2612 	 */
2613 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2614 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2615 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2616 		    (t == curthread) ? t->t_cpu : NULL));
2617 	}
2618 	return (t->t_cpu);
2619 }
2620