xref: /titanic_50/usr/src/uts/common/disp/disp.c (revision cf170fc06cee7b670cc5ccf1fe83dce33fb2592b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysinfo.h>
39 #include <sys/var.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/inline.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47 #include <sys/kmem.h>
48 #include <sys/cpuvar.h>
49 #include <sys/vtrace.h>
50 #include <sys/tnf.h>
51 #include <sys/cpupart.h>
52 #include <sys/lgrp.h>
53 #include <sys/pg.h>
54 #include <sys/cmt.h>
55 #include <sys/bitset.h>
56 #include <sys/schedctl.h>
57 #include <sys/atomic.h>
58 #include <sys/dtrace.h>
59 #include <sys/sdt.h>
60 #include <sys/archsystm.h>
61 
62 #include <vm/as.h>
63 
64 #define	BOUND_CPU	0x1
65 #define	BOUND_PARTITION	0x2
66 #define	BOUND_INTR	0x4
67 
68 /* Dispatch queue allocation structure and functions */
69 struct disp_queue_info {
70 	disp_t	*dp;
71 	dispq_t *olddispq;
72 	dispq_t *newdispq;
73 	ulong_t	*olddqactmap;
74 	ulong_t	*newdqactmap;
75 	int	oldnglobpris;
76 };
77 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
78     disp_t *dp);
79 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
80 static void	disp_dq_free(struct disp_queue_info *dptr);
81 
82 /* platform-specific routine to call when processor is idle */
83 static void	generic_idle_cpu();
84 void		(*idle_cpu)() = generic_idle_cpu;
85 
86 /* routines invoked when a CPU enters/exits the idle loop */
87 static void	idle_enter();
88 static void	idle_exit();
89 
90 /* platform-specific routine to call when thread is enqueued */
91 static void	generic_enq_thread(cpu_t *, int);
92 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
93 
94 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
95 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
96 pri_t	intr_pri;		/* interrupt thread priority base level */
97 
98 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
99 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
100 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
101 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
102 int	nswapped;		/* total number of swapped threads */
103 void	disp_swapped_enq(kthread_t *tp);
104 static void	disp_swapped_setrun(kthread_t *tp);
105 static void	cpu_resched(cpu_t *cp, pri_t tpri);
106 
107 /*
108  * If this is set, only interrupt threads will cause kernel preemptions.
109  * This is done by changing the value of kpreemptpri.  kpreemptpri
110  * will either be the max sysclass pri + 1 or the min interrupt pri.
111  */
112 int	only_intr_kpreempt;
113 
114 extern void set_idle_cpu(int cpun);
115 extern void unset_idle_cpu(int cpun);
116 static void setkpdq(kthread_t *tp, int borf);
117 #define	SETKP_BACK	0
118 #define	SETKP_FRONT	1
119 /*
120  * Parameter that determines how recently a thread must have run
121  * on the CPU to be considered loosely-bound to that CPU to reduce
122  * cold cache effects.  The interval is in hertz.
123  */
124 #define	RECHOOSE_INTERVAL 3
125 int	rechoose_interval = RECHOOSE_INTERVAL;
126 static cpu_t	*cpu_choose(kthread_t *, pri_t);
127 
128 /*
129  * Parameter that determines how long (in nanoseconds) a thread must
130  * be sitting on a run queue before it can be stolen by another CPU
131  * to reduce migrations.  The interval is in nanoseconds.
132  *
133  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
134  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
135  * here indicating it is uninitiallized.
136  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
137  *
138  */
139 #define	NOSTEAL_UNINITIALIZED	(-1)
140 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
141 extern void cmp_set_nosteal_interval(void);
142 
143 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
144 
145 disp_lock_t	transition_lock;	/* lock on transitioning threads */
146 disp_lock_t	stop_lock;		/* lock on stopped threads */
147 
148 static void	cpu_dispqalloc(int numpris);
149 
150 /*
151  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
152  * a thread because it was sitting on its run queue for a very short
153  * period of time.
154  */
155 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
156 
157 static kthread_t	*disp_getwork(cpu_t *to);
158 static kthread_t	*disp_getbest(disp_t *from);
159 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
160 
161 void	swtch_to(kthread_t *);
162 
163 /*
164  * dispatcher and scheduler initialization
165  */
166 
167 /*
168  * disp_setup - Common code to calculate and allocate dispatcher
169  *		variables and structures based on the maximum priority.
170  */
171 static void
172 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
173 {
174 	pri_t	newnglobpris;
175 
176 	ASSERT(MUTEX_HELD(&cpu_lock));
177 
178 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
179 
180 	if (newnglobpris > oldnglobpris) {
181 		/*
182 		 * Allocate new kp queues for each CPU partition.
183 		 */
184 		cpupart_kpqalloc(newnglobpris);
185 
186 		/*
187 		 * Allocate new dispatch queues for each CPU.
188 		 */
189 		cpu_dispqalloc(newnglobpris);
190 
191 		/*
192 		 * compute new interrupt thread base priority
193 		 */
194 		intr_pri = maxglobpri;
195 		if (only_intr_kpreempt) {
196 			kpreemptpri = intr_pri + 1;
197 			if (kpqpri == KPQPRI)
198 				kpqpri = kpreemptpri;
199 		}
200 		v.v_nglobpris = newnglobpris;
201 	}
202 }
203 
204 /*
205  * dispinit - Called to initialize all loaded classes and the
206  *	      dispatcher framework.
207  */
208 void
209 dispinit(void)
210 {
211 	id_t	cid;
212 	pri_t	maxglobpri;
213 	pri_t	cl_maxglobpri;
214 
215 	maxglobpri = -1;
216 
217 	/*
218 	 * Initialize transition lock, which will always be set.
219 	 */
220 	DISP_LOCK_INIT(&transition_lock);
221 	disp_lock_enter_high(&transition_lock);
222 	DISP_LOCK_INIT(&stop_lock);
223 
224 	mutex_enter(&cpu_lock);
225 	CPU->cpu_disp->disp_maxrunpri = -1;
226 	CPU->cpu_disp->disp_max_unbound_pri = -1;
227 
228 	/*
229 	 * Initialize the default CPU partition.
230 	 */
231 	cpupart_initialize_default();
232 	/*
233 	 * Call the class specific initialization functions for
234 	 * all pre-installed schedulers.
235 	 *
236 	 * We pass the size of a class specific parameter
237 	 * buffer to each of the initialization functions
238 	 * to try to catch problems with backward compatibility
239 	 * of class modules.
240 	 *
241 	 * For example a new class module running on an old system
242 	 * which didn't provide sufficiently large parameter buffers
243 	 * would be bad news. Class initialization modules can check for
244 	 * this and take action if they detect a problem.
245 	 */
246 
247 	for (cid = 0; cid < nclass; cid++) {
248 		sclass_t	*sc;
249 
250 		sc = &sclass[cid];
251 		if (SCHED_INSTALLED(sc)) {
252 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
253 			    &sc->cl_funcs);
254 			if (cl_maxglobpri > maxglobpri)
255 				maxglobpri = cl_maxglobpri;
256 		}
257 	}
258 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
259 	if (kpqpri == KPQPRI)
260 		kpqpri = kpreemptpri;
261 
262 	ASSERT(maxglobpri >= 0);
263 	disp_setup(maxglobpri, 0);
264 
265 	mutex_exit(&cpu_lock);
266 
267 	/*
268 	 * Platform specific sticky scheduler setup.
269 	 */
270 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
271 		cmp_set_nosteal_interval();
272 
273 	/*
274 	 * Get the default class ID; this may be later modified via
275 	 * dispadmin(1M).  This will load the class (normally TS) and that will
276 	 * call disp_add(), which is why we had to drop cpu_lock first.
277 	 */
278 	if (getcid(defaultclass, &defaultcid) != 0) {
279 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
280 		    defaultclass);
281 	}
282 }
283 
284 /*
285  * disp_add - Called with class pointer to initialize the dispatcher
286  *	      for a newly loaded class.
287  */
288 void
289 disp_add(sclass_t *clp)
290 {
291 	pri_t	maxglobpri;
292 	pri_t	cl_maxglobpri;
293 
294 	mutex_enter(&cpu_lock);
295 	/*
296 	 * Initialize the scheduler class.
297 	 */
298 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
299 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
300 	if (cl_maxglobpri > maxglobpri)
301 		maxglobpri = cl_maxglobpri;
302 
303 	/*
304 	 * Save old queue information.  Since we're initializing a
305 	 * new scheduling class which has just been loaded, then
306 	 * the size of the dispq may have changed.  We need to handle
307 	 * that here.
308 	 */
309 	disp_setup(maxglobpri, v.v_nglobpris);
310 
311 	mutex_exit(&cpu_lock);
312 }
313 
314 
315 /*
316  * For each CPU, allocate new dispatch queues
317  * with the stated number of priorities.
318  */
319 static void
320 cpu_dispqalloc(int numpris)
321 {
322 	cpu_t	*cpup;
323 	struct disp_queue_info	*disp_mem;
324 	int i, num;
325 
326 	ASSERT(MUTEX_HELD(&cpu_lock));
327 
328 	disp_mem = kmem_zalloc(NCPU *
329 	    sizeof (struct disp_queue_info), KM_SLEEP);
330 
331 	/*
332 	 * This routine must allocate all of the memory before stopping
333 	 * the cpus because it must not sleep in kmem_alloc while the
334 	 * CPUs are stopped.  Locks they hold will not be freed until they
335 	 * are restarted.
336 	 */
337 	i = 0;
338 	cpup = cpu_list;
339 	do {
340 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
341 		i++;
342 		cpup = cpup->cpu_next;
343 	} while (cpup != cpu_list);
344 	num = i;
345 
346 	pause_cpus(NULL);
347 	for (i = 0; i < num; i++)
348 		disp_dq_assign(&disp_mem[i], numpris);
349 	start_cpus();
350 
351 	/*
352 	 * I must free all of the memory after starting the cpus because
353 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
354 	 */
355 	for (i = 0; i < num; i++)
356 		disp_dq_free(&disp_mem[i]);
357 
358 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
359 }
360 
361 static void
362 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
363 {
364 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
365 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
366 	    sizeof (long), KM_SLEEP);
367 	dptr->dp = dp;
368 }
369 
370 static void
371 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
372 {
373 	disp_t	*dp;
374 
375 	dp = dptr->dp;
376 	dptr->olddispq = dp->disp_q;
377 	dptr->olddqactmap = dp->disp_qactmap;
378 	dptr->oldnglobpris = dp->disp_npri;
379 
380 	ASSERT(dptr->oldnglobpris < numpris);
381 
382 	if (dptr->olddispq != NULL) {
383 		/*
384 		 * Use kcopy because bcopy is platform-specific
385 		 * and could block while we might have paused the cpus.
386 		 */
387 		(void) kcopy(dptr->olddispq, dptr->newdispq,
388 		    dptr->oldnglobpris * sizeof (dispq_t));
389 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
390 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
391 		    sizeof (long));
392 	}
393 	dp->disp_q = dptr->newdispq;
394 	dp->disp_qactmap = dptr->newdqactmap;
395 	dp->disp_q_limit = &dptr->newdispq[numpris];
396 	dp->disp_npri = numpris;
397 }
398 
399 static void
400 disp_dq_free(struct disp_queue_info *dptr)
401 {
402 	if (dptr->olddispq != NULL)
403 		kmem_free(dptr->olddispq,
404 		    dptr->oldnglobpris * sizeof (dispq_t));
405 	if (dptr->olddqactmap != NULL)
406 		kmem_free(dptr->olddqactmap,
407 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
408 }
409 
410 /*
411  * For a newly created CPU, initialize the dispatch queue.
412  * This is called before the CPU is known through cpu[] or on any lists.
413  */
414 void
415 disp_cpu_init(cpu_t *cp)
416 {
417 	disp_t	*dp;
418 	dispq_t	*newdispq;
419 	ulong_t	*newdqactmap;
420 
421 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
422 
423 	if (cp == cpu0_disp.disp_cpu)
424 		dp = &cpu0_disp;
425 	else
426 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
427 	bzero(dp, sizeof (disp_t));
428 	cp->cpu_disp = dp;
429 	dp->disp_cpu = cp;
430 	dp->disp_maxrunpri = -1;
431 	dp->disp_max_unbound_pri = -1;
432 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
433 	/*
434 	 * Allocate memory for the dispatcher queue headers
435 	 * and the active queue bitmap.
436 	 */
437 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
438 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
439 	    sizeof (long), KM_SLEEP);
440 	dp->disp_q = newdispq;
441 	dp->disp_qactmap = newdqactmap;
442 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
443 	dp->disp_npri = v.v_nglobpris;
444 }
445 
446 void
447 disp_cpu_fini(cpu_t *cp)
448 {
449 	ASSERT(MUTEX_HELD(&cpu_lock));
450 
451 	disp_kp_free(cp->cpu_disp);
452 	if (cp->cpu_disp != &cpu0_disp)
453 		kmem_free(cp->cpu_disp, sizeof (disp_t));
454 }
455 
456 /*
457  * Allocate new, larger kpreempt dispatch queue to replace the old one.
458  */
459 void
460 disp_kp_alloc(disp_t *dq, pri_t npri)
461 {
462 	struct disp_queue_info	mem_info;
463 
464 	if (npri > dq->disp_npri) {
465 		/*
466 		 * Allocate memory for the new array.
467 		 */
468 		disp_dq_alloc(&mem_info, npri, dq);
469 
470 		/*
471 		 * We need to copy the old structures to the new
472 		 * and free the old.
473 		 */
474 		disp_dq_assign(&mem_info, npri);
475 		disp_dq_free(&mem_info);
476 	}
477 }
478 
479 /*
480  * Free dispatch queue.
481  * Used for the kpreempt queues for a removed CPU partition and
482  * for the per-CPU queues of deleted CPUs.
483  */
484 void
485 disp_kp_free(disp_t *dq)
486 {
487 	struct disp_queue_info	mem_info;
488 
489 	mem_info.olddispq = dq->disp_q;
490 	mem_info.olddqactmap = dq->disp_qactmap;
491 	mem_info.oldnglobpris = dq->disp_npri;
492 	disp_dq_free(&mem_info);
493 }
494 
495 /*
496  * End dispatcher and scheduler initialization.
497  */
498 
499 /*
500  * See if there's anything to do other than remain idle.
501  * Return non-zero if there is.
502  *
503  * This function must be called with high spl, or with
504  * kernel preemption disabled to prevent the partition's
505  * active cpu list from changing while being traversed.
506  *
507  */
508 int
509 disp_anywork(void)
510 {
511 	cpu_t   *cp = CPU;
512 	cpu_t   *ocp;
513 
514 	if (cp->cpu_disp->disp_nrunnable != 0)
515 		return (1);
516 
517 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
518 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
519 			return (1);
520 
521 		/*
522 		 * Work can be taken from another CPU if:
523 		 *	- There is unbound work on the run queue
524 		 *	- That work isn't a thread undergoing a
525 		 *	- context switch on an otherwise empty queue.
526 		 *	- The CPU isn't running the idle loop.
527 		 */
528 		for (ocp = cp->cpu_next_part; ocp != cp;
529 		    ocp = ocp->cpu_next_part) {
530 			ASSERT(CPU_ACTIVE(ocp));
531 
532 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
533 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
534 			    ocp->cpu_disp->disp_nrunnable == 1) &&
535 			    ocp->cpu_dispatch_pri != -1)
536 				return (1);
537 		}
538 	}
539 	return (0);
540 }
541 
542 /*
543  * Called when CPU enters the idle loop
544  */
545 static void
546 idle_enter()
547 {
548 	cpu_t		*cp = CPU;
549 
550 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
551 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
552 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
553 }
554 
555 /*
556  * Called when CPU exits the idle loop
557  */
558 static void
559 idle_exit()
560 {
561 	cpu_t		*cp = CPU;
562 
563 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
564 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
565 }
566 
567 /*
568  * Idle loop.
569  */
570 void
571 idle()
572 {
573 	struct cpu	*cp = CPU;		/* pointer to this CPU */
574 	kthread_t	*t;			/* taken thread */
575 
576 	idle_enter();
577 
578 	/*
579 	 * Uniprocessor version of idle loop.
580 	 * Do this until notified that we're on an actual multiprocessor.
581 	 */
582 	while (ncpus == 1) {
583 		if (cp->cpu_disp->disp_nrunnable == 0) {
584 			(*idle_cpu)();
585 			continue;
586 		}
587 		idle_exit();
588 		swtch();
589 
590 		idle_enter(); /* returned from swtch */
591 	}
592 
593 	/*
594 	 * Multiprocessor idle loop.
595 	 */
596 	for (;;) {
597 		/*
598 		 * If CPU is completely quiesced by p_online(2), just wait
599 		 * here with minimal bus traffic until put online.
600 		 */
601 		while (cp->cpu_flags & CPU_QUIESCED)
602 			(*idle_cpu)();
603 
604 		if (cp->cpu_disp->disp_nrunnable != 0) {
605 			idle_exit();
606 			swtch();
607 		} else {
608 			if (cp->cpu_flags & CPU_OFFLINE)
609 				continue;
610 			if ((t = disp_getwork(cp)) == NULL) {
611 				if (cp->cpu_chosen_level != -1) {
612 					disp_t *dp = cp->cpu_disp;
613 					disp_t *kpq;
614 
615 					disp_lock_enter(&dp->disp_lock);
616 					/*
617 					 * Set kpq under lock to prevent
618 					 * migration between partitions.
619 					 */
620 					kpq = &cp->cpu_part->cp_kp_queue;
621 					if (kpq->disp_maxrunpri == -1)
622 						cp->cpu_chosen_level = -1;
623 					disp_lock_exit(&dp->disp_lock);
624 				}
625 				(*idle_cpu)();
626 				continue;
627 			}
628 			/*
629 			 * If there was a thread but we couldn't steal
630 			 * it, then keep trying.
631 			 */
632 			if (t == T_DONTSTEAL)
633 				continue;
634 			idle_exit();
635 			swtch_to(t);
636 		}
637 		idle_enter(); /* returned from swtch/swtch_to */
638 	}
639 }
640 
641 
642 /*
643  * Preempt the currently running thread in favor of the highest
644  * priority thread.  The class of the current thread controls
645  * where it goes on the dispatcher queues. If panicking, turn
646  * preemption off.
647  */
648 void
649 preempt()
650 {
651 	kthread_t 	*t = curthread;
652 	klwp_t 		*lwp = ttolwp(curthread);
653 
654 	if (panicstr)
655 		return;
656 
657 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
658 
659 	thread_lock(t);
660 
661 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
662 		/*
663 		 * this thread has already been chosen to be run on
664 		 * another CPU. Clear kprunrun on this CPU since we're
665 		 * already headed for swtch().
666 		 */
667 		CPU->cpu_kprunrun = 0;
668 		thread_unlock_nopreempt(t);
669 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
670 	} else {
671 		if (lwp != NULL)
672 			lwp->lwp_ru.nivcsw++;
673 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
674 		THREAD_TRANSITION(t);
675 		CL_PREEMPT(t);
676 		DTRACE_SCHED(preempt);
677 		thread_unlock_nopreempt(t);
678 
679 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
680 
681 		swtch();		/* clears CPU->cpu_runrun via disp() */
682 	}
683 }
684 
685 extern kthread_t *thread_unpin();
686 
687 /*
688  * disp() - find the highest priority thread for this processor to run, and
689  * set it in TS_ONPROC state so that resume() can be called to run it.
690  */
691 static kthread_t *
692 disp()
693 {
694 	cpu_t		*cpup;
695 	disp_t		*dp;
696 	kthread_t	*tp;
697 	dispq_t		*dq;
698 	int		maxrunword;
699 	pri_t		pri;
700 	disp_t		*kpq;
701 
702 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
703 
704 	cpup = CPU;
705 	/*
706 	 * Find the highest priority loaded, runnable thread.
707 	 */
708 	dp = cpup->cpu_disp;
709 
710 reschedule:
711 	/*
712 	 * If there is more important work on the global queue with a better
713 	 * priority than the maximum on this CPU, take it now.
714 	 */
715 	kpq = &cpup->cpu_part->cp_kp_queue;
716 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
717 	    pri >= dp->disp_maxrunpri &&
718 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
719 	    (tp = disp_getbest(kpq)) != NULL) {
720 		if (disp_ratify(tp, kpq) != NULL) {
721 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
722 			    "disp_end:tid %p", tp);
723 			return (tp);
724 		}
725 	}
726 
727 	disp_lock_enter(&dp->disp_lock);
728 	pri = dp->disp_maxrunpri;
729 
730 	/*
731 	 * If there is nothing to run, look at what's runnable on other queues.
732 	 * Choose the idle thread if the CPU is quiesced.
733 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
734 	 * interrupt threads, which will be the only threads on the CPU's own
735 	 * queue, but cannot run threads from other queues.
736 	 */
737 	if (pri == -1) {
738 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
739 			disp_lock_exit(&dp->disp_lock);
740 			if ((tp = disp_getwork(cpup)) == NULL ||
741 			    tp == T_DONTSTEAL) {
742 				tp = cpup->cpu_idle_thread;
743 				(void) splhigh();
744 				THREAD_ONPROC(tp, cpup);
745 				cpup->cpu_dispthread = tp;
746 				cpup->cpu_dispatch_pri = -1;
747 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
748 				cpup->cpu_chosen_level = -1;
749 			}
750 		} else {
751 			disp_lock_exit_high(&dp->disp_lock);
752 			tp = cpup->cpu_idle_thread;
753 			THREAD_ONPROC(tp, cpup);
754 			cpup->cpu_dispthread = tp;
755 			cpup->cpu_dispatch_pri = -1;
756 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
757 			cpup->cpu_chosen_level = -1;
758 		}
759 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
760 		    "disp_end:tid %p", tp);
761 		return (tp);
762 	}
763 
764 	dq = &dp->disp_q[pri];
765 	tp = dq->dq_first;
766 
767 	ASSERT(tp != NULL);
768 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
769 
770 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
771 
772 	/*
773 	 * Found it so remove it from queue.
774 	 */
775 	dp->disp_nrunnable--;
776 	dq->dq_sruncnt--;
777 	if ((dq->dq_first = tp->t_link) == NULL) {
778 		ulong_t	*dqactmap = dp->disp_qactmap;
779 
780 		ASSERT(dq->dq_sruncnt == 0);
781 		dq->dq_last = NULL;
782 
783 		/*
784 		 * The queue is empty, so the corresponding bit needs to be
785 		 * turned off in dqactmap.   If nrunnable != 0 just took the
786 		 * last runnable thread off the
787 		 * highest queue, so recompute disp_maxrunpri.
788 		 */
789 		maxrunword = pri >> BT_ULSHIFT;
790 		dqactmap[maxrunword] &= ~BT_BIW(pri);
791 
792 		if (dp->disp_nrunnable == 0) {
793 			dp->disp_max_unbound_pri = -1;
794 			dp->disp_maxrunpri = -1;
795 		} else {
796 			int ipri;
797 
798 			ipri = bt_gethighbit(dqactmap, maxrunword);
799 			dp->disp_maxrunpri = ipri;
800 			if (ipri < dp->disp_max_unbound_pri)
801 				dp->disp_max_unbound_pri = ipri;
802 		}
803 	} else {
804 		tp->t_link = NULL;
805 	}
806 
807 	/*
808 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
809 	 * out this thread before we have a chance to run it.
810 	 * While running, it is protected against swapping by t_lock.
811 	 */
812 	tp->t_schedflag |= TS_DONT_SWAP;
813 	cpup->cpu_dispthread = tp;		/* protected by spl only */
814 	cpup->cpu_dispatch_pri = pri;
815 	ASSERT(pri == DISP_PRIO(tp));
816 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
817 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
818 
819 	ASSERT(tp != NULL);
820 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
821 	    "disp_end:tid %p", tp);
822 
823 	if (disp_ratify(tp, kpq) == NULL)
824 		goto reschedule;
825 
826 	return (tp);
827 }
828 
829 /*
830  * swtch()
831  *	Find best runnable thread and run it.
832  *	Called with the current thread already switched to a new state,
833  *	on a sleep queue, run queue, stopped, and not zombied.
834  *	May be called at any spl level less than or equal to LOCK_LEVEL.
835  *	Always drops spl to the base level (spl0()).
836  */
837 void
838 swtch()
839 {
840 	kthread_t	*t = curthread;
841 	kthread_t	*next;
842 	cpu_t		*cp;
843 
844 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
845 
846 	if (t->t_flag & T_INTR_THREAD)
847 		cpu_intr_swtch_enter(t);
848 
849 	if (t->t_intr != NULL) {
850 		/*
851 		 * We are an interrupt thread.  Setup and return
852 		 * the interrupted thread to be resumed.
853 		 */
854 		(void) splhigh();	/* block other scheduler action */
855 		cp = CPU;		/* now protected against migration */
856 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
857 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
858 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
859 		next = thread_unpin();
860 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
861 		resume_from_intr(next);
862 	} else {
863 #ifdef	DEBUG
864 		if (t->t_state == TS_ONPROC &&
865 		    t->t_disp_queue->disp_cpu == CPU &&
866 		    t->t_preempt == 0) {
867 			thread_lock(t);
868 			ASSERT(t->t_state != TS_ONPROC ||
869 			    t->t_disp_queue->disp_cpu != CPU ||
870 			    t->t_preempt != 0);	/* cannot migrate */
871 			thread_unlock_nopreempt(t);
872 		}
873 #endif	/* DEBUG */
874 		cp = CPU;
875 		next = disp();		/* returns with spl high */
876 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
877 
878 		/* OK to steal anything left on run queue */
879 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
880 
881 		if (next != t) {
882 			if (t == cp->cpu_idle_thread) {
883 				PG_NRUN_UPDATE(cp, 1);
884 			} else if (next == cp->cpu_idle_thread) {
885 				PG_NRUN_UPDATE(cp, -1);
886 			}
887 
888 			/*
889 			 * If t was previously in the TS_ONPROC state,
890 			 * setfrontdq and setbackdq won't have set its t_waitrq.
891 			 * Since we now finally know that we're switching away
892 			 * from this thread, set its t_waitrq if it is on a run
893 			 * queue.
894 			 */
895 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
896 				t->t_waitrq = gethrtime_unscaled();
897 			}
898 
899 			/*
900 			 * restore mstate of thread that we are switching to
901 			 */
902 			restore_mstate(next);
903 
904 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
905 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
906 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
907 
908 			if (dtrace_vtime_active)
909 				dtrace_vtime_switch(next);
910 
911 			resume(next);
912 			/*
913 			 * The TR_RESUME_END and TR_SWTCH_END trace points
914 			 * appear at the end of resume(), because we may not
915 			 * return here
916 			 */
917 		} else {
918 			if (t->t_flag & T_INTR_THREAD)
919 				cpu_intr_swtch_exit(t);
920 
921 			DTRACE_SCHED(remain__cpu);
922 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
923 			(void) spl0();
924 		}
925 	}
926 }
927 
928 /*
929  * swtch_from_zombie()
930  *	Special case of swtch(), which allows checks for TS_ZOMB to be
931  *	eliminated from normal resume.
932  *	Find best runnable thread and run it.
933  *	Called with the current thread zombied.
934  *	Zombies cannot migrate, so CPU references are safe.
935  */
936 void
937 swtch_from_zombie()
938 {
939 	kthread_t	*next;
940 	cpu_t		*cpu = CPU;
941 
942 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
943 
944 	ASSERT(curthread->t_state == TS_ZOMB);
945 
946 	next = disp();			/* returns with spl high */
947 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
948 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
949 	ASSERT(next != curthread);
950 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
951 
952 	if (next == cpu->cpu_idle_thread)
953 		PG_NRUN_UPDATE(cpu, -1);
954 
955 	restore_mstate(next);
956 
957 	if (dtrace_vtime_active)
958 		dtrace_vtime_switch(next);
959 
960 	resume_from_zombie(next);
961 	/*
962 	 * The TR_RESUME_END and TR_SWTCH_END trace points
963 	 * appear at the end of resume(), because we certainly will not
964 	 * return here
965 	 */
966 }
967 
968 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
969 
970 /*
971  * search_disp_queues()
972  *	Search the given dispatch queues for thread tp.
973  *	Return 1 if tp is found, otherwise return 0.
974  */
975 static int
976 search_disp_queues(disp_t *dp, kthread_t *tp)
977 {
978 	dispq_t		*dq;
979 	dispq_t		*eq;
980 
981 	disp_lock_enter_high(&dp->disp_lock);
982 
983 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
984 		kthread_t	*rp;
985 
986 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
987 
988 		for (rp = dq->dq_first; rp; rp = rp->t_link)
989 			if (tp == rp) {
990 				disp_lock_exit_high(&dp->disp_lock);
991 				return (1);
992 			}
993 	}
994 	disp_lock_exit_high(&dp->disp_lock);
995 
996 	return (0);
997 }
998 
999 /*
1000  * thread_on_queue()
1001  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1002  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1003  */
1004 static int
1005 thread_on_queue(kthread_t *tp)
1006 {
1007 	cpu_t		*cp;
1008 	struct cpupart	*part;
1009 
1010 	ASSERT(getpil() >= DISP_LEVEL);
1011 
1012 	/*
1013 	 * Search the per-CPU dispatch queues for tp.
1014 	 */
1015 	cp = CPU;
1016 	do {
1017 		if (search_disp_queues(cp->cpu_disp, tp))
1018 			return (1);
1019 	} while ((cp = cp->cpu_next_onln) != CPU);
1020 
1021 	/*
1022 	 * Search the partition-wide kpreempt queues for tp.
1023 	 */
1024 	part = CPU->cpu_part;
1025 	do {
1026 		if (search_disp_queues(&part->cp_kp_queue, tp))
1027 			return (1);
1028 	} while ((part = part->cp_next) != CPU->cpu_part);
1029 
1030 	return (0);
1031 }
1032 
1033 #else
1034 
1035 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1036 
1037 #endif  /* DEBUG */
1038 
1039 /*
1040  * like swtch(), but switch to a specified thread taken from another CPU.
1041  *	called with spl high..
1042  */
1043 void
1044 swtch_to(kthread_t *next)
1045 {
1046 	cpu_t			*cp = CPU;
1047 
1048 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1049 
1050 	/*
1051 	 * Update context switch statistics.
1052 	 */
1053 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1054 
1055 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1056 
1057 	if (curthread == cp->cpu_idle_thread)
1058 		PG_NRUN_UPDATE(cp, 1);
1059 
1060 	/* OK to steal anything left on run queue */
1061 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1062 
1063 	/* record last execution time */
1064 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
1065 
1066 	/*
1067 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1068 	 * won't have set its t_waitrq.  Since we now finally know that we're
1069 	 * switching away from this thread, set its t_waitrq if it is on a run
1070 	 * queue.
1071 	 */
1072 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1073 		curthread->t_waitrq = gethrtime_unscaled();
1074 	}
1075 
1076 	/* restore next thread to previously running microstate */
1077 	restore_mstate(next);
1078 
1079 	if (dtrace_vtime_active)
1080 		dtrace_vtime_switch(next);
1081 
1082 	resume(next);
1083 	/*
1084 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1085 	 * appear at the end of resume(), because we may not
1086 	 * return here
1087 	 */
1088 }
1089 
1090 
1091 
1092 #define	CPU_IDLING(pri)	((pri) == -1)
1093 
1094 static void
1095 cpu_resched(cpu_t *cp, pri_t tpri)
1096 {
1097 	int	call_poke_cpu = 0;
1098 	pri_t   cpupri = cp->cpu_dispatch_pri;
1099 
1100 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1101 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1102 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1103 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1104 			cp->cpu_runrun = 1;
1105 			aston(cp->cpu_dispthread);
1106 			if (tpri < kpreemptpri && cp != CPU)
1107 				call_poke_cpu = 1;
1108 		}
1109 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1110 			cp->cpu_kprunrun = 1;
1111 			if (cp != CPU)
1112 				call_poke_cpu = 1;
1113 		}
1114 	}
1115 
1116 	/*
1117 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1118 	 */
1119 	membar_enter();
1120 
1121 	if (call_poke_cpu)
1122 		poke_cpu(cp->cpu_id);
1123 }
1124 
1125 /*
1126  * Perform multi-level CMT load balancing of running threads.
1127  * tp is the thread being enqueued
1128  * cp is the hint CPU (chosen by cpu_choose()).
1129  */
1130 static cpu_t *
1131 cmt_balance(kthread_t *tp, cpu_t *cp)
1132 {
1133 	int		hint, i, cpu, nsiblings;
1134 	int		self = 0;
1135 	group_t		*cmt_pgs, *siblings;
1136 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
1137 	int		pg_nrun, tpg_nrun;
1138 	int		level = 0;
1139 	cpu_t		*newcp;
1140 
1141 	ASSERT(THREAD_LOCK_HELD(tp));
1142 
1143 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
1144 
1145 	if (GROUP_SIZE(cmt_pgs) == 0)
1146 		return (cp);	/* nothing to do */
1147 
1148 	if (tp == curthread)
1149 		self = 1;
1150 
1151 	/*
1152 	 * Balance across siblings in the CPUs CMT lineage
1153 	 */
1154 	do {
1155 		pg = GROUP_ACCESS(cmt_pgs, level);
1156 
1157 		siblings = pg->cmt_siblings;
1158 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
1159 		if (nsiblings == 1)
1160 			continue;	/* nobody to balance against */
1161 
1162 		pg_nrun = pg->cmt_nrunning;
1163 		if (self &&
1164 		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
1165 			pg_nrun--;	/* Ignore curthread's effect */
1166 
1167 		hint = pg->cmt_hint;
1168 		/*
1169 		 * Check for validity of the hint
1170 		 * It should reference a valid sibling
1171 		 */
1172 		if (hint >= nsiblings)
1173 			hint = pg->cmt_hint = 0;
1174 		else
1175 			pg->cmt_hint++;
1176 
1177 		/*
1178 		 * Find a balancing candidate from among our siblings
1179 		 * "hint" is a hint for where to start looking
1180 		 */
1181 		i = hint;
1182 		do {
1183 			ASSERT(i < nsiblings);
1184 			pg_tmp = GROUP_ACCESS(siblings, i);
1185 
1186 			/*
1187 			 * The candidate must not be us, and must
1188 			 * have some CPU resources in the thread's
1189 			 * partition
1190 			 */
1191 			if (pg_tmp != pg &&
1192 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
1193 			    ((pg_t *)pg_tmp)->pg_id)) {
1194 				tpg = pg_tmp;
1195 				break;
1196 			}
1197 
1198 			if (++i >= nsiblings)
1199 				i = 0;
1200 		} while (i != hint);
1201 
1202 		if (!tpg)
1203 			continue;	/* no candidates at this level */
1204 
1205 		/*
1206 		 * Check if the balancing target is underloaded
1207 		 * Decide to balance if the target is running fewer
1208 		 * threads, or if it's running the same number of threads
1209 		 * with more online CPUs
1210 		 */
1211 		tpg_nrun = tpg->cmt_nrunning;
1212 		if (pg_nrun > tpg_nrun ||
1213 		    (pg_nrun == tpg_nrun &&
1214 		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
1215 		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
1216 			break;
1217 		}
1218 		tpg = NULL;
1219 	} while (++level < GROUP_SIZE(cmt_pgs));
1220 
1221 
1222 	if (tpg) {
1223 		/*
1224 		 * Select an idle CPU from the target PG
1225 		 */
1226 		for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) {
1227 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
1228 			if (newcp->cpu_part == tp->t_cpupart &&
1229 			    newcp->cpu_dispatch_pri == -1) {
1230 				cp = newcp;
1231 				break;
1232 			}
1233 		}
1234 	}
1235 
1236 	return (cp);
1237 }
1238 
1239 /*
1240  * setbackdq() keeps runqs balanced such that the difference in length
1241  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1242  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1243  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1244  * try to keep runqs perfectly balanced regardless of the thread priority.
1245  */
1246 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1247 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1248 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1249 
1250 /*
1251  * Put the specified thread on the back of the dispatcher
1252  * queue corresponding to its current priority.
1253  *
1254  * Called with the thread in transition, onproc or stopped state
1255  * and locked (transition implies locked) and at high spl.
1256  * Returns with the thread in TS_RUN state and still locked.
1257  */
1258 void
1259 setbackdq(kthread_t *tp)
1260 {
1261 	dispq_t	*dq;
1262 	disp_t		*dp;
1263 	cpu_t		*cp;
1264 	pri_t		tpri;
1265 	int		bound;
1266 
1267 	ASSERT(THREAD_LOCK_HELD(tp));
1268 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1269 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1270 
1271 	/*
1272 	 * If thread is "swapped" or on the swap queue don't
1273 	 * queue it, but wake sched.
1274 	 */
1275 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1276 		disp_swapped_setrun(tp);
1277 		return;
1278 	}
1279 
1280 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1281 		bound = 1;
1282 	else
1283 		bound = 0;
1284 
1285 	tpri = DISP_PRIO(tp);
1286 	if (ncpus == 1)
1287 		cp = tp->t_cpu;
1288 	else if (!bound) {
1289 		if (tpri >= kpqpri) {
1290 			setkpdq(tp, SETKP_BACK);
1291 			return;
1292 		}
1293 		/*
1294 		 * Let cpu_choose suggest a CPU.
1295 		 */
1296 		cp = cpu_choose(tp, tpri);
1297 
1298 		if (tp->t_cpupart == cp->cpu_part) {
1299 			int	qlen;
1300 
1301 			/*
1302 			 * Perform any CMT load balancing
1303 			 */
1304 			cp = cmt_balance(tp, cp);
1305 
1306 			/*
1307 			 * Balance across the run queues
1308 			 */
1309 			qlen = RUNQ_LEN(cp, tpri);
1310 			if (tpri >= RUNQ_MATCH_PRI &&
1311 			    !(tp->t_schedflag & TS_RUNQMATCH))
1312 				qlen -= RUNQ_MAX_DIFF;
1313 			if (qlen > 0) {
1314 				cpu_t *newcp;
1315 
1316 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1317 					newcp = cp->cpu_next_part;
1318 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1319 					newcp = cp->cpu_next_part;
1320 				}
1321 
1322 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1323 					DTRACE_PROBE3(runq__balance,
1324 					    kthread_t *, tp,
1325 					    cpu_t *, cp, cpu_t *, newcp);
1326 					cp = newcp;
1327 				}
1328 			}
1329 		} else {
1330 			/*
1331 			 * Migrate to a cpu in the new partition.
1332 			 */
1333 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1334 			    tp->t_lpl, tp->t_pri, NULL);
1335 		}
1336 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1337 	} else {
1338 		/*
1339 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1340 		 * a short time until weak binding that existed when the
1341 		 * strong binding was established has dropped) so we must
1342 		 * favour weak binding over strong.
1343 		 */
1344 		cp = tp->t_weakbound_cpu ?
1345 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1346 	}
1347 	/*
1348 	 * A thread that is ONPROC may be temporarily placed on the run queue
1349 	 * but then chosen to run again by disp.  If the thread we're placing on
1350 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1351 	 * replacement process is actually scheduled in swtch().  In this
1352 	 * situation, curthread is the only thread that could be in the ONPROC
1353 	 * state.
1354 	 */
1355 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1356 		hrtime_t curtime;
1357 
1358 		curtime = gethrtime_unscaled();
1359 		(void) cpu_update_pct(tp, curtime);
1360 		tp->t_waitrq = curtime;
1361 	} else {
1362 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1363 	}
1364 
1365 	dp = cp->cpu_disp;
1366 	disp_lock_enter_high(&dp->disp_lock);
1367 
1368 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1369 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1370 	    tpri, cp, tp);
1371 
1372 #ifndef NPROBE
1373 	/* Kernel probe */
1374 	if (tnf_tracing_active)
1375 		tnf_thread_queue(tp, cp, tpri);
1376 #endif /* NPROBE */
1377 
1378 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1379 
1380 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1381 	tp->t_disp_queue = dp;
1382 	tp->t_link = NULL;
1383 
1384 	dq = &dp->disp_q[tpri];
1385 	dp->disp_nrunnable++;
1386 	if (!bound)
1387 		dp->disp_steal = 0;
1388 	membar_enter();
1389 
1390 	if (dq->dq_sruncnt++ != 0) {
1391 		ASSERT(dq->dq_first != NULL);
1392 		dq->dq_last->t_link = tp;
1393 		dq->dq_last = tp;
1394 	} else {
1395 		ASSERT(dq->dq_first == NULL);
1396 		ASSERT(dq->dq_last == NULL);
1397 		dq->dq_first = dq->dq_last = tp;
1398 		BT_SET(dp->disp_qactmap, tpri);
1399 		if (tpri > dp->disp_maxrunpri) {
1400 			dp->disp_maxrunpri = tpri;
1401 			membar_enter();
1402 			cpu_resched(cp, tpri);
1403 		}
1404 	}
1405 
1406 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1407 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1408 		    cp == CPU) {
1409 			/*
1410 			 * If there are no other unbound threads on the
1411 			 * run queue, don't allow other CPUs to steal
1412 			 * this thread while we are in the middle of a
1413 			 * context switch. We may just switch to it
1414 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1415 			 * in swtch and swtch_to.
1416 			 */
1417 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1418 		}
1419 		dp->disp_max_unbound_pri = tpri;
1420 	}
1421 	(*disp_enq_thread)(cp, bound);
1422 }
1423 
1424 /*
1425  * Put the specified thread on the front of the dispatcher
1426  * queue corresponding to its current priority.
1427  *
1428  * Called with the thread in transition, onproc or stopped state
1429  * and locked (transition implies locked) and at high spl.
1430  * Returns with the thread in TS_RUN state and still locked.
1431  */
1432 void
1433 setfrontdq(kthread_t *tp)
1434 {
1435 	disp_t		*dp;
1436 	dispq_t		*dq;
1437 	cpu_t		*cp;
1438 	pri_t		tpri;
1439 	int		bound;
1440 
1441 	ASSERT(THREAD_LOCK_HELD(tp));
1442 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1443 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1444 
1445 	/*
1446 	 * If thread is "swapped" or on the swap queue don't
1447 	 * queue it, but wake sched.
1448 	 */
1449 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1450 		disp_swapped_setrun(tp);
1451 		return;
1452 	}
1453 
1454 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1455 		bound = 1;
1456 	else
1457 		bound = 0;
1458 
1459 	tpri = DISP_PRIO(tp);
1460 	if (ncpus == 1)
1461 		cp = tp->t_cpu;
1462 	else if (!bound) {
1463 		if (tpri >= kpqpri) {
1464 			setkpdq(tp, SETKP_FRONT);
1465 			return;
1466 		}
1467 		cp = tp->t_cpu;
1468 		if (tp->t_cpupart == cp->cpu_part) {
1469 			/*
1470 			 * If we are of higher or equal priority than
1471 			 * the highest priority runnable thread of
1472 			 * the current CPU, just pick this CPU.  Otherwise
1473 			 * Let cpu_choose() select the CPU.  If this cpu
1474 			 * is the target of an offline request then do not
1475 			 * pick it - a thread_nomigrate() on the in motion
1476 			 * cpu relies on this when it forces a preempt.
1477 			 */
1478 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1479 			    cp == cpu_inmotion)
1480 				cp = cpu_choose(tp, tpri);
1481 		} else {
1482 			/*
1483 			 * Migrate to a cpu in the new partition.
1484 			 */
1485 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1486 			    tp->t_lpl, tp->t_pri, NULL);
1487 		}
1488 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1489 	} else {
1490 		/*
1491 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1492 		 * a short time until weak binding that existed when the
1493 		 * strong binding was established has dropped) so we must
1494 		 * favour weak binding over strong.
1495 		 */
1496 		cp = tp->t_weakbound_cpu ?
1497 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1498 	}
1499 
1500 	/*
1501 	 * A thread that is ONPROC may be temporarily placed on the run queue
1502 	 * but then chosen to run again by disp.  If the thread we're placing on
1503 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1504 	 * replacement process is actually scheduled in swtch().  In this
1505 	 * situation, curthread is the only thread that could be in the ONPROC
1506 	 * state.
1507 	 */
1508 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1509 		hrtime_t curtime;
1510 
1511 		curtime = gethrtime_unscaled();
1512 		(void) cpu_update_pct(tp, curtime);
1513 		tp->t_waitrq = curtime;
1514 	} else {
1515 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1516 	}
1517 
1518 	dp = cp->cpu_disp;
1519 	disp_lock_enter_high(&dp->disp_lock);
1520 
1521 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1522 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1523 
1524 #ifndef NPROBE
1525 	/* Kernel probe */
1526 	if (tnf_tracing_active)
1527 		tnf_thread_queue(tp, cp, tpri);
1528 #endif /* NPROBE */
1529 
1530 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1531 
1532 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1533 	tp->t_disp_queue = dp;
1534 
1535 	dq = &dp->disp_q[tpri];
1536 	dp->disp_nrunnable++;
1537 	if (!bound)
1538 		dp->disp_steal = 0;
1539 	membar_enter();
1540 
1541 	if (dq->dq_sruncnt++ != 0) {
1542 		ASSERT(dq->dq_last != NULL);
1543 		tp->t_link = dq->dq_first;
1544 		dq->dq_first = tp;
1545 	} else {
1546 		ASSERT(dq->dq_last == NULL);
1547 		ASSERT(dq->dq_first == NULL);
1548 		tp->t_link = NULL;
1549 		dq->dq_first = dq->dq_last = tp;
1550 		BT_SET(dp->disp_qactmap, tpri);
1551 		if (tpri > dp->disp_maxrunpri) {
1552 			dp->disp_maxrunpri = tpri;
1553 			membar_enter();
1554 			cpu_resched(cp, tpri);
1555 		}
1556 	}
1557 
1558 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1559 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1560 		    cp == CPU) {
1561 			/*
1562 			 * If there are no other unbound threads on the
1563 			 * run queue, don't allow other CPUs to steal
1564 			 * this thread while we are in the middle of a
1565 			 * context switch. We may just switch to it
1566 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1567 			 * in swtch and swtch_to.
1568 			 */
1569 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1570 		}
1571 		dp->disp_max_unbound_pri = tpri;
1572 	}
1573 	(*disp_enq_thread)(cp, bound);
1574 }
1575 
1576 /*
1577  * Put a high-priority unbound thread on the kp queue
1578  */
1579 static void
1580 setkpdq(kthread_t *tp, int borf)
1581 {
1582 	dispq_t	*dq;
1583 	disp_t	*dp;
1584 	cpu_t	*cp;
1585 	pri_t	tpri;
1586 
1587 	tpri = DISP_PRIO(tp);
1588 
1589 	dp = &tp->t_cpupart->cp_kp_queue;
1590 	disp_lock_enter_high(&dp->disp_lock);
1591 
1592 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1593 
1594 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1595 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1596 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1597 	tp->t_disp_queue = dp;
1598 	dp->disp_nrunnable++;
1599 	dq = &dp->disp_q[tpri];
1600 
1601 	if (dq->dq_sruncnt++ != 0) {
1602 		if (borf == SETKP_BACK) {
1603 			ASSERT(dq->dq_first != NULL);
1604 			tp->t_link = NULL;
1605 			dq->dq_last->t_link = tp;
1606 			dq->dq_last = tp;
1607 		} else {
1608 			ASSERT(dq->dq_last != NULL);
1609 			tp->t_link = dq->dq_first;
1610 			dq->dq_first = tp;
1611 		}
1612 	} else {
1613 		if (borf == SETKP_BACK) {
1614 			ASSERT(dq->dq_first == NULL);
1615 			ASSERT(dq->dq_last == NULL);
1616 			dq->dq_first = dq->dq_last = tp;
1617 		} else {
1618 			ASSERT(dq->dq_last == NULL);
1619 			ASSERT(dq->dq_first == NULL);
1620 			tp->t_link = NULL;
1621 			dq->dq_first = dq->dq_last = tp;
1622 		}
1623 		BT_SET(dp->disp_qactmap, tpri);
1624 		if (tpri > dp->disp_max_unbound_pri)
1625 			dp->disp_max_unbound_pri = tpri;
1626 		if (tpri > dp->disp_maxrunpri) {
1627 			dp->disp_maxrunpri = tpri;
1628 			membar_enter();
1629 		}
1630 	}
1631 
1632 	cp = tp->t_cpu;
1633 	if (tp->t_cpupart != cp->cpu_part) {
1634 		/* migrate to a cpu in the new partition */
1635 		cp = tp->t_cpupart->cp_cpulist;
1636 	}
1637 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1638 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1639 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1640 
1641 #ifndef NPROBE
1642 	/* Kernel probe */
1643 	if (tnf_tracing_active)
1644 		tnf_thread_queue(tp, cp, tpri);
1645 #endif /* NPROBE */
1646 
1647 	if (cp->cpu_chosen_level < tpri)
1648 		cp->cpu_chosen_level = tpri;
1649 	cpu_resched(cp, tpri);
1650 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1651 	(*disp_enq_thread)(cp, 0);
1652 }
1653 
1654 /*
1655  * Remove a thread from the dispatcher queue if it is on it.
1656  * It is not an error if it is not found but we return whether
1657  * or not it was found in case the caller wants to check.
1658  */
1659 int
1660 dispdeq(kthread_t *tp)
1661 {
1662 	disp_t		*dp;
1663 	dispq_t		*dq;
1664 	kthread_t	*rp;
1665 	kthread_t	*trp;
1666 	kthread_t	**ptp;
1667 	int		tpri;
1668 
1669 	ASSERT(THREAD_LOCK_HELD(tp));
1670 
1671 	if (tp->t_state != TS_RUN)
1672 		return (0);
1673 
1674 	/*
1675 	 * The thread is "swapped" or is on the swap queue and
1676 	 * hence no longer on the run queue, so return true.
1677 	 */
1678 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1679 		return (1);
1680 
1681 	tpri = DISP_PRIO(tp);
1682 	dp = tp->t_disp_queue;
1683 	ASSERT(tpri < dp->disp_npri);
1684 	dq = &dp->disp_q[tpri];
1685 	ptp = &dq->dq_first;
1686 	rp = *ptp;
1687 	trp = NULL;
1688 
1689 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1690 
1691 	/*
1692 	 * Search for thread in queue.
1693 	 * Double links would simplify this at the expense of disp/setrun.
1694 	 */
1695 	while (rp != tp && rp != NULL) {
1696 		trp = rp;
1697 		ptp = &trp->t_link;
1698 		rp = trp->t_link;
1699 	}
1700 
1701 	if (rp == NULL) {
1702 		panic("dispdeq: thread not on queue");
1703 	}
1704 
1705 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1706 
1707 	/*
1708 	 * Found it so remove it from queue.
1709 	 */
1710 	if ((*ptp = rp->t_link) == NULL)
1711 		dq->dq_last = trp;
1712 
1713 	dp->disp_nrunnable--;
1714 	if (--dq->dq_sruncnt == 0) {
1715 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1716 		if (dp->disp_nrunnable == 0) {
1717 			dp->disp_max_unbound_pri = -1;
1718 			dp->disp_maxrunpri = -1;
1719 		} else if (tpri == dp->disp_maxrunpri) {
1720 			int ipri;
1721 
1722 			ipri = bt_gethighbit(dp->disp_qactmap,
1723 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1724 			if (ipri < dp->disp_max_unbound_pri)
1725 				dp->disp_max_unbound_pri = ipri;
1726 			dp->disp_maxrunpri = ipri;
1727 		}
1728 	}
1729 	tp->t_link = NULL;
1730 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1731 	return (1);
1732 }
1733 
1734 
1735 /*
1736  * dq_sruninc and dq_srundec are public functions for
1737  * incrementing/decrementing the sruncnts when a thread on
1738  * a dispatcher queue is made schedulable/unschedulable by
1739  * resetting the TS_LOAD flag.
1740  *
1741  * The caller MUST have the thread lock and therefore the dispatcher
1742  * queue lock so that the operation which changes
1743  * the flag, the operation that checks the status of the thread to
1744  * determine if it's on a disp queue AND the call to this function
1745  * are one atomic operation with respect to interrupts.
1746  */
1747 
1748 /*
1749  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1750  */
1751 void
1752 dq_sruninc(kthread_t *t)
1753 {
1754 	ASSERT(t->t_state == TS_RUN);
1755 	ASSERT(t->t_schedflag & TS_LOAD);
1756 
1757 	THREAD_TRANSITION(t);
1758 	setfrontdq(t);
1759 }
1760 
1761 /*
1762  * See comment on calling conventions above.
1763  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1764  */
1765 void
1766 dq_srundec(kthread_t *t)
1767 {
1768 	ASSERT(t->t_schedflag & TS_LOAD);
1769 
1770 	(void) dispdeq(t);
1771 	disp_swapped_enq(t);
1772 }
1773 
1774 /*
1775  * Change the dispatcher lock of thread to the "swapped_lock"
1776  * and return with thread lock still held.
1777  *
1778  * Called with thread_lock held, in transition state, and at high spl.
1779  */
1780 void
1781 disp_swapped_enq(kthread_t *tp)
1782 {
1783 	ASSERT(THREAD_LOCK_HELD(tp));
1784 	ASSERT(tp->t_schedflag & TS_LOAD);
1785 
1786 	switch (tp->t_state) {
1787 	case TS_RUN:
1788 		disp_lock_enter_high(&swapped_lock);
1789 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1790 		break;
1791 	case TS_ONPROC:
1792 		disp_lock_enter_high(&swapped_lock);
1793 		THREAD_TRANSITION(tp);
1794 		wake_sched_sec = 1;		/* tell clock to wake sched */
1795 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1796 		break;
1797 	default:
1798 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1799 	}
1800 }
1801 
1802 /*
1803  * This routine is called by setbackdq/setfrontdq if the thread is
1804  * not loaded or loaded and on the swap queue.
1805  *
1806  * Thread state TS_SLEEP implies that a swapped thread
1807  * has been woken up and needs to be swapped in by the swapper.
1808  *
1809  * Thread state TS_RUN, it implies that the priority of a swapped
1810  * thread is being increased by scheduling class (e.g. ts_update).
1811  */
1812 static void
1813 disp_swapped_setrun(kthread_t *tp)
1814 {
1815 	ASSERT(THREAD_LOCK_HELD(tp));
1816 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1817 
1818 	switch (tp->t_state) {
1819 	case TS_SLEEP:
1820 		disp_lock_enter_high(&swapped_lock);
1821 		/*
1822 		 * Wakeup sched immediately (i.e., next tick) if the
1823 		 * thread priority is above maxclsyspri.
1824 		 */
1825 		if (DISP_PRIO(tp) > maxclsyspri)
1826 			wake_sched = 1;
1827 		else
1828 			wake_sched_sec = 1;
1829 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1830 		break;
1831 	case TS_RUN:				/* called from ts_update */
1832 		break;
1833 	default:
1834 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1835 	}
1836 }
1837 
1838 
1839 /*
1840  *	Make a thread give up its processor.  Find the processor on
1841  *	which this thread is executing, and have that processor
1842  *	preempt.
1843  */
1844 void
1845 cpu_surrender(kthread_t *tp)
1846 {
1847 	cpu_t	*cpup;
1848 	int	max_pri;
1849 	int	max_run_pri;
1850 	klwp_t	*lwp;
1851 
1852 	ASSERT(THREAD_LOCK_HELD(tp));
1853 
1854 	if (tp->t_state != TS_ONPROC)
1855 		return;
1856 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1857 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1858 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1859 	if (max_pri < max_run_pri)
1860 		max_pri = max_run_pri;
1861 
1862 	cpup->cpu_runrun = 1;
1863 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1864 		cpup->cpu_kprunrun = 1;
1865 	}
1866 
1867 	/*
1868 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1869 	 */
1870 	membar_enter();
1871 
1872 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1873 
1874 	/*
1875 	 * Make the target thread take an excursion through trap()
1876 	 * to do preempt() (unless we're already in trap or post_syscall,
1877 	 * calling cpu_surrender via CL_TRAPRET).
1878 	 */
1879 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1880 	    lwp->lwp_state != LWP_USER) {
1881 		aston(tp);
1882 		if (cpup != CPU)
1883 			poke_cpu(cpup->cpu_id);
1884 	}
1885 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1886 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1887 }
1888 
1889 
1890 /*
1891  * Commit to and ratify a scheduling decision
1892  */
1893 /*ARGSUSED*/
1894 static kthread_t *
1895 disp_ratify(kthread_t *tp, disp_t *kpq)
1896 {
1897 	pri_t	tpri, maxpri;
1898 	pri_t	maxkpri;
1899 	cpu_t	*cpup;
1900 
1901 	ASSERT(tp != NULL);
1902 	/*
1903 	 * Commit to, then ratify scheduling decision
1904 	 */
1905 	cpup = CPU;
1906 	if (cpup->cpu_runrun != 0)
1907 		cpup->cpu_runrun = 0;
1908 	if (cpup->cpu_kprunrun != 0)
1909 		cpup->cpu_kprunrun = 0;
1910 	if (cpup->cpu_chosen_level != -1)
1911 		cpup->cpu_chosen_level = -1;
1912 	membar_enter();
1913 	tpri = DISP_PRIO(tp);
1914 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1915 	maxkpri = kpq->disp_maxrunpri;
1916 	if (maxpri < maxkpri)
1917 		maxpri = maxkpri;
1918 	if (tpri < maxpri) {
1919 		/*
1920 		 * should have done better
1921 		 * put this one back and indicate to try again
1922 		 */
1923 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1924 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1925 		thread_lock_high(tp);
1926 		THREAD_TRANSITION(tp);
1927 		setfrontdq(tp);
1928 		thread_unlock_nopreempt(tp);
1929 
1930 		tp = NULL;
1931 	}
1932 	return (tp);
1933 }
1934 
1935 /*
1936  * See if there is any work on the dispatcher queue for other CPUs.
1937  * If there is, dequeue the best thread and return.
1938  */
1939 static kthread_t *
1940 disp_getwork(cpu_t *cp)
1941 {
1942 	cpu_t		*ocp;		/* other CPU */
1943 	cpu_t		*ocp_start;
1944 	cpu_t		*tcp;		/* target local CPU */
1945 	kthread_t	*tp;
1946 	kthread_t	*retval = NULL;
1947 	pri_t		maxpri;
1948 	disp_t		*kpq;		/* kp queue for this partition */
1949 	lpl_t		*lpl, *lpl_leaf;
1950 	int		hint, leafidx;
1951 	hrtime_t	stealtime;
1952 
1953 	maxpri = -1;
1954 	tcp = NULL;
1955 
1956 	kpq = &cp->cpu_part->cp_kp_queue;
1957 	while (kpq->disp_maxrunpri >= 0) {
1958 		/*
1959 		 * Try to take a thread from the kp_queue.
1960 		 */
1961 		tp = (disp_getbest(kpq));
1962 		if (tp)
1963 			return (disp_ratify(tp, kpq));
1964 	}
1965 
1966 	kpreempt_disable();		/* protect the cpu_active list */
1967 
1968 	/*
1969 	 * Try to find something to do on another CPU's run queue.
1970 	 * Loop through all other CPUs looking for the one with the highest
1971 	 * priority unbound thread.
1972 	 *
1973 	 * On NUMA machines, the partition's CPUs are consulted in order of
1974 	 * distance from the current CPU. This way, the first available
1975 	 * work found is also the closest, and will suffer the least
1976 	 * from being migrated.
1977 	 */
1978 	lpl = lpl_leaf = cp->cpu_lpl;
1979 	hint = leafidx = 0;
1980 
1981 	/*
1982 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1983 	 * broader levels of locality
1984 	 */
1985 	do {
1986 		/* This loop iterates over the lpl's leaves */
1987 		do {
1988 			if (lpl_leaf != cp->cpu_lpl)
1989 				ocp = lpl_leaf->lpl_cpus;
1990 			else
1991 				ocp = cp->cpu_next_lpl;
1992 
1993 			/* This loop iterates over the CPUs in the leaf */
1994 			ocp_start = ocp;
1995 			do {
1996 				pri_t pri;
1997 
1998 				ASSERT(CPU_ACTIVE(ocp));
1999 
2000 				/*
2001 				 * End our stroll around this lpl if:
2002 				 *
2003 				 * - Something became runnable on the local
2004 				 *   queue...which also ends our stroll around
2005 				 *   the partition.
2006 				 *
2007 				 * - We happen across another idle CPU.
2008 				 *   Since it is patrolling the next portion
2009 				 *   of the lpl's list (assuming it's not
2010 				 *   halted), move to the next higher level
2011 				 *   of locality.
2012 				 */
2013 				if (cp->cpu_disp->disp_nrunnable != 0) {
2014 					kpreempt_enable();
2015 					return (NULL);
2016 				}
2017 				if (ocp->cpu_dispatch_pri == -1) {
2018 					if (ocp->cpu_disp_flags &
2019 					    CPU_DISP_HALTED)
2020 						continue;
2021 					else
2022 						break;
2023 				}
2024 
2025 				/*
2026 				 * If there's only one thread and the CPU
2027 				 * is in the middle of a context switch,
2028 				 * or it's currently running the idle thread,
2029 				 * don't steal it.
2030 				 */
2031 				if ((ocp->cpu_disp_flags &
2032 				    CPU_DISP_DONTSTEAL) &&
2033 				    ocp->cpu_disp->disp_nrunnable == 1)
2034 					continue;
2035 
2036 				pri = ocp->cpu_disp->disp_max_unbound_pri;
2037 				if (pri > maxpri) {
2038 					/*
2039 					 * Don't steal threads that we attempted
2040 					 * to steal recently until they're ready
2041 					 * to be stolen again.
2042 					 */
2043 					stealtime = ocp->cpu_disp->disp_steal;
2044 					if (stealtime == 0 ||
2045 					    stealtime - gethrtime() <= 0) {
2046 						maxpri = pri;
2047 						tcp = ocp;
2048 					} else {
2049 						/*
2050 						 * Don't update tcp, just set
2051 						 * the retval to T_DONTSTEAL, so
2052 						 * that if no acceptable CPUs
2053 						 * are found the return value
2054 						 * will be T_DONTSTEAL rather
2055 						 * then NULL.
2056 						 */
2057 						retval = T_DONTSTEAL;
2058 					}
2059 				}
2060 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2061 
2062 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2063 				leafidx = 0;
2064 				lpl_leaf = lpl->lpl_rset[leafidx];
2065 			}
2066 		} while (leafidx != hint);
2067 
2068 		hint = leafidx = lpl->lpl_hint;
2069 		if ((lpl = lpl->lpl_parent) != NULL)
2070 			lpl_leaf = lpl->lpl_rset[hint];
2071 	} while (!tcp && lpl);
2072 
2073 	kpreempt_enable();
2074 
2075 	/*
2076 	 * If another queue looks good, and there is still nothing on
2077 	 * the local queue, try to transfer one or more threads
2078 	 * from it to our queue.
2079 	 */
2080 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2081 		tp = disp_getbest(tcp->cpu_disp);
2082 		if (tp == NULL || tp == T_DONTSTEAL)
2083 			return (tp);
2084 		return (disp_ratify(tp, kpq));
2085 	}
2086 	return (retval);
2087 }
2088 
2089 
2090 /*
2091  * disp_fix_unbound_pri()
2092  *	Determines the maximum priority of unbound threads on the queue.
2093  *	The priority is kept for the queue, but is only increased, never
2094  *	reduced unless some CPU is looking for something on that queue.
2095  *
2096  *	The priority argument is the known upper limit.
2097  *
2098  *	Perhaps this should be kept accurately, but that probably means
2099  *	separate bitmaps for bound and unbound threads.  Since only idled
2100  *	CPUs will have to do this recalculation, it seems better this way.
2101  */
2102 static void
2103 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2104 {
2105 	kthread_t	*tp;
2106 	dispq_t		*dq;
2107 	ulong_t		*dqactmap = dp->disp_qactmap;
2108 	ulong_t		mapword;
2109 	int		wx;
2110 
2111 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2112 
2113 	ASSERT(pri >= 0);			/* checked by caller */
2114 
2115 	/*
2116 	 * Start the search at the next lowest priority below the supplied
2117 	 * priority.  This depends on the bitmap implementation.
2118 	 */
2119 	do {
2120 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2121 
2122 		/*
2123 		 * Form mask for all lower priorities in the word.
2124 		 */
2125 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2126 
2127 		/*
2128 		 * Get next lower active priority.
2129 		 */
2130 		if (mapword != 0) {
2131 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2132 		} else if (wx > 0) {
2133 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2134 			if (pri < 0)
2135 				break;
2136 		} else {
2137 			pri = -1;
2138 			break;
2139 		}
2140 
2141 		/*
2142 		 * Search the queue for unbound, runnable threads.
2143 		 */
2144 		dq = &dp->disp_q[pri];
2145 		tp = dq->dq_first;
2146 
2147 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2148 			tp = tp->t_link;
2149 		}
2150 
2151 		/*
2152 		 * If a thread was found, set the priority and return.
2153 		 */
2154 	} while (tp == NULL);
2155 
2156 	/*
2157 	 * pri holds the maximum unbound thread priority or -1.
2158 	 */
2159 	if (dp->disp_max_unbound_pri != pri)
2160 		dp->disp_max_unbound_pri = pri;
2161 }
2162 
2163 /*
2164  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2165  * 	check if the CPU to which is was previously bound should have
2166  * 	its disp_max_unbound_pri increased.
2167  */
2168 void
2169 disp_adjust_unbound_pri(kthread_t *tp)
2170 {
2171 	disp_t *dp;
2172 	pri_t tpri;
2173 
2174 	ASSERT(THREAD_LOCK_HELD(tp));
2175 
2176 	/*
2177 	 * Don't do anything if the thread is not bound, or
2178 	 * currently not runnable or swapped out.
2179 	 */
2180 	if (tp->t_bound_cpu == NULL ||
2181 	    tp->t_state != TS_RUN ||
2182 	    tp->t_schedflag & TS_ON_SWAPQ)
2183 		return;
2184 
2185 	tpri = DISP_PRIO(tp);
2186 	dp = tp->t_bound_cpu->cpu_disp;
2187 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2188 	if (tpri > dp->disp_max_unbound_pri)
2189 		dp->disp_max_unbound_pri = tpri;
2190 }
2191 
2192 /*
2193  * disp_getbest()
2194  *   De-queue the highest priority unbound runnable thread.
2195  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2196  *   Returns NULL if nothing found.
2197  *   Returns T_DONTSTEAL if the thread was not stealable.
2198  *   so that the caller will try again later.
2199  *
2200  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2201  *   its type.
2202  */
2203 static kthread_t *
2204 disp_getbest(disp_t *dp)
2205 {
2206 	kthread_t	*tp;
2207 	dispq_t		*dq;
2208 	pri_t		pri;
2209 	cpu_t		*cp, *tcp;
2210 	boolean_t	allbound;
2211 
2212 	disp_lock_enter(&dp->disp_lock);
2213 
2214 	/*
2215 	 * If there is nothing to run, or the CPU is in the middle of a
2216 	 * context switch of the only thread, return NULL.
2217 	 */
2218 	tcp = dp->disp_cpu;
2219 	cp = CPU;
2220 	pri = dp->disp_max_unbound_pri;
2221 	if (pri == -1 ||
2222 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2223 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2224 		disp_lock_exit_nopreempt(&dp->disp_lock);
2225 		return (NULL);
2226 	}
2227 
2228 	dq = &dp->disp_q[pri];
2229 
2230 
2231 	/*
2232 	 * Assume that all threads are bound on this queue, and change it
2233 	 * later when we find out that it is not the case.
2234 	 */
2235 	allbound = B_TRUE;
2236 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2237 		hrtime_t now, nosteal, rqtime;
2238 
2239 		/*
2240 		 * Skip over bound threads which could be here even
2241 		 * though disp_max_unbound_pri indicated this level.
2242 		 */
2243 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2244 			continue;
2245 
2246 		/*
2247 		 * We've got some unbound threads on this queue, so turn
2248 		 * the allbound flag off now.
2249 		 */
2250 		allbound = B_FALSE;
2251 
2252 		/*
2253 		 * The thread is a candidate for stealing from its run queue. We
2254 		 * don't want to steal threads that became runnable just a
2255 		 * moment ago. This improves CPU affinity for threads that get
2256 		 * preempted for short periods of time and go back on the run
2257 		 * queue.
2258 		 *
2259 		 * We want to let it stay on its run queue if it was only placed
2260 		 * there recently and it was running on the same CPU before that
2261 		 * to preserve its cache investment. For the thread to remain on
2262 		 * its run queue, ALL of the following conditions must be
2263 		 * satisfied:
2264 		 *
2265 		 * - the disp queue should not be the kernel preemption queue
2266 		 * - delayed idle stealing should not be disabled
2267 		 * - nosteal_nsec should be non-zero
2268 		 * - it should run with user priority
2269 		 * - it should be on the run queue of the CPU where it was
2270 		 *   running before being placed on the run queue
2271 		 * - it should be the only thread on the run queue (to prevent
2272 		 *   extra scheduling latency for other threads)
2273 		 * - it should sit on the run queue for less than per-chip
2274 		 *   nosteal interval or global nosteal interval
2275 		 * - in case of CPUs with shared cache it should sit in a run
2276 		 *   queue of a CPU from a different chip
2277 		 *
2278 		 * The checks are arranged so that the ones that are faster are
2279 		 * placed earlier.
2280 		 */
2281 		if (tcp == NULL ||
2282 		    pri >= minclsyspri ||
2283 		    tp->t_cpu != tcp)
2284 			break;
2285 
2286 		/*
2287 		 * Steal immediately if, due to CMT processor architecture
2288 		 * migraiton between cp and tcp would incur no performance
2289 		 * penalty.
2290 		 */
2291 		if (pg_cmt_can_migrate(cp, tcp))
2292 			break;
2293 
2294 		nosteal = nosteal_nsec;
2295 		if (nosteal == 0)
2296 			break;
2297 
2298 		/*
2299 		 * Calculate time spent sitting on run queue
2300 		 */
2301 		now = gethrtime_unscaled();
2302 		rqtime = now - tp->t_waitrq;
2303 		scalehrtime(&rqtime);
2304 
2305 		/*
2306 		 * Steal immediately if the time spent on this run queue is more
2307 		 * than allowed nosteal delay.
2308 		 *
2309 		 * Negative rqtime check is needed here to avoid infinite
2310 		 * stealing delays caused by unlikely but not impossible
2311 		 * drifts between CPU times on different CPUs.
2312 		 */
2313 		if (rqtime > nosteal || rqtime < 0)
2314 			break;
2315 
2316 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2317 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2318 		scalehrtime(&now);
2319 		/*
2320 		 * Calculate when this thread becomes stealable
2321 		 */
2322 		now += (nosteal - rqtime);
2323 
2324 		/*
2325 		 * Calculate time when some thread becomes stealable
2326 		 */
2327 		if (now < dp->disp_steal)
2328 			dp->disp_steal = now;
2329 	}
2330 
2331 	/*
2332 	 * If there were no unbound threads on this queue, find the queue
2333 	 * where they are and then return later. The value of
2334 	 * disp_max_unbound_pri is not always accurate because it isn't
2335 	 * reduced until another idle CPU looks for work.
2336 	 */
2337 	if (allbound)
2338 		disp_fix_unbound_pri(dp, pri);
2339 
2340 	/*
2341 	 * If we reached the end of the queue and found no unbound threads
2342 	 * then return NULL so that other CPUs will be considered.  If there
2343 	 * are unbound threads but they cannot yet be stolen, then
2344 	 * return T_DONTSTEAL and try again later.
2345 	 */
2346 	if (tp == NULL) {
2347 		disp_lock_exit_nopreempt(&dp->disp_lock);
2348 		return (allbound ? NULL : T_DONTSTEAL);
2349 	}
2350 
2351 	/*
2352 	 * Found a runnable, unbound thread, so remove it from queue.
2353 	 * dispdeq() requires that we have the thread locked, and we do,
2354 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2355 	 * put the thread in transition state, thereby dropping the dispq
2356 	 * lock.
2357 	 */
2358 
2359 #ifdef DEBUG
2360 	{
2361 		int	thread_was_on_queue;
2362 
2363 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2364 		ASSERT(thread_was_on_queue);
2365 	}
2366 
2367 #else /* DEBUG */
2368 	(void) dispdeq(tp);			/* drops disp_lock */
2369 #endif /* DEBUG */
2370 
2371 	/*
2372 	 * Reset the disp_queue steal time - we do not know what is the smallest
2373 	 * value across the queue is.
2374 	 */
2375 	dp->disp_steal = 0;
2376 
2377 	tp->t_schedflag |= TS_DONT_SWAP;
2378 
2379 	/*
2380 	 * Setup thread to run on the current CPU.
2381 	 */
2382 	tp->t_disp_queue = cp->cpu_disp;
2383 
2384 	cp->cpu_dispthread = tp;		/* protected by spl only */
2385 	cp->cpu_dispatch_pri = pri;
2386 
2387 	/*
2388 	 * There can be a memory synchronization race between disp_getbest()
2389 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2390 	 * to preempt the current thread to run the enqueued thread while
2391 	 * disp_getbest() and disp_ratify() are changing the current thread
2392 	 * to the stolen thread. This may lead to a situation where
2393 	 * cpu_resched() tries to preempt the wrong thread and the
2394 	 * stolen thread continues to run on the CPU which has been tagged
2395 	 * for preemption.
2396 	 * Later the clock thread gets enqueued but doesn't get to run on the
2397 	 * CPU causing the system to hang.
2398 	 *
2399 	 * To avoid this, grabbing and dropping the disp_lock (which does
2400 	 * a memory barrier) is needed to synchronize the execution of
2401 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2402 	 * synchronize the memory read and written by cpu_resched(),
2403 	 * disp_getbest(), and disp_ratify() with each other.
2404 	 *  (see CR#6482861 for more details).
2405 	 */
2406 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2407 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2408 
2409 	ASSERT(pri == DISP_PRIO(tp));
2410 
2411 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2412 
2413 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2414 
2415 	/*
2416 	 * Return with spl high so that swtch() won't need to raise it.
2417 	 * The disp_lock was dropped by dispdeq().
2418 	 */
2419 
2420 	return (tp);
2421 }
2422 
2423 /*
2424  * disp_bound_common() - common routine for higher level functions
2425  *	that check for bound threads under certain conditions.
2426  *	If 'threadlistsafe' is set then there is no need to acquire
2427  *	pidlock to stop the thread list from changing (eg, if
2428  *	disp_bound_* is called with cpus paused).
2429  */
2430 static int
2431 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2432 {
2433 	int		found = 0;
2434 	kthread_t	*tp;
2435 
2436 	ASSERT(flag);
2437 
2438 	if (!threadlistsafe)
2439 		mutex_enter(&pidlock);
2440 	tp = curthread;		/* faster than allthreads */
2441 	do {
2442 		if (tp->t_state != TS_FREE) {
2443 			/*
2444 			 * If an interrupt thread is busy, but the
2445 			 * caller doesn't care (i.e. BOUND_INTR is off),
2446 			 * then just ignore it and continue through.
2447 			 */
2448 			if ((tp->t_flag & T_INTR_THREAD) &&
2449 			    !(flag & BOUND_INTR))
2450 				continue;
2451 
2452 			/*
2453 			 * Skip the idle thread for the CPU
2454 			 * we're about to set offline.
2455 			 */
2456 			if (tp == cp->cpu_idle_thread)
2457 				continue;
2458 
2459 			/*
2460 			 * Skip the pause thread for the CPU
2461 			 * we're about to set offline.
2462 			 */
2463 			if (tp == cp->cpu_pause_thread)
2464 				continue;
2465 
2466 			if ((flag & BOUND_CPU) &&
2467 			    (tp->t_bound_cpu == cp ||
2468 			    tp->t_bind_cpu == cp->cpu_id ||
2469 			    tp->t_weakbound_cpu == cp)) {
2470 				found = 1;
2471 				break;
2472 			}
2473 
2474 			if ((flag & BOUND_PARTITION) &&
2475 			    (tp->t_cpupart == cp->cpu_part)) {
2476 				found = 1;
2477 				break;
2478 			}
2479 		}
2480 	} while ((tp = tp->t_next) != curthread && found == 0);
2481 	if (!threadlistsafe)
2482 		mutex_exit(&pidlock);
2483 	return (found);
2484 }
2485 
2486 /*
2487  * disp_bound_threads - return nonzero if threads are bound to the processor.
2488  *	Called infrequently.  Keep this simple.
2489  *	Includes threads that are asleep or stopped but not onproc.
2490  */
2491 int
2492 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2493 {
2494 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2495 }
2496 
2497 /*
2498  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2499  * to the given processor, including interrupt threads.
2500  */
2501 int
2502 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2503 {
2504 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2505 }
2506 
2507 /*
2508  * disp_bound_partition - return nonzero if threads are bound to the same
2509  * partition as the processor.
2510  *	Called infrequently.  Keep this simple.
2511  *	Includes threads that are asleep or stopped but not onproc.
2512  */
2513 int
2514 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2515 {
2516 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2517 }
2518 
2519 /*
2520  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2521  * threads to other CPUs.
2522  */
2523 void
2524 disp_cpu_inactive(cpu_t *cp)
2525 {
2526 	kthread_t	*tp;
2527 	disp_t		*dp = cp->cpu_disp;
2528 	dispq_t		*dq;
2529 	pri_t		pri;
2530 	int		wasonq;
2531 
2532 	disp_lock_enter(&dp->disp_lock);
2533 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2534 		dq = &dp->disp_q[pri];
2535 		tp = dq->dq_first;
2536 
2537 		/*
2538 		 * Skip over bound threads.
2539 		 */
2540 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2541 			tp = tp->t_link;
2542 		}
2543 
2544 		if (tp == NULL) {
2545 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2546 			disp_fix_unbound_pri(dp, pri);
2547 			continue;
2548 		}
2549 
2550 		wasonq = dispdeq(tp);		/* drops disp_lock */
2551 		ASSERT(wasonq);
2552 		ASSERT(tp->t_weakbound_cpu == NULL);
2553 
2554 		setbackdq(tp);
2555 		/*
2556 		 * Called from cpu_offline:
2557 		 *
2558 		 * cp has already been removed from the list of active cpus
2559 		 * and tp->t_cpu has been changed so there is no risk of
2560 		 * tp ending up back on cp.
2561 		 *
2562 		 * Called from cpupart_move_cpu:
2563 		 *
2564 		 * The cpu has moved to a new cpupart.  Any threads that
2565 		 * were on it's dispatch queues before the move remain
2566 		 * in the old partition and can't run in the new partition.
2567 		 */
2568 		ASSERT(tp->t_cpu != cp);
2569 		thread_unlock(tp);
2570 
2571 		disp_lock_enter(&dp->disp_lock);
2572 	}
2573 	disp_lock_exit(&dp->disp_lock);
2574 }
2575 
2576 /*
2577  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2578  *	The hint passed in is used as a starting point so we don't favor
2579  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2580  *	used CPU for the thread.
2581  *
2582  *	The lgroup and priority are used to determine the best CPU to run on
2583  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2584  *	the thread priority will indicate whether the thread will actually run
2585  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2586  *	lgroup which are running the lowest priority threads are found.  The
2587  *	remote CPU is chosen only if the thread will not run locally on a CPU
2588  *	within the lgroup, but will run on the remote CPU. If the thread
2589  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2590  *
2591  *	The lpl specified also identifies the cpu partition from which
2592  *	disp_lowpri_cpu should select a CPU.
2593  *
2594  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2595  *      behalf of the current thread. (curthread is looking for a new cpu)
2596  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2597  *      ignored.
2598  *
2599  *      If a cpu is the target of an offline request then try to avoid it.
2600  *
2601  *	This function must be called at either high SPL, or with preemption
2602  *	disabled, so that the "hint" CPU cannot be removed from the online
2603  *	CPU list while we are traversing it.
2604  */
2605 cpu_t *
2606 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2607 {
2608 	cpu_t	*bestcpu;
2609 	cpu_t	*besthomecpu;
2610 	cpu_t   *cp, *cpstart;
2611 
2612 	pri_t   bestpri;
2613 	pri_t   cpupri;
2614 
2615 	klgrpset_t	done;
2616 	klgrpset_t	cur_set;
2617 
2618 	lpl_t		*lpl_iter, *lpl_leaf;
2619 	int		i;
2620 
2621 	/*
2622 	 * Scan for a CPU currently running the lowest priority thread.
2623 	 * Cannot get cpu_lock here because it is adaptive.
2624 	 * We do not require lock on CPU list.
2625 	 */
2626 	ASSERT(hint != NULL);
2627 	ASSERT(lpl != NULL);
2628 	ASSERT(lpl->lpl_ncpu > 0);
2629 
2630 	/*
2631 	 * First examine local CPUs. Note that it's possible the hint CPU
2632 	 * passed in in remote to the specified home lgroup. If our priority
2633 	 * isn't sufficient enough such that we can run immediately at home,
2634 	 * then examine CPUs remote to our home lgroup.
2635 	 * We would like to give preference to CPUs closest to "home".
2636 	 * If we can't find a CPU where we'll run at a given level
2637 	 * of locality, we expand our search to include the next level.
2638 	 */
2639 	bestcpu = besthomecpu = NULL;
2640 	klgrpset_clear(done);
2641 	/* start with lpl we were passed */
2642 
2643 	lpl_iter = lpl;
2644 
2645 	do {
2646 
2647 		bestpri = SHRT_MAX;
2648 		klgrpset_clear(cur_set);
2649 
2650 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2651 			lpl_leaf = lpl_iter->lpl_rset[i];
2652 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2653 				continue;
2654 
2655 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2656 
2657 			if (hint->cpu_lpl == lpl_leaf)
2658 				cp = cpstart = hint;
2659 			else
2660 				cp = cpstart = lpl_leaf->lpl_cpus;
2661 
2662 			do {
2663 				if (cp == curcpu)
2664 					cpupri = -1;
2665 				else if (cp == cpu_inmotion)
2666 					cpupri = SHRT_MAX;
2667 				else
2668 					cpupri = cp->cpu_dispatch_pri;
2669 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2670 					cpupri = cp->cpu_disp->disp_maxrunpri;
2671 				if (cp->cpu_chosen_level > cpupri)
2672 					cpupri = cp->cpu_chosen_level;
2673 				if (cpupri < bestpri) {
2674 					if (CPU_IDLING(cpupri)) {
2675 						ASSERT((cp->cpu_flags &
2676 						    CPU_QUIESCED) == 0);
2677 						return (cp);
2678 					}
2679 					bestcpu = cp;
2680 					bestpri = cpupri;
2681 				}
2682 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2683 		}
2684 
2685 		if (bestcpu && (tpri > bestpri)) {
2686 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2687 			return (bestcpu);
2688 		}
2689 		if (besthomecpu == NULL)
2690 			besthomecpu = bestcpu;
2691 		/*
2692 		 * Add the lgrps we just considered to the "done" set
2693 		 */
2694 		klgrpset_or(done, cur_set);
2695 
2696 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2697 
2698 	/*
2699 	 * The specified priority isn't high enough to run immediately
2700 	 * anywhere, so just return the best CPU from the home lgroup.
2701 	 */
2702 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2703 	return (besthomecpu);
2704 }
2705 
2706 /*
2707  * This routine provides the generic idle cpu function for all processors.
2708  * If a processor has some specific code to execute when idle (say, to stop
2709  * the pipeline and save power) then that routine should be defined in the
2710  * processors specific code (module_xx.c) and the global variable idle_cpu
2711  * set to that function.
2712  */
2713 static void
2714 generic_idle_cpu(void)
2715 {
2716 }
2717 
2718 /*ARGSUSED*/
2719 static void
2720 generic_enq_thread(cpu_t *cpu, int bound)
2721 {
2722 }
2723 
2724 /*
2725  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2726  *	- t->t_cpu is not in this thread's assigned lgrp
2727  *	- the time since the thread last came off t->t_cpu exceeds the
2728  *	  rechoose time for this cpu (ignore this if t is curthread in
2729  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2730  *	- t->t_cpu is presently the target of an offline or partition move
2731  *	  request
2732  */
2733 static cpu_t *
2734 cpu_choose(kthread_t *t, pri_t tpri)
2735 {
2736 	ASSERT(tpri < kpqpri);
2737 
2738 	if ((((lbolt - t->t_disp_time) > rechoose_interval) &&
2739 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2740 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2741 	}
2742 
2743 	/*
2744 	 * Take a trip through disp_lowpri_cpu() if the thread was
2745 	 * running outside it's home lgroup
2746 	 */
2747 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2748 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2749 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2750 		    (t == curthread) ? t->t_cpu : NULL));
2751 	}
2752 	return (t->t_cpu);
2753 }
2754