xref: /titanic_50/usr/src/uts/common/disp/disp.c (revision e1d9f4e6832768425b218c917c09c9afaed8ae36)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/signal.h>
37 #include <sys/user.h>
38 #include <sys/systm.h>
39 #include <sys/sysinfo.h>
40 #include <sys/var.h>
41 #include <sys/errno.h>
42 #include <sys/cmn_err.h>
43 #include <sys/debug.h>
44 #include <sys/inline.h>
45 #include <sys/disp.h>
46 #include <sys/class.h>
47 #include <sys/bitmap.h>
48 #include <sys/kmem.h>
49 #include <sys/cpuvar.h>
50 #include <sys/vtrace.h>
51 #include <sys/tnf.h>
52 #include <sys/cpupart.h>
53 #include <sys/lgrp.h>
54 #include <sys/chip.h>
55 #include <sys/schedctl.h>
56 #include <sys/atomic.h>
57 #include <sys/dtrace.h>
58 #include <sys/sdt.h>
59 
60 #include <vm/as.h>
61 
62 #define	BOUND_CPU	0x1
63 #define	BOUND_PARTITION	0x2
64 #define	BOUND_INTR	0x4
65 
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 	disp_t	*dp;
69 	dispq_t *olddispq;
70 	dispq_t *newdispq;
71 	ulong_t	*olddqactmap;
72 	ulong_t	*newdqactmap;
73 	int	oldnglobpris;
74 };
75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76     disp_t *dp);
77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void	disp_dq_free(struct disp_queue_info *dptr);
79 
80 /* platform-specific routine to call when processor is idle */
81 static void	generic_idle_cpu();
82 void		(*idle_cpu)() = generic_idle_cpu;
83 
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void	idle_enter();
86 static void	idle_exit();
87 
88 /* platform-specific routine to call when thread is enqueued */
89 static void	generic_enq_thread(cpu_t *, int);
90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 
92 pri_t	kpreemptpri;	/* priority where kernel preemption applies */
93 pri_t	upreemptpri = 0; /* priority where normal preemption applies */
94 pri_t	intr_pri;	/* interrupt thread priority base level */
95 
96 #define	KPQPRI	-1 /* priority where cpu affinity is dropped for kp queue */
97 pri_t	kpqpri = KPQPRI; /* can be set in /etc/system */
98 disp_t	cpu0_disp;	/* boot CPU's dispatch queue */
99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
100 int	nswapped;	/* total number of swapped threads */
101 void	disp_swapped_enq(kthread_t *tp);
102 static void	disp_swapped_setrun(kthread_t *tp);
103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
104 
105 /*
106  * If this is set, only interrupt threads will cause kernel preemptions.
107  * This is done by changing the value of kpreemptpri.  kpreemptpri
108  * will either be the max sysclass pri + 1 or the min interrupt pri.
109  */
110 int	only_intr_kpreempt;
111 
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define	SETKP_BACK	0
116 #define	SETKP_FRONT	1
117 /*
118  * Parameter that determines how recently a thread must have run
119  * on the CPU to be considered loosely-bound to that CPU to reduce
120  * cold cache effects.  The interval is in hertz.
121  *
122  * The platform may define a per physical processor adjustment of
123  * this parameter. For efficiency, the effective rechoose interval
124  * (rechoose_interval + per chip adjustment) is maintained in the
125  * cpu structures. See cpu_choose()
126  */
127 int	rechoose_interval = RECHOOSE_INTERVAL;
128 
129 static cpu_t	*cpu_choose(kthread_t *, pri_t);
130 
131 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
132 
133 disp_lock_t	transition_lock;	/* lock on transitioning threads */
134 disp_lock_t	stop_lock;		/* lock on stopped threads */
135 
136 static void		cpu_dispqalloc(int numpris);
137 
138 static kthread_t	*disp_getwork(cpu_t *to);
139 static kthread_t	*disp_getbest(disp_t *from);
140 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
141 
142 void	swtch_to(kthread_t *);
143 
144 /*
145  * dispatcher and scheduler initialization
146  */
147 
148 /*
149  * disp_setup - Common code to calculate and allocate dispatcher
150  *		variables and structures based on the maximum priority.
151  */
152 static void
153 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
154 {
155 	pri_t	newnglobpris;
156 
157 	ASSERT(MUTEX_HELD(&cpu_lock));
158 
159 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
160 
161 	if (newnglobpris > oldnglobpris) {
162 		/*
163 		 * Allocate new kp queues for each CPU partition.
164 		 */
165 		cpupart_kpqalloc(newnglobpris);
166 
167 		/*
168 		 * Allocate new dispatch queues for each CPU.
169 		 */
170 		cpu_dispqalloc(newnglobpris);
171 
172 		/*
173 		 * compute new interrupt thread base priority
174 		 */
175 		intr_pri = maxglobpri;
176 		if (only_intr_kpreempt) {
177 			kpreemptpri = intr_pri + 1;
178 			if (kpqpri == KPQPRI)
179 				kpqpri = kpreemptpri;
180 		}
181 		v.v_nglobpris = newnglobpris;
182 	}
183 }
184 
185 /*
186  * dispinit - Called to initialize all loaded classes and the
187  *	      dispatcher framework.
188  */
189 void
190 dispinit(void)
191 {
192 	id_t	cid;
193 	pri_t	maxglobpri;
194 	pri_t	cl_maxglobpri;
195 
196 	maxglobpri = -1;
197 
198 	/*
199 	 * Initialize transition lock, which will always be set.
200 	 */
201 	DISP_LOCK_INIT(&transition_lock);
202 	disp_lock_enter_high(&transition_lock);
203 	DISP_LOCK_INIT(&stop_lock);
204 
205 	mutex_enter(&cpu_lock);
206 	CPU->cpu_disp->disp_maxrunpri = -1;
207 	CPU->cpu_disp->disp_max_unbound_pri = -1;
208 	/*
209 	 * Initialize the default CPU partition.
210 	 */
211 	cpupart_initialize_default();
212 	/*
213 	 * Call the class specific initialization functions for
214 	 * all pre-installed schedulers.
215 	 *
216 	 * We pass the size of a class specific parameter
217 	 * buffer to each of the initialization functions
218 	 * to try to catch problems with backward compatibility
219 	 * of class modules.
220 	 *
221 	 * For example a new class module running on an old system
222 	 * which didn't provide sufficiently large parameter buffers
223 	 * would be bad news. Class initialization modules can check for
224 	 * this and take action if they detect a problem.
225 	 */
226 
227 	for (cid = 0; cid < nclass; cid++) {
228 		sclass_t	*sc;
229 
230 		sc = &sclass[cid];
231 		if (SCHED_INSTALLED(sc)) {
232 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
233 			    &sc->cl_funcs);
234 			if (cl_maxglobpri > maxglobpri)
235 				maxglobpri = cl_maxglobpri;
236 		}
237 	}
238 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
239 	if (kpqpri == KPQPRI)
240 		kpqpri = kpreemptpri;
241 
242 	ASSERT(maxglobpri >= 0);
243 	disp_setup(maxglobpri, 0);
244 
245 	mutex_exit(&cpu_lock);
246 
247 	/*
248 	 * Get the default class ID; this may be later modified via
249 	 * dispadmin(1M).  This will load the class (normally TS) and that will
250 	 * call disp_add(), which is why we had to drop cpu_lock first.
251 	 */
252 	if (getcid(defaultclass, &defaultcid) != 0) {
253 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
254 		    defaultclass);
255 	}
256 }
257 
258 /*
259  * disp_add - Called with class pointer to initialize the dispatcher
260  *	      for a newly loaded class.
261  */
262 void
263 disp_add(sclass_t *clp)
264 {
265 	pri_t	maxglobpri;
266 	pri_t	cl_maxglobpri;
267 
268 	mutex_enter(&cpu_lock);
269 	/*
270 	 * Initialize the scheduler class.
271 	 */
272 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
273 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
274 	if (cl_maxglobpri > maxglobpri)
275 		maxglobpri = cl_maxglobpri;
276 
277 	/*
278 	 * Save old queue information.  Since we're initializing a
279 	 * new scheduling class which has just been loaded, then
280 	 * the size of the dispq may have changed.  We need to handle
281 	 * that here.
282 	 */
283 	disp_setup(maxglobpri, v.v_nglobpris);
284 
285 	mutex_exit(&cpu_lock);
286 }
287 
288 
289 /*
290  * For each CPU, allocate new dispatch queues
291  * with the stated number of priorities.
292  */
293 static void
294 cpu_dispqalloc(int numpris)
295 {
296 	cpu_t	*cpup;
297 	struct disp_queue_info	*disp_mem;
298 	int i, num;
299 
300 	ASSERT(MUTEX_HELD(&cpu_lock));
301 
302 	disp_mem = kmem_zalloc(NCPU *
303 	    sizeof (struct disp_queue_info), KM_SLEEP);
304 
305 	/*
306 	 * This routine must allocate all of the memory before stopping
307 	 * the cpus because it must not sleep in kmem_alloc while the
308 	 * CPUs are stopped.  Locks they hold will not be freed until they
309 	 * are restarted.
310 	 */
311 	i = 0;
312 	cpup = cpu_list;
313 	do {
314 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
315 		i++;
316 		cpup = cpup->cpu_next;
317 	} while (cpup != cpu_list);
318 	num = i;
319 
320 	pause_cpus(NULL);
321 	for (i = 0; i < num; i++)
322 		disp_dq_assign(&disp_mem[i], numpris);
323 	start_cpus();
324 
325 	/*
326 	 * I must free all of the memory after starting the cpus because
327 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
328 	 */
329 	for (i = 0; i < num; i++)
330 		disp_dq_free(&disp_mem[i]);
331 
332 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
333 }
334 
335 static void
336 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
337 {
338 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
339 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
340 	    sizeof (long), KM_SLEEP);
341 	dptr->dp = dp;
342 }
343 
344 static void
345 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
346 {
347 	disp_t	*dp;
348 
349 	dp = dptr->dp;
350 	dptr->olddispq = dp->disp_q;
351 	dptr->olddqactmap = dp->disp_qactmap;
352 	dptr->oldnglobpris = dp->disp_npri;
353 
354 	ASSERT(dptr->oldnglobpris < numpris);
355 
356 	if (dptr->olddispq != NULL) {
357 		/*
358 		 * Use kcopy because bcopy is platform-specific
359 		 * and could block while we might have paused the cpus.
360 		 */
361 		(void) kcopy(dptr->olddispq, dptr->newdispq,
362 		    dptr->oldnglobpris * sizeof (dispq_t));
363 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
364 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
365 		    sizeof (long));
366 	}
367 	dp->disp_q = dptr->newdispq;
368 	dp->disp_qactmap = dptr->newdqactmap;
369 	dp->disp_q_limit = &dptr->newdispq[numpris];
370 	dp->disp_npri = numpris;
371 }
372 
373 static void
374 disp_dq_free(struct disp_queue_info *dptr)
375 {
376 	if (dptr->olddispq != NULL)
377 		kmem_free(dptr->olddispq,
378 		    dptr->oldnglobpris * sizeof (dispq_t));
379 	if (dptr->olddqactmap != NULL)
380 		kmem_free(dptr->olddqactmap,
381 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
382 }
383 
384 /*
385  * For a newly created CPU, initialize the dispatch queue.
386  * This is called before the CPU is known through cpu[] or on any lists.
387  */
388 void
389 disp_cpu_init(cpu_t *cp)
390 {
391 	disp_t	*dp;
392 	dispq_t	*newdispq;
393 	ulong_t	*newdqactmap;
394 
395 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
396 
397 	if (cp == cpu0_disp.disp_cpu)
398 		dp = &cpu0_disp;
399 	else
400 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
401 	bzero(dp, sizeof (disp_t));
402 	cp->cpu_disp = dp;
403 	dp->disp_cpu = cp;
404 	dp->disp_maxrunpri = -1;
405 	dp->disp_max_unbound_pri = -1;
406 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
407 	/*
408 	 * Allocate memory for the dispatcher queue headers
409 	 * and the active queue bitmap.
410 	 */
411 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
412 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
413 	    sizeof (long), KM_SLEEP);
414 	dp->disp_q = newdispq;
415 	dp->disp_qactmap = newdqactmap;
416 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
417 	dp->disp_npri = v.v_nglobpris;
418 }
419 
420 void
421 disp_cpu_fini(cpu_t *cp)
422 {
423 	ASSERT(MUTEX_HELD(&cpu_lock));
424 
425 	disp_kp_free(cp->cpu_disp);
426 	if (cp->cpu_disp != &cpu0_disp)
427 		kmem_free(cp->cpu_disp, sizeof (disp_t));
428 }
429 
430 /*
431  * Allocate new, larger kpreempt dispatch queue to replace the old one.
432  */
433 void
434 disp_kp_alloc(disp_t *dq, pri_t npri)
435 {
436 	struct disp_queue_info	mem_info;
437 
438 	if (npri > dq->disp_npri) {
439 		/*
440 		 * Allocate memory for the new array.
441 		 */
442 		disp_dq_alloc(&mem_info, npri, dq);
443 
444 		/*
445 		 * We need to copy the old structures to the new
446 		 * and free the old.
447 		 */
448 		disp_dq_assign(&mem_info, npri);
449 		disp_dq_free(&mem_info);
450 	}
451 }
452 
453 /*
454  * Free dispatch queue.
455  * Used for the kpreempt queues for a removed CPU partition and
456  * for the per-CPU queues of deleted CPUs.
457  */
458 void
459 disp_kp_free(disp_t *dq)
460 {
461 	struct disp_queue_info	mem_info;
462 
463 	mem_info.olddispq = dq->disp_q;
464 	mem_info.olddqactmap = dq->disp_qactmap;
465 	mem_info.oldnglobpris = dq->disp_npri;
466 	disp_dq_free(&mem_info);
467 }
468 
469 /*
470  * End dispatcher and scheduler initialization.
471  */
472 
473 /*
474  * See if there's anything to do other than remain idle.
475  * Return non-zero if there is.
476  *
477  * This function must be called with high spl, or with
478  * kernel preemption disabled to prevent the partition's
479  * active cpu list from changing while being traversed.
480  *
481  */
482 int
483 disp_anywork(void)
484 {
485 	cpu_t   *cp = CPU;
486 	cpu_t   *ocp;
487 
488 	if (cp->cpu_disp->disp_nrunnable != 0)
489 		return (1);
490 
491 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
492 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
493 			return (1);
494 
495 		/*
496 		 * Work can be taken from another CPU if:
497 		 *	- There is unbound work on the run queue
498 		 *	- That work isn't a thread undergoing a
499 		 *	- context switch on an otherwise empty queue.
500 		 *	- The CPU isn't running the idle loop.
501 		 */
502 		for (ocp = cp->cpu_next_part; ocp != cp;
503 		    ocp = ocp->cpu_next_part) {
504 			ASSERT(CPU_ACTIVE(ocp));
505 
506 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
507 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
508 			    ocp->cpu_disp->disp_nrunnable == 1) &&
509 			    ocp->cpu_dispatch_pri != -1)
510 				return (1);
511 		}
512 	}
513 	return (0);
514 }
515 
516 /*
517  * Called when CPU enters the idle loop
518  */
519 static void
520 idle_enter()
521 {
522 	cpu_t		*cp = CPU;
523 
524 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
525 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
526 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
527 }
528 
529 /*
530  * Called when CPU exits the idle loop
531  */
532 static void
533 idle_exit()
534 {
535 	cpu_t		*cp = CPU;
536 
537 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
538 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
539 }
540 
541 /*
542  * Idle loop.
543  */
544 void
545 idle()
546 {
547 	struct cpu	*cp = CPU;		/* pointer to this CPU */
548 	kthread_t	*t;			/* taken thread */
549 
550 	idle_enter();
551 
552 	/*
553 	 * Uniprocessor version of idle loop.
554 	 * Do this until notified that we're on an actual multiprocessor.
555 	 */
556 	while (ncpus == 1) {
557 		if (cp->cpu_disp->disp_nrunnable == 0) {
558 			(*idle_cpu)();
559 			continue;
560 		}
561 		idle_exit();
562 		swtch();
563 
564 		idle_enter(); /* returned from swtch */
565 	}
566 
567 	/*
568 	 * Multiprocessor idle loop.
569 	 */
570 	for (;;) {
571 		/*
572 		 * If CPU is completely quiesced by p_online(2), just wait
573 		 * here with minimal bus traffic until put online.
574 		 */
575 		while (cp->cpu_flags & CPU_QUIESCED)
576 			(*idle_cpu)();
577 
578 		if (cp->cpu_disp->disp_nrunnable != 0) {
579 			idle_exit();
580 			swtch();
581 		} else {
582 			if (cp->cpu_flags & CPU_OFFLINE)
583 				continue;
584 			if ((t = disp_getwork(cp)) == NULL) {
585 				if (cp->cpu_chosen_level != -1) {
586 					disp_t *dp = cp->cpu_disp;
587 					disp_t *kpq;
588 
589 					disp_lock_enter(&dp->disp_lock);
590 					/*
591 					 * Set kpq under lock to prevent
592 					 * migration between partitions.
593 					 */
594 					kpq = &cp->cpu_part->cp_kp_queue;
595 					if (kpq->disp_maxrunpri == -1)
596 						cp->cpu_chosen_level = -1;
597 					disp_lock_exit(&dp->disp_lock);
598 				}
599 				(*idle_cpu)();
600 				continue;
601 			}
602 			idle_exit();
603 			restore_mstate(t);
604 			swtch_to(t);
605 		}
606 		idle_enter(); /* returned from swtch/swtch_to */
607 	}
608 }
609 
610 
611 /*
612  * Preempt the currently running thread in favor of the highest
613  * priority thread.  The class of the current thread controls
614  * where it goes on the dispatcher queues. If panicking, turn
615  * preemption off.
616  */
617 void
618 preempt()
619 {
620 	kthread_t 	*t = curthread;
621 	klwp_t 		*lwp = ttolwp(curthread);
622 
623 	if (panicstr)
624 		return;
625 
626 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
627 
628 	thread_lock(t);
629 
630 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
631 		/*
632 		 * this thread has already been chosen to be run on
633 		 * another CPU. Clear kprunrun on this CPU since we're
634 		 * already headed for swtch().
635 		 */
636 		CPU->cpu_kprunrun = 0;
637 		thread_unlock_nopreempt(t);
638 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
639 	} else {
640 		if (lwp != NULL)
641 			lwp->lwp_ru.nivcsw++;
642 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
643 		THREAD_TRANSITION(t);
644 		CL_PREEMPT(t);
645 		DTRACE_SCHED(preempt);
646 		thread_unlock_nopreempt(t);
647 
648 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
649 
650 		swtch();		/* clears CPU->cpu_runrun via disp() */
651 	}
652 }
653 
654 extern kthread_t *thread_unpin();
655 
656 /*
657  * disp() - find the highest priority thread for this processor to run, and
658  * set it in TS_ONPROC state so that resume() can be called to run it.
659  */
660 static kthread_t *
661 disp()
662 {
663 	cpu_t		*cpup;
664 	disp_t		*dp;
665 	kthread_t	*tp;
666 	dispq_t		*dq;
667 	int		maxrunword;
668 	pri_t		pri;
669 	disp_t		*kpq;
670 
671 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
672 
673 	cpup = CPU;
674 	/*
675 	 * Find the highest priority loaded, runnable thread.
676 	 */
677 	dp = cpup->cpu_disp;
678 
679 reschedule:
680 	/*
681 	 * If there is more important work on the global queue with a better
682 	 * priority than the maximum on this CPU, take it now.
683 	 */
684 	kpq = &cpup->cpu_part->cp_kp_queue;
685 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
686 	    pri >= dp->disp_maxrunpri &&
687 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
688 	    (tp = disp_getbest(kpq)) != NULL) {
689 		if (disp_ratify(tp, kpq) != NULL) {
690 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
691 			    "disp_end:tid %p", tp);
692 			restore_mstate(tp);
693 			return (tp);
694 		}
695 	}
696 
697 	disp_lock_enter(&dp->disp_lock);
698 	pri = dp->disp_maxrunpri;
699 
700 	/*
701 	 * If there is nothing to run, look at what's runnable on other queues.
702 	 * Choose the idle thread if the CPU is quiesced.
703 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
704 	 * interrupt threads, which will be the only threads on the CPU's own
705 	 * queue, but cannot run threads from other queues.
706 	 */
707 	if (pri == -1) {
708 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
709 			disp_lock_exit(&dp->disp_lock);
710 			if ((tp = disp_getwork(cpup)) == NULL) {
711 				tp = cpup->cpu_idle_thread;
712 				(void) splhigh();
713 				THREAD_ONPROC(tp, cpup);
714 				cpup->cpu_dispthread = tp;
715 				cpup->cpu_dispatch_pri = -1;
716 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
717 				cpup->cpu_chosen_level = -1;
718 			}
719 		} else {
720 			disp_lock_exit_high(&dp->disp_lock);
721 			tp = cpup->cpu_idle_thread;
722 			THREAD_ONPROC(tp, cpup);
723 			cpup->cpu_dispthread = tp;
724 			cpup->cpu_dispatch_pri = -1;
725 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
726 			cpup->cpu_chosen_level = -1;
727 		}
728 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
729 			"disp_end:tid %p", tp);
730 		restore_mstate(tp);
731 		return (tp);
732 	}
733 
734 	dq = &dp->disp_q[pri];
735 	tp = dq->dq_first;
736 
737 	ASSERT(tp != NULL);
738 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
739 
740 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
741 
742 	/*
743 	 * Found it so remove it from queue.
744 	 */
745 	dp->disp_nrunnable--;
746 	dq->dq_sruncnt--;
747 	if ((dq->dq_first = tp->t_link) == NULL) {
748 		ulong_t	*dqactmap = dp->disp_qactmap;
749 
750 		ASSERT(dq->dq_sruncnt == 0);
751 		dq->dq_last = NULL;
752 
753 		/*
754 		 * The queue is empty, so the corresponding bit needs to be
755 		 * turned off in dqactmap.   If nrunnable != 0 just took the
756 		 * last runnable thread off the
757 		 * highest queue, so recompute disp_maxrunpri.
758 		 */
759 		maxrunword = pri >> BT_ULSHIFT;
760 		dqactmap[maxrunword] &= ~BT_BIW(pri);
761 
762 		if (dp->disp_nrunnable == 0) {
763 			dp->disp_max_unbound_pri = -1;
764 			dp->disp_maxrunpri = -1;
765 		} else {
766 			int ipri;
767 
768 			ipri = bt_gethighbit(dqactmap, maxrunword);
769 			dp->disp_maxrunpri = ipri;
770 			if (ipri < dp->disp_max_unbound_pri)
771 				dp->disp_max_unbound_pri = ipri;
772 		}
773 	} else {
774 		tp->t_link = NULL;
775 	}
776 
777 	/*
778 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
779 	 * out this thread before we have a chance to run it.
780 	 * While running, it is protected against swapping by t_lock.
781 	 */
782 	tp->t_schedflag |= TS_DONT_SWAP;
783 	cpup->cpu_dispthread = tp;		/* protected by spl only */
784 	cpup->cpu_dispatch_pri = pri;
785 	ASSERT(pri == DISP_PRIO(tp));
786 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
787 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
788 
789 	ASSERT(tp != NULL);
790 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
791 		"disp_end:tid %p", tp);
792 
793 	if (disp_ratify(tp, kpq) == NULL)
794 		goto reschedule;
795 
796 	restore_mstate(tp);
797 	return (tp);
798 }
799 
800 /*
801  * swtch()
802  *	Find best runnable thread and run it.
803  *	Called with the current thread already switched to a new state,
804  *	on a sleep queue, run queue, stopped, and not zombied.
805  *	May be called at any spl level less than or equal to LOCK_LEVEL.
806  *	Always drops spl to the base level (spl0()).
807  */
808 void
809 swtch()
810 {
811 	kthread_t	*t = curthread;
812 	kthread_t	*next;
813 	cpu_t		*cp;
814 
815 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
816 
817 	if (t->t_flag & T_INTR_THREAD)
818 		cpu_intr_swtch_enter(t);
819 
820 	if (t->t_intr != NULL) {
821 		/*
822 		 * We are an interrupt thread.  Setup and return
823 		 * the interrupted thread to be resumed.
824 		 */
825 		(void) splhigh();	/* block other scheduler action */
826 		cp = CPU;		/* now protected against migration */
827 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
828 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
829 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
830 		next = thread_unpin();
831 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
832 		resume_from_intr(next);
833 	} else {
834 #ifdef	DEBUG
835 		if (t->t_state == TS_ONPROC &&
836 		    t->t_disp_queue->disp_cpu == CPU &&
837 		    t->t_preempt == 0) {
838 			thread_lock(t);
839 			ASSERT(t->t_state != TS_ONPROC ||
840 			    t->t_disp_queue->disp_cpu != CPU ||
841 			    t->t_preempt != 0);	/* cannot migrate */
842 			thread_unlock_nopreempt(t);
843 		}
844 #endif	/* DEBUG */
845 		cp = CPU;
846 		next = disp();		/* returns with spl high */
847 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
848 
849 		/* OK to steal anything left on run queue */
850 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
851 
852 		if (next != t) {
853 			if (t == cp->cpu_idle_thread) {
854 				CHIP_NRUNNING(cp->cpu_chip, 1);
855 			} else if (next == cp->cpu_idle_thread) {
856 				CHIP_NRUNNING(cp->cpu_chip, -1);
857 			}
858 
859 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
860 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
861 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
862 
863 			if (dtrace_vtime_active)
864 				dtrace_vtime_switch(next);
865 
866 			resume(next);
867 			/*
868 			 * The TR_RESUME_END and TR_SWTCH_END trace points
869 			 * appear at the end of resume(), because we may not
870 			 * return here
871 			 */
872 		} else {
873 			if (t->t_flag & T_INTR_THREAD)
874 				cpu_intr_swtch_exit(t);
875 
876 			DTRACE_SCHED(remain__cpu);
877 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
878 			(void) spl0();
879 		}
880 	}
881 }
882 
883 /*
884  * swtch_from_zombie()
885  *	Special case of swtch(), which allows checks for TS_ZOMB to be
886  *	eliminated from normal resume.
887  *	Find best runnable thread and run it.
888  *	Called with the current thread zombied.
889  *	Zombies cannot migrate, so CPU references are safe.
890  */
891 void
892 swtch_from_zombie()
893 {
894 	kthread_t	*next;
895 	cpu_t		*cpu = CPU;
896 
897 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
898 
899 	ASSERT(curthread->t_state == TS_ZOMB);
900 
901 	next = disp();			/* returns with spl high */
902 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
903 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
904 	ASSERT(next != curthread);
905 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
906 
907 	if (next == cpu->cpu_idle_thread)
908 		CHIP_NRUNNING(cpu->cpu_chip, -1);
909 
910 	if (dtrace_vtime_active)
911 		dtrace_vtime_switch(next);
912 
913 	resume_from_zombie(next);
914 	/*
915 	 * The TR_RESUME_END and TR_SWTCH_END trace points
916 	 * appear at the end of resume(), because we certainly will not
917 	 * return here
918 	 */
919 }
920 
921 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
922 static int
923 thread_on_queue(kthread_t *tp)
924 {
925 	cpu_t	*cp;
926 	cpu_t	*self;
927 	disp_t	*dp;
928 
929 	self = CPU;
930 	cp = self->cpu_next_onln;
931 	dp = cp->cpu_disp;
932 	for (;;) {
933 		dispq_t		*dq;
934 		dispq_t		*eq;
935 
936 		disp_lock_enter_high(&dp->disp_lock);
937 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
938 			kthread_t	*rp;
939 
940 			ASSERT(dq->dq_last == NULL ||
941 				dq->dq_last->t_link == NULL);
942 			for (rp = dq->dq_first; rp; rp = rp->t_link)
943 				if (tp == rp) {
944 					disp_lock_exit_high(&dp->disp_lock);
945 					return (1);
946 				}
947 		}
948 		disp_lock_exit_high(&dp->disp_lock);
949 		if (cp == NULL)
950 			break;
951 		if (cp == self) {
952 			cp = NULL;
953 			dp = &cp->cpu_part->cp_kp_queue;
954 		} else {
955 			cp = cp->cpu_next_onln;
956 			dp = cp->cpu_disp;
957 		}
958 	}
959 	return (0);
960 }	/* end of thread_on_queue */
961 #else
962 
963 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
964 
965 #endif  /* DEBUG */
966 
967 /*
968  * like swtch(), but switch to a specified thread taken from another CPU.
969  *	called with spl high..
970  */
971 void
972 swtch_to(kthread_t *next)
973 {
974 	cpu_t			*cp = CPU;
975 
976 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
977 
978 	/*
979 	 * Update context switch statistics.
980 	 */
981 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
982 
983 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
984 
985 	if (curthread == cp->cpu_idle_thread)
986 		CHIP_NRUNNING(cp->cpu_chip, 1);
987 
988 	/* OK to steal anything left on run queue */
989 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
990 
991 	/* record last execution time */
992 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
993 
994 	if (dtrace_vtime_active)
995 		dtrace_vtime_switch(next);
996 
997 	resume(next);
998 	/*
999 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1000 	 * appear at the end of resume(), because we may not
1001 	 * return here
1002 	 */
1003 }
1004 
1005 
1006 
1007 #define	CPU_IDLING(pri)	((pri) == -1)
1008 
1009 static void
1010 cpu_resched(cpu_t *cp, pri_t tpri)
1011 {
1012 	int	call_poke_cpu = 0;
1013 	pri_t   cpupri = cp->cpu_dispatch_pri;
1014 
1015 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1016 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1017 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1018 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1019 			cp->cpu_runrun = 1;
1020 			aston(cp->cpu_dispthread);
1021 			if (tpri < kpreemptpri && cp != CPU)
1022 				call_poke_cpu = 1;
1023 		}
1024 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1025 			cp->cpu_kprunrun = 1;
1026 			if (cp != CPU)
1027 				call_poke_cpu = 1;
1028 		}
1029 	}
1030 
1031 	/*
1032 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1033 	 */
1034 	membar_enter();
1035 
1036 	if (call_poke_cpu)
1037 		poke_cpu(cp->cpu_id);
1038 }
1039 
1040 /*
1041  * Routine used by setbackdq() to balance load across the physical
1042  * processors. Returns a CPU of a lesser loaded chip in the lgroup
1043  * if balancing is necessary, or the "hint" CPU if it's not.
1044  *
1045  * - tp is the thread being enqueued
1046  * - cp is a hint CPU (chosen by cpu_choose()).
1047  * - curchip (if not NULL) is the chip on which the current thread
1048  *   is running.
1049  *
1050  * The thread lock for "tp" must be held while calling this routine.
1051  */
1052 static cpu_t *
1053 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip)
1054 {
1055 	int	chp_nrun, ochp_nrun;
1056 	chip_t	*chp, *nchp;
1057 
1058 	chp = cp->cpu_chip;
1059 	chp_nrun = chp->chip_nrunning;
1060 
1061 	if (chp == curchip)
1062 		chp_nrun--;	/* Ignore curthread */
1063 
1064 	/*
1065 	 * If this chip isn't at all idle, then let
1066 	 * run queue balancing do the work.
1067 	 */
1068 	if (chp_nrun == chp->chip_ncpu)
1069 		return (cp);
1070 
1071 	nchp = chp->chip_balance;
1072 	do {
1073 		if (nchp == chp ||
1074 		    !CHIP_IN_CPUPART(nchp, tp->t_cpupart))
1075 			continue;
1076 
1077 		ochp_nrun = nchp->chip_nrunning;
1078 
1079 		/*
1080 		 * If the other chip is running less threads,
1081 		 * or if it's running the same number of threads, but
1082 		 * has more online logical CPUs, then choose to balance.
1083 		 */
1084 		if (chp_nrun > ochp_nrun ||
1085 		    (chp_nrun == ochp_nrun &&
1086 		    nchp->chip_ncpu > chp->chip_ncpu)) {
1087 			cp = nchp->chip_cpus;
1088 			nchp->chip_cpus = cp->cpu_next_chip;
1089 
1090 			/*
1091 			 * Find a CPU on the chip in the correct
1092 			 * partition. We know at least one exists
1093 			 * because of the CHIP_IN_CPUPART() check above.
1094 			 */
1095 			while (cp->cpu_part != tp->t_cpupart)
1096 				cp = cp->cpu_next_chip;
1097 		}
1098 		chp->chip_balance = nchp->chip_next_lgrp;
1099 		break;
1100 	} while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance);
1101 
1102 	ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart));
1103 	return (cp);
1104 }
1105 
1106 /*
1107  * setbackdq() keeps runqs balanced such that the difference in length
1108  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1109  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1110  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1111  * try to keep runqs perfectly balanced regardless of the thread priority.
1112  */
1113 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1114 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1115 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1116 
1117 /*
1118  * Put the specified thread on the back of the dispatcher
1119  * queue corresponding to its current priority.
1120  *
1121  * Called with the thread in transition, onproc or stopped state
1122  * and locked (transition implies locked) and at high spl.
1123  * Returns with the thread in TS_RUN state and still locked.
1124  */
1125 void
1126 setbackdq(kthread_t *tp)
1127 {
1128 	dispq_t	*dq;
1129 	disp_t		*dp;
1130 	chip_t		*curchip = NULL;
1131 	cpu_t		*cp;
1132 	pri_t		tpri;
1133 	int		bound;
1134 
1135 	ASSERT(THREAD_LOCK_HELD(tp));
1136 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1137 
1138 	if (tp->t_waitrq == 0) {
1139 		hrtime_t curtime;
1140 
1141 		curtime = gethrtime_unscaled();
1142 		(void) cpu_update_pct(tp, curtime);
1143 		tp->t_waitrq = curtime;
1144 	} else {
1145 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1146 	}
1147 
1148 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1149 
1150 	/*
1151 	 * If thread is "swapped" or on the swap queue don't
1152 	 * queue it, but wake sched.
1153 	 */
1154 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1155 		disp_swapped_setrun(tp);
1156 		return;
1157 	}
1158 
1159 	tpri = DISP_PRIO(tp);
1160 	if (tp == curthread) {
1161 		curchip = CPU->cpu_chip;
1162 	}
1163 
1164 	if (ncpus == 1)
1165 		cp = tp->t_cpu;
1166 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1167 		if (tpri >= kpqpri) {
1168 			setkpdq(tp, SETKP_BACK);
1169 			return;
1170 		}
1171 		/*
1172 		 * Let cpu_choose suggest a CPU.
1173 		 */
1174 		cp = cpu_choose(tp, tpri);
1175 
1176 		if (tp->t_cpupart == cp->cpu_part) {
1177 			int	qlen;
1178 
1179 			/*
1180 			 * Select another CPU if we need
1181 			 * to do some load balancing across the
1182 			 * physical processors.
1183 			 */
1184 			if (CHIP_SHOULD_BALANCE(cp->cpu_chip))
1185 				cp = chip_balance(tp, cp, curchip);
1186 
1187 			/*
1188 			 * Balance across the run queues
1189 			 */
1190 			qlen = RUNQ_LEN(cp, tpri);
1191 			if (tpri >= RUNQ_MATCH_PRI &&
1192 			    !(tp->t_schedflag & TS_RUNQMATCH))
1193 				qlen -= RUNQ_MAX_DIFF;
1194 			if (qlen > 0) {
1195 				cpu_t	*np;
1196 
1197 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
1198 					np = cp->cpu_next_part;
1199 				else {
1200 					if ((np = cp->cpu_next_lpl) == cp)
1201 						np = cp->cpu_next_part;
1202 				}
1203 				if (RUNQ_LEN(np, tpri) < qlen)
1204 					cp = np;
1205 			}
1206 		} else {
1207 			/*
1208 			 * Migrate to a cpu in the new partition.
1209 			 */
1210 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1211 			    tp->t_lpl, tp->t_pri, NULL);
1212 		}
1213 		bound = 0;
1214 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1215 	} else {
1216 		/*
1217 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1218 		 * a short time until weak binding that existed when the
1219 		 * strong binding was established has dropped) so we must
1220 		 * favour weak binding over strong.
1221 		 */
1222 		cp = tp->t_weakbound_cpu ?
1223 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1224 		bound = 1;
1225 	}
1226 	dp = cp->cpu_disp;
1227 	disp_lock_enter_high(&dp->disp_lock);
1228 
1229 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1230 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1231 		tpri, cp, tp);
1232 
1233 #ifndef NPROBE
1234 	/* Kernel probe */
1235 	if (tnf_tracing_active)
1236 		tnf_thread_queue(tp, cp, tpri);
1237 #endif /* NPROBE */
1238 
1239 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1240 
1241 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1242 	tp->t_disp_queue = dp;
1243 	tp->t_link = NULL;
1244 
1245 	dq = &dp->disp_q[tpri];
1246 	dp->disp_nrunnable++;
1247 	membar_enter();
1248 
1249 	if (dq->dq_sruncnt++ != 0) {
1250 		ASSERT(dq->dq_first != NULL);
1251 		dq->dq_last->t_link = tp;
1252 		dq->dq_last = tp;
1253 	} else {
1254 		ASSERT(dq->dq_first == NULL);
1255 		ASSERT(dq->dq_last == NULL);
1256 		dq->dq_first = dq->dq_last = tp;
1257 		BT_SET(dp->disp_qactmap, tpri);
1258 		if (tpri > dp->disp_maxrunpri) {
1259 			dp->disp_maxrunpri = tpri;
1260 			membar_enter();
1261 			cpu_resched(cp, tpri);
1262 		}
1263 	}
1264 
1265 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1266 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1267 		    cp == CPU) {
1268 			/*
1269 			 * If there are no other unbound threads on the
1270 			 * run queue, don't allow other CPUs to steal
1271 			 * this thread while we are in the middle of a
1272 			 * context switch. We may just switch to it
1273 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1274 			 * in swtch and swtch_to.
1275 			 */
1276 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1277 		}
1278 		dp->disp_max_unbound_pri = tpri;
1279 	}
1280 	(*disp_enq_thread)(cp, bound);
1281 }
1282 
1283 /*
1284  * Put the specified thread on the front of the dispatcher
1285  * queue corresponding to its current priority.
1286  *
1287  * Called with the thread in transition, onproc or stopped state
1288  * and locked (transition implies locked) and at high spl.
1289  * Returns with the thread in TS_RUN state and still locked.
1290  */
1291 void
1292 setfrontdq(kthread_t *tp)
1293 {
1294 	disp_t		*dp;
1295 	dispq_t		*dq;
1296 	cpu_t		*cp;
1297 	pri_t		tpri;
1298 	int		bound;
1299 
1300 	ASSERT(THREAD_LOCK_HELD(tp));
1301 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1302 
1303 	if (tp->t_waitrq == 0) {
1304 		hrtime_t curtime;
1305 
1306 		curtime = gethrtime_unscaled();
1307 		(void) cpu_update_pct(tp, curtime);
1308 		tp->t_waitrq = curtime;
1309 	} else {
1310 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1311 	}
1312 
1313 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1314 
1315 	/*
1316 	 * If thread is "swapped" or on the swap queue don't
1317 	 * queue it, but wake sched.
1318 	 */
1319 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1320 		disp_swapped_setrun(tp);
1321 		return;
1322 	}
1323 
1324 	tpri = DISP_PRIO(tp);
1325 	if (ncpus == 1)
1326 		cp = tp->t_cpu;
1327 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1328 		if (tpri >= kpqpri) {
1329 			setkpdq(tp, SETKP_FRONT);
1330 			return;
1331 		}
1332 		cp = tp->t_cpu;
1333 		if (tp->t_cpupart == cp->cpu_part) {
1334 			/*
1335 			 * If we are of higher or equal priority than
1336 			 * the highest priority runnable thread of
1337 			 * the current CPU, just pick this CPU.  Otherwise
1338 			 * Let cpu_choose() select the CPU.  If this cpu
1339 			 * is the target of an offline request then do not
1340 			 * pick it - a thread_nomigrate() on the in motion
1341 			 * cpu relies on this when it forces a preempt.
1342 			 */
1343 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1344 			    cp == cpu_inmotion)
1345 				cp = cpu_choose(tp, tpri);
1346 		} else {
1347 			/*
1348 			 * Migrate to a cpu in the new partition.
1349 			 */
1350 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1351 			    tp->t_lpl, tp->t_pri, NULL);
1352 		}
1353 		bound = 0;
1354 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1355 	} else {
1356 		/*
1357 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1358 		 * a short time until weak binding that existed when the
1359 		 * strong binding was established has dropped) so we must
1360 		 * favour weak binding over strong.
1361 		 */
1362 		cp = tp->t_weakbound_cpu ?
1363 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1364 		bound = 1;
1365 	}
1366 	dp = cp->cpu_disp;
1367 	disp_lock_enter_high(&dp->disp_lock);
1368 
1369 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1370 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1371 
1372 #ifndef NPROBE
1373 	/* Kernel probe */
1374 	if (tnf_tracing_active)
1375 		tnf_thread_queue(tp, cp, tpri);
1376 #endif /* NPROBE */
1377 
1378 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1379 
1380 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1381 	tp->t_disp_queue = dp;
1382 
1383 	dq = &dp->disp_q[tpri];
1384 	dp->disp_nrunnable++;
1385 	membar_enter();
1386 
1387 	if (dq->dq_sruncnt++ != 0) {
1388 		ASSERT(dq->dq_last != NULL);
1389 		tp->t_link = dq->dq_first;
1390 		dq->dq_first = tp;
1391 	} else {
1392 		ASSERT(dq->dq_last == NULL);
1393 		ASSERT(dq->dq_first == NULL);
1394 		tp->t_link = NULL;
1395 		dq->dq_first = dq->dq_last = tp;
1396 		BT_SET(dp->disp_qactmap, tpri);
1397 		if (tpri > dp->disp_maxrunpri) {
1398 			dp->disp_maxrunpri = tpri;
1399 			membar_enter();
1400 			cpu_resched(cp, tpri);
1401 		}
1402 	}
1403 
1404 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1405 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1406 		    cp == CPU) {
1407 			/*
1408 			 * If there are no other unbound threads on the
1409 			 * run queue, don't allow other CPUs to steal
1410 			 * this thread while we are in the middle of a
1411 			 * context switch. We may just switch to it
1412 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1413 			 * in swtch and swtch_to.
1414 			 */
1415 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1416 		}
1417 		dp->disp_max_unbound_pri = tpri;
1418 	}
1419 	(*disp_enq_thread)(cp, bound);
1420 }
1421 
1422 /*
1423  * Put a high-priority unbound thread on the kp queue
1424  */
1425 static void
1426 setkpdq(kthread_t *tp, int borf)
1427 {
1428 	dispq_t	*dq;
1429 	disp_t	*dp;
1430 	cpu_t	*cp;
1431 	pri_t	tpri;
1432 
1433 	tpri = DISP_PRIO(tp);
1434 
1435 	dp = &tp->t_cpupart->cp_kp_queue;
1436 	disp_lock_enter_high(&dp->disp_lock);
1437 
1438 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1439 
1440 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1441 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1442 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1443 	tp->t_disp_queue = dp;
1444 	dp->disp_nrunnable++;
1445 	dq = &dp->disp_q[tpri];
1446 
1447 	if (dq->dq_sruncnt++ != 0) {
1448 		if (borf == SETKP_BACK) {
1449 			ASSERT(dq->dq_first != NULL);
1450 			tp->t_link = NULL;
1451 			dq->dq_last->t_link = tp;
1452 			dq->dq_last = tp;
1453 		} else {
1454 			ASSERT(dq->dq_last != NULL);
1455 			tp->t_link = dq->dq_first;
1456 			dq->dq_first = tp;
1457 		}
1458 	} else {
1459 		if (borf == SETKP_BACK) {
1460 			ASSERT(dq->dq_first == NULL);
1461 			ASSERT(dq->dq_last == NULL);
1462 			dq->dq_first = dq->dq_last = tp;
1463 		} else {
1464 			ASSERT(dq->dq_last == NULL);
1465 			ASSERT(dq->dq_first == NULL);
1466 			tp->t_link = NULL;
1467 			dq->dq_first = dq->dq_last = tp;
1468 		}
1469 		BT_SET(dp->disp_qactmap, tpri);
1470 		if (tpri > dp->disp_max_unbound_pri)
1471 			dp->disp_max_unbound_pri = tpri;
1472 		if (tpri > dp->disp_maxrunpri) {
1473 			dp->disp_maxrunpri = tpri;
1474 			membar_enter();
1475 		}
1476 	}
1477 
1478 	cp = tp->t_cpu;
1479 	if (tp->t_cpupart != cp->cpu_part) {
1480 		/* migrate to a cpu in the new partition */
1481 		cp = tp->t_cpupart->cp_cpulist;
1482 	}
1483 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1484 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1485 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1486 
1487 #ifndef NPROBE
1488 	/* Kernel probe */
1489 	if (tnf_tracing_active)
1490 		tnf_thread_queue(tp, cp, tpri);
1491 #endif /* NPROBE */
1492 
1493 	if (cp->cpu_chosen_level < tpri)
1494 		cp->cpu_chosen_level = tpri;
1495 	cpu_resched(cp, tpri);
1496 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1497 	(*disp_enq_thread)(cp, 0);
1498 }
1499 
1500 /*
1501  * Remove a thread from the dispatcher queue if it is on it.
1502  * It is not an error if it is not found but we return whether
1503  * or not it was found in case the caller wants to check.
1504  */
1505 int
1506 dispdeq(kthread_t *tp)
1507 {
1508 	disp_t		*dp;
1509 	dispq_t		*dq;
1510 	kthread_t	*rp;
1511 	kthread_t	*trp;
1512 	kthread_t	**ptp;
1513 	int		tpri;
1514 
1515 	ASSERT(THREAD_LOCK_HELD(tp));
1516 
1517 	if (tp->t_state != TS_RUN)
1518 		return (0);
1519 
1520 	/*
1521 	 * The thread is "swapped" or is on the swap queue and
1522 	 * hence no longer on the run queue, so return true.
1523 	 */
1524 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1525 		return (1);
1526 
1527 	tpri = DISP_PRIO(tp);
1528 	dp = tp->t_disp_queue;
1529 	ASSERT(tpri < dp->disp_npri);
1530 	dq = &dp->disp_q[tpri];
1531 	ptp = &dq->dq_first;
1532 	rp = *ptp;
1533 	trp = NULL;
1534 
1535 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1536 
1537 	/*
1538 	 * Search for thread in queue.
1539 	 * Double links would simplify this at the expense of disp/setrun.
1540 	 */
1541 	while (rp != tp && rp != NULL) {
1542 		trp = rp;
1543 		ptp = &trp->t_link;
1544 		rp = trp->t_link;
1545 	}
1546 
1547 	if (rp == NULL) {
1548 		panic("dispdeq: thread not on queue");
1549 	}
1550 
1551 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1552 
1553 	/*
1554 	 * Found it so remove it from queue.
1555 	 */
1556 	if ((*ptp = rp->t_link) == NULL)
1557 		dq->dq_last = trp;
1558 
1559 	dp->disp_nrunnable--;
1560 	if (--dq->dq_sruncnt == 0) {
1561 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1562 		if (dp->disp_nrunnable == 0) {
1563 			dp->disp_max_unbound_pri = -1;
1564 			dp->disp_maxrunpri = -1;
1565 		} else if (tpri == dp->disp_maxrunpri) {
1566 			int ipri;
1567 
1568 			ipri = bt_gethighbit(dp->disp_qactmap,
1569 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1570 			if (ipri < dp->disp_max_unbound_pri)
1571 				dp->disp_max_unbound_pri = ipri;
1572 			dp->disp_maxrunpri = ipri;
1573 		}
1574 	}
1575 	tp->t_link = NULL;
1576 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1577 	return (1);
1578 }
1579 
1580 
1581 /*
1582  * dq_sruninc and dq_srundec are public functions for
1583  * incrementing/decrementing the sruncnts when a thread on
1584  * a dispatcher queue is made schedulable/unschedulable by
1585  * resetting the TS_LOAD flag.
1586  *
1587  * The caller MUST have the thread lock and therefore the dispatcher
1588  * queue lock so that the operation which changes
1589  * the flag, the operation that checks the status of the thread to
1590  * determine if it's on a disp queue AND the call to this function
1591  * are one atomic operation with respect to interrupts.
1592  */
1593 
1594 /*
1595  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1596  */
1597 void
1598 dq_sruninc(kthread_t *t)
1599 {
1600 	ASSERT(t->t_state == TS_RUN);
1601 	ASSERT(t->t_schedflag & TS_LOAD);
1602 
1603 	THREAD_TRANSITION(t);
1604 	setfrontdq(t);
1605 }
1606 
1607 /*
1608  * See comment on calling conventions above.
1609  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1610  */
1611 void
1612 dq_srundec(kthread_t *t)
1613 {
1614 	ASSERT(t->t_schedflag & TS_LOAD);
1615 
1616 	(void) dispdeq(t);
1617 	disp_swapped_enq(t);
1618 }
1619 
1620 /*
1621  * Change the dispatcher lock of thread to the "swapped_lock"
1622  * and return with thread lock still held.
1623  *
1624  * Called with thread_lock held, in transition state, and at high spl.
1625  */
1626 void
1627 disp_swapped_enq(kthread_t *tp)
1628 {
1629 	ASSERT(THREAD_LOCK_HELD(tp));
1630 	ASSERT(tp->t_schedflag & TS_LOAD);
1631 
1632 	switch (tp->t_state) {
1633 	case TS_RUN:
1634 		disp_lock_enter_high(&swapped_lock);
1635 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1636 		break;
1637 	case TS_ONPROC:
1638 		disp_lock_enter_high(&swapped_lock);
1639 		THREAD_TRANSITION(tp);
1640 		wake_sched_sec = 1;		/* tell clock to wake sched */
1641 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1642 		break;
1643 	default:
1644 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1645 	}
1646 }
1647 
1648 /*
1649  * This routine is called by setbackdq/setfrontdq if the thread is
1650  * not loaded or loaded and on the swap queue.
1651  *
1652  * Thread state TS_SLEEP implies that a swapped thread
1653  * has been woken up and needs to be swapped in by the swapper.
1654  *
1655  * Thread state TS_RUN, it implies that the priority of a swapped
1656  * thread is being increased by scheduling class (e.g. ts_update).
1657  */
1658 static void
1659 disp_swapped_setrun(kthread_t *tp)
1660 {
1661 	ASSERT(THREAD_LOCK_HELD(tp));
1662 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1663 
1664 	switch (tp->t_state) {
1665 	case TS_SLEEP:
1666 		disp_lock_enter_high(&swapped_lock);
1667 		/*
1668 		 * Wakeup sched immediately (i.e., next tick) if the
1669 		 * thread priority is above maxclsyspri.
1670 		 */
1671 		if (DISP_PRIO(tp) > maxclsyspri)
1672 			wake_sched = 1;
1673 		else
1674 			wake_sched_sec = 1;
1675 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1676 		break;
1677 	case TS_RUN:				/* called from ts_update */
1678 		break;
1679 	default:
1680 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1681 	}
1682 }
1683 
1684 
1685 /*
1686  *	Make a thread give up its processor.  Find the processor on
1687  *	which this thread is executing, and have that processor
1688  *	preempt.
1689  */
1690 void
1691 cpu_surrender(kthread_t *tp)
1692 {
1693 	cpu_t	*cpup;
1694 	int	max_pri;
1695 	int	max_run_pri;
1696 	klwp_t	*lwp;
1697 
1698 	ASSERT(THREAD_LOCK_HELD(tp));
1699 
1700 	if (tp->t_state != TS_ONPROC)
1701 		return;
1702 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1703 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1704 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1705 	if (max_pri < max_run_pri)
1706 		max_pri = max_run_pri;
1707 
1708 	cpup->cpu_runrun = 1;
1709 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1710 		cpup->cpu_kprunrun = 1;
1711 	}
1712 
1713 	/*
1714 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1715 	 */
1716 	membar_enter();
1717 
1718 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1719 
1720 	/*
1721 	 * Make the target thread take an excursion through trap()
1722 	 * to do preempt() (unless we're already in trap or post_syscall,
1723 	 * calling cpu_surrender via CL_TRAPRET).
1724 	 */
1725 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1726 	    lwp->lwp_state != LWP_USER) {
1727 		aston(tp);
1728 		if (cpup != CPU)
1729 			poke_cpu(cpup->cpu_id);
1730 	}
1731 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1732 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1733 }
1734 
1735 
1736 /*
1737  * Commit to and ratify a scheduling decision
1738  */
1739 /*ARGSUSED*/
1740 static kthread_t *
1741 disp_ratify(kthread_t *tp, disp_t *kpq)
1742 {
1743 	pri_t	tpri, maxpri;
1744 	pri_t	maxkpri;
1745 	cpu_t	*cpup;
1746 
1747 	ASSERT(tp != NULL);
1748 	/*
1749 	 * Commit to, then ratify scheduling decision
1750 	 */
1751 	cpup = CPU;
1752 	if (cpup->cpu_runrun != 0)
1753 		cpup->cpu_runrun = 0;
1754 	if (cpup->cpu_kprunrun != 0)
1755 		cpup->cpu_kprunrun = 0;
1756 	if (cpup->cpu_chosen_level != -1)
1757 		cpup->cpu_chosen_level = -1;
1758 	membar_enter();
1759 	tpri = DISP_PRIO(tp);
1760 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1761 	maxkpri = kpq->disp_maxrunpri;
1762 	if (maxpri < maxkpri)
1763 		maxpri = maxkpri;
1764 	if (tpri < maxpri) {
1765 		/*
1766 		 * should have done better
1767 		 * put this one back and indicate to try again
1768 		 */
1769 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1770 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1771 		thread_lock_high(tp);
1772 		THREAD_TRANSITION(tp);
1773 		setfrontdq(tp);
1774 		thread_unlock_nopreempt(tp);
1775 
1776 		tp = NULL;
1777 	}
1778 	return (tp);
1779 }
1780 
1781 /*
1782  * See if there is any work on the dispatcher queue for other CPUs.
1783  * If there is, dequeue the best thread and return.
1784  */
1785 static kthread_t *
1786 disp_getwork(cpu_t *cp)
1787 {
1788 	cpu_t		*ocp;		/* other CPU */
1789 	cpu_t		*ocp_start;
1790 	cpu_t		*tcp;		/* target local CPU */
1791 	kthread_t	*tp;
1792 	pri_t		maxpri;
1793 	int		s;
1794 	disp_t		*kpq;		/* kp queue for this partition */
1795 	lpl_t		*lpl, *lpl_leaf;
1796 	int		hint, leafidx;
1797 
1798 	maxpri = -1;
1799 	tcp = NULL;
1800 
1801 	kpq = &cp->cpu_part->cp_kp_queue;
1802 	while (kpq->disp_maxrunpri >= 0) {
1803 		/*
1804 		 * Try to take a thread from the kp_queue.
1805 		 */
1806 		tp = (disp_getbest(kpq));
1807 		if (tp)
1808 			return (disp_ratify(tp, kpq));
1809 	}
1810 
1811 	s = splhigh();		/* protect the cpu_active list */
1812 
1813 	/*
1814 	 * Try to find something to do on another CPU's run queue.
1815 	 * Loop through all other CPUs looking for the one with the highest
1816 	 * priority unbound thread.
1817 	 *
1818 	 * On NUMA machines, the partition's CPUs are consulted in order of
1819 	 * distance from the current CPU. This way, the first available
1820 	 * work found is also the closest, and will suffer the least
1821 	 * from being migrated.
1822 	 */
1823 	lpl = lpl_leaf = cp->cpu_lpl;
1824 	hint = leafidx = 0;
1825 
1826 	/*
1827 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1828 	 * broader levels of locality
1829 	 */
1830 	do {
1831 		/* This loop iterates over the lpl's leaves */
1832 		do {
1833 			if (lpl_leaf != cp->cpu_lpl)
1834 				ocp = lpl_leaf->lpl_cpus;
1835 			else
1836 				ocp = cp->cpu_next_lpl;
1837 
1838 			/* This loop iterates over the CPUs in the leaf */
1839 			ocp_start = ocp;
1840 			do {
1841 				pri_t pri;
1842 
1843 				ASSERT(CPU_ACTIVE(ocp));
1844 
1845 				/*
1846 				 * End our stroll around the partition if:
1847 				 *
1848 				 * - Something became runnable on the local
1849 				 *	queue
1850 				 *
1851 				 * - We're at the broadest level of locality and
1852 				 *   we happen across another idle CPU. At the
1853 				 *   highest level of locality, all CPUs will
1854 				 *   walk the partition's CPUs in the same
1855 				 *   order, so we can end our stroll taking
1856 				 *   comfort in knowing the other idle CPU is
1857 				 *   already covering the next portion of the
1858 				 *   list.
1859 				 */
1860 				if (cp->cpu_disp->disp_nrunnable != 0)
1861 					break;
1862 				if (ocp->cpu_dispatch_pri == -1) {
1863 					if (ocp->cpu_disp_flags &
1864 					    CPU_DISP_HALTED)
1865 						continue;
1866 					else if (lpl->lpl_parent == NULL)
1867 						break;
1868 				}
1869 
1870 				/*
1871 				 * If there's only one thread and the CPU
1872 				 * is in the middle of a context switch,
1873 				 * or it's currently running the idle thread,
1874 				 * don't steal it.
1875 				 */
1876 				if ((ocp->cpu_disp_flags &
1877 					CPU_DISP_DONTSTEAL) &&
1878 				    ocp->cpu_disp->disp_nrunnable == 1)
1879 					continue;
1880 
1881 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1882 				if (pri > maxpri) {
1883 					maxpri = pri;
1884 					tcp = ocp;
1885 				}
1886 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1887 
1888 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1889 				leafidx = 0;
1890 				lpl_leaf = lpl->lpl_rset[leafidx];
1891 			}
1892 		} while (leafidx != hint);
1893 
1894 		hint = leafidx = lpl->lpl_hint;
1895 		if ((lpl = lpl->lpl_parent) != NULL)
1896 			lpl_leaf = lpl->lpl_rset[hint];
1897 	} while (!tcp && lpl);
1898 
1899 	splx(s);
1900 
1901 	/*
1902 	 * If another queue looks good, and there is still nothing on
1903 	 * the local queue, try to transfer one or more threads
1904 	 * from it to our queue.
1905 	 */
1906 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1907 		tp = (disp_getbest(tcp->cpu_disp));
1908 		if (tp)
1909 			return (disp_ratify(tp, kpq));
1910 	}
1911 	return (NULL);
1912 }
1913 
1914 
1915 /*
1916  * disp_fix_unbound_pri()
1917  *	Determines the maximum priority of unbound threads on the queue.
1918  *	The priority is kept for the queue, but is only increased, never
1919  *	reduced unless some CPU is looking for something on that queue.
1920  *
1921  *	The priority argument is the known upper limit.
1922  *
1923  *	Perhaps this should be kept accurately, but that probably means
1924  *	separate bitmaps for bound and unbound threads.  Since only idled
1925  *	CPUs will have to do this recalculation, it seems better this way.
1926  */
1927 static void
1928 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1929 {
1930 	kthread_t	*tp;
1931 	dispq_t		*dq;
1932 	ulong_t		*dqactmap = dp->disp_qactmap;
1933 	ulong_t		mapword;
1934 	int		wx;
1935 
1936 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1937 
1938 	ASSERT(pri >= 0);			/* checked by caller */
1939 
1940 	/*
1941 	 * Start the search at the next lowest priority below the supplied
1942 	 * priority.  This depends on the bitmap implementation.
1943 	 */
1944 	do {
1945 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
1946 
1947 		/*
1948 		 * Form mask for all lower priorities in the word.
1949 		 */
1950 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1951 
1952 		/*
1953 		 * Get next lower active priority.
1954 		 */
1955 		if (mapword != 0) {
1956 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1957 		} else if (wx > 0) {
1958 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1959 			if (pri < 0)
1960 				break;
1961 		} else {
1962 			pri = -1;
1963 			break;
1964 		}
1965 
1966 		/*
1967 		 * Search the queue for unbound, runnable threads.
1968 		 */
1969 		dq = &dp->disp_q[pri];
1970 		tp = dq->dq_first;
1971 
1972 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1973 			tp = tp->t_link;
1974 		}
1975 
1976 		/*
1977 		 * If a thread was found, set the priority and return.
1978 		 */
1979 	} while (tp == NULL);
1980 
1981 	/*
1982 	 * pri holds the maximum unbound thread priority or -1.
1983 	 */
1984 	if (dp->disp_max_unbound_pri != pri)
1985 		dp->disp_max_unbound_pri = pri;
1986 }
1987 
1988 /*
1989  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
1990  * 	check if the CPU to which is was previously bound should have
1991  * 	its disp_max_unbound_pri increased.
1992  */
1993 void
1994 disp_adjust_unbound_pri(kthread_t *tp)
1995 {
1996 	disp_t *dp;
1997 	pri_t tpri;
1998 
1999 	ASSERT(THREAD_LOCK_HELD(tp));
2000 
2001 	/*
2002 	 * Don't do anything if the thread is not bound, or
2003 	 * currently not runnable or swapped out.
2004 	 */
2005 	if (tp->t_bound_cpu == NULL ||
2006 	    tp->t_state != TS_RUN ||
2007 	    tp->t_schedflag & TS_ON_SWAPQ)
2008 		return;
2009 
2010 	tpri = DISP_PRIO(tp);
2011 	dp = tp->t_bound_cpu->cpu_disp;
2012 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2013 	if (tpri > dp->disp_max_unbound_pri)
2014 		dp->disp_max_unbound_pri = tpri;
2015 }
2016 
2017 /*
2018  * disp_getbest() - de-queue the highest priority unbound runnable thread.
2019  *	returns with the thread unlocked and onproc
2020  *	but at splhigh (like disp()).
2021  *	returns NULL if nothing found.
2022  *
2023  *	Passed a pointer to a dispatch queue not associated with this CPU.
2024  */
2025 static kthread_t *
2026 disp_getbest(disp_t *dp)
2027 {
2028 	kthread_t	*tp;
2029 	dispq_t		*dq;
2030 	pri_t		pri;
2031 	cpu_t		*cp;
2032 
2033 	disp_lock_enter(&dp->disp_lock);
2034 
2035 	/*
2036 	 * If there is nothing to run, or the CPU is in the middle of a
2037 	 * context switch of the only thread, return NULL.
2038 	 */
2039 	pri = dp->disp_max_unbound_pri;
2040 	if (pri == -1 ||
2041 		(dp->disp_cpu != NULL &&
2042 		    (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2043 		dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) {
2044 		disp_lock_exit_nopreempt(&dp->disp_lock);
2045 		return (NULL);
2046 	}
2047 
2048 	dq = &dp->disp_q[pri];
2049 	tp = dq->dq_first;
2050 
2051 	/*
2052 	 * Skip over bound threads.
2053 	 * Bound threads can be here even though disp_max_unbound_pri
2054 	 * indicated this level.  Besides, it not always accurate because it
2055 	 * isn't reduced until another CPU looks for work.
2056 	 * Note that tp could be NULL right away due to this.
2057 	 */
2058 	while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2059 		tp = tp->t_link;
2060 	}
2061 
2062 	/*
2063 	 * If there were no unbound threads on this queue, find the queue
2064 	 * where they are and then return NULL so that other CPUs will be
2065 	 * considered.
2066 	 */
2067 	if (tp == NULL) {
2068 		disp_fix_unbound_pri(dp, pri);
2069 		disp_lock_exit_nopreempt(&dp->disp_lock);
2070 		return (NULL);
2071 	}
2072 
2073 	/*
2074 	 * Found a runnable, unbound thread, so remove it from queue.
2075 	 * dispdeq() requires that we have the thread locked, and we do,
2076 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2077 	 * put the thread in transition state, thereby dropping the dispq
2078 	 * lock.
2079 	 */
2080 #ifdef DEBUG
2081 	{
2082 		int	thread_was_on_queue;
2083 
2084 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2085 		ASSERT(thread_was_on_queue);
2086 	}
2087 #else /* DEBUG */
2088 	(void) dispdeq(tp);			/* drops disp_lock */
2089 #endif /* DEBUG */
2090 
2091 	tp->t_schedflag |= TS_DONT_SWAP;
2092 
2093 	/*
2094 	 * Setup thread to run on the current CPU.
2095 	 */
2096 	cp = CPU;
2097 
2098 	tp->t_disp_queue = cp->cpu_disp;
2099 
2100 	cp->cpu_dispthread = tp;		/* protected by spl only */
2101 	cp->cpu_dispatch_pri = pri;
2102 	ASSERT(pri == DISP_PRIO(tp));
2103 
2104 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2105 
2106 	/*
2107 	 * Return with spl high so that swtch() won't need to raise it.
2108 	 * The disp_lock was dropped by dispdeq().
2109 	 */
2110 
2111 	return (tp);
2112 }
2113 
2114 /*
2115  * disp_bound_common() - common routine for higher level functions
2116  *	that check for bound threads under certain conditions.
2117  *	If 'threadlistsafe' is set then there is no need to acquire
2118  *	pidlock to stop the thread list from changing (eg, if
2119  *	disp_bound_* is called with cpus paused).
2120  */
2121 static int
2122 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2123 {
2124 	int		found = 0;
2125 	kthread_t	*tp;
2126 
2127 	ASSERT(flag);
2128 
2129 	if (!threadlistsafe)
2130 		mutex_enter(&pidlock);
2131 	tp = curthread;		/* faster than allthreads */
2132 	do {
2133 		if (tp->t_state != TS_FREE) {
2134 			/*
2135 			 * If an interrupt thread is busy, but the
2136 			 * caller doesn't care (i.e. BOUND_INTR is off),
2137 			 * then just ignore it and continue through.
2138 			 */
2139 			if ((tp->t_flag & T_INTR_THREAD) &&
2140 			    !(flag & BOUND_INTR))
2141 				continue;
2142 
2143 			/*
2144 			 * Skip the idle thread for the CPU
2145 			 * we're about to set offline.
2146 			 */
2147 			if (tp == cp->cpu_idle_thread)
2148 				continue;
2149 
2150 			/*
2151 			 * Skip the pause thread for the CPU
2152 			 * we're about to set offline.
2153 			 */
2154 			if (tp == cp->cpu_pause_thread)
2155 				continue;
2156 
2157 			if ((flag & BOUND_CPU) &&
2158 			    (tp->t_bound_cpu == cp ||
2159 			    tp->t_bind_cpu == cp->cpu_id ||
2160 			    tp->t_weakbound_cpu == cp)) {
2161 				found = 1;
2162 				break;
2163 			}
2164 
2165 			if ((flag & BOUND_PARTITION) &&
2166 			    (tp->t_cpupart == cp->cpu_part)) {
2167 				found = 1;
2168 				break;
2169 			}
2170 		}
2171 	} while ((tp = tp->t_next) != curthread && found == 0);
2172 	if (!threadlistsafe)
2173 		mutex_exit(&pidlock);
2174 	return (found);
2175 }
2176 
2177 /*
2178  * disp_bound_threads - return nonzero if threads are bound to the processor.
2179  *	Called infrequently.  Keep this simple.
2180  *	Includes threads that are asleep or stopped but not onproc.
2181  */
2182 int
2183 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2184 {
2185 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2186 }
2187 
2188 /*
2189  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2190  * to the given processor, including interrupt threads.
2191  */
2192 int
2193 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2194 {
2195 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2196 }
2197 
2198 /*
2199  * disp_bound_partition - return nonzero if threads are bound to the same
2200  * partition as the processor.
2201  *	Called infrequently.  Keep this simple.
2202  *	Includes threads that are asleep or stopped but not onproc.
2203  */
2204 int
2205 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2206 {
2207 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2208 }
2209 
2210 /*
2211  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2212  * threads to other CPUs.
2213  */
2214 void
2215 disp_cpu_inactive(cpu_t *cp)
2216 {
2217 	kthread_t	*tp;
2218 	disp_t		*dp = cp->cpu_disp;
2219 	dispq_t		*dq;
2220 	pri_t		pri;
2221 	int		wasonq;
2222 
2223 	disp_lock_enter(&dp->disp_lock);
2224 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2225 		dq = &dp->disp_q[pri];
2226 		tp = dq->dq_first;
2227 
2228 		/*
2229 		 * Skip over bound threads.
2230 		 */
2231 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2232 			tp = tp->t_link;
2233 		}
2234 
2235 		if (tp == NULL) {
2236 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2237 			disp_fix_unbound_pri(dp, pri);
2238 			continue;
2239 		}
2240 
2241 		wasonq = dispdeq(tp);		/* drops disp_lock */
2242 		ASSERT(wasonq);
2243 		ASSERT(tp->t_weakbound_cpu == NULL);
2244 
2245 		setbackdq(tp);
2246 		/*
2247 		 * Called from cpu_offline:
2248 		 *
2249 		 * cp has already been removed from the list of active cpus
2250 		 * and tp->t_cpu has been changed so there is no risk of
2251 		 * tp ending up back on cp.
2252 		 *
2253 		 * Called from cpupart_move_cpu:
2254 		 *
2255 		 * The cpu has moved to a new cpupart.  Any threads that
2256 		 * were on it's dispatch queues before the move remain
2257 		 * in the old partition and can't run in the new partition.
2258 		 */
2259 		ASSERT(tp->t_cpu != cp);
2260 		thread_unlock(tp);
2261 
2262 		disp_lock_enter(&dp->disp_lock);
2263 	}
2264 	disp_lock_exit(&dp->disp_lock);
2265 }
2266 
2267 /*
2268  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2269  *	The hint passed in is used as a starting point so we don't favor
2270  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2271  *	used CPU for the thread.
2272  *
2273  *	The lgroup and priority are used to determine the best CPU to run on
2274  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2275  *	the thread priority will indicate whether the thread will actually run
2276  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2277  *	lgroup which are running the lowest priority threads are found.  The
2278  *	remote CPU is chosen only if the thread will not run locally on a CPU
2279  *	within the lgroup, but will run on the remote CPU. If the thread
2280  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2281  *
2282  *	The lpl specified also identifies the cpu partition from which
2283  *	disp_lowpri_cpu should select a CPU.
2284  *
2285  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2286  *      behalf of the current thread. (curthread is looking for a new cpu)
2287  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2288  *      ignored.
2289  *
2290  *      If a cpu is the target of an offline request then try to avoid it.
2291  *
2292  *	This function must be called at either high SPL, or with preemption
2293  *	disabled, so that the "hint" CPU cannot be removed from the online
2294  *	CPU list while we are traversing it.
2295  */
2296 cpu_t *
2297 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2298 {
2299 	cpu_t	*bestcpu;
2300 	cpu_t	*besthomecpu;
2301 	cpu_t   *cp, *cpstart;
2302 
2303 	pri_t   bestpri;
2304 	pri_t   cpupri;
2305 
2306 	klgrpset_t	done;
2307 	klgrpset_t	cur_set;
2308 
2309 	lpl_t		*lpl_iter, *lpl_leaf;
2310 	int		i;
2311 
2312 	/*
2313 	 * Scan for a CPU currently running the lowest priority thread.
2314 	 * Cannot get cpu_lock here because it is adaptive.
2315 	 * We do not require lock on CPU list.
2316 	 */
2317 	ASSERT(hint != NULL);
2318 	ASSERT(lpl != NULL);
2319 	ASSERT(lpl->lpl_ncpu > 0);
2320 
2321 	/*
2322 	 * First examine local CPUs. Note that it's possible the hint CPU
2323 	 * passed in in remote to the specified home lgroup. If our priority
2324 	 * isn't sufficient enough such that we can run immediately at home,
2325 	 * then examine CPUs remote to our home lgroup.
2326 	 * We would like to give preference to CPUs closest to "home".
2327 	 * If we can't find a CPU where we'll run at a given level
2328 	 * of locality, we expand our search to include the next level.
2329 	 */
2330 	bestcpu = besthomecpu = NULL;
2331 	klgrpset_clear(done);
2332 	/* start with lpl we were passed */
2333 
2334 	lpl_iter = lpl;
2335 
2336 	do {
2337 
2338 		bestpri = SHRT_MAX;
2339 		klgrpset_clear(cur_set);
2340 
2341 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2342 			lpl_leaf = lpl_iter->lpl_rset[i];
2343 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2344 				continue;
2345 
2346 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2347 
2348 			if (hint->cpu_lpl == lpl_leaf)
2349 				cp = cpstart = hint;
2350 			else
2351 				cp = cpstart = lpl_leaf->lpl_cpus;
2352 
2353 			do {
2354 
2355 				if (cp == curcpu)
2356 					cpupri = -1;
2357 				else if (cp == cpu_inmotion)
2358 					cpupri = SHRT_MAX;
2359 				else
2360 					cpupri = cp->cpu_dispatch_pri;
2361 
2362 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2363 					cpupri = cp->cpu_disp->disp_maxrunpri;
2364 				if (cp->cpu_chosen_level > cpupri)
2365 					cpupri = cp->cpu_chosen_level;
2366 				if (cpupri < bestpri) {
2367 					if (CPU_IDLING(cpupri)) {
2368 						ASSERT((cp->cpu_flags &
2369 						    CPU_QUIESCED) == 0);
2370 						return (cp);
2371 					}
2372 					bestcpu = cp;
2373 					bestpri = cpupri;
2374 				}
2375 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2376 		}
2377 
2378 		if (bestcpu && (tpri > bestpri)) {
2379 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2380 			return (bestcpu);
2381 		}
2382 		if (besthomecpu == NULL)
2383 			besthomecpu = bestcpu;
2384 		/*
2385 		 * Add the lgrps we just considered to the "done" set
2386 		 */
2387 		klgrpset_or(done, cur_set);
2388 
2389 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2390 
2391 	/*
2392 	 * The specified priority isn't high enough to run immediately
2393 	 * anywhere, so just return the best CPU from the home lgroup.
2394 	 */
2395 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2396 	return (besthomecpu);
2397 }
2398 
2399 /*
2400  * This routine provides the generic idle cpu function for all processors.
2401  * If a processor has some specific code to execute when idle (say, to stop
2402  * the pipeline and save power) then that routine should be defined in the
2403  * processors specific code (module_xx.c) and the global variable idle_cpu
2404  * set to that function.
2405  */
2406 static void
2407 generic_idle_cpu(void)
2408 {
2409 }
2410 
2411 /*ARGSUSED*/
2412 static void
2413 generic_enq_thread(cpu_t *cpu, int bound)
2414 {
2415 }
2416 
2417 /*
2418  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2419  *	- t->t_cpu is not in this thread's assigned lgrp
2420  *	- the time since the thread last came off t->t_cpu exceeds the
2421  *	  rechoose time for this cpu (ignore this if t is curthread in
2422  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2423  *	- t->t_cpu is presently the target of an offline or partition move
2424  *	  request
2425  */
2426 static cpu_t *
2427 cpu_choose(kthread_t *t, pri_t tpri)
2428 {
2429 	ASSERT(tpri < kpqpri);
2430 
2431 	if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) &&
2432 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2433 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2434 	}
2435 
2436 	/*
2437 	 * Take a trip through disp_lowpri_cpu() if the thread was
2438 	 * running outside it's home lgroup
2439 	 */
2440 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2441 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2442 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2443 		    (t == curthread) ? t->t_cpu : NULL));
2444 	}
2445 	return (t->t_cpu);
2446 }
2447