xref: /illumos-gate/usr/src/uts/common/disp/disp.c (revision bcfd778bfadc1d551fbc83dfc8d417b2d3772f75)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysinfo.h>
39 #include <sys/var.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/inline.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47 #include <sys/kmem.h>
48 #include <sys/cpuvar.h>
49 #include <sys/vtrace.h>
50 #include <sys/tnf.h>
51 #include <sys/cpupart.h>
52 #include <sys/lgrp.h>
53 #include <sys/chip.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 
59 #include <vm/as.h>
60 
61 #define	BOUND_CPU	0x1
62 #define	BOUND_PARTITION	0x2
63 #define	BOUND_INTR	0x4
64 
65 /* Dispatch queue allocation structure and functions */
66 struct disp_queue_info {
67 	disp_t	*dp;
68 	dispq_t *olddispq;
69 	dispq_t *newdispq;
70 	ulong_t	*olddqactmap;
71 	ulong_t	*newdqactmap;
72 	int	oldnglobpris;
73 };
74 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
75     disp_t *dp);
76 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
77 static void	disp_dq_free(struct disp_queue_info *dptr);
78 
79 /* platform-specific routine to call when processor is idle */
80 static void	generic_idle_cpu();
81 void		(*idle_cpu)() = generic_idle_cpu;
82 
83 /* routines invoked when a CPU enters/exits the idle loop */
84 static void	idle_enter();
85 static void	idle_exit();
86 
87 /* platform-specific routine to call when thread is enqueued */
88 static void	generic_enq_thread(cpu_t *, int);
89 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
90 
91 pri_t	kpreemptpri;	/* priority where kernel preemption applies */
92 pri_t	upreemptpri = 0; /* priority where normal preemption applies */
93 pri_t	intr_pri;	/* interrupt thread priority base level */
94 
95 #define	KPQPRI	-1 /* priority where cpu affinity is dropped for kp queue */
96 pri_t	kpqpri = KPQPRI; /* can be set in /etc/system */
97 disp_t	cpu0_disp;	/* boot CPU's dispatch queue */
98 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
99 int	nswapped;	/* total number of swapped threads */
100 void	disp_swapped_enq(kthread_t *tp);
101 static void	disp_swapped_setrun(kthread_t *tp);
102 static void	cpu_resched(cpu_t *cp, pri_t tpri);
103 
104 /*
105  * If this is set, only interrupt threads will cause kernel preemptions.
106  * This is done by changing the value of kpreemptpri.  kpreemptpri
107  * will either be the max sysclass pri + 1 or the min interrupt pri.
108  */
109 int	only_intr_kpreempt;
110 
111 extern void set_idle_cpu(int cpun);
112 extern void unset_idle_cpu(int cpun);
113 static void setkpdq(kthread_t *tp, int borf);
114 #define	SETKP_BACK	0
115 #define	SETKP_FRONT	1
116 /*
117  * Parameter that determines how recently a thread must have run
118  * on the CPU to be considered loosely-bound to that CPU to reduce
119  * cold cache effects.  The interval is in hertz.
120  *
121  * The platform may define a per physical processor adjustment of
122  * this parameter. For efficiency, the effective rechoose interval
123  * (rechoose_interval + per chip adjustment) is maintained in the
124  * cpu structures. See cpu_choose()
125  */
126 int	rechoose_interval = RECHOOSE_INTERVAL;
127 
128 static cpu_t	*cpu_choose(kthread_t *, pri_t);
129 
130 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
131 
132 disp_lock_t	transition_lock;	/* lock on transitioning threads */
133 disp_lock_t	stop_lock;		/* lock on stopped threads */
134 
135 static void		cpu_dispqalloc(int numpris);
136 
137 static kthread_t	*disp_getwork(cpu_t *to);
138 static kthread_t	*disp_getbest(disp_t *from);
139 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
140 
141 void	swtch_to(kthread_t *);
142 
143 /*
144  * dispatcher and scheduler initialization
145  */
146 
147 /*
148  * disp_setup - Common code to calculate and allocate dispatcher
149  *		variables and structures based on the maximum priority.
150  */
151 static void
152 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
153 {
154 	pri_t	newnglobpris;
155 
156 	ASSERT(MUTEX_HELD(&cpu_lock));
157 
158 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
159 
160 	if (newnglobpris > oldnglobpris) {
161 		/*
162 		 * Allocate new kp queues for each CPU partition.
163 		 */
164 		cpupart_kpqalloc(newnglobpris);
165 
166 		/*
167 		 * Allocate new dispatch queues for each CPU.
168 		 */
169 		cpu_dispqalloc(newnglobpris);
170 
171 		/*
172 		 * compute new interrupt thread base priority
173 		 */
174 		intr_pri = maxglobpri;
175 		if (only_intr_kpreempt) {
176 			kpreemptpri = intr_pri + 1;
177 			if (kpqpri == KPQPRI)
178 				kpqpri = kpreemptpri;
179 		}
180 		v.v_nglobpris = newnglobpris;
181 	}
182 }
183 
184 /*
185  * dispinit - Called to initialize all loaded classes and the
186  *	      dispatcher framework.
187  */
188 void
189 dispinit(void)
190 {
191 	id_t	cid;
192 	pri_t	maxglobpri;
193 	pri_t	cl_maxglobpri;
194 
195 	maxglobpri = -1;
196 
197 	/*
198 	 * Initialize transition lock, which will always be set.
199 	 */
200 	DISP_LOCK_INIT(&transition_lock);
201 	disp_lock_enter_high(&transition_lock);
202 	DISP_LOCK_INIT(&stop_lock);
203 
204 	mutex_enter(&cpu_lock);
205 	CPU->cpu_disp->disp_maxrunpri = -1;
206 	CPU->cpu_disp->disp_max_unbound_pri = -1;
207 	/*
208 	 * Initialize the default CPU partition.
209 	 */
210 	cpupart_initialize_default();
211 	/*
212 	 * Call the class specific initialization functions for
213 	 * all pre-installed schedulers.
214 	 *
215 	 * We pass the size of a class specific parameter
216 	 * buffer to each of the initialization functions
217 	 * to try to catch problems with backward compatibility
218 	 * of class modules.
219 	 *
220 	 * For example a new class module running on an old system
221 	 * which didn't provide sufficiently large parameter buffers
222 	 * would be bad news. Class initialization modules can check for
223 	 * this and take action if they detect a problem.
224 	 */
225 
226 	for (cid = 0; cid < nclass; cid++) {
227 		sclass_t	*sc;
228 
229 		sc = &sclass[cid];
230 		if (SCHED_INSTALLED(sc)) {
231 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
232 			    &sc->cl_funcs);
233 			if (cl_maxglobpri > maxglobpri)
234 				maxglobpri = cl_maxglobpri;
235 		}
236 	}
237 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
238 	if (kpqpri == KPQPRI)
239 		kpqpri = kpreemptpri;
240 
241 	ASSERT(maxglobpri >= 0);
242 	disp_setup(maxglobpri, 0);
243 
244 	mutex_exit(&cpu_lock);
245 
246 	/*
247 	 * Get the default class ID; this may be later modified via
248 	 * dispadmin(1M).  This will load the class (normally TS) and that will
249 	 * call disp_add(), which is why we had to drop cpu_lock first.
250 	 */
251 	if (getcid(defaultclass, &defaultcid) != 0) {
252 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
253 		    defaultclass);
254 	}
255 }
256 
257 /*
258  * disp_add - Called with class pointer to initialize the dispatcher
259  *	      for a newly loaded class.
260  */
261 void
262 disp_add(sclass_t *clp)
263 {
264 	pri_t	maxglobpri;
265 	pri_t	cl_maxglobpri;
266 
267 	mutex_enter(&cpu_lock);
268 	/*
269 	 * Initialize the scheduler class.
270 	 */
271 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
272 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
273 	if (cl_maxglobpri > maxglobpri)
274 		maxglobpri = cl_maxglobpri;
275 
276 	/*
277 	 * Save old queue information.  Since we're initializing a
278 	 * new scheduling class which has just been loaded, then
279 	 * the size of the dispq may have changed.  We need to handle
280 	 * that here.
281 	 */
282 	disp_setup(maxglobpri, v.v_nglobpris);
283 
284 	mutex_exit(&cpu_lock);
285 }
286 
287 
288 /*
289  * For each CPU, allocate new dispatch queues
290  * with the stated number of priorities.
291  */
292 static void
293 cpu_dispqalloc(int numpris)
294 {
295 	cpu_t	*cpup;
296 	struct disp_queue_info	*disp_mem;
297 	int i, num;
298 
299 	ASSERT(MUTEX_HELD(&cpu_lock));
300 
301 	disp_mem = kmem_zalloc(NCPU *
302 	    sizeof (struct disp_queue_info), KM_SLEEP);
303 
304 	/*
305 	 * This routine must allocate all of the memory before stopping
306 	 * the cpus because it must not sleep in kmem_alloc while the
307 	 * CPUs are stopped.  Locks they hold will not be freed until they
308 	 * are restarted.
309 	 */
310 	i = 0;
311 	cpup = cpu_list;
312 	do {
313 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
314 		i++;
315 		cpup = cpup->cpu_next;
316 	} while (cpup != cpu_list);
317 	num = i;
318 
319 	pause_cpus(NULL);
320 	for (i = 0; i < num; i++)
321 		disp_dq_assign(&disp_mem[i], numpris);
322 	start_cpus();
323 
324 	/*
325 	 * I must free all of the memory after starting the cpus because
326 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
327 	 */
328 	for (i = 0; i < num; i++)
329 		disp_dq_free(&disp_mem[i]);
330 
331 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
332 }
333 
334 static void
335 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
336 {
337 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
338 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
339 	    sizeof (long), KM_SLEEP);
340 	dptr->dp = dp;
341 }
342 
343 static void
344 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
345 {
346 	disp_t	*dp;
347 
348 	dp = dptr->dp;
349 	dptr->olddispq = dp->disp_q;
350 	dptr->olddqactmap = dp->disp_qactmap;
351 	dptr->oldnglobpris = dp->disp_npri;
352 
353 	ASSERT(dptr->oldnglobpris < numpris);
354 
355 	if (dptr->olddispq != NULL) {
356 		/*
357 		 * Use kcopy because bcopy is platform-specific
358 		 * and could block while we might have paused the cpus.
359 		 */
360 		(void) kcopy(dptr->olddispq, dptr->newdispq,
361 		    dptr->oldnglobpris * sizeof (dispq_t));
362 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
363 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
364 		    sizeof (long));
365 	}
366 	dp->disp_q = dptr->newdispq;
367 	dp->disp_qactmap = dptr->newdqactmap;
368 	dp->disp_q_limit = &dptr->newdispq[numpris];
369 	dp->disp_npri = numpris;
370 }
371 
372 static void
373 disp_dq_free(struct disp_queue_info *dptr)
374 {
375 	if (dptr->olddispq != NULL)
376 		kmem_free(dptr->olddispq,
377 		    dptr->oldnglobpris * sizeof (dispq_t));
378 	if (dptr->olddqactmap != NULL)
379 		kmem_free(dptr->olddqactmap,
380 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
381 }
382 
383 /*
384  * For a newly created CPU, initialize the dispatch queue.
385  * This is called before the CPU is known through cpu[] or on any lists.
386  */
387 void
388 disp_cpu_init(cpu_t *cp)
389 {
390 	disp_t	*dp;
391 	dispq_t	*newdispq;
392 	ulong_t	*newdqactmap;
393 
394 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
395 
396 	if (cp == cpu0_disp.disp_cpu)
397 		dp = &cpu0_disp;
398 	else
399 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
400 	bzero(dp, sizeof (disp_t));
401 	cp->cpu_disp = dp;
402 	dp->disp_cpu = cp;
403 	dp->disp_maxrunpri = -1;
404 	dp->disp_max_unbound_pri = -1;
405 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
406 	/*
407 	 * Allocate memory for the dispatcher queue headers
408 	 * and the active queue bitmap.
409 	 */
410 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
411 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
412 	    sizeof (long), KM_SLEEP);
413 	dp->disp_q = newdispq;
414 	dp->disp_qactmap = newdqactmap;
415 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
416 	dp->disp_npri = v.v_nglobpris;
417 }
418 
419 void
420 disp_cpu_fini(cpu_t *cp)
421 {
422 	ASSERT(MUTEX_HELD(&cpu_lock));
423 
424 	disp_kp_free(cp->cpu_disp);
425 	if (cp->cpu_disp != &cpu0_disp)
426 		kmem_free(cp->cpu_disp, sizeof (disp_t));
427 }
428 
429 /*
430  * Allocate new, larger kpreempt dispatch queue to replace the old one.
431  */
432 void
433 disp_kp_alloc(disp_t *dq, pri_t npri)
434 {
435 	struct disp_queue_info	mem_info;
436 
437 	if (npri > dq->disp_npri) {
438 		/*
439 		 * Allocate memory for the new array.
440 		 */
441 		disp_dq_alloc(&mem_info, npri, dq);
442 
443 		/*
444 		 * We need to copy the old structures to the new
445 		 * and free the old.
446 		 */
447 		disp_dq_assign(&mem_info, npri);
448 		disp_dq_free(&mem_info);
449 	}
450 }
451 
452 /*
453  * Free dispatch queue.
454  * Used for the kpreempt queues for a removed CPU partition and
455  * for the per-CPU queues of deleted CPUs.
456  */
457 void
458 disp_kp_free(disp_t *dq)
459 {
460 	struct disp_queue_info	mem_info;
461 
462 	mem_info.olddispq = dq->disp_q;
463 	mem_info.olddqactmap = dq->disp_qactmap;
464 	mem_info.oldnglobpris = dq->disp_npri;
465 	disp_dq_free(&mem_info);
466 }
467 
468 /*
469  * End dispatcher and scheduler initialization.
470  */
471 
472 /*
473  * See if there's anything to do other than remain idle.
474  * Return non-zero if there is.
475  *
476  * This function must be called with high spl, or with
477  * kernel preemption disabled to prevent the partition's
478  * active cpu list from changing while being traversed.
479  *
480  */
481 int
482 disp_anywork(void)
483 {
484 	cpu_t   *cp = CPU;
485 	cpu_t   *ocp;
486 
487 	if (cp->cpu_disp->disp_nrunnable != 0)
488 		return (1);
489 
490 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
491 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
492 			return (1);
493 
494 		/*
495 		 * Work can be taken from another CPU if:
496 		 *	- There is unbound work on the run queue
497 		 *	- That work isn't a thread undergoing a
498 		 *	- context switch on an otherwise empty queue.
499 		 *	- The CPU isn't running the idle loop.
500 		 */
501 		for (ocp = cp->cpu_next_part; ocp != cp;
502 		    ocp = ocp->cpu_next_part) {
503 			ASSERT(CPU_ACTIVE(ocp));
504 
505 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
506 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
507 			    ocp->cpu_disp->disp_nrunnable == 1) &&
508 			    ocp->cpu_dispatch_pri != -1)
509 				return (1);
510 		}
511 	}
512 	return (0);
513 }
514 
515 /*
516  * Called when CPU enters the idle loop
517  */
518 static void
519 idle_enter()
520 {
521 	cpu_t		*cp = CPU;
522 
523 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
524 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
525 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
526 }
527 
528 /*
529  * Called when CPU exits the idle loop
530  */
531 static void
532 idle_exit()
533 {
534 	cpu_t		*cp = CPU;
535 
536 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
537 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
538 }
539 
540 /*
541  * Idle loop.
542  */
543 void
544 idle()
545 {
546 	struct cpu	*cp = CPU;		/* pointer to this CPU */
547 	kthread_t	*t;			/* taken thread */
548 
549 	idle_enter();
550 
551 	/*
552 	 * Uniprocessor version of idle loop.
553 	 * Do this until notified that we're on an actual multiprocessor.
554 	 */
555 	while (ncpus == 1) {
556 		if (cp->cpu_disp->disp_nrunnable == 0) {
557 			(*idle_cpu)();
558 			continue;
559 		}
560 		idle_exit();
561 		swtch();
562 
563 		idle_enter(); /* returned from swtch */
564 	}
565 
566 	/*
567 	 * Multiprocessor idle loop.
568 	 */
569 	for (;;) {
570 		/*
571 		 * If CPU is completely quiesced by p_online(2), just wait
572 		 * here with minimal bus traffic until put online.
573 		 */
574 		while (cp->cpu_flags & CPU_QUIESCED)
575 			(*idle_cpu)();
576 
577 		if (cp->cpu_disp->disp_nrunnable != 0) {
578 			idle_exit();
579 			swtch();
580 		} else {
581 			if (cp->cpu_flags & CPU_OFFLINE)
582 				continue;
583 			if ((t = disp_getwork(cp)) == NULL) {
584 				if (cp->cpu_chosen_level != -1) {
585 					disp_t *dp = cp->cpu_disp;
586 					disp_t *kpq;
587 
588 					disp_lock_enter(&dp->disp_lock);
589 					/*
590 					 * Set kpq under lock to prevent
591 					 * migration between partitions.
592 					 */
593 					kpq = &cp->cpu_part->cp_kp_queue;
594 					if (kpq->disp_maxrunpri == -1)
595 						cp->cpu_chosen_level = -1;
596 					disp_lock_exit(&dp->disp_lock);
597 				}
598 				(*idle_cpu)();
599 				continue;
600 			}
601 			idle_exit();
602 			restore_mstate(t);
603 			swtch_to(t);
604 		}
605 		idle_enter(); /* returned from swtch/swtch_to */
606 	}
607 }
608 
609 
610 /*
611  * Preempt the currently running thread in favor of the highest
612  * priority thread.  The class of the current thread controls
613  * where it goes on the dispatcher queues. If panicking, turn
614  * preemption off.
615  */
616 void
617 preempt()
618 {
619 	kthread_t 	*t = curthread;
620 	klwp_t 		*lwp = ttolwp(curthread);
621 
622 	if (panicstr)
623 		return;
624 
625 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
626 
627 	thread_lock(t);
628 
629 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
630 		/*
631 		 * this thread has already been chosen to be run on
632 		 * another CPU. Clear kprunrun on this CPU since we're
633 		 * already headed for swtch().
634 		 */
635 		CPU->cpu_kprunrun = 0;
636 		thread_unlock_nopreempt(t);
637 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
638 	} else {
639 		if (lwp != NULL)
640 			lwp->lwp_ru.nivcsw++;
641 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
642 		THREAD_TRANSITION(t);
643 		CL_PREEMPT(t);
644 		DTRACE_SCHED(preempt);
645 		thread_unlock_nopreempt(t);
646 
647 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
648 
649 		swtch();		/* clears CPU->cpu_runrun via disp() */
650 	}
651 }
652 
653 extern kthread_t *thread_unpin();
654 
655 /*
656  * disp() - find the highest priority thread for this processor to run, and
657  * set it in TS_ONPROC state so that resume() can be called to run it.
658  */
659 static kthread_t *
660 disp()
661 {
662 	cpu_t		*cpup;
663 	disp_t		*dp;
664 	kthread_t	*tp;
665 	dispq_t		*dq;
666 	int		maxrunword;
667 	pri_t		pri;
668 	disp_t		*kpq;
669 
670 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
671 
672 	cpup = CPU;
673 	/*
674 	 * Find the highest priority loaded, runnable thread.
675 	 */
676 	dp = cpup->cpu_disp;
677 
678 reschedule:
679 	/*
680 	 * If there is more important work on the global queue with a better
681 	 * priority than the maximum on this CPU, take it now.
682 	 */
683 	kpq = &cpup->cpu_part->cp_kp_queue;
684 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
685 	    pri >= dp->disp_maxrunpri &&
686 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
687 	    (tp = disp_getbest(kpq)) != NULL) {
688 		if (disp_ratify(tp, kpq) != NULL) {
689 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
690 			    "disp_end:tid %p", tp);
691 			restore_mstate(tp);
692 			return (tp);
693 		}
694 	}
695 
696 	disp_lock_enter(&dp->disp_lock);
697 	pri = dp->disp_maxrunpri;
698 
699 	/*
700 	 * If there is nothing to run, look at what's runnable on other queues.
701 	 * Choose the idle thread if the CPU is quiesced.
702 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
703 	 * interrupt threads, which will be the only threads on the CPU's own
704 	 * queue, but cannot run threads from other queues.
705 	 */
706 	if (pri == -1) {
707 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
708 			disp_lock_exit(&dp->disp_lock);
709 			if ((tp = disp_getwork(cpup)) == NULL) {
710 				tp = cpup->cpu_idle_thread;
711 				(void) splhigh();
712 				THREAD_ONPROC(tp, cpup);
713 				cpup->cpu_dispthread = tp;
714 				cpup->cpu_dispatch_pri = -1;
715 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
716 				cpup->cpu_chosen_level = -1;
717 			}
718 		} else {
719 			disp_lock_exit_high(&dp->disp_lock);
720 			tp = cpup->cpu_idle_thread;
721 			THREAD_ONPROC(tp, cpup);
722 			cpup->cpu_dispthread = tp;
723 			cpup->cpu_dispatch_pri = -1;
724 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
725 			cpup->cpu_chosen_level = -1;
726 		}
727 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
728 			"disp_end:tid %p", tp);
729 		restore_mstate(tp);
730 		return (tp);
731 	}
732 
733 	dq = &dp->disp_q[pri];
734 	tp = dq->dq_first;
735 
736 	ASSERT(tp != NULL);
737 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
738 
739 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
740 
741 	/*
742 	 * Found it so remove it from queue.
743 	 */
744 	dp->disp_nrunnable--;
745 	dq->dq_sruncnt--;
746 	if ((dq->dq_first = tp->t_link) == NULL) {
747 		ulong_t	*dqactmap = dp->disp_qactmap;
748 
749 		ASSERT(dq->dq_sruncnt == 0);
750 		dq->dq_last = NULL;
751 
752 		/*
753 		 * The queue is empty, so the corresponding bit needs to be
754 		 * turned off in dqactmap.   If nrunnable != 0 just took the
755 		 * last runnable thread off the
756 		 * highest queue, so recompute disp_maxrunpri.
757 		 */
758 		maxrunword = pri >> BT_ULSHIFT;
759 		dqactmap[maxrunword] &= ~BT_BIW(pri);
760 
761 		if (dp->disp_nrunnable == 0) {
762 			dp->disp_max_unbound_pri = -1;
763 			dp->disp_maxrunpri = -1;
764 		} else {
765 			int ipri;
766 
767 			ipri = bt_gethighbit(dqactmap, maxrunword);
768 			dp->disp_maxrunpri = ipri;
769 			if (ipri < dp->disp_max_unbound_pri)
770 				dp->disp_max_unbound_pri = ipri;
771 		}
772 	} else {
773 		tp->t_link = NULL;
774 	}
775 
776 	/*
777 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
778 	 * out this thread before we have a chance to run it.
779 	 * While running, it is protected against swapping by t_lock.
780 	 */
781 	tp->t_schedflag |= TS_DONT_SWAP;
782 	cpup->cpu_dispthread = tp;		/* protected by spl only */
783 	cpup->cpu_dispatch_pri = pri;
784 	ASSERT(pri == DISP_PRIO(tp));
785 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
786 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
787 
788 	ASSERT(tp != NULL);
789 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
790 		"disp_end:tid %p", tp);
791 
792 	if (disp_ratify(tp, kpq) == NULL)
793 		goto reschedule;
794 
795 	restore_mstate(tp);
796 	return (tp);
797 }
798 
799 /*
800  * swtch()
801  *	Find best runnable thread and run it.
802  *	Called with the current thread already switched to a new state,
803  *	on a sleep queue, run queue, stopped, and not zombied.
804  *	May be called at any spl level less than or equal to LOCK_LEVEL.
805  *	Always drops spl to the base level (spl0()).
806  */
807 void
808 swtch()
809 {
810 	kthread_t	*t = curthread;
811 	kthread_t	*next;
812 	cpu_t		*cp;
813 
814 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
815 
816 	if (t->t_flag & T_INTR_THREAD)
817 		cpu_intr_swtch_enter(t);
818 
819 	if (t->t_intr != NULL) {
820 		/*
821 		 * We are an interrupt thread.  Setup and return
822 		 * the interrupted thread to be resumed.
823 		 */
824 		(void) splhigh();	/* block other scheduler action */
825 		cp = CPU;		/* now protected against migration */
826 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
827 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
828 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
829 		next = thread_unpin();
830 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
831 		resume_from_intr(next);
832 	} else {
833 #ifdef	DEBUG
834 		if (t->t_state == TS_ONPROC &&
835 		    t->t_disp_queue->disp_cpu == CPU &&
836 		    t->t_preempt == 0) {
837 			thread_lock(t);
838 			ASSERT(t->t_state != TS_ONPROC ||
839 			    t->t_disp_queue->disp_cpu != CPU ||
840 			    t->t_preempt != 0);	/* cannot migrate */
841 			thread_unlock_nopreempt(t);
842 		}
843 #endif	/* DEBUG */
844 		cp = CPU;
845 		next = disp();		/* returns with spl high */
846 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
847 
848 		/* OK to steal anything left on run queue */
849 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
850 
851 		if (next != t) {
852 			if (t == cp->cpu_idle_thread) {
853 				CHIP_NRUNNING(cp->cpu_chip, 1);
854 			} else if (next == cp->cpu_idle_thread) {
855 				CHIP_NRUNNING(cp->cpu_chip, -1);
856 			}
857 
858 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
859 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
860 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
861 
862 			if (dtrace_vtime_active)
863 				dtrace_vtime_switch(next);
864 
865 			resume(next);
866 			/*
867 			 * The TR_RESUME_END and TR_SWTCH_END trace points
868 			 * appear at the end of resume(), because we may not
869 			 * return here
870 			 */
871 		} else {
872 			if (t->t_flag & T_INTR_THREAD)
873 				cpu_intr_swtch_exit(t);
874 
875 			DTRACE_SCHED(remain__cpu);
876 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
877 			(void) spl0();
878 		}
879 	}
880 }
881 
882 /*
883  * swtch_from_zombie()
884  *	Special case of swtch(), which allows checks for TS_ZOMB to be
885  *	eliminated from normal resume.
886  *	Find best runnable thread and run it.
887  *	Called with the current thread zombied.
888  *	Zombies cannot migrate, so CPU references are safe.
889  */
890 void
891 swtch_from_zombie()
892 {
893 	kthread_t	*next;
894 	cpu_t		*cpu = CPU;
895 
896 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
897 
898 	ASSERT(curthread->t_state == TS_ZOMB);
899 
900 	next = disp();			/* returns with spl high */
901 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
902 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
903 	ASSERT(next != curthread);
904 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
905 
906 	if (next == cpu->cpu_idle_thread)
907 		CHIP_NRUNNING(cpu->cpu_chip, -1);
908 
909 	if (dtrace_vtime_active)
910 		dtrace_vtime_switch(next);
911 
912 	resume_from_zombie(next);
913 	/*
914 	 * The TR_RESUME_END and TR_SWTCH_END trace points
915 	 * appear at the end of resume(), because we certainly will not
916 	 * return here
917 	 */
918 }
919 
920 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
921 static int
922 thread_on_queue(kthread_t *tp)
923 {
924 	cpu_t	*cp;
925 	cpu_t	*self;
926 	disp_t	*dp;
927 
928 	self = CPU;
929 	cp = self->cpu_next_onln;
930 	dp = cp->cpu_disp;
931 	for (;;) {
932 		dispq_t		*dq;
933 		dispq_t		*eq;
934 
935 		disp_lock_enter_high(&dp->disp_lock);
936 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
937 			kthread_t	*rp;
938 
939 			ASSERT(dq->dq_last == NULL ||
940 				dq->dq_last->t_link == NULL);
941 			for (rp = dq->dq_first; rp; rp = rp->t_link)
942 				if (tp == rp) {
943 					disp_lock_exit_high(&dp->disp_lock);
944 					return (1);
945 				}
946 		}
947 		disp_lock_exit_high(&dp->disp_lock);
948 		if (cp == NULL)
949 			break;
950 		if (cp == self) {
951 			cp = NULL;
952 			dp = &cp->cpu_part->cp_kp_queue;
953 		} else {
954 			cp = cp->cpu_next_onln;
955 			dp = cp->cpu_disp;
956 		}
957 	}
958 	return (0);
959 }	/* end of thread_on_queue */
960 #else
961 
962 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
963 
964 #endif  /* DEBUG */
965 
966 /*
967  * like swtch(), but switch to a specified thread taken from another CPU.
968  *	called with spl high..
969  */
970 void
971 swtch_to(kthread_t *next)
972 {
973 	cpu_t			*cp = CPU;
974 
975 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
976 
977 	/*
978 	 * Update context switch statistics.
979 	 */
980 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
981 
982 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
983 
984 	if (curthread == cp->cpu_idle_thread)
985 		CHIP_NRUNNING(cp->cpu_chip, 1);
986 
987 	/* OK to steal anything left on run queue */
988 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
989 
990 	/* record last execution time */
991 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
992 
993 	if (dtrace_vtime_active)
994 		dtrace_vtime_switch(next);
995 
996 	resume(next);
997 	/*
998 	 * The TR_RESUME_END and TR_SWTCH_END trace points
999 	 * appear at the end of resume(), because we may not
1000 	 * return here
1001 	 */
1002 }
1003 
1004 
1005 
1006 #define	CPU_IDLING(pri)	((pri) == -1)
1007 
1008 static void
1009 cpu_resched(cpu_t *cp, pri_t tpri)
1010 {
1011 	int	call_poke_cpu = 0;
1012 	pri_t   cpupri = cp->cpu_dispatch_pri;
1013 
1014 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1015 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1016 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1017 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1018 			cp->cpu_runrun = 1;
1019 			aston(cp->cpu_dispthread);
1020 			if (tpri < kpreemptpri && cp != CPU)
1021 				call_poke_cpu = 1;
1022 		}
1023 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1024 			cp->cpu_kprunrun = 1;
1025 			if (cp != CPU)
1026 				call_poke_cpu = 1;
1027 		}
1028 	}
1029 
1030 	/*
1031 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1032 	 */
1033 	membar_enter();
1034 
1035 	if (call_poke_cpu)
1036 		poke_cpu(cp->cpu_id);
1037 }
1038 
1039 /*
1040  * Routine used by setbackdq() to balance load across the physical
1041  * processors. Returns a CPU of a lesser loaded chip in the lgroup
1042  * if balancing is necessary, or the "hint" CPU if it's not.
1043  *
1044  * - tp is the thread being enqueued
1045  * - cp is a hint CPU (chosen by cpu_choose()).
1046  * - curchip (if not NULL) is the chip on which the current thread
1047  *   is running.
1048  *
1049  * The thread lock for "tp" must be held while calling this routine.
1050  */
1051 static cpu_t *
1052 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip)
1053 {
1054 	int	chp_nrun, ochp_nrun;
1055 	chip_t	*chp, *nchp;
1056 
1057 	chp = cp->cpu_chip;
1058 	chp_nrun = chp->chip_nrunning;
1059 
1060 	if (chp == curchip)
1061 		chp_nrun--;	/* Ignore curthread */
1062 
1063 	/*
1064 	 * If this chip isn't at all idle, then let
1065 	 * run queue balancing do the work.
1066 	 */
1067 	if (chp_nrun == chp->chip_ncpu)
1068 		return (cp);
1069 
1070 	nchp = chp->chip_balance;
1071 	do {
1072 		if (nchp == chp ||
1073 		    !CHIP_IN_CPUPART(nchp, tp->t_cpupart))
1074 			continue;
1075 
1076 		ochp_nrun = nchp->chip_nrunning;
1077 
1078 		/*
1079 		 * If the other chip is running less threads,
1080 		 * or if it's running the same number of threads, but
1081 		 * has more online logical CPUs, then choose to balance.
1082 		 */
1083 		if (chp_nrun > ochp_nrun ||
1084 		    (chp_nrun == ochp_nrun &&
1085 		    nchp->chip_ncpu > chp->chip_ncpu)) {
1086 			cp = nchp->chip_cpus;
1087 			nchp->chip_cpus = cp->cpu_next_chip;
1088 
1089 			/*
1090 			 * Find a CPU on the chip in the correct
1091 			 * partition. We know at least one exists
1092 			 * because of the CHIP_IN_CPUPART() check above.
1093 			 */
1094 			while (cp->cpu_part != tp->t_cpupart)
1095 				cp = cp->cpu_next_chip;
1096 		}
1097 		chp->chip_balance = nchp->chip_next_lgrp;
1098 		break;
1099 	} while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance);
1100 
1101 	ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart));
1102 	return (cp);
1103 }
1104 
1105 /*
1106  * setbackdq() keeps runqs balanced such that the difference in length
1107  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1108  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1109  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1110  * try to keep runqs perfectly balanced regardless of the thread priority.
1111  */
1112 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1113 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1114 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1115 
1116 /*
1117  * Put the specified thread on the back of the dispatcher
1118  * queue corresponding to its current priority.
1119  *
1120  * Called with the thread in transition, onproc or stopped state
1121  * and locked (transition implies locked) and at high spl.
1122  * Returns with the thread in TS_RUN state and still locked.
1123  */
1124 void
1125 setbackdq(kthread_t *tp)
1126 {
1127 	dispq_t	*dq;
1128 	disp_t		*dp;
1129 	chip_t		*curchip = NULL;
1130 	cpu_t		*cp;
1131 	pri_t		tpri;
1132 	int		bound;
1133 
1134 	ASSERT(THREAD_LOCK_HELD(tp));
1135 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1136 
1137 	if (tp->t_waitrq == 0) {
1138 		hrtime_t curtime;
1139 
1140 		curtime = gethrtime_unscaled();
1141 		(void) cpu_update_pct(tp, curtime);
1142 		tp->t_waitrq = curtime;
1143 	} else {
1144 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1145 	}
1146 
1147 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1148 
1149 	/*
1150 	 * If thread is "swapped" or on the swap queue don't
1151 	 * queue it, but wake sched.
1152 	 */
1153 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1154 		disp_swapped_setrun(tp);
1155 		return;
1156 	}
1157 
1158 	tpri = DISP_PRIO(tp);
1159 	if (tp == curthread) {
1160 		curchip = CPU->cpu_chip;
1161 	}
1162 
1163 	if (ncpus == 1)
1164 		cp = tp->t_cpu;
1165 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1166 		if (tpri >= kpqpri) {
1167 			setkpdq(tp, SETKP_BACK);
1168 			return;
1169 		}
1170 		/*
1171 		 * Let cpu_choose suggest a CPU.
1172 		 */
1173 		cp = cpu_choose(tp, tpri);
1174 
1175 		if (tp->t_cpupart == cp->cpu_part) {
1176 			int	qlen;
1177 
1178 			/*
1179 			 * Select another CPU if we need
1180 			 * to do some load balancing across the
1181 			 * physical processors.
1182 			 */
1183 			if (CHIP_SHOULD_BALANCE(cp->cpu_chip))
1184 				cp = chip_balance(tp, cp, curchip);
1185 
1186 			/*
1187 			 * Balance across the run queues
1188 			 */
1189 			qlen = RUNQ_LEN(cp, tpri);
1190 			if (tpri >= RUNQ_MATCH_PRI &&
1191 			    !(tp->t_schedflag & TS_RUNQMATCH))
1192 				qlen -= RUNQ_MAX_DIFF;
1193 			if (qlen > 0) {
1194 				cpu_t	*np;
1195 
1196 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
1197 					np = cp->cpu_next_part;
1198 				else {
1199 					if ((np = cp->cpu_next_lpl) == cp)
1200 						np = cp->cpu_next_part;
1201 				}
1202 				if (RUNQ_LEN(np, tpri) < qlen)
1203 					cp = np;
1204 			}
1205 		} else {
1206 			/*
1207 			 * Migrate to a cpu in the new partition.
1208 			 */
1209 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1210 			    tp->t_lpl, tp->t_pri, NULL);
1211 		}
1212 		bound = 0;
1213 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1214 	} else {
1215 		/*
1216 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1217 		 * a short time until weak binding that existed when the
1218 		 * strong binding was established has dropped) so we must
1219 		 * favour weak binding over strong.
1220 		 */
1221 		cp = tp->t_weakbound_cpu ?
1222 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1223 		bound = 1;
1224 	}
1225 	dp = cp->cpu_disp;
1226 	disp_lock_enter_high(&dp->disp_lock);
1227 
1228 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1229 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1230 		tpri, cp, tp);
1231 
1232 #ifndef NPROBE
1233 	/* Kernel probe */
1234 	if (tnf_tracing_active)
1235 		tnf_thread_queue(tp, cp, tpri);
1236 #endif /* NPROBE */
1237 
1238 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1239 
1240 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1241 	tp->t_disp_queue = dp;
1242 	tp->t_link = NULL;
1243 
1244 	dq = &dp->disp_q[tpri];
1245 	dp->disp_nrunnable++;
1246 	membar_enter();
1247 
1248 	if (dq->dq_sruncnt++ != 0) {
1249 		ASSERT(dq->dq_first != NULL);
1250 		dq->dq_last->t_link = tp;
1251 		dq->dq_last = tp;
1252 	} else {
1253 		ASSERT(dq->dq_first == NULL);
1254 		ASSERT(dq->dq_last == NULL);
1255 		dq->dq_first = dq->dq_last = tp;
1256 		BT_SET(dp->disp_qactmap, tpri);
1257 		if (tpri > dp->disp_maxrunpri) {
1258 			dp->disp_maxrunpri = tpri;
1259 			membar_enter();
1260 			cpu_resched(cp, tpri);
1261 		}
1262 	}
1263 
1264 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1265 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1266 		    cp == CPU) {
1267 			/*
1268 			 * If there are no other unbound threads on the
1269 			 * run queue, don't allow other CPUs to steal
1270 			 * this thread while we are in the middle of a
1271 			 * context switch. We may just switch to it
1272 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1273 			 * in swtch and swtch_to.
1274 			 */
1275 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1276 		}
1277 		dp->disp_max_unbound_pri = tpri;
1278 	}
1279 	(*disp_enq_thread)(cp, bound);
1280 }
1281 
1282 /*
1283  * Put the specified thread on the front of the dispatcher
1284  * queue corresponding to its current priority.
1285  *
1286  * Called with the thread in transition, onproc or stopped state
1287  * and locked (transition implies locked) and at high spl.
1288  * Returns with the thread in TS_RUN state and still locked.
1289  */
1290 void
1291 setfrontdq(kthread_t *tp)
1292 {
1293 	disp_t		*dp;
1294 	dispq_t		*dq;
1295 	cpu_t		*cp;
1296 	pri_t		tpri;
1297 	int		bound;
1298 
1299 	ASSERT(THREAD_LOCK_HELD(tp));
1300 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1301 
1302 	if (tp->t_waitrq == 0) {
1303 		hrtime_t curtime;
1304 
1305 		curtime = gethrtime_unscaled();
1306 		(void) cpu_update_pct(tp, curtime);
1307 		tp->t_waitrq = curtime;
1308 	} else {
1309 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1310 	}
1311 
1312 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1313 
1314 	/*
1315 	 * If thread is "swapped" or on the swap queue don't
1316 	 * queue it, but wake sched.
1317 	 */
1318 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1319 		disp_swapped_setrun(tp);
1320 		return;
1321 	}
1322 
1323 	tpri = DISP_PRIO(tp);
1324 	if (ncpus == 1)
1325 		cp = tp->t_cpu;
1326 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1327 		if (tpri >= kpqpri) {
1328 			setkpdq(tp, SETKP_FRONT);
1329 			return;
1330 		}
1331 		cp = tp->t_cpu;
1332 		if (tp->t_cpupart == cp->cpu_part) {
1333 			/*
1334 			 * If we are of higher or equal priority than
1335 			 * the highest priority runnable thread of
1336 			 * the current CPU, just pick this CPU.  Otherwise
1337 			 * Let cpu_choose() select the CPU.  If this cpu
1338 			 * is the target of an offline request then do not
1339 			 * pick it - a thread_nomigrate() on the in motion
1340 			 * cpu relies on this when it forces a preempt.
1341 			 */
1342 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1343 			    cp == cpu_inmotion)
1344 				cp = cpu_choose(tp, tpri);
1345 		} else {
1346 			/*
1347 			 * Migrate to a cpu in the new partition.
1348 			 */
1349 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1350 			    tp->t_lpl, tp->t_pri, NULL);
1351 		}
1352 		bound = 0;
1353 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1354 	} else {
1355 		/*
1356 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1357 		 * a short time until weak binding that existed when the
1358 		 * strong binding was established has dropped) so we must
1359 		 * favour weak binding over strong.
1360 		 */
1361 		cp = tp->t_weakbound_cpu ?
1362 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1363 		bound = 1;
1364 	}
1365 	dp = cp->cpu_disp;
1366 	disp_lock_enter_high(&dp->disp_lock);
1367 
1368 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1369 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1370 
1371 #ifndef NPROBE
1372 	/* Kernel probe */
1373 	if (tnf_tracing_active)
1374 		tnf_thread_queue(tp, cp, tpri);
1375 #endif /* NPROBE */
1376 
1377 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1378 
1379 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1380 	tp->t_disp_queue = dp;
1381 
1382 	dq = &dp->disp_q[tpri];
1383 	dp->disp_nrunnable++;
1384 	membar_enter();
1385 
1386 	if (dq->dq_sruncnt++ != 0) {
1387 		ASSERT(dq->dq_last != NULL);
1388 		tp->t_link = dq->dq_first;
1389 		dq->dq_first = tp;
1390 	} else {
1391 		ASSERT(dq->dq_last == NULL);
1392 		ASSERT(dq->dq_first == NULL);
1393 		tp->t_link = NULL;
1394 		dq->dq_first = dq->dq_last = tp;
1395 		BT_SET(dp->disp_qactmap, tpri);
1396 		if (tpri > dp->disp_maxrunpri) {
1397 			dp->disp_maxrunpri = tpri;
1398 			membar_enter();
1399 			cpu_resched(cp, tpri);
1400 		}
1401 	}
1402 
1403 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1404 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1405 		    cp == CPU) {
1406 			/*
1407 			 * If there are no other unbound threads on the
1408 			 * run queue, don't allow other CPUs to steal
1409 			 * this thread while we are in the middle of a
1410 			 * context switch. We may just switch to it
1411 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1412 			 * in swtch and swtch_to.
1413 			 */
1414 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1415 		}
1416 		dp->disp_max_unbound_pri = tpri;
1417 	}
1418 	(*disp_enq_thread)(cp, bound);
1419 }
1420 
1421 /*
1422  * Put a high-priority unbound thread on the kp queue
1423  */
1424 static void
1425 setkpdq(kthread_t *tp, int borf)
1426 {
1427 	dispq_t	*dq;
1428 	disp_t	*dp;
1429 	cpu_t	*cp;
1430 	pri_t	tpri;
1431 
1432 	tpri = DISP_PRIO(tp);
1433 
1434 	dp = &tp->t_cpupart->cp_kp_queue;
1435 	disp_lock_enter_high(&dp->disp_lock);
1436 
1437 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1438 
1439 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1440 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1441 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1442 	tp->t_disp_queue = dp;
1443 	dp->disp_nrunnable++;
1444 	dq = &dp->disp_q[tpri];
1445 
1446 	if (dq->dq_sruncnt++ != 0) {
1447 		if (borf == SETKP_BACK) {
1448 			ASSERT(dq->dq_first != NULL);
1449 			tp->t_link = NULL;
1450 			dq->dq_last->t_link = tp;
1451 			dq->dq_last = tp;
1452 		} else {
1453 			ASSERT(dq->dq_last != NULL);
1454 			tp->t_link = dq->dq_first;
1455 			dq->dq_first = tp;
1456 		}
1457 	} else {
1458 		if (borf == SETKP_BACK) {
1459 			ASSERT(dq->dq_first == NULL);
1460 			ASSERT(dq->dq_last == NULL);
1461 			dq->dq_first = dq->dq_last = tp;
1462 		} else {
1463 			ASSERT(dq->dq_last == NULL);
1464 			ASSERT(dq->dq_first == NULL);
1465 			tp->t_link = NULL;
1466 			dq->dq_first = dq->dq_last = tp;
1467 		}
1468 		BT_SET(dp->disp_qactmap, tpri);
1469 		if (tpri > dp->disp_max_unbound_pri)
1470 			dp->disp_max_unbound_pri = tpri;
1471 		if (tpri > dp->disp_maxrunpri) {
1472 			dp->disp_maxrunpri = tpri;
1473 			membar_enter();
1474 		}
1475 	}
1476 
1477 	cp = tp->t_cpu;
1478 	if (tp->t_cpupart != cp->cpu_part) {
1479 		/* migrate to a cpu in the new partition */
1480 		cp = tp->t_cpupart->cp_cpulist;
1481 	}
1482 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1483 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1484 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1485 
1486 #ifndef NPROBE
1487 	/* Kernel probe */
1488 	if (tnf_tracing_active)
1489 		tnf_thread_queue(tp, cp, tpri);
1490 #endif /* NPROBE */
1491 
1492 	if (cp->cpu_chosen_level < tpri)
1493 		cp->cpu_chosen_level = tpri;
1494 	cpu_resched(cp, tpri);
1495 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1496 	(*disp_enq_thread)(cp, 0);
1497 }
1498 
1499 /*
1500  * Remove a thread from the dispatcher queue if it is on it.
1501  * It is not an error if it is not found but we return whether
1502  * or not it was found in case the caller wants to check.
1503  */
1504 int
1505 dispdeq(kthread_t *tp)
1506 {
1507 	disp_t		*dp;
1508 	dispq_t		*dq;
1509 	kthread_t	*rp;
1510 	kthread_t	*trp;
1511 	kthread_t	**ptp;
1512 	int		tpri;
1513 
1514 	ASSERT(THREAD_LOCK_HELD(tp));
1515 
1516 	if (tp->t_state != TS_RUN)
1517 		return (0);
1518 
1519 	/*
1520 	 * The thread is "swapped" or is on the swap queue and
1521 	 * hence no longer on the run queue, so return true.
1522 	 */
1523 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1524 		return (1);
1525 
1526 	tpri = DISP_PRIO(tp);
1527 	dp = tp->t_disp_queue;
1528 	ASSERT(tpri < dp->disp_npri);
1529 	dq = &dp->disp_q[tpri];
1530 	ptp = &dq->dq_first;
1531 	rp = *ptp;
1532 	trp = NULL;
1533 
1534 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1535 
1536 	/*
1537 	 * Search for thread in queue.
1538 	 * Double links would simplify this at the expense of disp/setrun.
1539 	 */
1540 	while (rp != tp && rp != NULL) {
1541 		trp = rp;
1542 		ptp = &trp->t_link;
1543 		rp = trp->t_link;
1544 	}
1545 
1546 	if (rp == NULL) {
1547 		panic("dispdeq: thread not on queue");
1548 	}
1549 
1550 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1551 
1552 	/*
1553 	 * Found it so remove it from queue.
1554 	 */
1555 	if ((*ptp = rp->t_link) == NULL)
1556 		dq->dq_last = trp;
1557 
1558 	dp->disp_nrunnable--;
1559 	if (--dq->dq_sruncnt == 0) {
1560 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1561 		if (dp->disp_nrunnable == 0) {
1562 			dp->disp_max_unbound_pri = -1;
1563 			dp->disp_maxrunpri = -1;
1564 		} else if (tpri == dp->disp_maxrunpri) {
1565 			int ipri;
1566 
1567 			ipri = bt_gethighbit(dp->disp_qactmap,
1568 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1569 			if (ipri < dp->disp_max_unbound_pri)
1570 				dp->disp_max_unbound_pri = ipri;
1571 			dp->disp_maxrunpri = ipri;
1572 		}
1573 	}
1574 	tp->t_link = NULL;
1575 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1576 	return (1);
1577 }
1578 
1579 
1580 /*
1581  * dq_sruninc and dq_srundec are public functions for
1582  * incrementing/decrementing the sruncnts when a thread on
1583  * a dispatcher queue is made schedulable/unschedulable by
1584  * resetting the TS_LOAD flag.
1585  *
1586  * The caller MUST have the thread lock and therefore the dispatcher
1587  * queue lock so that the operation which changes
1588  * the flag, the operation that checks the status of the thread to
1589  * determine if it's on a disp queue AND the call to this function
1590  * are one atomic operation with respect to interrupts.
1591  */
1592 
1593 /*
1594  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1595  */
1596 void
1597 dq_sruninc(kthread_t *t)
1598 {
1599 	ASSERT(t->t_state == TS_RUN);
1600 	ASSERT(t->t_schedflag & TS_LOAD);
1601 
1602 	THREAD_TRANSITION(t);
1603 	setfrontdq(t);
1604 }
1605 
1606 /*
1607  * See comment on calling conventions above.
1608  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1609  */
1610 void
1611 dq_srundec(kthread_t *t)
1612 {
1613 	ASSERT(t->t_schedflag & TS_LOAD);
1614 
1615 	(void) dispdeq(t);
1616 	disp_swapped_enq(t);
1617 }
1618 
1619 /*
1620  * Change the dispatcher lock of thread to the "swapped_lock"
1621  * and return with thread lock still held.
1622  *
1623  * Called with thread_lock held, in transition state, and at high spl.
1624  */
1625 void
1626 disp_swapped_enq(kthread_t *tp)
1627 {
1628 	ASSERT(THREAD_LOCK_HELD(tp));
1629 	ASSERT(tp->t_schedflag & TS_LOAD);
1630 
1631 	switch (tp->t_state) {
1632 	case TS_RUN:
1633 		disp_lock_enter_high(&swapped_lock);
1634 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1635 		break;
1636 	case TS_ONPROC:
1637 		disp_lock_enter_high(&swapped_lock);
1638 		THREAD_TRANSITION(tp);
1639 		wake_sched_sec = 1;		/* tell clock to wake sched */
1640 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1641 		break;
1642 	default:
1643 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1644 	}
1645 }
1646 
1647 /*
1648  * This routine is called by setbackdq/setfrontdq if the thread is
1649  * not loaded or loaded and on the swap queue.
1650  *
1651  * Thread state TS_SLEEP implies that a swapped thread
1652  * has been woken up and needs to be swapped in by the swapper.
1653  *
1654  * Thread state TS_RUN, it implies that the priority of a swapped
1655  * thread is being increased by scheduling class (e.g. ts_update).
1656  */
1657 static void
1658 disp_swapped_setrun(kthread_t *tp)
1659 {
1660 	ASSERT(THREAD_LOCK_HELD(tp));
1661 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1662 
1663 	switch (tp->t_state) {
1664 	case TS_SLEEP:
1665 		disp_lock_enter_high(&swapped_lock);
1666 		/*
1667 		 * Wakeup sched immediately (i.e., next tick) if the
1668 		 * thread priority is above maxclsyspri.
1669 		 */
1670 		if (DISP_PRIO(tp) > maxclsyspri)
1671 			wake_sched = 1;
1672 		else
1673 			wake_sched_sec = 1;
1674 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1675 		break;
1676 	case TS_RUN:				/* called from ts_update */
1677 		break;
1678 	default:
1679 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1680 	}
1681 }
1682 
1683 
1684 /*
1685  *	Make a thread give up its processor.  Find the processor on
1686  *	which this thread is executing, and have that processor
1687  *	preempt.
1688  */
1689 void
1690 cpu_surrender(kthread_t *tp)
1691 {
1692 	cpu_t	*cpup;
1693 	int	max_pri;
1694 	int	max_run_pri;
1695 	klwp_t	*lwp;
1696 
1697 	ASSERT(THREAD_LOCK_HELD(tp));
1698 
1699 	if (tp->t_state != TS_ONPROC)
1700 		return;
1701 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1702 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1703 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1704 	if (max_pri < max_run_pri)
1705 		max_pri = max_run_pri;
1706 
1707 	cpup->cpu_runrun = 1;
1708 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1709 		cpup->cpu_kprunrun = 1;
1710 	}
1711 
1712 	/*
1713 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1714 	 */
1715 	membar_enter();
1716 
1717 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1718 
1719 	/*
1720 	 * Make the target thread take an excursion through trap()
1721 	 * to do preempt() (unless we're already in trap or post_syscall,
1722 	 * calling cpu_surrender via CL_TRAPRET).
1723 	 */
1724 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1725 	    lwp->lwp_state != LWP_USER) {
1726 		aston(tp);
1727 		if (cpup != CPU)
1728 			poke_cpu(cpup->cpu_id);
1729 	}
1730 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1731 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1732 }
1733 
1734 
1735 /*
1736  * Commit to and ratify a scheduling decision
1737  */
1738 /*ARGSUSED*/
1739 static kthread_t *
1740 disp_ratify(kthread_t *tp, disp_t *kpq)
1741 {
1742 	pri_t	tpri, maxpri;
1743 	pri_t	maxkpri;
1744 	cpu_t	*cpup;
1745 
1746 	ASSERT(tp != NULL);
1747 	/*
1748 	 * Commit to, then ratify scheduling decision
1749 	 */
1750 	cpup = CPU;
1751 	if (cpup->cpu_runrun != 0)
1752 		cpup->cpu_runrun = 0;
1753 	if (cpup->cpu_kprunrun != 0)
1754 		cpup->cpu_kprunrun = 0;
1755 	if (cpup->cpu_chosen_level != -1)
1756 		cpup->cpu_chosen_level = -1;
1757 	membar_enter();
1758 	tpri = DISP_PRIO(tp);
1759 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1760 	maxkpri = kpq->disp_maxrunpri;
1761 	if (maxpri < maxkpri)
1762 		maxpri = maxkpri;
1763 	if (tpri < maxpri) {
1764 		/*
1765 		 * should have done better
1766 		 * put this one back and indicate to try again
1767 		 */
1768 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1769 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1770 		thread_lock_high(tp);
1771 		THREAD_TRANSITION(tp);
1772 		setfrontdq(tp);
1773 		thread_unlock_nopreempt(tp);
1774 
1775 		tp = NULL;
1776 	}
1777 	return (tp);
1778 }
1779 
1780 /*
1781  * See if there is any work on the dispatcher queue for other CPUs.
1782  * If there is, dequeue the best thread and return.
1783  */
1784 static kthread_t *
1785 disp_getwork(cpu_t *cp)
1786 {
1787 	cpu_t		*ocp;		/* other CPU */
1788 	cpu_t		*ocp_start;
1789 	cpu_t		*tcp;		/* target local CPU */
1790 	kthread_t	*tp;
1791 	pri_t		maxpri;
1792 	disp_t		*kpq;		/* kp queue for this partition */
1793 	lpl_t		*lpl, *lpl_leaf;
1794 	int		hint, leafidx;
1795 
1796 	maxpri = -1;
1797 	tcp = NULL;
1798 
1799 	kpq = &cp->cpu_part->cp_kp_queue;
1800 	while (kpq->disp_maxrunpri >= 0) {
1801 		/*
1802 		 * Try to take a thread from the kp_queue.
1803 		 */
1804 		tp = (disp_getbest(kpq));
1805 		if (tp)
1806 			return (disp_ratify(tp, kpq));
1807 	}
1808 
1809 	kpreempt_disable();		/* protect the cpu_active list */
1810 
1811 	/*
1812 	 * Try to find something to do on another CPU's run queue.
1813 	 * Loop through all other CPUs looking for the one with the highest
1814 	 * priority unbound thread.
1815 	 *
1816 	 * On NUMA machines, the partition's CPUs are consulted in order of
1817 	 * distance from the current CPU. This way, the first available
1818 	 * work found is also the closest, and will suffer the least
1819 	 * from being migrated.
1820 	 */
1821 	lpl = lpl_leaf = cp->cpu_lpl;
1822 	hint = leafidx = 0;
1823 
1824 	/*
1825 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1826 	 * broader levels of locality
1827 	 */
1828 	do {
1829 		/* This loop iterates over the lpl's leaves */
1830 		do {
1831 			if (lpl_leaf != cp->cpu_lpl)
1832 				ocp = lpl_leaf->lpl_cpus;
1833 			else
1834 				ocp = cp->cpu_next_lpl;
1835 
1836 			/* This loop iterates over the CPUs in the leaf */
1837 			ocp_start = ocp;
1838 			do {
1839 				pri_t pri;
1840 
1841 				ASSERT(CPU_ACTIVE(ocp));
1842 
1843 				/*
1844 				 * End our stroll around the partition if:
1845 				 *
1846 				 * - Something became runnable on the local
1847 				 *	queue
1848 				 *
1849 				 * - We're at the broadest level of locality and
1850 				 *   we happen across another idle CPU. At the
1851 				 *   highest level of locality, all CPUs will
1852 				 *   walk the partition's CPUs in the same
1853 				 *   order, so we can end our stroll taking
1854 				 *   comfort in knowing the other idle CPU is
1855 				 *   already covering the next portion of the
1856 				 *   list.
1857 				 */
1858 				if (cp->cpu_disp->disp_nrunnable != 0)
1859 					break;
1860 				if (ocp->cpu_dispatch_pri == -1) {
1861 					if (ocp->cpu_disp_flags &
1862 					    CPU_DISP_HALTED)
1863 						continue;
1864 					else if (lpl->lpl_parent == NULL)
1865 						break;
1866 				}
1867 
1868 				/*
1869 				 * If there's only one thread and the CPU
1870 				 * is in the middle of a context switch,
1871 				 * or it's currently running the idle thread,
1872 				 * don't steal it.
1873 				 */
1874 				if ((ocp->cpu_disp_flags &
1875 					CPU_DISP_DONTSTEAL) &&
1876 				    ocp->cpu_disp->disp_nrunnable == 1)
1877 					continue;
1878 
1879 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1880 				if (pri > maxpri) {
1881 					maxpri = pri;
1882 					tcp = ocp;
1883 				}
1884 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1885 
1886 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1887 				leafidx = 0;
1888 				lpl_leaf = lpl->lpl_rset[leafidx];
1889 			}
1890 		} while (leafidx != hint);
1891 
1892 		hint = leafidx = lpl->lpl_hint;
1893 		if ((lpl = lpl->lpl_parent) != NULL)
1894 			lpl_leaf = lpl->lpl_rset[hint];
1895 	} while (!tcp && lpl);
1896 
1897 	kpreempt_enable();
1898 
1899 	/*
1900 	 * If another queue looks good, and there is still nothing on
1901 	 * the local queue, try to transfer one or more threads
1902 	 * from it to our queue.
1903 	 */
1904 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1905 		tp = (disp_getbest(tcp->cpu_disp));
1906 		if (tp)
1907 			return (disp_ratify(tp, kpq));
1908 	}
1909 	return (NULL);
1910 }
1911 
1912 
1913 /*
1914  * disp_fix_unbound_pri()
1915  *	Determines the maximum priority of unbound threads on the queue.
1916  *	The priority is kept for the queue, but is only increased, never
1917  *	reduced unless some CPU is looking for something on that queue.
1918  *
1919  *	The priority argument is the known upper limit.
1920  *
1921  *	Perhaps this should be kept accurately, but that probably means
1922  *	separate bitmaps for bound and unbound threads.  Since only idled
1923  *	CPUs will have to do this recalculation, it seems better this way.
1924  */
1925 static void
1926 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1927 {
1928 	kthread_t	*tp;
1929 	dispq_t		*dq;
1930 	ulong_t		*dqactmap = dp->disp_qactmap;
1931 	ulong_t		mapword;
1932 	int		wx;
1933 
1934 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1935 
1936 	ASSERT(pri >= 0);			/* checked by caller */
1937 
1938 	/*
1939 	 * Start the search at the next lowest priority below the supplied
1940 	 * priority.  This depends on the bitmap implementation.
1941 	 */
1942 	do {
1943 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
1944 
1945 		/*
1946 		 * Form mask for all lower priorities in the word.
1947 		 */
1948 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1949 
1950 		/*
1951 		 * Get next lower active priority.
1952 		 */
1953 		if (mapword != 0) {
1954 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1955 		} else if (wx > 0) {
1956 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1957 			if (pri < 0)
1958 				break;
1959 		} else {
1960 			pri = -1;
1961 			break;
1962 		}
1963 
1964 		/*
1965 		 * Search the queue for unbound, runnable threads.
1966 		 */
1967 		dq = &dp->disp_q[pri];
1968 		tp = dq->dq_first;
1969 
1970 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1971 			tp = tp->t_link;
1972 		}
1973 
1974 		/*
1975 		 * If a thread was found, set the priority and return.
1976 		 */
1977 	} while (tp == NULL);
1978 
1979 	/*
1980 	 * pri holds the maximum unbound thread priority or -1.
1981 	 */
1982 	if (dp->disp_max_unbound_pri != pri)
1983 		dp->disp_max_unbound_pri = pri;
1984 }
1985 
1986 /*
1987  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
1988  * 	check if the CPU to which is was previously bound should have
1989  * 	its disp_max_unbound_pri increased.
1990  */
1991 void
1992 disp_adjust_unbound_pri(kthread_t *tp)
1993 {
1994 	disp_t *dp;
1995 	pri_t tpri;
1996 
1997 	ASSERT(THREAD_LOCK_HELD(tp));
1998 
1999 	/*
2000 	 * Don't do anything if the thread is not bound, or
2001 	 * currently not runnable or swapped out.
2002 	 */
2003 	if (tp->t_bound_cpu == NULL ||
2004 	    tp->t_state != TS_RUN ||
2005 	    tp->t_schedflag & TS_ON_SWAPQ)
2006 		return;
2007 
2008 	tpri = DISP_PRIO(tp);
2009 	dp = tp->t_bound_cpu->cpu_disp;
2010 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2011 	if (tpri > dp->disp_max_unbound_pri)
2012 		dp->disp_max_unbound_pri = tpri;
2013 }
2014 
2015 /*
2016  * disp_getbest() - de-queue the highest priority unbound runnable thread.
2017  *	returns with the thread unlocked and onproc
2018  *	but at splhigh (like disp()).
2019  *	returns NULL if nothing found.
2020  *
2021  *	Passed a pointer to a dispatch queue not associated with this CPU.
2022  */
2023 static kthread_t *
2024 disp_getbest(disp_t *dp)
2025 {
2026 	kthread_t	*tp;
2027 	dispq_t		*dq;
2028 	pri_t		pri;
2029 	cpu_t		*cp;
2030 
2031 	disp_lock_enter(&dp->disp_lock);
2032 
2033 	/*
2034 	 * If there is nothing to run, or the CPU is in the middle of a
2035 	 * context switch of the only thread, return NULL.
2036 	 */
2037 	pri = dp->disp_max_unbound_pri;
2038 	if (pri == -1 ||
2039 		(dp->disp_cpu != NULL &&
2040 		    (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2041 		dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) {
2042 		disp_lock_exit_nopreempt(&dp->disp_lock);
2043 		return (NULL);
2044 	}
2045 
2046 	dq = &dp->disp_q[pri];
2047 	tp = dq->dq_first;
2048 
2049 	/*
2050 	 * Skip over bound threads.
2051 	 * Bound threads can be here even though disp_max_unbound_pri
2052 	 * indicated this level.  Besides, it not always accurate because it
2053 	 * isn't reduced until another CPU looks for work.
2054 	 * Note that tp could be NULL right away due to this.
2055 	 */
2056 	while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2057 		tp = tp->t_link;
2058 	}
2059 
2060 	/*
2061 	 * If there were no unbound threads on this queue, find the queue
2062 	 * where they are and then return NULL so that other CPUs will be
2063 	 * considered.
2064 	 */
2065 	if (tp == NULL) {
2066 		disp_fix_unbound_pri(dp, pri);
2067 		disp_lock_exit_nopreempt(&dp->disp_lock);
2068 		return (NULL);
2069 	}
2070 
2071 	/*
2072 	 * Found a runnable, unbound thread, so remove it from queue.
2073 	 * dispdeq() requires that we have the thread locked, and we do,
2074 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2075 	 * put the thread in transition state, thereby dropping the dispq
2076 	 * lock.
2077 	 */
2078 #ifdef DEBUG
2079 	{
2080 		int	thread_was_on_queue;
2081 
2082 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2083 		ASSERT(thread_was_on_queue);
2084 	}
2085 #else /* DEBUG */
2086 	(void) dispdeq(tp);			/* drops disp_lock */
2087 #endif /* DEBUG */
2088 
2089 	tp->t_schedflag |= TS_DONT_SWAP;
2090 
2091 	/*
2092 	 * Setup thread to run on the current CPU.
2093 	 */
2094 	cp = CPU;
2095 
2096 	tp->t_disp_queue = cp->cpu_disp;
2097 
2098 	cp->cpu_dispthread = tp;		/* protected by spl only */
2099 	cp->cpu_dispatch_pri = pri;
2100 	ASSERT(pri == DISP_PRIO(tp));
2101 
2102 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2103 
2104 	/*
2105 	 * Return with spl high so that swtch() won't need to raise it.
2106 	 * The disp_lock was dropped by dispdeq().
2107 	 */
2108 
2109 	return (tp);
2110 }
2111 
2112 /*
2113  * disp_bound_common() - common routine for higher level functions
2114  *	that check for bound threads under certain conditions.
2115  *	If 'threadlistsafe' is set then there is no need to acquire
2116  *	pidlock to stop the thread list from changing (eg, if
2117  *	disp_bound_* is called with cpus paused).
2118  */
2119 static int
2120 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2121 {
2122 	int		found = 0;
2123 	kthread_t	*tp;
2124 
2125 	ASSERT(flag);
2126 
2127 	if (!threadlistsafe)
2128 		mutex_enter(&pidlock);
2129 	tp = curthread;		/* faster than allthreads */
2130 	do {
2131 		if (tp->t_state != TS_FREE) {
2132 			/*
2133 			 * If an interrupt thread is busy, but the
2134 			 * caller doesn't care (i.e. BOUND_INTR is off),
2135 			 * then just ignore it and continue through.
2136 			 */
2137 			if ((tp->t_flag & T_INTR_THREAD) &&
2138 			    !(flag & BOUND_INTR))
2139 				continue;
2140 
2141 			/*
2142 			 * Skip the idle thread for the CPU
2143 			 * we're about to set offline.
2144 			 */
2145 			if (tp == cp->cpu_idle_thread)
2146 				continue;
2147 
2148 			/*
2149 			 * Skip the pause thread for the CPU
2150 			 * we're about to set offline.
2151 			 */
2152 			if (tp == cp->cpu_pause_thread)
2153 				continue;
2154 
2155 			if ((flag & BOUND_CPU) &&
2156 			    (tp->t_bound_cpu == cp ||
2157 			    tp->t_bind_cpu == cp->cpu_id ||
2158 			    tp->t_weakbound_cpu == cp)) {
2159 				found = 1;
2160 				break;
2161 			}
2162 
2163 			if ((flag & BOUND_PARTITION) &&
2164 			    (tp->t_cpupart == cp->cpu_part)) {
2165 				found = 1;
2166 				break;
2167 			}
2168 		}
2169 	} while ((tp = tp->t_next) != curthread && found == 0);
2170 	if (!threadlistsafe)
2171 		mutex_exit(&pidlock);
2172 	return (found);
2173 }
2174 
2175 /*
2176  * disp_bound_threads - return nonzero if threads are bound to the processor.
2177  *	Called infrequently.  Keep this simple.
2178  *	Includes threads that are asleep or stopped but not onproc.
2179  */
2180 int
2181 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2182 {
2183 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2184 }
2185 
2186 /*
2187  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2188  * to the given processor, including interrupt threads.
2189  */
2190 int
2191 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2192 {
2193 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2194 }
2195 
2196 /*
2197  * disp_bound_partition - return nonzero if threads are bound to the same
2198  * partition as the processor.
2199  *	Called infrequently.  Keep this simple.
2200  *	Includes threads that are asleep or stopped but not onproc.
2201  */
2202 int
2203 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2204 {
2205 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2206 }
2207 
2208 /*
2209  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2210  * threads to other CPUs.
2211  */
2212 void
2213 disp_cpu_inactive(cpu_t *cp)
2214 {
2215 	kthread_t	*tp;
2216 	disp_t		*dp = cp->cpu_disp;
2217 	dispq_t		*dq;
2218 	pri_t		pri;
2219 	int		wasonq;
2220 
2221 	disp_lock_enter(&dp->disp_lock);
2222 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2223 		dq = &dp->disp_q[pri];
2224 		tp = dq->dq_first;
2225 
2226 		/*
2227 		 * Skip over bound threads.
2228 		 */
2229 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2230 			tp = tp->t_link;
2231 		}
2232 
2233 		if (tp == NULL) {
2234 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2235 			disp_fix_unbound_pri(dp, pri);
2236 			continue;
2237 		}
2238 
2239 		wasonq = dispdeq(tp);		/* drops disp_lock */
2240 		ASSERT(wasonq);
2241 		ASSERT(tp->t_weakbound_cpu == NULL);
2242 
2243 		setbackdq(tp);
2244 		/*
2245 		 * Called from cpu_offline:
2246 		 *
2247 		 * cp has already been removed from the list of active cpus
2248 		 * and tp->t_cpu has been changed so there is no risk of
2249 		 * tp ending up back on cp.
2250 		 *
2251 		 * Called from cpupart_move_cpu:
2252 		 *
2253 		 * The cpu has moved to a new cpupart.  Any threads that
2254 		 * were on it's dispatch queues before the move remain
2255 		 * in the old partition and can't run in the new partition.
2256 		 */
2257 		ASSERT(tp->t_cpu != cp);
2258 		thread_unlock(tp);
2259 
2260 		disp_lock_enter(&dp->disp_lock);
2261 	}
2262 	disp_lock_exit(&dp->disp_lock);
2263 }
2264 
2265 /*
2266  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2267  *	The hint passed in is used as a starting point so we don't favor
2268  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2269  *	used CPU for the thread.
2270  *
2271  *	The lgroup and priority are used to determine the best CPU to run on
2272  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2273  *	the thread priority will indicate whether the thread will actually run
2274  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2275  *	lgroup which are running the lowest priority threads are found.  The
2276  *	remote CPU is chosen only if the thread will not run locally on a CPU
2277  *	within the lgroup, but will run on the remote CPU. If the thread
2278  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2279  *
2280  *	The lpl specified also identifies the cpu partition from which
2281  *	disp_lowpri_cpu should select a CPU.
2282  *
2283  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2284  *      behalf of the current thread. (curthread is looking for a new cpu)
2285  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2286  *      ignored.
2287  *
2288  *      If a cpu is the target of an offline request then try to avoid it.
2289  *
2290  *	This function must be called at either high SPL, or with preemption
2291  *	disabled, so that the "hint" CPU cannot be removed from the online
2292  *	CPU list while we are traversing it.
2293  */
2294 cpu_t *
2295 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2296 {
2297 	cpu_t	*bestcpu;
2298 	cpu_t	*besthomecpu;
2299 	cpu_t   *cp, *cpstart;
2300 
2301 	pri_t   bestpri;
2302 	pri_t   cpupri;
2303 
2304 	klgrpset_t	done;
2305 	klgrpset_t	cur_set;
2306 
2307 	lpl_t		*lpl_iter, *lpl_leaf;
2308 	int		i;
2309 
2310 	/*
2311 	 * Scan for a CPU currently running the lowest priority thread.
2312 	 * Cannot get cpu_lock here because it is adaptive.
2313 	 * We do not require lock on CPU list.
2314 	 */
2315 	ASSERT(hint != NULL);
2316 	ASSERT(lpl != NULL);
2317 	ASSERT(lpl->lpl_ncpu > 0);
2318 
2319 	/*
2320 	 * First examine local CPUs. Note that it's possible the hint CPU
2321 	 * passed in in remote to the specified home lgroup. If our priority
2322 	 * isn't sufficient enough such that we can run immediately at home,
2323 	 * then examine CPUs remote to our home lgroup.
2324 	 * We would like to give preference to CPUs closest to "home".
2325 	 * If we can't find a CPU where we'll run at a given level
2326 	 * of locality, we expand our search to include the next level.
2327 	 */
2328 	bestcpu = besthomecpu = NULL;
2329 	klgrpset_clear(done);
2330 	/* start with lpl we were passed */
2331 
2332 	lpl_iter = lpl;
2333 
2334 	do {
2335 
2336 		bestpri = SHRT_MAX;
2337 		klgrpset_clear(cur_set);
2338 
2339 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2340 			lpl_leaf = lpl_iter->lpl_rset[i];
2341 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2342 				continue;
2343 
2344 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2345 
2346 			if (hint->cpu_lpl == lpl_leaf)
2347 				cp = cpstart = hint;
2348 			else
2349 				cp = cpstart = lpl_leaf->lpl_cpus;
2350 
2351 			do {
2352 
2353 				if (cp == curcpu)
2354 					cpupri = -1;
2355 				else if (cp == cpu_inmotion)
2356 					cpupri = SHRT_MAX;
2357 				else
2358 					cpupri = cp->cpu_dispatch_pri;
2359 
2360 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2361 					cpupri = cp->cpu_disp->disp_maxrunpri;
2362 				if (cp->cpu_chosen_level > cpupri)
2363 					cpupri = cp->cpu_chosen_level;
2364 				if (cpupri < bestpri) {
2365 					if (CPU_IDLING(cpupri)) {
2366 						ASSERT((cp->cpu_flags &
2367 						    CPU_QUIESCED) == 0);
2368 						return (cp);
2369 					}
2370 					bestcpu = cp;
2371 					bestpri = cpupri;
2372 				}
2373 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2374 		}
2375 
2376 		if (bestcpu && (tpri > bestpri)) {
2377 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2378 			return (bestcpu);
2379 		}
2380 		if (besthomecpu == NULL)
2381 			besthomecpu = bestcpu;
2382 		/*
2383 		 * Add the lgrps we just considered to the "done" set
2384 		 */
2385 		klgrpset_or(done, cur_set);
2386 
2387 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2388 
2389 	/*
2390 	 * The specified priority isn't high enough to run immediately
2391 	 * anywhere, so just return the best CPU from the home lgroup.
2392 	 */
2393 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2394 	return (besthomecpu);
2395 }
2396 
2397 /*
2398  * This routine provides the generic idle cpu function for all processors.
2399  * If a processor has some specific code to execute when idle (say, to stop
2400  * the pipeline and save power) then that routine should be defined in the
2401  * processors specific code (module_xx.c) and the global variable idle_cpu
2402  * set to that function.
2403  */
2404 static void
2405 generic_idle_cpu(void)
2406 {
2407 }
2408 
2409 /*ARGSUSED*/
2410 static void
2411 generic_enq_thread(cpu_t *cpu, int bound)
2412 {
2413 }
2414 
2415 /*
2416  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2417  *	- t->t_cpu is not in this thread's assigned lgrp
2418  *	- the time since the thread last came off t->t_cpu exceeds the
2419  *	  rechoose time for this cpu (ignore this if t is curthread in
2420  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2421  *	- t->t_cpu is presently the target of an offline or partition move
2422  *	  request
2423  */
2424 static cpu_t *
2425 cpu_choose(kthread_t *t, pri_t tpri)
2426 {
2427 	ASSERT(tpri < kpqpri);
2428 
2429 	if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) &&
2430 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2431 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2432 	}
2433 
2434 	/*
2435 	 * Take a trip through disp_lowpri_cpu() if the thread was
2436 	 * running outside it's home lgroup
2437 	 */
2438 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2439 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2440 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2441 		    (t == curthread) ? t->t_cpu : NULL));
2442 	}
2443 	return (t->t_cpu);
2444 }
2445