xref: /titanic_51/usr/src/uts/common/disp/disp.c (revision 1a7c1b724419d3cb5fa6eea75123c6b2060ba31b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/signal.h>
37 #include <sys/user.h>
38 #include <sys/systm.h>
39 #include <sys/sysinfo.h>
40 #include <sys/var.h>
41 #include <sys/errno.h>
42 #include <sys/cmn_err.h>
43 #include <sys/debug.h>
44 #include <sys/inline.h>
45 #include <sys/disp.h>
46 #include <sys/class.h>
47 #include <sys/bitmap.h>
48 #include <sys/kmem.h>
49 #include <sys/cpuvar.h>
50 #include <sys/vtrace.h>
51 #include <sys/tnf.h>
52 #include <sys/cpupart.h>
53 #include <sys/lgrp.h>
54 #include <sys/chip.h>
55 #include <sys/schedctl.h>
56 #include <sys/atomic.h>
57 #include <sys/dtrace.h>
58 #include <sys/sdt.h>
59 
60 #include <vm/as.h>
61 
62 #define	BOUND_CPU	0x1
63 #define	BOUND_PARTITION	0x2
64 #define	BOUND_INTR	0x4
65 
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 	disp_t	*dp;
69 	dispq_t *olddispq;
70 	dispq_t *newdispq;
71 	ulong_t	*olddqactmap;
72 	ulong_t	*newdqactmap;
73 	int	oldnglobpris;
74 };
75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76     disp_t *dp);
77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void	disp_dq_free(struct disp_queue_info *dptr);
79 
80 /* platform-specific routine to call when processor is idle */
81 static void	generic_idle_cpu();
82 void		(*idle_cpu)() = generic_idle_cpu;
83 
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void	idle_enter();
86 static void	idle_exit();
87 
88 /* platform-specific routine to call when thread is enqueued */
89 static void	generic_enq_thread(cpu_t *, int);
90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 
92 pri_t	kpreemptpri;	/* priority where kernel preemption applies */
93 pri_t	upreemptpri = 0; /* priority where normal preemption applies */
94 pri_t	intr_pri;	/* interrupt thread priority base level */
95 
96 #define	KPQPRI	-1 /* priority where cpu affinity is dropped for kp queue */
97 pri_t	kpqpri = KPQPRI; /* can be set in /etc/system */
98 disp_t	cpu0_disp;	/* boot CPU's dispatch queue */
99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
100 int	nswapped;	/* total number of swapped threads */
101 void	disp_swapped_enq(kthread_t *tp);
102 static void	disp_swapped_setrun(kthread_t *tp);
103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
104 
105 /*
106  * If this is set, only interrupt threads will cause kernel preemptions.
107  * This is done by changing the value of kpreemptpri.  kpreemptpri
108  * will either be the max sysclass pri + 1 or the min interrupt pri.
109  */
110 int	only_intr_kpreempt;
111 
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define	SETKP_BACK	0
116 #define	SETKP_FRONT	1
117 /*
118  * Parameter that determines how recently a thread must have run
119  * on the CPU to be considered loosely-bound to that CPU to reduce
120  * cold cache effects.  The interval is in hertz.
121  *
122  * The platform may define a per physical processor adjustment of
123  * this parameter. For efficiency, the effective rechoose interval
124  * (rechoose_interval + per chip adjustment) is maintained in the
125  * cpu structures. See cpu_choose()
126  */
127 int	rechoose_interval = RECHOOSE_INTERVAL;
128 
129 static cpu_t	*cpu_choose(kthread_t *, pri_t);
130 
131 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
132 
133 disp_lock_t	transition_lock;	/* lock on transitioning threads */
134 disp_lock_t	stop_lock;		/* lock on stopped threads */
135 disp_lock_t	shuttle_lock;		/* lock on shuttle objects */
136 
137 static void		cpu_dispqalloc(int numpris);
138 
139 static kthread_t	*disp_getwork(cpu_t *to);
140 static kthread_t	*disp_getbest(disp_t *from);
141 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
142 
143 void	swtch_to(kthread_t *);
144 
145 /*
146  * dispatcher and scheduler initialization
147  */
148 
149 /*
150  * disp_setup - Common code to calculate and allocate dispatcher
151  *		variables and structures based on the maximum priority.
152  */
153 static void
154 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
155 {
156 	pri_t	newnglobpris;
157 
158 	ASSERT(MUTEX_HELD(&cpu_lock));
159 
160 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
161 
162 	if (newnglobpris > oldnglobpris) {
163 		/*
164 		 * Allocate new kp queues for each CPU partition.
165 		 */
166 		cpupart_kpqalloc(newnglobpris);
167 
168 		/*
169 		 * Allocate new dispatch queues for each CPU.
170 		 */
171 		cpu_dispqalloc(newnglobpris);
172 
173 		/*
174 		 * compute new interrupt thread base priority
175 		 */
176 		intr_pri = maxglobpri;
177 		if (only_intr_kpreempt) {
178 			kpreemptpri = intr_pri + 1;
179 			if (kpqpri == KPQPRI)
180 				kpqpri = kpreemptpri;
181 		}
182 		v.v_nglobpris = newnglobpris;
183 	}
184 }
185 
186 /*
187  * dispinit - Called to initialize all loaded classes and the
188  *	      dispatcher framework.
189  */
190 void
191 dispinit(void)
192 {
193 	id_t	cid;
194 	pri_t	maxglobpri;
195 	pri_t	cl_maxglobpri;
196 
197 	maxglobpri = -1;
198 
199 	/*
200 	 * Initialize transition lock, which will always be set.
201 	 */
202 	DISP_LOCK_INIT(&transition_lock);
203 	disp_lock_enter_high(&transition_lock);
204 	DISP_LOCK_INIT(&stop_lock);
205 	DISP_LOCK_INIT(&shuttle_lock);
206 
207 	mutex_enter(&cpu_lock);
208 	CPU->cpu_disp->disp_maxrunpri = -1;
209 	CPU->cpu_disp->disp_max_unbound_pri = -1;
210 	/*
211 	 * Initialize the default CPU partition.
212 	 */
213 	cpupart_initialize_default();
214 	/*
215 	 * Call the class specific initialization functions for
216 	 * all pre-installed schedulers.
217 	 *
218 	 * We pass the size of a class specific parameter
219 	 * buffer to each of the initialization functions
220 	 * to try to catch problems with backward compatibility
221 	 * of class modules.
222 	 *
223 	 * For example a new class module running on an old system
224 	 * which didn't provide sufficiently large parameter buffers
225 	 * would be bad news. Class initialization modules can check for
226 	 * this and take action if they detect a problem.
227 	 */
228 
229 	for (cid = 0; cid < nclass; cid++) {
230 		sclass_t	*sc;
231 
232 		sc = &sclass[cid];
233 		if (SCHED_INSTALLED(sc)) {
234 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
235 			    &sc->cl_funcs);
236 			if (cl_maxglobpri > maxglobpri)
237 				maxglobpri = cl_maxglobpri;
238 		}
239 	}
240 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
241 	if (kpqpri == KPQPRI)
242 		kpqpri = kpreemptpri;
243 
244 	ASSERT(maxglobpri >= 0);
245 	disp_setup(maxglobpri, 0);
246 
247 	mutex_exit(&cpu_lock);
248 
249 	/*
250 	 * Get the default class ID; this may be later modified via
251 	 * dispadmin(1M).  This will load the class (normally TS) and that will
252 	 * call disp_add(), which is why we had to drop cpu_lock first.
253 	 */
254 	if (getcid(defaultclass, &defaultcid) != 0) {
255 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
256 		    defaultclass);
257 	}
258 }
259 
260 /*
261  * disp_add - Called with class pointer to initialize the dispatcher
262  *	      for a newly loaded class.
263  */
264 void
265 disp_add(sclass_t *clp)
266 {
267 	pri_t	maxglobpri;
268 	pri_t	cl_maxglobpri;
269 
270 	mutex_enter(&cpu_lock);
271 	/*
272 	 * Initialize the scheduler class.
273 	 */
274 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
275 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
276 	if (cl_maxglobpri > maxglobpri)
277 		maxglobpri = cl_maxglobpri;
278 
279 	/*
280 	 * Save old queue information.  Since we're initializing a
281 	 * new scheduling class which has just been loaded, then
282 	 * the size of the dispq may have changed.  We need to handle
283 	 * that here.
284 	 */
285 	disp_setup(maxglobpri, v.v_nglobpris);
286 
287 	mutex_exit(&cpu_lock);
288 }
289 
290 
291 /*
292  * For each CPU, allocate new dispatch queues
293  * with the stated number of priorities.
294  */
295 static void
296 cpu_dispqalloc(int numpris)
297 {
298 	cpu_t	*cpup;
299 	struct disp_queue_info	*disp_mem;
300 	int i, num;
301 
302 	ASSERT(MUTEX_HELD(&cpu_lock));
303 
304 	disp_mem = kmem_zalloc(NCPU *
305 	    sizeof (struct disp_queue_info), KM_SLEEP);
306 
307 	/*
308 	 * This routine must allocate all of the memory before stopping
309 	 * the cpus because it must not sleep in kmem_alloc while the
310 	 * CPUs are stopped.  Locks they hold will not be freed until they
311 	 * are restarted.
312 	 */
313 	i = 0;
314 	cpup = cpu_list;
315 	do {
316 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
317 		i++;
318 		cpup = cpup->cpu_next;
319 	} while (cpup != cpu_list);
320 	num = i;
321 
322 	pause_cpus(NULL);
323 	for (i = 0; i < num; i++)
324 		disp_dq_assign(&disp_mem[i], numpris);
325 	start_cpus();
326 
327 	/*
328 	 * I must free all of the memory after starting the cpus because
329 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
330 	 */
331 	for (i = 0; i < num; i++)
332 		disp_dq_free(&disp_mem[i]);
333 
334 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
335 }
336 
337 static void
338 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
339 {
340 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
341 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
342 	    sizeof (long), KM_SLEEP);
343 	dptr->dp = dp;
344 }
345 
346 static void
347 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
348 {
349 	disp_t	*dp;
350 
351 	dp = dptr->dp;
352 	dptr->olddispq = dp->disp_q;
353 	dptr->olddqactmap = dp->disp_qactmap;
354 	dptr->oldnglobpris = dp->disp_npri;
355 
356 	ASSERT(dptr->oldnglobpris < numpris);
357 
358 	if (dptr->olddispq != NULL) {
359 		/*
360 		 * Use kcopy because bcopy is platform-specific
361 		 * and could block while we might have paused the cpus.
362 		 */
363 		(void) kcopy(dptr->olddispq, dptr->newdispq,
364 		    dptr->oldnglobpris * sizeof (dispq_t));
365 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
366 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
367 		    sizeof (long));
368 	}
369 	dp->disp_q = dptr->newdispq;
370 	dp->disp_qactmap = dptr->newdqactmap;
371 	dp->disp_q_limit = &dptr->newdispq[numpris];
372 	dp->disp_npri = numpris;
373 }
374 
375 static void
376 disp_dq_free(struct disp_queue_info *dptr)
377 {
378 	if (dptr->olddispq != NULL)
379 		kmem_free(dptr->olddispq,
380 		    dptr->oldnglobpris * sizeof (dispq_t));
381 	if (dptr->olddqactmap != NULL)
382 		kmem_free(dptr->olddqactmap,
383 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
384 }
385 
386 /*
387  * For a newly created CPU, initialize the dispatch queue.
388  * This is called before the CPU is known through cpu[] or on any lists.
389  */
390 void
391 disp_cpu_init(cpu_t *cp)
392 {
393 	disp_t	*dp;
394 	dispq_t	*newdispq;
395 	ulong_t	*newdqactmap;
396 
397 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
398 
399 	if (cp == cpu0_disp.disp_cpu)
400 		dp = &cpu0_disp;
401 	else
402 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
403 	bzero(dp, sizeof (disp_t));
404 	cp->cpu_disp = dp;
405 	dp->disp_cpu = cp;
406 	dp->disp_maxrunpri = -1;
407 	dp->disp_max_unbound_pri = -1;
408 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
409 	/*
410 	 * Allocate memory for the dispatcher queue headers
411 	 * and the active queue bitmap.
412 	 */
413 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
414 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
415 	    sizeof (long), KM_SLEEP);
416 	dp->disp_q = newdispq;
417 	dp->disp_qactmap = newdqactmap;
418 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
419 	dp->disp_npri = v.v_nglobpris;
420 }
421 
422 void
423 disp_cpu_fini(cpu_t *cp)
424 {
425 	ASSERT(MUTEX_HELD(&cpu_lock));
426 
427 	disp_kp_free(cp->cpu_disp);
428 	if (cp->cpu_disp != &cpu0_disp)
429 		kmem_free(cp->cpu_disp, sizeof (disp_t));
430 }
431 
432 /*
433  * Allocate new, larger kpreempt dispatch queue to replace the old one.
434  */
435 void
436 disp_kp_alloc(disp_t *dq, pri_t npri)
437 {
438 	struct disp_queue_info	mem_info;
439 
440 	if (npri > dq->disp_npri) {
441 		/*
442 		 * Allocate memory for the new array.
443 		 */
444 		disp_dq_alloc(&mem_info, npri, dq);
445 
446 		/*
447 		 * We need to copy the old structures to the new
448 		 * and free the old.
449 		 */
450 		disp_dq_assign(&mem_info, npri);
451 		disp_dq_free(&mem_info);
452 	}
453 }
454 
455 /*
456  * Free dispatch queue.
457  * Used for the kpreempt queues for a removed CPU partition and
458  * for the per-CPU queues of deleted CPUs.
459  */
460 void
461 disp_kp_free(disp_t *dq)
462 {
463 	struct disp_queue_info	mem_info;
464 
465 	mem_info.olddispq = dq->disp_q;
466 	mem_info.olddqactmap = dq->disp_qactmap;
467 	mem_info.oldnglobpris = dq->disp_npri;
468 	disp_dq_free(&mem_info);
469 }
470 
471 /*
472  * End dispatcher and scheduler initialization.
473  */
474 
475 /*
476  * See if there's anything to do other than remain idle.
477  * Return non-zero if there is.
478  *
479  * This function must be called with high spl, or with
480  * kernel preemption disabled to prevent the partition's
481  * active cpu list from changing while being traversed.
482  *
483  */
484 int
485 disp_anywork(void)
486 {
487 	cpu_t   *cp = CPU;
488 	cpu_t   *ocp;
489 
490 	if (cp->cpu_disp->disp_nrunnable != 0)
491 		return (1);
492 
493 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
494 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
495 			return (1);
496 
497 		/*
498 		 * Work can be taken from another CPU if:
499 		 *	- There is unbound work on the run queue
500 		 *	- That work isn't a thread undergoing a
501 		 *	- context switch on an otherwise empty queue.
502 		 *	- The CPU isn't running the idle loop.
503 		 */
504 		for (ocp = cp->cpu_next_part; ocp != cp;
505 		    ocp = ocp->cpu_next_part) {
506 			ASSERT(CPU_ACTIVE(ocp));
507 
508 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
509 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
510 			    ocp->cpu_disp->disp_nrunnable == 1) &&
511 			    ocp->cpu_dispatch_pri != -1)
512 				return (1);
513 		}
514 	}
515 	return (0);
516 }
517 
518 /*
519  * Called when CPU enters the idle loop
520  */
521 static void
522 idle_enter()
523 {
524 	cpu_t		*cp = CPU;
525 
526 	new_cpu_mstate(cp, CMS_IDLE);
527 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
528 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
529 }
530 
531 /*
532  * Called when CPU exits the idle loop
533  */
534 static void
535 idle_exit()
536 {
537 	cpu_t		*cp = CPU;
538 
539 	new_cpu_mstate(cp, CMS_SYSTEM);
540 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
541 }
542 
543 /*
544  * Idle loop.
545  */
546 void
547 idle()
548 {
549 	struct cpu	*cp = CPU;		/* pointer to this CPU */
550 	kthread_t	*t;			/* taken thread */
551 
552 	idle_enter();
553 
554 	/*
555 	 * Uniprocessor version of idle loop.
556 	 * Do this until notified that we're on an actual multiprocessor.
557 	 */
558 	while (ncpus == 1) {
559 		if (cp->cpu_disp->disp_nrunnable == 0) {
560 			(*idle_cpu)();
561 			continue;
562 		}
563 		idle_exit();
564 		swtch();
565 
566 		idle_enter(); /* returned from swtch */
567 	}
568 
569 	/*
570 	 * Multiprocessor idle loop.
571 	 */
572 	for (;;) {
573 		/*
574 		 * If CPU is completely quiesced by p_online(2), just wait
575 		 * here with minimal bus traffic until put online.
576 		 */
577 		while (cp->cpu_flags & CPU_QUIESCED)
578 			(*idle_cpu)();
579 
580 		if (cp->cpu_disp->disp_nrunnable != 0) {
581 			idle_exit();
582 			swtch();
583 		} else {
584 			if (cp->cpu_flags & CPU_OFFLINE)
585 				continue;
586 			if ((t = disp_getwork(cp)) == NULL) {
587 				if (cp->cpu_chosen_level != -1) {
588 					disp_t *dp = cp->cpu_disp;
589 					disp_t *kpq;
590 
591 					disp_lock_enter(&dp->disp_lock);
592 					/*
593 					 * Set kpq under lock to prevent
594 					 * migration between partitions.
595 					 */
596 					kpq = &cp->cpu_part->cp_kp_queue;
597 					if (kpq->disp_maxrunpri == -1)
598 						cp->cpu_chosen_level = -1;
599 					disp_lock_exit(&dp->disp_lock);
600 				}
601 				(*idle_cpu)();
602 				continue;
603 			}
604 			idle_exit();
605 			restore_mstate(t);
606 			swtch_to(t);
607 		}
608 		idle_enter(); /* returned from swtch/swtch_to */
609 	}
610 }
611 
612 
613 /*
614  * Preempt the currently running thread in favor of the highest
615  * priority thread.  The class of the current thread controls
616  * where it goes on the dispatcher queues. If panicking, turn
617  * preemption off.
618  */
619 void
620 preempt()
621 {
622 	kthread_t 	*t = curthread;
623 	klwp_t 		*lwp = ttolwp(curthread);
624 
625 	if (panicstr)
626 		return;
627 
628 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
629 
630 	thread_lock(t);
631 
632 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
633 		/*
634 		 * this thread has already been chosen to be run on
635 		 * another CPU. Clear kprunrun on this CPU since we're
636 		 * already headed for swtch().
637 		 */
638 		CPU->cpu_kprunrun = 0;
639 		thread_unlock_nopreempt(t);
640 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
641 	} else {
642 		if (lwp != NULL)
643 			lwp->lwp_ru.nivcsw++;
644 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
645 		THREAD_TRANSITION(t);
646 		CL_PREEMPT(t);
647 		DTRACE_SCHED(preempt);
648 		thread_unlock_nopreempt(t);
649 
650 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
651 
652 		swtch();		/* clears CPU->cpu_runrun via disp() */
653 	}
654 }
655 
656 extern kthread_t *thread_unpin();
657 
658 /*
659  * disp() - find the highest priority thread for this processor to run, and
660  * set it in TS_ONPROC state so that resume() can be called to run it.
661  */
662 static kthread_t *
663 disp()
664 {
665 	cpu_t		*cpup;
666 	disp_t		*dp;
667 	kthread_t	*tp;
668 	dispq_t		*dq;
669 	int		maxrunword;
670 	pri_t		pri;
671 	disp_t		*kpq;
672 
673 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
674 
675 	cpup = CPU;
676 	/*
677 	 * Find the highest priority loaded, runnable thread.
678 	 */
679 	dp = cpup->cpu_disp;
680 
681 reschedule:
682 	/*
683 	 * If there is more important work on the global queue with a better
684 	 * priority than the maximum on this CPU, take it now.
685 	 */
686 	kpq = &cpup->cpu_part->cp_kp_queue;
687 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
688 	    pri >= dp->disp_maxrunpri &&
689 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
690 	    (tp = disp_getbest(kpq)) != NULL) {
691 		if (disp_ratify(tp, kpq) != NULL) {
692 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
693 			    "disp_end:tid %p", tp);
694 			restore_mstate(tp);
695 			return (tp);
696 		}
697 	}
698 
699 	disp_lock_enter(&dp->disp_lock);
700 	pri = dp->disp_maxrunpri;
701 
702 	/*
703 	 * If there is nothing to run, look at what's runnable on other queues.
704 	 * Choose the idle thread if the CPU is quiesced.
705 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
706 	 * interrupt threads, which will be the only threads on the CPU's own
707 	 * queue, but cannot run threads from other queues.
708 	 */
709 	if (pri == -1) {
710 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
711 			disp_lock_exit(&dp->disp_lock);
712 			if ((tp = disp_getwork(cpup)) == NULL) {
713 				tp = cpup->cpu_idle_thread;
714 				(void) splhigh();
715 				THREAD_ONPROC(tp, cpup);
716 				cpup->cpu_dispthread = tp;
717 				cpup->cpu_dispatch_pri = -1;
718 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
719 				cpup->cpu_chosen_level = -1;
720 			}
721 		} else {
722 			disp_lock_exit_high(&dp->disp_lock);
723 			tp = cpup->cpu_idle_thread;
724 			THREAD_ONPROC(tp, cpup);
725 			cpup->cpu_dispthread = tp;
726 			cpup->cpu_dispatch_pri = -1;
727 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
728 			cpup->cpu_chosen_level = -1;
729 		}
730 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
731 			"disp_end:tid %p", tp);
732 		restore_mstate(tp);
733 		return (tp);
734 	}
735 
736 	dq = &dp->disp_q[pri];
737 	tp = dq->dq_first;
738 
739 	ASSERT(tp != NULL);
740 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
741 
742 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
743 
744 	/*
745 	 * Found it so remove it from queue.
746 	 */
747 	dp->disp_nrunnable--;
748 	dq->dq_sruncnt--;
749 	if ((dq->dq_first = tp->t_link) == NULL) {
750 		ulong_t	*dqactmap = dp->disp_qactmap;
751 
752 		ASSERT(dq->dq_sruncnt == 0);
753 		dq->dq_last = NULL;
754 
755 		/*
756 		 * The queue is empty, so the corresponding bit needs to be
757 		 * turned off in dqactmap.   If nrunnable != 0 just took the
758 		 * last runnable thread off the
759 		 * highest queue, so recompute disp_maxrunpri.
760 		 */
761 		maxrunword = pri >> BT_ULSHIFT;
762 		dqactmap[maxrunword] &= ~BT_BIW(pri);
763 
764 		if (dp->disp_nrunnable == 0) {
765 			dp->disp_max_unbound_pri = -1;
766 			dp->disp_maxrunpri = -1;
767 		} else {
768 			int ipri;
769 
770 			ipri = bt_gethighbit(dqactmap, maxrunword);
771 			dp->disp_maxrunpri = ipri;
772 			if (ipri < dp->disp_max_unbound_pri)
773 				dp->disp_max_unbound_pri = ipri;
774 		}
775 	} else {
776 		tp->t_link = NULL;
777 	}
778 
779 	/*
780 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
781 	 * out this thread before we have a chance to run it.
782 	 * While running, it is protected against swapping by t_lock.
783 	 */
784 	tp->t_schedflag |= TS_DONT_SWAP;
785 	cpup->cpu_dispthread = tp;		/* protected by spl only */
786 	cpup->cpu_dispatch_pri = pri;
787 	ASSERT(pri == DISP_PRIO(tp));
788 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
789 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
790 
791 	ASSERT(tp != NULL);
792 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
793 		"disp_end:tid %p", tp);
794 
795 	if (disp_ratify(tp, kpq) == NULL)
796 		goto reschedule;
797 
798 	restore_mstate(tp);
799 	return (tp);
800 }
801 
802 /*
803  * swtch()
804  *	Find best runnable thread and run it.
805  *	Called with the current thread already switched to a new state,
806  *	on a sleep queue, run queue, stopped, and not zombied.
807  *	May be called at any spl level less than or equal to LOCK_LEVEL.
808  *	Always drops spl to the base level (spl0()).
809  */
810 void
811 swtch()
812 {
813 	kthread_t	*t = curthread;
814 	kthread_t	*next;
815 	cpu_t		*cp;
816 
817 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
818 
819 	if (t->t_flag & T_INTR_THREAD)
820 		cpu_intr_swtch_enter(t);
821 
822 	if (t->t_intr != NULL) {
823 		/*
824 		 * We are an interrupt thread.  Setup and return
825 		 * the interrupted thread to be resumed.
826 		 */
827 		(void) splhigh();	/* block other scheduler action */
828 		cp = CPU;		/* now protected against migration */
829 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
830 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
831 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
832 		next = thread_unpin();
833 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
834 		resume_from_intr(next);
835 	} else {
836 #ifdef	DEBUG
837 		if (t->t_state == TS_ONPROC &&
838 		    t->t_disp_queue->disp_cpu == CPU &&
839 		    t->t_preempt == 0) {
840 			thread_lock(t);
841 			ASSERT(t->t_state != TS_ONPROC ||
842 			    t->t_disp_queue->disp_cpu != CPU ||
843 			    t->t_preempt != 0);	/* cannot migrate */
844 			thread_unlock_nopreempt(t);
845 		}
846 #endif	/* DEBUG */
847 		cp = CPU;
848 		next = disp();		/* returns with spl high */
849 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
850 
851 		/* OK to steal anything left on run queue */
852 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
853 
854 		if (next != t) {
855 			if (t == cp->cpu_idle_thread) {
856 				CHIP_NRUNNING(cp->cpu_chip, 1);
857 			} else if (next == cp->cpu_idle_thread) {
858 				CHIP_NRUNNING(cp->cpu_chip, -1);
859 			}
860 
861 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
862 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
863 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
864 
865 			if (dtrace_vtime_active)
866 				dtrace_vtime_switch(next);
867 
868 			resume(next);
869 			/*
870 			 * The TR_RESUME_END and TR_SWTCH_END trace points
871 			 * appear at the end of resume(), because we may not
872 			 * return here
873 			 */
874 		} else {
875 			if (t->t_flag & T_INTR_THREAD)
876 				cpu_intr_swtch_exit(t);
877 
878 			DTRACE_SCHED(remain__cpu);
879 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
880 			(void) spl0();
881 		}
882 	}
883 }
884 
885 /*
886  * swtch_from_zombie()
887  *	Special case of swtch(), which allows checks for TS_ZOMB to be
888  *	eliminated from normal resume.
889  *	Find best runnable thread and run it.
890  *	Called with the current thread zombied.
891  *	Zombies cannot migrate, so CPU references are safe.
892  */
893 void
894 swtch_from_zombie()
895 {
896 	kthread_t	*next;
897 	cpu_t		*cpu = CPU;
898 
899 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
900 
901 	ASSERT(curthread->t_state == TS_ZOMB);
902 
903 	next = disp();			/* returns with spl high */
904 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
905 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
906 	ASSERT(next != curthread);
907 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
908 
909 	if (next == cpu->cpu_idle_thread)
910 		CHIP_NRUNNING(cpu->cpu_chip, -1);
911 
912 	if (dtrace_vtime_active)
913 		dtrace_vtime_switch(next);
914 
915 	resume_from_zombie(next);
916 	/*
917 	 * The TR_RESUME_END and TR_SWTCH_END trace points
918 	 * appear at the end of resume(), because we certainly will not
919 	 * return here
920 	 */
921 }
922 
923 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
924 static int
925 thread_on_queue(kthread_t *tp)
926 {
927 	cpu_t	*cp;
928 	cpu_t	*self;
929 	disp_t	*dp;
930 
931 	self = CPU;
932 	cp = self->cpu_next_onln;
933 	dp = cp->cpu_disp;
934 	for (;;) {
935 		dispq_t		*dq;
936 		dispq_t		*eq;
937 
938 		disp_lock_enter_high(&dp->disp_lock);
939 		for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
940 			kthread_t	*rp;
941 
942 			ASSERT(dq->dq_last == NULL ||
943 				dq->dq_last->t_link == NULL);
944 			for (rp = dq->dq_first; rp; rp = rp->t_link)
945 				if (tp == rp) {
946 					disp_lock_exit_high(&dp->disp_lock);
947 					return (1);
948 				}
949 		}
950 		disp_lock_exit_high(&dp->disp_lock);
951 		if (cp == NULL)
952 			break;
953 		if (cp == self) {
954 			cp = NULL;
955 			dp = &cp->cpu_part->cp_kp_queue;
956 		} else {
957 			cp = cp->cpu_next_onln;
958 			dp = cp->cpu_disp;
959 		}
960 	}
961 	return (0);
962 }	/* end of thread_on_queue */
963 #else
964 
965 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
966 
967 #endif  /* DEBUG */
968 
969 /*
970  * like swtch(), but switch to a specified thread taken from another CPU.
971  *	called with spl high..
972  */
973 void
974 swtch_to(kthread_t *next)
975 {
976 	cpu_t			*cp = CPU;
977 
978 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
979 
980 	/*
981 	 * Update context switch statistics.
982 	 */
983 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
984 
985 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
986 
987 	if (curthread == cp->cpu_idle_thread)
988 		CHIP_NRUNNING(cp->cpu_chip, 1);
989 
990 	/* OK to steal anything left on run queue */
991 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
992 
993 	/* record last execution time */
994 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
995 
996 	if (dtrace_vtime_active)
997 		dtrace_vtime_switch(next);
998 
999 	resume(next);
1000 	/*
1001 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1002 	 * appear at the end of resume(), because we may not
1003 	 * return here
1004 	 */
1005 }
1006 
1007 
1008 
1009 #define	CPU_IDLING(pri)	((pri) == -1)
1010 
1011 static void
1012 cpu_resched(cpu_t *cp, pri_t tpri)
1013 {
1014 	int	call_poke_cpu = 0;
1015 	pri_t   cpupri = cp->cpu_dispatch_pri;
1016 
1017 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1018 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1019 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1020 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1021 			cp->cpu_runrun = 1;
1022 			aston(cp->cpu_dispthread);
1023 			if (tpri < kpreemptpri && cp != CPU)
1024 				call_poke_cpu = 1;
1025 		}
1026 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1027 			cp->cpu_kprunrun = 1;
1028 			if (cp != CPU)
1029 				call_poke_cpu = 1;
1030 		}
1031 	}
1032 
1033 	/*
1034 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1035 	 */
1036 	membar_enter();
1037 
1038 	if (call_poke_cpu)
1039 		poke_cpu(cp->cpu_id);
1040 }
1041 
1042 /*
1043  * Routine used by setbackdq() to balance load across the physical
1044  * processors. Returns a CPU of a lesser loaded chip in the lgroup
1045  * if balancing is necessary, or the "hint" CPU if it's not.
1046  *
1047  * - tp is the thread being enqueued
1048  * - cp is a hint CPU (chosen by cpu_choose()).
1049  * - curchip (if not NULL) is the chip on which the current thread
1050  *   is running.
1051  *
1052  * The thread lock for "tp" must be held while calling this routine.
1053  */
1054 static cpu_t *
1055 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip)
1056 {
1057 	int	chp_nrun, ochp_nrun;
1058 	chip_t	*chp, *nchp;
1059 
1060 	chp = cp->cpu_chip;
1061 	chp_nrun = chp->chip_nrunning;
1062 
1063 	if (chp == curchip)
1064 		chp_nrun--;	/* Ignore curthread */
1065 
1066 	/*
1067 	 * If this chip isn't at all idle, then let
1068 	 * run queue balancing do the work.
1069 	 */
1070 	if (chp_nrun == chp->chip_ncpu)
1071 		return (cp);
1072 
1073 	nchp = chp->chip_balance;
1074 	do {
1075 		if (nchp == chp ||
1076 		    !CHIP_IN_CPUPART(nchp, tp->t_cpupart))
1077 			continue;
1078 
1079 		ochp_nrun = nchp->chip_nrunning;
1080 
1081 		/*
1082 		 * If the other chip is running less threads,
1083 		 * or if it's running the same number of threads, but
1084 		 * has more online logical CPUs, then choose to balance.
1085 		 */
1086 		if (chp_nrun > ochp_nrun ||
1087 		    (chp_nrun == ochp_nrun &&
1088 		    nchp->chip_ncpu > chp->chip_ncpu)) {
1089 			cp = nchp->chip_cpus;
1090 			nchp->chip_cpus = cp->cpu_next_chip;
1091 
1092 			/*
1093 			 * Find a CPU on the chip in the correct
1094 			 * partition. We know at least one exists
1095 			 * because of the CHIP_IN_CPUPART() check above.
1096 			 */
1097 			while (cp->cpu_part != tp->t_cpupart)
1098 				cp = cp->cpu_next_chip;
1099 		}
1100 		chp->chip_balance = nchp->chip_next_lgrp;
1101 		break;
1102 	} while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance);
1103 
1104 	ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart));
1105 	return (cp);
1106 }
1107 
1108 /*
1109  * setbackdq() keeps runqs balanced such that the difference in length
1110  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1111  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1112  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1113  * try to keep runqs perfectly balanced regardless of the thread priority.
1114  */
1115 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1116 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1117 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1118 
1119 /*
1120  * Put the specified thread on the back of the dispatcher
1121  * queue corresponding to its current priority.
1122  *
1123  * Called with the thread in transition, onproc or stopped state
1124  * and locked (transition implies locked) and at high spl.
1125  * Returns with the thread in TS_RUN state and still locked.
1126  */
1127 void
1128 setbackdq(kthread_t *tp)
1129 {
1130 	dispq_t	*dq;
1131 	disp_t		*dp;
1132 	chip_t		*curchip = NULL;
1133 	cpu_t		*cp;
1134 	pri_t		tpri;
1135 	int		bound;
1136 
1137 	ASSERT(THREAD_LOCK_HELD(tp));
1138 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1139 
1140 	if (tp->t_waitrq == 0) {
1141 		hrtime_t curtime;
1142 
1143 		curtime = gethrtime_unscaled();
1144 		(void) cpu_update_pct(tp, curtime);
1145 		tp->t_waitrq = curtime;
1146 	} else {
1147 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1148 	}
1149 
1150 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1151 
1152 	/*
1153 	 * If thread is "swapped" or on the swap queue don't
1154 	 * queue it, but wake sched.
1155 	 */
1156 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1157 		disp_swapped_setrun(tp);
1158 		return;
1159 	}
1160 
1161 	tpri = DISP_PRIO(tp);
1162 	if (tp == curthread) {
1163 		curchip = CPU->cpu_chip;
1164 	}
1165 
1166 	if (ncpus == 1)
1167 		cp = tp->t_cpu;
1168 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1169 		if (tpri >= kpqpri) {
1170 			setkpdq(tp, SETKP_BACK);
1171 			return;
1172 		}
1173 		/*
1174 		 * Let cpu_choose suggest a CPU.
1175 		 */
1176 		cp = cpu_choose(tp, tpri);
1177 
1178 		if (tp->t_cpupart == cp->cpu_part) {
1179 			int	qlen;
1180 
1181 			/*
1182 			 * Select another CPU if we need
1183 			 * to do some load balancing across the
1184 			 * physical processors.
1185 			 */
1186 			if (CHIP_SHOULD_BALANCE(cp->cpu_chip))
1187 				cp = chip_balance(tp, cp, curchip);
1188 
1189 			/*
1190 			 * Balance across the run queues
1191 			 */
1192 			qlen = RUNQ_LEN(cp, tpri);
1193 			if (tpri >= RUNQ_MATCH_PRI &&
1194 			    !(tp->t_schedflag & TS_RUNQMATCH))
1195 				qlen -= RUNQ_MAX_DIFF;
1196 			if (qlen > 0) {
1197 				cpu_t	*np;
1198 
1199 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
1200 					np = cp->cpu_next_part;
1201 				else {
1202 					if ((np = cp->cpu_next_lpl) == cp)
1203 						np = cp->cpu_next_part;
1204 				}
1205 				if (RUNQ_LEN(np, tpri) < qlen)
1206 					cp = np;
1207 			}
1208 		} else {
1209 			/*
1210 			 * Migrate to a cpu in the new partition.
1211 			 */
1212 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1213 			    tp->t_lpl, tp->t_pri, NULL);
1214 		}
1215 		bound = 0;
1216 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1217 	} else {
1218 		/*
1219 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1220 		 * a short time until weak binding that existed when the
1221 		 * strong binding was established has dropped) so we must
1222 		 * favour weak binding over strong.
1223 		 */
1224 		cp = tp->t_weakbound_cpu ?
1225 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1226 		bound = 1;
1227 	}
1228 	dp = cp->cpu_disp;
1229 	disp_lock_enter_high(&dp->disp_lock);
1230 
1231 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1232 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1233 		tpri, cp, tp);
1234 
1235 #ifndef NPROBE
1236 	/* Kernel probe */
1237 	if (tnf_tracing_active)
1238 		tnf_thread_queue(tp, cp, tpri);
1239 #endif /* NPROBE */
1240 
1241 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1242 
1243 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1244 	tp->t_disp_queue = dp;
1245 	tp->t_link = NULL;
1246 
1247 	dq = &dp->disp_q[tpri];
1248 	dp->disp_nrunnable++;
1249 	membar_enter();
1250 
1251 	if (dq->dq_sruncnt++ != 0) {
1252 		ASSERT(dq->dq_first != NULL);
1253 		dq->dq_last->t_link = tp;
1254 		dq->dq_last = tp;
1255 	} else {
1256 		ASSERT(dq->dq_first == NULL);
1257 		ASSERT(dq->dq_last == NULL);
1258 		dq->dq_first = dq->dq_last = tp;
1259 		BT_SET(dp->disp_qactmap, tpri);
1260 		if (tpri > dp->disp_maxrunpri) {
1261 			dp->disp_maxrunpri = tpri;
1262 			membar_enter();
1263 			cpu_resched(cp, tpri);
1264 		}
1265 	}
1266 
1267 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1268 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1269 		    cp == CPU) {
1270 			/*
1271 			 * If there are no other unbound threads on the
1272 			 * run queue, don't allow other CPUs to steal
1273 			 * this thread while we are in the middle of a
1274 			 * context switch. We may just switch to it
1275 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1276 			 * in swtch and swtch_to.
1277 			 */
1278 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1279 		}
1280 		dp->disp_max_unbound_pri = tpri;
1281 	}
1282 	(*disp_enq_thread)(cp, bound);
1283 }
1284 
1285 /*
1286  * Put the specified thread on the front of the dispatcher
1287  * queue corresponding to its current priority.
1288  *
1289  * Called with the thread in transition, onproc or stopped state
1290  * and locked (transition implies locked) and at high spl.
1291  * Returns with the thread in TS_RUN state and still locked.
1292  */
1293 void
1294 setfrontdq(kthread_t *tp)
1295 {
1296 	disp_t		*dp;
1297 	dispq_t		*dq;
1298 	cpu_t		*cp;
1299 	pri_t		tpri;
1300 	int		bound;
1301 
1302 	ASSERT(THREAD_LOCK_HELD(tp));
1303 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1304 
1305 	if (tp->t_waitrq == 0) {
1306 		hrtime_t curtime;
1307 
1308 		curtime = gethrtime_unscaled();
1309 		(void) cpu_update_pct(tp, curtime);
1310 		tp->t_waitrq = curtime;
1311 	} else {
1312 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1313 	}
1314 
1315 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1316 
1317 	/*
1318 	 * If thread is "swapped" or on the swap queue don't
1319 	 * queue it, but wake sched.
1320 	 */
1321 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1322 		disp_swapped_setrun(tp);
1323 		return;
1324 	}
1325 
1326 	tpri = DISP_PRIO(tp);
1327 	if (ncpus == 1)
1328 		cp = tp->t_cpu;
1329 	else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) {
1330 		if (tpri >= kpqpri) {
1331 			setkpdq(tp, SETKP_FRONT);
1332 			return;
1333 		}
1334 		cp = tp->t_cpu;
1335 		if (tp->t_cpupart == cp->cpu_part) {
1336 			/*
1337 			 * If we are of higher or equal priority than
1338 			 * the highest priority runnable thread of
1339 			 * the current CPU, just pick this CPU.  Otherwise
1340 			 * Let cpu_choose() select the CPU.  If this cpu
1341 			 * is the target of an offline request then do not
1342 			 * pick it - a thread_nomigrate() on the in motion
1343 			 * cpu relies on this when it forces a preempt.
1344 			 */
1345 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1346 			    cp == cpu_inmotion)
1347 				cp = cpu_choose(tp, tpri);
1348 		} else {
1349 			/*
1350 			 * Migrate to a cpu in the new partition.
1351 			 */
1352 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1353 			    tp->t_lpl, tp->t_pri, NULL);
1354 		}
1355 		bound = 0;
1356 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1357 	} else {
1358 		/*
1359 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1360 		 * a short time until weak binding that existed when the
1361 		 * strong binding was established has dropped) so we must
1362 		 * favour weak binding over strong.
1363 		 */
1364 		cp = tp->t_weakbound_cpu ?
1365 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1366 		bound = 1;
1367 	}
1368 	dp = cp->cpu_disp;
1369 	disp_lock_enter_high(&dp->disp_lock);
1370 
1371 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1372 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1373 
1374 #ifndef NPROBE
1375 	/* Kernel probe */
1376 	if (tnf_tracing_active)
1377 		tnf_thread_queue(tp, cp, tpri);
1378 #endif /* NPROBE */
1379 
1380 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1381 
1382 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1383 	tp->t_disp_queue = dp;
1384 
1385 	dq = &dp->disp_q[tpri];
1386 	dp->disp_nrunnable++;
1387 	membar_enter();
1388 
1389 	if (dq->dq_sruncnt++ != 0) {
1390 		ASSERT(dq->dq_last != NULL);
1391 		tp->t_link = dq->dq_first;
1392 		dq->dq_first = tp;
1393 	} else {
1394 		ASSERT(dq->dq_last == NULL);
1395 		ASSERT(dq->dq_first == NULL);
1396 		tp->t_link = NULL;
1397 		dq->dq_first = dq->dq_last = tp;
1398 		BT_SET(dp->disp_qactmap, tpri);
1399 		if (tpri > dp->disp_maxrunpri) {
1400 			dp->disp_maxrunpri = tpri;
1401 			membar_enter();
1402 			cpu_resched(cp, tpri);
1403 		}
1404 	}
1405 
1406 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1407 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1408 		    cp == CPU) {
1409 			/*
1410 			 * If there are no other unbound threads on the
1411 			 * run queue, don't allow other CPUs to steal
1412 			 * this thread while we are in the middle of a
1413 			 * context switch. We may just switch to it
1414 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1415 			 * in swtch and swtch_to.
1416 			 */
1417 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1418 		}
1419 		dp->disp_max_unbound_pri = tpri;
1420 	}
1421 	(*disp_enq_thread)(cp, bound);
1422 }
1423 
1424 /*
1425  * Put a high-priority unbound thread on the kp queue
1426  */
1427 static void
1428 setkpdq(kthread_t *tp, int borf)
1429 {
1430 	dispq_t	*dq;
1431 	disp_t	*dp;
1432 	cpu_t	*cp;
1433 	pri_t	tpri;
1434 
1435 	tpri = DISP_PRIO(tp);
1436 
1437 	dp = &tp->t_cpupart->cp_kp_queue;
1438 	disp_lock_enter_high(&dp->disp_lock);
1439 
1440 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1441 
1442 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1443 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1444 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1445 	tp->t_disp_queue = dp;
1446 	dp->disp_nrunnable++;
1447 	dq = &dp->disp_q[tpri];
1448 
1449 	if (dq->dq_sruncnt++ != 0) {
1450 		if (borf == SETKP_BACK) {
1451 			ASSERT(dq->dq_first != NULL);
1452 			tp->t_link = NULL;
1453 			dq->dq_last->t_link = tp;
1454 			dq->dq_last = tp;
1455 		} else {
1456 			ASSERT(dq->dq_last != NULL);
1457 			tp->t_link = dq->dq_first;
1458 			dq->dq_first = tp;
1459 		}
1460 	} else {
1461 		if (borf == SETKP_BACK) {
1462 			ASSERT(dq->dq_first == NULL);
1463 			ASSERT(dq->dq_last == NULL);
1464 			dq->dq_first = dq->dq_last = tp;
1465 		} else {
1466 			ASSERT(dq->dq_last == NULL);
1467 			ASSERT(dq->dq_first == NULL);
1468 			tp->t_link = NULL;
1469 			dq->dq_first = dq->dq_last = tp;
1470 		}
1471 		BT_SET(dp->disp_qactmap, tpri);
1472 		if (tpri > dp->disp_max_unbound_pri)
1473 			dp->disp_max_unbound_pri = tpri;
1474 		if (tpri > dp->disp_maxrunpri) {
1475 			dp->disp_maxrunpri = tpri;
1476 			membar_enter();
1477 		}
1478 	}
1479 
1480 	cp = tp->t_cpu;
1481 	if (tp->t_cpupart != cp->cpu_part) {
1482 		/* migrate to a cpu in the new partition */
1483 		cp = tp->t_cpupart->cp_cpulist;
1484 	}
1485 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1486 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1487 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1488 
1489 #ifndef NPROBE
1490 	/* Kernel probe */
1491 	if (tnf_tracing_active)
1492 		tnf_thread_queue(tp, cp, tpri);
1493 #endif /* NPROBE */
1494 
1495 	if (cp->cpu_chosen_level < tpri)
1496 		cp->cpu_chosen_level = tpri;
1497 	cpu_resched(cp, tpri);
1498 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1499 	(*disp_enq_thread)(cp, 0);
1500 }
1501 
1502 /*
1503  * Remove a thread from the dispatcher queue if it is on it.
1504  * It is not an error if it is not found but we return whether
1505  * or not it was found in case the caller wants to check.
1506  */
1507 int
1508 dispdeq(kthread_t *tp)
1509 {
1510 	disp_t		*dp;
1511 	dispq_t		*dq;
1512 	kthread_t	*rp;
1513 	kthread_t	*trp;
1514 	kthread_t	**ptp;
1515 	int		tpri;
1516 
1517 	ASSERT(THREAD_LOCK_HELD(tp));
1518 
1519 	if (tp->t_state != TS_RUN)
1520 		return (0);
1521 
1522 	/*
1523 	 * The thread is "swapped" or is on the swap queue and
1524 	 * hence no longer on the run queue, so return true.
1525 	 */
1526 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1527 		return (1);
1528 
1529 	tpri = DISP_PRIO(tp);
1530 	dp = tp->t_disp_queue;
1531 	ASSERT(tpri < dp->disp_npri);
1532 	dq = &dp->disp_q[tpri];
1533 	ptp = &dq->dq_first;
1534 	rp = *ptp;
1535 	trp = NULL;
1536 
1537 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1538 
1539 	/*
1540 	 * Search for thread in queue.
1541 	 * Double links would simplify this at the expense of disp/setrun.
1542 	 */
1543 	while (rp != tp && rp != NULL) {
1544 		trp = rp;
1545 		ptp = &trp->t_link;
1546 		rp = trp->t_link;
1547 	}
1548 
1549 	if (rp == NULL) {
1550 		panic("dispdeq: thread not on queue");
1551 	}
1552 
1553 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1554 
1555 	/*
1556 	 * Found it so remove it from queue.
1557 	 */
1558 	if ((*ptp = rp->t_link) == NULL)
1559 		dq->dq_last = trp;
1560 
1561 	dp->disp_nrunnable--;
1562 	if (--dq->dq_sruncnt == 0) {
1563 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1564 		if (dp->disp_nrunnable == 0) {
1565 			dp->disp_max_unbound_pri = -1;
1566 			dp->disp_maxrunpri = -1;
1567 		} else if (tpri == dp->disp_maxrunpri) {
1568 			int ipri;
1569 
1570 			ipri = bt_gethighbit(dp->disp_qactmap,
1571 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1572 			if (ipri < dp->disp_max_unbound_pri)
1573 				dp->disp_max_unbound_pri = ipri;
1574 			dp->disp_maxrunpri = ipri;
1575 		}
1576 	}
1577 	tp->t_link = NULL;
1578 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1579 	return (1);
1580 }
1581 
1582 
1583 /*
1584  * dq_sruninc and dq_srundec are public functions for
1585  * incrementing/decrementing the sruncnts when a thread on
1586  * a dispatcher queue is made schedulable/unschedulable by
1587  * resetting the TS_LOAD flag.
1588  *
1589  * The caller MUST have the thread lock and therefore the dispatcher
1590  * queue lock so that the operation which changes
1591  * the flag, the operation that checks the status of the thread to
1592  * determine if it's on a disp queue AND the call to this function
1593  * are one atomic operation with respect to interrupts.
1594  */
1595 
1596 /*
1597  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1598  */
1599 void
1600 dq_sruninc(kthread_t *t)
1601 {
1602 	ASSERT(t->t_state == TS_RUN);
1603 	ASSERT(t->t_schedflag & TS_LOAD);
1604 
1605 	THREAD_TRANSITION(t);
1606 	setfrontdq(t);
1607 }
1608 
1609 /*
1610  * See comment on calling conventions above.
1611  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1612  */
1613 void
1614 dq_srundec(kthread_t *t)
1615 {
1616 	ASSERT(t->t_schedflag & TS_LOAD);
1617 
1618 	(void) dispdeq(t);
1619 	disp_swapped_enq(t);
1620 }
1621 
1622 /*
1623  * Change the dispatcher lock of thread to the "swapped_lock"
1624  * and return with thread lock still held.
1625  *
1626  * Called with thread_lock held, in transition state, and at high spl.
1627  */
1628 void
1629 disp_swapped_enq(kthread_t *tp)
1630 {
1631 	ASSERT(THREAD_LOCK_HELD(tp));
1632 	ASSERT(tp->t_schedflag & TS_LOAD);
1633 
1634 	switch (tp->t_state) {
1635 	case TS_RUN:
1636 		disp_lock_enter_high(&swapped_lock);
1637 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1638 		break;
1639 	case TS_ONPROC:
1640 		disp_lock_enter_high(&swapped_lock);
1641 		THREAD_TRANSITION(tp);
1642 		wake_sched_sec = 1;		/* tell clock to wake sched */
1643 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1644 		break;
1645 	default:
1646 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1647 	}
1648 }
1649 
1650 /*
1651  * This routine is called by setbackdq/setfrontdq if the thread is
1652  * not loaded or loaded and on the swap queue.
1653  *
1654  * Thread state TS_SLEEP implies that a swapped thread
1655  * has been woken up and needs to be swapped in by the swapper.
1656  *
1657  * Thread state TS_RUN, it implies that the priority of a swapped
1658  * thread is being increased by scheduling class (e.g. ts_update).
1659  */
1660 static void
1661 disp_swapped_setrun(kthread_t *tp)
1662 {
1663 	ASSERT(THREAD_LOCK_HELD(tp));
1664 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1665 
1666 	switch (tp->t_state) {
1667 	case TS_SLEEP:
1668 		disp_lock_enter_high(&swapped_lock);
1669 		/*
1670 		 * Wakeup sched immediately (i.e., next tick) if the
1671 		 * thread priority is above maxclsyspri.
1672 		 */
1673 		if (DISP_PRIO(tp) > maxclsyspri)
1674 			wake_sched = 1;
1675 		else
1676 			wake_sched_sec = 1;
1677 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1678 		break;
1679 	case TS_RUN:				/* called from ts_update */
1680 		break;
1681 	default:
1682 		panic("disp_swapped_setrun: tp: %p bad t_state", tp);
1683 	}
1684 }
1685 
1686 
1687 /*
1688  *	Make a thread give up its processor.  Find the processor on
1689  *	which this thread is executing, and have that processor
1690  *	preempt.
1691  */
1692 void
1693 cpu_surrender(kthread_t *tp)
1694 {
1695 	cpu_t	*cpup;
1696 	int	max_pri;
1697 	int	max_run_pri;
1698 	klwp_t	*lwp;
1699 
1700 	ASSERT(THREAD_LOCK_HELD(tp));
1701 
1702 	if (tp->t_state != TS_ONPROC)
1703 		return;
1704 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1705 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1706 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1707 	if (max_pri < max_run_pri)
1708 		max_pri = max_run_pri;
1709 
1710 	cpup->cpu_runrun = 1;
1711 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1712 		cpup->cpu_kprunrun = 1;
1713 	}
1714 
1715 	/*
1716 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1717 	 */
1718 	membar_enter();
1719 
1720 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1721 
1722 	/*
1723 	 * Make the target thread take an excursion through trap()
1724 	 * to do preempt() (unless we're already in trap or post_syscall,
1725 	 * calling cpu_surrender via CL_TRAPRET).
1726 	 */
1727 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1728 	    lwp->lwp_state != LWP_USER) {
1729 		aston(tp);
1730 		if (cpup != CPU)
1731 			poke_cpu(cpup->cpu_id);
1732 	}
1733 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1734 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1735 }
1736 
1737 
1738 /*
1739  * Commit to and ratify a scheduling decision
1740  */
1741 /*ARGSUSED*/
1742 static kthread_t *
1743 disp_ratify(kthread_t *tp, disp_t *kpq)
1744 {
1745 	pri_t	tpri, maxpri;
1746 	pri_t	maxkpri;
1747 	cpu_t	*cpup;
1748 
1749 	ASSERT(tp != NULL);
1750 	/*
1751 	 * Commit to, then ratify scheduling decision
1752 	 */
1753 	cpup = CPU;
1754 	if (cpup->cpu_runrun != 0)
1755 		cpup->cpu_runrun = 0;
1756 	if (cpup->cpu_kprunrun != 0)
1757 		cpup->cpu_kprunrun = 0;
1758 	if (cpup->cpu_chosen_level != -1)
1759 		cpup->cpu_chosen_level = -1;
1760 	membar_enter();
1761 	tpri = DISP_PRIO(tp);
1762 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1763 	maxkpri = kpq->disp_maxrunpri;
1764 	if (maxpri < maxkpri)
1765 		maxpri = maxkpri;
1766 	if (tpri < maxpri) {
1767 		/*
1768 		 * should have done better
1769 		 * put this one back and indicate to try again
1770 		 */
1771 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1772 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1773 		thread_lock_high(tp);
1774 		THREAD_TRANSITION(tp);
1775 		setfrontdq(tp);
1776 		thread_unlock_nopreempt(tp);
1777 
1778 		tp = NULL;
1779 	}
1780 	return (tp);
1781 }
1782 
1783 /*
1784  * See if there is any work on the dispatcher queue for other CPUs.
1785  * If there is, dequeue the best thread and return.
1786  */
1787 static kthread_t *
1788 disp_getwork(cpu_t *cp)
1789 {
1790 	cpu_t		*ocp;		/* other CPU */
1791 	cpu_t		*ocp_start;
1792 	cpu_t		*tcp;		/* target local CPU */
1793 	kthread_t	*tp;
1794 	pri_t		maxpri;
1795 	int		s;
1796 	disp_t		*kpq;		/* kp queue for this partition */
1797 	lpl_t		*lpl, *lpl_leaf;
1798 	int		hint, leafidx;
1799 
1800 	maxpri = -1;
1801 	tcp = NULL;
1802 
1803 	kpq = &cp->cpu_part->cp_kp_queue;
1804 	while (kpq->disp_maxrunpri >= 0) {
1805 		/*
1806 		 * Try to take a thread from the kp_queue.
1807 		 */
1808 		tp = (disp_getbest(kpq));
1809 		if (tp)
1810 			return (disp_ratify(tp, kpq));
1811 	}
1812 
1813 	s = splhigh();		/* protect the cpu_active list */
1814 
1815 	/*
1816 	 * Try to find something to do on another CPU's run queue.
1817 	 * Loop through all other CPUs looking for the one with the highest
1818 	 * priority unbound thread.
1819 	 *
1820 	 * On NUMA machines, the partition's CPUs are consulted in order of
1821 	 * distance from the current CPU. This way, the first available
1822 	 * work found is also the closest, and will suffer the least
1823 	 * from being migrated.
1824 	 */
1825 	lpl = lpl_leaf = cp->cpu_lpl;
1826 	hint = leafidx = 0;
1827 
1828 	/*
1829 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1830 	 * broader levels of locality
1831 	 */
1832 	do {
1833 		/* This loop iterates over the lpl's leaves */
1834 		do {
1835 			if (lpl_leaf != cp->cpu_lpl)
1836 				ocp = lpl_leaf->lpl_cpus;
1837 			else
1838 				ocp = cp->cpu_next_lpl;
1839 
1840 			/* This loop iterates over the CPUs in the leaf */
1841 			ocp_start = ocp;
1842 			do {
1843 				pri_t pri;
1844 
1845 				ASSERT(CPU_ACTIVE(ocp));
1846 
1847 				/*
1848 				 * End our stroll around the partition if:
1849 				 *
1850 				 * - Something became runnable on the local
1851 				 *	queue
1852 				 *
1853 				 * - We're at the broadest level of locality and
1854 				 *   we happen across another idle CPU. At the
1855 				 *   highest level of locality, all CPUs will
1856 				 *   walk the partition's CPUs in the same
1857 				 *   order, so we can end our stroll taking
1858 				 *   comfort in knowing the other idle CPU is
1859 				 *   already covering the next portion of the
1860 				 *   list.
1861 				 */
1862 				if (cp->cpu_disp->disp_nrunnable != 0)
1863 					break;
1864 				if (ocp->cpu_dispatch_pri == -1) {
1865 					if (ocp->cpu_disp_flags &
1866 					    CPU_DISP_HALTED)
1867 						continue;
1868 					else if (lpl->lpl_parent == NULL)
1869 						break;
1870 				}
1871 
1872 				/*
1873 				 * If there's only one thread and the CPU
1874 				 * is in the middle of a context switch,
1875 				 * or it's currently running the idle thread,
1876 				 * don't steal it.
1877 				 */
1878 				if ((ocp->cpu_disp_flags &
1879 					CPU_DISP_DONTSTEAL) &&
1880 				    ocp->cpu_disp->disp_nrunnable == 1)
1881 					continue;
1882 
1883 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1884 				if (pri > maxpri) {
1885 					maxpri = pri;
1886 					tcp = ocp;
1887 				}
1888 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1889 
1890 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1891 				leafidx = 0;
1892 				lpl_leaf = lpl->lpl_rset[leafidx];
1893 			}
1894 		} while (leafidx != hint);
1895 
1896 		hint = leafidx = lpl->lpl_hint;
1897 		if ((lpl = lpl->lpl_parent) != NULL)
1898 			lpl_leaf = lpl->lpl_rset[hint];
1899 	} while (!tcp && lpl);
1900 
1901 	splx(s);
1902 
1903 	/*
1904 	 * If another queue looks good, and there is still nothing on
1905 	 * the local queue, try to transfer one or more threads
1906 	 * from it to our queue.
1907 	 */
1908 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1909 		tp = (disp_getbest(tcp->cpu_disp));
1910 		if (tp)
1911 			return (disp_ratify(tp, kpq));
1912 	}
1913 	return (NULL);
1914 }
1915 
1916 
1917 /*
1918  * disp_fix_unbound_pri()
1919  *	Determines the maximum priority of unbound threads on the queue.
1920  *	The priority is kept for the queue, but is only increased, never
1921  *	reduced unless some CPU is looking for something on that queue.
1922  *
1923  *	The priority argument is the known upper limit.
1924  *
1925  *	Perhaps this should be kept accurately, but that probably means
1926  *	separate bitmaps for bound and unbound threads.  Since only idled
1927  *	CPUs will have to do this recalculation, it seems better this way.
1928  */
1929 static void
1930 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1931 {
1932 	kthread_t	*tp;
1933 	dispq_t		*dq;
1934 	ulong_t		*dqactmap = dp->disp_qactmap;
1935 	ulong_t		mapword;
1936 	int		wx;
1937 
1938 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1939 
1940 	ASSERT(pri >= 0);			/* checked by caller */
1941 
1942 	/*
1943 	 * Start the search at the next lowest priority below the supplied
1944 	 * priority.  This depends on the bitmap implementation.
1945 	 */
1946 	do {
1947 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
1948 
1949 		/*
1950 		 * Form mask for all lower priorities in the word.
1951 		 */
1952 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1953 
1954 		/*
1955 		 * Get next lower active priority.
1956 		 */
1957 		if (mapword != 0) {
1958 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1959 		} else if (wx > 0) {
1960 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1961 			if (pri < 0)
1962 				break;
1963 		} else {
1964 			pri = -1;
1965 			break;
1966 		}
1967 
1968 		/*
1969 		 * Search the queue for unbound, runnable threads.
1970 		 */
1971 		dq = &dp->disp_q[pri];
1972 		tp = dq->dq_first;
1973 
1974 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1975 			tp = tp->t_link;
1976 		}
1977 
1978 		/*
1979 		 * If a thread was found, set the priority and return.
1980 		 */
1981 	} while (tp == NULL);
1982 
1983 	/*
1984 	 * pri holds the maximum unbound thread priority or -1.
1985 	 */
1986 	if (dp->disp_max_unbound_pri != pri)
1987 		dp->disp_max_unbound_pri = pri;
1988 }
1989 
1990 /*
1991  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
1992  * 	check if the CPU to which is was previously bound should have
1993  * 	its disp_max_unbound_pri increased.
1994  */
1995 void
1996 disp_adjust_unbound_pri(kthread_t *tp)
1997 {
1998 	disp_t *dp;
1999 	pri_t tpri;
2000 
2001 	ASSERT(THREAD_LOCK_HELD(tp));
2002 
2003 	/*
2004 	 * Don't do anything if the thread is not bound, or
2005 	 * currently not runnable or swapped out.
2006 	 */
2007 	if (tp->t_bound_cpu == NULL ||
2008 	    tp->t_state != TS_RUN ||
2009 	    tp->t_schedflag & TS_ON_SWAPQ)
2010 		return;
2011 
2012 	tpri = DISP_PRIO(tp);
2013 	dp = tp->t_bound_cpu->cpu_disp;
2014 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2015 	if (tpri > dp->disp_max_unbound_pri)
2016 		dp->disp_max_unbound_pri = tpri;
2017 }
2018 
2019 /*
2020  * disp_getbest() - de-queue the highest priority unbound runnable thread.
2021  *	returns with the thread unlocked and onproc
2022  *	but at splhigh (like disp()).
2023  *	returns NULL if nothing found.
2024  *
2025  *	Passed a pointer to a dispatch queue not associated with this CPU.
2026  */
2027 static kthread_t *
2028 disp_getbest(disp_t *dp)
2029 {
2030 	kthread_t	*tp;
2031 	dispq_t		*dq;
2032 	pri_t		pri;
2033 	cpu_t		*cp;
2034 
2035 	disp_lock_enter(&dp->disp_lock);
2036 
2037 	/*
2038 	 * If there is nothing to run, or the CPU is in the middle of a
2039 	 * context switch of the only thread, return NULL.
2040 	 */
2041 	pri = dp->disp_max_unbound_pri;
2042 	if (pri == -1 ||
2043 		(dp->disp_cpu != NULL &&
2044 		    (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2045 		dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) {
2046 		disp_lock_exit_nopreempt(&dp->disp_lock);
2047 		return (NULL);
2048 	}
2049 
2050 	dq = &dp->disp_q[pri];
2051 	tp = dq->dq_first;
2052 
2053 	/*
2054 	 * Skip over bound threads.
2055 	 * Bound threads can be here even though disp_max_unbound_pri
2056 	 * indicated this level.  Besides, it not always accurate because it
2057 	 * isn't reduced until another CPU looks for work.
2058 	 * Note that tp could be NULL right away due to this.
2059 	 */
2060 	while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2061 		tp = tp->t_link;
2062 	}
2063 
2064 	/*
2065 	 * If there were no unbound threads on this queue, find the queue
2066 	 * where they are and then return NULL so that other CPUs will be
2067 	 * considered.
2068 	 */
2069 	if (tp == NULL) {
2070 		disp_fix_unbound_pri(dp, pri);
2071 		disp_lock_exit_nopreempt(&dp->disp_lock);
2072 		return (NULL);
2073 	}
2074 
2075 	/*
2076 	 * Found a runnable, unbound thread, so remove it from queue.
2077 	 * dispdeq() requires that we have the thread locked, and we do,
2078 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2079 	 * put the thread in transition state, thereby dropping the dispq
2080 	 * lock.
2081 	 */
2082 #ifdef DEBUG
2083 	{
2084 		int	thread_was_on_queue;
2085 
2086 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2087 		ASSERT(thread_was_on_queue);
2088 	}
2089 #else /* DEBUG */
2090 	(void) dispdeq(tp);			/* drops disp_lock */
2091 #endif /* DEBUG */
2092 
2093 	tp->t_schedflag |= TS_DONT_SWAP;
2094 
2095 	/*
2096 	 * Setup thread to run on the current CPU.
2097 	 */
2098 	cp = CPU;
2099 
2100 	tp->t_disp_queue = cp->cpu_disp;
2101 
2102 	cp->cpu_dispthread = tp;		/* protected by spl only */
2103 	cp->cpu_dispatch_pri = pri;
2104 	ASSERT(pri == DISP_PRIO(tp));
2105 
2106 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2107 
2108 	/*
2109 	 * Return with spl high so that swtch() won't need to raise it.
2110 	 * The disp_lock was dropped by dispdeq().
2111 	 */
2112 
2113 	return (tp);
2114 }
2115 
2116 /*
2117  * disp_bound_common() - common routine for higher level functions
2118  *	that check for bound threads under certain conditions.
2119  *	If 'threadlistsafe' is set then there is no need to acquire
2120  *	pidlock to stop the thread list from changing (eg, if
2121  *	disp_bound_* is called with cpus paused).
2122  */
2123 static int
2124 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2125 {
2126 	int		found = 0;
2127 	kthread_t	*tp;
2128 
2129 	ASSERT(flag);
2130 
2131 	if (!threadlistsafe)
2132 		mutex_enter(&pidlock);
2133 	tp = curthread;		/* faster than allthreads */
2134 	do {
2135 		if (tp->t_state != TS_FREE) {
2136 			/*
2137 			 * If an interrupt thread is busy, but the
2138 			 * caller doesn't care (i.e. BOUND_INTR is off),
2139 			 * then just ignore it and continue through.
2140 			 */
2141 			if ((tp->t_flag & T_INTR_THREAD) &&
2142 			    !(flag & BOUND_INTR))
2143 				continue;
2144 
2145 			/*
2146 			 * Skip the idle thread for the CPU
2147 			 * we're about to set offline.
2148 			 */
2149 			if (tp == cp->cpu_idle_thread)
2150 				continue;
2151 
2152 			/*
2153 			 * Skip the pause thread for the CPU
2154 			 * we're about to set offline.
2155 			 */
2156 			if (tp == cp->cpu_pause_thread)
2157 				continue;
2158 
2159 			if ((flag & BOUND_CPU) &&
2160 			    (tp->t_bound_cpu == cp ||
2161 			    tp->t_bind_cpu == cp->cpu_id ||
2162 			    tp->t_weakbound_cpu == cp)) {
2163 				found = 1;
2164 				break;
2165 			}
2166 
2167 			if ((flag & BOUND_PARTITION) &&
2168 			    (tp->t_cpupart == cp->cpu_part)) {
2169 				found = 1;
2170 				break;
2171 			}
2172 		}
2173 	} while ((tp = tp->t_next) != curthread && found == 0);
2174 	if (!threadlistsafe)
2175 		mutex_exit(&pidlock);
2176 	return (found);
2177 }
2178 
2179 /*
2180  * disp_bound_threads - return nonzero if threads are bound to the processor.
2181  *	Called infrequently.  Keep this simple.
2182  *	Includes threads that are asleep or stopped but not onproc.
2183  */
2184 int
2185 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2186 {
2187 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2188 }
2189 
2190 /*
2191  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2192  * to the given processor, including interrupt threads.
2193  */
2194 int
2195 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2196 {
2197 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2198 }
2199 
2200 /*
2201  * disp_bound_partition - return nonzero if threads are bound to the same
2202  * partition as the processor.
2203  *	Called infrequently.  Keep this simple.
2204  *	Includes threads that are asleep or stopped but not onproc.
2205  */
2206 int
2207 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2208 {
2209 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2210 }
2211 
2212 /*
2213  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2214  * threads to other CPUs.
2215  */
2216 void
2217 disp_cpu_inactive(cpu_t *cp)
2218 {
2219 	kthread_t	*tp;
2220 	disp_t		*dp = cp->cpu_disp;
2221 	dispq_t		*dq;
2222 	pri_t		pri;
2223 	int		wasonq;
2224 
2225 	disp_lock_enter(&dp->disp_lock);
2226 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2227 		dq = &dp->disp_q[pri];
2228 		tp = dq->dq_first;
2229 
2230 		/*
2231 		 * Skip over bound threads.
2232 		 */
2233 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2234 			tp = tp->t_link;
2235 		}
2236 
2237 		if (tp == NULL) {
2238 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2239 			disp_fix_unbound_pri(dp, pri);
2240 			continue;
2241 		}
2242 
2243 		wasonq = dispdeq(tp);		/* drops disp_lock */
2244 		ASSERT(wasonq);
2245 		ASSERT(tp->t_weakbound_cpu == NULL);
2246 
2247 		setbackdq(tp);
2248 		/*
2249 		 * Called from cpu_offline:
2250 		 *
2251 		 * cp has already been removed from the list of active cpus
2252 		 * and tp->t_cpu has been changed so there is no risk of
2253 		 * tp ending up back on cp.
2254 		 *
2255 		 * Called from cpupart_move_cpu:
2256 		 *
2257 		 * The cpu has moved to a new cpupart.  Any threads that
2258 		 * were on it's dispatch queues before the move remain
2259 		 * in the old partition and can't run in the new partition.
2260 		 */
2261 		ASSERT(tp->t_cpu != cp);
2262 		thread_unlock(tp);
2263 
2264 		disp_lock_enter(&dp->disp_lock);
2265 	}
2266 	disp_lock_exit(&dp->disp_lock);
2267 }
2268 
2269 /*
2270  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2271  *	The hint passed in is used as a starting point so we don't favor
2272  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2273  *	used CPU for the thread.
2274  *
2275  *	The lgroup and priority are used to determine the best CPU to run on
2276  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2277  *	the thread priority will indicate whether the thread will actually run
2278  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2279  *	lgroup which are running the lowest priority threads are found.  The
2280  *	remote CPU is chosen only if the thread will not run locally on a CPU
2281  *	within the lgroup, but will run on the remote CPU. If the thread
2282  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2283  *
2284  *	The lpl specified also identifies the cpu partition from which
2285  *	disp_lowpri_cpu should select a CPU.
2286  *
2287  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2288  *      behalf of the current thread. (curthread is looking for a new cpu)
2289  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2290  *      ignored.
2291  *
2292  *      If a cpu is the target of an offline request then try to avoid it.
2293  *
2294  *	This function must be called at either high SPL, or with preemption
2295  *	disabled, so that the "hint" CPU cannot be removed from the online
2296  *	CPU list while we are traversing it.
2297  */
2298 cpu_t *
2299 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2300 {
2301 	cpu_t	*bestcpu;
2302 	cpu_t	*besthomecpu;
2303 	cpu_t   *cp, *cpstart;
2304 
2305 	pri_t   bestpri;
2306 	pri_t   cpupri;
2307 
2308 	klgrpset_t	done;
2309 	klgrpset_t	cur_set;
2310 
2311 	lpl_t		*lpl_iter, *lpl_leaf;
2312 	int		i;
2313 
2314 	/*
2315 	 * Scan for a CPU currently running the lowest priority thread.
2316 	 * Cannot get cpu_lock here because it is adaptive.
2317 	 * We do not require lock on CPU list.
2318 	 */
2319 	ASSERT(hint != NULL);
2320 	ASSERT(lpl != NULL);
2321 	ASSERT(lpl->lpl_ncpu > 0);
2322 
2323 	/*
2324 	 * First examine local CPUs. Note that it's possible the hint CPU
2325 	 * passed in in remote to the specified home lgroup. If our priority
2326 	 * isn't sufficient enough such that we can run immediately at home,
2327 	 * then examine CPUs remote to our home lgroup.
2328 	 * We would like to give preference to CPUs closest to "home".
2329 	 * If we can't find a CPU where we'll run at a given level
2330 	 * of locality, we expand our search to include the next level.
2331 	 */
2332 	bestcpu = besthomecpu = NULL;
2333 	klgrpset_clear(done);
2334 	/* start with lpl we were passed */
2335 
2336 	lpl_iter = lpl;
2337 
2338 	do {
2339 
2340 		bestpri = SHRT_MAX;
2341 		klgrpset_clear(cur_set);
2342 
2343 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2344 			lpl_leaf = lpl_iter->lpl_rset[i];
2345 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2346 				continue;
2347 
2348 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2349 
2350 			if (hint->cpu_lpl == lpl_leaf)
2351 				cp = cpstart = hint;
2352 			else
2353 				cp = cpstart = lpl_leaf->lpl_cpus;
2354 
2355 			do {
2356 
2357 				if (cp == curcpu)
2358 					cpupri = -1;
2359 				else if (cp == cpu_inmotion)
2360 					cpupri = SHRT_MAX;
2361 				else
2362 					cpupri = cp->cpu_dispatch_pri;
2363 
2364 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2365 					cpupri = cp->cpu_disp->disp_maxrunpri;
2366 				if (cp->cpu_chosen_level > cpupri)
2367 					cpupri = cp->cpu_chosen_level;
2368 				if (cpupri < bestpri) {
2369 					if (CPU_IDLING(cpupri)) {
2370 						ASSERT((cp->cpu_flags &
2371 						    CPU_QUIESCED) == 0);
2372 						return (cp);
2373 					}
2374 					bestcpu = cp;
2375 					bestpri = cpupri;
2376 				}
2377 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2378 		}
2379 
2380 		if (bestcpu && (tpri > bestpri)) {
2381 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2382 			return (bestcpu);
2383 		}
2384 		if (besthomecpu == NULL)
2385 			besthomecpu = bestcpu;
2386 		/*
2387 		 * Add the lgrps we just considered to the "done" set
2388 		 */
2389 		klgrpset_or(done, cur_set);
2390 
2391 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2392 
2393 	/*
2394 	 * The specified priority isn't high enough to run immediately
2395 	 * anywhere, so just return the best CPU from the home lgroup.
2396 	 */
2397 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2398 	return (besthomecpu);
2399 }
2400 
2401 /*
2402  * This routine provides the generic idle cpu function for all processors.
2403  * If a processor has some specific code to execute when idle (say, to stop
2404  * the pipeline and save power) then that routine should be defined in the
2405  * processors specific code (module_xx.c) and the global variable idle_cpu
2406  * set to that function.
2407  */
2408 static void
2409 generic_idle_cpu(void)
2410 {
2411 }
2412 
2413 /*ARGSUSED*/
2414 static void
2415 generic_enq_thread(cpu_t *cpu, int bound)
2416 {
2417 }
2418 
2419 /*
2420  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2421  *	- t->t_cpu is not in this thread's assigned lgrp
2422  *	- the time since the thread last came off t->t_cpu exceeds the
2423  *	  rechoose time for this cpu (ignore this if t is curthread in
2424  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2425  *	- t->t_cpu is presently the target of an offline or partition move
2426  *	  request
2427  */
2428 static cpu_t *
2429 cpu_choose(kthread_t *t, pri_t tpri)
2430 {
2431 	ASSERT(tpri < kpqpri);
2432 
2433 	if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) &&
2434 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2435 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2436 	}
2437 
2438 	/*
2439 	 * Take a trip through disp_lowpri_cpu() if the thread was
2440 	 * running outside it's home lgroup
2441 	 */
2442 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2443 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2444 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2445 		    (t == curthread) ? t->t_cpu : NULL));
2446 	}
2447 	return (t->t_cpu);
2448 }
2449