xref: /titanic_51/usr/src/uts/common/disp/disp.c (revision 193974072f41a843678abf5f61979c748687e66b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
59 
60 #include <vm/as.h>
61 
62 #define	BOUND_CPU	0x1
63 #define	BOUND_PARTITION	0x2
64 #define	BOUND_INTR	0x4
65 
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 	disp_t	*dp;
69 	dispq_t *olddispq;
70 	dispq_t *newdispq;
71 	ulong_t	*olddqactmap;
72 	ulong_t	*newdqactmap;
73 	int	oldnglobpris;
74 };
75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76     disp_t *dp);
77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void	disp_dq_free(struct disp_queue_info *dptr);
79 
80 /* platform-specific routine to call when processor is idle */
81 static void	generic_idle_cpu();
82 void		(*idle_cpu)() = generic_idle_cpu;
83 
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void	idle_enter();
86 static void	idle_exit();
87 
88 /* platform-specific routine to call when thread is enqueued */
89 static void	generic_enq_thread(cpu_t *, int);
90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 
92 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
93 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
94 pri_t	intr_pri;		/* interrupt thread priority base level */
95 
96 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
97 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
98 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
100 int	nswapped;		/* total number of swapped threads */
101 void	disp_swapped_enq(kthread_t *tp);
102 static void	disp_swapped_setrun(kthread_t *tp);
103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
104 
105 /*
106  * If this is set, only interrupt threads will cause kernel preemptions.
107  * This is done by changing the value of kpreemptpri.  kpreemptpri
108  * will either be the max sysclass pri + 1 or the min interrupt pri.
109  */
110 int	only_intr_kpreempt;
111 
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define	SETKP_BACK	0
116 #define	SETKP_FRONT	1
117 /*
118  * Parameter that determines how recently a thread must have run
119  * on the CPU to be considered loosely-bound to that CPU to reduce
120  * cold cache effects.  The interval is in hertz.
121  */
122 #define	RECHOOSE_INTERVAL 3
123 int	rechoose_interval = RECHOOSE_INTERVAL;
124 static cpu_t	*cpu_choose(kthread_t *, pri_t);
125 
126 /*
127  * Parameter that determines how long (in nanoseconds) a thread must
128  * be sitting on a run queue before it can be stolen by another CPU
129  * to reduce migrations.  The interval is in nanoseconds.
130  *
131  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
132  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
133  * here indicating it is uninitiallized.
134  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
135  *
136  */
137 #define	NOSTEAL_UNINITIALIZED	(-1)
138 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
139 extern void cmp_set_nosteal_interval(void);
140 
141 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
142 
143 disp_lock_t	transition_lock;	/* lock on transitioning threads */
144 disp_lock_t	stop_lock;		/* lock on stopped threads */
145 
146 static void	cpu_dispqalloc(int numpris);
147 
148 /*
149  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
150  * a thread because it was sitting on its run queue for a very short
151  * period of time.
152  */
153 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
154 
155 static kthread_t	*disp_getwork(cpu_t *to);
156 static kthread_t	*disp_getbest(disp_t *from);
157 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
158 
159 void	swtch_to(kthread_t *);
160 
161 /*
162  * dispatcher and scheduler initialization
163  */
164 
165 /*
166  * disp_setup - Common code to calculate and allocate dispatcher
167  *		variables and structures based on the maximum priority.
168  */
169 static void
170 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
171 {
172 	pri_t	newnglobpris;
173 
174 	ASSERT(MUTEX_HELD(&cpu_lock));
175 
176 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
177 
178 	if (newnglobpris > oldnglobpris) {
179 		/*
180 		 * Allocate new kp queues for each CPU partition.
181 		 */
182 		cpupart_kpqalloc(newnglobpris);
183 
184 		/*
185 		 * Allocate new dispatch queues for each CPU.
186 		 */
187 		cpu_dispqalloc(newnglobpris);
188 
189 		/*
190 		 * compute new interrupt thread base priority
191 		 */
192 		intr_pri = maxglobpri;
193 		if (only_intr_kpreempt) {
194 			kpreemptpri = intr_pri + 1;
195 			if (kpqpri == KPQPRI)
196 				kpqpri = kpreemptpri;
197 		}
198 		v.v_nglobpris = newnglobpris;
199 	}
200 }
201 
202 /*
203  * dispinit - Called to initialize all loaded classes and the
204  *	      dispatcher framework.
205  */
206 void
207 dispinit(void)
208 {
209 	id_t	cid;
210 	pri_t	maxglobpri;
211 	pri_t	cl_maxglobpri;
212 
213 	maxglobpri = -1;
214 
215 	/*
216 	 * Initialize transition lock, which will always be set.
217 	 */
218 	DISP_LOCK_INIT(&transition_lock);
219 	disp_lock_enter_high(&transition_lock);
220 	DISP_LOCK_INIT(&stop_lock);
221 
222 	mutex_enter(&cpu_lock);
223 	CPU->cpu_disp->disp_maxrunpri = -1;
224 	CPU->cpu_disp->disp_max_unbound_pri = -1;
225 
226 	/*
227 	 * Initialize the default CPU partition.
228 	 */
229 	cpupart_initialize_default();
230 	/*
231 	 * Call the class specific initialization functions for
232 	 * all pre-installed schedulers.
233 	 *
234 	 * We pass the size of a class specific parameter
235 	 * buffer to each of the initialization functions
236 	 * to try to catch problems with backward compatibility
237 	 * of class modules.
238 	 *
239 	 * For example a new class module running on an old system
240 	 * which didn't provide sufficiently large parameter buffers
241 	 * would be bad news. Class initialization modules can check for
242 	 * this and take action if they detect a problem.
243 	 */
244 
245 	for (cid = 0; cid < nclass; cid++) {
246 		sclass_t	*sc;
247 
248 		sc = &sclass[cid];
249 		if (SCHED_INSTALLED(sc)) {
250 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
251 			    &sc->cl_funcs);
252 			if (cl_maxglobpri > maxglobpri)
253 				maxglobpri = cl_maxglobpri;
254 		}
255 	}
256 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
257 	if (kpqpri == KPQPRI)
258 		kpqpri = kpreemptpri;
259 
260 	ASSERT(maxglobpri >= 0);
261 	disp_setup(maxglobpri, 0);
262 
263 	mutex_exit(&cpu_lock);
264 
265 	/*
266 	 * Platform specific sticky scheduler setup.
267 	 */
268 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
269 		cmp_set_nosteal_interval();
270 
271 	/*
272 	 * Get the default class ID; this may be later modified via
273 	 * dispadmin(1M).  This will load the class (normally TS) and that will
274 	 * call disp_add(), which is why we had to drop cpu_lock first.
275 	 */
276 	if (getcid(defaultclass, &defaultcid) != 0) {
277 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
278 		    defaultclass);
279 	}
280 }
281 
282 /*
283  * disp_add - Called with class pointer to initialize the dispatcher
284  *	      for a newly loaded class.
285  */
286 void
287 disp_add(sclass_t *clp)
288 {
289 	pri_t	maxglobpri;
290 	pri_t	cl_maxglobpri;
291 
292 	mutex_enter(&cpu_lock);
293 	/*
294 	 * Initialize the scheduler class.
295 	 */
296 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
297 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
298 	if (cl_maxglobpri > maxglobpri)
299 		maxglobpri = cl_maxglobpri;
300 
301 	/*
302 	 * Save old queue information.  Since we're initializing a
303 	 * new scheduling class which has just been loaded, then
304 	 * the size of the dispq may have changed.  We need to handle
305 	 * that here.
306 	 */
307 	disp_setup(maxglobpri, v.v_nglobpris);
308 
309 	mutex_exit(&cpu_lock);
310 }
311 
312 
313 /*
314  * For each CPU, allocate new dispatch queues
315  * with the stated number of priorities.
316  */
317 static void
318 cpu_dispqalloc(int numpris)
319 {
320 	cpu_t	*cpup;
321 	struct disp_queue_info	*disp_mem;
322 	int i, num;
323 
324 	ASSERT(MUTEX_HELD(&cpu_lock));
325 
326 	disp_mem = kmem_zalloc(NCPU *
327 	    sizeof (struct disp_queue_info), KM_SLEEP);
328 
329 	/*
330 	 * This routine must allocate all of the memory before stopping
331 	 * the cpus because it must not sleep in kmem_alloc while the
332 	 * CPUs are stopped.  Locks they hold will not be freed until they
333 	 * are restarted.
334 	 */
335 	i = 0;
336 	cpup = cpu_list;
337 	do {
338 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
339 		i++;
340 		cpup = cpup->cpu_next;
341 	} while (cpup != cpu_list);
342 	num = i;
343 
344 	pause_cpus(NULL);
345 	for (i = 0; i < num; i++)
346 		disp_dq_assign(&disp_mem[i], numpris);
347 	start_cpus();
348 
349 	/*
350 	 * I must free all of the memory after starting the cpus because
351 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
352 	 */
353 	for (i = 0; i < num; i++)
354 		disp_dq_free(&disp_mem[i]);
355 
356 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
357 }
358 
359 static void
360 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
361 {
362 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
363 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
364 	    sizeof (long), KM_SLEEP);
365 	dptr->dp = dp;
366 }
367 
368 static void
369 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
370 {
371 	disp_t	*dp;
372 
373 	dp = dptr->dp;
374 	dptr->olddispq = dp->disp_q;
375 	dptr->olddqactmap = dp->disp_qactmap;
376 	dptr->oldnglobpris = dp->disp_npri;
377 
378 	ASSERT(dptr->oldnglobpris < numpris);
379 
380 	if (dptr->olddispq != NULL) {
381 		/*
382 		 * Use kcopy because bcopy is platform-specific
383 		 * and could block while we might have paused the cpus.
384 		 */
385 		(void) kcopy(dptr->olddispq, dptr->newdispq,
386 		    dptr->oldnglobpris * sizeof (dispq_t));
387 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
388 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
389 		    sizeof (long));
390 	}
391 	dp->disp_q = dptr->newdispq;
392 	dp->disp_qactmap = dptr->newdqactmap;
393 	dp->disp_q_limit = &dptr->newdispq[numpris];
394 	dp->disp_npri = numpris;
395 }
396 
397 static void
398 disp_dq_free(struct disp_queue_info *dptr)
399 {
400 	if (dptr->olddispq != NULL)
401 		kmem_free(dptr->olddispq,
402 		    dptr->oldnglobpris * sizeof (dispq_t));
403 	if (dptr->olddqactmap != NULL)
404 		kmem_free(dptr->olddqactmap,
405 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
406 }
407 
408 /*
409  * For a newly created CPU, initialize the dispatch queue.
410  * This is called before the CPU is known through cpu[] or on any lists.
411  */
412 void
413 disp_cpu_init(cpu_t *cp)
414 {
415 	disp_t	*dp;
416 	dispq_t	*newdispq;
417 	ulong_t	*newdqactmap;
418 
419 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
420 
421 	if (cp == cpu0_disp.disp_cpu)
422 		dp = &cpu0_disp;
423 	else
424 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
425 	bzero(dp, sizeof (disp_t));
426 	cp->cpu_disp = dp;
427 	dp->disp_cpu = cp;
428 	dp->disp_maxrunpri = -1;
429 	dp->disp_max_unbound_pri = -1;
430 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
431 	/*
432 	 * Allocate memory for the dispatcher queue headers
433 	 * and the active queue bitmap.
434 	 */
435 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
436 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
437 	    sizeof (long), KM_SLEEP);
438 	dp->disp_q = newdispq;
439 	dp->disp_qactmap = newdqactmap;
440 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
441 	dp->disp_npri = v.v_nglobpris;
442 }
443 
444 void
445 disp_cpu_fini(cpu_t *cp)
446 {
447 	ASSERT(MUTEX_HELD(&cpu_lock));
448 
449 	disp_kp_free(cp->cpu_disp);
450 	if (cp->cpu_disp != &cpu0_disp)
451 		kmem_free(cp->cpu_disp, sizeof (disp_t));
452 }
453 
454 /*
455  * Allocate new, larger kpreempt dispatch queue to replace the old one.
456  */
457 void
458 disp_kp_alloc(disp_t *dq, pri_t npri)
459 {
460 	struct disp_queue_info	mem_info;
461 
462 	if (npri > dq->disp_npri) {
463 		/*
464 		 * Allocate memory for the new array.
465 		 */
466 		disp_dq_alloc(&mem_info, npri, dq);
467 
468 		/*
469 		 * We need to copy the old structures to the new
470 		 * and free the old.
471 		 */
472 		disp_dq_assign(&mem_info, npri);
473 		disp_dq_free(&mem_info);
474 	}
475 }
476 
477 /*
478  * Free dispatch queue.
479  * Used for the kpreempt queues for a removed CPU partition and
480  * for the per-CPU queues of deleted CPUs.
481  */
482 void
483 disp_kp_free(disp_t *dq)
484 {
485 	struct disp_queue_info	mem_info;
486 
487 	mem_info.olddispq = dq->disp_q;
488 	mem_info.olddqactmap = dq->disp_qactmap;
489 	mem_info.oldnglobpris = dq->disp_npri;
490 	disp_dq_free(&mem_info);
491 }
492 
493 /*
494  * End dispatcher and scheduler initialization.
495  */
496 
497 /*
498  * See if there's anything to do other than remain idle.
499  * Return non-zero if there is.
500  *
501  * This function must be called with high spl, or with
502  * kernel preemption disabled to prevent the partition's
503  * active cpu list from changing while being traversed.
504  *
505  */
506 int
507 disp_anywork(void)
508 {
509 	cpu_t   *cp = CPU;
510 	cpu_t   *ocp;
511 
512 	if (cp->cpu_disp->disp_nrunnable != 0)
513 		return (1);
514 
515 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
516 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
517 			return (1);
518 
519 		/*
520 		 * Work can be taken from another CPU if:
521 		 *	- There is unbound work on the run queue
522 		 *	- That work isn't a thread undergoing a
523 		 *	- context switch on an otherwise empty queue.
524 		 *	- The CPU isn't running the idle loop.
525 		 */
526 		for (ocp = cp->cpu_next_part; ocp != cp;
527 		    ocp = ocp->cpu_next_part) {
528 			ASSERT(CPU_ACTIVE(ocp));
529 
530 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
531 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
532 			    ocp->cpu_disp->disp_nrunnable == 1) &&
533 			    ocp->cpu_dispatch_pri != -1)
534 				return (1);
535 		}
536 	}
537 	return (0);
538 }
539 
540 /*
541  * Called when CPU enters the idle loop
542  */
543 static void
544 idle_enter()
545 {
546 	cpu_t		*cp = CPU;
547 
548 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
549 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
550 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
551 }
552 
553 /*
554  * Called when CPU exits the idle loop
555  */
556 static void
557 idle_exit()
558 {
559 	cpu_t		*cp = CPU;
560 
561 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
562 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
563 }
564 
565 /*
566  * Idle loop.
567  */
568 void
569 idle()
570 {
571 	struct cpu	*cp = CPU;		/* pointer to this CPU */
572 	kthread_t	*t;			/* taken thread */
573 
574 	idle_enter();
575 
576 	/*
577 	 * Uniprocessor version of idle loop.
578 	 * Do this until notified that we're on an actual multiprocessor.
579 	 */
580 	while (ncpus == 1) {
581 		if (cp->cpu_disp->disp_nrunnable == 0) {
582 			(*idle_cpu)();
583 			continue;
584 		}
585 		idle_exit();
586 		swtch();
587 
588 		idle_enter(); /* returned from swtch */
589 	}
590 
591 	/*
592 	 * Multiprocessor idle loop.
593 	 */
594 	for (;;) {
595 		/*
596 		 * If CPU is completely quiesced by p_online(2), just wait
597 		 * here with minimal bus traffic until put online.
598 		 */
599 		while (cp->cpu_flags & CPU_QUIESCED)
600 			(*idle_cpu)();
601 
602 		if (cp->cpu_disp->disp_nrunnable != 0) {
603 			idle_exit();
604 			swtch();
605 		} else {
606 			if (cp->cpu_flags & CPU_OFFLINE)
607 				continue;
608 			if ((t = disp_getwork(cp)) == NULL) {
609 				if (cp->cpu_chosen_level != -1) {
610 					disp_t *dp = cp->cpu_disp;
611 					disp_t *kpq;
612 
613 					disp_lock_enter(&dp->disp_lock);
614 					/*
615 					 * Set kpq under lock to prevent
616 					 * migration between partitions.
617 					 */
618 					kpq = &cp->cpu_part->cp_kp_queue;
619 					if (kpq->disp_maxrunpri == -1)
620 						cp->cpu_chosen_level = -1;
621 					disp_lock_exit(&dp->disp_lock);
622 				}
623 				(*idle_cpu)();
624 				continue;
625 			}
626 			/*
627 			 * If there was a thread but we couldn't steal
628 			 * it, then keep trying.
629 			 */
630 			if (t == T_DONTSTEAL)
631 				continue;
632 			idle_exit();
633 			swtch_to(t);
634 		}
635 		idle_enter(); /* returned from swtch/swtch_to */
636 	}
637 }
638 
639 
640 /*
641  * Preempt the currently running thread in favor of the highest
642  * priority thread.  The class of the current thread controls
643  * where it goes on the dispatcher queues. If panicking, turn
644  * preemption off.
645  */
646 void
647 preempt()
648 {
649 	kthread_t 	*t = curthread;
650 	klwp_t 		*lwp = ttolwp(curthread);
651 
652 	if (panicstr)
653 		return;
654 
655 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
656 
657 	thread_lock(t);
658 
659 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
660 		/*
661 		 * this thread has already been chosen to be run on
662 		 * another CPU. Clear kprunrun on this CPU since we're
663 		 * already headed for swtch().
664 		 */
665 		CPU->cpu_kprunrun = 0;
666 		thread_unlock_nopreempt(t);
667 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
668 	} else {
669 		if (lwp != NULL)
670 			lwp->lwp_ru.nivcsw++;
671 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
672 		THREAD_TRANSITION(t);
673 		CL_PREEMPT(t);
674 		DTRACE_SCHED(preempt);
675 		thread_unlock_nopreempt(t);
676 
677 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
678 
679 		swtch();		/* clears CPU->cpu_runrun via disp() */
680 	}
681 }
682 
683 extern kthread_t *thread_unpin();
684 
685 /*
686  * disp() - find the highest priority thread for this processor to run, and
687  * set it in TS_ONPROC state so that resume() can be called to run it.
688  */
689 static kthread_t *
690 disp()
691 {
692 	cpu_t		*cpup;
693 	disp_t		*dp;
694 	kthread_t	*tp;
695 	dispq_t		*dq;
696 	int		maxrunword;
697 	pri_t		pri;
698 	disp_t		*kpq;
699 
700 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
701 
702 	cpup = CPU;
703 	/*
704 	 * Find the highest priority loaded, runnable thread.
705 	 */
706 	dp = cpup->cpu_disp;
707 
708 reschedule:
709 	/*
710 	 * If there is more important work on the global queue with a better
711 	 * priority than the maximum on this CPU, take it now.
712 	 */
713 	kpq = &cpup->cpu_part->cp_kp_queue;
714 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
715 	    pri >= dp->disp_maxrunpri &&
716 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
717 	    (tp = disp_getbest(kpq)) != NULL) {
718 		if (disp_ratify(tp, kpq) != NULL) {
719 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
720 			    "disp_end:tid %p", tp);
721 			return (tp);
722 		}
723 	}
724 
725 	disp_lock_enter(&dp->disp_lock);
726 	pri = dp->disp_maxrunpri;
727 
728 	/*
729 	 * If there is nothing to run, look at what's runnable on other queues.
730 	 * Choose the idle thread if the CPU is quiesced.
731 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
732 	 * interrupt threads, which will be the only threads on the CPU's own
733 	 * queue, but cannot run threads from other queues.
734 	 */
735 	if (pri == -1) {
736 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
737 			disp_lock_exit(&dp->disp_lock);
738 			if ((tp = disp_getwork(cpup)) == NULL ||
739 			    tp == T_DONTSTEAL) {
740 				tp = cpup->cpu_idle_thread;
741 				(void) splhigh();
742 				THREAD_ONPROC(tp, cpup);
743 				cpup->cpu_dispthread = tp;
744 				cpup->cpu_dispatch_pri = -1;
745 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
746 				cpup->cpu_chosen_level = -1;
747 			}
748 		} else {
749 			disp_lock_exit_high(&dp->disp_lock);
750 			tp = cpup->cpu_idle_thread;
751 			THREAD_ONPROC(tp, cpup);
752 			cpup->cpu_dispthread = tp;
753 			cpup->cpu_dispatch_pri = -1;
754 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
755 			cpup->cpu_chosen_level = -1;
756 		}
757 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
758 		    "disp_end:tid %p", tp);
759 		return (tp);
760 	}
761 
762 	dq = &dp->disp_q[pri];
763 	tp = dq->dq_first;
764 
765 	ASSERT(tp != NULL);
766 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
767 
768 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
769 
770 	/*
771 	 * Found it so remove it from queue.
772 	 */
773 	dp->disp_nrunnable--;
774 	dq->dq_sruncnt--;
775 	if ((dq->dq_first = tp->t_link) == NULL) {
776 		ulong_t	*dqactmap = dp->disp_qactmap;
777 
778 		ASSERT(dq->dq_sruncnt == 0);
779 		dq->dq_last = NULL;
780 
781 		/*
782 		 * The queue is empty, so the corresponding bit needs to be
783 		 * turned off in dqactmap.   If nrunnable != 0 just took the
784 		 * last runnable thread off the
785 		 * highest queue, so recompute disp_maxrunpri.
786 		 */
787 		maxrunword = pri >> BT_ULSHIFT;
788 		dqactmap[maxrunword] &= ~BT_BIW(pri);
789 
790 		if (dp->disp_nrunnable == 0) {
791 			dp->disp_max_unbound_pri = -1;
792 			dp->disp_maxrunpri = -1;
793 		} else {
794 			int ipri;
795 
796 			ipri = bt_gethighbit(dqactmap, maxrunword);
797 			dp->disp_maxrunpri = ipri;
798 			if (ipri < dp->disp_max_unbound_pri)
799 				dp->disp_max_unbound_pri = ipri;
800 		}
801 	} else {
802 		tp->t_link = NULL;
803 	}
804 
805 	/*
806 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
807 	 * out this thread before we have a chance to run it.
808 	 * While running, it is protected against swapping by t_lock.
809 	 */
810 	tp->t_schedflag |= TS_DONT_SWAP;
811 	cpup->cpu_dispthread = tp;		/* protected by spl only */
812 	cpup->cpu_dispatch_pri = pri;
813 	ASSERT(pri == DISP_PRIO(tp));
814 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
815 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
816 
817 	ASSERT(tp != NULL);
818 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
819 	    "disp_end:tid %p", tp);
820 
821 	if (disp_ratify(tp, kpq) == NULL)
822 		goto reschedule;
823 
824 	return (tp);
825 }
826 
827 /*
828  * swtch()
829  *	Find best runnable thread and run it.
830  *	Called with the current thread already switched to a new state,
831  *	on a sleep queue, run queue, stopped, and not zombied.
832  *	May be called at any spl level less than or equal to LOCK_LEVEL.
833  *	Always drops spl to the base level (spl0()).
834  */
835 void
836 swtch()
837 {
838 	kthread_t	*t = curthread;
839 	kthread_t	*next;
840 	cpu_t		*cp;
841 
842 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
843 
844 	if (t->t_flag & T_INTR_THREAD)
845 		cpu_intr_swtch_enter(t);
846 
847 	if (t->t_intr != NULL) {
848 		/*
849 		 * We are an interrupt thread.  Setup and return
850 		 * the interrupted thread to be resumed.
851 		 */
852 		(void) splhigh();	/* block other scheduler action */
853 		cp = CPU;		/* now protected against migration */
854 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
855 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
856 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
857 		next = thread_unpin();
858 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
859 		resume_from_intr(next);
860 	} else {
861 #ifdef	DEBUG
862 		if (t->t_state == TS_ONPROC &&
863 		    t->t_disp_queue->disp_cpu == CPU &&
864 		    t->t_preempt == 0) {
865 			thread_lock(t);
866 			ASSERT(t->t_state != TS_ONPROC ||
867 			    t->t_disp_queue->disp_cpu != CPU ||
868 			    t->t_preempt != 0);	/* cannot migrate */
869 			thread_unlock_nopreempt(t);
870 		}
871 #endif	/* DEBUG */
872 		cp = CPU;
873 		next = disp();		/* returns with spl high */
874 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
875 
876 		/* OK to steal anything left on run queue */
877 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
878 
879 		if (next != t) {
880 			if (t == cp->cpu_idle_thread) {
881 				PG_NRUN_UPDATE(cp, 1);
882 			} else if (next == cp->cpu_idle_thread) {
883 				PG_NRUN_UPDATE(cp, -1);
884 			}
885 
886 			/*
887 			 * If t was previously in the TS_ONPROC state,
888 			 * setfrontdq and setbackdq won't have set its t_waitrq.
889 			 * Since we now finally know that we're switching away
890 			 * from this thread, set its t_waitrq if it is on a run
891 			 * queue.
892 			 */
893 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
894 				t->t_waitrq = gethrtime_unscaled();
895 			}
896 
897 			/*
898 			 * restore mstate of thread that we are switching to
899 			 */
900 			restore_mstate(next);
901 
902 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
903 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
904 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
905 
906 			if (dtrace_vtime_active)
907 				dtrace_vtime_switch(next);
908 
909 			resume(next);
910 			/*
911 			 * The TR_RESUME_END and TR_SWTCH_END trace points
912 			 * appear at the end of resume(), because we may not
913 			 * return here
914 			 */
915 		} else {
916 			if (t->t_flag & T_INTR_THREAD)
917 				cpu_intr_swtch_exit(t);
918 
919 			DTRACE_SCHED(remain__cpu);
920 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
921 			(void) spl0();
922 		}
923 	}
924 }
925 
926 /*
927  * swtch_from_zombie()
928  *	Special case of swtch(), which allows checks for TS_ZOMB to be
929  *	eliminated from normal resume.
930  *	Find best runnable thread and run it.
931  *	Called with the current thread zombied.
932  *	Zombies cannot migrate, so CPU references are safe.
933  */
934 void
935 swtch_from_zombie()
936 {
937 	kthread_t	*next;
938 	cpu_t		*cpu = CPU;
939 
940 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
941 
942 	ASSERT(curthread->t_state == TS_ZOMB);
943 
944 	next = disp();			/* returns with spl high */
945 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
946 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
947 	ASSERT(next != curthread);
948 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
949 
950 	if (next == cpu->cpu_idle_thread)
951 		PG_NRUN_UPDATE(cpu, -1);
952 
953 	restore_mstate(next);
954 
955 	if (dtrace_vtime_active)
956 		dtrace_vtime_switch(next);
957 
958 	resume_from_zombie(next);
959 	/*
960 	 * The TR_RESUME_END and TR_SWTCH_END trace points
961 	 * appear at the end of resume(), because we certainly will not
962 	 * return here
963 	 */
964 }
965 
966 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
967 
968 /*
969  * search_disp_queues()
970  *	Search the given dispatch queues for thread tp.
971  *	Return 1 if tp is found, otherwise return 0.
972  */
973 static int
974 search_disp_queues(disp_t *dp, kthread_t *tp)
975 {
976 	dispq_t		*dq;
977 	dispq_t		*eq;
978 
979 	disp_lock_enter_high(&dp->disp_lock);
980 
981 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
982 		kthread_t	*rp;
983 
984 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
985 
986 		for (rp = dq->dq_first; rp; rp = rp->t_link)
987 			if (tp == rp) {
988 				disp_lock_exit_high(&dp->disp_lock);
989 				return (1);
990 			}
991 	}
992 	disp_lock_exit_high(&dp->disp_lock);
993 
994 	return (0);
995 }
996 
997 /*
998  * thread_on_queue()
999  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1000  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1001  */
1002 static int
1003 thread_on_queue(kthread_t *tp)
1004 {
1005 	cpu_t		*cp;
1006 	struct cpupart	*part;
1007 
1008 	ASSERT(getpil() >= DISP_LEVEL);
1009 
1010 	/*
1011 	 * Search the per-CPU dispatch queues for tp.
1012 	 */
1013 	cp = CPU;
1014 	do {
1015 		if (search_disp_queues(cp->cpu_disp, tp))
1016 			return (1);
1017 	} while ((cp = cp->cpu_next_onln) != CPU);
1018 
1019 	/*
1020 	 * Search the partition-wide kpreempt queues for tp.
1021 	 */
1022 	part = CPU->cpu_part;
1023 	do {
1024 		if (search_disp_queues(&part->cp_kp_queue, tp))
1025 			return (1);
1026 	} while ((part = part->cp_next) != CPU->cpu_part);
1027 
1028 	return (0);
1029 }
1030 
1031 #else
1032 
1033 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1034 
1035 #endif  /* DEBUG */
1036 
1037 /*
1038  * like swtch(), but switch to a specified thread taken from another CPU.
1039  *	called with spl high..
1040  */
1041 void
1042 swtch_to(kthread_t *next)
1043 {
1044 	cpu_t			*cp = CPU;
1045 
1046 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1047 
1048 	/*
1049 	 * Update context switch statistics.
1050 	 */
1051 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1052 
1053 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1054 
1055 	if (curthread == cp->cpu_idle_thread)
1056 		PG_NRUN_UPDATE(cp, 1);
1057 
1058 	/* OK to steal anything left on run queue */
1059 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1060 
1061 	/* record last execution time */
1062 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
1063 
1064 	/*
1065 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1066 	 * won't have set its t_waitrq.  Since we now finally know that we're
1067 	 * switching away from this thread, set its t_waitrq if it is on a run
1068 	 * queue.
1069 	 */
1070 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1071 		curthread->t_waitrq = gethrtime_unscaled();
1072 	}
1073 
1074 	/* restore next thread to previously running microstate */
1075 	restore_mstate(next);
1076 
1077 	if (dtrace_vtime_active)
1078 		dtrace_vtime_switch(next);
1079 
1080 	resume(next);
1081 	/*
1082 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1083 	 * appear at the end of resume(), because we may not
1084 	 * return here
1085 	 */
1086 }
1087 
1088 
1089 
1090 #define	CPU_IDLING(pri)	((pri) == -1)
1091 
1092 static void
1093 cpu_resched(cpu_t *cp, pri_t tpri)
1094 {
1095 	int	call_poke_cpu = 0;
1096 	pri_t   cpupri = cp->cpu_dispatch_pri;
1097 
1098 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1099 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1100 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1101 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1102 			cp->cpu_runrun = 1;
1103 			aston(cp->cpu_dispthread);
1104 			if (tpri < kpreemptpri && cp != CPU)
1105 				call_poke_cpu = 1;
1106 		}
1107 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1108 			cp->cpu_kprunrun = 1;
1109 			if (cp != CPU)
1110 				call_poke_cpu = 1;
1111 		}
1112 	}
1113 
1114 	/*
1115 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1116 	 */
1117 	membar_enter();
1118 
1119 	if (call_poke_cpu)
1120 		poke_cpu(cp->cpu_id);
1121 }
1122 
1123 /*
1124  * Perform multi-level CMT load balancing of running threads.
1125  * tp is the thread being enqueued
1126  * cp is the hint CPU (chosen by cpu_choose()).
1127  */
1128 static cpu_t *
1129 cmt_balance(kthread_t *tp, cpu_t *cp)
1130 {
1131 	int		hint, i, cpu, nsiblings;
1132 	int		self = 0;
1133 	group_t		*cmt_pgs, *siblings;
1134 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
1135 	int		pg_nrun, tpg_nrun;
1136 	int		level = 0;
1137 	cpu_t		*newcp;
1138 
1139 	ASSERT(THREAD_LOCK_HELD(tp));
1140 
1141 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
1142 
1143 	if (GROUP_SIZE(cmt_pgs) == 0)
1144 		return (cp);	/* nothing to do */
1145 
1146 	if (tp == curthread)
1147 		self = 1;
1148 
1149 	/*
1150 	 * Balance across siblings in the CPUs CMT lineage
1151 	 */
1152 	do {
1153 		pg = GROUP_ACCESS(cmt_pgs, level);
1154 
1155 		siblings = pg->cmt_siblings;
1156 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
1157 		if (nsiblings == 1)
1158 			continue;	/* nobody to balance against */
1159 
1160 		pg_nrun = pg->cmt_nrunning;
1161 		if (self &&
1162 		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
1163 			pg_nrun--;	/* Ignore curthread's effect */
1164 
1165 		hint = pg->cmt_hint;
1166 		/*
1167 		 * Check for validity of the hint
1168 		 * It should reference a valid sibling
1169 		 */
1170 		if (hint >= nsiblings)
1171 			hint = pg->cmt_hint = 0;
1172 		else
1173 			pg->cmt_hint++;
1174 
1175 		/*
1176 		 * Find a balancing candidate from among our siblings
1177 		 * "hint" is a hint for where to start looking
1178 		 */
1179 		i = hint;
1180 		do {
1181 			ASSERT(i < nsiblings);
1182 			pg_tmp = GROUP_ACCESS(siblings, i);
1183 
1184 			/*
1185 			 * The candidate must not be us, and must
1186 			 * have some CPU resources in the thread's
1187 			 * partition
1188 			 */
1189 			if (pg_tmp != pg &&
1190 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
1191 			    ((pg_t *)pg_tmp)->pg_id)) {
1192 				tpg = pg_tmp;
1193 				break;
1194 			}
1195 
1196 			if (++i >= nsiblings)
1197 				i = 0;
1198 		} while (i != hint);
1199 
1200 		if (!tpg)
1201 			continue;	/* no candidates at this level */
1202 
1203 		/*
1204 		 * Check if the balancing target is underloaded
1205 		 * Decide to balance if the target is running fewer
1206 		 * threads, or if it's running the same number of threads
1207 		 * with more online CPUs
1208 		 */
1209 		tpg_nrun = tpg->cmt_nrunning;
1210 		if (pg_nrun > tpg_nrun ||
1211 		    (pg_nrun == tpg_nrun &&
1212 		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
1213 		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
1214 			break;
1215 		}
1216 		tpg = NULL;
1217 	} while (++level < GROUP_SIZE(cmt_pgs));
1218 
1219 
1220 	if (tpg) {
1221 		/*
1222 		 * Select an idle CPU from the target PG
1223 		 */
1224 		for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) {
1225 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
1226 			if (newcp->cpu_part == tp->t_cpupart &&
1227 			    newcp->cpu_dispatch_pri == -1) {
1228 				cp = newcp;
1229 				break;
1230 			}
1231 		}
1232 	}
1233 
1234 	return (cp);
1235 }
1236 
1237 /*
1238  * setbackdq() keeps runqs balanced such that the difference in length
1239  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1240  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1241  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1242  * try to keep runqs perfectly balanced regardless of the thread priority.
1243  */
1244 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1245 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1246 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1247 
1248 /*
1249  * Put the specified thread on the back of the dispatcher
1250  * queue corresponding to its current priority.
1251  *
1252  * Called with the thread in transition, onproc or stopped state
1253  * and locked (transition implies locked) and at high spl.
1254  * Returns with the thread in TS_RUN state and still locked.
1255  */
1256 void
1257 setbackdq(kthread_t *tp)
1258 {
1259 	dispq_t	*dq;
1260 	disp_t		*dp;
1261 	cpu_t		*cp;
1262 	pri_t		tpri;
1263 	int		bound;
1264 
1265 	ASSERT(THREAD_LOCK_HELD(tp));
1266 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1267 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1268 
1269 	/*
1270 	 * If thread is "swapped" or on the swap queue don't
1271 	 * queue it, but wake sched.
1272 	 */
1273 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1274 		disp_swapped_setrun(tp);
1275 		return;
1276 	}
1277 
1278 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1279 		bound = 1;
1280 	else
1281 		bound = 0;
1282 
1283 	tpri = DISP_PRIO(tp);
1284 	if (ncpus == 1)
1285 		cp = tp->t_cpu;
1286 	else if (!bound) {
1287 		if (tpri >= kpqpri) {
1288 			setkpdq(tp, SETKP_BACK);
1289 			return;
1290 		}
1291 		/*
1292 		 * Let cpu_choose suggest a CPU.
1293 		 */
1294 		cp = cpu_choose(tp, tpri);
1295 
1296 		if (tp->t_cpupart == cp->cpu_part) {
1297 			int	qlen;
1298 
1299 			/*
1300 			 * Perform any CMT load balancing
1301 			 */
1302 			cp = cmt_balance(tp, cp);
1303 
1304 			/*
1305 			 * Balance across the run queues
1306 			 */
1307 			qlen = RUNQ_LEN(cp, tpri);
1308 			if (tpri >= RUNQ_MATCH_PRI &&
1309 			    !(tp->t_schedflag & TS_RUNQMATCH))
1310 				qlen -= RUNQ_MAX_DIFF;
1311 			if (qlen > 0) {
1312 				cpu_t *newcp;
1313 
1314 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1315 					newcp = cp->cpu_next_part;
1316 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1317 					newcp = cp->cpu_next_part;
1318 				}
1319 
1320 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1321 					DTRACE_PROBE3(runq__balance,
1322 					    kthread_t *, tp,
1323 					    cpu_t *, cp, cpu_t *, newcp);
1324 					cp = newcp;
1325 				}
1326 			}
1327 		} else {
1328 			/*
1329 			 * Migrate to a cpu in the new partition.
1330 			 */
1331 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1332 			    tp->t_lpl, tp->t_pri, NULL);
1333 		}
1334 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1335 	} else {
1336 		/*
1337 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1338 		 * a short time until weak binding that existed when the
1339 		 * strong binding was established has dropped) so we must
1340 		 * favour weak binding over strong.
1341 		 */
1342 		cp = tp->t_weakbound_cpu ?
1343 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1344 	}
1345 	/*
1346 	 * A thread that is ONPROC may be temporarily placed on the run queue
1347 	 * but then chosen to run again by disp.  If the thread we're placing on
1348 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1349 	 * replacement process is actually scheduled in swtch().  In this
1350 	 * situation, curthread is the only thread that could be in the ONPROC
1351 	 * state.
1352 	 */
1353 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1354 		hrtime_t curtime;
1355 
1356 		curtime = gethrtime_unscaled();
1357 		(void) cpu_update_pct(tp, curtime);
1358 		tp->t_waitrq = curtime;
1359 	} else {
1360 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1361 	}
1362 
1363 	dp = cp->cpu_disp;
1364 	disp_lock_enter_high(&dp->disp_lock);
1365 
1366 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1367 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1368 	    tpri, cp, tp);
1369 
1370 #ifndef NPROBE
1371 	/* Kernel probe */
1372 	if (tnf_tracing_active)
1373 		tnf_thread_queue(tp, cp, tpri);
1374 #endif /* NPROBE */
1375 
1376 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1377 
1378 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1379 	tp->t_disp_queue = dp;
1380 	tp->t_link = NULL;
1381 
1382 	dq = &dp->disp_q[tpri];
1383 	dp->disp_nrunnable++;
1384 	if (!bound)
1385 		dp->disp_steal = 0;
1386 	membar_enter();
1387 
1388 	if (dq->dq_sruncnt++ != 0) {
1389 		ASSERT(dq->dq_first != NULL);
1390 		dq->dq_last->t_link = tp;
1391 		dq->dq_last = tp;
1392 	} else {
1393 		ASSERT(dq->dq_first == NULL);
1394 		ASSERT(dq->dq_last == NULL);
1395 		dq->dq_first = dq->dq_last = tp;
1396 		BT_SET(dp->disp_qactmap, tpri);
1397 		if (tpri > dp->disp_maxrunpri) {
1398 			dp->disp_maxrunpri = tpri;
1399 			membar_enter();
1400 			cpu_resched(cp, tpri);
1401 		}
1402 	}
1403 
1404 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1405 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1406 		    cp == CPU) {
1407 			/*
1408 			 * If there are no other unbound threads on the
1409 			 * run queue, don't allow other CPUs to steal
1410 			 * this thread while we are in the middle of a
1411 			 * context switch. We may just switch to it
1412 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1413 			 * in swtch and swtch_to.
1414 			 */
1415 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1416 		}
1417 		dp->disp_max_unbound_pri = tpri;
1418 	}
1419 	(*disp_enq_thread)(cp, bound);
1420 }
1421 
1422 /*
1423  * Put the specified thread on the front of the dispatcher
1424  * queue corresponding to its current priority.
1425  *
1426  * Called with the thread in transition, onproc or stopped state
1427  * and locked (transition implies locked) and at high spl.
1428  * Returns with the thread in TS_RUN state and still locked.
1429  */
1430 void
1431 setfrontdq(kthread_t *tp)
1432 {
1433 	disp_t		*dp;
1434 	dispq_t		*dq;
1435 	cpu_t		*cp;
1436 	pri_t		tpri;
1437 	int		bound;
1438 
1439 	ASSERT(THREAD_LOCK_HELD(tp));
1440 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1441 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1442 
1443 	/*
1444 	 * If thread is "swapped" or on the swap queue don't
1445 	 * queue it, but wake sched.
1446 	 */
1447 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1448 		disp_swapped_setrun(tp);
1449 		return;
1450 	}
1451 
1452 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1453 		bound = 1;
1454 	else
1455 		bound = 0;
1456 
1457 	tpri = DISP_PRIO(tp);
1458 	if (ncpus == 1)
1459 		cp = tp->t_cpu;
1460 	else if (!bound) {
1461 		if (tpri >= kpqpri) {
1462 			setkpdq(tp, SETKP_FRONT);
1463 			return;
1464 		}
1465 		cp = tp->t_cpu;
1466 		if (tp->t_cpupart == cp->cpu_part) {
1467 			/*
1468 			 * If we are of higher or equal priority than
1469 			 * the highest priority runnable thread of
1470 			 * the current CPU, just pick this CPU.  Otherwise
1471 			 * Let cpu_choose() select the CPU.  If this cpu
1472 			 * is the target of an offline request then do not
1473 			 * pick it - a thread_nomigrate() on the in motion
1474 			 * cpu relies on this when it forces a preempt.
1475 			 */
1476 			if (tpri < cp->cpu_disp->disp_maxrunpri ||
1477 			    cp == cpu_inmotion)
1478 				cp = cpu_choose(tp, tpri);
1479 		} else {
1480 			/*
1481 			 * Migrate to a cpu in the new partition.
1482 			 */
1483 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1484 			    tp->t_lpl, tp->t_pri, NULL);
1485 		}
1486 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1487 	} else {
1488 		/*
1489 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1490 		 * a short time until weak binding that existed when the
1491 		 * strong binding was established has dropped) so we must
1492 		 * favour weak binding over strong.
1493 		 */
1494 		cp = tp->t_weakbound_cpu ?
1495 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1496 	}
1497 
1498 	/*
1499 	 * A thread that is ONPROC may be temporarily placed on the run queue
1500 	 * but then chosen to run again by disp.  If the thread we're placing on
1501 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1502 	 * replacement process is actually scheduled in swtch().  In this
1503 	 * situation, curthread is the only thread that could be in the ONPROC
1504 	 * state.
1505 	 */
1506 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1507 		hrtime_t curtime;
1508 
1509 		curtime = gethrtime_unscaled();
1510 		(void) cpu_update_pct(tp, curtime);
1511 		tp->t_waitrq = curtime;
1512 	} else {
1513 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1514 	}
1515 
1516 	dp = cp->cpu_disp;
1517 	disp_lock_enter_high(&dp->disp_lock);
1518 
1519 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1520 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1521 
1522 #ifndef NPROBE
1523 	/* Kernel probe */
1524 	if (tnf_tracing_active)
1525 		tnf_thread_queue(tp, cp, tpri);
1526 #endif /* NPROBE */
1527 
1528 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1529 
1530 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1531 	tp->t_disp_queue = dp;
1532 
1533 	dq = &dp->disp_q[tpri];
1534 	dp->disp_nrunnable++;
1535 	if (!bound)
1536 		dp->disp_steal = 0;
1537 	membar_enter();
1538 
1539 	if (dq->dq_sruncnt++ != 0) {
1540 		ASSERT(dq->dq_last != NULL);
1541 		tp->t_link = dq->dq_first;
1542 		dq->dq_first = tp;
1543 	} else {
1544 		ASSERT(dq->dq_last == NULL);
1545 		ASSERT(dq->dq_first == NULL);
1546 		tp->t_link = NULL;
1547 		dq->dq_first = dq->dq_last = tp;
1548 		BT_SET(dp->disp_qactmap, tpri);
1549 		if (tpri > dp->disp_maxrunpri) {
1550 			dp->disp_maxrunpri = tpri;
1551 			membar_enter();
1552 			cpu_resched(cp, tpri);
1553 		}
1554 	}
1555 
1556 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1557 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1558 		    cp == CPU) {
1559 			/*
1560 			 * If there are no other unbound threads on the
1561 			 * run queue, don't allow other CPUs to steal
1562 			 * this thread while we are in the middle of a
1563 			 * context switch. We may just switch to it
1564 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1565 			 * in swtch and swtch_to.
1566 			 */
1567 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1568 		}
1569 		dp->disp_max_unbound_pri = tpri;
1570 	}
1571 	(*disp_enq_thread)(cp, bound);
1572 }
1573 
1574 /*
1575  * Put a high-priority unbound thread on the kp queue
1576  */
1577 static void
1578 setkpdq(kthread_t *tp, int borf)
1579 {
1580 	dispq_t	*dq;
1581 	disp_t	*dp;
1582 	cpu_t	*cp;
1583 	pri_t	tpri;
1584 
1585 	tpri = DISP_PRIO(tp);
1586 
1587 	dp = &tp->t_cpupart->cp_kp_queue;
1588 	disp_lock_enter_high(&dp->disp_lock);
1589 
1590 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1591 
1592 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1593 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1594 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1595 	tp->t_disp_queue = dp;
1596 	dp->disp_nrunnable++;
1597 	dq = &dp->disp_q[tpri];
1598 
1599 	if (dq->dq_sruncnt++ != 0) {
1600 		if (borf == SETKP_BACK) {
1601 			ASSERT(dq->dq_first != NULL);
1602 			tp->t_link = NULL;
1603 			dq->dq_last->t_link = tp;
1604 			dq->dq_last = tp;
1605 		} else {
1606 			ASSERT(dq->dq_last != NULL);
1607 			tp->t_link = dq->dq_first;
1608 			dq->dq_first = tp;
1609 		}
1610 	} else {
1611 		if (borf == SETKP_BACK) {
1612 			ASSERT(dq->dq_first == NULL);
1613 			ASSERT(dq->dq_last == NULL);
1614 			dq->dq_first = dq->dq_last = tp;
1615 		} else {
1616 			ASSERT(dq->dq_last == NULL);
1617 			ASSERT(dq->dq_first == NULL);
1618 			tp->t_link = NULL;
1619 			dq->dq_first = dq->dq_last = tp;
1620 		}
1621 		BT_SET(dp->disp_qactmap, tpri);
1622 		if (tpri > dp->disp_max_unbound_pri)
1623 			dp->disp_max_unbound_pri = tpri;
1624 		if (tpri > dp->disp_maxrunpri) {
1625 			dp->disp_maxrunpri = tpri;
1626 			membar_enter();
1627 		}
1628 	}
1629 
1630 	cp = tp->t_cpu;
1631 	if (tp->t_cpupart != cp->cpu_part) {
1632 		/* migrate to a cpu in the new partition */
1633 		cp = tp->t_cpupart->cp_cpulist;
1634 	}
1635 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1636 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1637 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1638 
1639 #ifndef NPROBE
1640 	/* Kernel probe */
1641 	if (tnf_tracing_active)
1642 		tnf_thread_queue(tp, cp, tpri);
1643 #endif /* NPROBE */
1644 
1645 	if (cp->cpu_chosen_level < tpri)
1646 		cp->cpu_chosen_level = tpri;
1647 	cpu_resched(cp, tpri);
1648 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1649 	(*disp_enq_thread)(cp, 0);
1650 }
1651 
1652 /*
1653  * Remove a thread from the dispatcher queue if it is on it.
1654  * It is not an error if it is not found but we return whether
1655  * or not it was found in case the caller wants to check.
1656  */
1657 int
1658 dispdeq(kthread_t *tp)
1659 {
1660 	disp_t		*dp;
1661 	dispq_t		*dq;
1662 	kthread_t	*rp;
1663 	kthread_t	*trp;
1664 	kthread_t	**ptp;
1665 	int		tpri;
1666 
1667 	ASSERT(THREAD_LOCK_HELD(tp));
1668 
1669 	if (tp->t_state != TS_RUN)
1670 		return (0);
1671 
1672 	/*
1673 	 * The thread is "swapped" or is on the swap queue and
1674 	 * hence no longer on the run queue, so return true.
1675 	 */
1676 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1677 		return (1);
1678 
1679 	tpri = DISP_PRIO(tp);
1680 	dp = tp->t_disp_queue;
1681 	ASSERT(tpri < dp->disp_npri);
1682 	dq = &dp->disp_q[tpri];
1683 	ptp = &dq->dq_first;
1684 	rp = *ptp;
1685 	trp = NULL;
1686 
1687 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1688 
1689 	/*
1690 	 * Search for thread in queue.
1691 	 * Double links would simplify this at the expense of disp/setrun.
1692 	 */
1693 	while (rp != tp && rp != NULL) {
1694 		trp = rp;
1695 		ptp = &trp->t_link;
1696 		rp = trp->t_link;
1697 	}
1698 
1699 	if (rp == NULL) {
1700 		panic("dispdeq: thread not on queue");
1701 	}
1702 
1703 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1704 
1705 	/*
1706 	 * Found it so remove it from queue.
1707 	 */
1708 	if ((*ptp = rp->t_link) == NULL)
1709 		dq->dq_last = trp;
1710 
1711 	dp->disp_nrunnable--;
1712 	if (--dq->dq_sruncnt == 0) {
1713 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1714 		if (dp->disp_nrunnable == 0) {
1715 			dp->disp_max_unbound_pri = -1;
1716 			dp->disp_maxrunpri = -1;
1717 		} else if (tpri == dp->disp_maxrunpri) {
1718 			int ipri;
1719 
1720 			ipri = bt_gethighbit(dp->disp_qactmap,
1721 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1722 			if (ipri < dp->disp_max_unbound_pri)
1723 				dp->disp_max_unbound_pri = ipri;
1724 			dp->disp_maxrunpri = ipri;
1725 		}
1726 	}
1727 	tp->t_link = NULL;
1728 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1729 	return (1);
1730 }
1731 
1732 
1733 /*
1734  * dq_sruninc and dq_srundec are public functions for
1735  * incrementing/decrementing the sruncnts when a thread on
1736  * a dispatcher queue is made schedulable/unschedulable by
1737  * resetting the TS_LOAD flag.
1738  *
1739  * The caller MUST have the thread lock and therefore the dispatcher
1740  * queue lock so that the operation which changes
1741  * the flag, the operation that checks the status of the thread to
1742  * determine if it's on a disp queue AND the call to this function
1743  * are one atomic operation with respect to interrupts.
1744  */
1745 
1746 /*
1747  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1748  */
1749 void
1750 dq_sruninc(kthread_t *t)
1751 {
1752 	ASSERT(t->t_state == TS_RUN);
1753 	ASSERT(t->t_schedflag & TS_LOAD);
1754 
1755 	THREAD_TRANSITION(t);
1756 	setfrontdq(t);
1757 }
1758 
1759 /*
1760  * See comment on calling conventions above.
1761  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1762  */
1763 void
1764 dq_srundec(kthread_t *t)
1765 {
1766 	ASSERT(t->t_schedflag & TS_LOAD);
1767 
1768 	(void) dispdeq(t);
1769 	disp_swapped_enq(t);
1770 }
1771 
1772 /*
1773  * Change the dispatcher lock of thread to the "swapped_lock"
1774  * and return with thread lock still held.
1775  *
1776  * Called with thread_lock held, in transition state, and at high spl.
1777  */
1778 void
1779 disp_swapped_enq(kthread_t *tp)
1780 {
1781 	ASSERT(THREAD_LOCK_HELD(tp));
1782 	ASSERT(tp->t_schedflag & TS_LOAD);
1783 
1784 	switch (tp->t_state) {
1785 	case TS_RUN:
1786 		disp_lock_enter_high(&swapped_lock);
1787 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1788 		break;
1789 	case TS_ONPROC:
1790 		disp_lock_enter_high(&swapped_lock);
1791 		THREAD_TRANSITION(tp);
1792 		wake_sched_sec = 1;		/* tell clock to wake sched */
1793 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1794 		break;
1795 	default:
1796 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1797 	}
1798 }
1799 
1800 /*
1801  * This routine is called by setbackdq/setfrontdq if the thread is
1802  * not loaded or loaded and on the swap queue.
1803  *
1804  * Thread state TS_SLEEP implies that a swapped thread
1805  * has been woken up and needs to be swapped in by the swapper.
1806  *
1807  * Thread state TS_RUN, it implies that the priority of a swapped
1808  * thread is being increased by scheduling class (e.g. ts_update).
1809  */
1810 static void
1811 disp_swapped_setrun(kthread_t *tp)
1812 {
1813 	ASSERT(THREAD_LOCK_HELD(tp));
1814 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1815 
1816 	switch (tp->t_state) {
1817 	case TS_SLEEP:
1818 		disp_lock_enter_high(&swapped_lock);
1819 		/*
1820 		 * Wakeup sched immediately (i.e., next tick) if the
1821 		 * thread priority is above maxclsyspri.
1822 		 */
1823 		if (DISP_PRIO(tp) > maxclsyspri)
1824 			wake_sched = 1;
1825 		else
1826 			wake_sched_sec = 1;
1827 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1828 		break;
1829 	case TS_RUN:				/* called from ts_update */
1830 		break;
1831 	default:
1832 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1833 	}
1834 }
1835 
1836 
1837 /*
1838  *	Make a thread give up its processor.  Find the processor on
1839  *	which this thread is executing, and have that processor
1840  *	preempt.
1841  */
1842 void
1843 cpu_surrender(kthread_t *tp)
1844 {
1845 	cpu_t	*cpup;
1846 	int	max_pri;
1847 	int	max_run_pri;
1848 	klwp_t	*lwp;
1849 
1850 	ASSERT(THREAD_LOCK_HELD(tp));
1851 
1852 	if (tp->t_state != TS_ONPROC)
1853 		return;
1854 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1855 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1856 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1857 	if (max_pri < max_run_pri)
1858 		max_pri = max_run_pri;
1859 
1860 	cpup->cpu_runrun = 1;
1861 	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1862 		cpup->cpu_kprunrun = 1;
1863 	}
1864 
1865 	/*
1866 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1867 	 */
1868 	membar_enter();
1869 
1870 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1871 
1872 	/*
1873 	 * Make the target thread take an excursion through trap()
1874 	 * to do preempt() (unless we're already in trap or post_syscall,
1875 	 * calling cpu_surrender via CL_TRAPRET).
1876 	 */
1877 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1878 	    lwp->lwp_state != LWP_USER) {
1879 		aston(tp);
1880 		if (cpup != CPU)
1881 			poke_cpu(cpup->cpu_id);
1882 	}
1883 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1884 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1885 }
1886 
1887 
1888 /*
1889  * Commit to and ratify a scheduling decision
1890  */
1891 /*ARGSUSED*/
1892 static kthread_t *
1893 disp_ratify(kthread_t *tp, disp_t *kpq)
1894 {
1895 	pri_t	tpri, maxpri;
1896 	pri_t	maxkpri;
1897 	cpu_t	*cpup;
1898 
1899 	ASSERT(tp != NULL);
1900 	/*
1901 	 * Commit to, then ratify scheduling decision
1902 	 */
1903 	cpup = CPU;
1904 	if (cpup->cpu_runrun != 0)
1905 		cpup->cpu_runrun = 0;
1906 	if (cpup->cpu_kprunrun != 0)
1907 		cpup->cpu_kprunrun = 0;
1908 	if (cpup->cpu_chosen_level != -1)
1909 		cpup->cpu_chosen_level = -1;
1910 	membar_enter();
1911 	tpri = DISP_PRIO(tp);
1912 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1913 	maxkpri = kpq->disp_maxrunpri;
1914 	if (maxpri < maxkpri)
1915 		maxpri = maxkpri;
1916 	if (tpri < maxpri) {
1917 		/*
1918 		 * should have done better
1919 		 * put this one back and indicate to try again
1920 		 */
1921 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1922 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1923 		thread_lock_high(tp);
1924 		THREAD_TRANSITION(tp);
1925 		setfrontdq(tp);
1926 		thread_unlock_nopreempt(tp);
1927 
1928 		tp = NULL;
1929 	}
1930 	return (tp);
1931 }
1932 
1933 /*
1934  * See if there is any work on the dispatcher queue for other CPUs.
1935  * If there is, dequeue the best thread and return.
1936  */
1937 static kthread_t *
1938 disp_getwork(cpu_t *cp)
1939 {
1940 	cpu_t		*ocp;		/* other CPU */
1941 	cpu_t		*ocp_start;
1942 	cpu_t		*tcp;		/* target local CPU */
1943 	kthread_t	*tp;
1944 	kthread_t	*retval = NULL;
1945 	pri_t		maxpri;
1946 	disp_t		*kpq;		/* kp queue for this partition */
1947 	lpl_t		*lpl, *lpl_leaf;
1948 	int		hint, leafidx;
1949 	hrtime_t	stealtime;
1950 
1951 	maxpri = -1;
1952 	tcp = NULL;
1953 
1954 	kpq = &cp->cpu_part->cp_kp_queue;
1955 	while (kpq->disp_maxrunpri >= 0) {
1956 		/*
1957 		 * Try to take a thread from the kp_queue.
1958 		 */
1959 		tp = (disp_getbest(kpq));
1960 		if (tp)
1961 			return (disp_ratify(tp, kpq));
1962 	}
1963 
1964 	kpreempt_disable();		/* protect the cpu_active list */
1965 
1966 	/*
1967 	 * Try to find something to do on another CPU's run queue.
1968 	 * Loop through all other CPUs looking for the one with the highest
1969 	 * priority unbound thread.
1970 	 *
1971 	 * On NUMA machines, the partition's CPUs are consulted in order of
1972 	 * distance from the current CPU. This way, the first available
1973 	 * work found is also the closest, and will suffer the least
1974 	 * from being migrated.
1975 	 */
1976 	lpl = lpl_leaf = cp->cpu_lpl;
1977 	hint = leafidx = 0;
1978 
1979 	/*
1980 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1981 	 * broader levels of locality
1982 	 */
1983 	do {
1984 		/* This loop iterates over the lpl's leaves */
1985 		do {
1986 			if (lpl_leaf != cp->cpu_lpl)
1987 				ocp = lpl_leaf->lpl_cpus;
1988 			else
1989 				ocp = cp->cpu_next_lpl;
1990 
1991 			/* This loop iterates over the CPUs in the leaf */
1992 			ocp_start = ocp;
1993 			do {
1994 				pri_t pri;
1995 
1996 				ASSERT(CPU_ACTIVE(ocp));
1997 
1998 				/*
1999 				 * End our stroll around this lpl if:
2000 				 *
2001 				 * - Something became runnable on the local
2002 				 *   queue...which also ends our stroll around
2003 				 *   the partition.
2004 				 *
2005 				 * - We happen across another idle CPU.
2006 				 *   Since it is patrolling the next portion
2007 				 *   of the lpl's list (assuming it's not
2008 				 *   halted), move to the next higher level
2009 				 *   of locality.
2010 				 */
2011 				if (cp->cpu_disp->disp_nrunnable != 0) {
2012 					kpreempt_enable();
2013 					return (NULL);
2014 				}
2015 				if (ocp->cpu_dispatch_pri == -1) {
2016 					if (ocp->cpu_disp_flags &
2017 					    CPU_DISP_HALTED)
2018 						continue;
2019 					else
2020 						break;
2021 				}
2022 
2023 				/*
2024 				 * If there's only one thread and the CPU
2025 				 * is in the middle of a context switch,
2026 				 * or it's currently running the idle thread,
2027 				 * don't steal it.
2028 				 */
2029 				if ((ocp->cpu_disp_flags &
2030 				    CPU_DISP_DONTSTEAL) &&
2031 				    ocp->cpu_disp->disp_nrunnable == 1)
2032 					continue;
2033 
2034 				pri = ocp->cpu_disp->disp_max_unbound_pri;
2035 				if (pri > maxpri) {
2036 					/*
2037 					 * Don't steal threads that we attempted
2038 					 * to steal recently until they're ready
2039 					 * to be stolen again.
2040 					 */
2041 					stealtime = ocp->cpu_disp->disp_steal;
2042 					if (stealtime == 0 ||
2043 					    stealtime - gethrtime() <= 0) {
2044 						maxpri = pri;
2045 						tcp = ocp;
2046 					} else {
2047 						/*
2048 						 * Don't update tcp, just set
2049 						 * the retval to T_DONTSTEAL, so
2050 						 * that if no acceptable CPUs
2051 						 * are found the return value
2052 						 * will be T_DONTSTEAL rather
2053 						 * then NULL.
2054 						 */
2055 						retval = T_DONTSTEAL;
2056 					}
2057 				}
2058 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2059 
2060 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2061 				leafidx = 0;
2062 				lpl_leaf = lpl->lpl_rset[leafidx];
2063 			}
2064 		} while (leafidx != hint);
2065 
2066 		hint = leafidx = lpl->lpl_hint;
2067 		if ((lpl = lpl->lpl_parent) != NULL)
2068 			lpl_leaf = lpl->lpl_rset[hint];
2069 	} while (!tcp && lpl);
2070 
2071 	kpreempt_enable();
2072 
2073 	/*
2074 	 * If another queue looks good, and there is still nothing on
2075 	 * the local queue, try to transfer one or more threads
2076 	 * from it to our queue.
2077 	 */
2078 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2079 		tp = disp_getbest(tcp->cpu_disp);
2080 		if (tp == NULL || tp == T_DONTSTEAL)
2081 			return (tp);
2082 		return (disp_ratify(tp, kpq));
2083 	}
2084 	return (retval);
2085 }
2086 
2087 
2088 /*
2089  * disp_fix_unbound_pri()
2090  *	Determines the maximum priority of unbound threads on the queue.
2091  *	The priority is kept for the queue, but is only increased, never
2092  *	reduced unless some CPU is looking for something on that queue.
2093  *
2094  *	The priority argument is the known upper limit.
2095  *
2096  *	Perhaps this should be kept accurately, but that probably means
2097  *	separate bitmaps for bound and unbound threads.  Since only idled
2098  *	CPUs will have to do this recalculation, it seems better this way.
2099  */
2100 static void
2101 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2102 {
2103 	kthread_t	*tp;
2104 	dispq_t		*dq;
2105 	ulong_t		*dqactmap = dp->disp_qactmap;
2106 	ulong_t		mapword;
2107 	int		wx;
2108 
2109 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2110 
2111 	ASSERT(pri >= 0);			/* checked by caller */
2112 
2113 	/*
2114 	 * Start the search at the next lowest priority below the supplied
2115 	 * priority.  This depends on the bitmap implementation.
2116 	 */
2117 	do {
2118 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2119 
2120 		/*
2121 		 * Form mask for all lower priorities in the word.
2122 		 */
2123 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2124 
2125 		/*
2126 		 * Get next lower active priority.
2127 		 */
2128 		if (mapword != 0) {
2129 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2130 		} else if (wx > 0) {
2131 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2132 			if (pri < 0)
2133 				break;
2134 		} else {
2135 			pri = -1;
2136 			break;
2137 		}
2138 
2139 		/*
2140 		 * Search the queue for unbound, runnable threads.
2141 		 */
2142 		dq = &dp->disp_q[pri];
2143 		tp = dq->dq_first;
2144 
2145 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2146 			tp = tp->t_link;
2147 		}
2148 
2149 		/*
2150 		 * If a thread was found, set the priority and return.
2151 		 */
2152 	} while (tp == NULL);
2153 
2154 	/*
2155 	 * pri holds the maximum unbound thread priority or -1.
2156 	 */
2157 	if (dp->disp_max_unbound_pri != pri)
2158 		dp->disp_max_unbound_pri = pri;
2159 }
2160 
2161 /*
2162  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2163  * 	check if the CPU to which is was previously bound should have
2164  * 	its disp_max_unbound_pri increased.
2165  */
2166 void
2167 disp_adjust_unbound_pri(kthread_t *tp)
2168 {
2169 	disp_t *dp;
2170 	pri_t tpri;
2171 
2172 	ASSERT(THREAD_LOCK_HELD(tp));
2173 
2174 	/*
2175 	 * Don't do anything if the thread is not bound, or
2176 	 * currently not runnable or swapped out.
2177 	 */
2178 	if (tp->t_bound_cpu == NULL ||
2179 	    tp->t_state != TS_RUN ||
2180 	    tp->t_schedflag & TS_ON_SWAPQ)
2181 		return;
2182 
2183 	tpri = DISP_PRIO(tp);
2184 	dp = tp->t_bound_cpu->cpu_disp;
2185 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2186 	if (tpri > dp->disp_max_unbound_pri)
2187 		dp->disp_max_unbound_pri = tpri;
2188 }
2189 
2190 /*
2191  * disp_getbest()
2192  *   De-queue the highest priority unbound runnable thread.
2193  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2194  *   Returns NULL if nothing found.
2195  *   Returns T_DONTSTEAL if the thread was not stealable.
2196  *   so that the caller will try again later.
2197  *
2198  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2199  *   its type.
2200  */
2201 static kthread_t *
2202 disp_getbest(disp_t *dp)
2203 {
2204 	kthread_t	*tp;
2205 	dispq_t		*dq;
2206 	pri_t		pri;
2207 	cpu_t		*cp, *tcp;
2208 	boolean_t	allbound;
2209 
2210 	disp_lock_enter(&dp->disp_lock);
2211 
2212 	/*
2213 	 * If there is nothing to run, or the CPU is in the middle of a
2214 	 * context switch of the only thread, return NULL.
2215 	 */
2216 	tcp = dp->disp_cpu;
2217 	cp = CPU;
2218 	pri = dp->disp_max_unbound_pri;
2219 	if (pri == -1 ||
2220 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2221 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2222 		disp_lock_exit_nopreempt(&dp->disp_lock);
2223 		return (NULL);
2224 	}
2225 
2226 	dq = &dp->disp_q[pri];
2227 
2228 
2229 	/*
2230 	 * Assume that all threads are bound on this queue, and change it
2231 	 * later when we find out that it is not the case.
2232 	 */
2233 	allbound = B_TRUE;
2234 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2235 		hrtime_t now, nosteal, rqtime;
2236 
2237 		/*
2238 		 * Skip over bound threads which could be here even
2239 		 * though disp_max_unbound_pri indicated this level.
2240 		 */
2241 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2242 			continue;
2243 
2244 		/*
2245 		 * We've got some unbound threads on this queue, so turn
2246 		 * the allbound flag off now.
2247 		 */
2248 		allbound = B_FALSE;
2249 
2250 		/*
2251 		 * The thread is a candidate for stealing from its run queue. We
2252 		 * don't want to steal threads that became runnable just a
2253 		 * moment ago. This improves CPU affinity for threads that get
2254 		 * preempted for short periods of time and go back on the run
2255 		 * queue.
2256 		 *
2257 		 * We want to let it stay on its run queue if it was only placed
2258 		 * there recently and it was running on the same CPU before that
2259 		 * to preserve its cache investment. For the thread to remain on
2260 		 * its run queue, ALL of the following conditions must be
2261 		 * satisfied:
2262 		 *
2263 		 * - the disp queue should not be the kernel preemption queue
2264 		 * - delayed idle stealing should not be disabled
2265 		 * - nosteal_nsec should be non-zero
2266 		 * - it should run with user priority
2267 		 * - it should be on the run queue of the CPU where it was
2268 		 *   running before being placed on the run queue
2269 		 * - it should be the only thread on the run queue (to prevent
2270 		 *   extra scheduling latency for other threads)
2271 		 * - it should sit on the run queue for less than per-chip
2272 		 *   nosteal interval or global nosteal interval
2273 		 * - in case of CPUs with shared cache it should sit in a run
2274 		 *   queue of a CPU from a different chip
2275 		 *
2276 		 * The checks are arranged so that the ones that are faster are
2277 		 * placed earlier.
2278 		 */
2279 		if (tcp == NULL ||
2280 		    pri >= minclsyspri ||
2281 		    tp->t_cpu != tcp)
2282 			break;
2283 
2284 		/*
2285 		 * Steal immediately if, due to CMT processor architecture
2286 		 * migraiton between cp and tcp would incur no performance
2287 		 * penalty.
2288 		 */
2289 		if (pg_cmt_can_migrate(cp, tcp))
2290 			break;
2291 
2292 		nosteal = nosteal_nsec;
2293 		if (nosteal == 0)
2294 			break;
2295 
2296 		/*
2297 		 * Calculate time spent sitting on run queue
2298 		 */
2299 		now = gethrtime_unscaled();
2300 		rqtime = now - tp->t_waitrq;
2301 		scalehrtime(&rqtime);
2302 
2303 		/*
2304 		 * Steal immediately if the time spent on this run queue is more
2305 		 * than allowed nosteal delay.
2306 		 *
2307 		 * Negative rqtime check is needed here to avoid infinite
2308 		 * stealing delays caused by unlikely but not impossible
2309 		 * drifts between CPU times on different CPUs.
2310 		 */
2311 		if (rqtime > nosteal || rqtime < 0)
2312 			break;
2313 
2314 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2315 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2316 		scalehrtime(&now);
2317 		/*
2318 		 * Calculate when this thread becomes stealable
2319 		 */
2320 		now += (nosteal - rqtime);
2321 
2322 		/*
2323 		 * Calculate time when some thread becomes stealable
2324 		 */
2325 		if (now < dp->disp_steal)
2326 			dp->disp_steal = now;
2327 	}
2328 
2329 	/*
2330 	 * If there were no unbound threads on this queue, find the queue
2331 	 * where they are and then return later. The value of
2332 	 * disp_max_unbound_pri is not always accurate because it isn't
2333 	 * reduced until another idle CPU looks for work.
2334 	 */
2335 	if (allbound)
2336 		disp_fix_unbound_pri(dp, pri);
2337 
2338 	/*
2339 	 * If we reached the end of the queue and found no unbound threads
2340 	 * then return NULL so that other CPUs will be considered.  If there
2341 	 * are unbound threads but they cannot yet be stolen, then
2342 	 * return T_DONTSTEAL and try again later.
2343 	 */
2344 	if (tp == NULL) {
2345 		disp_lock_exit_nopreempt(&dp->disp_lock);
2346 		return (allbound ? NULL : T_DONTSTEAL);
2347 	}
2348 
2349 	/*
2350 	 * Found a runnable, unbound thread, so remove it from queue.
2351 	 * dispdeq() requires that we have the thread locked, and we do,
2352 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2353 	 * put the thread in transition state, thereby dropping the dispq
2354 	 * lock.
2355 	 */
2356 
2357 #ifdef DEBUG
2358 	{
2359 		int	thread_was_on_queue;
2360 
2361 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2362 		ASSERT(thread_was_on_queue);
2363 	}
2364 
2365 #else /* DEBUG */
2366 	(void) dispdeq(tp);			/* drops disp_lock */
2367 #endif /* DEBUG */
2368 
2369 	/*
2370 	 * Reset the disp_queue steal time - we do not know what is the smallest
2371 	 * value across the queue is.
2372 	 */
2373 	dp->disp_steal = 0;
2374 
2375 	tp->t_schedflag |= TS_DONT_SWAP;
2376 
2377 	/*
2378 	 * Setup thread to run on the current CPU.
2379 	 */
2380 	tp->t_disp_queue = cp->cpu_disp;
2381 
2382 	cp->cpu_dispthread = tp;		/* protected by spl only */
2383 	cp->cpu_dispatch_pri = pri;
2384 
2385 	/*
2386 	 * There can be a memory synchronization race between disp_getbest()
2387 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2388 	 * to preempt the current thread to run the enqueued thread while
2389 	 * disp_getbest() and disp_ratify() are changing the current thread
2390 	 * to the stolen thread. This may lead to a situation where
2391 	 * cpu_resched() tries to preempt the wrong thread and the
2392 	 * stolen thread continues to run on the CPU which has been tagged
2393 	 * for preemption.
2394 	 * Later the clock thread gets enqueued but doesn't get to run on the
2395 	 * CPU causing the system to hang.
2396 	 *
2397 	 * To avoid this, grabbing and dropping the disp_lock (which does
2398 	 * a memory barrier) is needed to synchronize the execution of
2399 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2400 	 * synchronize the memory read and written by cpu_resched(),
2401 	 * disp_getbest(), and disp_ratify() with each other.
2402 	 *  (see CR#6482861 for more details).
2403 	 */
2404 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2405 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2406 
2407 	ASSERT(pri == DISP_PRIO(tp));
2408 
2409 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2410 
2411 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2412 
2413 	/*
2414 	 * Return with spl high so that swtch() won't need to raise it.
2415 	 * The disp_lock was dropped by dispdeq().
2416 	 */
2417 
2418 	return (tp);
2419 }
2420 
2421 /*
2422  * disp_bound_common() - common routine for higher level functions
2423  *	that check for bound threads under certain conditions.
2424  *	If 'threadlistsafe' is set then there is no need to acquire
2425  *	pidlock to stop the thread list from changing (eg, if
2426  *	disp_bound_* is called with cpus paused).
2427  */
2428 static int
2429 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2430 {
2431 	int		found = 0;
2432 	kthread_t	*tp;
2433 
2434 	ASSERT(flag);
2435 
2436 	if (!threadlistsafe)
2437 		mutex_enter(&pidlock);
2438 	tp = curthread;		/* faster than allthreads */
2439 	do {
2440 		if (tp->t_state != TS_FREE) {
2441 			/*
2442 			 * If an interrupt thread is busy, but the
2443 			 * caller doesn't care (i.e. BOUND_INTR is off),
2444 			 * then just ignore it and continue through.
2445 			 */
2446 			if ((tp->t_flag & T_INTR_THREAD) &&
2447 			    !(flag & BOUND_INTR))
2448 				continue;
2449 
2450 			/*
2451 			 * Skip the idle thread for the CPU
2452 			 * we're about to set offline.
2453 			 */
2454 			if (tp == cp->cpu_idle_thread)
2455 				continue;
2456 
2457 			/*
2458 			 * Skip the pause thread for the CPU
2459 			 * we're about to set offline.
2460 			 */
2461 			if (tp == cp->cpu_pause_thread)
2462 				continue;
2463 
2464 			if ((flag & BOUND_CPU) &&
2465 			    (tp->t_bound_cpu == cp ||
2466 			    tp->t_bind_cpu == cp->cpu_id ||
2467 			    tp->t_weakbound_cpu == cp)) {
2468 				found = 1;
2469 				break;
2470 			}
2471 
2472 			if ((flag & BOUND_PARTITION) &&
2473 			    (tp->t_cpupart == cp->cpu_part)) {
2474 				found = 1;
2475 				break;
2476 			}
2477 		}
2478 	} while ((tp = tp->t_next) != curthread && found == 0);
2479 	if (!threadlistsafe)
2480 		mutex_exit(&pidlock);
2481 	return (found);
2482 }
2483 
2484 /*
2485  * disp_bound_threads - return nonzero if threads are bound to the processor.
2486  *	Called infrequently.  Keep this simple.
2487  *	Includes threads that are asleep or stopped but not onproc.
2488  */
2489 int
2490 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2491 {
2492 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2493 }
2494 
2495 /*
2496  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2497  * to the given processor, including interrupt threads.
2498  */
2499 int
2500 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2501 {
2502 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2503 }
2504 
2505 /*
2506  * disp_bound_partition - return nonzero if threads are bound to the same
2507  * partition as the processor.
2508  *	Called infrequently.  Keep this simple.
2509  *	Includes threads that are asleep or stopped but not onproc.
2510  */
2511 int
2512 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2513 {
2514 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2515 }
2516 
2517 /*
2518  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2519  * threads to other CPUs.
2520  */
2521 void
2522 disp_cpu_inactive(cpu_t *cp)
2523 {
2524 	kthread_t	*tp;
2525 	disp_t		*dp = cp->cpu_disp;
2526 	dispq_t		*dq;
2527 	pri_t		pri;
2528 	int		wasonq;
2529 
2530 	disp_lock_enter(&dp->disp_lock);
2531 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2532 		dq = &dp->disp_q[pri];
2533 		tp = dq->dq_first;
2534 
2535 		/*
2536 		 * Skip over bound threads.
2537 		 */
2538 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2539 			tp = tp->t_link;
2540 		}
2541 
2542 		if (tp == NULL) {
2543 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2544 			disp_fix_unbound_pri(dp, pri);
2545 			continue;
2546 		}
2547 
2548 		wasonq = dispdeq(tp);		/* drops disp_lock */
2549 		ASSERT(wasonq);
2550 		ASSERT(tp->t_weakbound_cpu == NULL);
2551 
2552 		setbackdq(tp);
2553 		/*
2554 		 * Called from cpu_offline:
2555 		 *
2556 		 * cp has already been removed from the list of active cpus
2557 		 * and tp->t_cpu has been changed so there is no risk of
2558 		 * tp ending up back on cp.
2559 		 *
2560 		 * Called from cpupart_move_cpu:
2561 		 *
2562 		 * The cpu has moved to a new cpupart.  Any threads that
2563 		 * were on it's dispatch queues before the move remain
2564 		 * in the old partition and can't run in the new partition.
2565 		 */
2566 		ASSERT(tp->t_cpu != cp);
2567 		thread_unlock(tp);
2568 
2569 		disp_lock_enter(&dp->disp_lock);
2570 	}
2571 	disp_lock_exit(&dp->disp_lock);
2572 }
2573 
2574 /*
2575  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2576  *	The hint passed in is used as a starting point so we don't favor
2577  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2578  *	used CPU for the thread.
2579  *
2580  *	The lgroup and priority are used to determine the best CPU to run on
2581  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2582  *	the thread priority will indicate whether the thread will actually run
2583  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2584  *	lgroup which are running the lowest priority threads are found.  The
2585  *	remote CPU is chosen only if the thread will not run locally on a CPU
2586  *	within the lgroup, but will run on the remote CPU. If the thread
2587  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2588  *
2589  *	The lpl specified also identifies the cpu partition from which
2590  *	disp_lowpri_cpu should select a CPU.
2591  *
2592  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2593  *      behalf of the current thread. (curthread is looking for a new cpu)
2594  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2595  *      ignored.
2596  *
2597  *      If a cpu is the target of an offline request then try to avoid it.
2598  *
2599  *	This function must be called at either high SPL, or with preemption
2600  *	disabled, so that the "hint" CPU cannot be removed from the online
2601  *	CPU list while we are traversing it.
2602  */
2603 cpu_t *
2604 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2605 {
2606 	cpu_t	*bestcpu;
2607 	cpu_t	*besthomecpu;
2608 	cpu_t   *cp, *cpstart;
2609 
2610 	pri_t   bestpri;
2611 	pri_t   cpupri;
2612 
2613 	klgrpset_t	done;
2614 	klgrpset_t	cur_set;
2615 
2616 	lpl_t		*lpl_iter, *lpl_leaf;
2617 	int		i;
2618 
2619 	/*
2620 	 * Scan for a CPU currently running the lowest priority thread.
2621 	 * Cannot get cpu_lock here because it is adaptive.
2622 	 * We do not require lock on CPU list.
2623 	 */
2624 	ASSERT(hint != NULL);
2625 	ASSERT(lpl != NULL);
2626 	ASSERT(lpl->lpl_ncpu > 0);
2627 
2628 	/*
2629 	 * First examine local CPUs. Note that it's possible the hint CPU
2630 	 * passed in in remote to the specified home lgroup. If our priority
2631 	 * isn't sufficient enough such that we can run immediately at home,
2632 	 * then examine CPUs remote to our home lgroup.
2633 	 * We would like to give preference to CPUs closest to "home".
2634 	 * If we can't find a CPU where we'll run at a given level
2635 	 * of locality, we expand our search to include the next level.
2636 	 */
2637 	bestcpu = besthomecpu = NULL;
2638 	klgrpset_clear(done);
2639 	/* start with lpl we were passed */
2640 
2641 	lpl_iter = lpl;
2642 
2643 	do {
2644 
2645 		bestpri = SHRT_MAX;
2646 		klgrpset_clear(cur_set);
2647 
2648 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2649 			lpl_leaf = lpl_iter->lpl_rset[i];
2650 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2651 				continue;
2652 
2653 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2654 
2655 			if (hint->cpu_lpl == lpl_leaf)
2656 				cp = cpstart = hint;
2657 			else
2658 				cp = cpstart = lpl_leaf->lpl_cpus;
2659 
2660 			do {
2661 				if (cp == curcpu)
2662 					cpupri = -1;
2663 				else if (cp == cpu_inmotion)
2664 					cpupri = SHRT_MAX;
2665 				else
2666 					cpupri = cp->cpu_dispatch_pri;
2667 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2668 					cpupri = cp->cpu_disp->disp_maxrunpri;
2669 				if (cp->cpu_chosen_level > cpupri)
2670 					cpupri = cp->cpu_chosen_level;
2671 				if (cpupri < bestpri) {
2672 					if (CPU_IDLING(cpupri)) {
2673 						ASSERT((cp->cpu_flags &
2674 						    CPU_QUIESCED) == 0);
2675 						return (cp);
2676 					}
2677 					bestcpu = cp;
2678 					bestpri = cpupri;
2679 				}
2680 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2681 		}
2682 
2683 		if (bestcpu && (tpri > bestpri)) {
2684 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2685 			return (bestcpu);
2686 		}
2687 		if (besthomecpu == NULL)
2688 			besthomecpu = bestcpu;
2689 		/*
2690 		 * Add the lgrps we just considered to the "done" set
2691 		 */
2692 		klgrpset_or(done, cur_set);
2693 
2694 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2695 
2696 	/*
2697 	 * The specified priority isn't high enough to run immediately
2698 	 * anywhere, so just return the best CPU from the home lgroup.
2699 	 */
2700 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2701 	return (besthomecpu);
2702 }
2703 
2704 /*
2705  * This routine provides the generic idle cpu function for all processors.
2706  * If a processor has some specific code to execute when idle (say, to stop
2707  * the pipeline and save power) then that routine should be defined in the
2708  * processors specific code (module_xx.c) and the global variable idle_cpu
2709  * set to that function.
2710  */
2711 static void
2712 generic_idle_cpu(void)
2713 {
2714 }
2715 
2716 /*ARGSUSED*/
2717 static void
2718 generic_enq_thread(cpu_t *cpu, int bound)
2719 {
2720 }
2721 
2722 /*
2723  * Select a CPU for this thread to run on.  Choose t->t_cpu unless:
2724  *	- t->t_cpu is not in this thread's assigned lgrp
2725  *	- the time since the thread last came off t->t_cpu exceeds the
2726  *	  rechoose time for this cpu (ignore this if t is curthread in
2727  *	  which case it's on CPU and t->t_disp_time is inaccurate)
2728  *	- t->t_cpu is presently the target of an offline or partition move
2729  *	  request
2730  */
2731 static cpu_t *
2732 cpu_choose(kthread_t *t, pri_t tpri)
2733 {
2734 	ASSERT(tpri < kpqpri);
2735 
2736 	if ((((lbolt - t->t_disp_time) > rechoose_interval) &&
2737 	    t != curthread) || t->t_cpu == cpu_inmotion) {
2738 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL));
2739 	}
2740 
2741 	/*
2742 	 * Take a trip through disp_lowpri_cpu() if the thread was
2743 	 * running outside it's home lgroup
2744 	 */
2745 	if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU],
2746 	    t->t_cpu->cpu_lpl->lpl_lgrpid)) {
2747 		return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri,
2748 		    (t == curthread) ? t->t_cpu : NULL));
2749 	}
2750 	return (t->t_cpu);
2751 }
2752