xref: /titanic_44/usr/src/uts/common/disp/disp.c (revision 73a8c195d0dfb19a2b8814d9df1ae6459c88d5a6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
59 
60 #include <vm/as.h>
61 
62 #define	BOUND_CPU	0x1
63 #define	BOUND_PARTITION	0x2
64 #define	BOUND_INTR	0x4
65 
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 	disp_t	*dp;
69 	dispq_t *olddispq;
70 	dispq_t *newdispq;
71 	ulong_t	*olddqactmap;
72 	ulong_t	*newdqactmap;
73 	int	oldnglobpris;
74 };
75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76     disp_t *dp);
77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void	disp_dq_free(struct disp_queue_info *dptr);
79 
80 /* platform-specific routine to call when processor is idle */
81 static void	generic_idle_cpu();
82 void		(*idle_cpu)() = generic_idle_cpu;
83 
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void	idle_enter();
86 static void	idle_exit();
87 
88 /* platform-specific routine to call when thread is enqueued */
89 static void	generic_enq_thread(cpu_t *, int);
90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 
92 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
93 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
94 pri_t	intr_pri;		/* interrupt thread priority base level */
95 
96 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
97 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
98 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
100 int	nswapped;		/* total number of swapped threads */
101 void	disp_swapped_enq(kthread_t *tp);
102 static void	disp_swapped_setrun(kthread_t *tp);
103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
104 
105 /*
106  * If this is set, only interrupt threads will cause kernel preemptions.
107  * This is done by changing the value of kpreemptpri.  kpreemptpri
108  * will either be the max sysclass pri + 1 or the min interrupt pri.
109  */
110 int	only_intr_kpreempt;
111 
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define	SETKP_BACK	0
116 #define	SETKP_FRONT	1
117 /*
118  * Parameter that determines how recently a thread must have run
119  * on the CPU to be considered loosely-bound to that CPU to reduce
120  * cold cache effects.  The interval is in hertz.
121  */
122 #define	RECHOOSE_INTERVAL 3
123 int	rechoose_interval = RECHOOSE_INTERVAL;
124 
125 /*
126  * Parameter that determines how long (in nanoseconds) a thread must
127  * be sitting on a run queue before it can be stolen by another CPU
128  * to reduce migrations.  The interval is in nanoseconds.
129  *
130  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132  * here indicating it is uninitiallized.
133  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
134  *
135  */
136 #define	NOSTEAL_UNINITIALIZED	(-1)
137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 extern void cmp_set_nosteal_interval(void);
139 
140 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
141 
142 disp_lock_t	transition_lock;	/* lock on transitioning threads */
143 disp_lock_t	stop_lock;		/* lock on stopped threads */
144 
145 static void	cpu_dispqalloc(int numpris);
146 
147 /*
148  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149  * a thread because it was sitting on its run queue for a very short
150  * period of time.
151  */
152 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 
154 static kthread_t	*disp_getwork(cpu_t *to);
155 static kthread_t	*disp_getbest(disp_t *from);
156 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
157 
158 void	swtch_to(kthread_t *);
159 
160 /*
161  * dispatcher and scheduler initialization
162  */
163 
164 /*
165  * disp_setup - Common code to calculate and allocate dispatcher
166  *		variables and structures based on the maximum priority.
167  */
168 static void
169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 {
171 	pri_t	newnglobpris;
172 
173 	ASSERT(MUTEX_HELD(&cpu_lock));
174 
175 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176 
177 	if (newnglobpris > oldnglobpris) {
178 		/*
179 		 * Allocate new kp queues for each CPU partition.
180 		 */
181 		cpupart_kpqalloc(newnglobpris);
182 
183 		/*
184 		 * Allocate new dispatch queues for each CPU.
185 		 */
186 		cpu_dispqalloc(newnglobpris);
187 
188 		/*
189 		 * compute new interrupt thread base priority
190 		 */
191 		intr_pri = maxglobpri;
192 		if (only_intr_kpreempt) {
193 			kpreemptpri = intr_pri + 1;
194 			if (kpqpri == KPQPRI)
195 				kpqpri = kpreemptpri;
196 		}
197 		v.v_nglobpris = newnglobpris;
198 	}
199 }
200 
201 /*
202  * dispinit - Called to initialize all loaded classes and the
203  *	      dispatcher framework.
204  */
205 void
206 dispinit(void)
207 {
208 	id_t	cid;
209 	pri_t	maxglobpri;
210 	pri_t	cl_maxglobpri;
211 
212 	maxglobpri = -1;
213 
214 	/*
215 	 * Initialize transition lock, which will always be set.
216 	 */
217 	DISP_LOCK_INIT(&transition_lock);
218 	disp_lock_enter_high(&transition_lock);
219 	DISP_LOCK_INIT(&stop_lock);
220 
221 	mutex_enter(&cpu_lock);
222 	CPU->cpu_disp->disp_maxrunpri = -1;
223 	CPU->cpu_disp->disp_max_unbound_pri = -1;
224 
225 	/*
226 	 * Initialize the default CPU partition.
227 	 */
228 	cpupart_initialize_default();
229 	/*
230 	 * Call the class specific initialization functions for
231 	 * all pre-installed schedulers.
232 	 *
233 	 * We pass the size of a class specific parameter
234 	 * buffer to each of the initialization functions
235 	 * to try to catch problems with backward compatibility
236 	 * of class modules.
237 	 *
238 	 * For example a new class module running on an old system
239 	 * which didn't provide sufficiently large parameter buffers
240 	 * would be bad news. Class initialization modules can check for
241 	 * this and take action if they detect a problem.
242 	 */
243 
244 	for (cid = 0; cid < nclass; cid++) {
245 		sclass_t	*sc;
246 
247 		sc = &sclass[cid];
248 		if (SCHED_INSTALLED(sc)) {
249 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 			    &sc->cl_funcs);
251 			if (cl_maxglobpri > maxglobpri)
252 				maxglobpri = cl_maxglobpri;
253 		}
254 	}
255 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
256 	if (kpqpri == KPQPRI)
257 		kpqpri = kpreemptpri;
258 
259 	ASSERT(maxglobpri >= 0);
260 	disp_setup(maxglobpri, 0);
261 
262 	mutex_exit(&cpu_lock);
263 
264 	/*
265 	 * Platform specific sticky scheduler setup.
266 	 */
267 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
268 		cmp_set_nosteal_interval();
269 
270 	/*
271 	 * Get the default class ID; this may be later modified via
272 	 * dispadmin(1M).  This will load the class (normally TS) and that will
273 	 * call disp_add(), which is why we had to drop cpu_lock first.
274 	 */
275 	if (getcid(defaultclass, &defaultcid) != 0) {
276 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
277 		    defaultclass);
278 	}
279 }
280 
281 /*
282  * disp_add - Called with class pointer to initialize the dispatcher
283  *	      for a newly loaded class.
284  */
285 void
286 disp_add(sclass_t *clp)
287 {
288 	pri_t	maxglobpri;
289 	pri_t	cl_maxglobpri;
290 
291 	mutex_enter(&cpu_lock);
292 	/*
293 	 * Initialize the scheduler class.
294 	 */
295 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
296 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
297 	if (cl_maxglobpri > maxglobpri)
298 		maxglobpri = cl_maxglobpri;
299 
300 	/*
301 	 * Save old queue information.  Since we're initializing a
302 	 * new scheduling class which has just been loaded, then
303 	 * the size of the dispq may have changed.  We need to handle
304 	 * that here.
305 	 */
306 	disp_setup(maxglobpri, v.v_nglobpris);
307 
308 	mutex_exit(&cpu_lock);
309 }
310 
311 
312 /*
313  * For each CPU, allocate new dispatch queues
314  * with the stated number of priorities.
315  */
316 static void
317 cpu_dispqalloc(int numpris)
318 {
319 	cpu_t	*cpup;
320 	struct disp_queue_info	*disp_mem;
321 	int i, num;
322 
323 	ASSERT(MUTEX_HELD(&cpu_lock));
324 
325 	disp_mem = kmem_zalloc(NCPU *
326 	    sizeof (struct disp_queue_info), KM_SLEEP);
327 
328 	/*
329 	 * This routine must allocate all of the memory before stopping
330 	 * the cpus because it must not sleep in kmem_alloc while the
331 	 * CPUs are stopped.  Locks they hold will not be freed until they
332 	 * are restarted.
333 	 */
334 	i = 0;
335 	cpup = cpu_list;
336 	do {
337 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
338 		i++;
339 		cpup = cpup->cpu_next;
340 	} while (cpup != cpu_list);
341 	num = i;
342 
343 	pause_cpus(NULL);
344 	for (i = 0; i < num; i++)
345 		disp_dq_assign(&disp_mem[i], numpris);
346 	start_cpus();
347 
348 	/*
349 	 * I must free all of the memory after starting the cpus because
350 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
351 	 */
352 	for (i = 0; i < num; i++)
353 		disp_dq_free(&disp_mem[i]);
354 
355 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
356 }
357 
358 static void
359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
360 {
361 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
362 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
363 	    sizeof (long), KM_SLEEP);
364 	dptr->dp = dp;
365 }
366 
367 static void
368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
369 {
370 	disp_t	*dp;
371 
372 	dp = dptr->dp;
373 	dptr->olddispq = dp->disp_q;
374 	dptr->olddqactmap = dp->disp_qactmap;
375 	dptr->oldnglobpris = dp->disp_npri;
376 
377 	ASSERT(dptr->oldnglobpris < numpris);
378 
379 	if (dptr->olddispq != NULL) {
380 		/*
381 		 * Use kcopy because bcopy is platform-specific
382 		 * and could block while we might have paused the cpus.
383 		 */
384 		(void) kcopy(dptr->olddispq, dptr->newdispq,
385 		    dptr->oldnglobpris * sizeof (dispq_t));
386 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
387 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
388 		    sizeof (long));
389 	}
390 	dp->disp_q = dptr->newdispq;
391 	dp->disp_qactmap = dptr->newdqactmap;
392 	dp->disp_q_limit = &dptr->newdispq[numpris];
393 	dp->disp_npri = numpris;
394 }
395 
396 static void
397 disp_dq_free(struct disp_queue_info *dptr)
398 {
399 	if (dptr->olddispq != NULL)
400 		kmem_free(dptr->olddispq,
401 		    dptr->oldnglobpris * sizeof (dispq_t));
402 	if (dptr->olddqactmap != NULL)
403 		kmem_free(dptr->olddqactmap,
404 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
405 }
406 
407 /*
408  * For a newly created CPU, initialize the dispatch queue.
409  * This is called before the CPU is known through cpu[] or on any lists.
410  */
411 void
412 disp_cpu_init(cpu_t *cp)
413 {
414 	disp_t	*dp;
415 	dispq_t	*newdispq;
416 	ulong_t	*newdqactmap;
417 
418 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
419 
420 	if (cp == cpu0_disp.disp_cpu)
421 		dp = &cpu0_disp;
422 	else
423 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
424 	bzero(dp, sizeof (disp_t));
425 	cp->cpu_disp = dp;
426 	dp->disp_cpu = cp;
427 	dp->disp_maxrunpri = -1;
428 	dp->disp_max_unbound_pri = -1;
429 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
430 	/*
431 	 * Allocate memory for the dispatcher queue headers
432 	 * and the active queue bitmap.
433 	 */
434 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
435 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
436 	    sizeof (long), KM_SLEEP);
437 	dp->disp_q = newdispq;
438 	dp->disp_qactmap = newdqactmap;
439 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
440 	dp->disp_npri = v.v_nglobpris;
441 }
442 
443 void
444 disp_cpu_fini(cpu_t *cp)
445 {
446 	ASSERT(MUTEX_HELD(&cpu_lock));
447 
448 	disp_kp_free(cp->cpu_disp);
449 	if (cp->cpu_disp != &cpu0_disp)
450 		kmem_free(cp->cpu_disp, sizeof (disp_t));
451 }
452 
453 /*
454  * Allocate new, larger kpreempt dispatch queue to replace the old one.
455  */
456 void
457 disp_kp_alloc(disp_t *dq, pri_t npri)
458 {
459 	struct disp_queue_info	mem_info;
460 
461 	if (npri > dq->disp_npri) {
462 		/*
463 		 * Allocate memory for the new array.
464 		 */
465 		disp_dq_alloc(&mem_info, npri, dq);
466 
467 		/*
468 		 * We need to copy the old structures to the new
469 		 * and free the old.
470 		 */
471 		disp_dq_assign(&mem_info, npri);
472 		disp_dq_free(&mem_info);
473 	}
474 }
475 
476 /*
477  * Free dispatch queue.
478  * Used for the kpreempt queues for a removed CPU partition and
479  * for the per-CPU queues of deleted CPUs.
480  */
481 void
482 disp_kp_free(disp_t *dq)
483 {
484 	struct disp_queue_info	mem_info;
485 
486 	mem_info.olddispq = dq->disp_q;
487 	mem_info.olddqactmap = dq->disp_qactmap;
488 	mem_info.oldnglobpris = dq->disp_npri;
489 	disp_dq_free(&mem_info);
490 }
491 
492 /*
493  * End dispatcher and scheduler initialization.
494  */
495 
496 /*
497  * See if there's anything to do other than remain idle.
498  * Return non-zero if there is.
499  *
500  * This function must be called with high spl, or with
501  * kernel preemption disabled to prevent the partition's
502  * active cpu list from changing while being traversed.
503  *
504  * This is essentially a simpler version of disp_getwork()
505  * to be called by CPUs preparing to "halt".
506  */
507 int
508 disp_anywork(void)
509 {
510 	cpu_t		*cp = CPU;
511 	cpu_t		*ocp;
512 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
513 
514 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
515 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
516 			return (1);
517 
518 		for (ocp = cp->cpu_next_part; ocp != cp;
519 		    ocp = ocp->cpu_next_part) {
520 			ASSERT(CPU_ACTIVE(ocp));
521 
522 			/*
523 			 * Something has appeared on the local run queue.
524 			 */
525 			if (*local_nrunnable > 0)
526 				return (1);
527 			/*
528 			 * If we encounter another idle CPU that will
529 			 * soon be trolling around through disp_anywork()
530 			 * terminate our walk here and let this other CPU
531 			 * patrol the next part of the list.
532 			 */
533 			if (ocp->cpu_dispatch_pri == -1 &&
534 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
535 				return (0);
536 			/*
537 			 * Work can be taken from another CPU if:
538 			 *	- There is unbound work on the run queue
539 			 *	- That work isn't a thread undergoing a
540 			 *	- context switch on an otherwise empty queue.
541 			 *	- The CPU isn't running the idle loop.
542 			 */
543 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
544 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
545 			    ocp->cpu_disp->disp_nrunnable == 1) &&
546 			    ocp->cpu_dispatch_pri != -1)
547 				return (1);
548 		}
549 	}
550 	return (0);
551 }
552 
553 /*
554  * Called when CPU enters the idle loop
555  */
556 static void
557 idle_enter()
558 {
559 	cpu_t		*cp = CPU;
560 
561 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
562 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
563 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
564 }
565 
566 /*
567  * Called when CPU exits the idle loop
568  */
569 static void
570 idle_exit()
571 {
572 	cpu_t		*cp = CPU;
573 
574 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
575 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
576 }
577 
578 /*
579  * Idle loop.
580  */
581 void
582 idle()
583 {
584 	struct cpu	*cp = CPU;		/* pointer to this CPU */
585 	kthread_t	*t;			/* taken thread */
586 
587 	idle_enter();
588 
589 	/*
590 	 * Uniprocessor version of idle loop.
591 	 * Do this until notified that we're on an actual multiprocessor.
592 	 */
593 	while (ncpus == 1) {
594 		if (cp->cpu_disp->disp_nrunnable == 0) {
595 			(*idle_cpu)();
596 			continue;
597 		}
598 		idle_exit();
599 		swtch();
600 
601 		idle_enter(); /* returned from swtch */
602 	}
603 
604 	/*
605 	 * Multiprocessor idle loop.
606 	 */
607 	for (;;) {
608 		/*
609 		 * If CPU is completely quiesced by p_online(2), just wait
610 		 * here with minimal bus traffic until put online.
611 		 */
612 		while (cp->cpu_flags & CPU_QUIESCED)
613 			(*idle_cpu)();
614 
615 		if (cp->cpu_disp->disp_nrunnable != 0) {
616 			idle_exit();
617 			swtch();
618 		} else {
619 			if (cp->cpu_flags & CPU_OFFLINE)
620 				continue;
621 			if ((t = disp_getwork(cp)) == NULL) {
622 				if (cp->cpu_chosen_level != -1) {
623 					disp_t *dp = cp->cpu_disp;
624 					disp_t *kpq;
625 
626 					disp_lock_enter(&dp->disp_lock);
627 					/*
628 					 * Set kpq under lock to prevent
629 					 * migration between partitions.
630 					 */
631 					kpq = &cp->cpu_part->cp_kp_queue;
632 					if (kpq->disp_maxrunpri == -1)
633 						cp->cpu_chosen_level = -1;
634 					disp_lock_exit(&dp->disp_lock);
635 				}
636 				(*idle_cpu)();
637 				continue;
638 			}
639 			/*
640 			 * If there was a thread but we couldn't steal
641 			 * it, then keep trying.
642 			 */
643 			if (t == T_DONTSTEAL)
644 				continue;
645 			idle_exit();
646 			swtch_to(t);
647 		}
648 		idle_enter(); /* returned from swtch/swtch_to */
649 	}
650 }
651 
652 
653 /*
654  * Preempt the currently running thread in favor of the highest
655  * priority thread.  The class of the current thread controls
656  * where it goes on the dispatcher queues. If panicking, turn
657  * preemption off.
658  */
659 void
660 preempt()
661 {
662 	kthread_t 	*t = curthread;
663 	klwp_t 		*lwp = ttolwp(curthread);
664 
665 	if (panicstr)
666 		return;
667 
668 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
669 
670 	thread_lock(t);
671 
672 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
673 		/*
674 		 * this thread has already been chosen to be run on
675 		 * another CPU. Clear kprunrun on this CPU since we're
676 		 * already headed for swtch().
677 		 */
678 		CPU->cpu_kprunrun = 0;
679 		thread_unlock_nopreempt(t);
680 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
681 	} else {
682 		if (lwp != NULL)
683 			lwp->lwp_ru.nivcsw++;
684 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
685 		THREAD_TRANSITION(t);
686 		CL_PREEMPT(t);
687 		DTRACE_SCHED(preempt);
688 		thread_unlock_nopreempt(t);
689 
690 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
691 
692 		swtch();		/* clears CPU->cpu_runrun via disp() */
693 	}
694 }
695 
696 extern kthread_t *thread_unpin();
697 
698 /*
699  * disp() - find the highest priority thread for this processor to run, and
700  * set it in TS_ONPROC state so that resume() can be called to run it.
701  */
702 static kthread_t *
703 disp()
704 {
705 	cpu_t		*cpup;
706 	disp_t		*dp;
707 	kthread_t	*tp;
708 	dispq_t		*dq;
709 	int		maxrunword;
710 	pri_t		pri;
711 	disp_t		*kpq;
712 
713 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
714 
715 	cpup = CPU;
716 	/*
717 	 * Find the highest priority loaded, runnable thread.
718 	 */
719 	dp = cpup->cpu_disp;
720 
721 reschedule:
722 	/*
723 	 * If there is more important work on the global queue with a better
724 	 * priority than the maximum on this CPU, take it now.
725 	 */
726 	kpq = &cpup->cpu_part->cp_kp_queue;
727 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
728 	    pri >= dp->disp_maxrunpri &&
729 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
730 	    (tp = disp_getbest(kpq)) != NULL) {
731 		if (disp_ratify(tp, kpq) != NULL) {
732 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
733 			    "disp_end:tid %p", tp);
734 			return (tp);
735 		}
736 	}
737 
738 	disp_lock_enter(&dp->disp_lock);
739 	pri = dp->disp_maxrunpri;
740 
741 	/*
742 	 * If there is nothing to run, look at what's runnable on other queues.
743 	 * Choose the idle thread if the CPU is quiesced.
744 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
745 	 * interrupt threads, which will be the only threads on the CPU's own
746 	 * queue, but cannot run threads from other queues.
747 	 */
748 	if (pri == -1) {
749 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
750 			disp_lock_exit(&dp->disp_lock);
751 			if ((tp = disp_getwork(cpup)) == NULL ||
752 			    tp == T_DONTSTEAL) {
753 				tp = cpup->cpu_idle_thread;
754 				(void) splhigh();
755 				THREAD_ONPROC(tp, cpup);
756 				cpup->cpu_dispthread = tp;
757 				cpup->cpu_dispatch_pri = -1;
758 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
759 				cpup->cpu_chosen_level = -1;
760 			}
761 		} else {
762 			disp_lock_exit_high(&dp->disp_lock);
763 			tp = cpup->cpu_idle_thread;
764 			THREAD_ONPROC(tp, cpup);
765 			cpup->cpu_dispthread = tp;
766 			cpup->cpu_dispatch_pri = -1;
767 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
768 			cpup->cpu_chosen_level = -1;
769 		}
770 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
771 		    "disp_end:tid %p", tp);
772 		return (tp);
773 	}
774 
775 	dq = &dp->disp_q[pri];
776 	tp = dq->dq_first;
777 
778 	ASSERT(tp != NULL);
779 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
780 
781 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
782 
783 	/*
784 	 * Found it so remove it from queue.
785 	 */
786 	dp->disp_nrunnable--;
787 	dq->dq_sruncnt--;
788 	if ((dq->dq_first = tp->t_link) == NULL) {
789 		ulong_t	*dqactmap = dp->disp_qactmap;
790 
791 		ASSERT(dq->dq_sruncnt == 0);
792 		dq->dq_last = NULL;
793 
794 		/*
795 		 * The queue is empty, so the corresponding bit needs to be
796 		 * turned off in dqactmap.   If nrunnable != 0 just took the
797 		 * last runnable thread off the
798 		 * highest queue, so recompute disp_maxrunpri.
799 		 */
800 		maxrunword = pri >> BT_ULSHIFT;
801 		dqactmap[maxrunword] &= ~BT_BIW(pri);
802 
803 		if (dp->disp_nrunnable == 0) {
804 			dp->disp_max_unbound_pri = -1;
805 			dp->disp_maxrunpri = -1;
806 		} else {
807 			int ipri;
808 
809 			ipri = bt_gethighbit(dqactmap, maxrunword);
810 			dp->disp_maxrunpri = ipri;
811 			if (ipri < dp->disp_max_unbound_pri)
812 				dp->disp_max_unbound_pri = ipri;
813 		}
814 	} else {
815 		tp->t_link = NULL;
816 	}
817 
818 	/*
819 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
820 	 * out this thread before we have a chance to run it.
821 	 * While running, it is protected against swapping by t_lock.
822 	 */
823 	tp->t_schedflag |= TS_DONT_SWAP;
824 	cpup->cpu_dispthread = tp;		/* protected by spl only */
825 	cpup->cpu_dispatch_pri = pri;
826 	ASSERT(pri == DISP_PRIO(tp));
827 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
828 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
829 
830 	ASSERT(tp != NULL);
831 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
832 	    "disp_end:tid %p", tp);
833 
834 	if (disp_ratify(tp, kpq) == NULL)
835 		goto reschedule;
836 
837 	return (tp);
838 }
839 
840 /*
841  * swtch()
842  *	Find best runnable thread and run it.
843  *	Called with the current thread already switched to a new state,
844  *	on a sleep queue, run queue, stopped, and not zombied.
845  *	May be called at any spl level less than or equal to LOCK_LEVEL.
846  *	Always drops spl to the base level (spl0()).
847  */
848 void
849 swtch()
850 {
851 	kthread_t	*t = curthread;
852 	kthread_t	*next;
853 	cpu_t		*cp;
854 
855 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
856 
857 	if (t->t_flag & T_INTR_THREAD)
858 		cpu_intr_swtch_enter(t);
859 
860 	if (t->t_intr != NULL) {
861 		/*
862 		 * We are an interrupt thread.  Setup and return
863 		 * the interrupted thread to be resumed.
864 		 */
865 		(void) splhigh();	/* block other scheduler action */
866 		cp = CPU;		/* now protected against migration */
867 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
868 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
869 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
870 		next = thread_unpin();
871 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
872 		resume_from_intr(next);
873 	} else {
874 #ifdef	DEBUG
875 		if (t->t_state == TS_ONPROC &&
876 		    t->t_disp_queue->disp_cpu == CPU &&
877 		    t->t_preempt == 0) {
878 			thread_lock(t);
879 			ASSERT(t->t_state != TS_ONPROC ||
880 			    t->t_disp_queue->disp_cpu != CPU ||
881 			    t->t_preempt != 0);	/* cannot migrate */
882 			thread_unlock_nopreempt(t);
883 		}
884 #endif	/* DEBUG */
885 		cp = CPU;
886 		next = disp();		/* returns with spl high */
887 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
888 
889 		/* OK to steal anything left on run queue */
890 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
891 
892 		if (next != t) {
893 			hrtime_t now;
894 
895 			now = gethrtime_unscaled();
896 			pg_ev_thread_swtch(cp, now, t, next);
897 
898 			/*
899 			 * If t was previously in the TS_ONPROC state,
900 			 * setfrontdq and setbackdq won't have set its t_waitrq.
901 			 * Since we now finally know that we're switching away
902 			 * from this thread, set its t_waitrq if it is on a run
903 			 * queue.
904 			 */
905 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
906 				t->t_waitrq = now;
907 			}
908 
909 			/*
910 			 * restore mstate of thread that we are switching to
911 			 */
912 			restore_mstate(next);
913 
914 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
915 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
916 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
917 
918 			if (dtrace_vtime_active)
919 				dtrace_vtime_switch(next);
920 
921 			resume(next);
922 			/*
923 			 * The TR_RESUME_END and TR_SWTCH_END trace points
924 			 * appear at the end of resume(), because we may not
925 			 * return here
926 			 */
927 		} else {
928 			if (t->t_flag & T_INTR_THREAD)
929 				cpu_intr_swtch_exit(t);
930 			/*
931 			 * Threads that enqueue themselves on a run queue defer
932 			 * setting t_waitrq. It is then either set in swtch()
933 			 * when the CPU is actually yielded, or not at all if it
934 			 * is remaining on the CPU.
935 			 * There is however a window between where the thread
936 			 * placed itself on a run queue, and where it selects
937 			 * itself in disp(), where a third party (eg. clock()
938 			 * doing tick processing) may have re-enqueued this
939 			 * thread, setting t_waitrq in the process. We detect
940 			 * this race by noticing that despite switching to
941 			 * ourself, our t_waitrq has been set, and should be
942 			 * cleared.
943 			 */
944 			if (t->t_waitrq != 0)
945 				t->t_waitrq = 0;
946 
947 			pg_ev_thread_remain(cp, t);
948 
949 			DTRACE_SCHED(remain__cpu);
950 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
951 			(void) spl0();
952 		}
953 	}
954 }
955 
956 /*
957  * swtch_from_zombie()
958  *	Special case of swtch(), which allows checks for TS_ZOMB to be
959  *	eliminated from normal resume.
960  *	Find best runnable thread and run it.
961  *	Called with the current thread zombied.
962  *	Zombies cannot migrate, so CPU references are safe.
963  */
964 void
965 swtch_from_zombie()
966 {
967 	kthread_t	*next;
968 	cpu_t		*cpu = CPU;
969 
970 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
971 
972 	ASSERT(curthread->t_state == TS_ZOMB);
973 
974 	next = disp();			/* returns with spl high */
975 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
976 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
977 	ASSERT(next != curthread);
978 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
979 
980 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
981 
982 	restore_mstate(next);
983 
984 	if (dtrace_vtime_active)
985 		dtrace_vtime_switch(next);
986 
987 	resume_from_zombie(next);
988 	/*
989 	 * The TR_RESUME_END and TR_SWTCH_END trace points
990 	 * appear at the end of resume(), because we certainly will not
991 	 * return here
992 	 */
993 }
994 
995 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
996 
997 /*
998  * search_disp_queues()
999  *	Search the given dispatch queues for thread tp.
1000  *	Return 1 if tp is found, otherwise return 0.
1001  */
1002 static int
1003 search_disp_queues(disp_t *dp, kthread_t *tp)
1004 {
1005 	dispq_t		*dq;
1006 	dispq_t		*eq;
1007 
1008 	disp_lock_enter_high(&dp->disp_lock);
1009 
1010 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 		kthread_t	*rp;
1012 
1013 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 
1015 		for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 			if (tp == rp) {
1017 				disp_lock_exit_high(&dp->disp_lock);
1018 				return (1);
1019 			}
1020 	}
1021 	disp_lock_exit_high(&dp->disp_lock);
1022 
1023 	return (0);
1024 }
1025 
1026 /*
1027  * thread_on_queue()
1028  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1029  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030  */
1031 static int
1032 thread_on_queue(kthread_t *tp)
1033 {
1034 	cpu_t		*cp;
1035 	struct cpupart	*part;
1036 
1037 	ASSERT(getpil() >= DISP_LEVEL);
1038 
1039 	/*
1040 	 * Search the per-CPU dispatch queues for tp.
1041 	 */
1042 	cp = CPU;
1043 	do {
1044 		if (search_disp_queues(cp->cpu_disp, tp))
1045 			return (1);
1046 	} while ((cp = cp->cpu_next_onln) != CPU);
1047 
1048 	/*
1049 	 * Search the partition-wide kpreempt queues for tp.
1050 	 */
1051 	part = CPU->cpu_part;
1052 	do {
1053 		if (search_disp_queues(&part->cp_kp_queue, tp))
1054 			return (1);
1055 	} while ((part = part->cp_next) != CPU->cpu_part);
1056 
1057 	return (0);
1058 }
1059 
1060 #else
1061 
1062 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1063 
1064 #endif  /* DEBUG */
1065 
1066 /*
1067  * like swtch(), but switch to a specified thread taken from another CPU.
1068  *	called with spl high..
1069  */
1070 void
1071 swtch_to(kthread_t *next)
1072 {
1073 	cpu_t			*cp = CPU;
1074 	hrtime_t		now;
1075 
1076 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 
1078 	/*
1079 	 * Update context switch statistics.
1080 	 */
1081 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 
1083 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 
1085 	now = gethrtime_unscaled();
1086 	pg_ev_thread_swtch(cp, now, curthread, next);
1087 
1088 	/* OK to steal anything left on run queue */
1089 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 
1091 	/* record last execution time */
1092 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 
1094 	/*
1095 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 	 * won't have set its t_waitrq.  Since we now finally know that we're
1097 	 * switching away from this thread, set its t_waitrq if it is on a run
1098 	 * queue.
1099 	 */
1100 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 		curthread->t_waitrq = now;
1102 	}
1103 
1104 	/* restore next thread to previously running microstate */
1105 	restore_mstate(next);
1106 
1107 	if (dtrace_vtime_active)
1108 		dtrace_vtime_switch(next);
1109 
1110 	resume(next);
1111 	/*
1112 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1113 	 * appear at the end of resume(), because we may not
1114 	 * return here
1115 	 */
1116 }
1117 
1118 #define	CPU_IDLING(pri)	((pri) == -1)
1119 
1120 static void
1121 cpu_resched(cpu_t *cp, pri_t tpri)
1122 {
1123 	int	call_poke_cpu = 0;
1124 	pri_t   cpupri = cp->cpu_dispatch_pri;
1125 
1126 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 			cp->cpu_runrun = 1;
1131 			aston(cp->cpu_dispthread);
1132 			if (tpri < kpreemptpri && cp != CPU)
1133 				call_poke_cpu = 1;
1134 		}
1135 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 			cp->cpu_kprunrun = 1;
1137 			if (cp != CPU)
1138 				call_poke_cpu = 1;
1139 		}
1140 	}
1141 
1142 	/*
1143 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 	 */
1145 	membar_enter();
1146 
1147 	if (call_poke_cpu)
1148 		poke_cpu(cp->cpu_id);
1149 }
1150 
1151 /*
1152  * setbackdq() keeps runqs balanced such that the difference in length
1153  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156  * try to keep runqs perfectly balanced regardless of the thread priority.
1157  */
1158 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1159 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1160 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 
1162 /*
1163  * Macro that evaluates to true if it is likely that the thread has cache
1164  * warmth. This is based on the amount of time that has elapsed since the
1165  * thread last ran. If that amount of time is less than "rechoose_interval"
1166  * ticks, then we decide that the thread has enough cache warmth to warrant
1167  * some affinity for t->t_cpu.
1168  */
1169 #define	THREAD_HAS_CACHE_WARMTH(thread)	\
1170 	((thread == curthread) ||	\
1171 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 /*
1173  * Put the specified thread on the back of the dispatcher
1174  * queue corresponding to its current priority.
1175  *
1176  * Called with the thread in transition, onproc or stopped state
1177  * and locked (transition implies locked) and at high spl.
1178  * Returns with the thread in TS_RUN state and still locked.
1179  */
1180 void
1181 setbackdq(kthread_t *tp)
1182 {
1183 	dispq_t	*dq;
1184 	disp_t		*dp;
1185 	cpu_t		*cp;
1186 	pri_t		tpri;
1187 	int		bound;
1188 	boolean_t	self;
1189 
1190 	ASSERT(THREAD_LOCK_HELD(tp));
1191 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1193 
1194 	/*
1195 	 * If thread is "swapped" or on the swap queue don't
1196 	 * queue it, but wake sched.
1197 	 */
1198 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 		disp_swapped_setrun(tp);
1200 		return;
1201 	}
1202 
1203 	self = (tp == curthread);
1204 
1205 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 		bound = 1;
1207 	else
1208 		bound = 0;
1209 
1210 	tpri = DISP_PRIO(tp);
1211 	if (ncpus == 1)
1212 		cp = tp->t_cpu;
1213 	else if (!bound) {
1214 		if (tpri >= kpqpri) {
1215 			setkpdq(tp, SETKP_BACK);
1216 			return;
1217 		}
1218 
1219 		/*
1220 		 * We'll generally let this thread continue to run where
1221 		 * it last ran...but will consider migration if:
1222 		 * - We thread probably doesn't have much cache warmth.
1223 		 * - The CPU where it last ran is the target of an offline
1224 		 *   request.
1225 		 * - The thread last ran outside it's home lgroup.
1226 		 */
1227 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 		    (tp->t_cpu == cpu_inmotion)) {
1229 			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 		} else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 			    self ? tp->t_cpu : NULL);
1233 		} else {
1234 			cp = tp->t_cpu;
1235 		}
1236 
1237 		if (tp->t_cpupart == cp->cpu_part) {
1238 			int	qlen;
1239 
1240 			/*
1241 			 * Perform any CMT load balancing
1242 			 */
1243 			cp = cmt_balance(tp, cp);
1244 
1245 			/*
1246 			 * Balance across the run queues
1247 			 */
1248 			qlen = RUNQ_LEN(cp, tpri);
1249 			if (tpri >= RUNQ_MATCH_PRI &&
1250 			    !(tp->t_schedflag & TS_RUNQMATCH))
1251 				qlen -= RUNQ_MAX_DIFF;
1252 			if (qlen > 0) {
1253 				cpu_t *newcp;
1254 
1255 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 					newcp = cp->cpu_next_part;
1257 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 					newcp = cp->cpu_next_part;
1259 				}
1260 
1261 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1262 					DTRACE_PROBE3(runq__balance,
1263 					    kthread_t *, tp,
1264 					    cpu_t *, cp, cpu_t *, newcp);
1265 					cp = newcp;
1266 				}
1267 			}
1268 		} else {
1269 			/*
1270 			 * Migrate to a cpu in the new partition.
1271 			 */
1272 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 			    tp->t_lpl, tp->t_pri, NULL);
1274 		}
1275 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 	} else {
1277 		/*
1278 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 		 * a short time until weak binding that existed when the
1280 		 * strong binding was established has dropped) so we must
1281 		 * favour weak binding over strong.
1282 		 */
1283 		cp = tp->t_weakbound_cpu ?
1284 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 	}
1286 	/*
1287 	 * A thread that is ONPROC may be temporarily placed on the run queue
1288 	 * but then chosen to run again by disp.  If the thread we're placing on
1289 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 	 * replacement process is actually scheduled in swtch().  In this
1291 	 * situation, curthread is the only thread that could be in the ONPROC
1292 	 * state.
1293 	 */
1294 	if ((!self) && (tp->t_waitrq == 0)) {
1295 		hrtime_t curtime;
1296 
1297 		curtime = gethrtime_unscaled();
1298 		(void) cpu_update_pct(tp, curtime);
1299 		tp->t_waitrq = curtime;
1300 	} else {
1301 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1302 	}
1303 
1304 	dp = cp->cpu_disp;
1305 	disp_lock_enter_high(&dp->disp_lock);
1306 
1307 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 	    tpri, cp, tp);
1310 
1311 #ifndef NPROBE
1312 	/* Kernel probe */
1313 	if (tnf_tracing_active)
1314 		tnf_thread_queue(tp, cp, tpri);
1315 #endif /* NPROBE */
1316 
1317 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 
1319 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1320 	tp->t_disp_queue = dp;
1321 	tp->t_link = NULL;
1322 
1323 	dq = &dp->disp_q[tpri];
1324 	dp->disp_nrunnable++;
1325 	if (!bound)
1326 		dp->disp_steal = 0;
1327 	membar_enter();
1328 
1329 	if (dq->dq_sruncnt++ != 0) {
1330 		ASSERT(dq->dq_first != NULL);
1331 		dq->dq_last->t_link = tp;
1332 		dq->dq_last = tp;
1333 	} else {
1334 		ASSERT(dq->dq_first == NULL);
1335 		ASSERT(dq->dq_last == NULL);
1336 		dq->dq_first = dq->dq_last = tp;
1337 		BT_SET(dp->disp_qactmap, tpri);
1338 		if (tpri > dp->disp_maxrunpri) {
1339 			dp->disp_maxrunpri = tpri;
1340 			membar_enter();
1341 			cpu_resched(cp, tpri);
1342 		}
1343 	}
1344 
1345 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 			/*
1348 			 * If there are no other unbound threads on the
1349 			 * run queue, don't allow other CPUs to steal
1350 			 * this thread while we are in the middle of a
1351 			 * context switch. We may just switch to it
1352 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1353 			 * in swtch and swtch_to.
1354 			 */
1355 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 		}
1357 		dp->disp_max_unbound_pri = tpri;
1358 	}
1359 	(*disp_enq_thread)(cp, bound);
1360 }
1361 
1362 /*
1363  * Put the specified thread on the front of the dispatcher
1364  * queue corresponding to its current priority.
1365  *
1366  * Called with the thread in transition, onproc or stopped state
1367  * and locked (transition implies locked) and at high spl.
1368  * Returns with the thread in TS_RUN state and still locked.
1369  */
1370 void
1371 setfrontdq(kthread_t *tp)
1372 {
1373 	disp_t		*dp;
1374 	dispq_t		*dq;
1375 	cpu_t		*cp;
1376 	pri_t		tpri;
1377 	int		bound;
1378 
1379 	ASSERT(THREAD_LOCK_HELD(tp));
1380 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1382 
1383 	/*
1384 	 * If thread is "swapped" or on the swap queue don't
1385 	 * queue it, but wake sched.
1386 	 */
1387 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 		disp_swapped_setrun(tp);
1389 		return;
1390 	}
1391 
1392 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 		bound = 1;
1394 	else
1395 		bound = 0;
1396 
1397 	tpri = DISP_PRIO(tp);
1398 	if (ncpus == 1)
1399 		cp = tp->t_cpu;
1400 	else if (!bound) {
1401 		if (tpri >= kpqpri) {
1402 			setkpdq(tp, SETKP_FRONT);
1403 			return;
1404 		}
1405 		cp = tp->t_cpu;
1406 		if (tp->t_cpupart == cp->cpu_part) {
1407 			/*
1408 			 * We'll generally let this thread continue to run
1409 			 * where it last ran, but will consider migration if:
1410 			 * - The thread last ran outside it's home lgroup.
1411 			 * - The CPU where it last ran is the target of an
1412 			 *   offline request (a thread_nomigrate() on the in
1413 			 *   motion CPU relies on this when forcing a preempt).
1414 			 * - The thread isn't the highest priority thread where
1415 			 *   it last ran, and it is considered not likely to
1416 			 *   have significant cache warmth.
1417 			 */
1418 			if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 			    (cp == cpu_inmotion)) {
1420 				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 				    (tp == curthread) ? cp : NULL);
1422 			} else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 			    (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 				    NULL);
1426 			}
1427 		} else {
1428 			/*
1429 			 * Migrate to a cpu in the new partition.
1430 			 */
1431 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 			    tp->t_lpl, tp->t_pri, NULL);
1433 		}
1434 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 	} else {
1436 		/*
1437 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 		 * a short time until weak binding that existed when the
1439 		 * strong binding was established has dropped) so we must
1440 		 * favour weak binding over strong.
1441 		 */
1442 		cp = tp->t_weakbound_cpu ?
1443 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 	}
1445 
1446 	/*
1447 	 * A thread that is ONPROC may be temporarily placed on the run queue
1448 	 * but then chosen to run again by disp.  If the thread we're placing on
1449 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 	 * replacement process is actually scheduled in swtch().  In this
1451 	 * situation, curthread is the only thread that could be in the ONPROC
1452 	 * state.
1453 	 */
1454 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 		hrtime_t curtime;
1456 
1457 		curtime = gethrtime_unscaled();
1458 		(void) cpu_update_pct(tp, curtime);
1459 		tp->t_waitrq = curtime;
1460 	} else {
1461 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1462 	}
1463 
1464 	dp = cp->cpu_disp;
1465 	disp_lock_enter_high(&dp->disp_lock);
1466 
1467 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 
1470 #ifndef NPROBE
1471 	/* Kernel probe */
1472 	if (tnf_tracing_active)
1473 		tnf_thread_queue(tp, cp, tpri);
1474 #endif /* NPROBE */
1475 
1476 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 
1478 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1479 	tp->t_disp_queue = dp;
1480 
1481 	dq = &dp->disp_q[tpri];
1482 	dp->disp_nrunnable++;
1483 	if (!bound)
1484 		dp->disp_steal = 0;
1485 	membar_enter();
1486 
1487 	if (dq->dq_sruncnt++ != 0) {
1488 		ASSERT(dq->dq_last != NULL);
1489 		tp->t_link = dq->dq_first;
1490 		dq->dq_first = tp;
1491 	} else {
1492 		ASSERT(dq->dq_last == NULL);
1493 		ASSERT(dq->dq_first == NULL);
1494 		tp->t_link = NULL;
1495 		dq->dq_first = dq->dq_last = tp;
1496 		BT_SET(dp->disp_qactmap, tpri);
1497 		if (tpri > dp->disp_maxrunpri) {
1498 			dp->disp_maxrunpri = tpri;
1499 			membar_enter();
1500 			cpu_resched(cp, tpri);
1501 		}
1502 	}
1503 
1504 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 		    cp == CPU) {
1507 			/*
1508 			 * If there are no other unbound threads on the
1509 			 * run queue, don't allow other CPUs to steal
1510 			 * this thread while we are in the middle of a
1511 			 * context switch. We may just switch to it
1512 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1513 			 * in swtch and swtch_to.
1514 			 */
1515 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 		}
1517 		dp->disp_max_unbound_pri = tpri;
1518 	}
1519 	(*disp_enq_thread)(cp, bound);
1520 }
1521 
1522 /*
1523  * Put a high-priority unbound thread on the kp queue
1524  */
1525 static void
1526 setkpdq(kthread_t *tp, int borf)
1527 {
1528 	dispq_t	*dq;
1529 	disp_t	*dp;
1530 	cpu_t	*cp;
1531 	pri_t	tpri;
1532 
1533 	tpri = DISP_PRIO(tp);
1534 
1535 	dp = &tp->t_cpupart->cp_kp_queue;
1536 	disp_lock_enter_high(&dp->disp_lock);
1537 
1538 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 
1540 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1543 	tp->t_disp_queue = dp;
1544 	dp->disp_nrunnable++;
1545 	dq = &dp->disp_q[tpri];
1546 
1547 	if (dq->dq_sruncnt++ != 0) {
1548 		if (borf == SETKP_BACK) {
1549 			ASSERT(dq->dq_first != NULL);
1550 			tp->t_link = NULL;
1551 			dq->dq_last->t_link = tp;
1552 			dq->dq_last = tp;
1553 		} else {
1554 			ASSERT(dq->dq_last != NULL);
1555 			tp->t_link = dq->dq_first;
1556 			dq->dq_first = tp;
1557 		}
1558 	} else {
1559 		if (borf == SETKP_BACK) {
1560 			ASSERT(dq->dq_first == NULL);
1561 			ASSERT(dq->dq_last == NULL);
1562 			dq->dq_first = dq->dq_last = tp;
1563 		} else {
1564 			ASSERT(dq->dq_last == NULL);
1565 			ASSERT(dq->dq_first == NULL);
1566 			tp->t_link = NULL;
1567 			dq->dq_first = dq->dq_last = tp;
1568 		}
1569 		BT_SET(dp->disp_qactmap, tpri);
1570 		if (tpri > dp->disp_max_unbound_pri)
1571 			dp->disp_max_unbound_pri = tpri;
1572 		if (tpri > dp->disp_maxrunpri) {
1573 			dp->disp_maxrunpri = tpri;
1574 			membar_enter();
1575 		}
1576 	}
1577 
1578 	cp = tp->t_cpu;
1579 	if (tp->t_cpupart != cp->cpu_part) {
1580 		/* migrate to a cpu in the new partition */
1581 		cp = tp->t_cpupart->cp_cpulist;
1582 	}
1583 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 
1587 #ifndef NPROBE
1588 	/* Kernel probe */
1589 	if (tnf_tracing_active)
1590 		tnf_thread_queue(tp, cp, tpri);
1591 #endif /* NPROBE */
1592 
1593 	if (cp->cpu_chosen_level < tpri)
1594 		cp->cpu_chosen_level = tpri;
1595 	cpu_resched(cp, tpri);
1596 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 	(*disp_enq_thread)(cp, 0);
1598 }
1599 
1600 /*
1601  * Remove a thread from the dispatcher queue if it is on it.
1602  * It is not an error if it is not found but we return whether
1603  * or not it was found in case the caller wants to check.
1604  */
1605 int
1606 dispdeq(kthread_t *tp)
1607 {
1608 	disp_t		*dp;
1609 	dispq_t		*dq;
1610 	kthread_t	*rp;
1611 	kthread_t	*trp;
1612 	kthread_t	**ptp;
1613 	int		tpri;
1614 
1615 	ASSERT(THREAD_LOCK_HELD(tp));
1616 
1617 	if (tp->t_state != TS_RUN)
1618 		return (0);
1619 
1620 	/*
1621 	 * The thread is "swapped" or is on the swap queue and
1622 	 * hence no longer on the run queue, so return true.
1623 	 */
1624 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 		return (1);
1626 
1627 	tpri = DISP_PRIO(tp);
1628 	dp = tp->t_disp_queue;
1629 	ASSERT(tpri < dp->disp_npri);
1630 	dq = &dp->disp_q[tpri];
1631 	ptp = &dq->dq_first;
1632 	rp = *ptp;
1633 	trp = NULL;
1634 
1635 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 
1637 	/*
1638 	 * Search for thread in queue.
1639 	 * Double links would simplify this at the expense of disp/setrun.
1640 	 */
1641 	while (rp != tp && rp != NULL) {
1642 		trp = rp;
1643 		ptp = &trp->t_link;
1644 		rp = trp->t_link;
1645 	}
1646 
1647 	if (rp == NULL) {
1648 		panic("dispdeq: thread not on queue");
1649 	}
1650 
1651 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 
1653 	/*
1654 	 * Found it so remove it from queue.
1655 	 */
1656 	if ((*ptp = rp->t_link) == NULL)
1657 		dq->dq_last = trp;
1658 
1659 	dp->disp_nrunnable--;
1660 	if (--dq->dq_sruncnt == 0) {
1661 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 		if (dp->disp_nrunnable == 0) {
1663 			dp->disp_max_unbound_pri = -1;
1664 			dp->disp_maxrunpri = -1;
1665 		} else if (tpri == dp->disp_maxrunpri) {
1666 			int ipri;
1667 
1668 			ipri = bt_gethighbit(dp->disp_qactmap,
1669 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1670 			if (ipri < dp->disp_max_unbound_pri)
1671 				dp->disp_max_unbound_pri = ipri;
1672 			dp->disp_maxrunpri = ipri;
1673 		}
1674 	}
1675 	tp->t_link = NULL;
1676 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1677 	return (1);
1678 }
1679 
1680 
1681 /*
1682  * dq_sruninc and dq_srundec are public functions for
1683  * incrementing/decrementing the sruncnts when a thread on
1684  * a dispatcher queue is made schedulable/unschedulable by
1685  * resetting the TS_LOAD flag.
1686  *
1687  * The caller MUST have the thread lock and therefore the dispatcher
1688  * queue lock so that the operation which changes
1689  * the flag, the operation that checks the status of the thread to
1690  * determine if it's on a disp queue AND the call to this function
1691  * are one atomic operation with respect to interrupts.
1692  */
1693 
1694 /*
1695  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696  */
1697 void
1698 dq_sruninc(kthread_t *t)
1699 {
1700 	ASSERT(t->t_state == TS_RUN);
1701 	ASSERT(t->t_schedflag & TS_LOAD);
1702 
1703 	THREAD_TRANSITION(t);
1704 	setfrontdq(t);
1705 }
1706 
1707 /*
1708  * See comment on calling conventions above.
1709  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710  */
1711 void
1712 dq_srundec(kthread_t *t)
1713 {
1714 	ASSERT(t->t_schedflag & TS_LOAD);
1715 
1716 	(void) dispdeq(t);
1717 	disp_swapped_enq(t);
1718 }
1719 
1720 /*
1721  * Change the dispatcher lock of thread to the "swapped_lock"
1722  * and return with thread lock still held.
1723  *
1724  * Called with thread_lock held, in transition state, and at high spl.
1725  */
1726 void
1727 disp_swapped_enq(kthread_t *tp)
1728 {
1729 	ASSERT(THREAD_LOCK_HELD(tp));
1730 	ASSERT(tp->t_schedflag & TS_LOAD);
1731 
1732 	switch (tp->t_state) {
1733 	case TS_RUN:
1734 		disp_lock_enter_high(&swapped_lock);
1735 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1736 		break;
1737 	case TS_ONPROC:
1738 		disp_lock_enter_high(&swapped_lock);
1739 		THREAD_TRANSITION(tp);
1740 		wake_sched_sec = 1;		/* tell clock to wake sched */
1741 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1742 		break;
1743 	default:
1744 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 	}
1746 }
1747 
1748 /*
1749  * This routine is called by setbackdq/setfrontdq if the thread is
1750  * not loaded or loaded and on the swap queue.
1751  *
1752  * Thread state TS_SLEEP implies that a swapped thread
1753  * has been woken up and needs to be swapped in by the swapper.
1754  *
1755  * Thread state TS_RUN, it implies that the priority of a swapped
1756  * thread is being increased by scheduling class (e.g. ts_update).
1757  */
1758 static void
1759 disp_swapped_setrun(kthread_t *tp)
1760 {
1761 	ASSERT(THREAD_LOCK_HELD(tp));
1762 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 
1764 	switch (tp->t_state) {
1765 	case TS_SLEEP:
1766 		disp_lock_enter_high(&swapped_lock);
1767 		/*
1768 		 * Wakeup sched immediately (i.e., next tick) if the
1769 		 * thread priority is above maxclsyspri.
1770 		 */
1771 		if (DISP_PRIO(tp) > maxclsyspri)
1772 			wake_sched = 1;
1773 		else
1774 			wake_sched_sec = 1;
1775 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 		break;
1777 	case TS_RUN:				/* called from ts_update */
1778 		break;
1779 	default:
1780 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 	}
1782 }
1783 
1784 /*
1785  *	Make a thread give up its processor.  Find the processor on
1786  *	which this thread is executing, and have that processor
1787  *	preempt.
1788  *
1789  *	We allow System Duty Cycle (SDC) threads to be preempted even if
1790  *	they are running at kernel priorities.  To implement this, we always
1791  *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1792  *	calls cpu_surrender() very often, we only preempt if there is anyone
1793  *	competing with us.
1794  */
1795 void
1796 cpu_surrender(kthread_t *tp)
1797 {
1798 	cpu_t	*cpup;
1799 	int	max_pri;
1800 	int	max_run_pri;
1801 	klwp_t	*lwp;
1802 
1803 	ASSERT(THREAD_LOCK_HELD(tp));
1804 
1805 	if (tp->t_state != TS_ONPROC)
1806 		return;
1807 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1808 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 	if (max_pri < max_run_pri)
1811 		max_pri = max_run_pri;
1812 
1813 	if (tp->t_cid == sysdccid) {
1814 		uint_t t_pri = DISP_PRIO(tp);
1815 		if (t_pri > max_pri)
1816 			return;		/* we are not competing w/ anyone */
1817 		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 	} else {
1819 		cpup->cpu_runrun = 1;
1820 		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 			cpup->cpu_kprunrun = 1;
1822 		}
1823 	}
1824 
1825 	/*
1826 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 	 */
1828 	membar_enter();
1829 
1830 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 
1832 	/*
1833 	 * Make the target thread take an excursion through trap()
1834 	 * to do preempt() (unless we're already in trap or post_syscall,
1835 	 * calling cpu_surrender via CL_TRAPRET).
1836 	 */
1837 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 	    lwp->lwp_state != LWP_USER) {
1839 		aston(tp);
1840 		if (cpup != CPU)
1841 			poke_cpu(cpup->cpu_id);
1842 	}
1843 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 }
1846 
1847 /*
1848  * Commit to and ratify a scheduling decision
1849  */
1850 /*ARGSUSED*/
1851 static kthread_t *
1852 disp_ratify(kthread_t *tp, disp_t *kpq)
1853 {
1854 	pri_t	tpri, maxpri;
1855 	pri_t	maxkpri;
1856 	cpu_t	*cpup;
1857 
1858 	ASSERT(tp != NULL);
1859 	/*
1860 	 * Commit to, then ratify scheduling decision
1861 	 */
1862 	cpup = CPU;
1863 	if (cpup->cpu_runrun != 0)
1864 		cpup->cpu_runrun = 0;
1865 	if (cpup->cpu_kprunrun != 0)
1866 		cpup->cpu_kprunrun = 0;
1867 	if (cpup->cpu_chosen_level != -1)
1868 		cpup->cpu_chosen_level = -1;
1869 	membar_enter();
1870 	tpri = DISP_PRIO(tp);
1871 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 	maxkpri = kpq->disp_maxrunpri;
1873 	if (maxpri < maxkpri)
1874 		maxpri = maxkpri;
1875 	if (tpri < maxpri) {
1876 		/*
1877 		 * should have done better
1878 		 * put this one back and indicate to try again
1879 		 */
1880 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1881 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 		thread_lock_high(tp);
1883 		THREAD_TRANSITION(tp);
1884 		setfrontdq(tp);
1885 		thread_unlock_nopreempt(tp);
1886 
1887 		tp = NULL;
1888 	}
1889 	return (tp);
1890 }
1891 
1892 /*
1893  * See if there is any work on the dispatcher queue for other CPUs.
1894  * If there is, dequeue the best thread and return.
1895  */
1896 static kthread_t *
1897 disp_getwork(cpu_t *cp)
1898 {
1899 	cpu_t		*ocp;		/* other CPU */
1900 	cpu_t		*ocp_start;
1901 	cpu_t		*tcp;		/* target local CPU */
1902 	kthread_t	*tp;
1903 	kthread_t	*retval = NULL;
1904 	pri_t		maxpri;
1905 	disp_t		*kpq;		/* kp queue for this partition */
1906 	lpl_t		*lpl, *lpl_leaf;
1907 	int		leafidx, startidx;
1908 	hrtime_t	stealtime;
1909 	lgrp_id_t	local_id;
1910 
1911 	maxpri = -1;
1912 	tcp = NULL;
1913 
1914 	kpq = &cp->cpu_part->cp_kp_queue;
1915 	while (kpq->disp_maxrunpri >= 0) {
1916 		/*
1917 		 * Try to take a thread from the kp_queue.
1918 		 */
1919 		tp = (disp_getbest(kpq));
1920 		if (tp)
1921 			return (disp_ratify(tp, kpq));
1922 	}
1923 
1924 	kpreempt_disable();		/* protect the cpu_active list */
1925 
1926 	/*
1927 	 * Try to find something to do on another CPU's run queue.
1928 	 * Loop through all other CPUs looking for the one with the highest
1929 	 * priority unbound thread.
1930 	 *
1931 	 * On NUMA machines, the partition's CPUs are consulted in order of
1932 	 * distance from the current CPU. This way, the first available
1933 	 * work found is also the closest, and will suffer the least
1934 	 * from being migrated.
1935 	 */
1936 	lpl = lpl_leaf = cp->cpu_lpl;
1937 	local_id = lpl_leaf->lpl_lgrpid;
1938 	leafidx = startidx = 0;
1939 
1940 	/*
1941 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 	 * broader levels of locality
1943 	 */
1944 	do {
1945 		/* This loop iterates over the lpl's leaves */
1946 		do {
1947 			if (lpl_leaf != cp->cpu_lpl)
1948 				ocp = lpl_leaf->lpl_cpus;
1949 			else
1950 				ocp = cp->cpu_next_lpl;
1951 
1952 			/* This loop iterates over the CPUs in the leaf */
1953 			ocp_start = ocp;
1954 			do {
1955 				pri_t pri;
1956 
1957 				ASSERT(CPU_ACTIVE(ocp));
1958 
1959 				/*
1960 				 * End our stroll around this lpl if:
1961 				 *
1962 				 * - Something became runnable on the local
1963 				 *   queue...which also ends our stroll around
1964 				 *   the partition.
1965 				 *
1966 				 * - We happen across another idle CPU.
1967 				 *   Since it is patrolling the next portion
1968 				 *   of the lpl's list (assuming it's not
1969 				 *   halted, or busy servicing an interrupt),
1970 				 *   move to the next higher level of locality.
1971 				 */
1972 				if (cp->cpu_disp->disp_nrunnable != 0) {
1973 					kpreempt_enable();
1974 					return (NULL);
1975 				}
1976 				if (ocp->cpu_dispatch_pri == -1) {
1977 					if (ocp->cpu_disp_flags &
1978 					    CPU_DISP_HALTED ||
1979 					    ocp->cpu_intr_actv != 0)
1980 						continue;
1981 					else
1982 						goto next_level;
1983 				}
1984 
1985 				/*
1986 				 * If there's only one thread and the CPU
1987 				 * is in the middle of a context switch,
1988 				 * or it's currently running the idle thread,
1989 				 * don't steal it.
1990 				 */
1991 				if ((ocp->cpu_disp_flags &
1992 				    CPU_DISP_DONTSTEAL) &&
1993 				    ocp->cpu_disp->disp_nrunnable == 1)
1994 					continue;
1995 
1996 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 				if (pri > maxpri) {
1998 					/*
1999 					 * Don't steal threads that we attempted
2000 					 * to steal recently until they're ready
2001 					 * to be stolen again.
2002 					 */
2003 					stealtime = ocp->cpu_disp->disp_steal;
2004 					if (stealtime == 0 ||
2005 					    stealtime - gethrtime() <= 0) {
2006 						maxpri = pri;
2007 						tcp = ocp;
2008 					} else {
2009 						/*
2010 						 * Don't update tcp, just set
2011 						 * the retval to T_DONTSTEAL, so
2012 						 * that if no acceptable CPUs
2013 						 * are found the return value
2014 						 * will be T_DONTSTEAL rather
2015 						 * then NULL.
2016 						 */
2017 						retval = T_DONTSTEAL;
2018 					}
2019 				}
2020 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 
2022 			/*
2023 			 * Iterate to the next leaf lpl in the resource set
2024 			 * at this level of locality. If we hit the end of
2025 			 * the set, wrap back around to the beginning.
2026 			 *
2027 			 * Note: This iteration is NULL terminated for a reason
2028 			 * see lpl_topo_bootstrap() in lgrp.c for details.
2029 			 */
2030 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 				leafidx = 0;
2032 				lpl_leaf = lpl->lpl_rset[leafidx];
2033 			}
2034 		} while (leafidx != startidx);
2035 
2036 next_level:
2037 		/*
2038 		 * Expand the search to include farther away CPUs (next
2039 		 * locality level). The closer CPUs that have already been
2040 		 * checked will be checked again. In doing so, idle CPUs
2041 		 * will tend to be more aggresive about stealing from CPUs
2042 		 * that are closer (since the closer CPUs will be considered
2043 		 * more often).
2044 		 * Begin at this level with the CPUs local leaf lpl.
2045 		 */
2046 		if ((lpl = lpl->lpl_parent) != NULL) {
2047 			leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 			lpl_leaf = lpl->lpl_rset[leafidx];
2049 		}
2050 	} while (!tcp && lpl);
2051 
2052 	kpreempt_enable();
2053 
2054 	/*
2055 	 * If another queue looks good, and there is still nothing on
2056 	 * the local queue, try to transfer one or more threads
2057 	 * from it to our queue.
2058 	 */
2059 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 		tp = disp_getbest(tcp->cpu_disp);
2061 		if (tp == NULL || tp == T_DONTSTEAL)
2062 			return (tp);
2063 		return (disp_ratify(tp, kpq));
2064 	}
2065 	return (retval);
2066 }
2067 
2068 
2069 /*
2070  * disp_fix_unbound_pri()
2071  *	Determines the maximum priority of unbound threads on the queue.
2072  *	The priority is kept for the queue, but is only increased, never
2073  *	reduced unless some CPU is looking for something on that queue.
2074  *
2075  *	The priority argument is the known upper limit.
2076  *
2077  *	Perhaps this should be kept accurately, but that probably means
2078  *	separate bitmaps for bound and unbound threads.  Since only idled
2079  *	CPUs will have to do this recalculation, it seems better this way.
2080  */
2081 static void
2082 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 {
2084 	kthread_t	*tp;
2085 	dispq_t		*dq;
2086 	ulong_t		*dqactmap = dp->disp_qactmap;
2087 	ulong_t		mapword;
2088 	int		wx;
2089 
2090 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 
2092 	ASSERT(pri >= 0);			/* checked by caller */
2093 
2094 	/*
2095 	 * Start the search at the next lowest priority below the supplied
2096 	 * priority.  This depends on the bitmap implementation.
2097 	 */
2098 	do {
2099 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2100 
2101 		/*
2102 		 * Form mask for all lower priorities in the word.
2103 		 */
2104 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 
2106 		/*
2107 		 * Get next lower active priority.
2108 		 */
2109 		if (mapword != 0) {
2110 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 		} else if (wx > 0) {
2112 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 			if (pri < 0)
2114 				break;
2115 		} else {
2116 			pri = -1;
2117 			break;
2118 		}
2119 
2120 		/*
2121 		 * Search the queue for unbound, runnable threads.
2122 		 */
2123 		dq = &dp->disp_q[pri];
2124 		tp = dq->dq_first;
2125 
2126 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 			tp = tp->t_link;
2128 		}
2129 
2130 		/*
2131 		 * If a thread was found, set the priority and return.
2132 		 */
2133 	} while (tp == NULL);
2134 
2135 	/*
2136 	 * pri holds the maximum unbound thread priority or -1.
2137 	 */
2138 	if (dp->disp_max_unbound_pri != pri)
2139 		dp->disp_max_unbound_pri = pri;
2140 }
2141 
2142 /*
2143  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144  * 	check if the CPU to which is was previously bound should have
2145  * 	its disp_max_unbound_pri increased.
2146  */
2147 void
2148 disp_adjust_unbound_pri(kthread_t *tp)
2149 {
2150 	disp_t *dp;
2151 	pri_t tpri;
2152 
2153 	ASSERT(THREAD_LOCK_HELD(tp));
2154 
2155 	/*
2156 	 * Don't do anything if the thread is not bound, or
2157 	 * currently not runnable or swapped out.
2158 	 */
2159 	if (tp->t_bound_cpu == NULL ||
2160 	    tp->t_state != TS_RUN ||
2161 	    tp->t_schedflag & TS_ON_SWAPQ)
2162 		return;
2163 
2164 	tpri = DISP_PRIO(tp);
2165 	dp = tp->t_bound_cpu->cpu_disp;
2166 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 	if (tpri > dp->disp_max_unbound_pri)
2168 		dp->disp_max_unbound_pri = tpri;
2169 }
2170 
2171 /*
2172  * disp_getbest()
2173  *   De-queue the highest priority unbound runnable thread.
2174  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175  *   Returns NULL if nothing found.
2176  *   Returns T_DONTSTEAL if the thread was not stealable.
2177  *   so that the caller will try again later.
2178  *
2179  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2180  *   its type.
2181  */
2182 static kthread_t *
2183 disp_getbest(disp_t *dp)
2184 {
2185 	kthread_t	*tp;
2186 	dispq_t		*dq;
2187 	pri_t		pri;
2188 	cpu_t		*cp, *tcp;
2189 	boolean_t	allbound;
2190 
2191 	disp_lock_enter(&dp->disp_lock);
2192 
2193 	/*
2194 	 * If there is nothing to run, or the CPU is in the middle of a
2195 	 * context switch of the only thread, return NULL.
2196 	 */
2197 	tcp = dp->disp_cpu;
2198 	cp = CPU;
2199 	pri = dp->disp_max_unbound_pri;
2200 	if (pri == -1 ||
2201 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2203 		disp_lock_exit_nopreempt(&dp->disp_lock);
2204 		return (NULL);
2205 	}
2206 
2207 	dq = &dp->disp_q[pri];
2208 
2209 
2210 	/*
2211 	 * Assume that all threads are bound on this queue, and change it
2212 	 * later when we find out that it is not the case.
2213 	 */
2214 	allbound = B_TRUE;
2215 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 		hrtime_t now, nosteal, rqtime;
2217 
2218 		/*
2219 		 * Skip over bound threads which could be here even
2220 		 * though disp_max_unbound_pri indicated this level.
2221 		 */
2222 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 			continue;
2224 
2225 		/*
2226 		 * We've got some unbound threads on this queue, so turn
2227 		 * the allbound flag off now.
2228 		 */
2229 		allbound = B_FALSE;
2230 
2231 		/*
2232 		 * The thread is a candidate for stealing from its run queue. We
2233 		 * don't want to steal threads that became runnable just a
2234 		 * moment ago. This improves CPU affinity for threads that get
2235 		 * preempted for short periods of time and go back on the run
2236 		 * queue.
2237 		 *
2238 		 * We want to let it stay on its run queue if it was only placed
2239 		 * there recently and it was running on the same CPU before that
2240 		 * to preserve its cache investment. For the thread to remain on
2241 		 * its run queue, ALL of the following conditions must be
2242 		 * satisfied:
2243 		 *
2244 		 * - the disp queue should not be the kernel preemption queue
2245 		 * - delayed idle stealing should not be disabled
2246 		 * - nosteal_nsec should be non-zero
2247 		 * - it should run with user priority
2248 		 * - it should be on the run queue of the CPU where it was
2249 		 *   running before being placed on the run queue
2250 		 * - it should be the only thread on the run queue (to prevent
2251 		 *   extra scheduling latency for other threads)
2252 		 * - it should sit on the run queue for less than per-chip
2253 		 *   nosteal interval or global nosteal interval
2254 		 * - in case of CPUs with shared cache it should sit in a run
2255 		 *   queue of a CPU from a different chip
2256 		 *
2257 		 * The checks are arranged so that the ones that are faster are
2258 		 * placed earlier.
2259 		 */
2260 		if (tcp == NULL ||
2261 		    pri >= minclsyspri ||
2262 		    tp->t_cpu != tcp)
2263 			break;
2264 
2265 		/*
2266 		 * Steal immediately if, due to CMT processor architecture
2267 		 * migraiton between cp and tcp would incur no performance
2268 		 * penalty.
2269 		 */
2270 		if (pg_cmt_can_migrate(cp, tcp))
2271 			break;
2272 
2273 		nosteal = nosteal_nsec;
2274 		if (nosteal == 0)
2275 			break;
2276 
2277 		/*
2278 		 * Calculate time spent sitting on run queue
2279 		 */
2280 		now = gethrtime_unscaled();
2281 		rqtime = now - tp->t_waitrq;
2282 		scalehrtime(&rqtime);
2283 
2284 		/*
2285 		 * Steal immediately if the time spent on this run queue is more
2286 		 * than allowed nosteal delay.
2287 		 *
2288 		 * Negative rqtime check is needed here to avoid infinite
2289 		 * stealing delays caused by unlikely but not impossible
2290 		 * drifts between CPU times on different CPUs.
2291 		 */
2292 		if (rqtime > nosteal || rqtime < 0)
2293 			break;
2294 
2295 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 		scalehrtime(&now);
2298 		/*
2299 		 * Calculate when this thread becomes stealable
2300 		 */
2301 		now += (nosteal - rqtime);
2302 
2303 		/*
2304 		 * Calculate time when some thread becomes stealable
2305 		 */
2306 		if (now < dp->disp_steal)
2307 			dp->disp_steal = now;
2308 	}
2309 
2310 	/*
2311 	 * If there were no unbound threads on this queue, find the queue
2312 	 * where they are and then return later. The value of
2313 	 * disp_max_unbound_pri is not always accurate because it isn't
2314 	 * reduced until another idle CPU looks for work.
2315 	 */
2316 	if (allbound)
2317 		disp_fix_unbound_pri(dp, pri);
2318 
2319 	/*
2320 	 * If we reached the end of the queue and found no unbound threads
2321 	 * then return NULL so that other CPUs will be considered.  If there
2322 	 * are unbound threads but they cannot yet be stolen, then
2323 	 * return T_DONTSTEAL and try again later.
2324 	 */
2325 	if (tp == NULL) {
2326 		disp_lock_exit_nopreempt(&dp->disp_lock);
2327 		return (allbound ? NULL : T_DONTSTEAL);
2328 	}
2329 
2330 	/*
2331 	 * Found a runnable, unbound thread, so remove it from queue.
2332 	 * dispdeq() requires that we have the thread locked, and we do,
2333 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2334 	 * put the thread in transition state, thereby dropping the dispq
2335 	 * lock.
2336 	 */
2337 
2338 #ifdef DEBUG
2339 	{
2340 		int	thread_was_on_queue;
2341 
2342 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2343 		ASSERT(thread_was_on_queue);
2344 	}
2345 
2346 #else /* DEBUG */
2347 	(void) dispdeq(tp);			/* drops disp_lock */
2348 #endif /* DEBUG */
2349 
2350 	/*
2351 	 * Reset the disp_queue steal time - we do not know what is the smallest
2352 	 * value across the queue is.
2353 	 */
2354 	dp->disp_steal = 0;
2355 
2356 	tp->t_schedflag |= TS_DONT_SWAP;
2357 
2358 	/*
2359 	 * Setup thread to run on the current CPU.
2360 	 */
2361 	tp->t_disp_queue = cp->cpu_disp;
2362 
2363 	cp->cpu_dispthread = tp;		/* protected by spl only */
2364 	cp->cpu_dispatch_pri = pri;
2365 
2366 	/*
2367 	 * There can be a memory synchronization race between disp_getbest()
2368 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 	 * to preempt the current thread to run the enqueued thread while
2370 	 * disp_getbest() and disp_ratify() are changing the current thread
2371 	 * to the stolen thread. This may lead to a situation where
2372 	 * cpu_resched() tries to preempt the wrong thread and the
2373 	 * stolen thread continues to run on the CPU which has been tagged
2374 	 * for preemption.
2375 	 * Later the clock thread gets enqueued but doesn't get to run on the
2376 	 * CPU causing the system to hang.
2377 	 *
2378 	 * To avoid this, grabbing and dropping the disp_lock (which does
2379 	 * a memory barrier) is needed to synchronize the execution of
2380 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2381 	 * synchronize the memory read and written by cpu_resched(),
2382 	 * disp_getbest(), and disp_ratify() with each other.
2383 	 *  (see CR#6482861 for more details).
2384 	 */
2385 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 
2388 	ASSERT(pri == DISP_PRIO(tp));
2389 
2390 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 
2392 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2393 
2394 	/*
2395 	 * Return with spl high so that swtch() won't need to raise it.
2396 	 * The disp_lock was dropped by dispdeq().
2397 	 */
2398 
2399 	return (tp);
2400 }
2401 
2402 /*
2403  * disp_bound_common() - common routine for higher level functions
2404  *	that check for bound threads under certain conditions.
2405  *	If 'threadlistsafe' is set then there is no need to acquire
2406  *	pidlock to stop the thread list from changing (eg, if
2407  *	disp_bound_* is called with cpus paused).
2408  */
2409 static int
2410 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 {
2412 	int		found = 0;
2413 	kthread_t	*tp;
2414 
2415 	ASSERT(flag);
2416 
2417 	if (!threadlistsafe)
2418 		mutex_enter(&pidlock);
2419 	tp = curthread;		/* faster than allthreads */
2420 	do {
2421 		if (tp->t_state != TS_FREE) {
2422 			/*
2423 			 * If an interrupt thread is busy, but the
2424 			 * caller doesn't care (i.e. BOUND_INTR is off),
2425 			 * then just ignore it and continue through.
2426 			 */
2427 			if ((tp->t_flag & T_INTR_THREAD) &&
2428 			    !(flag & BOUND_INTR))
2429 				continue;
2430 
2431 			/*
2432 			 * Skip the idle thread for the CPU
2433 			 * we're about to set offline.
2434 			 */
2435 			if (tp == cp->cpu_idle_thread)
2436 				continue;
2437 
2438 			/*
2439 			 * Skip the pause thread for the CPU
2440 			 * we're about to set offline.
2441 			 */
2442 			if (tp == cp->cpu_pause_thread)
2443 				continue;
2444 
2445 			if ((flag & BOUND_CPU) &&
2446 			    (tp->t_bound_cpu == cp ||
2447 			    tp->t_bind_cpu == cp->cpu_id ||
2448 			    tp->t_weakbound_cpu == cp)) {
2449 				found = 1;
2450 				break;
2451 			}
2452 
2453 			if ((flag & BOUND_PARTITION) &&
2454 			    (tp->t_cpupart == cp->cpu_part)) {
2455 				found = 1;
2456 				break;
2457 			}
2458 		}
2459 	} while ((tp = tp->t_next) != curthread && found == 0);
2460 	if (!threadlistsafe)
2461 		mutex_exit(&pidlock);
2462 	return (found);
2463 }
2464 
2465 /*
2466  * disp_bound_threads - return nonzero if threads are bound to the processor.
2467  *	Called infrequently.  Keep this simple.
2468  *	Includes threads that are asleep or stopped but not onproc.
2469  */
2470 int
2471 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 {
2473 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 }
2475 
2476 /*
2477  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478  * to the given processor, including interrupt threads.
2479  */
2480 int
2481 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 {
2483 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 }
2485 
2486 /*
2487  * disp_bound_partition - return nonzero if threads are bound to the same
2488  * partition as the processor.
2489  *	Called infrequently.  Keep this simple.
2490  *	Includes threads that are asleep or stopped but not onproc.
2491  */
2492 int
2493 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 {
2495 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 }
2497 
2498 /*
2499  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500  * threads to other CPUs.
2501  */
2502 void
2503 disp_cpu_inactive(cpu_t *cp)
2504 {
2505 	kthread_t	*tp;
2506 	disp_t		*dp = cp->cpu_disp;
2507 	dispq_t		*dq;
2508 	pri_t		pri;
2509 	int		wasonq;
2510 
2511 	disp_lock_enter(&dp->disp_lock);
2512 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 		dq = &dp->disp_q[pri];
2514 		tp = dq->dq_first;
2515 
2516 		/*
2517 		 * Skip over bound threads.
2518 		 */
2519 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 			tp = tp->t_link;
2521 		}
2522 
2523 		if (tp == NULL) {
2524 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2525 			disp_fix_unbound_pri(dp, pri);
2526 			continue;
2527 		}
2528 
2529 		wasonq = dispdeq(tp);		/* drops disp_lock */
2530 		ASSERT(wasonq);
2531 		ASSERT(tp->t_weakbound_cpu == NULL);
2532 
2533 		setbackdq(tp);
2534 		/*
2535 		 * Called from cpu_offline:
2536 		 *
2537 		 * cp has already been removed from the list of active cpus
2538 		 * and tp->t_cpu has been changed so there is no risk of
2539 		 * tp ending up back on cp.
2540 		 *
2541 		 * Called from cpupart_move_cpu:
2542 		 *
2543 		 * The cpu has moved to a new cpupart.  Any threads that
2544 		 * were on it's dispatch queues before the move remain
2545 		 * in the old partition and can't run in the new partition.
2546 		 */
2547 		ASSERT(tp->t_cpu != cp);
2548 		thread_unlock(tp);
2549 
2550 		disp_lock_enter(&dp->disp_lock);
2551 	}
2552 	disp_lock_exit(&dp->disp_lock);
2553 }
2554 
2555 /*
2556  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557  *	The hint passed in is used as a starting point so we don't favor
2558  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2559  *	used CPU for the thread.
2560  *
2561  *	The lgroup and priority are used to determine the best CPU to run on
2562  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563  *	the thread priority will indicate whether the thread will actually run
2564  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2565  *	lgroup which are running the lowest priority threads are found.  The
2566  *	remote CPU is chosen only if the thread will not run locally on a CPU
2567  *	within the lgroup, but will run on the remote CPU. If the thread
2568  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2569  *
2570  *	The lpl specified also identifies the cpu partition from which
2571  *	disp_lowpri_cpu should select a CPU.
2572  *
2573  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2574  *      behalf of the current thread. (curthread is looking for a new cpu)
2575  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576  *      ignored.
2577  *
2578  *      If a cpu is the target of an offline request then try to avoid it.
2579  *
2580  *	This function must be called at either high SPL, or with preemption
2581  *	disabled, so that the "hint" CPU cannot be removed from the online
2582  *	CPU list while we are traversing it.
2583  */
2584 cpu_t *
2585 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 {
2587 	cpu_t	*bestcpu;
2588 	cpu_t	*besthomecpu;
2589 	cpu_t   *cp, *cpstart;
2590 
2591 	pri_t   bestpri;
2592 	pri_t   cpupri;
2593 
2594 	klgrpset_t	done;
2595 	klgrpset_t	cur_set;
2596 
2597 	lpl_t		*lpl_iter, *lpl_leaf;
2598 	int		i;
2599 
2600 	/*
2601 	 * Scan for a CPU currently running the lowest priority thread.
2602 	 * Cannot get cpu_lock here because it is adaptive.
2603 	 * We do not require lock on CPU list.
2604 	 */
2605 	ASSERT(hint != NULL);
2606 	ASSERT(lpl != NULL);
2607 	ASSERT(lpl->lpl_ncpu > 0);
2608 
2609 	/*
2610 	 * First examine local CPUs. Note that it's possible the hint CPU
2611 	 * passed in in remote to the specified home lgroup. If our priority
2612 	 * isn't sufficient enough such that we can run immediately at home,
2613 	 * then examine CPUs remote to our home lgroup.
2614 	 * We would like to give preference to CPUs closest to "home".
2615 	 * If we can't find a CPU where we'll run at a given level
2616 	 * of locality, we expand our search to include the next level.
2617 	 */
2618 	bestcpu = besthomecpu = NULL;
2619 	klgrpset_clear(done);
2620 	/* start with lpl we were passed */
2621 
2622 	lpl_iter = lpl;
2623 
2624 	do {
2625 
2626 		bestpri = SHRT_MAX;
2627 		klgrpset_clear(cur_set);
2628 
2629 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 			lpl_leaf = lpl_iter->lpl_rset[i];
2631 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 				continue;
2633 
2634 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 
2636 			if (hint->cpu_lpl == lpl_leaf)
2637 				cp = cpstart = hint;
2638 			else
2639 				cp = cpstart = lpl_leaf->lpl_cpus;
2640 
2641 			do {
2642 				if (cp == curcpu)
2643 					cpupri = -1;
2644 				else if (cp == cpu_inmotion)
2645 					cpupri = SHRT_MAX;
2646 				else
2647 					cpupri = cp->cpu_dispatch_pri;
2648 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 					cpupri = cp->cpu_disp->disp_maxrunpri;
2650 				if (cp->cpu_chosen_level > cpupri)
2651 					cpupri = cp->cpu_chosen_level;
2652 				if (cpupri < bestpri) {
2653 					if (CPU_IDLING(cpupri)) {
2654 						ASSERT((cp->cpu_flags &
2655 						    CPU_QUIESCED) == 0);
2656 						return (cp);
2657 					}
2658 					bestcpu = cp;
2659 					bestpri = cpupri;
2660 				}
2661 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2662 		}
2663 
2664 		if (bestcpu && (tpri > bestpri)) {
2665 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 			return (bestcpu);
2667 		}
2668 		if (besthomecpu == NULL)
2669 			besthomecpu = bestcpu;
2670 		/*
2671 		 * Add the lgrps we just considered to the "done" set
2672 		 */
2673 		klgrpset_or(done, cur_set);
2674 
2675 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 
2677 	/*
2678 	 * The specified priority isn't high enough to run immediately
2679 	 * anywhere, so just return the best CPU from the home lgroup.
2680 	 */
2681 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 	return (besthomecpu);
2683 }
2684 
2685 /*
2686  * This routine provides the generic idle cpu function for all processors.
2687  * If a processor has some specific code to execute when idle (say, to stop
2688  * the pipeline and save power) then that routine should be defined in the
2689  * processors specific code (module_xx.c) and the global variable idle_cpu
2690  * set to that function.
2691  */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696 
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
2701 }
2702