xref: /illumos-gate/usr/src/uts/common/disp/disp.c (revision 28ab0ca48b3e331cbbb231b1c8325f9f24f9af95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2019 Joyent, Inc.
28  */
29 
30 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved  	*/
32 
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/systm.h>
40 #include <sys/sysinfo.h>
41 #include <sys/var.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <sys/inline.h>
46 #include <sys/disp.h>
47 #include <sys/class.h>
48 #include <sys/bitmap.h>
49 #include <sys/kmem.h>
50 #include <sys/cpuvar.h>
51 #include <sys/vtrace.h>
52 #include <sys/tnf.h>
53 #include <sys/cpupart.h>
54 #include <sys/lgrp.h>
55 #include <sys/pg.h>
56 #include <sys/cmt.h>
57 #include <sys/bitset.h>
58 #include <sys/schedctl.h>
59 #include <sys/atomic.h>
60 #include <sys/dtrace.h>
61 #include <sys/sdt.h>
62 #include <sys/archsystm.h>
63 #include <sys/smt.h>
64 
65 #include <vm/as.h>
66 
67 #define	BOUND_CPU	0x1
68 #define	BOUND_PARTITION	0x2
69 #define	BOUND_INTR	0x4
70 
71 /* Dispatch queue allocation structure and functions */
72 struct disp_queue_info {
73 	disp_t	*dp;
74 	dispq_t *olddispq;
75 	dispq_t *newdispq;
76 	ulong_t	*olddqactmap;
77 	ulong_t	*newdqactmap;
78 	int	oldnglobpris;
79 };
80 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
81     disp_t *dp);
82 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
83 static void	disp_dq_free(struct disp_queue_info *dptr);
84 
85 /* platform-specific routine to call when processor is idle */
86 static void	generic_idle_cpu();
87 void		(*idle_cpu)() = generic_idle_cpu;
88 
89 /* routines invoked when a CPU enters/exits the idle loop */
90 static void	idle_enter();
91 static void	idle_exit();
92 
93 /* platform-specific routine to call when thread is enqueued */
94 static void	generic_enq_thread(cpu_t *, int);
95 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
96 
97 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
98 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
99 pri_t	intr_pri;		/* interrupt thread priority base level */
100 
101 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
102 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
103 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
104 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
105 int	nswapped;		/* total number of swapped threads */
106 void	disp_swapped_enq(kthread_t *tp);
107 static void	disp_swapped_setrun(kthread_t *tp);
108 static void	cpu_resched(cpu_t *cp, pri_t tpri);
109 
110 /*
111  * If this is set, only interrupt threads will cause kernel preemptions.
112  * This is done by changing the value of kpreemptpri.  kpreemptpri
113  * will either be the max sysclass pri or the min interrupt pri.
114  */
115 int	only_intr_kpreempt;
116 
117 extern void set_idle_cpu(int cpun);
118 extern void unset_idle_cpu(int cpun);
119 static void setkpdq(kthread_t *tp, int borf);
120 #define	SETKP_BACK	0
121 #define	SETKP_FRONT	1
122 /*
123  * Parameter that determines how recently a thread must have run
124  * on the CPU to be considered loosely-bound to that CPU to reduce
125  * cold cache effects.  The interval is in hertz.
126  */
127 #define	RECHOOSE_INTERVAL 3
128 int	rechoose_interval = RECHOOSE_INTERVAL;
129 
130 /*
131  * Parameter that determines how long (in nanoseconds) a thread must
132  * be sitting on a run queue before it can be stolen by another CPU
133  * to reduce migrations.  The interval is in nanoseconds.
134  *
135  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
136  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
137  * here indicating it is uninitiallized.
138  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
139  *
140  */
141 #define	NOSTEAL_UNINITIALIZED	(-1)
142 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
143 extern void cmp_set_nosteal_interval(void);
144 
145 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
146 
147 disp_lock_t	transition_lock;	/* lock on transitioning threads */
148 disp_lock_t	stop_lock;		/* lock on stopped threads */
149 
150 static void	cpu_dispqalloc(int numpris);
151 
152 /*
153  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
154  * a thread because it was sitting on its run queue for a very short
155  * period of time.
156  */
157 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
158 
159 static kthread_t	*disp_getwork(cpu_t *to);
160 static kthread_t	*disp_getbest(disp_t *from);
161 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
162 
163 void	swtch_to(kthread_t *);
164 
165 /*
166  * dispatcher and scheduler initialization
167  */
168 
169 /*
170  * disp_setup - Common code to calculate and allocate dispatcher
171  *		variables and structures based on the maximum priority.
172  */
173 static void
174 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
175 {
176 	pri_t	newnglobpris;
177 
178 	ASSERT(MUTEX_HELD(&cpu_lock));
179 
180 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
181 
182 	if (newnglobpris > oldnglobpris) {
183 		/*
184 		 * Allocate new kp queues for each CPU partition.
185 		 */
186 		cpupart_kpqalloc(newnglobpris);
187 
188 		/*
189 		 * Allocate new dispatch queues for each CPU.
190 		 */
191 		cpu_dispqalloc(newnglobpris);
192 
193 		/*
194 		 * compute new interrupt thread base priority
195 		 */
196 		intr_pri = maxglobpri;
197 		if (only_intr_kpreempt) {
198 			kpreemptpri = intr_pri + 1;
199 			if (kpqpri == KPQPRI)
200 				kpqpri = kpreemptpri;
201 		}
202 		v.v_nglobpris = newnglobpris;
203 	}
204 }
205 
206 /*
207  * dispinit - Called to initialize all loaded classes and the
208  *	      dispatcher framework.
209  */
210 void
211 dispinit(void)
212 {
213 	id_t	cid;
214 	pri_t	maxglobpri;
215 	pri_t	cl_maxglobpri;
216 
217 	maxglobpri = -1;
218 
219 	/*
220 	 * Initialize transition lock, which will always be set.
221 	 */
222 	DISP_LOCK_INIT(&transition_lock);
223 	disp_lock_enter_high(&transition_lock);
224 	DISP_LOCK_INIT(&stop_lock);
225 
226 	mutex_enter(&cpu_lock);
227 	CPU->cpu_disp->disp_maxrunpri = -1;
228 	CPU->cpu_disp->disp_max_unbound_pri = -1;
229 
230 	/*
231 	 * Initialize the default CPU partition.
232 	 */
233 	cpupart_initialize_default();
234 	/*
235 	 * Call the class specific initialization functions for
236 	 * all pre-installed schedulers.
237 	 *
238 	 * We pass the size of a class specific parameter
239 	 * buffer to each of the initialization functions
240 	 * to try to catch problems with backward compatibility
241 	 * of class modules.
242 	 *
243 	 * For example a new class module running on an old system
244 	 * which didn't provide sufficiently large parameter buffers
245 	 * would be bad news. Class initialization modules can check for
246 	 * this and take action if they detect a problem.
247 	 */
248 
249 	for (cid = 0; cid < nclass; cid++) {
250 		sclass_t	*sc;
251 
252 		sc = &sclass[cid];
253 		if (SCHED_INSTALLED(sc)) {
254 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
255 			    &sc->cl_funcs);
256 			if (cl_maxglobpri > maxglobpri)
257 				maxglobpri = cl_maxglobpri;
258 		}
259 	}
260 
261 	/*
262 	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
263 	 * to say, maxclsyspri + 1.  However, over time, the system has used
264 	 * more and more asynchronous kernel threads, with an increasing number
265 	 * of these doing work on direct behalf of higher-level software (e.g.,
266 	 * network processing).  This has led to potential priority inversions:
267 	 * threads doing low-priority lengthy kernel work can effectively
268 	 * delay kernel-level processing of higher-priority data. To minimize
269 	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
270 	 * the kernel that runs at maxclsyspri will therefore induce kernel
271 	 * preemption, and this priority should be used if/when an asynchronous
272 	 * thread (or, as is often the case, task queue) is performing a task
273 	 * on behalf of higher-level software (or any task that is otherwise
274 	 * latency-sensitve).
275 	 */
276 	kpreemptpri = (pri_t)v.v_maxsyspri;
277 	if (kpqpri == KPQPRI)
278 		kpqpri = kpreemptpri;
279 
280 	ASSERT(maxglobpri >= 0);
281 	disp_setup(maxglobpri, 0);
282 
283 	mutex_exit(&cpu_lock);
284 
285 	/*
286 	 * Platform specific sticky scheduler setup.
287 	 */
288 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
289 		cmp_set_nosteal_interval();
290 
291 	/*
292 	 * Get the default class ID; this may be later modified via
293 	 * dispadmin(1M).  This will load the class (normally TS) and that will
294 	 * call disp_add(), which is why we had to drop cpu_lock first.
295 	 */
296 	if (getcid(defaultclass, &defaultcid) != 0) {
297 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
298 		    defaultclass);
299 	}
300 }
301 
302 /*
303  * disp_add - Called with class pointer to initialize the dispatcher
304  *	      for a newly loaded class.
305  */
306 void
307 disp_add(sclass_t *clp)
308 {
309 	pri_t	maxglobpri;
310 	pri_t	cl_maxglobpri;
311 
312 	mutex_enter(&cpu_lock);
313 	/*
314 	 * Initialize the scheduler class.
315 	 */
316 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
317 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
318 	if (cl_maxglobpri > maxglobpri)
319 		maxglobpri = cl_maxglobpri;
320 
321 	/*
322 	 * Save old queue information.  Since we're initializing a
323 	 * new scheduling class which has just been loaded, then
324 	 * the size of the dispq may have changed.  We need to handle
325 	 * that here.
326 	 */
327 	disp_setup(maxglobpri, v.v_nglobpris);
328 
329 	mutex_exit(&cpu_lock);
330 }
331 
332 
333 /*
334  * For each CPU, allocate new dispatch queues
335  * with the stated number of priorities.
336  */
337 static void
338 cpu_dispqalloc(int numpris)
339 {
340 	cpu_t	*cpup;
341 	struct disp_queue_info	*disp_mem;
342 	int i, num;
343 
344 	ASSERT(MUTEX_HELD(&cpu_lock));
345 
346 	disp_mem = kmem_zalloc(NCPU *
347 	    sizeof (struct disp_queue_info), KM_SLEEP);
348 
349 	/*
350 	 * This routine must allocate all of the memory before stopping
351 	 * the cpus because it must not sleep in kmem_alloc while the
352 	 * CPUs are stopped.  Locks they hold will not be freed until they
353 	 * are restarted.
354 	 */
355 	i = 0;
356 	cpup = cpu_list;
357 	do {
358 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
359 		i++;
360 		cpup = cpup->cpu_next;
361 	} while (cpup != cpu_list);
362 	num = i;
363 
364 	pause_cpus(NULL, NULL);
365 	for (i = 0; i < num; i++)
366 		disp_dq_assign(&disp_mem[i], numpris);
367 	start_cpus();
368 
369 	/*
370 	 * I must free all of the memory after starting the cpus because
371 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
372 	 */
373 	for (i = 0; i < num; i++)
374 		disp_dq_free(&disp_mem[i]);
375 
376 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
377 }
378 
379 static void
380 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
381 {
382 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
383 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
384 	    sizeof (long), KM_SLEEP);
385 	dptr->dp = dp;
386 }
387 
388 static void
389 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
390 {
391 	disp_t	*dp;
392 
393 	dp = dptr->dp;
394 	dptr->olddispq = dp->disp_q;
395 	dptr->olddqactmap = dp->disp_qactmap;
396 	dptr->oldnglobpris = dp->disp_npri;
397 
398 	ASSERT(dptr->oldnglobpris < numpris);
399 
400 	if (dptr->olddispq != NULL) {
401 		/*
402 		 * Use kcopy because bcopy is platform-specific
403 		 * and could block while we might have paused the cpus.
404 		 */
405 		(void) kcopy(dptr->olddispq, dptr->newdispq,
406 		    dptr->oldnglobpris * sizeof (dispq_t));
407 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
408 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
409 		    sizeof (long));
410 	}
411 	dp->disp_q = dptr->newdispq;
412 	dp->disp_qactmap = dptr->newdqactmap;
413 	dp->disp_q_limit = &dptr->newdispq[numpris];
414 	dp->disp_npri = numpris;
415 }
416 
417 static void
418 disp_dq_free(struct disp_queue_info *dptr)
419 {
420 	if (dptr->olddispq != NULL)
421 		kmem_free(dptr->olddispq,
422 		    dptr->oldnglobpris * sizeof (dispq_t));
423 	if (dptr->olddqactmap != NULL)
424 		kmem_free(dptr->olddqactmap,
425 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
426 }
427 
428 /*
429  * For a newly created CPU, initialize the dispatch queue.
430  * This is called before the CPU is known through cpu[] or on any lists.
431  */
432 void
433 disp_cpu_init(cpu_t *cp)
434 {
435 	disp_t	*dp;
436 	dispq_t	*newdispq;
437 	ulong_t	*newdqactmap;
438 
439 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
440 
441 	if (cp == cpu0_disp.disp_cpu)
442 		dp = &cpu0_disp;
443 	else
444 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
445 	bzero(dp, sizeof (disp_t));
446 	cp->cpu_disp = dp;
447 	dp->disp_cpu = cp;
448 	dp->disp_maxrunpri = -1;
449 	dp->disp_max_unbound_pri = -1;
450 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
451 	/*
452 	 * Allocate memory for the dispatcher queue headers
453 	 * and the active queue bitmap.
454 	 */
455 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
456 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
457 	    sizeof (long), KM_SLEEP);
458 	dp->disp_q = newdispq;
459 	dp->disp_qactmap = newdqactmap;
460 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
461 	dp->disp_npri = v.v_nglobpris;
462 }
463 
464 void
465 disp_cpu_fini(cpu_t *cp)
466 {
467 	ASSERT(MUTEX_HELD(&cpu_lock));
468 
469 	disp_kp_free(cp->cpu_disp);
470 	if (cp->cpu_disp != &cpu0_disp)
471 		kmem_free(cp->cpu_disp, sizeof (disp_t));
472 }
473 
474 /*
475  * Allocate new, larger kpreempt dispatch queue to replace the old one.
476  */
477 void
478 disp_kp_alloc(disp_t *dq, pri_t npri)
479 {
480 	struct disp_queue_info	mem_info;
481 
482 	if (npri > dq->disp_npri) {
483 		/*
484 		 * Allocate memory for the new array.
485 		 */
486 		disp_dq_alloc(&mem_info, npri, dq);
487 
488 		/*
489 		 * We need to copy the old structures to the new
490 		 * and free the old.
491 		 */
492 		disp_dq_assign(&mem_info, npri);
493 		disp_dq_free(&mem_info);
494 	}
495 }
496 
497 /*
498  * Free dispatch queue.
499  * Used for the kpreempt queues for a removed CPU partition and
500  * for the per-CPU queues of deleted CPUs.
501  */
502 void
503 disp_kp_free(disp_t *dq)
504 {
505 	struct disp_queue_info	mem_info;
506 
507 	mem_info.olddispq = dq->disp_q;
508 	mem_info.olddqactmap = dq->disp_qactmap;
509 	mem_info.oldnglobpris = dq->disp_npri;
510 	disp_dq_free(&mem_info);
511 }
512 
513 /*
514  * End dispatcher and scheduler initialization.
515  */
516 
517 /*
518  * See if there's anything to do other than remain idle.
519  * Return non-zero if there is.
520  *
521  * This function must be called with high spl, or with
522  * kernel preemption disabled to prevent the partition's
523  * active cpu list from changing while being traversed.
524  *
525  * This is essentially a simpler version of disp_getwork()
526  * to be called by CPUs preparing to "halt".
527  */
528 int
529 disp_anywork(void)
530 {
531 	cpu_t		*cp = CPU;
532 	cpu_t		*ocp;
533 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
534 
535 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
536 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
537 			return (1);
538 
539 		for (ocp = cp->cpu_next_part; ocp != cp;
540 		    ocp = ocp->cpu_next_part) {
541 			ASSERT(CPU_ACTIVE(ocp));
542 
543 			/*
544 			 * Something has appeared on the local run queue.
545 			 */
546 			if (*local_nrunnable > 0)
547 				return (1);
548 			/*
549 			 * If we encounter another idle CPU that will
550 			 * soon be trolling around through disp_anywork()
551 			 * terminate our walk here and let this other CPU
552 			 * patrol the next part of the list.
553 			 */
554 			if (ocp->cpu_dispatch_pri == -1 &&
555 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
556 				return (0);
557 			/*
558 			 * Work can be taken from another CPU if:
559 			 *	- There is unbound work on the run queue
560 			 *	- That work isn't a thread undergoing a
561 			 *	- context switch on an otherwise empty queue.
562 			 *	- The CPU isn't running the idle loop.
563 			 */
564 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
565 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
566 			    ocp->cpu_disp->disp_nrunnable == 1) &&
567 			    ocp->cpu_dispatch_pri != -1)
568 				return (1);
569 		}
570 	}
571 	return (0);
572 }
573 
574 /*
575  * Called when CPU enters the idle loop
576  */
577 static void
578 idle_enter()
579 {
580 	cpu_t		*cp = CPU;
581 
582 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
583 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
584 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
585 }
586 
587 /*
588  * Called when CPU exits the idle loop
589  */
590 static void
591 idle_exit()
592 {
593 	cpu_t		*cp = CPU;
594 
595 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
596 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
597 }
598 
599 /*
600  * Idle loop.
601  */
602 void
603 idle()
604 {
605 	struct cpu	*cp = CPU;		/* pointer to this CPU */
606 	kthread_t	*t;			/* taken thread */
607 
608 	idle_enter();
609 
610 	/*
611 	 * Uniprocessor version of idle loop.
612 	 * Do this until notified that we're on an actual multiprocessor.
613 	 */
614 	while (ncpus == 1) {
615 		if (cp->cpu_disp->disp_nrunnable == 0) {
616 			(*idle_cpu)();
617 			continue;
618 		}
619 		idle_exit();
620 		swtch();
621 
622 		idle_enter(); /* returned from swtch */
623 	}
624 
625 	/*
626 	 * Multiprocessor idle loop.
627 	 */
628 	for (;;) {
629 		/*
630 		 * If CPU is completely quiesced by p_online(2), just wait
631 		 * here with minimal bus traffic until put online.
632 		 */
633 		while (cp->cpu_flags & CPU_QUIESCED)
634 			(*idle_cpu)();
635 
636 		if (cp->cpu_disp->disp_nrunnable != 0) {
637 			idle_exit();
638 			swtch();
639 		} else {
640 			if (cp->cpu_flags & CPU_OFFLINE)
641 				continue;
642 			if ((t = disp_getwork(cp)) == NULL) {
643 				if (cp->cpu_chosen_level != -1) {
644 					disp_t *dp = cp->cpu_disp;
645 					disp_t *kpq;
646 
647 					disp_lock_enter(&dp->disp_lock);
648 					/*
649 					 * Set kpq under lock to prevent
650 					 * migration between partitions.
651 					 */
652 					kpq = &cp->cpu_part->cp_kp_queue;
653 					if (kpq->disp_maxrunpri == -1)
654 						cp->cpu_chosen_level = -1;
655 					disp_lock_exit(&dp->disp_lock);
656 				}
657 				(*idle_cpu)();
658 				continue;
659 			}
660 			/*
661 			 * If there was a thread but we couldn't steal
662 			 * it, then keep trying.
663 			 */
664 			if (t == T_DONTSTEAL)
665 				continue;
666 			idle_exit();
667 			swtch_to(t);
668 		}
669 		idle_enter(); /* returned from swtch/swtch_to */
670 	}
671 }
672 
673 
674 /*
675  * Preempt the currently running thread in favor of the highest
676  * priority thread.  The class of the current thread controls
677  * where it goes on the dispatcher queues. If panicking, turn
678  * preemption off.
679  */
680 void
681 preempt()
682 {
683 	kthread_t 	*t = curthread;
684 	klwp_t 		*lwp = ttolwp(curthread);
685 
686 	if (panicstr)
687 		return;
688 
689 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
690 
691 	thread_lock(t);
692 
693 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
694 		/*
695 		 * this thread has already been chosen to be run on
696 		 * another CPU. Clear kprunrun on this CPU since we're
697 		 * already headed for swtch().
698 		 */
699 		CPU->cpu_kprunrun = 0;
700 		thread_unlock_nopreempt(t);
701 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
702 	} else {
703 		if (lwp != NULL)
704 			lwp->lwp_ru.nivcsw++;
705 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
706 		THREAD_TRANSITION(t);
707 		CL_PREEMPT(t);
708 		DTRACE_SCHED(preempt);
709 		thread_unlock_nopreempt(t);
710 
711 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
712 
713 		swtch();		/* clears CPU->cpu_runrun via disp() */
714 	}
715 }
716 
717 extern kthread_t *thread_unpin();
718 
719 /*
720  * disp() - find the highest priority thread for this processor to run, and
721  * set it in TS_ONPROC state so that resume() can be called to run it.
722  */
723 static kthread_t *
724 disp()
725 {
726 	cpu_t		*cpup;
727 	disp_t		*dp;
728 	kthread_t	*tp;
729 	dispq_t		*dq;
730 	int		maxrunword;
731 	pri_t		pri;
732 	disp_t		*kpq;
733 
734 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
735 
736 	cpup = CPU;
737 	/*
738 	 * Find the highest priority loaded, runnable thread.
739 	 */
740 	dp = cpup->cpu_disp;
741 
742 reschedule:
743 	/*
744 	 * If there is more important work on the global queue with a better
745 	 * priority than the maximum on this CPU, take it now.
746 	 */
747 	kpq = &cpup->cpu_part->cp_kp_queue;
748 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
749 	    pri >= dp->disp_maxrunpri &&
750 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
751 	    (tp = disp_getbest(kpq)) != NULL) {
752 		if (disp_ratify(tp, kpq) != NULL) {
753 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
754 			    "disp_end:tid %p", tp);
755 			return (tp);
756 		}
757 	}
758 
759 	disp_lock_enter(&dp->disp_lock);
760 	pri = dp->disp_maxrunpri;
761 
762 	/*
763 	 * If there is nothing to run, look at what's runnable on other queues.
764 	 * Choose the idle thread if the CPU is quiesced.
765 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
766 	 * interrupt threads, which will be the only threads on the CPU's own
767 	 * queue, but cannot run threads from other queues.
768 	 */
769 	if (pri == -1) {
770 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
771 			disp_lock_exit(&dp->disp_lock);
772 			if ((tp = disp_getwork(cpup)) == NULL ||
773 			    tp == T_DONTSTEAL) {
774 				tp = cpup->cpu_idle_thread;
775 				(void) splhigh();
776 				THREAD_ONPROC(tp, cpup);
777 				cpup->cpu_dispthread = tp;
778 				cpup->cpu_dispatch_pri = -1;
779 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
780 				cpup->cpu_chosen_level = -1;
781 			}
782 		} else {
783 			disp_lock_exit_high(&dp->disp_lock);
784 			tp = cpup->cpu_idle_thread;
785 			THREAD_ONPROC(tp, cpup);
786 			cpup->cpu_dispthread = tp;
787 			cpup->cpu_dispatch_pri = -1;
788 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
789 			cpup->cpu_chosen_level = -1;
790 		}
791 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
792 		    "disp_end:tid %p", tp);
793 		return (tp);
794 	}
795 
796 	dq = &dp->disp_q[pri];
797 	tp = dq->dq_first;
798 
799 	ASSERT(tp != NULL);
800 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
801 
802 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
803 
804 	/*
805 	 * Found it so remove it from queue.
806 	 */
807 	dp->disp_nrunnable--;
808 	dq->dq_sruncnt--;
809 	if ((dq->dq_first = tp->t_link) == NULL) {
810 		ulong_t	*dqactmap = dp->disp_qactmap;
811 
812 		ASSERT(dq->dq_sruncnt == 0);
813 		dq->dq_last = NULL;
814 
815 		/*
816 		 * The queue is empty, so the corresponding bit needs to be
817 		 * turned off in dqactmap.   If nrunnable != 0 just took the
818 		 * last runnable thread off the
819 		 * highest queue, so recompute disp_maxrunpri.
820 		 */
821 		maxrunword = pri >> BT_ULSHIFT;
822 		dqactmap[maxrunword] &= ~BT_BIW(pri);
823 
824 		if (dp->disp_nrunnable == 0) {
825 			dp->disp_max_unbound_pri = -1;
826 			dp->disp_maxrunpri = -1;
827 		} else {
828 			int ipri;
829 
830 			ipri = bt_gethighbit(dqactmap, maxrunword);
831 			dp->disp_maxrunpri = ipri;
832 			if (ipri < dp->disp_max_unbound_pri)
833 				dp->disp_max_unbound_pri = ipri;
834 		}
835 	} else {
836 		tp->t_link = NULL;
837 	}
838 
839 	/*
840 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
841 	 * out this thread before we have a chance to run it.
842 	 * While running, it is protected against swapping by t_lock.
843 	 */
844 	tp->t_schedflag |= TS_DONT_SWAP;
845 	cpup->cpu_dispthread = tp;		/* protected by spl only */
846 	cpup->cpu_dispatch_pri = pri;
847 	ASSERT(pri == DISP_PRIO(tp));
848 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
849 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
850 
851 	ASSERT(tp != NULL);
852 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
853 	    "disp_end:tid %p", tp);
854 
855 	if (disp_ratify(tp, kpq) == NULL)
856 		goto reschedule;
857 
858 	return (tp);
859 }
860 
861 /*
862  * swtch()
863  *	Find best runnable thread and run it.
864  *	Called with the current thread already switched to a new state,
865  *	on a sleep queue, run queue, stopped, and not zombied.
866  *	May be called at any spl level less than or equal to LOCK_LEVEL.
867  *	Always drops spl to the base level (spl0()).
868  */
869 void
870 swtch()
871 {
872 	kthread_t	*t = curthread;
873 	kthread_t	*next;
874 	cpu_t		*cp;
875 
876 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
877 
878 	if (t->t_flag & T_INTR_THREAD)
879 		cpu_intr_swtch_enter(t);
880 
881 	if (t->t_intr != NULL) {
882 		/*
883 		 * We are an interrupt thread.  Setup and return
884 		 * the interrupted thread to be resumed.
885 		 */
886 		(void) splhigh();	/* block other scheduler action */
887 		cp = CPU;		/* now protected against migration */
888 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
889 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
890 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
891 		next = thread_unpin();
892 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
893 		resume_from_intr(next);
894 	} else {
895 #ifdef	DEBUG
896 		if (t->t_state == TS_ONPROC &&
897 		    t->t_disp_queue->disp_cpu == CPU &&
898 		    t->t_preempt == 0) {
899 			thread_lock(t);
900 			ASSERT(t->t_state != TS_ONPROC ||
901 			    t->t_disp_queue->disp_cpu != CPU ||
902 			    t->t_preempt != 0);	/* cannot migrate */
903 			thread_unlock_nopreempt(t);
904 		}
905 #endif	/* DEBUG */
906 		cp = CPU;
907 		next = disp();		/* returns with spl high */
908 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
909 
910 		/* OK to steal anything left on run queue */
911 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
912 
913 		if (next != t) {
914 			hrtime_t now;
915 
916 			now = gethrtime_unscaled();
917 			pg_ev_thread_swtch(cp, now, t, next);
918 
919 			/*
920 			 * If t was previously in the TS_ONPROC state,
921 			 * setfrontdq and setbackdq won't have set its t_waitrq.
922 			 * Since we now finally know that we're switching away
923 			 * from this thread, set its t_waitrq if it is on a run
924 			 * queue.
925 			 */
926 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
927 				t->t_waitrq = now;
928 			}
929 
930 			/*
931 			 * restore mstate of thread that we are switching to
932 			 */
933 			restore_mstate(next);
934 
935 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
936 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
937 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
938 
939 			if (dtrace_vtime_active)
940 				dtrace_vtime_switch(next);
941 
942 			resume(next);
943 			/*
944 			 * The TR_RESUME_END and TR_SWTCH_END trace points
945 			 * appear at the end of resume(), because we may not
946 			 * return here
947 			 */
948 		} else {
949 			if (t->t_flag & T_INTR_THREAD)
950 				cpu_intr_swtch_exit(t);
951 			/*
952 			 * Threads that enqueue themselves on a run queue defer
953 			 * setting t_waitrq. It is then either set in swtch()
954 			 * when the CPU is actually yielded, or not at all if it
955 			 * is remaining on the CPU.
956 			 * There is however a window between where the thread
957 			 * placed itself on a run queue, and where it selects
958 			 * itself in disp(), where a third party (eg. clock()
959 			 * doing tick processing) may have re-enqueued this
960 			 * thread, setting t_waitrq in the process. We detect
961 			 * this race by noticing that despite switching to
962 			 * ourself, our t_waitrq has been set, and should be
963 			 * cleared.
964 			 */
965 			if (t->t_waitrq != 0)
966 				t->t_waitrq = 0;
967 
968 			pg_ev_thread_remain(cp, t);
969 
970 			DTRACE_SCHED(remain__cpu);
971 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
972 			(void) spl0();
973 		}
974 	}
975 }
976 
977 /*
978  * swtch_from_zombie()
979  *	Special case of swtch(), which allows checks for TS_ZOMB to be
980  *	eliminated from normal resume.
981  *	Find best runnable thread and run it.
982  *	Called with the current thread zombied.
983  *	Zombies cannot migrate, so CPU references are safe.
984  */
985 void
986 swtch_from_zombie()
987 {
988 	kthread_t	*next;
989 	cpu_t		*cpu = CPU;
990 
991 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
992 
993 	ASSERT(curthread->t_state == TS_ZOMB);
994 
995 	next = disp();			/* returns with spl high */
996 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
997 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
998 	ASSERT(next != curthread);
999 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1000 
1001 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
1002 
1003 	restore_mstate(next);
1004 
1005 	if (dtrace_vtime_active)
1006 		dtrace_vtime_switch(next);
1007 
1008 	resume_from_zombie(next);
1009 	/*
1010 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1011 	 * appear at the end of resume(), because we certainly will not
1012 	 * return here
1013 	 */
1014 }
1015 
1016 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1017 
1018 /*
1019  * search_disp_queues()
1020  *	Search the given dispatch queues for thread tp.
1021  *	Return 1 if tp is found, otherwise return 0.
1022  */
1023 static int
1024 search_disp_queues(disp_t *dp, kthread_t *tp)
1025 {
1026 	dispq_t		*dq;
1027 	dispq_t		*eq;
1028 
1029 	disp_lock_enter_high(&dp->disp_lock);
1030 
1031 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1032 		kthread_t	*rp;
1033 
1034 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1035 
1036 		for (rp = dq->dq_first; rp; rp = rp->t_link)
1037 			if (tp == rp) {
1038 				disp_lock_exit_high(&dp->disp_lock);
1039 				return (1);
1040 			}
1041 	}
1042 	disp_lock_exit_high(&dp->disp_lock);
1043 
1044 	return (0);
1045 }
1046 
1047 /*
1048  * thread_on_queue()
1049  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1050  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1051  */
1052 static int
1053 thread_on_queue(kthread_t *tp)
1054 {
1055 	cpu_t		*cp;
1056 	struct cpupart	*part;
1057 
1058 	ASSERT(getpil() >= DISP_LEVEL);
1059 
1060 	/*
1061 	 * Search the per-CPU dispatch queues for tp.
1062 	 */
1063 	cp = CPU;
1064 	do {
1065 		if (search_disp_queues(cp->cpu_disp, tp))
1066 			return (1);
1067 	} while ((cp = cp->cpu_next_onln) != CPU);
1068 
1069 	/*
1070 	 * Search the partition-wide kpreempt queues for tp.
1071 	 */
1072 	part = CPU->cpu_part;
1073 	do {
1074 		if (search_disp_queues(&part->cp_kp_queue, tp))
1075 			return (1);
1076 	} while ((part = part->cp_next) != CPU->cpu_part);
1077 
1078 	return (0);
1079 }
1080 
1081 #else
1082 
1083 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1084 
1085 #endif  /* DEBUG */
1086 
1087 /*
1088  * like swtch(), but switch to a specified thread taken from another CPU.
1089  *	called with spl high..
1090  */
1091 void
1092 swtch_to(kthread_t *next)
1093 {
1094 	cpu_t			*cp = CPU;
1095 	hrtime_t		now;
1096 
1097 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1098 
1099 	/*
1100 	 * Update context switch statistics.
1101 	 */
1102 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1103 
1104 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1105 
1106 	now = gethrtime_unscaled();
1107 	pg_ev_thread_swtch(cp, now, curthread, next);
1108 
1109 	/* OK to steal anything left on run queue */
1110 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1111 
1112 	/* record last execution time */
1113 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1114 
1115 	/*
1116 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1117 	 * won't have set its t_waitrq.  Since we now finally know that we're
1118 	 * switching away from this thread, set its t_waitrq if it is on a run
1119 	 * queue.
1120 	 */
1121 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1122 		curthread->t_waitrq = now;
1123 	}
1124 
1125 	/* restore next thread to previously running microstate */
1126 	restore_mstate(next);
1127 
1128 	if (dtrace_vtime_active)
1129 		dtrace_vtime_switch(next);
1130 
1131 	resume(next);
1132 	/*
1133 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1134 	 * appear at the end of resume(), because we may not
1135 	 * return here
1136 	 */
1137 }
1138 
1139 static void
1140 cpu_resched(cpu_t *cp, pri_t tpri)
1141 {
1142 	int	call_poke_cpu = 0;
1143 	pri_t   cpupri = cp->cpu_dispatch_pri;
1144 
1145 	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1146 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1147 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1148 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1149 			cp->cpu_runrun = 1;
1150 			aston(cp->cpu_dispthread);
1151 			if (tpri < kpreemptpri && cp != CPU)
1152 				call_poke_cpu = 1;
1153 		}
1154 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1155 			cp->cpu_kprunrun = 1;
1156 			if (cp != CPU)
1157 				call_poke_cpu = 1;
1158 		}
1159 	}
1160 
1161 	/*
1162 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1163 	 */
1164 	membar_enter();
1165 
1166 	if (call_poke_cpu)
1167 		poke_cpu(cp->cpu_id);
1168 }
1169 
1170 /*
1171  * setbackdq() keeps runqs balanced such that the difference in length
1172  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1173  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1174  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1175  * try to keep runqs perfectly balanced regardless of the thread priority.
1176  */
1177 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1178 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1179 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1180 
1181 /*
1182  * Macro that evaluates to true if it is likely that the thread has cache
1183  * warmth. This is based on the amount of time that has elapsed since the
1184  * thread last ran. If that amount of time is less than "rechoose_interval"
1185  * ticks, then we decide that the thread has enough cache warmth to warrant
1186  * some affinity for t->t_cpu.
1187  */
1188 #define	THREAD_HAS_CACHE_WARMTH(thread)	\
1189 	((thread == curthread) ||	\
1190 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1191 /*
1192  * Put the specified thread on the back of the dispatcher
1193  * queue corresponding to its current priority.
1194  *
1195  * Called with the thread in transition, onproc or stopped state
1196  * and locked (transition implies locked) and at high spl.
1197  * Returns with the thread in TS_RUN state and still locked.
1198  */
1199 void
1200 setbackdq(kthread_t *tp)
1201 {
1202 	dispq_t	*dq;
1203 	disp_t		*dp;
1204 	cpu_t		*cp;
1205 	pri_t		tpri;
1206 	int		bound;
1207 	boolean_t	self;
1208 
1209 	ASSERT(THREAD_LOCK_HELD(tp));
1210 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1211 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1212 
1213 	/*
1214 	 * If thread is "swapped" or on the swap queue don't
1215 	 * queue it, but wake sched.
1216 	 */
1217 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1218 		disp_swapped_setrun(tp);
1219 		return;
1220 	}
1221 
1222 	self = (tp == curthread);
1223 
1224 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1225 		bound = 1;
1226 	else
1227 		bound = 0;
1228 
1229 	tpri = DISP_PRIO(tp);
1230 	if (ncpus == 1)
1231 		cp = tp->t_cpu;
1232 	else if (!bound) {
1233 		if (tpri >= kpqpri) {
1234 			setkpdq(tp, SETKP_BACK);
1235 			return;
1236 		}
1237 
1238 		/*
1239 		 * We'll generally let this thread continue to run where
1240 		 * it last ran...but will consider migration if:
1241 		 * - The thread probably doesn't have much cache warmth.
1242 		 * - SMT exclusion would prefer us to run elsewhere
1243 		 * - The CPU where it last ran is the target of an offline
1244 		 *   request.
1245 		 * - The thread last ran outside its home lgroup.
1246 		 */
1247 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1248 		    !smt_should_run(tp, tp->t_cpu) ||
1249 		    (tp->t_cpu == cpu_inmotion) ||
1250 		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1251 			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1252 		} else {
1253 			cp = tp->t_cpu;
1254 		}
1255 
1256 		if (tp->t_cpupart == cp->cpu_part) {
1257 			int	qlen;
1258 
1259 			/*
1260 			 * Perform any CMT load balancing
1261 			 */
1262 			cp = cmt_balance(tp, cp);
1263 
1264 			/*
1265 			 * Balance across the run queues
1266 			 */
1267 			qlen = RUNQ_LEN(cp, tpri);
1268 			if (tpri >= RUNQ_MATCH_PRI &&
1269 			    !(tp->t_schedflag & TS_RUNQMATCH))
1270 				qlen -= RUNQ_MAX_DIFF;
1271 			if (qlen > 0) {
1272 				cpu_t *newcp;
1273 
1274 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1275 					newcp = cp->cpu_next_part;
1276 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1277 					newcp = cp->cpu_next_part;
1278 				}
1279 
1280 				if (smt_should_run(tp, newcp) &&
1281 				    RUNQ_LEN(newcp, tpri) < qlen) {
1282 					DTRACE_PROBE3(runq__balance,
1283 					    kthread_t *, tp,
1284 					    cpu_t *, cp, cpu_t *, newcp);
1285 					cp = newcp;
1286 				}
1287 			}
1288 		} else {
1289 			/*
1290 			 * Migrate to a cpu in the new partition.
1291 			 */
1292 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1293 			    tp->t_pri);
1294 		}
1295 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1296 	} else {
1297 		/*
1298 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1299 		 * a short time until weak binding that existed when the
1300 		 * strong binding was established has dropped) so we must
1301 		 * favour weak binding over strong.
1302 		 */
1303 		cp = tp->t_weakbound_cpu ?
1304 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1305 	}
1306 	/*
1307 	 * A thread that is ONPROC may be temporarily placed on the run queue
1308 	 * but then chosen to run again by disp.  If the thread we're placing on
1309 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1310 	 * replacement process is actually scheduled in swtch().  In this
1311 	 * situation, curthread is the only thread that could be in the ONPROC
1312 	 * state.
1313 	 */
1314 	if ((!self) && (tp->t_waitrq == 0)) {
1315 		hrtime_t curtime;
1316 
1317 		curtime = gethrtime_unscaled();
1318 		(void) cpu_update_pct(tp, curtime);
1319 		tp->t_waitrq = curtime;
1320 	} else {
1321 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1322 	}
1323 
1324 	dp = cp->cpu_disp;
1325 	disp_lock_enter_high(&dp->disp_lock);
1326 
1327 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1328 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1329 	    tpri, cp, tp);
1330 
1331 #ifndef NPROBE
1332 	/* Kernel probe */
1333 	if (tnf_tracing_active)
1334 		tnf_thread_queue(tp, cp, tpri);
1335 #endif /* NPROBE */
1336 
1337 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1338 
1339 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1340 	tp->t_disp_queue = dp;
1341 	tp->t_link = NULL;
1342 
1343 	dq = &dp->disp_q[tpri];
1344 	dp->disp_nrunnable++;
1345 	if (!bound)
1346 		dp->disp_steal = 0;
1347 	membar_enter();
1348 
1349 	if (dq->dq_sruncnt++ != 0) {
1350 		ASSERT(dq->dq_first != NULL);
1351 		dq->dq_last->t_link = tp;
1352 		dq->dq_last = tp;
1353 	} else {
1354 		ASSERT(dq->dq_first == NULL);
1355 		ASSERT(dq->dq_last == NULL);
1356 		dq->dq_first = dq->dq_last = tp;
1357 		BT_SET(dp->disp_qactmap, tpri);
1358 		if (tpri > dp->disp_maxrunpri) {
1359 			dp->disp_maxrunpri = tpri;
1360 			membar_enter();
1361 			cpu_resched(cp, tpri);
1362 		}
1363 	}
1364 
1365 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1366 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1367 			/*
1368 			 * If there are no other unbound threads on the
1369 			 * run queue, don't allow other CPUs to steal
1370 			 * this thread while we are in the middle of a
1371 			 * context switch. We may just switch to it
1372 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1373 			 * in swtch and swtch_to.
1374 			 */
1375 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1376 		}
1377 		dp->disp_max_unbound_pri = tpri;
1378 	}
1379 	(*disp_enq_thread)(cp, bound);
1380 }
1381 
1382 /*
1383  * Put the specified thread on the front of the dispatcher
1384  * queue corresponding to its current priority.
1385  *
1386  * Called with the thread in transition, onproc or stopped state
1387  * and locked (transition implies locked) and at high spl.
1388  * Returns with the thread in TS_RUN state and still locked.
1389  */
1390 void
1391 setfrontdq(kthread_t *tp)
1392 {
1393 	disp_t		*dp;
1394 	dispq_t		*dq;
1395 	cpu_t		*cp;
1396 	pri_t		tpri;
1397 	int		bound;
1398 
1399 	ASSERT(THREAD_LOCK_HELD(tp));
1400 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1401 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1402 
1403 	/*
1404 	 * If thread is "swapped" or on the swap queue don't
1405 	 * queue it, but wake sched.
1406 	 */
1407 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1408 		disp_swapped_setrun(tp);
1409 		return;
1410 	}
1411 
1412 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1413 		bound = 1;
1414 	else
1415 		bound = 0;
1416 
1417 	tpri = DISP_PRIO(tp);
1418 	if (ncpus == 1)
1419 		cp = tp->t_cpu;
1420 	else if (!bound) {
1421 		if (tpri >= kpqpri) {
1422 			setkpdq(tp, SETKP_FRONT);
1423 			return;
1424 		}
1425 		cp = tp->t_cpu;
1426 		if (tp->t_cpupart == cp->cpu_part) {
1427 			/*
1428 			 * We'll generally let this thread continue to run
1429 			 * where it last ran, but will consider migration if:
1430 			 * - The thread last ran outside its home lgroup.
1431 			 * - The CPU where it last ran is the target of an
1432 			 *   offline request (a thread_nomigrate() on the in
1433 			 *   motion CPU relies on this when forcing a preempt).
1434 			 * - The thread isn't the highest priority thread where
1435 			 *   it last ran, and it is considered not likely to
1436 			 *   have significant cache warmth.
1437 			 */
1438 			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1439 			    cp == cpu_inmotion ||
1440 			    (tpri < cp->cpu_disp->disp_maxrunpri &&
1441 			    !THREAD_HAS_CACHE_WARMTH(tp))) {
1442 				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1443 			}
1444 		} else {
1445 			/*
1446 			 * Migrate to a cpu in the new partition.
1447 			 */
1448 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1449 			    tp, tp->t_pri);
1450 		}
1451 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1452 	} else {
1453 		/*
1454 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1455 		 * a short time until weak binding that existed when the
1456 		 * strong binding was established has dropped) so we must
1457 		 * favour weak binding over strong.
1458 		 */
1459 		cp = tp->t_weakbound_cpu ?
1460 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1461 	}
1462 
1463 	/*
1464 	 * A thread that is ONPROC may be temporarily placed on the run queue
1465 	 * but then chosen to run again by disp.  If the thread we're placing on
1466 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1467 	 * replacement process is actually scheduled in swtch().  In this
1468 	 * situation, curthread is the only thread that could be in the ONPROC
1469 	 * state.
1470 	 */
1471 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1472 		hrtime_t curtime;
1473 
1474 		curtime = gethrtime_unscaled();
1475 		(void) cpu_update_pct(tp, curtime);
1476 		tp->t_waitrq = curtime;
1477 	} else {
1478 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1479 	}
1480 
1481 	dp = cp->cpu_disp;
1482 	disp_lock_enter_high(&dp->disp_lock);
1483 
1484 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1485 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1486 
1487 #ifndef NPROBE
1488 	/* Kernel probe */
1489 	if (tnf_tracing_active)
1490 		tnf_thread_queue(tp, cp, tpri);
1491 #endif /* NPROBE */
1492 
1493 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1494 
1495 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1496 	tp->t_disp_queue = dp;
1497 
1498 	dq = &dp->disp_q[tpri];
1499 	dp->disp_nrunnable++;
1500 	if (!bound)
1501 		dp->disp_steal = 0;
1502 	membar_enter();
1503 
1504 	if (dq->dq_sruncnt++ != 0) {
1505 		ASSERT(dq->dq_last != NULL);
1506 		tp->t_link = dq->dq_first;
1507 		dq->dq_first = tp;
1508 	} else {
1509 		ASSERT(dq->dq_last == NULL);
1510 		ASSERT(dq->dq_first == NULL);
1511 		tp->t_link = NULL;
1512 		dq->dq_first = dq->dq_last = tp;
1513 		BT_SET(dp->disp_qactmap, tpri);
1514 		if (tpri > dp->disp_maxrunpri) {
1515 			dp->disp_maxrunpri = tpri;
1516 			membar_enter();
1517 			cpu_resched(cp, tpri);
1518 		}
1519 	}
1520 
1521 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1522 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1523 		    cp == CPU) {
1524 			/*
1525 			 * If there are no other unbound threads on the
1526 			 * run queue, don't allow other CPUs to steal
1527 			 * this thread while we are in the middle of a
1528 			 * context switch. We may just switch to it
1529 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1530 			 * in swtch and swtch_to.
1531 			 */
1532 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1533 		}
1534 		dp->disp_max_unbound_pri = tpri;
1535 	}
1536 	(*disp_enq_thread)(cp, bound);
1537 }
1538 
1539 /*
1540  * Put a high-priority unbound thread on the kp queue
1541  */
1542 static void
1543 setkpdq(kthread_t *tp, int borf)
1544 {
1545 	dispq_t	*dq;
1546 	disp_t	*dp;
1547 	cpu_t	*cp;
1548 	pri_t	tpri;
1549 
1550 	tpri = DISP_PRIO(tp);
1551 
1552 	dp = &tp->t_cpupart->cp_kp_queue;
1553 	disp_lock_enter_high(&dp->disp_lock);
1554 
1555 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1556 
1557 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1558 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1559 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1560 	tp->t_disp_queue = dp;
1561 	dp->disp_nrunnable++;
1562 	dq = &dp->disp_q[tpri];
1563 
1564 	if (dq->dq_sruncnt++ != 0) {
1565 		if (borf == SETKP_BACK) {
1566 			ASSERT(dq->dq_first != NULL);
1567 			tp->t_link = NULL;
1568 			dq->dq_last->t_link = tp;
1569 			dq->dq_last = tp;
1570 		} else {
1571 			ASSERT(dq->dq_last != NULL);
1572 			tp->t_link = dq->dq_first;
1573 			dq->dq_first = tp;
1574 		}
1575 	} else {
1576 		if (borf == SETKP_BACK) {
1577 			ASSERT(dq->dq_first == NULL);
1578 			ASSERT(dq->dq_last == NULL);
1579 			dq->dq_first = dq->dq_last = tp;
1580 		} else {
1581 			ASSERT(dq->dq_last == NULL);
1582 			ASSERT(dq->dq_first == NULL);
1583 			tp->t_link = NULL;
1584 			dq->dq_first = dq->dq_last = tp;
1585 		}
1586 		BT_SET(dp->disp_qactmap, tpri);
1587 		if (tpri > dp->disp_max_unbound_pri)
1588 			dp->disp_max_unbound_pri = tpri;
1589 		if (tpri > dp->disp_maxrunpri) {
1590 			dp->disp_maxrunpri = tpri;
1591 			membar_enter();
1592 		}
1593 	}
1594 
1595 	cp = tp->t_cpu;
1596 	if (tp->t_cpupart != cp->cpu_part) {
1597 		/* migrate to a cpu in the new partition */
1598 		cp = tp->t_cpupart->cp_cpulist;
1599 	}
1600 	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1601 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1602 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1603 
1604 #ifndef NPROBE
1605 	/* Kernel probe */
1606 	if (tnf_tracing_active)
1607 		tnf_thread_queue(tp, cp, tpri);
1608 #endif /* NPROBE */
1609 
1610 	if (cp->cpu_chosen_level < tpri)
1611 		cp->cpu_chosen_level = tpri;
1612 	cpu_resched(cp, tpri);
1613 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1614 	(*disp_enq_thread)(cp, 0);
1615 }
1616 
1617 /*
1618  * Remove a thread from the dispatcher queue if it is on it.
1619  * It is not an error if it is not found but we return whether
1620  * or not it was found in case the caller wants to check.
1621  */
1622 int
1623 dispdeq(kthread_t *tp)
1624 {
1625 	disp_t		*dp;
1626 	dispq_t		*dq;
1627 	kthread_t	*rp;
1628 	kthread_t	*trp;
1629 	kthread_t	**ptp;
1630 	int		tpri;
1631 
1632 	ASSERT(THREAD_LOCK_HELD(tp));
1633 
1634 	if (tp->t_state != TS_RUN)
1635 		return (0);
1636 
1637 	/*
1638 	 * The thread is "swapped" or is on the swap queue and
1639 	 * hence no longer on the run queue, so return true.
1640 	 */
1641 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1642 		return (1);
1643 
1644 	tpri = DISP_PRIO(tp);
1645 	dp = tp->t_disp_queue;
1646 	ASSERT(tpri < dp->disp_npri);
1647 	dq = &dp->disp_q[tpri];
1648 	ptp = &dq->dq_first;
1649 	rp = *ptp;
1650 	trp = NULL;
1651 
1652 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1653 
1654 	/*
1655 	 * Search for thread in queue.
1656 	 * Double links would simplify this at the expense of disp/setrun.
1657 	 */
1658 	while (rp != tp && rp != NULL) {
1659 		trp = rp;
1660 		ptp = &trp->t_link;
1661 		rp = trp->t_link;
1662 	}
1663 
1664 	if (rp == NULL) {
1665 		panic("dispdeq: thread not on queue");
1666 	}
1667 
1668 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1669 
1670 	/*
1671 	 * Found it so remove it from queue.
1672 	 */
1673 	if ((*ptp = rp->t_link) == NULL)
1674 		dq->dq_last = trp;
1675 
1676 	dp->disp_nrunnable--;
1677 	if (--dq->dq_sruncnt == 0) {
1678 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1679 		if (dp->disp_nrunnable == 0) {
1680 			dp->disp_max_unbound_pri = -1;
1681 			dp->disp_maxrunpri = -1;
1682 		} else if (tpri == dp->disp_maxrunpri) {
1683 			int ipri;
1684 
1685 			ipri = bt_gethighbit(dp->disp_qactmap,
1686 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1687 			if (ipri < dp->disp_max_unbound_pri)
1688 				dp->disp_max_unbound_pri = ipri;
1689 			dp->disp_maxrunpri = ipri;
1690 		}
1691 	}
1692 	tp->t_link = NULL;
1693 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1694 	return (1);
1695 }
1696 
1697 
1698 /*
1699  * dq_sruninc and dq_srundec are public functions for
1700  * incrementing/decrementing the sruncnts when a thread on
1701  * a dispatcher queue is made schedulable/unschedulable by
1702  * resetting the TS_LOAD flag.
1703  *
1704  * The caller MUST have the thread lock and therefore the dispatcher
1705  * queue lock so that the operation which changes
1706  * the flag, the operation that checks the status of the thread to
1707  * determine if it's on a disp queue AND the call to this function
1708  * are one atomic operation with respect to interrupts.
1709  */
1710 
1711 /*
1712  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1713  */
1714 void
1715 dq_sruninc(kthread_t *t)
1716 {
1717 	ASSERT(t->t_state == TS_RUN);
1718 	ASSERT(t->t_schedflag & TS_LOAD);
1719 
1720 	THREAD_TRANSITION(t);
1721 	setfrontdq(t);
1722 }
1723 
1724 /*
1725  * See comment on calling conventions above.
1726  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1727  */
1728 void
1729 dq_srundec(kthread_t *t)
1730 {
1731 	ASSERT(t->t_schedflag & TS_LOAD);
1732 
1733 	(void) dispdeq(t);
1734 	disp_swapped_enq(t);
1735 }
1736 
1737 /*
1738  * Change the dispatcher lock of thread to the "swapped_lock"
1739  * and return with thread lock still held.
1740  *
1741  * Called with thread_lock held, in transition state, and at high spl.
1742  */
1743 void
1744 disp_swapped_enq(kthread_t *tp)
1745 {
1746 	ASSERT(THREAD_LOCK_HELD(tp));
1747 	ASSERT(tp->t_schedflag & TS_LOAD);
1748 
1749 	switch (tp->t_state) {
1750 	case TS_RUN:
1751 		disp_lock_enter_high(&swapped_lock);
1752 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1753 		break;
1754 	case TS_ONPROC:
1755 		disp_lock_enter_high(&swapped_lock);
1756 		THREAD_TRANSITION(tp);
1757 		wake_sched_sec = 1;		/* tell clock to wake sched */
1758 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1759 		break;
1760 	default:
1761 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1762 	}
1763 }
1764 
1765 /*
1766  * This routine is called by setbackdq/setfrontdq if the thread is
1767  * not loaded or loaded and on the swap queue.
1768  *
1769  * Thread state TS_SLEEP implies that a swapped thread
1770  * has been woken up and needs to be swapped in by the swapper.
1771  *
1772  * Thread state TS_RUN, it implies that the priority of a swapped
1773  * thread is being increased by scheduling class (e.g. ts_update).
1774  */
1775 static void
1776 disp_swapped_setrun(kthread_t *tp)
1777 {
1778 	ASSERT(THREAD_LOCK_HELD(tp));
1779 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1780 
1781 	switch (tp->t_state) {
1782 	case TS_SLEEP:
1783 		disp_lock_enter_high(&swapped_lock);
1784 		/*
1785 		 * Wakeup sched immediately (i.e., next tick) if the
1786 		 * thread priority is above maxclsyspri.
1787 		 */
1788 		if (DISP_PRIO(tp) > maxclsyspri)
1789 			wake_sched = 1;
1790 		else
1791 			wake_sched_sec = 1;
1792 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1793 		break;
1794 	case TS_RUN:				/* called from ts_update */
1795 		break;
1796 	default:
1797 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1798 	}
1799 }
1800 
1801 /*
1802  *	Make a thread give up its processor.  Find the processor on
1803  *	which this thread is executing, and have that processor
1804  *	preempt.
1805  *
1806  *	We allow System Duty Cycle (SDC) threads to be preempted even if
1807  *	they are running at kernel priorities.  To implement this, we always
1808  *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1809  *	calls cpu_surrender() very often, we only preempt if there is anyone
1810  *	competing with us.
1811  */
1812 void
1813 cpu_surrender(kthread_t *tp)
1814 {
1815 	cpu_t	*cpup;
1816 	int	max_pri;
1817 	int	max_run_pri;
1818 	klwp_t	*lwp;
1819 
1820 	ASSERT(THREAD_LOCK_HELD(tp));
1821 
1822 	if (tp->t_state != TS_ONPROC)
1823 		return;
1824 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1825 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1826 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1827 	if (max_pri < max_run_pri)
1828 		max_pri = max_run_pri;
1829 
1830 	if (tp->t_cid == sysdccid) {
1831 		uint_t t_pri = DISP_PRIO(tp);
1832 		if (t_pri > max_pri)
1833 			return;		/* we are not competing w/ anyone */
1834 		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1835 	} else {
1836 		cpup->cpu_runrun = 1;
1837 		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1838 			cpup->cpu_kprunrun = 1;
1839 		}
1840 	}
1841 
1842 	/*
1843 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1844 	 */
1845 	membar_enter();
1846 
1847 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1848 
1849 	/*
1850 	 * Make the target thread take an excursion through trap()
1851 	 * to do preempt() (unless we're already in trap or post_syscall,
1852 	 * calling cpu_surrender via CL_TRAPRET).
1853 	 */
1854 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1855 	    lwp->lwp_state != LWP_USER) {
1856 		aston(tp);
1857 		if (cpup != CPU)
1858 			poke_cpu(cpup->cpu_id);
1859 	}
1860 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1861 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1862 }
1863 
1864 /*
1865  * Commit to and ratify a scheduling decision
1866  */
1867 /*ARGSUSED*/
1868 static kthread_t *
1869 disp_ratify(kthread_t *tp, disp_t *kpq)
1870 {
1871 	pri_t	tpri, maxpri;
1872 	pri_t	maxkpri;
1873 	cpu_t	*cpup;
1874 
1875 	ASSERT(tp != NULL);
1876 	/*
1877 	 * Commit to, then ratify scheduling decision
1878 	 */
1879 	cpup = CPU;
1880 	if (cpup->cpu_runrun != 0)
1881 		cpup->cpu_runrun = 0;
1882 	if (cpup->cpu_kprunrun != 0)
1883 		cpup->cpu_kprunrun = 0;
1884 	if (cpup->cpu_chosen_level != -1)
1885 		cpup->cpu_chosen_level = -1;
1886 	membar_enter();
1887 	tpri = DISP_PRIO(tp);
1888 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1889 	maxkpri = kpq->disp_maxrunpri;
1890 	if (maxpri < maxkpri)
1891 		maxpri = maxkpri;
1892 	if (tpri < maxpri) {
1893 		/*
1894 		 * should have done better
1895 		 * put this one back and indicate to try again
1896 		 */
1897 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1898 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1899 		thread_lock_high(tp);
1900 		THREAD_TRANSITION(tp);
1901 		setfrontdq(tp);
1902 		thread_unlock_nopreempt(tp);
1903 
1904 		tp = NULL;
1905 	}
1906 	return (tp);
1907 }
1908 
1909 /*
1910  * See if there is any work on the dispatcher queue for other CPUs.
1911  * If there is, dequeue the best thread and return.
1912  */
1913 static kthread_t *
1914 disp_getwork(cpu_t *cp)
1915 {
1916 	cpu_t		*ocp;		/* other CPU */
1917 	cpu_t		*ocp_start;
1918 	cpu_t		*tcp;		/* target local CPU */
1919 	kthread_t	*tp;
1920 	kthread_t	*retval = NULL;
1921 	pri_t		maxpri;
1922 	disp_t		*kpq;		/* kp queue for this partition */
1923 	lpl_t		*lpl, *lpl_leaf;
1924 	int		leafidx, startidx;
1925 	hrtime_t	stealtime;
1926 	lgrp_id_t	local_id;
1927 
1928 	maxpri = -1;
1929 	tcp = NULL;
1930 
1931 	kpq = &cp->cpu_part->cp_kp_queue;
1932 	while (kpq->disp_maxrunpri >= 0) {
1933 		/*
1934 		 * Try to take a thread from the kp_queue.
1935 		 */
1936 		tp = (disp_getbest(kpq));
1937 		if (tp)
1938 			return (disp_ratify(tp, kpq));
1939 	}
1940 
1941 	kpreempt_disable();		/* protect the cpu_active list */
1942 
1943 	/*
1944 	 * Try to find something to do on another CPU's run queue.
1945 	 * Loop through all other CPUs looking for the one with the highest
1946 	 * priority unbound thread.
1947 	 *
1948 	 * On NUMA machines, the partition's CPUs are consulted in order of
1949 	 * distance from the current CPU. This way, the first available
1950 	 * work found is also the closest, and will suffer the least
1951 	 * from being migrated.
1952 	 */
1953 	lpl = lpl_leaf = cp->cpu_lpl;
1954 	local_id = lpl_leaf->lpl_lgrpid;
1955 	leafidx = startidx = 0;
1956 
1957 	/*
1958 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1959 	 * broader levels of locality
1960 	 */
1961 	do {
1962 		/* This loop iterates over the lpl's leaves */
1963 		do {
1964 			if (lpl_leaf != cp->cpu_lpl)
1965 				ocp = lpl_leaf->lpl_cpus;
1966 			else
1967 				ocp = cp->cpu_next_lpl;
1968 
1969 			/* This loop iterates over the CPUs in the leaf */
1970 			ocp_start = ocp;
1971 			do {
1972 				pri_t pri;
1973 
1974 				ASSERT(CPU_ACTIVE(ocp));
1975 
1976 				/*
1977 				 * End our stroll around this lpl if:
1978 				 *
1979 				 * - Something became runnable on the local
1980 				 *   queue...which also ends our stroll around
1981 				 *   the partition.
1982 				 *
1983 				 * - We happen across another idle CPU.
1984 				 *   Since it is patrolling the next portion
1985 				 *   of the lpl's list (assuming it's not
1986 				 *   halted, or busy servicing an interrupt),
1987 				 *   move to the next higher level of locality.
1988 				 */
1989 				if (cp->cpu_disp->disp_nrunnable != 0) {
1990 					kpreempt_enable();
1991 					return (NULL);
1992 				}
1993 				if (ocp->cpu_dispatch_pri == -1) {
1994 					if (ocp->cpu_disp_flags &
1995 					    CPU_DISP_HALTED ||
1996 					    ocp->cpu_intr_actv != 0)
1997 						continue;
1998 					else
1999 						goto next_level;
2000 				}
2001 
2002 				/*
2003 				 * If there's only one thread and the CPU
2004 				 * is in the middle of a context switch,
2005 				 * or it's currently running the idle thread,
2006 				 * don't steal it.
2007 				 */
2008 				if ((ocp->cpu_disp_flags &
2009 				    CPU_DISP_DONTSTEAL) &&
2010 				    ocp->cpu_disp->disp_nrunnable == 1)
2011 					continue;
2012 
2013 				pri = ocp->cpu_disp->disp_max_unbound_pri;
2014 				if (pri > maxpri) {
2015 					/*
2016 					 * Don't steal threads that we attempted
2017 					 * to steal recently until they're ready
2018 					 * to be stolen again.
2019 					 */
2020 					stealtime = ocp->cpu_disp->disp_steal;
2021 					if (stealtime == 0 ||
2022 					    stealtime - gethrtime() <= 0) {
2023 						maxpri = pri;
2024 						tcp = ocp;
2025 					} else {
2026 						/*
2027 						 * Don't update tcp, just set
2028 						 * the retval to T_DONTSTEAL, so
2029 						 * that if no acceptable CPUs
2030 						 * are found the return value
2031 						 * will be T_DONTSTEAL rather
2032 						 * then NULL.
2033 						 */
2034 						retval = T_DONTSTEAL;
2035 					}
2036 				}
2037 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2038 
2039 			/*
2040 			 * Iterate to the next leaf lpl in the resource set
2041 			 * at this level of locality. If we hit the end of
2042 			 * the set, wrap back around to the beginning.
2043 			 *
2044 			 * Note: This iteration is NULL terminated for a reason
2045 			 * see lpl_topo_bootstrap() in lgrp.c for details.
2046 			 */
2047 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2048 				leafidx = 0;
2049 				lpl_leaf = lpl->lpl_rset[leafidx];
2050 			}
2051 		} while (leafidx != startidx);
2052 
2053 next_level:
2054 		/*
2055 		 * Expand the search to include farther away CPUs (next
2056 		 * locality level). The closer CPUs that have already been
2057 		 * checked will be checked again. In doing so, idle CPUs
2058 		 * will tend to be more aggresive about stealing from CPUs
2059 		 * that are closer (since the closer CPUs will be considered
2060 		 * more often).
2061 		 * Begin at this level with the CPUs local leaf lpl.
2062 		 */
2063 		if ((lpl = lpl->lpl_parent) != NULL) {
2064 			leafidx = startidx = lpl->lpl_id2rset[local_id];
2065 			lpl_leaf = lpl->lpl_rset[leafidx];
2066 		}
2067 	} while (!tcp && lpl);
2068 
2069 	kpreempt_enable();
2070 
2071 	/*
2072 	 * If another queue looks good, and there is still nothing on
2073 	 * the local queue, try to transfer one or more threads
2074 	 * from it to our queue.
2075 	 */
2076 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2077 		tp = disp_getbest(tcp->cpu_disp);
2078 		if (tp == NULL || tp == T_DONTSTEAL)
2079 			return (tp);
2080 		return (disp_ratify(tp, kpq));
2081 	}
2082 	return (retval);
2083 }
2084 
2085 
2086 /*
2087  * disp_fix_unbound_pri()
2088  *	Determines the maximum priority of unbound threads on the queue.
2089  *	The priority is kept for the queue, but is only increased, never
2090  *	reduced unless some CPU is looking for something on that queue.
2091  *
2092  *	The priority argument is the known upper limit.
2093  *
2094  *	Perhaps this should be kept accurately, but that probably means
2095  *	separate bitmaps for bound and unbound threads.  Since only idled
2096  *	CPUs will have to do this recalculation, it seems better this way.
2097  */
2098 static void
2099 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2100 {
2101 	kthread_t	*tp;
2102 	dispq_t		*dq;
2103 	ulong_t		*dqactmap = dp->disp_qactmap;
2104 	ulong_t		mapword;
2105 	int		wx;
2106 
2107 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2108 
2109 	ASSERT(pri >= 0);			/* checked by caller */
2110 
2111 	/*
2112 	 * Start the search at the next lowest priority below the supplied
2113 	 * priority.  This depends on the bitmap implementation.
2114 	 */
2115 	do {
2116 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2117 
2118 		/*
2119 		 * Form mask for all lower priorities in the word.
2120 		 */
2121 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2122 
2123 		/*
2124 		 * Get next lower active priority.
2125 		 */
2126 		if (mapword != 0) {
2127 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2128 		} else if (wx > 0) {
2129 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2130 			if (pri < 0)
2131 				break;
2132 		} else {
2133 			pri = -1;
2134 			break;
2135 		}
2136 
2137 		/*
2138 		 * Search the queue for unbound, runnable threads.
2139 		 */
2140 		dq = &dp->disp_q[pri];
2141 		tp = dq->dq_first;
2142 
2143 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2144 			tp = tp->t_link;
2145 		}
2146 
2147 		/*
2148 		 * If a thread was found, set the priority and return.
2149 		 */
2150 	} while (tp == NULL);
2151 
2152 	/*
2153 	 * pri holds the maximum unbound thread priority or -1.
2154 	 */
2155 	if (dp->disp_max_unbound_pri != pri)
2156 		dp->disp_max_unbound_pri = pri;
2157 }
2158 
2159 /*
2160  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2161  * 	check if the CPU to which is was previously bound should have
2162  * 	its disp_max_unbound_pri increased.
2163  */
2164 void
2165 disp_adjust_unbound_pri(kthread_t *tp)
2166 {
2167 	disp_t *dp;
2168 	pri_t tpri;
2169 
2170 	ASSERT(THREAD_LOCK_HELD(tp));
2171 
2172 	/*
2173 	 * Don't do anything if the thread is not bound, or
2174 	 * currently not runnable or swapped out.
2175 	 */
2176 	if (tp->t_bound_cpu == NULL ||
2177 	    tp->t_state != TS_RUN ||
2178 	    tp->t_schedflag & TS_ON_SWAPQ)
2179 		return;
2180 
2181 	tpri = DISP_PRIO(tp);
2182 	dp = tp->t_bound_cpu->cpu_disp;
2183 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2184 	if (tpri > dp->disp_max_unbound_pri)
2185 		dp->disp_max_unbound_pri = tpri;
2186 }
2187 
2188 /*
2189  * disp_getbest()
2190  *   De-queue the highest priority unbound runnable thread.
2191  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2192  *   Returns NULL if nothing found.
2193  *   Returns T_DONTSTEAL if the thread was not stealable.
2194  *   so that the caller will try again later.
2195  *
2196  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2197  *   its type.
2198  */
2199 static kthread_t *
2200 disp_getbest(disp_t *dp)
2201 {
2202 	kthread_t	*tp;
2203 	dispq_t		*dq;
2204 	pri_t		pri;
2205 	cpu_t		*cp, *tcp;
2206 	boolean_t	allbound;
2207 
2208 	disp_lock_enter(&dp->disp_lock);
2209 
2210 	/*
2211 	 * If there is nothing to run, or the CPU is in the middle of a
2212 	 * context switch of the only thread, return NULL.
2213 	 */
2214 	tcp = dp->disp_cpu;
2215 	cp = CPU;
2216 	pri = dp->disp_max_unbound_pri;
2217 	if (pri == -1 ||
2218 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2219 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2220 		disp_lock_exit_nopreempt(&dp->disp_lock);
2221 		return (NULL);
2222 	}
2223 
2224 	dq = &dp->disp_q[pri];
2225 
2226 
2227 	/*
2228 	 * Assume that all threads are bound on this queue, and change it
2229 	 * later when we find out that it is not the case.
2230 	 */
2231 	allbound = B_TRUE;
2232 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2233 		hrtime_t now, nosteal, rqtime;
2234 
2235 		/*
2236 		 * Skip over bound threads which could be here even
2237 		 * though disp_max_unbound_pri indicated this level.
2238 		 */
2239 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2240 			continue;
2241 
2242 		/*
2243 		 * We've got some unbound threads on this queue, so turn
2244 		 * the allbound flag off now.
2245 		 */
2246 		allbound = B_FALSE;
2247 
2248 		/*
2249 		 * The thread is a candidate for stealing from its run queue. We
2250 		 * don't want to steal threads that became runnable just a
2251 		 * moment ago. This improves CPU affinity for threads that get
2252 		 * preempted for short periods of time and go back on the run
2253 		 * queue.
2254 		 *
2255 		 * We want to let it stay on its run queue if it was only placed
2256 		 * there recently and it was running on the same CPU before that
2257 		 * to preserve its cache investment. For the thread to remain on
2258 		 * its run queue, ALL of the following conditions must be
2259 		 * satisfied:
2260 		 *
2261 		 * - the disp queue should not be the kernel preemption queue
2262 		 * - delayed idle stealing should not be disabled
2263 		 * - nosteal_nsec should be non-zero
2264 		 * - it should run with user priority
2265 		 * - it should be on the run queue of the CPU where it was
2266 		 *   running before being placed on the run queue
2267 		 * - it should be the only thread on the run queue (to prevent
2268 		 *   extra scheduling latency for other threads)
2269 		 * - it should sit on the run queue for less than per-chip
2270 		 *   nosteal interval or global nosteal interval
2271 		 * - in case of CPUs with shared cache it should sit in a run
2272 		 *   queue of a CPU from a different chip
2273 		 *
2274 		 * The checks are arranged so that the ones that are faster are
2275 		 * placed earlier.
2276 		 */
2277 		if (tcp == NULL ||
2278 		    pri >= minclsyspri ||
2279 		    tp->t_cpu != tcp)
2280 			break;
2281 
2282 		/*
2283 		 * Steal immediately if, due to CMT processor architecture
2284 		 * migraiton between cp and tcp would incur no performance
2285 		 * penalty.
2286 		 */
2287 		if (pg_cmt_can_migrate(cp, tcp))
2288 			break;
2289 
2290 		nosteal = nosteal_nsec;
2291 		if (nosteal == 0)
2292 			break;
2293 
2294 		/*
2295 		 * Calculate time spent sitting on run queue
2296 		 */
2297 		now = gethrtime_unscaled();
2298 		rqtime = now - tp->t_waitrq;
2299 		scalehrtime(&rqtime);
2300 
2301 		/*
2302 		 * Steal immediately if the time spent on this run queue is more
2303 		 * than allowed nosteal delay.
2304 		 *
2305 		 * Negative rqtime check is needed here to avoid infinite
2306 		 * stealing delays caused by unlikely but not impossible
2307 		 * drifts between CPU times on different CPUs.
2308 		 */
2309 		if (rqtime > nosteal || rqtime < 0)
2310 			break;
2311 
2312 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2313 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2314 		scalehrtime(&now);
2315 		/*
2316 		 * Calculate when this thread becomes stealable
2317 		 */
2318 		now += (nosteal - rqtime);
2319 
2320 		/*
2321 		 * Calculate time when some thread becomes stealable
2322 		 */
2323 		if (now < dp->disp_steal)
2324 			dp->disp_steal = now;
2325 	}
2326 
2327 	/*
2328 	 * If there were no unbound threads on this queue, find the queue
2329 	 * where they are and then return later. The value of
2330 	 * disp_max_unbound_pri is not always accurate because it isn't
2331 	 * reduced until another idle CPU looks for work.
2332 	 */
2333 	if (allbound)
2334 		disp_fix_unbound_pri(dp, pri);
2335 
2336 	/*
2337 	 * If we reached the end of the queue and found no unbound threads
2338 	 * then return NULL so that other CPUs will be considered.  If there
2339 	 * are unbound threads but they cannot yet be stolen, then
2340 	 * return T_DONTSTEAL and try again later.
2341 	 */
2342 	if (tp == NULL) {
2343 		disp_lock_exit_nopreempt(&dp->disp_lock);
2344 		return (allbound ? NULL : T_DONTSTEAL);
2345 	}
2346 
2347 	/*
2348 	 * Found a runnable, unbound thread, so remove it from queue.
2349 	 * dispdeq() requires that we have the thread locked, and we do,
2350 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2351 	 * put the thread in transition state, thereby dropping the dispq
2352 	 * lock.
2353 	 */
2354 
2355 #ifdef DEBUG
2356 	{
2357 		int	thread_was_on_queue;
2358 
2359 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2360 		ASSERT(thread_was_on_queue);
2361 	}
2362 
2363 #else /* DEBUG */
2364 	(void) dispdeq(tp);			/* drops disp_lock */
2365 #endif /* DEBUG */
2366 
2367 	/*
2368 	 * Reset the disp_queue steal time - we do not know what is the smallest
2369 	 * value across the queue is.
2370 	 */
2371 	dp->disp_steal = 0;
2372 
2373 	tp->t_schedflag |= TS_DONT_SWAP;
2374 
2375 	/*
2376 	 * Setup thread to run on the current CPU.
2377 	 */
2378 	tp->t_disp_queue = cp->cpu_disp;
2379 
2380 	cp->cpu_dispthread = tp;		/* protected by spl only */
2381 	cp->cpu_dispatch_pri = pri;
2382 
2383 	/*
2384 	 * There can be a memory synchronization race between disp_getbest()
2385 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2386 	 * to preempt the current thread to run the enqueued thread while
2387 	 * disp_getbest() and disp_ratify() are changing the current thread
2388 	 * to the stolen thread. This may lead to a situation where
2389 	 * cpu_resched() tries to preempt the wrong thread and the
2390 	 * stolen thread continues to run on the CPU which has been tagged
2391 	 * for preemption.
2392 	 * Later the clock thread gets enqueued but doesn't get to run on the
2393 	 * CPU causing the system to hang.
2394 	 *
2395 	 * To avoid this, grabbing and dropping the disp_lock (which does
2396 	 * a memory barrier) is needed to synchronize the execution of
2397 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2398 	 * synchronize the memory read and written by cpu_resched(),
2399 	 * disp_getbest(), and disp_ratify() with each other.
2400 	 *  (see CR#6482861 for more details).
2401 	 */
2402 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2403 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2404 
2405 	ASSERT(pri == DISP_PRIO(tp));
2406 
2407 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2408 
2409 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2410 
2411 	/*
2412 	 * Return with spl high so that swtch() won't need to raise it.
2413 	 * The disp_lock was dropped by dispdeq().
2414 	 */
2415 
2416 	return (tp);
2417 }
2418 
2419 /*
2420  * disp_bound_common() - common routine for higher level functions
2421  *	that check for bound threads under certain conditions.
2422  *	If 'threadlistsafe' is set then there is no need to acquire
2423  *	pidlock to stop the thread list from changing (eg, if
2424  *	disp_bound_* is called with cpus paused).
2425  */
2426 static int
2427 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2428 {
2429 	int		found = 0;
2430 	kthread_t	*tp;
2431 
2432 	ASSERT(flag);
2433 
2434 	if (!threadlistsafe)
2435 		mutex_enter(&pidlock);
2436 	tp = curthread;		/* faster than allthreads */
2437 	do {
2438 		if (tp->t_state != TS_FREE) {
2439 			/*
2440 			 * If an interrupt thread is busy, but the
2441 			 * caller doesn't care (i.e. BOUND_INTR is off),
2442 			 * then just ignore it and continue through.
2443 			 */
2444 			if ((tp->t_flag & T_INTR_THREAD) &&
2445 			    !(flag & BOUND_INTR))
2446 				continue;
2447 
2448 			/*
2449 			 * Skip the idle thread for the CPU
2450 			 * we're about to set offline.
2451 			 */
2452 			if (tp == cp->cpu_idle_thread)
2453 				continue;
2454 
2455 			/*
2456 			 * Skip the pause thread for the CPU
2457 			 * we're about to set offline.
2458 			 */
2459 			if (tp == cp->cpu_pause_thread)
2460 				continue;
2461 
2462 			if ((flag & BOUND_CPU) &&
2463 			    (tp->t_bound_cpu == cp ||
2464 			    tp->t_bind_cpu == cp->cpu_id ||
2465 			    tp->t_weakbound_cpu == cp)) {
2466 				found = 1;
2467 				break;
2468 			}
2469 
2470 			if ((flag & BOUND_PARTITION) &&
2471 			    (tp->t_cpupart == cp->cpu_part)) {
2472 				found = 1;
2473 				break;
2474 			}
2475 		}
2476 	} while ((tp = tp->t_next) != curthread && found == 0);
2477 	if (!threadlistsafe)
2478 		mutex_exit(&pidlock);
2479 	return (found);
2480 }
2481 
2482 /*
2483  * disp_bound_threads - return nonzero if threads are bound to the processor.
2484  *	Called infrequently.  Keep this simple.
2485  *	Includes threads that are asleep or stopped but not onproc.
2486  */
2487 int
2488 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2489 {
2490 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2491 }
2492 
2493 /*
2494  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2495  * to the given processor, including interrupt threads.
2496  */
2497 int
2498 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2499 {
2500 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2501 }
2502 
2503 /*
2504  * disp_bound_partition - return nonzero if threads are bound to the same
2505  * partition as the processor.
2506  *	Called infrequently.  Keep this simple.
2507  *	Includes threads that are asleep or stopped but not onproc.
2508  */
2509 int
2510 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2511 {
2512 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2513 }
2514 
2515 /*
2516  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2517  * threads to other CPUs.
2518  */
2519 void
2520 disp_cpu_inactive(cpu_t *cp)
2521 {
2522 	kthread_t	*tp;
2523 	disp_t		*dp = cp->cpu_disp;
2524 	dispq_t		*dq;
2525 	pri_t		pri;
2526 	int		wasonq;
2527 
2528 	disp_lock_enter(&dp->disp_lock);
2529 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2530 		dq = &dp->disp_q[pri];
2531 		tp = dq->dq_first;
2532 
2533 		/*
2534 		 * Skip over bound threads.
2535 		 */
2536 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2537 			tp = tp->t_link;
2538 		}
2539 
2540 		if (tp == NULL) {
2541 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2542 			disp_fix_unbound_pri(dp, pri);
2543 			continue;
2544 		}
2545 
2546 		wasonq = dispdeq(tp);		/* drops disp_lock */
2547 		ASSERT(wasonq);
2548 		ASSERT(tp->t_weakbound_cpu == NULL);
2549 
2550 		setbackdq(tp);
2551 		/*
2552 		 * Called from cpu_offline:
2553 		 *
2554 		 * cp has already been removed from the list of active cpus
2555 		 * and tp->t_cpu has been changed so there is no risk of
2556 		 * tp ending up back on cp.
2557 		 *
2558 		 * Called from cpupart_move_cpu:
2559 		 *
2560 		 * The cpu has moved to a new cpupart.  Any threads that
2561 		 * were on it's dispatch queues before the move remain
2562 		 * in the old partition and can't run in the new partition.
2563 		 */
2564 		ASSERT(tp->t_cpu != cp);
2565 		thread_unlock(tp);
2566 
2567 		disp_lock_enter(&dp->disp_lock);
2568 	}
2569 	disp_lock_exit(&dp->disp_lock);
2570 }
2571 
2572 /*
2573  * Return a score rating this CPU for running this thread: lower is better.
2574  *
2575  * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2576  * curcpu (as that's our own priority).
2577  *
2578  * If a cpu is the target of an offline request, then try to avoid it.
2579  *
2580  * Otherwise we'll use double the effective dispatcher priority for the CPU.
2581  *
2582  * We do this so smt_adjust_cpu_score() can increment the score if needed,
2583  * without ending up over-riding a dispatcher priority.
2584  */
2585 static pri_t
2586 cpu_score(cpu_t *cp, kthread_t *tp)
2587 {
2588 	pri_t score;
2589 
2590 	if (tp == curthread && cp == curthread->t_cpu)
2591 		score = 2 * CPU_IDLE_PRI;
2592 	else if (cp == cpu_inmotion)
2593 		score = SHRT_MAX;
2594 	else
2595 		score = 2 * cp->cpu_dispatch_pri;
2596 
2597 	if (2 * cp->cpu_disp->disp_maxrunpri > score)
2598 		score = 2 * cp->cpu_disp->disp_maxrunpri;
2599 	if (2 * cp->cpu_chosen_level > score)
2600 		score = 2 * cp->cpu_chosen_level;
2601 
2602 	return (smt_adjust_cpu_score(tp, cp, score));
2603 }
2604 
2605 /*
2606  * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2607  *
2608  * We are looking for a CPU with an effective dispatch priority lower than the
2609  * thread's, so that the thread will run immediately rather than be enqueued.
2610  * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2611  * If we don't find an available CPU there, we will expand our search to include
2612  * wider locality levels. (Note these groups are already divided by CPU
2613  * partition.)
2614  *
2615  * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2616  * the best home CPU we found.
2617  *
2618  * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2619  * other CPU.  The caller should pass in the most recently used CPU for the
2620  * thread; it's of course possible that this CPU isn't in the home lgroup.
2621  *
2622  * This function must be called at either high SPL, or with preemption disabled,
2623  * so that the "hint" CPU cannot be removed from the online CPU list while we
2624  * are traversing it.
2625  */
2626 cpu_t *
2627 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2628 {
2629 	cpu_t	*bestcpu;
2630 	cpu_t	*besthomecpu;
2631 	cpu_t   *cp, *cpstart;
2632 
2633 	klgrpset_t	done;
2634 
2635 	lpl_t		*lpl_iter, *lpl_leaf;
2636 
2637 	ASSERT(hint != NULL);
2638 	ASSERT(tp->t_lpl->lpl_ncpu > 0);
2639 
2640 	bestcpu = besthomecpu = NULL;
2641 	klgrpset_clear(done);
2642 
2643 	lpl_iter = tp->t_lpl;
2644 
2645 	do {
2646 		pri_t best = SHRT_MAX;
2647 		klgrpset_t cur_set;
2648 
2649 		klgrpset_clear(cur_set);
2650 
2651 		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2652 			lpl_leaf = lpl_iter->lpl_rset[i];
2653 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2654 				continue;
2655 
2656 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2657 
2658 			if (hint->cpu_lpl == lpl_leaf)
2659 				cp = cpstart = hint;
2660 			else
2661 				cp = cpstart = lpl_leaf->lpl_cpus;
2662 
2663 			do {
2664 				pri_t score = cpu_score(cp, tp);
2665 
2666 				if (score < best) {
2667 					best = score;
2668 					bestcpu = cp;
2669 
2670 					/* An idle CPU: we're done. */
2671 					if (score / 2 == CPU_IDLE_PRI)
2672 						goto out;
2673 				}
2674 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2675 		}
2676 
2677 		if (bestcpu != NULL && tpri > (best / 2))
2678 			goto out;
2679 
2680 		if (besthomecpu == NULL)
2681 			besthomecpu = bestcpu;
2682 
2683 		/*
2684 		 * Add the lgrps we just considered to the "done" set
2685 		 */
2686 		klgrpset_or(done, cur_set);
2687 
2688 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2689 
2690 	/*
2691 	 * The specified priority isn't high enough to run immediately
2692 	 * anywhere, so just return the best CPU from the home lgroup.
2693 	 */
2694 	bestcpu = besthomecpu;
2695 
2696 out:
2697 	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2698 	return (bestcpu);
2699 }
2700 
2701 /*
2702  * This routine provides the generic idle cpu function for all processors.
2703  * If a processor has some specific code to execute when idle (say, to stop
2704  * the pipeline and save power) then that routine should be defined in the
2705  * processors specific code (module_xx.c) and the global variable idle_cpu
2706  * set to that function.
2707  */
2708 static void
2709 generic_idle_cpu(void)
2710 {
2711 }
2712 
2713 /*ARGSUSED*/
2714 static void
2715 generic_enq_thread(cpu_t *cpu, int bound)
2716 {
2717 }
2718 
2719 cpu_t *
2720 disp_choose_best_cpu(void)
2721 {
2722 	kthread_t *t = curthread;
2723 	cpu_t *curcpu = CPU;
2724 
2725 	ASSERT(t->t_preempt > 0);
2726 	ASSERT(t->t_state == TS_ONPROC);
2727 	ASSERT(t->t_schedflag & TS_VCPU);
2728 
2729 	if (smt_should_run(t, curcpu))
2730 		return (curcpu);
2731 
2732 	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2733 }
2734