xref: /illumos-gate/usr/src/uts/common/disp/disp.c (revision 12042ab213b3af68474f48555504db816a449211)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2019 Joyent, Inc.
28  */
29 
30 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved  	*/
32 
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/systm.h>
40 #include <sys/sysinfo.h>
41 #include <sys/var.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <sys/inline.h>
46 #include <sys/disp.h>
47 #include <sys/class.h>
48 #include <sys/bitmap.h>
49 #include <sys/kmem.h>
50 #include <sys/cpuvar.h>
51 #include <sys/vtrace.h>
52 #include <sys/tnf.h>
53 #include <sys/cpupart.h>
54 #include <sys/lgrp.h>
55 #include <sys/pg.h>
56 #include <sys/cmt.h>
57 #include <sys/bitset.h>
58 #include <sys/schedctl.h>
59 #include <sys/atomic.h>
60 #include <sys/dtrace.h>
61 #include <sys/sdt.h>
62 #include <sys/archsystm.h>
63 #include <sys/smt.h>
64 
65 #include <vm/as.h>
66 
67 #define	BOUND_CPU	0x1
68 #define	BOUND_PARTITION	0x2
69 #define	BOUND_INTR	0x4
70 
71 /* Dispatch queue allocation structure and functions */
72 struct disp_queue_info {
73 	disp_t	*dp;
74 	dispq_t *olddispq;
75 	dispq_t *newdispq;
76 	ulong_t	*olddqactmap;
77 	ulong_t	*newdqactmap;
78 	int	oldnglobpris;
79 };
80 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
81     disp_t *dp);
82 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
83 static void	disp_dq_free(struct disp_queue_info *dptr);
84 
85 /* platform-specific routine to call when processor is idle */
86 static void	generic_idle_cpu();
87 void		(*idle_cpu)() = generic_idle_cpu;
88 
89 /* routines invoked when a CPU enters/exits the idle loop */
90 static void	idle_enter();
91 static void	idle_exit();
92 
93 /* platform-specific routine to call when thread is enqueued */
94 static void	generic_enq_thread(cpu_t *, int);
95 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
96 
97 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
98 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
99 pri_t	intr_pri;		/* interrupt thread priority base level */
100 
101 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
102 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
103 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
104 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
105 int	nswapped;		/* total number of swapped threads */
106 void	disp_swapped_enq(kthread_t *tp);
107 static void	disp_swapped_setrun(kthread_t *tp);
108 static void	cpu_resched(cpu_t *cp, pri_t tpri);
109 
110 /*
111  * If this is set, only interrupt threads will cause kernel preemptions.
112  * This is done by changing the value of kpreemptpri.  kpreemptpri
113  * will either be the max sysclass pri + 1 or the min interrupt pri.
114  */
115 int	only_intr_kpreempt;
116 
117 extern void set_idle_cpu(int cpun);
118 extern void unset_idle_cpu(int cpun);
119 static void setkpdq(kthread_t *tp, int borf);
120 #define	SETKP_BACK	0
121 #define	SETKP_FRONT	1
122 /*
123  * Parameter that determines how recently a thread must have run
124  * on the CPU to be considered loosely-bound to that CPU to reduce
125  * cold cache effects.  The interval is in hertz.
126  */
127 #define	RECHOOSE_INTERVAL 3
128 int	rechoose_interval = RECHOOSE_INTERVAL;
129 
130 /*
131  * Parameter that determines how long (in nanoseconds) a thread must
132  * be sitting on a run queue before it can be stolen by another CPU
133  * to reduce migrations.  The interval is in nanoseconds.
134  *
135  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
136  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
137  * here indicating it is uninitiallized.
138  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
139  *
140  */
141 #define	NOSTEAL_UNINITIALIZED	(-1)
142 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
143 extern void cmp_set_nosteal_interval(void);
144 
145 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
146 
147 disp_lock_t	transition_lock;	/* lock on transitioning threads */
148 disp_lock_t	stop_lock;		/* lock on stopped threads */
149 
150 static void	cpu_dispqalloc(int numpris);
151 
152 /*
153  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
154  * a thread because it was sitting on its run queue for a very short
155  * period of time.
156  */
157 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
158 
159 static kthread_t	*disp_getwork(cpu_t *to);
160 static kthread_t	*disp_getbest(disp_t *from);
161 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
162 
163 void	swtch_to(kthread_t *);
164 
165 /*
166  * dispatcher and scheduler initialization
167  */
168 
169 /*
170  * disp_setup - Common code to calculate and allocate dispatcher
171  *		variables and structures based on the maximum priority.
172  */
173 static void
174 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
175 {
176 	pri_t	newnglobpris;
177 
178 	ASSERT(MUTEX_HELD(&cpu_lock));
179 
180 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
181 
182 	if (newnglobpris > oldnglobpris) {
183 		/*
184 		 * Allocate new kp queues for each CPU partition.
185 		 */
186 		cpupart_kpqalloc(newnglobpris);
187 
188 		/*
189 		 * Allocate new dispatch queues for each CPU.
190 		 */
191 		cpu_dispqalloc(newnglobpris);
192 
193 		/*
194 		 * compute new interrupt thread base priority
195 		 */
196 		intr_pri = maxglobpri;
197 		if (only_intr_kpreempt) {
198 			kpreemptpri = intr_pri + 1;
199 			if (kpqpri == KPQPRI)
200 				kpqpri = kpreemptpri;
201 		}
202 		v.v_nglobpris = newnglobpris;
203 	}
204 }
205 
206 /*
207  * dispinit - Called to initialize all loaded classes and the
208  *	      dispatcher framework.
209  */
210 void
211 dispinit(void)
212 {
213 	id_t	cid;
214 	pri_t	maxglobpri;
215 	pri_t	cl_maxglobpri;
216 
217 	maxglobpri = -1;
218 
219 	/*
220 	 * Initialize transition lock, which will always be set.
221 	 */
222 	DISP_LOCK_INIT(&transition_lock);
223 	disp_lock_enter_high(&transition_lock);
224 	DISP_LOCK_INIT(&stop_lock);
225 
226 	mutex_enter(&cpu_lock);
227 	CPU->cpu_disp->disp_maxrunpri = -1;
228 	CPU->cpu_disp->disp_max_unbound_pri = -1;
229 
230 	/*
231 	 * Initialize the default CPU partition.
232 	 */
233 	cpupart_initialize_default();
234 	/*
235 	 * Call the class specific initialization functions for
236 	 * all pre-installed schedulers.
237 	 *
238 	 * We pass the size of a class specific parameter
239 	 * buffer to each of the initialization functions
240 	 * to try to catch problems with backward compatibility
241 	 * of class modules.
242 	 *
243 	 * For example a new class module running on an old system
244 	 * which didn't provide sufficiently large parameter buffers
245 	 * would be bad news. Class initialization modules can check for
246 	 * this and take action if they detect a problem.
247 	 */
248 
249 	for (cid = 0; cid < nclass; cid++) {
250 		sclass_t	*sc;
251 
252 		sc = &sclass[cid];
253 		if (SCHED_INSTALLED(sc)) {
254 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
255 			    &sc->cl_funcs);
256 			if (cl_maxglobpri > maxglobpri)
257 				maxglobpri = cl_maxglobpri;
258 		}
259 	}
260 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
261 	if (kpqpri == KPQPRI)
262 		kpqpri = kpreemptpri;
263 
264 	ASSERT(maxglobpri >= 0);
265 	disp_setup(maxglobpri, 0);
266 
267 	mutex_exit(&cpu_lock);
268 
269 	/*
270 	 * Platform specific sticky scheduler setup.
271 	 */
272 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
273 		cmp_set_nosteal_interval();
274 
275 	/*
276 	 * Get the default class ID; this may be later modified via
277 	 * dispadmin(1M).  This will load the class (normally TS) and that will
278 	 * call disp_add(), which is why we had to drop cpu_lock first.
279 	 */
280 	if (getcid(defaultclass, &defaultcid) != 0) {
281 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
282 		    defaultclass);
283 	}
284 }
285 
286 /*
287  * disp_add - Called with class pointer to initialize the dispatcher
288  *	      for a newly loaded class.
289  */
290 void
291 disp_add(sclass_t *clp)
292 {
293 	pri_t	maxglobpri;
294 	pri_t	cl_maxglobpri;
295 
296 	mutex_enter(&cpu_lock);
297 	/*
298 	 * Initialize the scheduler class.
299 	 */
300 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
301 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
302 	if (cl_maxglobpri > maxglobpri)
303 		maxglobpri = cl_maxglobpri;
304 
305 	/*
306 	 * Save old queue information.  Since we're initializing a
307 	 * new scheduling class which has just been loaded, then
308 	 * the size of the dispq may have changed.  We need to handle
309 	 * that here.
310 	 */
311 	disp_setup(maxglobpri, v.v_nglobpris);
312 
313 	mutex_exit(&cpu_lock);
314 }
315 
316 
317 /*
318  * For each CPU, allocate new dispatch queues
319  * with the stated number of priorities.
320  */
321 static void
322 cpu_dispqalloc(int numpris)
323 {
324 	cpu_t	*cpup;
325 	struct disp_queue_info	*disp_mem;
326 	int i, num;
327 
328 	ASSERT(MUTEX_HELD(&cpu_lock));
329 
330 	disp_mem = kmem_zalloc(NCPU *
331 	    sizeof (struct disp_queue_info), KM_SLEEP);
332 
333 	/*
334 	 * This routine must allocate all of the memory before stopping
335 	 * the cpus because it must not sleep in kmem_alloc while the
336 	 * CPUs are stopped.  Locks they hold will not be freed until they
337 	 * are restarted.
338 	 */
339 	i = 0;
340 	cpup = cpu_list;
341 	do {
342 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
343 		i++;
344 		cpup = cpup->cpu_next;
345 	} while (cpup != cpu_list);
346 	num = i;
347 
348 	pause_cpus(NULL, NULL);
349 	for (i = 0; i < num; i++)
350 		disp_dq_assign(&disp_mem[i], numpris);
351 	start_cpus();
352 
353 	/*
354 	 * I must free all of the memory after starting the cpus because
355 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
356 	 */
357 	for (i = 0; i < num; i++)
358 		disp_dq_free(&disp_mem[i]);
359 
360 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
361 }
362 
363 static void
364 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
365 {
366 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
367 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
368 	    sizeof (long), KM_SLEEP);
369 	dptr->dp = dp;
370 }
371 
372 static void
373 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
374 {
375 	disp_t	*dp;
376 
377 	dp = dptr->dp;
378 	dptr->olddispq = dp->disp_q;
379 	dptr->olddqactmap = dp->disp_qactmap;
380 	dptr->oldnglobpris = dp->disp_npri;
381 
382 	ASSERT(dptr->oldnglobpris < numpris);
383 
384 	if (dptr->olddispq != NULL) {
385 		/*
386 		 * Use kcopy because bcopy is platform-specific
387 		 * and could block while we might have paused the cpus.
388 		 */
389 		(void) kcopy(dptr->olddispq, dptr->newdispq,
390 		    dptr->oldnglobpris * sizeof (dispq_t));
391 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
392 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
393 		    sizeof (long));
394 	}
395 	dp->disp_q = dptr->newdispq;
396 	dp->disp_qactmap = dptr->newdqactmap;
397 	dp->disp_q_limit = &dptr->newdispq[numpris];
398 	dp->disp_npri = numpris;
399 }
400 
401 static void
402 disp_dq_free(struct disp_queue_info *dptr)
403 {
404 	if (dptr->olddispq != NULL)
405 		kmem_free(dptr->olddispq,
406 		    dptr->oldnglobpris * sizeof (dispq_t));
407 	if (dptr->olddqactmap != NULL)
408 		kmem_free(dptr->olddqactmap,
409 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
410 }
411 
412 /*
413  * For a newly created CPU, initialize the dispatch queue.
414  * This is called before the CPU is known through cpu[] or on any lists.
415  */
416 void
417 disp_cpu_init(cpu_t *cp)
418 {
419 	disp_t	*dp;
420 	dispq_t	*newdispq;
421 	ulong_t	*newdqactmap;
422 
423 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
424 
425 	if (cp == cpu0_disp.disp_cpu)
426 		dp = &cpu0_disp;
427 	else
428 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
429 	bzero(dp, sizeof (disp_t));
430 	cp->cpu_disp = dp;
431 	dp->disp_cpu = cp;
432 	dp->disp_maxrunpri = -1;
433 	dp->disp_max_unbound_pri = -1;
434 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
435 	/*
436 	 * Allocate memory for the dispatcher queue headers
437 	 * and the active queue bitmap.
438 	 */
439 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
440 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
441 	    sizeof (long), KM_SLEEP);
442 	dp->disp_q = newdispq;
443 	dp->disp_qactmap = newdqactmap;
444 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
445 	dp->disp_npri = v.v_nglobpris;
446 }
447 
448 void
449 disp_cpu_fini(cpu_t *cp)
450 {
451 	ASSERT(MUTEX_HELD(&cpu_lock));
452 
453 	disp_kp_free(cp->cpu_disp);
454 	if (cp->cpu_disp != &cpu0_disp)
455 		kmem_free(cp->cpu_disp, sizeof (disp_t));
456 }
457 
458 /*
459  * Allocate new, larger kpreempt dispatch queue to replace the old one.
460  */
461 void
462 disp_kp_alloc(disp_t *dq, pri_t npri)
463 {
464 	struct disp_queue_info	mem_info;
465 
466 	if (npri > dq->disp_npri) {
467 		/*
468 		 * Allocate memory for the new array.
469 		 */
470 		disp_dq_alloc(&mem_info, npri, dq);
471 
472 		/*
473 		 * We need to copy the old structures to the new
474 		 * and free the old.
475 		 */
476 		disp_dq_assign(&mem_info, npri);
477 		disp_dq_free(&mem_info);
478 	}
479 }
480 
481 /*
482  * Free dispatch queue.
483  * Used for the kpreempt queues for a removed CPU partition and
484  * for the per-CPU queues of deleted CPUs.
485  */
486 void
487 disp_kp_free(disp_t *dq)
488 {
489 	struct disp_queue_info	mem_info;
490 
491 	mem_info.olddispq = dq->disp_q;
492 	mem_info.olddqactmap = dq->disp_qactmap;
493 	mem_info.oldnglobpris = dq->disp_npri;
494 	disp_dq_free(&mem_info);
495 }
496 
497 /*
498  * End dispatcher and scheduler initialization.
499  */
500 
501 /*
502  * See if there's anything to do other than remain idle.
503  * Return non-zero if there is.
504  *
505  * This function must be called with high spl, or with
506  * kernel preemption disabled to prevent the partition's
507  * active cpu list from changing while being traversed.
508  *
509  * This is essentially a simpler version of disp_getwork()
510  * to be called by CPUs preparing to "halt".
511  */
512 int
513 disp_anywork(void)
514 {
515 	cpu_t		*cp = CPU;
516 	cpu_t		*ocp;
517 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
518 
519 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
520 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
521 			return (1);
522 
523 		for (ocp = cp->cpu_next_part; ocp != cp;
524 		    ocp = ocp->cpu_next_part) {
525 			ASSERT(CPU_ACTIVE(ocp));
526 
527 			/*
528 			 * Something has appeared on the local run queue.
529 			 */
530 			if (*local_nrunnable > 0)
531 				return (1);
532 			/*
533 			 * If we encounter another idle CPU that will
534 			 * soon be trolling around through disp_anywork()
535 			 * terminate our walk here and let this other CPU
536 			 * patrol the next part of the list.
537 			 */
538 			if (ocp->cpu_dispatch_pri == -1 &&
539 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
540 				return (0);
541 			/*
542 			 * Work can be taken from another CPU if:
543 			 *	- There is unbound work on the run queue
544 			 *	- That work isn't a thread undergoing a
545 			 *	- context switch on an otherwise empty queue.
546 			 *	- The CPU isn't running the idle loop.
547 			 */
548 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
549 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
550 			    ocp->cpu_disp->disp_nrunnable == 1) &&
551 			    ocp->cpu_dispatch_pri != -1)
552 				return (1);
553 		}
554 	}
555 	return (0);
556 }
557 
558 /*
559  * Called when CPU enters the idle loop
560  */
561 static void
562 idle_enter()
563 {
564 	cpu_t		*cp = CPU;
565 
566 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
567 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
568 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
569 }
570 
571 /*
572  * Called when CPU exits the idle loop
573  */
574 static void
575 idle_exit()
576 {
577 	cpu_t		*cp = CPU;
578 
579 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
580 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
581 }
582 
583 /*
584  * Idle loop.
585  */
586 void
587 idle()
588 {
589 	struct cpu	*cp = CPU;		/* pointer to this CPU */
590 	kthread_t	*t;			/* taken thread */
591 
592 	idle_enter();
593 
594 	/*
595 	 * Uniprocessor version of idle loop.
596 	 * Do this until notified that we're on an actual multiprocessor.
597 	 */
598 	while (ncpus == 1) {
599 		if (cp->cpu_disp->disp_nrunnable == 0) {
600 			(*idle_cpu)();
601 			continue;
602 		}
603 		idle_exit();
604 		swtch();
605 
606 		idle_enter(); /* returned from swtch */
607 	}
608 
609 	/*
610 	 * Multiprocessor idle loop.
611 	 */
612 	for (;;) {
613 		/*
614 		 * If CPU is completely quiesced by p_online(2), just wait
615 		 * here with minimal bus traffic until put online.
616 		 */
617 		while (cp->cpu_flags & CPU_QUIESCED)
618 			(*idle_cpu)();
619 
620 		if (cp->cpu_disp->disp_nrunnable != 0) {
621 			idle_exit();
622 			swtch();
623 		} else {
624 			if (cp->cpu_flags & CPU_OFFLINE)
625 				continue;
626 			if ((t = disp_getwork(cp)) == NULL) {
627 				if (cp->cpu_chosen_level != -1) {
628 					disp_t *dp = cp->cpu_disp;
629 					disp_t *kpq;
630 
631 					disp_lock_enter(&dp->disp_lock);
632 					/*
633 					 * Set kpq under lock to prevent
634 					 * migration between partitions.
635 					 */
636 					kpq = &cp->cpu_part->cp_kp_queue;
637 					if (kpq->disp_maxrunpri == -1)
638 						cp->cpu_chosen_level = -1;
639 					disp_lock_exit(&dp->disp_lock);
640 				}
641 				(*idle_cpu)();
642 				continue;
643 			}
644 			/*
645 			 * If there was a thread but we couldn't steal
646 			 * it, then keep trying.
647 			 */
648 			if (t == T_DONTSTEAL)
649 				continue;
650 			idle_exit();
651 			swtch_to(t);
652 		}
653 		idle_enter(); /* returned from swtch/swtch_to */
654 	}
655 }
656 
657 
658 /*
659  * Preempt the currently running thread in favor of the highest
660  * priority thread.  The class of the current thread controls
661  * where it goes on the dispatcher queues. If panicking, turn
662  * preemption off.
663  */
664 void
665 preempt()
666 {
667 	kthread_t 	*t = curthread;
668 	klwp_t 		*lwp = ttolwp(curthread);
669 
670 	if (panicstr)
671 		return;
672 
673 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
674 
675 	thread_lock(t);
676 
677 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
678 		/*
679 		 * this thread has already been chosen to be run on
680 		 * another CPU. Clear kprunrun on this CPU since we're
681 		 * already headed for swtch().
682 		 */
683 		CPU->cpu_kprunrun = 0;
684 		thread_unlock_nopreempt(t);
685 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
686 	} else {
687 		if (lwp != NULL)
688 			lwp->lwp_ru.nivcsw++;
689 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
690 		THREAD_TRANSITION(t);
691 		CL_PREEMPT(t);
692 		DTRACE_SCHED(preempt);
693 		thread_unlock_nopreempt(t);
694 
695 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
696 
697 		swtch();		/* clears CPU->cpu_runrun via disp() */
698 	}
699 }
700 
701 extern kthread_t *thread_unpin();
702 
703 /*
704  * disp() - find the highest priority thread for this processor to run, and
705  * set it in TS_ONPROC state so that resume() can be called to run it.
706  */
707 static kthread_t *
708 disp()
709 {
710 	cpu_t		*cpup;
711 	disp_t		*dp;
712 	kthread_t	*tp;
713 	dispq_t		*dq;
714 	int		maxrunword;
715 	pri_t		pri;
716 	disp_t		*kpq;
717 
718 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
719 
720 	cpup = CPU;
721 	/*
722 	 * Find the highest priority loaded, runnable thread.
723 	 */
724 	dp = cpup->cpu_disp;
725 
726 reschedule:
727 	/*
728 	 * If there is more important work on the global queue with a better
729 	 * priority than the maximum on this CPU, take it now.
730 	 */
731 	kpq = &cpup->cpu_part->cp_kp_queue;
732 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
733 	    pri >= dp->disp_maxrunpri &&
734 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
735 	    (tp = disp_getbest(kpq)) != NULL) {
736 		if (disp_ratify(tp, kpq) != NULL) {
737 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
738 			    "disp_end:tid %p", tp);
739 			return (tp);
740 		}
741 	}
742 
743 	disp_lock_enter(&dp->disp_lock);
744 	pri = dp->disp_maxrunpri;
745 
746 	/*
747 	 * If there is nothing to run, look at what's runnable on other queues.
748 	 * Choose the idle thread if the CPU is quiesced.
749 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
750 	 * interrupt threads, which will be the only threads on the CPU's own
751 	 * queue, but cannot run threads from other queues.
752 	 */
753 	if (pri == -1) {
754 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
755 			disp_lock_exit(&dp->disp_lock);
756 			if ((tp = disp_getwork(cpup)) == NULL ||
757 			    tp == T_DONTSTEAL) {
758 				tp = cpup->cpu_idle_thread;
759 				(void) splhigh();
760 				THREAD_ONPROC(tp, cpup);
761 				cpup->cpu_dispthread = tp;
762 				cpup->cpu_dispatch_pri = -1;
763 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
764 				cpup->cpu_chosen_level = -1;
765 			}
766 		} else {
767 			disp_lock_exit_high(&dp->disp_lock);
768 			tp = cpup->cpu_idle_thread;
769 			THREAD_ONPROC(tp, cpup);
770 			cpup->cpu_dispthread = tp;
771 			cpup->cpu_dispatch_pri = -1;
772 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
773 			cpup->cpu_chosen_level = -1;
774 		}
775 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
776 		    "disp_end:tid %p", tp);
777 		return (tp);
778 	}
779 
780 	dq = &dp->disp_q[pri];
781 	tp = dq->dq_first;
782 
783 	ASSERT(tp != NULL);
784 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
785 
786 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
787 
788 	/*
789 	 * Found it so remove it from queue.
790 	 */
791 	dp->disp_nrunnable--;
792 	dq->dq_sruncnt--;
793 	if ((dq->dq_first = tp->t_link) == NULL) {
794 		ulong_t	*dqactmap = dp->disp_qactmap;
795 
796 		ASSERT(dq->dq_sruncnt == 0);
797 		dq->dq_last = NULL;
798 
799 		/*
800 		 * The queue is empty, so the corresponding bit needs to be
801 		 * turned off in dqactmap.   If nrunnable != 0 just took the
802 		 * last runnable thread off the
803 		 * highest queue, so recompute disp_maxrunpri.
804 		 */
805 		maxrunword = pri >> BT_ULSHIFT;
806 		dqactmap[maxrunword] &= ~BT_BIW(pri);
807 
808 		if (dp->disp_nrunnable == 0) {
809 			dp->disp_max_unbound_pri = -1;
810 			dp->disp_maxrunpri = -1;
811 		} else {
812 			int ipri;
813 
814 			ipri = bt_gethighbit(dqactmap, maxrunword);
815 			dp->disp_maxrunpri = ipri;
816 			if (ipri < dp->disp_max_unbound_pri)
817 				dp->disp_max_unbound_pri = ipri;
818 		}
819 	} else {
820 		tp->t_link = NULL;
821 	}
822 
823 	/*
824 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
825 	 * out this thread before we have a chance to run it.
826 	 * While running, it is protected against swapping by t_lock.
827 	 */
828 	tp->t_schedflag |= TS_DONT_SWAP;
829 	cpup->cpu_dispthread = tp;		/* protected by spl only */
830 	cpup->cpu_dispatch_pri = pri;
831 	ASSERT(pri == DISP_PRIO(tp));
832 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
833 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
834 
835 	ASSERT(tp != NULL);
836 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
837 	    "disp_end:tid %p", tp);
838 
839 	if (disp_ratify(tp, kpq) == NULL)
840 		goto reschedule;
841 
842 	return (tp);
843 }
844 
845 /*
846  * swtch()
847  *	Find best runnable thread and run it.
848  *	Called with the current thread already switched to a new state,
849  *	on a sleep queue, run queue, stopped, and not zombied.
850  *	May be called at any spl level less than or equal to LOCK_LEVEL.
851  *	Always drops spl to the base level (spl0()).
852  */
853 void
854 swtch()
855 {
856 	kthread_t	*t = curthread;
857 	kthread_t	*next;
858 	cpu_t		*cp;
859 
860 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
861 
862 	if (t->t_flag & T_INTR_THREAD)
863 		cpu_intr_swtch_enter(t);
864 
865 	if (t->t_intr != NULL) {
866 		/*
867 		 * We are an interrupt thread.  Setup and return
868 		 * the interrupted thread to be resumed.
869 		 */
870 		(void) splhigh();	/* block other scheduler action */
871 		cp = CPU;		/* now protected against migration */
872 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
873 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
874 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
875 		next = thread_unpin();
876 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
877 		resume_from_intr(next);
878 	} else {
879 #ifdef	DEBUG
880 		if (t->t_state == TS_ONPROC &&
881 		    t->t_disp_queue->disp_cpu == CPU &&
882 		    t->t_preempt == 0) {
883 			thread_lock(t);
884 			ASSERT(t->t_state != TS_ONPROC ||
885 			    t->t_disp_queue->disp_cpu != CPU ||
886 			    t->t_preempt != 0);	/* cannot migrate */
887 			thread_unlock_nopreempt(t);
888 		}
889 #endif	/* DEBUG */
890 		cp = CPU;
891 		next = disp();		/* returns with spl high */
892 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
893 
894 		/* OK to steal anything left on run queue */
895 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
896 
897 		if (next != t) {
898 			hrtime_t now;
899 
900 			now = gethrtime_unscaled();
901 			pg_ev_thread_swtch(cp, now, t, next);
902 
903 			/*
904 			 * If t was previously in the TS_ONPROC state,
905 			 * setfrontdq and setbackdq won't have set its t_waitrq.
906 			 * Since we now finally know that we're switching away
907 			 * from this thread, set its t_waitrq if it is on a run
908 			 * queue.
909 			 */
910 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
911 				t->t_waitrq = now;
912 			}
913 
914 			/*
915 			 * restore mstate of thread that we are switching to
916 			 */
917 			restore_mstate(next);
918 
919 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
920 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
921 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
922 
923 			if (dtrace_vtime_active)
924 				dtrace_vtime_switch(next);
925 
926 			resume(next);
927 			/*
928 			 * The TR_RESUME_END and TR_SWTCH_END trace points
929 			 * appear at the end of resume(), because we may not
930 			 * return here
931 			 */
932 		} else {
933 			if (t->t_flag & T_INTR_THREAD)
934 				cpu_intr_swtch_exit(t);
935 			/*
936 			 * Threads that enqueue themselves on a run queue defer
937 			 * setting t_waitrq. It is then either set in swtch()
938 			 * when the CPU is actually yielded, or not at all if it
939 			 * is remaining on the CPU.
940 			 * There is however a window between where the thread
941 			 * placed itself on a run queue, and where it selects
942 			 * itself in disp(), where a third party (eg. clock()
943 			 * doing tick processing) may have re-enqueued this
944 			 * thread, setting t_waitrq in the process. We detect
945 			 * this race by noticing that despite switching to
946 			 * ourself, our t_waitrq has been set, and should be
947 			 * cleared.
948 			 */
949 			if (t->t_waitrq != 0)
950 				t->t_waitrq = 0;
951 
952 			pg_ev_thread_remain(cp, t);
953 
954 			DTRACE_SCHED(remain__cpu);
955 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
956 			(void) spl0();
957 		}
958 	}
959 }
960 
961 /*
962  * swtch_from_zombie()
963  *	Special case of swtch(), which allows checks for TS_ZOMB to be
964  *	eliminated from normal resume.
965  *	Find best runnable thread and run it.
966  *	Called with the current thread zombied.
967  *	Zombies cannot migrate, so CPU references are safe.
968  */
969 void
970 swtch_from_zombie()
971 {
972 	kthread_t	*next;
973 	cpu_t		*cpu = CPU;
974 
975 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
976 
977 	ASSERT(curthread->t_state == TS_ZOMB);
978 
979 	next = disp();			/* returns with spl high */
980 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
981 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
982 	ASSERT(next != curthread);
983 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
984 
985 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
986 
987 	restore_mstate(next);
988 
989 	if (dtrace_vtime_active)
990 		dtrace_vtime_switch(next);
991 
992 	resume_from_zombie(next);
993 	/*
994 	 * The TR_RESUME_END and TR_SWTCH_END trace points
995 	 * appear at the end of resume(), because we certainly will not
996 	 * return here
997 	 */
998 }
999 
1000 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1001 
1002 /*
1003  * search_disp_queues()
1004  *	Search the given dispatch queues for thread tp.
1005  *	Return 1 if tp is found, otherwise return 0.
1006  */
1007 static int
1008 search_disp_queues(disp_t *dp, kthread_t *tp)
1009 {
1010 	dispq_t		*dq;
1011 	dispq_t		*eq;
1012 
1013 	disp_lock_enter_high(&dp->disp_lock);
1014 
1015 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1016 		kthread_t	*rp;
1017 
1018 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1019 
1020 		for (rp = dq->dq_first; rp; rp = rp->t_link)
1021 			if (tp == rp) {
1022 				disp_lock_exit_high(&dp->disp_lock);
1023 				return (1);
1024 			}
1025 	}
1026 	disp_lock_exit_high(&dp->disp_lock);
1027 
1028 	return (0);
1029 }
1030 
1031 /*
1032  * thread_on_queue()
1033  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1034  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1035  */
1036 static int
1037 thread_on_queue(kthread_t *tp)
1038 {
1039 	cpu_t		*cp;
1040 	struct cpupart	*part;
1041 
1042 	ASSERT(getpil() >= DISP_LEVEL);
1043 
1044 	/*
1045 	 * Search the per-CPU dispatch queues for tp.
1046 	 */
1047 	cp = CPU;
1048 	do {
1049 		if (search_disp_queues(cp->cpu_disp, tp))
1050 			return (1);
1051 	} while ((cp = cp->cpu_next_onln) != CPU);
1052 
1053 	/*
1054 	 * Search the partition-wide kpreempt queues for tp.
1055 	 */
1056 	part = CPU->cpu_part;
1057 	do {
1058 		if (search_disp_queues(&part->cp_kp_queue, tp))
1059 			return (1);
1060 	} while ((part = part->cp_next) != CPU->cpu_part);
1061 
1062 	return (0);
1063 }
1064 
1065 #else
1066 
1067 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1068 
1069 #endif  /* DEBUG */
1070 
1071 /*
1072  * like swtch(), but switch to a specified thread taken from another CPU.
1073  *	called with spl high..
1074  */
1075 void
1076 swtch_to(kthread_t *next)
1077 {
1078 	cpu_t			*cp = CPU;
1079 	hrtime_t		now;
1080 
1081 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1082 
1083 	/*
1084 	 * Update context switch statistics.
1085 	 */
1086 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1087 
1088 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1089 
1090 	now = gethrtime_unscaled();
1091 	pg_ev_thread_swtch(cp, now, curthread, next);
1092 
1093 	/* OK to steal anything left on run queue */
1094 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1095 
1096 	/* record last execution time */
1097 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1098 
1099 	/*
1100 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1101 	 * won't have set its t_waitrq.  Since we now finally know that we're
1102 	 * switching away from this thread, set its t_waitrq if it is on a run
1103 	 * queue.
1104 	 */
1105 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1106 		curthread->t_waitrq = now;
1107 	}
1108 
1109 	/* restore next thread to previously running microstate */
1110 	restore_mstate(next);
1111 
1112 	if (dtrace_vtime_active)
1113 		dtrace_vtime_switch(next);
1114 
1115 	resume(next);
1116 	/*
1117 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1118 	 * appear at the end of resume(), because we may not
1119 	 * return here
1120 	 */
1121 }
1122 
1123 static void
1124 cpu_resched(cpu_t *cp, pri_t tpri)
1125 {
1126 	int	call_poke_cpu = 0;
1127 	pri_t   cpupri = cp->cpu_dispatch_pri;
1128 
1129 	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1130 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133 			cp->cpu_runrun = 1;
1134 			aston(cp->cpu_dispthread);
1135 			if (tpri < kpreemptpri && cp != CPU)
1136 				call_poke_cpu = 1;
1137 		}
1138 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139 			cp->cpu_kprunrun = 1;
1140 			if (cp != CPU)
1141 				call_poke_cpu = 1;
1142 		}
1143 	}
1144 
1145 	/*
1146 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147 	 */
1148 	membar_enter();
1149 
1150 	if (call_poke_cpu)
1151 		poke_cpu(cp->cpu_id);
1152 }
1153 
1154 /*
1155  * setbackdq() keeps runqs balanced such that the difference in length
1156  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1157  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1158  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1159  * try to keep runqs perfectly balanced regardless of the thread priority.
1160  */
1161 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1162 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1163 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1164 
1165 /*
1166  * Macro that evaluates to true if it is likely that the thread has cache
1167  * warmth. This is based on the amount of time that has elapsed since the
1168  * thread last ran. If that amount of time is less than "rechoose_interval"
1169  * ticks, then we decide that the thread has enough cache warmth to warrant
1170  * some affinity for t->t_cpu.
1171  */
1172 #define	THREAD_HAS_CACHE_WARMTH(thread)	\
1173 	((thread == curthread) ||	\
1174 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1175 /*
1176  * Put the specified thread on the back of the dispatcher
1177  * queue corresponding to its current priority.
1178  *
1179  * Called with the thread in transition, onproc or stopped state
1180  * and locked (transition implies locked) and at high spl.
1181  * Returns with the thread in TS_RUN state and still locked.
1182  */
1183 void
1184 setbackdq(kthread_t *tp)
1185 {
1186 	dispq_t	*dq;
1187 	disp_t		*dp;
1188 	cpu_t		*cp;
1189 	pri_t		tpri;
1190 	int		bound;
1191 	boolean_t	self;
1192 
1193 	ASSERT(THREAD_LOCK_HELD(tp));
1194 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1195 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1196 
1197 	/*
1198 	 * If thread is "swapped" or on the swap queue don't
1199 	 * queue it, but wake sched.
1200 	 */
1201 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1202 		disp_swapped_setrun(tp);
1203 		return;
1204 	}
1205 
1206 	self = (tp == curthread);
1207 
1208 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209 		bound = 1;
1210 	else
1211 		bound = 0;
1212 
1213 	tpri = DISP_PRIO(tp);
1214 	if (ncpus == 1)
1215 		cp = tp->t_cpu;
1216 	else if (!bound) {
1217 		if (tpri >= kpqpri) {
1218 			setkpdq(tp, SETKP_BACK);
1219 			return;
1220 		}
1221 
1222 		/*
1223 		 * We'll generally let this thread continue to run where
1224 		 * it last ran...but will consider migration if:
1225 		 * - The thread probably doesn't have much cache warmth.
1226 		 * - SMT exclusion would prefer us to run elsewhere
1227 		 * - The CPU where it last ran is the target of an offline
1228 		 *   request.
1229 		 * - The thread last ran outside its home lgroup.
1230 		 */
1231 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1232 		    !smt_should_run(tp, tp->t_cpu) ||
1233 		    (tp->t_cpu == cpu_inmotion) ||
1234 		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235 			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1236 		} else {
1237 			cp = tp->t_cpu;
1238 		}
1239 
1240 		if (tp->t_cpupart == cp->cpu_part) {
1241 			int	qlen;
1242 
1243 			/*
1244 			 * Perform any CMT load balancing
1245 			 */
1246 			cp = cmt_balance(tp, cp);
1247 
1248 			/*
1249 			 * Balance across the run queues
1250 			 */
1251 			qlen = RUNQ_LEN(cp, tpri);
1252 			if (tpri >= RUNQ_MATCH_PRI &&
1253 			    !(tp->t_schedflag & TS_RUNQMATCH))
1254 				qlen -= RUNQ_MAX_DIFF;
1255 			if (qlen > 0) {
1256 				cpu_t *newcp;
1257 
1258 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259 					newcp = cp->cpu_next_part;
1260 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1261 					newcp = cp->cpu_next_part;
1262 				}
1263 
1264 				if (smt_should_run(tp, newcp) &&
1265 				    RUNQ_LEN(newcp, tpri) < qlen) {
1266 					DTRACE_PROBE3(runq__balance,
1267 					    kthread_t *, tp,
1268 					    cpu_t *, cp, cpu_t *, newcp);
1269 					cp = newcp;
1270 				}
1271 			}
1272 		} else {
1273 			/*
1274 			 * Migrate to a cpu in the new partition.
1275 			 */
1276 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277 			    tp->t_pri);
1278 		}
1279 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1280 	} else {
1281 		/*
1282 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1283 		 * a short time until weak binding that existed when the
1284 		 * strong binding was established has dropped) so we must
1285 		 * favour weak binding over strong.
1286 		 */
1287 		cp = tp->t_weakbound_cpu ?
1288 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1289 	}
1290 	/*
1291 	 * A thread that is ONPROC may be temporarily placed on the run queue
1292 	 * but then chosen to run again by disp.  If the thread we're placing on
1293 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1294 	 * replacement process is actually scheduled in swtch().  In this
1295 	 * situation, curthread is the only thread that could be in the ONPROC
1296 	 * state.
1297 	 */
1298 	if ((!self) && (tp->t_waitrq == 0)) {
1299 		hrtime_t curtime;
1300 
1301 		curtime = gethrtime_unscaled();
1302 		(void) cpu_update_pct(tp, curtime);
1303 		tp->t_waitrq = curtime;
1304 	} else {
1305 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1306 	}
1307 
1308 	dp = cp->cpu_disp;
1309 	disp_lock_enter_high(&dp->disp_lock);
1310 
1311 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1312 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1313 	    tpri, cp, tp);
1314 
1315 #ifndef NPROBE
1316 	/* Kernel probe */
1317 	if (tnf_tracing_active)
1318 		tnf_thread_queue(tp, cp, tpri);
1319 #endif /* NPROBE */
1320 
1321 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1322 
1323 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1324 	tp->t_disp_queue = dp;
1325 	tp->t_link = NULL;
1326 
1327 	dq = &dp->disp_q[tpri];
1328 	dp->disp_nrunnable++;
1329 	if (!bound)
1330 		dp->disp_steal = 0;
1331 	membar_enter();
1332 
1333 	if (dq->dq_sruncnt++ != 0) {
1334 		ASSERT(dq->dq_first != NULL);
1335 		dq->dq_last->t_link = tp;
1336 		dq->dq_last = tp;
1337 	} else {
1338 		ASSERT(dq->dq_first == NULL);
1339 		ASSERT(dq->dq_last == NULL);
1340 		dq->dq_first = dq->dq_last = tp;
1341 		BT_SET(dp->disp_qactmap, tpri);
1342 		if (tpri > dp->disp_maxrunpri) {
1343 			dp->disp_maxrunpri = tpri;
1344 			membar_enter();
1345 			cpu_resched(cp, tpri);
1346 		}
1347 	}
1348 
1349 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1350 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1351 			/*
1352 			 * If there are no other unbound threads on the
1353 			 * run queue, don't allow other CPUs to steal
1354 			 * this thread while we are in the middle of a
1355 			 * context switch. We may just switch to it
1356 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1357 			 * in swtch and swtch_to.
1358 			 */
1359 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1360 		}
1361 		dp->disp_max_unbound_pri = tpri;
1362 	}
1363 	(*disp_enq_thread)(cp, bound);
1364 }
1365 
1366 /*
1367  * Put the specified thread on the front of the dispatcher
1368  * queue corresponding to its current priority.
1369  *
1370  * Called with the thread in transition, onproc or stopped state
1371  * and locked (transition implies locked) and at high spl.
1372  * Returns with the thread in TS_RUN state and still locked.
1373  */
1374 void
1375 setfrontdq(kthread_t *tp)
1376 {
1377 	disp_t		*dp;
1378 	dispq_t		*dq;
1379 	cpu_t		*cp;
1380 	pri_t		tpri;
1381 	int		bound;
1382 
1383 	ASSERT(THREAD_LOCK_HELD(tp));
1384 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1385 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1386 
1387 	/*
1388 	 * If thread is "swapped" or on the swap queue don't
1389 	 * queue it, but wake sched.
1390 	 */
1391 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1392 		disp_swapped_setrun(tp);
1393 		return;
1394 	}
1395 
1396 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1397 		bound = 1;
1398 	else
1399 		bound = 0;
1400 
1401 	tpri = DISP_PRIO(tp);
1402 	if (ncpus == 1)
1403 		cp = tp->t_cpu;
1404 	else if (!bound) {
1405 		if (tpri >= kpqpri) {
1406 			setkpdq(tp, SETKP_FRONT);
1407 			return;
1408 		}
1409 		cp = tp->t_cpu;
1410 		if (tp->t_cpupart == cp->cpu_part) {
1411 			/*
1412 			 * We'll generally let this thread continue to run
1413 			 * where it last ran, but will consider migration if:
1414 			 * - The thread last ran outside its home lgroup.
1415 			 * - The CPU where it last ran is the target of an
1416 			 *   offline request (a thread_nomigrate() on the in
1417 			 *   motion CPU relies on this when forcing a preempt).
1418 			 * - The thread isn't the highest priority thread where
1419 			 *   it last ran, and it is considered not likely to
1420 			 *   have significant cache warmth.
1421 			 */
1422 			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423 			    cp == cpu_inmotion ||
1424 			    (tpri < cp->cpu_disp->disp_maxrunpri &&
1425 			    !THREAD_HAS_CACHE_WARMTH(tp))) {
1426 				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1427 			}
1428 		} else {
1429 			/*
1430 			 * Migrate to a cpu in the new partition.
1431 			 */
1432 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1433 			    tp, tp->t_pri);
1434 		}
1435 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1436 	} else {
1437 		/*
1438 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1439 		 * a short time until weak binding that existed when the
1440 		 * strong binding was established has dropped) so we must
1441 		 * favour weak binding over strong.
1442 		 */
1443 		cp = tp->t_weakbound_cpu ?
1444 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1445 	}
1446 
1447 	/*
1448 	 * A thread that is ONPROC may be temporarily placed on the run queue
1449 	 * but then chosen to run again by disp.  If the thread we're placing on
1450 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1451 	 * replacement process is actually scheduled in swtch().  In this
1452 	 * situation, curthread is the only thread that could be in the ONPROC
1453 	 * state.
1454 	 */
1455 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1456 		hrtime_t curtime;
1457 
1458 		curtime = gethrtime_unscaled();
1459 		(void) cpu_update_pct(tp, curtime);
1460 		tp->t_waitrq = curtime;
1461 	} else {
1462 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1463 	}
1464 
1465 	dp = cp->cpu_disp;
1466 	disp_lock_enter_high(&dp->disp_lock);
1467 
1468 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1469 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1470 
1471 #ifndef NPROBE
1472 	/* Kernel probe */
1473 	if (tnf_tracing_active)
1474 		tnf_thread_queue(tp, cp, tpri);
1475 #endif /* NPROBE */
1476 
1477 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1478 
1479 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1480 	tp->t_disp_queue = dp;
1481 
1482 	dq = &dp->disp_q[tpri];
1483 	dp->disp_nrunnable++;
1484 	if (!bound)
1485 		dp->disp_steal = 0;
1486 	membar_enter();
1487 
1488 	if (dq->dq_sruncnt++ != 0) {
1489 		ASSERT(dq->dq_last != NULL);
1490 		tp->t_link = dq->dq_first;
1491 		dq->dq_first = tp;
1492 	} else {
1493 		ASSERT(dq->dq_last == NULL);
1494 		ASSERT(dq->dq_first == NULL);
1495 		tp->t_link = NULL;
1496 		dq->dq_first = dq->dq_last = tp;
1497 		BT_SET(dp->disp_qactmap, tpri);
1498 		if (tpri > dp->disp_maxrunpri) {
1499 			dp->disp_maxrunpri = tpri;
1500 			membar_enter();
1501 			cpu_resched(cp, tpri);
1502 		}
1503 	}
1504 
1505 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1506 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1507 		    cp == CPU) {
1508 			/*
1509 			 * If there are no other unbound threads on the
1510 			 * run queue, don't allow other CPUs to steal
1511 			 * this thread while we are in the middle of a
1512 			 * context switch. We may just switch to it
1513 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1514 			 * in swtch and swtch_to.
1515 			 */
1516 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1517 		}
1518 		dp->disp_max_unbound_pri = tpri;
1519 	}
1520 	(*disp_enq_thread)(cp, bound);
1521 }
1522 
1523 /*
1524  * Put a high-priority unbound thread on the kp queue
1525  */
1526 static void
1527 setkpdq(kthread_t *tp, int borf)
1528 {
1529 	dispq_t	*dq;
1530 	disp_t	*dp;
1531 	cpu_t	*cp;
1532 	pri_t	tpri;
1533 
1534 	tpri = DISP_PRIO(tp);
1535 
1536 	dp = &tp->t_cpupart->cp_kp_queue;
1537 	disp_lock_enter_high(&dp->disp_lock);
1538 
1539 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1540 
1541 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1542 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1543 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1544 	tp->t_disp_queue = dp;
1545 	dp->disp_nrunnable++;
1546 	dq = &dp->disp_q[tpri];
1547 
1548 	if (dq->dq_sruncnt++ != 0) {
1549 		if (borf == SETKP_BACK) {
1550 			ASSERT(dq->dq_first != NULL);
1551 			tp->t_link = NULL;
1552 			dq->dq_last->t_link = tp;
1553 			dq->dq_last = tp;
1554 		} else {
1555 			ASSERT(dq->dq_last != NULL);
1556 			tp->t_link = dq->dq_first;
1557 			dq->dq_first = tp;
1558 		}
1559 	} else {
1560 		if (borf == SETKP_BACK) {
1561 			ASSERT(dq->dq_first == NULL);
1562 			ASSERT(dq->dq_last == NULL);
1563 			dq->dq_first = dq->dq_last = tp;
1564 		} else {
1565 			ASSERT(dq->dq_last == NULL);
1566 			ASSERT(dq->dq_first == NULL);
1567 			tp->t_link = NULL;
1568 			dq->dq_first = dq->dq_last = tp;
1569 		}
1570 		BT_SET(dp->disp_qactmap, tpri);
1571 		if (tpri > dp->disp_max_unbound_pri)
1572 			dp->disp_max_unbound_pri = tpri;
1573 		if (tpri > dp->disp_maxrunpri) {
1574 			dp->disp_maxrunpri = tpri;
1575 			membar_enter();
1576 		}
1577 	}
1578 
1579 	cp = tp->t_cpu;
1580 	if (tp->t_cpupart != cp->cpu_part) {
1581 		/* migrate to a cpu in the new partition */
1582 		cp = tp->t_cpupart->cp_cpulist;
1583 	}
1584 	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1585 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1586 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1587 
1588 #ifndef NPROBE
1589 	/* Kernel probe */
1590 	if (tnf_tracing_active)
1591 		tnf_thread_queue(tp, cp, tpri);
1592 #endif /* NPROBE */
1593 
1594 	if (cp->cpu_chosen_level < tpri)
1595 		cp->cpu_chosen_level = tpri;
1596 	cpu_resched(cp, tpri);
1597 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1598 	(*disp_enq_thread)(cp, 0);
1599 }
1600 
1601 /*
1602  * Remove a thread from the dispatcher queue if it is on it.
1603  * It is not an error if it is not found but we return whether
1604  * or not it was found in case the caller wants to check.
1605  */
1606 int
1607 dispdeq(kthread_t *tp)
1608 {
1609 	disp_t		*dp;
1610 	dispq_t		*dq;
1611 	kthread_t	*rp;
1612 	kthread_t	*trp;
1613 	kthread_t	**ptp;
1614 	int		tpri;
1615 
1616 	ASSERT(THREAD_LOCK_HELD(tp));
1617 
1618 	if (tp->t_state != TS_RUN)
1619 		return (0);
1620 
1621 	/*
1622 	 * The thread is "swapped" or is on the swap queue and
1623 	 * hence no longer on the run queue, so return true.
1624 	 */
1625 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1626 		return (1);
1627 
1628 	tpri = DISP_PRIO(tp);
1629 	dp = tp->t_disp_queue;
1630 	ASSERT(tpri < dp->disp_npri);
1631 	dq = &dp->disp_q[tpri];
1632 	ptp = &dq->dq_first;
1633 	rp = *ptp;
1634 	trp = NULL;
1635 
1636 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1637 
1638 	/*
1639 	 * Search for thread in queue.
1640 	 * Double links would simplify this at the expense of disp/setrun.
1641 	 */
1642 	while (rp != tp && rp != NULL) {
1643 		trp = rp;
1644 		ptp = &trp->t_link;
1645 		rp = trp->t_link;
1646 	}
1647 
1648 	if (rp == NULL) {
1649 		panic("dispdeq: thread not on queue");
1650 	}
1651 
1652 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1653 
1654 	/*
1655 	 * Found it so remove it from queue.
1656 	 */
1657 	if ((*ptp = rp->t_link) == NULL)
1658 		dq->dq_last = trp;
1659 
1660 	dp->disp_nrunnable--;
1661 	if (--dq->dq_sruncnt == 0) {
1662 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1663 		if (dp->disp_nrunnable == 0) {
1664 			dp->disp_max_unbound_pri = -1;
1665 			dp->disp_maxrunpri = -1;
1666 		} else if (tpri == dp->disp_maxrunpri) {
1667 			int ipri;
1668 
1669 			ipri = bt_gethighbit(dp->disp_qactmap,
1670 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1671 			if (ipri < dp->disp_max_unbound_pri)
1672 				dp->disp_max_unbound_pri = ipri;
1673 			dp->disp_maxrunpri = ipri;
1674 		}
1675 	}
1676 	tp->t_link = NULL;
1677 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1678 	return (1);
1679 }
1680 
1681 
1682 /*
1683  * dq_sruninc and dq_srundec are public functions for
1684  * incrementing/decrementing the sruncnts when a thread on
1685  * a dispatcher queue is made schedulable/unschedulable by
1686  * resetting the TS_LOAD flag.
1687  *
1688  * The caller MUST have the thread lock and therefore the dispatcher
1689  * queue lock so that the operation which changes
1690  * the flag, the operation that checks the status of the thread to
1691  * determine if it's on a disp queue AND the call to this function
1692  * are one atomic operation with respect to interrupts.
1693  */
1694 
1695 /*
1696  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1697  */
1698 void
1699 dq_sruninc(kthread_t *t)
1700 {
1701 	ASSERT(t->t_state == TS_RUN);
1702 	ASSERT(t->t_schedflag & TS_LOAD);
1703 
1704 	THREAD_TRANSITION(t);
1705 	setfrontdq(t);
1706 }
1707 
1708 /*
1709  * See comment on calling conventions above.
1710  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1711  */
1712 void
1713 dq_srundec(kthread_t *t)
1714 {
1715 	ASSERT(t->t_schedflag & TS_LOAD);
1716 
1717 	(void) dispdeq(t);
1718 	disp_swapped_enq(t);
1719 }
1720 
1721 /*
1722  * Change the dispatcher lock of thread to the "swapped_lock"
1723  * and return with thread lock still held.
1724  *
1725  * Called with thread_lock held, in transition state, and at high spl.
1726  */
1727 void
1728 disp_swapped_enq(kthread_t *tp)
1729 {
1730 	ASSERT(THREAD_LOCK_HELD(tp));
1731 	ASSERT(tp->t_schedflag & TS_LOAD);
1732 
1733 	switch (tp->t_state) {
1734 	case TS_RUN:
1735 		disp_lock_enter_high(&swapped_lock);
1736 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1737 		break;
1738 	case TS_ONPROC:
1739 		disp_lock_enter_high(&swapped_lock);
1740 		THREAD_TRANSITION(tp);
1741 		wake_sched_sec = 1;		/* tell clock to wake sched */
1742 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1743 		break;
1744 	default:
1745 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1746 	}
1747 }
1748 
1749 /*
1750  * This routine is called by setbackdq/setfrontdq if the thread is
1751  * not loaded or loaded and on the swap queue.
1752  *
1753  * Thread state TS_SLEEP implies that a swapped thread
1754  * has been woken up and needs to be swapped in by the swapper.
1755  *
1756  * Thread state TS_RUN, it implies that the priority of a swapped
1757  * thread is being increased by scheduling class (e.g. ts_update).
1758  */
1759 static void
1760 disp_swapped_setrun(kthread_t *tp)
1761 {
1762 	ASSERT(THREAD_LOCK_HELD(tp));
1763 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1764 
1765 	switch (tp->t_state) {
1766 	case TS_SLEEP:
1767 		disp_lock_enter_high(&swapped_lock);
1768 		/*
1769 		 * Wakeup sched immediately (i.e., next tick) if the
1770 		 * thread priority is above maxclsyspri.
1771 		 */
1772 		if (DISP_PRIO(tp) > maxclsyspri)
1773 			wake_sched = 1;
1774 		else
1775 			wake_sched_sec = 1;
1776 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1777 		break;
1778 	case TS_RUN:				/* called from ts_update */
1779 		break;
1780 	default:
1781 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1782 	}
1783 }
1784 
1785 /*
1786  *	Make a thread give up its processor.  Find the processor on
1787  *	which this thread is executing, and have that processor
1788  *	preempt.
1789  *
1790  *	We allow System Duty Cycle (SDC) threads to be preempted even if
1791  *	they are running at kernel priorities.  To implement this, we always
1792  *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1793  *	calls cpu_surrender() very often, we only preempt if there is anyone
1794  *	competing with us.
1795  */
1796 void
1797 cpu_surrender(kthread_t *tp)
1798 {
1799 	cpu_t	*cpup;
1800 	int	max_pri;
1801 	int	max_run_pri;
1802 	klwp_t	*lwp;
1803 
1804 	ASSERT(THREAD_LOCK_HELD(tp));
1805 
1806 	if (tp->t_state != TS_ONPROC)
1807 		return;
1808 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1809 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1810 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1811 	if (max_pri < max_run_pri)
1812 		max_pri = max_run_pri;
1813 
1814 	if (tp->t_cid == sysdccid) {
1815 		uint_t t_pri = DISP_PRIO(tp);
1816 		if (t_pri > max_pri)
1817 			return;		/* we are not competing w/ anyone */
1818 		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1819 	} else {
1820 		cpup->cpu_runrun = 1;
1821 		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1822 			cpup->cpu_kprunrun = 1;
1823 		}
1824 	}
1825 
1826 	/*
1827 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1828 	 */
1829 	membar_enter();
1830 
1831 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1832 
1833 	/*
1834 	 * Make the target thread take an excursion through trap()
1835 	 * to do preempt() (unless we're already in trap or post_syscall,
1836 	 * calling cpu_surrender via CL_TRAPRET).
1837 	 */
1838 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1839 	    lwp->lwp_state != LWP_USER) {
1840 		aston(tp);
1841 		if (cpup != CPU)
1842 			poke_cpu(cpup->cpu_id);
1843 	}
1844 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1845 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1846 }
1847 
1848 /*
1849  * Commit to and ratify a scheduling decision
1850  */
1851 /*ARGSUSED*/
1852 static kthread_t *
1853 disp_ratify(kthread_t *tp, disp_t *kpq)
1854 {
1855 	pri_t	tpri, maxpri;
1856 	pri_t	maxkpri;
1857 	cpu_t	*cpup;
1858 
1859 	ASSERT(tp != NULL);
1860 	/*
1861 	 * Commit to, then ratify scheduling decision
1862 	 */
1863 	cpup = CPU;
1864 	if (cpup->cpu_runrun != 0)
1865 		cpup->cpu_runrun = 0;
1866 	if (cpup->cpu_kprunrun != 0)
1867 		cpup->cpu_kprunrun = 0;
1868 	if (cpup->cpu_chosen_level != -1)
1869 		cpup->cpu_chosen_level = -1;
1870 	membar_enter();
1871 	tpri = DISP_PRIO(tp);
1872 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1873 	maxkpri = kpq->disp_maxrunpri;
1874 	if (maxpri < maxkpri)
1875 		maxpri = maxkpri;
1876 	if (tpri < maxpri) {
1877 		/*
1878 		 * should have done better
1879 		 * put this one back and indicate to try again
1880 		 */
1881 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1882 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1883 		thread_lock_high(tp);
1884 		THREAD_TRANSITION(tp);
1885 		setfrontdq(tp);
1886 		thread_unlock_nopreempt(tp);
1887 
1888 		tp = NULL;
1889 	}
1890 	return (tp);
1891 }
1892 
1893 /*
1894  * See if there is any work on the dispatcher queue for other CPUs.
1895  * If there is, dequeue the best thread and return.
1896  */
1897 static kthread_t *
1898 disp_getwork(cpu_t *cp)
1899 {
1900 	cpu_t		*ocp;		/* other CPU */
1901 	cpu_t		*ocp_start;
1902 	cpu_t		*tcp;		/* target local CPU */
1903 	kthread_t	*tp;
1904 	kthread_t	*retval = NULL;
1905 	pri_t		maxpri;
1906 	disp_t		*kpq;		/* kp queue for this partition */
1907 	lpl_t		*lpl, *lpl_leaf;
1908 	int		leafidx, startidx;
1909 	hrtime_t	stealtime;
1910 	lgrp_id_t	local_id;
1911 
1912 	maxpri = -1;
1913 	tcp = NULL;
1914 
1915 	kpq = &cp->cpu_part->cp_kp_queue;
1916 	while (kpq->disp_maxrunpri >= 0) {
1917 		/*
1918 		 * Try to take a thread from the kp_queue.
1919 		 */
1920 		tp = (disp_getbest(kpq));
1921 		if (tp)
1922 			return (disp_ratify(tp, kpq));
1923 	}
1924 
1925 	kpreempt_disable();		/* protect the cpu_active list */
1926 
1927 	/*
1928 	 * Try to find something to do on another CPU's run queue.
1929 	 * Loop through all other CPUs looking for the one with the highest
1930 	 * priority unbound thread.
1931 	 *
1932 	 * On NUMA machines, the partition's CPUs are consulted in order of
1933 	 * distance from the current CPU. This way, the first available
1934 	 * work found is also the closest, and will suffer the least
1935 	 * from being migrated.
1936 	 */
1937 	lpl = lpl_leaf = cp->cpu_lpl;
1938 	local_id = lpl_leaf->lpl_lgrpid;
1939 	leafidx = startidx = 0;
1940 
1941 	/*
1942 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1943 	 * broader levels of locality
1944 	 */
1945 	do {
1946 		/* This loop iterates over the lpl's leaves */
1947 		do {
1948 			if (lpl_leaf != cp->cpu_lpl)
1949 				ocp = lpl_leaf->lpl_cpus;
1950 			else
1951 				ocp = cp->cpu_next_lpl;
1952 
1953 			/* This loop iterates over the CPUs in the leaf */
1954 			ocp_start = ocp;
1955 			do {
1956 				pri_t pri;
1957 
1958 				ASSERT(CPU_ACTIVE(ocp));
1959 
1960 				/*
1961 				 * End our stroll around this lpl if:
1962 				 *
1963 				 * - Something became runnable on the local
1964 				 *   queue...which also ends our stroll around
1965 				 *   the partition.
1966 				 *
1967 				 * - We happen across another idle CPU.
1968 				 *   Since it is patrolling the next portion
1969 				 *   of the lpl's list (assuming it's not
1970 				 *   halted, or busy servicing an interrupt),
1971 				 *   move to the next higher level of locality.
1972 				 */
1973 				if (cp->cpu_disp->disp_nrunnable != 0) {
1974 					kpreempt_enable();
1975 					return (NULL);
1976 				}
1977 				if (ocp->cpu_dispatch_pri == -1) {
1978 					if (ocp->cpu_disp_flags &
1979 					    CPU_DISP_HALTED ||
1980 					    ocp->cpu_intr_actv != 0)
1981 						continue;
1982 					else
1983 						goto next_level;
1984 				}
1985 
1986 				/*
1987 				 * If there's only one thread and the CPU
1988 				 * is in the middle of a context switch,
1989 				 * or it's currently running the idle thread,
1990 				 * don't steal it.
1991 				 */
1992 				if ((ocp->cpu_disp_flags &
1993 				    CPU_DISP_DONTSTEAL) &&
1994 				    ocp->cpu_disp->disp_nrunnable == 1)
1995 					continue;
1996 
1997 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1998 				if (pri > maxpri) {
1999 					/*
2000 					 * Don't steal threads that we attempted
2001 					 * to steal recently until they're ready
2002 					 * to be stolen again.
2003 					 */
2004 					stealtime = ocp->cpu_disp->disp_steal;
2005 					if (stealtime == 0 ||
2006 					    stealtime - gethrtime() <= 0) {
2007 						maxpri = pri;
2008 						tcp = ocp;
2009 					} else {
2010 						/*
2011 						 * Don't update tcp, just set
2012 						 * the retval to T_DONTSTEAL, so
2013 						 * that if no acceptable CPUs
2014 						 * are found the return value
2015 						 * will be T_DONTSTEAL rather
2016 						 * then NULL.
2017 						 */
2018 						retval = T_DONTSTEAL;
2019 					}
2020 				}
2021 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2022 
2023 			/*
2024 			 * Iterate to the next leaf lpl in the resource set
2025 			 * at this level of locality. If we hit the end of
2026 			 * the set, wrap back around to the beginning.
2027 			 *
2028 			 * Note: This iteration is NULL terminated for a reason
2029 			 * see lpl_topo_bootstrap() in lgrp.c for details.
2030 			 */
2031 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2032 				leafidx = 0;
2033 				lpl_leaf = lpl->lpl_rset[leafidx];
2034 			}
2035 		} while (leafidx != startidx);
2036 
2037 next_level:
2038 		/*
2039 		 * Expand the search to include farther away CPUs (next
2040 		 * locality level). The closer CPUs that have already been
2041 		 * checked will be checked again. In doing so, idle CPUs
2042 		 * will tend to be more aggresive about stealing from CPUs
2043 		 * that are closer (since the closer CPUs will be considered
2044 		 * more often).
2045 		 * Begin at this level with the CPUs local leaf lpl.
2046 		 */
2047 		if ((lpl = lpl->lpl_parent) != NULL) {
2048 			leafidx = startidx = lpl->lpl_id2rset[local_id];
2049 			lpl_leaf = lpl->lpl_rset[leafidx];
2050 		}
2051 	} while (!tcp && lpl);
2052 
2053 	kpreempt_enable();
2054 
2055 	/*
2056 	 * If another queue looks good, and there is still nothing on
2057 	 * the local queue, try to transfer one or more threads
2058 	 * from it to our queue.
2059 	 */
2060 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2061 		tp = disp_getbest(tcp->cpu_disp);
2062 		if (tp == NULL || tp == T_DONTSTEAL)
2063 			return (tp);
2064 		return (disp_ratify(tp, kpq));
2065 	}
2066 	return (retval);
2067 }
2068 
2069 
2070 /*
2071  * disp_fix_unbound_pri()
2072  *	Determines the maximum priority of unbound threads on the queue.
2073  *	The priority is kept for the queue, but is only increased, never
2074  *	reduced unless some CPU is looking for something on that queue.
2075  *
2076  *	The priority argument is the known upper limit.
2077  *
2078  *	Perhaps this should be kept accurately, but that probably means
2079  *	separate bitmaps for bound and unbound threads.  Since only idled
2080  *	CPUs will have to do this recalculation, it seems better this way.
2081  */
2082 static void
2083 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2084 {
2085 	kthread_t	*tp;
2086 	dispq_t		*dq;
2087 	ulong_t		*dqactmap = dp->disp_qactmap;
2088 	ulong_t		mapword;
2089 	int		wx;
2090 
2091 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2092 
2093 	ASSERT(pri >= 0);			/* checked by caller */
2094 
2095 	/*
2096 	 * Start the search at the next lowest priority below the supplied
2097 	 * priority.  This depends on the bitmap implementation.
2098 	 */
2099 	do {
2100 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2101 
2102 		/*
2103 		 * Form mask for all lower priorities in the word.
2104 		 */
2105 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2106 
2107 		/*
2108 		 * Get next lower active priority.
2109 		 */
2110 		if (mapword != 0) {
2111 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2112 		} else if (wx > 0) {
2113 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2114 			if (pri < 0)
2115 				break;
2116 		} else {
2117 			pri = -1;
2118 			break;
2119 		}
2120 
2121 		/*
2122 		 * Search the queue for unbound, runnable threads.
2123 		 */
2124 		dq = &dp->disp_q[pri];
2125 		tp = dq->dq_first;
2126 
2127 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2128 			tp = tp->t_link;
2129 		}
2130 
2131 		/*
2132 		 * If a thread was found, set the priority and return.
2133 		 */
2134 	} while (tp == NULL);
2135 
2136 	/*
2137 	 * pri holds the maximum unbound thread priority or -1.
2138 	 */
2139 	if (dp->disp_max_unbound_pri != pri)
2140 		dp->disp_max_unbound_pri = pri;
2141 }
2142 
2143 /*
2144  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2145  * 	check if the CPU to which is was previously bound should have
2146  * 	its disp_max_unbound_pri increased.
2147  */
2148 void
2149 disp_adjust_unbound_pri(kthread_t *tp)
2150 {
2151 	disp_t *dp;
2152 	pri_t tpri;
2153 
2154 	ASSERT(THREAD_LOCK_HELD(tp));
2155 
2156 	/*
2157 	 * Don't do anything if the thread is not bound, or
2158 	 * currently not runnable or swapped out.
2159 	 */
2160 	if (tp->t_bound_cpu == NULL ||
2161 	    tp->t_state != TS_RUN ||
2162 	    tp->t_schedflag & TS_ON_SWAPQ)
2163 		return;
2164 
2165 	tpri = DISP_PRIO(tp);
2166 	dp = tp->t_bound_cpu->cpu_disp;
2167 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2168 	if (tpri > dp->disp_max_unbound_pri)
2169 		dp->disp_max_unbound_pri = tpri;
2170 }
2171 
2172 /*
2173  * disp_getbest()
2174  *   De-queue the highest priority unbound runnable thread.
2175  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2176  *   Returns NULL if nothing found.
2177  *   Returns T_DONTSTEAL if the thread was not stealable.
2178  *   so that the caller will try again later.
2179  *
2180  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2181  *   its type.
2182  */
2183 static kthread_t *
2184 disp_getbest(disp_t *dp)
2185 {
2186 	kthread_t	*tp;
2187 	dispq_t		*dq;
2188 	pri_t		pri;
2189 	cpu_t		*cp, *tcp;
2190 	boolean_t	allbound;
2191 
2192 	disp_lock_enter(&dp->disp_lock);
2193 
2194 	/*
2195 	 * If there is nothing to run, or the CPU is in the middle of a
2196 	 * context switch of the only thread, return NULL.
2197 	 */
2198 	tcp = dp->disp_cpu;
2199 	cp = CPU;
2200 	pri = dp->disp_max_unbound_pri;
2201 	if (pri == -1 ||
2202 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2203 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2204 		disp_lock_exit_nopreempt(&dp->disp_lock);
2205 		return (NULL);
2206 	}
2207 
2208 	dq = &dp->disp_q[pri];
2209 
2210 
2211 	/*
2212 	 * Assume that all threads are bound on this queue, and change it
2213 	 * later when we find out that it is not the case.
2214 	 */
2215 	allbound = B_TRUE;
2216 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2217 		hrtime_t now, nosteal, rqtime;
2218 
2219 		/*
2220 		 * Skip over bound threads which could be here even
2221 		 * though disp_max_unbound_pri indicated this level.
2222 		 */
2223 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2224 			continue;
2225 
2226 		/*
2227 		 * We've got some unbound threads on this queue, so turn
2228 		 * the allbound flag off now.
2229 		 */
2230 		allbound = B_FALSE;
2231 
2232 		/*
2233 		 * The thread is a candidate for stealing from its run queue. We
2234 		 * don't want to steal threads that became runnable just a
2235 		 * moment ago. This improves CPU affinity for threads that get
2236 		 * preempted for short periods of time and go back on the run
2237 		 * queue.
2238 		 *
2239 		 * We want to let it stay on its run queue if it was only placed
2240 		 * there recently and it was running on the same CPU before that
2241 		 * to preserve its cache investment. For the thread to remain on
2242 		 * its run queue, ALL of the following conditions must be
2243 		 * satisfied:
2244 		 *
2245 		 * - the disp queue should not be the kernel preemption queue
2246 		 * - delayed idle stealing should not be disabled
2247 		 * - nosteal_nsec should be non-zero
2248 		 * - it should run with user priority
2249 		 * - it should be on the run queue of the CPU where it was
2250 		 *   running before being placed on the run queue
2251 		 * - it should be the only thread on the run queue (to prevent
2252 		 *   extra scheduling latency for other threads)
2253 		 * - it should sit on the run queue for less than per-chip
2254 		 *   nosteal interval or global nosteal interval
2255 		 * - in case of CPUs with shared cache it should sit in a run
2256 		 *   queue of a CPU from a different chip
2257 		 *
2258 		 * The checks are arranged so that the ones that are faster are
2259 		 * placed earlier.
2260 		 */
2261 		if (tcp == NULL ||
2262 		    pri >= minclsyspri ||
2263 		    tp->t_cpu != tcp)
2264 			break;
2265 
2266 		/*
2267 		 * Steal immediately if, due to CMT processor architecture
2268 		 * migraiton between cp and tcp would incur no performance
2269 		 * penalty.
2270 		 */
2271 		if (pg_cmt_can_migrate(cp, tcp))
2272 			break;
2273 
2274 		nosteal = nosteal_nsec;
2275 		if (nosteal == 0)
2276 			break;
2277 
2278 		/*
2279 		 * Calculate time spent sitting on run queue
2280 		 */
2281 		now = gethrtime_unscaled();
2282 		rqtime = now - tp->t_waitrq;
2283 		scalehrtime(&rqtime);
2284 
2285 		/*
2286 		 * Steal immediately if the time spent on this run queue is more
2287 		 * than allowed nosteal delay.
2288 		 *
2289 		 * Negative rqtime check is needed here to avoid infinite
2290 		 * stealing delays caused by unlikely but not impossible
2291 		 * drifts between CPU times on different CPUs.
2292 		 */
2293 		if (rqtime > nosteal || rqtime < 0)
2294 			break;
2295 
2296 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2297 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2298 		scalehrtime(&now);
2299 		/*
2300 		 * Calculate when this thread becomes stealable
2301 		 */
2302 		now += (nosteal - rqtime);
2303 
2304 		/*
2305 		 * Calculate time when some thread becomes stealable
2306 		 */
2307 		if (now < dp->disp_steal)
2308 			dp->disp_steal = now;
2309 	}
2310 
2311 	/*
2312 	 * If there were no unbound threads on this queue, find the queue
2313 	 * where they are and then return later. The value of
2314 	 * disp_max_unbound_pri is not always accurate because it isn't
2315 	 * reduced until another idle CPU looks for work.
2316 	 */
2317 	if (allbound)
2318 		disp_fix_unbound_pri(dp, pri);
2319 
2320 	/*
2321 	 * If we reached the end of the queue and found no unbound threads
2322 	 * then return NULL so that other CPUs will be considered.  If there
2323 	 * are unbound threads but they cannot yet be stolen, then
2324 	 * return T_DONTSTEAL and try again later.
2325 	 */
2326 	if (tp == NULL) {
2327 		disp_lock_exit_nopreempt(&dp->disp_lock);
2328 		return (allbound ? NULL : T_DONTSTEAL);
2329 	}
2330 
2331 	/*
2332 	 * Found a runnable, unbound thread, so remove it from queue.
2333 	 * dispdeq() requires that we have the thread locked, and we do,
2334 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2335 	 * put the thread in transition state, thereby dropping the dispq
2336 	 * lock.
2337 	 */
2338 
2339 #ifdef DEBUG
2340 	{
2341 		int	thread_was_on_queue;
2342 
2343 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2344 		ASSERT(thread_was_on_queue);
2345 	}
2346 
2347 #else /* DEBUG */
2348 	(void) dispdeq(tp);			/* drops disp_lock */
2349 #endif /* DEBUG */
2350 
2351 	/*
2352 	 * Reset the disp_queue steal time - we do not know what is the smallest
2353 	 * value across the queue is.
2354 	 */
2355 	dp->disp_steal = 0;
2356 
2357 	tp->t_schedflag |= TS_DONT_SWAP;
2358 
2359 	/*
2360 	 * Setup thread to run on the current CPU.
2361 	 */
2362 	tp->t_disp_queue = cp->cpu_disp;
2363 
2364 	cp->cpu_dispthread = tp;		/* protected by spl only */
2365 	cp->cpu_dispatch_pri = pri;
2366 
2367 	/*
2368 	 * There can be a memory synchronization race between disp_getbest()
2369 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2370 	 * to preempt the current thread to run the enqueued thread while
2371 	 * disp_getbest() and disp_ratify() are changing the current thread
2372 	 * to the stolen thread. This may lead to a situation where
2373 	 * cpu_resched() tries to preempt the wrong thread and the
2374 	 * stolen thread continues to run on the CPU which has been tagged
2375 	 * for preemption.
2376 	 * Later the clock thread gets enqueued but doesn't get to run on the
2377 	 * CPU causing the system to hang.
2378 	 *
2379 	 * To avoid this, grabbing and dropping the disp_lock (which does
2380 	 * a memory barrier) is needed to synchronize the execution of
2381 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2382 	 * synchronize the memory read and written by cpu_resched(),
2383 	 * disp_getbest(), and disp_ratify() with each other.
2384 	 *  (see CR#6482861 for more details).
2385 	 */
2386 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2387 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2388 
2389 	ASSERT(pri == DISP_PRIO(tp));
2390 
2391 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2392 
2393 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2394 
2395 	/*
2396 	 * Return with spl high so that swtch() won't need to raise it.
2397 	 * The disp_lock was dropped by dispdeq().
2398 	 */
2399 
2400 	return (tp);
2401 }
2402 
2403 /*
2404  * disp_bound_common() - common routine for higher level functions
2405  *	that check for bound threads under certain conditions.
2406  *	If 'threadlistsafe' is set then there is no need to acquire
2407  *	pidlock to stop the thread list from changing (eg, if
2408  *	disp_bound_* is called with cpus paused).
2409  */
2410 static int
2411 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2412 {
2413 	int		found = 0;
2414 	kthread_t	*tp;
2415 
2416 	ASSERT(flag);
2417 
2418 	if (!threadlistsafe)
2419 		mutex_enter(&pidlock);
2420 	tp = curthread;		/* faster than allthreads */
2421 	do {
2422 		if (tp->t_state != TS_FREE) {
2423 			/*
2424 			 * If an interrupt thread is busy, but the
2425 			 * caller doesn't care (i.e. BOUND_INTR is off),
2426 			 * then just ignore it and continue through.
2427 			 */
2428 			if ((tp->t_flag & T_INTR_THREAD) &&
2429 			    !(flag & BOUND_INTR))
2430 				continue;
2431 
2432 			/*
2433 			 * Skip the idle thread for the CPU
2434 			 * we're about to set offline.
2435 			 */
2436 			if (tp == cp->cpu_idle_thread)
2437 				continue;
2438 
2439 			/*
2440 			 * Skip the pause thread for the CPU
2441 			 * we're about to set offline.
2442 			 */
2443 			if (tp == cp->cpu_pause_thread)
2444 				continue;
2445 
2446 			if ((flag & BOUND_CPU) &&
2447 			    (tp->t_bound_cpu == cp ||
2448 			    tp->t_bind_cpu == cp->cpu_id ||
2449 			    tp->t_weakbound_cpu == cp)) {
2450 				found = 1;
2451 				break;
2452 			}
2453 
2454 			if ((flag & BOUND_PARTITION) &&
2455 			    (tp->t_cpupart == cp->cpu_part)) {
2456 				found = 1;
2457 				break;
2458 			}
2459 		}
2460 	} while ((tp = tp->t_next) != curthread && found == 0);
2461 	if (!threadlistsafe)
2462 		mutex_exit(&pidlock);
2463 	return (found);
2464 }
2465 
2466 /*
2467  * disp_bound_threads - return nonzero if threads are bound to the processor.
2468  *	Called infrequently.  Keep this simple.
2469  *	Includes threads that are asleep or stopped but not onproc.
2470  */
2471 int
2472 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2473 {
2474 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2475 }
2476 
2477 /*
2478  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2479  * to the given processor, including interrupt threads.
2480  */
2481 int
2482 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2483 {
2484 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2485 }
2486 
2487 /*
2488  * disp_bound_partition - return nonzero if threads are bound to the same
2489  * partition as the processor.
2490  *	Called infrequently.  Keep this simple.
2491  *	Includes threads that are asleep or stopped but not onproc.
2492  */
2493 int
2494 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2495 {
2496 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2497 }
2498 
2499 /*
2500  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2501  * threads to other CPUs.
2502  */
2503 void
2504 disp_cpu_inactive(cpu_t *cp)
2505 {
2506 	kthread_t	*tp;
2507 	disp_t		*dp = cp->cpu_disp;
2508 	dispq_t		*dq;
2509 	pri_t		pri;
2510 	int		wasonq;
2511 
2512 	disp_lock_enter(&dp->disp_lock);
2513 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2514 		dq = &dp->disp_q[pri];
2515 		tp = dq->dq_first;
2516 
2517 		/*
2518 		 * Skip over bound threads.
2519 		 */
2520 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2521 			tp = tp->t_link;
2522 		}
2523 
2524 		if (tp == NULL) {
2525 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2526 			disp_fix_unbound_pri(dp, pri);
2527 			continue;
2528 		}
2529 
2530 		wasonq = dispdeq(tp);		/* drops disp_lock */
2531 		ASSERT(wasonq);
2532 		ASSERT(tp->t_weakbound_cpu == NULL);
2533 
2534 		setbackdq(tp);
2535 		/*
2536 		 * Called from cpu_offline:
2537 		 *
2538 		 * cp has already been removed from the list of active cpus
2539 		 * and tp->t_cpu has been changed so there is no risk of
2540 		 * tp ending up back on cp.
2541 		 *
2542 		 * Called from cpupart_move_cpu:
2543 		 *
2544 		 * The cpu has moved to a new cpupart.  Any threads that
2545 		 * were on it's dispatch queues before the move remain
2546 		 * in the old partition and can't run in the new partition.
2547 		 */
2548 		ASSERT(tp->t_cpu != cp);
2549 		thread_unlock(tp);
2550 
2551 		disp_lock_enter(&dp->disp_lock);
2552 	}
2553 	disp_lock_exit(&dp->disp_lock);
2554 }
2555 
2556 /*
2557  * Return a score rating this CPU for running this thread: lower is better.
2558  *
2559  * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2560  * curcpu (as that's our own priority).
2561  *
2562  * If a cpu is the target of an offline request, then try to avoid it.
2563  *
2564  * Otherwise we'll use double the effective dispatcher priority for the CPU.
2565  *
2566  * We do this so smt_adjust_cpu_score() can increment the score if needed,
2567  * without ending up over-riding a dispatcher priority.
2568  */
2569 static pri_t
2570 cpu_score(cpu_t *cp, kthread_t *tp)
2571 {
2572 	pri_t score;
2573 
2574 	if (tp == curthread && cp == curthread->t_cpu)
2575 		score = 2 * CPU_IDLE_PRI;
2576 	else if (cp == cpu_inmotion)
2577 		score = SHRT_MAX;
2578 	else
2579 		score = 2 * cp->cpu_dispatch_pri;
2580 
2581 	if (2 * cp->cpu_disp->disp_maxrunpri > score)
2582 		score = 2 * cp->cpu_disp->disp_maxrunpri;
2583 	if (2 * cp->cpu_chosen_level > score)
2584 		score = 2 * cp->cpu_chosen_level;
2585 
2586 	return (smt_adjust_cpu_score(tp, cp, score));
2587 }
2588 
2589 /*
2590  * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2591  *
2592  * We are looking for a CPU with an effective dispatch priority lower than the
2593  * thread's, so that the thread will run immediately rather than be enqueued.
2594  * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2595  * If we don't find an available CPU there, we will expand our search to include
2596  * wider locality levels. (Note these groups are already divided by CPU
2597  * partition.)
2598  *
2599  * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2600  * the best home CPU we found.
2601  *
2602  * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2603  * other CPU.  The caller should pass in the most recently used CPU for the
2604  * thread; it's of course possible that this CPU isn't in the home lgroup.
2605  *
2606  * This function must be called at either high SPL, or with preemption disabled,
2607  * so that the "hint" CPU cannot be removed from the online CPU list while we
2608  * are traversing it.
2609  */
2610 cpu_t *
2611 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2612 {
2613 	cpu_t	*bestcpu;
2614 	cpu_t	*besthomecpu;
2615 	cpu_t   *cp, *cpstart;
2616 
2617 	klgrpset_t	done;
2618 
2619 	lpl_t		*lpl_iter, *lpl_leaf;
2620 
2621 	ASSERT(hint != NULL);
2622 	ASSERT(tp->t_lpl->lpl_ncpu > 0);
2623 
2624 	bestcpu = besthomecpu = NULL;
2625 	klgrpset_clear(done);
2626 
2627 	lpl_iter = tp->t_lpl;
2628 
2629 	do {
2630 		pri_t best = SHRT_MAX;
2631 		klgrpset_t cur_set;
2632 
2633 		klgrpset_clear(cur_set);
2634 
2635 		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2636 			lpl_leaf = lpl_iter->lpl_rset[i];
2637 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2638 				continue;
2639 
2640 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2641 
2642 			if (hint->cpu_lpl == lpl_leaf)
2643 				cp = cpstart = hint;
2644 			else
2645 				cp = cpstart = lpl_leaf->lpl_cpus;
2646 
2647 			do {
2648 				pri_t score = cpu_score(cp, tp);
2649 
2650 				if (score < best) {
2651 					best = score;
2652 					bestcpu = cp;
2653 
2654 					/* An idle CPU: we're done. */
2655 					if (score / 2 == CPU_IDLE_PRI)
2656 						goto out;
2657 				}
2658 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2659 		}
2660 
2661 		if (bestcpu != NULL && tpri > (best / 2))
2662 			goto out;
2663 
2664 		if (besthomecpu == NULL)
2665 			besthomecpu = bestcpu;
2666 
2667 		/*
2668 		 * Add the lgrps we just considered to the "done" set
2669 		 */
2670 		klgrpset_or(done, cur_set);
2671 
2672 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2673 
2674 	/*
2675 	 * The specified priority isn't high enough to run immediately
2676 	 * anywhere, so just return the best CPU from the home lgroup.
2677 	 */
2678 	bestcpu = besthomecpu;
2679 
2680 out:
2681 	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682 	return (bestcpu);
2683 }
2684 
2685 /*
2686  * This routine provides the generic idle cpu function for all processors.
2687  * If a processor has some specific code to execute when idle (say, to stop
2688  * the pipeline and save power) then that routine should be defined in the
2689  * processors specific code (module_xx.c) and the global variable idle_cpu
2690  * set to that function.
2691  */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696 
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
2701 }
2702 
2703 cpu_t *
2704 disp_choose_best_cpu(void)
2705 {
2706 	kthread_t *t = curthread;
2707 	cpu_t *curcpu = CPU;
2708 
2709 	ASSERT(t->t_preempt > 0);
2710 	ASSERT(t->t_state == TS_ONPROC);
2711 	ASSERT(t->t_schedflag & TS_VCPU);
2712 
2713 	if (smt_should_run(t, curcpu))
2714 		return (curcpu);
2715 
2716 	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2717 }
2718