xref: /illumos-gate/usr/src/uts/common/disp/disp.c (revision 533affcbc7fc4d0c8132976ea454aaa715fe2307)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2019 Joyent, Inc.
28  */
29 
30 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved  	*/
32 
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/systm.h>
40 #include <sys/sysinfo.h>
41 #include <sys/var.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <sys/inline.h>
46 #include <sys/disp.h>
47 #include <sys/class.h>
48 #include <sys/bitmap.h>
49 #include <sys/kmem.h>
50 #include <sys/cpuvar.h>
51 #include <sys/vtrace.h>
52 #include <sys/cpupart.h>
53 #include <sys/lgrp.h>
54 #include <sys/pg.h>
55 #include <sys/cmt.h>
56 #include <sys/bitset.h>
57 #include <sys/schedctl.h>
58 #include <sys/atomic.h>
59 #include <sys/dtrace.h>
60 #include <sys/sdt.h>
61 #include <sys/archsystm.h>
62 #include <sys/smt.h>
63 
64 #include <vm/as.h>
65 
66 #define	BOUND_CPU	0x1
67 #define	BOUND_PARTITION	0x2
68 #define	BOUND_INTR	0x4
69 
70 /* Dispatch queue allocation structure and functions */
71 struct disp_queue_info {
72 	disp_t	*dp;
73 	dispq_t *olddispq;
74 	dispq_t *newdispq;
75 	ulong_t	*olddqactmap;
76 	ulong_t	*newdqactmap;
77 	int	oldnglobpris;
78 };
79 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
80     disp_t *dp);
81 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
82 static void	disp_dq_free(struct disp_queue_info *dptr);
83 
84 /* platform-specific routine to call when processor is idle */
85 static void	generic_idle_cpu();
86 void		(*idle_cpu)() = generic_idle_cpu;
87 
88 /* routines invoked when a CPU enters/exits the idle loop */
89 static void	idle_enter();
90 static void	idle_exit();
91 
92 /* platform-specific routine to call when thread is enqueued */
93 static void	generic_enq_thread(cpu_t *, int);
94 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
95 
96 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
97 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
98 pri_t	intr_pri;		/* interrupt thread priority base level */
99 
100 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
101 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
102 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
103 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
104 int	nswapped;		/* total number of swapped threads */
105 void	disp_swapped_enq(kthread_t *tp);
106 static void	disp_swapped_setrun(kthread_t *tp);
107 static void	cpu_resched(cpu_t *cp, pri_t tpri);
108 
109 /*
110  * If this is set, only interrupt threads will cause kernel preemptions.
111  * This is done by changing the value of kpreemptpri.  kpreemptpri
112  * will either be the max sysclass pri or the min interrupt pri.
113  */
114 int	only_intr_kpreempt;
115 
116 extern void set_idle_cpu(int cpun);
117 extern void unset_idle_cpu(int cpun);
118 static void setkpdq(kthread_t *tp, int borf);
119 #define	SETKP_BACK	0
120 #define	SETKP_FRONT	1
121 /*
122  * Parameter that determines how recently a thread must have run
123  * on the CPU to be considered loosely-bound to that CPU to reduce
124  * cold cache effects.  The interval is in hertz.
125  */
126 #define	RECHOOSE_INTERVAL 3
127 int	rechoose_interval = RECHOOSE_INTERVAL;
128 
129 /*
130  * Parameter that determines how long (in nanoseconds) a thread must
131  * be sitting on a run queue before it can be stolen by another CPU
132  * to reduce migrations.  The interval is in nanoseconds.
133  *
134  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
135  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
136  * here indicating it is uninitiallized.
137  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
138  *
139  */
140 #define	NOSTEAL_UNINITIALIZED	(-1)
141 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
142 extern void cmp_set_nosteal_interval(void);
143 
144 id_t	defaultcid;	/* system "default" class; see dispadmin(8) */
145 
146 disp_lock_t	transition_lock;	/* lock on transitioning threads */
147 disp_lock_t	stop_lock;		/* lock on stopped threads */
148 
149 static void	cpu_dispqalloc(int numpris);
150 
151 /*
152  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
153  * a thread because it was sitting on its run queue for a very short
154  * period of time.
155  */
156 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
157 
158 static kthread_t	*disp_getwork(cpu_t *to);
159 static kthread_t	*disp_getbest(disp_t *from);
160 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
161 
162 void	swtch_to(kthread_t *);
163 
164 /*
165  * dispatcher and scheduler initialization
166  */
167 
168 /*
169  * disp_setup - Common code to calculate and allocate dispatcher
170  *		variables and structures based on the maximum priority.
171  */
172 static void
173 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
174 {
175 	pri_t	newnglobpris;
176 
177 	ASSERT(MUTEX_HELD(&cpu_lock));
178 
179 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
180 
181 	if (newnglobpris > oldnglobpris) {
182 		/*
183 		 * Allocate new kp queues for each CPU partition.
184 		 */
185 		cpupart_kpqalloc(newnglobpris);
186 
187 		/*
188 		 * Allocate new dispatch queues for each CPU.
189 		 */
190 		cpu_dispqalloc(newnglobpris);
191 
192 		/*
193 		 * compute new interrupt thread base priority
194 		 */
195 		intr_pri = maxglobpri;
196 		if (only_intr_kpreempt) {
197 			kpreemptpri = intr_pri + 1;
198 			if (kpqpri == KPQPRI)
199 				kpqpri = kpreemptpri;
200 		}
201 		v.v_nglobpris = newnglobpris;
202 	}
203 }
204 
205 /*
206  * dispinit - Called to initialize all loaded classes and the
207  *	      dispatcher framework.
208  */
209 void
210 dispinit(void)
211 {
212 	id_t	cid;
213 	pri_t	maxglobpri;
214 	pri_t	cl_maxglobpri;
215 
216 	maxglobpri = -1;
217 
218 	/*
219 	 * Initialize transition lock, which will always be set.
220 	 */
221 	DISP_LOCK_INIT(&transition_lock);
222 	disp_lock_enter_high(&transition_lock);
223 	DISP_LOCK_INIT(&stop_lock);
224 
225 	mutex_enter(&cpu_lock);
226 	CPU->cpu_disp->disp_maxrunpri = -1;
227 	CPU->cpu_disp->disp_max_unbound_pri = -1;
228 
229 	/*
230 	 * Initialize the default CPU partition.
231 	 */
232 	cpupart_initialize_default();
233 	/*
234 	 * Call the class specific initialization functions for
235 	 * all pre-installed schedulers.
236 	 *
237 	 * We pass the size of a class specific parameter
238 	 * buffer to each of the initialization functions
239 	 * to try to catch problems with backward compatibility
240 	 * of class modules.
241 	 *
242 	 * For example a new class module running on an old system
243 	 * which didn't provide sufficiently large parameter buffers
244 	 * would be bad news. Class initialization modules can check for
245 	 * this and take action if they detect a problem.
246 	 */
247 
248 	for (cid = 0; cid < nclass; cid++) {
249 		sclass_t	*sc;
250 
251 		sc = &sclass[cid];
252 		if (SCHED_INSTALLED(sc)) {
253 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
254 			    &sc->cl_funcs);
255 			if (cl_maxglobpri > maxglobpri)
256 				maxglobpri = cl_maxglobpri;
257 		}
258 	}
259 
260 	/*
261 	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
262 	 * to say, maxclsyspri + 1.  However, over time, the system has used
263 	 * more and more asynchronous kernel threads, with an increasing number
264 	 * of these doing work on direct behalf of higher-level software (e.g.,
265 	 * network processing).  This has led to potential priority inversions:
266 	 * threads doing low-priority lengthy kernel work can effectively
267 	 * delay kernel-level processing of higher-priority data. To minimize
268 	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
269 	 * the kernel that runs at maxclsyspri will therefore induce kernel
270 	 * preemption, and this priority should be used if/when an asynchronous
271 	 * thread (or, as is often the case, task queue) is performing a task
272 	 * on behalf of higher-level software (or any task that is otherwise
273 	 * latency-sensitve).
274 	 */
275 	kpreemptpri = (pri_t)v.v_maxsyspri;
276 	if (kpqpri == KPQPRI)
277 		kpqpri = kpreemptpri;
278 
279 	ASSERT(maxglobpri >= 0);
280 	disp_setup(maxglobpri, 0);
281 
282 	mutex_exit(&cpu_lock);
283 
284 	/*
285 	 * Platform specific sticky scheduler setup.
286 	 */
287 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
288 		cmp_set_nosteal_interval();
289 
290 	/*
291 	 * Get the default class ID; this may be later modified via
292 	 * dispadmin(8).  This will load the class (normally TS) and that will
293 	 * call disp_add(), which is why we had to drop cpu_lock first.
294 	 */
295 	if (getcid(defaultclass, &defaultcid) != 0) {
296 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
297 		    defaultclass);
298 	}
299 }
300 
301 /*
302  * disp_add - Called with class pointer to initialize the dispatcher
303  *	      for a newly loaded class.
304  */
305 void
306 disp_add(sclass_t *clp)
307 {
308 	pri_t	maxglobpri;
309 	pri_t	cl_maxglobpri;
310 
311 	mutex_enter(&cpu_lock);
312 	/*
313 	 * Initialize the scheduler class.
314 	 */
315 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
316 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
317 	if (cl_maxglobpri > maxglobpri)
318 		maxglobpri = cl_maxglobpri;
319 
320 	/*
321 	 * Save old queue information.  Since we're initializing a
322 	 * new scheduling class which has just been loaded, then
323 	 * the size of the dispq may have changed.  We need to handle
324 	 * that here.
325 	 */
326 	disp_setup(maxglobpri, v.v_nglobpris);
327 
328 	mutex_exit(&cpu_lock);
329 }
330 
331 
332 /*
333  * For each CPU, allocate new dispatch queues
334  * with the stated number of priorities.
335  */
336 static void
337 cpu_dispqalloc(int numpris)
338 {
339 	cpu_t	*cpup;
340 	struct disp_queue_info	*disp_mem;
341 	int i, num;
342 
343 	ASSERT(MUTEX_HELD(&cpu_lock));
344 
345 	disp_mem = kmem_zalloc(NCPU *
346 	    sizeof (struct disp_queue_info), KM_SLEEP);
347 
348 	/*
349 	 * This routine must allocate all of the memory before stopping
350 	 * the cpus because it must not sleep in kmem_alloc while the
351 	 * CPUs are stopped.  Locks they hold will not be freed until they
352 	 * are restarted.
353 	 */
354 	i = 0;
355 	cpup = cpu_list;
356 	do {
357 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
358 		i++;
359 		cpup = cpup->cpu_next;
360 	} while (cpup != cpu_list);
361 	num = i;
362 
363 	pause_cpus(NULL, NULL);
364 	for (i = 0; i < num; i++)
365 		disp_dq_assign(&disp_mem[i], numpris);
366 	start_cpus();
367 
368 	/*
369 	 * I must free all of the memory after starting the cpus because
370 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
371 	 */
372 	for (i = 0; i < num; i++)
373 		disp_dq_free(&disp_mem[i]);
374 
375 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
376 }
377 
378 static void
379 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
380 {
381 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
382 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
383 	    sizeof (long), KM_SLEEP);
384 	dptr->dp = dp;
385 }
386 
387 static void
388 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
389 {
390 	disp_t	*dp;
391 
392 	dp = dptr->dp;
393 	dptr->olddispq = dp->disp_q;
394 	dptr->olddqactmap = dp->disp_qactmap;
395 	dptr->oldnglobpris = dp->disp_npri;
396 
397 	ASSERT(dptr->oldnglobpris < numpris);
398 
399 	if (dptr->olddispq != NULL) {
400 		/*
401 		 * Use kcopy because bcopy is platform-specific
402 		 * and could block while we might have paused the cpus.
403 		 */
404 		(void) kcopy(dptr->olddispq, dptr->newdispq,
405 		    dptr->oldnglobpris * sizeof (dispq_t));
406 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
407 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
408 		    sizeof (long));
409 	}
410 	dp->disp_q = dptr->newdispq;
411 	dp->disp_qactmap = dptr->newdqactmap;
412 	dp->disp_q_limit = &dptr->newdispq[numpris];
413 	dp->disp_npri = numpris;
414 }
415 
416 static void
417 disp_dq_free(struct disp_queue_info *dptr)
418 {
419 	if (dptr->olddispq != NULL)
420 		kmem_free(dptr->olddispq,
421 		    dptr->oldnglobpris * sizeof (dispq_t));
422 	if (dptr->olddqactmap != NULL)
423 		kmem_free(dptr->olddqactmap,
424 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
425 }
426 
427 /*
428  * For a newly created CPU, initialize the dispatch queue.
429  * This is called before the CPU is known through cpu[] or on any lists.
430  */
431 void
432 disp_cpu_init(cpu_t *cp)
433 {
434 	disp_t	*dp;
435 	dispq_t	*newdispq;
436 	ulong_t	*newdqactmap;
437 
438 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
439 
440 	if (cp == cpu0_disp.disp_cpu)
441 		dp = &cpu0_disp;
442 	else
443 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
444 	bzero(dp, sizeof (disp_t));
445 	cp->cpu_disp = dp;
446 	dp->disp_cpu = cp;
447 	dp->disp_maxrunpri = -1;
448 	dp->disp_max_unbound_pri = -1;
449 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
450 	/*
451 	 * Allocate memory for the dispatcher queue headers
452 	 * and the active queue bitmap.
453 	 */
454 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
455 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
456 	    sizeof (long), KM_SLEEP);
457 	dp->disp_q = newdispq;
458 	dp->disp_qactmap = newdqactmap;
459 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
460 	dp->disp_npri = v.v_nglobpris;
461 }
462 
463 void
464 disp_cpu_fini(cpu_t *cp)
465 {
466 	ASSERT(MUTEX_HELD(&cpu_lock));
467 
468 	disp_kp_free(cp->cpu_disp);
469 	if (cp->cpu_disp != &cpu0_disp)
470 		kmem_free(cp->cpu_disp, sizeof (disp_t));
471 }
472 
473 /*
474  * Allocate new, larger kpreempt dispatch queue to replace the old one.
475  */
476 void
477 disp_kp_alloc(disp_t *dq, pri_t npri)
478 {
479 	struct disp_queue_info	mem_info;
480 
481 	if (npri > dq->disp_npri) {
482 		/*
483 		 * Allocate memory for the new array.
484 		 */
485 		disp_dq_alloc(&mem_info, npri, dq);
486 
487 		/*
488 		 * We need to copy the old structures to the new
489 		 * and free the old.
490 		 */
491 		disp_dq_assign(&mem_info, npri);
492 		disp_dq_free(&mem_info);
493 	}
494 }
495 
496 /*
497  * Free dispatch queue.
498  * Used for the kpreempt queues for a removed CPU partition and
499  * for the per-CPU queues of deleted CPUs.
500  */
501 void
502 disp_kp_free(disp_t *dq)
503 {
504 	struct disp_queue_info	mem_info;
505 
506 	mem_info.olddispq = dq->disp_q;
507 	mem_info.olddqactmap = dq->disp_qactmap;
508 	mem_info.oldnglobpris = dq->disp_npri;
509 	disp_dq_free(&mem_info);
510 }
511 
512 /*
513  * End dispatcher and scheduler initialization.
514  */
515 
516 /*
517  * See if there's anything to do other than remain idle.
518  * Return non-zero if there is.
519  *
520  * This function must be called with high spl, or with
521  * kernel preemption disabled to prevent the partition's
522  * active cpu list from changing while being traversed.
523  *
524  * This is essentially a simpler version of disp_getwork()
525  * to be called by CPUs preparing to "halt".
526  */
527 int
528 disp_anywork(void)
529 {
530 	cpu_t		*cp = CPU;
531 	cpu_t		*ocp;
532 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
533 
534 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
535 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
536 			return (1);
537 
538 		for (ocp = cp->cpu_next_part; ocp != cp;
539 		    ocp = ocp->cpu_next_part) {
540 			ASSERT(CPU_ACTIVE(ocp));
541 
542 			/*
543 			 * Something has appeared on the local run queue.
544 			 */
545 			if (*local_nrunnable > 0)
546 				return (1);
547 			/*
548 			 * If we encounter another idle CPU that will
549 			 * soon be trolling around through disp_anywork()
550 			 * terminate our walk here and let this other CPU
551 			 * patrol the next part of the list.
552 			 */
553 			if (ocp->cpu_dispatch_pri == -1 &&
554 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
555 				return (0);
556 			/*
557 			 * Work can be taken from another CPU if:
558 			 *	- There is unbound work on the run queue
559 			 *	- That work isn't a thread undergoing a
560 			 *	- context switch on an otherwise empty queue.
561 			 *	- The CPU isn't running the idle loop.
562 			 */
563 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
564 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
565 			    ocp->cpu_disp->disp_nrunnable == 1) &&
566 			    ocp->cpu_dispatch_pri != -1)
567 				return (1);
568 		}
569 	}
570 	return (0);
571 }
572 
573 /*
574  * Called when CPU enters the idle loop
575  */
576 static void
577 idle_enter()
578 {
579 	cpu_t		*cp = CPU;
580 
581 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
582 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
583 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
584 }
585 
586 /*
587  * Called when CPU exits the idle loop
588  */
589 static void
590 idle_exit()
591 {
592 	cpu_t		*cp = CPU;
593 
594 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
595 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
596 }
597 
598 /*
599  * Idle loop.
600  */
601 void
602 idle()
603 {
604 	struct cpu	*cp = CPU;		/* pointer to this CPU */
605 	kthread_t	*t;			/* taken thread */
606 
607 	idle_enter();
608 
609 	/*
610 	 * Uniprocessor version of idle loop.
611 	 * Do this until notified that we're on an actual multiprocessor.
612 	 */
613 	while (ncpus == 1) {
614 		if (cp->cpu_disp->disp_nrunnable == 0) {
615 			(*idle_cpu)();
616 			continue;
617 		}
618 		idle_exit();
619 		swtch();
620 
621 		idle_enter(); /* returned from swtch */
622 	}
623 
624 	/*
625 	 * Multiprocessor idle loop.
626 	 */
627 	for (;;) {
628 		/*
629 		 * If CPU is completely quiesced by p_online(2), just wait
630 		 * here with minimal bus traffic until put online.
631 		 */
632 		while (cp->cpu_flags & CPU_QUIESCED)
633 			(*idle_cpu)();
634 
635 		if (cp->cpu_disp->disp_nrunnable != 0) {
636 			idle_exit();
637 			swtch();
638 		} else {
639 			if (cp->cpu_flags & CPU_OFFLINE)
640 				continue;
641 			if ((t = disp_getwork(cp)) == NULL) {
642 				if (cp->cpu_chosen_level != -1) {
643 					disp_t *dp = cp->cpu_disp;
644 					disp_t *kpq;
645 
646 					disp_lock_enter(&dp->disp_lock);
647 					/*
648 					 * Set kpq under lock to prevent
649 					 * migration between partitions.
650 					 */
651 					kpq = &cp->cpu_part->cp_kp_queue;
652 					if (kpq->disp_maxrunpri == -1)
653 						cp->cpu_chosen_level = -1;
654 					disp_lock_exit(&dp->disp_lock);
655 				}
656 				(*idle_cpu)();
657 				continue;
658 			}
659 			/*
660 			 * If there was a thread but we couldn't steal
661 			 * it, then keep trying.
662 			 */
663 			if (t == T_DONTSTEAL)
664 				continue;
665 			idle_exit();
666 			swtch_to(t);
667 		}
668 		idle_enter(); /* returned from swtch/swtch_to */
669 	}
670 }
671 
672 
673 /*
674  * Preempt the currently running thread in favor of the highest
675  * priority thread.  The class of the current thread controls
676  * where it goes on the dispatcher queues. If panicking, turn
677  * preemption off.
678  */
679 void
680 preempt()
681 {
682 	kthread_t 	*t = curthread;
683 	klwp_t 		*lwp = ttolwp(curthread);
684 
685 	if (panicstr)
686 		return;
687 
688 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
689 
690 	thread_lock(t);
691 
692 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
693 		/*
694 		 * this thread has already been chosen to be run on
695 		 * another CPU. Clear kprunrun on this CPU since we're
696 		 * already headed for swtch().
697 		 */
698 		CPU->cpu_kprunrun = 0;
699 		thread_unlock_nopreempt(t);
700 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
701 	} else {
702 		if (lwp != NULL)
703 			lwp->lwp_ru.nivcsw++;
704 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
705 		THREAD_TRANSITION(t);
706 		CL_PREEMPT(t);
707 		DTRACE_SCHED(preempt);
708 		thread_unlock_nopreempt(t);
709 
710 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
711 
712 		swtch();		/* clears CPU->cpu_runrun via disp() */
713 	}
714 }
715 
716 extern kthread_t *thread_unpin();
717 
718 /*
719  * disp() - find the highest priority thread for this processor to run, and
720  * set it in TS_ONPROC state so that resume() can be called to run it.
721  */
722 static kthread_t *
723 disp()
724 {
725 	cpu_t		*cpup;
726 	disp_t		*dp;
727 	kthread_t	*tp;
728 	dispq_t		*dq;
729 	int		maxrunword;
730 	pri_t		pri;
731 	disp_t		*kpq;
732 
733 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
734 
735 	cpup = CPU;
736 	/*
737 	 * Find the highest priority loaded, runnable thread.
738 	 */
739 	dp = cpup->cpu_disp;
740 
741 reschedule:
742 	/*
743 	 * If there is more important work on the global queue with a better
744 	 * priority than the maximum on this CPU, take it now.
745 	 */
746 	kpq = &cpup->cpu_part->cp_kp_queue;
747 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
748 	    pri >= dp->disp_maxrunpri &&
749 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
750 	    (tp = disp_getbest(kpq)) != NULL) {
751 		if (disp_ratify(tp, kpq) != NULL) {
752 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
753 			    "disp_end:tid %p", tp);
754 			return (tp);
755 		}
756 	}
757 
758 	disp_lock_enter(&dp->disp_lock);
759 	pri = dp->disp_maxrunpri;
760 
761 	/*
762 	 * If there is nothing to run, look at what's runnable on other queues.
763 	 * Choose the idle thread if the CPU is quiesced.
764 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
765 	 * interrupt threads, which will be the only threads on the CPU's own
766 	 * queue, but cannot run threads from other queues.
767 	 */
768 	if (pri == -1) {
769 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
770 			disp_lock_exit(&dp->disp_lock);
771 			if ((tp = disp_getwork(cpup)) == NULL ||
772 			    tp == T_DONTSTEAL) {
773 				tp = cpup->cpu_idle_thread;
774 				(void) splhigh();
775 				THREAD_ONPROC(tp, cpup);
776 				cpup->cpu_dispthread = tp;
777 				cpup->cpu_dispatch_pri = -1;
778 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
779 				cpup->cpu_chosen_level = -1;
780 			}
781 		} else {
782 			disp_lock_exit_high(&dp->disp_lock);
783 			tp = cpup->cpu_idle_thread;
784 			THREAD_ONPROC(tp, cpup);
785 			cpup->cpu_dispthread = tp;
786 			cpup->cpu_dispatch_pri = -1;
787 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
788 			cpup->cpu_chosen_level = -1;
789 		}
790 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
791 		    "disp_end:tid %p", tp);
792 		return (tp);
793 	}
794 
795 	dq = &dp->disp_q[pri];
796 	tp = dq->dq_first;
797 
798 	ASSERT(tp != NULL);
799 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
800 
801 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
802 
803 	/*
804 	 * Found it so remove it from queue.
805 	 */
806 	dp->disp_nrunnable--;
807 	dq->dq_sruncnt--;
808 	if ((dq->dq_first = tp->t_link) == NULL) {
809 		ulong_t	*dqactmap = dp->disp_qactmap;
810 
811 		ASSERT(dq->dq_sruncnt == 0);
812 		dq->dq_last = NULL;
813 
814 		/*
815 		 * The queue is empty, so the corresponding bit needs to be
816 		 * turned off in dqactmap.   If nrunnable != 0 just took the
817 		 * last runnable thread off the
818 		 * highest queue, so recompute disp_maxrunpri.
819 		 */
820 		maxrunword = pri >> BT_ULSHIFT;
821 		dqactmap[maxrunword] &= ~BT_BIW(pri);
822 
823 		if (dp->disp_nrunnable == 0) {
824 			dp->disp_max_unbound_pri = -1;
825 			dp->disp_maxrunpri = -1;
826 		} else {
827 			int ipri;
828 
829 			ipri = bt_gethighbit(dqactmap, maxrunword);
830 			dp->disp_maxrunpri = ipri;
831 			if (ipri < dp->disp_max_unbound_pri)
832 				dp->disp_max_unbound_pri = ipri;
833 		}
834 	} else {
835 		tp->t_link = NULL;
836 	}
837 
838 	/*
839 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
840 	 * out this thread before we have a chance to run it.
841 	 * While running, it is protected against swapping by t_lock.
842 	 */
843 	tp->t_schedflag |= TS_DONT_SWAP;
844 	cpup->cpu_dispthread = tp;		/* protected by spl only */
845 	cpup->cpu_dispatch_pri = pri;
846 	ASSERT(pri == DISP_PRIO(tp));
847 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
848 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
849 
850 	ASSERT(tp != NULL);
851 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
852 	    "disp_end:tid %p", tp);
853 
854 	if (disp_ratify(tp, kpq) == NULL)
855 		goto reschedule;
856 
857 	return (tp);
858 }
859 
860 /*
861  * swtch()
862  *	Find best runnable thread and run it.
863  *	Called with the current thread already switched to a new state,
864  *	on a sleep queue, run queue, stopped, and not zombied.
865  *	May be called at any spl level less than or equal to LOCK_LEVEL.
866  *	Always drops spl to the base level (spl0()).
867  */
868 void
869 swtch()
870 {
871 	kthread_t	*t = curthread;
872 	kthread_t	*next;
873 	cpu_t		*cp;
874 
875 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
876 
877 	if (t->t_flag & T_INTR_THREAD)
878 		cpu_intr_swtch_enter(t);
879 
880 	if (t->t_intr != NULL) {
881 		/*
882 		 * We are an interrupt thread.  Setup and return
883 		 * the interrupted thread to be resumed.
884 		 */
885 		(void) splhigh();	/* block other scheduler action */
886 		cp = CPU;		/* now protected against migration */
887 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
888 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
889 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
890 		next = thread_unpin();
891 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
892 		resume_from_intr(next);
893 	} else {
894 #ifdef	DEBUG
895 		if (t->t_state == TS_ONPROC &&
896 		    t->t_disp_queue->disp_cpu == CPU &&
897 		    t->t_preempt == 0) {
898 			thread_lock(t);
899 			ASSERT(t->t_state != TS_ONPROC ||
900 			    t->t_disp_queue->disp_cpu != CPU ||
901 			    t->t_preempt != 0);	/* cannot migrate */
902 			thread_unlock_nopreempt(t);
903 		}
904 #endif	/* DEBUG */
905 		cp = CPU;
906 		next = disp();		/* returns with spl high */
907 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
908 
909 		/* OK to steal anything left on run queue */
910 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
911 
912 		if (next != t) {
913 			hrtime_t now;
914 
915 			now = gethrtime_unscaled();
916 			pg_ev_thread_swtch(cp, now, t, next);
917 
918 			/*
919 			 * If t was previously in the TS_ONPROC state,
920 			 * setfrontdq and setbackdq won't have set its t_waitrq.
921 			 * Since we now finally know that we're switching away
922 			 * from this thread, set its t_waitrq if it is on a run
923 			 * queue.
924 			 */
925 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
926 				t->t_waitrq = now;
927 			}
928 
929 			/*
930 			 * restore mstate of thread that we are switching to
931 			 */
932 			restore_mstate(next);
933 
934 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
935 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
936 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
937 
938 			if (dtrace_vtime_active)
939 				dtrace_vtime_switch(next);
940 
941 			resume(next);
942 			/*
943 			 * The TR_RESUME_END and TR_SWTCH_END trace points
944 			 * appear at the end of resume(), because we may not
945 			 * return here
946 			 */
947 		} else {
948 			if (t->t_flag & T_INTR_THREAD)
949 				cpu_intr_swtch_exit(t);
950 			/*
951 			 * Threads that enqueue themselves on a run queue defer
952 			 * setting t_waitrq. It is then either set in swtch()
953 			 * when the CPU is actually yielded, or not at all if it
954 			 * is remaining on the CPU.
955 			 * There is however a window between where the thread
956 			 * placed itself on a run queue, and where it selects
957 			 * itself in disp(), where a third party (eg. clock()
958 			 * doing tick processing) may have re-enqueued this
959 			 * thread, setting t_waitrq in the process. We detect
960 			 * this race by noticing that despite switching to
961 			 * ourself, our t_waitrq has been set, and should be
962 			 * cleared.
963 			 */
964 			if (t->t_waitrq != 0)
965 				t->t_waitrq = 0;
966 
967 			pg_ev_thread_remain(cp, t);
968 
969 			DTRACE_SCHED(remain__cpu);
970 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
971 			(void) spl0();
972 		}
973 	}
974 }
975 
976 /*
977  * swtch_from_zombie()
978  *	Special case of swtch(), which allows checks for TS_ZOMB to be
979  *	eliminated from normal resume.
980  *	Find best runnable thread and run it.
981  *	Called with the current thread zombied.
982  *	Zombies cannot migrate, so CPU references are safe.
983  */
984 void
985 swtch_from_zombie()
986 {
987 	kthread_t	*next;
988 	cpu_t		*cpu = CPU;
989 
990 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
991 
992 	ASSERT(curthread->t_state == TS_ZOMB);
993 
994 	next = disp();			/* returns with spl high */
995 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
996 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
997 	ASSERT(next != curthread);
998 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
999 
1000 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
1001 
1002 	restore_mstate(next);
1003 
1004 	if (dtrace_vtime_active)
1005 		dtrace_vtime_switch(next);
1006 
1007 	resume_from_zombie(next);
1008 	/*
1009 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1010 	 * appear at the end of resume(), because we certainly will not
1011 	 * return here
1012 	 */
1013 }
1014 
1015 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1016 
1017 /*
1018  * search_disp_queues()
1019  *	Search the given dispatch queues for thread tp.
1020  *	Return 1 if tp is found, otherwise return 0.
1021  */
1022 static int
1023 search_disp_queues(disp_t *dp, kthread_t *tp)
1024 {
1025 	dispq_t		*dq;
1026 	dispq_t		*eq;
1027 
1028 	disp_lock_enter_high(&dp->disp_lock);
1029 
1030 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1031 		kthread_t	*rp;
1032 
1033 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1034 
1035 		for (rp = dq->dq_first; rp; rp = rp->t_link)
1036 			if (tp == rp) {
1037 				disp_lock_exit_high(&dp->disp_lock);
1038 				return (1);
1039 			}
1040 	}
1041 	disp_lock_exit_high(&dp->disp_lock);
1042 
1043 	return (0);
1044 }
1045 
1046 /*
1047  * thread_on_queue()
1048  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1049  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1050  */
1051 static int
1052 thread_on_queue(kthread_t *tp)
1053 {
1054 	cpu_t		*cp;
1055 	struct cpupart	*part;
1056 
1057 	ASSERT(getpil() >= DISP_LEVEL);
1058 
1059 	/*
1060 	 * Search the per-CPU dispatch queues for tp.
1061 	 */
1062 	cp = CPU;
1063 	do {
1064 		if (search_disp_queues(cp->cpu_disp, tp))
1065 			return (1);
1066 	} while ((cp = cp->cpu_next_onln) != CPU);
1067 
1068 	/*
1069 	 * Search the partition-wide kpreempt queues for tp.
1070 	 */
1071 	part = CPU->cpu_part;
1072 	do {
1073 		if (search_disp_queues(&part->cp_kp_queue, tp))
1074 			return (1);
1075 	} while ((part = part->cp_next) != CPU->cpu_part);
1076 
1077 	return (0);
1078 }
1079 
1080 #else
1081 
1082 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1083 
1084 #endif  /* DEBUG */
1085 
1086 /*
1087  * like swtch(), but switch to a specified thread taken from another CPU.
1088  *	called with spl high..
1089  */
1090 void
1091 swtch_to(kthread_t *next)
1092 {
1093 	cpu_t			*cp = CPU;
1094 	hrtime_t		now;
1095 
1096 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1097 
1098 	/*
1099 	 * Update context switch statistics.
1100 	 */
1101 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1102 
1103 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1104 
1105 	now = gethrtime_unscaled();
1106 	pg_ev_thread_swtch(cp, now, curthread, next);
1107 
1108 	/* OK to steal anything left on run queue */
1109 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1110 
1111 	/* record last execution time */
1112 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1113 
1114 	/*
1115 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1116 	 * won't have set its t_waitrq.  Since we now finally know that we're
1117 	 * switching away from this thread, set its t_waitrq if it is on a run
1118 	 * queue.
1119 	 */
1120 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1121 		curthread->t_waitrq = now;
1122 	}
1123 
1124 	/* restore next thread to previously running microstate */
1125 	restore_mstate(next);
1126 
1127 	if (dtrace_vtime_active)
1128 		dtrace_vtime_switch(next);
1129 
1130 	resume(next);
1131 	/*
1132 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1133 	 * appear at the end of resume(), because we may not
1134 	 * return here
1135 	 */
1136 }
1137 
1138 static void
1139 cpu_resched(cpu_t *cp, pri_t tpri)
1140 {
1141 	int	call_poke_cpu = 0;
1142 	pri_t   cpupri = cp->cpu_dispatch_pri;
1143 
1144 	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1145 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1146 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1147 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1148 			cp->cpu_runrun = 1;
1149 			aston(cp->cpu_dispthread);
1150 			if (tpri < kpreemptpri && cp != CPU)
1151 				call_poke_cpu = 1;
1152 		}
1153 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1154 			cp->cpu_kprunrun = 1;
1155 			if (cp != CPU)
1156 				call_poke_cpu = 1;
1157 		}
1158 	}
1159 
1160 	/*
1161 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1162 	 */
1163 	membar_enter();
1164 
1165 	if (call_poke_cpu)
1166 		poke_cpu(cp->cpu_id);
1167 }
1168 
1169 /*
1170  * setbackdq() keeps runqs balanced such that the difference in length
1171  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1172  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1173  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1174  * try to keep runqs perfectly balanced regardless of the thread priority.
1175  */
1176 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1177 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1178 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1179 
1180 /*
1181  * Macro that evaluates to true if it is likely that the thread has cache
1182  * warmth. This is based on the amount of time that has elapsed since the
1183  * thread last ran. If that amount of time is less than "rechoose_interval"
1184  * ticks, then we decide that the thread has enough cache warmth to warrant
1185  * some affinity for t->t_cpu.
1186  */
1187 #define	THREAD_HAS_CACHE_WARMTH(thread)	\
1188 	((thread == curthread) ||	\
1189 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1190 /*
1191  * Put the specified thread on the back of the dispatcher
1192  * queue corresponding to its current priority.
1193  *
1194  * Called with the thread in transition, onproc or stopped state
1195  * and locked (transition implies locked) and at high spl.
1196  * Returns with the thread in TS_RUN state and still locked.
1197  */
1198 void
1199 setbackdq(kthread_t *tp)
1200 {
1201 	dispq_t	*dq;
1202 	disp_t		*dp;
1203 	cpu_t		*cp;
1204 	pri_t		tpri;
1205 	int		bound;
1206 	boolean_t	self;
1207 
1208 	ASSERT(THREAD_LOCK_HELD(tp));
1209 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1210 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1211 
1212 	/*
1213 	 * If thread is "swapped" or on the swap queue don't
1214 	 * queue it, but wake sched.
1215 	 */
1216 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1217 		disp_swapped_setrun(tp);
1218 		return;
1219 	}
1220 
1221 	self = (tp == curthread);
1222 
1223 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1224 		bound = 1;
1225 	else
1226 		bound = 0;
1227 
1228 	tpri = DISP_PRIO(tp);
1229 	if (ncpus == 1)
1230 		cp = tp->t_cpu;
1231 	else if (!bound) {
1232 		if (tpri >= kpqpri) {
1233 			setkpdq(tp, SETKP_BACK);
1234 			return;
1235 		}
1236 
1237 		/*
1238 		 * We'll generally let this thread continue to run where
1239 		 * it last ran...but will consider migration if:
1240 		 * - The thread probably doesn't have much cache warmth.
1241 		 * - SMT exclusion would prefer us to run elsewhere
1242 		 * - The CPU where it last ran is the target of an offline
1243 		 *   request.
1244 		 * - The thread last ran outside its home lgroup.
1245 		 */
1246 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1247 		    !smt_should_run(tp, tp->t_cpu) ||
1248 		    (tp->t_cpu == cpu_inmotion) ||
1249 		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1250 			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1251 		} else {
1252 			cp = tp->t_cpu;
1253 		}
1254 
1255 		if (tp->t_cpupart == cp->cpu_part) {
1256 			int	qlen;
1257 
1258 			/*
1259 			 * Perform any CMT load balancing
1260 			 */
1261 			cp = cmt_balance(tp, cp);
1262 
1263 			/*
1264 			 * Balance across the run queues
1265 			 */
1266 			qlen = RUNQ_LEN(cp, tpri);
1267 			if (tpri >= RUNQ_MATCH_PRI &&
1268 			    !(tp->t_schedflag & TS_RUNQMATCH))
1269 				qlen -= RUNQ_MAX_DIFF;
1270 			if (qlen > 0) {
1271 				cpu_t *newcp;
1272 
1273 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1274 					newcp = cp->cpu_next_part;
1275 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1276 					newcp = cp->cpu_next_part;
1277 				}
1278 
1279 				if (smt_should_run(tp, newcp) &&
1280 				    RUNQ_LEN(newcp, tpri) < qlen) {
1281 					DTRACE_PROBE3(runq__balance,
1282 					    kthread_t *, tp,
1283 					    cpu_t *, cp, cpu_t *, newcp);
1284 					cp = newcp;
1285 				}
1286 			}
1287 		} else {
1288 			/*
1289 			 * Migrate to a cpu in the new partition.
1290 			 */
1291 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1292 			    tp->t_pri);
1293 		}
1294 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1295 	} else {
1296 		/*
1297 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1298 		 * a short time until weak binding that existed when the
1299 		 * strong binding was established has dropped) so we must
1300 		 * favour weak binding over strong.
1301 		 */
1302 		cp = tp->t_weakbound_cpu ?
1303 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1304 	}
1305 	/*
1306 	 * A thread that is ONPROC may be temporarily placed on the run queue
1307 	 * but then chosen to run again by disp.  If the thread we're placing on
1308 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1309 	 * replacement process is actually scheduled in swtch().  In this
1310 	 * situation, curthread is the only thread that could be in the ONPROC
1311 	 * state.
1312 	 */
1313 	if ((!self) && (tp->t_waitrq == 0)) {
1314 		hrtime_t curtime;
1315 
1316 		curtime = gethrtime_unscaled();
1317 		(void) cpu_update_pct(tp, curtime);
1318 		tp->t_waitrq = curtime;
1319 	} else {
1320 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1321 	}
1322 
1323 	dp = cp->cpu_disp;
1324 	disp_lock_enter_high(&dp->disp_lock);
1325 
1326 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1327 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1328 	    tpri, cp, tp);
1329 
1330 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1331 
1332 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1333 	tp->t_disp_queue = dp;
1334 	tp->t_link = NULL;
1335 
1336 	dq = &dp->disp_q[tpri];
1337 	dp->disp_nrunnable++;
1338 	if (!bound)
1339 		dp->disp_steal = 0;
1340 	membar_enter();
1341 
1342 	if (dq->dq_sruncnt++ != 0) {
1343 		ASSERT(dq->dq_first != NULL);
1344 		dq->dq_last->t_link = tp;
1345 		dq->dq_last = tp;
1346 	} else {
1347 		ASSERT(dq->dq_first == NULL);
1348 		ASSERT(dq->dq_last == NULL);
1349 		dq->dq_first = dq->dq_last = tp;
1350 		BT_SET(dp->disp_qactmap, tpri);
1351 		if (tpri > dp->disp_maxrunpri) {
1352 			dp->disp_maxrunpri = tpri;
1353 			membar_enter();
1354 			cpu_resched(cp, tpri);
1355 		}
1356 	}
1357 
1358 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1359 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1360 			/*
1361 			 * If there are no other unbound threads on the
1362 			 * run queue, don't allow other CPUs to steal
1363 			 * this thread while we are in the middle of a
1364 			 * context switch. We may just switch to it
1365 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1366 			 * in swtch and swtch_to.
1367 			 */
1368 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1369 		}
1370 		dp->disp_max_unbound_pri = tpri;
1371 	}
1372 	(*disp_enq_thread)(cp, bound);
1373 }
1374 
1375 /*
1376  * Put the specified thread on the front of the dispatcher
1377  * queue corresponding to its current priority.
1378  *
1379  * Called with the thread in transition, onproc or stopped state
1380  * and locked (transition implies locked) and at high spl.
1381  * Returns with the thread in TS_RUN state and still locked.
1382  */
1383 void
1384 setfrontdq(kthread_t *tp)
1385 {
1386 	disp_t		*dp;
1387 	dispq_t		*dq;
1388 	cpu_t		*cp;
1389 	pri_t		tpri;
1390 	int		bound;
1391 
1392 	ASSERT(THREAD_LOCK_HELD(tp));
1393 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1394 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1395 
1396 	/*
1397 	 * If thread is "swapped" or on the swap queue don't
1398 	 * queue it, but wake sched.
1399 	 */
1400 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1401 		disp_swapped_setrun(tp);
1402 		return;
1403 	}
1404 
1405 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1406 		bound = 1;
1407 	else
1408 		bound = 0;
1409 
1410 	tpri = DISP_PRIO(tp);
1411 	if (ncpus == 1)
1412 		cp = tp->t_cpu;
1413 	else if (!bound) {
1414 		if (tpri >= kpqpri) {
1415 			setkpdq(tp, SETKP_FRONT);
1416 			return;
1417 		}
1418 		cp = tp->t_cpu;
1419 		if (tp->t_cpupart == cp->cpu_part) {
1420 			/*
1421 			 * We'll generally let this thread continue to run
1422 			 * where it last ran, but will consider migration if:
1423 			 * - The thread last ran outside its home lgroup.
1424 			 * - The CPU where it last ran is the target of an
1425 			 *   offline request (a thread_nomigrate() on the in
1426 			 *   motion CPU relies on this when forcing a preempt).
1427 			 * - The thread isn't the highest priority thread where
1428 			 *   it last ran, and it is considered not likely to
1429 			 *   have significant cache warmth.
1430 			 */
1431 			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1432 			    cp == cpu_inmotion ||
1433 			    (tpri < cp->cpu_disp->disp_maxrunpri &&
1434 			    !THREAD_HAS_CACHE_WARMTH(tp))) {
1435 				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1436 			}
1437 		} else {
1438 			/*
1439 			 * Migrate to a cpu in the new partition.
1440 			 */
1441 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1442 			    tp, tp->t_pri);
1443 		}
1444 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1445 	} else {
1446 		/*
1447 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1448 		 * a short time until weak binding that existed when the
1449 		 * strong binding was established has dropped) so we must
1450 		 * favour weak binding over strong.
1451 		 */
1452 		cp = tp->t_weakbound_cpu ?
1453 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1454 	}
1455 
1456 	/*
1457 	 * A thread that is ONPROC may be temporarily placed on the run queue
1458 	 * but then chosen to run again by disp.  If the thread we're placing on
1459 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1460 	 * replacement process is actually scheduled in swtch().  In this
1461 	 * situation, curthread is the only thread that could be in the ONPROC
1462 	 * state.
1463 	 */
1464 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1465 		hrtime_t curtime;
1466 
1467 		curtime = gethrtime_unscaled();
1468 		(void) cpu_update_pct(tp, curtime);
1469 		tp->t_waitrq = curtime;
1470 	} else {
1471 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1472 	}
1473 
1474 	dp = cp->cpu_disp;
1475 	disp_lock_enter_high(&dp->disp_lock);
1476 
1477 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1478 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1479 
1480 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1481 
1482 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1483 	tp->t_disp_queue = dp;
1484 
1485 	dq = &dp->disp_q[tpri];
1486 	dp->disp_nrunnable++;
1487 	if (!bound)
1488 		dp->disp_steal = 0;
1489 	membar_enter();
1490 
1491 	if (dq->dq_sruncnt++ != 0) {
1492 		ASSERT(dq->dq_last != NULL);
1493 		tp->t_link = dq->dq_first;
1494 		dq->dq_first = tp;
1495 	} else {
1496 		ASSERT(dq->dq_last == NULL);
1497 		ASSERT(dq->dq_first == NULL);
1498 		tp->t_link = NULL;
1499 		dq->dq_first = dq->dq_last = tp;
1500 		BT_SET(dp->disp_qactmap, tpri);
1501 		if (tpri > dp->disp_maxrunpri) {
1502 			dp->disp_maxrunpri = tpri;
1503 			membar_enter();
1504 			cpu_resched(cp, tpri);
1505 		}
1506 	}
1507 
1508 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1509 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1510 		    cp == CPU) {
1511 			/*
1512 			 * If there are no other unbound threads on the
1513 			 * run queue, don't allow other CPUs to steal
1514 			 * this thread while we are in the middle of a
1515 			 * context switch. We may just switch to it
1516 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1517 			 * in swtch and swtch_to.
1518 			 */
1519 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1520 		}
1521 		dp->disp_max_unbound_pri = tpri;
1522 	}
1523 	(*disp_enq_thread)(cp, bound);
1524 }
1525 
1526 /*
1527  * Put a high-priority unbound thread on the kp queue
1528  */
1529 static void
1530 setkpdq(kthread_t *tp, int borf)
1531 {
1532 	dispq_t	*dq;
1533 	disp_t	*dp;
1534 	cpu_t	*cp;
1535 	pri_t	tpri;
1536 
1537 	tpri = DISP_PRIO(tp);
1538 
1539 	dp = &tp->t_cpupart->cp_kp_queue;
1540 	disp_lock_enter_high(&dp->disp_lock);
1541 
1542 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1543 
1544 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1545 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1546 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1547 	tp->t_disp_queue = dp;
1548 	dp->disp_nrunnable++;
1549 	dq = &dp->disp_q[tpri];
1550 
1551 	if (dq->dq_sruncnt++ != 0) {
1552 		if (borf == SETKP_BACK) {
1553 			ASSERT(dq->dq_first != NULL);
1554 			tp->t_link = NULL;
1555 			dq->dq_last->t_link = tp;
1556 			dq->dq_last = tp;
1557 		} else {
1558 			ASSERT(dq->dq_last != NULL);
1559 			tp->t_link = dq->dq_first;
1560 			dq->dq_first = tp;
1561 		}
1562 	} else {
1563 		if (borf == SETKP_BACK) {
1564 			ASSERT(dq->dq_first == NULL);
1565 			ASSERT(dq->dq_last == NULL);
1566 			dq->dq_first = dq->dq_last = tp;
1567 		} else {
1568 			ASSERT(dq->dq_last == NULL);
1569 			ASSERT(dq->dq_first == NULL);
1570 			tp->t_link = NULL;
1571 			dq->dq_first = dq->dq_last = tp;
1572 		}
1573 		BT_SET(dp->disp_qactmap, tpri);
1574 		if (tpri > dp->disp_max_unbound_pri)
1575 			dp->disp_max_unbound_pri = tpri;
1576 		if (tpri > dp->disp_maxrunpri) {
1577 			dp->disp_maxrunpri = tpri;
1578 			membar_enter();
1579 		}
1580 	}
1581 
1582 	cp = tp->t_cpu;
1583 	if (tp->t_cpupart != cp->cpu_part) {
1584 		/* migrate to a cpu in the new partition */
1585 		cp = tp->t_cpupart->cp_cpulist;
1586 	}
1587 	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1588 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1589 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1590 
1591 	if (cp->cpu_chosen_level < tpri)
1592 		cp->cpu_chosen_level = tpri;
1593 	cpu_resched(cp, tpri);
1594 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1595 	(*disp_enq_thread)(cp, 0);
1596 }
1597 
1598 /*
1599  * Remove a thread from the dispatcher queue if it is on it.
1600  * It is not an error if it is not found but we return whether
1601  * or not it was found in case the caller wants to check.
1602  */
1603 int
1604 dispdeq(kthread_t *tp)
1605 {
1606 	disp_t		*dp;
1607 	dispq_t		*dq;
1608 	kthread_t	*rp;
1609 	kthread_t	*trp;
1610 	kthread_t	**ptp;
1611 	int		tpri;
1612 
1613 	ASSERT(THREAD_LOCK_HELD(tp));
1614 
1615 	if (tp->t_state != TS_RUN)
1616 		return (0);
1617 
1618 	/*
1619 	 * The thread is "swapped" or is on the swap queue and
1620 	 * hence no longer on the run queue, so return true.
1621 	 */
1622 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1623 		return (1);
1624 
1625 	tpri = DISP_PRIO(tp);
1626 	dp = tp->t_disp_queue;
1627 	ASSERT(tpri < dp->disp_npri);
1628 	dq = &dp->disp_q[tpri];
1629 	ptp = &dq->dq_first;
1630 	rp = *ptp;
1631 	trp = NULL;
1632 
1633 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1634 
1635 	/*
1636 	 * Search for thread in queue.
1637 	 * Double links would simplify this at the expense of disp/setrun.
1638 	 */
1639 	while (rp != tp && rp != NULL) {
1640 		trp = rp;
1641 		ptp = &trp->t_link;
1642 		rp = trp->t_link;
1643 	}
1644 
1645 	if (rp == NULL) {
1646 		panic("dispdeq: thread not on queue");
1647 	}
1648 
1649 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1650 
1651 	/*
1652 	 * Found it so remove it from queue.
1653 	 */
1654 	if ((*ptp = rp->t_link) == NULL)
1655 		dq->dq_last = trp;
1656 
1657 	dp->disp_nrunnable--;
1658 	if (--dq->dq_sruncnt == 0) {
1659 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1660 		if (dp->disp_nrunnable == 0) {
1661 			dp->disp_max_unbound_pri = -1;
1662 			dp->disp_maxrunpri = -1;
1663 		} else if (tpri == dp->disp_maxrunpri) {
1664 			int ipri;
1665 
1666 			ipri = bt_gethighbit(dp->disp_qactmap,
1667 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1668 			if (ipri < dp->disp_max_unbound_pri)
1669 				dp->disp_max_unbound_pri = ipri;
1670 			dp->disp_maxrunpri = ipri;
1671 		}
1672 	}
1673 	tp->t_link = NULL;
1674 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1675 	return (1);
1676 }
1677 
1678 
1679 /*
1680  * dq_sruninc and dq_srundec are public functions for
1681  * incrementing/decrementing the sruncnts when a thread on
1682  * a dispatcher queue is made schedulable/unschedulable by
1683  * resetting the TS_LOAD flag.
1684  *
1685  * The caller MUST have the thread lock and therefore the dispatcher
1686  * queue lock so that the operation which changes
1687  * the flag, the operation that checks the status of the thread to
1688  * determine if it's on a disp queue AND the call to this function
1689  * are one atomic operation with respect to interrupts.
1690  */
1691 
1692 /*
1693  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1694  */
1695 void
1696 dq_sruninc(kthread_t *t)
1697 {
1698 	ASSERT(t->t_state == TS_RUN);
1699 	ASSERT(t->t_schedflag & TS_LOAD);
1700 
1701 	THREAD_TRANSITION(t);
1702 	setfrontdq(t);
1703 }
1704 
1705 /*
1706  * See comment on calling conventions above.
1707  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1708  */
1709 void
1710 dq_srundec(kthread_t *t)
1711 {
1712 	ASSERT(t->t_schedflag & TS_LOAD);
1713 
1714 	(void) dispdeq(t);
1715 	disp_swapped_enq(t);
1716 }
1717 
1718 /*
1719  * Change the dispatcher lock of thread to the "swapped_lock"
1720  * and return with thread lock still held.
1721  *
1722  * Called with thread_lock held, in transition state, and at high spl.
1723  */
1724 void
1725 disp_swapped_enq(kthread_t *tp)
1726 {
1727 	ASSERT(THREAD_LOCK_HELD(tp));
1728 	ASSERT(tp->t_schedflag & TS_LOAD);
1729 
1730 	switch (tp->t_state) {
1731 	case TS_RUN:
1732 		disp_lock_enter_high(&swapped_lock);
1733 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1734 		break;
1735 	case TS_ONPROC:
1736 		disp_lock_enter_high(&swapped_lock);
1737 		THREAD_TRANSITION(tp);
1738 		wake_sched_sec = 1;		/* tell clock to wake sched */
1739 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1740 		break;
1741 	default:
1742 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1743 	}
1744 }
1745 
1746 /*
1747  * This routine is called by setbackdq/setfrontdq if the thread is
1748  * not loaded or loaded and on the swap queue.
1749  *
1750  * Thread state TS_SLEEP implies that a swapped thread
1751  * has been woken up and needs to be swapped in by the swapper.
1752  *
1753  * Thread state TS_RUN, it implies that the priority of a swapped
1754  * thread is being increased by scheduling class (e.g. ts_update).
1755  */
1756 static void
1757 disp_swapped_setrun(kthread_t *tp)
1758 {
1759 	ASSERT(THREAD_LOCK_HELD(tp));
1760 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1761 
1762 	switch (tp->t_state) {
1763 	case TS_SLEEP:
1764 		disp_lock_enter_high(&swapped_lock);
1765 		/*
1766 		 * Wakeup sched immediately (i.e., next tick) if the
1767 		 * thread priority is above maxclsyspri.
1768 		 */
1769 		if (DISP_PRIO(tp) > maxclsyspri)
1770 			wake_sched = 1;
1771 		else
1772 			wake_sched_sec = 1;
1773 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1774 		break;
1775 	case TS_RUN:				/* called from ts_update */
1776 		break;
1777 	default:
1778 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1779 	}
1780 }
1781 
1782 /*
1783  *	Make a thread give up its processor.  Find the processor on
1784  *	which this thread is executing, and have that processor
1785  *	preempt.
1786  *
1787  *	We allow System Duty Cycle (SDC) threads to be preempted even if
1788  *	they are running at kernel priorities.  To implement this, we always
1789  *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1790  *	calls cpu_surrender() very often, we only preempt if there is anyone
1791  *	competing with us.
1792  */
1793 void
1794 cpu_surrender(kthread_t *tp)
1795 {
1796 	cpu_t	*cpup;
1797 	int	max_pri;
1798 	int	max_run_pri;
1799 	klwp_t	*lwp;
1800 
1801 	ASSERT(THREAD_LOCK_HELD(tp));
1802 
1803 	if (tp->t_state != TS_ONPROC)
1804 		return;
1805 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1806 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1807 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1808 	if (max_pri < max_run_pri)
1809 		max_pri = max_run_pri;
1810 
1811 	if (tp->t_cid == sysdccid) {
1812 		uint_t t_pri = DISP_PRIO(tp);
1813 		if (t_pri > max_pri)
1814 			return;		/* we are not competing w/ anyone */
1815 		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1816 	} else {
1817 		cpup->cpu_runrun = 1;
1818 		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1819 			cpup->cpu_kprunrun = 1;
1820 		}
1821 	}
1822 
1823 	/*
1824 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1825 	 */
1826 	membar_enter();
1827 
1828 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1829 
1830 	/*
1831 	 * Make the target thread take an excursion through trap()
1832 	 * to do preempt() (unless we're already in trap or post_syscall,
1833 	 * calling cpu_surrender via CL_TRAPRET).
1834 	 */
1835 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1836 	    lwp->lwp_state != LWP_USER) {
1837 		aston(tp);
1838 		if (cpup != CPU)
1839 			poke_cpu(cpup->cpu_id);
1840 	}
1841 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1842 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1843 }
1844 
1845 /*
1846  * Commit to and ratify a scheduling decision
1847  */
1848 /*ARGSUSED*/
1849 static kthread_t *
1850 disp_ratify(kthread_t *tp, disp_t *kpq)
1851 {
1852 	pri_t	tpri, maxpri;
1853 	pri_t	maxkpri;
1854 	cpu_t	*cpup;
1855 
1856 	ASSERT(tp != NULL);
1857 	/*
1858 	 * Commit to, then ratify scheduling decision
1859 	 */
1860 	cpup = CPU;
1861 	if (cpup->cpu_runrun != 0)
1862 		cpup->cpu_runrun = 0;
1863 	if (cpup->cpu_kprunrun != 0)
1864 		cpup->cpu_kprunrun = 0;
1865 	if (cpup->cpu_chosen_level != -1)
1866 		cpup->cpu_chosen_level = -1;
1867 	membar_enter();
1868 	tpri = DISP_PRIO(tp);
1869 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1870 	maxkpri = kpq->disp_maxrunpri;
1871 	if (maxpri < maxkpri)
1872 		maxpri = maxkpri;
1873 	if (tpri < maxpri) {
1874 		/*
1875 		 * should have done better
1876 		 * put this one back and indicate to try again
1877 		 */
1878 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1879 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1880 		thread_lock_high(tp);
1881 		THREAD_TRANSITION(tp);
1882 		setfrontdq(tp);
1883 		thread_unlock_nopreempt(tp);
1884 
1885 		tp = NULL;
1886 	}
1887 	return (tp);
1888 }
1889 
1890 /*
1891  * See if there is any work on the dispatcher queue for other CPUs.
1892  * If there is, dequeue the best thread and return.
1893  */
1894 static kthread_t *
1895 disp_getwork(cpu_t *cp)
1896 {
1897 	cpu_t		*ocp;		/* other CPU */
1898 	cpu_t		*ocp_start;
1899 	cpu_t		*tcp;		/* target local CPU */
1900 	kthread_t	*tp;
1901 	kthread_t	*retval = NULL;
1902 	pri_t		maxpri;
1903 	disp_t		*kpq;		/* kp queue for this partition */
1904 	lpl_t		*lpl, *lpl_leaf;
1905 	int		leafidx, startidx;
1906 	hrtime_t	stealtime;
1907 	lgrp_id_t	local_id;
1908 
1909 	maxpri = -1;
1910 	tcp = NULL;
1911 
1912 	kpq = &cp->cpu_part->cp_kp_queue;
1913 	while (kpq->disp_maxrunpri >= 0) {
1914 		/*
1915 		 * Try to take a thread from the kp_queue.
1916 		 */
1917 		tp = (disp_getbest(kpq));
1918 		if (tp)
1919 			return (disp_ratify(tp, kpq));
1920 	}
1921 
1922 	kpreempt_disable();		/* protect the cpu_active list */
1923 
1924 	/*
1925 	 * Try to find something to do on another CPU's run queue.
1926 	 * Loop through all other CPUs looking for the one with the highest
1927 	 * priority unbound thread.
1928 	 *
1929 	 * On NUMA machines, the partition's CPUs are consulted in order of
1930 	 * distance from the current CPU. This way, the first available
1931 	 * work found is also the closest, and will suffer the least
1932 	 * from being migrated.
1933 	 */
1934 	lpl = lpl_leaf = cp->cpu_lpl;
1935 	local_id = lpl_leaf->lpl_lgrpid;
1936 	leafidx = startidx = 0;
1937 
1938 	/*
1939 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1940 	 * broader levels of locality
1941 	 */
1942 	do {
1943 		/* This loop iterates over the lpl's leaves */
1944 		do {
1945 			if (lpl_leaf != cp->cpu_lpl)
1946 				ocp = lpl_leaf->lpl_cpus;
1947 			else
1948 				ocp = cp->cpu_next_lpl;
1949 
1950 			/* This loop iterates over the CPUs in the leaf */
1951 			ocp_start = ocp;
1952 			do {
1953 				pri_t pri;
1954 
1955 				ASSERT(CPU_ACTIVE(ocp));
1956 
1957 				/*
1958 				 * End our stroll around this lpl if:
1959 				 *
1960 				 * - Something became runnable on the local
1961 				 *   queue...which also ends our stroll around
1962 				 *   the partition.
1963 				 *
1964 				 * - We happen across another idle CPU.
1965 				 *   Since it is patrolling the next portion
1966 				 *   of the lpl's list (assuming it's not
1967 				 *   halted, or busy servicing an interrupt),
1968 				 *   move to the next higher level of locality.
1969 				 */
1970 				if (cp->cpu_disp->disp_nrunnable != 0) {
1971 					kpreempt_enable();
1972 					return (NULL);
1973 				}
1974 				if (ocp->cpu_dispatch_pri == -1) {
1975 					if (ocp->cpu_disp_flags &
1976 					    CPU_DISP_HALTED ||
1977 					    ocp->cpu_intr_actv != 0)
1978 						continue;
1979 					else
1980 						goto next_level;
1981 				}
1982 
1983 				/*
1984 				 * If there's only one thread and the CPU
1985 				 * is in the middle of a context switch,
1986 				 * or it's currently running the idle thread,
1987 				 * don't steal it.
1988 				 */
1989 				if ((ocp->cpu_disp_flags &
1990 				    CPU_DISP_DONTSTEAL) &&
1991 				    ocp->cpu_disp->disp_nrunnable == 1)
1992 					continue;
1993 
1994 				pri = ocp->cpu_disp->disp_max_unbound_pri;
1995 				if (pri > maxpri) {
1996 					/*
1997 					 * Don't steal threads that we attempted
1998 					 * to steal recently until they're ready
1999 					 * to be stolen again.
2000 					 */
2001 					stealtime = ocp->cpu_disp->disp_steal;
2002 					if (stealtime == 0 ||
2003 					    stealtime - gethrtime() <= 0) {
2004 						maxpri = pri;
2005 						tcp = ocp;
2006 					} else {
2007 						/*
2008 						 * Don't update tcp, just set
2009 						 * the retval to T_DONTSTEAL, so
2010 						 * that if no acceptable CPUs
2011 						 * are found the return value
2012 						 * will be T_DONTSTEAL rather
2013 						 * then NULL.
2014 						 */
2015 						retval = T_DONTSTEAL;
2016 					}
2017 				}
2018 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2019 
2020 			/*
2021 			 * Iterate to the next leaf lpl in the resource set
2022 			 * at this level of locality. If we hit the end of
2023 			 * the set, wrap back around to the beginning.
2024 			 *
2025 			 * Note: This iteration is NULL terminated for a reason
2026 			 * see lpl_topo_bootstrap() in lgrp.c for details.
2027 			 */
2028 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2029 				leafidx = 0;
2030 				lpl_leaf = lpl->lpl_rset[leafidx];
2031 			}
2032 		} while (leafidx != startidx);
2033 
2034 next_level:
2035 		/*
2036 		 * Expand the search to include farther away CPUs (next
2037 		 * locality level). The closer CPUs that have already been
2038 		 * checked will be checked again. In doing so, idle CPUs
2039 		 * will tend to be more aggresive about stealing from CPUs
2040 		 * that are closer (since the closer CPUs will be considered
2041 		 * more often).
2042 		 * Begin at this level with the CPUs local leaf lpl.
2043 		 */
2044 		if ((lpl = lpl->lpl_parent) != NULL) {
2045 			leafidx = startidx = lpl->lpl_id2rset[local_id];
2046 			lpl_leaf = lpl->lpl_rset[leafidx];
2047 		}
2048 	} while (!tcp && lpl);
2049 
2050 	kpreempt_enable();
2051 
2052 	/*
2053 	 * If another queue looks good, and there is still nothing on
2054 	 * the local queue, try to transfer one or more threads
2055 	 * from it to our queue.
2056 	 */
2057 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2058 		tp = disp_getbest(tcp->cpu_disp);
2059 		if (tp == NULL || tp == T_DONTSTEAL)
2060 			return (tp);
2061 		return (disp_ratify(tp, kpq));
2062 	}
2063 	return (retval);
2064 }
2065 
2066 
2067 /*
2068  * disp_fix_unbound_pri()
2069  *	Determines the maximum priority of unbound threads on the queue.
2070  *	The priority is kept for the queue, but is only increased, never
2071  *	reduced unless some CPU is looking for something on that queue.
2072  *
2073  *	The priority argument is the known upper limit.
2074  *
2075  *	Perhaps this should be kept accurately, but that probably means
2076  *	separate bitmaps for bound and unbound threads.  Since only idled
2077  *	CPUs will have to do this recalculation, it seems better this way.
2078  */
2079 static void
2080 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2081 {
2082 	kthread_t	*tp;
2083 	dispq_t		*dq;
2084 	ulong_t		*dqactmap = dp->disp_qactmap;
2085 	ulong_t		mapword;
2086 	int		wx;
2087 
2088 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2089 
2090 	ASSERT(pri >= 0);			/* checked by caller */
2091 
2092 	/*
2093 	 * Start the search at the next lowest priority below the supplied
2094 	 * priority.  This depends on the bitmap implementation.
2095 	 */
2096 	do {
2097 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2098 
2099 		/*
2100 		 * Form mask for all lower priorities in the word.
2101 		 */
2102 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2103 
2104 		/*
2105 		 * Get next lower active priority.
2106 		 */
2107 		if (mapword != 0) {
2108 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2109 		} else if (wx > 0) {
2110 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2111 			if (pri < 0)
2112 				break;
2113 		} else {
2114 			pri = -1;
2115 			break;
2116 		}
2117 
2118 		/*
2119 		 * Search the queue for unbound, runnable threads.
2120 		 */
2121 		dq = &dp->disp_q[pri];
2122 		tp = dq->dq_first;
2123 
2124 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2125 			tp = tp->t_link;
2126 		}
2127 
2128 		/*
2129 		 * If a thread was found, set the priority and return.
2130 		 */
2131 	} while (tp == NULL);
2132 
2133 	/*
2134 	 * pri holds the maximum unbound thread priority or -1.
2135 	 */
2136 	if (dp->disp_max_unbound_pri != pri)
2137 		dp->disp_max_unbound_pri = pri;
2138 }
2139 
2140 /*
2141  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2142  * 	check if the CPU to which is was previously bound should have
2143  * 	its disp_max_unbound_pri increased.
2144  */
2145 void
2146 disp_adjust_unbound_pri(kthread_t *tp)
2147 {
2148 	disp_t *dp;
2149 	pri_t tpri;
2150 
2151 	ASSERT(THREAD_LOCK_HELD(tp));
2152 
2153 	/*
2154 	 * Don't do anything if the thread is not bound, or
2155 	 * currently not runnable or swapped out.
2156 	 */
2157 	if (tp->t_bound_cpu == NULL ||
2158 	    tp->t_state != TS_RUN ||
2159 	    tp->t_schedflag & TS_ON_SWAPQ)
2160 		return;
2161 
2162 	tpri = DISP_PRIO(tp);
2163 	dp = tp->t_bound_cpu->cpu_disp;
2164 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2165 	if (tpri > dp->disp_max_unbound_pri)
2166 		dp->disp_max_unbound_pri = tpri;
2167 }
2168 
2169 /*
2170  * disp_getbest()
2171  *   De-queue the highest priority unbound runnable thread.
2172  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2173  *   Returns NULL if nothing found.
2174  *   Returns T_DONTSTEAL if the thread was not stealable.
2175  *   so that the caller will try again later.
2176  *
2177  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2178  *   its type.
2179  */
2180 static kthread_t *
2181 disp_getbest(disp_t *dp)
2182 {
2183 	kthread_t	*tp;
2184 	dispq_t		*dq;
2185 	pri_t		pri;
2186 	cpu_t		*cp, *tcp;
2187 	boolean_t	allbound;
2188 
2189 	disp_lock_enter(&dp->disp_lock);
2190 
2191 	/*
2192 	 * If there is nothing to run, or the CPU is in the middle of a
2193 	 * context switch of the only thread, return NULL.
2194 	 */
2195 	tcp = dp->disp_cpu;
2196 	cp = CPU;
2197 	pri = dp->disp_max_unbound_pri;
2198 	if (pri == -1 ||
2199 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2200 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2201 		disp_lock_exit_nopreempt(&dp->disp_lock);
2202 		return (NULL);
2203 	}
2204 
2205 	dq = &dp->disp_q[pri];
2206 
2207 
2208 	/*
2209 	 * Assume that all threads are bound on this queue, and change it
2210 	 * later when we find out that it is not the case.
2211 	 */
2212 	allbound = B_TRUE;
2213 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2214 		hrtime_t now, nosteal, rqtime;
2215 
2216 		/*
2217 		 * Skip over bound threads which could be here even
2218 		 * though disp_max_unbound_pri indicated this level.
2219 		 */
2220 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2221 			continue;
2222 
2223 		/*
2224 		 * We've got some unbound threads on this queue, so turn
2225 		 * the allbound flag off now.
2226 		 */
2227 		allbound = B_FALSE;
2228 
2229 		/*
2230 		 * The thread is a candidate for stealing from its run queue. We
2231 		 * don't want to steal threads that became runnable just a
2232 		 * moment ago. This improves CPU affinity for threads that get
2233 		 * preempted for short periods of time and go back on the run
2234 		 * queue.
2235 		 *
2236 		 * We want to let it stay on its run queue if it was only placed
2237 		 * there recently and it was running on the same CPU before that
2238 		 * to preserve its cache investment. For the thread to remain on
2239 		 * its run queue, ALL of the following conditions must be
2240 		 * satisfied:
2241 		 *
2242 		 * - the disp queue should not be the kernel preemption queue
2243 		 * - delayed idle stealing should not be disabled
2244 		 * - nosteal_nsec should be non-zero
2245 		 * - it should run with user priority
2246 		 * - it should be on the run queue of the CPU where it was
2247 		 *   running before being placed on the run queue
2248 		 * - it should be the only thread on the run queue (to prevent
2249 		 *   extra scheduling latency for other threads)
2250 		 * - it should sit on the run queue for less than per-chip
2251 		 *   nosteal interval or global nosteal interval
2252 		 * - in case of CPUs with shared cache it should sit in a run
2253 		 *   queue of a CPU from a different chip
2254 		 *
2255 		 * The checks are arranged so that the ones that are faster are
2256 		 * placed earlier.
2257 		 */
2258 		if (tcp == NULL ||
2259 		    pri >= minclsyspri ||
2260 		    tp->t_cpu != tcp)
2261 			break;
2262 
2263 		/*
2264 		 * Steal immediately if, due to CMT processor architecture
2265 		 * migraiton between cp and tcp would incur no performance
2266 		 * penalty.
2267 		 */
2268 		if (pg_cmt_can_migrate(cp, tcp))
2269 			break;
2270 
2271 		nosteal = nosteal_nsec;
2272 		if (nosteal == 0)
2273 			break;
2274 
2275 		/*
2276 		 * Calculate time spent sitting on run queue
2277 		 */
2278 		now = gethrtime_unscaled();
2279 		rqtime = now - tp->t_waitrq;
2280 		scalehrtime(&rqtime);
2281 
2282 		/*
2283 		 * Steal immediately if the time spent on this run queue is more
2284 		 * than allowed nosteal delay.
2285 		 *
2286 		 * Negative rqtime check is needed here to avoid infinite
2287 		 * stealing delays caused by unlikely but not impossible
2288 		 * drifts between CPU times on different CPUs.
2289 		 */
2290 		if (rqtime > nosteal || rqtime < 0)
2291 			break;
2292 
2293 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2294 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2295 		scalehrtime(&now);
2296 		/*
2297 		 * Calculate when this thread becomes stealable
2298 		 */
2299 		now += (nosteal - rqtime);
2300 
2301 		/*
2302 		 * Calculate time when some thread becomes stealable
2303 		 */
2304 		if (now < dp->disp_steal)
2305 			dp->disp_steal = now;
2306 	}
2307 
2308 	/*
2309 	 * If there were no unbound threads on this queue, find the queue
2310 	 * where they are and then return later. The value of
2311 	 * disp_max_unbound_pri is not always accurate because it isn't
2312 	 * reduced until another idle CPU looks for work.
2313 	 */
2314 	if (allbound)
2315 		disp_fix_unbound_pri(dp, pri);
2316 
2317 	/*
2318 	 * If we reached the end of the queue and found no unbound threads
2319 	 * then return NULL so that other CPUs will be considered.  If there
2320 	 * are unbound threads but they cannot yet be stolen, then
2321 	 * return T_DONTSTEAL and try again later.
2322 	 */
2323 	if (tp == NULL) {
2324 		disp_lock_exit_nopreempt(&dp->disp_lock);
2325 		return (allbound ? NULL : T_DONTSTEAL);
2326 	}
2327 
2328 	/*
2329 	 * Found a runnable, unbound thread, so remove it from queue.
2330 	 * dispdeq() requires that we have the thread locked, and we do,
2331 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2332 	 * put the thread in transition state, thereby dropping the dispq
2333 	 * lock.
2334 	 */
2335 
2336 #ifdef DEBUG
2337 	{
2338 		int	thread_was_on_queue;
2339 
2340 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2341 		ASSERT(thread_was_on_queue);
2342 	}
2343 
2344 #else /* DEBUG */
2345 	(void) dispdeq(tp);			/* drops disp_lock */
2346 #endif /* DEBUG */
2347 
2348 	/*
2349 	 * Reset the disp_queue steal time - we do not know what is the smallest
2350 	 * value across the queue is.
2351 	 */
2352 	dp->disp_steal = 0;
2353 
2354 	tp->t_schedflag |= TS_DONT_SWAP;
2355 
2356 	/*
2357 	 * Setup thread to run on the current CPU.
2358 	 */
2359 	tp->t_disp_queue = cp->cpu_disp;
2360 
2361 	cp->cpu_dispthread = tp;		/* protected by spl only */
2362 	cp->cpu_dispatch_pri = pri;
2363 
2364 	/*
2365 	 * There can be a memory synchronization race between disp_getbest()
2366 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2367 	 * to preempt the current thread to run the enqueued thread while
2368 	 * disp_getbest() and disp_ratify() are changing the current thread
2369 	 * to the stolen thread. This may lead to a situation where
2370 	 * cpu_resched() tries to preempt the wrong thread and the
2371 	 * stolen thread continues to run on the CPU which has been tagged
2372 	 * for preemption.
2373 	 * Later the clock thread gets enqueued but doesn't get to run on the
2374 	 * CPU causing the system to hang.
2375 	 *
2376 	 * To avoid this, grabbing and dropping the disp_lock (which does
2377 	 * a memory barrier) is needed to synchronize the execution of
2378 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2379 	 * synchronize the memory read and written by cpu_resched(),
2380 	 * disp_getbest(), and disp_ratify() with each other.
2381 	 *  (see CR#6482861 for more details).
2382 	 */
2383 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2384 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2385 
2386 	ASSERT(pri == DISP_PRIO(tp));
2387 
2388 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2389 
2390 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2391 
2392 	/*
2393 	 * Return with spl high so that swtch() won't need to raise it.
2394 	 * The disp_lock was dropped by dispdeq().
2395 	 */
2396 
2397 	return (tp);
2398 }
2399 
2400 /*
2401  * disp_bound_common() - common routine for higher level functions
2402  *	that check for bound threads under certain conditions.
2403  *	If 'threadlistsafe' is set then there is no need to acquire
2404  *	pidlock to stop the thread list from changing (eg, if
2405  *	disp_bound_* is called with cpus paused).
2406  */
2407 static int
2408 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2409 {
2410 	int		found = 0;
2411 	kthread_t	*tp;
2412 
2413 	ASSERT(flag);
2414 
2415 	if (!threadlistsafe)
2416 		mutex_enter(&pidlock);
2417 	tp = curthread;		/* faster than allthreads */
2418 	do {
2419 		if (tp->t_state != TS_FREE) {
2420 			/*
2421 			 * If an interrupt thread is busy, but the
2422 			 * caller doesn't care (i.e. BOUND_INTR is off),
2423 			 * then just ignore it and continue through.
2424 			 */
2425 			if ((tp->t_flag & T_INTR_THREAD) &&
2426 			    !(flag & BOUND_INTR))
2427 				continue;
2428 
2429 			/*
2430 			 * Skip the idle thread for the CPU
2431 			 * we're about to set offline.
2432 			 */
2433 			if (tp == cp->cpu_idle_thread)
2434 				continue;
2435 
2436 			/*
2437 			 * Skip the pause thread for the CPU
2438 			 * we're about to set offline.
2439 			 */
2440 			if (tp == cp->cpu_pause_thread)
2441 				continue;
2442 
2443 			if ((flag & BOUND_CPU) &&
2444 			    (tp->t_bound_cpu == cp ||
2445 			    tp->t_bind_cpu == cp->cpu_id ||
2446 			    tp->t_weakbound_cpu == cp)) {
2447 				found = 1;
2448 				break;
2449 			}
2450 
2451 			if ((flag & BOUND_PARTITION) &&
2452 			    (tp->t_cpupart == cp->cpu_part)) {
2453 				found = 1;
2454 				break;
2455 			}
2456 		}
2457 	} while ((tp = tp->t_next) != curthread && found == 0);
2458 	if (!threadlistsafe)
2459 		mutex_exit(&pidlock);
2460 	return (found);
2461 }
2462 
2463 /*
2464  * disp_bound_threads - return nonzero if threads are bound to the processor.
2465  *	Called infrequently.  Keep this simple.
2466  *	Includes threads that are asleep or stopped but not onproc.
2467  */
2468 int
2469 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2470 {
2471 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2472 }
2473 
2474 /*
2475  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2476  * to the given processor, including interrupt threads.
2477  */
2478 int
2479 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2480 {
2481 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2482 }
2483 
2484 /*
2485  * disp_bound_partition - return nonzero if threads are bound to the same
2486  * partition as the processor.
2487  *	Called infrequently.  Keep this simple.
2488  *	Includes threads that are asleep or stopped but not onproc.
2489  */
2490 int
2491 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2492 {
2493 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2494 }
2495 
2496 /*
2497  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2498  * threads to other CPUs.
2499  */
2500 void
2501 disp_cpu_inactive(cpu_t *cp)
2502 {
2503 	kthread_t	*tp;
2504 	disp_t		*dp = cp->cpu_disp;
2505 	dispq_t		*dq;
2506 	pri_t		pri;
2507 	int		wasonq;
2508 
2509 	disp_lock_enter(&dp->disp_lock);
2510 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2511 		dq = &dp->disp_q[pri];
2512 		tp = dq->dq_first;
2513 
2514 		/*
2515 		 * Skip over bound threads.
2516 		 */
2517 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2518 			tp = tp->t_link;
2519 		}
2520 
2521 		if (tp == NULL) {
2522 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2523 			disp_fix_unbound_pri(dp, pri);
2524 			continue;
2525 		}
2526 
2527 		wasonq = dispdeq(tp);		/* drops disp_lock */
2528 		ASSERT(wasonq);
2529 		ASSERT(tp->t_weakbound_cpu == NULL);
2530 
2531 		setbackdq(tp);
2532 		/*
2533 		 * Called from cpu_offline:
2534 		 *
2535 		 * cp has already been removed from the list of active cpus
2536 		 * and tp->t_cpu has been changed so there is no risk of
2537 		 * tp ending up back on cp.
2538 		 *
2539 		 * Called from cpupart_move_cpu:
2540 		 *
2541 		 * The cpu has moved to a new cpupart.  Any threads that
2542 		 * were on it's dispatch queues before the move remain
2543 		 * in the old partition and can't run in the new partition.
2544 		 */
2545 		ASSERT(tp->t_cpu != cp);
2546 		thread_unlock(tp);
2547 
2548 		disp_lock_enter(&dp->disp_lock);
2549 	}
2550 	disp_lock_exit(&dp->disp_lock);
2551 }
2552 
2553 /*
2554  * Return a score rating this CPU for running this thread: lower is better.
2555  *
2556  * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2557  * curcpu (as that's our own priority).
2558  *
2559  * If a cpu is the target of an offline request, then try to avoid it.
2560  *
2561  * Otherwise we'll use double the effective dispatcher priority for the CPU.
2562  *
2563  * We do this so smt_adjust_cpu_score() can increment the score if needed,
2564  * without ending up over-riding a dispatcher priority.
2565  */
2566 static pri_t
2567 cpu_score(cpu_t *cp, kthread_t *tp)
2568 {
2569 	pri_t score;
2570 
2571 	if (tp == curthread && cp == curthread->t_cpu)
2572 		score = 2 * CPU_IDLE_PRI;
2573 	else if (cp == cpu_inmotion)
2574 		score = SHRT_MAX;
2575 	else
2576 		score = 2 * cp->cpu_dispatch_pri;
2577 
2578 	if (2 * cp->cpu_disp->disp_maxrunpri > score)
2579 		score = 2 * cp->cpu_disp->disp_maxrunpri;
2580 	if (2 * cp->cpu_chosen_level > score)
2581 		score = 2 * cp->cpu_chosen_level;
2582 
2583 	return (smt_adjust_cpu_score(tp, cp, score));
2584 }
2585 
2586 /*
2587  * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2588  *
2589  * We are looking for a CPU with an effective dispatch priority lower than the
2590  * thread's, so that the thread will run immediately rather than be enqueued.
2591  * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2592  * If we don't find an available CPU there, we will expand our search to include
2593  * wider locality levels. (Note these groups are already divided by CPU
2594  * partition.)
2595  *
2596  * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2597  * the best home CPU we found.
2598  *
2599  * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2600  * other CPU.  The caller should pass in the most recently used CPU for the
2601  * thread; it's of course possible that this CPU isn't in the home lgroup.
2602  *
2603  * This function must be called at either high SPL, or with preemption disabled,
2604  * so that the "hint" CPU cannot be removed from the online CPU list while we
2605  * are traversing it.
2606  */
2607 cpu_t *
2608 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2609 {
2610 	cpu_t	*bestcpu;
2611 	cpu_t	*besthomecpu;
2612 	cpu_t   *cp, *cpstart;
2613 
2614 	klgrpset_t	done;
2615 
2616 	lpl_t		*lpl_iter, *lpl_leaf;
2617 
2618 	ASSERT(hint != NULL);
2619 	ASSERT(tp->t_lpl->lpl_ncpu > 0);
2620 
2621 	bestcpu = besthomecpu = NULL;
2622 	klgrpset_clear(done);
2623 
2624 	lpl_iter = tp->t_lpl;
2625 
2626 	do {
2627 		pri_t best = SHRT_MAX;
2628 		klgrpset_t cur_set;
2629 
2630 		klgrpset_clear(cur_set);
2631 
2632 		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2633 			lpl_leaf = lpl_iter->lpl_rset[i];
2634 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2635 				continue;
2636 
2637 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2638 
2639 			if (hint->cpu_lpl == lpl_leaf)
2640 				cp = cpstart = hint;
2641 			else
2642 				cp = cpstart = lpl_leaf->lpl_cpus;
2643 
2644 			do {
2645 				pri_t score = cpu_score(cp, tp);
2646 
2647 				if (score < best) {
2648 					best = score;
2649 					bestcpu = cp;
2650 
2651 					/* An idle CPU: we're done. */
2652 					if (score / 2 == CPU_IDLE_PRI)
2653 						goto out;
2654 				}
2655 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2656 		}
2657 
2658 		if (bestcpu != NULL && tpri > (best / 2))
2659 			goto out;
2660 
2661 		if (besthomecpu == NULL)
2662 			besthomecpu = bestcpu;
2663 
2664 		/*
2665 		 * Add the lgrps we just considered to the "done" set
2666 		 */
2667 		klgrpset_or(done, cur_set);
2668 
2669 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2670 
2671 	/*
2672 	 * The specified priority isn't high enough to run immediately
2673 	 * anywhere, so just return the best CPU from the home lgroup.
2674 	 */
2675 	bestcpu = besthomecpu;
2676 
2677 out:
2678 	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2679 	return (bestcpu);
2680 }
2681 
2682 /*
2683  * This routine provides the generic idle cpu function for all processors.
2684  * If a processor has some specific code to execute when idle (say, to stop
2685  * the pipeline and save power) then that routine should be defined in the
2686  * processors specific code (module_xx.c) and the global variable idle_cpu
2687  * set to that function.
2688  */
2689 static void
2690 generic_idle_cpu(void)
2691 {
2692 }
2693 
2694 /*ARGSUSED*/
2695 static void
2696 generic_enq_thread(cpu_t *cpu, int bound)
2697 {
2698 }
2699 
2700 cpu_t *
2701 disp_choose_best_cpu(void)
2702 {
2703 	kthread_t *t = curthread;
2704 	cpu_t *curcpu = CPU;
2705 
2706 	ASSERT(t->t_preempt > 0);
2707 	ASSERT(t->t_state == TS_ONPROC);
2708 	ASSERT(t->t_schedflag & TS_VCPU);
2709 
2710 	if (smt_should_run(t, curcpu))
2711 		return (curcpu);
2712 
2713 	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2714 }
2715