xref: /titanic_50/usr/src/uts/common/disp/disp.c (revision 880d797826457b77414b37d531cc3e1aa166ecbe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
59 
60 #include <vm/as.h>
61 
62 #define	BOUND_CPU	0x1
63 #define	BOUND_PARTITION	0x2
64 #define	BOUND_INTR	0x4
65 
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 	disp_t	*dp;
69 	dispq_t *olddispq;
70 	dispq_t *newdispq;
71 	ulong_t	*olddqactmap;
72 	ulong_t	*newdqactmap;
73 	int	oldnglobpris;
74 };
75 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76     disp_t *dp);
77 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void	disp_dq_free(struct disp_queue_info *dptr);
79 
80 /* platform-specific routine to call when processor is idle */
81 static void	generic_idle_cpu();
82 void		(*idle_cpu)() = generic_idle_cpu;
83 
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void	idle_enter();
86 static void	idle_exit();
87 
88 /* platform-specific routine to call when thread is enqueued */
89 static void	generic_enq_thread(cpu_t *, int);
90 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 
92 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
93 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
94 pri_t	intr_pri;		/* interrupt thread priority base level */
95 
96 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
97 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
98 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
99 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
100 int	nswapped;		/* total number of swapped threads */
101 void	disp_swapped_enq(kthread_t *tp);
102 static void	disp_swapped_setrun(kthread_t *tp);
103 static void	cpu_resched(cpu_t *cp, pri_t tpri);
104 
105 /*
106  * If this is set, only interrupt threads will cause kernel preemptions.
107  * This is done by changing the value of kpreemptpri.  kpreemptpri
108  * will either be the max sysclass pri or the min interrupt pri.
109  */
110 int	only_intr_kpreempt;
111 
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define	SETKP_BACK	0
116 #define	SETKP_FRONT	1
117 /*
118  * Parameter that determines how recently a thread must have run
119  * on the CPU to be considered loosely-bound to that CPU to reduce
120  * cold cache effects.  The interval is in hertz.
121  */
122 #define	RECHOOSE_INTERVAL 3
123 int	rechoose_interval = RECHOOSE_INTERVAL;
124 
125 /*
126  * Parameter that determines how long (in nanoseconds) a thread must
127  * be sitting on a run queue before it can be stolen by another CPU
128  * to reduce migrations.  The interval is in nanoseconds.
129  *
130  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132  * here indicating it is uninitiallized.
133  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
134  *
135  */
136 #define	NOSTEAL_UNINITIALIZED	(-1)
137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 extern void cmp_set_nosteal_interval(void);
139 
140 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
141 
142 disp_lock_t	transition_lock;	/* lock on transitioning threads */
143 disp_lock_t	stop_lock;		/* lock on stopped threads */
144 
145 static void	cpu_dispqalloc(int numpris);
146 
147 /*
148  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149  * a thread because it was sitting on its run queue for a very short
150  * period of time.
151  */
152 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 
154 static kthread_t	*disp_getwork(cpu_t *to);
155 static kthread_t	*disp_getbest(disp_t *from);
156 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
157 
158 void	swtch_to(kthread_t *);
159 
160 /*
161  * dispatcher and scheduler initialization
162  */
163 
164 /*
165  * disp_setup - Common code to calculate and allocate dispatcher
166  *		variables and structures based on the maximum priority.
167  */
168 static void
169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 {
171 	pri_t	newnglobpris;
172 
173 	ASSERT(MUTEX_HELD(&cpu_lock));
174 
175 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176 
177 	if (newnglobpris > oldnglobpris) {
178 		/*
179 		 * Allocate new kp queues for each CPU partition.
180 		 */
181 		cpupart_kpqalloc(newnglobpris);
182 
183 		/*
184 		 * Allocate new dispatch queues for each CPU.
185 		 */
186 		cpu_dispqalloc(newnglobpris);
187 
188 		/*
189 		 * compute new interrupt thread base priority
190 		 */
191 		intr_pri = maxglobpri;
192 		if (only_intr_kpreempt) {
193 			kpreemptpri = intr_pri + 1;
194 			if (kpqpri == KPQPRI)
195 				kpqpri = kpreemptpri;
196 		}
197 		v.v_nglobpris = newnglobpris;
198 	}
199 }
200 
201 /*
202  * dispinit - Called to initialize all loaded classes and the
203  *	      dispatcher framework.
204  */
205 void
206 dispinit(void)
207 {
208 	id_t	cid;
209 	pri_t	maxglobpri;
210 	pri_t	cl_maxglobpri;
211 
212 	maxglobpri = -1;
213 
214 	/*
215 	 * Initialize transition lock, which will always be set.
216 	 */
217 	DISP_LOCK_INIT(&transition_lock);
218 	disp_lock_enter_high(&transition_lock);
219 	DISP_LOCK_INIT(&stop_lock);
220 
221 	mutex_enter(&cpu_lock);
222 	CPU->cpu_disp->disp_maxrunpri = -1;
223 	CPU->cpu_disp->disp_max_unbound_pri = -1;
224 
225 	/*
226 	 * Initialize the default CPU partition.
227 	 */
228 	cpupart_initialize_default();
229 	/*
230 	 * Call the class specific initialization functions for
231 	 * all pre-installed schedulers.
232 	 *
233 	 * We pass the size of a class specific parameter
234 	 * buffer to each of the initialization functions
235 	 * to try to catch problems with backward compatibility
236 	 * of class modules.
237 	 *
238 	 * For example a new class module running on an old system
239 	 * which didn't provide sufficiently large parameter buffers
240 	 * would be bad news. Class initialization modules can check for
241 	 * this and take action if they detect a problem.
242 	 */
243 
244 	for (cid = 0; cid < nclass; cid++) {
245 		sclass_t	*sc;
246 
247 		sc = &sclass[cid];
248 		if (SCHED_INSTALLED(sc)) {
249 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 			    &sc->cl_funcs);
251 			if (cl_maxglobpri > maxglobpri)
252 				maxglobpri = cl_maxglobpri;
253 		}
254 	}
255 
256 	/*
257 	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
258 	 * to say, maxclsyspri + 1.  However, over time, the system has used
259 	 * more and more asynchronous kernel threads, with an increasing number
260 	 * of these doing work on direct behalf of higher-level software (e.g.,
261 	 * network processing).  This has led to potential priority inversions:
262 	 * threads doing low-priority lengthy kernel work can effectively
263 	 * delay kernel-level processing of higher-priority data. To minimize
264 	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
265 	 * the kernel that runs at maxclsyspri will therefore induce kernel
266 	 * preemption, and this priority should be used if/when an asynchronous
267 	 * thread (or, as is often the case, task queue) is performing a task
268 	 * on behalf of higher-level software (or any task that is otherwise
269 	 * latency-sensitve).
270 	 */
271 	kpreemptpri = (pri_t)v.v_maxsyspri;
272 	if (kpqpri == KPQPRI)
273 		kpqpri = kpreemptpri;
274 
275 	ASSERT(maxglobpri >= 0);
276 	disp_setup(maxglobpri, 0);
277 
278 	mutex_exit(&cpu_lock);
279 
280 	/*
281 	 * Platform specific sticky scheduler setup.
282 	 */
283 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
284 		cmp_set_nosteal_interval();
285 
286 	/*
287 	 * Get the default class ID; this may be later modified via
288 	 * dispadmin(1M).  This will load the class (normally TS) and that will
289 	 * call disp_add(), which is why we had to drop cpu_lock first.
290 	 */
291 	if (getcid(defaultclass, &defaultcid) != 0) {
292 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
293 		    defaultclass);
294 	}
295 }
296 
297 /*
298  * disp_add - Called with class pointer to initialize the dispatcher
299  *	      for a newly loaded class.
300  */
301 void
302 disp_add(sclass_t *clp)
303 {
304 	pri_t	maxglobpri;
305 	pri_t	cl_maxglobpri;
306 
307 	mutex_enter(&cpu_lock);
308 	/*
309 	 * Initialize the scheduler class.
310 	 */
311 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
312 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
313 	if (cl_maxglobpri > maxglobpri)
314 		maxglobpri = cl_maxglobpri;
315 
316 	/*
317 	 * Save old queue information.  Since we're initializing a
318 	 * new scheduling class which has just been loaded, then
319 	 * the size of the dispq may have changed.  We need to handle
320 	 * that here.
321 	 */
322 	disp_setup(maxglobpri, v.v_nglobpris);
323 
324 	mutex_exit(&cpu_lock);
325 }
326 
327 
328 /*
329  * For each CPU, allocate new dispatch queues
330  * with the stated number of priorities.
331  */
332 static void
333 cpu_dispqalloc(int numpris)
334 {
335 	cpu_t	*cpup;
336 	struct disp_queue_info	*disp_mem;
337 	int i, num;
338 
339 	ASSERT(MUTEX_HELD(&cpu_lock));
340 
341 	disp_mem = kmem_zalloc(NCPU *
342 	    sizeof (struct disp_queue_info), KM_SLEEP);
343 
344 	/*
345 	 * This routine must allocate all of the memory before stopping
346 	 * the cpus because it must not sleep in kmem_alloc while the
347 	 * CPUs are stopped.  Locks they hold will not be freed until they
348 	 * are restarted.
349 	 */
350 	i = 0;
351 	cpup = cpu_list;
352 	do {
353 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
354 		i++;
355 		cpup = cpup->cpu_next;
356 	} while (cpup != cpu_list);
357 	num = i;
358 
359 	pause_cpus(NULL, NULL);
360 	for (i = 0; i < num; i++)
361 		disp_dq_assign(&disp_mem[i], numpris);
362 	start_cpus();
363 
364 	/*
365 	 * I must free all of the memory after starting the cpus because
366 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
367 	 */
368 	for (i = 0; i < num; i++)
369 		disp_dq_free(&disp_mem[i]);
370 
371 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
372 }
373 
374 static void
375 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
376 {
377 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
378 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
379 	    sizeof (long), KM_SLEEP);
380 	dptr->dp = dp;
381 }
382 
383 static void
384 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
385 {
386 	disp_t	*dp;
387 
388 	dp = dptr->dp;
389 	dptr->olddispq = dp->disp_q;
390 	dptr->olddqactmap = dp->disp_qactmap;
391 	dptr->oldnglobpris = dp->disp_npri;
392 
393 	ASSERT(dptr->oldnglobpris < numpris);
394 
395 	if (dptr->olddispq != NULL) {
396 		/*
397 		 * Use kcopy because bcopy is platform-specific
398 		 * and could block while we might have paused the cpus.
399 		 */
400 		(void) kcopy(dptr->olddispq, dptr->newdispq,
401 		    dptr->oldnglobpris * sizeof (dispq_t));
402 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
403 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
404 		    sizeof (long));
405 	}
406 	dp->disp_q = dptr->newdispq;
407 	dp->disp_qactmap = dptr->newdqactmap;
408 	dp->disp_q_limit = &dptr->newdispq[numpris];
409 	dp->disp_npri = numpris;
410 }
411 
412 static void
413 disp_dq_free(struct disp_queue_info *dptr)
414 {
415 	if (dptr->olddispq != NULL)
416 		kmem_free(dptr->olddispq,
417 		    dptr->oldnglobpris * sizeof (dispq_t));
418 	if (dptr->olddqactmap != NULL)
419 		kmem_free(dptr->olddqactmap,
420 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
421 }
422 
423 /*
424  * For a newly created CPU, initialize the dispatch queue.
425  * This is called before the CPU is known through cpu[] or on any lists.
426  */
427 void
428 disp_cpu_init(cpu_t *cp)
429 {
430 	disp_t	*dp;
431 	dispq_t	*newdispq;
432 	ulong_t	*newdqactmap;
433 
434 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
435 
436 	if (cp == cpu0_disp.disp_cpu)
437 		dp = &cpu0_disp;
438 	else
439 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
440 	bzero(dp, sizeof (disp_t));
441 	cp->cpu_disp = dp;
442 	dp->disp_cpu = cp;
443 	dp->disp_maxrunpri = -1;
444 	dp->disp_max_unbound_pri = -1;
445 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
446 	/*
447 	 * Allocate memory for the dispatcher queue headers
448 	 * and the active queue bitmap.
449 	 */
450 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
451 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
452 	    sizeof (long), KM_SLEEP);
453 	dp->disp_q = newdispq;
454 	dp->disp_qactmap = newdqactmap;
455 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
456 	dp->disp_npri = v.v_nglobpris;
457 }
458 
459 void
460 disp_cpu_fini(cpu_t *cp)
461 {
462 	ASSERT(MUTEX_HELD(&cpu_lock));
463 
464 	disp_kp_free(cp->cpu_disp);
465 	if (cp->cpu_disp != &cpu0_disp)
466 		kmem_free(cp->cpu_disp, sizeof (disp_t));
467 }
468 
469 /*
470  * Allocate new, larger kpreempt dispatch queue to replace the old one.
471  */
472 void
473 disp_kp_alloc(disp_t *dq, pri_t npri)
474 {
475 	struct disp_queue_info	mem_info;
476 
477 	if (npri > dq->disp_npri) {
478 		/*
479 		 * Allocate memory for the new array.
480 		 */
481 		disp_dq_alloc(&mem_info, npri, dq);
482 
483 		/*
484 		 * We need to copy the old structures to the new
485 		 * and free the old.
486 		 */
487 		disp_dq_assign(&mem_info, npri);
488 		disp_dq_free(&mem_info);
489 	}
490 }
491 
492 /*
493  * Free dispatch queue.
494  * Used for the kpreempt queues for a removed CPU partition and
495  * for the per-CPU queues of deleted CPUs.
496  */
497 void
498 disp_kp_free(disp_t *dq)
499 {
500 	struct disp_queue_info	mem_info;
501 
502 	mem_info.olddispq = dq->disp_q;
503 	mem_info.olddqactmap = dq->disp_qactmap;
504 	mem_info.oldnglobpris = dq->disp_npri;
505 	disp_dq_free(&mem_info);
506 }
507 
508 /*
509  * End dispatcher and scheduler initialization.
510  */
511 
512 /*
513  * See if there's anything to do other than remain idle.
514  * Return non-zero if there is.
515  *
516  * This function must be called with high spl, or with
517  * kernel preemption disabled to prevent the partition's
518  * active cpu list from changing while being traversed.
519  *
520  * This is essentially a simpler version of disp_getwork()
521  * to be called by CPUs preparing to "halt".
522  */
523 int
524 disp_anywork(void)
525 {
526 	cpu_t		*cp = CPU;
527 	cpu_t		*ocp;
528 	volatile int	*local_nrunnable = &cp->cpu_disp->disp_nrunnable;
529 
530 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
531 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
532 			return (1);
533 
534 		for (ocp = cp->cpu_next_part; ocp != cp;
535 		    ocp = ocp->cpu_next_part) {
536 			ASSERT(CPU_ACTIVE(ocp));
537 
538 			/*
539 			 * Something has appeared on the local run queue.
540 			 */
541 			if (*local_nrunnable > 0)
542 				return (1);
543 			/*
544 			 * If we encounter another idle CPU that will
545 			 * soon be trolling around through disp_anywork()
546 			 * terminate our walk here and let this other CPU
547 			 * patrol the next part of the list.
548 			 */
549 			if (ocp->cpu_dispatch_pri == -1 &&
550 			    (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
551 				return (0);
552 			/*
553 			 * Work can be taken from another CPU if:
554 			 *	- There is unbound work on the run queue
555 			 *	- That work isn't a thread undergoing a
556 			 *	- context switch on an otherwise empty queue.
557 			 *	- The CPU isn't running the idle loop.
558 			 */
559 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
560 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
561 			    ocp->cpu_disp->disp_nrunnable == 1) &&
562 			    ocp->cpu_dispatch_pri != -1)
563 				return (1);
564 		}
565 	}
566 	return (0);
567 }
568 
569 /*
570  * Called when CPU enters the idle loop
571  */
572 static void
573 idle_enter()
574 {
575 	cpu_t		*cp = CPU;
576 
577 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
578 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
579 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
580 }
581 
582 /*
583  * Called when CPU exits the idle loop
584  */
585 static void
586 idle_exit()
587 {
588 	cpu_t		*cp = CPU;
589 
590 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
591 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
592 }
593 
594 /*
595  * Idle loop.
596  */
597 void
598 idle()
599 {
600 	struct cpu	*cp = CPU;		/* pointer to this CPU */
601 	kthread_t	*t;			/* taken thread */
602 
603 	idle_enter();
604 
605 	/*
606 	 * Uniprocessor version of idle loop.
607 	 * Do this until notified that we're on an actual multiprocessor.
608 	 */
609 	while (ncpus == 1) {
610 		if (cp->cpu_disp->disp_nrunnable == 0) {
611 			(*idle_cpu)();
612 			continue;
613 		}
614 		idle_exit();
615 		swtch();
616 
617 		idle_enter(); /* returned from swtch */
618 	}
619 
620 	/*
621 	 * Multiprocessor idle loop.
622 	 */
623 	for (;;) {
624 		/*
625 		 * If CPU is completely quiesced by p_online(2), just wait
626 		 * here with minimal bus traffic until put online.
627 		 */
628 		while (cp->cpu_flags & CPU_QUIESCED)
629 			(*idle_cpu)();
630 
631 		if (cp->cpu_disp->disp_nrunnable != 0) {
632 			idle_exit();
633 			swtch();
634 		} else {
635 			if (cp->cpu_flags & CPU_OFFLINE)
636 				continue;
637 			if ((t = disp_getwork(cp)) == NULL) {
638 				if (cp->cpu_chosen_level != -1) {
639 					disp_t *dp = cp->cpu_disp;
640 					disp_t *kpq;
641 
642 					disp_lock_enter(&dp->disp_lock);
643 					/*
644 					 * Set kpq under lock to prevent
645 					 * migration between partitions.
646 					 */
647 					kpq = &cp->cpu_part->cp_kp_queue;
648 					if (kpq->disp_maxrunpri == -1)
649 						cp->cpu_chosen_level = -1;
650 					disp_lock_exit(&dp->disp_lock);
651 				}
652 				(*idle_cpu)();
653 				continue;
654 			}
655 			/*
656 			 * If there was a thread but we couldn't steal
657 			 * it, then keep trying.
658 			 */
659 			if (t == T_DONTSTEAL)
660 				continue;
661 			idle_exit();
662 			swtch_to(t);
663 		}
664 		idle_enter(); /* returned from swtch/swtch_to */
665 	}
666 }
667 
668 
669 /*
670  * Preempt the currently running thread in favor of the highest
671  * priority thread.  The class of the current thread controls
672  * where it goes on the dispatcher queues. If panicking, turn
673  * preemption off.
674  */
675 void
676 preempt()
677 {
678 	kthread_t 	*t = curthread;
679 	klwp_t 		*lwp = ttolwp(curthread);
680 
681 	if (panicstr)
682 		return;
683 
684 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
685 
686 	thread_lock(t);
687 
688 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
689 		/*
690 		 * this thread has already been chosen to be run on
691 		 * another CPU. Clear kprunrun on this CPU since we're
692 		 * already headed for swtch().
693 		 */
694 		CPU->cpu_kprunrun = 0;
695 		thread_unlock_nopreempt(t);
696 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
697 	} else {
698 		if (lwp != NULL)
699 			lwp->lwp_ru.nivcsw++;
700 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
701 		THREAD_TRANSITION(t);
702 		CL_PREEMPT(t);
703 		DTRACE_SCHED(preempt);
704 		thread_unlock_nopreempt(t);
705 
706 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
707 
708 		swtch();		/* clears CPU->cpu_runrun via disp() */
709 	}
710 }
711 
712 extern kthread_t *thread_unpin();
713 
714 /*
715  * disp() - find the highest priority thread for this processor to run, and
716  * set it in TS_ONPROC state so that resume() can be called to run it.
717  */
718 static kthread_t *
719 disp()
720 {
721 	cpu_t		*cpup;
722 	disp_t		*dp;
723 	kthread_t	*tp;
724 	dispq_t		*dq;
725 	int		maxrunword;
726 	pri_t		pri;
727 	disp_t		*kpq;
728 
729 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
730 
731 	cpup = CPU;
732 	/*
733 	 * Find the highest priority loaded, runnable thread.
734 	 */
735 	dp = cpup->cpu_disp;
736 
737 reschedule:
738 	/*
739 	 * If there is more important work on the global queue with a better
740 	 * priority than the maximum on this CPU, take it now.
741 	 */
742 	kpq = &cpup->cpu_part->cp_kp_queue;
743 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
744 	    pri >= dp->disp_maxrunpri &&
745 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
746 	    (tp = disp_getbest(kpq)) != NULL) {
747 		if (disp_ratify(tp, kpq) != NULL) {
748 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
749 			    "disp_end:tid %p", tp);
750 			return (tp);
751 		}
752 	}
753 
754 	disp_lock_enter(&dp->disp_lock);
755 	pri = dp->disp_maxrunpri;
756 
757 	/*
758 	 * If there is nothing to run, look at what's runnable on other queues.
759 	 * Choose the idle thread if the CPU is quiesced.
760 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
761 	 * interrupt threads, which will be the only threads on the CPU's own
762 	 * queue, but cannot run threads from other queues.
763 	 */
764 	if (pri == -1) {
765 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
766 			disp_lock_exit(&dp->disp_lock);
767 			if ((tp = disp_getwork(cpup)) == NULL ||
768 			    tp == T_DONTSTEAL) {
769 				tp = cpup->cpu_idle_thread;
770 				(void) splhigh();
771 				THREAD_ONPROC(tp, cpup);
772 				cpup->cpu_dispthread = tp;
773 				cpup->cpu_dispatch_pri = -1;
774 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
775 				cpup->cpu_chosen_level = -1;
776 			}
777 		} else {
778 			disp_lock_exit_high(&dp->disp_lock);
779 			tp = cpup->cpu_idle_thread;
780 			THREAD_ONPROC(tp, cpup);
781 			cpup->cpu_dispthread = tp;
782 			cpup->cpu_dispatch_pri = -1;
783 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
784 			cpup->cpu_chosen_level = -1;
785 		}
786 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
787 		    "disp_end:tid %p", tp);
788 		return (tp);
789 	}
790 
791 	dq = &dp->disp_q[pri];
792 	tp = dq->dq_first;
793 
794 	ASSERT(tp != NULL);
795 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
796 
797 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
798 
799 	/*
800 	 * Found it so remove it from queue.
801 	 */
802 	dp->disp_nrunnable--;
803 	dq->dq_sruncnt--;
804 	if ((dq->dq_first = tp->t_link) == NULL) {
805 		ulong_t	*dqactmap = dp->disp_qactmap;
806 
807 		ASSERT(dq->dq_sruncnt == 0);
808 		dq->dq_last = NULL;
809 
810 		/*
811 		 * The queue is empty, so the corresponding bit needs to be
812 		 * turned off in dqactmap.   If nrunnable != 0 just took the
813 		 * last runnable thread off the
814 		 * highest queue, so recompute disp_maxrunpri.
815 		 */
816 		maxrunword = pri >> BT_ULSHIFT;
817 		dqactmap[maxrunword] &= ~BT_BIW(pri);
818 
819 		if (dp->disp_nrunnable == 0) {
820 			dp->disp_max_unbound_pri = -1;
821 			dp->disp_maxrunpri = -1;
822 		} else {
823 			int ipri;
824 
825 			ipri = bt_gethighbit(dqactmap, maxrunword);
826 			dp->disp_maxrunpri = ipri;
827 			if (ipri < dp->disp_max_unbound_pri)
828 				dp->disp_max_unbound_pri = ipri;
829 		}
830 	} else {
831 		tp->t_link = NULL;
832 	}
833 
834 	/*
835 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
836 	 * out this thread before we have a chance to run it.
837 	 * While running, it is protected against swapping by t_lock.
838 	 */
839 	tp->t_schedflag |= TS_DONT_SWAP;
840 	cpup->cpu_dispthread = tp;		/* protected by spl only */
841 	cpup->cpu_dispatch_pri = pri;
842 	ASSERT(pri == DISP_PRIO(tp));
843 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
844 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
845 
846 	ASSERT(tp != NULL);
847 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
848 	    "disp_end:tid %p", tp);
849 
850 	if (disp_ratify(tp, kpq) == NULL)
851 		goto reschedule;
852 
853 	return (tp);
854 }
855 
856 /*
857  * swtch()
858  *	Find best runnable thread and run it.
859  *	Called with the current thread already switched to a new state,
860  *	on a sleep queue, run queue, stopped, and not zombied.
861  *	May be called at any spl level less than or equal to LOCK_LEVEL.
862  *	Always drops spl to the base level (spl0()).
863  */
864 void
865 swtch()
866 {
867 	kthread_t	*t = curthread;
868 	kthread_t	*next;
869 	cpu_t		*cp;
870 
871 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
872 
873 	if (t->t_flag & T_INTR_THREAD)
874 		cpu_intr_swtch_enter(t);
875 
876 	if (t->t_intr != NULL) {
877 		/*
878 		 * We are an interrupt thread.  Setup and return
879 		 * the interrupted thread to be resumed.
880 		 */
881 		(void) splhigh();	/* block other scheduler action */
882 		cp = CPU;		/* now protected against migration */
883 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
884 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
885 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
886 		next = thread_unpin();
887 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
888 		resume_from_intr(next);
889 	} else {
890 #ifdef	DEBUG
891 		if (t->t_state == TS_ONPROC &&
892 		    t->t_disp_queue->disp_cpu == CPU &&
893 		    t->t_preempt == 0) {
894 			thread_lock(t);
895 			ASSERT(t->t_state != TS_ONPROC ||
896 			    t->t_disp_queue->disp_cpu != CPU ||
897 			    t->t_preempt != 0);	/* cannot migrate */
898 			thread_unlock_nopreempt(t);
899 		}
900 #endif	/* DEBUG */
901 		cp = CPU;
902 		next = disp();		/* returns with spl high */
903 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
904 
905 		/* OK to steal anything left on run queue */
906 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
907 
908 		if (next != t) {
909 			hrtime_t now;
910 
911 			now = gethrtime_unscaled();
912 			pg_ev_thread_swtch(cp, now, t, next);
913 
914 			/*
915 			 * If t was previously in the TS_ONPROC state,
916 			 * setfrontdq and setbackdq won't have set its t_waitrq.
917 			 * Since we now finally know that we're switching away
918 			 * from this thread, set its t_waitrq if it is on a run
919 			 * queue.
920 			 */
921 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
922 				t->t_waitrq = now;
923 			}
924 
925 			/*
926 			 * restore mstate of thread that we are switching to
927 			 */
928 			restore_mstate(next);
929 
930 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
931 			cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
932 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
933 
934 			if (dtrace_vtime_active)
935 				dtrace_vtime_switch(next);
936 
937 			resume(next);
938 			/*
939 			 * The TR_RESUME_END and TR_SWTCH_END trace points
940 			 * appear at the end of resume(), because we may not
941 			 * return here
942 			 */
943 		} else {
944 			if (t->t_flag & T_INTR_THREAD)
945 				cpu_intr_swtch_exit(t);
946 			/*
947 			 * Threads that enqueue themselves on a run queue defer
948 			 * setting t_waitrq. It is then either set in swtch()
949 			 * when the CPU is actually yielded, or not at all if it
950 			 * is remaining on the CPU.
951 			 * There is however a window between where the thread
952 			 * placed itself on a run queue, and where it selects
953 			 * itself in disp(), where a third party (eg. clock()
954 			 * doing tick processing) may have re-enqueued this
955 			 * thread, setting t_waitrq in the process. We detect
956 			 * this race by noticing that despite switching to
957 			 * ourself, our t_waitrq has been set, and should be
958 			 * cleared.
959 			 */
960 			if (t->t_waitrq != 0)
961 				t->t_waitrq = 0;
962 
963 			pg_ev_thread_remain(cp, t);
964 
965 			DTRACE_SCHED(remain__cpu);
966 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
967 			(void) spl0();
968 		}
969 	}
970 }
971 
972 /*
973  * swtch_from_zombie()
974  *	Special case of swtch(), which allows checks for TS_ZOMB to be
975  *	eliminated from normal resume.
976  *	Find best runnable thread and run it.
977  *	Called with the current thread zombied.
978  *	Zombies cannot migrate, so CPU references are safe.
979  */
980 void
981 swtch_from_zombie()
982 {
983 	kthread_t	*next;
984 	cpu_t		*cpu = CPU;
985 
986 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
987 
988 	ASSERT(curthread->t_state == TS_ZOMB);
989 
990 	next = disp();			/* returns with spl high */
991 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
992 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
993 	ASSERT(next != curthread);
994 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
995 
996 	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
997 
998 	restore_mstate(next);
999 
1000 	if (dtrace_vtime_active)
1001 		dtrace_vtime_switch(next);
1002 
1003 	resume_from_zombie(next);
1004 	/*
1005 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1006 	 * appear at the end of resume(), because we certainly will not
1007 	 * return here
1008 	 */
1009 }
1010 
1011 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1012 
1013 /*
1014  * search_disp_queues()
1015  *	Search the given dispatch queues for thread tp.
1016  *	Return 1 if tp is found, otherwise return 0.
1017  */
1018 static int
1019 search_disp_queues(disp_t *dp, kthread_t *tp)
1020 {
1021 	dispq_t		*dq;
1022 	dispq_t		*eq;
1023 
1024 	disp_lock_enter_high(&dp->disp_lock);
1025 
1026 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1027 		kthread_t	*rp;
1028 
1029 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1030 
1031 		for (rp = dq->dq_first; rp; rp = rp->t_link)
1032 			if (tp == rp) {
1033 				disp_lock_exit_high(&dp->disp_lock);
1034 				return (1);
1035 			}
1036 	}
1037 	disp_lock_exit_high(&dp->disp_lock);
1038 
1039 	return (0);
1040 }
1041 
1042 /*
1043  * thread_on_queue()
1044  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
1045  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
1046  */
1047 static int
1048 thread_on_queue(kthread_t *tp)
1049 {
1050 	cpu_t		*cp;
1051 	struct cpupart	*part;
1052 
1053 	ASSERT(getpil() >= DISP_LEVEL);
1054 
1055 	/*
1056 	 * Search the per-CPU dispatch queues for tp.
1057 	 */
1058 	cp = CPU;
1059 	do {
1060 		if (search_disp_queues(cp->cpu_disp, tp))
1061 			return (1);
1062 	} while ((cp = cp->cpu_next_onln) != CPU);
1063 
1064 	/*
1065 	 * Search the partition-wide kpreempt queues for tp.
1066 	 */
1067 	part = CPU->cpu_part;
1068 	do {
1069 		if (search_disp_queues(&part->cp_kp_queue, tp))
1070 			return (1);
1071 	} while ((part = part->cp_next) != CPU->cpu_part);
1072 
1073 	return (0);
1074 }
1075 
1076 #else
1077 
1078 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
1079 
1080 #endif  /* DEBUG */
1081 
1082 /*
1083  * like swtch(), but switch to a specified thread taken from another CPU.
1084  *	called with spl high..
1085  */
1086 void
1087 swtch_to(kthread_t *next)
1088 {
1089 	cpu_t			*cp = CPU;
1090 	hrtime_t		now;
1091 
1092 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1093 
1094 	/*
1095 	 * Update context switch statistics.
1096 	 */
1097 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1098 
1099 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1100 
1101 	now = gethrtime_unscaled();
1102 	pg_ev_thread_swtch(cp, now, curthread, next);
1103 
1104 	/* OK to steal anything left on run queue */
1105 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1106 
1107 	/* record last execution time */
1108 	cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1109 
1110 	/*
1111 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1112 	 * won't have set its t_waitrq.  Since we now finally know that we're
1113 	 * switching away from this thread, set its t_waitrq if it is on a run
1114 	 * queue.
1115 	 */
1116 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1117 		curthread->t_waitrq = now;
1118 	}
1119 
1120 	/* restore next thread to previously running microstate */
1121 	restore_mstate(next);
1122 
1123 	if (dtrace_vtime_active)
1124 		dtrace_vtime_switch(next);
1125 
1126 	resume(next);
1127 	/*
1128 	 * The TR_RESUME_END and TR_SWTCH_END trace points
1129 	 * appear at the end of resume(), because we may not
1130 	 * return here
1131 	 */
1132 }
1133 
1134 #define	CPU_IDLING(pri)	((pri) == -1)
1135 
1136 static void
1137 cpu_resched(cpu_t *cp, pri_t tpri)
1138 {
1139 	int	call_poke_cpu = 0;
1140 	pri_t   cpupri = cp->cpu_dispatch_pri;
1141 
1142 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1143 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1144 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1145 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1146 			cp->cpu_runrun = 1;
1147 			aston(cp->cpu_dispthread);
1148 			if (tpri < kpreemptpri && cp != CPU)
1149 				call_poke_cpu = 1;
1150 		}
1151 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1152 			cp->cpu_kprunrun = 1;
1153 			if (cp != CPU)
1154 				call_poke_cpu = 1;
1155 		}
1156 	}
1157 
1158 	/*
1159 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1160 	 */
1161 	membar_enter();
1162 
1163 	if (call_poke_cpu)
1164 		poke_cpu(cp->cpu_id);
1165 }
1166 
1167 /*
1168  * setbackdq() keeps runqs balanced such that the difference in length
1169  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1170  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1171  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1172  * try to keep runqs perfectly balanced regardless of the thread priority.
1173  */
1174 #define	RUNQ_MATCH_PRI	16	/* pri below which queue lengths must match */
1175 #define	RUNQ_MAX_DIFF	2	/* maximum runq length difference */
1176 #define	RUNQ_LEN(cp, pri)	((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1177 
1178 /*
1179  * Macro that evaluates to true if it is likely that the thread has cache
1180  * warmth. This is based on the amount of time that has elapsed since the
1181  * thread last ran. If that amount of time is less than "rechoose_interval"
1182  * ticks, then we decide that the thread has enough cache warmth to warrant
1183  * some affinity for t->t_cpu.
1184  */
1185 #define	THREAD_HAS_CACHE_WARMTH(thread)	\
1186 	((thread == curthread) ||	\
1187 	((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1188 /*
1189  * Put the specified thread on the back of the dispatcher
1190  * queue corresponding to its current priority.
1191  *
1192  * Called with the thread in transition, onproc or stopped state
1193  * and locked (transition implies locked) and at high spl.
1194  * Returns with the thread in TS_RUN state and still locked.
1195  */
1196 void
1197 setbackdq(kthread_t *tp)
1198 {
1199 	dispq_t	*dq;
1200 	disp_t		*dp;
1201 	cpu_t		*cp;
1202 	pri_t		tpri;
1203 	int		bound;
1204 	boolean_t	self;
1205 
1206 	ASSERT(THREAD_LOCK_HELD(tp));
1207 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1208 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1209 
1210 	/*
1211 	 * If thread is "swapped" or on the swap queue don't
1212 	 * queue it, but wake sched.
1213 	 */
1214 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1215 		disp_swapped_setrun(tp);
1216 		return;
1217 	}
1218 
1219 	self = (tp == curthread);
1220 
1221 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1222 		bound = 1;
1223 	else
1224 		bound = 0;
1225 
1226 	tpri = DISP_PRIO(tp);
1227 	if (ncpus == 1)
1228 		cp = tp->t_cpu;
1229 	else if (!bound) {
1230 		if (tpri >= kpqpri) {
1231 			setkpdq(tp, SETKP_BACK);
1232 			return;
1233 		}
1234 
1235 		/*
1236 		 * We'll generally let this thread continue to run where
1237 		 * it last ran...but will consider migration if:
1238 		 * - We thread probably doesn't have much cache warmth.
1239 		 * - The CPU where it last ran is the target of an offline
1240 		 *   request.
1241 		 * - The thread last ran outside it's home lgroup.
1242 		 */
1243 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1244 		    (tp->t_cpu == cpu_inmotion)) {
1245 			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1246 		} else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1247 			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1248 			    self ? tp->t_cpu : NULL);
1249 		} else {
1250 			cp = tp->t_cpu;
1251 		}
1252 
1253 		if (tp->t_cpupart == cp->cpu_part) {
1254 			int	qlen;
1255 
1256 			/*
1257 			 * Perform any CMT load balancing
1258 			 */
1259 			cp = cmt_balance(tp, cp);
1260 
1261 			/*
1262 			 * Balance across the run queues
1263 			 */
1264 			qlen = RUNQ_LEN(cp, tpri);
1265 			if (tpri >= RUNQ_MATCH_PRI &&
1266 			    !(tp->t_schedflag & TS_RUNQMATCH))
1267 				qlen -= RUNQ_MAX_DIFF;
1268 			if (qlen > 0) {
1269 				cpu_t *newcp;
1270 
1271 				if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1272 					newcp = cp->cpu_next_part;
1273 				} else if ((newcp = cp->cpu_next_lpl) == cp) {
1274 					newcp = cp->cpu_next_part;
1275 				}
1276 
1277 				if (RUNQ_LEN(newcp, tpri) < qlen) {
1278 					DTRACE_PROBE3(runq__balance,
1279 					    kthread_t *, tp,
1280 					    cpu_t *, cp, cpu_t *, newcp);
1281 					cp = newcp;
1282 				}
1283 			}
1284 		} else {
1285 			/*
1286 			 * Migrate to a cpu in the new partition.
1287 			 */
1288 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1289 			    tp->t_lpl, tp->t_pri, NULL);
1290 		}
1291 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1292 	} else {
1293 		/*
1294 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1295 		 * a short time until weak binding that existed when the
1296 		 * strong binding was established has dropped) so we must
1297 		 * favour weak binding over strong.
1298 		 */
1299 		cp = tp->t_weakbound_cpu ?
1300 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1301 	}
1302 	/*
1303 	 * A thread that is ONPROC may be temporarily placed on the run queue
1304 	 * but then chosen to run again by disp.  If the thread we're placing on
1305 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1306 	 * replacement process is actually scheduled in swtch().  In this
1307 	 * situation, curthread is the only thread that could be in the ONPROC
1308 	 * state.
1309 	 */
1310 	if ((!self) && (tp->t_waitrq == 0)) {
1311 		hrtime_t curtime;
1312 
1313 		curtime = gethrtime_unscaled();
1314 		(void) cpu_update_pct(tp, curtime);
1315 		tp->t_waitrq = curtime;
1316 	} else {
1317 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1318 	}
1319 
1320 	dp = cp->cpu_disp;
1321 	disp_lock_enter_high(&dp->disp_lock);
1322 
1323 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1324 	TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1325 	    tpri, cp, tp);
1326 
1327 #ifndef NPROBE
1328 	/* Kernel probe */
1329 	if (tnf_tracing_active)
1330 		tnf_thread_queue(tp, cp, tpri);
1331 #endif /* NPROBE */
1332 
1333 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1334 
1335 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1336 	tp->t_disp_queue = dp;
1337 	tp->t_link = NULL;
1338 
1339 	dq = &dp->disp_q[tpri];
1340 	dp->disp_nrunnable++;
1341 	if (!bound)
1342 		dp->disp_steal = 0;
1343 	membar_enter();
1344 
1345 	if (dq->dq_sruncnt++ != 0) {
1346 		ASSERT(dq->dq_first != NULL);
1347 		dq->dq_last->t_link = tp;
1348 		dq->dq_last = tp;
1349 	} else {
1350 		ASSERT(dq->dq_first == NULL);
1351 		ASSERT(dq->dq_last == NULL);
1352 		dq->dq_first = dq->dq_last = tp;
1353 		BT_SET(dp->disp_qactmap, tpri);
1354 		if (tpri > dp->disp_maxrunpri) {
1355 			dp->disp_maxrunpri = tpri;
1356 			membar_enter();
1357 			cpu_resched(cp, tpri);
1358 		}
1359 	}
1360 
1361 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1362 		if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1363 			/*
1364 			 * If there are no other unbound threads on the
1365 			 * run queue, don't allow other CPUs to steal
1366 			 * this thread while we are in the middle of a
1367 			 * context switch. We may just switch to it
1368 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1369 			 * in swtch and swtch_to.
1370 			 */
1371 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1372 		}
1373 		dp->disp_max_unbound_pri = tpri;
1374 	}
1375 	(*disp_enq_thread)(cp, bound);
1376 }
1377 
1378 /*
1379  * Put the specified thread on the front of the dispatcher
1380  * queue corresponding to its current priority.
1381  *
1382  * Called with the thread in transition, onproc or stopped state
1383  * and locked (transition implies locked) and at high spl.
1384  * Returns with the thread in TS_RUN state and still locked.
1385  */
1386 void
1387 setfrontdq(kthread_t *tp)
1388 {
1389 	disp_t		*dp;
1390 	dispq_t		*dq;
1391 	cpu_t		*cp;
1392 	pri_t		tpri;
1393 	int		bound;
1394 
1395 	ASSERT(THREAD_LOCK_HELD(tp));
1396 	ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1397 	ASSERT(!thread_on_queue(tp));	/* make sure tp isn't on a runq */
1398 
1399 	/*
1400 	 * If thread is "swapped" or on the swap queue don't
1401 	 * queue it, but wake sched.
1402 	 */
1403 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1404 		disp_swapped_setrun(tp);
1405 		return;
1406 	}
1407 
1408 	if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1409 		bound = 1;
1410 	else
1411 		bound = 0;
1412 
1413 	tpri = DISP_PRIO(tp);
1414 	if (ncpus == 1)
1415 		cp = tp->t_cpu;
1416 	else if (!bound) {
1417 		if (tpri >= kpqpri) {
1418 			setkpdq(tp, SETKP_FRONT);
1419 			return;
1420 		}
1421 		cp = tp->t_cpu;
1422 		if (tp->t_cpupart == cp->cpu_part) {
1423 			/*
1424 			 * We'll generally let this thread continue to run
1425 			 * where it last ran, but will consider migration if:
1426 			 * - The thread last ran outside it's home lgroup.
1427 			 * - The CPU where it last ran is the target of an
1428 			 *   offline request (a thread_nomigrate() on the in
1429 			 *   motion CPU relies on this when forcing a preempt).
1430 			 * - The thread isn't the highest priority thread where
1431 			 *   it last ran, and it is considered not likely to
1432 			 *   have significant cache warmth.
1433 			 */
1434 			if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1435 			    (cp == cpu_inmotion)) {
1436 				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1437 				    (tp == curthread) ? cp : NULL);
1438 			} else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1439 			    (!THREAD_HAS_CACHE_WARMTH(tp))) {
1440 				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1441 				    NULL);
1442 			}
1443 		} else {
1444 			/*
1445 			 * Migrate to a cpu in the new partition.
1446 			 */
1447 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1448 			    tp->t_lpl, tp->t_pri, NULL);
1449 		}
1450 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1451 	} else {
1452 		/*
1453 		 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1454 		 * a short time until weak binding that existed when the
1455 		 * strong binding was established has dropped) so we must
1456 		 * favour weak binding over strong.
1457 		 */
1458 		cp = tp->t_weakbound_cpu ?
1459 		    tp->t_weakbound_cpu : tp->t_bound_cpu;
1460 	}
1461 
1462 	/*
1463 	 * A thread that is ONPROC may be temporarily placed on the run queue
1464 	 * but then chosen to run again by disp.  If the thread we're placing on
1465 	 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1466 	 * replacement process is actually scheduled in swtch().  In this
1467 	 * situation, curthread is the only thread that could be in the ONPROC
1468 	 * state.
1469 	 */
1470 	if ((tp != curthread) && (tp->t_waitrq == 0)) {
1471 		hrtime_t curtime;
1472 
1473 		curtime = gethrtime_unscaled();
1474 		(void) cpu_update_pct(tp, curtime);
1475 		tp->t_waitrq = curtime;
1476 	} else {
1477 		(void) cpu_update_pct(tp, gethrtime_unscaled());
1478 	}
1479 
1480 	dp = cp->cpu_disp;
1481 	disp_lock_enter_high(&dp->disp_lock);
1482 
1483 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1484 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1485 
1486 #ifndef NPROBE
1487 	/* Kernel probe */
1488 	if (tnf_tracing_active)
1489 		tnf_thread_queue(tp, cp, tpri);
1490 #endif /* NPROBE */
1491 
1492 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1493 
1494 	THREAD_RUN(tp, &dp->disp_lock);		/* set TS_RUN state and lock */
1495 	tp->t_disp_queue = dp;
1496 
1497 	dq = &dp->disp_q[tpri];
1498 	dp->disp_nrunnable++;
1499 	if (!bound)
1500 		dp->disp_steal = 0;
1501 	membar_enter();
1502 
1503 	if (dq->dq_sruncnt++ != 0) {
1504 		ASSERT(dq->dq_last != NULL);
1505 		tp->t_link = dq->dq_first;
1506 		dq->dq_first = tp;
1507 	} else {
1508 		ASSERT(dq->dq_last == NULL);
1509 		ASSERT(dq->dq_first == NULL);
1510 		tp->t_link = NULL;
1511 		dq->dq_first = dq->dq_last = tp;
1512 		BT_SET(dp->disp_qactmap, tpri);
1513 		if (tpri > dp->disp_maxrunpri) {
1514 			dp->disp_maxrunpri = tpri;
1515 			membar_enter();
1516 			cpu_resched(cp, tpri);
1517 		}
1518 	}
1519 
1520 	if (!bound && tpri > dp->disp_max_unbound_pri) {
1521 		if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1522 		    cp == CPU) {
1523 			/*
1524 			 * If there are no other unbound threads on the
1525 			 * run queue, don't allow other CPUs to steal
1526 			 * this thread while we are in the middle of a
1527 			 * context switch. We may just switch to it
1528 			 * again right away. CPU_DISP_DONTSTEAL is cleared
1529 			 * in swtch and swtch_to.
1530 			 */
1531 			cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1532 		}
1533 		dp->disp_max_unbound_pri = tpri;
1534 	}
1535 	(*disp_enq_thread)(cp, bound);
1536 }
1537 
1538 /*
1539  * Put a high-priority unbound thread on the kp queue
1540  */
1541 static void
1542 setkpdq(kthread_t *tp, int borf)
1543 {
1544 	dispq_t	*dq;
1545 	disp_t	*dp;
1546 	cpu_t	*cp;
1547 	pri_t	tpri;
1548 
1549 	tpri = DISP_PRIO(tp);
1550 
1551 	dp = &tp->t_cpupart->cp_kp_queue;
1552 	disp_lock_enter_high(&dp->disp_lock);
1553 
1554 	TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1555 
1556 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1557 	DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1558 	THREAD_RUN(tp, &dp->disp_lock);		/* set t_state to TS_RUN */
1559 	tp->t_disp_queue = dp;
1560 	dp->disp_nrunnable++;
1561 	dq = &dp->disp_q[tpri];
1562 
1563 	if (dq->dq_sruncnt++ != 0) {
1564 		if (borf == SETKP_BACK) {
1565 			ASSERT(dq->dq_first != NULL);
1566 			tp->t_link = NULL;
1567 			dq->dq_last->t_link = tp;
1568 			dq->dq_last = tp;
1569 		} else {
1570 			ASSERT(dq->dq_last != NULL);
1571 			tp->t_link = dq->dq_first;
1572 			dq->dq_first = tp;
1573 		}
1574 	} else {
1575 		if (borf == SETKP_BACK) {
1576 			ASSERT(dq->dq_first == NULL);
1577 			ASSERT(dq->dq_last == NULL);
1578 			dq->dq_first = dq->dq_last = tp;
1579 		} else {
1580 			ASSERT(dq->dq_last == NULL);
1581 			ASSERT(dq->dq_first == NULL);
1582 			tp->t_link = NULL;
1583 			dq->dq_first = dq->dq_last = tp;
1584 		}
1585 		BT_SET(dp->disp_qactmap, tpri);
1586 		if (tpri > dp->disp_max_unbound_pri)
1587 			dp->disp_max_unbound_pri = tpri;
1588 		if (tpri > dp->disp_maxrunpri) {
1589 			dp->disp_maxrunpri = tpri;
1590 			membar_enter();
1591 		}
1592 	}
1593 
1594 	cp = tp->t_cpu;
1595 	if (tp->t_cpupart != cp->cpu_part) {
1596 		/* migrate to a cpu in the new partition */
1597 		cp = tp->t_cpupart->cp_cpulist;
1598 	}
1599 	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1600 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1601 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1602 
1603 #ifndef NPROBE
1604 	/* Kernel probe */
1605 	if (tnf_tracing_active)
1606 		tnf_thread_queue(tp, cp, tpri);
1607 #endif /* NPROBE */
1608 
1609 	if (cp->cpu_chosen_level < tpri)
1610 		cp->cpu_chosen_level = tpri;
1611 	cpu_resched(cp, tpri);
1612 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1613 	(*disp_enq_thread)(cp, 0);
1614 }
1615 
1616 /*
1617  * Remove a thread from the dispatcher queue if it is on it.
1618  * It is not an error if it is not found but we return whether
1619  * or not it was found in case the caller wants to check.
1620  */
1621 int
1622 dispdeq(kthread_t *tp)
1623 {
1624 	disp_t		*dp;
1625 	dispq_t		*dq;
1626 	kthread_t	*rp;
1627 	kthread_t	*trp;
1628 	kthread_t	**ptp;
1629 	int		tpri;
1630 
1631 	ASSERT(THREAD_LOCK_HELD(tp));
1632 
1633 	if (tp->t_state != TS_RUN)
1634 		return (0);
1635 
1636 	/*
1637 	 * The thread is "swapped" or is on the swap queue and
1638 	 * hence no longer on the run queue, so return true.
1639 	 */
1640 	if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1641 		return (1);
1642 
1643 	tpri = DISP_PRIO(tp);
1644 	dp = tp->t_disp_queue;
1645 	ASSERT(tpri < dp->disp_npri);
1646 	dq = &dp->disp_q[tpri];
1647 	ptp = &dq->dq_first;
1648 	rp = *ptp;
1649 	trp = NULL;
1650 
1651 	ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1652 
1653 	/*
1654 	 * Search for thread in queue.
1655 	 * Double links would simplify this at the expense of disp/setrun.
1656 	 */
1657 	while (rp != tp && rp != NULL) {
1658 		trp = rp;
1659 		ptp = &trp->t_link;
1660 		rp = trp->t_link;
1661 	}
1662 
1663 	if (rp == NULL) {
1664 		panic("dispdeq: thread not on queue");
1665 	}
1666 
1667 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1668 
1669 	/*
1670 	 * Found it so remove it from queue.
1671 	 */
1672 	if ((*ptp = rp->t_link) == NULL)
1673 		dq->dq_last = trp;
1674 
1675 	dp->disp_nrunnable--;
1676 	if (--dq->dq_sruncnt == 0) {
1677 		dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1678 		if (dp->disp_nrunnable == 0) {
1679 			dp->disp_max_unbound_pri = -1;
1680 			dp->disp_maxrunpri = -1;
1681 		} else if (tpri == dp->disp_maxrunpri) {
1682 			int ipri;
1683 
1684 			ipri = bt_gethighbit(dp->disp_qactmap,
1685 			    dp->disp_maxrunpri >> BT_ULSHIFT);
1686 			if (ipri < dp->disp_max_unbound_pri)
1687 				dp->disp_max_unbound_pri = ipri;
1688 			dp->disp_maxrunpri = ipri;
1689 		}
1690 	}
1691 	tp->t_link = NULL;
1692 	THREAD_TRANSITION(tp);		/* put in intermediate state */
1693 	return (1);
1694 }
1695 
1696 
1697 /*
1698  * dq_sruninc and dq_srundec are public functions for
1699  * incrementing/decrementing the sruncnts when a thread on
1700  * a dispatcher queue is made schedulable/unschedulable by
1701  * resetting the TS_LOAD flag.
1702  *
1703  * The caller MUST have the thread lock and therefore the dispatcher
1704  * queue lock so that the operation which changes
1705  * the flag, the operation that checks the status of the thread to
1706  * determine if it's on a disp queue AND the call to this function
1707  * are one atomic operation with respect to interrupts.
1708  */
1709 
1710 /*
1711  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1712  */
1713 void
1714 dq_sruninc(kthread_t *t)
1715 {
1716 	ASSERT(t->t_state == TS_RUN);
1717 	ASSERT(t->t_schedflag & TS_LOAD);
1718 
1719 	THREAD_TRANSITION(t);
1720 	setfrontdq(t);
1721 }
1722 
1723 /*
1724  * See comment on calling conventions above.
1725  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1726  */
1727 void
1728 dq_srundec(kthread_t *t)
1729 {
1730 	ASSERT(t->t_schedflag & TS_LOAD);
1731 
1732 	(void) dispdeq(t);
1733 	disp_swapped_enq(t);
1734 }
1735 
1736 /*
1737  * Change the dispatcher lock of thread to the "swapped_lock"
1738  * and return with thread lock still held.
1739  *
1740  * Called with thread_lock held, in transition state, and at high spl.
1741  */
1742 void
1743 disp_swapped_enq(kthread_t *tp)
1744 {
1745 	ASSERT(THREAD_LOCK_HELD(tp));
1746 	ASSERT(tp->t_schedflag & TS_LOAD);
1747 
1748 	switch (tp->t_state) {
1749 	case TS_RUN:
1750 		disp_lock_enter_high(&swapped_lock);
1751 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1752 		break;
1753 	case TS_ONPROC:
1754 		disp_lock_enter_high(&swapped_lock);
1755 		THREAD_TRANSITION(tp);
1756 		wake_sched_sec = 1;		/* tell clock to wake sched */
1757 		THREAD_SWAP(tp, &swapped_lock);	/* set TS_RUN state and lock */
1758 		break;
1759 	default:
1760 		panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1761 	}
1762 }
1763 
1764 /*
1765  * This routine is called by setbackdq/setfrontdq if the thread is
1766  * not loaded or loaded and on the swap queue.
1767  *
1768  * Thread state TS_SLEEP implies that a swapped thread
1769  * has been woken up and needs to be swapped in by the swapper.
1770  *
1771  * Thread state TS_RUN, it implies that the priority of a swapped
1772  * thread is being increased by scheduling class (e.g. ts_update).
1773  */
1774 static void
1775 disp_swapped_setrun(kthread_t *tp)
1776 {
1777 	ASSERT(THREAD_LOCK_HELD(tp));
1778 	ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1779 
1780 	switch (tp->t_state) {
1781 	case TS_SLEEP:
1782 		disp_lock_enter_high(&swapped_lock);
1783 		/*
1784 		 * Wakeup sched immediately (i.e., next tick) if the
1785 		 * thread priority is above maxclsyspri.
1786 		 */
1787 		if (DISP_PRIO(tp) > maxclsyspri)
1788 			wake_sched = 1;
1789 		else
1790 			wake_sched_sec = 1;
1791 		THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1792 		break;
1793 	case TS_RUN:				/* called from ts_update */
1794 		break;
1795 	default:
1796 		panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1797 	}
1798 }
1799 
1800 /*
1801  *	Make a thread give up its processor.  Find the processor on
1802  *	which this thread is executing, and have that processor
1803  *	preempt.
1804  *
1805  *	We allow System Duty Cycle (SDC) threads to be preempted even if
1806  *	they are running at kernel priorities.  To implement this, we always
1807  *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1808  *	calls cpu_surrender() very often, we only preempt if there is anyone
1809  *	competing with us.
1810  */
1811 void
1812 cpu_surrender(kthread_t *tp)
1813 {
1814 	cpu_t	*cpup;
1815 	int	max_pri;
1816 	int	max_run_pri;
1817 	klwp_t	*lwp;
1818 
1819 	ASSERT(THREAD_LOCK_HELD(tp));
1820 
1821 	if (tp->t_state != TS_ONPROC)
1822 		return;
1823 	cpup = tp->t_disp_queue->disp_cpu;	/* CPU thread dispatched to */
1824 	max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1825 	max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1826 	if (max_pri < max_run_pri)
1827 		max_pri = max_run_pri;
1828 
1829 	if (tp->t_cid == sysdccid) {
1830 		uint_t t_pri = DISP_PRIO(tp);
1831 		if (t_pri > max_pri)
1832 			return;		/* we are not competing w/ anyone */
1833 		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1834 	} else {
1835 		cpup->cpu_runrun = 1;
1836 		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1837 			cpup->cpu_kprunrun = 1;
1838 		}
1839 	}
1840 
1841 	/*
1842 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1843 	 */
1844 	membar_enter();
1845 
1846 	DTRACE_SCHED1(surrender, kthread_t *, tp);
1847 
1848 	/*
1849 	 * Make the target thread take an excursion through trap()
1850 	 * to do preempt() (unless we're already in trap or post_syscall,
1851 	 * calling cpu_surrender via CL_TRAPRET).
1852 	 */
1853 	if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1854 	    lwp->lwp_state != LWP_USER) {
1855 		aston(tp);
1856 		if (cpup != CPU)
1857 			poke_cpu(cpup->cpu_id);
1858 	}
1859 	TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1860 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
1861 }
1862 
1863 /*
1864  * Commit to and ratify a scheduling decision
1865  */
1866 /*ARGSUSED*/
1867 static kthread_t *
1868 disp_ratify(kthread_t *tp, disp_t *kpq)
1869 {
1870 	pri_t	tpri, maxpri;
1871 	pri_t	maxkpri;
1872 	cpu_t	*cpup;
1873 
1874 	ASSERT(tp != NULL);
1875 	/*
1876 	 * Commit to, then ratify scheduling decision
1877 	 */
1878 	cpup = CPU;
1879 	if (cpup->cpu_runrun != 0)
1880 		cpup->cpu_runrun = 0;
1881 	if (cpup->cpu_kprunrun != 0)
1882 		cpup->cpu_kprunrun = 0;
1883 	if (cpup->cpu_chosen_level != -1)
1884 		cpup->cpu_chosen_level = -1;
1885 	membar_enter();
1886 	tpri = DISP_PRIO(tp);
1887 	maxpri = cpup->cpu_disp->disp_maxrunpri;
1888 	maxkpri = kpq->disp_maxrunpri;
1889 	if (maxpri < maxkpri)
1890 		maxpri = maxkpri;
1891 	if (tpri < maxpri) {
1892 		/*
1893 		 * should have done better
1894 		 * put this one back and indicate to try again
1895 		 */
1896 		cpup->cpu_dispthread = curthread;	/* fixup dispthread */
1897 		cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1898 		thread_lock_high(tp);
1899 		THREAD_TRANSITION(tp);
1900 		setfrontdq(tp);
1901 		thread_unlock_nopreempt(tp);
1902 
1903 		tp = NULL;
1904 	}
1905 	return (tp);
1906 }
1907 
1908 /*
1909  * See if there is any work on the dispatcher queue for other CPUs.
1910  * If there is, dequeue the best thread and return.
1911  */
1912 static kthread_t *
1913 disp_getwork(cpu_t *cp)
1914 {
1915 	cpu_t		*ocp;		/* other CPU */
1916 	cpu_t		*ocp_start;
1917 	cpu_t		*tcp;		/* target local CPU */
1918 	kthread_t	*tp;
1919 	kthread_t	*retval = NULL;
1920 	pri_t		maxpri;
1921 	disp_t		*kpq;		/* kp queue for this partition */
1922 	lpl_t		*lpl, *lpl_leaf;
1923 	int		leafidx, startidx;
1924 	hrtime_t	stealtime;
1925 	lgrp_id_t	local_id;
1926 
1927 	maxpri = -1;
1928 	tcp = NULL;
1929 
1930 	kpq = &cp->cpu_part->cp_kp_queue;
1931 	while (kpq->disp_maxrunpri >= 0) {
1932 		/*
1933 		 * Try to take a thread from the kp_queue.
1934 		 */
1935 		tp = (disp_getbest(kpq));
1936 		if (tp)
1937 			return (disp_ratify(tp, kpq));
1938 	}
1939 
1940 	kpreempt_disable();		/* protect the cpu_active list */
1941 
1942 	/*
1943 	 * Try to find something to do on another CPU's run queue.
1944 	 * Loop through all other CPUs looking for the one with the highest
1945 	 * priority unbound thread.
1946 	 *
1947 	 * On NUMA machines, the partition's CPUs are consulted in order of
1948 	 * distance from the current CPU. This way, the first available
1949 	 * work found is also the closest, and will suffer the least
1950 	 * from being migrated.
1951 	 */
1952 	lpl = lpl_leaf = cp->cpu_lpl;
1953 	local_id = lpl_leaf->lpl_lgrpid;
1954 	leafidx = startidx = 0;
1955 
1956 	/*
1957 	 * This loop traverses the lpl hierarchy. Higher level lpls represent
1958 	 * broader levels of locality
1959 	 */
1960 	do {
1961 		/* This loop iterates over the lpl's leaves */
1962 		do {
1963 			if (lpl_leaf != cp->cpu_lpl)
1964 				ocp = lpl_leaf->lpl_cpus;
1965 			else
1966 				ocp = cp->cpu_next_lpl;
1967 
1968 			/* This loop iterates over the CPUs in the leaf */
1969 			ocp_start = ocp;
1970 			do {
1971 				pri_t pri;
1972 
1973 				ASSERT(CPU_ACTIVE(ocp));
1974 
1975 				/*
1976 				 * End our stroll around this lpl if:
1977 				 *
1978 				 * - Something became runnable on the local
1979 				 *   queue...which also ends our stroll around
1980 				 *   the partition.
1981 				 *
1982 				 * - We happen across another idle CPU.
1983 				 *   Since it is patrolling the next portion
1984 				 *   of the lpl's list (assuming it's not
1985 				 *   halted, or busy servicing an interrupt),
1986 				 *   move to the next higher level of locality.
1987 				 */
1988 				if (cp->cpu_disp->disp_nrunnable != 0) {
1989 					kpreempt_enable();
1990 					return (NULL);
1991 				}
1992 				if (ocp->cpu_dispatch_pri == -1) {
1993 					if (ocp->cpu_disp_flags &
1994 					    CPU_DISP_HALTED ||
1995 					    ocp->cpu_intr_actv != 0)
1996 						continue;
1997 					else
1998 						goto next_level;
1999 				}
2000 
2001 				/*
2002 				 * If there's only one thread and the CPU
2003 				 * is in the middle of a context switch,
2004 				 * or it's currently running the idle thread,
2005 				 * don't steal it.
2006 				 */
2007 				if ((ocp->cpu_disp_flags &
2008 				    CPU_DISP_DONTSTEAL) &&
2009 				    ocp->cpu_disp->disp_nrunnable == 1)
2010 					continue;
2011 
2012 				pri = ocp->cpu_disp->disp_max_unbound_pri;
2013 				if (pri > maxpri) {
2014 					/*
2015 					 * Don't steal threads that we attempted
2016 					 * to steal recently until they're ready
2017 					 * to be stolen again.
2018 					 */
2019 					stealtime = ocp->cpu_disp->disp_steal;
2020 					if (stealtime == 0 ||
2021 					    stealtime - gethrtime() <= 0) {
2022 						maxpri = pri;
2023 						tcp = ocp;
2024 					} else {
2025 						/*
2026 						 * Don't update tcp, just set
2027 						 * the retval to T_DONTSTEAL, so
2028 						 * that if no acceptable CPUs
2029 						 * are found the return value
2030 						 * will be T_DONTSTEAL rather
2031 						 * then NULL.
2032 						 */
2033 						retval = T_DONTSTEAL;
2034 					}
2035 				}
2036 			} while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2037 
2038 			/*
2039 			 * Iterate to the next leaf lpl in the resource set
2040 			 * at this level of locality. If we hit the end of
2041 			 * the set, wrap back around to the beginning.
2042 			 *
2043 			 * Note: This iteration is NULL terminated for a reason
2044 			 * see lpl_topo_bootstrap() in lgrp.c for details.
2045 			 */
2046 			if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2047 				leafidx = 0;
2048 				lpl_leaf = lpl->lpl_rset[leafidx];
2049 			}
2050 		} while (leafidx != startidx);
2051 
2052 next_level:
2053 		/*
2054 		 * Expand the search to include farther away CPUs (next
2055 		 * locality level). The closer CPUs that have already been
2056 		 * checked will be checked again. In doing so, idle CPUs
2057 		 * will tend to be more aggresive about stealing from CPUs
2058 		 * that are closer (since the closer CPUs will be considered
2059 		 * more often).
2060 		 * Begin at this level with the CPUs local leaf lpl.
2061 		 */
2062 		if ((lpl = lpl->lpl_parent) != NULL) {
2063 			leafidx = startidx = lpl->lpl_id2rset[local_id];
2064 			lpl_leaf = lpl->lpl_rset[leafidx];
2065 		}
2066 	} while (!tcp && lpl);
2067 
2068 	kpreempt_enable();
2069 
2070 	/*
2071 	 * If another queue looks good, and there is still nothing on
2072 	 * the local queue, try to transfer one or more threads
2073 	 * from it to our queue.
2074 	 */
2075 	if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2076 		tp = disp_getbest(tcp->cpu_disp);
2077 		if (tp == NULL || tp == T_DONTSTEAL)
2078 			return (tp);
2079 		return (disp_ratify(tp, kpq));
2080 	}
2081 	return (retval);
2082 }
2083 
2084 
2085 /*
2086  * disp_fix_unbound_pri()
2087  *	Determines the maximum priority of unbound threads on the queue.
2088  *	The priority is kept for the queue, but is only increased, never
2089  *	reduced unless some CPU is looking for something on that queue.
2090  *
2091  *	The priority argument is the known upper limit.
2092  *
2093  *	Perhaps this should be kept accurately, but that probably means
2094  *	separate bitmaps for bound and unbound threads.  Since only idled
2095  *	CPUs will have to do this recalculation, it seems better this way.
2096  */
2097 static void
2098 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2099 {
2100 	kthread_t	*tp;
2101 	dispq_t		*dq;
2102 	ulong_t		*dqactmap = dp->disp_qactmap;
2103 	ulong_t		mapword;
2104 	int		wx;
2105 
2106 	ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2107 
2108 	ASSERT(pri >= 0);			/* checked by caller */
2109 
2110 	/*
2111 	 * Start the search at the next lowest priority below the supplied
2112 	 * priority.  This depends on the bitmap implementation.
2113 	 */
2114 	do {
2115 		wx = pri >> BT_ULSHIFT;		/* index of word in map */
2116 
2117 		/*
2118 		 * Form mask for all lower priorities in the word.
2119 		 */
2120 		mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2121 
2122 		/*
2123 		 * Get next lower active priority.
2124 		 */
2125 		if (mapword != 0) {
2126 			pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2127 		} else if (wx > 0) {
2128 			pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2129 			if (pri < 0)
2130 				break;
2131 		} else {
2132 			pri = -1;
2133 			break;
2134 		}
2135 
2136 		/*
2137 		 * Search the queue for unbound, runnable threads.
2138 		 */
2139 		dq = &dp->disp_q[pri];
2140 		tp = dq->dq_first;
2141 
2142 		while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2143 			tp = tp->t_link;
2144 		}
2145 
2146 		/*
2147 		 * If a thread was found, set the priority and return.
2148 		 */
2149 	} while (tp == NULL);
2150 
2151 	/*
2152 	 * pri holds the maximum unbound thread priority or -1.
2153 	 */
2154 	if (dp->disp_max_unbound_pri != pri)
2155 		dp->disp_max_unbound_pri = pri;
2156 }
2157 
2158 /*
2159  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2160  * 	check if the CPU to which is was previously bound should have
2161  * 	its disp_max_unbound_pri increased.
2162  */
2163 void
2164 disp_adjust_unbound_pri(kthread_t *tp)
2165 {
2166 	disp_t *dp;
2167 	pri_t tpri;
2168 
2169 	ASSERT(THREAD_LOCK_HELD(tp));
2170 
2171 	/*
2172 	 * Don't do anything if the thread is not bound, or
2173 	 * currently not runnable or swapped out.
2174 	 */
2175 	if (tp->t_bound_cpu == NULL ||
2176 	    tp->t_state != TS_RUN ||
2177 	    tp->t_schedflag & TS_ON_SWAPQ)
2178 		return;
2179 
2180 	tpri = DISP_PRIO(tp);
2181 	dp = tp->t_bound_cpu->cpu_disp;
2182 	ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2183 	if (tpri > dp->disp_max_unbound_pri)
2184 		dp->disp_max_unbound_pri = tpri;
2185 }
2186 
2187 /*
2188  * disp_getbest()
2189  *   De-queue the highest priority unbound runnable thread.
2190  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2191  *   Returns NULL if nothing found.
2192  *   Returns T_DONTSTEAL if the thread was not stealable.
2193  *   so that the caller will try again later.
2194  *
2195  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2196  *   its type.
2197  */
2198 static kthread_t *
2199 disp_getbest(disp_t *dp)
2200 {
2201 	kthread_t	*tp;
2202 	dispq_t		*dq;
2203 	pri_t		pri;
2204 	cpu_t		*cp, *tcp;
2205 	boolean_t	allbound;
2206 
2207 	disp_lock_enter(&dp->disp_lock);
2208 
2209 	/*
2210 	 * If there is nothing to run, or the CPU is in the middle of a
2211 	 * context switch of the only thread, return NULL.
2212 	 */
2213 	tcp = dp->disp_cpu;
2214 	cp = CPU;
2215 	pri = dp->disp_max_unbound_pri;
2216 	if (pri == -1 ||
2217 	    (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2218 	    tcp->cpu_disp->disp_nrunnable == 1)) {
2219 		disp_lock_exit_nopreempt(&dp->disp_lock);
2220 		return (NULL);
2221 	}
2222 
2223 	dq = &dp->disp_q[pri];
2224 
2225 
2226 	/*
2227 	 * Assume that all threads are bound on this queue, and change it
2228 	 * later when we find out that it is not the case.
2229 	 */
2230 	allbound = B_TRUE;
2231 	for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2232 		hrtime_t now, nosteal, rqtime;
2233 
2234 		/*
2235 		 * Skip over bound threads which could be here even
2236 		 * though disp_max_unbound_pri indicated this level.
2237 		 */
2238 		if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2239 			continue;
2240 
2241 		/*
2242 		 * We've got some unbound threads on this queue, so turn
2243 		 * the allbound flag off now.
2244 		 */
2245 		allbound = B_FALSE;
2246 
2247 		/*
2248 		 * The thread is a candidate for stealing from its run queue. We
2249 		 * don't want to steal threads that became runnable just a
2250 		 * moment ago. This improves CPU affinity for threads that get
2251 		 * preempted for short periods of time and go back on the run
2252 		 * queue.
2253 		 *
2254 		 * We want to let it stay on its run queue if it was only placed
2255 		 * there recently and it was running on the same CPU before that
2256 		 * to preserve its cache investment. For the thread to remain on
2257 		 * its run queue, ALL of the following conditions must be
2258 		 * satisfied:
2259 		 *
2260 		 * - the disp queue should not be the kernel preemption queue
2261 		 * - delayed idle stealing should not be disabled
2262 		 * - nosteal_nsec should be non-zero
2263 		 * - it should run with user priority
2264 		 * - it should be on the run queue of the CPU where it was
2265 		 *   running before being placed on the run queue
2266 		 * - it should be the only thread on the run queue (to prevent
2267 		 *   extra scheduling latency for other threads)
2268 		 * - it should sit on the run queue for less than per-chip
2269 		 *   nosteal interval or global nosteal interval
2270 		 * - in case of CPUs with shared cache it should sit in a run
2271 		 *   queue of a CPU from a different chip
2272 		 *
2273 		 * The checks are arranged so that the ones that are faster are
2274 		 * placed earlier.
2275 		 */
2276 		if (tcp == NULL ||
2277 		    pri >= minclsyspri ||
2278 		    tp->t_cpu != tcp)
2279 			break;
2280 
2281 		/*
2282 		 * Steal immediately if, due to CMT processor architecture
2283 		 * migraiton between cp and tcp would incur no performance
2284 		 * penalty.
2285 		 */
2286 		if (pg_cmt_can_migrate(cp, tcp))
2287 			break;
2288 
2289 		nosteal = nosteal_nsec;
2290 		if (nosteal == 0)
2291 			break;
2292 
2293 		/*
2294 		 * Calculate time spent sitting on run queue
2295 		 */
2296 		now = gethrtime_unscaled();
2297 		rqtime = now - tp->t_waitrq;
2298 		scalehrtime(&rqtime);
2299 
2300 		/*
2301 		 * Steal immediately if the time spent on this run queue is more
2302 		 * than allowed nosteal delay.
2303 		 *
2304 		 * Negative rqtime check is needed here to avoid infinite
2305 		 * stealing delays caused by unlikely but not impossible
2306 		 * drifts between CPU times on different CPUs.
2307 		 */
2308 		if (rqtime > nosteal || rqtime < 0)
2309 			break;
2310 
2311 		DTRACE_PROBE4(nosteal, kthread_t *, tp,
2312 		    cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2313 		scalehrtime(&now);
2314 		/*
2315 		 * Calculate when this thread becomes stealable
2316 		 */
2317 		now += (nosteal - rqtime);
2318 
2319 		/*
2320 		 * Calculate time when some thread becomes stealable
2321 		 */
2322 		if (now < dp->disp_steal)
2323 			dp->disp_steal = now;
2324 	}
2325 
2326 	/*
2327 	 * If there were no unbound threads on this queue, find the queue
2328 	 * where they are and then return later. The value of
2329 	 * disp_max_unbound_pri is not always accurate because it isn't
2330 	 * reduced until another idle CPU looks for work.
2331 	 */
2332 	if (allbound)
2333 		disp_fix_unbound_pri(dp, pri);
2334 
2335 	/*
2336 	 * If we reached the end of the queue and found no unbound threads
2337 	 * then return NULL so that other CPUs will be considered.  If there
2338 	 * are unbound threads but they cannot yet be stolen, then
2339 	 * return T_DONTSTEAL and try again later.
2340 	 */
2341 	if (tp == NULL) {
2342 		disp_lock_exit_nopreempt(&dp->disp_lock);
2343 		return (allbound ? NULL : T_DONTSTEAL);
2344 	}
2345 
2346 	/*
2347 	 * Found a runnable, unbound thread, so remove it from queue.
2348 	 * dispdeq() requires that we have the thread locked, and we do,
2349 	 * by virtue of holding the dispatch queue lock.  dispdeq() will
2350 	 * put the thread in transition state, thereby dropping the dispq
2351 	 * lock.
2352 	 */
2353 
2354 #ifdef DEBUG
2355 	{
2356 		int	thread_was_on_queue;
2357 
2358 		thread_was_on_queue = dispdeq(tp);	/* drops disp_lock */
2359 		ASSERT(thread_was_on_queue);
2360 	}
2361 
2362 #else /* DEBUG */
2363 	(void) dispdeq(tp);			/* drops disp_lock */
2364 #endif /* DEBUG */
2365 
2366 	/*
2367 	 * Reset the disp_queue steal time - we do not know what is the smallest
2368 	 * value across the queue is.
2369 	 */
2370 	dp->disp_steal = 0;
2371 
2372 	tp->t_schedflag |= TS_DONT_SWAP;
2373 
2374 	/*
2375 	 * Setup thread to run on the current CPU.
2376 	 */
2377 	tp->t_disp_queue = cp->cpu_disp;
2378 
2379 	cp->cpu_dispthread = tp;		/* protected by spl only */
2380 	cp->cpu_dispatch_pri = pri;
2381 
2382 	/*
2383 	 * There can be a memory synchronization race between disp_getbest()
2384 	 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2385 	 * to preempt the current thread to run the enqueued thread while
2386 	 * disp_getbest() and disp_ratify() are changing the current thread
2387 	 * to the stolen thread. This may lead to a situation where
2388 	 * cpu_resched() tries to preempt the wrong thread and the
2389 	 * stolen thread continues to run on the CPU which has been tagged
2390 	 * for preemption.
2391 	 * Later the clock thread gets enqueued but doesn't get to run on the
2392 	 * CPU causing the system to hang.
2393 	 *
2394 	 * To avoid this, grabbing and dropping the disp_lock (which does
2395 	 * a memory barrier) is needed to synchronize the execution of
2396 	 * cpu_resched() with disp_getbest() and disp_ratify() and
2397 	 * synchronize the memory read and written by cpu_resched(),
2398 	 * disp_getbest(), and disp_ratify() with each other.
2399 	 *  (see CR#6482861 for more details).
2400 	 */
2401 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2402 	disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2403 
2404 	ASSERT(pri == DISP_PRIO(tp));
2405 
2406 	DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2407 
2408 	thread_onproc(tp, cp);			/* set t_state to TS_ONPROC */
2409 
2410 	/*
2411 	 * Return with spl high so that swtch() won't need to raise it.
2412 	 * The disp_lock was dropped by dispdeq().
2413 	 */
2414 
2415 	return (tp);
2416 }
2417 
2418 /*
2419  * disp_bound_common() - common routine for higher level functions
2420  *	that check for bound threads under certain conditions.
2421  *	If 'threadlistsafe' is set then there is no need to acquire
2422  *	pidlock to stop the thread list from changing (eg, if
2423  *	disp_bound_* is called with cpus paused).
2424  */
2425 static int
2426 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2427 {
2428 	int		found = 0;
2429 	kthread_t	*tp;
2430 
2431 	ASSERT(flag);
2432 
2433 	if (!threadlistsafe)
2434 		mutex_enter(&pidlock);
2435 	tp = curthread;		/* faster than allthreads */
2436 	do {
2437 		if (tp->t_state != TS_FREE) {
2438 			/*
2439 			 * If an interrupt thread is busy, but the
2440 			 * caller doesn't care (i.e. BOUND_INTR is off),
2441 			 * then just ignore it and continue through.
2442 			 */
2443 			if ((tp->t_flag & T_INTR_THREAD) &&
2444 			    !(flag & BOUND_INTR))
2445 				continue;
2446 
2447 			/*
2448 			 * Skip the idle thread for the CPU
2449 			 * we're about to set offline.
2450 			 */
2451 			if (tp == cp->cpu_idle_thread)
2452 				continue;
2453 
2454 			/*
2455 			 * Skip the pause thread for the CPU
2456 			 * we're about to set offline.
2457 			 */
2458 			if (tp == cp->cpu_pause_thread)
2459 				continue;
2460 
2461 			if ((flag & BOUND_CPU) &&
2462 			    (tp->t_bound_cpu == cp ||
2463 			    tp->t_bind_cpu == cp->cpu_id ||
2464 			    tp->t_weakbound_cpu == cp)) {
2465 				found = 1;
2466 				break;
2467 			}
2468 
2469 			if ((flag & BOUND_PARTITION) &&
2470 			    (tp->t_cpupart == cp->cpu_part)) {
2471 				found = 1;
2472 				break;
2473 			}
2474 		}
2475 	} while ((tp = tp->t_next) != curthread && found == 0);
2476 	if (!threadlistsafe)
2477 		mutex_exit(&pidlock);
2478 	return (found);
2479 }
2480 
2481 /*
2482  * disp_bound_threads - return nonzero if threads are bound to the processor.
2483  *	Called infrequently.  Keep this simple.
2484  *	Includes threads that are asleep or stopped but not onproc.
2485  */
2486 int
2487 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2488 {
2489 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2490 }
2491 
2492 /*
2493  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2494  * to the given processor, including interrupt threads.
2495  */
2496 int
2497 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2498 {
2499 	return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2500 }
2501 
2502 /*
2503  * disp_bound_partition - return nonzero if threads are bound to the same
2504  * partition as the processor.
2505  *	Called infrequently.  Keep this simple.
2506  *	Includes threads that are asleep or stopped but not onproc.
2507  */
2508 int
2509 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2510 {
2511 	return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2512 }
2513 
2514 /*
2515  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2516  * threads to other CPUs.
2517  */
2518 void
2519 disp_cpu_inactive(cpu_t *cp)
2520 {
2521 	kthread_t	*tp;
2522 	disp_t		*dp = cp->cpu_disp;
2523 	dispq_t		*dq;
2524 	pri_t		pri;
2525 	int		wasonq;
2526 
2527 	disp_lock_enter(&dp->disp_lock);
2528 	while ((pri = dp->disp_max_unbound_pri) != -1) {
2529 		dq = &dp->disp_q[pri];
2530 		tp = dq->dq_first;
2531 
2532 		/*
2533 		 * Skip over bound threads.
2534 		 */
2535 		while (tp != NULL && tp->t_bound_cpu != NULL) {
2536 			tp = tp->t_link;
2537 		}
2538 
2539 		if (tp == NULL) {
2540 			/* disp_max_unbound_pri must be inaccurate, so fix it */
2541 			disp_fix_unbound_pri(dp, pri);
2542 			continue;
2543 		}
2544 
2545 		wasonq = dispdeq(tp);		/* drops disp_lock */
2546 		ASSERT(wasonq);
2547 		ASSERT(tp->t_weakbound_cpu == NULL);
2548 
2549 		setbackdq(tp);
2550 		/*
2551 		 * Called from cpu_offline:
2552 		 *
2553 		 * cp has already been removed from the list of active cpus
2554 		 * and tp->t_cpu has been changed so there is no risk of
2555 		 * tp ending up back on cp.
2556 		 *
2557 		 * Called from cpupart_move_cpu:
2558 		 *
2559 		 * The cpu has moved to a new cpupart.  Any threads that
2560 		 * were on it's dispatch queues before the move remain
2561 		 * in the old partition and can't run in the new partition.
2562 		 */
2563 		ASSERT(tp->t_cpu != cp);
2564 		thread_unlock(tp);
2565 
2566 		disp_lock_enter(&dp->disp_lock);
2567 	}
2568 	disp_lock_exit(&dp->disp_lock);
2569 }
2570 
2571 /*
2572  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2573  *	The hint passed in is used as a starting point so we don't favor
2574  *	CPU 0 or any other CPU.  The caller should pass in the most recently
2575  *	used CPU for the thread.
2576  *
2577  *	The lgroup and priority are used to determine the best CPU to run on
2578  *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
2579  *	the thread priority will indicate whether the thread will actually run
2580  *	there.  To pick the best CPU, the CPUs inside and outside of the given
2581  *	lgroup which are running the lowest priority threads are found.  The
2582  *	remote CPU is chosen only if the thread will not run locally on a CPU
2583  *	within the lgroup, but will run on the remote CPU. If the thread
2584  *	cannot immediately run on any CPU, the best local CPU will be chosen.
2585  *
2586  *	The lpl specified also identifies the cpu partition from which
2587  *	disp_lowpri_cpu should select a CPU.
2588  *
2589  *	curcpu is used to indicate that disp_lowpri_cpu is being called on
2590  *      behalf of the current thread. (curthread is looking for a new cpu)
2591  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2592  *      ignored.
2593  *
2594  *      If a cpu is the target of an offline request then try to avoid it.
2595  *
2596  *	This function must be called at either high SPL, or with preemption
2597  *	disabled, so that the "hint" CPU cannot be removed from the online
2598  *	CPU list while we are traversing it.
2599  */
2600 cpu_t *
2601 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2602 {
2603 	cpu_t	*bestcpu;
2604 	cpu_t	*besthomecpu;
2605 	cpu_t   *cp, *cpstart;
2606 
2607 	pri_t   bestpri;
2608 	pri_t   cpupri;
2609 
2610 	klgrpset_t	done;
2611 	klgrpset_t	cur_set;
2612 
2613 	lpl_t		*lpl_iter, *lpl_leaf;
2614 	int		i;
2615 
2616 	/*
2617 	 * Scan for a CPU currently running the lowest priority thread.
2618 	 * Cannot get cpu_lock here because it is adaptive.
2619 	 * We do not require lock on CPU list.
2620 	 */
2621 	ASSERT(hint != NULL);
2622 	ASSERT(lpl != NULL);
2623 	ASSERT(lpl->lpl_ncpu > 0);
2624 
2625 	/*
2626 	 * First examine local CPUs. Note that it's possible the hint CPU
2627 	 * passed in in remote to the specified home lgroup. If our priority
2628 	 * isn't sufficient enough such that we can run immediately at home,
2629 	 * then examine CPUs remote to our home lgroup.
2630 	 * We would like to give preference to CPUs closest to "home".
2631 	 * If we can't find a CPU where we'll run at a given level
2632 	 * of locality, we expand our search to include the next level.
2633 	 */
2634 	bestcpu = besthomecpu = NULL;
2635 	klgrpset_clear(done);
2636 	/* start with lpl we were passed */
2637 
2638 	lpl_iter = lpl;
2639 
2640 	do {
2641 
2642 		bestpri = SHRT_MAX;
2643 		klgrpset_clear(cur_set);
2644 
2645 		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2646 			lpl_leaf = lpl_iter->lpl_rset[i];
2647 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2648 				continue;
2649 
2650 			klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2651 
2652 			if (hint->cpu_lpl == lpl_leaf)
2653 				cp = cpstart = hint;
2654 			else
2655 				cp = cpstart = lpl_leaf->lpl_cpus;
2656 
2657 			do {
2658 				if (cp == curcpu)
2659 					cpupri = -1;
2660 				else if (cp == cpu_inmotion)
2661 					cpupri = SHRT_MAX;
2662 				else
2663 					cpupri = cp->cpu_dispatch_pri;
2664 				if (cp->cpu_disp->disp_maxrunpri > cpupri)
2665 					cpupri = cp->cpu_disp->disp_maxrunpri;
2666 				if (cp->cpu_chosen_level > cpupri)
2667 					cpupri = cp->cpu_chosen_level;
2668 				if (cpupri < bestpri) {
2669 					if (CPU_IDLING(cpupri)) {
2670 						ASSERT((cp->cpu_flags &
2671 						    CPU_QUIESCED) == 0);
2672 						return (cp);
2673 					}
2674 					bestcpu = cp;
2675 					bestpri = cpupri;
2676 				}
2677 			} while ((cp = cp->cpu_next_lpl) != cpstart);
2678 		}
2679 
2680 		if (bestcpu && (tpri > bestpri)) {
2681 			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682 			return (bestcpu);
2683 		}
2684 		if (besthomecpu == NULL)
2685 			besthomecpu = bestcpu;
2686 		/*
2687 		 * Add the lgrps we just considered to the "done" set
2688 		 */
2689 		klgrpset_or(done, cur_set);
2690 
2691 	} while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2692 
2693 	/*
2694 	 * The specified priority isn't high enough to run immediately
2695 	 * anywhere, so just return the best CPU from the home lgroup.
2696 	 */
2697 	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2698 	return (besthomecpu);
2699 }
2700 
2701 /*
2702  * This routine provides the generic idle cpu function for all processors.
2703  * If a processor has some specific code to execute when idle (say, to stop
2704  * the pipeline and save power) then that routine should be defined in the
2705  * processors specific code (module_xx.c) and the global variable idle_cpu
2706  * set to that function.
2707  */
2708 static void
2709 generic_idle_cpu(void)
2710 {
2711 }
2712 
2713 /*ARGSUSED*/
2714 static void
2715 generic_enq_thread(cpu_t *cpu, int bound)
2716 {
2717 }
2718