1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
59
60 #include <vm/as.h>
61
62 #define BOUND_CPU 0x1
63 #define BOUND_PARTITION 0x2
64 #define BOUND_INTR 0x4
65
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 disp_t *dp;
69 dispq_t *olddispq;
70 dispq_t *newdispq;
71 ulong_t *olddqactmap;
72 ulong_t *newdqactmap;
73 int oldnglobpris;
74 };
75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76 disp_t *dp);
77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void disp_dq_free(struct disp_queue_info *dptr);
79
80 /* platform-specific routine to call when processor is idle */
81 static void generic_idle_cpu();
82 void (*idle_cpu)() = generic_idle_cpu;
83
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void idle_enter();
86 static void idle_exit();
87
88 /* platform-specific routine to call when thread is enqueued */
89 static void generic_enq_thread(cpu_t *, int);
90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91
92 pri_t kpreemptpri; /* priority where kernel preemption applies */
93 pri_t upreemptpri = 0; /* priority where normal preemption applies */
94 pri_t intr_pri; /* interrupt thread priority base level */
95
96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
98 disp_t cpu0_disp; /* boot CPU's dispatch queue */
99 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
100 int nswapped; /* total number of swapped threads */
101 void disp_swapped_enq(kthread_t *tp);
102 static void disp_swapped_setrun(kthread_t *tp);
103 static void cpu_resched(cpu_t *cp, pri_t tpri);
104
105 /*
106 * If this is set, only interrupt threads will cause kernel preemptions.
107 * This is done by changing the value of kpreemptpri. kpreemptpri
108 * will either be the max sysclass pri or the min interrupt pri.
109 */
110 int only_intr_kpreempt;
111
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define SETKP_BACK 0
116 #define SETKP_FRONT 1
117 /*
118 * Parameter that determines how recently a thread must have run
119 * on the CPU to be considered loosely-bound to that CPU to reduce
120 * cold cache effects. The interval is in hertz.
121 */
122 #define RECHOOSE_INTERVAL 3
123 int rechoose_interval = RECHOOSE_INTERVAL;
124
125 /*
126 * Parameter that determines how long (in nanoseconds) a thread must
127 * be sitting on a run queue before it can be stolen by another CPU
128 * to reduce migrations. The interval is in nanoseconds.
129 *
130 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132 * here indicating it is uninitiallized.
133 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
134 *
135 */
136 #define NOSTEAL_UNINITIALIZED (-1)
137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 extern void cmp_set_nosteal_interval(void);
139
140 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
141
142 disp_lock_t transition_lock; /* lock on transitioning threads */
143 disp_lock_t stop_lock; /* lock on stopped threads */
144
145 static void cpu_dispqalloc(int numpris);
146
147 /*
148 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149 * a thread because it was sitting on its run queue for a very short
150 * period of time.
151 */
152 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
153
154 static kthread_t *disp_getwork(cpu_t *to);
155 static kthread_t *disp_getbest(disp_t *from);
156 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
157
158 void swtch_to(kthread_t *);
159
160 /*
161 * dispatcher and scheduler initialization
162 */
163
164 /*
165 * disp_setup - Common code to calculate and allocate dispatcher
166 * variables and structures based on the maximum priority.
167 */
168 static void
disp_setup(pri_t maxglobpri,pri_t oldnglobpris)169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 {
171 pri_t newnglobpris;
172
173 ASSERT(MUTEX_HELD(&cpu_lock));
174
175 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176
177 if (newnglobpris > oldnglobpris) {
178 /*
179 * Allocate new kp queues for each CPU partition.
180 */
181 cpupart_kpqalloc(newnglobpris);
182
183 /*
184 * Allocate new dispatch queues for each CPU.
185 */
186 cpu_dispqalloc(newnglobpris);
187
188 /*
189 * compute new interrupt thread base priority
190 */
191 intr_pri = maxglobpri;
192 if (only_intr_kpreempt) {
193 kpreemptpri = intr_pri + 1;
194 if (kpqpri == KPQPRI)
195 kpqpri = kpreemptpri;
196 }
197 v.v_nglobpris = newnglobpris;
198 }
199 }
200
201 /*
202 * dispinit - Called to initialize all loaded classes and the
203 * dispatcher framework.
204 */
205 void
dispinit(void)206 dispinit(void)
207 {
208 id_t cid;
209 pri_t maxglobpri;
210 pri_t cl_maxglobpri;
211
212 maxglobpri = -1;
213
214 /*
215 * Initialize transition lock, which will always be set.
216 */
217 DISP_LOCK_INIT(&transition_lock);
218 disp_lock_enter_high(&transition_lock);
219 DISP_LOCK_INIT(&stop_lock);
220
221 mutex_enter(&cpu_lock);
222 CPU->cpu_disp->disp_maxrunpri = -1;
223 CPU->cpu_disp->disp_max_unbound_pri = -1;
224
225 /*
226 * Initialize the default CPU partition.
227 */
228 cpupart_initialize_default();
229 /*
230 * Call the class specific initialization functions for
231 * all pre-installed schedulers.
232 *
233 * We pass the size of a class specific parameter
234 * buffer to each of the initialization functions
235 * to try to catch problems with backward compatibility
236 * of class modules.
237 *
238 * For example a new class module running on an old system
239 * which didn't provide sufficiently large parameter buffers
240 * would be bad news. Class initialization modules can check for
241 * this and take action if they detect a problem.
242 */
243
244 for (cid = 0; cid < nclass; cid++) {
245 sclass_t *sc;
246
247 sc = &sclass[cid];
248 if (SCHED_INSTALLED(sc)) {
249 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 &sc->cl_funcs);
251 if (cl_maxglobpri > maxglobpri)
252 maxglobpri = cl_maxglobpri;
253 }
254 }
255
256 /*
257 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
258 * to say, maxclsyspri + 1. However, over time, the system has used
259 * more and more asynchronous kernel threads, with an increasing number
260 * of these doing work on direct behalf of higher-level software (e.g.,
261 * network processing). This has led to potential priority inversions:
262 * threads doing low-priority lengthy kernel work can effectively
263 * delay kernel-level processing of higher-priority data. To minimize
264 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
265 * the kernel that runs at maxclsyspri will therefore induce kernel
266 * preemption, and this priority should be used if/when an asynchronous
267 * thread (or, as is often the case, task queue) is performing a task
268 * on behalf of higher-level software (or any task that is otherwise
269 * latency-sensitve).
270 */
271 kpreemptpri = (pri_t)v.v_maxsyspri;
272 if (kpqpri == KPQPRI)
273 kpqpri = kpreemptpri;
274
275 ASSERT(maxglobpri >= 0);
276 disp_setup(maxglobpri, 0);
277
278 mutex_exit(&cpu_lock);
279
280 /*
281 * Platform specific sticky scheduler setup.
282 */
283 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
284 cmp_set_nosteal_interval();
285
286 /*
287 * Get the default class ID; this may be later modified via
288 * dispadmin(1M). This will load the class (normally TS) and that will
289 * call disp_add(), which is why we had to drop cpu_lock first.
290 */
291 if (getcid(defaultclass, &defaultcid) != 0) {
292 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
293 defaultclass);
294 }
295 }
296
297 /*
298 * disp_add - Called with class pointer to initialize the dispatcher
299 * for a newly loaded class.
300 */
301 void
disp_add(sclass_t * clp)302 disp_add(sclass_t *clp)
303 {
304 pri_t maxglobpri;
305 pri_t cl_maxglobpri;
306
307 mutex_enter(&cpu_lock);
308 /*
309 * Initialize the scheduler class.
310 */
311 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
312 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
313 if (cl_maxglobpri > maxglobpri)
314 maxglobpri = cl_maxglobpri;
315
316 /*
317 * Save old queue information. Since we're initializing a
318 * new scheduling class which has just been loaded, then
319 * the size of the dispq may have changed. We need to handle
320 * that here.
321 */
322 disp_setup(maxglobpri, v.v_nglobpris);
323
324 mutex_exit(&cpu_lock);
325 }
326
327
328 /*
329 * For each CPU, allocate new dispatch queues
330 * with the stated number of priorities.
331 */
332 static void
cpu_dispqalloc(int numpris)333 cpu_dispqalloc(int numpris)
334 {
335 cpu_t *cpup;
336 struct disp_queue_info *disp_mem;
337 int i, num;
338
339 ASSERT(MUTEX_HELD(&cpu_lock));
340
341 disp_mem = kmem_zalloc(NCPU *
342 sizeof (struct disp_queue_info), KM_SLEEP);
343
344 /*
345 * This routine must allocate all of the memory before stopping
346 * the cpus because it must not sleep in kmem_alloc while the
347 * CPUs are stopped. Locks they hold will not be freed until they
348 * are restarted.
349 */
350 i = 0;
351 cpup = cpu_list;
352 do {
353 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
354 i++;
355 cpup = cpup->cpu_next;
356 } while (cpup != cpu_list);
357 num = i;
358
359 pause_cpus(NULL, NULL);
360 for (i = 0; i < num; i++)
361 disp_dq_assign(&disp_mem[i], numpris);
362 start_cpus();
363
364 /*
365 * I must free all of the memory after starting the cpus because
366 * I can not risk sleeping in kmem_free while the cpus are stopped.
367 */
368 for (i = 0; i < num; i++)
369 disp_dq_free(&disp_mem[i]);
370
371 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
372 }
373
374 static void
disp_dq_alloc(struct disp_queue_info * dptr,int numpris,disp_t * dp)375 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
376 {
377 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
378 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
379 sizeof (long), KM_SLEEP);
380 dptr->dp = dp;
381 }
382
383 static void
disp_dq_assign(struct disp_queue_info * dptr,int numpris)384 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
385 {
386 disp_t *dp;
387
388 dp = dptr->dp;
389 dptr->olddispq = dp->disp_q;
390 dptr->olddqactmap = dp->disp_qactmap;
391 dptr->oldnglobpris = dp->disp_npri;
392
393 ASSERT(dptr->oldnglobpris < numpris);
394
395 if (dptr->olddispq != NULL) {
396 /*
397 * Use kcopy because bcopy is platform-specific
398 * and could block while we might have paused the cpus.
399 */
400 (void) kcopy(dptr->olddispq, dptr->newdispq,
401 dptr->oldnglobpris * sizeof (dispq_t));
402 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
403 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
404 sizeof (long));
405 }
406 dp->disp_q = dptr->newdispq;
407 dp->disp_qactmap = dptr->newdqactmap;
408 dp->disp_q_limit = &dptr->newdispq[numpris];
409 dp->disp_npri = numpris;
410 }
411
412 static void
disp_dq_free(struct disp_queue_info * dptr)413 disp_dq_free(struct disp_queue_info *dptr)
414 {
415 if (dptr->olddispq != NULL)
416 kmem_free(dptr->olddispq,
417 dptr->oldnglobpris * sizeof (dispq_t));
418 if (dptr->olddqactmap != NULL)
419 kmem_free(dptr->olddqactmap,
420 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
421 }
422
423 /*
424 * For a newly created CPU, initialize the dispatch queue.
425 * This is called before the CPU is known through cpu[] or on any lists.
426 */
427 void
disp_cpu_init(cpu_t * cp)428 disp_cpu_init(cpu_t *cp)
429 {
430 disp_t *dp;
431 dispq_t *newdispq;
432 ulong_t *newdqactmap;
433
434 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
435
436 if (cp == cpu0_disp.disp_cpu)
437 dp = &cpu0_disp;
438 else
439 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
440 bzero(dp, sizeof (disp_t));
441 cp->cpu_disp = dp;
442 dp->disp_cpu = cp;
443 dp->disp_maxrunpri = -1;
444 dp->disp_max_unbound_pri = -1;
445 DISP_LOCK_INIT(&cp->cpu_thread_lock);
446 /*
447 * Allocate memory for the dispatcher queue headers
448 * and the active queue bitmap.
449 */
450 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
451 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
452 sizeof (long), KM_SLEEP);
453 dp->disp_q = newdispq;
454 dp->disp_qactmap = newdqactmap;
455 dp->disp_q_limit = &newdispq[v.v_nglobpris];
456 dp->disp_npri = v.v_nglobpris;
457 }
458
459 void
disp_cpu_fini(cpu_t * cp)460 disp_cpu_fini(cpu_t *cp)
461 {
462 ASSERT(MUTEX_HELD(&cpu_lock));
463
464 disp_kp_free(cp->cpu_disp);
465 if (cp->cpu_disp != &cpu0_disp)
466 kmem_free(cp->cpu_disp, sizeof (disp_t));
467 }
468
469 /*
470 * Allocate new, larger kpreempt dispatch queue to replace the old one.
471 */
472 void
disp_kp_alloc(disp_t * dq,pri_t npri)473 disp_kp_alloc(disp_t *dq, pri_t npri)
474 {
475 struct disp_queue_info mem_info;
476
477 if (npri > dq->disp_npri) {
478 /*
479 * Allocate memory for the new array.
480 */
481 disp_dq_alloc(&mem_info, npri, dq);
482
483 /*
484 * We need to copy the old structures to the new
485 * and free the old.
486 */
487 disp_dq_assign(&mem_info, npri);
488 disp_dq_free(&mem_info);
489 }
490 }
491
492 /*
493 * Free dispatch queue.
494 * Used for the kpreempt queues for a removed CPU partition and
495 * for the per-CPU queues of deleted CPUs.
496 */
497 void
disp_kp_free(disp_t * dq)498 disp_kp_free(disp_t *dq)
499 {
500 struct disp_queue_info mem_info;
501
502 mem_info.olddispq = dq->disp_q;
503 mem_info.olddqactmap = dq->disp_qactmap;
504 mem_info.oldnglobpris = dq->disp_npri;
505 disp_dq_free(&mem_info);
506 }
507
508 /*
509 * End dispatcher and scheduler initialization.
510 */
511
512 /*
513 * See if there's anything to do other than remain idle.
514 * Return non-zero if there is.
515 *
516 * This function must be called with high spl, or with
517 * kernel preemption disabled to prevent the partition's
518 * active cpu list from changing while being traversed.
519 *
520 * This is essentially a simpler version of disp_getwork()
521 * to be called by CPUs preparing to "halt".
522 */
523 int
disp_anywork(void)524 disp_anywork(void)
525 {
526 cpu_t *cp = CPU;
527 cpu_t *ocp;
528 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
529
530 if (!(cp->cpu_flags & CPU_OFFLINE)) {
531 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
532 return (1);
533
534 for (ocp = cp->cpu_next_part; ocp != cp;
535 ocp = ocp->cpu_next_part) {
536 ASSERT(CPU_ACTIVE(ocp));
537
538 /*
539 * Something has appeared on the local run queue.
540 */
541 if (*local_nrunnable > 0)
542 return (1);
543 /*
544 * If we encounter another idle CPU that will
545 * soon be trolling around through disp_anywork()
546 * terminate our walk here and let this other CPU
547 * patrol the next part of the list.
548 */
549 if (ocp->cpu_dispatch_pri == -1 &&
550 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
551 return (0);
552 /*
553 * Work can be taken from another CPU if:
554 * - There is unbound work on the run queue
555 * - That work isn't a thread undergoing a
556 * - context switch on an otherwise empty queue.
557 * - The CPU isn't running the idle loop.
558 */
559 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
560 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
561 ocp->cpu_disp->disp_nrunnable == 1) &&
562 ocp->cpu_dispatch_pri != -1)
563 return (1);
564 }
565 }
566 return (0);
567 }
568
569 /*
570 * Called when CPU enters the idle loop
571 */
572 static void
idle_enter()573 idle_enter()
574 {
575 cpu_t *cp = CPU;
576
577 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
578 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
579 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
580 }
581
582 /*
583 * Called when CPU exits the idle loop
584 */
585 static void
idle_exit()586 idle_exit()
587 {
588 cpu_t *cp = CPU;
589
590 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
591 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
592 }
593
594 /*
595 * Idle loop.
596 */
597 void
idle()598 idle()
599 {
600 struct cpu *cp = CPU; /* pointer to this CPU */
601 kthread_t *t; /* taken thread */
602
603 idle_enter();
604
605 /*
606 * Uniprocessor version of idle loop.
607 * Do this until notified that we're on an actual multiprocessor.
608 */
609 while (ncpus == 1) {
610 if (cp->cpu_disp->disp_nrunnable == 0) {
611 (*idle_cpu)();
612 continue;
613 }
614 idle_exit();
615 swtch();
616
617 idle_enter(); /* returned from swtch */
618 }
619
620 /*
621 * Multiprocessor idle loop.
622 */
623 for (;;) {
624 /*
625 * If CPU is completely quiesced by p_online(2), just wait
626 * here with minimal bus traffic until put online.
627 */
628 while (cp->cpu_flags & CPU_QUIESCED)
629 (*idle_cpu)();
630
631 if (cp->cpu_disp->disp_nrunnable != 0) {
632 idle_exit();
633 swtch();
634 } else {
635 if (cp->cpu_flags & CPU_OFFLINE)
636 continue;
637 if ((t = disp_getwork(cp)) == NULL) {
638 if (cp->cpu_chosen_level != -1) {
639 disp_t *dp = cp->cpu_disp;
640 disp_t *kpq;
641
642 disp_lock_enter(&dp->disp_lock);
643 /*
644 * Set kpq under lock to prevent
645 * migration between partitions.
646 */
647 kpq = &cp->cpu_part->cp_kp_queue;
648 if (kpq->disp_maxrunpri == -1)
649 cp->cpu_chosen_level = -1;
650 disp_lock_exit(&dp->disp_lock);
651 }
652 (*idle_cpu)();
653 continue;
654 }
655 /*
656 * If there was a thread but we couldn't steal
657 * it, then keep trying.
658 */
659 if (t == T_DONTSTEAL)
660 continue;
661 idle_exit();
662 swtch_to(t);
663 }
664 idle_enter(); /* returned from swtch/swtch_to */
665 }
666 }
667
668
669 /*
670 * Preempt the currently running thread in favor of the highest
671 * priority thread. The class of the current thread controls
672 * where it goes on the dispatcher queues. If panicking, turn
673 * preemption off.
674 */
675 void
preempt()676 preempt()
677 {
678 kthread_t *t = curthread;
679 klwp_t *lwp = ttolwp(curthread);
680
681 if (panicstr)
682 return;
683
684 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
685
686 thread_lock(t);
687
688 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
689 /*
690 * this thread has already been chosen to be run on
691 * another CPU. Clear kprunrun on this CPU since we're
692 * already headed for swtch().
693 */
694 CPU->cpu_kprunrun = 0;
695 thread_unlock_nopreempt(t);
696 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
697 } else {
698 if (lwp != NULL)
699 lwp->lwp_ru.nivcsw++;
700 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
701 THREAD_TRANSITION(t);
702 CL_PREEMPT(t);
703 DTRACE_SCHED(preempt);
704 thread_unlock_nopreempt(t);
705
706 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
707
708 swtch(); /* clears CPU->cpu_runrun via disp() */
709 }
710 }
711
712 extern kthread_t *thread_unpin();
713
714 /*
715 * disp() - find the highest priority thread for this processor to run, and
716 * set it in TS_ONPROC state so that resume() can be called to run it.
717 */
718 static kthread_t *
disp()719 disp()
720 {
721 cpu_t *cpup;
722 disp_t *dp;
723 kthread_t *tp;
724 dispq_t *dq;
725 int maxrunword;
726 pri_t pri;
727 disp_t *kpq;
728
729 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
730
731 cpup = CPU;
732 /*
733 * Find the highest priority loaded, runnable thread.
734 */
735 dp = cpup->cpu_disp;
736
737 reschedule:
738 /*
739 * If there is more important work on the global queue with a better
740 * priority than the maximum on this CPU, take it now.
741 */
742 kpq = &cpup->cpu_part->cp_kp_queue;
743 while ((pri = kpq->disp_maxrunpri) >= 0 &&
744 pri >= dp->disp_maxrunpri &&
745 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
746 (tp = disp_getbest(kpq)) != NULL) {
747 if (disp_ratify(tp, kpq) != NULL) {
748 TRACE_1(TR_FAC_DISP, TR_DISP_END,
749 "disp_end:tid %p", tp);
750 return (tp);
751 }
752 }
753
754 disp_lock_enter(&dp->disp_lock);
755 pri = dp->disp_maxrunpri;
756
757 /*
758 * If there is nothing to run, look at what's runnable on other queues.
759 * Choose the idle thread if the CPU is quiesced.
760 * Note that CPUs that have the CPU_OFFLINE flag set can still run
761 * interrupt threads, which will be the only threads on the CPU's own
762 * queue, but cannot run threads from other queues.
763 */
764 if (pri == -1) {
765 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
766 disp_lock_exit(&dp->disp_lock);
767 if ((tp = disp_getwork(cpup)) == NULL ||
768 tp == T_DONTSTEAL) {
769 tp = cpup->cpu_idle_thread;
770 (void) splhigh();
771 THREAD_ONPROC(tp, cpup);
772 cpup->cpu_dispthread = tp;
773 cpup->cpu_dispatch_pri = -1;
774 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
775 cpup->cpu_chosen_level = -1;
776 }
777 } else {
778 disp_lock_exit_high(&dp->disp_lock);
779 tp = cpup->cpu_idle_thread;
780 THREAD_ONPROC(tp, cpup);
781 cpup->cpu_dispthread = tp;
782 cpup->cpu_dispatch_pri = -1;
783 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
784 cpup->cpu_chosen_level = -1;
785 }
786 TRACE_1(TR_FAC_DISP, TR_DISP_END,
787 "disp_end:tid %p", tp);
788 return (tp);
789 }
790
791 dq = &dp->disp_q[pri];
792 tp = dq->dq_first;
793
794 ASSERT(tp != NULL);
795 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
796
797 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
798
799 /*
800 * Found it so remove it from queue.
801 */
802 dp->disp_nrunnable--;
803 dq->dq_sruncnt--;
804 if ((dq->dq_first = tp->t_link) == NULL) {
805 ulong_t *dqactmap = dp->disp_qactmap;
806
807 ASSERT(dq->dq_sruncnt == 0);
808 dq->dq_last = NULL;
809
810 /*
811 * The queue is empty, so the corresponding bit needs to be
812 * turned off in dqactmap. If nrunnable != 0 just took the
813 * last runnable thread off the
814 * highest queue, so recompute disp_maxrunpri.
815 */
816 maxrunword = pri >> BT_ULSHIFT;
817 dqactmap[maxrunword] &= ~BT_BIW(pri);
818
819 if (dp->disp_nrunnable == 0) {
820 dp->disp_max_unbound_pri = -1;
821 dp->disp_maxrunpri = -1;
822 } else {
823 int ipri;
824
825 ipri = bt_gethighbit(dqactmap, maxrunword);
826 dp->disp_maxrunpri = ipri;
827 if (ipri < dp->disp_max_unbound_pri)
828 dp->disp_max_unbound_pri = ipri;
829 }
830 } else {
831 tp->t_link = NULL;
832 }
833
834 /*
835 * Set TS_DONT_SWAP flag to prevent another processor from swapping
836 * out this thread before we have a chance to run it.
837 * While running, it is protected against swapping by t_lock.
838 */
839 tp->t_schedflag |= TS_DONT_SWAP;
840 cpup->cpu_dispthread = tp; /* protected by spl only */
841 cpup->cpu_dispatch_pri = pri;
842 ASSERT(pri == DISP_PRIO(tp));
843 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
844 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
845
846 ASSERT(tp != NULL);
847 TRACE_1(TR_FAC_DISP, TR_DISP_END,
848 "disp_end:tid %p", tp);
849
850 if (disp_ratify(tp, kpq) == NULL)
851 goto reschedule;
852
853 return (tp);
854 }
855
856 /*
857 * swtch()
858 * Find best runnable thread and run it.
859 * Called with the current thread already switched to a new state,
860 * on a sleep queue, run queue, stopped, and not zombied.
861 * May be called at any spl level less than or equal to LOCK_LEVEL.
862 * Always drops spl to the base level (spl0()).
863 */
864 void
swtch()865 swtch()
866 {
867 kthread_t *t = curthread;
868 kthread_t *next;
869 cpu_t *cp;
870
871 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
872
873 if (t->t_flag & T_INTR_THREAD)
874 cpu_intr_swtch_enter(t);
875
876 if (t->t_intr != NULL) {
877 /*
878 * We are an interrupt thread. Setup and return
879 * the interrupted thread to be resumed.
880 */
881 (void) splhigh(); /* block other scheduler action */
882 cp = CPU; /* now protected against migration */
883 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
884 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
885 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
886 next = thread_unpin();
887 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
888 resume_from_intr(next);
889 } else {
890 #ifdef DEBUG
891 if (t->t_state == TS_ONPROC &&
892 t->t_disp_queue->disp_cpu == CPU &&
893 t->t_preempt == 0) {
894 thread_lock(t);
895 ASSERT(t->t_state != TS_ONPROC ||
896 t->t_disp_queue->disp_cpu != CPU ||
897 t->t_preempt != 0); /* cannot migrate */
898 thread_unlock_nopreempt(t);
899 }
900 #endif /* DEBUG */
901 cp = CPU;
902 next = disp(); /* returns with spl high */
903 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
904
905 /* OK to steal anything left on run queue */
906 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
907
908 if (next != t) {
909 hrtime_t now;
910
911 now = gethrtime_unscaled();
912 pg_ev_thread_swtch(cp, now, t, next);
913
914 /*
915 * If t was previously in the TS_ONPROC state,
916 * setfrontdq and setbackdq won't have set its t_waitrq.
917 * Since we now finally know that we're switching away
918 * from this thread, set its t_waitrq if it is on a run
919 * queue.
920 */
921 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
922 t->t_waitrq = now;
923 }
924
925 /*
926 * restore mstate of thread that we are switching to
927 */
928 restore_mstate(next);
929
930 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
931 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
932 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
933
934 if (dtrace_vtime_active)
935 dtrace_vtime_switch(next);
936
937 resume(next);
938 /*
939 * The TR_RESUME_END and TR_SWTCH_END trace points
940 * appear at the end of resume(), because we may not
941 * return here
942 */
943 } else {
944 if (t->t_flag & T_INTR_THREAD)
945 cpu_intr_swtch_exit(t);
946 /*
947 * Threads that enqueue themselves on a run queue defer
948 * setting t_waitrq. It is then either set in swtch()
949 * when the CPU is actually yielded, or not at all if it
950 * is remaining on the CPU.
951 * There is however a window between where the thread
952 * placed itself on a run queue, and where it selects
953 * itself in disp(), where a third party (eg. clock()
954 * doing tick processing) may have re-enqueued this
955 * thread, setting t_waitrq in the process. We detect
956 * this race by noticing that despite switching to
957 * ourself, our t_waitrq has been set, and should be
958 * cleared.
959 */
960 if (t->t_waitrq != 0)
961 t->t_waitrq = 0;
962
963 pg_ev_thread_remain(cp, t);
964
965 DTRACE_SCHED(remain__cpu);
966 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
967 (void) spl0();
968 }
969 }
970 }
971
972 /*
973 * swtch_from_zombie()
974 * Special case of swtch(), which allows checks for TS_ZOMB to be
975 * eliminated from normal resume.
976 * Find best runnable thread and run it.
977 * Called with the current thread zombied.
978 * Zombies cannot migrate, so CPU references are safe.
979 */
980 void
swtch_from_zombie()981 swtch_from_zombie()
982 {
983 kthread_t *next;
984 cpu_t *cpu = CPU;
985
986 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
987
988 ASSERT(curthread->t_state == TS_ZOMB);
989
990 next = disp(); /* returns with spl high */
991 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
992 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
993 ASSERT(next != curthread);
994 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
995
996 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
997
998 restore_mstate(next);
999
1000 if (dtrace_vtime_active)
1001 dtrace_vtime_switch(next);
1002
1003 resume_from_zombie(next);
1004 /*
1005 * The TR_RESUME_END and TR_SWTCH_END trace points
1006 * appear at the end of resume(), because we certainly will not
1007 * return here
1008 */
1009 }
1010
1011 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1012
1013 /*
1014 * search_disp_queues()
1015 * Search the given dispatch queues for thread tp.
1016 * Return 1 if tp is found, otherwise return 0.
1017 */
1018 static int
search_disp_queues(disp_t * dp,kthread_t * tp)1019 search_disp_queues(disp_t *dp, kthread_t *tp)
1020 {
1021 dispq_t *dq;
1022 dispq_t *eq;
1023
1024 disp_lock_enter_high(&dp->disp_lock);
1025
1026 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1027 kthread_t *rp;
1028
1029 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1030
1031 for (rp = dq->dq_first; rp; rp = rp->t_link)
1032 if (tp == rp) {
1033 disp_lock_exit_high(&dp->disp_lock);
1034 return (1);
1035 }
1036 }
1037 disp_lock_exit_high(&dp->disp_lock);
1038
1039 return (0);
1040 }
1041
1042 /*
1043 * thread_on_queue()
1044 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1045 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1046 */
1047 static int
thread_on_queue(kthread_t * tp)1048 thread_on_queue(kthread_t *tp)
1049 {
1050 cpu_t *cp;
1051 struct cpupart *part;
1052
1053 ASSERT(getpil() >= DISP_LEVEL);
1054
1055 /*
1056 * Search the per-CPU dispatch queues for tp.
1057 */
1058 cp = CPU;
1059 do {
1060 if (search_disp_queues(cp->cpu_disp, tp))
1061 return (1);
1062 } while ((cp = cp->cpu_next_onln) != CPU);
1063
1064 /*
1065 * Search the partition-wide kpreempt queues for tp.
1066 */
1067 part = CPU->cpu_part;
1068 do {
1069 if (search_disp_queues(&part->cp_kp_queue, tp))
1070 return (1);
1071 } while ((part = part->cp_next) != CPU->cpu_part);
1072
1073 return (0);
1074 }
1075
1076 #else
1077
1078 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1079
1080 #endif /* DEBUG */
1081
1082 /*
1083 * like swtch(), but switch to a specified thread taken from another CPU.
1084 * called with spl high..
1085 */
1086 void
swtch_to(kthread_t * next)1087 swtch_to(kthread_t *next)
1088 {
1089 cpu_t *cp = CPU;
1090 hrtime_t now;
1091
1092 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1093
1094 /*
1095 * Update context switch statistics.
1096 */
1097 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1098
1099 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1100
1101 now = gethrtime_unscaled();
1102 pg_ev_thread_swtch(cp, now, curthread, next);
1103
1104 /* OK to steal anything left on run queue */
1105 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1106
1107 /* record last execution time */
1108 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1109
1110 /*
1111 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1112 * won't have set its t_waitrq. Since we now finally know that we're
1113 * switching away from this thread, set its t_waitrq if it is on a run
1114 * queue.
1115 */
1116 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1117 curthread->t_waitrq = now;
1118 }
1119
1120 /* restore next thread to previously running microstate */
1121 restore_mstate(next);
1122
1123 if (dtrace_vtime_active)
1124 dtrace_vtime_switch(next);
1125
1126 resume(next);
1127 /*
1128 * The TR_RESUME_END and TR_SWTCH_END trace points
1129 * appear at the end of resume(), because we may not
1130 * return here
1131 */
1132 }
1133
1134 #define CPU_IDLING(pri) ((pri) == -1)
1135
1136 static void
cpu_resched(cpu_t * cp,pri_t tpri)1137 cpu_resched(cpu_t *cp, pri_t tpri)
1138 {
1139 int call_poke_cpu = 0;
1140 pri_t cpupri = cp->cpu_dispatch_pri;
1141
1142 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1143 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1144 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1145 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1146 cp->cpu_runrun = 1;
1147 aston(cp->cpu_dispthread);
1148 if (tpri < kpreemptpri && cp != CPU)
1149 call_poke_cpu = 1;
1150 }
1151 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1152 cp->cpu_kprunrun = 1;
1153 if (cp != CPU)
1154 call_poke_cpu = 1;
1155 }
1156 }
1157
1158 /*
1159 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1160 */
1161 membar_enter();
1162
1163 if (call_poke_cpu)
1164 poke_cpu(cp->cpu_id);
1165 }
1166
1167 /*
1168 * setbackdq() keeps runqs balanced such that the difference in length
1169 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1170 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1171 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1172 * try to keep runqs perfectly balanced regardless of the thread priority.
1173 */
1174 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1175 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1176 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1177
1178 /*
1179 * Macro that evaluates to true if it is likely that the thread has cache
1180 * warmth. This is based on the amount of time that has elapsed since the
1181 * thread last ran. If that amount of time is less than "rechoose_interval"
1182 * ticks, then we decide that the thread has enough cache warmth to warrant
1183 * some affinity for t->t_cpu.
1184 */
1185 #define THREAD_HAS_CACHE_WARMTH(thread) \
1186 ((thread == curthread) || \
1187 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1188 /*
1189 * Put the specified thread on the back of the dispatcher
1190 * queue corresponding to its current priority.
1191 *
1192 * Called with the thread in transition, onproc or stopped state
1193 * and locked (transition implies locked) and at high spl.
1194 * Returns with the thread in TS_RUN state and still locked.
1195 */
1196 void
setbackdq(kthread_t * tp)1197 setbackdq(kthread_t *tp)
1198 {
1199 dispq_t *dq;
1200 disp_t *dp;
1201 cpu_t *cp;
1202 pri_t tpri;
1203 int bound;
1204 boolean_t self;
1205
1206 ASSERT(THREAD_LOCK_HELD(tp));
1207 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1208 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1209
1210 /*
1211 * If thread is "swapped" or on the swap queue don't
1212 * queue it, but wake sched.
1213 */
1214 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1215 disp_swapped_setrun(tp);
1216 return;
1217 }
1218
1219 self = (tp == curthread);
1220
1221 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1222 bound = 1;
1223 else
1224 bound = 0;
1225
1226 tpri = DISP_PRIO(tp);
1227 if (ncpus == 1)
1228 cp = tp->t_cpu;
1229 else if (!bound) {
1230 if (tpri >= kpqpri) {
1231 setkpdq(tp, SETKP_BACK);
1232 return;
1233 }
1234
1235 /*
1236 * We'll generally let this thread continue to run where
1237 * it last ran...but will consider migration if:
1238 * - We thread probably doesn't have much cache warmth.
1239 * - The CPU where it last ran is the target of an offline
1240 * request.
1241 * - The thread last ran outside it's home lgroup.
1242 */
1243 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1244 (tp->t_cpu == cpu_inmotion)) {
1245 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1246 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1247 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1248 self ? tp->t_cpu : NULL);
1249 } else {
1250 cp = tp->t_cpu;
1251 }
1252
1253 if (tp->t_cpupart == cp->cpu_part) {
1254 int qlen;
1255
1256 /*
1257 * Perform any CMT load balancing
1258 */
1259 cp = cmt_balance(tp, cp);
1260
1261 /*
1262 * Balance across the run queues
1263 */
1264 qlen = RUNQ_LEN(cp, tpri);
1265 if (tpri >= RUNQ_MATCH_PRI &&
1266 !(tp->t_schedflag & TS_RUNQMATCH))
1267 qlen -= RUNQ_MAX_DIFF;
1268 if (qlen > 0) {
1269 cpu_t *newcp;
1270
1271 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1272 newcp = cp->cpu_next_part;
1273 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1274 newcp = cp->cpu_next_part;
1275 }
1276
1277 if (RUNQ_LEN(newcp, tpri) < qlen) {
1278 DTRACE_PROBE3(runq__balance,
1279 kthread_t *, tp,
1280 cpu_t *, cp, cpu_t *, newcp);
1281 cp = newcp;
1282 }
1283 }
1284 } else {
1285 /*
1286 * Migrate to a cpu in the new partition.
1287 */
1288 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1289 tp->t_lpl, tp->t_pri, NULL);
1290 }
1291 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1292 } else {
1293 /*
1294 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1295 * a short time until weak binding that existed when the
1296 * strong binding was established has dropped) so we must
1297 * favour weak binding over strong.
1298 */
1299 cp = tp->t_weakbound_cpu ?
1300 tp->t_weakbound_cpu : tp->t_bound_cpu;
1301 }
1302 /*
1303 * A thread that is ONPROC may be temporarily placed on the run queue
1304 * but then chosen to run again by disp. If the thread we're placing on
1305 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1306 * replacement process is actually scheduled in swtch(). In this
1307 * situation, curthread is the only thread that could be in the ONPROC
1308 * state.
1309 */
1310 if ((!self) && (tp->t_waitrq == 0)) {
1311 hrtime_t curtime;
1312
1313 curtime = gethrtime_unscaled();
1314 (void) cpu_update_pct(tp, curtime);
1315 tp->t_waitrq = curtime;
1316 } else {
1317 (void) cpu_update_pct(tp, gethrtime_unscaled());
1318 }
1319
1320 dp = cp->cpu_disp;
1321 disp_lock_enter_high(&dp->disp_lock);
1322
1323 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1324 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1325 tpri, cp, tp);
1326
1327 #ifndef NPROBE
1328 /* Kernel probe */
1329 if (tnf_tracing_active)
1330 tnf_thread_queue(tp, cp, tpri);
1331 #endif /* NPROBE */
1332
1333 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1334
1335 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1336 tp->t_disp_queue = dp;
1337 tp->t_link = NULL;
1338
1339 dq = &dp->disp_q[tpri];
1340 dp->disp_nrunnable++;
1341 if (!bound)
1342 dp->disp_steal = 0;
1343 membar_enter();
1344
1345 if (dq->dq_sruncnt++ != 0) {
1346 ASSERT(dq->dq_first != NULL);
1347 dq->dq_last->t_link = tp;
1348 dq->dq_last = tp;
1349 } else {
1350 ASSERT(dq->dq_first == NULL);
1351 ASSERT(dq->dq_last == NULL);
1352 dq->dq_first = dq->dq_last = tp;
1353 BT_SET(dp->disp_qactmap, tpri);
1354 if (tpri > dp->disp_maxrunpri) {
1355 dp->disp_maxrunpri = tpri;
1356 membar_enter();
1357 cpu_resched(cp, tpri);
1358 }
1359 }
1360
1361 if (!bound && tpri > dp->disp_max_unbound_pri) {
1362 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1363 /*
1364 * If there are no other unbound threads on the
1365 * run queue, don't allow other CPUs to steal
1366 * this thread while we are in the middle of a
1367 * context switch. We may just switch to it
1368 * again right away. CPU_DISP_DONTSTEAL is cleared
1369 * in swtch and swtch_to.
1370 */
1371 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1372 }
1373 dp->disp_max_unbound_pri = tpri;
1374 }
1375 (*disp_enq_thread)(cp, bound);
1376 }
1377
1378 /*
1379 * Put the specified thread on the front of the dispatcher
1380 * queue corresponding to its current priority.
1381 *
1382 * Called with the thread in transition, onproc or stopped state
1383 * and locked (transition implies locked) and at high spl.
1384 * Returns with the thread in TS_RUN state and still locked.
1385 */
1386 void
setfrontdq(kthread_t * tp)1387 setfrontdq(kthread_t *tp)
1388 {
1389 disp_t *dp;
1390 dispq_t *dq;
1391 cpu_t *cp;
1392 pri_t tpri;
1393 int bound;
1394
1395 ASSERT(THREAD_LOCK_HELD(tp));
1396 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1397 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1398
1399 /*
1400 * If thread is "swapped" or on the swap queue don't
1401 * queue it, but wake sched.
1402 */
1403 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1404 disp_swapped_setrun(tp);
1405 return;
1406 }
1407
1408 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1409 bound = 1;
1410 else
1411 bound = 0;
1412
1413 tpri = DISP_PRIO(tp);
1414 if (ncpus == 1)
1415 cp = tp->t_cpu;
1416 else if (!bound) {
1417 if (tpri >= kpqpri) {
1418 setkpdq(tp, SETKP_FRONT);
1419 return;
1420 }
1421 cp = tp->t_cpu;
1422 if (tp->t_cpupart == cp->cpu_part) {
1423 /*
1424 * We'll generally let this thread continue to run
1425 * where it last ran, but will consider migration if:
1426 * - The thread last ran outside it's home lgroup.
1427 * - The CPU where it last ran is the target of an
1428 * offline request (a thread_nomigrate() on the in
1429 * motion CPU relies on this when forcing a preempt).
1430 * - The thread isn't the highest priority thread where
1431 * it last ran, and it is considered not likely to
1432 * have significant cache warmth.
1433 */
1434 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1435 (cp == cpu_inmotion)) {
1436 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1437 (tp == curthread) ? cp : NULL);
1438 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1439 (!THREAD_HAS_CACHE_WARMTH(tp))) {
1440 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1441 NULL);
1442 }
1443 } else {
1444 /*
1445 * Migrate to a cpu in the new partition.
1446 */
1447 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1448 tp->t_lpl, tp->t_pri, NULL);
1449 }
1450 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1451 } else {
1452 /*
1453 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1454 * a short time until weak binding that existed when the
1455 * strong binding was established has dropped) so we must
1456 * favour weak binding over strong.
1457 */
1458 cp = tp->t_weakbound_cpu ?
1459 tp->t_weakbound_cpu : tp->t_bound_cpu;
1460 }
1461
1462 /*
1463 * A thread that is ONPROC may be temporarily placed on the run queue
1464 * but then chosen to run again by disp. If the thread we're placing on
1465 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1466 * replacement process is actually scheduled in swtch(). In this
1467 * situation, curthread is the only thread that could be in the ONPROC
1468 * state.
1469 */
1470 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1471 hrtime_t curtime;
1472
1473 curtime = gethrtime_unscaled();
1474 (void) cpu_update_pct(tp, curtime);
1475 tp->t_waitrq = curtime;
1476 } else {
1477 (void) cpu_update_pct(tp, gethrtime_unscaled());
1478 }
1479
1480 dp = cp->cpu_disp;
1481 disp_lock_enter_high(&dp->disp_lock);
1482
1483 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1484 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1485
1486 #ifndef NPROBE
1487 /* Kernel probe */
1488 if (tnf_tracing_active)
1489 tnf_thread_queue(tp, cp, tpri);
1490 #endif /* NPROBE */
1491
1492 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1493
1494 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1495 tp->t_disp_queue = dp;
1496
1497 dq = &dp->disp_q[tpri];
1498 dp->disp_nrunnable++;
1499 if (!bound)
1500 dp->disp_steal = 0;
1501 membar_enter();
1502
1503 if (dq->dq_sruncnt++ != 0) {
1504 ASSERT(dq->dq_last != NULL);
1505 tp->t_link = dq->dq_first;
1506 dq->dq_first = tp;
1507 } else {
1508 ASSERT(dq->dq_last == NULL);
1509 ASSERT(dq->dq_first == NULL);
1510 tp->t_link = NULL;
1511 dq->dq_first = dq->dq_last = tp;
1512 BT_SET(dp->disp_qactmap, tpri);
1513 if (tpri > dp->disp_maxrunpri) {
1514 dp->disp_maxrunpri = tpri;
1515 membar_enter();
1516 cpu_resched(cp, tpri);
1517 }
1518 }
1519
1520 if (!bound && tpri > dp->disp_max_unbound_pri) {
1521 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1522 cp == CPU) {
1523 /*
1524 * If there are no other unbound threads on the
1525 * run queue, don't allow other CPUs to steal
1526 * this thread while we are in the middle of a
1527 * context switch. We may just switch to it
1528 * again right away. CPU_DISP_DONTSTEAL is cleared
1529 * in swtch and swtch_to.
1530 */
1531 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1532 }
1533 dp->disp_max_unbound_pri = tpri;
1534 }
1535 (*disp_enq_thread)(cp, bound);
1536 }
1537
1538 /*
1539 * Put a high-priority unbound thread on the kp queue
1540 */
1541 static void
setkpdq(kthread_t * tp,int borf)1542 setkpdq(kthread_t *tp, int borf)
1543 {
1544 dispq_t *dq;
1545 disp_t *dp;
1546 cpu_t *cp;
1547 pri_t tpri;
1548
1549 tpri = DISP_PRIO(tp);
1550
1551 dp = &tp->t_cpupart->cp_kp_queue;
1552 disp_lock_enter_high(&dp->disp_lock);
1553
1554 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1555
1556 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1557 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1558 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1559 tp->t_disp_queue = dp;
1560 dp->disp_nrunnable++;
1561 dq = &dp->disp_q[tpri];
1562
1563 if (dq->dq_sruncnt++ != 0) {
1564 if (borf == SETKP_BACK) {
1565 ASSERT(dq->dq_first != NULL);
1566 tp->t_link = NULL;
1567 dq->dq_last->t_link = tp;
1568 dq->dq_last = tp;
1569 } else {
1570 ASSERT(dq->dq_last != NULL);
1571 tp->t_link = dq->dq_first;
1572 dq->dq_first = tp;
1573 }
1574 } else {
1575 if (borf == SETKP_BACK) {
1576 ASSERT(dq->dq_first == NULL);
1577 ASSERT(dq->dq_last == NULL);
1578 dq->dq_first = dq->dq_last = tp;
1579 } else {
1580 ASSERT(dq->dq_last == NULL);
1581 ASSERT(dq->dq_first == NULL);
1582 tp->t_link = NULL;
1583 dq->dq_first = dq->dq_last = tp;
1584 }
1585 BT_SET(dp->disp_qactmap, tpri);
1586 if (tpri > dp->disp_max_unbound_pri)
1587 dp->disp_max_unbound_pri = tpri;
1588 if (tpri > dp->disp_maxrunpri) {
1589 dp->disp_maxrunpri = tpri;
1590 membar_enter();
1591 }
1592 }
1593
1594 cp = tp->t_cpu;
1595 if (tp->t_cpupart != cp->cpu_part) {
1596 /* migrate to a cpu in the new partition */
1597 cp = tp->t_cpupart->cp_cpulist;
1598 }
1599 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1600 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1601 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1602
1603 #ifndef NPROBE
1604 /* Kernel probe */
1605 if (tnf_tracing_active)
1606 tnf_thread_queue(tp, cp, tpri);
1607 #endif /* NPROBE */
1608
1609 if (cp->cpu_chosen_level < tpri)
1610 cp->cpu_chosen_level = tpri;
1611 cpu_resched(cp, tpri);
1612 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1613 (*disp_enq_thread)(cp, 0);
1614 }
1615
1616 /*
1617 * Remove a thread from the dispatcher queue if it is on it.
1618 * It is not an error if it is not found but we return whether
1619 * or not it was found in case the caller wants to check.
1620 */
1621 int
dispdeq(kthread_t * tp)1622 dispdeq(kthread_t *tp)
1623 {
1624 disp_t *dp;
1625 dispq_t *dq;
1626 kthread_t *rp;
1627 kthread_t *trp;
1628 kthread_t **ptp;
1629 int tpri;
1630
1631 ASSERT(THREAD_LOCK_HELD(tp));
1632
1633 if (tp->t_state != TS_RUN)
1634 return (0);
1635
1636 /*
1637 * The thread is "swapped" or is on the swap queue and
1638 * hence no longer on the run queue, so return true.
1639 */
1640 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1641 return (1);
1642
1643 tpri = DISP_PRIO(tp);
1644 dp = tp->t_disp_queue;
1645 ASSERT(tpri < dp->disp_npri);
1646 dq = &dp->disp_q[tpri];
1647 ptp = &dq->dq_first;
1648 rp = *ptp;
1649 trp = NULL;
1650
1651 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1652
1653 /*
1654 * Search for thread in queue.
1655 * Double links would simplify this at the expense of disp/setrun.
1656 */
1657 while (rp != tp && rp != NULL) {
1658 trp = rp;
1659 ptp = &trp->t_link;
1660 rp = trp->t_link;
1661 }
1662
1663 if (rp == NULL) {
1664 panic("dispdeq: thread not on queue");
1665 }
1666
1667 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1668
1669 /*
1670 * Found it so remove it from queue.
1671 */
1672 if ((*ptp = rp->t_link) == NULL)
1673 dq->dq_last = trp;
1674
1675 dp->disp_nrunnable--;
1676 if (--dq->dq_sruncnt == 0) {
1677 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1678 if (dp->disp_nrunnable == 0) {
1679 dp->disp_max_unbound_pri = -1;
1680 dp->disp_maxrunpri = -1;
1681 } else if (tpri == dp->disp_maxrunpri) {
1682 int ipri;
1683
1684 ipri = bt_gethighbit(dp->disp_qactmap,
1685 dp->disp_maxrunpri >> BT_ULSHIFT);
1686 if (ipri < dp->disp_max_unbound_pri)
1687 dp->disp_max_unbound_pri = ipri;
1688 dp->disp_maxrunpri = ipri;
1689 }
1690 }
1691 tp->t_link = NULL;
1692 THREAD_TRANSITION(tp); /* put in intermediate state */
1693 return (1);
1694 }
1695
1696
1697 /*
1698 * dq_sruninc and dq_srundec are public functions for
1699 * incrementing/decrementing the sruncnts when a thread on
1700 * a dispatcher queue is made schedulable/unschedulable by
1701 * resetting the TS_LOAD flag.
1702 *
1703 * The caller MUST have the thread lock and therefore the dispatcher
1704 * queue lock so that the operation which changes
1705 * the flag, the operation that checks the status of the thread to
1706 * determine if it's on a disp queue AND the call to this function
1707 * are one atomic operation with respect to interrupts.
1708 */
1709
1710 /*
1711 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1712 */
1713 void
dq_sruninc(kthread_t * t)1714 dq_sruninc(kthread_t *t)
1715 {
1716 ASSERT(t->t_state == TS_RUN);
1717 ASSERT(t->t_schedflag & TS_LOAD);
1718
1719 THREAD_TRANSITION(t);
1720 setfrontdq(t);
1721 }
1722
1723 /*
1724 * See comment on calling conventions above.
1725 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1726 */
1727 void
dq_srundec(kthread_t * t)1728 dq_srundec(kthread_t *t)
1729 {
1730 ASSERT(t->t_schedflag & TS_LOAD);
1731
1732 (void) dispdeq(t);
1733 disp_swapped_enq(t);
1734 }
1735
1736 /*
1737 * Change the dispatcher lock of thread to the "swapped_lock"
1738 * and return with thread lock still held.
1739 *
1740 * Called with thread_lock held, in transition state, and at high spl.
1741 */
1742 void
disp_swapped_enq(kthread_t * tp)1743 disp_swapped_enq(kthread_t *tp)
1744 {
1745 ASSERT(THREAD_LOCK_HELD(tp));
1746 ASSERT(tp->t_schedflag & TS_LOAD);
1747
1748 switch (tp->t_state) {
1749 case TS_RUN:
1750 disp_lock_enter_high(&swapped_lock);
1751 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1752 break;
1753 case TS_ONPROC:
1754 disp_lock_enter_high(&swapped_lock);
1755 THREAD_TRANSITION(tp);
1756 wake_sched_sec = 1; /* tell clock to wake sched */
1757 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1758 break;
1759 default:
1760 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1761 }
1762 }
1763
1764 /*
1765 * This routine is called by setbackdq/setfrontdq if the thread is
1766 * not loaded or loaded and on the swap queue.
1767 *
1768 * Thread state TS_SLEEP implies that a swapped thread
1769 * has been woken up and needs to be swapped in by the swapper.
1770 *
1771 * Thread state TS_RUN, it implies that the priority of a swapped
1772 * thread is being increased by scheduling class (e.g. ts_update).
1773 */
1774 static void
disp_swapped_setrun(kthread_t * tp)1775 disp_swapped_setrun(kthread_t *tp)
1776 {
1777 ASSERT(THREAD_LOCK_HELD(tp));
1778 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1779
1780 switch (tp->t_state) {
1781 case TS_SLEEP:
1782 disp_lock_enter_high(&swapped_lock);
1783 /*
1784 * Wakeup sched immediately (i.e., next tick) if the
1785 * thread priority is above maxclsyspri.
1786 */
1787 if (DISP_PRIO(tp) > maxclsyspri)
1788 wake_sched = 1;
1789 else
1790 wake_sched_sec = 1;
1791 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1792 break;
1793 case TS_RUN: /* called from ts_update */
1794 break;
1795 default:
1796 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1797 }
1798 }
1799
1800 /*
1801 * Make a thread give up its processor. Find the processor on
1802 * which this thread is executing, and have that processor
1803 * preempt.
1804 *
1805 * We allow System Duty Cycle (SDC) threads to be preempted even if
1806 * they are running at kernel priorities. To implement this, we always
1807 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1808 * calls cpu_surrender() very often, we only preempt if there is anyone
1809 * competing with us.
1810 */
1811 void
cpu_surrender(kthread_t * tp)1812 cpu_surrender(kthread_t *tp)
1813 {
1814 cpu_t *cpup;
1815 int max_pri;
1816 int max_run_pri;
1817 klwp_t *lwp;
1818
1819 ASSERT(THREAD_LOCK_HELD(tp));
1820
1821 if (tp->t_state != TS_ONPROC)
1822 return;
1823 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1824 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1825 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1826 if (max_pri < max_run_pri)
1827 max_pri = max_run_pri;
1828
1829 if (tp->t_cid == sysdccid) {
1830 uint_t t_pri = DISP_PRIO(tp);
1831 if (t_pri > max_pri)
1832 return; /* we are not competing w/ anyone */
1833 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1834 } else {
1835 cpup->cpu_runrun = 1;
1836 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1837 cpup->cpu_kprunrun = 1;
1838 }
1839 }
1840
1841 /*
1842 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1843 */
1844 membar_enter();
1845
1846 DTRACE_SCHED1(surrender, kthread_t *, tp);
1847
1848 /*
1849 * Make the target thread take an excursion through trap()
1850 * to do preempt() (unless we're already in trap or post_syscall,
1851 * calling cpu_surrender via CL_TRAPRET).
1852 */
1853 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1854 lwp->lwp_state != LWP_USER) {
1855 aston(tp);
1856 if (cpup != CPU)
1857 poke_cpu(cpup->cpu_id);
1858 }
1859 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1860 "cpu_surrender:tid %p cpu %p", tp, cpup);
1861 }
1862
1863 /*
1864 * Commit to and ratify a scheduling decision
1865 */
1866 /*ARGSUSED*/
1867 static kthread_t *
disp_ratify(kthread_t * tp,disp_t * kpq)1868 disp_ratify(kthread_t *tp, disp_t *kpq)
1869 {
1870 pri_t tpri, maxpri;
1871 pri_t maxkpri;
1872 cpu_t *cpup;
1873
1874 ASSERT(tp != NULL);
1875 /*
1876 * Commit to, then ratify scheduling decision
1877 */
1878 cpup = CPU;
1879 if (cpup->cpu_runrun != 0)
1880 cpup->cpu_runrun = 0;
1881 if (cpup->cpu_kprunrun != 0)
1882 cpup->cpu_kprunrun = 0;
1883 if (cpup->cpu_chosen_level != -1)
1884 cpup->cpu_chosen_level = -1;
1885 membar_enter();
1886 tpri = DISP_PRIO(tp);
1887 maxpri = cpup->cpu_disp->disp_maxrunpri;
1888 maxkpri = kpq->disp_maxrunpri;
1889 if (maxpri < maxkpri)
1890 maxpri = maxkpri;
1891 if (tpri < maxpri) {
1892 /*
1893 * should have done better
1894 * put this one back and indicate to try again
1895 */
1896 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1897 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1898 thread_lock_high(tp);
1899 THREAD_TRANSITION(tp);
1900 setfrontdq(tp);
1901 thread_unlock_nopreempt(tp);
1902
1903 tp = NULL;
1904 }
1905 return (tp);
1906 }
1907
1908 /*
1909 * See if there is any work on the dispatcher queue for other CPUs.
1910 * If there is, dequeue the best thread and return.
1911 */
1912 static kthread_t *
disp_getwork(cpu_t * cp)1913 disp_getwork(cpu_t *cp)
1914 {
1915 cpu_t *ocp; /* other CPU */
1916 cpu_t *ocp_start;
1917 cpu_t *tcp; /* target local CPU */
1918 kthread_t *tp;
1919 kthread_t *retval = NULL;
1920 pri_t maxpri;
1921 disp_t *kpq; /* kp queue for this partition */
1922 lpl_t *lpl, *lpl_leaf;
1923 int leafidx, startidx;
1924 hrtime_t stealtime;
1925 lgrp_id_t local_id;
1926
1927 maxpri = -1;
1928 tcp = NULL;
1929
1930 kpq = &cp->cpu_part->cp_kp_queue;
1931 while (kpq->disp_maxrunpri >= 0) {
1932 /*
1933 * Try to take a thread from the kp_queue.
1934 */
1935 tp = (disp_getbest(kpq));
1936 if (tp)
1937 return (disp_ratify(tp, kpq));
1938 }
1939
1940 kpreempt_disable(); /* protect the cpu_active list */
1941
1942 /*
1943 * Try to find something to do on another CPU's run queue.
1944 * Loop through all other CPUs looking for the one with the highest
1945 * priority unbound thread.
1946 *
1947 * On NUMA machines, the partition's CPUs are consulted in order of
1948 * distance from the current CPU. This way, the first available
1949 * work found is also the closest, and will suffer the least
1950 * from being migrated.
1951 */
1952 lpl = lpl_leaf = cp->cpu_lpl;
1953 local_id = lpl_leaf->lpl_lgrpid;
1954 leafidx = startidx = 0;
1955
1956 /*
1957 * This loop traverses the lpl hierarchy. Higher level lpls represent
1958 * broader levels of locality
1959 */
1960 do {
1961 /* This loop iterates over the lpl's leaves */
1962 do {
1963 if (lpl_leaf != cp->cpu_lpl)
1964 ocp = lpl_leaf->lpl_cpus;
1965 else
1966 ocp = cp->cpu_next_lpl;
1967
1968 /* This loop iterates over the CPUs in the leaf */
1969 ocp_start = ocp;
1970 do {
1971 pri_t pri;
1972
1973 ASSERT(CPU_ACTIVE(ocp));
1974
1975 /*
1976 * End our stroll around this lpl if:
1977 *
1978 * - Something became runnable on the local
1979 * queue...which also ends our stroll around
1980 * the partition.
1981 *
1982 * - We happen across another idle CPU.
1983 * Since it is patrolling the next portion
1984 * of the lpl's list (assuming it's not
1985 * halted, or busy servicing an interrupt),
1986 * move to the next higher level of locality.
1987 */
1988 if (cp->cpu_disp->disp_nrunnable != 0) {
1989 kpreempt_enable();
1990 return (NULL);
1991 }
1992 if (ocp->cpu_dispatch_pri == -1) {
1993 if (ocp->cpu_disp_flags &
1994 CPU_DISP_HALTED ||
1995 ocp->cpu_intr_actv != 0)
1996 continue;
1997 else
1998 goto next_level;
1999 }
2000
2001 /*
2002 * If there's only one thread and the CPU
2003 * is in the middle of a context switch,
2004 * or it's currently running the idle thread,
2005 * don't steal it.
2006 */
2007 if ((ocp->cpu_disp_flags &
2008 CPU_DISP_DONTSTEAL) &&
2009 ocp->cpu_disp->disp_nrunnable == 1)
2010 continue;
2011
2012 pri = ocp->cpu_disp->disp_max_unbound_pri;
2013 if (pri > maxpri) {
2014 /*
2015 * Don't steal threads that we attempted
2016 * to steal recently until they're ready
2017 * to be stolen again.
2018 */
2019 stealtime = ocp->cpu_disp->disp_steal;
2020 if (stealtime == 0 ||
2021 stealtime - gethrtime() <= 0) {
2022 maxpri = pri;
2023 tcp = ocp;
2024 } else {
2025 /*
2026 * Don't update tcp, just set
2027 * the retval to T_DONTSTEAL, so
2028 * that if no acceptable CPUs
2029 * are found the return value
2030 * will be T_DONTSTEAL rather
2031 * then NULL.
2032 */
2033 retval = T_DONTSTEAL;
2034 }
2035 }
2036 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2037
2038 /*
2039 * Iterate to the next leaf lpl in the resource set
2040 * at this level of locality. If we hit the end of
2041 * the set, wrap back around to the beginning.
2042 *
2043 * Note: This iteration is NULL terminated for a reason
2044 * see lpl_topo_bootstrap() in lgrp.c for details.
2045 */
2046 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2047 leafidx = 0;
2048 lpl_leaf = lpl->lpl_rset[leafidx];
2049 }
2050 } while (leafidx != startidx);
2051
2052 next_level:
2053 /*
2054 * Expand the search to include farther away CPUs (next
2055 * locality level). The closer CPUs that have already been
2056 * checked will be checked again. In doing so, idle CPUs
2057 * will tend to be more aggresive about stealing from CPUs
2058 * that are closer (since the closer CPUs will be considered
2059 * more often).
2060 * Begin at this level with the CPUs local leaf lpl.
2061 */
2062 if ((lpl = lpl->lpl_parent) != NULL) {
2063 leafidx = startidx = lpl->lpl_id2rset[local_id];
2064 lpl_leaf = lpl->lpl_rset[leafidx];
2065 }
2066 } while (!tcp && lpl);
2067
2068 kpreempt_enable();
2069
2070 /*
2071 * If another queue looks good, and there is still nothing on
2072 * the local queue, try to transfer one or more threads
2073 * from it to our queue.
2074 */
2075 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2076 tp = disp_getbest(tcp->cpu_disp);
2077 if (tp == NULL || tp == T_DONTSTEAL)
2078 return (tp);
2079 return (disp_ratify(tp, kpq));
2080 }
2081 return (retval);
2082 }
2083
2084
2085 /*
2086 * disp_fix_unbound_pri()
2087 * Determines the maximum priority of unbound threads on the queue.
2088 * The priority is kept for the queue, but is only increased, never
2089 * reduced unless some CPU is looking for something on that queue.
2090 *
2091 * The priority argument is the known upper limit.
2092 *
2093 * Perhaps this should be kept accurately, but that probably means
2094 * separate bitmaps for bound and unbound threads. Since only idled
2095 * CPUs will have to do this recalculation, it seems better this way.
2096 */
2097 static void
disp_fix_unbound_pri(disp_t * dp,pri_t pri)2098 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2099 {
2100 kthread_t *tp;
2101 dispq_t *dq;
2102 ulong_t *dqactmap = dp->disp_qactmap;
2103 ulong_t mapword;
2104 int wx;
2105
2106 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2107
2108 ASSERT(pri >= 0); /* checked by caller */
2109
2110 /*
2111 * Start the search at the next lowest priority below the supplied
2112 * priority. This depends on the bitmap implementation.
2113 */
2114 do {
2115 wx = pri >> BT_ULSHIFT; /* index of word in map */
2116
2117 /*
2118 * Form mask for all lower priorities in the word.
2119 */
2120 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2121
2122 /*
2123 * Get next lower active priority.
2124 */
2125 if (mapword != 0) {
2126 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2127 } else if (wx > 0) {
2128 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2129 if (pri < 0)
2130 break;
2131 } else {
2132 pri = -1;
2133 break;
2134 }
2135
2136 /*
2137 * Search the queue for unbound, runnable threads.
2138 */
2139 dq = &dp->disp_q[pri];
2140 tp = dq->dq_first;
2141
2142 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2143 tp = tp->t_link;
2144 }
2145
2146 /*
2147 * If a thread was found, set the priority and return.
2148 */
2149 } while (tp == NULL);
2150
2151 /*
2152 * pri holds the maximum unbound thread priority or -1.
2153 */
2154 if (dp->disp_max_unbound_pri != pri)
2155 dp->disp_max_unbound_pri = pri;
2156 }
2157
2158 /*
2159 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2160 * check if the CPU to which is was previously bound should have
2161 * its disp_max_unbound_pri increased.
2162 */
2163 void
disp_adjust_unbound_pri(kthread_t * tp)2164 disp_adjust_unbound_pri(kthread_t *tp)
2165 {
2166 disp_t *dp;
2167 pri_t tpri;
2168
2169 ASSERT(THREAD_LOCK_HELD(tp));
2170
2171 /*
2172 * Don't do anything if the thread is not bound, or
2173 * currently not runnable or swapped out.
2174 */
2175 if (tp->t_bound_cpu == NULL ||
2176 tp->t_state != TS_RUN ||
2177 tp->t_schedflag & TS_ON_SWAPQ)
2178 return;
2179
2180 tpri = DISP_PRIO(tp);
2181 dp = tp->t_bound_cpu->cpu_disp;
2182 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2183 if (tpri > dp->disp_max_unbound_pri)
2184 dp->disp_max_unbound_pri = tpri;
2185 }
2186
2187 /*
2188 * disp_getbest()
2189 * De-queue the highest priority unbound runnable thread.
2190 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2191 * Returns NULL if nothing found.
2192 * Returns T_DONTSTEAL if the thread was not stealable.
2193 * so that the caller will try again later.
2194 *
2195 * Passed a pointer to a dispatch queue not associated with this CPU, and
2196 * its type.
2197 */
2198 static kthread_t *
disp_getbest(disp_t * dp)2199 disp_getbest(disp_t *dp)
2200 {
2201 kthread_t *tp;
2202 dispq_t *dq;
2203 pri_t pri;
2204 cpu_t *cp, *tcp;
2205 boolean_t allbound;
2206
2207 disp_lock_enter(&dp->disp_lock);
2208
2209 /*
2210 * If there is nothing to run, or the CPU is in the middle of a
2211 * context switch of the only thread, return NULL.
2212 */
2213 tcp = dp->disp_cpu;
2214 cp = CPU;
2215 pri = dp->disp_max_unbound_pri;
2216 if (pri == -1 ||
2217 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2218 tcp->cpu_disp->disp_nrunnable == 1)) {
2219 disp_lock_exit_nopreempt(&dp->disp_lock);
2220 return (NULL);
2221 }
2222
2223 dq = &dp->disp_q[pri];
2224
2225
2226 /*
2227 * Assume that all threads are bound on this queue, and change it
2228 * later when we find out that it is not the case.
2229 */
2230 allbound = B_TRUE;
2231 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2232 hrtime_t now, nosteal, rqtime;
2233
2234 /*
2235 * Skip over bound threads which could be here even
2236 * though disp_max_unbound_pri indicated this level.
2237 */
2238 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2239 continue;
2240
2241 /*
2242 * We've got some unbound threads on this queue, so turn
2243 * the allbound flag off now.
2244 */
2245 allbound = B_FALSE;
2246
2247 /*
2248 * The thread is a candidate for stealing from its run queue. We
2249 * don't want to steal threads that became runnable just a
2250 * moment ago. This improves CPU affinity for threads that get
2251 * preempted for short periods of time and go back on the run
2252 * queue.
2253 *
2254 * We want to let it stay on its run queue if it was only placed
2255 * there recently and it was running on the same CPU before that
2256 * to preserve its cache investment. For the thread to remain on
2257 * its run queue, ALL of the following conditions must be
2258 * satisfied:
2259 *
2260 * - the disp queue should not be the kernel preemption queue
2261 * - delayed idle stealing should not be disabled
2262 * - nosteal_nsec should be non-zero
2263 * - it should run with user priority
2264 * - it should be on the run queue of the CPU where it was
2265 * running before being placed on the run queue
2266 * - it should be the only thread on the run queue (to prevent
2267 * extra scheduling latency for other threads)
2268 * - it should sit on the run queue for less than per-chip
2269 * nosteal interval or global nosteal interval
2270 * - in case of CPUs with shared cache it should sit in a run
2271 * queue of a CPU from a different chip
2272 *
2273 * The checks are arranged so that the ones that are faster are
2274 * placed earlier.
2275 */
2276 if (tcp == NULL ||
2277 pri >= minclsyspri ||
2278 tp->t_cpu != tcp)
2279 break;
2280
2281 /*
2282 * Steal immediately if, due to CMT processor architecture
2283 * migraiton between cp and tcp would incur no performance
2284 * penalty.
2285 */
2286 if (pg_cmt_can_migrate(cp, tcp))
2287 break;
2288
2289 nosteal = nosteal_nsec;
2290 if (nosteal == 0)
2291 break;
2292
2293 /*
2294 * Calculate time spent sitting on run queue
2295 */
2296 now = gethrtime_unscaled();
2297 rqtime = now - tp->t_waitrq;
2298 scalehrtime(&rqtime);
2299
2300 /*
2301 * Steal immediately if the time spent on this run queue is more
2302 * than allowed nosteal delay.
2303 *
2304 * Negative rqtime check is needed here to avoid infinite
2305 * stealing delays caused by unlikely but not impossible
2306 * drifts between CPU times on different CPUs.
2307 */
2308 if (rqtime > nosteal || rqtime < 0)
2309 break;
2310
2311 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2312 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2313 scalehrtime(&now);
2314 /*
2315 * Calculate when this thread becomes stealable
2316 */
2317 now += (nosteal - rqtime);
2318
2319 /*
2320 * Calculate time when some thread becomes stealable
2321 */
2322 if (now < dp->disp_steal)
2323 dp->disp_steal = now;
2324 }
2325
2326 /*
2327 * If there were no unbound threads on this queue, find the queue
2328 * where they are and then return later. The value of
2329 * disp_max_unbound_pri is not always accurate because it isn't
2330 * reduced until another idle CPU looks for work.
2331 */
2332 if (allbound)
2333 disp_fix_unbound_pri(dp, pri);
2334
2335 /*
2336 * If we reached the end of the queue and found no unbound threads
2337 * then return NULL so that other CPUs will be considered. If there
2338 * are unbound threads but they cannot yet be stolen, then
2339 * return T_DONTSTEAL and try again later.
2340 */
2341 if (tp == NULL) {
2342 disp_lock_exit_nopreempt(&dp->disp_lock);
2343 return (allbound ? NULL : T_DONTSTEAL);
2344 }
2345
2346 /*
2347 * Found a runnable, unbound thread, so remove it from queue.
2348 * dispdeq() requires that we have the thread locked, and we do,
2349 * by virtue of holding the dispatch queue lock. dispdeq() will
2350 * put the thread in transition state, thereby dropping the dispq
2351 * lock.
2352 */
2353
2354 #ifdef DEBUG
2355 {
2356 int thread_was_on_queue;
2357
2358 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2359 ASSERT(thread_was_on_queue);
2360 }
2361
2362 #else /* DEBUG */
2363 (void) dispdeq(tp); /* drops disp_lock */
2364 #endif /* DEBUG */
2365
2366 /*
2367 * Reset the disp_queue steal time - we do not know what is the smallest
2368 * value across the queue is.
2369 */
2370 dp->disp_steal = 0;
2371
2372 tp->t_schedflag |= TS_DONT_SWAP;
2373
2374 /*
2375 * Setup thread to run on the current CPU.
2376 */
2377 tp->t_disp_queue = cp->cpu_disp;
2378
2379 cp->cpu_dispthread = tp; /* protected by spl only */
2380 cp->cpu_dispatch_pri = pri;
2381
2382 /*
2383 * There can be a memory synchronization race between disp_getbest()
2384 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2385 * to preempt the current thread to run the enqueued thread while
2386 * disp_getbest() and disp_ratify() are changing the current thread
2387 * to the stolen thread. This may lead to a situation where
2388 * cpu_resched() tries to preempt the wrong thread and the
2389 * stolen thread continues to run on the CPU which has been tagged
2390 * for preemption.
2391 * Later the clock thread gets enqueued but doesn't get to run on the
2392 * CPU causing the system to hang.
2393 *
2394 * To avoid this, grabbing and dropping the disp_lock (which does
2395 * a memory barrier) is needed to synchronize the execution of
2396 * cpu_resched() with disp_getbest() and disp_ratify() and
2397 * synchronize the memory read and written by cpu_resched(),
2398 * disp_getbest(), and disp_ratify() with each other.
2399 * (see CR#6482861 for more details).
2400 */
2401 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2402 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2403
2404 ASSERT(pri == DISP_PRIO(tp));
2405
2406 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2407
2408 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2409
2410 /*
2411 * Return with spl high so that swtch() won't need to raise it.
2412 * The disp_lock was dropped by dispdeq().
2413 */
2414
2415 return (tp);
2416 }
2417
2418 /*
2419 * disp_bound_common() - common routine for higher level functions
2420 * that check for bound threads under certain conditions.
2421 * If 'threadlistsafe' is set then there is no need to acquire
2422 * pidlock to stop the thread list from changing (eg, if
2423 * disp_bound_* is called with cpus paused).
2424 */
2425 static int
disp_bound_common(cpu_t * cp,int threadlistsafe,int flag)2426 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2427 {
2428 int found = 0;
2429 kthread_t *tp;
2430
2431 ASSERT(flag);
2432
2433 if (!threadlistsafe)
2434 mutex_enter(&pidlock);
2435 tp = curthread; /* faster than allthreads */
2436 do {
2437 if (tp->t_state != TS_FREE) {
2438 /*
2439 * If an interrupt thread is busy, but the
2440 * caller doesn't care (i.e. BOUND_INTR is off),
2441 * then just ignore it and continue through.
2442 */
2443 if ((tp->t_flag & T_INTR_THREAD) &&
2444 !(flag & BOUND_INTR))
2445 continue;
2446
2447 /*
2448 * Skip the idle thread for the CPU
2449 * we're about to set offline.
2450 */
2451 if (tp == cp->cpu_idle_thread)
2452 continue;
2453
2454 /*
2455 * Skip the pause thread for the CPU
2456 * we're about to set offline.
2457 */
2458 if (tp == cp->cpu_pause_thread)
2459 continue;
2460
2461 if ((flag & BOUND_CPU) &&
2462 (tp->t_bound_cpu == cp ||
2463 tp->t_bind_cpu == cp->cpu_id ||
2464 tp->t_weakbound_cpu == cp)) {
2465 found = 1;
2466 break;
2467 }
2468
2469 if ((flag & BOUND_PARTITION) &&
2470 (tp->t_cpupart == cp->cpu_part)) {
2471 found = 1;
2472 break;
2473 }
2474 }
2475 } while ((tp = tp->t_next) != curthread && found == 0);
2476 if (!threadlistsafe)
2477 mutex_exit(&pidlock);
2478 return (found);
2479 }
2480
2481 /*
2482 * disp_bound_threads - return nonzero if threads are bound to the processor.
2483 * Called infrequently. Keep this simple.
2484 * Includes threads that are asleep or stopped but not onproc.
2485 */
2486 int
disp_bound_threads(cpu_t * cp,int threadlistsafe)2487 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2488 {
2489 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2490 }
2491
2492 /*
2493 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2494 * to the given processor, including interrupt threads.
2495 */
2496 int
disp_bound_anythreads(cpu_t * cp,int threadlistsafe)2497 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2498 {
2499 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2500 }
2501
2502 /*
2503 * disp_bound_partition - return nonzero if threads are bound to the same
2504 * partition as the processor.
2505 * Called infrequently. Keep this simple.
2506 * Includes threads that are asleep or stopped but not onproc.
2507 */
2508 int
disp_bound_partition(cpu_t * cp,int threadlistsafe)2509 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2510 {
2511 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2512 }
2513
2514 /*
2515 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2516 * threads to other CPUs.
2517 */
2518 void
disp_cpu_inactive(cpu_t * cp)2519 disp_cpu_inactive(cpu_t *cp)
2520 {
2521 kthread_t *tp;
2522 disp_t *dp = cp->cpu_disp;
2523 dispq_t *dq;
2524 pri_t pri;
2525 int wasonq;
2526
2527 disp_lock_enter(&dp->disp_lock);
2528 while ((pri = dp->disp_max_unbound_pri) != -1) {
2529 dq = &dp->disp_q[pri];
2530 tp = dq->dq_first;
2531
2532 /*
2533 * Skip over bound threads.
2534 */
2535 while (tp != NULL && tp->t_bound_cpu != NULL) {
2536 tp = tp->t_link;
2537 }
2538
2539 if (tp == NULL) {
2540 /* disp_max_unbound_pri must be inaccurate, so fix it */
2541 disp_fix_unbound_pri(dp, pri);
2542 continue;
2543 }
2544
2545 wasonq = dispdeq(tp); /* drops disp_lock */
2546 ASSERT(wasonq);
2547 ASSERT(tp->t_weakbound_cpu == NULL);
2548
2549 setbackdq(tp);
2550 /*
2551 * Called from cpu_offline:
2552 *
2553 * cp has already been removed from the list of active cpus
2554 * and tp->t_cpu has been changed so there is no risk of
2555 * tp ending up back on cp.
2556 *
2557 * Called from cpupart_move_cpu:
2558 *
2559 * The cpu has moved to a new cpupart. Any threads that
2560 * were on it's dispatch queues before the move remain
2561 * in the old partition and can't run in the new partition.
2562 */
2563 ASSERT(tp->t_cpu != cp);
2564 thread_unlock(tp);
2565
2566 disp_lock_enter(&dp->disp_lock);
2567 }
2568 disp_lock_exit(&dp->disp_lock);
2569 }
2570
2571 /*
2572 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2573 * The hint passed in is used as a starting point so we don't favor
2574 * CPU 0 or any other CPU. The caller should pass in the most recently
2575 * used CPU for the thread.
2576 *
2577 * The lgroup and priority are used to determine the best CPU to run on
2578 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2579 * the thread priority will indicate whether the thread will actually run
2580 * there. To pick the best CPU, the CPUs inside and outside of the given
2581 * lgroup which are running the lowest priority threads are found. The
2582 * remote CPU is chosen only if the thread will not run locally on a CPU
2583 * within the lgroup, but will run on the remote CPU. If the thread
2584 * cannot immediately run on any CPU, the best local CPU will be chosen.
2585 *
2586 * The lpl specified also identifies the cpu partition from which
2587 * disp_lowpri_cpu should select a CPU.
2588 *
2589 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2590 * behalf of the current thread. (curthread is looking for a new cpu)
2591 * In this case, cpu_dispatch_pri for this thread's cpu should be
2592 * ignored.
2593 *
2594 * If a cpu is the target of an offline request then try to avoid it.
2595 *
2596 * This function must be called at either high SPL, or with preemption
2597 * disabled, so that the "hint" CPU cannot be removed from the online
2598 * CPU list while we are traversing it.
2599 */
2600 cpu_t *
disp_lowpri_cpu(cpu_t * hint,lpl_t * lpl,pri_t tpri,cpu_t * curcpu)2601 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2602 {
2603 cpu_t *bestcpu;
2604 cpu_t *besthomecpu;
2605 cpu_t *cp, *cpstart;
2606
2607 pri_t bestpri;
2608 pri_t cpupri;
2609
2610 klgrpset_t done;
2611 klgrpset_t cur_set;
2612
2613 lpl_t *lpl_iter, *lpl_leaf;
2614 int i;
2615
2616 /*
2617 * Scan for a CPU currently running the lowest priority thread.
2618 * Cannot get cpu_lock here because it is adaptive.
2619 * We do not require lock on CPU list.
2620 */
2621 ASSERT(hint != NULL);
2622 ASSERT(lpl != NULL);
2623 ASSERT(lpl->lpl_ncpu > 0);
2624
2625 /*
2626 * First examine local CPUs. Note that it's possible the hint CPU
2627 * passed in in remote to the specified home lgroup. If our priority
2628 * isn't sufficient enough such that we can run immediately at home,
2629 * then examine CPUs remote to our home lgroup.
2630 * We would like to give preference to CPUs closest to "home".
2631 * If we can't find a CPU where we'll run at a given level
2632 * of locality, we expand our search to include the next level.
2633 */
2634 bestcpu = besthomecpu = NULL;
2635 klgrpset_clear(done);
2636 /* start with lpl we were passed */
2637
2638 lpl_iter = lpl;
2639
2640 do {
2641
2642 bestpri = SHRT_MAX;
2643 klgrpset_clear(cur_set);
2644
2645 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2646 lpl_leaf = lpl_iter->lpl_rset[i];
2647 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2648 continue;
2649
2650 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2651
2652 if (hint->cpu_lpl == lpl_leaf)
2653 cp = cpstart = hint;
2654 else
2655 cp = cpstart = lpl_leaf->lpl_cpus;
2656
2657 do {
2658 if (cp == curcpu)
2659 cpupri = -1;
2660 else if (cp == cpu_inmotion)
2661 cpupri = SHRT_MAX;
2662 else
2663 cpupri = cp->cpu_dispatch_pri;
2664 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2665 cpupri = cp->cpu_disp->disp_maxrunpri;
2666 if (cp->cpu_chosen_level > cpupri)
2667 cpupri = cp->cpu_chosen_level;
2668 if (cpupri < bestpri) {
2669 if (CPU_IDLING(cpupri)) {
2670 ASSERT((cp->cpu_flags &
2671 CPU_QUIESCED) == 0);
2672 return (cp);
2673 }
2674 bestcpu = cp;
2675 bestpri = cpupri;
2676 }
2677 } while ((cp = cp->cpu_next_lpl) != cpstart);
2678 }
2679
2680 if (bestcpu && (tpri > bestpri)) {
2681 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682 return (bestcpu);
2683 }
2684 if (besthomecpu == NULL)
2685 besthomecpu = bestcpu;
2686 /*
2687 * Add the lgrps we just considered to the "done" set
2688 */
2689 klgrpset_or(done, cur_set);
2690
2691 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2692
2693 /*
2694 * The specified priority isn't high enough to run immediately
2695 * anywhere, so just return the best CPU from the home lgroup.
2696 */
2697 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2698 return (besthomecpu);
2699 }
2700
2701 /*
2702 * This routine provides the generic idle cpu function for all processors.
2703 * If a processor has some specific code to execute when idle (say, to stop
2704 * the pipeline and save power) then that routine should be defined in the
2705 * processors specific code (module_xx.c) and the global variable idle_cpu
2706 * set to that function.
2707 */
2708 static void
generic_idle_cpu(void)2709 generic_idle_cpu(void)
2710 {
2711 }
2712
2713 /*ARGSUSED*/
2714 static void
generic_enq_thread(cpu_t * cpu,int bound)2715 generic_enq_thread(cpu_t *cpu, int bound)
2716 {
2717 }
2718