1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2019 Joyent, Inc.
28 */
29
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
31 /* All Rights Reserved */
32
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/systm.h>
40 #include <sys/sysinfo.h>
41 #include <sys/var.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <sys/inline.h>
46 #include <sys/disp.h>
47 #include <sys/class.h>
48 #include <sys/bitmap.h>
49 #include <sys/kmem.h>
50 #include <sys/cpuvar.h>
51 #include <sys/vtrace.h>
52 #include <sys/cpupart.h>
53 #include <sys/lgrp.h>
54 #include <sys/pg.h>
55 #include <sys/cmt.h>
56 #include <sys/bitset.h>
57 #include <sys/schedctl.h>
58 #include <sys/atomic.h>
59 #include <sys/dtrace.h>
60 #include <sys/sdt.h>
61 #include <sys/archsystm.h>
62 #include <sys/smt.h>
63
64 #include <vm/as.h>
65
66 #define BOUND_CPU 0x1
67 #define BOUND_PARTITION 0x2
68 #define BOUND_INTR 0x4
69
70 /* Dispatch queue allocation structure and functions */
71 struct disp_queue_info {
72 disp_t *dp;
73 dispq_t *olddispq;
74 dispq_t *newdispq;
75 ulong_t *olddqactmap;
76 ulong_t *newdqactmap;
77 int oldnglobpris;
78 };
79 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
80 disp_t *dp);
81 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
82 static void disp_dq_free(struct disp_queue_info *dptr);
83
84 /* platform-specific routine to call when processor is idle */
85 static void generic_idle_cpu();
86 void (*idle_cpu)() = generic_idle_cpu;
87
88 /* routines invoked when a CPU enters/exits the idle loop */
89 static void idle_enter();
90 static void idle_exit();
91
92 /* platform-specific routine to call when thread is enqueued */
93 static void generic_enq_thread(cpu_t *, int);
94 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
95
96 pri_t kpreemptpri; /* priority where kernel preemption applies */
97 pri_t upreemptpri = 0; /* priority where normal preemption applies */
98 pri_t intr_pri; /* interrupt thread priority base level */
99
100 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
101 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
102 disp_t cpu0_disp; /* boot CPU's dispatch queue */
103 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
104 int nswapped; /* total number of swapped threads */
105 void disp_swapped_enq(kthread_t *tp);
106 static void disp_swapped_setrun(kthread_t *tp);
107 static void cpu_resched(cpu_t *cp, pri_t tpri);
108
109 /*
110 * If this is set, only interrupt threads will cause kernel preemptions.
111 * This is done by changing the value of kpreemptpri. kpreemptpri
112 * will either be the max sysclass pri or the min interrupt pri.
113 */
114 int only_intr_kpreempt;
115
116 extern void set_idle_cpu(int cpun);
117 extern void unset_idle_cpu(int cpun);
118 static void setkpdq(kthread_t *tp, int borf);
119 #define SETKP_BACK 0
120 #define SETKP_FRONT 1
121 /*
122 * Parameter that determines how recently a thread must have run
123 * on the CPU to be considered loosely-bound to that CPU to reduce
124 * cold cache effects. The interval is in hertz.
125 */
126 #define RECHOOSE_INTERVAL 3
127 int rechoose_interval = RECHOOSE_INTERVAL;
128
129 /*
130 * Parameter that determines how long (in nanoseconds) a thread must
131 * be sitting on a run queue before it can be stolen by another CPU
132 * to reduce migrations. The interval is in nanoseconds.
133 *
134 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
135 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
136 * here indicating it is uninitiallized.
137 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
138 *
139 */
140 #define NOSTEAL_UNINITIALIZED (-1)
141 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
142 extern void cmp_set_nosteal_interval(void);
143
144 id_t defaultcid; /* system "default" class; see dispadmin(8) */
145
146 disp_lock_t transition_lock; /* lock on transitioning threads */
147 disp_lock_t stop_lock; /* lock on stopped threads */
148
149 static void cpu_dispqalloc(int numpris);
150
151 /*
152 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
153 * a thread because it was sitting on its run queue for a very short
154 * period of time.
155 */
156 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
157
158 static kthread_t *disp_getwork(cpu_t *to);
159 static kthread_t *disp_getbest(disp_t *from);
160 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
161
162 void swtch_to(kthread_t *);
163
164 /*
165 * dispatcher and scheduler initialization
166 */
167
168 /*
169 * disp_setup - Common code to calculate and allocate dispatcher
170 * variables and structures based on the maximum priority.
171 */
172 static void
disp_setup(pri_t maxglobpri,pri_t oldnglobpris)173 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
174 {
175 pri_t newnglobpris;
176
177 ASSERT(MUTEX_HELD(&cpu_lock));
178
179 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
180
181 if (newnglobpris > oldnglobpris) {
182 /*
183 * Allocate new kp queues for each CPU partition.
184 */
185 cpupart_kpqalloc(newnglobpris);
186
187 /*
188 * Allocate new dispatch queues for each CPU.
189 */
190 cpu_dispqalloc(newnglobpris);
191
192 /*
193 * compute new interrupt thread base priority
194 */
195 intr_pri = maxglobpri;
196 if (only_intr_kpreempt) {
197 kpreemptpri = intr_pri + 1;
198 if (kpqpri == KPQPRI)
199 kpqpri = kpreemptpri;
200 }
201 v.v_nglobpris = newnglobpris;
202 }
203 }
204
205 /*
206 * dispinit - Called to initialize all loaded classes and the
207 * dispatcher framework.
208 */
209 void
dispinit(void)210 dispinit(void)
211 {
212 id_t cid;
213 pri_t maxglobpri;
214 pri_t cl_maxglobpri;
215
216 maxglobpri = -1;
217
218 /*
219 * Initialize transition lock, which will always be set.
220 */
221 DISP_LOCK_INIT(&transition_lock);
222 disp_lock_enter_high(&transition_lock);
223 DISP_LOCK_INIT(&stop_lock);
224
225 mutex_enter(&cpu_lock);
226 CPU->cpu_disp->disp_maxrunpri = -1;
227 CPU->cpu_disp->disp_max_unbound_pri = -1;
228
229 /*
230 * Initialize the default CPU partition.
231 */
232 cpupart_initialize_default();
233 /*
234 * Call the class specific initialization functions for
235 * all pre-installed schedulers.
236 *
237 * We pass the size of a class specific parameter
238 * buffer to each of the initialization functions
239 * to try to catch problems with backward compatibility
240 * of class modules.
241 *
242 * For example a new class module running on an old system
243 * which didn't provide sufficiently large parameter buffers
244 * would be bad news. Class initialization modules can check for
245 * this and take action if they detect a problem.
246 */
247
248 for (cid = 0; cid < nclass; cid++) {
249 sclass_t *sc;
250
251 sc = &sclass[cid];
252 if (SCHED_INSTALLED(sc)) {
253 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
254 &sc->cl_funcs);
255 if (cl_maxglobpri > maxglobpri)
256 maxglobpri = cl_maxglobpri;
257 }
258 }
259
260 /*
261 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
262 * to say, maxclsyspri + 1. However, over time, the system has used
263 * more and more asynchronous kernel threads, with an increasing number
264 * of these doing work on direct behalf of higher-level software (e.g.,
265 * network processing). This has led to potential priority inversions:
266 * threads doing low-priority lengthy kernel work can effectively
267 * delay kernel-level processing of higher-priority data. To minimize
268 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
269 * the kernel that runs at maxclsyspri will therefore induce kernel
270 * preemption, and this priority should be used if/when an asynchronous
271 * thread (or, as is often the case, task queue) is performing a task
272 * on behalf of higher-level software (or any task that is otherwise
273 * latency-sensitve).
274 */
275 kpreemptpri = (pri_t)v.v_maxsyspri;
276 if (kpqpri == KPQPRI)
277 kpqpri = kpreemptpri;
278
279 ASSERT(maxglobpri >= 0);
280 disp_setup(maxglobpri, 0);
281
282 mutex_exit(&cpu_lock);
283
284 /*
285 * Platform specific sticky scheduler setup.
286 */
287 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
288 cmp_set_nosteal_interval();
289
290 /*
291 * Get the default class ID; this may be later modified via
292 * dispadmin(8). This will load the class (normally TS) and that will
293 * call disp_add(), which is why we had to drop cpu_lock first.
294 */
295 if (getcid(defaultclass, &defaultcid) != 0) {
296 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
297 defaultclass);
298 }
299 }
300
301 /*
302 * disp_add - Called with class pointer to initialize the dispatcher
303 * for a newly loaded class.
304 */
305 void
disp_add(sclass_t * clp)306 disp_add(sclass_t *clp)
307 {
308 pri_t maxglobpri;
309 pri_t cl_maxglobpri;
310
311 mutex_enter(&cpu_lock);
312 /*
313 * Initialize the scheduler class.
314 */
315 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
316 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
317 if (cl_maxglobpri > maxglobpri)
318 maxglobpri = cl_maxglobpri;
319
320 /*
321 * Save old queue information. Since we're initializing a
322 * new scheduling class which has just been loaded, then
323 * the size of the dispq may have changed. We need to handle
324 * that here.
325 */
326 disp_setup(maxglobpri, v.v_nglobpris);
327
328 mutex_exit(&cpu_lock);
329 }
330
331
332 /*
333 * For each CPU, allocate new dispatch queues
334 * with the stated number of priorities.
335 */
336 static void
cpu_dispqalloc(int numpris)337 cpu_dispqalloc(int numpris)
338 {
339 cpu_t *cpup;
340 struct disp_queue_info *disp_mem;
341 int i, num;
342
343 ASSERT(MUTEX_HELD(&cpu_lock));
344
345 disp_mem = kmem_zalloc(NCPU *
346 sizeof (struct disp_queue_info), KM_SLEEP);
347
348 /*
349 * This routine must allocate all of the memory before stopping
350 * the cpus because it must not sleep in kmem_alloc while the
351 * CPUs are stopped. Locks they hold will not be freed until they
352 * are restarted.
353 */
354 i = 0;
355 cpup = cpu_list;
356 do {
357 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
358 i++;
359 cpup = cpup->cpu_next;
360 } while (cpup != cpu_list);
361 num = i;
362
363 pause_cpus(NULL, NULL);
364 for (i = 0; i < num; i++)
365 disp_dq_assign(&disp_mem[i], numpris);
366 start_cpus();
367
368 /*
369 * I must free all of the memory after starting the cpus because
370 * I can not risk sleeping in kmem_free while the cpus are stopped.
371 */
372 for (i = 0; i < num; i++)
373 disp_dq_free(&disp_mem[i]);
374
375 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
376 }
377
378 static void
disp_dq_alloc(struct disp_queue_info * dptr,int numpris,disp_t * dp)379 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
380 {
381 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
382 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
383 sizeof (long), KM_SLEEP);
384 dptr->dp = dp;
385 }
386
387 static void
disp_dq_assign(struct disp_queue_info * dptr,int numpris)388 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
389 {
390 disp_t *dp;
391
392 dp = dptr->dp;
393 dptr->olddispq = dp->disp_q;
394 dptr->olddqactmap = dp->disp_qactmap;
395 dptr->oldnglobpris = dp->disp_npri;
396
397 ASSERT(dptr->oldnglobpris < numpris);
398
399 if (dptr->olddispq != NULL) {
400 /*
401 * Use kcopy because bcopy is platform-specific
402 * and could block while we might have paused the cpus.
403 */
404 (void) kcopy(dptr->olddispq, dptr->newdispq,
405 dptr->oldnglobpris * sizeof (dispq_t));
406 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
407 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
408 sizeof (long));
409 }
410 dp->disp_q = dptr->newdispq;
411 dp->disp_qactmap = dptr->newdqactmap;
412 dp->disp_q_limit = &dptr->newdispq[numpris];
413 dp->disp_npri = numpris;
414 }
415
416 static void
disp_dq_free(struct disp_queue_info * dptr)417 disp_dq_free(struct disp_queue_info *dptr)
418 {
419 if (dptr->olddispq != NULL)
420 kmem_free(dptr->olddispq,
421 dptr->oldnglobpris * sizeof (dispq_t));
422 if (dptr->olddqactmap != NULL)
423 kmem_free(dptr->olddqactmap,
424 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
425 }
426
427 /*
428 * For a newly created CPU, initialize the dispatch queue.
429 * This is called before the CPU is known through cpu[] or on any lists.
430 */
431 void
disp_cpu_init(cpu_t * cp)432 disp_cpu_init(cpu_t *cp)
433 {
434 disp_t *dp;
435 dispq_t *newdispq;
436 ulong_t *newdqactmap;
437
438 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
439
440 if (cp == cpu0_disp.disp_cpu)
441 dp = &cpu0_disp;
442 else
443 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
444 bzero(dp, sizeof (disp_t));
445 cp->cpu_disp = dp;
446 dp->disp_cpu = cp;
447 dp->disp_maxrunpri = -1;
448 dp->disp_max_unbound_pri = -1;
449 DISP_LOCK_INIT(&cp->cpu_thread_lock);
450 /*
451 * Allocate memory for the dispatcher queue headers
452 * and the active queue bitmap.
453 */
454 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
455 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
456 sizeof (long), KM_SLEEP);
457 dp->disp_q = newdispq;
458 dp->disp_qactmap = newdqactmap;
459 dp->disp_q_limit = &newdispq[v.v_nglobpris];
460 dp->disp_npri = v.v_nglobpris;
461 }
462
463 void
disp_cpu_fini(cpu_t * cp)464 disp_cpu_fini(cpu_t *cp)
465 {
466 ASSERT(MUTEX_HELD(&cpu_lock));
467
468 disp_kp_free(cp->cpu_disp);
469 if (cp->cpu_disp != &cpu0_disp)
470 kmem_free(cp->cpu_disp, sizeof (disp_t));
471 }
472
473 /*
474 * Allocate new, larger kpreempt dispatch queue to replace the old one.
475 */
476 void
disp_kp_alloc(disp_t * dq,pri_t npri)477 disp_kp_alloc(disp_t *dq, pri_t npri)
478 {
479 struct disp_queue_info mem_info;
480
481 if (npri > dq->disp_npri) {
482 /*
483 * Allocate memory for the new array.
484 */
485 disp_dq_alloc(&mem_info, npri, dq);
486
487 /*
488 * We need to copy the old structures to the new
489 * and free the old.
490 */
491 disp_dq_assign(&mem_info, npri);
492 disp_dq_free(&mem_info);
493 }
494 }
495
496 /*
497 * Free dispatch queue.
498 * Used for the kpreempt queues for a removed CPU partition and
499 * for the per-CPU queues of deleted CPUs.
500 */
501 void
disp_kp_free(disp_t * dq)502 disp_kp_free(disp_t *dq)
503 {
504 struct disp_queue_info mem_info;
505
506 mem_info.olddispq = dq->disp_q;
507 mem_info.olddqactmap = dq->disp_qactmap;
508 mem_info.oldnglobpris = dq->disp_npri;
509 disp_dq_free(&mem_info);
510 }
511
512 /*
513 * End dispatcher and scheduler initialization.
514 */
515
516 /*
517 * See if there's anything to do other than remain idle.
518 * Return non-zero if there is.
519 *
520 * This function must be called with high spl, or with
521 * kernel preemption disabled to prevent the partition's
522 * active cpu list from changing while being traversed.
523 *
524 * This is essentially a simpler version of disp_getwork()
525 * to be called by CPUs preparing to "halt".
526 */
527 int
disp_anywork(void)528 disp_anywork(void)
529 {
530 cpu_t *cp = CPU;
531 cpu_t *ocp;
532 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
533
534 if (!(cp->cpu_flags & CPU_OFFLINE)) {
535 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
536 return (1);
537
538 for (ocp = cp->cpu_next_part; ocp != cp;
539 ocp = ocp->cpu_next_part) {
540 ASSERT(CPU_ACTIVE(ocp));
541
542 /*
543 * Something has appeared on the local run queue.
544 */
545 if (*local_nrunnable > 0)
546 return (1);
547 /*
548 * If we encounter another idle CPU that will
549 * soon be trolling around through disp_anywork()
550 * terminate our walk here and let this other CPU
551 * patrol the next part of the list.
552 */
553 if (ocp->cpu_dispatch_pri == -1 &&
554 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
555 return (0);
556 /*
557 * Work can be taken from another CPU if:
558 * - There is unbound work on the run queue
559 * - That work isn't a thread undergoing a
560 * - context switch on an otherwise empty queue.
561 * - The CPU isn't running the idle loop.
562 */
563 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
564 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
565 ocp->cpu_disp->disp_nrunnable == 1) &&
566 ocp->cpu_dispatch_pri != -1)
567 return (1);
568 }
569 }
570 return (0);
571 }
572
573 /*
574 * Called when CPU enters the idle loop
575 */
576 static void
idle_enter()577 idle_enter()
578 {
579 cpu_t *cp = CPU;
580
581 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
582 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
583 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
584 }
585
586 /*
587 * Called when CPU exits the idle loop
588 */
589 static void
idle_exit()590 idle_exit()
591 {
592 cpu_t *cp = CPU;
593
594 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
595 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
596 }
597
598 /*
599 * Idle loop.
600 */
601 void
idle()602 idle()
603 {
604 struct cpu *cp = CPU; /* pointer to this CPU */
605 kthread_t *t; /* taken thread */
606
607 idle_enter();
608
609 /*
610 * Uniprocessor version of idle loop.
611 * Do this until notified that we're on an actual multiprocessor.
612 */
613 while (ncpus == 1) {
614 if (cp->cpu_disp->disp_nrunnable == 0) {
615 (*idle_cpu)();
616 continue;
617 }
618 idle_exit();
619 swtch();
620
621 idle_enter(); /* returned from swtch */
622 }
623
624 /*
625 * Multiprocessor idle loop.
626 */
627 for (;;) {
628 /*
629 * If CPU is completely quiesced by p_online(2), just wait
630 * here with minimal bus traffic until put online.
631 */
632 while (cp->cpu_flags & CPU_QUIESCED)
633 (*idle_cpu)();
634
635 if (cp->cpu_disp->disp_nrunnable != 0) {
636 idle_exit();
637 swtch();
638 } else {
639 if (cp->cpu_flags & CPU_OFFLINE)
640 continue;
641 if ((t = disp_getwork(cp)) == NULL) {
642 if (cp->cpu_chosen_level != -1) {
643 disp_t *dp = cp->cpu_disp;
644 disp_t *kpq;
645
646 disp_lock_enter(&dp->disp_lock);
647 /*
648 * Set kpq under lock to prevent
649 * migration between partitions.
650 */
651 kpq = &cp->cpu_part->cp_kp_queue;
652 if (kpq->disp_maxrunpri == -1)
653 cp->cpu_chosen_level = -1;
654 disp_lock_exit(&dp->disp_lock);
655 }
656 (*idle_cpu)();
657 continue;
658 }
659 /*
660 * If there was a thread but we couldn't steal
661 * it, then keep trying.
662 */
663 if (t == T_DONTSTEAL)
664 continue;
665 idle_exit();
666 swtch_to(t);
667 }
668 idle_enter(); /* returned from swtch/swtch_to */
669 }
670 }
671
672
673 /*
674 * Preempt the currently running thread in favor of the highest
675 * priority thread. The class of the current thread controls
676 * where it goes on the dispatcher queues. If panicking, turn
677 * preemption off.
678 */
679 void
preempt()680 preempt()
681 {
682 kthread_t *t = curthread;
683 klwp_t *lwp = ttolwp(curthread);
684
685 if (panicstr)
686 return;
687
688 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
689
690 thread_lock(t);
691
692 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
693 /*
694 * this thread has already been chosen to be run on
695 * another CPU. Clear kprunrun on this CPU since we're
696 * already headed for swtch().
697 */
698 CPU->cpu_kprunrun = 0;
699 thread_unlock_nopreempt(t);
700 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
701 } else {
702 if (lwp != NULL)
703 lwp->lwp_ru.nivcsw++;
704 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
705 THREAD_TRANSITION(t);
706 CL_PREEMPT(t);
707 DTRACE_SCHED(preempt);
708 thread_unlock_nopreempt(t);
709
710 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
711
712 swtch(); /* clears CPU->cpu_runrun via disp() */
713 }
714 }
715
716 extern kthread_t *thread_unpin();
717
718 /*
719 * disp() - find the highest priority thread for this processor to run, and
720 * set it in TS_ONPROC state so that resume() can be called to run it.
721 */
722 static kthread_t *
disp()723 disp()
724 {
725 cpu_t *cpup;
726 disp_t *dp;
727 kthread_t *tp;
728 dispq_t *dq;
729 int maxrunword;
730 pri_t pri;
731 disp_t *kpq;
732
733 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
734
735 cpup = CPU;
736 /*
737 * Find the highest priority loaded, runnable thread.
738 */
739 dp = cpup->cpu_disp;
740
741 reschedule:
742 /*
743 * If there is more important work on the global queue with a better
744 * priority than the maximum on this CPU, take it now.
745 */
746 kpq = &cpup->cpu_part->cp_kp_queue;
747 while ((pri = kpq->disp_maxrunpri) >= 0 &&
748 pri >= dp->disp_maxrunpri &&
749 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
750 (tp = disp_getbest(kpq)) != NULL) {
751 if (disp_ratify(tp, kpq) != NULL) {
752 TRACE_1(TR_FAC_DISP, TR_DISP_END,
753 "disp_end:tid %p", tp);
754 return (tp);
755 }
756 }
757
758 disp_lock_enter(&dp->disp_lock);
759 pri = dp->disp_maxrunpri;
760
761 /*
762 * If there is nothing to run, look at what's runnable on other queues.
763 * Choose the idle thread if the CPU is quiesced.
764 * Note that CPUs that have the CPU_OFFLINE flag set can still run
765 * interrupt threads, which will be the only threads on the CPU's own
766 * queue, but cannot run threads from other queues.
767 */
768 if (pri == -1) {
769 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
770 disp_lock_exit(&dp->disp_lock);
771 if ((tp = disp_getwork(cpup)) == NULL ||
772 tp == T_DONTSTEAL) {
773 tp = cpup->cpu_idle_thread;
774 (void) splhigh();
775 THREAD_ONPROC(tp, cpup);
776 cpup->cpu_dispthread = tp;
777 cpup->cpu_dispatch_pri = -1;
778 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
779 cpup->cpu_chosen_level = -1;
780 }
781 } else {
782 disp_lock_exit_high(&dp->disp_lock);
783 tp = cpup->cpu_idle_thread;
784 THREAD_ONPROC(tp, cpup);
785 cpup->cpu_dispthread = tp;
786 cpup->cpu_dispatch_pri = -1;
787 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
788 cpup->cpu_chosen_level = -1;
789 }
790 TRACE_1(TR_FAC_DISP, TR_DISP_END,
791 "disp_end:tid %p", tp);
792 return (tp);
793 }
794
795 dq = &dp->disp_q[pri];
796 tp = dq->dq_first;
797
798 ASSERT(tp != NULL);
799 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
800
801 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
802
803 /*
804 * Found it so remove it from queue.
805 */
806 dp->disp_nrunnable--;
807 dq->dq_sruncnt--;
808 if ((dq->dq_first = tp->t_link) == NULL) {
809 ulong_t *dqactmap = dp->disp_qactmap;
810
811 ASSERT(dq->dq_sruncnt == 0);
812 dq->dq_last = NULL;
813
814 /*
815 * The queue is empty, so the corresponding bit needs to be
816 * turned off in dqactmap. If nrunnable != 0 just took the
817 * last runnable thread off the
818 * highest queue, so recompute disp_maxrunpri.
819 */
820 maxrunword = pri >> BT_ULSHIFT;
821 dqactmap[maxrunword] &= ~BT_BIW(pri);
822
823 if (dp->disp_nrunnable == 0) {
824 dp->disp_max_unbound_pri = -1;
825 dp->disp_maxrunpri = -1;
826 } else {
827 int ipri;
828
829 ipri = bt_gethighbit(dqactmap, maxrunword);
830 dp->disp_maxrunpri = ipri;
831 if (ipri < dp->disp_max_unbound_pri)
832 dp->disp_max_unbound_pri = ipri;
833 }
834 } else {
835 tp->t_link = NULL;
836 }
837
838 /*
839 * Set TS_DONT_SWAP flag to prevent another processor from swapping
840 * out this thread before we have a chance to run it.
841 * While running, it is protected against swapping by t_lock.
842 */
843 tp->t_schedflag |= TS_DONT_SWAP;
844 cpup->cpu_dispthread = tp; /* protected by spl only */
845 cpup->cpu_dispatch_pri = pri;
846 ASSERT(pri == DISP_PRIO(tp));
847 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
848 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
849
850 ASSERT(tp != NULL);
851 TRACE_1(TR_FAC_DISP, TR_DISP_END,
852 "disp_end:tid %p", tp);
853
854 if (disp_ratify(tp, kpq) == NULL)
855 goto reschedule;
856
857 return (tp);
858 }
859
860 /*
861 * swtch()
862 * Find best runnable thread and run it.
863 * Called with the current thread already switched to a new state,
864 * on a sleep queue, run queue, stopped, and not zombied.
865 * May be called at any spl level less than or equal to LOCK_LEVEL.
866 * Always drops spl to the base level (spl0()).
867 */
868 void
swtch()869 swtch()
870 {
871 kthread_t *t = curthread;
872 kthread_t *next;
873 cpu_t *cp;
874
875 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
876
877 if (t->t_flag & T_INTR_THREAD)
878 cpu_intr_swtch_enter(t);
879
880 if (t->t_intr != NULL) {
881 /*
882 * We are an interrupt thread. Setup and return
883 * the interrupted thread to be resumed.
884 */
885 (void) splhigh(); /* block other scheduler action */
886 cp = CPU; /* now protected against migration */
887 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
888 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
889 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
890 next = thread_unpin();
891 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
892 resume_from_intr(next);
893 } else {
894 #ifdef DEBUG
895 if (t->t_state == TS_ONPROC &&
896 t->t_disp_queue->disp_cpu == CPU &&
897 t->t_preempt == 0) {
898 thread_lock(t);
899 ASSERT(t->t_state != TS_ONPROC ||
900 t->t_disp_queue->disp_cpu != CPU ||
901 t->t_preempt != 0); /* cannot migrate */
902 thread_unlock_nopreempt(t);
903 }
904 #endif /* DEBUG */
905 cp = CPU;
906 next = disp(); /* returns with spl high */
907 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
908
909 /* OK to steal anything left on run queue */
910 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
911
912 if (next != t) {
913 hrtime_t now;
914
915 now = gethrtime_unscaled();
916 pg_ev_thread_swtch(cp, now, t, next);
917
918 /*
919 * If t was previously in the TS_ONPROC state,
920 * setfrontdq and setbackdq won't have set its t_waitrq.
921 * Since we now finally know that we're switching away
922 * from this thread, set its t_waitrq if it is on a run
923 * queue.
924 */
925 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
926 t->t_waitrq = now;
927 }
928
929 /*
930 * restore mstate of thread that we are switching to
931 */
932 restore_mstate(next);
933
934 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
935 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
936 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
937
938 if (dtrace_vtime_active)
939 dtrace_vtime_switch(next);
940
941 resume(next);
942 /*
943 * The TR_RESUME_END and TR_SWTCH_END trace points
944 * appear at the end of resume(), because we may not
945 * return here
946 */
947 } else {
948 if (t->t_flag & T_INTR_THREAD)
949 cpu_intr_swtch_exit(t);
950 /*
951 * Threads that enqueue themselves on a run queue defer
952 * setting t_waitrq. It is then either set in swtch()
953 * when the CPU is actually yielded, or not at all if it
954 * is remaining on the CPU.
955 * There is however a window between where the thread
956 * placed itself on a run queue, and where it selects
957 * itself in disp(), where a third party (eg. clock()
958 * doing tick processing) may have re-enqueued this
959 * thread, setting t_waitrq in the process. We detect
960 * this race by noticing that despite switching to
961 * ourself, our t_waitrq has been set, and should be
962 * cleared.
963 */
964 if (t->t_waitrq != 0)
965 t->t_waitrq = 0;
966
967 pg_ev_thread_remain(cp, t);
968
969 DTRACE_SCHED(remain__cpu);
970 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
971 (void) spl0();
972 }
973 }
974 }
975
976 /*
977 * swtch_from_zombie()
978 * Special case of swtch(), which allows checks for TS_ZOMB to be
979 * eliminated from normal resume.
980 * Find best runnable thread and run it.
981 * Called with the current thread zombied.
982 * Zombies cannot migrate, so CPU references are safe.
983 */
984 void
swtch_from_zombie()985 swtch_from_zombie()
986 {
987 kthread_t *next;
988 cpu_t *cpu = CPU;
989
990 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
991
992 ASSERT(curthread->t_state == TS_ZOMB);
993
994 next = disp(); /* returns with spl high */
995 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
996 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
997 ASSERT(next != curthread);
998 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
999
1000 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
1001
1002 restore_mstate(next);
1003
1004 if (dtrace_vtime_active)
1005 dtrace_vtime_switch(next);
1006
1007 resume_from_zombie(next);
1008 /*
1009 * The TR_RESUME_END and TR_SWTCH_END trace points
1010 * appear at the end of resume(), because we certainly will not
1011 * return here
1012 */
1013 }
1014
1015 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1016
1017 /*
1018 * search_disp_queues()
1019 * Search the given dispatch queues for thread tp.
1020 * Return 1 if tp is found, otherwise return 0.
1021 */
1022 static int
search_disp_queues(disp_t * dp,kthread_t * tp)1023 search_disp_queues(disp_t *dp, kthread_t *tp)
1024 {
1025 dispq_t *dq;
1026 dispq_t *eq;
1027
1028 disp_lock_enter_high(&dp->disp_lock);
1029
1030 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1031 kthread_t *rp;
1032
1033 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1034
1035 for (rp = dq->dq_first; rp; rp = rp->t_link)
1036 if (tp == rp) {
1037 disp_lock_exit_high(&dp->disp_lock);
1038 return (1);
1039 }
1040 }
1041 disp_lock_exit_high(&dp->disp_lock);
1042
1043 return (0);
1044 }
1045
1046 /*
1047 * thread_on_queue()
1048 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1049 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1050 */
1051 static int
thread_on_queue(kthread_t * tp)1052 thread_on_queue(kthread_t *tp)
1053 {
1054 cpu_t *cp;
1055 struct cpupart *part;
1056
1057 ASSERT(getpil() >= DISP_LEVEL);
1058
1059 /*
1060 * Search the per-CPU dispatch queues for tp.
1061 */
1062 cp = CPU;
1063 do {
1064 if (search_disp_queues(cp->cpu_disp, tp))
1065 return (1);
1066 } while ((cp = cp->cpu_next_onln) != CPU);
1067
1068 /*
1069 * Search the partition-wide kpreempt queues for tp.
1070 */
1071 part = CPU->cpu_part;
1072 do {
1073 if (search_disp_queues(&part->cp_kp_queue, tp))
1074 return (1);
1075 } while ((part = part->cp_next) != CPU->cpu_part);
1076
1077 return (0);
1078 }
1079
1080 #else
1081
1082 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1083
1084 #endif /* DEBUG */
1085
1086 /*
1087 * like swtch(), but switch to a specified thread taken from another CPU.
1088 * called with spl high..
1089 */
1090 void
swtch_to(kthread_t * next)1091 swtch_to(kthread_t *next)
1092 {
1093 cpu_t *cp = CPU;
1094 hrtime_t now;
1095
1096 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1097
1098 /*
1099 * Update context switch statistics.
1100 */
1101 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1102
1103 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1104
1105 now = gethrtime_unscaled();
1106 pg_ev_thread_swtch(cp, now, curthread, next);
1107
1108 /* OK to steal anything left on run queue */
1109 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1110
1111 /* record last execution time */
1112 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1113
1114 /*
1115 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1116 * won't have set its t_waitrq. Since we now finally know that we're
1117 * switching away from this thread, set its t_waitrq if it is on a run
1118 * queue.
1119 */
1120 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1121 curthread->t_waitrq = now;
1122 }
1123
1124 /* restore next thread to previously running microstate */
1125 restore_mstate(next);
1126
1127 if (dtrace_vtime_active)
1128 dtrace_vtime_switch(next);
1129
1130 resume(next);
1131 /*
1132 * The TR_RESUME_END and TR_SWTCH_END trace points
1133 * appear at the end of resume(), because we may not
1134 * return here
1135 */
1136 }
1137
1138 static void
cpu_resched(cpu_t * cp,pri_t tpri)1139 cpu_resched(cpu_t *cp, pri_t tpri)
1140 {
1141 int call_poke_cpu = 0;
1142 pri_t cpupri = cp->cpu_dispatch_pri;
1143
1144 if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1145 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1146 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1147 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1148 cp->cpu_runrun = 1;
1149 aston(cp->cpu_dispthread);
1150 if (tpri < kpreemptpri && cp != CPU)
1151 call_poke_cpu = 1;
1152 }
1153 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1154 cp->cpu_kprunrun = 1;
1155 if (cp != CPU)
1156 call_poke_cpu = 1;
1157 }
1158 }
1159
1160 /*
1161 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1162 */
1163 membar_enter();
1164
1165 if (call_poke_cpu)
1166 poke_cpu(cp->cpu_id);
1167 }
1168
1169 /*
1170 * setbackdq() keeps runqs balanced such that the difference in length
1171 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1172 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1173 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1174 * try to keep runqs perfectly balanced regardless of the thread priority.
1175 */
1176 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1177 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1178 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1179
1180 /*
1181 * Macro that evaluates to true if it is likely that the thread has cache
1182 * warmth. This is based on the amount of time that has elapsed since the
1183 * thread last ran. If that amount of time is less than "rechoose_interval"
1184 * ticks, then we decide that the thread has enough cache warmth to warrant
1185 * some affinity for t->t_cpu.
1186 */
1187 #define THREAD_HAS_CACHE_WARMTH(thread) \
1188 ((thread == curthread) || \
1189 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1190 /*
1191 * Put the specified thread on the back of the dispatcher
1192 * queue corresponding to its current priority.
1193 *
1194 * Called with the thread in transition, onproc or stopped state
1195 * and locked (transition implies locked) and at high spl.
1196 * Returns with the thread in TS_RUN state and still locked.
1197 */
1198 void
setbackdq(kthread_t * tp)1199 setbackdq(kthread_t *tp)
1200 {
1201 dispq_t *dq;
1202 disp_t *dp;
1203 cpu_t *cp;
1204 pri_t tpri;
1205 int bound;
1206 boolean_t self;
1207
1208 ASSERT(THREAD_LOCK_HELD(tp));
1209 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1210 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1211
1212 /*
1213 * If thread is "swapped" or on the swap queue don't
1214 * queue it, but wake sched.
1215 */
1216 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1217 disp_swapped_setrun(tp);
1218 return;
1219 }
1220
1221 self = (tp == curthread);
1222
1223 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1224 bound = 1;
1225 else
1226 bound = 0;
1227
1228 tpri = DISP_PRIO(tp);
1229 if (ncpus == 1)
1230 cp = tp->t_cpu;
1231 else if (!bound) {
1232 if (tpri >= kpqpri) {
1233 setkpdq(tp, SETKP_BACK);
1234 return;
1235 }
1236
1237 /*
1238 * We'll generally let this thread continue to run where
1239 * it last ran...but will consider migration if:
1240 * - The thread probably doesn't have much cache warmth.
1241 * - SMT exclusion would prefer us to run elsewhere
1242 * - The CPU where it last ran is the target of an offline
1243 * request.
1244 * - The thread last ran outside its home lgroup.
1245 */
1246 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1247 !smt_should_run(tp, tp->t_cpu) ||
1248 (tp->t_cpu == cpu_inmotion) ||
1249 !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1250 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1251 } else {
1252 cp = tp->t_cpu;
1253 }
1254
1255 if (tp->t_cpupart == cp->cpu_part) {
1256 int qlen;
1257
1258 /*
1259 * Perform any CMT load balancing
1260 */
1261 cp = cmt_balance(tp, cp);
1262
1263 /*
1264 * Balance across the run queues
1265 */
1266 qlen = RUNQ_LEN(cp, tpri);
1267 if (tpri >= RUNQ_MATCH_PRI &&
1268 !(tp->t_schedflag & TS_RUNQMATCH))
1269 qlen -= RUNQ_MAX_DIFF;
1270 if (qlen > 0) {
1271 cpu_t *newcp;
1272
1273 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1274 newcp = cp->cpu_next_part;
1275 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1276 newcp = cp->cpu_next_part;
1277 }
1278
1279 if (smt_should_run(tp, newcp) &&
1280 RUNQ_LEN(newcp, tpri) < qlen) {
1281 DTRACE_PROBE3(runq__balance,
1282 kthread_t *, tp,
1283 cpu_t *, cp, cpu_t *, newcp);
1284 cp = newcp;
1285 }
1286 }
1287 } else {
1288 /*
1289 * Migrate to a cpu in the new partition.
1290 */
1291 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1292 tp->t_pri);
1293 }
1294 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1295 } else {
1296 /*
1297 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1298 * a short time until weak binding that existed when the
1299 * strong binding was established has dropped) so we must
1300 * favour weak binding over strong.
1301 */
1302 cp = tp->t_weakbound_cpu ?
1303 tp->t_weakbound_cpu : tp->t_bound_cpu;
1304 }
1305 /*
1306 * A thread that is ONPROC may be temporarily placed on the run queue
1307 * but then chosen to run again by disp. If the thread we're placing on
1308 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1309 * replacement process is actually scheduled in swtch(). In this
1310 * situation, curthread is the only thread that could be in the ONPROC
1311 * state.
1312 */
1313 if ((!self) && (tp->t_waitrq == 0)) {
1314 hrtime_t curtime;
1315
1316 curtime = gethrtime_unscaled();
1317 (void) cpu_update_pct(tp, curtime);
1318 tp->t_waitrq = curtime;
1319 } else {
1320 (void) cpu_update_pct(tp, gethrtime_unscaled());
1321 }
1322
1323 dp = cp->cpu_disp;
1324 disp_lock_enter_high(&dp->disp_lock);
1325
1326 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1327 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1328 tpri, cp, tp);
1329
1330 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1331
1332 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1333 tp->t_disp_queue = dp;
1334 tp->t_link = NULL;
1335
1336 dq = &dp->disp_q[tpri];
1337 dp->disp_nrunnable++;
1338 if (!bound)
1339 dp->disp_steal = 0;
1340 membar_enter();
1341
1342 if (dq->dq_sruncnt++ != 0) {
1343 ASSERT(dq->dq_first != NULL);
1344 dq->dq_last->t_link = tp;
1345 dq->dq_last = tp;
1346 } else {
1347 ASSERT(dq->dq_first == NULL);
1348 ASSERT(dq->dq_last == NULL);
1349 dq->dq_first = dq->dq_last = tp;
1350 BT_SET(dp->disp_qactmap, tpri);
1351 if (tpri > dp->disp_maxrunpri) {
1352 dp->disp_maxrunpri = tpri;
1353 membar_enter();
1354 cpu_resched(cp, tpri);
1355 }
1356 }
1357
1358 if (!bound && tpri > dp->disp_max_unbound_pri) {
1359 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1360 /*
1361 * If there are no other unbound threads on the
1362 * run queue, don't allow other CPUs to steal
1363 * this thread while we are in the middle of a
1364 * context switch. We may just switch to it
1365 * again right away. CPU_DISP_DONTSTEAL is cleared
1366 * in swtch and swtch_to.
1367 */
1368 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1369 }
1370 dp->disp_max_unbound_pri = tpri;
1371 }
1372 (*disp_enq_thread)(cp, bound);
1373 }
1374
1375 /*
1376 * Put the specified thread on the front of the dispatcher
1377 * queue corresponding to its current priority.
1378 *
1379 * Called with the thread in transition, onproc or stopped state
1380 * and locked (transition implies locked) and at high spl.
1381 * Returns with the thread in TS_RUN state and still locked.
1382 */
1383 void
setfrontdq(kthread_t * tp)1384 setfrontdq(kthread_t *tp)
1385 {
1386 disp_t *dp;
1387 dispq_t *dq;
1388 cpu_t *cp;
1389 pri_t tpri;
1390 int bound;
1391
1392 ASSERT(THREAD_LOCK_HELD(tp));
1393 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1394 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1395
1396 /*
1397 * If thread is "swapped" or on the swap queue don't
1398 * queue it, but wake sched.
1399 */
1400 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1401 disp_swapped_setrun(tp);
1402 return;
1403 }
1404
1405 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1406 bound = 1;
1407 else
1408 bound = 0;
1409
1410 tpri = DISP_PRIO(tp);
1411 if (ncpus == 1)
1412 cp = tp->t_cpu;
1413 else if (!bound) {
1414 if (tpri >= kpqpri) {
1415 setkpdq(tp, SETKP_FRONT);
1416 return;
1417 }
1418 cp = tp->t_cpu;
1419 if (tp->t_cpupart == cp->cpu_part) {
1420 /*
1421 * We'll generally let this thread continue to run
1422 * where it last ran, but will consider migration if:
1423 * - The thread last ran outside its home lgroup.
1424 * - The CPU where it last ran is the target of an
1425 * offline request (a thread_nomigrate() on the in
1426 * motion CPU relies on this when forcing a preempt).
1427 * - The thread isn't the highest priority thread where
1428 * it last ran, and it is considered not likely to
1429 * have significant cache warmth.
1430 */
1431 if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1432 cp == cpu_inmotion ||
1433 (tpri < cp->cpu_disp->disp_maxrunpri &&
1434 !THREAD_HAS_CACHE_WARMTH(tp))) {
1435 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1436 }
1437 } else {
1438 /*
1439 * Migrate to a cpu in the new partition.
1440 */
1441 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1442 tp, tp->t_pri);
1443 }
1444 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1445 } else {
1446 /*
1447 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1448 * a short time until weak binding that existed when the
1449 * strong binding was established has dropped) so we must
1450 * favour weak binding over strong.
1451 */
1452 cp = tp->t_weakbound_cpu ?
1453 tp->t_weakbound_cpu : tp->t_bound_cpu;
1454 }
1455
1456 /*
1457 * A thread that is ONPROC may be temporarily placed on the run queue
1458 * but then chosen to run again by disp. If the thread we're placing on
1459 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1460 * replacement process is actually scheduled in swtch(). In this
1461 * situation, curthread is the only thread that could be in the ONPROC
1462 * state.
1463 */
1464 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1465 hrtime_t curtime;
1466
1467 curtime = gethrtime_unscaled();
1468 (void) cpu_update_pct(tp, curtime);
1469 tp->t_waitrq = curtime;
1470 } else {
1471 (void) cpu_update_pct(tp, gethrtime_unscaled());
1472 }
1473
1474 dp = cp->cpu_disp;
1475 disp_lock_enter_high(&dp->disp_lock);
1476
1477 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1478 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1479
1480 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1481
1482 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1483 tp->t_disp_queue = dp;
1484
1485 dq = &dp->disp_q[tpri];
1486 dp->disp_nrunnable++;
1487 if (!bound)
1488 dp->disp_steal = 0;
1489 membar_enter();
1490
1491 if (dq->dq_sruncnt++ != 0) {
1492 ASSERT(dq->dq_last != NULL);
1493 tp->t_link = dq->dq_first;
1494 dq->dq_first = tp;
1495 } else {
1496 ASSERT(dq->dq_last == NULL);
1497 ASSERT(dq->dq_first == NULL);
1498 tp->t_link = NULL;
1499 dq->dq_first = dq->dq_last = tp;
1500 BT_SET(dp->disp_qactmap, tpri);
1501 if (tpri > dp->disp_maxrunpri) {
1502 dp->disp_maxrunpri = tpri;
1503 membar_enter();
1504 cpu_resched(cp, tpri);
1505 }
1506 }
1507
1508 if (!bound && tpri > dp->disp_max_unbound_pri) {
1509 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1510 cp == CPU) {
1511 /*
1512 * If there are no other unbound threads on the
1513 * run queue, don't allow other CPUs to steal
1514 * this thread while we are in the middle of a
1515 * context switch. We may just switch to it
1516 * again right away. CPU_DISP_DONTSTEAL is cleared
1517 * in swtch and swtch_to.
1518 */
1519 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1520 }
1521 dp->disp_max_unbound_pri = tpri;
1522 }
1523 (*disp_enq_thread)(cp, bound);
1524 }
1525
1526 /*
1527 * Put a high-priority unbound thread on the kp queue
1528 */
1529 static void
setkpdq(kthread_t * tp,int borf)1530 setkpdq(kthread_t *tp, int borf)
1531 {
1532 dispq_t *dq;
1533 disp_t *dp;
1534 cpu_t *cp;
1535 pri_t tpri;
1536
1537 tpri = DISP_PRIO(tp);
1538
1539 dp = &tp->t_cpupart->cp_kp_queue;
1540 disp_lock_enter_high(&dp->disp_lock);
1541
1542 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1543
1544 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1545 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1546 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1547 tp->t_disp_queue = dp;
1548 dp->disp_nrunnable++;
1549 dq = &dp->disp_q[tpri];
1550
1551 if (dq->dq_sruncnt++ != 0) {
1552 if (borf == SETKP_BACK) {
1553 ASSERT(dq->dq_first != NULL);
1554 tp->t_link = NULL;
1555 dq->dq_last->t_link = tp;
1556 dq->dq_last = tp;
1557 } else {
1558 ASSERT(dq->dq_last != NULL);
1559 tp->t_link = dq->dq_first;
1560 dq->dq_first = tp;
1561 }
1562 } else {
1563 if (borf == SETKP_BACK) {
1564 ASSERT(dq->dq_first == NULL);
1565 ASSERT(dq->dq_last == NULL);
1566 dq->dq_first = dq->dq_last = tp;
1567 } else {
1568 ASSERT(dq->dq_last == NULL);
1569 ASSERT(dq->dq_first == NULL);
1570 tp->t_link = NULL;
1571 dq->dq_first = dq->dq_last = tp;
1572 }
1573 BT_SET(dp->disp_qactmap, tpri);
1574 if (tpri > dp->disp_max_unbound_pri)
1575 dp->disp_max_unbound_pri = tpri;
1576 if (tpri > dp->disp_maxrunpri) {
1577 dp->disp_maxrunpri = tpri;
1578 membar_enter();
1579 }
1580 }
1581
1582 cp = tp->t_cpu;
1583 if (tp->t_cpupart != cp->cpu_part) {
1584 /* migrate to a cpu in the new partition */
1585 cp = tp->t_cpupart->cp_cpulist;
1586 }
1587 cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1588 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1589 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1590
1591 if (cp->cpu_chosen_level < tpri)
1592 cp->cpu_chosen_level = tpri;
1593 cpu_resched(cp, tpri);
1594 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1595 (*disp_enq_thread)(cp, 0);
1596 }
1597
1598 /*
1599 * Remove a thread from the dispatcher queue if it is on it.
1600 * It is not an error if it is not found but we return whether
1601 * or not it was found in case the caller wants to check.
1602 */
1603 int
dispdeq(kthread_t * tp)1604 dispdeq(kthread_t *tp)
1605 {
1606 disp_t *dp;
1607 dispq_t *dq;
1608 kthread_t *rp;
1609 kthread_t *trp;
1610 kthread_t **ptp;
1611 int tpri;
1612
1613 ASSERT(THREAD_LOCK_HELD(tp));
1614
1615 if (tp->t_state != TS_RUN)
1616 return (0);
1617
1618 /*
1619 * The thread is "swapped" or is on the swap queue and
1620 * hence no longer on the run queue, so return true.
1621 */
1622 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1623 return (1);
1624
1625 tpri = DISP_PRIO(tp);
1626 dp = tp->t_disp_queue;
1627 ASSERT(tpri < dp->disp_npri);
1628 dq = &dp->disp_q[tpri];
1629 ptp = &dq->dq_first;
1630 rp = *ptp;
1631 trp = NULL;
1632
1633 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1634
1635 /*
1636 * Search for thread in queue.
1637 * Double links would simplify this at the expense of disp/setrun.
1638 */
1639 while (rp != tp && rp != NULL) {
1640 trp = rp;
1641 ptp = &trp->t_link;
1642 rp = trp->t_link;
1643 }
1644
1645 if (rp == NULL) {
1646 panic("dispdeq: thread not on queue");
1647 }
1648
1649 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1650
1651 /*
1652 * Found it so remove it from queue.
1653 */
1654 if ((*ptp = rp->t_link) == NULL)
1655 dq->dq_last = trp;
1656
1657 dp->disp_nrunnable--;
1658 if (--dq->dq_sruncnt == 0) {
1659 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1660 if (dp->disp_nrunnable == 0) {
1661 dp->disp_max_unbound_pri = -1;
1662 dp->disp_maxrunpri = -1;
1663 } else if (tpri == dp->disp_maxrunpri) {
1664 int ipri;
1665
1666 ipri = bt_gethighbit(dp->disp_qactmap,
1667 dp->disp_maxrunpri >> BT_ULSHIFT);
1668 if (ipri < dp->disp_max_unbound_pri)
1669 dp->disp_max_unbound_pri = ipri;
1670 dp->disp_maxrunpri = ipri;
1671 }
1672 }
1673 tp->t_link = NULL;
1674 THREAD_TRANSITION(tp); /* put in intermediate state */
1675 return (1);
1676 }
1677
1678
1679 /*
1680 * dq_sruninc and dq_srundec are public functions for
1681 * incrementing/decrementing the sruncnts when a thread on
1682 * a dispatcher queue is made schedulable/unschedulable by
1683 * resetting the TS_LOAD flag.
1684 *
1685 * The caller MUST have the thread lock and therefore the dispatcher
1686 * queue lock so that the operation which changes
1687 * the flag, the operation that checks the status of the thread to
1688 * determine if it's on a disp queue AND the call to this function
1689 * are one atomic operation with respect to interrupts.
1690 */
1691
1692 /*
1693 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1694 */
1695 void
dq_sruninc(kthread_t * t)1696 dq_sruninc(kthread_t *t)
1697 {
1698 ASSERT(t->t_state == TS_RUN);
1699 ASSERT(t->t_schedflag & TS_LOAD);
1700
1701 THREAD_TRANSITION(t);
1702 setfrontdq(t);
1703 }
1704
1705 /*
1706 * See comment on calling conventions above.
1707 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1708 */
1709 void
dq_srundec(kthread_t * t)1710 dq_srundec(kthread_t *t)
1711 {
1712 ASSERT(t->t_schedflag & TS_LOAD);
1713
1714 (void) dispdeq(t);
1715 disp_swapped_enq(t);
1716 }
1717
1718 /*
1719 * Change the dispatcher lock of thread to the "swapped_lock"
1720 * and return with thread lock still held.
1721 *
1722 * Called with thread_lock held, in transition state, and at high spl.
1723 */
1724 void
disp_swapped_enq(kthread_t * tp)1725 disp_swapped_enq(kthread_t *tp)
1726 {
1727 ASSERT(THREAD_LOCK_HELD(tp));
1728 ASSERT(tp->t_schedflag & TS_LOAD);
1729
1730 switch (tp->t_state) {
1731 case TS_RUN:
1732 disp_lock_enter_high(&swapped_lock);
1733 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1734 break;
1735 case TS_ONPROC:
1736 disp_lock_enter_high(&swapped_lock);
1737 THREAD_TRANSITION(tp);
1738 wake_sched_sec = 1; /* tell clock to wake sched */
1739 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1740 break;
1741 default:
1742 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1743 }
1744 }
1745
1746 /*
1747 * This routine is called by setbackdq/setfrontdq if the thread is
1748 * not loaded or loaded and on the swap queue.
1749 *
1750 * Thread state TS_SLEEP implies that a swapped thread
1751 * has been woken up and needs to be swapped in by the swapper.
1752 *
1753 * Thread state TS_RUN, it implies that the priority of a swapped
1754 * thread is being increased by scheduling class (e.g. ts_update).
1755 */
1756 static void
disp_swapped_setrun(kthread_t * tp)1757 disp_swapped_setrun(kthread_t *tp)
1758 {
1759 ASSERT(THREAD_LOCK_HELD(tp));
1760 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1761
1762 switch (tp->t_state) {
1763 case TS_SLEEP:
1764 disp_lock_enter_high(&swapped_lock);
1765 /*
1766 * Wakeup sched immediately (i.e., next tick) if the
1767 * thread priority is above maxclsyspri.
1768 */
1769 if (DISP_PRIO(tp) > maxclsyspri)
1770 wake_sched = 1;
1771 else
1772 wake_sched_sec = 1;
1773 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1774 break;
1775 case TS_RUN: /* called from ts_update */
1776 break;
1777 default:
1778 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1779 }
1780 }
1781
1782 /*
1783 * Make a thread give up its processor. Find the processor on
1784 * which this thread is executing, and have that processor
1785 * preempt.
1786 *
1787 * We allow System Duty Cycle (SDC) threads to be preempted even if
1788 * they are running at kernel priorities. To implement this, we always
1789 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1790 * calls cpu_surrender() very often, we only preempt if there is anyone
1791 * competing with us.
1792 */
1793 void
cpu_surrender(kthread_t * tp)1794 cpu_surrender(kthread_t *tp)
1795 {
1796 cpu_t *cpup;
1797 int max_pri;
1798 int max_run_pri;
1799 klwp_t *lwp;
1800
1801 ASSERT(THREAD_LOCK_HELD(tp));
1802
1803 if (tp->t_state != TS_ONPROC)
1804 return;
1805 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1806 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1807 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1808 if (max_pri < max_run_pri)
1809 max_pri = max_run_pri;
1810
1811 if (tp->t_cid == sysdccid) {
1812 uint_t t_pri = DISP_PRIO(tp);
1813 if (t_pri > max_pri)
1814 return; /* we are not competing w/ anyone */
1815 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1816 } else {
1817 cpup->cpu_runrun = 1;
1818 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1819 cpup->cpu_kprunrun = 1;
1820 }
1821 }
1822
1823 /*
1824 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1825 */
1826 membar_enter();
1827
1828 DTRACE_SCHED1(surrender, kthread_t *, tp);
1829
1830 /*
1831 * Make the target thread take an excursion through trap()
1832 * to do preempt() (unless we're already in trap or post_syscall,
1833 * calling cpu_surrender via CL_TRAPRET).
1834 */
1835 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1836 lwp->lwp_state != LWP_USER) {
1837 aston(tp);
1838 if (cpup != CPU)
1839 poke_cpu(cpup->cpu_id);
1840 }
1841 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1842 "cpu_surrender:tid %p cpu %p", tp, cpup);
1843 }
1844
1845 /*
1846 * Commit to and ratify a scheduling decision
1847 */
1848 /*ARGSUSED*/
1849 static kthread_t *
disp_ratify(kthread_t * tp,disp_t * kpq)1850 disp_ratify(kthread_t *tp, disp_t *kpq)
1851 {
1852 pri_t tpri, maxpri;
1853 pri_t maxkpri;
1854 cpu_t *cpup;
1855
1856 ASSERT(tp != NULL);
1857 /*
1858 * Commit to, then ratify scheduling decision
1859 */
1860 cpup = CPU;
1861 if (cpup->cpu_runrun != 0)
1862 cpup->cpu_runrun = 0;
1863 if (cpup->cpu_kprunrun != 0)
1864 cpup->cpu_kprunrun = 0;
1865 if (cpup->cpu_chosen_level != -1)
1866 cpup->cpu_chosen_level = -1;
1867 membar_enter();
1868 tpri = DISP_PRIO(tp);
1869 maxpri = cpup->cpu_disp->disp_maxrunpri;
1870 maxkpri = kpq->disp_maxrunpri;
1871 if (maxpri < maxkpri)
1872 maxpri = maxkpri;
1873 if (tpri < maxpri) {
1874 /*
1875 * should have done better
1876 * put this one back and indicate to try again
1877 */
1878 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1879 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1880 thread_lock_high(tp);
1881 THREAD_TRANSITION(tp);
1882 setfrontdq(tp);
1883 thread_unlock_nopreempt(tp);
1884
1885 tp = NULL;
1886 }
1887 return (tp);
1888 }
1889
1890 /*
1891 * See if there is any work on the dispatcher queue for other CPUs.
1892 * If there is, dequeue the best thread and return.
1893 */
1894 static kthread_t *
disp_getwork(cpu_t * cp)1895 disp_getwork(cpu_t *cp)
1896 {
1897 cpu_t *ocp; /* other CPU */
1898 cpu_t *ocp_start;
1899 cpu_t *tcp; /* target local CPU */
1900 kthread_t *tp;
1901 kthread_t *retval = NULL;
1902 pri_t maxpri;
1903 disp_t *kpq; /* kp queue for this partition */
1904 lpl_t *lpl, *lpl_leaf;
1905 int leafidx, startidx;
1906 hrtime_t stealtime;
1907 lgrp_id_t local_id;
1908
1909 maxpri = -1;
1910 tcp = NULL;
1911
1912 kpq = &cp->cpu_part->cp_kp_queue;
1913 while (kpq->disp_maxrunpri >= 0) {
1914 /*
1915 * Try to take a thread from the kp_queue.
1916 */
1917 tp = (disp_getbest(kpq));
1918 if (tp)
1919 return (disp_ratify(tp, kpq));
1920 }
1921
1922 kpreempt_disable(); /* protect the cpu_active list */
1923
1924 /*
1925 * Try to find something to do on another CPU's run queue.
1926 * Loop through all other CPUs looking for the one with the highest
1927 * priority unbound thread.
1928 *
1929 * On NUMA machines, the partition's CPUs are consulted in order of
1930 * distance from the current CPU. This way, the first available
1931 * work found is also the closest, and will suffer the least
1932 * from being migrated.
1933 */
1934 lpl = lpl_leaf = cp->cpu_lpl;
1935 local_id = lpl_leaf->lpl_lgrpid;
1936 leafidx = startidx = 0;
1937
1938 /*
1939 * This loop traverses the lpl hierarchy. Higher level lpls represent
1940 * broader levels of locality
1941 */
1942 do {
1943 /* This loop iterates over the lpl's leaves */
1944 do {
1945 if (lpl_leaf != cp->cpu_lpl)
1946 ocp = lpl_leaf->lpl_cpus;
1947 else
1948 ocp = cp->cpu_next_lpl;
1949
1950 /* This loop iterates over the CPUs in the leaf */
1951 ocp_start = ocp;
1952 do {
1953 pri_t pri;
1954
1955 ASSERT(CPU_ACTIVE(ocp));
1956
1957 /*
1958 * End our stroll around this lpl if:
1959 *
1960 * - Something became runnable on the local
1961 * queue...which also ends our stroll around
1962 * the partition.
1963 *
1964 * - We happen across another idle CPU.
1965 * Since it is patrolling the next portion
1966 * of the lpl's list (assuming it's not
1967 * halted, or busy servicing an interrupt),
1968 * move to the next higher level of locality.
1969 */
1970 if (cp->cpu_disp->disp_nrunnable != 0) {
1971 kpreempt_enable();
1972 return (NULL);
1973 }
1974 if (ocp->cpu_dispatch_pri == -1) {
1975 if (ocp->cpu_disp_flags &
1976 CPU_DISP_HALTED ||
1977 ocp->cpu_intr_actv != 0)
1978 continue;
1979 else
1980 goto next_level;
1981 }
1982
1983 /*
1984 * If there's only one thread and the CPU
1985 * is in the middle of a context switch,
1986 * or it's currently running the idle thread,
1987 * don't steal it.
1988 */
1989 if ((ocp->cpu_disp_flags &
1990 CPU_DISP_DONTSTEAL) &&
1991 ocp->cpu_disp->disp_nrunnable == 1)
1992 continue;
1993
1994 pri = ocp->cpu_disp->disp_max_unbound_pri;
1995 if (pri > maxpri) {
1996 /*
1997 * Don't steal threads that we attempted
1998 * to steal recently until they're ready
1999 * to be stolen again.
2000 */
2001 stealtime = ocp->cpu_disp->disp_steal;
2002 if (stealtime == 0 ||
2003 stealtime - gethrtime() <= 0) {
2004 maxpri = pri;
2005 tcp = ocp;
2006 } else {
2007 /*
2008 * Don't update tcp, just set
2009 * the retval to T_DONTSTEAL, so
2010 * that if no acceptable CPUs
2011 * are found the return value
2012 * will be T_DONTSTEAL rather
2013 * then NULL.
2014 */
2015 retval = T_DONTSTEAL;
2016 }
2017 }
2018 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2019
2020 /*
2021 * Iterate to the next leaf lpl in the resource set
2022 * at this level of locality. If we hit the end of
2023 * the set, wrap back around to the beginning.
2024 *
2025 * Note: This iteration is NULL terminated for a reason
2026 * see lpl_topo_bootstrap() in lgrp.c for details.
2027 */
2028 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2029 leafidx = 0;
2030 lpl_leaf = lpl->lpl_rset[leafidx];
2031 }
2032 } while (leafidx != startidx);
2033
2034 next_level:
2035 /*
2036 * Expand the search to include farther away CPUs (next
2037 * locality level). The closer CPUs that have already been
2038 * checked will be checked again. In doing so, idle CPUs
2039 * will tend to be more aggresive about stealing from CPUs
2040 * that are closer (since the closer CPUs will be considered
2041 * more often).
2042 * Begin at this level with the CPUs local leaf lpl.
2043 */
2044 if ((lpl = lpl->lpl_parent) != NULL) {
2045 leafidx = startidx = lpl->lpl_id2rset[local_id];
2046 lpl_leaf = lpl->lpl_rset[leafidx];
2047 }
2048 } while (!tcp && lpl);
2049
2050 kpreempt_enable();
2051
2052 /*
2053 * If another queue looks good, and there is still nothing on
2054 * the local queue, try to transfer one or more threads
2055 * from it to our queue.
2056 */
2057 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2058 tp = disp_getbest(tcp->cpu_disp);
2059 if (tp == NULL || tp == T_DONTSTEAL)
2060 return (tp);
2061 return (disp_ratify(tp, kpq));
2062 }
2063 return (retval);
2064 }
2065
2066
2067 /*
2068 * disp_fix_unbound_pri()
2069 * Determines the maximum priority of unbound threads on the queue.
2070 * The priority is kept for the queue, but is only increased, never
2071 * reduced unless some CPU is looking for something on that queue.
2072 *
2073 * The priority argument is the known upper limit.
2074 *
2075 * Perhaps this should be kept accurately, but that probably means
2076 * separate bitmaps for bound and unbound threads. Since only idled
2077 * CPUs will have to do this recalculation, it seems better this way.
2078 */
2079 static void
disp_fix_unbound_pri(disp_t * dp,pri_t pri)2080 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2081 {
2082 kthread_t *tp;
2083 dispq_t *dq;
2084 ulong_t *dqactmap = dp->disp_qactmap;
2085 ulong_t mapword;
2086 int wx;
2087
2088 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2089
2090 ASSERT(pri >= 0); /* checked by caller */
2091
2092 /*
2093 * Start the search at the next lowest priority below the supplied
2094 * priority. This depends on the bitmap implementation.
2095 */
2096 do {
2097 wx = pri >> BT_ULSHIFT; /* index of word in map */
2098
2099 /*
2100 * Form mask for all lower priorities in the word.
2101 */
2102 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2103
2104 /*
2105 * Get next lower active priority.
2106 */
2107 if (mapword != 0) {
2108 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2109 } else if (wx > 0) {
2110 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2111 if (pri < 0)
2112 break;
2113 } else {
2114 pri = -1;
2115 break;
2116 }
2117
2118 /*
2119 * Search the queue for unbound, runnable threads.
2120 */
2121 dq = &dp->disp_q[pri];
2122 tp = dq->dq_first;
2123
2124 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2125 tp = tp->t_link;
2126 }
2127
2128 /*
2129 * If a thread was found, set the priority and return.
2130 */
2131 } while (tp == NULL);
2132
2133 /*
2134 * pri holds the maximum unbound thread priority or -1.
2135 */
2136 if (dp->disp_max_unbound_pri != pri)
2137 dp->disp_max_unbound_pri = pri;
2138 }
2139
2140 /*
2141 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2142 * check if the CPU to which is was previously bound should have
2143 * its disp_max_unbound_pri increased.
2144 */
2145 void
disp_adjust_unbound_pri(kthread_t * tp)2146 disp_adjust_unbound_pri(kthread_t *tp)
2147 {
2148 disp_t *dp;
2149 pri_t tpri;
2150
2151 ASSERT(THREAD_LOCK_HELD(tp));
2152
2153 /*
2154 * Don't do anything if the thread is not bound, or
2155 * currently not runnable or swapped out.
2156 */
2157 if (tp->t_bound_cpu == NULL ||
2158 tp->t_state != TS_RUN ||
2159 tp->t_schedflag & TS_ON_SWAPQ)
2160 return;
2161
2162 tpri = DISP_PRIO(tp);
2163 dp = tp->t_bound_cpu->cpu_disp;
2164 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2165 if (tpri > dp->disp_max_unbound_pri)
2166 dp->disp_max_unbound_pri = tpri;
2167 }
2168
2169 /*
2170 * disp_getbest()
2171 * De-queue the highest priority unbound runnable thread.
2172 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2173 * Returns NULL if nothing found.
2174 * Returns T_DONTSTEAL if the thread was not stealable.
2175 * so that the caller will try again later.
2176 *
2177 * Passed a pointer to a dispatch queue not associated with this CPU, and
2178 * its type.
2179 */
2180 static kthread_t *
disp_getbest(disp_t * dp)2181 disp_getbest(disp_t *dp)
2182 {
2183 kthread_t *tp;
2184 dispq_t *dq;
2185 pri_t pri;
2186 cpu_t *cp, *tcp;
2187 boolean_t allbound;
2188
2189 disp_lock_enter(&dp->disp_lock);
2190
2191 /*
2192 * If there is nothing to run, or the CPU is in the middle of a
2193 * context switch of the only thread, return NULL.
2194 */
2195 tcp = dp->disp_cpu;
2196 cp = CPU;
2197 pri = dp->disp_max_unbound_pri;
2198 if (pri == -1 ||
2199 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2200 tcp->cpu_disp->disp_nrunnable == 1)) {
2201 disp_lock_exit_nopreempt(&dp->disp_lock);
2202 return (NULL);
2203 }
2204
2205 dq = &dp->disp_q[pri];
2206
2207
2208 /*
2209 * Assume that all threads are bound on this queue, and change it
2210 * later when we find out that it is not the case.
2211 */
2212 allbound = B_TRUE;
2213 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2214 hrtime_t now, nosteal, rqtime;
2215
2216 /*
2217 * Skip over bound threads which could be here even
2218 * though disp_max_unbound_pri indicated this level.
2219 */
2220 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2221 continue;
2222
2223 /*
2224 * We've got some unbound threads on this queue, so turn
2225 * the allbound flag off now.
2226 */
2227 allbound = B_FALSE;
2228
2229 /*
2230 * The thread is a candidate for stealing from its run queue. We
2231 * don't want to steal threads that became runnable just a
2232 * moment ago. This improves CPU affinity for threads that get
2233 * preempted for short periods of time and go back on the run
2234 * queue.
2235 *
2236 * We want to let it stay on its run queue if it was only placed
2237 * there recently and it was running on the same CPU before that
2238 * to preserve its cache investment. For the thread to remain on
2239 * its run queue, ALL of the following conditions must be
2240 * satisfied:
2241 *
2242 * - the disp queue should not be the kernel preemption queue
2243 * - delayed idle stealing should not be disabled
2244 * - nosteal_nsec should be non-zero
2245 * - it should run with user priority
2246 * - it should be on the run queue of the CPU where it was
2247 * running before being placed on the run queue
2248 * - it should be the only thread on the run queue (to prevent
2249 * extra scheduling latency for other threads)
2250 * - it should sit on the run queue for less than per-chip
2251 * nosteal interval or global nosteal interval
2252 * - in case of CPUs with shared cache it should sit in a run
2253 * queue of a CPU from a different chip
2254 *
2255 * The checks are arranged so that the ones that are faster are
2256 * placed earlier.
2257 */
2258 if (tcp == NULL ||
2259 pri >= minclsyspri ||
2260 tp->t_cpu != tcp)
2261 break;
2262
2263 /*
2264 * Steal immediately if, due to CMT processor architecture
2265 * migraiton between cp and tcp would incur no performance
2266 * penalty.
2267 */
2268 if (pg_cmt_can_migrate(cp, tcp))
2269 break;
2270
2271 nosteal = nosteal_nsec;
2272 if (nosteal == 0)
2273 break;
2274
2275 /*
2276 * Calculate time spent sitting on run queue
2277 */
2278 now = gethrtime_unscaled();
2279 rqtime = now - tp->t_waitrq;
2280 scalehrtime(&rqtime);
2281
2282 /*
2283 * Steal immediately if the time spent on this run queue is more
2284 * than allowed nosteal delay.
2285 *
2286 * Negative rqtime check is needed here to avoid infinite
2287 * stealing delays caused by unlikely but not impossible
2288 * drifts between CPU times on different CPUs.
2289 */
2290 if (rqtime > nosteal || rqtime < 0)
2291 break;
2292
2293 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2294 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2295 scalehrtime(&now);
2296 /*
2297 * Calculate when this thread becomes stealable
2298 */
2299 now += (nosteal - rqtime);
2300
2301 /*
2302 * Calculate time when some thread becomes stealable
2303 */
2304 if (now < dp->disp_steal)
2305 dp->disp_steal = now;
2306 }
2307
2308 /*
2309 * If there were no unbound threads on this queue, find the queue
2310 * where they are and then return later. The value of
2311 * disp_max_unbound_pri is not always accurate because it isn't
2312 * reduced until another idle CPU looks for work.
2313 */
2314 if (allbound)
2315 disp_fix_unbound_pri(dp, pri);
2316
2317 /*
2318 * If we reached the end of the queue and found no unbound threads
2319 * then return NULL so that other CPUs will be considered. If there
2320 * are unbound threads but they cannot yet be stolen, then
2321 * return T_DONTSTEAL and try again later.
2322 */
2323 if (tp == NULL) {
2324 disp_lock_exit_nopreempt(&dp->disp_lock);
2325 return (allbound ? NULL : T_DONTSTEAL);
2326 }
2327
2328 /*
2329 * Found a runnable, unbound thread, so remove it from queue.
2330 * dispdeq() requires that we have the thread locked, and we do,
2331 * by virtue of holding the dispatch queue lock. dispdeq() will
2332 * put the thread in transition state, thereby dropping the dispq
2333 * lock.
2334 */
2335
2336 #ifdef DEBUG
2337 {
2338 int thread_was_on_queue;
2339
2340 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2341 ASSERT(thread_was_on_queue);
2342 }
2343
2344 #else /* DEBUG */
2345 (void) dispdeq(tp); /* drops disp_lock */
2346 #endif /* DEBUG */
2347
2348 /*
2349 * Reset the disp_queue steal time - we do not know what is the smallest
2350 * value across the queue is.
2351 */
2352 dp->disp_steal = 0;
2353
2354 tp->t_schedflag |= TS_DONT_SWAP;
2355
2356 /*
2357 * Setup thread to run on the current CPU.
2358 */
2359 tp->t_disp_queue = cp->cpu_disp;
2360
2361 cp->cpu_dispthread = tp; /* protected by spl only */
2362 cp->cpu_dispatch_pri = pri;
2363
2364 /*
2365 * There can be a memory synchronization race between disp_getbest()
2366 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2367 * to preempt the current thread to run the enqueued thread while
2368 * disp_getbest() and disp_ratify() are changing the current thread
2369 * to the stolen thread. This may lead to a situation where
2370 * cpu_resched() tries to preempt the wrong thread and the
2371 * stolen thread continues to run on the CPU which has been tagged
2372 * for preemption.
2373 * Later the clock thread gets enqueued but doesn't get to run on the
2374 * CPU causing the system to hang.
2375 *
2376 * To avoid this, grabbing and dropping the disp_lock (which does
2377 * a memory barrier) is needed to synchronize the execution of
2378 * cpu_resched() with disp_getbest() and disp_ratify() and
2379 * synchronize the memory read and written by cpu_resched(),
2380 * disp_getbest(), and disp_ratify() with each other.
2381 * (see CR#6482861 for more details).
2382 */
2383 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2384 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2385
2386 ASSERT(pri == DISP_PRIO(tp));
2387
2388 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2389
2390 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2391
2392 /*
2393 * Return with spl high so that swtch() won't need to raise it.
2394 * The disp_lock was dropped by dispdeq().
2395 */
2396
2397 return (tp);
2398 }
2399
2400 /*
2401 * disp_bound_common() - common routine for higher level functions
2402 * that check for bound threads under certain conditions.
2403 * If 'threadlistsafe' is set then there is no need to acquire
2404 * pidlock to stop the thread list from changing (eg, if
2405 * disp_bound_* is called with cpus paused).
2406 */
2407 static int
disp_bound_common(cpu_t * cp,int threadlistsafe,int flag)2408 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2409 {
2410 int found = 0;
2411 kthread_t *tp;
2412
2413 ASSERT(flag);
2414
2415 if (!threadlistsafe)
2416 mutex_enter(&pidlock);
2417 tp = curthread; /* faster than allthreads */
2418 do {
2419 if (tp->t_state != TS_FREE) {
2420 /*
2421 * If an interrupt thread is busy, but the
2422 * caller doesn't care (i.e. BOUND_INTR is off),
2423 * then just ignore it and continue through.
2424 */
2425 if ((tp->t_flag & T_INTR_THREAD) &&
2426 !(flag & BOUND_INTR))
2427 continue;
2428
2429 /*
2430 * Skip the idle thread for the CPU
2431 * we're about to set offline.
2432 */
2433 if (tp == cp->cpu_idle_thread)
2434 continue;
2435
2436 /*
2437 * Skip the pause thread for the CPU
2438 * we're about to set offline.
2439 */
2440 if (tp == cp->cpu_pause_thread)
2441 continue;
2442
2443 if ((flag & BOUND_CPU) &&
2444 (tp->t_bound_cpu == cp ||
2445 tp->t_bind_cpu == cp->cpu_id ||
2446 tp->t_weakbound_cpu == cp)) {
2447 found = 1;
2448 break;
2449 }
2450
2451 if ((flag & BOUND_PARTITION) &&
2452 (tp->t_cpupart == cp->cpu_part)) {
2453 found = 1;
2454 break;
2455 }
2456 }
2457 } while ((tp = tp->t_next) != curthread && found == 0);
2458 if (!threadlistsafe)
2459 mutex_exit(&pidlock);
2460 return (found);
2461 }
2462
2463 /*
2464 * disp_bound_threads - return nonzero if threads are bound to the processor.
2465 * Called infrequently. Keep this simple.
2466 * Includes threads that are asleep or stopped but not onproc.
2467 */
2468 int
disp_bound_threads(cpu_t * cp,int threadlistsafe)2469 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2470 {
2471 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2472 }
2473
2474 /*
2475 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2476 * to the given processor, including interrupt threads.
2477 */
2478 int
disp_bound_anythreads(cpu_t * cp,int threadlistsafe)2479 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2480 {
2481 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2482 }
2483
2484 /*
2485 * disp_bound_partition - return nonzero if threads are bound to the same
2486 * partition as the processor.
2487 * Called infrequently. Keep this simple.
2488 * Includes threads that are asleep or stopped but not onproc.
2489 */
2490 int
disp_bound_partition(cpu_t * cp,int threadlistsafe)2491 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2492 {
2493 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2494 }
2495
2496 /*
2497 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2498 * threads to other CPUs.
2499 */
2500 void
disp_cpu_inactive(cpu_t * cp)2501 disp_cpu_inactive(cpu_t *cp)
2502 {
2503 kthread_t *tp;
2504 disp_t *dp = cp->cpu_disp;
2505 dispq_t *dq;
2506 pri_t pri;
2507 int wasonq;
2508
2509 disp_lock_enter(&dp->disp_lock);
2510 while ((pri = dp->disp_max_unbound_pri) != -1) {
2511 dq = &dp->disp_q[pri];
2512 tp = dq->dq_first;
2513
2514 /*
2515 * Skip over bound threads.
2516 */
2517 while (tp != NULL && tp->t_bound_cpu != NULL) {
2518 tp = tp->t_link;
2519 }
2520
2521 if (tp == NULL) {
2522 /* disp_max_unbound_pri must be inaccurate, so fix it */
2523 disp_fix_unbound_pri(dp, pri);
2524 continue;
2525 }
2526
2527 wasonq = dispdeq(tp); /* drops disp_lock */
2528 ASSERT(wasonq);
2529 ASSERT(tp->t_weakbound_cpu == NULL);
2530
2531 setbackdq(tp);
2532 /*
2533 * Called from cpu_offline:
2534 *
2535 * cp has already been removed from the list of active cpus
2536 * and tp->t_cpu has been changed so there is no risk of
2537 * tp ending up back on cp.
2538 *
2539 * Called from cpupart_move_cpu:
2540 *
2541 * The cpu has moved to a new cpupart. Any threads that
2542 * were on it's dispatch queues before the move remain
2543 * in the old partition and can't run in the new partition.
2544 */
2545 ASSERT(tp->t_cpu != cp);
2546 thread_unlock(tp);
2547
2548 disp_lock_enter(&dp->disp_lock);
2549 }
2550 disp_lock_exit(&dp->disp_lock);
2551 }
2552
2553 /*
2554 * Return a score rating this CPU for running this thread: lower is better.
2555 *
2556 * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2557 * curcpu (as that's our own priority).
2558 *
2559 * If a cpu is the target of an offline request, then try to avoid it.
2560 *
2561 * Otherwise we'll use double the effective dispatcher priority for the CPU.
2562 *
2563 * We do this so smt_adjust_cpu_score() can increment the score if needed,
2564 * without ending up over-riding a dispatcher priority.
2565 */
2566 static pri_t
cpu_score(cpu_t * cp,kthread_t * tp)2567 cpu_score(cpu_t *cp, kthread_t *tp)
2568 {
2569 pri_t score;
2570
2571 if (tp == curthread && cp == curthread->t_cpu)
2572 score = 2 * CPU_IDLE_PRI;
2573 else if (cp == cpu_inmotion)
2574 score = SHRT_MAX;
2575 else
2576 score = 2 * cp->cpu_dispatch_pri;
2577
2578 if (2 * cp->cpu_disp->disp_maxrunpri > score)
2579 score = 2 * cp->cpu_disp->disp_maxrunpri;
2580 if (2 * cp->cpu_chosen_level > score)
2581 score = 2 * cp->cpu_chosen_level;
2582
2583 return (smt_adjust_cpu_score(tp, cp, score));
2584 }
2585
2586 /*
2587 * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2588 *
2589 * We are looking for a CPU with an effective dispatch priority lower than the
2590 * thread's, so that the thread will run immediately rather than be enqueued.
2591 * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2592 * If we don't find an available CPU there, we will expand our search to include
2593 * wider locality levels. (Note these groups are already divided by CPU
2594 * partition.)
2595 *
2596 * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2597 * the best home CPU we found.
2598 *
2599 * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2600 * other CPU. The caller should pass in the most recently used CPU for the
2601 * thread; it's of course possible that this CPU isn't in the home lgroup.
2602 *
2603 * This function must be called at either high SPL, or with preemption disabled,
2604 * so that the "hint" CPU cannot be removed from the online CPU list while we
2605 * are traversing it.
2606 */
2607 cpu_t *
disp_lowpri_cpu(cpu_t * hint,kthread_t * tp,pri_t tpri)2608 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2609 {
2610 cpu_t *bestcpu;
2611 cpu_t *besthomecpu;
2612 cpu_t *cp, *cpstart;
2613
2614 klgrpset_t done;
2615
2616 lpl_t *lpl_iter, *lpl_leaf;
2617
2618 ASSERT(hint != NULL);
2619 ASSERT(tp->t_lpl->lpl_ncpu > 0);
2620
2621 bestcpu = besthomecpu = NULL;
2622 klgrpset_clear(done);
2623
2624 lpl_iter = tp->t_lpl;
2625
2626 do {
2627 pri_t best = SHRT_MAX;
2628 klgrpset_t cur_set;
2629
2630 klgrpset_clear(cur_set);
2631
2632 for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2633 lpl_leaf = lpl_iter->lpl_rset[i];
2634 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2635 continue;
2636
2637 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2638
2639 if (hint->cpu_lpl == lpl_leaf)
2640 cp = cpstart = hint;
2641 else
2642 cp = cpstart = lpl_leaf->lpl_cpus;
2643
2644 do {
2645 pri_t score = cpu_score(cp, tp);
2646
2647 if (score < best) {
2648 best = score;
2649 bestcpu = cp;
2650
2651 /* An idle CPU: we're done. */
2652 if (score / 2 == CPU_IDLE_PRI)
2653 goto out;
2654 }
2655 } while ((cp = cp->cpu_next_lpl) != cpstart);
2656 }
2657
2658 if (bestcpu != NULL && tpri > (best / 2))
2659 goto out;
2660
2661 if (besthomecpu == NULL)
2662 besthomecpu = bestcpu;
2663
2664 /*
2665 * Add the lgrps we just considered to the "done" set
2666 */
2667 klgrpset_or(done, cur_set);
2668
2669 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2670
2671 /*
2672 * The specified priority isn't high enough to run immediately
2673 * anywhere, so just return the best CPU from the home lgroup.
2674 */
2675 bestcpu = besthomecpu;
2676
2677 out:
2678 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2679 return (bestcpu);
2680 }
2681
2682 /*
2683 * This routine provides the generic idle cpu function for all processors.
2684 * If a processor has some specific code to execute when idle (say, to stop
2685 * the pipeline and save power) then that routine should be defined in the
2686 * processors specific code (module_xx.c) and the global variable idle_cpu
2687 * set to that function.
2688 */
2689 static void
generic_idle_cpu(void)2690 generic_idle_cpu(void)
2691 {
2692 }
2693
2694 /*ARGSUSED*/
2695 static void
generic_enq_thread(cpu_t * cpu,int bound)2696 generic_enq_thread(cpu_t *cpu, int bound)
2697 {
2698 }
2699
2700 cpu_t *
disp_choose_best_cpu(void)2701 disp_choose_best_cpu(void)
2702 {
2703 kthread_t *t = curthread;
2704 cpu_t *curcpu = CPU;
2705
2706 ASSERT(t->t_preempt > 0);
2707 ASSERT(t->t_state == TS_ONPROC);
2708 ASSERT(t->t_schedflag & TS_VCPU);
2709
2710 if (smt_should_run(t, curcpu))
2711 return (curcpu);
2712
2713 return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2714 }
2715