1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 31 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/sysmacros.h> 36 #include <sys/signal.h> 37 #include <sys/user.h> 38 #include <sys/systm.h> 39 #include <sys/sysinfo.h> 40 #include <sys/var.h> 41 #include <sys/errno.h> 42 #include <sys/cmn_err.h> 43 #include <sys/debug.h> 44 #include <sys/inline.h> 45 #include <sys/disp.h> 46 #include <sys/class.h> 47 #include <sys/bitmap.h> 48 #include <sys/kmem.h> 49 #include <sys/cpuvar.h> 50 #include <sys/vtrace.h> 51 #include <sys/tnf.h> 52 #include <sys/cpupart.h> 53 #include <sys/lgrp.h> 54 #include <sys/chip.h> 55 #include <sys/schedctl.h> 56 #include <sys/atomic.h> 57 #include <sys/dtrace.h> 58 #include <sys/sdt.h> 59 60 #include <vm/as.h> 61 62 #define BOUND_CPU 0x1 63 #define BOUND_PARTITION 0x2 64 #define BOUND_INTR 0x4 65 66 /* Dispatch queue allocation structure and functions */ 67 struct disp_queue_info { 68 disp_t *dp; 69 dispq_t *olddispq; 70 dispq_t *newdispq; 71 ulong_t *olddqactmap; 72 ulong_t *newdqactmap; 73 int oldnglobpris; 74 }; 75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 76 disp_t *dp); 77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 78 static void disp_dq_free(struct disp_queue_info *dptr); 79 80 /* platform-specific routine to call when processor is idle */ 81 static void generic_idle_cpu(); 82 void (*idle_cpu)() = generic_idle_cpu; 83 84 /* routines invoked when a CPU enters/exits the idle loop */ 85 static void idle_enter(); 86 static void idle_exit(); 87 88 /* platform-specific routine to call when thread is enqueued */ 89 static void generic_enq_thread(cpu_t *, int); 90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 91 92 pri_t kpreemptpri; /* priority where kernel preemption applies */ 93 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 94 pri_t intr_pri; /* interrupt thread priority base level */ 95 96 #define KPQPRI -1 /* priority where cpu affinity is dropped for kp queue */ 97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 98 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 99 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 100 int nswapped; /* total number of swapped threads */ 101 void disp_swapped_enq(kthread_t *tp); 102 static void disp_swapped_setrun(kthread_t *tp); 103 static void cpu_resched(cpu_t *cp, pri_t tpri); 104 105 /* 106 * If this is set, only interrupt threads will cause kernel preemptions. 107 * This is done by changing the value of kpreemptpri. kpreemptpri 108 * will either be the max sysclass pri + 1 or the min interrupt pri. 109 */ 110 int only_intr_kpreempt; 111 112 extern void set_idle_cpu(int cpun); 113 extern void unset_idle_cpu(int cpun); 114 static void setkpdq(kthread_t *tp, int borf); 115 #define SETKP_BACK 0 116 #define SETKP_FRONT 1 117 /* 118 * Parameter that determines how recently a thread must have run 119 * on the CPU to be considered loosely-bound to that CPU to reduce 120 * cold cache effects. The interval is in hertz. 121 * 122 * The platform may define a per physical processor adjustment of 123 * this parameter. For efficiency, the effective rechoose interval 124 * (rechoose_interval + per chip adjustment) is maintained in the 125 * cpu structures. See cpu_choose() 126 */ 127 int rechoose_interval = RECHOOSE_INTERVAL; 128 129 static cpu_t *cpu_choose(kthread_t *, pri_t); 130 131 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 132 133 disp_lock_t transition_lock; /* lock on transitioning threads */ 134 disp_lock_t stop_lock; /* lock on stopped threads */ 135 136 static void cpu_dispqalloc(int numpris); 137 138 static kthread_t *disp_getwork(cpu_t *to); 139 static kthread_t *disp_getbest(disp_t *from); 140 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 141 142 void swtch_to(kthread_t *); 143 144 /* 145 * dispatcher and scheduler initialization 146 */ 147 148 /* 149 * disp_setup - Common code to calculate and allocate dispatcher 150 * variables and structures based on the maximum priority. 151 */ 152 static void 153 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 154 { 155 pri_t newnglobpris; 156 157 ASSERT(MUTEX_HELD(&cpu_lock)); 158 159 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 160 161 if (newnglobpris > oldnglobpris) { 162 /* 163 * Allocate new kp queues for each CPU partition. 164 */ 165 cpupart_kpqalloc(newnglobpris); 166 167 /* 168 * Allocate new dispatch queues for each CPU. 169 */ 170 cpu_dispqalloc(newnglobpris); 171 172 /* 173 * compute new interrupt thread base priority 174 */ 175 intr_pri = maxglobpri; 176 if (only_intr_kpreempt) { 177 kpreemptpri = intr_pri + 1; 178 if (kpqpri == KPQPRI) 179 kpqpri = kpreemptpri; 180 } 181 v.v_nglobpris = newnglobpris; 182 } 183 } 184 185 /* 186 * dispinit - Called to initialize all loaded classes and the 187 * dispatcher framework. 188 */ 189 void 190 dispinit(void) 191 { 192 id_t cid; 193 pri_t maxglobpri; 194 pri_t cl_maxglobpri; 195 196 maxglobpri = -1; 197 198 /* 199 * Initialize transition lock, which will always be set. 200 */ 201 DISP_LOCK_INIT(&transition_lock); 202 disp_lock_enter_high(&transition_lock); 203 DISP_LOCK_INIT(&stop_lock); 204 205 mutex_enter(&cpu_lock); 206 CPU->cpu_disp->disp_maxrunpri = -1; 207 CPU->cpu_disp->disp_max_unbound_pri = -1; 208 /* 209 * Initialize the default CPU partition. 210 */ 211 cpupart_initialize_default(); 212 /* 213 * Call the class specific initialization functions for 214 * all pre-installed schedulers. 215 * 216 * We pass the size of a class specific parameter 217 * buffer to each of the initialization functions 218 * to try to catch problems with backward compatibility 219 * of class modules. 220 * 221 * For example a new class module running on an old system 222 * which didn't provide sufficiently large parameter buffers 223 * would be bad news. Class initialization modules can check for 224 * this and take action if they detect a problem. 225 */ 226 227 for (cid = 0; cid < nclass; cid++) { 228 sclass_t *sc; 229 230 sc = &sclass[cid]; 231 if (SCHED_INSTALLED(sc)) { 232 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 233 &sc->cl_funcs); 234 if (cl_maxglobpri > maxglobpri) 235 maxglobpri = cl_maxglobpri; 236 } 237 } 238 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 239 if (kpqpri == KPQPRI) 240 kpqpri = kpreemptpri; 241 242 ASSERT(maxglobpri >= 0); 243 disp_setup(maxglobpri, 0); 244 245 mutex_exit(&cpu_lock); 246 247 /* 248 * Get the default class ID; this may be later modified via 249 * dispadmin(1M). This will load the class (normally TS) and that will 250 * call disp_add(), which is why we had to drop cpu_lock first. 251 */ 252 if (getcid(defaultclass, &defaultcid) != 0) { 253 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 254 defaultclass); 255 } 256 } 257 258 /* 259 * disp_add - Called with class pointer to initialize the dispatcher 260 * for a newly loaded class. 261 */ 262 void 263 disp_add(sclass_t *clp) 264 { 265 pri_t maxglobpri; 266 pri_t cl_maxglobpri; 267 268 mutex_enter(&cpu_lock); 269 /* 270 * Initialize the scheduler class. 271 */ 272 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 273 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 274 if (cl_maxglobpri > maxglobpri) 275 maxglobpri = cl_maxglobpri; 276 277 /* 278 * Save old queue information. Since we're initializing a 279 * new scheduling class which has just been loaded, then 280 * the size of the dispq may have changed. We need to handle 281 * that here. 282 */ 283 disp_setup(maxglobpri, v.v_nglobpris); 284 285 mutex_exit(&cpu_lock); 286 } 287 288 289 /* 290 * For each CPU, allocate new dispatch queues 291 * with the stated number of priorities. 292 */ 293 static void 294 cpu_dispqalloc(int numpris) 295 { 296 cpu_t *cpup; 297 struct disp_queue_info *disp_mem; 298 int i, num; 299 300 ASSERT(MUTEX_HELD(&cpu_lock)); 301 302 disp_mem = kmem_zalloc(NCPU * 303 sizeof (struct disp_queue_info), KM_SLEEP); 304 305 /* 306 * This routine must allocate all of the memory before stopping 307 * the cpus because it must not sleep in kmem_alloc while the 308 * CPUs are stopped. Locks they hold will not be freed until they 309 * are restarted. 310 */ 311 i = 0; 312 cpup = cpu_list; 313 do { 314 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 315 i++; 316 cpup = cpup->cpu_next; 317 } while (cpup != cpu_list); 318 num = i; 319 320 pause_cpus(NULL); 321 for (i = 0; i < num; i++) 322 disp_dq_assign(&disp_mem[i], numpris); 323 start_cpus(); 324 325 /* 326 * I must free all of the memory after starting the cpus because 327 * I can not risk sleeping in kmem_free while the cpus are stopped. 328 */ 329 for (i = 0; i < num; i++) 330 disp_dq_free(&disp_mem[i]); 331 332 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 333 } 334 335 static void 336 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 337 { 338 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 339 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 340 sizeof (long), KM_SLEEP); 341 dptr->dp = dp; 342 } 343 344 static void 345 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 346 { 347 disp_t *dp; 348 349 dp = dptr->dp; 350 dptr->olddispq = dp->disp_q; 351 dptr->olddqactmap = dp->disp_qactmap; 352 dptr->oldnglobpris = dp->disp_npri; 353 354 ASSERT(dptr->oldnglobpris < numpris); 355 356 if (dptr->olddispq != NULL) { 357 /* 358 * Use kcopy because bcopy is platform-specific 359 * and could block while we might have paused the cpus. 360 */ 361 (void) kcopy(dptr->olddispq, dptr->newdispq, 362 dptr->oldnglobpris * sizeof (dispq_t)); 363 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 364 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 365 sizeof (long)); 366 } 367 dp->disp_q = dptr->newdispq; 368 dp->disp_qactmap = dptr->newdqactmap; 369 dp->disp_q_limit = &dptr->newdispq[numpris]; 370 dp->disp_npri = numpris; 371 } 372 373 static void 374 disp_dq_free(struct disp_queue_info *dptr) 375 { 376 if (dptr->olddispq != NULL) 377 kmem_free(dptr->olddispq, 378 dptr->oldnglobpris * sizeof (dispq_t)); 379 if (dptr->olddqactmap != NULL) 380 kmem_free(dptr->olddqactmap, 381 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 382 } 383 384 /* 385 * For a newly created CPU, initialize the dispatch queue. 386 * This is called before the CPU is known through cpu[] or on any lists. 387 */ 388 void 389 disp_cpu_init(cpu_t *cp) 390 { 391 disp_t *dp; 392 dispq_t *newdispq; 393 ulong_t *newdqactmap; 394 395 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 396 397 if (cp == cpu0_disp.disp_cpu) 398 dp = &cpu0_disp; 399 else 400 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 401 bzero(dp, sizeof (disp_t)); 402 cp->cpu_disp = dp; 403 dp->disp_cpu = cp; 404 dp->disp_maxrunpri = -1; 405 dp->disp_max_unbound_pri = -1; 406 DISP_LOCK_INIT(&cp->cpu_thread_lock); 407 /* 408 * Allocate memory for the dispatcher queue headers 409 * and the active queue bitmap. 410 */ 411 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 412 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 413 sizeof (long), KM_SLEEP); 414 dp->disp_q = newdispq; 415 dp->disp_qactmap = newdqactmap; 416 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 417 dp->disp_npri = v.v_nglobpris; 418 } 419 420 void 421 disp_cpu_fini(cpu_t *cp) 422 { 423 ASSERT(MUTEX_HELD(&cpu_lock)); 424 425 disp_kp_free(cp->cpu_disp); 426 if (cp->cpu_disp != &cpu0_disp) 427 kmem_free(cp->cpu_disp, sizeof (disp_t)); 428 } 429 430 /* 431 * Allocate new, larger kpreempt dispatch queue to replace the old one. 432 */ 433 void 434 disp_kp_alloc(disp_t *dq, pri_t npri) 435 { 436 struct disp_queue_info mem_info; 437 438 if (npri > dq->disp_npri) { 439 /* 440 * Allocate memory for the new array. 441 */ 442 disp_dq_alloc(&mem_info, npri, dq); 443 444 /* 445 * We need to copy the old structures to the new 446 * and free the old. 447 */ 448 disp_dq_assign(&mem_info, npri); 449 disp_dq_free(&mem_info); 450 } 451 } 452 453 /* 454 * Free dispatch queue. 455 * Used for the kpreempt queues for a removed CPU partition and 456 * for the per-CPU queues of deleted CPUs. 457 */ 458 void 459 disp_kp_free(disp_t *dq) 460 { 461 struct disp_queue_info mem_info; 462 463 mem_info.olddispq = dq->disp_q; 464 mem_info.olddqactmap = dq->disp_qactmap; 465 mem_info.oldnglobpris = dq->disp_npri; 466 disp_dq_free(&mem_info); 467 } 468 469 /* 470 * End dispatcher and scheduler initialization. 471 */ 472 473 /* 474 * See if there's anything to do other than remain idle. 475 * Return non-zero if there is. 476 * 477 * This function must be called with high spl, or with 478 * kernel preemption disabled to prevent the partition's 479 * active cpu list from changing while being traversed. 480 * 481 */ 482 int 483 disp_anywork(void) 484 { 485 cpu_t *cp = CPU; 486 cpu_t *ocp; 487 488 if (cp->cpu_disp->disp_nrunnable != 0) 489 return (1); 490 491 if (!(cp->cpu_flags & CPU_OFFLINE)) { 492 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 493 return (1); 494 495 /* 496 * Work can be taken from another CPU if: 497 * - There is unbound work on the run queue 498 * - That work isn't a thread undergoing a 499 * - context switch on an otherwise empty queue. 500 * - The CPU isn't running the idle loop. 501 */ 502 for (ocp = cp->cpu_next_part; ocp != cp; 503 ocp = ocp->cpu_next_part) { 504 ASSERT(CPU_ACTIVE(ocp)); 505 506 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 507 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 508 ocp->cpu_disp->disp_nrunnable == 1) && 509 ocp->cpu_dispatch_pri != -1) 510 return (1); 511 } 512 } 513 return (0); 514 } 515 516 /* 517 * Called when CPU enters the idle loop 518 */ 519 static void 520 idle_enter() 521 { 522 cpu_t *cp = CPU; 523 524 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 525 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 526 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 527 } 528 529 /* 530 * Called when CPU exits the idle loop 531 */ 532 static void 533 idle_exit() 534 { 535 cpu_t *cp = CPU; 536 537 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 538 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 539 } 540 541 /* 542 * Idle loop. 543 */ 544 void 545 idle() 546 { 547 struct cpu *cp = CPU; /* pointer to this CPU */ 548 kthread_t *t; /* taken thread */ 549 550 idle_enter(); 551 552 /* 553 * Uniprocessor version of idle loop. 554 * Do this until notified that we're on an actual multiprocessor. 555 */ 556 while (ncpus == 1) { 557 if (cp->cpu_disp->disp_nrunnable == 0) { 558 (*idle_cpu)(); 559 continue; 560 } 561 idle_exit(); 562 swtch(); 563 564 idle_enter(); /* returned from swtch */ 565 } 566 567 /* 568 * Multiprocessor idle loop. 569 */ 570 for (;;) { 571 /* 572 * If CPU is completely quiesced by p_online(2), just wait 573 * here with minimal bus traffic until put online. 574 */ 575 while (cp->cpu_flags & CPU_QUIESCED) 576 (*idle_cpu)(); 577 578 if (cp->cpu_disp->disp_nrunnable != 0) { 579 idle_exit(); 580 swtch(); 581 } else { 582 if (cp->cpu_flags & CPU_OFFLINE) 583 continue; 584 if ((t = disp_getwork(cp)) == NULL) { 585 if (cp->cpu_chosen_level != -1) { 586 disp_t *dp = cp->cpu_disp; 587 disp_t *kpq; 588 589 disp_lock_enter(&dp->disp_lock); 590 /* 591 * Set kpq under lock to prevent 592 * migration between partitions. 593 */ 594 kpq = &cp->cpu_part->cp_kp_queue; 595 if (kpq->disp_maxrunpri == -1) 596 cp->cpu_chosen_level = -1; 597 disp_lock_exit(&dp->disp_lock); 598 } 599 (*idle_cpu)(); 600 continue; 601 } 602 idle_exit(); 603 restore_mstate(t); 604 swtch_to(t); 605 } 606 idle_enter(); /* returned from swtch/swtch_to */ 607 } 608 } 609 610 611 /* 612 * Preempt the currently running thread in favor of the highest 613 * priority thread. The class of the current thread controls 614 * where it goes on the dispatcher queues. If panicking, turn 615 * preemption off. 616 */ 617 void 618 preempt() 619 { 620 kthread_t *t = curthread; 621 klwp_t *lwp = ttolwp(curthread); 622 623 if (panicstr) 624 return; 625 626 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 627 628 thread_lock(t); 629 630 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 631 /* 632 * this thread has already been chosen to be run on 633 * another CPU. Clear kprunrun on this CPU since we're 634 * already headed for swtch(). 635 */ 636 CPU->cpu_kprunrun = 0; 637 thread_unlock_nopreempt(t); 638 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 639 } else { 640 if (lwp != NULL) 641 lwp->lwp_ru.nivcsw++; 642 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 643 THREAD_TRANSITION(t); 644 CL_PREEMPT(t); 645 DTRACE_SCHED(preempt); 646 thread_unlock_nopreempt(t); 647 648 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 649 650 swtch(); /* clears CPU->cpu_runrun via disp() */ 651 } 652 } 653 654 extern kthread_t *thread_unpin(); 655 656 /* 657 * disp() - find the highest priority thread for this processor to run, and 658 * set it in TS_ONPROC state so that resume() can be called to run it. 659 */ 660 static kthread_t * 661 disp() 662 { 663 cpu_t *cpup; 664 disp_t *dp; 665 kthread_t *tp; 666 dispq_t *dq; 667 int maxrunword; 668 pri_t pri; 669 disp_t *kpq; 670 671 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 672 673 cpup = CPU; 674 /* 675 * Find the highest priority loaded, runnable thread. 676 */ 677 dp = cpup->cpu_disp; 678 679 reschedule: 680 /* 681 * If there is more important work on the global queue with a better 682 * priority than the maximum on this CPU, take it now. 683 */ 684 kpq = &cpup->cpu_part->cp_kp_queue; 685 while ((pri = kpq->disp_maxrunpri) >= 0 && 686 pri >= dp->disp_maxrunpri && 687 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 688 (tp = disp_getbest(kpq)) != NULL) { 689 if (disp_ratify(tp, kpq) != NULL) { 690 TRACE_1(TR_FAC_DISP, TR_DISP_END, 691 "disp_end:tid %p", tp); 692 restore_mstate(tp); 693 return (tp); 694 } 695 } 696 697 disp_lock_enter(&dp->disp_lock); 698 pri = dp->disp_maxrunpri; 699 700 /* 701 * If there is nothing to run, look at what's runnable on other queues. 702 * Choose the idle thread if the CPU is quiesced. 703 * Note that CPUs that have the CPU_OFFLINE flag set can still run 704 * interrupt threads, which will be the only threads on the CPU's own 705 * queue, but cannot run threads from other queues. 706 */ 707 if (pri == -1) { 708 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 709 disp_lock_exit(&dp->disp_lock); 710 if ((tp = disp_getwork(cpup)) == NULL) { 711 tp = cpup->cpu_idle_thread; 712 (void) splhigh(); 713 THREAD_ONPROC(tp, cpup); 714 cpup->cpu_dispthread = tp; 715 cpup->cpu_dispatch_pri = -1; 716 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 717 cpup->cpu_chosen_level = -1; 718 } 719 } else { 720 disp_lock_exit_high(&dp->disp_lock); 721 tp = cpup->cpu_idle_thread; 722 THREAD_ONPROC(tp, cpup); 723 cpup->cpu_dispthread = tp; 724 cpup->cpu_dispatch_pri = -1; 725 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 726 cpup->cpu_chosen_level = -1; 727 } 728 TRACE_1(TR_FAC_DISP, TR_DISP_END, 729 "disp_end:tid %p", tp); 730 restore_mstate(tp); 731 return (tp); 732 } 733 734 dq = &dp->disp_q[pri]; 735 tp = dq->dq_first; 736 737 ASSERT(tp != NULL); 738 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 739 740 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 741 742 /* 743 * Found it so remove it from queue. 744 */ 745 dp->disp_nrunnable--; 746 dq->dq_sruncnt--; 747 if ((dq->dq_first = tp->t_link) == NULL) { 748 ulong_t *dqactmap = dp->disp_qactmap; 749 750 ASSERT(dq->dq_sruncnt == 0); 751 dq->dq_last = NULL; 752 753 /* 754 * The queue is empty, so the corresponding bit needs to be 755 * turned off in dqactmap. If nrunnable != 0 just took the 756 * last runnable thread off the 757 * highest queue, so recompute disp_maxrunpri. 758 */ 759 maxrunword = pri >> BT_ULSHIFT; 760 dqactmap[maxrunword] &= ~BT_BIW(pri); 761 762 if (dp->disp_nrunnable == 0) { 763 dp->disp_max_unbound_pri = -1; 764 dp->disp_maxrunpri = -1; 765 } else { 766 int ipri; 767 768 ipri = bt_gethighbit(dqactmap, maxrunword); 769 dp->disp_maxrunpri = ipri; 770 if (ipri < dp->disp_max_unbound_pri) 771 dp->disp_max_unbound_pri = ipri; 772 } 773 } else { 774 tp->t_link = NULL; 775 } 776 777 /* 778 * Set TS_DONT_SWAP flag to prevent another processor from swapping 779 * out this thread before we have a chance to run it. 780 * While running, it is protected against swapping by t_lock. 781 */ 782 tp->t_schedflag |= TS_DONT_SWAP; 783 cpup->cpu_dispthread = tp; /* protected by spl only */ 784 cpup->cpu_dispatch_pri = pri; 785 ASSERT(pri == DISP_PRIO(tp)); 786 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 787 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 788 789 ASSERT(tp != NULL); 790 TRACE_1(TR_FAC_DISP, TR_DISP_END, 791 "disp_end:tid %p", tp); 792 793 if (disp_ratify(tp, kpq) == NULL) 794 goto reschedule; 795 796 restore_mstate(tp); 797 return (tp); 798 } 799 800 /* 801 * swtch() 802 * Find best runnable thread and run it. 803 * Called with the current thread already switched to a new state, 804 * on a sleep queue, run queue, stopped, and not zombied. 805 * May be called at any spl level less than or equal to LOCK_LEVEL. 806 * Always drops spl to the base level (spl0()). 807 */ 808 void 809 swtch() 810 { 811 kthread_t *t = curthread; 812 kthread_t *next; 813 cpu_t *cp; 814 815 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 816 817 if (t->t_flag & T_INTR_THREAD) 818 cpu_intr_swtch_enter(t); 819 820 if (t->t_intr != NULL) { 821 /* 822 * We are an interrupt thread. Setup and return 823 * the interrupted thread to be resumed. 824 */ 825 (void) splhigh(); /* block other scheduler action */ 826 cp = CPU; /* now protected against migration */ 827 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 828 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 829 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 830 next = thread_unpin(); 831 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 832 resume_from_intr(next); 833 } else { 834 #ifdef DEBUG 835 if (t->t_state == TS_ONPROC && 836 t->t_disp_queue->disp_cpu == CPU && 837 t->t_preempt == 0) { 838 thread_lock(t); 839 ASSERT(t->t_state != TS_ONPROC || 840 t->t_disp_queue->disp_cpu != CPU || 841 t->t_preempt != 0); /* cannot migrate */ 842 thread_unlock_nopreempt(t); 843 } 844 #endif /* DEBUG */ 845 cp = CPU; 846 next = disp(); /* returns with spl high */ 847 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 848 849 /* OK to steal anything left on run queue */ 850 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 851 852 if (next != t) { 853 if (t == cp->cpu_idle_thread) { 854 CHIP_NRUNNING(cp->cpu_chip, 1); 855 } else if (next == cp->cpu_idle_thread) { 856 CHIP_NRUNNING(cp->cpu_chip, -1); 857 } 858 859 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 860 cp->cpu_last_swtch = t->t_disp_time = lbolt; 861 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 862 863 if (dtrace_vtime_active) 864 dtrace_vtime_switch(next); 865 866 resume(next); 867 /* 868 * The TR_RESUME_END and TR_SWTCH_END trace points 869 * appear at the end of resume(), because we may not 870 * return here 871 */ 872 } else { 873 if (t->t_flag & T_INTR_THREAD) 874 cpu_intr_swtch_exit(t); 875 876 DTRACE_SCHED(remain__cpu); 877 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 878 (void) spl0(); 879 } 880 } 881 } 882 883 /* 884 * swtch_from_zombie() 885 * Special case of swtch(), which allows checks for TS_ZOMB to be 886 * eliminated from normal resume. 887 * Find best runnable thread and run it. 888 * Called with the current thread zombied. 889 * Zombies cannot migrate, so CPU references are safe. 890 */ 891 void 892 swtch_from_zombie() 893 { 894 kthread_t *next; 895 cpu_t *cpu = CPU; 896 897 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 898 899 ASSERT(curthread->t_state == TS_ZOMB); 900 901 next = disp(); /* returns with spl high */ 902 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 903 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 904 ASSERT(next != curthread); 905 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 906 907 if (next == cpu->cpu_idle_thread) 908 CHIP_NRUNNING(cpu->cpu_chip, -1); 909 910 if (dtrace_vtime_active) 911 dtrace_vtime_switch(next); 912 913 resume_from_zombie(next); 914 /* 915 * The TR_RESUME_END and TR_SWTCH_END trace points 916 * appear at the end of resume(), because we certainly will not 917 * return here 918 */ 919 } 920 921 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 922 static int 923 thread_on_queue(kthread_t *tp) 924 { 925 cpu_t *cp; 926 cpu_t *self; 927 disp_t *dp; 928 929 self = CPU; 930 cp = self->cpu_next_onln; 931 dp = cp->cpu_disp; 932 for (;;) { 933 dispq_t *dq; 934 dispq_t *eq; 935 936 disp_lock_enter_high(&dp->disp_lock); 937 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 938 kthread_t *rp; 939 940 ASSERT(dq->dq_last == NULL || 941 dq->dq_last->t_link == NULL); 942 for (rp = dq->dq_first; rp; rp = rp->t_link) 943 if (tp == rp) { 944 disp_lock_exit_high(&dp->disp_lock); 945 return (1); 946 } 947 } 948 disp_lock_exit_high(&dp->disp_lock); 949 if (cp == NULL) 950 break; 951 if (cp == self) { 952 cp = NULL; 953 dp = &cp->cpu_part->cp_kp_queue; 954 } else { 955 cp = cp->cpu_next_onln; 956 dp = cp->cpu_disp; 957 } 958 } 959 return (0); 960 } /* end of thread_on_queue */ 961 #else 962 963 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 964 965 #endif /* DEBUG */ 966 967 /* 968 * like swtch(), but switch to a specified thread taken from another CPU. 969 * called with spl high.. 970 */ 971 void 972 swtch_to(kthread_t *next) 973 { 974 cpu_t *cp = CPU; 975 976 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 977 978 /* 979 * Update context switch statistics. 980 */ 981 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 982 983 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 984 985 if (curthread == cp->cpu_idle_thread) 986 CHIP_NRUNNING(cp->cpu_chip, 1); 987 988 /* OK to steal anything left on run queue */ 989 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 990 991 /* record last execution time */ 992 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 993 994 if (dtrace_vtime_active) 995 dtrace_vtime_switch(next); 996 997 resume(next); 998 /* 999 * The TR_RESUME_END and TR_SWTCH_END trace points 1000 * appear at the end of resume(), because we may not 1001 * return here 1002 */ 1003 } 1004 1005 1006 1007 #define CPU_IDLING(pri) ((pri) == -1) 1008 1009 static void 1010 cpu_resched(cpu_t *cp, pri_t tpri) 1011 { 1012 int call_poke_cpu = 0; 1013 pri_t cpupri = cp->cpu_dispatch_pri; 1014 1015 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1016 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1017 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1018 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1019 cp->cpu_runrun = 1; 1020 aston(cp->cpu_dispthread); 1021 if (tpri < kpreemptpri && cp != CPU) 1022 call_poke_cpu = 1; 1023 } 1024 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1025 cp->cpu_kprunrun = 1; 1026 if (cp != CPU) 1027 call_poke_cpu = 1; 1028 } 1029 } 1030 1031 /* 1032 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1033 */ 1034 membar_enter(); 1035 1036 if (call_poke_cpu) 1037 poke_cpu(cp->cpu_id); 1038 } 1039 1040 /* 1041 * Routine used by setbackdq() to balance load across the physical 1042 * processors. Returns a CPU of a lesser loaded chip in the lgroup 1043 * if balancing is necessary, or the "hint" CPU if it's not. 1044 * 1045 * - tp is the thread being enqueued 1046 * - cp is a hint CPU (chosen by cpu_choose()). 1047 * - curchip (if not NULL) is the chip on which the current thread 1048 * is running. 1049 * 1050 * The thread lock for "tp" must be held while calling this routine. 1051 */ 1052 static cpu_t * 1053 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip) 1054 { 1055 int chp_nrun, ochp_nrun; 1056 chip_t *chp, *nchp; 1057 1058 chp = cp->cpu_chip; 1059 chp_nrun = chp->chip_nrunning; 1060 1061 if (chp == curchip) 1062 chp_nrun--; /* Ignore curthread */ 1063 1064 /* 1065 * If this chip isn't at all idle, then let 1066 * run queue balancing do the work. 1067 */ 1068 if (chp_nrun == chp->chip_ncpu) 1069 return (cp); 1070 1071 nchp = chp->chip_balance; 1072 do { 1073 if (nchp == chp || 1074 !CHIP_IN_CPUPART(nchp, tp->t_cpupart)) 1075 continue; 1076 1077 ochp_nrun = nchp->chip_nrunning; 1078 1079 /* 1080 * If the other chip is running less threads, 1081 * or if it's running the same number of threads, but 1082 * has more online logical CPUs, then choose to balance. 1083 */ 1084 if (chp_nrun > ochp_nrun || 1085 (chp_nrun == ochp_nrun && 1086 nchp->chip_ncpu > chp->chip_ncpu)) { 1087 cp = nchp->chip_cpus; 1088 nchp->chip_cpus = cp->cpu_next_chip; 1089 1090 /* 1091 * Find a CPU on the chip in the correct 1092 * partition. We know at least one exists 1093 * because of the CHIP_IN_CPUPART() check above. 1094 */ 1095 while (cp->cpu_part != tp->t_cpupart) 1096 cp = cp->cpu_next_chip; 1097 } 1098 chp->chip_balance = nchp->chip_next_lgrp; 1099 break; 1100 } while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance); 1101 1102 ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart)); 1103 return (cp); 1104 } 1105 1106 /* 1107 * setbackdq() keeps runqs balanced such that the difference in length 1108 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1109 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1110 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1111 * try to keep runqs perfectly balanced regardless of the thread priority. 1112 */ 1113 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1114 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1115 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1116 1117 /* 1118 * Put the specified thread on the back of the dispatcher 1119 * queue corresponding to its current priority. 1120 * 1121 * Called with the thread in transition, onproc or stopped state 1122 * and locked (transition implies locked) and at high spl. 1123 * Returns with the thread in TS_RUN state and still locked. 1124 */ 1125 void 1126 setbackdq(kthread_t *tp) 1127 { 1128 dispq_t *dq; 1129 disp_t *dp; 1130 chip_t *curchip = NULL; 1131 cpu_t *cp; 1132 pri_t tpri; 1133 int bound; 1134 1135 ASSERT(THREAD_LOCK_HELD(tp)); 1136 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1137 1138 if (tp->t_waitrq == 0) { 1139 hrtime_t curtime; 1140 1141 curtime = gethrtime_unscaled(); 1142 (void) cpu_update_pct(tp, curtime); 1143 tp->t_waitrq = curtime; 1144 } else { 1145 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1146 } 1147 1148 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1149 1150 /* 1151 * If thread is "swapped" or on the swap queue don't 1152 * queue it, but wake sched. 1153 */ 1154 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1155 disp_swapped_setrun(tp); 1156 return; 1157 } 1158 1159 tpri = DISP_PRIO(tp); 1160 if (tp == curthread) { 1161 curchip = CPU->cpu_chip; 1162 } 1163 1164 if (ncpus == 1) 1165 cp = tp->t_cpu; 1166 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1167 if (tpri >= kpqpri) { 1168 setkpdq(tp, SETKP_BACK); 1169 return; 1170 } 1171 /* 1172 * Let cpu_choose suggest a CPU. 1173 */ 1174 cp = cpu_choose(tp, tpri); 1175 1176 if (tp->t_cpupart == cp->cpu_part) { 1177 int qlen; 1178 1179 /* 1180 * Select another CPU if we need 1181 * to do some load balancing across the 1182 * physical processors. 1183 */ 1184 if (CHIP_SHOULD_BALANCE(cp->cpu_chip)) 1185 cp = chip_balance(tp, cp, curchip); 1186 1187 /* 1188 * Balance across the run queues 1189 */ 1190 qlen = RUNQ_LEN(cp, tpri); 1191 if (tpri >= RUNQ_MATCH_PRI && 1192 !(tp->t_schedflag & TS_RUNQMATCH)) 1193 qlen -= RUNQ_MAX_DIFF; 1194 if (qlen > 0) { 1195 cpu_t *np; 1196 1197 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 1198 np = cp->cpu_next_part; 1199 else { 1200 if ((np = cp->cpu_next_lpl) == cp) 1201 np = cp->cpu_next_part; 1202 } 1203 if (RUNQ_LEN(np, tpri) < qlen) 1204 cp = np; 1205 } 1206 } else { 1207 /* 1208 * Migrate to a cpu in the new partition. 1209 */ 1210 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1211 tp->t_lpl, tp->t_pri, NULL); 1212 } 1213 bound = 0; 1214 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1215 } else { 1216 /* 1217 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1218 * a short time until weak binding that existed when the 1219 * strong binding was established has dropped) so we must 1220 * favour weak binding over strong. 1221 */ 1222 cp = tp->t_weakbound_cpu ? 1223 tp->t_weakbound_cpu : tp->t_bound_cpu; 1224 bound = 1; 1225 } 1226 dp = cp->cpu_disp; 1227 disp_lock_enter_high(&dp->disp_lock); 1228 1229 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1230 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1231 tpri, cp, tp); 1232 1233 #ifndef NPROBE 1234 /* Kernel probe */ 1235 if (tnf_tracing_active) 1236 tnf_thread_queue(tp, cp, tpri); 1237 #endif /* NPROBE */ 1238 1239 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1240 1241 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1242 tp->t_disp_queue = dp; 1243 tp->t_link = NULL; 1244 1245 dq = &dp->disp_q[tpri]; 1246 dp->disp_nrunnable++; 1247 membar_enter(); 1248 1249 if (dq->dq_sruncnt++ != 0) { 1250 ASSERT(dq->dq_first != NULL); 1251 dq->dq_last->t_link = tp; 1252 dq->dq_last = tp; 1253 } else { 1254 ASSERT(dq->dq_first == NULL); 1255 ASSERT(dq->dq_last == NULL); 1256 dq->dq_first = dq->dq_last = tp; 1257 BT_SET(dp->disp_qactmap, tpri); 1258 if (tpri > dp->disp_maxrunpri) { 1259 dp->disp_maxrunpri = tpri; 1260 membar_enter(); 1261 cpu_resched(cp, tpri); 1262 } 1263 } 1264 1265 if (!bound && tpri > dp->disp_max_unbound_pri) { 1266 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1267 cp == CPU) { 1268 /* 1269 * If there are no other unbound threads on the 1270 * run queue, don't allow other CPUs to steal 1271 * this thread while we are in the middle of a 1272 * context switch. We may just switch to it 1273 * again right away. CPU_DISP_DONTSTEAL is cleared 1274 * in swtch and swtch_to. 1275 */ 1276 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1277 } 1278 dp->disp_max_unbound_pri = tpri; 1279 } 1280 (*disp_enq_thread)(cp, bound); 1281 } 1282 1283 /* 1284 * Put the specified thread on the front of the dispatcher 1285 * queue corresponding to its current priority. 1286 * 1287 * Called with the thread in transition, onproc or stopped state 1288 * and locked (transition implies locked) and at high spl. 1289 * Returns with the thread in TS_RUN state and still locked. 1290 */ 1291 void 1292 setfrontdq(kthread_t *tp) 1293 { 1294 disp_t *dp; 1295 dispq_t *dq; 1296 cpu_t *cp; 1297 pri_t tpri; 1298 int bound; 1299 1300 ASSERT(THREAD_LOCK_HELD(tp)); 1301 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1302 1303 if (tp->t_waitrq == 0) { 1304 hrtime_t curtime; 1305 1306 curtime = gethrtime_unscaled(); 1307 (void) cpu_update_pct(tp, curtime); 1308 tp->t_waitrq = curtime; 1309 } else { 1310 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1311 } 1312 1313 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1314 1315 /* 1316 * If thread is "swapped" or on the swap queue don't 1317 * queue it, but wake sched. 1318 */ 1319 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1320 disp_swapped_setrun(tp); 1321 return; 1322 } 1323 1324 tpri = DISP_PRIO(tp); 1325 if (ncpus == 1) 1326 cp = tp->t_cpu; 1327 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1328 if (tpri >= kpqpri) { 1329 setkpdq(tp, SETKP_FRONT); 1330 return; 1331 } 1332 cp = tp->t_cpu; 1333 if (tp->t_cpupart == cp->cpu_part) { 1334 /* 1335 * If we are of higher or equal priority than 1336 * the highest priority runnable thread of 1337 * the current CPU, just pick this CPU. Otherwise 1338 * Let cpu_choose() select the CPU. If this cpu 1339 * is the target of an offline request then do not 1340 * pick it - a thread_nomigrate() on the in motion 1341 * cpu relies on this when it forces a preempt. 1342 */ 1343 if (tpri < cp->cpu_disp->disp_maxrunpri || 1344 cp == cpu_inmotion) 1345 cp = cpu_choose(tp, tpri); 1346 } else { 1347 /* 1348 * Migrate to a cpu in the new partition. 1349 */ 1350 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1351 tp->t_lpl, tp->t_pri, NULL); 1352 } 1353 bound = 0; 1354 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1355 } else { 1356 /* 1357 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1358 * a short time until weak binding that existed when the 1359 * strong binding was established has dropped) so we must 1360 * favour weak binding over strong. 1361 */ 1362 cp = tp->t_weakbound_cpu ? 1363 tp->t_weakbound_cpu : tp->t_bound_cpu; 1364 bound = 1; 1365 } 1366 dp = cp->cpu_disp; 1367 disp_lock_enter_high(&dp->disp_lock); 1368 1369 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1370 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1371 1372 #ifndef NPROBE 1373 /* Kernel probe */ 1374 if (tnf_tracing_active) 1375 tnf_thread_queue(tp, cp, tpri); 1376 #endif /* NPROBE */ 1377 1378 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1379 1380 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1381 tp->t_disp_queue = dp; 1382 1383 dq = &dp->disp_q[tpri]; 1384 dp->disp_nrunnable++; 1385 membar_enter(); 1386 1387 if (dq->dq_sruncnt++ != 0) { 1388 ASSERT(dq->dq_last != NULL); 1389 tp->t_link = dq->dq_first; 1390 dq->dq_first = tp; 1391 } else { 1392 ASSERT(dq->dq_last == NULL); 1393 ASSERT(dq->dq_first == NULL); 1394 tp->t_link = NULL; 1395 dq->dq_first = dq->dq_last = tp; 1396 BT_SET(dp->disp_qactmap, tpri); 1397 if (tpri > dp->disp_maxrunpri) { 1398 dp->disp_maxrunpri = tpri; 1399 membar_enter(); 1400 cpu_resched(cp, tpri); 1401 } 1402 } 1403 1404 if (!bound && tpri > dp->disp_max_unbound_pri) { 1405 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1406 cp == CPU) { 1407 /* 1408 * If there are no other unbound threads on the 1409 * run queue, don't allow other CPUs to steal 1410 * this thread while we are in the middle of a 1411 * context switch. We may just switch to it 1412 * again right away. CPU_DISP_DONTSTEAL is cleared 1413 * in swtch and swtch_to. 1414 */ 1415 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1416 } 1417 dp->disp_max_unbound_pri = tpri; 1418 } 1419 (*disp_enq_thread)(cp, bound); 1420 } 1421 1422 /* 1423 * Put a high-priority unbound thread on the kp queue 1424 */ 1425 static void 1426 setkpdq(kthread_t *tp, int borf) 1427 { 1428 dispq_t *dq; 1429 disp_t *dp; 1430 cpu_t *cp; 1431 pri_t tpri; 1432 1433 tpri = DISP_PRIO(tp); 1434 1435 dp = &tp->t_cpupart->cp_kp_queue; 1436 disp_lock_enter_high(&dp->disp_lock); 1437 1438 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1439 1440 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1441 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1442 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1443 tp->t_disp_queue = dp; 1444 dp->disp_nrunnable++; 1445 dq = &dp->disp_q[tpri]; 1446 1447 if (dq->dq_sruncnt++ != 0) { 1448 if (borf == SETKP_BACK) { 1449 ASSERT(dq->dq_first != NULL); 1450 tp->t_link = NULL; 1451 dq->dq_last->t_link = tp; 1452 dq->dq_last = tp; 1453 } else { 1454 ASSERT(dq->dq_last != NULL); 1455 tp->t_link = dq->dq_first; 1456 dq->dq_first = tp; 1457 } 1458 } else { 1459 if (borf == SETKP_BACK) { 1460 ASSERT(dq->dq_first == NULL); 1461 ASSERT(dq->dq_last == NULL); 1462 dq->dq_first = dq->dq_last = tp; 1463 } else { 1464 ASSERT(dq->dq_last == NULL); 1465 ASSERT(dq->dq_first == NULL); 1466 tp->t_link = NULL; 1467 dq->dq_first = dq->dq_last = tp; 1468 } 1469 BT_SET(dp->disp_qactmap, tpri); 1470 if (tpri > dp->disp_max_unbound_pri) 1471 dp->disp_max_unbound_pri = tpri; 1472 if (tpri > dp->disp_maxrunpri) { 1473 dp->disp_maxrunpri = tpri; 1474 membar_enter(); 1475 } 1476 } 1477 1478 cp = tp->t_cpu; 1479 if (tp->t_cpupart != cp->cpu_part) { 1480 /* migrate to a cpu in the new partition */ 1481 cp = tp->t_cpupart->cp_cpulist; 1482 } 1483 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1484 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1485 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1486 1487 #ifndef NPROBE 1488 /* Kernel probe */ 1489 if (tnf_tracing_active) 1490 tnf_thread_queue(tp, cp, tpri); 1491 #endif /* NPROBE */ 1492 1493 if (cp->cpu_chosen_level < tpri) 1494 cp->cpu_chosen_level = tpri; 1495 cpu_resched(cp, tpri); 1496 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1497 (*disp_enq_thread)(cp, 0); 1498 } 1499 1500 /* 1501 * Remove a thread from the dispatcher queue if it is on it. 1502 * It is not an error if it is not found but we return whether 1503 * or not it was found in case the caller wants to check. 1504 */ 1505 int 1506 dispdeq(kthread_t *tp) 1507 { 1508 disp_t *dp; 1509 dispq_t *dq; 1510 kthread_t *rp; 1511 kthread_t *trp; 1512 kthread_t **ptp; 1513 int tpri; 1514 1515 ASSERT(THREAD_LOCK_HELD(tp)); 1516 1517 if (tp->t_state != TS_RUN) 1518 return (0); 1519 1520 /* 1521 * The thread is "swapped" or is on the swap queue and 1522 * hence no longer on the run queue, so return true. 1523 */ 1524 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1525 return (1); 1526 1527 tpri = DISP_PRIO(tp); 1528 dp = tp->t_disp_queue; 1529 ASSERT(tpri < dp->disp_npri); 1530 dq = &dp->disp_q[tpri]; 1531 ptp = &dq->dq_first; 1532 rp = *ptp; 1533 trp = NULL; 1534 1535 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1536 1537 /* 1538 * Search for thread in queue. 1539 * Double links would simplify this at the expense of disp/setrun. 1540 */ 1541 while (rp != tp && rp != NULL) { 1542 trp = rp; 1543 ptp = &trp->t_link; 1544 rp = trp->t_link; 1545 } 1546 1547 if (rp == NULL) { 1548 panic("dispdeq: thread not on queue"); 1549 } 1550 1551 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1552 1553 /* 1554 * Found it so remove it from queue. 1555 */ 1556 if ((*ptp = rp->t_link) == NULL) 1557 dq->dq_last = trp; 1558 1559 dp->disp_nrunnable--; 1560 if (--dq->dq_sruncnt == 0) { 1561 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1562 if (dp->disp_nrunnable == 0) { 1563 dp->disp_max_unbound_pri = -1; 1564 dp->disp_maxrunpri = -1; 1565 } else if (tpri == dp->disp_maxrunpri) { 1566 int ipri; 1567 1568 ipri = bt_gethighbit(dp->disp_qactmap, 1569 dp->disp_maxrunpri >> BT_ULSHIFT); 1570 if (ipri < dp->disp_max_unbound_pri) 1571 dp->disp_max_unbound_pri = ipri; 1572 dp->disp_maxrunpri = ipri; 1573 } 1574 } 1575 tp->t_link = NULL; 1576 THREAD_TRANSITION(tp); /* put in intermediate state */ 1577 return (1); 1578 } 1579 1580 1581 /* 1582 * dq_sruninc and dq_srundec are public functions for 1583 * incrementing/decrementing the sruncnts when a thread on 1584 * a dispatcher queue is made schedulable/unschedulable by 1585 * resetting the TS_LOAD flag. 1586 * 1587 * The caller MUST have the thread lock and therefore the dispatcher 1588 * queue lock so that the operation which changes 1589 * the flag, the operation that checks the status of the thread to 1590 * determine if it's on a disp queue AND the call to this function 1591 * are one atomic operation with respect to interrupts. 1592 */ 1593 1594 /* 1595 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1596 */ 1597 void 1598 dq_sruninc(kthread_t *t) 1599 { 1600 ASSERT(t->t_state == TS_RUN); 1601 ASSERT(t->t_schedflag & TS_LOAD); 1602 1603 THREAD_TRANSITION(t); 1604 setfrontdq(t); 1605 } 1606 1607 /* 1608 * See comment on calling conventions above. 1609 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1610 */ 1611 void 1612 dq_srundec(kthread_t *t) 1613 { 1614 ASSERT(t->t_schedflag & TS_LOAD); 1615 1616 (void) dispdeq(t); 1617 disp_swapped_enq(t); 1618 } 1619 1620 /* 1621 * Change the dispatcher lock of thread to the "swapped_lock" 1622 * and return with thread lock still held. 1623 * 1624 * Called with thread_lock held, in transition state, and at high spl. 1625 */ 1626 void 1627 disp_swapped_enq(kthread_t *tp) 1628 { 1629 ASSERT(THREAD_LOCK_HELD(tp)); 1630 ASSERT(tp->t_schedflag & TS_LOAD); 1631 1632 switch (tp->t_state) { 1633 case TS_RUN: 1634 disp_lock_enter_high(&swapped_lock); 1635 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1636 break; 1637 case TS_ONPROC: 1638 disp_lock_enter_high(&swapped_lock); 1639 THREAD_TRANSITION(tp); 1640 wake_sched_sec = 1; /* tell clock to wake sched */ 1641 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1642 break; 1643 default: 1644 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1645 } 1646 } 1647 1648 /* 1649 * This routine is called by setbackdq/setfrontdq if the thread is 1650 * not loaded or loaded and on the swap queue. 1651 * 1652 * Thread state TS_SLEEP implies that a swapped thread 1653 * has been woken up and needs to be swapped in by the swapper. 1654 * 1655 * Thread state TS_RUN, it implies that the priority of a swapped 1656 * thread is being increased by scheduling class (e.g. ts_update). 1657 */ 1658 static void 1659 disp_swapped_setrun(kthread_t *tp) 1660 { 1661 ASSERT(THREAD_LOCK_HELD(tp)); 1662 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1663 1664 switch (tp->t_state) { 1665 case TS_SLEEP: 1666 disp_lock_enter_high(&swapped_lock); 1667 /* 1668 * Wakeup sched immediately (i.e., next tick) if the 1669 * thread priority is above maxclsyspri. 1670 */ 1671 if (DISP_PRIO(tp) > maxclsyspri) 1672 wake_sched = 1; 1673 else 1674 wake_sched_sec = 1; 1675 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1676 break; 1677 case TS_RUN: /* called from ts_update */ 1678 break; 1679 default: 1680 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1681 } 1682 } 1683 1684 1685 /* 1686 * Make a thread give up its processor. Find the processor on 1687 * which this thread is executing, and have that processor 1688 * preempt. 1689 */ 1690 void 1691 cpu_surrender(kthread_t *tp) 1692 { 1693 cpu_t *cpup; 1694 int max_pri; 1695 int max_run_pri; 1696 klwp_t *lwp; 1697 1698 ASSERT(THREAD_LOCK_HELD(tp)); 1699 1700 if (tp->t_state != TS_ONPROC) 1701 return; 1702 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1703 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1704 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1705 if (max_pri < max_run_pri) 1706 max_pri = max_run_pri; 1707 1708 cpup->cpu_runrun = 1; 1709 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1710 cpup->cpu_kprunrun = 1; 1711 } 1712 1713 /* 1714 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1715 */ 1716 membar_enter(); 1717 1718 DTRACE_SCHED1(surrender, kthread_t *, tp); 1719 1720 /* 1721 * Make the target thread take an excursion through trap() 1722 * to do preempt() (unless we're already in trap or post_syscall, 1723 * calling cpu_surrender via CL_TRAPRET). 1724 */ 1725 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1726 lwp->lwp_state != LWP_USER) { 1727 aston(tp); 1728 if (cpup != CPU) 1729 poke_cpu(cpup->cpu_id); 1730 } 1731 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1732 "cpu_surrender:tid %p cpu %p", tp, cpup); 1733 } 1734 1735 1736 /* 1737 * Commit to and ratify a scheduling decision 1738 */ 1739 /*ARGSUSED*/ 1740 static kthread_t * 1741 disp_ratify(kthread_t *tp, disp_t *kpq) 1742 { 1743 pri_t tpri, maxpri; 1744 pri_t maxkpri; 1745 cpu_t *cpup; 1746 1747 ASSERT(tp != NULL); 1748 /* 1749 * Commit to, then ratify scheduling decision 1750 */ 1751 cpup = CPU; 1752 if (cpup->cpu_runrun != 0) 1753 cpup->cpu_runrun = 0; 1754 if (cpup->cpu_kprunrun != 0) 1755 cpup->cpu_kprunrun = 0; 1756 if (cpup->cpu_chosen_level != -1) 1757 cpup->cpu_chosen_level = -1; 1758 membar_enter(); 1759 tpri = DISP_PRIO(tp); 1760 maxpri = cpup->cpu_disp->disp_maxrunpri; 1761 maxkpri = kpq->disp_maxrunpri; 1762 if (maxpri < maxkpri) 1763 maxpri = maxkpri; 1764 if (tpri < maxpri) { 1765 /* 1766 * should have done better 1767 * put this one back and indicate to try again 1768 */ 1769 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1770 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1771 thread_lock_high(tp); 1772 THREAD_TRANSITION(tp); 1773 setfrontdq(tp); 1774 thread_unlock_nopreempt(tp); 1775 1776 tp = NULL; 1777 } 1778 return (tp); 1779 } 1780 1781 /* 1782 * See if there is any work on the dispatcher queue for other CPUs. 1783 * If there is, dequeue the best thread and return. 1784 */ 1785 static kthread_t * 1786 disp_getwork(cpu_t *cp) 1787 { 1788 cpu_t *ocp; /* other CPU */ 1789 cpu_t *ocp_start; 1790 cpu_t *tcp; /* target local CPU */ 1791 kthread_t *tp; 1792 pri_t maxpri; 1793 int s; 1794 disp_t *kpq; /* kp queue for this partition */ 1795 lpl_t *lpl, *lpl_leaf; 1796 int hint, leafidx; 1797 1798 maxpri = -1; 1799 tcp = NULL; 1800 1801 kpq = &cp->cpu_part->cp_kp_queue; 1802 while (kpq->disp_maxrunpri >= 0) { 1803 /* 1804 * Try to take a thread from the kp_queue. 1805 */ 1806 tp = (disp_getbest(kpq)); 1807 if (tp) 1808 return (disp_ratify(tp, kpq)); 1809 } 1810 1811 s = splhigh(); /* protect the cpu_active list */ 1812 1813 /* 1814 * Try to find something to do on another CPU's run queue. 1815 * Loop through all other CPUs looking for the one with the highest 1816 * priority unbound thread. 1817 * 1818 * On NUMA machines, the partition's CPUs are consulted in order of 1819 * distance from the current CPU. This way, the first available 1820 * work found is also the closest, and will suffer the least 1821 * from being migrated. 1822 */ 1823 lpl = lpl_leaf = cp->cpu_lpl; 1824 hint = leafidx = 0; 1825 1826 /* 1827 * This loop traverses the lpl hierarchy. Higher level lpls represent 1828 * broader levels of locality 1829 */ 1830 do { 1831 /* This loop iterates over the lpl's leaves */ 1832 do { 1833 if (lpl_leaf != cp->cpu_lpl) 1834 ocp = lpl_leaf->lpl_cpus; 1835 else 1836 ocp = cp->cpu_next_lpl; 1837 1838 /* This loop iterates over the CPUs in the leaf */ 1839 ocp_start = ocp; 1840 do { 1841 pri_t pri; 1842 1843 ASSERT(CPU_ACTIVE(ocp)); 1844 1845 /* 1846 * End our stroll around the partition if: 1847 * 1848 * - Something became runnable on the local 1849 * queue 1850 * 1851 * - We're at the broadest level of locality and 1852 * we happen across another idle CPU. At the 1853 * highest level of locality, all CPUs will 1854 * walk the partition's CPUs in the same 1855 * order, so we can end our stroll taking 1856 * comfort in knowing the other idle CPU is 1857 * already covering the next portion of the 1858 * list. 1859 */ 1860 if (cp->cpu_disp->disp_nrunnable != 0) 1861 break; 1862 if (ocp->cpu_dispatch_pri == -1) { 1863 if (ocp->cpu_disp_flags & 1864 CPU_DISP_HALTED) 1865 continue; 1866 else if (lpl->lpl_parent == NULL) 1867 break; 1868 } 1869 1870 /* 1871 * If there's only one thread and the CPU 1872 * is in the middle of a context switch, 1873 * or it's currently running the idle thread, 1874 * don't steal it. 1875 */ 1876 if ((ocp->cpu_disp_flags & 1877 CPU_DISP_DONTSTEAL) && 1878 ocp->cpu_disp->disp_nrunnable == 1) 1879 continue; 1880 1881 pri = ocp->cpu_disp->disp_max_unbound_pri; 1882 if (pri > maxpri) { 1883 maxpri = pri; 1884 tcp = ocp; 1885 } 1886 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 1887 1888 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 1889 leafidx = 0; 1890 lpl_leaf = lpl->lpl_rset[leafidx]; 1891 } 1892 } while (leafidx != hint); 1893 1894 hint = leafidx = lpl->lpl_hint; 1895 if ((lpl = lpl->lpl_parent) != NULL) 1896 lpl_leaf = lpl->lpl_rset[hint]; 1897 } while (!tcp && lpl); 1898 1899 splx(s); 1900 1901 /* 1902 * If another queue looks good, and there is still nothing on 1903 * the local queue, try to transfer one or more threads 1904 * from it to our queue. 1905 */ 1906 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 1907 tp = (disp_getbest(tcp->cpu_disp)); 1908 if (tp) 1909 return (disp_ratify(tp, kpq)); 1910 } 1911 return (NULL); 1912 } 1913 1914 1915 /* 1916 * disp_fix_unbound_pri() 1917 * Determines the maximum priority of unbound threads on the queue. 1918 * The priority is kept for the queue, but is only increased, never 1919 * reduced unless some CPU is looking for something on that queue. 1920 * 1921 * The priority argument is the known upper limit. 1922 * 1923 * Perhaps this should be kept accurately, but that probably means 1924 * separate bitmaps for bound and unbound threads. Since only idled 1925 * CPUs will have to do this recalculation, it seems better this way. 1926 */ 1927 static void 1928 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 1929 { 1930 kthread_t *tp; 1931 dispq_t *dq; 1932 ulong_t *dqactmap = dp->disp_qactmap; 1933 ulong_t mapword; 1934 int wx; 1935 1936 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 1937 1938 ASSERT(pri >= 0); /* checked by caller */ 1939 1940 /* 1941 * Start the search at the next lowest priority below the supplied 1942 * priority. This depends on the bitmap implementation. 1943 */ 1944 do { 1945 wx = pri >> BT_ULSHIFT; /* index of word in map */ 1946 1947 /* 1948 * Form mask for all lower priorities in the word. 1949 */ 1950 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 1951 1952 /* 1953 * Get next lower active priority. 1954 */ 1955 if (mapword != 0) { 1956 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 1957 } else if (wx > 0) { 1958 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 1959 if (pri < 0) 1960 break; 1961 } else { 1962 pri = -1; 1963 break; 1964 } 1965 1966 /* 1967 * Search the queue for unbound, runnable threads. 1968 */ 1969 dq = &dp->disp_q[pri]; 1970 tp = dq->dq_first; 1971 1972 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 1973 tp = tp->t_link; 1974 } 1975 1976 /* 1977 * If a thread was found, set the priority and return. 1978 */ 1979 } while (tp == NULL); 1980 1981 /* 1982 * pri holds the maximum unbound thread priority or -1. 1983 */ 1984 if (dp->disp_max_unbound_pri != pri) 1985 dp->disp_max_unbound_pri = pri; 1986 } 1987 1988 /* 1989 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 1990 * check if the CPU to which is was previously bound should have 1991 * its disp_max_unbound_pri increased. 1992 */ 1993 void 1994 disp_adjust_unbound_pri(kthread_t *tp) 1995 { 1996 disp_t *dp; 1997 pri_t tpri; 1998 1999 ASSERT(THREAD_LOCK_HELD(tp)); 2000 2001 /* 2002 * Don't do anything if the thread is not bound, or 2003 * currently not runnable or swapped out. 2004 */ 2005 if (tp->t_bound_cpu == NULL || 2006 tp->t_state != TS_RUN || 2007 tp->t_schedflag & TS_ON_SWAPQ) 2008 return; 2009 2010 tpri = DISP_PRIO(tp); 2011 dp = tp->t_bound_cpu->cpu_disp; 2012 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2013 if (tpri > dp->disp_max_unbound_pri) 2014 dp->disp_max_unbound_pri = tpri; 2015 } 2016 2017 /* 2018 * disp_getbest() - de-queue the highest priority unbound runnable thread. 2019 * returns with the thread unlocked and onproc 2020 * but at splhigh (like disp()). 2021 * returns NULL if nothing found. 2022 * 2023 * Passed a pointer to a dispatch queue not associated with this CPU. 2024 */ 2025 static kthread_t * 2026 disp_getbest(disp_t *dp) 2027 { 2028 kthread_t *tp; 2029 dispq_t *dq; 2030 pri_t pri; 2031 cpu_t *cp; 2032 2033 disp_lock_enter(&dp->disp_lock); 2034 2035 /* 2036 * If there is nothing to run, or the CPU is in the middle of a 2037 * context switch of the only thread, return NULL. 2038 */ 2039 pri = dp->disp_max_unbound_pri; 2040 if (pri == -1 || 2041 (dp->disp_cpu != NULL && 2042 (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2043 dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) { 2044 disp_lock_exit_nopreempt(&dp->disp_lock); 2045 return (NULL); 2046 } 2047 2048 dq = &dp->disp_q[pri]; 2049 tp = dq->dq_first; 2050 2051 /* 2052 * Skip over bound threads. 2053 * Bound threads can be here even though disp_max_unbound_pri 2054 * indicated this level. Besides, it not always accurate because it 2055 * isn't reduced until another CPU looks for work. 2056 * Note that tp could be NULL right away due to this. 2057 */ 2058 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2059 tp = tp->t_link; 2060 } 2061 2062 /* 2063 * If there were no unbound threads on this queue, find the queue 2064 * where they are and then return NULL so that other CPUs will be 2065 * considered. 2066 */ 2067 if (tp == NULL) { 2068 disp_fix_unbound_pri(dp, pri); 2069 disp_lock_exit_nopreempt(&dp->disp_lock); 2070 return (NULL); 2071 } 2072 2073 /* 2074 * Found a runnable, unbound thread, so remove it from queue. 2075 * dispdeq() requires that we have the thread locked, and we do, 2076 * by virtue of holding the dispatch queue lock. dispdeq() will 2077 * put the thread in transition state, thereby dropping the dispq 2078 * lock. 2079 */ 2080 #ifdef DEBUG 2081 { 2082 int thread_was_on_queue; 2083 2084 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2085 ASSERT(thread_was_on_queue); 2086 } 2087 #else /* DEBUG */ 2088 (void) dispdeq(tp); /* drops disp_lock */ 2089 #endif /* DEBUG */ 2090 2091 tp->t_schedflag |= TS_DONT_SWAP; 2092 2093 /* 2094 * Setup thread to run on the current CPU. 2095 */ 2096 cp = CPU; 2097 2098 tp->t_disp_queue = cp->cpu_disp; 2099 2100 cp->cpu_dispthread = tp; /* protected by spl only */ 2101 cp->cpu_dispatch_pri = pri; 2102 ASSERT(pri == DISP_PRIO(tp)); 2103 2104 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2105 2106 /* 2107 * Return with spl high so that swtch() won't need to raise it. 2108 * The disp_lock was dropped by dispdeq(). 2109 */ 2110 2111 return (tp); 2112 } 2113 2114 /* 2115 * disp_bound_common() - common routine for higher level functions 2116 * that check for bound threads under certain conditions. 2117 * If 'threadlistsafe' is set then there is no need to acquire 2118 * pidlock to stop the thread list from changing (eg, if 2119 * disp_bound_* is called with cpus paused). 2120 */ 2121 static int 2122 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2123 { 2124 int found = 0; 2125 kthread_t *tp; 2126 2127 ASSERT(flag); 2128 2129 if (!threadlistsafe) 2130 mutex_enter(&pidlock); 2131 tp = curthread; /* faster than allthreads */ 2132 do { 2133 if (tp->t_state != TS_FREE) { 2134 /* 2135 * If an interrupt thread is busy, but the 2136 * caller doesn't care (i.e. BOUND_INTR is off), 2137 * then just ignore it and continue through. 2138 */ 2139 if ((tp->t_flag & T_INTR_THREAD) && 2140 !(flag & BOUND_INTR)) 2141 continue; 2142 2143 /* 2144 * Skip the idle thread for the CPU 2145 * we're about to set offline. 2146 */ 2147 if (tp == cp->cpu_idle_thread) 2148 continue; 2149 2150 /* 2151 * Skip the pause thread for the CPU 2152 * we're about to set offline. 2153 */ 2154 if (tp == cp->cpu_pause_thread) 2155 continue; 2156 2157 if ((flag & BOUND_CPU) && 2158 (tp->t_bound_cpu == cp || 2159 tp->t_bind_cpu == cp->cpu_id || 2160 tp->t_weakbound_cpu == cp)) { 2161 found = 1; 2162 break; 2163 } 2164 2165 if ((flag & BOUND_PARTITION) && 2166 (tp->t_cpupart == cp->cpu_part)) { 2167 found = 1; 2168 break; 2169 } 2170 } 2171 } while ((tp = tp->t_next) != curthread && found == 0); 2172 if (!threadlistsafe) 2173 mutex_exit(&pidlock); 2174 return (found); 2175 } 2176 2177 /* 2178 * disp_bound_threads - return nonzero if threads are bound to the processor. 2179 * Called infrequently. Keep this simple. 2180 * Includes threads that are asleep or stopped but not onproc. 2181 */ 2182 int 2183 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2184 { 2185 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2186 } 2187 2188 /* 2189 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2190 * to the given processor, including interrupt threads. 2191 */ 2192 int 2193 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2194 { 2195 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2196 } 2197 2198 /* 2199 * disp_bound_partition - return nonzero if threads are bound to the same 2200 * partition as the processor. 2201 * Called infrequently. Keep this simple. 2202 * Includes threads that are asleep or stopped but not onproc. 2203 */ 2204 int 2205 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2206 { 2207 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2208 } 2209 2210 /* 2211 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2212 * threads to other CPUs. 2213 */ 2214 void 2215 disp_cpu_inactive(cpu_t *cp) 2216 { 2217 kthread_t *tp; 2218 disp_t *dp = cp->cpu_disp; 2219 dispq_t *dq; 2220 pri_t pri; 2221 int wasonq; 2222 2223 disp_lock_enter(&dp->disp_lock); 2224 while ((pri = dp->disp_max_unbound_pri) != -1) { 2225 dq = &dp->disp_q[pri]; 2226 tp = dq->dq_first; 2227 2228 /* 2229 * Skip over bound threads. 2230 */ 2231 while (tp != NULL && tp->t_bound_cpu != NULL) { 2232 tp = tp->t_link; 2233 } 2234 2235 if (tp == NULL) { 2236 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2237 disp_fix_unbound_pri(dp, pri); 2238 continue; 2239 } 2240 2241 wasonq = dispdeq(tp); /* drops disp_lock */ 2242 ASSERT(wasonq); 2243 ASSERT(tp->t_weakbound_cpu == NULL); 2244 2245 setbackdq(tp); 2246 /* 2247 * Called from cpu_offline: 2248 * 2249 * cp has already been removed from the list of active cpus 2250 * and tp->t_cpu has been changed so there is no risk of 2251 * tp ending up back on cp. 2252 * 2253 * Called from cpupart_move_cpu: 2254 * 2255 * The cpu has moved to a new cpupart. Any threads that 2256 * were on it's dispatch queues before the move remain 2257 * in the old partition and can't run in the new partition. 2258 */ 2259 ASSERT(tp->t_cpu != cp); 2260 thread_unlock(tp); 2261 2262 disp_lock_enter(&dp->disp_lock); 2263 } 2264 disp_lock_exit(&dp->disp_lock); 2265 } 2266 2267 /* 2268 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2269 * The hint passed in is used as a starting point so we don't favor 2270 * CPU 0 or any other CPU. The caller should pass in the most recently 2271 * used CPU for the thread. 2272 * 2273 * The lgroup and priority are used to determine the best CPU to run on 2274 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2275 * the thread priority will indicate whether the thread will actually run 2276 * there. To pick the best CPU, the CPUs inside and outside of the given 2277 * lgroup which are running the lowest priority threads are found. The 2278 * remote CPU is chosen only if the thread will not run locally on a CPU 2279 * within the lgroup, but will run on the remote CPU. If the thread 2280 * cannot immediately run on any CPU, the best local CPU will be chosen. 2281 * 2282 * The lpl specified also identifies the cpu partition from which 2283 * disp_lowpri_cpu should select a CPU. 2284 * 2285 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2286 * behalf of the current thread. (curthread is looking for a new cpu) 2287 * In this case, cpu_dispatch_pri for this thread's cpu should be 2288 * ignored. 2289 * 2290 * If a cpu is the target of an offline request then try to avoid it. 2291 * 2292 * This function must be called at either high SPL, or with preemption 2293 * disabled, so that the "hint" CPU cannot be removed from the online 2294 * CPU list while we are traversing it. 2295 */ 2296 cpu_t * 2297 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2298 { 2299 cpu_t *bestcpu; 2300 cpu_t *besthomecpu; 2301 cpu_t *cp, *cpstart; 2302 2303 pri_t bestpri; 2304 pri_t cpupri; 2305 2306 klgrpset_t done; 2307 klgrpset_t cur_set; 2308 2309 lpl_t *lpl_iter, *lpl_leaf; 2310 int i; 2311 2312 /* 2313 * Scan for a CPU currently running the lowest priority thread. 2314 * Cannot get cpu_lock here because it is adaptive. 2315 * We do not require lock on CPU list. 2316 */ 2317 ASSERT(hint != NULL); 2318 ASSERT(lpl != NULL); 2319 ASSERT(lpl->lpl_ncpu > 0); 2320 2321 /* 2322 * First examine local CPUs. Note that it's possible the hint CPU 2323 * passed in in remote to the specified home lgroup. If our priority 2324 * isn't sufficient enough such that we can run immediately at home, 2325 * then examine CPUs remote to our home lgroup. 2326 * We would like to give preference to CPUs closest to "home". 2327 * If we can't find a CPU where we'll run at a given level 2328 * of locality, we expand our search to include the next level. 2329 */ 2330 bestcpu = besthomecpu = NULL; 2331 klgrpset_clear(done); 2332 /* start with lpl we were passed */ 2333 2334 lpl_iter = lpl; 2335 2336 do { 2337 2338 bestpri = SHRT_MAX; 2339 klgrpset_clear(cur_set); 2340 2341 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2342 lpl_leaf = lpl_iter->lpl_rset[i]; 2343 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2344 continue; 2345 2346 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2347 2348 if (hint->cpu_lpl == lpl_leaf) 2349 cp = cpstart = hint; 2350 else 2351 cp = cpstart = lpl_leaf->lpl_cpus; 2352 2353 do { 2354 2355 if (cp == curcpu) 2356 cpupri = -1; 2357 else if (cp == cpu_inmotion) 2358 cpupri = SHRT_MAX; 2359 else 2360 cpupri = cp->cpu_dispatch_pri; 2361 2362 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2363 cpupri = cp->cpu_disp->disp_maxrunpri; 2364 if (cp->cpu_chosen_level > cpupri) 2365 cpupri = cp->cpu_chosen_level; 2366 if (cpupri < bestpri) { 2367 if (CPU_IDLING(cpupri)) { 2368 ASSERT((cp->cpu_flags & 2369 CPU_QUIESCED) == 0); 2370 return (cp); 2371 } 2372 bestcpu = cp; 2373 bestpri = cpupri; 2374 } 2375 } while ((cp = cp->cpu_next_lpl) != cpstart); 2376 } 2377 2378 if (bestcpu && (tpri > bestpri)) { 2379 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2380 return (bestcpu); 2381 } 2382 if (besthomecpu == NULL) 2383 besthomecpu = bestcpu; 2384 /* 2385 * Add the lgrps we just considered to the "done" set 2386 */ 2387 klgrpset_or(done, cur_set); 2388 2389 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2390 2391 /* 2392 * The specified priority isn't high enough to run immediately 2393 * anywhere, so just return the best CPU from the home lgroup. 2394 */ 2395 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2396 return (besthomecpu); 2397 } 2398 2399 /* 2400 * This routine provides the generic idle cpu function for all processors. 2401 * If a processor has some specific code to execute when idle (say, to stop 2402 * the pipeline and save power) then that routine should be defined in the 2403 * processors specific code (module_xx.c) and the global variable idle_cpu 2404 * set to that function. 2405 */ 2406 static void 2407 generic_idle_cpu(void) 2408 { 2409 } 2410 2411 /*ARGSUSED*/ 2412 static void 2413 generic_enq_thread(cpu_t *cpu, int bound) 2414 { 2415 } 2416 2417 /* 2418 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2419 * - t->t_cpu is not in this thread's assigned lgrp 2420 * - the time since the thread last came off t->t_cpu exceeds the 2421 * rechoose time for this cpu (ignore this if t is curthread in 2422 * which case it's on CPU and t->t_disp_time is inaccurate) 2423 * - t->t_cpu is presently the target of an offline or partition move 2424 * request 2425 */ 2426 static cpu_t * 2427 cpu_choose(kthread_t *t, pri_t tpri) 2428 { 2429 ASSERT(tpri < kpqpri); 2430 2431 if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) && 2432 t != curthread) || t->t_cpu == cpu_inmotion) { 2433 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2434 } 2435 2436 /* 2437 * Take a trip through disp_lowpri_cpu() if the thread was 2438 * running outside it's home lgroup 2439 */ 2440 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2441 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2442 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2443 (t == curthread) ? t->t_cpu : NULL)); 2444 } 2445 return (t->t_cpu); 2446 } 2447