1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 31 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/sysmacros.h> 36 #include <sys/signal.h> 37 #include <sys/user.h> 38 #include <sys/systm.h> 39 #include <sys/sysinfo.h> 40 #include <sys/var.h> 41 #include <sys/errno.h> 42 #include <sys/cmn_err.h> 43 #include <sys/debug.h> 44 #include <sys/inline.h> 45 #include <sys/disp.h> 46 #include <sys/class.h> 47 #include <sys/bitmap.h> 48 #include <sys/kmem.h> 49 #include <sys/cpuvar.h> 50 #include <sys/vtrace.h> 51 #include <sys/tnf.h> 52 #include <sys/cpupart.h> 53 #include <sys/lgrp.h> 54 #include <sys/chip.h> 55 #include <sys/schedctl.h> 56 #include <sys/atomic.h> 57 #include <sys/dtrace.h> 58 #include <sys/sdt.h> 59 60 #include <vm/as.h> 61 62 #define BOUND_CPU 0x1 63 #define BOUND_PARTITION 0x2 64 #define BOUND_INTR 0x4 65 66 /* Dispatch queue allocation structure and functions */ 67 struct disp_queue_info { 68 disp_t *dp; 69 dispq_t *olddispq; 70 dispq_t *newdispq; 71 ulong_t *olddqactmap; 72 ulong_t *newdqactmap; 73 int oldnglobpris; 74 }; 75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 76 disp_t *dp); 77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 78 static void disp_dq_free(struct disp_queue_info *dptr); 79 80 /* platform-specific routine to call when processor is idle */ 81 static void generic_idle_cpu(); 82 void (*idle_cpu)() = generic_idle_cpu; 83 84 /* routines invoked when a CPU enters/exits the idle loop */ 85 static void idle_enter(); 86 static void idle_exit(); 87 88 /* platform-specific routine to call when thread is enqueued */ 89 static void generic_enq_thread(cpu_t *, int); 90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 91 92 pri_t kpreemptpri; /* priority where kernel preemption applies */ 93 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 94 pri_t intr_pri; /* interrupt thread priority base level */ 95 96 #define KPQPRI -1 /* priority where cpu affinity is dropped for kp queue */ 97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 98 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 99 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 100 int nswapped; /* total number of swapped threads */ 101 void disp_swapped_enq(kthread_t *tp); 102 static void disp_swapped_setrun(kthread_t *tp); 103 static void cpu_resched(cpu_t *cp, pri_t tpri); 104 105 /* 106 * If this is set, only interrupt threads will cause kernel preemptions. 107 * This is done by changing the value of kpreemptpri. kpreemptpri 108 * will either be the max sysclass pri + 1 or the min interrupt pri. 109 */ 110 int only_intr_kpreempt; 111 112 extern void set_idle_cpu(int cpun); 113 extern void unset_idle_cpu(int cpun); 114 static void setkpdq(kthread_t *tp, int borf); 115 #define SETKP_BACK 0 116 #define SETKP_FRONT 1 117 /* 118 * Parameter that determines how recently a thread must have run 119 * on the CPU to be considered loosely-bound to that CPU to reduce 120 * cold cache effects. The interval is in hertz. 121 * 122 * The platform may define a per physical processor adjustment of 123 * this parameter. For efficiency, the effective rechoose interval 124 * (rechoose_interval + per chip adjustment) is maintained in the 125 * cpu structures. See cpu_choose() 126 */ 127 int rechoose_interval = RECHOOSE_INTERVAL; 128 129 static cpu_t *cpu_choose(kthread_t *, pri_t); 130 131 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 132 133 disp_lock_t transition_lock; /* lock on transitioning threads */ 134 disp_lock_t stop_lock; /* lock on stopped threads */ 135 disp_lock_t shuttle_lock; /* lock on shuttle objects */ 136 137 static void cpu_dispqalloc(int numpris); 138 139 static kthread_t *disp_getwork(cpu_t *to); 140 static kthread_t *disp_getbest(disp_t *from); 141 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 142 143 void swtch_to(kthread_t *); 144 145 /* 146 * dispatcher and scheduler initialization 147 */ 148 149 /* 150 * disp_setup - Common code to calculate and allocate dispatcher 151 * variables and structures based on the maximum priority. 152 */ 153 static void 154 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 155 { 156 pri_t newnglobpris; 157 158 ASSERT(MUTEX_HELD(&cpu_lock)); 159 160 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 161 162 if (newnglobpris > oldnglobpris) { 163 /* 164 * Allocate new kp queues for each CPU partition. 165 */ 166 cpupart_kpqalloc(newnglobpris); 167 168 /* 169 * Allocate new dispatch queues for each CPU. 170 */ 171 cpu_dispqalloc(newnglobpris); 172 173 /* 174 * compute new interrupt thread base priority 175 */ 176 intr_pri = maxglobpri; 177 if (only_intr_kpreempt) { 178 kpreemptpri = intr_pri + 1; 179 if (kpqpri == KPQPRI) 180 kpqpri = kpreemptpri; 181 } 182 v.v_nglobpris = newnglobpris; 183 } 184 } 185 186 /* 187 * dispinit - Called to initialize all loaded classes and the 188 * dispatcher framework. 189 */ 190 void 191 dispinit(void) 192 { 193 id_t cid; 194 pri_t maxglobpri; 195 pri_t cl_maxglobpri; 196 197 maxglobpri = -1; 198 199 /* 200 * Initialize transition lock, which will always be set. 201 */ 202 DISP_LOCK_INIT(&transition_lock); 203 disp_lock_enter_high(&transition_lock); 204 DISP_LOCK_INIT(&stop_lock); 205 DISP_LOCK_INIT(&shuttle_lock); 206 207 mutex_enter(&cpu_lock); 208 CPU->cpu_disp->disp_maxrunpri = -1; 209 CPU->cpu_disp->disp_max_unbound_pri = -1; 210 /* 211 * Initialize the default CPU partition. 212 */ 213 cpupart_initialize_default(); 214 /* 215 * Call the class specific initialization functions for 216 * all pre-installed schedulers. 217 * 218 * We pass the size of a class specific parameter 219 * buffer to each of the initialization functions 220 * to try to catch problems with backward compatibility 221 * of class modules. 222 * 223 * For example a new class module running on an old system 224 * which didn't provide sufficiently large parameter buffers 225 * would be bad news. Class initialization modules can check for 226 * this and take action if they detect a problem. 227 */ 228 229 for (cid = 0; cid < nclass; cid++) { 230 sclass_t *sc; 231 232 sc = &sclass[cid]; 233 if (SCHED_INSTALLED(sc)) { 234 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 235 &sc->cl_funcs); 236 if (cl_maxglobpri > maxglobpri) 237 maxglobpri = cl_maxglobpri; 238 } 239 } 240 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 241 if (kpqpri == KPQPRI) 242 kpqpri = kpreemptpri; 243 244 ASSERT(maxglobpri >= 0); 245 disp_setup(maxglobpri, 0); 246 247 mutex_exit(&cpu_lock); 248 249 /* 250 * Get the default class ID; this may be later modified via 251 * dispadmin(1M). This will load the class (normally TS) and that will 252 * call disp_add(), which is why we had to drop cpu_lock first. 253 */ 254 if (getcid(defaultclass, &defaultcid) != 0) { 255 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 256 defaultclass); 257 } 258 } 259 260 /* 261 * disp_add - Called with class pointer to initialize the dispatcher 262 * for a newly loaded class. 263 */ 264 void 265 disp_add(sclass_t *clp) 266 { 267 pri_t maxglobpri; 268 pri_t cl_maxglobpri; 269 270 mutex_enter(&cpu_lock); 271 /* 272 * Initialize the scheduler class. 273 */ 274 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 275 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 276 if (cl_maxglobpri > maxglobpri) 277 maxglobpri = cl_maxglobpri; 278 279 /* 280 * Save old queue information. Since we're initializing a 281 * new scheduling class which has just been loaded, then 282 * the size of the dispq may have changed. We need to handle 283 * that here. 284 */ 285 disp_setup(maxglobpri, v.v_nglobpris); 286 287 mutex_exit(&cpu_lock); 288 } 289 290 291 /* 292 * For each CPU, allocate new dispatch queues 293 * with the stated number of priorities. 294 */ 295 static void 296 cpu_dispqalloc(int numpris) 297 { 298 cpu_t *cpup; 299 struct disp_queue_info *disp_mem; 300 int i, num; 301 302 ASSERT(MUTEX_HELD(&cpu_lock)); 303 304 disp_mem = kmem_zalloc(NCPU * 305 sizeof (struct disp_queue_info), KM_SLEEP); 306 307 /* 308 * This routine must allocate all of the memory before stopping 309 * the cpus because it must not sleep in kmem_alloc while the 310 * CPUs are stopped. Locks they hold will not be freed until they 311 * are restarted. 312 */ 313 i = 0; 314 cpup = cpu_list; 315 do { 316 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 317 i++; 318 cpup = cpup->cpu_next; 319 } while (cpup != cpu_list); 320 num = i; 321 322 pause_cpus(NULL); 323 for (i = 0; i < num; i++) 324 disp_dq_assign(&disp_mem[i], numpris); 325 start_cpus(); 326 327 /* 328 * I must free all of the memory after starting the cpus because 329 * I can not risk sleeping in kmem_free while the cpus are stopped. 330 */ 331 for (i = 0; i < num; i++) 332 disp_dq_free(&disp_mem[i]); 333 334 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 335 } 336 337 static void 338 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 339 { 340 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 341 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 342 sizeof (long), KM_SLEEP); 343 dptr->dp = dp; 344 } 345 346 static void 347 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 348 { 349 disp_t *dp; 350 351 dp = dptr->dp; 352 dptr->olddispq = dp->disp_q; 353 dptr->olddqactmap = dp->disp_qactmap; 354 dptr->oldnglobpris = dp->disp_npri; 355 356 ASSERT(dptr->oldnglobpris < numpris); 357 358 if (dptr->olddispq != NULL) { 359 /* 360 * Use kcopy because bcopy is platform-specific 361 * and could block while we might have paused the cpus. 362 */ 363 (void) kcopy(dptr->olddispq, dptr->newdispq, 364 dptr->oldnglobpris * sizeof (dispq_t)); 365 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 366 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 367 sizeof (long)); 368 } 369 dp->disp_q = dptr->newdispq; 370 dp->disp_qactmap = dptr->newdqactmap; 371 dp->disp_q_limit = &dptr->newdispq[numpris]; 372 dp->disp_npri = numpris; 373 } 374 375 static void 376 disp_dq_free(struct disp_queue_info *dptr) 377 { 378 if (dptr->olddispq != NULL) 379 kmem_free(dptr->olddispq, 380 dptr->oldnglobpris * sizeof (dispq_t)); 381 if (dptr->olddqactmap != NULL) 382 kmem_free(dptr->olddqactmap, 383 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 384 } 385 386 /* 387 * For a newly created CPU, initialize the dispatch queue. 388 * This is called before the CPU is known through cpu[] or on any lists. 389 */ 390 void 391 disp_cpu_init(cpu_t *cp) 392 { 393 disp_t *dp; 394 dispq_t *newdispq; 395 ulong_t *newdqactmap; 396 397 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 398 399 if (cp == cpu0_disp.disp_cpu) 400 dp = &cpu0_disp; 401 else 402 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 403 bzero(dp, sizeof (disp_t)); 404 cp->cpu_disp = dp; 405 dp->disp_cpu = cp; 406 dp->disp_maxrunpri = -1; 407 dp->disp_max_unbound_pri = -1; 408 DISP_LOCK_INIT(&cp->cpu_thread_lock); 409 /* 410 * Allocate memory for the dispatcher queue headers 411 * and the active queue bitmap. 412 */ 413 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 414 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 415 sizeof (long), KM_SLEEP); 416 dp->disp_q = newdispq; 417 dp->disp_qactmap = newdqactmap; 418 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 419 dp->disp_npri = v.v_nglobpris; 420 } 421 422 void 423 disp_cpu_fini(cpu_t *cp) 424 { 425 ASSERT(MUTEX_HELD(&cpu_lock)); 426 427 disp_kp_free(cp->cpu_disp); 428 if (cp->cpu_disp != &cpu0_disp) 429 kmem_free(cp->cpu_disp, sizeof (disp_t)); 430 } 431 432 /* 433 * Allocate new, larger kpreempt dispatch queue to replace the old one. 434 */ 435 void 436 disp_kp_alloc(disp_t *dq, pri_t npri) 437 { 438 struct disp_queue_info mem_info; 439 440 if (npri > dq->disp_npri) { 441 /* 442 * Allocate memory for the new array. 443 */ 444 disp_dq_alloc(&mem_info, npri, dq); 445 446 /* 447 * We need to copy the old structures to the new 448 * and free the old. 449 */ 450 disp_dq_assign(&mem_info, npri); 451 disp_dq_free(&mem_info); 452 } 453 } 454 455 /* 456 * Free dispatch queue. 457 * Used for the kpreempt queues for a removed CPU partition and 458 * for the per-CPU queues of deleted CPUs. 459 */ 460 void 461 disp_kp_free(disp_t *dq) 462 { 463 struct disp_queue_info mem_info; 464 465 mem_info.olddispq = dq->disp_q; 466 mem_info.olddqactmap = dq->disp_qactmap; 467 mem_info.oldnglobpris = dq->disp_npri; 468 disp_dq_free(&mem_info); 469 } 470 471 /* 472 * End dispatcher and scheduler initialization. 473 */ 474 475 /* 476 * See if there's anything to do other than remain idle. 477 * Return non-zero if there is. 478 * 479 * This function must be called with high spl, or with 480 * kernel preemption disabled to prevent the partition's 481 * active cpu list from changing while being traversed. 482 * 483 */ 484 int 485 disp_anywork(void) 486 { 487 cpu_t *cp = CPU; 488 cpu_t *ocp; 489 490 if (cp->cpu_disp->disp_nrunnable != 0) 491 return (1); 492 493 if (!(cp->cpu_flags & CPU_OFFLINE)) { 494 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 495 return (1); 496 497 /* 498 * Work can be taken from another CPU if: 499 * - There is unbound work on the run queue 500 * - That work isn't a thread undergoing a 501 * - context switch on an otherwise empty queue. 502 * - The CPU isn't running the idle loop. 503 */ 504 for (ocp = cp->cpu_next_part; ocp != cp; 505 ocp = ocp->cpu_next_part) { 506 ASSERT(CPU_ACTIVE(ocp)); 507 508 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 509 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 510 ocp->cpu_disp->disp_nrunnable == 1) && 511 ocp->cpu_dispatch_pri != -1) 512 return (1); 513 } 514 } 515 return (0); 516 } 517 518 /* 519 * Called when CPU enters the idle loop 520 */ 521 static void 522 idle_enter() 523 { 524 cpu_t *cp = CPU; 525 526 new_cpu_mstate(cp, CMS_IDLE); 527 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 528 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 529 } 530 531 /* 532 * Called when CPU exits the idle loop 533 */ 534 static void 535 idle_exit() 536 { 537 cpu_t *cp = CPU; 538 539 new_cpu_mstate(cp, CMS_SYSTEM); 540 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 541 } 542 543 /* 544 * Idle loop. 545 */ 546 void 547 idle() 548 { 549 struct cpu *cp = CPU; /* pointer to this CPU */ 550 kthread_t *t; /* taken thread */ 551 552 idle_enter(); 553 554 /* 555 * Uniprocessor version of idle loop. 556 * Do this until notified that we're on an actual multiprocessor. 557 */ 558 while (ncpus == 1) { 559 if (cp->cpu_disp->disp_nrunnable == 0) { 560 (*idle_cpu)(); 561 continue; 562 } 563 idle_exit(); 564 swtch(); 565 566 idle_enter(); /* returned from swtch */ 567 } 568 569 /* 570 * Multiprocessor idle loop. 571 */ 572 for (;;) { 573 /* 574 * If CPU is completely quiesced by p_online(2), just wait 575 * here with minimal bus traffic until put online. 576 */ 577 while (cp->cpu_flags & CPU_QUIESCED) 578 (*idle_cpu)(); 579 580 if (cp->cpu_disp->disp_nrunnable != 0) { 581 idle_exit(); 582 swtch(); 583 } else { 584 if (cp->cpu_flags & CPU_OFFLINE) 585 continue; 586 if ((t = disp_getwork(cp)) == NULL) { 587 if (cp->cpu_chosen_level != -1) { 588 disp_t *dp = cp->cpu_disp; 589 disp_t *kpq; 590 591 disp_lock_enter(&dp->disp_lock); 592 /* 593 * Set kpq under lock to prevent 594 * migration between partitions. 595 */ 596 kpq = &cp->cpu_part->cp_kp_queue; 597 if (kpq->disp_maxrunpri == -1) 598 cp->cpu_chosen_level = -1; 599 disp_lock_exit(&dp->disp_lock); 600 } 601 (*idle_cpu)(); 602 continue; 603 } 604 idle_exit(); 605 restore_mstate(t); 606 swtch_to(t); 607 } 608 idle_enter(); /* returned from swtch/swtch_to */ 609 } 610 } 611 612 613 /* 614 * Preempt the currently running thread in favor of the highest 615 * priority thread. The class of the current thread controls 616 * where it goes on the dispatcher queues. If panicking, turn 617 * preemption off. 618 */ 619 void 620 preempt() 621 { 622 kthread_t *t = curthread; 623 klwp_t *lwp = ttolwp(curthread); 624 625 if (panicstr) 626 return; 627 628 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 629 630 thread_lock(t); 631 632 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 633 /* 634 * this thread has already been chosen to be run on 635 * another CPU. Clear kprunrun on this CPU since we're 636 * already headed for swtch(). 637 */ 638 CPU->cpu_kprunrun = 0; 639 thread_unlock_nopreempt(t); 640 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 641 } else { 642 if (lwp != NULL) 643 lwp->lwp_ru.nivcsw++; 644 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 645 THREAD_TRANSITION(t); 646 CL_PREEMPT(t); 647 DTRACE_SCHED(preempt); 648 thread_unlock_nopreempt(t); 649 650 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 651 652 swtch(); /* clears CPU->cpu_runrun via disp() */ 653 } 654 } 655 656 extern kthread_t *thread_unpin(); 657 658 /* 659 * disp() - find the highest priority thread for this processor to run, and 660 * set it in TS_ONPROC state so that resume() can be called to run it. 661 */ 662 static kthread_t * 663 disp() 664 { 665 cpu_t *cpup; 666 disp_t *dp; 667 kthread_t *tp; 668 dispq_t *dq; 669 int maxrunword; 670 pri_t pri; 671 disp_t *kpq; 672 673 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 674 675 cpup = CPU; 676 /* 677 * Find the highest priority loaded, runnable thread. 678 */ 679 dp = cpup->cpu_disp; 680 681 reschedule: 682 /* 683 * If there is more important work on the global queue with a better 684 * priority than the maximum on this CPU, take it now. 685 */ 686 kpq = &cpup->cpu_part->cp_kp_queue; 687 while ((pri = kpq->disp_maxrunpri) >= 0 && 688 pri >= dp->disp_maxrunpri && 689 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 690 (tp = disp_getbest(kpq)) != NULL) { 691 if (disp_ratify(tp, kpq) != NULL) { 692 TRACE_1(TR_FAC_DISP, TR_DISP_END, 693 "disp_end:tid %p", tp); 694 restore_mstate(tp); 695 return (tp); 696 } 697 } 698 699 disp_lock_enter(&dp->disp_lock); 700 pri = dp->disp_maxrunpri; 701 702 /* 703 * If there is nothing to run, look at what's runnable on other queues. 704 * Choose the idle thread if the CPU is quiesced. 705 * Note that CPUs that have the CPU_OFFLINE flag set can still run 706 * interrupt threads, which will be the only threads on the CPU's own 707 * queue, but cannot run threads from other queues. 708 */ 709 if (pri == -1) { 710 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 711 disp_lock_exit(&dp->disp_lock); 712 if ((tp = disp_getwork(cpup)) == NULL) { 713 tp = cpup->cpu_idle_thread; 714 (void) splhigh(); 715 THREAD_ONPROC(tp, cpup); 716 cpup->cpu_dispthread = tp; 717 cpup->cpu_dispatch_pri = -1; 718 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 719 cpup->cpu_chosen_level = -1; 720 } 721 } else { 722 disp_lock_exit_high(&dp->disp_lock); 723 tp = cpup->cpu_idle_thread; 724 THREAD_ONPROC(tp, cpup); 725 cpup->cpu_dispthread = tp; 726 cpup->cpu_dispatch_pri = -1; 727 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 728 cpup->cpu_chosen_level = -1; 729 } 730 TRACE_1(TR_FAC_DISP, TR_DISP_END, 731 "disp_end:tid %p", tp); 732 restore_mstate(tp); 733 return (tp); 734 } 735 736 dq = &dp->disp_q[pri]; 737 tp = dq->dq_first; 738 739 ASSERT(tp != NULL); 740 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 741 742 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 743 744 /* 745 * Found it so remove it from queue. 746 */ 747 dp->disp_nrunnable--; 748 dq->dq_sruncnt--; 749 if ((dq->dq_first = tp->t_link) == NULL) { 750 ulong_t *dqactmap = dp->disp_qactmap; 751 752 ASSERT(dq->dq_sruncnt == 0); 753 dq->dq_last = NULL; 754 755 /* 756 * The queue is empty, so the corresponding bit needs to be 757 * turned off in dqactmap. If nrunnable != 0 just took the 758 * last runnable thread off the 759 * highest queue, so recompute disp_maxrunpri. 760 */ 761 maxrunword = pri >> BT_ULSHIFT; 762 dqactmap[maxrunword] &= ~BT_BIW(pri); 763 764 if (dp->disp_nrunnable == 0) { 765 dp->disp_max_unbound_pri = -1; 766 dp->disp_maxrunpri = -1; 767 } else { 768 int ipri; 769 770 ipri = bt_gethighbit(dqactmap, maxrunword); 771 dp->disp_maxrunpri = ipri; 772 if (ipri < dp->disp_max_unbound_pri) 773 dp->disp_max_unbound_pri = ipri; 774 } 775 } else { 776 tp->t_link = NULL; 777 } 778 779 /* 780 * Set TS_DONT_SWAP flag to prevent another processor from swapping 781 * out this thread before we have a chance to run it. 782 * While running, it is protected against swapping by t_lock. 783 */ 784 tp->t_schedflag |= TS_DONT_SWAP; 785 cpup->cpu_dispthread = tp; /* protected by spl only */ 786 cpup->cpu_dispatch_pri = pri; 787 ASSERT(pri == DISP_PRIO(tp)); 788 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 789 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 790 791 ASSERT(tp != NULL); 792 TRACE_1(TR_FAC_DISP, TR_DISP_END, 793 "disp_end:tid %p", tp); 794 795 if (disp_ratify(tp, kpq) == NULL) 796 goto reschedule; 797 798 restore_mstate(tp); 799 return (tp); 800 } 801 802 /* 803 * swtch() 804 * Find best runnable thread and run it. 805 * Called with the current thread already switched to a new state, 806 * on a sleep queue, run queue, stopped, and not zombied. 807 * May be called at any spl level less than or equal to LOCK_LEVEL. 808 * Always drops spl to the base level (spl0()). 809 */ 810 void 811 swtch() 812 { 813 kthread_t *t = curthread; 814 kthread_t *next; 815 cpu_t *cp; 816 817 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 818 819 if (t->t_flag & T_INTR_THREAD) 820 cpu_intr_swtch_enter(t); 821 822 if (t->t_intr != NULL) { 823 /* 824 * We are an interrupt thread. Setup and return 825 * the interrupted thread to be resumed. 826 */ 827 (void) splhigh(); /* block other scheduler action */ 828 cp = CPU; /* now protected against migration */ 829 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 830 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 831 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 832 next = thread_unpin(); 833 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 834 resume_from_intr(next); 835 } else { 836 #ifdef DEBUG 837 if (t->t_state == TS_ONPROC && 838 t->t_disp_queue->disp_cpu == CPU && 839 t->t_preempt == 0) { 840 thread_lock(t); 841 ASSERT(t->t_state != TS_ONPROC || 842 t->t_disp_queue->disp_cpu != CPU || 843 t->t_preempt != 0); /* cannot migrate */ 844 thread_unlock_nopreempt(t); 845 } 846 #endif /* DEBUG */ 847 cp = CPU; 848 next = disp(); /* returns with spl high */ 849 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 850 851 /* OK to steal anything left on run queue */ 852 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 853 854 if (next != t) { 855 if (t == cp->cpu_idle_thread) { 856 CHIP_NRUNNING(cp->cpu_chip, 1); 857 } else if (next == cp->cpu_idle_thread) { 858 CHIP_NRUNNING(cp->cpu_chip, -1); 859 } 860 861 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 862 cp->cpu_last_swtch = t->t_disp_time = lbolt; 863 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 864 865 if (dtrace_vtime_active) 866 dtrace_vtime_switch(next); 867 868 resume(next); 869 /* 870 * The TR_RESUME_END and TR_SWTCH_END trace points 871 * appear at the end of resume(), because we may not 872 * return here 873 */ 874 } else { 875 if (t->t_flag & T_INTR_THREAD) 876 cpu_intr_swtch_exit(t); 877 878 DTRACE_SCHED(remain__cpu); 879 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 880 (void) spl0(); 881 } 882 } 883 } 884 885 /* 886 * swtch_from_zombie() 887 * Special case of swtch(), which allows checks for TS_ZOMB to be 888 * eliminated from normal resume. 889 * Find best runnable thread and run it. 890 * Called with the current thread zombied. 891 * Zombies cannot migrate, so CPU references are safe. 892 */ 893 void 894 swtch_from_zombie() 895 { 896 kthread_t *next; 897 cpu_t *cpu = CPU; 898 899 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 900 901 ASSERT(curthread->t_state == TS_ZOMB); 902 903 next = disp(); /* returns with spl high */ 904 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 905 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 906 ASSERT(next != curthread); 907 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 908 909 if (next == cpu->cpu_idle_thread) 910 CHIP_NRUNNING(cpu->cpu_chip, -1); 911 912 if (dtrace_vtime_active) 913 dtrace_vtime_switch(next); 914 915 resume_from_zombie(next); 916 /* 917 * The TR_RESUME_END and TR_SWTCH_END trace points 918 * appear at the end of resume(), because we certainly will not 919 * return here 920 */ 921 } 922 923 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 924 static int 925 thread_on_queue(kthread_t *tp) 926 { 927 cpu_t *cp; 928 cpu_t *self; 929 disp_t *dp; 930 931 self = CPU; 932 cp = self->cpu_next_onln; 933 dp = cp->cpu_disp; 934 for (;;) { 935 dispq_t *dq; 936 dispq_t *eq; 937 938 disp_lock_enter_high(&dp->disp_lock); 939 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 940 kthread_t *rp; 941 942 ASSERT(dq->dq_last == NULL || 943 dq->dq_last->t_link == NULL); 944 for (rp = dq->dq_first; rp; rp = rp->t_link) 945 if (tp == rp) { 946 disp_lock_exit_high(&dp->disp_lock); 947 return (1); 948 } 949 } 950 disp_lock_exit_high(&dp->disp_lock); 951 if (cp == NULL) 952 break; 953 if (cp == self) { 954 cp = NULL; 955 dp = &cp->cpu_part->cp_kp_queue; 956 } else { 957 cp = cp->cpu_next_onln; 958 dp = cp->cpu_disp; 959 } 960 } 961 return (0); 962 } /* end of thread_on_queue */ 963 #else 964 965 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 966 967 #endif /* DEBUG */ 968 969 /* 970 * like swtch(), but switch to a specified thread taken from another CPU. 971 * called with spl high.. 972 */ 973 void 974 swtch_to(kthread_t *next) 975 { 976 cpu_t *cp = CPU; 977 978 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 979 980 /* 981 * Update context switch statistics. 982 */ 983 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 984 985 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 986 987 if (curthread == cp->cpu_idle_thread) 988 CHIP_NRUNNING(cp->cpu_chip, 1); 989 990 /* OK to steal anything left on run queue */ 991 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 992 993 /* record last execution time */ 994 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 995 996 if (dtrace_vtime_active) 997 dtrace_vtime_switch(next); 998 999 resume(next); 1000 /* 1001 * The TR_RESUME_END and TR_SWTCH_END trace points 1002 * appear at the end of resume(), because we may not 1003 * return here 1004 */ 1005 } 1006 1007 1008 1009 #define CPU_IDLING(pri) ((pri) == -1) 1010 1011 static void 1012 cpu_resched(cpu_t *cp, pri_t tpri) 1013 { 1014 int call_poke_cpu = 0; 1015 pri_t cpupri = cp->cpu_dispatch_pri; 1016 1017 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1018 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1019 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1020 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1021 cp->cpu_runrun = 1; 1022 aston(cp->cpu_dispthread); 1023 if (tpri < kpreemptpri && cp != CPU) 1024 call_poke_cpu = 1; 1025 } 1026 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1027 cp->cpu_kprunrun = 1; 1028 if (cp != CPU) 1029 call_poke_cpu = 1; 1030 } 1031 } 1032 1033 /* 1034 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1035 */ 1036 membar_enter(); 1037 1038 if (call_poke_cpu) 1039 poke_cpu(cp->cpu_id); 1040 } 1041 1042 /* 1043 * Routine used by setbackdq() to balance load across the physical 1044 * processors. Returns a CPU of a lesser loaded chip in the lgroup 1045 * if balancing is necessary, or the "hint" CPU if it's not. 1046 * 1047 * - tp is the thread being enqueued 1048 * - cp is a hint CPU (chosen by cpu_choose()). 1049 * - curchip (if not NULL) is the chip on which the current thread 1050 * is running. 1051 * 1052 * The thread lock for "tp" must be held while calling this routine. 1053 */ 1054 static cpu_t * 1055 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip) 1056 { 1057 int chp_nrun, ochp_nrun; 1058 chip_t *chp, *nchp; 1059 1060 chp = cp->cpu_chip; 1061 chp_nrun = chp->chip_nrunning; 1062 1063 if (chp == curchip) 1064 chp_nrun--; /* Ignore curthread */ 1065 1066 /* 1067 * If this chip isn't at all idle, then let 1068 * run queue balancing do the work. 1069 */ 1070 if (chp_nrun == chp->chip_ncpu) 1071 return (cp); 1072 1073 nchp = chp->chip_balance; 1074 do { 1075 if (nchp == chp || 1076 !CHIP_IN_CPUPART(nchp, tp->t_cpupart)) 1077 continue; 1078 1079 ochp_nrun = nchp->chip_nrunning; 1080 1081 /* 1082 * If the other chip is running less threads, 1083 * or if it's running the same number of threads, but 1084 * has more online logical CPUs, then choose to balance. 1085 */ 1086 if (chp_nrun > ochp_nrun || 1087 (chp_nrun == ochp_nrun && 1088 nchp->chip_ncpu > chp->chip_ncpu)) { 1089 cp = nchp->chip_cpus; 1090 nchp->chip_cpus = cp->cpu_next_chip; 1091 1092 /* 1093 * Find a CPU on the chip in the correct 1094 * partition. We know at least one exists 1095 * because of the CHIP_IN_CPUPART() check above. 1096 */ 1097 while (cp->cpu_part != tp->t_cpupart) 1098 cp = cp->cpu_next_chip; 1099 } 1100 chp->chip_balance = nchp->chip_next_lgrp; 1101 break; 1102 } while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance); 1103 1104 ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart)); 1105 return (cp); 1106 } 1107 1108 /* 1109 * setbackdq() keeps runqs balanced such that the difference in length 1110 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1111 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1112 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1113 * try to keep runqs perfectly balanced regardless of the thread priority. 1114 */ 1115 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1116 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1117 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1118 1119 /* 1120 * Put the specified thread on the back of the dispatcher 1121 * queue corresponding to its current priority. 1122 * 1123 * Called with the thread in transition, onproc or stopped state 1124 * and locked (transition implies locked) and at high spl. 1125 * Returns with the thread in TS_RUN state and still locked. 1126 */ 1127 void 1128 setbackdq(kthread_t *tp) 1129 { 1130 dispq_t *dq; 1131 disp_t *dp; 1132 chip_t *curchip = NULL; 1133 cpu_t *cp; 1134 pri_t tpri; 1135 int bound; 1136 1137 ASSERT(THREAD_LOCK_HELD(tp)); 1138 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1139 1140 if (tp->t_waitrq == 0) { 1141 hrtime_t curtime; 1142 1143 curtime = gethrtime_unscaled(); 1144 (void) cpu_update_pct(tp, curtime); 1145 tp->t_waitrq = curtime; 1146 } else { 1147 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1148 } 1149 1150 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1151 1152 /* 1153 * If thread is "swapped" or on the swap queue don't 1154 * queue it, but wake sched. 1155 */ 1156 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1157 disp_swapped_setrun(tp); 1158 return; 1159 } 1160 1161 tpri = DISP_PRIO(tp); 1162 if (tp == curthread) { 1163 curchip = CPU->cpu_chip; 1164 } 1165 1166 if (ncpus == 1) 1167 cp = tp->t_cpu; 1168 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1169 if (tpri >= kpqpri) { 1170 setkpdq(tp, SETKP_BACK); 1171 return; 1172 } 1173 /* 1174 * Let cpu_choose suggest a CPU. 1175 */ 1176 cp = cpu_choose(tp, tpri); 1177 1178 if (tp->t_cpupart == cp->cpu_part) { 1179 int qlen; 1180 1181 /* 1182 * Select another CPU if we need 1183 * to do some load balancing across the 1184 * physical processors. 1185 */ 1186 if (CHIP_SHOULD_BALANCE(cp->cpu_chip)) 1187 cp = chip_balance(tp, cp, curchip); 1188 1189 /* 1190 * Balance across the run queues 1191 */ 1192 qlen = RUNQ_LEN(cp, tpri); 1193 if (tpri >= RUNQ_MATCH_PRI && 1194 !(tp->t_schedflag & TS_RUNQMATCH)) 1195 qlen -= RUNQ_MAX_DIFF; 1196 if (qlen > 0) { 1197 cpu_t *np; 1198 1199 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 1200 np = cp->cpu_next_part; 1201 else { 1202 if ((np = cp->cpu_next_lpl) == cp) 1203 np = cp->cpu_next_part; 1204 } 1205 if (RUNQ_LEN(np, tpri) < qlen) 1206 cp = np; 1207 } 1208 } else { 1209 /* 1210 * Migrate to a cpu in the new partition. 1211 */ 1212 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1213 tp->t_lpl, tp->t_pri, NULL); 1214 } 1215 bound = 0; 1216 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1217 } else { 1218 /* 1219 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1220 * a short time until weak binding that existed when the 1221 * strong binding was established has dropped) so we must 1222 * favour weak binding over strong. 1223 */ 1224 cp = tp->t_weakbound_cpu ? 1225 tp->t_weakbound_cpu : tp->t_bound_cpu; 1226 bound = 1; 1227 } 1228 dp = cp->cpu_disp; 1229 disp_lock_enter_high(&dp->disp_lock); 1230 1231 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1232 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1233 tpri, cp, tp); 1234 1235 #ifndef NPROBE 1236 /* Kernel probe */ 1237 if (tnf_tracing_active) 1238 tnf_thread_queue(tp, cp, tpri); 1239 #endif /* NPROBE */ 1240 1241 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1242 1243 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1244 tp->t_disp_queue = dp; 1245 tp->t_link = NULL; 1246 1247 dq = &dp->disp_q[tpri]; 1248 dp->disp_nrunnable++; 1249 membar_enter(); 1250 1251 if (dq->dq_sruncnt++ != 0) { 1252 ASSERT(dq->dq_first != NULL); 1253 dq->dq_last->t_link = tp; 1254 dq->dq_last = tp; 1255 } else { 1256 ASSERT(dq->dq_first == NULL); 1257 ASSERT(dq->dq_last == NULL); 1258 dq->dq_first = dq->dq_last = tp; 1259 BT_SET(dp->disp_qactmap, tpri); 1260 if (tpri > dp->disp_maxrunpri) { 1261 dp->disp_maxrunpri = tpri; 1262 membar_enter(); 1263 cpu_resched(cp, tpri); 1264 } 1265 } 1266 1267 if (!bound && tpri > dp->disp_max_unbound_pri) { 1268 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1269 cp == CPU) { 1270 /* 1271 * If there are no other unbound threads on the 1272 * run queue, don't allow other CPUs to steal 1273 * this thread while we are in the middle of a 1274 * context switch. We may just switch to it 1275 * again right away. CPU_DISP_DONTSTEAL is cleared 1276 * in swtch and swtch_to. 1277 */ 1278 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1279 } 1280 dp->disp_max_unbound_pri = tpri; 1281 } 1282 (*disp_enq_thread)(cp, bound); 1283 } 1284 1285 /* 1286 * Put the specified thread on the front of the dispatcher 1287 * queue corresponding to its current priority. 1288 * 1289 * Called with the thread in transition, onproc or stopped state 1290 * and locked (transition implies locked) and at high spl. 1291 * Returns with the thread in TS_RUN state and still locked. 1292 */ 1293 void 1294 setfrontdq(kthread_t *tp) 1295 { 1296 disp_t *dp; 1297 dispq_t *dq; 1298 cpu_t *cp; 1299 pri_t tpri; 1300 int bound; 1301 1302 ASSERT(THREAD_LOCK_HELD(tp)); 1303 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1304 1305 if (tp->t_waitrq == 0) { 1306 hrtime_t curtime; 1307 1308 curtime = gethrtime_unscaled(); 1309 (void) cpu_update_pct(tp, curtime); 1310 tp->t_waitrq = curtime; 1311 } else { 1312 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1313 } 1314 1315 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1316 1317 /* 1318 * If thread is "swapped" or on the swap queue don't 1319 * queue it, but wake sched. 1320 */ 1321 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1322 disp_swapped_setrun(tp); 1323 return; 1324 } 1325 1326 tpri = DISP_PRIO(tp); 1327 if (ncpus == 1) 1328 cp = tp->t_cpu; 1329 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1330 if (tpri >= kpqpri) { 1331 setkpdq(tp, SETKP_FRONT); 1332 return; 1333 } 1334 cp = tp->t_cpu; 1335 if (tp->t_cpupart == cp->cpu_part) { 1336 /* 1337 * If we are of higher or equal priority than 1338 * the highest priority runnable thread of 1339 * the current CPU, just pick this CPU. Otherwise 1340 * Let cpu_choose() select the CPU. If this cpu 1341 * is the target of an offline request then do not 1342 * pick it - a thread_nomigrate() on the in motion 1343 * cpu relies on this when it forces a preempt. 1344 */ 1345 if (tpri < cp->cpu_disp->disp_maxrunpri || 1346 cp == cpu_inmotion) 1347 cp = cpu_choose(tp, tpri); 1348 } else { 1349 /* 1350 * Migrate to a cpu in the new partition. 1351 */ 1352 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1353 tp->t_lpl, tp->t_pri, NULL); 1354 } 1355 bound = 0; 1356 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1357 } else { 1358 /* 1359 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1360 * a short time until weak binding that existed when the 1361 * strong binding was established has dropped) so we must 1362 * favour weak binding over strong. 1363 */ 1364 cp = tp->t_weakbound_cpu ? 1365 tp->t_weakbound_cpu : tp->t_bound_cpu; 1366 bound = 1; 1367 } 1368 dp = cp->cpu_disp; 1369 disp_lock_enter_high(&dp->disp_lock); 1370 1371 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1372 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1373 1374 #ifndef NPROBE 1375 /* Kernel probe */ 1376 if (tnf_tracing_active) 1377 tnf_thread_queue(tp, cp, tpri); 1378 #endif /* NPROBE */ 1379 1380 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1381 1382 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1383 tp->t_disp_queue = dp; 1384 1385 dq = &dp->disp_q[tpri]; 1386 dp->disp_nrunnable++; 1387 membar_enter(); 1388 1389 if (dq->dq_sruncnt++ != 0) { 1390 ASSERT(dq->dq_last != NULL); 1391 tp->t_link = dq->dq_first; 1392 dq->dq_first = tp; 1393 } else { 1394 ASSERT(dq->dq_last == NULL); 1395 ASSERT(dq->dq_first == NULL); 1396 tp->t_link = NULL; 1397 dq->dq_first = dq->dq_last = tp; 1398 BT_SET(dp->disp_qactmap, tpri); 1399 if (tpri > dp->disp_maxrunpri) { 1400 dp->disp_maxrunpri = tpri; 1401 membar_enter(); 1402 cpu_resched(cp, tpri); 1403 } 1404 } 1405 1406 if (!bound && tpri > dp->disp_max_unbound_pri) { 1407 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1408 cp == CPU) { 1409 /* 1410 * If there are no other unbound threads on the 1411 * run queue, don't allow other CPUs to steal 1412 * this thread while we are in the middle of a 1413 * context switch. We may just switch to it 1414 * again right away. CPU_DISP_DONTSTEAL is cleared 1415 * in swtch and swtch_to. 1416 */ 1417 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1418 } 1419 dp->disp_max_unbound_pri = tpri; 1420 } 1421 (*disp_enq_thread)(cp, bound); 1422 } 1423 1424 /* 1425 * Put a high-priority unbound thread on the kp queue 1426 */ 1427 static void 1428 setkpdq(kthread_t *tp, int borf) 1429 { 1430 dispq_t *dq; 1431 disp_t *dp; 1432 cpu_t *cp; 1433 pri_t tpri; 1434 1435 tpri = DISP_PRIO(tp); 1436 1437 dp = &tp->t_cpupart->cp_kp_queue; 1438 disp_lock_enter_high(&dp->disp_lock); 1439 1440 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1441 1442 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1443 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1444 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1445 tp->t_disp_queue = dp; 1446 dp->disp_nrunnable++; 1447 dq = &dp->disp_q[tpri]; 1448 1449 if (dq->dq_sruncnt++ != 0) { 1450 if (borf == SETKP_BACK) { 1451 ASSERT(dq->dq_first != NULL); 1452 tp->t_link = NULL; 1453 dq->dq_last->t_link = tp; 1454 dq->dq_last = tp; 1455 } else { 1456 ASSERT(dq->dq_last != NULL); 1457 tp->t_link = dq->dq_first; 1458 dq->dq_first = tp; 1459 } 1460 } else { 1461 if (borf == SETKP_BACK) { 1462 ASSERT(dq->dq_first == NULL); 1463 ASSERT(dq->dq_last == NULL); 1464 dq->dq_first = dq->dq_last = tp; 1465 } else { 1466 ASSERT(dq->dq_last == NULL); 1467 ASSERT(dq->dq_first == NULL); 1468 tp->t_link = NULL; 1469 dq->dq_first = dq->dq_last = tp; 1470 } 1471 BT_SET(dp->disp_qactmap, tpri); 1472 if (tpri > dp->disp_max_unbound_pri) 1473 dp->disp_max_unbound_pri = tpri; 1474 if (tpri > dp->disp_maxrunpri) { 1475 dp->disp_maxrunpri = tpri; 1476 membar_enter(); 1477 } 1478 } 1479 1480 cp = tp->t_cpu; 1481 if (tp->t_cpupart != cp->cpu_part) { 1482 /* migrate to a cpu in the new partition */ 1483 cp = tp->t_cpupart->cp_cpulist; 1484 } 1485 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1486 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1487 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1488 1489 #ifndef NPROBE 1490 /* Kernel probe */ 1491 if (tnf_tracing_active) 1492 tnf_thread_queue(tp, cp, tpri); 1493 #endif /* NPROBE */ 1494 1495 if (cp->cpu_chosen_level < tpri) 1496 cp->cpu_chosen_level = tpri; 1497 cpu_resched(cp, tpri); 1498 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1499 (*disp_enq_thread)(cp, 0); 1500 } 1501 1502 /* 1503 * Remove a thread from the dispatcher queue if it is on it. 1504 * It is not an error if it is not found but we return whether 1505 * or not it was found in case the caller wants to check. 1506 */ 1507 int 1508 dispdeq(kthread_t *tp) 1509 { 1510 disp_t *dp; 1511 dispq_t *dq; 1512 kthread_t *rp; 1513 kthread_t *trp; 1514 kthread_t **ptp; 1515 int tpri; 1516 1517 ASSERT(THREAD_LOCK_HELD(tp)); 1518 1519 if (tp->t_state != TS_RUN) 1520 return (0); 1521 1522 /* 1523 * The thread is "swapped" or is on the swap queue and 1524 * hence no longer on the run queue, so return true. 1525 */ 1526 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1527 return (1); 1528 1529 tpri = DISP_PRIO(tp); 1530 dp = tp->t_disp_queue; 1531 ASSERT(tpri < dp->disp_npri); 1532 dq = &dp->disp_q[tpri]; 1533 ptp = &dq->dq_first; 1534 rp = *ptp; 1535 trp = NULL; 1536 1537 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1538 1539 /* 1540 * Search for thread in queue. 1541 * Double links would simplify this at the expense of disp/setrun. 1542 */ 1543 while (rp != tp && rp != NULL) { 1544 trp = rp; 1545 ptp = &trp->t_link; 1546 rp = trp->t_link; 1547 } 1548 1549 if (rp == NULL) { 1550 panic("dispdeq: thread not on queue"); 1551 } 1552 1553 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1554 1555 /* 1556 * Found it so remove it from queue. 1557 */ 1558 if ((*ptp = rp->t_link) == NULL) 1559 dq->dq_last = trp; 1560 1561 dp->disp_nrunnable--; 1562 if (--dq->dq_sruncnt == 0) { 1563 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1564 if (dp->disp_nrunnable == 0) { 1565 dp->disp_max_unbound_pri = -1; 1566 dp->disp_maxrunpri = -1; 1567 } else if (tpri == dp->disp_maxrunpri) { 1568 int ipri; 1569 1570 ipri = bt_gethighbit(dp->disp_qactmap, 1571 dp->disp_maxrunpri >> BT_ULSHIFT); 1572 if (ipri < dp->disp_max_unbound_pri) 1573 dp->disp_max_unbound_pri = ipri; 1574 dp->disp_maxrunpri = ipri; 1575 } 1576 } 1577 tp->t_link = NULL; 1578 THREAD_TRANSITION(tp); /* put in intermediate state */ 1579 return (1); 1580 } 1581 1582 1583 /* 1584 * dq_sruninc and dq_srundec are public functions for 1585 * incrementing/decrementing the sruncnts when a thread on 1586 * a dispatcher queue is made schedulable/unschedulable by 1587 * resetting the TS_LOAD flag. 1588 * 1589 * The caller MUST have the thread lock and therefore the dispatcher 1590 * queue lock so that the operation which changes 1591 * the flag, the operation that checks the status of the thread to 1592 * determine if it's on a disp queue AND the call to this function 1593 * are one atomic operation with respect to interrupts. 1594 */ 1595 1596 /* 1597 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1598 */ 1599 void 1600 dq_sruninc(kthread_t *t) 1601 { 1602 ASSERT(t->t_state == TS_RUN); 1603 ASSERT(t->t_schedflag & TS_LOAD); 1604 1605 THREAD_TRANSITION(t); 1606 setfrontdq(t); 1607 } 1608 1609 /* 1610 * See comment on calling conventions above. 1611 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1612 */ 1613 void 1614 dq_srundec(kthread_t *t) 1615 { 1616 ASSERT(t->t_schedflag & TS_LOAD); 1617 1618 (void) dispdeq(t); 1619 disp_swapped_enq(t); 1620 } 1621 1622 /* 1623 * Change the dispatcher lock of thread to the "swapped_lock" 1624 * and return with thread lock still held. 1625 * 1626 * Called with thread_lock held, in transition state, and at high spl. 1627 */ 1628 void 1629 disp_swapped_enq(kthread_t *tp) 1630 { 1631 ASSERT(THREAD_LOCK_HELD(tp)); 1632 ASSERT(tp->t_schedflag & TS_LOAD); 1633 1634 switch (tp->t_state) { 1635 case TS_RUN: 1636 disp_lock_enter_high(&swapped_lock); 1637 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1638 break; 1639 case TS_ONPROC: 1640 disp_lock_enter_high(&swapped_lock); 1641 THREAD_TRANSITION(tp); 1642 wake_sched_sec = 1; /* tell clock to wake sched */ 1643 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1644 break; 1645 default: 1646 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1647 } 1648 } 1649 1650 /* 1651 * This routine is called by setbackdq/setfrontdq if the thread is 1652 * not loaded or loaded and on the swap queue. 1653 * 1654 * Thread state TS_SLEEP implies that a swapped thread 1655 * has been woken up and needs to be swapped in by the swapper. 1656 * 1657 * Thread state TS_RUN, it implies that the priority of a swapped 1658 * thread is being increased by scheduling class (e.g. ts_update). 1659 */ 1660 static void 1661 disp_swapped_setrun(kthread_t *tp) 1662 { 1663 ASSERT(THREAD_LOCK_HELD(tp)); 1664 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1665 1666 switch (tp->t_state) { 1667 case TS_SLEEP: 1668 disp_lock_enter_high(&swapped_lock); 1669 /* 1670 * Wakeup sched immediately (i.e., next tick) if the 1671 * thread priority is above maxclsyspri. 1672 */ 1673 if (DISP_PRIO(tp) > maxclsyspri) 1674 wake_sched = 1; 1675 else 1676 wake_sched_sec = 1; 1677 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1678 break; 1679 case TS_RUN: /* called from ts_update */ 1680 break; 1681 default: 1682 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1683 } 1684 } 1685 1686 1687 /* 1688 * Make a thread give up its processor. Find the processor on 1689 * which this thread is executing, and have that processor 1690 * preempt. 1691 */ 1692 void 1693 cpu_surrender(kthread_t *tp) 1694 { 1695 cpu_t *cpup; 1696 int max_pri; 1697 int max_run_pri; 1698 klwp_t *lwp; 1699 1700 ASSERT(THREAD_LOCK_HELD(tp)); 1701 1702 if (tp->t_state != TS_ONPROC) 1703 return; 1704 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1705 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1706 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1707 if (max_pri < max_run_pri) 1708 max_pri = max_run_pri; 1709 1710 cpup->cpu_runrun = 1; 1711 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1712 cpup->cpu_kprunrun = 1; 1713 } 1714 1715 /* 1716 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1717 */ 1718 membar_enter(); 1719 1720 DTRACE_SCHED1(surrender, kthread_t *, tp); 1721 1722 /* 1723 * Make the target thread take an excursion through trap() 1724 * to do preempt() (unless we're already in trap or post_syscall, 1725 * calling cpu_surrender via CL_TRAPRET). 1726 */ 1727 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1728 lwp->lwp_state != LWP_USER) { 1729 aston(tp); 1730 if (cpup != CPU) 1731 poke_cpu(cpup->cpu_id); 1732 } 1733 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1734 "cpu_surrender:tid %p cpu %p", tp, cpup); 1735 } 1736 1737 1738 /* 1739 * Commit to and ratify a scheduling decision 1740 */ 1741 /*ARGSUSED*/ 1742 static kthread_t * 1743 disp_ratify(kthread_t *tp, disp_t *kpq) 1744 { 1745 pri_t tpri, maxpri; 1746 pri_t maxkpri; 1747 cpu_t *cpup; 1748 1749 ASSERT(tp != NULL); 1750 /* 1751 * Commit to, then ratify scheduling decision 1752 */ 1753 cpup = CPU; 1754 if (cpup->cpu_runrun != 0) 1755 cpup->cpu_runrun = 0; 1756 if (cpup->cpu_kprunrun != 0) 1757 cpup->cpu_kprunrun = 0; 1758 if (cpup->cpu_chosen_level != -1) 1759 cpup->cpu_chosen_level = -1; 1760 membar_enter(); 1761 tpri = DISP_PRIO(tp); 1762 maxpri = cpup->cpu_disp->disp_maxrunpri; 1763 maxkpri = kpq->disp_maxrunpri; 1764 if (maxpri < maxkpri) 1765 maxpri = maxkpri; 1766 if (tpri < maxpri) { 1767 /* 1768 * should have done better 1769 * put this one back and indicate to try again 1770 */ 1771 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1772 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1773 thread_lock_high(tp); 1774 THREAD_TRANSITION(tp); 1775 setfrontdq(tp); 1776 thread_unlock_nopreempt(tp); 1777 1778 tp = NULL; 1779 } 1780 return (tp); 1781 } 1782 1783 /* 1784 * See if there is any work on the dispatcher queue for other CPUs. 1785 * If there is, dequeue the best thread and return. 1786 */ 1787 static kthread_t * 1788 disp_getwork(cpu_t *cp) 1789 { 1790 cpu_t *ocp; /* other CPU */ 1791 cpu_t *ocp_start; 1792 cpu_t *tcp; /* target local CPU */ 1793 kthread_t *tp; 1794 pri_t maxpri; 1795 int s; 1796 disp_t *kpq; /* kp queue for this partition */ 1797 lpl_t *lpl, *lpl_leaf; 1798 int hint, leafidx; 1799 1800 maxpri = -1; 1801 tcp = NULL; 1802 1803 kpq = &cp->cpu_part->cp_kp_queue; 1804 while (kpq->disp_maxrunpri >= 0) { 1805 /* 1806 * Try to take a thread from the kp_queue. 1807 */ 1808 tp = (disp_getbest(kpq)); 1809 if (tp) 1810 return (disp_ratify(tp, kpq)); 1811 } 1812 1813 s = splhigh(); /* protect the cpu_active list */ 1814 1815 /* 1816 * Try to find something to do on another CPU's run queue. 1817 * Loop through all other CPUs looking for the one with the highest 1818 * priority unbound thread. 1819 * 1820 * On NUMA machines, the partition's CPUs are consulted in order of 1821 * distance from the current CPU. This way, the first available 1822 * work found is also the closest, and will suffer the least 1823 * from being migrated. 1824 */ 1825 lpl = lpl_leaf = cp->cpu_lpl; 1826 hint = leafidx = 0; 1827 1828 /* 1829 * This loop traverses the lpl hierarchy. Higher level lpls represent 1830 * broader levels of locality 1831 */ 1832 do { 1833 /* This loop iterates over the lpl's leaves */ 1834 do { 1835 if (lpl_leaf != cp->cpu_lpl) 1836 ocp = lpl_leaf->lpl_cpus; 1837 else 1838 ocp = cp->cpu_next_lpl; 1839 1840 /* This loop iterates over the CPUs in the leaf */ 1841 ocp_start = ocp; 1842 do { 1843 pri_t pri; 1844 1845 ASSERT(CPU_ACTIVE(ocp)); 1846 1847 /* 1848 * End our stroll around the partition if: 1849 * 1850 * - Something became runnable on the local 1851 * queue 1852 * 1853 * - We're at the broadest level of locality and 1854 * we happen across another idle CPU. At the 1855 * highest level of locality, all CPUs will 1856 * walk the partition's CPUs in the same 1857 * order, so we can end our stroll taking 1858 * comfort in knowing the other idle CPU is 1859 * already covering the next portion of the 1860 * list. 1861 */ 1862 if (cp->cpu_disp->disp_nrunnable != 0) 1863 break; 1864 if (ocp->cpu_dispatch_pri == -1) { 1865 if (ocp->cpu_disp_flags & 1866 CPU_DISP_HALTED) 1867 continue; 1868 else if (lpl->lpl_parent == NULL) 1869 break; 1870 } 1871 1872 /* 1873 * If there's only one thread and the CPU 1874 * is in the middle of a context switch, 1875 * or it's currently running the idle thread, 1876 * don't steal it. 1877 */ 1878 if ((ocp->cpu_disp_flags & 1879 CPU_DISP_DONTSTEAL) && 1880 ocp->cpu_disp->disp_nrunnable == 1) 1881 continue; 1882 1883 pri = ocp->cpu_disp->disp_max_unbound_pri; 1884 if (pri > maxpri) { 1885 maxpri = pri; 1886 tcp = ocp; 1887 } 1888 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 1889 1890 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 1891 leafidx = 0; 1892 lpl_leaf = lpl->lpl_rset[leafidx]; 1893 } 1894 } while (leafidx != hint); 1895 1896 hint = leafidx = lpl->lpl_hint; 1897 if ((lpl = lpl->lpl_parent) != NULL) 1898 lpl_leaf = lpl->lpl_rset[hint]; 1899 } while (!tcp && lpl); 1900 1901 splx(s); 1902 1903 /* 1904 * If another queue looks good, and there is still nothing on 1905 * the local queue, try to transfer one or more threads 1906 * from it to our queue. 1907 */ 1908 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 1909 tp = (disp_getbest(tcp->cpu_disp)); 1910 if (tp) 1911 return (disp_ratify(tp, kpq)); 1912 } 1913 return (NULL); 1914 } 1915 1916 1917 /* 1918 * disp_fix_unbound_pri() 1919 * Determines the maximum priority of unbound threads on the queue. 1920 * The priority is kept for the queue, but is only increased, never 1921 * reduced unless some CPU is looking for something on that queue. 1922 * 1923 * The priority argument is the known upper limit. 1924 * 1925 * Perhaps this should be kept accurately, but that probably means 1926 * separate bitmaps for bound and unbound threads. Since only idled 1927 * CPUs will have to do this recalculation, it seems better this way. 1928 */ 1929 static void 1930 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 1931 { 1932 kthread_t *tp; 1933 dispq_t *dq; 1934 ulong_t *dqactmap = dp->disp_qactmap; 1935 ulong_t mapword; 1936 int wx; 1937 1938 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 1939 1940 ASSERT(pri >= 0); /* checked by caller */ 1941 1942 /* 1943 * Start the search at the next lowest priority below the supplied 1944 * priority. This depends on the bitmap implementation. 1945 */ 1946 do { 1947 wx = pri >> BT_ULSHIFT; /* index of word in map */ 1948 1949 /* 1950 * Form mask for all lower priorities in the word. 1951 */ 1952 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 1953 1954 /* 1955 * Get next lower active priority. 1956 */ 1957 if (mapword != 0) { 1958 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 1959 } else if (wx > 0) { 1960 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 1961 if (pri < 0) 1962 break; 1963 } else { 1964 pri = -1; 1965 break; 1966 } 1967 1968 /* 1969 * Search the queue for unbound, runnable threads. 1970 */ 1971 dq = &dp->disp_q[pri]; 1972 tp = dq->dq_first; 1973 1974 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 1975 tp = tp->t_link; 1976 } 1977 1978 /* 1979 * If a thread was found, set the priority and return. 1980 */ 1981 } while (tp == NULL); 1982 1983 /* 1984 * pri holds the maximum unbound thread priority or -1. 1985 */ 1986 if (dp->disp_max_unbound_pri != pri) 1987 dp->disp_max_unbound_pri = pri; 1988 } 1989 1990 /* 1991 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 1992 * check if the CPU to which is was previously bound should have 1993 * its disp_max_unbound_pri increased. 1994 */ 1995 void 1996 disp_adjust_unbound_pri(kthread_t *tp) 1997 { 1998 disp_t *dp; 1999 pri_t tpri; 2000 2001 ASSERT(THREAD_LOCK_HELD(tp)); 2002 2003 /* 2004 * Don't do anything if the thread is not bound, or 2005 * currently not runnable or swapped out. 2006 */ 2007 if (tp->t_bound_cpu == NULL || 2008 tp->t_state != TS_RUN || 2009 tp->t_schedflag & TS_ON_SWAPQ) 2010 return; 2011 2012 tpri = DISP_PRIO(tp); 2013 dp = tp->t_bound_cpu->cpu_disp; 2014 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2015 if (tpri > dp->disp_max_unbound_pri) 2016 dp->disp_max_unbound_pri = tpri; 2017 } 2018 2019 /* 2020 * disp_getbest() - de-queue the highest priority unbound runnable thread. 2021 * returns with the thread unlocked and onproc 2022 * but at splhigh (like disp()). 2023 * returns NULL if nothing found. 2024 * 2025 * Passed a pointer to a dispatch queue not associated with this CPU. 2026 */ 2027 static kthread_t * 2028 disp_getbest(disp_t *dp) 2029 { 2030 kthread_t *tp; 2031 dispq_t *dq; 2032 pri_t pri; 2033 cpu_t *cp; 2034 2035 disp_lock_enter(&dp->disp_lock); 2036 2037 /* 2038 * If there is nothing to run, or the CPU is in the middle of a 2039 * context switch of the only thread, return NULL. 2040 */ 2041 pri = dp->disp_max_unbound_pri; 2042 if (pri == -1 || 2043 (dp->disp_cpu != NULL && 2044 (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2045 dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) { 2046 disp_lock_exit_nopreempt(&dp->disp_lock); 2047 return (NULL); 2048 } 2049 2050 dq = &dp->disp_q[pri]; 2051 tp = dq->dq_first; 2052 2053 /* 2054 * Skip over bound threads. 2055 * Bound threads can be here even though disp_max_unbound_pri 2056 * indicated this level. Besides, it not always accurate because it 2057 * isn't reduced until another CPU looks for work. 2058 * Note that tp could be NULL right away due to this. 2059 */ 2060 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2061 tp = tp->t_link; 2062 } 2063 2064 /* 2065 * If there were no unbound threads on this queue, find the queue 2066 * where they are and then return NULL so that other CPUs will be 2067 * considered. 2068 */ 2069 if (tp == NULL) { 2070 disp_fix_unbound_pri(dp, pri); 2071 disp_lock_exit_nopreempt(&dp->disp_lock); 2072 return (NULL); 2073 } 2074 2075 /* 2076 * Found a runnable, unbound thread, so remove it from queue. 2077 * dispdeq() requires that we have the thread locked, and we do, 2078 * by virtue of holding the dispatch queue lock. dispdeq() will 2079 * put the thread in transition state, thereby dropping the dispq 2080 * lock. 2081 */ 2082 #ifdef DEBUG 2083 { 2084 int thread_was_on_queue; 2085 2086 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2087 ASSERT(thread_was_on_queue); 2088 } 2089 #else /* DEBUG */ 2090 (void) dispdeq(tp); /* drops disp_lock */ 2091 #endif /* DEBUG */ 2092 2093 tp->t_schedflag |= TS_DONT_SWAP; 2094 2095 /* 2096 * Setup thread to run on the current CPU. 2097 */ 2098 cp = CPU; 2099 2100 tp->t_disp_queue = cp->cpu_disp; 2101 2102 cp->cpu_dispthread = tp; /* protected by spl only */ 2103 cp->cpu_dispatch_pri = pri; 2104 ASSERT(pri == DISP_PRIO(tp)); 2105 2106 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2107 2108 /* 2109 * Return with spl high so that swtch() won't need to raise it. 2110 * The disp_lock was dropped by dispdeq(). 2111 */ 2112 2113 return (tp); 2114 } 2115 2116 /* 2117 * disp_bound_common() - common routine for higher level functions 2118 * that check for bound threads under certain conditions. 2119 * If 'threadlistsafe' is set then there is no need to acquire 2120 * pidlock to stop the thread list from changing (eg, if 2121 * disp_bound_* is called with cpus paused). 2122 */ 2123 static int 2124 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2125 { 2126 int found = 0; 2127 kthread_t *tp; 2128 2129 ASSERT(flag); 2130 2131 if (!threadlistsafe) 2132 mutex_enter(&pidlock); 2133 tp = curthread; /* faster than allthreads */ 2134 do { 2135 if (tp->t_state != TS_FREE) { 2136 /* 2137 * If an interrupt thread is busy, but the 2138 * caller doesn't care (i.e. BOUND_INTR is off), 2139 * then just ignore it and continue through. 2140 */ 2141 if ((tp->t_flag & T_INTR_THREAD) && 2142 !(flag & BOUND_INTR)) 2143 continue; 2144 2145 /* 2146 * Skip the idle thread for the CPU 2147 * we're about to set offline. 2148 */ 2149 if (tp == cp->cpu_idle_thread) 2150 continue; 2151 2152 /* 2153 * Skip the pause thread for the CPU 2154 * we're about to set offline. 2155 */ 2156 if (tp == cp->cpu_pause_thread) 2157 continue; 2158 2159 if ((flag & BOUND_CPU) && 2160 (tp->t_bound_cpu == cp || 2161 tp->t_bind_cpu == cp->cpu_id || 2162 tp->t_weakbound_cpu == cp)) { 2163 found = 1; 2164 break; 2165 } 2166 2167 if ((flag & BOUND_PARTITION) && 2168 (tp->t_cpupart == cp->cpu_part)) { 2169 found = 1; 2170 break; 2171 } 2172 } 2173 } while ((tp = tp->t_next) != curthread && found == 0); 2174 if (!threadlistsafe) 2175 mutex_exit(&pidlock); 2176 return (found); 2177 } 2178 2179 /* 2180 * disp_bound_threads - return nonzero if threads are bound to the processor. 2181 * Called infrequently. Keep this simple. 2182 * Includes threads that are asleep or stopped but not onproc. 2183 */ 2184 int 2185 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2186 { 2187 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2188 } 2189 2190 /* 2191 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2192 * to the given processor, including interrupt threads. 2193 */ 2194 int 2195 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2196 { 2197 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2198 } 2199 2200 /* 2201 * disp_bound_partition - return nonzero if threads are bound to the same 2202 * partition as the processor. 2203 * Called infrequently. Keep this simple. 2204 * Includes threads that are asleep or stopped but not onproc. 2205 */ 2206 int 2207 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2208 { 2209 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2210 } 2211 2212 /* 2213 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2214 * threads to other CPUs. 2215 */ 2216 void 2217 disp_cpu_inactive(cpu_t *cp) 2218 { 2219 kthread_t *tp; 2220 disp_t *dp = cp->cpu_disp; 2221 dispq_t *dq; 2222 pri_t pri; 2223 int wasonq; 2224 2225 disp_lock_enter(&dp->disp_lock); 2226 while ((pri = dp->disp_max_unbound_pri) != -1) { 2227 dq = &dp->disp_q[pri]; 2228 tp = dq->dq_first; 2229 2230 /* 2231 * Skip over bound threads. 2232 */ 2233 while (tp != NULL && tp->t_bound_cpu != NULL) { 2234 tp = tp->t_link; 2235 } 2236 2237 if (tp == NULL) { 2238 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2239 disp_fix_unbound_pri(dp, pri); 2240 continue; 2241 } 2242 2243 wasonq = dispdeq(tp); /* drops disp_lock */ 2244 ASSERT(wasonq); 2245 ASSERT(tp->t_weakbound_cpu == NULL); 2246 2247 setbackdq(tp); 2248 /* 2249 * Called from cpu_offline: 2250 * 2251 * cp has already been removed from the list of active cpus 2252 * and tp->t_cpu has been changed so there is no risk of 2253 * tp ending up back on cp. 2254 * 2255 * Called from cpupart_move_cpu: 2256 * 2257 * The cpu has moved to a new cpupart. Any threads that 2258 * were on it's dispatch queues before the move remain 2259 * in the old partition and can't run in the new partition. 2260 */ 2261 ASSERT(tp->t_cpu != cp); 2262 thread_unlock(tp); 2263 2264 disp_lock_enter(&dp->disp_lock); 2265 } 2266 disp_lock_exit(&dp->disp_lock); 2267 } 2268 2269 /* 2270 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2271 * The hint passed in is used as a starting point so we don't favor 2272 * CPU 0 or any other CPU. The caller should pass in the most recently 2273 * used CPU for the thread. 2274 * 2275 * The lgroup and priority are used to determine the best CPU to run on 2276 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2277 * the thread priority will indicate whether the thread will actually run 2278 * there. To pick the best CPU, the CPUs inside and outside of the given 2279 * lgroup which are running the lowest priority threads are found. The 2280 * remote CPU is chosen only if the thread will not run locally on a CPU 2281 * within the lgroup, but will run on the remote CPU. If the thread 2282 * cannot immediately run on any CPU, the best local CPU will be chosen. 2283 * 2284 * The lpl specified also identifies the cpu partition from which 2285 * disp_lowpri_cpu should select a CPU. 2286 * 2287 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2288 * behalf of the current thread. (curthread is looking for a new cpu) 2289 * In this case, cpu_dispatch_pri for this thread's cpu should be 2290 * ignored. 2291 * 2292 * If a cpu is the target of an offline request then try to avoid it. 2293 * 2294 * This function must be called at either high SPL, or with preemption 2295 * disabled, so that the "hint" CPU cannot be removed from the online 2296 * CPU list while we are traversing it. 2297 */ 2298 cpu_t * 2299 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2300 { 2301 cpu_t *bestcpu; 2302 cpu_t *besthomecpu; 2303 cpu_t *cp, *cpstart; 2304 2305 pri_t bestpri; 2306 pri_t cpupri; 2307 2308 klgrpset_t done; 2309 klgrpset_t cur_set; 2310 2311 lpl_t *lpl_iter, *lpl_leaf; 2312 int i; 2313 2314 /* 2315 * Scan for a CPU currently running the lowest priority thread. 2316 * Cannot get cpu_lock here because it is adaptive. 2317 * We do not require lock on CPU list. 2318 */ 2319 ASSERT(hint != NULL); 2320 ASSERT(lpl != NULL); 2321 ASSERT(lpl->lpl_ncpu > 0); 2322 2323 /* 2324 * First examine local CPUs. Note that it's possible the hint CPU 2325 * passed in in remote to the specified home lgroup. If our priority 2326 * isn't sufficient enough such that we can run immediately at home, 2327 * then examine CPUs remote to our home lgroup. 2328 * We would like to give preference to CPUs closest to "home". 2329 * If we can't find a CPU where we'll run at a given level 2330 * of locality, we expand our search to include the next level. 2331 */ 2332 bestcpu = besthomecpu = NULL; 2333 klgrpset_clear(done); 2334 /* start with lpl we were passed */ 2335 2336 lpl_iter = lpl; 2337 2338 do { 2339 2340 bestpri = SHRT_MAX; 2341 klgrpset_clear(cur_set); 2342 2343 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2344 lpl_leaf = lpl_iter->lpl_rset[i]; 2345 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2346 continue; 2347 2348 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2349 2350 if (hint->cpu_lpl == lpl_leaf) 2351 cp = cpstart = hint; 2352 else 2353 cp = cpstart = lpl_leaf->lpl_cpus; 2354 2355 do { 2356 2357 if (cp == curcpu) 2358 cpupri = -1; 2359 else if (cp == cpu_inmotion) 2360 cpupri = SHRT_MAX; 2361 else 2362 cpupri = cp->cpu_dispatch_pri; 2363 2364 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2365 cpupri = cp->cpu_disp->disp_maxrunpri; 2366 if (cp->cpu_chosen_level > cpupri) 2367 cpupri = cp->cpu_chosen_level; 2368 if (cpupri < bestpri) { 2369 if (CPU_IDLING(cpupri)) { 2370 ASSERT((cp->cpu_flags & 2371 CPU_QUIESCED) == 0); 2372 return (cp); 2373 } 2374 bestcpu = cp; 2375 bestpri = cpupri; 2376 } 2377 } while ((cp = cp->cpu_next_lpl) != cpstart); 2378 } 2379 2380 if (bestcpu && (tpri > bestpri)) { 2381 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2382 return (bestcpu); 2383 } 2384 if (besthomecpu == NULL) 2385 besthomecpu = bestcpu; 2386 /* 2387 * Add the lgrps we just considered to the "done" set 2388 */ 2389 klgrpset_or(done, cur_set); 2390 2391 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2392 2393 /* 2394 * The specified priority isn't high enough to run immediately 2395 * anywhere, so just return the best CPU from the home lgroup. 2396 */ 2397 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2398 return (besthomecpu); 2399 } 2400 2401 /* 2402 * This routine provides the generic idle cpu function for all processors. 2403 * If a processor has some specific code to execute when idle (say, to stop 2404 * the pipeline and save power) then that routine should be defined in the 2405 * processors specific code (module_xx.c) and the global variable idle_cpu 2406 * set to that function. 2407 */ 2408 static void 2409 generic_idle_cpu(void) 2410 { 2411 } 2412 2413 /*ARGSUSED*/ 2414 static void 2415 generic_enq_thread(cpu_t *cpu, int bound) 2416 { 2417 } 2418 2419 /* 2420 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2421 * - t->t_cpu is not in this thread's assigned lgrp 2422 * - the time since the thread last came off t->t_cpu exceeds the 2423 * rechoose time for this cpu (ignore this if t is curthread in 2424 * which case it's on CPU and t->t_disp_time is inaccurate) 2425 * - t->t_cpu is presently the target of an offline or partition move 2426 * request 2427 */ 2428 static cpu_t * 2429 cpu_choose(kthread_t *t, pri_t tpri) 2430 { 2431 ASSERT(tpri < kpqpri); 2432 2433 if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) && 2434 t != curthread) || t->t_cpu == cpu_inmotion) { 2435 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2436 } 2437 2438 /* 2439 * Take a trip through disp_lowpri_cpu() if the thread was 2440 * running outside it's home lgroup 2441 */ 2442 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2443 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2444 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2445 (t == curthread) ? t->t_cpu : NULL)); 2446 } 2447 return (t->t_cpu); 2448 } 2449