1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/chip.h> 54 #include <sys/schedctl.h> 55 #include <sys/atomic.h> 56 #include <sys/dtrace.h> 57 #include <sys/sdt.h> 58 59 #include <vm/as.h> 60 61 #define BOUND_CPU 0x1 62 #define BOUND_PARTITION 0x2 63 #define BOUND_INTR 0x4 64 65 /* Dispatch queue allocation structure and functions */ 66 struct disp_queue_info { 67 disp_t *dp; 68 dispq_t *olddispq; 69 dispq_t *newdispq; 70 ulong_t *olddqactmap; 71 ulong_t *newdqactmap; 72 int oldnglobpris; 73 }; 74 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 75 disp_t *dp); 76 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 77 static void disp_dq_free(struct disp_queue_info *dptr); 78 79 /* platform-specific routine to call when processor is idle */ 80 static void generic_idle_cpu(); 81 void (*idle_cpu)() = generic_idle_cpu; 82 83 /* routines invoked when a CPU enters/exits the idle loop */ 84 static void idle_enter(); 85 static void idle_exit(); 86 87 /* platform-specific routine to call when thread is enqueued */ 88 static void generic_enq_thread(cpu_t *, int); 89 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 90 91 pri_t kpreemptpri; /* priority where kernel preemption applies */ 92 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 93 pri_t intr_pri; /* interrupt thread priority base level */ 94 95 #define KPQPRI -1 /* priority where cpu affinity is dropped for kp queue */ 96 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 97 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 98 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 99 int nswapped; /* total number of swapped threads */ 100 void disp_swapped_enq(kthread_t *tp); 101 static void disp_swapped_setrun(kthread_t *tp); 102 static void cpu_resched(cpu_t *cp, pri_t tpri); 103 104 /* 105 * If this is set, only interrupt threads will cause kernel preemptions. 106 * This is done by changing the value of kpreemptpri. kpreemptpri 107 * will either be the max sysclass pri + 1 or the min interrupt pri. 108 */ 109 int only_intr_kpreempt; 110 111 extern void set_idle_cpu(int cpun); 112 extern void unset_idle_cpu(int cpun); 113 static void setkpdq(kthread_t *tp, int borf); 114 #define SETKP_BACK 0 115 #define SETKP_FRONT 1 116 /* 117 * Parameter that determines how recently a thread must have run 118 * on the CPU to be considered loosely-bound to that CPU to reduce 119 * cold cache effects. The interval is in hertz. 120 * 121 * The platform may define a per physical processor adjustment of 122 * this parameter. For efficiency, the effective rechoose interval 123 * (rechoose_interval + per chip adjustment) is maintained in the 124 * cpu structures. See cpu_choose() 125 */ 126 int rechoose_interval = RECHOOSE_INTERVAL; 127 128 static cpu_t *cpu_choose(kthread_t *, pri_t); 129 130 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 131 132 disp_lock_t transition_lock; /* lock on transitioning threads */ 133 disp_lock_t stop_lock; /* lock on stopped threads */ 134 135 static void cpu_dispqalloc(int numpris); 136 137 static kthread_t *disp_getwork(cpu_t *to); 138 static kthread_t *disp_getbest(disp_t *from); 139 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 140 141 void swtch_to(kthread_t *); 142 143 /* 144 * dispatcher and scheduler initialization 145 */ 146 147 /* 148 * disp_setup - Common code to calculate and allocate dispatcher 149 * variables and structures based on the maximum priority. 150 */ 151 static void 152 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 153 { 154 pri_t newnglobpris; 155 156 ASSERT(MUTEX_HELD(&cpu_lock)); 157 158 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 159 160 if (newnglobpris > oldnglobpris) { 161 /* 162 * Allocate new kp queues for each CPU partition. 163 */ 164 cpupart_kpqalloc(newnglobpris); 165 166 /* 167 * Allocate new dispatch queues for each CPU. 168 */ 169 cpu_dispqalloc(newnglobpris); 170 171 /* 172 * compute new interrupt thread base priority 173 */ 174 intr_pri = maxglobpri; 175 if (only_intr_kpreempt) { 176 kpreemptpri = intr_pri + 1; 177 if (kpqpri == KPQPRI) 178 kpqpri = kpreemptpri; 179 } 180 v.v_nglobpris = newnglobpris; 181 } 182 } 183 184 /* 185 * dispinit - Called to initialize all loaded classes and the 186 * dispatcher framework. 187 */ 188 void 189 dispinit(void) 190 { 191 id_t cid; 192 pri_t maxglobpri; 193 pri_t cl_maxglobpri; 194 195 maxglobpri = -1; 196 197 /* 198 * Initialize transition lock, which will always be set. 199 */ 200 DISP_LOCK_INIT(&transition_lock); 201 disp_lock_enter_high(&transition_lock); 202 DISP_LOCK_INIT(&stop_lock); 203 204 mutex_enter(&cpu_lock); 205 CPU->cpu_disp->disp_maxrunpri = -1; 206 CPU->cpu_disp->disp_max_unbound_pri = -1; 207 /* 208 * Initialize the default CPU partition. 209 */ 210 cpupart_initialize_default(); 211 /* 212 * Call the class specific initialization functions for 213 * all pre-installed schedulers. 214 * 215 * We pass the size of a class specific parameter 216 * buffer to each of the initialization functions 217 * to try to catch problems with backward compatibility 218 * of class modules. 219 * 220 * For example a new class module running on an old system 221 * which didn't provide sufficiently large parameter buffers 222 * would be bad news. Class initialization modules can check for 223 * this and take action if they detect a problem. 224 */ 225 226 for (cid = 0; cid < nclass; cid++) { 227 sclass_t *sc; 228 229 sc = &sclass[cid]; 230 if (SCHED_INSTALLED(sc)) { 231 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 232 &sc->cl_funcs); 233 if (cl_maxglobpri > maxglobpri) 234 maxglobpri = cl_maxglobpri; 235 } 236 } 237 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 238 if (kpqpri == KPQPRI) 239 kpqpri = kpreemptpri; 240 241 ASSERT(maxglobpri >= 0); 242 disp_setup(maxglobpri, 0); 243 244 mutex_exit(&cpu_lock); 245 246 /* 247 * Get the default class ID; this may be later modified via 248 * dispadmin(1M). This will load the class (normally TS) and that will 249 * call disp_add(), which is why we had to drop cpu_lock first. 250 */ 251 if (getcid(defaultclass, &defaultcid) != 0) { 252 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 253 defaultclass); 254 } 255 } 256 257 /* 258 * disp_add - Called with class pointer to initialize the dispatcher 259 * for a newly loaded class. 260 */ 261 void 262 disp_add(sclass_t *clp) 263 { 264 pri_t maxglobpri; 265 pri_t cl_maxglobpri; 266 267 mutex_enter(&cpu_lock); 268 /* 269 * Initialize the scheduler class. 270 */ 271 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 272 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 273 if (cl_maxglobpri > maxglobpri) 274 maxglobpri = cl_maxglobpri; 275 276 /* 277 * Save old queue information. Since we're initializing a 278 * new scheduling class which has just been loaded, then 279 * the size of the dispq may have changed. We need to handle 280 * that here. 281 */ 282 disp_setup(maxglobpri, v.v_nglobpris); 283 284 mutex_exit(&cpu_lock); 285 } 286 287 288 /* 289 * For each CPU, allocate new dispatch queues 290 * with the stated number of priorities. 291 */ 292 static void 293 cpu_dispqalloc(int numpris) 294 { 295 cpu_t *cpup; 296 struct disp_queue_info *disp_mem; 297 int i, num; 298 299 ASSERT(MUTEX_HELD(&cpu_lock)); 300 301 disp_mem = kmem_zalloc(NCPU * 302 sizeof (struct disp_queue_info), KM_SLEEP); 303 304 /* 305 * This routine must allocate all of the memory before stopping 306 * the cpus because it must not sleep in kmem_alloc while the 307 * CPUs are stopped. Locks they hold will not be freed until they 308 * are restarted. 309 */ 310 i = 0; 311 cpup = cpu_list; 312 do { 313 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 314 i++; 315 cpup = cpup->cpu_next; 316 } while (cpup != cpu_list); 317 num = i; 318 319 pause_cpus(NULL); 320 for (i = 0; i < num; i++) 321 disp_dq_assign(&disp_mem[i], numpris); 322 start_cpus(); 323 324 /* 325 * I must free all of the memory after starting the cpus because 326 * I can not risk sleeping in kmem_free while the cpus are stopped. 327 */ 328 for (i = 0; i < num; i++) 329 disp_dq_free(&disp_mem[i]); 330 331 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 332 } 333 334 static void 335 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 336 { 337 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 338 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 339 sizeof (long), KM_SLEEP); 340 dptr->dp = dp; 341 } 342 343 static void 344 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 345 { 346 disp_t *dp; 347 348 dp = dptr->dp; 349 dptr->olddispq = dp->disp_q; 350 dptr->olddqactmap = dp->disp_qactmap; 351 dptr->oldnglobpris = dp->disp_npri; 352 353 ASSERT(dptr->oldnglobpris < numpris); 354 355 if (dptr->olddispq != NULL) { 356 /* 357 * Use kcopy because bcopy is platform-specific 358 * and could block while we might have paused the cpus. 359 */ 360 (void) kcopy(dptr->olddispq, dptr->newdispq, 361 dptr->oldnglobpris * sizeof (dispq_t)); 362 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 363 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 364 sizeof (long)); 365 } 366 dp->disp_q = dptr->newdispq; 367 dp->disp_qactmap = dptr->newdqactmap; 368 dp->disp_q_limit = &dptr->newdispq[numpris]; 369 dp->disp_npri = numpris; 370 } 371 372 static void 373 disp_dq_free(struct disp_queue_info *dptr) 374 { 375 if (dptr->olddispq != NULL) 376 kmem_free(dptr->olddispq, 377 dptr->oldnglobpris * sizeof (dispq_t)); 378 if (dptr->olddqactmap != NULL) 379 kmem_free(dptr->olddqactmap, 380 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 381 } 382 383 /* 384 * For a newly created CPU, initialize the dispatch queue. 385 * This is called before the CPU is known through cpu[] or on any lists. 386 */ 387 void 388 disp_cpu_init(cpu_t *cp) 389 { 390 disp_t *dp; 391 dispq_t *newdispq; 392 ulong_t *newdqactmap; 393 394 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 395 396 if (cp == cpu0_disp.disp_cpu) 397 dp = &cpu0_disp; 398 else 399 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 400 bzero(dp, sizeof (disp_t)); 401 cp->cpu_disp = dp; 402 dp->disp_cpu = cp; 403 dp->disp_maxrunpri = -1; 404 dp->disp_max_unbound_pri = -1; 405 DISP_LOCK_INIT(&cp->cpu_thread_lock); 406 /* 407 * Allocate memory for the dispatcher queue headers 408 * and the active queue bitmap. 409 */ 410 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 411 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 412 sizeof (long), KM_SLEEP); 413 dp->disp_q = newdispq; 414 dp->disp_qactmap = newdqactmap; 415 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 416 dp->disp_npri = v.v_nglobpris; 417 } 418 419 void 420 disp_cpu_fini(cpu_t *cp) 421 { 422 ASSERT(MUTEX_HELD(&cpu_lock)); 423 424 disp_kp_free(cp->cpu_disp); 425 if (cp->cpu_disp != &cpu0_disp) 426 kmem_free(cp->cpu_disp, sizeof (disp_t)); 427 } 428 429 /* 430 * Allocate new, larger kpreempt dispatch queue to replace the old one. 431 */ 432 void 433 disp_kp_alloc(disp_t *dq, pri_t npri) 434 { 435 struct disp_queue_info mem_info; 436 437 if (npri > dq->disp_npri) { 438 /* 439 * Allocate memory for the new array. 440 */ 441 disp_dq_alloc(&mem_info, npri, dq); 442 443 /* 444 * We need to copy the old structures to the new 445 * and free the old. 446 */ 447 disp_dq_assign(&mem_info, npri); 448 disp_dq_free(&mem_info); 449 } 450 } 451 452 /* 453 * Free dispatch queue. 454 * Used for the kpreempt queues for a removed CPU partition and 455 * for the per-CPU queues of deleted CPUs. 456 */ 457 void 458 disp_kp_free(disp_t *dq) 459 { 460 struct disp_queue_info mem_info; 461 462 mem_info.olddispq = dq->disp_q; 463 mem_info.olddqactmap = dq->disp_qactmap; 464 mem_info.oldnglobpris = dq->disp_npri; 465 disp_dq_free(&mem_info); 466 } 467 468 /* 469 * End dispatcher and scheduler initialization. 470 */ 471 472 /* 473 * See if there's anything to do other than remain idle. 474 * Return non-zero if there is. 475 * 476 * This function must be called with high spl, or with 477 * kernel preemption disabled to prevent the partition's 478 * active cpu list from changing while being traversed. 479 * 480 */ 481 int 482 disp_anywork(void) 483 { 484 cpu_t *cp = CPU; 485 cpu_t *ocp; 486 487 if (cp->cpu_disp->disp_nrunnable != 0) 488 return (1); 489 490 if (!(cp->cpu_flags & CPU_OFFLINE)) { 491 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 492 return (1); 493 494 /* 495 * Work can be taken from another CPU if: 496 * - There is unbound work on the run queue 497 * - That work isn't a thread undergoing a 498 * - context switch on an otherwise empty queue. 499 * - The CPU isn't running the idle loop. 500 */ 501 for (ocp = cp->cpu_next_part; ocp != cp; 502 ocp = ocp->cpu_next_part) { 503 ASSERT(CPU_ACTIVE(ocp)); 504 505 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 506 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 507 ocp->cpu_disp->disp_nrunnable == 1) && 508 ocp->cpu_dispatch_pri != -1) 509 return (1); 510 } 511 } 512 return (0); 513 } 514 515 /* 516 * Called when CPU enters the idle loop 517 */ 518 static void 519 idle_enter() 520 { 521 cpu_t *cp = CPU; 522 523 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 524 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 525 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 526 } 527 528 /* 529 * Called when CPU exits the idle loop 530 */ 531 static void 532 idle_exit() 533 { 534 cpu_t *cp = CPU; 535 536 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 537 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 538 } 539 540 /* 541 * Idle loop. 542 */ 543 void 544 idle() 545 { 546 struct cpu *cp = CPU; /* pointer to this CPU */ 547 kthread_t *t; /* taken thread */ 548 549 idle_enter(); 550 551 /* 552 * Uniprocessor version of idle loop. 553 * Do this until notified that we're on an actual multiprocessor. 554 */ 555 while (ncpus == 1) { 556 if (cp->cpu_disp->disp_nrunnable == 0) { 557 (*idle_cpu)(); 558 continue; 559 } 560 idle_exit(); 561 swtch(); 562 563 idle_enter(); /* returned from swtch */ 564 } 565 566 /* 567 * Multiprocessor idle loop. 568 */ 569 for (;;) { 570 /* 571 * If CPU is completely quiesced by p_online(2), just wait 572 * here with minimal bus traffic until put online. 573 */ 574 while (cp->cpu_flags & CPU_QUIESCED) 575 (*idle_cpu)(); 576 577 if (cp->cpu_disp->disp_nrunnable != 0) { 578 idle_exit(); 579 swtch(); 580 } else { 581 if (cp->cpu_flags & CPU_OFFLINE) 582 continue; 583 if ((t = disp_getwork(cp)) == NULL) { 584 if (cp->cpu_chosen_level != -1) { 585 disp_t *dp = cp->cpu_disp; 586 disp_t *kpq; 587 588 disp_lock_enter(&dp->disp_lock); 589 /* 590 * Set kpq under lock to prevent 591 * migration between partitions. 592 */ 593 kpq = &cp->cpu_part->cp_kp_queue; 594 if (kpq->disp_maxrunpri == -1) 595 cp->cpu_chosen_level = -1; 596 disp_lock_exit(&dp->disp_lock); 597 } 598 (*idle_cpu)(); 599 continue; 600 } 601 idle_exit(); 602 restore_mstate(t); 603 swtch_to(t); 604 } 605 idle_enter(); /* returned from swtch/swtch_to */ 606 } 607 } 608 609 610 /* 611 * Preempt the currently running thread in favor of the highest 612 * priority thread. The class of the current thread controls 613 * where it goes on the dispatcher queues. If panicking, turn 614 * preemption off. 615 */ 616 void 617 preempt() 618 { 619 kthread_t *t = curthread; 620 klwp_t *lwp = ttolwp(curthread); 621 622 if (panicstr) 623 return; 624 625 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 626 627 thread_lock(t); 628 629 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 630 /* 631 * this thread has already been chosen to be run on 632 * another CPU. Clear kprunrun on this CPU since we're 633 * already headed for swtch(). 634 */ 635 CPU->cpu_kprunrun = 0; 636 thread_unlock_nopreempt(t); 637 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 638 } else { 639 if (lwp != NULL) 640 lwp->lwp_ru.nivcsw++; 641 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 642 THREAD_TRANSITION(t); 643 CL_PREEMPT(t); 644 DTRACE_SCHED(preempt); 645 thread_unlock_nopreempt(t); 646 647 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 648 649 swtch(); /* clears CPU->cpu_runrun via disp() */ 650 } 651 } 652 653 extern kthread_t *thread_unpin(); 654 655 /* 656 * disp() - find the highest priority thread for this processor to run, and 657 * set it in TS_ONPROC state so that resume() can be called to run it. 658 */ 659 static kthread_t * 660 disp() 661 { 662 cpu_t *cpup; 663 disp_t *dp; 664 kthread_t *tp; 665 dispq_t *dq; 666 int maxrunword; 667 pri_t pri; 668 disp_t *kpq; 669 670 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 671 672 cpup = CPU; 673 /* 674 * Find the highest priority loaded, runnable thread. 675 */ 676 dp = cpup->cpu_disp; 677 678 reschedule: 679 /* 680 * If there is more important work on the global queue with a better 681 * priority than the maximum on this CPU, take it now. 682 */ 683 kpq = &cpup->cpu_part->cp_kp_queue; 684 while ((pri = kpq->disp_maxrunpri) >= 0 && 685 pri >= dp->disp_maxrunpri && 686 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 687 (tp = disp_getbest(kpq)) != NULL) { 688 if (disp_ratify(tp, kpq) != NULL) { 689 TRACE_1(TR_FAC_DISP, TR_DISP_END, 690 "disp_end:tid %p", tp); 691 restore_mstate(tp); 692 return (tp); 693 } 694 } 695 696 disp_lock_enter(&dp->disp_lock); 697 pri = dp->disp_maxrunpri; 698 699 /* 700 * If there is nothing to run, look at what's runnable on other queues. 701 * Choose the idle thread if the CPU is quiesced. 702 * Note that CPUs that have the CPU_OFFLINE flag set can still run 703 * interrupt threads, which will be the only threads on the CPU's own 704 * queue, but cannot run threads from other queues. 705 */ 706 if (pri == -1) { 707 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 708 disp_lock_exit(&dp->disp_lock); 709 if ((tp = disp_getwork(cpup)) == NULL) { 710 tp = cpup->cpu_idle_thread; 711 (void) splhigh(); 712 THREAD_ONPROC(tp, cpup); 713 cpup->cpu_dispthread = tp; 714 cpup->cpu_dispatch_pri = -1; 715 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 716 cpup->cpu_chosen_level = -1; 717 } 718 } else { 719 disp_lock_exit_high(&dp->disp_lock); 720 tp = cpup->cpu_idle_thread; 721 THREAD_ONPROC(tp, cpup); 722 cpup->cpu_dispthread = tp; 723 cpup->cpu_dispatch_pri = -1; 724 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 725 cpup->cpu_chosen_level = -1; 726 } 727 TRACE_1(TR_FAC_DISP, TR_DISP_END, 728 "disp_end:tid %p", tp); 729 restore_mstate(tp); 730 return (tp); 731 } 732 733 dq = &dp->disp_q[pri]; 734 tp = dq->dq_first; 735 736 ASSERT(tp != NULL); 737 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 738 739 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 740 741 /* 742 * Found it so remove it from queue. 743 */ 744 dp->disp_nrunnable--; 745 dq->dq_sruncnt--; 746 if ((dq->dq_first = tp->t_link) == NULL) { 747 ulong_t *dqactmap = dp->disp_qactmap; 748 749 ASSERT(dq->dq_sruncnt == 0); 750 dq->dq_last = NULL; 751 752 /* 753 * The queue is empty, so the corresponding bit needs to be 754 * turned off in dqactmap. If nrunnable != 0 just took the 755 * last runnable thread off the 756 * highest queue, so recompute disp_maxrunpri. 757 */ 758 maxrunword = pri >> BT_ULSHIFT; 759 dqactmap[maxrunword] &= ~BT_BIW(pri); 760 761 if (dp->disp_nrunnable == 0) { 762 dp->disp_max_unbound_pri = -1; 763 dp->disp_maxrunpri = -1; 764 } else { 765 int ipri; 766 767 ipri = bt_gethighbit(dqactmap, maxrunword); 768 dp->disp_maxrunpri = ipri; 769 if (ipri < dp->disp_max_unbound_pri) 770 dp->disp_max_unbound_pri = ipri; 771 } 772 } else { 773 tp->t_link = NULL; 774 } 775 776 /* 777 * Set TS_DONT_SWAP flag to prevent another processor from swapping 778 * out this thread before we have a chance to run it. 779 * While running, it is protected against swapping by t_lock. 780 */ 781 tp->t_schedflag |= TS_DONT_SWAP; 782 cpup->cpu_dispthread = tp; /* protected by spl only */ 783 cpup->cpu_dispatch_pri = pri; 784 ASSERT(pri == DISP_PRIO(tp)); 785 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 786 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 787 788 ASSERT(tp != NULL); 789 TRACE_1(TR_FAC_DISP, TR_DISP_END, 790 "disp_end:tid %p", tp); 791 792 if (disp_ratify(tp, kpq) == NULL) 793 goto reschedule; 794 795 restore_mstate(tp); 796 return (tp); 797 } 798 799 /* 800 * swtch() 801 * Find best runnable thread and run it. 802 * Called with the current thread already switched to a new state, 803 * on a sleep queue, run queue, stopped, and not zombied. 804 * May be called at any spl level less than or equal to LOCK_LEVEL. 805 * Always drops spl to the base level (spl0()). 806 */ 807 void 808 swtch() 809 { 810 kthread_t *t = curthread; 811 kthread_t *next; 812 cpu_t *cp; 813 814 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 815 816 if (t->t_flag & T_INTR_THREAD) 817 cpu_intr_swtch_enter(t); 818 819 if (t->t_intr != NULL) { 820 /* 821 * We are an interrupt thread. Setup and return 822 * the interrupted thread to be resumed. 823 */ 824 (void) splhigh(); /* block other scheduler action */ 825 cp = CPU; /* now protected against migration */ 826 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 827 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 828 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 829 next = thread_unpin(); 830 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 831 resume_from_intr(next); 832 } else { 833 #ifdef DEBUG 834 if (t->t_state == TS_ONPROC && 835 t->t_disp_queue->disp_cpu == CPU && 836 t->t_preempt == 0) { 837 thread_lock(t); 838 ASSERT(t->t_state != TS_ONPROC || 839 t->t_disp_queue->disp_cpu != CPU || 840 t->t_preempt != 0); /* cannot migrate */ 841 thread_unlock_nopreempt(t); 842 } 843 #endif /* DEBUG */ 844 cp = CPU; 845 next = disp(); /* returns with spl high */ 846 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 847 848 /* OK to steal anything left on run queue */ 849 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 850 851 if (next != t) { 852 if (t == cp->cpu_idle_thread) { 853 CHIP_NRUNNING(cp->cpu_chip, 1); 854 } else if (next == cp->cpu_idle_thread) { 855 CHIP_NRUNNING(cp->cpu_chip, -1); 856 } 857 858 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 859 cp->cpu_last_swtch = t->t_disp_time = lbolt; 860 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 861 862 if (dtrace_vtime_active) 863 dtrace_vtime_switch(next); 864 865 resume(next); 866 /* 867 * The TR_RESUME_END and TR_SWTCH_END trace points 868 * appear at the end of resume(), because we may not 869 * return here 870 */ 871 } else { 872 if (t->t_flag & T_INTR_THREAD) 873 cpu_intr_swtch_exit(t); 874 875 DTRACE_SCHED(remain__cpu); 876 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 877 (void) spl0(); 878 } 879 } 880 } 881 882 /* 883 * swtch_from_zombie() 884 * Special case of swtch(), which allows checks for TS_ZOMB to be 885 * eliminated from normal resume. 886 * Find best runnable thread and run it. 887 * Called with the current thread zombied. 888 * Zombies cannot migrate, so CPU references are safe. 889 */ 890 void 891 swtch_from_zombie() 892 { 893 kthread_t *next; 894 cpu_t *cpu = CPU; 895 896 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 897 898 ASSERT(curthread->t_state == TS_ZOMB); 899 900 next = disp(); /* returns with spl high */ 901 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 902 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 903 ASSERT(next != curthread); 904 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 905 906 if (next == cpu->cpu_idle_thread) 907 CHIP_NRUNNING(cpu->cpu_chip, -1); 908 909 if (dtrace_vtime_active) 910 dtrace_vtime_switch(next); 911 912 resume_from_zombie(next); 913 /* 914 * The TR_RESUME_END and TR_SWTCH_END trace points 915 * appear at the end of resume(), because we certainly will not 916 * return here 917 */ 918 } 919 920 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 921 static int 922 thread_on_queue(kthread_t *tp) 923 { 924 cpu_t *cp; 925 cpu_t *self; 926 disp_t *dp; 927 928 self = CPU; 929 cp = self->cpu_next_onln; 930 dp = cp->cpu_disp; 931 for (;;) { 932 dispq_t *dq; 933 dispq_t *eq; 934 935 disp_lock_enter_high(&dp->disp_lock); 936 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 937 kthread_t *rp; 938 939 ASSERT(dq->dq_last == NULL || 940 dq->dq_last->t_link == NULL); 941 for (rp = dq->dq_first; rp; rp = rp->t_link) 942 if (tp == rp) { 943 disp_lock_exit_high(&dp->disp_lock); 944 return (1); 945 } 946 } 947 disp_lock_exit_high(&dp->disp_lock); 948 if (cp == NULL) 949 break; 950 if (cp == self) { 951 cp = NULL; 952 dp = &cp->cpu_part->cp_kp_queue; 953 } else { 954 cp = cp->cpu_next_onln; 955 dp = cp->cpu_disp; 956 } 957 } 958 return (0); 959 } /* end of thread_on_queue */ 960 #else 961 962 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 963 964 #endif /* DEBUG */ 965 966 /* 967 * like swtch(), but switch to a specified thread taken from another CPU. 968 * called with spl high.. 969 */ 970 void 971 swtch_to(kthread_t *next) 972 { 973 cpu_t *cp = CPU; 974 975 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 976 977 /* 978 * Update context switch statistics. 979 */ 980 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 981 982 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 983 984 if (curthread == cp->cpu_idle_thread) 985 CHIP_NRUNNING(cp->cpu_chip, 1); 986 987 /* OK to steal anything left on run queue */ 988 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 989 990 /* record last execution time */ 991 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 992 993 if (dtrace_vtime_active) 994 dtrace_vtime_switch(next); 995 996 resume(next); 997 /* 998 * The TR_RESUME_END and TR_SWTCH_END trace points 999 * appear at the end of resume(), because we may not 1000 * return here 1001 */ 1002 } 1003 1004 1005 1006 #define CPU_IDLING(pri) ((pri) == -1) 1007 1008 static void 1009 cpu_resched(cpu_t *cp, pri_t tpri) 1010 { 1011 int call_poke_cpu = 0; 1012 pri_t cpupri = cp->cpu_dispatch_pri; 1013 1014 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1015 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1016 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1017 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1018 cp->cpu_runrun = 1; 1019 aston(cp->cpu_dispthread); 1020 if (tpri < kpreemptpri && cp != CPU) 1021 call_poke_cpu = 1; 1022 } 1023 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1024 cp->cpu_kprunrun = 1; 1025 if (cp != CPU) 1026 call_poke_cpu = 1; 1027 } 1028 } 1029 1030 /* 1031 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1032 */ 1033 membar_enter(); 1034 1035 if (call_poke_cpu) 1036 poke_cpu(cp->cpu_id); 1037 } 1038 1039 /* 1040 * Routine used by setbackdq() to balance load across the physical 1041 * processors. Returns a CPU of a lesser loaded chip in the lgroup 1042 * if balancing is necessary, or the "hint" CPU if it's not. 1043 * 1044 * - tp is the thread being enqueued 1045 * - cp is a hint CPU (chosen by cpu_choose()). 1046 * - curchip (if not NULL) is the chip on which the current thread 1047 * is running. 1048 * 1049 * The thread lock for "tp" must be held while calling this routine. 1050 */ 1051 static cpu_t * 1052 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip) 1053 { 1054 int chp_nrun, ochp_nrun; 1055 chip_t *chp, *nchp; 1056 1057 chp = cp->cpu_chip; 1058 chp_nrun = chp->chip_nrunning; 1059 1060 if (chp == curchip) 1061 chp_nrun--; /* Ignore curthread */ 1062 1063 /* 1064 * If this chip isn't at all idle, then let 1065 * run queue balancing do the work. 1066 */ 1067 if (chp_nrun == chp->chip_ncpu) 1068 return (cp); 1069 1070 nchp = chp->chip_balance; 1071 do { 1072 if (nchp == chp || 1073 !CHIP_IN_CPUPART(nchp, tp->t_cpupart)) 1074 continue; 1075 1076 ochp_nrun = nchp->chip_nrunning; 1077 1078 /* 1079 * If the other chip is running less threads, 1080 * or if it's running the same number of threads, but 1081 * has more online logical CPUs, then choose to balance. 1082 */ 1083 if (chp_nrun > ochp_nrun || 1084 (chp_nrun == ochp_nrun && 1085 nchp->chip_ncpu > chp->chip_ncpu)) { 1086 cp = nchp->chip_cpus; 1087 nchp->chip_cpus = cp->cpu_next_chip; 1088 1089 /* 1090 * Find a CPU on the chip in the correct 1091 * partition. We know at least one exists 1092 * because of the CHIP_IN_CPUPART() check above. 1093 */ 1094 while (cp->cpu_part != tp->t_cpupart) 1095 cp = cp->cpu_next_chip; 1096 } 1097 chp->chip_balance = nchp->chip_next_lgrp; 1098 break; 1099 } while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance); 1100 1101 ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart)); 1102 return (cp); 1103 } 1104 1105 /* 1106 * setbackdq() keeps runqs balanced such that the difference in length 1107 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1108 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1109 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1110 * try to keep runqs perfectly balanced regardless of the thread priority. 1111 */ 1112 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1113 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1114 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1115 1116 /* 1117 * Put the specified thread on the back of the dispatcher 1118 * queue corresponding to its current priority. 1119 * 1120 * Called with the thread in transition, onproc or stopped state 1121 * and locked (transition implies locked) and at high spl. 1122 * Returns with the thread in TS_RUN state and still locked. 1123 */ 1124 void 1125 setbackdq(kthread_t *tp) 1126 { 1127 dispq_t *dq; 1128 disp_t *dp; 1129 chip_t *curchip = NULL; 1130 cpu_t *cp; 1131 pri_t tpri; 1132 int bound; 1133 1134 ASSERT(THREAD_LOCK_HELD(tp)); 1135 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1136 1137 if (tp->t_waitrq == 0) { 1138 hrtime_t curtime; 1139 1140 curtime = gethrtime_unscaled(); 1141 (void) cpu_update_pct(tp, curtime); 1142 tp->t_waitrq = curtime; 1143 } else { 1144 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1145 } 1146 1147 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1148 1149 /* 1150 * If thread is "swapped" or on the swap queue don't 1151 * queue it, but wake sched. 1152 */ 1153 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1154 disp_swapped_setrun(tp); 1155 return; 1156 } 1157 1158 tpri = DISP_PRIO(tp); 1159 if (tp == curthread) { 1160 curchip = CPU->cpu_chip; 1161 } 1162 1163 if (ncpus == 1) 1164 cp = tp->t_cpu; 1165 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1166 if (tpri >= kpqpri) { 1167 setkpdq(tp, SETKP_BACK); 1168 return; 1169 } 1170 /* 1171 * Let cpu_choose suggest a CPU. 1172 */ 1173 cp = cpu_choose(tp, tpri); 1174 1175 if (tp->t_cpupart == cp->cpu_part) { 1176 int qlen; 1177 1178 /* 1179 * Select another CPU if we need 1180 * to do some load balancing across the 1181 * physical processors. 1182 */ 1183 if (CHIP_SHOULD_BALANCE(cp->cpu_chip)) 1184 cp = chip_balance(tp, cp, curchip); 1185 1186 /* 1187 * Balance across the run queues 1188 */ 1189 qlen = RUNQ_LEN(cp, tpri); 1190 if (tpri >= RUNQ_MATCH_PRI && 1191 !(tp->t_schedflag & TS_RUNQMATCH)) 1192 qlen -= RUNQ_MAX_DIFF; 1193 if (qlen > 0) { 1194 cpu_t *np; 1195 1196 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 1197 np = cp->cpu_next_part; 1198 else { 1199 if ((np = cp->cpu_next_lpl) == cp) 1200 np = cp->cpu_next_part; 1201 } 1202 if (RUNQ_LEN(np, tpri) < qlen) 1203 cp = np; 1204 } 1205 } else { 1206 /* 1207 * Migrate to a cpu in the new partition. 1208 */ 1209 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1210 tp->t_lpl, tp->t_pri, NULL); 1211 } 1212 bound = 0; 1213 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1214 } else { 1215 /* 1216 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1217 * a short time until weak binding that existed when the 1218 * strong binding was established has dropped) so we must 1219 * favour weak binding over strong. 1220 */ 1221 cp = tp->t_weakbound_cpu ? 1222 tp->t_weakbound_cpu : tp->t_bound_cpu; 1223 bound = 1; 1224 } 1225 dp = cp->cpu_disp; 1226 disp_lock_enter_high(&dp->disp_lock); 1227 1228 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1229 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1230 tpri, cp, tp); 1231 1232 #ifndef NPROBE 1233 /* Kernel probe */ 1234 if (tnf_tracing_active) 1235 tnf_thread_queue(tp, cp, tpri); 1236 #endif /* NPROBE */ 1237 1238 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1239 1240 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1241 tp->t_disp_queue = dp; 1242 tp->t_link = NULL; 1243 1244 dq = &dp->disp_q[tpri]; 1245 dp->disp_nrunnable++; 1246 membar_enter(); 1247 1248 if (dq->dq_sruncnt++ != 0) { 1249 ASSERT(dq->dq_first != NULL); 1250 dq->dq_last->t_link = tp; 1251 dq->dq_last = tp; 1252 } else { 1253 ASSERT(dq->dq_first == NULL); 1254 ASSERT(dq->dq_last == NULL); 1255 dq->dq_first = dq->dq_last = tp; 1256 BT_SET(dp->disp_qactmap, tpri); 1257 if (tpri > dp->disp_maxrunpri) { 1258 dp->disp_maxrunpri = tpri; 1259 membar_enter(); 1260 cpu_resched(cp, tpri); 1261 } 1262 } 1263 1264 if (!bound && tpri > dp->disp_max_unbound_pri) { 1265 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1266 cp == CPU) { 1267 /* 1268 * If there are no other unbound threads on the 1269 * run queue, don't allow other CPUs to steal 1270 * this thread while we are in the middle of a 1271 * context switch. We may just switch to it 1272 * again right away. CPU_DISP_DONTSTEAL is cleared 1273 * in swtch and swtch_to. 1274 */ 1275 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1276 } 1277 dp->disp_max_unbound_pri = tpri; 1278 } 1279 (*disp_enq_thread)(cp, bound); 1280 } 1281 1282 /* 1283 * Put the specified thread on the front of the dispatcher 1284 * queue corresponding to its current priority. 1285 * 1286 * Called with the thread in transition, onproc or stopped state 1287 * and locked (transition implies locked) and at high spl. 1288 * Returns with the thread in TS_RUN state and still locked. 1289 */ 1290 void 1291 setfrontdq(kthread_t *tp) 1292 { 1293 disp_t *dp; 1294 dispq_t *dq; 1295 cpu_t *cp; 1296 pri_t tpri; 1297 int bound; 1298 1299 ASSERT(THREAD_LOCK_HELD(tp)); 1300 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1301 1302 if (tp->t_waitrq == 0) { 1303 hrtime_t curtime; 1304 1305 curtime = gethrtime_unscaled(); 1306 (void) cpu_update_pct(tp, curtime); 1307 tp->t_waitrq = curtime; 1308 } else { 1309 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1310 } 1311 1312 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1313 1314 /* 1315 * If thread is "swapped" or on the swap queue don't 1316 * queue it, but wake sched. 1317 */ 1318 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1319 disp_swapped_setrun(tp); 1320 return; 1321 } 1322 1323 tpri = DISP_PRIO(tp); 1324 if (ncpus == 1) 1325 cp = tp->t_cpu; 1326 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1327 if (tpri >= kpqpri) { 1328 setkpdq(tp, SETKP_FRONT); 1329 return; 1330 } 1331 cp = tp->t_cpu; 1332 if (tp->t_cpupart == cp->cpu_part) { 1333 /* 1334 * If we are of higher or equal priority than 1335 * the highest priority runnable thread of 1336 * the current CPU, just pick this CPU. Otherwise 1337 * Let cpu_choose() select the CPU. If this cpu 1338 * is the target of an offline request then do not 1339 * pick it - a thread_nomigrate() on the in motion 1340 * cpu relies on this when it forces a preempt. 1341 */ 1342 if (tpri < cp->cpu_disp->disp_maxrunpri || 1343 cp == cpu_inmotion) 1344 cp = cpu_choose(tp, tpri); 1345 } else { 1346 /* 1347 * Migrate to a cpu in the new partition. 1348 */ 1349 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1350 tp->t_lpl, tp->t_pri, NULL); 1351 } 1352 bound = 0; 1353 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1354 } else { 1355 /* 1356 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1357 * a short time until weak binding that existed when the 1358 * strong binding was established has dropped) so we must 1359 * favour weak binding over strong. 1360 */ 1361 cp = tp->t_weakbound_cpu ? 1362 tp->t_weakbound_cpu : tp->t_bound_cpu; 1363 bound = 1; 1364 } 1365 dp = cp->cpu_disp; 1366 disp_lock_enter_high(&dp->disp_lock); 1367 1368 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1369 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1370 1371 #ifndef NPROBE 1372 /* Kernel probe */ 1373 if (tnf_tracing_active) 1374 tnf_thread_queue(tp, cp, tpri); 1375 #endif /* NPROBE */ 1376 1377 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1378 1379 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1380 tp->t_disp_queue = dp; 1381 1382 dq = &dp->disp_q[tpri]; 1383 dp->disp_nrunnable++; 1384 membar_enter(); 1385 1386 if (dq->dq_sruncnt++ != 0) { 1387 ASSERT(dq->dq_last != NULL); 1388 tp->t_link = dq->dq_first; 1389 dq->dq_first = tp; 1390 } else { 1391 ASSERT(dq->dq_last == NULL); 1392 ASSERT(dq->dq_first == NULL); 1393 tp->t_link = NULL; 1394 dq->dq_first = dq->dq_last = tp; 1395 BT_SET(dp->disp_qactmap, tpri); 1396 if (tpri > dp->disp_maxrunpri) { 1397 dp->disp_maxrunpri = tpri; 1398 membar_enter(); 1399 cpu_resched(cp, tpri); 1400 } 1401 } 1402 1403 if (!bound && tpri > dp->disp_max_unbound_pri) { 1404 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1405 cp == CPU) { 1406 /* 1407 * If there are no other unbound threads on the 1408 * run queue, don't allow other CPUs to steal 1409 * this thread while we are in the middle of a 1410 * context switch. We may just switch to it 1411 * again right away. CPU_DISP_DONTSTEAL is cleared 1412 * in swtch and swtch_to. 1413 */ 1414 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1415 } 1416 dp->disp_max_unbound_pri = tpri; 1417 } 1418 (*disp_enq_thread)(cp, bound); 1419 } 1420 1421 /* 1422 * Put a high-priority unbound thread on the kp queue 1423 */ 1424 static void 1425 setkpdq(kthread_t *tp, int borf) 1426 { 1427 dispq_t *dq; 1428 disp_t *dp; 1429 cpu_t *cp; 1430 pri_t tpri; 1431 1432 tpri = DISP_PRIO(tp); 1433 1434 dp = &tp->t_cpupart->cp_kp_queue; 1435 disp_lock_enter_high(&dp->disp_lock); 1436 1437 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1438 1439 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1440 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1441 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1442 tp->t_disp_queue = dp; 1443 dp->disp_nrunnable++; 1444 dq = &dp->disp_q[tpri]; 1445 1446 if (dq->dq_sruncnt++ != 0) { 1447 if (borf == SETKP_BACK) { 1448 ASSERT(dq->dq_first != NULL); 1449 tp->t_link = NULL; 1450 dq->dq_last->t_link = tp; 1451 dq->dq_last = tp; 1452 } else { 1453 ASSERT(dq->dq_last != NULL); 1454 tp->t_link = dq->dq_first; 1455 dq->dq_first = tp; 1456 } 1457 } else { 1458 if (borf == SETKP_BACK) { 1459 ASSERT(dq->dq_first == NULL); 1460 ASSERT(dq->dq_last == NULL); 1461 dq->dq_first = dq->dq_last = tp; 1462 } else { 1463 ASSERT(dq->dq_last == NULL); 1464 ASSERT(dq->dq_first == NULL); 1465 tp->t_link = NULL; 1466 dq->dq_first = dq->dq_last = tp; 1467 } 1468 BT_SET(dp->disp_qactmap, tpri); 1469 if (tpri > dp->disp_max_unbound_pri) 1470 dp->disp_max_unbound_pri = tpri; 1471 if (tpri > dp->disp_maxrunpri) { 1472 dp->disp_maxrunpri = tpri; 1473 membar_enter(); 1474 } 1475 } 1476 1477 cp = tp->t_cpu; 1478 if (tp->t_cpupart != cp->cpu_part) { 1479 /* migrate to a cpu in the new partition */ 1480 cp = tp->t_cpupart->cp_cpulist; 1481 } 1482 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1483 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1484 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1485 1486 #ifndef NPROBE 1487 /* Kernel probe */ 1488 if (tnf_tracing_active) 1489 tnf_thread_queue(tp, cp, tpri); 1490 #endif /* NPROBE */ 1491 1492 if (cp->cpu_chosen_level < tpri) 1493 cp->cpu_chosen_level = tpri; 1494 cpu_resched(cp, tpri); 1495 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1496 (*disp_enq_thread)(cp, 0); 1497 } 1498 1499 /* 1500 * Remove a thread from the dispatcher queue if it is on it. 1501 * It is not an error if it is not found but we return whether 1502 * or not it was found in case the caller wants to check. 1503 */ 1504 int 1505 dispdeq(kthread_t *tp) 1506 { 1507 disp_t *dp; 1508 dispq_t *dq; 1509 kthread_t *rp; 1510 kthread_t *trp; 1511 kthread_t **ptp; 1512 int tpri; 1513 1514 ASSERT(THREAD_LOCK_HELD(tp)); 1515 1516 if (tp->t_state != TS_RUN) 1517 return (0); 1518 1519 /* 1520 * The thread is "swapped" or is on the swap queue and 1521 * hence no longer on the run queue, so return true. 1522 */ 1523 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1524 return (1); 1525 1526 tpri = DISP_PRIO(tp); 1527 dp = tp->t_disp_queue; 1528 ASSERT(tpri < dp->disp_npri); 1529 dq = &dp->disp_q[tpri]; 1530 ptp = &dq->dq_first; 1531 rp = *ptp; 1532 trp = NULL; 1533 1534 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1535 1536 /* 1537 * Search for thread in queue. 1538 * Double links would simplify this at the expense of disp/setrun. 1539 */ 1540 while (rp != tp && rp != NULL) { 1541 trp = rp; 1542 ptp = &trp->t_link; 1543 rp = trp->t_link; 1544 } 1545 1546 if (rp == NULL) { 1547 panic("dispdeq: thread not on queue"); 1548 } 1549 1550 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1551 1552 /* 1553 * Found it so remove it from queue. 1554 */ 1555 if ((*ptp = rp->t_link) == NULL) 1556 dq->dq_last = trp; 1557 1558 dp->disp_nrunnable--; 1559 if (--dq->dq_sruncnt == 0) { 1560 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1561 if (dp->disp_nrunnable == 0) { 1562 dp->disp_max_unbound_pri = -1; 1563 dp->disp_maxrunpri = -1; 1564 } else if (tpri == dp->disp_maxrunpri) { 1565 int ipri; 1566 1567 ipri = bt_gethighbit(dp->disp_qactmap, 1568 dp->disp_maxrunpri >> BT_ULSHIFT); 1569 if (ipri < dp->disp_max_unbound_pri) 1570 dp->disp_max_unbound_pri = ipri; 1571 dp->disp_maxrunpri = ipri; 1572 } 1573 } 1574 tp->t_link = NULL; 1575 THREAD_TRANSITION(tp); /* put in intermediate state */ 1576 return (1); 1577 } 1578 1579 1580 /* 1581 * dq_sruninc and dq_srundec are public functions for 1582 * incrementing/decrementing the sruncnts when a thread on 1583 * a dispatcher queue is made schedulable/unschedulable by 1584 * resetting the TS_LOAD flag. 1585 * 1586 * The caller MUST have the thread lock and therefore the dispatcher 1587 * queue lock so that the operation which changes 1588 * the flag, the operation that checks the status of the thread to 1589 * determine if it's on a disp queue AND the call to this function 1590 * are one atomic operation with respect to interrupts. 1591 */ 1592 1593 /* 1594 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1595 */ 1596 void 1597 dq_sruninc(kthread_t *t) 1598 { 1599 ASSERT(t->t_state == TS_RUN); 1600 ASSERT(t->t_schedflag & TS_LOAD); 1601 1602 THREAD_TRANSITION(t); 1603 setfrontdq(t); 1604 } 1605 1606 /* 1607 * See comment on calling conventions above. 1608 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1609 */ 1610 void 1611 dq_srundec(kthread_t *t) 1612 { 1613 ASSERT(t->t_schedflag & TS_LOAD); 1614 1615 (void) dispdeq(t); 1616 disp_swapped_enq(t); 1617 } 1618 1619 /* 1620 * Change the dispatcher lock of thread to the "swapped_lock" 1621 * and return with thread lock still held. 1622 * 1623 * Called with thread_lock held, in transition state, and at high spl. 1624 */ 1625 void 1626 disp_swapped_enq(kthread_t *tp) 1627 { 1628 ASSERT(THREAD_LOCK_HELD(tp)); 1629 ASSERT(tp->t_schedflag & TS_LOAD); 1630 1631 switch (tp->t_state) { 1632 case TS_RUN: 1633 disp_lock_enter_high(&swapped_lock); 1634 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1635 break; 1636 case TS_ONPROC: 1637 disp_lock_enter_high(&swapped_lock); 1638 THREAD_TRANSITION(tp); 1639 wake_sched_sec = 1; /* tell clock to wake sched */ 1640 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1641 break; 1642 default: 1643 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1644 } 1645 } 1646 1647 /* 1648 * This routine is called by setbackdq/setfrontdq if the thread is 1649 * not loaded or loaded and on the swap queue. 1650 * 1651 * Thread state TS_SLEEP implies that a swapped thread 1652 * has been woken up and needs to be swapped in by the swapper. 1653 * 1654 * Thread state TS_RUN, it implies that the priority of a swapped 1655 * thread is being increased by scheduling class (e.g. ts_update). 1656 */ 1657 static void 1658 disp_swapped_setrun(kthread_t *tp) 1659 { 1660 ASSERT(THREAD_LOCK_HELD(tp)); 1661 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1662 1663 switch (tp->t_state) { 1664 case TS_SLEEP: 1665 disp_lock_enter_high(&swapped_lock); 1666 /* 1667 * Wakeup sched immediately (i.e., next tick) if the 1668 * thread priority is above maxclsyspri. 1669 */ 1670 if (DISP_PRIO(tp) > maxclsyspri) 1671 wake_sched = 1; 1672 else 1673 wake_sched_sec = 1; 1674 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1675 break; 1676 case TS_RUN: /* called from ts_update */ 1677 break; 1678 default: 1679 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1680 } 1681 } 1682 1683 1684 /* 1685 * Make a thread give up its processor. Find the processor on 1686 * which this thread is executing, and have that processor 1687 * preempt. 1688 */ 1689 void 1690 cpu_surrender(kthread_t *tp) 1691 { 1692 cpu_t *cpup; 1693 int max_pri; 1694 int max_run_pri; 1695 klwp_t *lwp; 1696 1697 ASSERT(THREAD_LOCK_HELD(tp)); 1698 1699 if (tp->t_state != TS_ONPROC) 1700 return; 1701 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1702 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1703 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1704 if (max_pri < max_run_pri) 1705 max_pri = max_run_pri; 1706 1707 cpup->cpu_runrun = 1; 1708 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1709 cpup->cpu_kprunrun = 1; 1710 } 1711 1712 /* 1713 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1714 */ 1715 membar_enter(); 1716 1717 DTRACE_SCHED1(surrender, kthread_t *, tp); 1718 1719 /* 1720 * Make the target thread take an excursion through trap() 1721 * to do preempt() (unless we're already in trap or post_syscall, 1722 * calling cpu_surrender via CL_TRAPRET). 1723 */ 1724 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1725 lwp->lwp_state != LWP_USER) { 1726 aston(tp); 1727 if (cpup != CPU) 1728 poke_cpu(cpup->cpu_id); 1729 } 1730 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1731 "cpu_surrender:tid %p cpu %p", tp, cpup); 1732 } 1733 1734 1735 /* 1736 * Commit to and ratify a scheduling decision 1737 */ 1738 /*ARGSUSED*/ 1739 static kthread_t * 1740 disp_ratify(kthread_t *tp, disp_t *kpq) 1741 { 1742 pri_t tpri, maxpri; 1743 pri_t maxkpri; 1744 cpu_t *cpup; 1745 1746 ASSERT(tp != NULL); 1747 /* 1748 * Commit to, then ratify scheduling decision 1749 */ 1750 cpup = CPU; 1751 if (cpup->cpu_runrun != 0) 1752 cpup->cpu_runrun = 0; 1753 if (cpup->cpu_kprunrun != 0) 1754 cpup->cpu_kprunrun = 0; 1755 if (cpup->cpu_chosen_level != -1) 1756 cpup->cpu_chosen_level = -1; 1757 membar_enter(); 1758 tpri = DISP_PRIO(tp); 1759 maxpri = cpup->cpu_disp->disp_maxrunpri; 1760 maxkpri = kpq->disp_maxrunpri; 1761 if (maxpri < maxkpri) 1762 maxpri = maxkpri; 1763 if (tpri < maxpri) { 1764 /* 1765 * should have done better 1766 * put this one back and indicate to try again 1767 */ 1768 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1769 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1770 thread_lock_high(tp); 1771 THREAD_TRANSITION(tp); 1772 setfrontdq(tp); 1773 thread_unlock_nopreempt(tp); 1774 1775 tp = NULL; 1776 } 1777 return (tp); 1778 } 1779 1780 /* 1781 * See if there is any work on the dispatcher queue for other CPUs. 1782 * If there is, dequeue the best thread and return. 1783 */ 1784 static kthread_t * 1785 disp_getwork(cpu_t *cp) 1786 { 1787 cpu_t *ocp; /* other CPU */ 1788 cpu_t *ocp_start; 1789 cpu_t *tcp; /* target local CPU */ 1790 kthread_t *tp; 1791 pri_t maxpri; 1792 disp_t *kpq; /* kp queue for this partition */ 1793 lpl_t *lpl, *lpl_leaf; 1794 int hint, leafidx; 1795 1796 maxpri = -1; 1797 tcp = NULL; 1798 1799 kpq = &cp->cpu_part->cp_kp_queue; 1800 while (kpq->disp_maxrunpri >= 0) { 1801 /* 1802 * Try to take a thread from the kp_queue. 1803 */ 1804 tp = (disp_getbest(kpq)); 1805 if (tp) 1806 return (disp_ratify(tp, kpq)); 1807 } 1808 1809 kpreempt_disable(); /* protect the cpu_active list */ 1810 1811 /* 1812 * Try to find something to do on another CPU's run queue. 1813 * Loop through all other CPUs looking for the one with the highest 1814 * priority unbound thread. 1815 * 1816 * On NUMA machines, the partition's CPUs are consulted in order of 1817 * distance from the current CPU. This way, the first available 1818 * work found is also the closest, and will suffer the least 1819 * from being migrated. 1820 */ 1821 lpl = lpl_leaf = cp->cpu_lpl; 1822 hint = leafidx = 0; 1823 1824 /* 1825 * This loop traverses the lpl hierarchy. Higher level lpls represent 1826 * broader levels of locality 1827 */ 1828 do { 1829 /* This loop iterates over the lpl's leaves */ 1830 do { 1831 if (lpl_leaf != cp->cpu_lpl) 1832 ocp = lpl_leaf->lpl_cpus; 1833 else 1834 ocp = cp->cpu_next_lpl; 1835 1836 /* This loop iterates over the CPUs in the leaf */ 1837 ocp_start = ocp; 1838 do { 1839 pri_t pri; 1840 1841 ASSERT(CPU_ACTIVE(ocp)); 1842 1843 /* 1844 * End our stroll around the partition if: 1845 * 1846 * - Something became runnable on the local 1847 * queue 1848 * 1849 * - We're at the broadest level of locality and 1850 * we happen across another idle CPU. At the 1851 * highest level of locality, all CPUs will 1852 * walk the partition's CPUs in the same 1853 * order, so we can end our stroll taking 1854 * comfort in knowing the other idle CPU is 1855 * already covering the next portion of the 1856 * list. 1857 */ 1858 if (cp->cpu_disp->disp_nrunnable != 0) 1859 break; 1860 if (ocp->cpu_dispatch_pri == -1) { 1861 if (ocp->cpu_disp_flags & 1862 CPU_DISP_HALTED) 1863 continue; 1864 else if (lpl->lpl_parent == NULL) 1865 break; 1866 } 1867 1868 /* 1869 * If there's only one thread and the CPU 1870 * is in the middle of a context switch, 1871 * or it's currently running the idle thread, 1872 * don't steal it. 1873 */ 1874 if ((ocp->cpu_disp_flags & 1875 CPU_DISP_DONTSTEAL) && 1876 ocp->cpu_disp->disp_nrunnable == 1) 1877 continue; 1878 1879 pri = ocp->cpu_disp->disp_max_unbound_pri; 1880 if (pri > maxpri) { 1881 maxpri = pri; 1882 tcp = ocp; 1883 } 1884 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 1885 1886 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 1887 leafidx = 0; 1888 lpl_leaf = lpl->lpl_rset[leafidx]; 1889 } 1890 } while (leafidx != hint); 1891 1892 hint = leafidx = lpl->lpl_hint; 1893 if ((lpl = lpl->lpl_parent) != NULL) 1894 lpl_leaf = lpl->lpl_rset[hint]; 1895 } while (!tcp && lpl); 1896 1897 kpreempt_enable(); 1898 1899 /* 1900 * If another queue looks good, and there is still nothing on 1901 * the local queue, try to transfer one or more threads 1902 * from it to our queue. 1903 */ 1904 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 1905 tp = (disp_getbest(tcp->cpu_disp)); 1906 if (tp) 1907 return (disp_ratify(tp, kpq)); 1908 } 1909 return (NULL); 1910 } 1911 1912 1913 /* 1914 * disp_fix_unbound_pri() 1915 * Determines the maximum priority of unbound threads on the queue. 1916 * The priority is kept for the queue, but is only increased, never 1917 * reduced unless some CPU is looking for something on that queue. 1918 * 1919 * The priority argument is the known upper limit. 1920 * 1921 * Perhaps this should be kept accurately, but that probably means 1922 * separate bitmaps for bound and unbound threads. Since only idled 1923 * CPUs will have to do this recalculation, it seems better this way. 1924 */ 1925 static void 1926 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 1927 { 1928 kthread_t *tp; 1929 dispq_t *dq; 1930 ulong_t *dqactmap = dp->disp_qactmap; 1931 ulong_t mapword; 1932 int wx; 1933 1934 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 1935 1936 ASSERT(pri >= 0); /* checked by caller */ 1937 1938 /* 1939 * Start the search at the next lowest priority below the supplied 1940 * priority. This depends on the bitmap implementation. 1941 */ 1942 do { 1943 wx = pri >> BT_ULSHIFT; /* index of word in map */ 1944 1945 /* 1946 * Form mask for all lower priorities in the word. 1947 */ 1948 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 1949 1950 /* 1951 * Get next lower active priority. 1952 */ 1953 if (mapword != 0) { 1954 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 1955 } else if (wx > 0) { 1956 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 1957 if (pri < 0) 1958 break; 1959 } else { 1960 pri = -1; 1961 break; 1962 } 1963 1964 /* 1965 * Search the queue for unbound, runnable threads. 1966 */ 1967 dq = &dp->disp_q[pri]; 1968 tp = dq->dq_first; 1969 1970 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 1971 tp = tp->t_link; 1972 } 1973 1974 /* 1975 * If a thread was found, set the priority and return. 1976 */ 1977 } while (tp == NULL); 1978 1979 /* 1980 * pri holds the maximum unbound thread priority or -1. 1981 */ 1982 if (dp->disp_max_unbound_pri != pri) 1983 dp->disp_max_unbound_pri = pri; 1984 } 1985 1986 /* 1987 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 1988 * check if the CPU to which is was previously bound should have 1989 * its disp_max_unbound_pri increased. 1990 */ 1991 void 1992 disp_adjust_unbound_pri(kthread_t *tp) 1993 { 1994 disp_t *dp; 1995 pri_t tpri; 1996 1997 ASSERT(THREAD_LOCK_HELD(tp)); 1998 1999 /* 2000 * Don't do anything if the thread is not bound, or 2001 * currently not runnable or swapped out. 2002 */ 2003 if (tp->t_bound_cpu == NULL || 2004 tp->t_state != TS_RUN || 2005 tp->t_schedflag & TS_ON_SWAPQ) 2006 return; 2007 2008 tpri = DISP_PRIO(tp); 2009 dp = tp->t_bound_cpu->cpu_disp; 2010 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2011 if (tpri > dp->disp_max_unbound_pri) 2012 dp->disp_max_unbound_pri = tpri; 2013 } 2014 2015 /* 2016 * disp_getbest() - de-queue the highest priority unbound runnable thread. 2017 * returns with the thread unlocked and onproc 2018 * but at splhigh (like disp()). 2019 * returns NULL if nothing found. 2020 * 2021 * Passed a pointer to a dispatch queue not associated with this CPU. 2022 */ 2023 static kthread_t * 2024 disp_getbest(disp_t *dp) 2025 { 2026 kthread_t *tp; 2027 dispq_t *dq; 2028 pri_t pri; 2029 cpu_t *cp; 2030 2031 disp_lock_enter(&dp->disp_lock); 2032 2033 /* 2034 * If there is nothing to run, or the CPU is in the middle of a 2035 * context switch of the only thread, return NULL. 2036 */ 2037 pri = dp->disp_max_unbound_pri; 2038 if (pri == -1 || 2039 (dp->disp_cpu != NULL && 2040 (dp->disp_cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2041 dp->disp_cpu->cpu_disp->disp_nrunnable == 1)) { 2042 disp_lock_exit_nopreempt(&dp->disp_lock); 2043 return (NULL); 2044 } 2045 2046 dq = &dp->disp_q[pri]; 2047 tp = dq->dq_first; 2048 2049 /* 2050 * Skip over bound threads. 2051 * Bound threads can be here even though disp_max_unbound_pri 2052 * indicated this level. Besides, it not always accurate because it 2053 * isn't reduced until another CPU looks for work. 2054 * Note that tp could be NULL right away due to this. 2055 */ 2056 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2057 tp = tp->t_link; 2058 } 2059 2060 /* 2061 * If there were no unbound threads on this queue, find the queue 2062 * where they are and then return NULL so that other CPUs will be 2063 * considered. 2064 */ 2065 if (tp == NULL) { 2066 disp_fix_unbound_pri(dp, pri); 2067 disp_lock_exit_nopreempt(&dp->disp_lock); 2068 return (NULL); 2069 } 2070 2071 /* 2072 * Found a runnable, unbound thread, so remove it from queue. 2073 * dispdeq() requires that we have the thread locked, and we do, 2074 * by virtue of holding the dispatch queue lock. dispdeq() will 2075 * put the thread in transition state, thereby dropping the dispq 2076 * lock. 2077 */ 2078 #ifdef DEBUG 2079 { 2080 int thread_was_on_queue; 2081 2082 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2083 ASSERT(thread_was_on_queue); 2084 } 2085 #else /* DEBUG */ 2086 (void) dispdeq(tp); /* drops disp_lock */ 2087 #endif /* DEBUG */ 2088 2089 tp->t_schedflag |= TS_DONT_SWAP; 2090 2091 /* 2092 * Setup thread to run on the current CPU. 2093 */ 2094 cp = CPU; 2095 2096 tp->t_disp_queue = cp->cpu_disp; 2097 2098 cp->cpu_dispthread = tp; /* protected by spl only */ 2099 cp->cpu_dispatch_pri = pri; 2100 ASSERT(pri == DISP_PRIO(tp)); 2101 2102 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2103 2104 /* 2105 * Return with spl high so that swtch() won't need to raise it. 2106 * The disp_lock was dropped by dispdeq(). 2107 */ 2108 2109 return (tp); 2110 } 2111 2112 /* 2113 * disp_bound_common() - common routine for higher level functions 2114 * that check for bound threads under certain conditions. 2115 * If 'threadlistsafe' is set then there is no need to acquire 2116 * pidlock to stop the thread list from changing (eg, if 2117 * disp_bound_* is called with cpus paused). 2118 */ 2119 static int 2120 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2121 { 2122 int found = 0; 2123 kthread_t *tp; 2124 2125 ASSERT(flag); 2126 2127 if (!threadlistsafe) 2128 mutex_enter(&pidlock); 2129 tp = curthread; /* faster than allthreads */ 2130 do { 2131 if (tp->t_state != TS_FREE) { 2132 /* 2133 * If an interrupt thread is busy, but the 2134 * caller doesn't care (i.e. BOUND_INTR is off), 2135 * then just ignore it and continue through. 2136 */ 2137 if ((tp->t_flag & T_INTR_THREAD) && 2138 !(flag & BOUND_INTR)) 2139 continue; 2140 2141 /* 2142 * Skip the idle thread for the CPU 2143 * we're about to set offline. 2144 */ 2145 if (tp == cp->cpu_idle_thread) 2146 continue; 2147 2148 /* 2149 * Skip the pause thread for the CPU 2150 * we're about to set offline. 2151 */ 2152 if (tp == cp->cpu_pause_thread) 2153 continue; 2154 2155 if ((flag & BOUND_CPU) && 2156 (tp->t_bound_cpu == cp || 2157 tp->t_bind_cpu == cp->cpu_id || 2158 tp->t_weakbound_cpu == cp)) { 2159 found = 1; 2160 break; 2161 } 2162 2163 if ((flag & BOUND_PARTITION) && 2164 (tp->t_cpupart == cp->cpu_part)) { 2165 found = 1; 2166 break; 2167 } 2168 } 2169 } while ((tp = tp->t_next) != curthread && found == 0); 2170 if (!threadlistsafe) 2171 mutex_exit(&pidlock); 2172 return (found); 2173 } 2174 2175 /* 2176 * disp_bound_threads - return nonzero if threads are bound to the processor. 2177 * Called infrequently. Keep this simple. 2178 * Includes threads that are asleep or stopped but not onproc. 2179 */ 2180 int 2181 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2182 { 2183 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2184 } 2185 2186 /* 2187 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2188 * to the given processor, including interrupt threads. 2189 */ 2190 int 2191 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2192 { 2193 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2194 } 2195 2196 /* 2197 * disp_bound_partition - return nonzero if threads are bound to the same 2198 * partition as the processor. 2199 * Called infrequently. Keep this simple. 2200 * Includes threads that are asleep or stopped but not onproc. 2201 */ 2202 int 2203 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2204 { 2205 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2206 } 2207 2208 /* 2209 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2210 * threads to other CPUs. 2211 */ 2212 void 2213 disp_cpu_inactive(cpu_t *cp) 2214 { 2215 kthread_t *tp; 2216 disp_t *dp = cp->cpu_disp; 2217 dispq_t *dq; 2218 pri_t pri; 2219 int wasonq; 2220 2221 disp_lock_enter(&dp->disp_lock); 2222 while ((pri = dp->disp_max_unbound_pri) != -1) { 2223 dq = &dp->disp_q[pri]; 2224 tp = dq->dq_first; 2225 2226 /* 2227 * Skip over bound threads. 2228 */ 2229 while (tp != NULL && tp->t_bound_cpu != NULL) { 2230 tp = tp->t_link; 2231 } 2232 2233 if (tp == NULL) { 2234 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2235 disp_fix_unbound_pri(dp, pri); 2236 continue; 2237 } 2238 2239 wasonq = dispdeq(tp); /* drops disp_lock */ 2240 ASSERT(wasonq); 2241 ASSERT(tp->t_weakbound_cpu == NULL); 2242 2243 setbackdq(tp); 2244 /* 2245 * Called from cpu_offline: 2246 * 2247 * cp has already been removed from the list of active cpus 2248 * and tp->t_cpu has been changed so there is no risk of 2249 * tp ending up back on cp. 2250 * 2251 * Called from cpupart_move_cpu: 2252 * 2253 * The cpu has moved to a new cpupart. Any threads that 2254 * were on it's dispatch queues before the move remain 2255 * in the old partition and can't run in the new partition. 2256 */ 2257 ASSERT(tp->t_cpu != cp); 2258 thread_unlock(tp); 2259 2260 disp_lock_enter(&dp->disp_lock); 2261 } 2262 disp_lock_exit(&dp->disp_lock); 2263 } 2264 2265 /* 2266 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2267 * The hint passed in is used as a starting point so we don't favor 2268 * CPU 0 or any other CPU. The caller should pass in the most recently 2269 * used CPU for the thread. 2270 * 2271 * The lgroup and priority are used to determine the best CPU to run on 2272 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2273 * the thread priority will indicate whether the thread will actually run 2274 * there. To pick the best CPU, the CPUs inside and outside of the given 2275 * lgroup which are running the lowest priority threads are found. The 2276 * remote CPU is chosen only if the thread will not run locally on a CPU 2277 * within the lgroup, but will run on the remote CPU. If the thread 2278 * cannot immediately run on any CPU, the best local CPU will be chosen. 2279 * 2280 * The lpl specified also identifies the cpu partition from which 2281 * disp_lowpri_cpu should select a CPU. 2282 * 2283 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2284 * behalf of the current thread. (curthread is looking for a new cpu) 2285 * In this case, cpu_dispatch_pri for this thread's cpu should be 2286 * ignored. 2287 * 2288 * If a cpu is the target of an offline request then try to avoid it. 2289 * 2290 * This function must be called at either high SPL, or with preemption 2291 * disabled, so that the "hint" CPU cannot be removed from the online 2292 * CPU list while we are traversing it. 2293 */ 2294 cpu_t * 2295 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2296 { 2297 cpu_t *bestcpu; 2298 cpu_t *besthomecpu; 2299 cpu_t *cp, *cpstart; 2300 2301 pri_t bestpri; 2302 pri_t cpupri; 2303 2304 klgrpset_t done; 2305 klgrpset_t cur_set; 2306 2307 lpl_t *lpl_iter, *lpl_leaf; 2308 int i; 2309 2310 /* 2311 * Scan for a CPU currently running the lowest priority thread. 2312 * Cannot get cpu_lock here because it is adaptive. 2313 * We do not require lock on CPU list. 2314 */ 2315 ASSERT(hint != NULL); 2316 ASSERT(lpl != NULL); 2317 ASSERT(lpl->lpl_ncpu > 0); 2318 2319 /* 2320 * First examine local CPUs. Note that it's possible the hint CPU 2321 * passed in in remote to the specified home lgroup. If our priority 2322 * isn't sufficient enough such that we can run immediately at home, 2323 * then examine CPUs remote to our home lgroup. 2324 * We would like to give preference to CPUs closest to "home". 2325 * If we can't find a CPU where we'll run at a given level 2326 * of locality, we expand our search to include the next level. 2327 */ 2328 bestcpu = besthomecpu = NULL; 2329 klgrpset_clear(done); 2330 /* start with lpl we were passed */ 2331 2332 lpl_iter = lpl; 2333 2334 do { 2335 2336 bestpri = SHRT_MAX; 2337 klgrpset_clear(cur_set); 2338 2339 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2340 lpl_leaf = lpl_iter->lpl_rset[i]; 2341 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2342 continue; 2343 2344 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2345 2346 if (hint->cpu_lpl == lpl_leaf) 2347 cp = cpstart = hint; 2348 else 2349 cp = cpstart = lpl_leaf->lpl_cpus; 2350 2351 do { 2352 2353 if (cp == curcpu) 2354 cpupri = -1; 2355 else if (cp == cpu_inmotion) 2356 cpupri = SHRT_MAX; 2357 else 2358 cpupri = cp->cpu_dispatch_pri; 2359 2360 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2361 cpupri = cp->cpu_disp->disp_maxrunpri; 2362 if (cp->cpu_chosen_level > cpupri) 2363 cpupri = cp->cpu_chosen_level; 2364 if (cpupri < bestpri) { 2365 if (CPU_IDLING(cpupri)) { 2366 ASSERT((cp->cpu_flags & 2367 CPU_QUIESCED) == 0); 2368 return (cp); 2369 } 2370 bestcpu = cp; 2371 bestpri = cpupri; 2372 } 2373 } while ((cp = cp->cpu_next_lpl) != cpstart); 2374 } 2375 2376 if (bestcpu && (tpri > bestpri)) { 2377 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2378 return (bestcpu); 2379 } 2380 if (besthomecpu == NULL) 2381 besthomecpu = bestcpu; 2382 /* 2383 * Add the lgrps we just considered to the "done" set 2384 */ 2385 klgrpset_or(done, cur_set); 2386 2387 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2388 2389 /* 2390 * The specified priority isn't high enough to run immediately 2391 * anywhere, so just return the best CPU from the home lgroup. 2392 */ 2393 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2394 return (besthomecpu); 2395 } 2396 2397 /* 2398 * This routine provides the generic idle cpu function for all processors. 2399 * If a processor has some specific code to execute when idle (say, to stop 2400 * the pipeline and save power) then that routine should be defined in the 2401 * processors specific code (module_xx.c) and the global variable idle_cpu 2402 * set to that function. 2403 */ 2404 static void 2405 generic_idle_cpu(void) 2406 { 2407 } 2408 2409 /*ARGSUSED*/ 2410 static void 2411 generic_enq_thread(cpu_t *cpu, int bound) 2412 { 2413 } 2414 2415 /* 2416 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2417 * - t->t_cpu is not in this thread's assigned lgrp 2418 * - the time since the thread last came off t->t_cpu exceeds the 2419 * rechoose time for this cpu (ignore this if t is curthread in 2420 * which case it's on CPU and t->t_disp_time is inaccurate) 2421 * - t->t_cpu is presently the target of an offline or partition move 2422 * request 2423 */ 2424 static cpu_t * 2425 cpu_choose(kthread_t *t, pri_t tpri) 2426 { 2427 ASSERT(tpri < kpqpri); 2428 2429 if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) && 2430 t != curthread) || t->t_cpu == cpu_inmotion) { 2431 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2432 } 2433 2434 /* 2435 * Take a trip through disp_lowpri_cpu() if the thread was 2436 * running outside it's home lgroup 2437 */ 2438 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2439 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2440 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2441 (t == curthread) ? t->t_cpu : NULL)); 2442 } 2443 return (t->t_cpu); 2444 } 2445