1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/chip.h> 54 #include <sys/schedctl.h> 55 #include <sys/atomic.h> 56 #include <sys/dtrace.h> 57 #include <sys/sdt.h> 58 59 #include <vm/as.h> 60 61 #define BOUND_CPU 0x1 62 #define BOUND_PARTITION 0x2 63 #define BOUND_INTR 0x4 64 65 /* Dispatch queue allocation structure and functions */ 66 struct disp_queue_info { 67 disp_t *dp; 68 dispq_t *olddispq; 69 dispq_t *newdispq; 70 ulong_t *olddqactmap; 71 ulong_t *newdqactmap; 72 int oldnglobpris; 73 }; 74 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 75 disp_t *dp); 76 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 77 static void disp_dq_free(struct disp_queue_info *dptr); 78 79 /* platform-specific routine to call when processor is idle */ 80 static void generic_idle_cpu(); 81 void (*idle_cpu)() = generic_idle_cpu; 82 83 /* routines invoked when a CPU enters/exits the idle loop */ 84 static void idle_enter(); 85 static void idle_exit(); 86 87 /* platform-specific routine to call when thread is enqueued */ 88 static void generic_enq_thread(cpu_t *, int); 89 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 90 91 pri_t kpreemptpri; /* priority where kernel preemption applies */ 92 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 93 pri_t intr_pri; /* interrupt thread priority base level */ 94 95 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 96 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 97 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 98 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 99 int nswapped; /* total number of swapped threads */ 100 void disp_swapped_enq(kthread_t *tp); 101 static void disp_swapped_setrun(kthread_t *tp); 102 static void cpu_resched(cpu_t *cp, pri_t tpri); 103 104 /* 105 * If this is set, only interrupt threads will cause kernel preemptions. 106 * This is done by changing the value of kpreemptpri. kpreemptpri 107 * will either be the max sysclass pri + 1 or the min interrupt pri. 108 */ 109 int only_intr_kpreempt; 110 111 extern void set_idle_cpu(int cpun); 112 extern void unset_idle_cpu(int cpun); 113 static void setkpdq(kthread_t *tp, int borf); 114 #define SETKP_BACK 0 115 #define SETKP_FRONT 1 116 /* 117 * Parameter that determines how recently a thread must have run 118 * on the CPU to be considered loosely-bound to that CPU to reduce 119 * cold cache effects. The interval is in hertz. 120 * 121 * The platform may define a per physical processor adjustment of 122 * this parameter. For efficiency, the effective rechoose interval 123 * (rechoose_interval + per chip adjustment) is maintained in the 124 * cpu structures. See cpu_choose() 125 */ 126 int rechoose_interval = RECHOOSE_INTERVAL; 127 static cpu_t *cpu_choose(kthread_t *, pri_t); 128 129 /* 130 * Parameter that determines how long (in nanoseconds) a thread must 131 * be sitting on a run queue before it can be stolen by another CPU 132 * to reduce migrations. The interval is in nanoseconds. 133 * 134 * The nosteal_nsec should be set by a platform code to an appropriate value. 135 * 136 */ 137 hrtime_t nosteal_nsec = 0; 138 139 /* 140 * Value of nosteal_nsec meaning that nosteal optimization should be disabled 141 */ 142 #define NOSTEAL_DISABLED 1 143 144 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 145 146 disp_lock_t transition_lock; /* lock on transitioning threads */ 147 disp_lock_t stop_lock; /* lock on stopped threads */ 148 149 static void cpu_dispqalloc(int numpris); 150 151 /* 152 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 153 * a thread because it was sitting on its run queue for a very short 154 * period of time. 155 */ 156 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 157 158 static kthread_t *disp_getwork(cpu_t *to); 159 static kthread_t *disp_getbest(disp_t *from); 160 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 161 162 void swtch_to(kthread_t *); 163 164 /* 165 * dispatcher and scheduler initialization 166 */ 167 168 /* 169 * disp_setup - Common code to calculate and allocate dispatcher 170 * variables and structures based on the maximum priority. 171 */ 172 static void 173 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 174 { 175 pri_t newnglobpris; 176 177 ASSERT(MUTEX_HELD(&cpu_lock)); 178 179 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 180 181 if (newnglobpris > oldnglobpris) { 182 /* 183 * Allocate new kp queues for each CPU partition. 184 */ 185 cpupart_kpqalloc(newnglobpris); 186 187 /* 188 * Allocate new dispatch queues for each CPU. 189 */ 190 cpu_dispqalloc(newnglobpris); 191 192 /* 193 * compute new interrupt thread base priority 194 */ 195 intr_pri = maxglobpri; 196 if (only_intr_kpreempt) { 197 kpreemptpri = intr_pri + 1; 198 if (kpqpri == KPQPRI) 199 kpqpri = kpreemptpri; 200 } 201 v.v_nglobpris = newnglobpris; 202 } 203 } 204 205 /* 206 * dispinit - Called to initialize all loaded classes and the 207 * dispatcher framework. 208 */ 209 void 210 dispinit(void) 211 { 212 id_t cid; 213 pri_t maxglobpri; 214 pri_t cl_maxglobpri; 215 216 maxglobpri = -1; 217 218 /* 219 * Initialize transition lock, which will always be set. 220 */ 221 DISP_LOCK_INIT(&transition_lock); 222 disp_lock_enter_high(&transition_lock); 223 DISP_LOCK_INIT(&stop_lock); 224 225 mutex_enter(&cpu_lock); 226 CPU->cpu_disp->disp_maxrunpri = -1; 227 CPU->cpu_disp->disp_max_unbound_pri = -1; 228 /* 229 * Initialize the default CPU partition. 230 */ 231 cpupart_initialize_default(); 232 /* 233 * Call the class specific initialization functions for 234 * all pre-installed schedulers. 235 * 236 * We pass the size of a class specific parameter 237 * buffer to each of the initialization functions 238 * to try to catch problems with backward compatibility 239 * of class modules. 240 * 241 * For example a new class module running on an old system 242 * which didn't provide sufficiently large parameter buffers 243 * would be bad news. Class initialization modules can check for 244 * this and take action if they detect a problem. 245 */ 246 247 for (cid = 0; cid < nclass; cid++) { 248 sclass_t *sc; 249 250 sc = &sclass[cid]; 251 if (SCHED_INSTALLED(sc)) { 252 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 253 &sc->cl_funcs); 254 if (cl_maxglobpri > maxglobpri) 255 maxglobpri = cl_maxglobpri; 256 } 257 } 258 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 259 if (kpqpri == KPQPRI) 260 kpqpri = kpreemptpri; 261 262 ASSERT(maxglobpri >= 0); 263 disp_setup(maxglobpri, 0); 264 265 mutex_exit(&cpu_lock); 266 267 /* 268 * Get the default class ID; this may be later modified via 269 * dispadmin(1M). This will load the class (normally TS) and that will 270 * call disp_add(), which is why we had to drop cpu_lock first. 271 */ 272 if (getcid(defaultclass, &defaultcid) != 0) { 273 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 274 defaultclass); 275 } 276 } 277 278 /* 279 * disp_add - Called with class pointer to initialize the dispatcher 280 * for a newly loaded class. 281 */ 282 void 283 disp_add(sclass_t *clp) 284 { 285 pri_t maxglobpri; 286 pri_t cl_maxglobpri; 287 288 mutex_enter(&cpu_lock); 289 /* 290 * Initialize the scheduler class. 291 */ 292 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 293 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 294 if (cl_maxglobpri > maxglobpri) 295 maxglobpri = cl_maxglobpri; 296 297 /* 298 * Save old queue information. Since we're initializing a 299 * new scheduling class which has just been loaded, then 300 * the size of the dispq may have changed. We need to handle 301 * that here. 302 */ 303 disp_setup(maxglobpri, v.v_nglobpris); 304 305 mutex_exit(&cpu_lock); 306 } 307 308 309 /* 310 * For each CPU, allocate new dispatch queues 311 * with the stated number of priorities. 312 */ 313 static void 314 cpu_dispqalloc(int numpris) 315 { 316 cpu_t *cpup; 317 struct disp_queue_info *disp_mem; 318 int i, num; 319 320 ASSERT(MUTEX_HELD(&cpu_lock)); 321 322 disp_mem = kmem_zalloc(NCPU * 323 sizeof (struct disp_queue_info), KM_SLEEP); 324 325 /* 326 * This routine must allocate all of the memory before stopping 327 * the cpus because it must not sleep in kmem_alloc while the 328 * CPUs are stopped. Locks they hold will not be freed until they 329 * are restarted. 330 */ 331 i = 0; 332 cpup = cpu_list; 333 do { 334 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 335 i++; 336 cpup = cpup->cpu_next; 337 } while (cpup != cpu_list); 338 num = i; 339 340 pause_cpus(NULL); 341 for (i = 0; i < num; i++) 342 disp_dq_assign(&disp_mem[i], numpris); 343 start_cpus(); 344 345 /* 346 * I must free all of the memory after starting the cpus because 347 * I can not risk sleeping in kmem_free while the cpus are stopped. 348 */ 349 for (i = 0; i < num; i++) 350 disp_dq_free(&disp_mem[i]); 351 352 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 353 } 354 355 static void 356 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 357 { 358 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 359 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 360 sizeof (long), KM_SLEEP); 361 dptr->dp = dp; 362 } 363 364 static void 365 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 366 { 367 disp_t *dp; 368 369 dp = dptr->dp; 370 dptr->olddispq = dp->disp_q; 371 dptr->olddqactmap = dp->disp_qactmap; 372 dptr->oldnglobpris = dp->disp_npri; 373 374 ASSERT(dptr->oldnglobpris < numpris); 375 376 if (dptr->olddispq != NULL) { 377 /* 378 * Use kcopy because bcopy is platform-specific 379 * and could block while we might have paused the cpus. 380 */ 381 (void) kcopy(dptr->olddispq, dptr->newdispq, 382 dptr->oldnglobpris * sizeof (dispq_t)); 383 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 384 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 385 sizeof (long)); 386 } 387 dp->disp_q = dptr->newdispq; 388 dp->disp_qactmap = dptr->newdqactmap; 389 dp->disp_q_limit = &dptr->newdispq[numpris]; 390 dp->disp_npri = numpris; 391 } 392 393 static void 394 disp_dq_free(struct disp_queue_info *dptr) 395 { 396 if (dptr->olddispq != NULL) 397 kmem_free(dptr->olddispq, 398 dptr->oldnglobpris * sizeof (dispq_t)); 399 if (dptr->olddqactmap != NULL) 400 kmem_free(dptr->olddqactmap, 401 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 402 } 403 404 /* 405 * For a newly created CPU, initialize the dispatch queue. 406 * This is called before the CPU is known through cpu[] or on any lists. 407 */ 408 void 409 disp_cpu_init(cpu_t *cp) 410 { 411 disp_t *dp; 412 dispq_t *newdispq; 413 ulong_t *newdqactmap; 414 415 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 416 417 if (cp == cpu0_disp.disp_cpu) 418 dp = &cpu0_disp; 419 else 420 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 421 bzero(dp, sizeof (disp_t)); 422 cp->cpu_disp = dp; 423 dp->disp_cpu = cp; 424 dp->disp_maxrunpri = -1; 425 dp->disp_max_unbound_pri = -1; 426 DISP_LOCK_INIT(&cp->cpu_thread_lock); 427 /* 428 * Allocate memory for the dispatcher queue headers 429 * and the active queue bitmap. 430 */ 431 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 432 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 433 sizeof (long), KM_SLEEP); 434 dp->disp_q = newdispq; 435 dp->disp_qactmap = newdqactmap; 436 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 437 dp->disp_npri = v.v_nglobpris; 438 } 439 440 void 441 disp_cpu_fini(cpu_t *cp) 442 { 443 ASSERT(MUTEX_HELD(&cpu_lock)); 444 445 disp_kp_free(cp->cpu_disp); 446 if (cp->cpu_disp != &cpu0_disp) 447 kmem_free(cp->cpu_disp, sizeof (disp_t)); 448 } 449 450 /* 451 * Allocate new, larger kpreempt dispatch queue to replace the old one. 452 */ 453 void 454 disp_kp_alloc(disp_t *dq, pri_t npri) 455 { 456 struct disp_queue_info mem_info; 457 458 if (npri > dq->disp_npri) { 459 /* 460 * Allocate memory for the new array. 461 */ 462 disp_dq_alloc(&mem_info, npri, dq); 463 464 /* 465 * We need to copy the old structures to the new 466 * and free the old. 467 */ 468 disp_dq_assign(&mem_info, npri); 469 disp_dq_free(&mem_info); 470 } 471 } 472 473 /* 474 * Free dispatch queue. 475 * Used for the kpreempt queues for a removed CPU partition and 476 * for the per-CPU queues of deleted CPUs. 477 */ 478 void 479 disp_kp_free(disp_t *dq) 480 { 481 struct disp_queue_info mem_info; 482 483 mem_info.olddispq = dq->disp_q; 484 mem_info.olddqactmap = dq->disp_qactmap; 485 mem_info.oldnglobpris = dq->disp_npri; 486 disp_dq_free(&mem_info); 487 } 488 489 /* 490 * End dispatcher and scheduler initialization. 491 */ 492 493 /* 494 * See if there's anything to do other than remain idle. 495 * Return non-zero if there is. 496 * 497 * This function must be called with high spl, or with 498 * kernel preemption disabled to prevent the partition's 499 * active cpu list from changing while being traversed. 500 * 501 */ 502 int 503 disp_anywork(void) 504 { 505 cpu_t *cp = CPU; 506 cpu_t *ocp; 507 508 if (cp->cpu_disp->disp_nrunnable != 0) 509 return (1); 510 511 if (!(cp->cpu_flags & CPU_OFFLINE)) { 512 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 513 return (1); 514 515 /* 516 * Work can be taken from another CPU if: 517 * - There is unbound work on the run queue 518 * - That work isn't a thread undergoing a 519 * - context switch on an otherwise empty queue. 520 * - The CPU isn't running the idle loop. 521 */ 522 for (ocp = cp->cpu_next_part; ocp != cp; 523 ocp = ocp->cpu_next_part) { 524 ASSERT(CPU_ACTIVE(ocp)); 525 526 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 527 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 528 ocp->cpu_disp->disp_nrunnable == 1) && 529 ocp->cpu_dispatch_pri != -1) 530 return (1); 531 } 532 } 533 return (0); 534 } 535 536 /* 537 * Called when CPU enters the idle loop 538 */ 539 static void 540 idle_enter() 541 { 542 cpu_t *cp = CPU; 543 544 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 545 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 546 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 547 } 548 549 /* 550 * Called when CPU exits the idle loop 551 */ 552 static void 553 idle_exit() 554 { 555 cpu_t *cp = CPU; 556 557 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 558 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 559 } 560 561 /* 562 * Idle loop. 563 */ 564 void 565 idle() 566 { 567 struct cpu *cp = CPU; /* pointer to this CPU */ 568 kthread_t *t; /* taken thread */ 569 570 idle_enter(); 571 572 /* 573 * Uniprocessor version of idle loop. 574 * Do this until notified that we're on an actual multiprocessor. 575 */ 576 while (ncpus == 1) { 577 if (cp->cpu_disp->disp_nrunnable == 0) { 578 (*idle_cpu)(); 579 continue; 580 } 581 idle_exit(); 582 swtch(); 583 584 idle_enter(); /* returned from swtch */ 585 } 586 587 /* 588 * Multiprocessor idle loop. 589 */ 590 for (;;) { 591 /* 592 * If CPU is completely quiesced by p_online(2), just wait 593 * here with minimal bus traffic until put online. 594 */ 595 while (cp->cpu_flags & CPU_QUIESCED) 596 (*idle_cpu)(); 597 598 if (cp->cpu_disp->disp_nrunnable != 0) { 599 idle_exit(); 600 swtch(); 601 } else { 602 if (cp->cpu_flags & CPU_OFFLINE) 603 continue; 604 if ((t = disp_getwork(cp)) == NULL) { 605 if (cp->cpu_chosen_level != -1) { 606 disp_t *dp = cp->cpu_disp; 607 disp_t *kpq; 608 609 disp_lock_enter(&dp->disp_lock); 610 /* 611 * Set kpq under lock to prevent 612 * migration between partitions. 613 */ 614 kpq = &cp->cpu_part->cp_kp_queue; 615 if (kpq->disp_maxrunpri == -1) 616 cp->cpu_chosen_level = -1; 617 disp_lock_exit(&dp->disp_lock); 618 } 619 (*idle_cpu)(); 620 continue; 621 } 622 /* 623 * If there was a thread but we couldn't steal 624 * it, then keep trying. 625 */ 626 if (t == T_DONTSTEAL) 627 continue; 628 idle_exit(); 629 swtch_to(t); 630 } 631 idle_enter(); /* returned from swtch/swtch_to */ 632 } 633 } 634 635 636 /* 637 * Preempt the currently running thread in favor of the highest 638 * priority thread. The class of the current thread controls 639 * where it goes on the dispatcher queues. If panicking, turn 640 * preemption off. 641 */ 642 void 643 preempt() 644 { 645 kthread_t *t = curthread; 646 klwp_t *lwp = ttolwp(curthread); 647 648 if (panicstr) 649 return; 650 651 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 652 653 thread_lock(t); 654 655 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 656 /* 657 * this thread has already been chosen to be run on 658 * another CPU. Clear kprunrun on this CPU since we're 659 * already headed for swtch(). 660 */ 661 CPU->cpu_kprunrun = 0; 662 thread_unlock_nopreempt(t); 663 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 664 } else { 665 if (lwp != NULL) 666 lwp->lwp_ru.nivcsw++; 667 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 668 THREAD_TRANSITION(t); 669 CL_PREEMPT(t); 670 DTRACE_SCHED(preempt); 671 thread_unlock_nopreempt(t); 672 673 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 674 675 swtch(); /* clears CPU->cpu_runrun via disp() */ 676 } 677 } 678 679 extern kthread_t *thread_unpin(); 680 681 /* 682 * disp() - find the highest priority thread for this processor to run, and 683 * set it in TS_ONPROC state so that resume() can be called to run it. 684 */ 685 static kthread_t * 686 disp() 687 { 688 cpu_t *cpup; 689 disp_t *dp; 690 kthread_t *tp; 691 dispq_t *dq; 692 int maxrunword; 693 pri_t pri; 694 disp_t *kpq; 695 696 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 697 698 cpup = CPU; 699 /* 700 * Find the highest priority loaded, runnable thread. 701 */ 702 dp = cpup->cpu_disp; 703 704 reschedule: 705 /* 706 * If there is more important work on the global queue with a better 707 * priority than the maximum on this CPU, take it now. 708 */ 709 kpq = &cpup->cpu_part->cp_kp_queue; 710 while ((pri = kpq->disp_maxrunpri) >= 0 && 711 pri >= dp->disp_maxrunpri && 712 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 713 (tp = disp_getbest(kpq)) != NULL) { 714 if (disp_ratify(tp, kpq) != NULL) { 715 TRACE_1(TR_FAC_DISP, TR_DISP_END, 716 "disp_end:tid %p", tp); 717 return (tp); 718 } 719 } 720 721 disp_lock_enter(&dp->disp_lock); 722 pri = dp->disp_maxrunpri; 723 724 /* 725 * If there is nothing to run, look at what's runnable on other queues. 726 * Choose the idle thread if the CPU is quiesced. 727 * Note that CPUs that have the CPU_OFFLINE flag set can still run 728 * interrupt threads, which will be the only threads on the CPU's own 729 * queue, but cannot run threads from other queues. 730 */ 731 if (pri == -1) { 732 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 733 disp_lock_exit(&dp->disp_lock); 734 if ((tp = disp_getwork(cpup)) == NULL || 735 tp == T_DONTSTEAL) { 736 tp = cpup->cpu_idle_thread; 737 (void) splhigh(); 738 THREAD_ONPROC(tp, cpup); 739 cpup->cpu_dispthread = tp; 740 cpup->cpu_dispatch_pri = -1; 741 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 742 cpup->cpu_chosen_level = -1; 743 } 744 } else { 745 disp_lock_exit_high(&dp->disp_lock); 746 tp = cpup->cpu_idle_thread; 747 THREAD_ONPROC(tp, cpup); 748 cpup->cpu_dispthread = tp; 749 cpup->cpu_dispatch_pri = -1; 750 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 751 cpup->cpu_chosen_level = -1; 752 } 753 TRACE_1(TR_FAC_DISP, TR_DISP_END, 754 "disp_end:tid %p", tp); 755 return (tp); 756 } 757 758 dq = &dp->disp_q[pri]; 759 tp = dq->dq_first; 760 761 ASSERT(tp != NULL); 762 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 763 764 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 765 766 /* 767 * Found it so remove it from queue. 768 */ 769 dp->disp_nrunnable--; 770 dq->dq_sruncnt--; 771 if ((dq->dq_first = tp->t_link) == NULL) { 772 ulong_t *dqactmap = dp->disp_qactmap; 773 774 ASSERT(dq->dq_sruncnt == 0); 775 dq->dq_last = NULL; 776 777 /* 778 * The queue is empty, so the corresponding bit needs to be 779 * turned off in dqactmap. If nrunnable != 0 just took the 780 * last runnable thread off the 781 * highest queue, so recompute disp_maxrunpri. 782 */ 783 maxrunword = pri >> BT_ULSHIFT; 784 dqactmap[maxrunword] &= ~BT_BIW(pri); 785 786 if (dp->disp_nrunnable == 0) { 787 dp->disp_max_unbound_pri = -1; 788 dp->disp_maxrunpri = -1; 789 } else { 790 int ipri; 791 792 ipri = bt_gethighbit(dqactmap, maxrunword); 793 dp->disp_maxrunpri = ipri; 794 if (ipri < dp->disp_max_unbound_pri) 795 dp->disp_max_unbound_pri = ipri; 796 } 797 } else { 798 tp->t_link = NULL; 799 } 800 801 /* 802 * Set TS_DONT_SWAP flag to prevent another processor from swapping 803 * out this thread before we have a chance to run it. 804 * While running, it is protected against swapping by t_lock. 805 */ 806 tp->t_schedflag |= TS_DONT_SWAP; 807 cpup->cpu_dispthread = tp; /* protected by spl only */ 808 cpup->cpu_dispatch_pri = pri; 809 ASSERT(pri == DISP_PRIO(tp)); 810 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 811 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 812 813 ASSERT(tp != NULL); 814 TRACE_1(TR_FAC_DISP, TR_DISP_END, 815 "disp_end:tid %p", tp); 816 817 if (disp_ratify(tp, kpq) == NULL) 818 goto reschedule; 819 820 return (tp); 821 } 822 823 /* 824 * swtch() 825 * Find best runnable thread and run it. 826 * Called with the current thread already switched to a new state, 827 * on a sleep queue, run queue, stopped, and not zombied. 828 * May be called at any spl level less than or equal to LOCK_LEVEL. 829 * Always drops spl to the base level (spl0()). 830 */ 831 void 832 swtch() 833 { 834 kthread_t *t = curthread; 835 kthread_t *next; 836 cpu_t *cp; 837 838 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 839 840 if (t->t_flag & T_INTR_THREAD) 841 cpu_intr_swtch_enter(t); 842 843 if (t->t_intr != NULL) { 844 /* 845 * We are an interrupt thread. Setup and return 846 * the interrupted thread to be resumed. 847 */ 848 (void) splhigh(); /* block other scheduler action */ 849 cp = CPU; /* now protected against migration */ 850 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 851 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 852 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 853 next = thread_unpin(); 854 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 855 resume_from_intr(next); 856 } else { 857 #ifdef DEBUG 858 if (t->t_state == TS_ONPROC && 859 t->t_disp_queue->disp_cpu == CPU && 860 t->t_preempt == 0) { 861 thread_lock(t); 862 ASSERT(t->t_state != TS_ONPROC || 863 t->t_disp_queue->disp_cpu != CPU || 864 t->t_preempt != 0); /* cannot migrate */ 865 thread_unlock_nopreempt(t); 866 } 867 #endif /* DEBUG */ 868 cp = CPU; 869 next = disp(); /* returns with spl high */ 870 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 871 872 /* OK to steal anything left on run queue */ 873 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 874 875 if (next != t) { 876 if (t == cp->cpu_idle_thread) { 877 CHIP_NRUNNING(cp->cpu_chip, 1); 878 } else if (next == cp->cpu_idle_thread) { 879 CHIP_NRUNNING(cp->cpu_chip, -1); 880 } 881 882 /* 883 * If t was previously in the TS_ONPROC state, 884 * setfrontdq and setbackdq won't have set its t_waitrq. 885 * Since we now finally know that we're switching away 886 * from this thread, set its t_waitrq if it is on a run 887 * queue. 888 */ 889 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 890 t->t_waitrq = gethrtime_unscaled(); 891 } 892 893 /* 894 * restore mstate of thread that we are switching to 895 */ 896 restore_mstate(next); 897 898 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 899 cp->cpu_last_swtch = t->t_disp_time = lbolt; 900 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 901 902 if (dtrace_vtime_active) 903 dtrace_vtime_switch(next); 904 905 resume(next); 906 /* 907 * The TR_RESUME_END and TR_SWTCH_END trace points 908 * appear at the end of resume(), because we may not 909 * return here 910 */ 911 } else { 912 if (t->t_flag & T_INTR_THREAD) 913 cpu_intr_swtch_exit(t); 914 915 DTRACE_SCHED(remain__cpu); 916 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 917 (void) spl0(); 918 } 919 } 920 } 921 922 /* 923 * swtch_from_zombie() 924 * Special case of swtch(), which allows checks for TS_ZOMB to be 925 * eliminated from normal resume. 926 * Find best runnable thread and run it. 927 * Called with the current thread zombied. 928 * Zombies cannot migrate, so CPU references are safe. 929 */ 930 void 931 swtch_from_zombie() 932 { 933 kthread_t *next; 934 cpu_t *cpu = CPU; 935 936 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 937 938 ASSERT(curthread->t_state == TS_ZOMB); 939 940 next = disp(); /* returns with spl high */ 941 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 942 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 943 ASSERT(next != curthread); 944 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 945 946 if (next == cpu->cpu_idle_thread) 947 CHIP_NRUNNING(cpu->cpu_chip, -1); 948 949 restore_mstate(next); 950 951 if (dtrace_vtime_active) 952 dtrace_vtime_switch(next); 953 954 resume_from_zombie(next); 955 /* 956 * The TR_RESUME_END and TR_SWTCH_END trace points 957 * appear at the end of resume(), because we certainly will not 958 * return here 959 */ 960 } 961 962 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 963 static int 964 thread_on_queue(kthread_t *tp) 965 { 966 cpu_t *cp; 967 cpu_t *self; 968 disp_t *dp; 969 970 self = CPU; 971 cp = self->cpu_next_onln; 972 dp = cp->cpu_disp; 973 for (;;) { 974 dispq_t *dq; 975 dispq_t *eq; 976 977 disp_lock_enter_high(&dp->disp_lock); 978 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 979 kthread_t *rp; 980 981 ASSERT(dq->dq_last == NULL || 982 dq->dq_last->t_link == NULL); 983 for (rp = dq->dq_first; rp; rp = rp->t_link) 984 if (tp == rp) { 985 disp_lock_exit_high(&dp->disp_lock); 986 return (1); 987 } 988 } 989 disp_lock_exit_high(&dp->disp_lock); 990 if (cp == NULL) 991 break; 992 if (cp == self) { 993 cp = NULL; 994 dp = &cp->cpu_part->cp_kp_queue; 995 } else { 996 cp = cp->cpu_next_onln; 997 dp = cp->cpu_disp; 998 } 999 } 1000 return (0); 1001 } /* end of thread_on_queue */ 1002 #else 1003 1004 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1005 1006 #endif /* DEBUG */ 1007 1008 /* 1009 * like swtch(), but switch to a specified thread taken from another CPU. 1010 * called with spl high.. 1011 */ 1012 void 1013 swtch_to(kthread_t *next) 1014 { 1015 cpu_t *cp = CPU; 1016 1017 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1018 1019 /* 1020 * Update context switch statistics. 1021 */ 1022 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1023 1024 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1025 1026 if (curthread == cp->cpu_idle_thread) 1027 CHIP_NRUNNING(cp->cpu_chip, 1); 1028 1029 /* OK to steal anything left on run queue */ 1030 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1031 1032 /* record last execution time */ 1033 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1034 1035 /* 1036 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1037 * won't have set its t_waitrq. Since we now finally know that we're 1038 * switching away from this thread, set its t_waitrq if it is on a run 1039 * queue. 1040 */ 1041 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1042 curthread->t_waitrq = gethrtime_unscaled(); 1043 } 1044 1045 /* restore next thread to previously running microstate */ 1046 restore_mstate(next); 1047 1048 if (dtrace_vtime_active) 1049 dtrace_vtime_switch(next); 1050 1051 resume(next); 1052 /* 1053 * The TR_RESUME_END and TR_SWTCH_END trace points 1054 * appear at the end of resume(), because we may not 1055 * return here 1056 */ 1057 } 1058 1059 1060 1061 #define CPU_IDLING(pri) ((pri) == -1) 1062 1063 static void 1064 cpu_resched(cpu_t *cp, pri_t tpri) 1065 { 1066 int call_poke_cpu = 0; 1067 pri_t cpupri = cp->cpu_dispatch_pri; 1068 1069 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1070 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1071 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1072 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1073 cp->cpu_runrun = 1; 1074 aston(cp->cpu_dispthread); 1075 if (tpri < kpreemptpri && cp != CPU) 1076 call_poke_cpu = 1; 1077 } 1078 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1079 cp->cpu_kprunrun = 1; 1080 if (cp != CPU) 1081 call_poke_cpu = 1; 1082 } 1083 } 1084 1085 /* 1086 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1087 */ 1088 membar_enter(); 1089 1090 if (call_poke_cpu) 1091 poke_cpu(cp->cpu_id); 1092 } 1093 1094 /* 1095 * Routine used by setbackdq() to balance load across the physical 1096 * processors. Returns a CPU of a lesser loaded chip in the lgroup 1097 * if balancing is necessary, or the "hint" CPU if it's not. 1098 * 1099 * - tp is the thread being enqueued 1100 * - cp is a hint CPU (chosen by cpu_choose()). 1101 * - curchip (if not NULL) is the chip on which the current thread 1102 * is running. 1103 * 1104 * The thread lock for "tp" must be held while calling this routine. 1105 */ 1106 static cpu_t * 1107 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip) 1108 { 1109 int chp_nrun, ochp_nrun; 1110 chip_t *chp, *nchp; 1111 1112 chp = cp->cpu_chip; 1113 chp_nrun = chp->chip_nrunning; 1114 1115 if (chp == curchip) 1116 chp_nrun--; /* Ignore curthread */ 1117 1118 /* 1119 * If this chip isn't at all idle, then let 1120 * run queue balancing do the work. 1121 */ 1122 if (chp_nrun == chp->chip_ncpu) 1123 return (cp); 1124 1125 nchp = chp->chip_balance; 1126 do { 1127 if (nchp == chp || 1128 !CHIP_IN_CPUPART(nchp, tp->t_cpupart)) 1129 continue; 1130 1131 ochp_nrun = nchp->chip_nrunning; 1132 1133 /* 1134 * If the other chip is running less threads, 1135 * or if it's running the same number of threads, but 1136 * has more online logical CPUs, then choose to balance. 1137 */ 1138 if (chp_nrun > ochp_nrun || 1139 (chp_nrun == ochp_nrun && 1140 nchp->chip_ncpu > chp->chip_ncpu)) { 1141 cp = nchp->chip_cpus; 1142 nchp->chip_cpus = cp->cpu_next_chip; 1143 1144 /* 1145 * Find a CPU on the chip in the correct 1146 * partition. We know at least one exists 1147 * because of the CHIP_IN_CPUPART() check above. 1148 */ 1149 while (cp->cpu_part != tp->t_cpupart) 1150 cp = cp->cpu_next_chip; 1151 } 1152 chp->chip_balance = nchp->chip_next_lgrp; 1153 break; 1154 } while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance); 1155 1156 ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart)); 1157 return (cp); 1158 } 1159 1160 /* 1161 * setbackdq() keeps runqs balanced such that the difference in length 1162 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1163 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1164 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1165 * try to keep runqs perfectly balanced regardless of the thread priority. 1166 */ 1167 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1168 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1169 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1170 1171 /* 1172 * Put the specified thread on the back of the dispatcher 1173 * queue corresponding to its current priority. 1174 * 1175 * Called with the thread in transition, onproc or stopped state 1176 * and locked (transition implies locked) and at high spl. 1177 * Returns with the thread in TS_RUN state and still locked. 1178 */ 1179 void 1180 setbackdq(kthread_t *tp) 1181 { 1182 dispq_t *dq; 1183 disp_t *dp; 1184 chip_t *curchip = NULL; 1185 cpu_t *cp; 1186 pri_t tpri; 1187 int bound; 1188 1189 ASSERT(THREAD_LOCK_HELD(tp)); 1190 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1191 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1192 1193 /* 1194 * If thread is "swapped" or on the swap queue don't 1195 * queue it, but wake sched. 1196 */ 1197 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1198 disp_swapped_setrun(tp); 1199 return; 1200 } 1201 1202 tpri = DISP_PRIO(tp); 1203 if (tp == curthread) { 1204 curchip = CPU->cpu_chip; 1205 } 1206 1207 if (ncpus == 1) 1208 cp = tp->t_cpu; 1209 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1210 if (tpri >= kpqpri) { 1211 setkpdq(tp, SETKP_BACK); 1212 return; 1213 } 1214 /* 1215 * Let cpu_choose suggest a CPU. 1216 */ 1217 cp = cpu_choose(tp, tpri); 1218 1219 if (tp->t_cpupart == cp->cpu_part) { 1220 int qlen; 1221 1222 /* 1223 * Select another CPU if we need 1224 * to do some load balancing across the 1225 * physical processors. 1226 */ 1227 if (CHIP_SHOULD_BALANCE(cp->cpu_chip)) 1228 cp = chip_balance(tp, cp, curchip); 1229 1230 /* 1231 * Balance across the run queues 1232 */ 1233 qlen = RUNQ_LEN(cp, tpri); 1234 if (tpri >= RUNQ_MATCH_PRI && 1235 !(tp->t_schedflag & TS_RUNQMATCH)) 1236 qlen -= RUNQ_MAX_DIFF; 1237 if (qlen > 0) { 1238 cpu_t *newcp; 1239 1240 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1241 newcp = cp->cpu_next_part; 1242 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1243 newcp = cp->cpu_next_part; 1244 } 1245 1246 if (RUNQ_LEN(newcp, tpri) < qlen) { 1247 DTRACE_PROBE3(runq__balance, 1248 kthread_t *, tp, 1249 cpu_t *, cp, cpu_t *, newcp); 1250 cp = newcp; 1251 } 1252 } 1253 } else { 1254 /* 1255 * Migrate to a cpu in the new partition. 1256 */ 1257 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1258 tp->t_lpl, tp->t_pri, NULL); 1259 } 1260 bound = 0; 1261 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1262 } else { 1263 /* 1264 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1265 * a short time until weak binding that existed when the 1266 * strong binding was established has dropped) so we must 1267 * favour weak binding over strong. 1268 */ 1269 cp = tp->t_weakbound_cpu ? 1270 tp->t_weakbound_cpu : tp->t_bound_cpu; 1271 bound = 1; 1272 } 1273 /* 1274 * A thread that is ONPROC may be temporarily placed on the run queue 1275 * but then chosen to run again by disp. If the thread we're placing on 1276 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1277 * replacement process is actually scheduled in swtch(). In this 1278 * situation, curthread is the only thread that could be in the ONPROC 1279 * state. 1280 */ 1281 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1282 hrtime_t curtime; 1283 1284 curtime = gethrtime_unscaled(); 1285 (void) cpu_update_pct(tp, curtime); 1286 tp->t_waitrq = curtime; 1287 } else { 1288 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1289 } 1290 1291 dp = cp->cpu_disp; 1292 disp_lock_enter_high(&dp->disp_lock); 1293 1294 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1295 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1296 tpri, cp, tp); 1297 1298 #ifndef NPROBE 1299 /* Kernel probe */ 1300 if (tnf_tracing_active) 1301 tnf_thread_queue(tp, cp, tpri); 1302 #endif /* NPROBE */ 1303 1304 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1305 1306 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1307 tp->t_disp_queue = dp; 1308 tp->t_link = NULL; 1309 1310 dq = &dp->disp_q[tpri]; 1311 dp->disp_nrunnable++; 1312 if (!bound) 1313 dp->disp_steal = 0; 1314 membar_enter(); 1315 1316 if (dq->dq_sruncnt++ != 0) { 1317 ASSERT(dq->dq_first != NULL); 1318 dq->dq_last->t_link = tp; 1319 dq->dq_last = tp; 1320 } else { 1321 ASSERT(dq->dq_first == NULL); 1322 ASSERT(dq->dq_last == NULL); 1323 dq->dq_first = dq->dq_last = tp; 1324 BT_SET(dp->disp_qactmap, tpri); 1325 if (tpri > dp->disp_maxrunpri) { 1326 dp->disp_maxrunpri = tpri; 1327 membar_enter(); 1328 cpu_resched(cp, tpri); 1329 } 1330 } 1331 1332 if (!bound && tpri > dp->disp_max_unbound_pri) { 1333 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1334 cp == CPU) { 1335 /* 1336 * If there are no other unbound threads on the 1337 * run queue, don't allow other CPUs to steal 1338 * this thread while we are in the middle of a 1339 * context switch. We may just switch to it 1340 * again right away. CPU_DISP_DONTSTEAL is cleared 1341 * in swtch and swtch_to. 1342 */ 1343 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1344 } 1345 dp->disp_max_unbound_pri = tpri; 1346 } 1347 (*disp_enq_thread)(cp, bound); 1348 } 1349 1350 /* 1351 * Put the specified thread on the front of the dispatcher 1352 * queue corresponding to its current priority. 1353 * 1354 * Called with the thread in transition, onproc or stopped state 1355 * and locked (transition implies locked) and at high spl. 1356 * Returns with the thread in TS_RUN state and still locked. 1357 */ 1358 void 1359 setfrontdq(kthread_t *tp) 1360 { 1361 disp_t *dp; 1362 dispq_t *dq; 1363 cpu_t *cp; 1364 pri_t tpri; 1365 int bound; 1366 1367 ASSERT(THREAD_LOCK_HELD(tp)); 1368 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1369 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1370 1371 /* 1372 * If thread is "swapped" or on the swap queue don't 1373 * queue it, but wake sched. 1374 */ 1375 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1376 disp_swapped_setrun(tp); 1377 return; 1378 } 1379 1380 tpri = DISP_PRIO(tp); 1381 if (ncpus == 1) 1382 cp = tp->t_cpu; 1383 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1384 if (tpri >= kpqpri) { 1385 setkpdq(tp, SETKP_FRONT); 1386 return; 1387 } 1388 cp = tp->t_cpu; 1389 if (tp->t_cpupart == cp->cpu_part) { 1390 /* 1391 * If we are of higher or equal priority than 1392 * the highest priority runnable thread of 1393 * the current CPU, just pick this CPU. Otherwise 1394 * Let cpu_choose() select the CPU. If this cpu 1395 * is the target of an offline request then do not 1396 * pick it - a thread_nomigrate() on the in motion 1397 * cpu relies on this when it forces a preempt. 1398 */ 1399 if (tpri < cp->cpu_disp->disp_maxrunpri || 1400 cp == cpu_inmotion) 1401 cp = cpu_choose(tp, tpri); 1402 } else { 1403 /* 1404 * Migrate to a cpu in the new partition. 1405 */ 1406 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1407 tp->t_lpl, tp->t_pri, NULL); 1408 } 1409 bound = 0; 1410 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1411 } else { 1412 /* 1413 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1414 * a short time until weak binding that existed when the 1415 * strong binding was established has dropped) so we must 1416 * favour weak binding over strong. 1417 */ 1418 cp = tp->t_weakbound_cpu ? 1419 tp->t_weakbound_cpu : tp->t_bound_cpu; 1420 bound = 1; 1421 } 1422 1423 /* 1424 * A thread that is ONPROC may be temporarily placed on the run queue 1425 * but then chosen to run again by disp. If the thread we're placing on 1426 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1427 * replacement process is actually scheduled in swtch(). In this 1428 * situation, curthread is the only thread that could be in the ONPROC 1429 * state. 1430 */ 1431 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1432 hrtime_t curtime; 1433 1434 curtime = gethrtime_unscaled(); 1435 (void) cpu_update_pct(tp, curtime); 1436 tp->t_waitrq = curtime; 1437 } else { 1438 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1439 } 1440 1441 dp = cp->cpu_disp; 1442 disp_lock_enter_high(&dp->disp_lock); 1443 1444 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1445 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1446 1447 #ifndef NPROBE 1448 /* Kernel probe */ 1449 if (tnf_tracing_active) 1450 tnf_thread_queue(tp, cp, tpri); 1451 #endif /* NPROBE */ 1452 1453 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1454 1455 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1456 tp->t_disp_queue = dp; 1457 1458 dq = &dp->disp_q[tpri]; 1459 dp->disp_nrunnable++; 1460 if (!bound) 1461 dp->disp_steal = 0; 1462 membar_enter(); 1463 1464 if (dq->dq_sruncnt++ != 0) { 1465 ASSERT(dq->dq_last != NULL); 1466 tp->t_link = dq->dq_first; 1467 dq->dq_first = tp; 1468 } else { 1469 ASSERT(dq->dq_last == NULL); 1470 ASSERT(dq->dq_first == NULL); 1471 tp->t_link = NULL; 1472 dq->dq_first = dq->dq_last = tp; 1473 BT_SET(dp->disp_qactmap, tpri); 1474 if (tpri > dp->disp_maxrunpri) { 1475 dp->disp_maxrunpri = tpri; 1476 membar_enter(); 1477 cpu_resched(cp, tpri); 1478 } 1479 } 1480 1481 if (!bound && tpri > dp->disp_max_unbound_pri) { 1482 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1483 cp == CPU) { 1484 /* 1485 * If there are no other unbound threads on the 1486 * run queue, don't allow other CPUs to steal 1487 * this thread while we are in the middle of a 1488 * context switch. We may just switch to it 1489 * again right away. CPU_DISP_DONTSTEAL is cleared 1490 * in swtch and swtch_to. 1491 */ 1492 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1493 } 1494 dp->disp_max_unbound_pri = tpri; 1495 } 1496 (*disp_enq_thread)(cp, bound); 1497 } 1498 1499 /* 1500 * Put a high-priority unbound thread on the kp queue 1501 */ 1502 static void 1503 setkpdq(kthread_t *tp, int borf) 1504 { 1505 dispq_t *dq; 1506 disp_t *dp; 1507 cpu_t *cp; 1508 pri_t tpri; 1509 1510 tpri = DISP_PRIO(tp); 1511 1512 dp = &tp->t_cpupart->cp_kp_queue; 1513 disp_lock_enter_high(&dp->disp_lock); 1514 1515 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1516 1517 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1518 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1519 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1520 tp->t_disp_queue = dp; 1521 dp->disp_nrunnable++; 1522 dq = &dp->disp_q[tpri]; 1523 1524 if (dq->dq_sruncnt++ != 0) { 1525 if (borf == SETKP_BACK) { 1526 ASSERT(dq->dq_first != NULL); 1527 tp->t_link = NULL; 1528 dq->dq_last->t_link = tp; 1529 dq->dq_last = tp; 1530 } else { 1531 ASSERT(dq->dq_last != NULL); 1532 tp->t_link = dq->dq_first; 1533 dq->dq_first = tp; 1534 } 1535 } else { 1536 if (borf == SETKP_BACK) { 1537 ASSERT(dq->dq_first == NULL); 1538 ASSERT(dq->dq_last == NULL); 1539 dq->dq_first = dq->dq_last = tp; 1540 } else { 1541 ASSERT(dq->dq_last == NULL); 1542 ASSERT(dq->dq_first == NULL); 1543 tp->t_link = NULL; 1544 dq->dq_first = dq->dq_last = tp; 1545 } 1546 BT_SET(dp->disp_qactmap, tpri); 1547 if (tpri > dp->disp_max_unbound_pri) 1548 dp->disp_max_unbound_pri = tpri; 1549 if (tpri > dp->disp_maxrunpri) { 1550 dp->disp_maxrunpri = tpri; 1551 membar_enter(); 1552 } 1553 } 1554 1555 cp = tp->t_cpu; 1556 if (tp->t_cpupart != cp->cpu_part) { 1557 /* migrate to a cpu in the new partition */ 1558 cp = tp->t_cpupart->cp_cpulist; 1559 } 1560 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1561 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1562 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1563 1564 #ifndef NPROBE 1565 /* Kernel probe */ 1566 if (tnf_tracing_active) 1567 tnf_thread_queue(tp, cp, tpri); 1568 #endif /* NPROBE */ 1569 1570 if (cp->cpu_chosen_level < tpri) 1571 cp->cpu_chosen_level = tpri; 1572 cpu_resched(cp, tpri); 1573 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1574 (*disp_enq_thread)(cp, 0); 1575 } 1576 1577 /* 1578 * Remove a thread from the dispatcher queue if it is on it. 1579 * It is not an error if it is not found but we return whether 1580 * or not it was found in case the caller wants to check. 1581 */ 1582 int 1583 dispdeq(kthread_t *tp) 1584 { 1585 disp_t *dp; 1586 dispq_t *dq; 1587 kthread_t *rp; 1588 kthread_t *trp; 1589 kthread_t **ptp; 1590 int tpri; 1591 1592 ASSERT(THREAD_LOCK_HELD(tp)); 1593 1594 if (tp->t_state != TS_RUN) 1595 return (0); 1596 1597 /* 1598 * The thread is "swapped" or is on the swap queue and 1599 * hence no longer on the run queue, so return true. 1600 */ 1601 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1602 return (1); 1603 1604 tpri = DISP_PRIO(tp); 1605 dp = tp->t_disp_queue; 1606 ASSERT(tpri < dp->disp_npri); 1607 dq = &dp->disp_q[tpri]; 1608 ptp = &dq->dq_first; 1609 rp = *ptp; 1610 trp = NULL; 1611 1612 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1613 1614 /* 1615 * Search for thread in queue. 1616 * Double links would simplify this at the expense of disp/setrun. 1617 */ 1618 while (rp != tp && rp != NULL) { 1619 trp = rp; 1620 ptp = &trp->t_link; 1621 rp = trp->t_link; 1622 } 1623 1624 if (rp == NULL) { 1625 panic("dispdeq: thread not on queue"); 1626 } 1627 1628 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1629 1630 /* 1631 * Found it so remove it from queue. 1632 */ 1633 if ((*ptp = rp->t_link) == NULL) 1634 dq->dq_last = trp; 1635 1636 dp->disp_nrunnable--; 1637 if (--dq->dq_sruncnt == 0) { 1638 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1639 if (dp->disp_nrunnable == 0) { 1640 dp->disp_max_unbound_pri = -1; 1641 dp->disp_maxrunpri = -1; 1642 } else if (tpri == dp->disp_maxrunpri) { 1643 int ipri; 1644 1645 ipri = bt_gethighbit(dp->disp_qactmap, 1646 dp->disp_maxrunpri >> BT_ULSHIFT); 1647 if (ipri < dp->disp_max_unbound_pri) 1648 dp->disp_max_unbound_pri = ipri; 1649 dp->disp_maxrunpri = ipri; 1650 } 1651 } 1652 tp->t_link = NULL; 1653 THREAD_TRANSITION(tp); /* put in intermediate state */ 1654 return (1); 1655 } 1656 1657 1658 /* 1659 * dq_sruninc and dq_srundec are public functions for 1660 * incrementing/decrementing the sruncnts when a thread on 1661 * a dispatcher queue is made schedulable/unschedulable by 1662 * resetting the TS_LOAD flag. 1663 * 1664 * The caller MUST have the thread lock and therefore the dispatcher 1665 * queue lock so that the operation which changes 1666 * the flag, the operation that checks the status of the thread to 1667 * determine if it's on a disp queue AND the call to this function 1668 * are one atomic operation with respect to interrupts. 1669 */ 1670 1671 /* 1672 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1673 */ 1674 void 1675 dq_sruninc(kthread_t *t) 1676 { 1677 ASSERT(t->t_state == TS_RUN); 1678 ASSERT(t->t_schedflag & TS_LOAD); 1679 1680 THREAD_TRANSITION(t); 1681 setfrontdq(t); 1682 } 1683 1684 /* 1685 * See comment on calling conventions above. 1686 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1687 */ 1688 void 1689 dq_srundec(kthread_t *t) 1690 { 1691 ASSERT(t->t_schedflag & TS_LOAD); 1692 1693 (void) dispdeq(t); 1694 disp_swapped_enq(t); 1695 } 1696 1697 /* 1698 * Change the dispatcher lock of thread to the "swapped_lock" 1699 * and return with thread lock still held. 1700 * 1701 * Called with thread_lock held, in transition state, and at high spl. 1702 */ 1703 void 1704 disp_swapped_enq(kthread_t *tp) 1705 { 1706 ASSERT(THREAD_LOCK_HELD(tp)); 1707 ASSERT(tp->t_schedflag & TS_LOAD); 1708 1709 switch (tp->t_state) { 1710 case TS_RUN: 1711 disp_lock_enter_high(&swapped_lock); 1712 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1713 break; 1714 case TS_ONPROC: 1715 disp_lock_enter_high(&swapped_lock); 1716 THREAD_TRANSITION(tp); 1717 wake_sched_sec = 1; /* tell clock to wake sched */ 1718 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1719 break; 1720 default: 1721 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1722 } 1723 } 1724 1725 /* 1726 * This routine is called by setbackdq/setfrontdq if the thread is 1727 * not loaded or loaded and on the swap queue. 1728 * 1729 * Thread state TS_SLEEP implies that a swapped thread 1730 * has been woken up and needs to be swapped in by the swapper. 1731 * 1732 * Thread state TS_RUN, it implies that the priority of a swapped 1733 * thread is being increased by scheduling class (e.g. ts_update). 1734 */ 1735 static void 1736 disp_swapped_setrun(kthread_t *tp) 1737 { 1738 ASSERT(THREAD_LOCK_HELD(tp)); 1739 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1740 1741 switch (tp->t_state) { 1742 case TS_SLEEP: 1743 disp_lock_enter_high(&swapped_lock); 1744 /* 1745 * Wakeup sched immediately (i.e., next tick) if the 1746 * thread priority is above maxclsyspri. 1747 */ 1748 if (DISP_PRIO(tp) > maxclsyspri) 1749 wake_sched = 1; 1750 else 1751 wake_sched_sec = 1; 1752 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1753 break; 1754 case TS_RUN: /* called from ts_update */ 1755 break; 1756 default: 1757 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1758 } 1759 } 1760 1761 1762 /* 1763 * Make a thread give up its processor. Find the processor on 1764 * which this thread is executing, and have that processor 1765 * preempt. 1766 */ 1767 void 1768 cpu_surrender(kthread_t *tp) 1769 { 1770 cpu_t *cpup; 1771 int max_pri; 1772 int max_run_pri; 1773 klwp_t *lwp; 1774 1775 ASSERT(THREAD_LOCK_HELD(tp)); 1776 1777 if (tp->t_state != TS_ONPROC) 1778 return; 1779 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1780 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1781 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1782 if (max_pri < max_run_pri) 1783 max_pri = max_run_pri; 1784 1785 cpup->cpu_runrun = 1; 1786 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1787 cpup->cpu_kprunrun = 1; 1788 } 1789 1790 /* 1791 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1792 */ 1793 membar_enter(); 1794 1795 DTRACE_SCHED1(surrender, kthread_t *, tp); 1796 1797 /* 1798 * Make the target thread take an excursion through trap() 1799 * to do preempt() (unless we're already in trap or post_syscall, 1800 * calling cpu_surrender via CL_TRAPRET). 1801 */ 1802 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1803 lwp->lwp_state != LWP_USER) { 1804 aston(tp); 1805 if (cpup != CPU) 1806 poke_cpu(cpup->cpu_id); 1807 } 1808 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1809 "cpu_surrender:tid %p cpu %p", tp, cpup); 1810 } 1811 1812 1813 /* 1814 * Commit to and ratify a scheduling decision 1815 */ 1816 /*ARGSUSED*/ 1817 static kthread_t * 1818 disp_ratify(kthread_t *tp, disp_t *kpq) 1819 { 1820 pri_t tpri, maxpri; 1821 pri_t maxkpri; 1822 cpu_t *cpup; 1823 1824 ASSERT(tp != NULL); 1825 /* 1826 * Commit to, then ratify scheduling decision 1827 */ 1828 cpup = CPU; 1829 if (cpup->cpu_runrun != 0) 1830 cpup->cpu_runrun = 0; 1831 if (cpup->cpu_kprunrun != 0) 1832 cpup->cpu_kprunrun = 0; 1833 if (cpup->cpu_chosen_level != -1) 1834 cpup->cpu_chosen_level = -1; 1835 membar_enter(); 1836 tpri = DISP_PRIO(tp); 1837 maxpri = cpup->cpu_disp->disp_maxrunpri; 1838 maxkpri = kpq->disp_maxrunpri; 1839 if (maxpri < maxkpri) 1840 maxpri = maxkpri; 1841 if (tpri < maxpri) { 1842 /* 1843 * should have done better 1844 * put this one back and indicate to try again 1845 */ 1846 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1847 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1848 thread_lock_high(tp); 1849 THREAD_TRANSITION(tp); 1850 setfrontdq(tp); 1851 thread_unlock_nopreempt(tp); 1852 1853 tp = NULL; 1854 } 1855 return (tp); 1856 } 1857 1858 /* 1859 * See if there is any work on the dispatcher queue for other CPUs. 1860 * If there is, dequeue the best thread and return. 1861 */ 1862 static kthread_t * 1863 disp_getwork(cpu_t *cp) 1864 { 1865 cpu_t *ocp; /* other CPU */ 1866 cpu_t *ocp_start; 1867 cpu_t *tcp; /* target local CPU */ 1868 kthread_t *tp; 1869 kthread_t *retval = NULL; 1870 pri_t maxpri; 1871 disp_t *kpq; /* kp queue for this partition */ 1872 lpl_t *lpl, *lpl_leaf; 1873 int hint, leafidx; 1874 hrtime_t stealtime; 1875 1876 maxpri = -1; 1877 tcp = NULL; 1878 1879 kpq = &cp->cpu_part->cp_kp_queue; 1880 while (kpq->disp_maxrunpri >= 0) { 1881 /* 1882 * Try to take a thread from the kp_queue. 1883 */ 1884 tp = (disp_getbest(kpq)); 1885 if (tp) 1886 return (disp_ratify(tp, kpq)); 1887 } 1888 1889 kpreempt_disable(); /* protect the cpu_active list */ 1890 1891 /* 1892 * Try to find something to do on another CPU's run queue. 1893 * Loop through all other CPUs looking for the one with the highest 1894 * priority unbound thread. 1895 * 1896 * On NUMA machines, the partition's CPUs are consulted in order of 1897 * distance from the current CPU. This way, the first available 1898 * work found is also the closest, and will suffer the least 1899 * from being migrated. 1900 */ 1901 lpl = lpl_leaf = cp->cpu_lpl; 1902 hint = leafidx = 0; 1903 1904 /* 1905 * This loop traverses the lpl hierarchy. Higher level lpls represent 1906 * broader levels of locality 1907 */ 1908 do { 1909 /* This loop iterates over the lpl's leaves */ 1910 do { 1911 if (lpl_leaf != cp->cpu_lpl) 1912 ocp = lpl_leaf->lpl_cpus; 1913 else 1914 ocp = cp->cpu_next_lpl; 1915 1916 /* This loop iterates over the CPUs in the leaf */ 1917 ocp_start = ocp; 1918 do { 1919 pri_t pri; 1920 1921 ASSERT(CPU_ACTIVE(ocp)); 1922 1923 /* 1924 * End our stroll around the partition if: 1925 * 1926 * - Something became runnable on the local 1927 * queue 1928 * 1929 * - We're at the broadest level of locality and 1930 * we happen across another idle CPU. At the 1931 * highest level of locality, all CPUs will 1932 * walk the partition's CPUs in the same 1933 * order, so we can end our stroll taking 1934 * comfort in knowing the other idle CPU is 1935 * already covering the next portion of the 1936 * list. 1937 */ 1938 if (cp->cpu_disp->disp_nrunnable != 0) 1939 break; 1940 if (ocp->cpu_dispatch_pri == -1) { 1941 if (ocp->cpu_disp_flags & 1942 CPU_DISP_HALTED) 1943 continue; 1944 else if (lpl->lpl_parent == NULL) 1945 break; 1946 } 1947 1948 /* 1949 * If there's only one thread and the CPU 1950 * is in the middle of a context switch, 1951 * or it's currently running the idle thread, 1952 * don't steal it. 1953 */ 1954 if ((ocp->cpu_disp_flags & 1955 CPU_DISP_DONTSTEAL) && 1956 ocp->cpu_disp->disp_nrunnable == 1) 1957 continue; 1958 1959 pri = ocp->cpu_disp->disp_max_unbound_pri; 1960 if (pri > maxpri) { 1961 /* 1962 * Don't steal threads that we attempted 1963 * to be stolen very recently until 1964 * they're ready to be stolen again. 1965 */ 1966 stealtime = ocp->cpu_disp->disp_steal; 1967 if (stealtime == 0 || 1968 stealtime - gethrtime() <= 0) { 1969 maxpri = pri; 1970 tcp = ocp; 1971 } else { 1972 /* 1973 * Don't update tcp, just set 1974 * the retval to T_DONTSTEAL, so 1975 * that if no acceptable CPUs 1976 * are found the return value 1977 * will be T_DONTSTEAL rather 1978 * then NULL. 1979 */ 1980 retval = T_DONTSTEAL; 1981 } 1982 } 1983 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 1984 1985 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 1986 leafidx = 0; 1987 lpl_leaf = lpl->lpl_rset[leafidx]; 1988 } 1989 } while (leafidx != hint); 1990 1991 hint = leafidx = lpl->lpl_hint; 1992 if ((lpl = lpl->lpl_parent) != NULL) 1993 lpl_leaf = lpl->lpl_rset[hint]; 1994 } while (!tcp && lpl); 1995 1996 kpreempt_enable(); 1997 1998 /* 1999 * If another queue looks good, and there is still nothing on 2000 * the local queue, try to transfer one or more threads 2001 * from it to our queue. 2002 */ 2003 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 2004 tp = disp_getbest(tcp->cpu_disp); 2005 if (tp == NULL || tp == T_DONTSTEAL) 2006 return (tp); 2007 return (disp_ratify(tp, kpq)); 2008 } 2009 return (retval); 2010 } 2011 2012 2013 /* 2014 * disp_fix_unbound_pri() 2015 * Determines the maximum priority of unbound threads on the queue. 2016 * The priority is kept for the queue, but is only increased, never 2017 * reduced unless some CPU is looking for something on that queue. 2018 * 2019 * The priority argument is the known upper limit. 2020 * 2021 * Perhaps this should be kept accurately, but that probably means 2022 * separate bitmaps for bound and unbound threads. Since only idled 2023 * CPUs will have to do this recalculation, it seems better this way. 2024 */ 2025 static void 2026 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 2027 { 2028 kthread_t *tp; 2029 dispq_t *dq; 2030 ulong_t *dqactmap = dp->disp_qactmap; 2031 ulong_t mapword; 2032 int wx; 2033 2034 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 2035 2036 ASSERT(pri >= 0); /* checked by caller */ 2037 2038 /* 2039 * Start the search at the next lowest priority below the supplied 2040 * priority. This depends on the bitmap implementation. 2041 */ 2042 do { 2043 wx = pri >> BT_ULSHIFT; /* index of word in map */ 2044 2045 /* 2046 * Form mask for all lower priorities in the word. 2047 */ 2048 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 2049 2050 /* 2051 * Get next lower active priority. 2052 */ 2053 if (mapword != 0) { 2054 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 2055 } else if (wx > 0) { 2056 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 2057 if (pri < 0) 2058 break; 2059 } else { 2060 pri = -1; 2061 break; 2062 } 2063 2064 /* 2065 * Search the queue for unbound, runnable threads. 2066 */ 2067 dq = &dp->disp_q[pri]; 2068 tp = dq->dq_first; 2069 2070 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2071 tp = tp->t_link; 2072 } 2073 2074 /* 2075 * If a thread was found, set the priority and return. 2076 */ 2077 } while (tp == NULL); 2078 2079 /* 2080 * pri holds the maximum unbound thread priority or -1. 2081 */ 2082 if (dp->disp_max_unbound_pri != pri) 2083 dp->disp_max_unbound_pri = pri; 2084 } 2085 2086 /* 2087 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2088 * check if the CPU to which is was previously bound should have 2089 * its disp_max_unbound_pri increased. 2090 */ 2091 void 2092 disp_adjust_unbound_pri(kthread_t *tp) 2093 { 2094 disp_t *dp; 2095 pri_t tpri; 2096 2097 ASSERT(THREAD_LOCK_HELD(tp)); 2098 2099 /* 2100 * Don't do anything if the thread is not bound, or 2101 * currently not runnable or swapped out. 2102 */ 2103 if (tp->t_bound_cpu == NULL || 2104 tp->t_state != TS_RUN || 2105 tp->t_schedflag & TS_ON_SWAPQ) 2106 return; 2107 2108 tpri = DISP_PRIO(tp); 2109 dp = tp->t_bound_cpu->cpu_disp; 2110 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2111 if (tpri > dp->disp_max_unbound_pri) 2112 dp->disp_max_unbound_pri = tpri; 2113 } 2114 2115 /* 2116 * disp_getbest() 2117 * De-queue the highest priority unbound runnable thread. 2118 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2119 * Returns NULL if nothing found. 2120 * Returns T_DONTSTEAL if the thread was not stealable. 2121 * so that the caller will try again later. 2122 * 2123 * Passed a pointer to a dispatch queue not associated with this CPU, and 2124 * its type. 2125 */ 2126 static kthread_t * 2127 disp_getbest(disp_t *dp) 2128 { 2129 kthread_t *tp; 2130 dispq_t *dq; 2131 pri_t pri; 2132 cpu_t *cp, *tcp; 2133 boolean_t allbound; 2134 2135 disp_lock_enter(&dp->disp_lock); 2136 2137 /* 2138 * If there is nothing to run, or the CPU is in the middle of a 2139 * context switch of the only thread, return NULL. 2140 */ 2141 tcp = dp->disp_cpu; 2142 cp = CPU; 2143 pri = dp->disp_max_unbound_pri; 2144 if (pri == -1 || 2145 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2146 tcp->cpu_disp->disp_nrunnable == 1)) { 2147 disp_lock_exit_nopreempt(&dp->disp_lock); 2148 return (NULL); 2149 } 2150 2151 dq = &dp->disp_q[pri]; 2152 2153 2154 /* 2155 * Assume that all threads are bound on this queue, and change it 2156 * later when we find out that it is not the case. 2157 */ 2158 allbound = B_TRUE; 2159 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2160 hrtime_t now, nosteal, rqtime; 2161 chip_type_t chtype; 2162 chip_t *chip; 2163 2164 /* 2165 * Skip over bound threads which could be here even 2166 * though disp_max_unbound_pri indicated this level. 2167 */ 2168 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2169 continue; 2170 2171 /* 2172 * We've got some unbound threads on this queue, so turn 2173 * the allbound flag off now. 2174 */ 2175 allbound = B_FALSE; 2176 2177 /* 2178 * The thread is a candidate for stealing from its run queue. We 2179 * don't want to steal threads that became runnable just a 2180 * moment ago. This improves CPU affinity for threads that get 2181 * preempted for short periods of time and go back on the run 2182 * queue. 2183 * 2184 * We want to let it stay on its run queue if it was only placed 2185 * there recently and it was running on the same CPU before that 2186 * to preserve its cache investment. For the thread to remain on 2187 * its run queue, ALL of the following conditions must be 2188 * satisfied: 2189 * 2190 * - the disp queue should not be the kernel preemption queue 2191 * - delayed idle stealing should not be disabled 2192 * - nosteal_nsec should be non-zero 2193 * - it should run with user priority 2194 * - it should be on the run queue of the CPU where it was 2195 * running before being placed on the run queue 2196 * - it should be the only thread on the run queue (to prevent 2197 * extra scheduling latency for other threads) 2198 * - it should sit on the run queue for less than per-chip 2199 * nosteal interval or global nosteal interval 2200 * - in case of CPUs with shared cache it should sit in a run 2201 * queue of a CPU from a different chip 2202 * 2203 * The checks are arranged so that the ones that are faster are 2204 * placed earlier. 2205 */ 2206 if (tcp == NULL || 2207 pri >= minclsyspri || 2208 tp->t_cpu != tcp) 2209 break; 2210 2211 /* 2212 * Steal immediately if the chip has shared cache and we are 2213 * sharing the chip with the target thread's CPU. 2214 */ 2215 chip = tcp->cpu_chip; 2216 chtype = chip->chip_type; 2217 if ((chtype == CHIP_SMT || chtype == CHIP_CMP_SHARED_CACHE) && 2218 chip == cp->cpu_chip) 2219 break; 2220 2221 /* 2222 * Get the value of nosteal interval either from nosteal_nsec 2223 * global variable or from a value specified by a chip 2224 */ 2225 nosteal = nosteal_nsec ? nosteal_nsec : chip->chip_nosteal; 2226 if (nosteal == 0 || nosteal == NOSTEAL_DISABLED) 2227 break; 2228 2229 /* 2230 * Calculate time spent sitting on run queue 2231 */ 2232 now = gethrtime_unscaled(); 2233 rqtime = now - tp->t_waitrq; 2234 scalehrtime(&rqtime); 2235 2236 /* 2237 * Steal immediately if the time spent on this run queue is more 2238 * than allowed nosteal delay. 2239 * 2240 * Negative rqtime check is needed here to avoid infinite 2241 * stealing delays caused by unlikely but not impossible 2242 * drifts between CPU times on different CPUs. 2243 */ 2244 if (rqtime > nosteal || rqtime < 0) 2245 break; 2246 2247 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2248 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2249 scalehrtime(&now); 2250 /* 2251 * Calculate when this thread becomes stealable 2252 */ 2253 now += (nosteal - rqtime); 2254 2255 /* 2256 * Calculate time when some thread becomes stealable 2257 */ 2258 if (now < dp->disp_steal) 2259 dp->disp_steal = now; 2260 } 2261 2262 /* 2263 * If there were no unbound threads on this queue, find the queue 2264 * where they are and then return later. The value of 2265 * disp_max_unbound_pri is not always accurate because it isn't 2266 * reduced until another idle CPU looks for work. 2267 */ 2268 if (allbound) 2269 disp_fix_unbound_pri(dp, pri); 2270 2271 /* 2272 * If we reached the end of the queue and found no unbound threads 2273 * then return NULL so that other CPUs will be considered. If there 2274 * are unbound threads but they cannot yet be stolen, then 2275 * return T_DONTSTEAL and try again later. 2276 */ 2277 if (tp == NULL) { 2278 disp_lock_exit_nopreempt(&dp->disp_lock); 2279 return (allbound ? NULL : T_DONTSTEAL); 2280 } 2281 2282 /* 2283 * Found a runnable, unbound thread, so remove it from queue. 2284 * dispdeq() requires that we have the thread locked, and we do, 2285 * by virtue of holding the dispatch queue lock. dispdeq() will 2286 * put the thread in transition state, thereby dropping the dispq 2287 * lock. 2288 */ 2289 2290 #ifdef DEBUG 2291 { 2292 int thread_was_on_queue; 2293 2294 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2295 ASSERT(thread_was_on_queue); 2296 } 2297 2298 #else /* DEBUG */ 2299 (void) dispdeq(tp); /* drops disp_lock */ 2300 #endif /* DEBUG */ 2301 2302 /* 2303 * Reset the disp_queue steal time - we do not know what is the smallest 2304 * value across the queue is. 2305 */ 2306 dp->disp_steal = 0; 2307 2308 tp->t_schedflag |= TS_DONT_SWAP; 2309 2310 /* 2311 * Setup thread to run on the current CPU. 2312 */ 2313 tp->t_disp_queue = cp->cpu_disp; 2314 2315 cp->cpu_dispthread = tp; /* protected by spl only */ 2316 cp->cpu_dispatch_pri = pri; 2317 ASSERT(pri == DISP_PRIO(tp)); 2318 2319 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2320 2321 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2322 2323 /* 2324 * Return with spl high so that swtch() won't need to raise it. 2325 * The disp_lock was dropped by dispdeq(). 2326 */ 2327 2328 return (tp); 2329 } 2330 2331 /* 2332 * disp_bound_common() - common routine for higher level functions 2333 * that check for bound threads under certain conditions. 2334 * If 'threadlistsafe' is set then there is no need to acquire 2335 * pidlock to stop the thread list from changing (eg, if 2336 * disp_bound_* is called with cpus paused). 2337 */ 2338 static int 2339 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2340 { 2341 int found = 0; 2342 kthread_t *tp; 2343 2344 ASSERT(flag); 2345 2346 if (!threadlistsafe) 2347 mutex_enter(&pidlock); 2348 tp = curthread; /* faster than allthreads */ 2349 do { 2350 if (tp->t_state != TS_FREE) { 2351 /* 2352 * If an interrupt thread is busy, but the 2353 * caller doesn't care (i.e. BOUND_INTR is off), 2354 * then just ignore it and continue through. 2355 */ 2356 if ((tp->t_flag & T_INTR_THREAD) && 2357 !(flag & BOUND_INTR)) 2358 continue; 2359 2360 /* 2361 * Skip the idle thread for the CPU 2362 * we're about to set offline. 2363 */ 2364 if (tp == cp->cpu_idle_thread) 2365 continue; 2366 2367 /* 2368 * Skip the pause thread for the CPU 2369 * we're about to set offline. 2370 */ 2371 if (tp == cp->cpu_pause_thread) 2372 continue; 2373 2374 if ((flag & BOUND_CPU) && 2375 (tp->t_bound_cpu == cp || 2376 tp->t_bind_cpu == cp->cpu_id || 2377 tp->t_weakbound_cpu == cp)) { 2378 found = 1; 2379 break; 2380 } 2381 2382 if ((flag & BOUND_PARTITION) && 2383 (tp->t_cpupart == cp->cpu_part)) { 2384 found = 1; 2385 break; 2386 } 2387 } 2388 } while ((tp = tp->t_next) != curthread && found == 0); 2389 if (!threadlistsafe) 2390 mutex_exit(&pidlock); 2391 return (found); 2392 } 2393 2394 /* 2395 * disp_bound_threads - return nonzero if threads are bound to the processor. 2396 * Called infrequently. Keep this simple. 2397 * Includes threads that are asleep or stopped but not onproc. 2398 */ 2399 int 2400 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2401 { 2402 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2403 } 2404 2405 /* 2406 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2407 * to the given processor, including interrupt threads. 2408 */ 2409 int 2410 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2411 { 2412 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2413 } 2414 2415 /* 2416 * disp_bound_partition - return nonzero if threads are bound to the same 2417 * partition as the processor. 2418 * Called infrequently. Keep this simple. 2419 * Includes threads that are asleep or stopped but not onproc. 2420 */ 2421 int 2422 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2423 { 2424 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2425 } 2426 2427 /* 2428 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2429 * threads to other CPUs. 2430 */ 2431 void 2432 disp_cpu_inactive(cpu_t *cp) 2433 { 2434 kthread_t *tp; 2435 disp_t *dp = cp->cpu_disp; 2436 dispq_t *dq; 2437 pri_t pri; 2438 int wasonq; 2439 2440 disp_lock_enter(&dp->disp_lock); 2441 while ((pri = dp->disp_max_unbound_pri) != -1) { 2442 dq = &dp->disp_q[pri]; 2443 tp = dq->dq_first; 2444 2445 /* 2446 * Skip over bound threads. 2447 */ 2448 while (tp != NULL && tp->t_bound_cpu != NULL) { 2449 tp = tp->t_link; 2450 } 2451 2452 if (tp == NULL) { 2453 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2454 disp_fix_unbound_pri(dp, pri); 2455 continue; 2456 } 2457 2458 wasonq = dispdeq(tp); /* drops disp_lock */ 2459 ASSERT(wasonq); 2460 ASSERT(tp->t_weakbound_cpu == NULL); 2461 2462 setbackdq(tp); 2463 /* 2464 * Called from cpu_offline: 2465 * 2466 * cp has already been removed from the list of active cpus 2467 * and tp->t_cpu has been changed so there is no risk of 2468 * tp ending up back on cp. 2469 * 2470 * Called from cpupart_move_cpu: 2471 * 2472 * The cpu has moved to a new cpupart. Any threads that 2473 * were on it's dispatch queues before the move remain 2474 * in the old partition and can't run in the new partition. 2475 */ 2476 ASSERT(tp->t_cpu != cp); 2477 thread_unlock(tp); 2478 2479 disp_lock_enter(&dp->disp_lock); 2480 } 2481 disp_lock_exit(&dp->disp_lock); 2482 } 2483 2484 /* 2485 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2486 * The hint passed in is used as a starting point so we don't favor 2487 * CPU 0 or any other CPU. The caller should pass in the most recently 2488 * used CPU for the thread. 2489 * 2490 * The lgroup and priority are used to determine the best CPU to run on 2491 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2492 * the thread priority will indicate whether the thread will actually run 2493 * there. To pick the best CPU, the CPUs inside and outside of the given 2494 * lgroup which are running the lowest priority threads are found. The 2495 * remote CPU is chosen only if the thread will not run locally on a CPU 2496 * within the lgroup, but will run on the remote CPU. If the thread 2497 * cannot immediately run on any CPU, the best local CPU will be chosen. 2498 * 2499 * The lpl specified also identifies the cpu partition from which 2500 * disp_lowpri_cpu should select a CPU. 2501 * 2502 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2503 * behalf of the current thread. (curthread is looking for a new cpu) 2504 * In this case, cpu_dispatch_pri for this thread's cpu should be 2505 * ignored. 2506 * 2507 * If a cpu is the target of an offline request then try to avoid it. 2508 * 2509 * This function must be called at either high SPL, or with preemption 2510 * disabled, so that the "hint" CPU cannot be removed from the online 2511 * CPU list while we are traversing it. 2512 */ 2513 cpu_t * 2514 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2515 { 2516 cpu_t *bestcpu; 2517 cpu_t *besthomecpu; 2518 cpu_t *cp, *cpstart; 2519 2520 pri_t bestpri; 2521 pri_t cpupri; 2522 2523 klgrpset_t done; 2524 klgrpset_t cur_set; 2525 2526 lpl_t *lpl_iter, *lpl_leaf; 2527 int i; 2528 2529 /* 2530 * Scan for a CPU currently running the lowest priority thread. 2531 * Cannot get cpu_lock here because it is adaptive. 2532 * We do not require lock on CPU list. 2533 */ 2534 ASSERT(hint != NULL); 2535 ASSERT(lpl != NULL); 2536 ASSERT(lpl->lpl_ncpu > 0); 2537 2538 /* 2539 * First examine local CPUs. Note that it's possible the hint CPU 2540 * passed in in remote to the specified home lgroup. If our priority 2541 * isn't sufficient enough such that we can run immediately at home, 2542 * then examine CPUs remote to our home lgroup. 2543 * We would like to give preference to CPUs closest to "home". 2544 * If we can't find a CPU where we'll run at a given level 2545 * of locality, we expand our search to include the next level. 2546 */ 2547 bestcpu = besthomecpu = NULL; 2548 klgrpset_clear(done); 2549 /* start with lpl we were passed */ 2550 2551 lpl_iter = lpl; 2552 2553 do { 2554 2555 bestpri = SHRT_MAX; 2556 klgrpset_clear(cur_set); 2557 2558 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2559 lpl_leaf = lpl_iter->lpl_rset[i]; 2560 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2561 continue; 2562 2563 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2564 2565 if (hint->cpu_lpl == lpl_leaf) 2566 cp = cpstart = hint; 2567 else 2568 cp = cpstart = lpl_leaf->lpl_cpus; 2569 2570 do { 2571 if (cp == curcpu) 2572 cpupri = -1; 2573 else if (cp == cpu_inmotion) 2574 cpupri = SHRT_MAX; 2575 else 2576 cpupri = cp->cpu_dispatch_pri; 2577 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2578 cpupri = cp->cpu_disp->disp_maxrunpri; 2579 if (cp->cpu_chosen_level > cpupri) 2580 cpupri = cp->cpu_chosen_level; 2581 if (cpupri < bestpri) { 2582 if (CPU_IDLING(cpupri)) { 2583 ASSERT((cp->cpu_flags & 2584 CPU_QUIESCED) == 0); 2585 return (cp); 2586 } 2587 bestcpu = cp; 2588 bestpri = cpupri; 2589 } 2590 } while ((cp = cp->cpu_next_lpl) != cpstart); 2591 } 2592 2593 if (bestcpu && (tpri > bestpri)) { 2594 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2595 return (bestcpu); 2596 } 2597 if (besthomecpu == NULL) 2598 besthomecpu = bestcpu; 2599 /* 2600 * Add the lgrps we just considered to the "done" set 2601 */ 2602 klgrpset_or(done, cur_set); 2603 2604 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2605 2606 /* 2607 * The specified priority isn't high enough to run immediately 2608 * anywhere, so just return the best CPU from the home lgroup. 2609 */ 2610 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2611 return (besthomecpu); 2612 } 2613 2614 /* 2615 * This routine provides the generic idle cpu function for all processors. 2616 * If a processor has some specific code to execute when idle (say, to stop 2617 * the pipeline and save power) then that routine should be defined in the 2618 * processors specific code (module_xx.c) and the global variable idle_cpu 2619 * set to that function. 2620 */ 2621 static void 2622 generic_idle_cpu(void) 2623 { 2624 } 2625 2626 /*ARGSUSED*/ 2627 static void 2628 generic_enq_thread(cpu_t *cpu, int bound) 2629 { 2630 } 2631 2632 /* 2633 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2634 * - t->t_cpu is not in this thread's assigned lgrp 2635 * - the time since the thread last came off t->t_cpu exceeds the 2636 * rechoose time for this cpu (ignore this if t is curthread in 2637 * which case it's on CPU and t->t_disp_time is inaccurate) 2638 * - t->t_cpu is presently the target of an offline or partition move 2639 * request 2640 */ 2641 static cpu_t * 2642 cpu_choose(kthread_t *t, pri_t tpri) 2643 { 2644 ASSERT(tpri < kpqpri); 2645 2646 if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) && 2647 t != curthread) || t->t_cpu == cpu_inmotion) { 2648 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2649 } 2650 2651 /* 2652 * Take a trip through disp_lowpri_cpu() if the thread was 2653 * running outside it's home lgroup 2654 */ 2655 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2656 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2657 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2658 (t == curthread) ? t->t_cpu : NULL)); 2659 } 2660 return (t->t_cpu); 2661 } 2662