1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/pg.h> 54 #include <sys/cmt.h> 55 #include <sys/bitset.h> 56 #include <sys/schedctl.h> 57 #include <sys/atomic.h> 58 #include <sys/dtrace.h> 59 #include <sys/sdt.h> 60 61 #include <vm/as.h> 62 63 #define BOUND_CPU 0x1 64 #define BOUND_PARTITION 0x2 65 #define BOUND_INTR 0x4 66 67 /* Dispatch queue allocation structure and functions */ 68 struct disp_queue_info { 69 disp_t *dp; 70 dispq_t *olddispq; 71 dispq_t *newdispq; 72 ulong_t *olddqactmap; 73 ulong_t *newdqactmap; 74 int oldnglobpris; 75 }; 76 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 77 disp_t *dp); 78 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 79 static void disp_dq_free(struct disp_queue_info *dptr); 80 81 /* platform-specific routine to call when processor is idle */ 82 static void generic_idle_cpu(); 83 void (*idle_cpu)() = generic_idle_cpu; 84 85 /* routines invoked when a CPU enters/exits the idle loop */ 86 static void idle_enter(); 87 static void idle_exit(); 88 89 /* platform-specific routine to call when thread is enqueued */ 90 static void generic_enq_thread(cpu_t *, int); 91 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 92 93 pri_t kpreemptpri; /* priority where kernel preemption applies */ 94 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 95 pri_t intr_pri; /* interrupt thread priority base level */ 96 97 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 98 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 99 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 100 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 101 int nswapped; /* total number of swapped threads */ 102 void disp_swapped_enq(kthread_t *tp); 103 static void disp_swapped_setrun(kthread_t *tp); 104 static void cpu_resched(cpu_t *cp, pri_t tpri); 105 106 /* 107 * If this is set, only interrupt threads will cause kernel preemptions. 108 * This is done by changing the value of kpreemptpri. kpreemptpri 109 * will either be the max sysclass pri + 1 or the min interrupt pri. 110 */ 111 int only_intr_kpreempt; 112 113 extern void set_idle_cpu(int cpun); 114 extern void unset_idle_cpu(int cpun); 115 static void setkpdq(kthread_t *tp, int borf); 116 #define SETKP_BACK 0 117 #define SETKP_FRONT 1 118 /* 119 * Parameter that determines how recently a thread must have run 120 * on the CPU to be considered loosely-bound to that CPU to reduce 121 * cold cache effects. The interval is in hertz. 122 */ 123 #define RECHOOSE_INTERVAL 3 124 int rechoose_interval = RECHOOSE_INTERVAL; 125 static cpu_t *cpu_choose(kthread_t *, pri_t); 126 127 /* 128 * Parameter that determines how long (in nanoseconds) a thread must 129 * be sitting on a run queue before it can be stolen by another CPU 130 * to reduce migrations. The interval is in nanoseconds. 131 * 132 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval() 133 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED 134 * here indicating it is uninitiallized. 135 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'. 136 * 137 */ 138 #define NOSTEAL_UNINITIALIZED (-1) 139 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED; 140 extern void cmp_set_nosteal_interval(void); 141 142 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 143 144 disp_lock_t transition_lock; /* lock on transitioning threads */ 145 disp_lock_t stop_lock; /* lock on stopped threads */ 146 147 static void cpu_dispqalloc(int numpris); 148 149 /* 150 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 151 * a thread because it was sitting on its run queue for a very short 152 * period of time. 153 */ 154 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 155 156 static kthread_t *disp_getwork(cpu_t *to); 157 static kthread_t *disp_getbest(disp_t *from); 158 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 159 160 void swtch_to(kthread_t *); 161 162 /* 163 * dispatcher and scheduler initialization 164 */ 165 166 /* 167 * disp_setup - Common code to calculate and allocate dispatcher 168 * variables and structures based on the maximum priority. 169 */ 170 static void 171 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 172 { 173 pri_t newnglobpris; 174 175 ASSERT(MUTEX_HELD(&cpu_lock)); 176 177 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 178 179 if (newnglobpris > oldnglobpris) { 180 /* 181 * Allocate new kp queues for each CPU partition. 182 */ 183 cpupart_kpqalloc(newnglobpris); 184 185 /* 186 * Allocate new dispatch queues for each CPU. 187 */ 188 cpu_dispqalloc(newnglobpris); 189 190 /* 191 * compute new interrupt thread base priority 192 */ 193 intr_pri = maxglobpri; 194 if (only_intr_kpreempt) { 195 kpreemptpri = intr_pri + 1; 196 if (kpqpri == KPQPRI) 197 kpqpri = kpreemptpri; 198 } 199 v.v_nglobpris = newnglobpris; 200 } 201 } 202 203 /* 204 * dispinit - Called to initialize all loaded classes and the 205 * dispatcher framework. 206 */ 207 void 208 dispinit(void) 209 { 210 id_t cid; 211 pri_t maxglobpri; 212 pri_t cl_maxglobpri; 213 214 maxglobpri = -1; 215 216 /* 217 * Initialize transition lock, which will always be set. 218 */ 219 DISP_LOCK_INIT(&transition_lock); 220 disp_lock_enter_high(&transition_lock); 221 DISP_LOCK_INIT(&stop_lock); 222 223 mutex_enter(&cpu_lock); 224 CPU->cpu_disp->disp_maxrunpri = -1; 225 CPU->cpu_disp->disp_max_unbound_pri = -1; 226 227 /* 228 * Initialize the default CPU partition. 229 */ 230 cpupart_initialize_default(); 231 /* 232 * Call the class specific initialization functions for 233 * all pre-installed schedulers. 234 * 235 * We pass the size of a class specific parameter 236 * buffer to each of the initialization functions 237 * to try to catch problems with backward compatibility 238 * of class modules. 239 * 240 * For example a new class module running on an old system 241 * which didn't provide sufficiently large parameter buffers 242 * would be bad news. Class initialization modules can check for 243 * this and take action if they detect a problem. 244 */ 245 246 for (cid = 0; cid < nclass; cid++) { 247 sclass_t *sc; 248 249 sc = &sclass[cid]; 250 if (SCHED_INSTALLED(sc)) { 251 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 252 &sc->cl_funcs); 253 if (cl_maxglobpri > maxglobpri) 254 maxglobpri = cl_maxglobpri; 255 } 256 } 257 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 258 if (kpqpri == KPQPRI) 259 kpqpri = kpreemptpri; 260 261 ASSERT(maxglobpri >= 0); 262 disp_setup(maxglobpri, 0); 263 264 mutex_exit(&cpu_lock); 265 266 /* 267 * Platform specific sticky scheduler setup. 268 */ 269 if (nosteal_nsec == NOSTEAL_UNINITIALIZED) 270 cmp_set_nosteal_interval(); 271 272 /* 273 * Get the default class ID; this may be later modified via 274 * dispadmin(1M). This will load the class (normally TS) and that will 275 * call disp_add(), which is why we had to drop cpu_lock first. 276 */ 277 if (getcid(defaultclass, &defaultcid) != 0) { 278 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 279 defaultclass); 280 } 281 } 282 283 /* 284 * disp_add - Called with class pointer to initialize the dispatcher 285 * for a newly loaded class. 286 */ 287 void 288 disp_add(sclass_t *clp) 289 { 290 pri_t maxglobpri; 291 pri_t cl_maxglobpri; 292 293 mutex_enter(&cpu_lock); 294 /* 295 * Initialize the scheduler class. 296 */ 297 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 298 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 299 if (cl_maxglobpri > maxglobpri) 300 maxglobpri = cl_maxglobpri; 301 302 /* 303 * Save old queue information. Since we're initializing a 304 * new scheduling class which has just been loaded, then 305 * the size of the dispq may have changed. We need to handle 306 * that here. 307 */ 308 disp_setup(maxglobpri, v.v_nglobpris); 309 310 mutex_exit(&cpu_lock); 311 } 312 313 314 /* 315 * For each CPU, allocate new dispatch queues 316 * with the stated number of priorities. 317 */ 318 static void 319 cpu_dispqalloc(int numpris) 320 { 321 cpu_t *cpup; 322 struct disp_queue_info *disp_mem; 323 int i, num; 324 325 ASSERT(MUTEX_HELD(&cpu_lock)); 326 327 disp_mem = kmem_zalloc(NCPU * 328 sizeof (struct disp_queue_info), KM_SLEEP); 329 330 /* 331 * This routine must allocate all of the memory before stopping 332 * the cpus because it must not sleep in kmem_alloc while the 333 * CPUs are stopped. Locks they hold will not be freed until they 334 * are restarted. 335 */ 336 i = 0; 337 cpup = cpu_list; 338 do { 339 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 340 i++; 341 cpup = cpup->cpu_next; 342 } while (cpup != cpu_list); 343 num = i; 344 345 pause_cpus(NULL); 346 for (i = 0; i < num; i++) 347 disp_dq_assign(&disp_mem[i], numpris); 348 start_cpus(); 349 350 /* 351 * I must free all of the memory after starting the cpus because 352 * I can not risk sleeping in kmem_free while the cpus are stopped. 353 */ 354 for (i = 0; i < num; i++) 355 disp_dq_free(&disp_mem[i]); 356 357 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 358 } 359 360 static void 361 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 362 { 363 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 364 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 365 sizeof (long), KM_SLEEP); 366 dptr->dp = dp; 367 } 368 369 static void 370 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 371 { 372 disp_t *dp; 373 374 dp = dptr->dp; 375 dptr->olddispq = dp->disp_q; 376 dptr->olddqactmap = dp->disp_qactmap; 377 dptr->oldnglobpris = dp->disp_npri; 378 379 ASSERT(dptr->oldnglobpris < numpris); 380 381 if (dptr->olddispq != NULL) { 382 /* 383 * Use kcopy because bcopy is platform-specific 384 * and could block while we might have paused the cpus. 385 */ 386 (void) kcopy(dptr->olddispq, dptr->newdispq, 387 dptr->oldnglobpris * sizeof (dispq_t)); 388 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 389 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 390 sizeof (long)); 391 } 392 dp->disp_q = dptr->newdispq; 393 dp->disp_qactmap = dptr->newdqactmap; 394 dp->disp_q_limit = &dptr->newdispq[numpris]; 395 dp->disp_npri = numpris; 396 } 397 398 static void 399 disp_dq_free(struct disp_queue_info *dptr) 400 { 401 if (dptr->olddispq != NULL) 402 kmem_free(dptr->olddispq, 403 dptr->oldnglobpris * sizeof (dispq_t)); 404 if (dptr->olddqactmap != NULL) 405 kmem_free(dptr->olddqactmap, 406 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 407 } 408 409 /* 410 * For a newly created CPU, initialize the dispatch queue. 411 * This is called before the CPU is known through cpu[] or on any lists. 412 */ 413 void 414 disp_cpu_init(cpu_t *cp) 415 { 416 disp_t *dp; 417 dispq_t *newdispq; 418 ulong_t *newdqactmap; 419 420 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 421 422 if (cp == cpu0_disp.disp_cpu) 423 dp = &cpu0_disp; 424 else 425 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 426 bzero(dp, sizeof (disp_t)); 427 cp->cpu_disp = dp; 428 dp->disp_cpu = cp; 429 dp->disp_maxrunpri = -1; 430 dp->disp_max_unbound_pri = -1; 431 DISP_LOCK_INIT(&cp->cpu_thread_lock); 432 /* 433 * Allocate memory for the dispatcher queue headers 434 * and the active queue bitmap. 435 */ 436 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 437 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 438 sizeof (long), KM_SLEEP); 439 dp->disp_q = newdispq; 440 dp->disp_qactmap = newdqactmap; 441 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 442 dp->disp_npri = v.v_nglobpris; 443 } 444 445 void 446 disp_cpu_fini(cpu_t *cp) 447 { 448 ASSERT(MUTEX_HELD(&cpu_lock)); 449 450 disp_kp_free(cp->cpu_disp); 451 if (cp->cpu_disp != &cpu0_disp) 452 kmem_free(cp->cpu_disp, sizeof (disp_t)); 453 } 454 455 /* 456 * Allocate new, larger kpreempt dispatch queue to replace the old one. 457 */ 458 void 459 disp_kp_alloc(disp_t *dq, pri_t npri) 460 { 461 struct disp_queue_info mem_info; 462 463 if (npri > dq->disp_npri) { 464 /* 465 * Allocate memory for the new array. 466 */ 467 disp_dq_alloc(&mem_info, npri, dq); 468 469 /* 470 * We need to copy the old structures to the new 471 * and free the old. 472 */ 473 disp_dq_assign(&mem_info, npri); 474 disp_dq_free(&mem_info); 475 } 476 } 477 478 /* 479 * Free dispatch queue. 480 * Used for the kpreempt queues for a removed CPU partition and 481 * for the per-CPU queues of deleted CPUs. 482 */ 483 void 484 disp_kp_free(disp_t *dq) 485 { 486 struct disp_queue_info mem_info; 487 488 mem_info.olddispq = dq->disp_q; 489 mem_info.olddqactmap = dq->disp_qactmap; 490 mem_info.oldnglobpris = dq->disp_npri; 491 disp_dq_free(&mem_info); 492 } 493 494 /* 495 * End dispatcher and scheduler initialization. 496 */ 497 498 /* 499 * See if there's anything to do other than remain idle. 500 * Return non-zero if there is. 501 * 502 * This function must be called with high spl, or with 503 * kernel preemption disabled to prevent the partition's 504 * active cpu list from changing while being traversed. 505 * 506 */ 507 int 508 disp_anywork(void) 509 { 510 cpu_t *cp = CPU; 511 cpu_t *ocp; 512 513 if (cp->cpu_disp->disp_nrunnable != 0) 514 return (1); 515 516 if (!(cp->cpu_flags & CPU_OFFLINE)) { 517 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 518 return (1); 519 520 /* 521 * Work can be taken from another CPU if: 522 * - There is unbound work on the run queue 523 * - That work isn't a thread undergoing a 524 * - context switch on an otherwise empty queue. 525 * - The CPU isn't running the idle loop. 526 */ 527 for (ocp = cp->cpu_next_part; ocp != cp; 528 ocp = ocp->cpu_next_part) { 529 ASSERT(CPU_ACTIVE(ocp)); 530 531 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 532 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 533 ocp->cpu_disp->disp_nrunnable == 1) && 534 ocp->cpu_dispatch_pri != -1) 535 return (1); 536 } 537 } 538 return (0); 539 } 540 541 /* 542 * Called when CPU enters the idle loop 543 */ 544 static void 545 idle_enter() 546 { 547 cpu_t *cp = CPU; 548 549 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 550 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 551 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 552 } 553 554 /* 555 * Called when CPU exits the idle loop 556 */ 557 static void 558 idle_exit() 559 { 560 cpu_t *cp = CPU; 561 562 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 563 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 564 } 565 566 /* 567 * Idle loop. 568 */ 569 void 570 idle() 571 { 572 struct cpu *cp = CPU; /* pointer to this CPU */ 573 kthread_t *t; /* taken thread */ 574 575 idle_enter(); 576 577 /* 578 * Uniprocessor version of idle loop. 579 * Do this until notified that we're on an actual multiprocessor. 580 */ 581 while (ncpus == 1) { 582 if (cp->cpu_disp->disp_nrunnable == 0) { 583 (*idle_cpu)(); 584 continue; 585 } 586 idle_exit(); 587 swtch(); 588 589 idle_enter(); /* returned from swtch */ 590 } 591 592 /* 593 * Multiprocessor idle loop. 594 */ 595 for (;;) { 596 /* 597 * If CPU is completely quiesced by p_online(2), just wait 598 * here with minimal bus traffic until put online. 599 */ 600 while (cp->cpu_flags & CPU_QUIESCED) 601 (*idle_cpu)(); 602 603 if (cp->cpu_disp->disp_nrunnable != 0) { 604 idle_exit(); 605 swtch(); 606 } else { 607 if (cp->cpu_flags & CPU_OFFLINE) 608 continue; 609 if ((t = disp_getwork(cp)) == NULL) { 610 if (cp->cpu_chosen_level != -1) { 611 disp_t *dp = cp->cpu_disp; 612 disp_t *kpq; 613 614 disp_lock_enter(&dp->disp_lock); 615 /* 616 * Set kpq under lock to prevent 617 * migration between partitions. 618 */ 619 kpq = &cp->cpu_part->cp_kp_queue; 620 if (kpq->disp_maxrunpri == -1) 621 cp->cpu_chosen_level = -1; 622 disp_lock_exit(&dp->disp_lock); 623 } 624 (*idle_cpu)(); 625 continue; 626 } 627 /* 628 * If there was a thread but we couldn't steal 629 * it, then keep trying. 630 */ 631 if (t == T_DONTSTEAL) 632 continue; 633 idle_exit(); 634 swtch_to(t); 635 } 636 idle_enter(); /* returned from swtch/swtch_to */ 637 } 638 } 639 640 641 /* 642 * Preempt the currently running thread in favor of the highest 643 * priority thread. The class of the current thread controls 644 * where it goes on the dispatcher queues. If panicking, turn 645 * preemption off. 646 */ 647 void 648 preempt() 649 { 650 kthread_t *t = curthread; 651 klwp_t *lwp = ttolwp(curthread); 652 653 if (panicstr) 654 return; 655 656 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 657 658 thread_lock(t); 659 660 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 661 /* 662 * this thread has already been chosen to be run on 663 * another CPU. Clear kprunrun on this CPU since we're 664 * already headed for swtch(). 665 */ 666 CPU->cpu_kprunrun = 0; 667 thread_unlock_nopreempt(t); 668 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 669 } else { 670 if (lwp != NULL) 671 lwp->lwp_ru.nivcsw++; 672 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 673 THREAD_TRANSITION(t); 674 CL_PREEMPT(t); 675 DTRACE_SCHED(preempt); 676 thread_unlock_nopreempt(t); 677 678 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 679 680 swtch(); /* clears CPU->cpu_runrun via disp() */ 681 } 682 } 683 684 extern kthread_t *thread_unpin(); 685 686 /* 687 * disp() - find the highest priority thread for this processor to run, and 688 * set it in TS_ONPROC state so that resume() can be called to run it. 689 */ 690 static kthread_t * 691 disp() 692 { 693 cpu_t *cpup; 694 disp_t *dp; 695 kthread_t *tp; 696 dispq_t *dq; 697 int maxrunword; 698 pri_t pri; 699 disp_t *kpq; 700 701 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 702 703 cpup = CPU; 704 /* 705 * Find the highest priority loaded, runnable thread. 706 */ 707 dp = cpup->cpu_disp; 708 709 reschedule: 710 /* 711 * If there is more important work on the global queue with a better 712 * priority than the maximum on this CPU, take it now. 713 */ 714 kpq = &cpup->cpu_part->cp_kp_queue; 715 while ((pri = kpq->disp_maxrunpri) >= 0 && 716 pri >= dp->disp_maxrunpri && 717 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 718 (tp = disp_getbest(kpq)) != NULL) { 719 if (disp_ratify(tp, kpq) != NULL) { 720 TRACE_1(TR_FAC_DISP, TR_DISP_END, 721 "disp_end:tid %p", tp); 722 return (tp); 723 } 724 } 725 726 disp_lock_enter(&dp->disp_lock); 727 pri = dp->disp_maxrunpri; 728 729 /* 730 * If there is nothing to run, look at what's runnable on other queues. 731 * Choose the idle thread if the CPU is quiesced. 732 * Note that CPUs that have the CPU_OFFLINE flag set can still run 733 * interrupt threads, which will be the only threads on the CPU's own 734 * queue, but cannot run threads from other queues. 735 */ 736 if (pri == -1) { 737 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 738 disp_lock_exit(&dp->disp_lock); 739 if ((tp = disp_getwork(cpup)) == NULL || 740 tp == T_DONTSTEAL) { 741 tp = cpup->cpu_idle_thread; 742 (void) splhigh(); 743 THREAD_ONPROC(tp, cpup); 744 cpup->cpu_dispthread = tp; 745 cpup->cpu_dispatch_pri = -1; 746 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 747 cpup->cpu_chosen_level = -1; 748 } 749 } else { 750 disp_lock_exit_high(&dp->disp_lock); 751 tp = cpup->cpu_idle_thread; 752 THREAD_ONPROC(tp, cpup); 753 cpup->cpu_dispthread = tp; 754 cpup->cpu_dispatch_pri = -1; 755 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 756 cpup->cpu_chosen_level = -1; 757 } 758 TRACE_1(TR_FAC_DISP, TR_DISP_END, 759 "disp_end:tid %p", tp); 760 return (tp); 761 } 762 763 dq = &dp->disp_q[pri]; 764 tp = dq->dq_first; 765 766 ASSERT(tp != NULL); 767 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 768 769 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 770 771 /* 772 * Found it so remove it from queue. 773 */ 774 dp->disp_nrunnable--; 775 dq->dq_sruncnt--; 776 if ((dq->dq_first = tp->t_link) == NULL) { 777 ulong_t *dqactmap = dp->disp_qactmap; 778 779 ASSERT(dq->dq_sruncnt == 0); 780 dq->dq_last = NULL; 781 782 /* 783 * The queue is empty, so the corresponding bit needs to be 784 * turned off in dqactmap. If nrunnable != 0 just took the 785 * last runnable thread off the 786 * highest queue, so recompute disp_maxrunpri. 787 */ 788 maxrunword = pri >> BT_ULSHIFT; 789 dqactmap[maxrunword] &= ~BT_BIW(pri); 790 791 if (dp->disp_nrunnable == 0) { 792 dp->disp_max_unbound_pri = -1; 793 dp->disp_maxrunpri = -1; 794 } else { 795 int ipri; 796 797 ipri = bt_gethighbit(dqactmap, maxrunword); 798 dp->disp_maxrunpri = ipri; 799 if (ipri < dp->disp_max_unbound_pri) 800 dp->disp_max_unbound_pri = ipri; 801 } 802 } else { 803 tp->t_link = NULL; 804 } 805 806 /* 807 * Set TS_DONT_SWAP flag to prevent another processor from swapping 808 * out this thread before we have a chance to run it. 809 * While running, it is protected against swapping by t_lock. 810 */ 811 tp->t_schedflag |= TS_DONT_SWAP; 812 cpup->cpu_dispthread = tp; /* protected by spl only */ 813 cpup->cpu_dispatch_pri = pri; 814 ASSERT(pri == DISP_PRIO(tp)); 815 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 816 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 817 818 ASSERT(tp != NULL); 819 TRACE_1(TR_FAC_DISP, TR_DISP_END, 820 "disp_end:tid %p", tp); 821 822 if (disp_ratify(tp, kpq) == NULL) 823 goto reschedule; 824 825 return (tp); 826 } 827 828 /* 829 * swtch() 830 * Find best runnable thread and run it. 831 * Called with the current thread already switched to a new state, 832 * on a sleep queue, run queue, stopped, and not zombied. 833 * May be called at any spl level less than or equal to LOCK_LEVEL. 834 * Always drops spl to the base level (spl0()). 835 */ 836 void 837 swtch() 838 { 839 kthread_t *t = curthread; 840 kthread_t *next; 841 cpu_t *cp; 842 843 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 844 845 if (t->t_flag & T_INTR_THREAD) 846 cpu_intr_swtch_enter(t); 847 848 if (t->t_intr != NULL) { 849 /* 850 * We are an interrupt thread. Setup and return 851 * the interrupted thread to be resumed. 852 */ 853 (void) splhigh(); /* block other scheduler action */ 854 cp = CPU; /* now protected against migration */ 855 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 856 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 857 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 858 next = thread_unpin(); 859 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 860 resume_from_intr(next); 861 } else { 862 #ifdef DEBUG 863 if (t->t_state == TS_ONPROC && 864 t->t_disp_queue->disp_cpu == CPU && 865 t->t_preempt == 0) { 866 thread_lock(t); 867 ASSERT(t->t_state != TS_ONPROC || 868 t->t_disp_queue->disp_cpu != CPU || 869 t->t_preempt != 0); /* cannot migrate */ 870 thread_unlock_nopreempt(t); 871 } 872 #endif /* DEBUG */ 873 cp = CPU; 874 next = disp(); /* returns with spl high */ 875 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 876 877 /* OK to steal anything left on run queue */ 878 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 879 880 if (next != t) { 881 if (t == cp->cpu_idle_thread) { 882 PG_NRUN_UPDATE(cp, 1); 883 } else if (next == cp->cpu_idle_thread) { 884 PG_NRUN_UPDATE(cp, -1); 885 } 886 887 /* 888 * If t was previously in the TS_ONPROC state, 889 * setfrontdq and setbackdq won't have set its t_waitrq. 890 * Since we now finally know that we're switching away 891 * from this thread, set its t_waitrq if it is on a run 892 * queue. 893 */ 894 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 895 t->t_waitrq = gethrtime_unscaled(); 896 } 897 898 /* 899 * restore mstate of thread that we are switching to 900 */ 901 restore_mstate(next); 902 903 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 904 cp->cpu_last_swtch = t->t_disp_time = lbolt; 905 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 906 907 if (dtrace_vtime_active) 908 dtrace_vtime_switch(next); 909 910 resume(next); 911 /* 912 * The TR_RESUME_END and TR_SWTCH_END trace points 913 * appear at the end of resume(), because we may not 914 * return here 915 */ 916 } else { 917 if (t->t_flag & T_INTR_THREAD) 918 cpu_intr_swtch_exit(t); 919 920 DTRACE_SCHED(remain__cpu); 921 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 922 (void) spl0(); 923 } 924 } 925 } 926 927 /* 928 * swtch_from_zombie() 929 * Special case of swtch(), which allows checks for TS_ZOMB to be 930 * eliminated from normal resume. 931 * Find best runnable thread and run it. 932 * Called with the current thread zombied. 933 * Zombies cannot migrate, so CPU references are safe. 934 */ 935 void 936 swtch_from_zombie() 937 { 938 kthread_t *next; 939 cpu_t *cpu = CPU; 940 941 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 942 943 ASSERT(curthread->t_state == TS_ZOMB); 944 945 next = disp(); /* returns with spl high */ 946 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 947 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 948 ASSERT(next != curthread); 949 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 950 951 if (next == cpu->cpu_idle_thread) 952 PG_NRUN_UPDATE(cpu, -1); 953 954 restore_mstate(next); 955 956 if (dtrace_vtime_active) 957 dtrace_vtime_switch(next); 958 959 resume_from_zombie(next); 960 /* 961 * The TR_RESUME_END and TR_SWTCH_END trace points 962 * appear at the end of resume(), because we certainly will not 963 * return here 964 */ 965 } 966 967 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 968 static int 969 thread_on_queue(kthread_t *tp) 970 { 971 cpu_t *cp; 972 cpu_t *self; 973 disp_t *dp; 974 975 self = CPU; 976 cp = self->cpu_next_onln; 977 dp = cp->cpu_disp; 978 for (;;) { 979 dispq_t *dq; 980 dispq_t *eq; 981 982 disp_lock_enter_high(&dp->disp_lock); 983 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 984 kthread_t *rp; 985 986 ASSERT(dq->dq_last == NULL || 987 dq->dq_last->t_link == NULL); 988 for (rp = dq->dq_first; rp; rp = rp->t_link) 989 if (tp == rp) { 990 disp_lock_exit_high(&dp->disp_lock); 991 return (1); 992 } 993 } 994 disp_lock_exit_high(&dp->disp_lock); 995 if (cp == NULL) 996 break; 997 if (cp == self) { 998 cp = NULL; 999 dp = &cp->cpu_part->cp_kp_queue; 1000 } else { 1001 cp = cp->cpu_next_onln; 1002 dp = cp->cpu_disp; 1003 } 1004 } 1005 return (0); 1006 } /* end of thread_on_queue */ 1007 #else 1008 1009 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1010 1011 #endif /* DEBUG */ 1012 1013 /* 1014 * like swtch(), but switch to a specified thread taken from another CPU. 1015 * called with spl high.. 1016 */ 1017 void 1018 swtch_to(kthread_t *next) 1019 { 1020 cpu_t *cp = CPU; 1021 1022 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1023 1024 /* 1025 * Update context switch statistics. 1026 */ 1027 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1028 1029 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1030 1031 if (curthread == cp->cpu_idle_thread) 1032 PG_NRUN_UPDATE(cp, 1); 1033 1034 /* OK to steal anything left on run queue */ 1035 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1036 1037 /* record last execution time */ 1038 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1039 1040 /* 1041 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1042 * won't have set its t_waitrq. Since we now finally know that we're 1043 * switching away from this thread, set its t_waitrq if it is on a run 1044 * queue. 1045 */ 1046 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1047 curthread->t_waitrq = gethrtime_unscaled(); 1048 } 1049 1050 /* restore next thread to previously running microstate */ 1051 restore_mstate(next); 1052 1053 if (dtrace_vtime_active) 1054 dtrace_vtime_switch(next); 1055 1056 resume(next); 1057 /* 1058 * The TR_RESUME_END and TR_SWTCH_END trace points 1059 * appear at the end of resume(), because we may not 1060 * return here 1061 */ 1062 } 1063 1064 1065 1066 #define CPU_IDLING(pri) ((pri) == -1) 1067 1068 static void 1069 cpu_resched(cpu_t *cp, pri_t tpri) 1070 { 1071 int call_poke_cpu = 0; 1072 pri_t cpupri = cp->cpu_dispatch_pri; 1073 1074 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1075 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1076 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1077 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1078 cp->cpu_runrun = 1; 1079 aston(cp->cpu_dispthread); 1080 if (tpri < kpreemptpri && cp != CPU) 1081 call_poke_cpu = 1; 1082 } 1083 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1084 cp->cpu_kprunrun = 1; 1085 if (cp != CPU) 1086 call_poke_cpu = 1; 1087 } 1088 } 1089 1090 /* 1091 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1092 */ 1093 membar_enter(); 1094 1095 if (call_poke_cpu) 1096 poke_cpu(cp->cpu_id); 1097 } 1098 1099 /* 1100 * Perform multi-level CMT load balancing of running threads. 1101 * tp is the thread being enqueued 1102 * cp is the hint CPU (chosen by cpu_choose()). 1103 */ 1104 static cpu_t * 1105 cmt_balance(kthread_t *tp, cpu_t *cp) 1106 { 1107 int hint, i, cpu, nsiblings; 1108 int self = 0; 1109 group_t *cmt_pgs, *siblings; 1110 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 1111 int pg_nrun, tpg_nrun; 1112 int level = 0; 1113 cpu_t *newcp; 1114 1115 ASSERT(THREAD_LOCK_HELD(tp)); 1116 1117 cmt_pgs = &cp->cpu_pg->cmt_pgs; 1118 1119 if (GROUP_SIZE(cmt_pgs) == 0) 1120 return (cp); /* nothing to do */ 1121 1122 if (tp == curthread) 1123 self = 1; 1124 1125 /* 1126 * Balance across siblings in the CPUs CMT lineage 1127 */ 1128 do { 1129 pg = GROUP_ACCESS(cmt_pgs, level); 1130 1131 siblings = pg->cmt_siblings; 1132 nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 1133 if (nsiblings == 1) 1134 continue; /* nobody to balance against */ 1135 1136 pg_nrun = pg->cmt_nrunning; 1137 if (self && 1138 bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 1139 pg_nrun--; /* Ignore curthread's effect */ 1140 1141 hint = pg->cmt_hint; 1142 /* 1143 * Check for validity of the hint 1144 * It should reference a valid sibling 1145 */ 1146 if (hint >= nsiblings) 1147 hint = pg->cmt_hint = 0; 1148 else 1149 pg->cmt_hint++; 1150 1151 /* 1152 * Find a balancing candidate from among our siblings 1153 * "hint" is a hint for where to start looking 1154 */ 1155 i = hint; 1156 do { 1157 ASSERT(i < nsiblings); 1158 pg_tmp = GROUP_ACCESS(siblings, i); 1159 1160 /* 1161 * The candidate must not be us, and must 1162 * have some CPU resources in the thread's 1163 * partition 1164 */ 1165 if (pg_tmp != pg && 1166 bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 1167 ((pg_t *)pg_tmp)->pg_id)) { 1168 tpg = pg_tmp; 1169 break; 1170 } 1171 1172 if (++i >= nsiblings) 1173 i = 0; 1174 } while (i != hint); 1175 1176 if (!tpg) 1177 continue; /* no candidates at this level */ 1178 1179 /* 1180 * Check if the balancing target is underloaded 1181 * Decide to balance if the target is running fewer 1182 * threads, or if it's running the same number of threads 1183 * with more online CPUs 1184 */ 1185 tpg_nrun = tpg->cmt_nrunning; 1186 if (pg_nrun > tpg_nrun || 1187 (pg_nrun == tpg_nrun && 1188 (GROUP_SIZE(&tpg->cmt_cpus_actv) > 1189 GROUP_SIZE(&pg->cmt_cpus_actv)))) { 1190 break; 1191 } 1192 tpg = NULL; 1193 } while (++level < GROUP_SIZE(cmt_pgs)); 1194 1195 1196 if (tpg) { 1197 /* 1198 * Select an idle CPU from the target PG 1199 */ 1200 for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) { 1201 newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 1202 if (newcp->cpu_part == tp->t_cpupart && 1203 newcp->cpu_dispatch_pri == -1) { 1204 cp = newcp; 1205 break; 1206 } 1207 } 1208 } 1209 1210 return (cp); 1211 } 1212 1213 /* 1214 * setbackdq() keeps runqs balanced such that the difference in length 1215 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1216 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1217 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1218 * try to keep runqs perfectly balanced regardless of the thread priority. 1219 */ 1220 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1221 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1222 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1223 1224 /* 1225 * Put the specified thread on the back of the dispatcher 1226 * queue corresponding to its current priority. 1227 * 1228 * Called with the thread in transition, onproc or stopped state 1229 * and locked (transition implies locked) and at high spl. 1230 * Returns with the thread in TS_RUN state and still locked. 1231 */ 1232 void 1233 setbackdq(kthread_t *tp) 1234 { 1235 dispq_t *dq; 1236 disp_t *dp; 1237 cpu_t *cp; 1238 pri_t tpri; 1239 int bound; 1240 1241 ASSERT(THREAD_LOCK_HELD(tp)); 1242 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1243 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1244 1245 /* 1246 * If thread is "swapped" or on the swap queue don't 1247 * queue it, but wake sched. 1248 */ 1249 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1250 disp_swapped_setrun(tp); 1251 return; 1252 } 1253 1254 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1255 bound = 1; 1256 else 1257 bound = 0; 1258 1259 tpri = DISP_PRIO(tp); 1260 if (ncpus == 1) 1261 cp = tp->t_cpu; 1262 else if (!bound) { 1263 if (tpri >= kpqpri) { 1264 setkpdq(tp, SETKP_BACK); 1265 return; 1266 } 1267 /* 1268 * Let cpu_choose suggest a CPU. 1269 */ 1270 cp = cpu_choose(tp, tpri); 1271 1272 if (tp->t_cpupart == cp->cpu_part) { 1273 int qlen; 1274 1275 /* 1276 * Perform any CMT load balancing 1277 */ 1278 cp = cmt_balance(tp, cp); 1279 1280 /* 1281 * Balance across the run queues 1282 */ 1283 qlen = RUNQ_LEN(cp, tpri); 1284 if (tpri >= RUNQ_MATCH_PRI && 1285 !(tp->t_schedflag & TS_RUNQMATCH)) 1286 qlen -= RUNQ_MAX_DIFF; 1287 if (qlen > 0) { 1288 cpu_t *newcp; 1289 1290 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1291 newcp = cp->cpu_next_part; 1292 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1293 newcp = cp->cpu_next_part; 1294 } 1295 1296 if (RUNQ_LEN(newcp, tpri) < qlen) { 1297 DTRACE_PROBE3(runq__balance, 1298 kthread_t *, tp, 1299 cpu_t *, cp, cpu_t *, newcp); 1300 cp = newcp; 1301 } 1302 } 1303 } else { 1304 /* 1305 * Migrate to a cpu in the new partition. 1306 */ 1307 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1308 tp->t_lpl, tp->t_pri, NULL); 1309 } 1310 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1311 } else { 1312 /* 1313 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1314 * a short time until weak binding that existed when the 1315 * strong binding was established has dropped) so we must 1316 * favour weak binding over strong. 1317 */ 1318 cp = tp->t_weakbound_cpu ? 1319 tp->t_weakbound_cpu : tp->t_bound_cpu; 1320 } 1321 /* 1322 * A thread that is ONPROC may be temporarily placed on the run queue 1323 * but then chosen to run again by disp. If the thread we're placing on 1324 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1325 * replacement process is actually scheduled in swtch(). In this 1326 * situation, curthread is the only thread that could be in the ONPROC 1327 * state. 1328 */ 1329 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1330 hrtime_t curtime; 1331 1332 curtime = gethrtime_unscaled(); 1333 (void) cpu_update_pct(tp, curtime); 1334 tp->t_waitrq = curtime; 1335 } else { 1336 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1337 } 1338 1339 dp = cp->cpu_disp; 1340 disp_lock_enter_high(&dp->disp_lock); 1341 1342 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1343 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1344 tpri, cp, tp); 1345 1346 #ifndef NPROBE 1347 /* Kernel probe */ 1348 if (tnf_tracing_active) 1349 tnf_thread_queue(tp, cp, tpri); 1350 #endif /* NPROBE */ 1351 1352 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1353 1354 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1355 tp->t_disp_queue = dp; 1356 tp->t_link = NULL; 1357 1358 dq = &dp->disp_q[tpri]; 1359 dp->disp_nrunnable++; 1360 if (!bound) 1361 dp->disp_steal = 0; 1362 membar_enter(); 1363 1364 if (dq->dq_sruncnt++ != 0) { 1365 ASSERT(dq->dq_first != NULL); 1366 dq->dq_last->t_link = tp; 1367 dq->dq_last = tp; 1368 } else { 1369 ASSERT(dq->dq_first == NULL); 1370 ASSERT(dq->dq_last == NULL); 1371 dq->dq_first = dq->dq_last = tp; 1372 BT_SET(dp->disp_qactmap, tpri); 1373 if (tpri > dp->disp_maxrunpri) { 1374 dp->disp_maxrunpri = tpri; 1375 membar_enter(); 1376 cpu_resched(cp, tpri); 1377 } 1378 } 1379 1380 if (!bound && tpri > dp->disp_max_unbound_pri) { 1381 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1382 cp == CPU) { 1383 /* 1384 * If there are no other unbound threads on the 1385 * run queue, don't allow other CPUs to steal 1386 * this thread while we are in the middle of a 1387 * context switch. We may just switch to it 1388 * again right away. CPU_DISP_DONTSTEAL is cleared 1389 * in swtch and swtch_to. 1390 */ 1391 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1392 } 1393 dp->disp_max_unbound_pri = tpri; 1394 } 1395 (*disp_enq_thread)(cp, bound); 1396 } 1397 1398 /* 1399 * Put the specified thread on the front of the dispatcher 1400 * queue corresponding to its current priority. 1401 * 1402 * Called with the thread in transition, onproc or stopped state 1403 * and locked (transition implies locked) and at high spl. 1404 * Returns with the thread in TS_RUN state and still locked. 1405 */ 1406 void 1407 setfrontdq(kthread_t *tp) 1408 { 1409 disp_t *dp; 1410 dispq_t *dq; 1411 cpu_t *cp; 1412 pri_t tpri; 1413 int bound; 1414 1415 ASSERT(THREAD_LOCK_HELD(tp)); 1416 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1417 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1418 1419 /* 1420 * If thread is "swapped" or on the swap queue don't 1421 * queue it, but wake sched. 1422 */ 1423 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1424 disp_swapped_setrun(tp); 1425 return; 1426 } 1427 1428 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1429 bound = 1; 1430 else 1431 bound = 0; 1432 1433 tpri = DISP_PRIO(tp); 1434 if (ncpus == 1) 1435 cp = tp->t_cpu; 1436 else if (!bound) { 1437 if (tpri >= kpqpri) { 1438 setkpdq(tp, SETKP_FRONT); 1439 return; 1440 } 1441 cp = tp->t_cpu; 1442 if (tp->t_cpupart == cp->cpu_part) { 1443 /* 1444 * If we are of higher or equal priority than 1445 * the highest priority runnable thread of 1446 * the current CPU, just pick this CPU. Otherwise 1447 * Let cpu_choose() select the CPU. If this cpu 1448 * is the target of an offline request then do not 1449 * pick it - a thread_nomigrate() on the in motion 1450 * cpu relies on this when it forces a preempt. 1451 */ 1452 if (tpri < cp->cpu_disp->disp_maxrunpri || 1453 cp == cpu_inmotion) 1454 cp = cpu_choose(tp, tpri); 1455 } else { 1456 /* 1457 * Migrate to a cpu in the new partition. 1458 */ 1459 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1460 tp->t_lpl, tp->t_pri, NULL); 1461 } 1462 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1463 } else { 1464 /* 1465 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1466 * a short time until weak binding that existed when the 1467 * strong binding was established has dropped) so we must 1468 * favour weak binding over strong. 1469 */ 1470 cp = tp->t_weakbound_cpu ? 1471 tp->t_weakbound_cpu : tp->t_bound_cpu; 1472 } 1473 1474 /* 1475 * A thread that is ONPROC may be temporarily placed on the run queue 1476 * but then chosen to run again by disp. If the thread we're placing on 1477 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1478 * replacement process is actually scheduled in swtch(). In this 1479 * situation, curthread is the only thread that could be in the ONPROC 1480 * state. 1481 */ 1482 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1483 hrtime_t curtime; 1484 1485 curtime = gethrtime_unscaled(); 1486 (void) cpu_update_pct(tp, curtime); 1487 tp->t_waitrq = curtime; 1488 } else { 1489 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1490 } 1491 1492 dp = cp->cpu_disp; 1493 disp_lock_enter_high(&dp->disp_lock); 1494 1495 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1496 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1497 1498 #ifndef NPROBE 1499 /* Kernel probe */ 1500 if (tnf_tracing_active) 1501 tnf_thread_queue(tp, cp, tpri); 1502 #endif /* NPROBE */ 1503 1504 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1505 1506 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1507 tp->t_disp_queue = dp; 1508 1509 dq = &dp->disp_q[tpri]; 1510 dp->disp_nrunnable++; 1511 if (!bound) 1512 dp->disp_steal = 0; 1513 membar_enter(); 1514 1515 if (dq->dq_sruncnt++ != 0) { 1516 ASSERT(dq->dq_last != NULL); 1517 tp->t_link = dq->dq_first; 1518 dq->dq_first = tp; 1519 } else { 1520 ASSERT(dq->dq_last == NULL); 1521 ASSERT(dq->dq_first == NULL); 1522 tp->t_link = NULL; 1523 dq->dq_first = dq->dq_last = tp; 1524 BT_SET(dp->disp_qactmap, tpri); 1525 if (tpri > dp->disp_maxrunpri) { 1526 dp->disp_maxrunpri = tpri; 1527 membar_enter(); 1528 cpu_resched(cp, tpri); 1529 } 1530 } 1531 1532 if (!bound && tpri > dp->disp_max_unbound_pri) { 1533 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1534 cp == CPU) { 1535 /* 1536 * If there are no other unbound threads on the 1537 * run queue, don't allow other CPUs to steal 1538 * this thread while we are in the middle of a 1539 * context switch. We may just switch to it 1540 * again right away. CPU_DISP_DONTSTEAL is cleared 1541 * in swtch and swtch_to. 1542 */ 1543 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1544 } 1545 dp->disp_max_unbound_pri = tpri; 1546 } 1547 (*disp_enq_thread)(cp, bound); 1548 } 1549 1550 /* 1551 * Put a high-priority unbound thread on the kp queue 1552 */ 1553 static void 1554 setkpdq(kthread_t *tp, int borf) 1555 { 1556 dispq_t *dq; 1557 disp_t *dp; 1558 cpu_t *cp; 1559 pri_t tpri; 1560 1561 tpri = DISP_PRIO(tp); 1562 1563 dp = &tp->t_cpupart->cp_kp_queue; 1564 disp_lock_enter_high(&dp->disp_lock); 1565 1566 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1567 1568 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1569 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1570 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1571 tp->t_disp_queue = dp; 1572 dp->disp_nrunnable++; 1573 dq = &dp->disp_q[tpri]; 1574 1575 if (dq->dq_sruncnt++ != 0) { 1576 if (borf == SETKP_BACK) { 1577 ASSERT(dq->dq_first != NULL); 1578 tp->t_link = NULL; 1579 dq->dq_last->t_link = tp; 1580 dq->dq_last = tp; 1581 } else { 1582 ASSERT(dq->dq_last != NULL); 1583 tp->t_link = dq->dq_first; 1584 dq->dq_first = tp; 1585 } 1586 } else { 1587 if (borf == SETKP_BACK) { 1588 ASSERT(dq->dq_first == NULL); 1589 ASSERT(dq->dq_last == NULL); 1590 dq->dq_first = dq->dq_last = tp; 1591 } else { 1592 ASSERT(dq->dq_last == NULL); 1593 ASSERT(dq->dq_first == NULL); 1594 tp->t_link = NULL; 1595 dq->dq_first = dq->dq_last = tp; 1596 } 1597 BT_SET(dp->disp_qactmap, tpri); 1598 if (tpri > dp->disp_max_unbound_pri) 1599 dp->disp_max_unbound_pri = tpri; 1600 if (tpri > dp->disp_maxrunpri) { 1601 dp->disp_maxrunpri = tpri; 1602 membar_enter(); 1603 } 1604 } 1605 1606 cp = tp->t_cpu; 1607 if (tp->t_cpupart != cp->cpu_part) { 1608 /* migrate to a cpu in the new partition */ 1609 cp = tp->t_cpupart->cp_cpulist; 1610 } 1611 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1612 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1613 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1614 1615 #ifndef NPROBE 1616 /* Kernel probe */ 1617 if (tnf_tracing_active) 1618 tnf_thread_queue(tp, cp, tpri); 1619 #endif /* NPROBE */ 1620 1621 if (cp->cpu_chosen_level < tpri) 1622 cp->cpu_chosen_level = tpri; 1623 cpu_resched(cp, tpri); 1624 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1625 (*disp_enq_thread)(cp, 0); 1626 } 1627 1628 /* 1629 * Remove a thread from the dispatcher queue if it is on it. 1630 * It is not an error if it is not found but we return whether 1631 * or not it was found in case the caller wants to check. 1632 */ 1633 int 1634 dispdeq(kthread_t *tp) 1635 { 1636 disp_t *dp; 1637 dispq_t *dq; 1638 kthread_t *rp; 1639 kthread_t *trp; 1640 kthread_t **ptp; 1641 int tpri; 1642 1643 ASSERT(THREAD_LOCK_HELD(tp)); 1644 1645 if (tp->t_state != TS_RUN) 1646 return (0); 1647 1648 /* 1649 * The thread is "swapped" or is on the swap queue and 1650 * hence no longer on the run queue, so return true. 1651 */ 1652 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1653 return (1); 1654 1655 tpri = DISP_PRIO(tp); 1656 dp = tp->t_disp_queue; 1657 ASSERT(tpri < dp->disp_npri); 1658 dq = &dp->disp_q[tpri]; 1659 ptp = &dq->dq_first; 1660 rp = *ptp; 1661 trp = NULL; 1662 1663 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1664 1665 /* 1666 * Search for thread in queue. 1667 * Double links would simplify this at the expense of disp/setrun. 1668 */ 1669 while (rp != tp && rp != NULL) { 1670 trp = rp; 1671 ptp = &trp->t_link; 1672 rp = trp->t_link; 1673 } 1674 1675 if (rp == NULL) { 1676 panic("dispdeq: thread not on queue"); 1677 } 1678 1679 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1680 1681 /* 1682 * Found it so remove it from queue. 1683 */ 1684 if ((*ptp = rp->t_link) == NULL) 1685 dq->dq_last = trp; 1686 1687 dp->disp_nrunnable--; 1688 if (--dq->dq_sruncnt == 0) { 1689 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1690 if (dp->disp_nrunnable == 0) { 1691 dp->disp_max_unbound_pri = -1; 1692 dp->disp_maxrunpri = -1; 1693 } else if (tpri == dp->disp_maxrunpri) { 1694 int ipri; 1695 1696 ipri = bt_gethighbit(dp->disp_qactmap, 1697 dp->disp_maxrunpri >> BT_ULSHIFT); 1698 if (ipri < dp->disp_max_unbound_pri) 1699 dp->disp_max_unbound_pri = ipri; 1700 dp->disp_maxrunpri = ipri; 1701 } 1702 } 1703 tp->t_link = NULL; 1704 THREAD_TRANSITION(tp); /* put in intermediate state */ 1705 return (1); 1706 } 1707 1708 1709 /* 1710 * dq_sruninc and dq_srundec are public functions for 1711 * incrementing/decrementing the sruncnts when a thread on 1712 * a dispatcher queue is made schedulable/unschedulable by 1713 * resetting the TS_LOAD flag. 1714 * 1715 * The caller MUST have the thread lock and therefore the dispatcher 1716 * queue lock so that the operation which changes 1717 * the flag, the operation that checks the status of the thread to 1718 * determine if it's on a disp queue AND the call to this function 1719 * are one atomic operation with respect to interrupts. 1720 */ 1721 1722 /* 1723 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1724 */ 1725 void 1726 dq_sruninc(kthread_t *t) 1727 { 1728 ASSERT(t->t_state == TS_RUN); 1729 ASSERT(t->t_schedflag & TS_LOAD); 1730 1731 THREAD_TRANSITION(t); 1732 setfrontdq(t); 1733 } 1734 1735 /* 1736 * See comment on calling conventions above. 1737 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1738 */ 1739 void 1740 dq_srundec(kthread_t *t) 1741 { 1742 ASSERT(t->t_schedflag & TS_LOAD); 1743 1744 (void) dispdeq(t); 1745 disp_swapped_enq(t); 1746 } 1747 1748 /* 1749 * Change the dispatcher lock of thread to the "swapped_lock" 1750 * and return with thread lock still held. 1751 * 1752 * Called with thread_lock held, in transition state, and at high spl. 1753 */ 1754 void 1755 disp_swapped_enq(kthread_t *tp) 1756 { 1757 ASSERT(THREAD_LOCK_HELD(tp)); 1758 ASSERT(tp->t_schedflag & TS_LOAD); 1759 1760 switch (tp->t_state) { 1761 case TS_RUN: 1762 disp_lock_enter_high(&swapped_lock); 1763 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1764 break; 1765 case TS_ONPROC: 1766 disp_lock_enter_high(&swapped_lock); 1767 THREAD_TRANSITION(tp); 1768 wake_sched_sec = 1; /* tell clock to wake sched */ 1769 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1770 break; 1771 default: 1772 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1773 } 1774 } 1775 1776 /* 1777 * This routine is called by setbackdq/setfrontdq if the thread is 1778 * not loaded or loaded and on the swap queue. 1779 * 1780 * Thread state TS_SLEEP implies that a swapped thread 1781 * has been woken up and needs to be swapped in by the swapper. 1782 * 1783 * Thread state TS_RUN, it implies that the priority of a swapped 1784 * thread is being increased by scheduling class (e.g. ts_update). 1785 */ 1786 static void 1787 disp_swapped_setrun(kthread_t *tp) 1788 { 1789 ASSERT(THREAD_LOCK_HELD(tp)); 1790 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1791 1792 switch (tp->t_state) { 1793 case TS_SLEEP: 1794 disp_lock_enter_high(&swapped_lock); 1795 /* 1796 * Wakeup sched immediately (i.e., next tick) if the 1797 * thread priority is above maxclsyspri. 1798 */ 1799 if (DISP_PRIO(tp) > maxclsyspri) 1800 wake_sched = 1; 1801 else 1802 wake_sched_sec = 1; 1803 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1804 break; 1805 case TS_RUN: /* called from ts_update */ 1806 break; 1807 default: 1808 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1809 } 1810 } 1811 1812 1813 /* 1814 * Make a thread give up its processor. Find the processor on 1815 * which this thread is executing, and have that processor 1816 * preempt. 1817 */ 1818 void 1819 cpu_surrender(kthread_t *tp) 1820 { 1821 cpu_t *cpup; 1822 int max_pri; 1823 int max_run_pri; 1824 klwp_t *lwp; 1825 1826 ASSERT(THREAD_LOCK_HELD(tp)); 1827 1828 if (tp->t_state != TS_ONPROC) 1829 return; 1830 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1831 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1832 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1833 if (max_pri < max_run_pri) 1834 max_pri = max_run_pri; 1835 1836 cpup->cpu_runrun = 1; 1837 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1838 cpup->cpu_kprunrun = 1; 1839 } 1840 1841 /* 1842 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1843 */ 1844 membar_enter(); 1845 1846 DTRACE_SCHED1(surrender, kthread_t *, tp); 1847 1848 /* 1849 * Make the target thread take an excursion through trap() 1850 * to do preempt() (unless we're already in trap or post_syscall, 1851 * calling cpu_surrender via CL_TRAPRET). 1852 */ 1853 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1854 lwp->lwp_state != LWP_USER) { 1855 aston(tp); 1856 if (cpup != CPU) 1857 poke_cpu(cpup->cpu_id); 1858 } 1859 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1860 "cpu_surrender:tid %p cpu %p", tp, cpup); 1861 } 1862 1863 1864 /* 1865 * Commit to and ratify a scheduling decision 1866 */ 1867 /*ARGSUSED*/ 1868 static kthread_t * 1869 disp_ratify(kthread_t *tp, disp_t *kpq) 1870 { 1871 pri_t tpri, maxpri; 1872 pri_t maxkpri; 1873 cpu_t *cpup; 1874 1875 ASSERT(tp != NULL); 1876 /* 1877 * Commit to, then ratify scheduling decision 1878 */ 1879 cpup = CPU; 1880 if (cpup->cpu_runrun != 0) 1881 cpup->cpu_runrun = 0; 1882 if (cpup->cpu_kprunrun != 0) 1883 cpup->cpu_kprunrun = 0; 1884 if (cpup->cpu_chosen_level != -1) 1885 cpup->cpu_chosen_level = -1; 1886 membar_enter(); 1887 tpri = DISP_PRIO(tp); 1888 maxpri = cpup->cpu_disp->disp_maxrunpri; 1889 maxkpri = kpq->disp_maxrunpri; 1890 if (maxpri < maxkpri) 1891 maxpri = maxkpri; 1892 if (tpri < maxpri) { 1893 /* 1894 * should have done better 1895 * put this one back and indicate to try again 1896 */ 1897 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1898 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1899 thread_lock_high(tp); 1900 THREAD_TRANSITION(tp); 1901 setfrontdq(tp); 1902 thread_unlock_nopreempt(tp); 1903 1904 tp = NULL; 1905 } 1906 return (tp); 1907 } 1908 1909 /* 1910 * See if there is any work on the dispatcher queue for other CPUs. 1911 * If there is, dequeue the best thread and return. 1912 */ 1913 static kthread_t * 1914 disp_getwork(cpu_t *cp) 1915 { 1916 cpu_t *ocp; /* other CPU */ 1917 cpu_t *ocp_start; 1918 cpu_t *tcp; /* target local CPU */ 1919 kthread_t *tp; 1920 kthread_t *retval = NULL; 1921 pri_t maxpri; 1922 disp_t *kpq; /* kp queue for this partition */ 1923 lpl_t *lpl, *lpl_leaf; 1924 int hint, leafidx; 1925 hrtime_t stealtime; 1926 1927 maxpri = -1; 1928 tcp = NULL; 1929 1930 kpq = &cp->cpu_part->cp_kp_queue; 1931 while (kpq->disp_maxrunpri >= 0) { 1932 /* 1933 * Try to take a thread from the kp_queue. 1934 */ 1935 tp = (disp_getbest(kpq)); 1936 if (tp) 1937 return (disp_ratify(tp, kpq)); 1938 } 1939 1940 kpreempt_disable(); /* protect the cpu_active list */ 1941 1942 /* 1943 * Try to find something to do on another CPU's run queue. 1944 * Loop through all other CPUs looking for the one with the highest 1945 * priority unbound thread. 1946 * 1947 * On NUMA machines, the partition's CPUs are consulted in order of 1948 * distance from the current CPU. This way, the first available 1949 * work found is also the closest, and will suffer the least 1950 * from being migrated. 1951 */ 1952 lpl = lpl_leaf = cp->cpu_lpl; 1953 hint = leafidx = 0; 1954 1955 /* 1956 * This loop traverses the lpl hierarchy. Higher level lpls represent 1957 * broader levels of locality 1958 */ 1959 do { 1960 /* This loop iterates over the lpl's leaves */ 1961 do { 1962 if (lpl_leaf != cp->cpu_lpl) 1963 ocp = lpl_leaf->lpl_cpus; 1964 else 1965 ocp = cp->cpu_next_lpl; 1966 1967 /* This loop iterates over the CPUs in the leaf */ 1968 ocp_start = ocp; 1969 do { 1970 pri_t pri; 1971 1972 ASSERT(CPU_ACTIVE(ocp)); 1973 1974 /* 1975 * End our stroll around this lpl if: 1976 * 1977 * - Something became runnable on the local 1978 * queue...which also ends our stroll around 1979 * the partition. 1980 * 1981 * - We happen across another idle CPU. 1982 * Since it is patrolling the next portion 1983 * of the lpl's list (assuming it's not 1984 * halted), move to the next higher level 1985 * of locality. 1986 */ 1987 if (cp->cpu_disp->disp_nrunnable != 0) { 1988 kpreempt_enable(); 1989 return (NULL); 1990 } 1991 if (ocp->cpu_dispatch_pri == -1) { 1992 if (ocp->cpu_disp_flags & 1993 CPU_DISP_HALTED) 1994 continue; 1995 else 1996 break; 1997 } 1998 1999 /* 2000 * If there's only one thread and the CPU 2001 * is in the middle of a context switch, 2002 * or it's currently running the idle thread, 2003 * don't steal it. 2004 */ 2005 if ((ocp->cpu_disp_flags & 2006 CPU_DISP_DONTSTEAL) && 2007 ocp->cpu_disp->disp_nrunnable == 1) 2008 continue; 2009 2010 pri = ocp->cpu_disp->disp_max_unbound_pri; 2011 if (pri > maxpri) { 2012 /* 2013 * Don't steal threads that we attempted 2014 * to steal recently until they're ready 2015 * to be stolen again. 2016 */ 2017 stealtime = ocp->cpu_disp->disp_steal; 2018 if (stealtime == 0 || 2019 stealtime - gethrtime() <= 0) { 2020 maxpri = pri; 2021 tcp = ocp; 2022 } else { 2023 /* 2024 * Don't update tcp, just set 2025 * the retval to T_DONTSTEAL, so 2026 * that if no acceptable CPUs 2027 * are found the return value 2028 * will be T_DONTSTEAL rather 2029 * then NULL. 2030 */ 2031 retval = T_DONTSTEAL; 2032 } 2033 } 2034 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 2035 2036 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 2037 leafidx = 0; 2038 lpl_leaf = lpl->lpl_rset[leafidx]; 2039 } 2040 } while (leafidx != hint); 2041 2042 hint = leafidx = lpl->lpl_hint; 2043 if ((lpl = lpl->lpl_parent) != NULL) 2044 lpl_leaf = lpl->lpl_rset[hint]; 2045 } while (!tcp && lpl); 2046 2047 kpreempt_enable(); 2048 2049 /* 2050 * If another queue looks good, and there is still nothing on 2051 * the local queue, try to transfer one or more threads 2052 * from it to our queue. 2053 */ 2054 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 2055 tp = disp_getbest(tcp->cpu_disp); 2056 if (tp == NULL || tp == T_DONTSTEAL) 2057 return (tp); 2058 return (disp_ratify(tp, kpq)); 2059 } 2060 return (retval); 2061 } 2062 2063 2064 /* 2065 * disp_fix_unbound_pri() 2066 * Determines the maximum priority of unbound threads on the queue. 2067 * The priority is kept for the queue, but is only increased, never 2068 * reduced unless some CPU is looking for something on that queue. 2069 * 2070 * The priority argument is the known upper limit. 2071 * 2072 * Perhaps this should be kept accurately, but that probably means 2073 * separate bitmaps for bound and unbound threads. Since only idled 2074 * CPUs will have to do this recalculation, it seems better this way. 2075 */ 2076 static void 2077 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 2078 { 2079 kthread_t *tp; 2080 dispq_t *dq; 2081 ulong_t *dqactmap = dp->disp_qactmap; 2082 ulong_t mapword; 2083 int wx; 2084 2085 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 2086 2087 ASSERT(pri >= 0); /* checked by caller */ 2088 2089 /* 2090 * Start the search at the next lowest priority below the supplied 2091 * priority. This depends on the bitmap implementation. 2092 */ 2093 do { 2094 wx = pri >> BT_ULSHIFT; /* index of word in map */ 2095 2096 /* 2097 * Form mask for all lower priorities in the word. 2098 */ 2099 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 2100 2101 /* 2102 * Get next lower active priority. 2103 */ 2104 if (mapword != 0) { 2105 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 2106 } else if (wx > 0) { 2107 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 2108 if (pri < 0) 2109 break; 2110 } else { 2111 pri = -1; 2112 break; 2113 } 2114 2115 /* 2116 * Search the queue for unbound, runnable threads. 2117 */ 2118 dq = &dp->disp_q[pri]; 2119 tp = dq->dq_first; 2120 2121 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2122 tp = tp->t_link; 2123 } 2124 2125 /* 2126 * If a thread was found, set the priority and return. 2127 */ 2128 } while (tp == NULL); 2129 2130 /* 2131 * pri holds the maximum unbound thread priority or -1. 2132 */ 2133 if (dp->disp_max_unbound_pri != pri) 2134 dp->disp_max_unbound_pri = pri; 2135 } 2136 2137 /* 2138 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2139 * check if the CPU to which is was previously bound should have 2140 * its disp_max_unbound_pri increased. 2141 */ 2142 void 2143 disp_adjust_unbound_pri(kthread_t *tp) 2144 { 2145 disp_t *dp; 2146 pri_t tpri; 2147 2148 ASSERT(THREAD_LOCK_HELD(tp)); 2149 2150 /* 2151 * Don't do anything if the thread is not bound, or 2152 * currently not runnable or swapped out. 2153 */ 2154 if (tp->t_bound_cpu == NULL || 2155 tp->t_state != TS_RUN || 2156 tp->t_schedflag & TS_ON_SWAPQ) 2157 return; 2158 2159 tpri = DISP_PRIO(tp); 2160 dp = tp->t_bound_cpu->cpu_disp; 2161 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2162 if (tpri > dp->disp_max_unbound_pri) 2163 dp->disp_max_unbound_pri = tpri; 2164 } 2165 2166 /* 2167 * disp_getbest() 2168 * De-queue the highest priority unbound runnable thread. 2169 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2170 * Returns NULL if nothing found. 2171 * Returns T_DONTSTEAL if the thread was not stealable. 2172 * so that the caller will try again later. 2173 * 2174 * Passed a pointer to a dispatch queue not associated with this CPU, and 2175 * its type. 2176 */ 2177 static kthread_t * 2178 disp_getbest(disp_t *dp) 2179 { 2180 kthread_t *tp; 2181 dispq_t *dq; 2182 pri_t pri; 2183 cpu_t *cp, *tcp; 2184 boolean_t allbound; 2185 2186 disp_lock_enter(&dp->disp_lock); 2187 2188 /* 2189 * If there is nothing to run, or the CPU is in the middle of a 2190 * context switch of the only thread, return NULL. 2191 */ 2192 tcp = dp->disp_cpu; 2193 cp = CPU; 2194 pri = dp->disp_max_unbound_pri; 2195 if (pri == -1 || 2196 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2197 tcp->cpu_disp->disp_nrunnable == 1)) { 2198 disp_lock_exit_nopreempt(&dp->disp_lock); 2199 return (NULL); 2200 } 2201 2202 dq = &dp->disp_q[pri]; 2203 2204 2205 /* 2206 * Assume that all threads are bound on this queue, and change it 2207 * later when we find out that it is not the case. 2208 */ 2209 allbound = B_TRUE; 2210 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2211 hrtime_t now, nosteal, rqtime; 2212 2213 /* 2214 * Skip over bound threads which could be here even 2215 * though disp_max_unbound_pri indicated this level. 2216 */ 2217 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2218 continue; 2219 2220 /* 2221 * We've got some unbound threads on this queue, so turn 2222 * the allbound flag off now. 2223 */ 2224 allbound = B_FALSE; 2225 2226 /* 2227 * The thread is a candidate for stealing from its run queue. We 2228 * don't want to steal threads that became runnable just a 2229 * moment ago. This improves CPU affinity for threads that get 2230 * preempted for short periods of time and go back on the run 2231 * queue. 2232 * 2233 * We want to let it stay on its run queue if it was only placed 2234 * there recently and it was running on the same CPU before that 2235 * to preserve its cache investment. For the thread to remain on 2236 * its run queue, ALL of the following conditions must be 2237 * satisfied: 2238 * 2239 * - the disp queue should not be the kernel preemption queue 2240 * - delayed idle stealing should not be disabled 2241 * - nosteal_nsec should be non-zero 2242 * - it should run with user priority 2243 * - it should be on the run queue of the CPU where it was 2244 * running before being placed on the run queue 2245 * - it should be the only thread on the run queue (to prevent 2246 * extra scheduling latency for other threads) 2247 * - it should sit on the run queue for less than per-chip 2248 * nosteal interval or global nosteal interval 2249 * - in case of CPUs with shared cache it should sit in a run 2250 * queue of a CPU from a different chip 2251 * 2252 * The checks are arranged so that the ones that are faster are 2253 * placed earlier. 2254 */ 2255 if (tcp == NULL || 2256 pri >= minclsyspri || 2257 tp->t_cpu != tcp) 2258 break; 2259 2260 /* 2261 * Steal immediately if, due to CMT processor architecture 2262 * migraiton between cp and tcp would incur no performance 2263 * penalty. 2264 */ 2265 if (pg_cmt_can_migrate(cp, tcp)) 2266 break; 2267 2268 nosteal = nosteal_nsec; 2269 if (nosteal == 0) 2270 break; 2271 2272 /* 2273 * Calculate time spent sitting on run queue 2274 */ 2275 now = gethrtime_unscaled(); 2276 rqtime = now - tp->t_waitrq; 2277 scalehrtime(&rqtime); 2278 2279 /* 2280 * Steal immediately if the time spent on this run queue is more 2281 * than allowed nosteal delay. 2282 * 2283 * Negative rqtime check is needed here to avoid infinite 2284 * stealing delays caused by unlikely but not impossible 2285 * drifts between CPU times on different CPUs. 2286 */ 2287 if (rqtime > nosteal || rqtime < 0) 2288 break; 2289 2290 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2291 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2292 scalehrtime(&now); 2293 /* 2294 * Calculate when this thread becomes stealable 2295 */ 2296 now += (nosteal - rqtime); 2297 2298 /* 2299 * Calculate time when some thread becomes stealable 2300 */ 2301 if (now < dp->disp_steal) 2302 dp->disp_steal = now; 2303 } 2304 2305 /* 2306 * If there were no unbound threads on this queue, find the queue 2307 * where they are and then return later. The value of 2308 * disp_max_unbound_pri is not always accurate because it isn't 2309 * reduced until another idle CPU looks for work. 2310 */ 2311 if (allbound) 2312 disp_fix_unbound_pri(dp, pri); 2313 2314 /* 2315 * If we reached the end of the queue and found no unbound threads 2316 * then return NULL so that other CPUs will be considered. If there 2317 * are unbound threads but they cannot yet be stolen, then 2318 * return T_DONTSTEAL and try again later. 2319 */ 2320 if (tp == NULL) { 2321 disp_lock_exit_nopreempt(&dp->disp_lock); 2322 return (allbound ? NULL : T_DONTSTEAL); 2323 } 2324 2325 /* 2326 * Found a runnable, unbound thread, so remove it from queue. 2327 * dispdeq() requires that we have the thread locked, and we do, 2328 * by virtue of holding the dispatch queue lock. dispdeq() will 2329 * put the thread in transition state, thereby dropping the dispq 2330 * lock. 2331 */ 2332 2333 #ifdef DEBUG 2334 { 2335 int thread_was_on_queue; 2336 2337 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2338 ASSERT(thread_was_on_queue); 2339 } 2340 2341 #else /* DEBUG */ 2342 (void) dispdeq(tp); /* drops disp_lock */ 2343 #endif /* DEBUG */ 2344 2345 /* 2346 * Reset the disp_queue steal time - we do not know what is the smallest 2347 * value across the queue is. 2348 */ 2349 dp->disp_steal = 0; 2350 2351 tp->t_schedflag |= TS_DONT_SWAP; 2352 2353 /* 2354 * Setup thread to run on the current CPU. 2355 */ 2356 tp->t_disp_queue = cp->cpu_disp; 2357 2358 cp->cpu_dispthread = tp; /* protected by spl only */ 2359 cp->cpu_dispatch_pri = pri; 2360 ASSERT(pri == DISP_PRIO(tp)); 2361 2362 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2363 2364 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2365 2366 /* 2367 * Return with spl high so that swtch() won't need to raise it. 2368 * The disp_lock was dropped by dispdeq(). 2369 */ 2370 2371 return (tp); 2372 } 2373 2374 /* 2375 * disp_bound_common() - common routine for higher level functions 2376 * that check for bound threads under certain conditions. 2377 * If 'threadlistsafe' is set then there is no need to acquire 2378 * pidlock to stop the thread list from changing (eg, if 2379 * disp_bound_* is called with cpus paused). 2380 */ 2381 static int 2382 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2383 { 2384 int found = 0; 2385 kthread_t *tp; 2386 2387 ASSERT(flag); 2388 2389 if (!threadlistsafe) 2390 mutex_enter(&pidlock); 2391 tp = curthread; /* faster than allthreads */ 2392 do { 2393 if (tp->t_state != TS_FREE) { 2394 /* 2395 * If an interrupt thread is busy, but the 2396 * caller doesn't care (i.e. BOUND_INTR is off), 2397 * then just ignore it and continue through. 2398 */ 2399 if ((tp->t_flag & T_INTR_THREAD) && 2400 !(flag & BOUND_INTR)) 2401 continue; 2402 2403 /* 2404 * Skip the idle thread for the CPU 2405 * we're about to set offline. 2406 */ 2407 if (tp == cp->cpu_idle_thread) 2408 continue; 2409 2410 /* 2411 * Skip the pause thread for the CPU 2412 * we're about to set offline. 2413 */ 2414 if (tp == cp->cpu_pause_thread) 2415 continue; 2416 2417 if ((flag & BOUND_CPU) && 2418 (tp->t_bound_cpu == cp || 2419 tp->t_bind_cpu == cp->cpu_id || 2420 tp->t_weakbound_cpu == cp)) { 2421 found = 1; 2422 break; 2423 } 2424 2425 if ((flag & BOUND_PARTITION) && 2426 (tp->t_cpupart == cp->cpu_part)) { 2427 found = 1; 2428 break; 2429 } 2430 } 2431 } while ((tp = tp->t_next) != curthread && found == 0); 2432 if (!threadlistsafe) 2433 mutex_exit(&pidlock); 2434 return (found); 2435 } 2436 2437 /* 2438 * disp_bound_threads - return nonzero if threads are bound to the processor. 2439 * Called infrequently. Keep this simple. 2440 * Includes threads that are asleep or stopped but not onproc. 2441 */ 2442 int 2443 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2444 { 2445 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2446 } 2447 2448 /* 2449 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2450 * to the given processor, including interrupt threads. 2451 */ 2452 int 2453 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2454 { 2455 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2456 } 2457 2458 /* 2459 * disp_bound_partition - return nonzero if threads are bound to the same 2460 * partition as the processor. 2461 * Called infrequently. Keep this simple. 2462 * Includes threads that are asleep or stopped but not onproc. 2463 */ 2464 int 2465 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2466 { 2467 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2468 } 2469 2470 /* 2471 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2472 * threads to other CPUs. 2473 */ 2474 void 2475 disp_cpu_inactive(cpu_t *cp) 2476 { 2477 kthread_t *tp; 2478 disp_t *dp = cp->cpu_disp; 2479 dispq_t *dq; 2480 pri_t pri; 2481 int wasonq; 2482 2483 disp_lock_enter(&dp->disp_lock); 2484 while ((pri = dp->disp_max_unbound_pri) != -1) { 2485 dq = &dp->disp_q[pri]; 2486 tp = dq->dq_first; 2487 2488 /* 2489 * Skip over bound threads. 2490 */ 2491 while (tp != NULL && tp->t_bound_cpu != NULL) { 2492 tp = tp->t_link; 2493 } 2494 2495 if (tp == NULL) { 2496 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2497 disp_fix_unbound_pri(dp, pri); 2498 continue; 2499 } 2500 2501 wasonq = dispdeq(tp); /* drops disp_lock */ 2502 ASSERT(wasonq); 2503 ASSERT(tp->t_weakbound_cpu == NULL); 2504 2505 setbackdq(tp); 2506 /* 2507 * Called from cpu_offline: 2508 * 2509 * cp has already been removed from the list of active cpus 2510 * and tp->t_cpu has been changed so there is no risk of 2511 * tp ending up back on cp. 2512 * 2513 * Called from cpupart_move_cpu: 2514 * 2515 * The cpu has moved to a new cpupart. Any threads that 2516 * were on it's dispatch queues before the move remain 2517 * in the old partition and can't run in the new partition. 2518 */ 2519 ASSERT(tp->t_cpu != cp); 2520 thread_unlock(tp); 2521 2522 disp_lock_enter(&dp->disp_lock); 2523 } 2524 disp_lock_exit(&dp->disp_lock); 2525 } 2526 2527 /* 2528 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2529 * The hint passed in is used as a starting point so we don't favor 2530 * CPU 0 or any other CPU. The caller should pass in the most recently 2531 * used CPU for the thread. 2532 * 2533 * The lgroup and priority are used to determine the best CPU to run on 2534 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2535 * the thread priority will indicate whether the thread will actually run 2536 * there. To pick the best CPU, the CPUs inside and outside of the given 2537 * lgroup which are running the lowest priority threads are found. The 2538 * remote CPU is chosen only if the thread will not run locally on a CPU 2539 * within the lgroup, but will run on the remote CPU. If the thread 2540 * cannot immediately run on any CPU, the best local CPU will be chosen. 2541 * 2542 * The lpl specified also identifies the cpu partition from which 2543 * disp_lowpri_cpu should select a CPU. 2544 * 2545 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2546 * behalf of the current thread. (curthread is looking for a new cpu) 2547 * In this case, cpu_dispatch_pri for this thread's cpu should be 2548 * ignored. 2549 * 2550 * If a cpu is the target of an offline request then try to avoid it. 2551 * 2552 * This function must be called at either high SPL, or with preemption 2553 * disabled, so that the "hint" CPU cannot be removed from the online 2554 * CPU list while we are traversing it. 2555 */ 2556 cpu_t * 2557 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2558 { 2559 cpu_t *bestcpu; 2560 cpu_t *besthomecpu; 2561 cpu_t *cp, *cpstart; 2562 2563 pri_t bestpri; 2564 pri_t cpupri; 2565 2566 klgrpset_t done; 2567 klgrpset_t cur_set; 2568 2569 lpl_t *lpl_iter, *lpl_leaf; 2570 int i; 2571 2572 /* 2573 * Scan for a CPU currently running the lowest priority thread. 2574 * Cannot get cpu_lock here because it is adaptive. 2575 * We do not require lock on CPU list. 2576 */ 2577 ASSERT(hint != NULL); 2578 ASSERT(lpl != NULL); 2579 ASSERT(lpl->lpl_ncpu > 0); 2580 2581 /* 2582 * First examine local CPUs. Note that it's possible the hint CPU 2583 * passed in in remote to the specified home lgroup. If our priority 2584 * isn't sufficient enough such that we can run immediately at home, 2585 * then examine CPUs remote to our home lgroup. 2586 * We would like to give preference to CPUs closest to "home". 2587 * If we can't find a CPU where we'll run at a given level 2588 * of locality, we expand our search to include the next level. 2589 */ 2590 bestcpu = besthomecpu = NULL; 2591 klgrpset_clear(done); 2592 /* start with lpl we were passed */ 2593 2594 lpl_iter = lpl; 2595 2596 do { 2597 2598 bestpri = SHRT_MAX; 2599 klgrpset_clear(cur_set); 2600 2601 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2602 lpl_leaf = lpl_iter->lpl_rset[i]; 2603 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2604 continue; 2605 2606 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2607 2608 if (hint->cpu_lpl == lpl_leaf) 2609 cp = cpstart = hint; 2610 else 2611 cp = cpstart = lpl_leaf->lpl_cpus; 2612 2613 do { 2614 if (cp == curcpu) 2615 cpupri = -1; 2616 else if (cp == cpu_inmotion) 2617 cpupri = SHRT_MAX; 2618 else 2619 cpupri = cp->cpu_dispatch_pri; 2620 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2621 cpupri = cp->cpu_disp->disp_maxrunpri; 2622 if (cp->cpu_chosen_level > cpupri) 2623 cpupri = cp->cpu_chosen_level; 2624 if (cpupri < bestpri) { 2625 if (CPU_IDLING(cpupri)) { 2626 ASSERT((cp->cpu_flags & 2627 CPU_QUIESCED) == 0); 2628 return (cp); 2629 } 2630 bestcpu = cp; 2631 bestpri = cpupri; 2632 } 2633 } while ((cp = cp->cpu_next_lpl) != cpstart); 2634 } 2635 2636 if (bestcpu && (tpri > bestpri)) { 2637 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2638 return (bestcpu); 2639 } 2640 if (besthomecpu == NULL) 2641 besthomecpu = bestcpu; 2642 /* 2643 * Add the lgrps we just considered to the "done" set 2644 */ 2645 klgrpset_or(done, cur_set); 2646 2647 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2648 2649 /* 2650 * The specified priority isn't high enough to run immediately 2651 * anywhere, so just return the best CPU from the home lgroup. 2652 */ 2653 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2654 return (besthomecpu); 2655 } 2656 2657 /* 2658 * This routine provides the generic idle cpu function for all processors. 2659 * If a processor has some specific code to execute when idle (say, to stop 2660 * the pipeline and save power) then that routine should be defined in the 2661 * processors specific code (module_xx.c) and the global variable idle_cpu 2662 * set to that function. 2663 */ 2664 static void 2665 generic_idle_cpu(void) 2666 { 2667 } 2668 2669 /*ARGSUSED*/ 2670 static void 2671 generic_enq_thread(cpu_t *cpu, int bound) 2672 { 2673 } 2674 2675 /* 2676 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2677 * - t->t_cpu is not in this thread's assigned lgrp 2678 * - the time since the thread last came off t->t_cpu exceeds the 2679 * rechoose time for this cpu (ignore this if t is curthread in 2680 * which case it's on CPU and t->t_disp_time is inaccurate) 2681 * - t->t_cpu is presently the target of an offline or partition move 2682 * request 2683 */ 2684 static cpu_t * 2685 cpu_choose(kthread_t *t, pri_t tpri) 2686 { 2687 ASSERT(tpri < kpqpri); 2688 2689 if ((((lbolt - t->t_disp_time) > rechoose_interval) && 2690 t != curthread) || t->t_cpu == cpu_inmotion) { 2691 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2692 } 2693 2694 /* 2695 * Take a trip through disp_lowpri_cpu() if the thread was 2696 * running outside it's home lgroup 2697 */ 2698 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2699 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2700 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2701 (t == curthread) ? t->t_cpu : NULL)); 2702 } 2703 return (t->t_cpu); 2704 } 2705