1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/pg.h> 54 #include <sys/cmt.h> 55 #include <sys/bitset.h> 56 #include <sys/schedctl.h> 57 #include <sys/atomic.h> 58 #include <sys/dtrace.h> 59 #include <sys/sdt.h> 60 #include <sys/archsystm.h> 61 62 #include <vm/as.h> 63 64 #define BOUND_CPU 0x1 65 #define BOUND_PARTITION 0x2 66 #define BOUND_INTR 0x4 67 68 /* Dispatch queue allocation structure and functions */ 69 struct disp_queue_info { 70 disp_t *dp; 71 dispq_t *olddispq; 72 dispq_t *newdispq; 73 ulong_t *olddqactmap; 74 ulong_t *newdqactmap; 75 int oldnglobpris; 76 }; 77 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 78 disp_t *dp); 79 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 80 static void disp_dq_free(struct disp_queue_info *dptr); 81 82 /* platform-specific routine to call when processor is idle */ 83 static void generic_idle_cpu(); 84 void (*idle_cpu)() = generic_idle_cpu; 85 86 /* routines invoked when a CPU enters/exits the idle loop */ 87 static void idle_enter(); 88 static void idle_exit(); 89 90 /* platform-specific routine to call when thread is enqueued */ 91 static void generic_enq_thread(cpu_t *, int); 92 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 93 94 pri_t kpreemptpri; /* priority where kernel preemption applies */ 95 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 96 pri_t intr_pri; /* interrupt thread priority base level */ 97 98 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 99 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 100 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 101 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 102 int nswapped; /* total number of swapped threads */ 103 void disp_swapped_enq(kthread_t *tp); 104 static void disp_swapped_setrun(kthread_t *tp); 105 static void cpu_resched(cpu_t *cp, pri_t tpri); 106 107 /* 108 * If this is set, only interrupt threads will cause kernel preemptions. 109 * This is done by changing the value of kpreemptpri. kpreemptpri 110 * will either be the max sysclass pri + 1 or the min interrupt pri. 111 */ 112 int only_intr_kpreempt; 113 114 extern void set_idle_cpu(int cpun); 115 extern void unset_idle_cpu(int cpun); 116 static void setkpdq(kthread_t *tp, int borf); 117 #define SETKP_BACK 0 118 #define SETKP_FRONT 1 119 /* 120 * Parameter that determines how recently a thread must have run 121 * on the CPU to be considered loosely-bound to that CPU to reduce 122 * cold cache effects. The interval is in hertz. 123 */ 124 #define RECHOOSE_INTERVAL 3 125 int rechoose_interval = RECHOOSE_INTERVAL; 126 static cpu_t *cpu_choose(kthread_t *, pri_t); 127 128 /* 129 * Parameter that determines how long (in nanoseconds) a thread must 130 * be sitting on a run queue before it can be stolen by another CPU 131 * to reduce migrations. The interval is in nanoseconds. 132 * 133 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval() 134 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED 135 * here indicating it is uninitiallized. 136 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'. 137 * 138 */ 139 #define NOSTEAL_UNINITIALIZED (-1) 140 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED; 141 extern void cmp_set_nosteal_interval(void); 142 143 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 144 145 disp_lock_t transition_lock; /* lock on transitioning threads */ 146 disp_lock_t stop_lock; /* lock on stopped threads */ 147 148 static void cpu_dispqalloc(int numpris); 149 150 /* 151 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 152 * a thread because it was sitting on its run queue for a very short 153 * period of time. 154 */ 155 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 156 157 static kthread_t *disp_getwork(cpu_t *to); 158 static kthread_t *disp_getbest(disp_t *from); 159 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 160 161 void swtch_to(kthread_t *); 162 163 /* 164 * dispatcher and scheduler initialization 165 */ 166 167 /* 168 * disp_setup - Common code to calculate and allocate dispatcher 169 * variables and structures based on the maximum priority. 170 */ 171 static void 172 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 173 { 174 pri_t newnglobpris; 175 176 ASSERT(MUTEX_HELD(&cpu_lock)); 177 178 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 179 180 if (newnglobpris > oldnglobpris) { 181 /* 182 * Allocate new kp queues for each CPU partition. 183 */ 184 cpupart_kpqalloc(newnglobpris); 185 186 /* 187 * Allocate new dispatch queues for each CPU. 188 */ 189 cpu_dispqalloc(newnglobpris); 190 191 /* 192 * compute new interrupt thread base priority 193 */ 194 intr_pri = maxglobpri; 195 if (only_intr_kpreempt) { 196 kpreemptpri = intr_pri + 1; 197 if (kpqpri == KPQPRI) 198 kpqpri = kpreemptpri; 199 } 200 v.v_nglobpris = newnglobpris; 201 } 202 } 203 204 /* 205 * dispinit - Called to initialize all loaded classes and the 206 * dispatcher framework. 207 */ 208 void 209 dispinit(void) 210 { 211 id_t cid; 212 pri_t maxglobpri; 213 pri_t cl_maxglobpri; 214 215 maxglobpri = -1; 216 217 /* 218 * Initialize transition lock, which will always be set. 219 */ 220 DISP_LOCK_INIT(&transition_lock); 221 disp_lock_enter_high(&transition_lock); 222 DISP_LOCK_INIT(&stop_lock); 223 224 mutex_enter(&cpu_lock); 225 CPU->cpu_disp->disp_maxrunpri = -1; 226 CPU->cpu_disp->disp_max_unbound_pri = -1; 227 228 /* 229 * Initialize the default CPU partition. 230 */ 231 cpupart_initialize_default(); 232 /* 233 * Call the class specific initialization functions for 234 * all pre-installed schedulers. 235 * 236 * We pass the size of a class specific parameter 237 * buffer to each of the initialization functions 238 * to try to catch problems with backward compatibility 239 * of class modules. 240 * 241 * For example a new class module running on an old system 242 * which didn't provide sufficiently large parameter buffers 243 * would be bad news. Class initialization modules can check for 244 * this and take action if they detect a problem. 245 */ 246 247 for (cid = 0; cid < nclass; cid++) { 248 sclass_t *sc; 249 250 sc = &sclass[cid]; 251 if (SCHED_INSTALLED(sc)) { 252 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 253 &sc->cl_funcs); 254 if (cl_maxglobpri > maxglobpri) 255 maxglobpri = cl_maxglobpri; 256 } 257 } 258 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 259 if (kpqpri == KPQPRI) 260 kpqpri = kpreemptpri; 261 262 ASSERT(maxglobpri >= 0); 263 disp_setup(maxglobpri, 0); 264 265 mutex_exit(&cpu_lock); 266 267 /* 268 * Platform specific sticky scheduler setup. 269 */ 270 if (nosteal_nsec == NOSTEAL_UNINITIALIZED) 271 cmp_set_nosteal_interval(); 272 273 /* 274 * Get the default class ID; this may be later modified via 275 * dispadmin(1M). This will load the class (normally TS) and that will 276 * call disp_add(), which is why we had to drop cpu_lock first. 277 */ 278 if (getcid(defaultclass, &defaultcid) != 0) { 279 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 280 defaultclass); 281 } 282 } 283 284 /* 285 * disp_add - Called with class pointer to initialize the dispatcher 286 * for a newly loaded class. 287 */ 288 void 289 disp_add(sclass_t *clp) 290 { 291 pri_t maxglobpri; 292 pri_t cl_maxglobpri; 293 294 mutex_enter(&cpu_lock); 295 /* 296 * Initialize the scheduler class. 297 */ 298 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 299 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 300 if (cl_maxglobpri > maxglobpri) 301 maxglobpri = cl_maxglobpri; 302 303 /* 304 * Save old queue information. Since we're initializing a 305 * new scheduling class which has just been loaded, then 306 * the size of the dispq may have changed. We need to handle 307 * that here. 308 */ 309 disp_setup(maxglobpri, v.v_nglobpris); 310 311 mutex_exit(&cpu_lock); 312 } 313 314 315 /* 316 * For each CPU, allocate new dispatch queues 317 * with the stated number of priorities. 318 */ 319 static void 320 cpu_dispqalloc(int numpris) 321 { 322 cpu_t *cpup; 323 struct disp_queue_info *disp_mem; 324 int i, num; 325 326 ASSERT(MUTEX_HELD(&cpu_lock)); 327 328 disp_mem = kmem_zalloc(NCPU * 329 sizeof (struct disp_queue_info), KM_SLEEP); 330 331 /* 332 * This routine must allocate all of the memory before stopping 333 * the cpus because it must not sleep in kmem_alloc while the 334 * CPUs are stopped. Locks they hold will not be freed until they 335 * are restarted. 336 */ 337 i = 0; 338 cpup = cpu_list; 339 do { 340 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 341 i++; 342 cpup = cpup->cpu_next; 343 } while (cpup != cpu_list); 344 num = i; 345 346 pause_cpus(NULL); 347 for (i = 0; i < num; i++) 348 disp_dq_assign(&disp_mem[i], numpris); 349 start_cpus(); 350 351 /* 352 * I must free all of the memory after starting the cpus because 353 * I can not risk sleeping in kmem_free while the cpus are stopped. 354 */ 355 for (i = 0; i < num; i++) 356 disp_dq_free(&disp_mem[i]); 357 358 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 359 } 360 361 static void 362 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 363 { 364 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 365 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 366 sizeof (long), KM_SLEEP); 367 dptr->dp = dp; 368 } 369 370 static void 371 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 372 { 373 disp_t *dp; 374 375 dp = dptr->dp; 376 dptr->olddispq = dp->disp_q; 377 dptr->olddqactmap = dp->disp_qactmap; 378 dptr->oldnglobpris = dp->disp_npri; 379 380 ASSERT(dptr->oldnglobpris < numpris); 381 382 if (dptr->olddispq != NULL) { 383 /* 384 * Use kcopy because bcopy is platform-specific 385 * and could block while we might have paused the cpus. 386 */ 387 (void) kcopy(dptr->olddispq, dptr->newdispq, 388 dptr->oldnglobpris * sizeof (dispq_t)); 389 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 390 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 391 sizeof (long)); 392 } 393 dp->disp_q = dptr->newdispq; 394 dp->disp_qactmap = dptr->newdqactmap; 395 dp->disp_q_limit = &dptr->newdispq[numpris]; 396 dp->disp_npri = numpris; 397 } 398 399 static void 400 disp_dq_free(struct disp_queue_info *dptr) 401 { 402 if (dptr->olddispq != NULL) 403 kmem_free(dptr->olddispq, 404 dptr->oldnglobpris * sizeof (dispq_t)); 405 if (dptr->olddqactmap != NULL) 406 kmem_free(dptr->olddqactmap, 407 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 408 } 409 410 /* 411 * For a newly created CPU, initialize the dispatch queue. 412 * This is called before the CPU is known through cpu[] or on any lists. 413 */ 414 void 415 disp_cpu_init(cpu_t *cp) 416 { 417 disp_t *dp; 418 dispq_t *newdispq; 419 ulong_t *newdqactmap; 420 421 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 422 423 if (cp == cpu0_disp.disp_cpu) 424 dp = &cpu0_disp; 425 else 426 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 427 bzero(dp, sizeof (disp_t)); 428 cp->cpu_disp = dp; 429 dp->disp_cpu = cp; 430 dp->disp_maxrunpri = -1; 431 dp->disp_max_unbound_pri = -1; 432 DISP_LOCK_INIT(&cp->cpu_thread_lock); 433 /* 434 * Allocate memory for the dispatcher queue headers 435 * and the active queue bitmap. 436 */ 437 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 438 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 439 sizeof (long), KM_SLEEP); 440 dp->disp_q = newdispq; 441 dp->disp_qactmap = newdqactmap; 442 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 443 dp->disp_npri = v.v_nglobpris; 444 } 445 446 void 447 disp_cpu_fini(cpu_t *cp) 448 { 449 ASSERT(MUTEX_HELD(&cpu_lock)); 450 451 disp_kp_free(cp->cpu_disp); 452 if (cp->cpu_disp != &cpu0_disp) 453 kmem_free(cp->cpu_disp, sizeof (disp_t)); 454 } 455 456 /* 457 * Allocate new, larger kpreempt dispatch queue to replace the old one. 458 */ 459 void 460 disp_kp_alloc(disp_t *dq, pri_t npri) 461 { 462 struct disp_queue_info mem_info; 463 464 if (npri > dq->disp_npri) { 465 /* 466 * Allocate memory for the new array. 467 */ 468 disp_dq_alloc(&mem_info, npri, dq); 469 470 /* 471 * We need to copy the old structures to the new 472 * and free the old. 473 */ 474 disp_dq_assign(&mem_info, npri); 475 disp_dq_free(&mem_info); 476 } 477 } 478 479 /* 480 * Free dispatch queue. 481 * Used for the kpreempt queues for a removed CPU partition and 482 * for the per-CPU queues of deleted CPUs. 483 */ 484 void 485 disp_kp_free(disp_t *dq) 486 { 487 struct disp_queue_info mem_info; 488 489 mem_info.olddispq = dq->disp_q; 490 mem_info.olddqactmap = dq->disp_qactmap; 491 mem_info.oldnglobpris = dq->disp_npri; 492 disp_dq_free(&mem_info); 493 } 494 495 /* 496 * End dispatcher and scheduler initialization. 497 */ 498 499 /* 500 * See if there's anything to do other than remain idle. 501 * Return non-zero if there is. 502 * 503 * This function must be called with high spl, or with 504 * kernel preemption disabled to prevent the partition's 505 * active cpu list from changing while being traversed. 506 * 507 */ 508 int 509 disp_anywork(void) 510 { 511 cpu_t *cp = CPU; 512 cpu_t *ocp; 513 514 if (cp->cpu_disp->disp_nrunnable != 0) 515 return (1); 516 517 if (!(cp->cpu_flags & CPU_OFFLINE)) { 518 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 519 return (1); 520 521 /* 522 * Work can be taken from another CPU if: 523 * - There is unbound work on the run queue 524 * - That work isn't a thread undergoing a 525 * - context switch on an otherwise empty queue. 526 * - The CPU isn't running the idle loop. 527 */ 528 for (ocp = cp->cpu_next_part; ocp != cp; 529 ocp = ocp->cpu_next_part) { 530 ASSERT(CPU_ACTIVE(ocp)); 531 532 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 533 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 534 ocp->cpu_disp->disp_nrunnable == 1) && 535 ocp->cpu_dispatch_pri != -1) 536 return (1); 537 } 538 } 539 return (0); 540 } 541 542 /* 543 * Called when CPU enters the idle loop 544 */ 545 static void 546 idle_enter() 547 { 548 cpu_t *cp = CPU; 549 550 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 551 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 552 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 553 } 554 555 /* 556 * Called when CPU exits the idle loop 557 */ 558 static void 559 idle_exit() 560 { 561 cpu_t *cp = CPU; 562 563 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 564 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 565 } 566 567 /* 568 * Idle loop. 569 */ 570 void 571 idle() 572 { 573 struct cpu *cp = CPU; /* pointer to this CPU */ 574 kthread_t *t; /* taken thread */ 575 576 idle_enter(); 577 578 /* 579 * Uniprocessor version of idle loop. 580 * Do this until notified that we're on an actual multiprocessor. 581 */ 582 while (ncpus == 1) { 583 if (cp->cpu_disp->disp_nrunnable == 0) { 584 (*idle_cpu)(); 585 continue; 586 } 587 idle_exit(); 588 swtch(); 589 590 idle_enter(); /* returned from swtch */ 591 } 592 593 /* 594 * Multiprocessor idle loop. 595 */ 596 for (;;) { 597 /* 598 * If CPU is completely quiesced by p_online(2), just wait 599 * here with minimal bus traffic until put online. 600 */ 601 while (cp->cpu_flags & CPU_QUIESCED) 602 (*idle_cpu)(); 603 604 if (cp->cpu_disp->disp_nrunnable != 0) { 605 idle_exit(); 606 swtch(); 607 } else { 608 if (cp->cpu_flags & CPU_OFFLINE) 609 continue; 610 if ((t = disp_getwork(cp)) == NULL) { 611 if (cp->cpu_chosen_level != -1) { 612 disp_t *dp = cp->cpu_disp; 613 disp_t *kpq; 614 615 disp_lock_enter(&dp->disp_lock); 616 /* 617 * Set kpq under lock to prevent 618 * migration between partitions. 619 */ 620 kpq = &cp->cpu_part->cp_kp_queue; 621 if (kpq->disp_maxrunpri == -1) 622 cp->cpu_chosen_level = -1; 623 disp_lock_exit(&dp->disp_lock); 624 } 625 (*idle_cpu)(); 626 continue; 627 } 628 /* 629 * If there was a thread but we couldn't steal 630 * it, then keep trying. 631 */ 632 if (t == T_DONTSTEAL) 633 continue; 634 idle_exit(); 635 swtch_to(t); 636 } 637 idle_enter(); /* returned from swtch/swtch_to */ 638 } 639 } 640 641 642 /* 643 * Preempt the currently running thread in favor of the highest 644 * priority thread. The class of the current thread controls 645 * where it goes on the dispatcher queues. If panicking, turn 646 * preemption off. 647 */ 648 void 649 preempt() 650 { 651 kthread_t *t = curthread; 652 klwp_t *lwp = ttolwp(curthread); 653 654 if (panicstr) 655 return; 656 657 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 658 659 thread_lock(t); 660 661 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 662 /* 663 * this thread has already been chosen to be run on 664 * another CPU. Clear kprunrun on this CPU since we're 665 * already headed for swtch(). 666 */ 667 CPU->cpu_kprunrun = 0; 668 thread_unlock_nopreempt(t); 669 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 670 } else { 671 if (lwp != NULL) 672 lwp->lwp_ru.nivcsw++; 673 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 674 THREAD_TRANSITION(t); 675 CL_PREEMPT(t); 676 DTRACE_SCHED(preempt); 677 thread_unlock_nopreempt(t); 678 679 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 680 681 swtch(); /* clears CPU->cpu_runrun via disp() */ 682 } 683 } 684 685 extern kthread_t *thread_unpin(); 686 687 /* 688 * disp() - find the highest priority thread for this processor to run, and 689 * set it in TS_ONPROC state so that resume() can be called to run it. 690 */ 691 static kthread_t * 692 disp() 693 { 694 cpu_t *cpup; 695 disp_t *dp; 696 kthread_t *tp; 697 dispq_t *dq; 698 int maxrunword; 699 pri_t pri; 700 disp_t *kpq; 701 702 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 703 704 cpup = CPU; 705 /* 706 * Find the highest priority loaded, runnable thread. 707 */ 708 dp = cpup->cpu_disp; 709 710 reschedule: 711 /* 712 * If there is more important work on the global queue with a better 713 * priority than the maximum on this CPU, take it now. 714 */ 715 kpq = &cpup->cpu_part->cp_kp_queue; 716 while ((pri = kpq->disp_maxrunpri) >= 0 && 717 pri >= dp->disp_maxrunpri && 718 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 719 (tp = disp_getbest(kpq)) != NULL) { 720 if (disp_ratify(tp, kpq) != NULL) { 721 TRACE_1(TR_FAC_DISP, TR_DISP_END, 722 "disp_end:tid %p", tp); 723 return (tp); 724 } 725 } 726 727 disp_lock_enter(&dp->disp_lock); 728 pri = dp->disp_maxrunpri; 729 730 /* 731 * If there is nothing to run, look at what's runnable on other queues. 732 * Choose the idle thread if the CPU is quiesced. 733 * Note that CPUs that have the CPU_OFFLINE flag set can still run 734 * interrupt threads, which will be the only threads on the CPU's own 735 * queue, but cannot run threads from other queues. 736 */ 737 if (pri == -1) { 738 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 739 disp_lock_exit(&dp->disp_lock); 740 if ((tp = disp_getwork(cpup)) == NULL || 741 tp == T_DONTSTEAL) { 742 tp = cpup->cpu_idle_thread; 743 (void) splhigh(); 744 THREAD_ONPROC(tp, cpup); 745 cpup->cpu_dispthread = tp; 746 cpup->cpu_dispatch_pri = -1; 747 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 748 cpup->cpu_chosen_level = -1; 749 } 750 } else { 751 disp_lock_exit_high(&dp->disp_lock); 752 tp = cpup->cpu_idle_thread; 753 THREAD_ONPROC(tp, cpup); 754 cpup->cpu_dispthread = tp; 755 cpup->cpu_dispatch_pri = -1; 756 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 757 cpup->cpu_chosen_level = -1; 758 } 759 TRACE_1(TR_FAC_DISP, TR_DISP_END, 760 "disp_end:tid %p", tp); 761 return (tp); 762 } 763 764 dq = &dp->disp_q[pri]; 765 tp = dq->dq_first; 766 767 ASSERT(tp != NULL); 768 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 769 770 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 771 772 /* 773 * Found it so remove it from queue. 774 */ 775 dp->disp_nrunnable--; 776 dq->dq_sruncnt--; 777 if ((dq->dq_first = tp->t_link) == NULL) { 778 ulong_t *dqactmap = dp->disp_qactmap; 779 780 ASSERT(dq->dq_sruncnt == 0); 781 dq->dq_last = NULL; 782 783 /* 784 * The queue is empty, so the corresponding bit needs to be 785 * turned off in dqactmap. If nrunnable != 0 just took the 786 * last runnable thread off the 787 * highest queue, so recompute disp_maxrunpri. 788 */ 789 maxrunword = pri >> BT_ULSHIFT; 790 dqactmap[maxrunword] &= ~BT_BIW(pri); 791 792 if (dp->disp_nrunnable == 0) { 793 dp->disp_max_unbound_pri = -1; 794 dp->disp_maxrunpri = -1; 795 } else { 796 int ipri; 797 798 ipri = bt_gethighbit(dqactmap, maxrunword); 799 dp->disp_maxrunpri = ipri; 800 if (ipri < dp->disp_max_unbound_pri) 801 dp->disp_max_unbound_pri = ipri; 802 } 803 } else { 804 tp->t_link = NULL; 805 } 806 807 /* 808 * Set TS_DONT_SWAP flag to prevent another processor from swapping 809 * out this thread before we have a chance to run it. 810 * While running, it is protected against swapping by t_lock. 811 */ 812 tp->t_schedflag |= TS_DONT_SWAP; 813 cpup->cpu_dispthread = tp; /* protected by spl only */ 814 cpup->cpu_dispatch_pri = pri; 815 ASSERT(pri == DISP_PRIO(tp)); 816 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 817 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 818 819 ASSERT(tp != NULL); 820 TRACE_1(TR_FAC_DISP, TR_DISP_END, 821 "disp_end:tid %p", tp); 822 823 if (disp_ratify(tp, kpq) == NULL) 824 goto reschedule; 825 826 return (tp); 827 } 828 829 /* 830 * swtch() 831 * Find best runnable thread and run it. 832 * Called with the current thread already switched to a new state, 833 * on a sleep queue, run queue, stopped, and not zombied. 834 * May be called at any spl level less than or equal to LOCK_LEVEL. 835 * Always drops spl to the base level (spl0()). 836 */ 837 void 838 swtch() 839 { 840 kthread_t *t = curthread; 841 kthread_t *next; 842 cpu_t *cp; 843 844 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 845 846 if (t->t_flag & T_INTR_THREAD) 847 cpu_intr_swtch_enter(t); 848 849 if (t->t_intr != NULL) { 850 /* 851 * We are an interrupt thread. Setup and return 852 * the interrupted thread to be resumed. 853 */ 854 (void) splhigh(); /* block other scheduler action */ 855 cp = CPU; /* now protected against migration */ 856 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 857 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 858 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 859 next = thread_unpin(); 860 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 861 resume_from_intr(next); 862 } else { 863 #ifdef DEBUG 864 if (t->t_state == TS_ONPROC && 865 t->t_disp_queue->disp_cpu == CPU && 866 t->t_preempt == 0) { 867 thread_lock(t); 868 ASSERT(t->t_state != TS_ONPROC || 869 t->t_disp_queue->disp_cpu != CPU || 870 t->t_preempt != 0); /* cannot migrate */ 871 thread_unlock_nopreempt(t); 872 } 873 #endif /* DEBUG */ 874 cp = CPU; 875 next = disp(); /* returns with spl high */ 876 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 877 878 /* OK to steal anything left on run queue */ 879 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 880 881 if (next != t) { 882 if (t == cp->cpu_idle_thread) { 883 PG_NRUN_UPDATE(cp, 1); 884 } else if (next == cp->cpu_idle_thread) { 885 PG_NRUN_UPDATE(cp, -1); 886 } 887 888 /* 889 * If t was previously in the TS_ONPROC state, 890 * setfrontdq and setbackdq won't have set its t_waitrq. 891 * Since we now finally know that we're switching away 892 * from this thread, set its t_waitrq if it is on a run 893 * queue. 894 */ 895 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 896 t->t_waitrq = gethrtime_unscaled(); 897 } 898 899 /* 900 * restore mstate of thread that we are switching to 901 */ 902 restore_mstate(next); 903 904 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 905 cp->cpu_last_swtch = t->t_disp_time = lbolt; 906 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 907 908 if (dtrace_vtime_active) 909 dtrace_vtime_switch(next); 910 911 resume(next); 912 /* 913 * The TR_RESUME_END and TR_SWTCH_END trace points 914 * appear at the end of resume(), because we may not 915 * return here 916 */ 917 } else { 918 if (t->t_flag & T_INTR_THREAD) 919 cpu_intr_swtch_exit(t); 920 921 DTRACE_SCHED(remain__cpu); 922 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 923 (void) spl0(); 924 } 925 } 926 } 927 928 /* 929 * swtch_from_zombie() 930 * Special case of swtch(), which allows checks for TS_ZOMB to be 931 * eliminated from normal resume. 932 * Find best runnable thread and run it. 933 * Called with the current thread zombied. 934 * Zombies cannot migrate, so CPU references are safe. 935 */ 936 void 937 swtch_from_zombie() 938 { 939 kthread_t *next; 940 cpu_t *cpu = CPU; 941 942 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 943 944 ASSERT(curthread->t_state == TS_ZOMB); 945 946 next = disp(); /* returns with spl high */ 947 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 948 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 949 ASSERT(next != curthread); 950 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 951 952 if (next == cpu->cpu_idle_thread) 953 PG_NRUN_UPDATE(cpu, -1); 954 955 restore_mstate(next); 956 957 if (dtrace_vtime_active) 958 dtrace_vtime_switch(next); 959 960 resume_from_zombie(next); 961 /* 962 * The TR_RESUME_END and TR_SWTCH_END trace points 963 * appear at the end of resume(), because we certainly will not 964 * return here 965 */ 966 } 967 968 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 969 970 /* 971 * search_disp_queues() 972 * Search the given dispatch queues for thread tp. 973 * Return 1 if tp is found, otherwise return 0. 974 */ 975 static int 976 search_disp_queues(disp_t *dp, kthread_t *tp) 977 { 978 dispq_t *dq; 979 dispq_t *eq; 980 981 disp_lock_enter_high(&dp->disp_lock); 982 983 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 984 kthread_t *rp; 985 986 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 987 988 for (rp = dq->dq_first; rp; rp = rp->t_link) 989 if (tp == rp) { 990 disp_lock_exit_high(&dp->disp_lock); 991 return (1); 992 } 993 } 994 disp_lock_exit_high(&dp->disp_lock); 995 996 return (0); 997 } 998 999 /* 1000 * thread_on_queue() 1001 * Search all per-CPU dispatch queues and all partition-wide kpreempt 1002 * queues for thread tp. Return 1 if tp is found, otherwise return 0. 1003 */ 1004 static int 1005 thread_on_queue(kthread_t *tp) 1006 { 1007 cpu_t *cp; 1008 struct cpupart *part; 1009 1010 ASSERT(getpil() >= DISP_LEVEL); 1011 1012 /* 1013 * Search the per-CPU dispatch queues for tp. 1014 */ 1015 cp = CPU; 1016 do { 1017 if (search_disp_queues(cp->cpu_disp, tp)) 1018 return (1); 1019 } while ((cp = cp->cpu_next_onln) != CPU); 1020 1021 /* 1022 * Search the partition-wide kpreempt queues for tp. 1023 */ 1024 part = CPU->cpu_part; 1025 do { 1026 if (search_disp_queues(&part->cp_kp_queue, tp)) 1027 return (1); 1028 } while ((part = part->cp_next) != CPU->cpu_part); 1029 1030 return (0); 1031 } 1032 1033 #else 1034 1035 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1036 1037 #endif /* DEBUG */ 1038 1039 /* 1040 * like swtch(), but switch to a specified thread taken from another CPU. 1041 * called with spl high.. 1042 */ 1043 void 1044 swtch_to(kthread_t *next) 1045 { 1046 cpu_t *cp = CPU; 1047 1048 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1049 1050 /* 1051 * Update context switch statistics. 1052 */ 1053 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1054 1055 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1056 1057 if (curthread == cp->cpu_idle_thread) 1058 PG_NRUN_UPDATE(cp, 1); 1059 1060 /* OK to steal anything left on run queue */ 1061 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1062 1063 /* record last execution time */ 1064 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1065 1066 /* 1067 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1068 * won't have set its t_waitrq. Since we now finally know that we're 1069 * switching away from this thread, set its t_waitrq if it is on a run 1070 * queue. 1071 */ 1072 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1073 curthread->t_waitrq = gethrtime_unscaled(); 1074 } 1075 1076 /* restore next thread to previously running microstate */ 1077 restore_mstate(next); 1078 1079 if (dtrace_vtime_active) 1080 dtrace_vtime_switch(next); 1081 1082 resume(next); 1083 /* 1084 * The TR_RESUME_END and TR_SWTCH_END trace points 1085 * appear at the end of resume(), because we may not 1086 * return here 1087 */ 1088 } 1089 1090 1091 1092 #define CPU_IDLING(pri) ((pri) == -1) 1093 1094 static void 1095 cpu_resched(cpu_t *cp, pri_t tpri) 1096 { 1097 int call_poke_cpu = 0; 1098 pri_t cpupri = cp->cpu_dispatch_pri; 1099 1100 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1101 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1102 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1103 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1104 cp->cpu_runrun = 1; 1105 aston(cp->cpu_dispthread); 1106 if (tpri < kpreemptpri && cp != CPU) 1107 call_poke_cpu = 1; 1108 } 1109 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1110 cp->cpu_kprunrun = 1; 1111 if (cp != CPU) 1112 call_poke_cpu = 1; 1113 } 1114 } 1115 1116 /* 1117 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1118 */ 1119 membar_enter(); 1120 1121 if (call_poke_cpu) 1122 poke_cpu(cp->cpu_id); 1123 } 1124 1125 /* 1126 * Perform multi-level CMT load balancing of running threads. 1127 * tp is the thread being enqueued 1128 * cp is the hint CPU (chosen by cpu_choose()). 1129 */ 1130 static cpu_t * 1131 cmt_balance(kthread_t *tp, cpu_t *cp) 1132 { 1133 int hint, i, cpu, nsiblings; 1134 int self = 0; 1135 group_t *cmt_pgs, *siblings; 1136 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 1137 int pg_nrun, tpg_nrun; 1138 int level = 0; 1139 cpu_t *newcp; 1140 1141 ASSERT(THREAD_LOCK_HELD(tp)); 1142 1143 cmt_pgs = &cp->cpu_pg->cmt_pgs; 1144 1145 if (GROUP_SIZE(cmt_pgs) == 0) 1146 return (cp); /* nothing to do */ 1147 1148 if (tp == curthread) 1149 self = 1; 1150 1151 /* 1152 * Balance across siblings in the CPUs CMT lineage 1153 */ 1154 do { 1155 pg = GROUP_ACCESS(cmt_pgs, level); 1156 1157 siblings = pg->cmt_siblings; 1158 nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 1159 if (nsiblings == 1) 1160 continue; /* nobody to balance against */ 1161 1162 pg_nrun = pg->cmt_nrunning; 1163 if (self && 1164 bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 1165 pg_nrun--; /* Ignore curthread's effect */ 1166 1167 hint = pg->cmt_hint; 1168 /* 1169 * Check for validity of the hint 1170 * It should reference a valid sibling 1171 */ 1172 if (hint >= nsiblings) 1173 hint = pg->cmt_hint = 0; 1174 else 1175 pg->cmt_hint++; 1176 1177 /* 1178 * Find a balancing candidate from among our siblings 1179 * "hint" is a hint for where to start looking 1180 */ 1181 i = hint; 1182 do { 1183 ASSERT(i < nsiblings); 1184 pg_tmp = GROUP_ACCESS(siblings, i); 1185 1186 /* 1187 * The candidate must not be us, and must 1188 * have some CPU resources in the thread's 1189 * partition 1190 */ 1191 if (pg_tmp != pg && 1192 bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 1193 ((pg_t *)pg_tmp)->pg_id)) { 1194 tpg = pg_tmp; 1195 break; 1196 } 1197 1198 if (++i >= nsiblings) 1199 i = 0; 1200 } while (i != hint); 1201 1202 if (!tpg) 1203 continue; /* no candidates at this level */ 1204 1205 /* 1206 * Check if the balancing target is underloaded 1207 * Decide to balance if the target is running fewer 1208 * threads, or if it's running the same number of threads 1209 * with more online CPUs 1210 */ 1211 tpg_nrun = tpg->cmt_nrunning; 1212 if (pg_nrun > tpg_nrun || 1213 (pg_nrun == tpg_nrun && 1214 (GROUP_SIZE(&tpg->cmt_cpus_actv) > 1215 GROUP_SIZE(&pg->cmt_cpus_actv)))) { 1216 break; 1217 } 1218 tpg = NULL; 1219 } while (++level < GROUP_SIZE(cmt_pgs)); 1220 1221 1222 if (tpg) { 1223 /* 1224 * Select an idle CPU from the target PG 1225 */ 1226 for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) { 1227 newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 1228 if (newcp->cpu_part == tp->t_cpupart && 1229 newcp->cpu_dispatch_pri == -1) { 1230 cp = newcp; 1231 break; 1232 } 1233 } 1234 } 1235 1236 return (cp); 1237 } 1238 1239 /* 1240 * setbackdq() keeps runqs balanced such that the difference in length 1241 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1242 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1243 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1244 * try to keep runqs perfectly balanced regardless of the thread priority. 1245 */ 1246 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1247 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1248 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1249 1250 /* 1251 * Put the specified thread on the back of the dispatcher 1252 * queue corresponding to its current priority. 1253 * 1254 * Called with the thread in transition, onproc or stopped state 1255 * and locked (transition implies locked) and at high spl. 1256 * Returns with the thread in TS_RUN state and still locked. 1257 */ 1258 void 1259 setbackdq(kthread_t *tp) 1260 { 1261 dispq_t *dq; 1262 disp_t *dp; 1263 cpu_t *cp; 1264 pri_t tpri; 1265 int bound; 1266 1267 ASSERT(THREAD_LOCK_HELD(tp)); 1268 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1269 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1270 1271 /* 1272 * If thread is "swapped" or on the swap queue don't 1273 * queue it, but wake sched. 1274 */ 1275 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1276 disp_swapped_setrun(tp); 1277 return; 1278 } 1279 1280 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1281 bound = 1; 1282 else 1283 bound = 0; 1284 1285 tpri = DISP_PRIO(tp); 1286 if (ncpus == 1) 1287 cp = tp->t_cpu; 1288 else if (!bound) { 1289 if (tpri >= kpqpri) { 1290 setkpdq(tp, SETKP_BACK); 1291 return; 1292 } 1293 /* 1294 * Let cpu_choose suggest a CPU. 1295 */ 1296 cp = cpu_choose(tp, tpri); 1297 1298 if (tp->t_cpupart == cp->cpu_part) { 1299 int qlen; 1300 1301 /* 1302 * Perform any CMT load balancing 1303 */ 1304 cp = cmt_balance(tp, cp); 1305 1306 /* 1307 * Balance across the run queues 1308 */ 1309 qlen = RUNQ_LEN(cp, tpri); 1310 if (tpri >= RUNQ_MATCH_PRI && 1311 !(tp->t_schedflag & TS_RUNQMATCH)) 1312 qlen -= RUNQ_MAX_DIFF; 1313 if (qlen > 0) { 1314 cpu_t *newcp; 1315 1316 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1317 newcp = cp->cpu_next_part; 1318 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1319 newcp = cp->cpu_next_part; 1320 } 1321 1322 if (RUNQ_LEN(newcp, tpri) < qlen) { 1323 DTRACE_PROBE3(runq__balance, 1324 kthread_t *, tp, 1325 cpu_t *, cp, cpu_t *, newcp); 1326 cp = newcp; 1327 } 1328 } 1329 } else { 1330 /* 1331 * Migrate to a cpu in the new partition. 1332 */ 1333 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1334 tp->t_lpl, tp->t_pri, NULL); 1335 } 1336 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1337 } else { 1338 /* 1339 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1340 * a short time until weak binding that existed when the 1341 * strong binding was established has dropped) so we must 1342 * favour weak binding over strong. 1343 */ 1344 cp = tp->t_weakbound_cpu ? 1345 tp->t_weakbound_cpu : tp->t_bound_cpu; 1346 } 1347 /* 1348 * A thread that is ONPROC may be temporarily placed on the run queue 1349 * but then chosen to run again by disp. If the thread we're placing on 1350 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1351 * replacement process is actually scheduled in swtch(). In this 1352 * situation, curthread is the only thread that could be in the ONPROC 1353 * state. 1354 */ 1355 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1356 hrtime_t curtime; 1357 1358 curtime = gethrtime_unscaled(); 1359 (void) cpu_update_pct(tp, curtime); 1360 tp->t_waitrq = curtime; 1361 } else { 1362 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1363 } 1364 1365 dp = cp->cpu_disp; 1366 disp_lock_enter_high(&dp->disp_lock); 1367 1368 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1369 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1370 tpri, cp, tp); 1371 1372 #ifndef NPROBE 1373 /* Kernel probe */ 1374 if (tnf_tracing_active) 1375 tnf_thread_queue(tp, cp, tpri); 1376 #endif /* NPROBE */ 1377 1378 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1379 1380 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1381 tp->t_disp_queue = dp; 1382 tp->t_link = NULL; 1383 1384 dq = &dp->disp_q[tpri]; 1385 dp->disp_nrunnable++; 1386 if (!bound) 1387 dp->disp_steal = 0; 1388 membar_enter(); 1389 1390 if (dq->dq_sruncnt++ != 0) { 1391 ASSERT(dq->dq_first != NULL); 1392 dq->dq_last->t_link = tp; 1393 dq->dq_last = tp; 1394 } else { 1395 ASSERT(dq->dq_first == NULL); 1396 ASSERT(dq->dq_last == NULL); 1397 dq->dq_first = dq->dq_last = tp; 1398 BT_SET(dp->disp_qactmap, tpri); 1399 if (tpri > dp->disp_maxrunpri) { 1400 dp->disp_maxrunpri = tpri; 1401 membar_enter(); 1402 cpu_resched(cp, tpri); 1403 } 1404 } 1405 1406 if (!bound && tpri > dp->disp_max_unbound_pri) { 1407 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1408 cp == CPU) { 1409 /* 1410 * If there are no other unbound threads on the 1411 * run queue, don't allow other CPUs to steal 1412 * this thread while we are in the middle of a 1413 * context switch. We may just switch to it 1414 * again right away. CPU_DISP_DONTSTEAL is cleared 1415 * in swtch and swtch_to. 1416 */ 1417 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1418 } 1419 dp->disp_max_unbound_pri = tpri; 1420 } 1421 (*disp_enq_thread)(cp, bound); 1422 } 1423 1424 /* 1425 * Put the specified thread on the front of the dispatcher 1426 * queue corresponding to its current priority. 1427 * 1428 * Called with the thread in transition, onproc or stopped state 1429 * and locked (transition implies locked) and at high spl. 1430 * Returns with the thread in TS_RUN state and still locked. 1431 */ 1432 void 1433 setfrontdq(kthread_t *tp) 1434 { 1435 disp_t *dp; 1436 dispq_t *dq; 1437 cpu_t *cp; 1438 pri_t tpri; 1439 int bound; 1440 1441 ASSERT(THREAD_LOCK_HELD(tp)); 1442 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1443 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1444 1445 /* 1446 * If thread is "swapped" or on the swap queue don't 1447 * queue it, but wake sched. 1448 */ 1449 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1450 disp_swapped_setrun(tp); 1451 return; 1452 } 1453 1454 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1455 bound = 1; 1456 else 1457 bound = 0; 1458 1459 tpri = DISP_PRIO(tp); 1460 if (ncpus == 1) 1461 cp = tp->t_cpu; 1462 else if (!bound) { 1463 if (tpri >= kpqpri) { 1464 setkpdq(tp, SETKP_FRONT); 1465 return; 1466 } 1467 cp = tp->t_cpu; 1468 if (tp->t_cpupart == cp->cpu_part) { 1469 /* 1470 * If we are of higher or equal priority than 1471 * the highest priority runnable thread of 1472 * the current CPU, just pick this CPU. Otherwise 1473 * Let cpu_choose() select the CPU. If this cpu 1474 * is the target of an offline request then do not 1475 * pick it - a thread_nomigrate() on the in motion 1476 * cpu relies on this when it forces a preempt. 1477 */ 1478 if (tpri < cp->cpu_disp->disp_maxrunpri || 1479 cp == cpu_inmotion) 1480 cp = cpu_choose(tp, tpri); 1481 } else { 1482 /* 1483 * Migrate to a cpu in the new partition. 1484 */ 1485 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1486 tp->t_lpl, tp->t_pri, NULL); 1487 } 1488 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1489 } else { 1490 /* 1491 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1492 * a short time until weak binding that existed when the 1493 * strong binding was established has dropped) so we must 1494 * favour weak binding over strong. 1495 */ 1496 cp = tp->t_weakbound_cpu ? 1497 tp->t_weakbound_cpu : tp->t_bound_cpu; 1498 } 1499 1500 /* 1501 * A thread that is ONPROC may be temporarily placed on the run queue 1502 * but then chosen to run again by disp. If the thread we're placing on 1503 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1504 * replacement process is actually scheduled in swtch(). In this 1505 * situation, curthread is the only thread that could be in the ONPROC 1506 * state. 1507 */ 1508 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1509 hrtime_t curtime; 1510 1511 curtime = gethrtime_unscaled(); 1512 (void) cpu_update_pct(tp, curtime); 1513 tp->t_waitrq = curtime; 1514 } else { 1515 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1516 } 1517 1518 dp = cp->cpu_disp; 1519 disp_lock_enter_high(&dp->disp_lock); 1520 1521 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1522 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1523 1524 #ifndef NPROBE 1525 /* Kernel probe */ 1526 if (tnf_tracing_active) 1527 tnf_thread_queue(tp, cp, tpri); 1528 #endif /* NPROBE */ 1529 1530 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1531 1532 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1533 tp->t_disp_queue = dp; 1534 1535 dq = &dp->disp_q[tpri]; 1536 dp->disp_nrunnable++; 1537 if (!bound) 1538 dp->disp_steal = 0; 1539 membar_enter(); 1540 1541 if (dq->dq_sruncnt++ != 0) { 1542 ASSERT(dq->dq_last != NULL); 1543 tp->t_link = dq->dq_first; 1544 dq->dq_first = tp; 1545 } else { 1546 ASSERT(dq->dq_last == NULL); 1547 ASSERT(dq->dq_first == NULL); 1548 tp->t_link = NULL; 1549 dq->dq_first = dq->dq_last = tp; 1550 BT_SET(dp->disp_qactmap, tpri); 1551 if (tpri > dp->disp_maxrunpri) { 1552 dp->disp_maxrunpri = tpri; 1553 membar_enter(); 1554 cpu_resched(cp, tpri); 1555 } 1556 } 1557 1558 if (!bound && tpri > dp->disp_max_unbound_pri) { 1559 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1560 cp == CPU) { 1561 /* 1562 * If there are no other unbound threads on the 1563 * run queue, don't allow other CPUs to steal 1564 * this thread while we are in the middle of a 1565 * context switch. We may just switch to it 1566 * again right away. CPU_DISP_DONTSTEAL is cleared 1567 * in swtch and swtch_to. 1568 */ 1569 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1570 } 1571 dp->disp_max_unbound_pri = tpri; 1572 } 1573 (*disp_enq_thread)(cp, bound); 1574 } 1575 1576 /* 1577 * Put a high-priority unbound thread on the kp queue 1578 */ 1579 static void 1580 setkpdq(kthread_t *tp, int borf) 1581 { 1582 dispq_t *dq; 1583 disp_t *dp; 1584 cpu_t *cp; 1585 pri_t tpri; 1586 1587 tpri = DISP_PRIO(tp); 1588 1589 dp = &tp->t_cpupart->cp_kp_queue; 1590 disp_lock_enter_high(&dp->disp_lock); 1591 1592 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1593 1594 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1595 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1596 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1597 tp->t_disp_queue = dp; 1598 dp->disp_nrunnable++; 1599 dq = &dp->disp_q[tpri]; 1600 1601 if (dq->dq_sruncnt++ != 0) { 1602 if (borf == SETKP_BACK) { 1603 ASSERT(dq->dq_first != NULL); 1604 tp->t_link = NULL; 1605 dq->dq_last->t_link = tp; 1606 dq->dq_last = tp; 1607 } else { 1608 ASSERT(dq->dq_last != NULL); 1609 tp->t_link = dq->dq_first; 1610 dq->dq_first = tp; 1611 } 1612 } else { 1613 if (borf == SETKP_BACK) { 1614 ASSERT(dq->dq_first == NULL); 1615 ASSERT(dq->dq_last == NULL); 1616 dq->dq_first = dq->dq_last = tp; 1617 } else { 1618 ASSERT(dq->dq_last == NULL); 1619 ASSERT(dq->dq_first == NULL); 1620 tp->t_link = NULL; 1621 dq->dq_first = dq->dq_last = tp; 1622 } 1623 BT_SET(dp->disp_qactmap, tpri); 1624 if (tpri > dp->disp_max_unbound_pri) 1625 dp->disp_max_unbound_pri = tpri; 1626 if (tpri > dp->disp_maxrunpri) { 1627 dp->disp_maxrunpri = tpri; 1628 membar_enter(); 1629 } 1630 } 1631 1632 cp = tp->t_cpu; 1633 if (tp->t_cpupart != cp->cpu_part) { 1634 /* migrate to a cpu in the new partition */ 1635 cp = tp->t_cpupart->cp_cpulist; 1636 } 1637 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1638 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1639 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1640 1641 #ifndef NPROBE 1642 /* Kernel probe */ 1643 if (tnf_tracing_active) 1644 tnf_thread_queue(tp, cp, tpri); 1645 #endif /* NPROBE */ 1646 1647 if (cp->cpu_chosen_level < tpri) 1648 cp->cpu_chosen_level = tpri; 1649 cpu_resched(cp, tpri); 1650 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1651 (*disp_enq_thread)(cp, 0); 1652 } 1653 1654 /* 1655 * Remove a thread from the dispatcher queue if it is on it. 1656 * It is not an error if it is not found but we return whether 1657 * or not it was found in case the caller wants to check. 1658 */ 1659 int 1660 dispdeq(kthread_t *tp) 1661 { 1662 disp_t *dp; 1663 dispq_t *dq; 1664 kthread_t *rp; 1665 kthread_t *trp; 1666 kthread_t **ptp; 1667 int tpri; 1668 1669 ASSERT(THREAD_LOCK_HELD(tp)); 1670 1671 if (tp->t_state != TS_RUN) 1672 return (0); 1673 1674 /* 1675 * The thread is "swapped" or is on the swap queue and 1676 * hence no longer on the run queue, so return true. 1677 */ 1678 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1679 return (1); 1680 1681 tpri = DISP_PRIO(tp); 1682 dp = tp->t_disp_queue; 1683 ASSERT(tpri < dp->disp_npri); 1684 dq = &dp->disp_q[tpri]; 1685 ptp = &dq->dq_first; 1686 rp = *ptp; 1687 trp = NULL; 1688 1689 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1690 1691 /* 1692 * Search for thread in queue. 1693 * Double links would simplify this at the expense of disp/setrun. 1694 */ 1695 while (rp != tp && rp != NULL) { 1696 trp = rp; 1697 ptp = &trp->t_link; 1698 rp = trp->t_link; 1699 } 1700 1701 if (rp == NULL) { 1702 panic("dispdeq: thread not on queue"); 1703 } 1704 1705 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1706 1707 /* 1708 * Found it so remove it from queue. 1709 */ 1710 if ((*ptp = rp->t_link) == NULL) 1711 dq->dq_last = trp; 1712 1713 dp->disp_nrunnable--; 1714 if (--dq->dq_sruncnt == 0) { 1715 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1716 if (dp->disp_nrunnable == 0) { 1717 dp->disp_max_unbound_pri = -1; 1718 dp->disp_maxrunpri = -1; 1719 } else if (tpri == dp->disp_maxrunpri) { 1720 int ipri; 1721 1722 ipri = bt_gethighbit(dp->disp_qactmap, 1723 dp->disp_maxrunpri >> BT_ULSHIFT); 1724 if (ipri < dp->disp_max_unbound_pri) 1725 dp->disp_max_unbound_pri = ipri; 1726 dp->disp_maxrunpri = ipri; 1727 } 1728 } 1729 tp->t_link = NULL; 1730 THREAD_TRANSITION(tp); /* put in intermediate state */ 1731 return (1); 1732 } 1733 1734 1735 /* 1736 * dq_sruninc and dq_srundec are public functions for 1737 * incrementing/decrementing the sruncnts when a thread on 1738 * a dispatcher queue is made schedulable/unschedulable by 1739 * resetting the TS_LOAD flag. 1740 * 1741 * The caller MUST have the thread lock and therefore the dispatcher 1742 * queue lock so that the operation which changes 1743 * the flag, the operation that checks the status of the thread to 1744 * determine if it's on a disp queue AND the call to this function 1745 * are one atomic operation with respect to interrupts. 1746 */ 1747 1748 /* 1749 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1750 */ 1751 void 1752 dq_sruninc(kthread_t *t) 1753 { 1754 ASSERT(t->t_state == TS_RUN); 1755 ASSERT(t->t_schedflag & TS_LOAD); 1756 1757 THREAD_TRANSITION(t); 1758 setfrontdq(t); 1759 } 1760 1761 /* 1762 * See comment on calling conventions above. 1763 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1764 */ 1765 void 1766 dq_srundec(kthread_t *t) 1767 { 1768 ASSERT(t->t_schedflag & TS_LOAD); 1769 1770 (void) dispdeq(t); 1771 disp_swapped_enq(t); 1772 } 1773 1774 /* 1775 * Change the dispatcher lock of thread to the "swapped_lock" 1776 * and return with thread lock still held. 1777 * 1778 * Called with thread_lock held, in transition state, and at high spl. 1779 */ 1780 void 1781 disp_swapped_enq(kthread_t *tp) 1782 { 1783 ASSERT(THREAD_LOCK_HELD(tp)); 1784 ASSERT(tp->t_schedflag & TS_LOAD); 1785 1786 switch (tp->t_state) { 1787 case TS_RUN: 1788 disp_lock_enter_high(&swapped_lock); 1789 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1790 break; 1791 case TS_ONPROC: 1792 disp_lock_enter_high(&swapped_lock); 1793 THREAD_TRANSITION(tp); 1794 wake_sched_sec = 1; /* tell clock to wake sched */ 1795 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1796 break; 1797 default: 1798 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1799 } 1800 } 1801 1802 /* 1803 * This routine is called by setbackdq/setfrontdq if the thread is 1804 * not loaded or loaded and on the swap queue. 1805 * 1806 * Thread state TS_SLEEP implies that a swapped thread 1807 * has been woken up and needs to be swapped in by the swapper. 1808 * 1809 * Thread state TS_RUN, it implies that the priority of a swapped 1810 * thread is being increased by scheduling class (e.g. ts_update). 1811 */ 1812 static void 1813 disp_swapped_setrun(kthread_t *tp) 1814 { 1815 ASSERT(THREAD_LOCK_HELD(tp)); 1816 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1817 1818 switch (tp->t_state) { 1819 case TS_SLEEP: 1820 disp_lock_enter_high(&swapped_lock); 1821 /* 1822 * Wakeup sched immediately (i.e., next tick) if the 1823 * thread priority is above maxclsyspri. 1824 */ 1825 if (DISP_PRIO(tp) > maxclsyspri) 1826 wake_sched = 1; 1827 else 1828 wake_sched_sec = 1; 1829 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1830 break; 1831 case TS_RUN: /* called from ts_update */ 1832 break; 1833 default: 1834 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1835 } 1836 } 1837 1838 1839 /* 1840 * Make a thread give up its processor. Find the processor on 1841 * which this thread is executing, and have that processor 1842 * preempt. 1843 */ 1844 void 1845 cpu_surrender(kthread_t *tp) 1846 { 1847 cpu_t *cpup; 1848 int max_pri; 1849 int max_run_pri; 1850 klwp_t *lwp; 1851 1852 ASSERT(THREAD_LOCK_HELD(tp)); 1853 1854 if (tp->t_state != TS_ONPROC) 1855 return; 1856 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1857 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1858 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1859 if (max_pri < max_run_pri) 1860 max_pri = max_run_pri; 1861 1862 cpup->cpu_runrun = 1; 1863 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1864 cpup->cpu_kprunrun = 1; 1865 } 1866 1867 /* 1868 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1869 */ 1870 membar_enter(); 1871 1872 DTRACE_SCHED1(surrender, kthread_t *, tp); 1873 1874 /* 1875 * Make the target thread take an excursion through trap() 1876 * to do preempt() (unless we're already in trap or post_syscall, 1877 * calling cpu_surrender via CL_TRAPRET). 1878 */ 1879 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1880 lwp->lwp_state != LWP_USER) { 1881 aston(tp); 1882 if (cpup != CPU) 1883 poke_cpu(cpup->cpu_id); 1884 } 1885 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1886 "cpu_surrender:tid %p cpu %p", tp, cpup); 1887 } 1888 1889 1890 /* 1891 * Commit to and ratify a scheduling decision 1892 */ 1893 /*ARGSUSED*/ 1894 static kthread_t * 1895 disp_ratify(kthread_t *tp, disp_t *kpq) 1896 { 1897 pri_t tpri, maxpri; 1898 pri_t maxkpri; 1899 cpu_t *cpup; 1900 1901 ASSERT(tp != NULL); 1902 /* 1903 * Commit to, then ratify scheduling decision 1904 */ 1905 cpup = CPU; 1906 if (cpup->cpu_runrun != 0) 1907 cpup->cpu_runrun = 0; 1908 if (cpup->cpu_kprunrun != 0) 1909 cpup->cpu_kprunrun = 0; 1910 if (cpup->cpu_chosen_level != -1) 1911 cpup->cpu_chosen_level = -1; 1912 membar_enter(); 1913 tpri = DISP_PRIO(tp); 1914 maxpri = cpup->cpu_disp->disp_maxrunpri; 1915 maxkpri = kpq->disp_maxrunpri; 1916 if (maxpri < maxkpri) 1917 maxpri = maxkpri; 1918 if (tpri < maxpri) { 1919 /* 1920 * should have done better 1921 * put this one back and indicate to try again 1922 */ 1923 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1924 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1925 thread_lock_high(tp); 1926 THREAD_TRANSITION(tp); 1927 setfrontdq(tp); 1928 thread_unlock_nopreempt(tp); 1929 1930 tp = NULL; 1931 } 1932 return (tp); 1933 } 1934 1935 /* 1936 * See if there is any work on the dispatcher queue for other CPUs. 1937 * If there is, dequeue the best thread and return. 1938 */ 1939 static kthread_t * 1940 disp_getwork(cpu_t *cp) 1941 { 1942 cpu_t *ocp; /* other CPU */ 1943 cpu_t *ocp_start; 1944 cpu_t *tcp; /* target local CPU */ 1945 kthread_t *tp; 1946 kthread_t *retval = NULL; 1947 pri_t maxpri; 1948 disp_t *kpq; /* kp queue for this partition */ 1949 lpl_t *lpl, *lpl_leaf; 1950 int hint, leafidx; 1951 hrtime_t stealtime; 1952 1953 maxpri = -1; 1954 tcp = NULL; 1955 1956 kpq = &cp->cpu_part->cp_kp_queue; 1957 while (kpq->disp_maxrunpri >= 0) { 1958 /* 1959 * Try to take a thread from the kp_queue. 1960 */ 1961 tp = (disp_getbest(kpq)); 1962 if (tp) 1963 return (disp_ratify(tp, kpq)); 1964 } 1965 1966 kpreempt_disable(); /* protect the cpu_active list */ 1967 1968 /* 1969 * Try to find something to do on another CPU's run queue. 1970 * Loop through all other CPUs looking for the one with the highest 1971 * priority unbound thread. 1972 * 1973 * On NUMA machines, the partition's CPUs are consulted in order of 1974 * distance from the current CPU. This way, the first available 1975 * work found is also the closest, and will suffer the least 1976 * from being migrated. 1977 */ 1978 lpl = lpl_leaf = cp->cpu_lpl; 1979 hint = leafidx = 0; 1980 1981 /* 1982 * This loop traverses the lpl hierarchy. Higher level lpls represent 1983 * broader levels of locality 1984 */ 1985 do { 1986 /* This loop iterates over the lpl's leaves */ 1987 do { 1988 if (lpl_leaf != cp->cpu_lpl) 1989 ocp = lpl_leaf->lpl_cpus; 1990 else 1991 ocp = cp->cpu_next_lpl; 1992 1993 /* This loop iterates over the CPUs in the leaf */ 1994 ocp_start = ocp; 1995 do { 1996 pri_t pri; 1997 1998 ASSERT(CPU_ACTIVE(ocp)); 1999 2000 /* 2001 * End our stroll around this lpl if: 2002 * 2003 * - Something became runnable on the local 2004 * queue...which also ends our stroll around 2005 * the partition. 2006 * 2007 * - We happen across another idle CPU. 2008 * Since it is patrolling the next portion 2009 * of the lpl's list (assuming it's not 2010 * halted), move to the next higher level 2011 * of locality. 2012 */ 2013 if (cp->cpu_disp->disp_nrunnable != 0) { 2014 kpreempt_enable(); 2015 return (NULL); 2016 } 2017 if (ocp->cpu_dispatch_pri == -1) { 2018 if (ocp->cpu_disp_flags & 2019 CPU_DISP_HALTED) 2020 continue; 2021 else 2022 break; 2023 } 2024 2025 /* 2026 * If there's only one thread and the CPU 2027 * is in the middle of a context switch, 2028 * or it's currently running the idle thread, 2029 * don't steal it. 2030 */ 2031 if ((ocp->cpu_disp_flags & 2032 CPU_DISP_DONTSTEAL) && 2033 ocp->cpu_disp->disp_nrunnable == 1) 2034 continue; 2035 2036 pri = ocp->cpu_disp->disp_max_unbound_pri; 2037 if (pri > maxpri) { 2038 /* 2039 * Don't steal threads that we attempted 2040 * to steal recently until they're ready 2041 * to be stolen again. 2042 */ 2043 stealtime = ocp->cpu_disp->disp_steal; 2044 if (stealtime == 0 || 2045 stealtime - gethrtime() <= 0) { 2046 maxpri = pri; 2047 tcp = ocp; 2048 } else { 2049 /* 2050 * Don't update tcp, just set 2051 * the retval to T_DONTSTEAL, so 2052 * that if no acceptable CPUs 2053 * are found the return value 2054 * will be T_DONTSTEAL rather 2055 * then NULL. 2056 */ 2057 retval = T_DONTSTEAL; 2058 } 2059 } 2060 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 2061 2062 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 2063 leafidx = 0; 2064 lpl_leaf = lpl->lpl_rset[leafidx]; 2065 } 2066 } while (leafidx != hint); 2067 2068 hint = leafidx = lpl->lpl_hint; 2069 if ((lpl = lpl->lpl_parent) != NULL) 2070 lpl_leaf = lpl->lpl_rset[hint]; 2071 } while (!tcp && lpl); 2072 2073 kpreempt_enable(); 2074 2075 /* 2076 * If another queue looks good, and there is still nothing on 2077 * the local queue, try to transfer one or more threads 2078 * from it to our queue. 2079 */ 2080 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 2081 tp = disp_getbest(tcp->cpu_disp); 2082 if (tp == NULL || tp == T_DONTSTEAL) 2083 return (tp); 2084 return (disp_ratify(tp, kpq)); 2085 } 2086 return (retval); 2087 } 2088 2089 2090 /* 2091 * disp_fix_unbound_pri() 2092 * Determines the maximum priority of unbound threads on the queue. 2093 * The priority is kept for the queue, but is only increased, never 2094 * reduced unless some CPU is looking for something on that queue. 2095 * 2096 * The priority argument is the known upper limit. 2097 * 2098 * Perhaps this should be kept accurately, but that probably means 2099 * separate bitmaps for bound and unbound threads. Since only idled 2100 * CPUs will have to do this recalculation, it seems better this way. 2101 */ 2102 static void 2103 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 2104 { 2105 kthread_t *tp; 2106 dispq_t *dq; 2107 ulong_t *dqactmap = dp->disp_qactmap; 2108 ulong_t mapword; 2109 int wx; 2110 2111 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 2112 2113 ASSERT(pri >= 0); /* checked by caller */ 2114 2115 /* 2116 * Start the search at the next lowest priority below the supplied 2117 * priority. This depends on the bitmap implementation. 2118 */ 2119 do { 2120 wx = pri >> BT_ULSHIFT; /* index of word in map */ 2121 2122 /* 2123 * Form mask for all lower priorities in the word. 2124 */ 2125 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 2126 2127 /* 2128 * Get next lower active priority. 2129 */ 2130 if (mapword != 0) { 2131 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 2132 } else if (wx > 0) { 2133 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 2134 if (pri < 0) 2135 break; 2136 } else { 2137 pri = -1; 2138 break; 2139 } 2140 2141 /* 2142 * Search the queue for unbound, runnable threads. 2143 */ 2144 dq = &dp->disp_q[pri]; 2145 tp = dq->dq_first; 2146 2147 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2148 tp = tp->t_link; 2149 } 2150 2151 /* 2152 * If a thread was found, set the priority and return. 2153 */ 2154 } while (tp == NULL); 2155 2156 /* 2157 * pri holds the maximum unbound thread priority or -1. 2158 */ 2159 if (dp->disp_max_unbound_pri != pri) 2160 dp->disp_max_unbound_pri = pri; 2161 } 2162 2163 /* 2164 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2165 * check if the CPU to which is was previously bound should have 2166 * its disp_max_unbound_pri increased. 2167 */ 2168 void 2169 disp_adjust_unbound_pri(kthread_t *tp) 2170 { 2171 disp_t *dp; 2172 pri_t tpri; 2173 2174 ASSERT(THREAD_LOCK_HELD(tp)); 2175 2176 /* 2177 * Don't do anything if the thread is not bound, or 2178 * currently not runnable or swapped out. 2179 */ 2180 if (tp->t_bound_cpu == NULL || 2181 tp->t_state != TS_RUN || 2182 tp->t_schedflag & TS_ON_SWAPQ) 2183 return; 2184 2185 tpri = DISP_PRIO(tp); 2186 dp = tp->t_bound_cpu->cpu_disp; 2187 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2188 if (tpri > dp->disp_max_unbound_pri) 2189 dp->disp_max_unbound_pri = tpri; 2190 } 2191 2192 /* 2193 * disp_getbest() 2194 * De-queue the highest priority unbound runnable thread. 2195 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2196 * Returns NULL if nothing found. 2197 * Returns T_DONTSTEAL if the thread was not stealable. 2198 * so that the caller will try again later. 2199 * 2200 * Passed a pointer to a dispatch queue not associated with this CPU, and 2201 * its type. 2202 */ 2203 static kthread_t * 2204 disp_getbest(disp_t *dp) 2205 { 2206 kthread_t *tp; 2207 dispq_t *dq; 2208 pri_t pri; 2209 cpu_t *cp, *tcp; 2210 boolean_t allbound; 2211 2212 disp_lock_enter(&dp->disp_lock); 2213 2214 /* 2215 * If there is nothing to run, or the CPU is in the middle of a 2216 * context switch of the only thread, return NULL. 2217 */ 2218 tcp = dp->disp_cpu; 2219 cp = CPU; 2220 pri = dp->disp_max_unbound_pri; 2221 if (pri == -1 || 2222 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2223 tcp->cpu_disp->disp_nrunnable == 1)) { 2224 disp_lock_exit_nopreempt(&dp->disp_lock); 2225 return (NULL); 2226 } 2227 2228 dq = &dp->disp_q[pri]; 2229 2230 2231 /* 2232 * Assume that all threads are bound on this queue, and change it 2233 * later when we find out that it is not the case. 2234 */ 2235 allbound = B_TRUE; 2236 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2237 hrtime_t now, nosteal, rqtime; 2238 2239 /* 2240 * Skip over bound threads which could be here even 2241 * though disp_max_unbound_pri indicated this level. 2242 */ 2243 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2244 continue; 2245 2246 /* 2247 * We've got some unbound threads on this queue, so turn 2248 * the allbound flag off now. 2249 */ 2250 allbound = B_FALSE; 2251 2252 /* 2253 * The thread is a candidate for stealing from its run queue. We 2254 * don't want to steal threads that became runnable just a 2255 * moment ago. This improves CPU affinity for threads that get 2256 * preempted for short periods of time and go back on the run 2257 * queue. 2258 * 2259 * We want to let it stay on its run queue if it was only placed 2260 * there recently and it was running on the same CPU before that 2261 * to preserve its cache investment. For the thread to remain on 2262 * its run queue, ALL of the following conditions must be 2263 * satisfied: 2264 * 2265 * - the disp queue should not be the kernel preemption queue 2266 * - delayed idle stealing should not be disabled 2267 * - nosteal_nsec should be non-zero 2268 * - it should run with user priority 2269 * - it should be on the run queue of the CPU where it was 2270 * running before being placed on the run queue 2271 * - it should be the only thread on the run queue (to prevent 2272 * extra scheduling latency for other threads) 2273 * - it should sit on the run queue for less than per-chip 2274 * nosteal interval or global nosteal interval 2275 * - in case of CPUs with shared cache it should sit in a run 2276 * queue of a CPU from a different chip 2277 * 2278 * The checks are arranged so that the ones that are faster are 2279 * placed earlier. 2280 */ 2281 if (tcp == NULL || 2282 pri >= minclsyspri || 2283 tp->t_cpu != tcp) 2284 break; 2285 2286 /* 2287 * Steal immediately if, due to CMT processor architecture 2288 * migraiton between cp and tcp would incur no performance 2289 * penalty. 2290 */ 2291 if (pg_cmt_can_migrate(cp, tcp)) 2292 break; 2293 2294 nosteal = nosteal_nsec; 2295 if (nosteal == 0) 2296 break; 2297 2298 /* 2299 * Calculate time spent sitting on run queue 2300 */ 2301 now = gethrtime_unscaled(); 2302 rqtime = now - tp->t_waitrq; 2303 scalehrtime(&rqtime); 2304 2305 /* 2306 * Steal immediately if the time spent on this run queue is more 2307 * than allowed nosteal delay. 2308 * 2309 * Negative rqtime check is needed here to avoid infinite 2310 * stealing delays caused by unlikely but not impossible 2311 * drifts between CPU times on different CPUs. 2312 */ 2313 if (rqtime > nosteal || rqtime < 0) 2314 break; 2315 2316 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2317 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2318 scalehrtime(&now); 2319 /* 2320 * Calculate when this thread becomes stealable 2321 */ 2322 now += (nosteal - rqtime); 2323 2324 /* 2325 * Calculate time when some thread becomes stealable 2326 */ 2327 if (now < dp->disp_steal) 2328 dp->disp_steal = now; 2329 } 2330 2331 /* 2332 * If there were no unbound threads on this queue, find the queue 2333 * where they are and then return later. The value of 2334 * disp_max_unbound_pri is not always accurate because it isn't 2335 * reduced until another idle CPU looks for work. 2336 */ 2337 if (allbound) 2338 disp_fix_unbound_pri(dp, pri); 2339 2340 /* 2341 * If we reached the end of the queue and found no unbound threads 2342 * then return NULL so that other CPUs will be considered. If there 2343 * are unbound threads but they cannot yet be stolen, then 2344 * return T_DONTSTEAL and try again later. 2345 */ 2346 if (tp == NULL) { 2347 disp_lock_exit_nopreempt(&dp->disp_lock); 2348 return (allbound ? NULL : T_DONTSTEAL); 2349 } 2350 2351 /* 2352 * Found a runnable, unbound thread, so remove it from queue. 2353 * dispdeq() requires that we have the thread locked, and we do, 2354 * by virtue of holding the dispatch queue lock. dispdeq() will 2355 * put the thread in transition state, thereby dropping the dispq 2356 * lock. 2357 */ 2358 2359 #ifdef DEBUG 2360 { 2361 int thread_was_on_queue; 2362 2363 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2364 ASSERT(thread_was_on_queue); 2365 } 2366 2367 #else /* DEBUG */ 2368 (void) dispdeq(tp); /* drops disp_lock */ 2369 #endif /* DEBUG */ 2370 2371 /* 2372 * Reset the disp_queue steal time - we do not know what is the smallest 2373 * value across the queue is. 2374 */ 2375 dp->disp_steal = 0; 2376 2377 tp->t_schedflag |= TS_DONT_SWAP; 2378 2379 /* 2380 * Setup thread to run on the current CPU. 2381 */ 2382 tp->t_disp_queue = cp->cpu_disp; 2383 2384 cp->cpu_dispthread = tp; /* protected by spl only */ 2385 cp->cpu_dispatch_pri = pri; 2386 ASSERT(pri == DISP_PRIO(tp)); 2387 2388 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2389 2390 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2391 2392 /* 2393 * Return with spl high so that swtch() won't need to raise it. 2394 * The disp_lock was dropped by dispdeq(). 2395 */ 2396 2397 return (tp); 2398 } 2399 2400 /* 2401 * disp_bound_common() - common routine for higher level functions 2402 * that check for bound threads under certain conditions. 2403 * If 'threadlistsafe' is set then there is no need to acquire 2404 * pidlock to stop the thread list from changing (eg, if 2405 * disp_bound_* is called with cpus paused). 2406 */ 2407 static int 2408 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2409 { 2410 int found = 0; 2411 kthread_t *tp; 2412 2413 ASSERT(flag); 2414 2415 if (!threadlistsafe) 2416 mutex_enter(&pidlock); 2417 tp = curthread; /* faster than allthreads */ 2418 do { 2419 if (tp->t_state != TS_FREE) { 2420 /* 2421 * If an interrupt thread is busy, but the 2422 * caller doesn't care (i.e. BOUND_INTR is off), 2423 * then just ignore it and continue through. 2424 */ 2425 if ((tp->t_flag & T_INTR_THREAD) && 2426 !(flag & BOUND_INTR)) 2427 continue; 2428 2429 /* 2430 * Skip the idle thread for the CPU 2431 * we're about to set offline. 2432 */ 2433 if (tp == cp->cpu_idle_thread) 2434 continue; 2435 2436 /* 2437 * Skip the pause thread for the CPU 2438 * we're about to set offline. 2439 */ 2440 if (tp == cp->cpu_pause_thread) 2441 continue; 2442 2443 if ((flag & BOUND_CPU) && 2444 (tp->t_bound_cpu == cp || 2445 tp->t_bind_cpu == cp->cpu_id || 2446 tp->t_weakbound_cpu == cp)) { 2447 found = 1; 2448 break; 2449 } 2450 2451 if ((flag & BOUND_PARTITION) && 2452 (tp->t_cpupart == cp->cpu_part)) { 2453 found = 1; 2454 break; 2455 } 2456 } 2457 } while ((tp = tp->t_next) != curthread && found == 0); 2458 if (!threadlistsafe) 2459 mutex_exit(&pidlock); 2460 return (found); 2461 } 2462 2463 /* 2464 * disp_bound_threads - return nonzero if threads are bound to the processor. 2465 * Called infrequently. Keep this simple. 2466 * Includes threads that are asleep or stopped but not onproc. 2467 */ 2468 int 2469 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2470 { 2471 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2472 } 2473 2474 /* 2475 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2476 * to the given processor, including interrupt threads. 2477 */ 2478 int 2479 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2480 { 2481 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2482 } 2483 2484 /* 2485 * disp_bound_partition - return nonzero if threads are bound to the same 2486 * partition as the processor. 2487 * Called infrequently. Keep this simple. 2488 * Includes threads that are asleep or stopped but not onproc. 2489 */ 2490 int 2491 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2492 { 2493 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2494 } 2495 2496 /* 2497 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2498 * threads to other CPUs. 2499 */ 2500 void 2501 disp_cpu_inactive(cpu_t *cp) 2502 { 2503 kthread_t *tp; 2504 disp_t *dp = cp->cpu_disp; 2505 dispq_t *dq; 2506 pri_t pri; 2507 int wasonq; 2508 2509 disp_lock_enter(&dp->disp_lock); 2510 while ((pri = dp->disp_max_unbound_pri) != -1) { 2511 dq = &dp->disp_q[pri]; 2512 tp = dq->dq_first; 2513 2514 /* 2515 * Skip over bound threads. 2516 */ 2517 while (tp != NULL && tp->t_bound_cpu != NULL) { 2518 tp = tp->t_link; 2519 } 2520 2521 if (tp == NULL) { 2522 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2523 disp_fix_unbound_pri(dp, pri); 2524 continue; 2525 } 2526 2527 wasonq = dispdeq(tp); /* drops disp_lock */ 2528 ASSERT(wasonq); 2529 ASSERT(tp->t_weakbound_cpu == NULL); 2530 2531 setbackdq(tp); 2532 /* 2533 * Called from cpu_offline: 2534 * 2535 * cp has already been removed from the list of active cpus 2536 * and tp->t_cpu has been changed so there is no risk of 2537 * tp ending up back on cp. 2538 * 2539 * Called from cpupart_move_cpu: 2540 * 2541 * The cpu has moved to a new cpupart. Any threads that 2542 * were on it's dispatch queues before the move remain 2543 * in the old partition and can't run in the new partition. 2544 */ 2545 ASSERT(tp->t_cpu != cp); 2546 thread_unlock(tp); 2547 2548 disp_lock_enter(&dp->disp_lock); 2549 } 2550 disp_lock_exit(&dp->disp_lock); 2551 } 2552 2553 /* 2554 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2555 * The hint passed in is used as a starting point so we don't favor 2556 * CPU 0 or any other CPU. The caller should pass in the most recently 2557 * used CPU for the thread. 2558 * 2559 * The lgroup and priority are used to determine the best CPU to run on 2560 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2561 * the thread priority will indicate whether the thread will actually run 2562 * there. To pick the best CPU, the CPUs inside and outside of the given 2563 * lgroup which are running the lowest priority threads are found. The 2564 * remote CPU is chosen only if the thread will not run locally on a CPU 2565 * within the lgroup, but will run on the remote CPU. If the thread 2566 * cannot immediately run on any CPU, the best local CPU will be chosen. 2567 * 2568 * The lpl specified also identifies the cpu partition from which 2569 * disp_lowpri_cpu should select a CPU. 2570 * 2571 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2572 * behalf of the current thread. (curthread is looking for a new cpu) 2573 * In this case, cpu_dispatch_pri for this thread's cpu should be 2574 * ignored. 2575 * 2576 * If a cpu is the target of an offline request then try to avoid it. 2577 * 2578 * This function must be called at either high SPL, or with preemption 2579 * disabled, so that the "hint" CPU cannot be removed from the online 2580 * CPU list while we are traversing it. 2581 */ 2582 cpu_t * 2583 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2584 { 2585 cpu_t *bestcpu; 2586 cpu_t *besthomecpu; 2587 cpu_t *cp, *cpstart; 2588 2589 pri_t bestpri; 2590 pri_t cpupri; 2591 2592 klgrpset_t done; 2593 klgrpset_t cur_set; 2594 2595 lpl_t *lpl_iter, *lpl_leaf; 2596 int i; 2597 2598 /* 2599 * Scan for a CPU currently running the lowest priority thread. 2600 * Cannot get cpu_lock here because it is adaptive. 2601 * We do not require lock on CPU list. 2602 */ 2603 ASSERT(hint != NULL); 2604 ASSERT(lpl != NULL); 2605 ASSERT(lpl->lpl_ncpu > 0); 2606 2607 /* 2608 * First examine local CPUs. Note that it's possible the hint CPU 2609 * passed in in remote to the specified home lgroup. If our priority 2610 * isn't sufficient enough such that we can run immediately at home, 2611 * then examine CPUs remote to our home lgroup. 2612 * We would like to give preference to CPUs closest to "home". 2613 * If we can't find a CPU where we'll run at a given level 2614 * of locality, we expand our search to include the next level. 2615 */ 2616 bestcpu = besthomecpu = NULL; 2617 klgrpset_clear(done); 2618 /* start with lpl we were passed */ 2619 2620 lpl_iter = lpl; 2621 2622 do { 2623 2624 bestpri = SHRT_MAX; 2625 klgrpset_clear(cur_set); 2626 2627 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2628 lpl_leaf = lpl_iter->lpl_rset[i]; 2629 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2630 continue; 2631 2632 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2633 2634 if (hint->cpu_lpl == lpl_leaf) 2635 cp = cpstart = hint; 2636 else 2637 cp = cpstart = lpl_leaf->lpl_cpus; 2638 2639 do { 2640 if (cp == curcpu) 2641 cpupri = -1; 2642 else if (cp == cpu_inmotion) 2643 cpupri = SHRT_MAX; 2644 else 2645 cpupri = cp->cpu_dispatch_pri; 2646 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2647 cpupri = cp->cpu_disp->disp_maxrunpri; 2648 if (cp->cpu_chosen_level > cpupri) 2649 cpupri = cp->cpu_chosen_level; 2650 if (cpupri < bestpri) { 2651 if (CPU_IDLING(cpupri)) { 2652 ASSERT((cp->cpu_flags & 2653 CPU_QUIESCED) == 0); 2654 return (cp); 2655 } 2656 bestcpu = cp; 2657 bestpri = cpupri; 2658 } 2659 } while ((cp = cp->cpu_next_lpl) != cpstart); 2660 } 2661 2662 if (bestcpu && (tpri > bestpri)) { 2663 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2664 return (bestcpu); 2665 } 2666 if (besthomecpu == NULL) 2667 besthomecpu = bestcpu; 2668 /* 2669 * Add the lgrps we just considered to the "done" set 2670 */ 2671 klgrpset_or(done, cur_set); 2672 2673 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2674 2675 /* 2676 * The specified priority isn't high enough to run immediately 2677 * anywhere, so just return the best CPU from the home lgroup. 2678 */ 2679 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2680 return (besthomecpu); 2681 } 2682 2683 /* 2684 * This routine provides the generic idle cpu function for all processors. 2685 * If a processor has some specific code to execute when idle (say, to stop 2686 * the pipeline and save power) then that routine should be defined in the 2687 * processors specific code (module_xx.c) and the global variable idle_cpu 2688 * set to that function. 2689 */ 2690 static void 2691 generic_idle_cpu(void) 2692 { 2693 } 2694 2695 /*ARGSUSED*/ 2696 static void 2697 generic_enq_thread(cpu_t *cpu, int bound) 2698 { 2699 } 2700 2701 /* 2702 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2703 * - t->t_cpu is not in this thread's assigned lgrp 2704 * - the time since the thread last came off t->t_cpu exceeds the 2705 * rechoose time for this cpu (ignore this if t is curthread in 2706 * which case it's on CPU and t->t_disp_time is inaccurate) 2707 * - t->t_cpu is presently the target of an offline or partition move 2708 * request 2709 */ 2710 static cpu_t * 2711 cpu_choose(kthread_t *t, pri_t tpri) 2712 { 2713 ASSERT(tpri < kpqpri); 2714 2715 if ((((lbolt - t->t_disp_time) > rechoose_interval) && 2716 t != curthread) || t->t_cpu == cpu_inmotion) { 2717 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2718 } 2719 2720 /* 2721 * Take a trip through disp_lowpri_cpu() if the thread was 2722 * running outside it's home lgroup 2723 */ 2724 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2725 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2726 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2727 (t == curthread) ? t->t_cpu : NULL)); 2728 } 2729 return (t->t_cpu); 2730 } 2731