1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/pg.h> 54 #include <sys/cmt.h> 55 #include <sys/bitset.h> 56 #include <sys/schedctl.h> 57 #include <sys/atomic.h> 58 #include <sys/dtrace.h> 59 #include <sys/sdt.h> 60 61 #include <vm/as.h> 62 63 #define BOUND_CPU 0x1 64 #define BOUND_PARTITION 0x2 65 #define BOUND_INTR 0x4 66 67 /* Dispatch queue allocation structure and functions */ 68 struct disp_queue_info { 69 disp_t *dp; 70 dispq_t *olddispq; 71 dispq_t *newdispq; 72 ulong_t *olddqactmap; 73 ulong_t *newdqactmap; 74 int oldnglobpris; 75 }; 76 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 77 disp_t *dp); 78 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 79 static void disp_dq_free(struct disp_queue_info *dptr); 80 81 /* platform-specific routine to call when processor is idle */ 82 static void generic_idle_cpu(); 83 void (*idle_cpu)() = generic_idle_cpu; 84 85 /* routines invoked when a CPU enters/exits the idle loop */ 86 static void idle_enter(); 87 static void idle_exit(); 88 89 /* platform-specific routine to call when thread is enqueued */ 90 static void generic_enq_thread(cpu_t *, int); 91 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 92 93 pri_t kpreemptpri; /* priority where kernel preemption applies */ 94 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 95 pri_t intr_pri; /* interrupt thread priority base level */ 96 97 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 98 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 99 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 100 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 101 int nswapped; /* total number of swapped threads */ 102 void disp_swapped_enq(kthread_t *tp); 103 static void disp_swapped_setrun(kthread_t *tp); 104 static void cpu_resched(cpu_t *cp, pri_t tpri); 105 106 /* 107 * If this is set, only interrupt threads will cause kernel preemptions. 108 * This is done by changing the value of kpreemptpri. kpreemptpri 109 * will either be the max sysclass pri + 1 or the min interrupt pri. 110 */ 111 int only_intr_kpreempt; 112 113 extern void set_idle_cpu(int cpun); 114 extern void unset_idle_cpu(int cpun); 115 static void setkpdq(kthread_t *tp, int borf); 116 #define SETKP_BACK 0 117 #define SETKP_FRONT 1 118 /* 119 * Parameter that determines how recently a thread must have run 120 * on the CPU to be considered loosely-bound to that CPU to reduce 121 * cold cache effects. The interval is in hertz. 122 */ 123 #define RECHOOSE_INTERVAL 3 124 int rechoose_interval = RECHOOSE_INTERVAL; 125 static cpu_t *cpu_choose(kthread_t *, pri_t); 126 127 /* 128 * Parameter that determines how long (in nanoseconds) a thread must 129 * be sitting on a run queue before it can be stolen by another CPU 130 * to reduce migrations. The interval is in nanoseconds. 131 * 132 * The nosteal_nsec should be set by a platform code to an appropriate value. 133 * Setting it to 0 effectively disables the nosteal 'protection' 134 */ 135 hrtime_t nosteal_nsec = -1; 136 137 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 138 139 disp_lock_t transition_lock; /* lock on transitioning threads */ 140 disp_lock_t stop_lock; /* lock on stopped threads */ 141 142 static void cpu_dispqalloc(int numpris); 143 144 /* 145 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 146 * a thread because it was sitting on its run queue for a very short 147 * period of time. 148 */ 149 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 150 151 static kthread_t *disp_getwork(cpu_t *to); 152 static kthread_t *disp_getbest(disp_t *from); 153 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 154 155 void swtch_to(kthread_t *); 156 157 /* 158 * dispatcher and scheduler initialization 159 */ 160 161 /* 162 * disp_setup - Common code to calculate and allocate dispatcher 163 * variables and structures based on the maximum priority. 164 */ 165 static void 166 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 167 { 168 pri_t newnglobpris; 169 170 ASSERT(MUTEX_HELD(&cpu_lock)); 171 172 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 173 174 if (newnglobpris > oldnglobpris) { 175 /* 176 * Allocate new kp queues for each CPU partition. 177 */ 178 cpupart_kpqalloc(newnglobpris); 179 180 /* 181 * Allocate new dispatch queues for each CPU. 182 */ 183 cpu_dispqalloc(newnglobpris); 184 185 /* 186 * compute new interrupt thread base priority 187 */ 188 intr_pri = maxglobpri; 189 if (only_intr_kpreempt) { 190 kpreemptpri = intr_pri + 1; 191 if (kpqpri == KPQPRI) 192 kpqpri = kpreemptpri; 193 } 194 v.v_nglobpris = newnglobpris; 195 } 196 } 197 198 /* 199 * dispinit - Called to initialize all loaded classes and the 200 * dispatcher framework. 201 */ 202 void 203 dispinit(void) 204 { 205 id_t cid; 206 pri_t maxglobpri; 207 pri_t cl_maxglobpri; 208 209 maxglobpri = -1; 210 211 /* 212 * Initialize transition lock, which will always be set. 213 */ 214 DISP_LOCK_INIT(&transition_lock); 215 disp_lock_enter_high(&transition_lock); 216 DISP_LOCK_INIT(&stop_lock); 217 218 mutex_enter(&cpu_lock); 219 CPU->cpu_disp->disp_maxrunpri = -1; 220 CPU->cpu_disp->disp_max_unbound_pri = -1; 221 222 /* 223 * Initialize the default CPU partition. 224 */ 225 cpupart_initialize_default(); 226 /* 227 * Call the class specific initialization functions for 228 * all pre-installed schedulers. 229 * 230 * We pass the size of a class specific parameter 231 * buffer to each of the initialization functions 232 * to try to catch problems with backward compatibility 233 * of class modules. 234 * 235 * For example a new class module running on an old system 236 * which didn't provide sufficiently large parameter buffers 237 * would be bad news. Class initialization modules can check for 238 * this and take action if they detect a problem. 239 */ 240 241 for (cid = 0; cid < nclass; cid++) { 242 sclass_t *sc; 243 244 sc = &sclass[cid]; 245 if (SCHED_INSTALLED(sc)) { 246 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 247 &sc->cl_funcs); 248 if (cl_maxglobpri > maxglobpri) 249 maxglobpri = cl_maxglobpri; 250 } 251 } 252 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 253 if (kpqpri == KPQPRI) 254 kpqpri = kpreemptpri; 255 256 ASSERT(maxglobpri >= 0); 257 disp_setup(maxglobpri, 0); 258 259 mutex_exit(&cpu_lock); 260 261 /* 262 * Get the default class ID; this may be later modified via 263 * dispadmin(1M). This will load the class (normally TS) and that will 264 * call disp_add(), which is why we had to drop cpu_lock first. 265 */ 266 if (getcid(defaultclass, &defaultcid) != 0) { 267 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 268 defaultclass); 269 } 270 } 271 272 /* 273 * disp_add - Called with class pointer to initialize the dispatcher 274 * for a newly loaded class. 275 */ 276 void 277 disp_add(sclass_t *clp) 278 { 279 pri_t maxglobpri; 280 pri_t cl_maxglobpri; 281 282 mutex_enter(&cpu_lock); 283 /* 284 * Initialize the scheduler class. 285 */ 286 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 287 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 288 if (cl_maxglobpri > maxglobpri) 289 maxglobpri = cl_maxglobpri; 290 291 /* 292 * Save old queue information. Since we're initializing a 293 * new scheduling class which has just been loaded, then 294 * the size of the dispq may have changed. We need to handle 295 * that here. 296 */ 297 disp_setup(maxglobpri, v.v_nglobpris); 298 299 mutex_exit(&cpu_lock); 300 } 301 302 303 /* 304 * For each CPU, allocate new dispatch queues 305 * with the stated number of priorities. 306 */ 307 static void 308 cpu_dispqalloc(int numpris) 309 { 310 cpu_t *cpup; 311 struct disp_queue_info *disp_mem; 312 int i, num; 313 314 ASSERT(MUTEX_HELD(&cpu_lock)); 315 316 disp_mem = kmem_zalloc(NCPU * 317 sizeof (struct disp_queue_info), KM_SLEEP); 318 319 /* 320 * This routine must allocate all of the memory before stopping 321 * the cpus because it must not sleep in kmem_alloc while the 322 * CPUs are stopped. Locks they hold will not be freed until they 323 * are restarted. 324 */ 325 i = 0; 326 cpup = cpu_list; 327 do { 328 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 329 i++; 330 cpup = cpup->cpu_next; 331 } while (cpup != cpu_list); 332 num = i; 333 334 pause_cpus(NULL); 335 for (i = 0; i < num; i++) 336 disp_dq_assign(&disp_mem[i], numpris); 337 start_cpus(); 338 339 /* 340 * I must free all of the memory after starting the cpus because 341 * I can not risk sleeping in kmem_free while the cpus are stopped. 342 */ 343 for (i = 0; i < num; i++) 344 disp_dq_free(&disp_mem[i]); 345 346 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 347 } 348 349 static void 350 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 351 { 352 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 353 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 354 sizeof (long), KM_SLEEP); 355 dptr->dp = dp; 356 } 357 358 static void 359 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 360 { 361 disp_t *dp; 362 363 dp = dptr->dp; 364 dptr->olddispq = dp->disp_q; 365 dptr->olddqactmap = dp->disp_qactmap; 366 dptr->oldnglobpris = dp->disp_npri; 367 368 ASSERT(dptr->oldnglobpris < numpris); 369 370 if (dptr->olddispq != NULL) { 371 /* 372 * Use kcopy because bcopy is platform-specific 373 * and could block while we might have paused the cpus. 374 */ 375 (void) kcopy(dptr->olddispq, dptr->newdispq, 376 dptr->oldnglobpris * sizeof (dispq_t)); 377 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 378 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 379 sizeof (long)); 380 } 381 dp->disp_q = dptr->newdispq; 382 dp->disp_qactmap = dptr->newdqactmap; 383 dp->disp_q_limit = &dptr->newdispq[numpris]; 384 dp->disp_npri = numpris; 385 } 386 387 static void 388 disp_dq_free(struct disp_queue_info *dptr) 389 { 390 if (dptr->olddispq != NULL) 391 kmem_free(dptr->olddispq, 392 dptr->oldnglobpris * sizeof (dispq_t)); 393 if (dptr->olddqactmap != NULL) 394 kmem_free(dptr->olddqactmap, 395 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 396 } 397 398 /* 399 * For a newly created CPU, initialize the dispatch queue. 400 * This is called before the CPU is known through cpu[] or on any lists. 401 */ 402 void 403 disp_cpu_init(cpu_t *cp) 404 { 405 disp_t *dp; 406 dispq_t *newdispq; 407 ulong_t *newdqactmap; 408 409 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 410 411 if (cp == cpu0_disp.disp_cpu) 412 dp = &cpu0_disp; 413 else 414 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 415 bzero(dp, sizeof (disp_t)); 416 cp->cpu_disp = dp; 417 dp->disp_cpu = cp; 418 dp->disp_maxrunpri = -1; 419 dp->disp_max_unbound_pri = -1; 420 DISP_LOCK_INIT(&cp->cpu_thread_lock); 421 /* 422 * Allocate memory for the dispatcher queue headers 423 * and the active queue bitmap. 424 */ 425 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 426 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 427 sizeof (long), KM_SLEEP); 428 dp->disp_q = newdispq; 429 dp->disp_qactmap = newdqactmap; 430 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 431 dp->disp_npri = v.v_nglobpris; 432 } 433 434 void 435 disp_cpu_fini(cpu_t *cp) 436 { 437 ASSERT(MUTEX_HELD(&cpu_lock)); 438 439 disp_kp_free(cp->cpu_disp); 440 if (cp->cpu_disp != &cpu0_disp) 441 kmem_free(cp->cpu_disp, sizeof (disp_t)); 442 } 443 444 /* 445 * Allocate new, larger kpreempt dispatch queue to replace the old one. 446 */ 447 void 448 disp_kp_alloc(disp_t *dq, pri_t npri) 449 { 450 struct disp_queue_info mem_info; 451 452 if (npri > dq->disp_npri) { 453 /* 454 * Allocate memory for the new array. 455 */ 456 disp_dq_alloc(&mem_info, npri, dq); 457 458 /* 459 * We need to copy the old structures to the new 460 * and free the old. 461 */ 462 disp_dq_assign(&mem_info, npri); 463 disp_dq_free(&mem_info); 464 } 465 } 466 467 /* 468 * Free dispatch queue. 469 * Used for the kpreempt queues for a removed CPU partition and 470 * for the per-CPU queues of deleted CPUs. 471 */ 472 void 473 disp_kp_free(disp_t *dq) 474 { 475 struct disp_queue_info mem_info; 476 477 mem_info.olddispq = dq->disp_q; 478 mem_info.olddqactmap = dq->disp_qactmap; 479 mem_info.oldnglobpris = dq->disp_npri; 480 disp_dq_free(&mem_info); 481 } 482 483 /* 484 * End dispatcher and scheduler initialization. 485 */ 486 487 /* 488 * See if there's anything to do other than remain idle. 489 * Return non-zero if there is. 490 * 491 * This function must be called with high spl, or with 492 * kernel preemption disabled to prevent the partition's 493 * active cpu list from changing while being traversed. 494 * 495 */ 496 int 497 disp_anywork(void) 498 { 499 cpu_t *cp = CPU; 500 cpu_t *ocp; 501 502 if (cp->cpu_disp->disp_nrunnable != 0) 503 return (1); 504 505 if (!(cp->cpu_flags & CPU_OFFLINE)) { 506 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 507 return (1); 508 509 /* 510 * Work can be taken from another CPU if: 511 * - There is unbound work on the run queue 512 * - That work isn't a thread undergoing a 513 * - context switch on an otherwise empty queue. 514 * - The CPU isn't running the idle loop. 515 */ 516 for (ocp = cp->cpu_next_part; ocp != cp; 517 ocp = ocp->cpu_next_part) { 518 ASSERT(CPU_ACTIVE(ocp)); 519 520 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 521 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 522 ocp->cpu_disp->disp_nrunnable == 1) && 523 ocp->cpu_dispatch_pri != -1) 524 return (1); 525 } 526 } 527 return (0); 528 } 529 530 /* 531 * Called when CPU enters the idle loop 532 */ 533 static void 534 idle_enter() 535 { 536 cpu_t *cp = CPU; 537 538 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 539 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 540 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 541 } 542 543 /* 544 * Called when CPU exits the idle loop 545 */ 546 static void 547 idle_exit() 548 { 549 cpu_t *cp = CPU; 550 551 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 552 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 553 } 554 555 /* 556 * Idle loop. 557 */ 558 void 559 idle() 560 { 561 struct cpu *cp = CPU; /* pointer to this CPU */ 562 kthread_t *t; /* taken thread */ 563 564 idle_enter(); 565 566 /* 567 * Uniprocessor version of idle loop. 568 * Do this until notified that we're on an actual multiprocessor. 569 */ 570 while (ncpus == 1) { 571 if (cp->cpu_disp->disp_nrunnable == 0) { 572 (*idle_cpu)(); 573 continue; 574 } 575 idle_exit(); 576 swtch(); 577 578 idle_enter(); /* returned from swtch */ 579 } 580 581 /* 582 * Multiprocessor idle loop. 583 */ 584 for (;;) { 585 /* 586 * If CPU is completely quiesced by p_online(2), just wait 587 * here with minimal bus traffic until put online. 588 */ 589 while (cp->cpu_flags & CPU_QUIESCED) 590 (*idle_cpu)(); 591 592 if (cp->cpu_disp->disp_nrunnable != 0) { 593 idle_exit(); 594 swtch(); 595 } else { 596 if (cp->cpu_flags & CPU_OFFLINE) 597 continue; 598 if ((t = disp_getwork(cp)) == NULL) { 599 if (cp->cpu_chosen_level != -1) { 600 disp_t *dp = cp->cpu_disp; 601 disp_t *kpq; 602 603 disp_lock_enter(&dp->disp_lock); 604 /* 605 * Set kpq under lock to prevent 606 * migration between partitions. 607 */ 608 kpq = &cp->cpu_part->cp_kp_queue; 609 if (kpq->disp_maxrunpri == -1) 610 cp->cpu_chosen_level = -1; 611 disp_lock_exit(&dp->disp_lock); 612 } 613 (*idle_cpu)(); 614 continue; 615 } 616 /* 617 * If there was a thread but we couldn't steal 618 * it, then keep trying. 619 */ 620 if (t == T_DONTSTEAL) 621 continue; 622 idle_exit(); 623 swtch_to(t); 624 } 625 idle_enter(); /* returned from swtch/swtch_to */ 626 } 627 } 628 629 630 /* 631 * Preempt the currently running thread in favor of the highest 632 * priority thread. The class of the current thread controls 633 * where it goes on the dispatcher queues. If panicking, turn 634 * preemption off. 635 */ 636 void 637 preempt() 638 { 639 kthread_t *t = curthread; 640 klwp_t *lwp = ttolwp(curthread); 641 642 if (panicstr) 643 return; 644 645 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 646 647 thread_lock(t); 648 649 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 650 /* 651 * this thread has already been chosen to be run on 652 * another CPU. Clear kprunrun on this CPU since we're 653 * already headed for swtch(). 654 */ 655 CPU->cpu_kprunrun = 0; 656 thread_unlock_nopreempt(t); 657 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 658 } else { 659 if (lwp != NULL) 660 lwp->lwp_ru.nivcsw++; 661 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 662 THREAD_TRANSITION(t); 663 CL_PREEMPT(t); 664 DTRACE_SCHED(preempt); 665 thread_unlock_nopreempt(t); 666 667 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 668 669 swtch(); /* clears CPU->cpu_runrun via disp() */ 670 } 671 } 672 673 extern kthread_t *thread_unpin(); 674 675 /* 676 * disp() - find the highest priority thread for this processor to run, and 677 * set it in TS_ONPROC state so that resume() can be called to run it. 678 */ 679 static kthread_t * 680 disp() 681 { 682 cpu_t *cpup; 683 disp_t *dp; 684 kthread_t *tp; 685 dispq_t *dq; 686 int maxrunword; 687 pri_t pri; 688 disp_t *kpq; 689 690 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 691 692 cpup = CPU; 693 /* 694 * Find the highest priority loaded, runnable thread. 695 */ 696 dp = cpup->cpu_disp; 697 698 reschedule: 699 /* 700 * If there is more important work on the global queue with a better 701 * priority than the maximum on this CPU, take it now. 702 */ 703 kpq = &cpup->cpu_part->cp_kp_queue; 704 while ((pri = kpq->disp_maxrunpri) >= 0 && 705 pri >= dp->disp_maxrunpri && 706 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 707 (tp = disp_getbest(kpq)) != NULL) { 708 if (disp_ratify(tp, kpq) != NULL) { 709 TRACE_1(TR_FAC_DISP, TR_DISP_END, 710 "disp_end:tid %p", tp); 711 return (tp); 712 } 713 } 714 715 disp_lock_enter(&dp->disp_lock); 716 pri = dp->disp_maxrunpri; 717 718 /* 719 * If there is nothing to run, look at what's runnable on other queues. 720 * Choose the idle thread if the CPU is quiesced. 721 * Note that CPUs that have the CPU_OFFLINE flag set can still run 722 * interrupt threads, which will be the only threads on the CPU's own 723 * queue, but cannot run threads from other queues. 724 */ 725 if (pri == -1) { 726 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 727 disp_lock_exit(&dp->disp_lock); 728 if ((tp = disp_getwork(cpup)) == NULL || 729 tp == T_DONTSTEAL) { 730 tp = cpup->cpu_idle_thread; 731 (void) splhigh(); 732 THREAD_ONPROC(tp, cpup); 733 cpup->cpu_dispthread = tp; 734 cpup->cpu_dispatch_pri = -1; 735 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 736 cpup->cpu_chosen_level = -1; 737 } 738 } else { 739 disp_lock_exit_high(&dp->disp_lock); 740 tp = cpup->cpu_idle_thread; 741 THREAD_ONPROC(tp, cpup); 742 cpup->cpu_dispthread = tp; 743 cpup->cpu_dispatch_pri = -1; 744 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 745 cpup->cpu_chosen_level = -1; 746 } 747 TRACE_1(TR_FAC_DISP, TR_DISP_END, 748 "disp_end:tid %p", tp); 749 return (tp); 750 } 751 752 dq = &dp->disp_q[pri]; 753 tp = dq->dq_first; 754 755 ASSERT(tp != NULL); 756 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 757 758 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 759 760 /* 761 * Found it so remove it from queue. 762 */ 763 dp->disp_nrunnable--; 764 dq->dq_sruncnt--; 765 if ((dq->dq_first = tp->t_link) == NULL) { 766 ulong_t *dqactmap = dp->disp_qactmap; 767 768 ASSERT(dq->dq_sruncnt == 0); 769 dq->dq_last = NULL; 770 771 /* 772 * The queue is empty, so the corresponding bit needs to be 773 * turned off in dqactmap. If nrunnable != 0 just took the 774 * last runnable thread off the 775 * highest queue, so recompute disp_maxrunpri. 776 */ 777 maxrunword = pri >> BT_ULSHIFT; 778 dqactmap[maxrunword] &= ~BT_BIW(pri); 779 780 if (dp->disp_nrunnable == 0) { 781 dp->disp_max_unbound_pri = -1; 782 dp->disp_maxrunpri = -1; 783 } else { 784 int ipri; 785 786 ipri = bt_gethighbit(dqactmap, maxrunword); 787 dp->disp_maxrunpri = ipri; 788 if (ipri < dp->disp_max_unbound_pri) 789 dp->disp_max_unbound_pri = ipri; 790 } 791 } else { 792 tp->t_link = NULL; 793 } 794 795 /* 796 * Set TS_DONT_SWAP flag to prevent another processor from swapping 797 * out this thread before we have a chance to run it. 798 * While running, it is protected against swapping by t_lock. 799 */ 800 tp->t_schedflag |= TS_DONT_SWAP; 801 cpup->cpu_dispthread = tp; /* protected by spl only */ 802 cpup->cpu_dispatch_pri = pri; 803 ASSERT(pri == DISP_PRIO(tp)); 804 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 805 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 806 807 ASSERT(tp != NULL); 808 TRACE_1(TR_FAC_DISP, TR_DISP_END, 809 "disp_end:tid %p", tp); 810 811 if (disp_ratify(tp, kpq) == NULL) 812 goto reschedule; 813 814 return (tp); 815 } 816 817 /* 818 * swtch() 819 * Find best runnable thread and run it. 820 * Called with the current thread already switched to a new state, 821 * on a sleep queue, run queue, stopped, and not zombied. 822 * May be called at any spl level less than or equal to LOCK_LEVEL. 823 * Always drops spl to the base level (spl0()). 824 */ 825 void 826 swtch() 827 { 828 kthread_t *t = curthread; 829 kthread_t *next; 830 cpu_t *cp; 831 832 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 833 834 if (t->t_flag & T_INTR_THREAD) 835 cpu_intr_swtch_enter(t); 836 837 if (t->t_intr != NULL) { 838 /* 839 * We are an interrupt thread. Setup and return 840 * the interrupted thread to be resumed. 841 */ 842 (void) splhigh(); /* block other scheduler action */ 843 cp = CPU; /* now protected against migration */ 844 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 845 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 846 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 847 next = thread_unpin(); 848 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 849 resume_from_intr(next); 850 } else { 851 #ifdef DEBUG 852 if (t->t_state == TS_ONPROC && 853 t->t_disp_queue->disp_cpu == CPU && 854 t->t_preempt == 0) { 855 thread_lock(t); 856 ASSERT(t->t_state != TS_ONPROC || 857 t->t_disp_queue->disp_cpu != CPU || 858 t->t_preempt != 0); /* cannot migrate */ 859 thread_unlock_nopreempt(t); 860 } 861 #endif /* DEBUG */ 862 cp = CPU; 863 next = disp(); /* returns with spl high */ 864 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 865 866 /* OK to steal anything left on run queue */ 867 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 868 869 if (next != t) { 870 if (t == cp->cpu_idle_thread) { 871 PG_NRUN_UPDATE(cp, 1); 872 } else if (next == cp->cpu_idle_thread) { 873 PG_NRUN_UPDATE(cp, -1); 874 } 875 876 /* 877 * If t was previously in the TS_ONPROC state, 878 * setfrontdq and setbackdq won't have set its t_waitrq. 879 * Since we now finally know that we're switching away 880 * from this thread, set its t_waitrq if it is on a run 881 * queue. 882 */ 883 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 884 t->t_waitrq = gethrtime_unscaled(); 885 } 886 887 /* 888 * restore mstate of thread that we are switching to 889 */ 890 restore_mstate(next); 891 892 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 893 cp->cpu_last_swtch = t->t_disp_time = lbolt; 894 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 895 896 if (dtrace_vtime_active) 897 dtrace_vtime_switch(next); 898 899 resume(next); 900 /* 901 * The TR_RESUME_END and TR_SWTCH_END trace points 902 * appear at the end of resume(), because we may not 903 * return here 904 */ 905 } else { 906 if (t->t_flag & T_INTR_THREAD) 907 cpu_intr_swtch_exit(t); 908 909 DTRACE_SCHED(remain__cpu); 910 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 911 (void) spl0(); 912 } 913 } 914 } 915 916 /* 917 * swtch_from_zombie() 918 * Special case of swtch(), which allows checks for TS_ZOMB to be 919 * eliminated from normal resume. 920 * Find best runnable thread and run it. 921 * Called with the current thread zombied. 922 * Zombies cannot migrate, so CPU references are safe. 923 */ 924 void 925 swtch_from_zombie() 926 { 927 kthread_t *next; 928 cpu_t *cpu = CPU; 929 930 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 931 932 ASSERT(curthread->t_state == TS_ZOMB); 933 934 next = disp(); /* returns with spl high */ 935 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 936 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 937 ASSERT(next != curthread); 938 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 939 940 if (next == cpu->cpu_idle_thread) 941 PG_NRUN_UPDATE(cpu, -1); 942 943 restore_mstate(next); 944 945 if (dtrace_vtime_active) 946 dtrace_vtime_switch(next); 947 948 resume_from_zombie(next); 949 /* 950 * The TR_RESUME_END and TR_SWTCH_END trace points 951 * appear at the end of resume(), because we certainly will not 952 * return here 953 */ 954 } 955 956 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 957 static int 958 thread_on_queue(kthread_t *tp) 959 { 960 cpu_t *cp; 961 cpu_t *self; 962 disp_t *dp; 963 964 self = CPU; 965 cp = self->cpu_next_onln; 966 dp = cp->cpu_disp; 967 for (;;) { 968 dispq_t *dq; 969 dispq_t *eq; 970 971 disp_lock_enter_high(&dp->disp_lock); 972 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 973 kthread_t *rp; 974 975 ASSERT(dq->dq_last == NULL || 976 dq->dq_last->t_link == NULL); 977 for (rp = dq->dq_first; rp; rp = rp->t_link) 978 if (tp == rp) { 979 disp_lock_exit_high(&dp->disp_lock); 980 return (1); 981 } 982 } 983 disp_lock_exit_high(&dp->disp_lock); 984 if (cp == NULL) 985 break; 986 if (cp == self) { 987 cp = NULL; 988 dp = &cp->cpu_part->cp_kp_queue; 989 } else { 990 cp = cp->cpu_next_onln; 991 dp = cp->cpu_disp; 992 } 993 } 994 return (0); 995 } /* end of thread_on_queue */ 996 #else 997 998 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 999 1000 #endif /* DEBUG */ 1001 1002 /* 1003 * like swtch(), but switch to a specified thread taken from another CPU. 1004 * called with spl high.. 1005 */ 1006 void 1007 swtch_to(kthread_t *next) 1008 { 1009 cpu_t *cp = CPU; 1010 1011 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1012 1013 /* 1014 * Update context switch statistics. 1015 */ 1016 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1017 1018 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1019 1020 if (curthread == cp->cpu_idle_thread) 1021 PG_NRUN_UPDATE(cp, 1); 1022 1023 /* OK to steal anything left on run queue */ 1024 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1025 1026 /* record last execution time */ 1027 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1028 1029 /* 1030 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1031 * won't have set its t_waitrq. Since we now finally know that we're 1032 * switching away from this thread, set its t_waitrq if it is on a run 1033 * queue. 1034 */ 1035 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1036 curthread->t_waitrq = gethrtime_unscaled(); 1037 } 1038 1039 /* restore next thread to previously running microstate */ 1040 restore_mstate(next); 1041 1042 if (dtrace_vtime_active) 1043 dtrace_vtime_switch(next); 1044 1045 resume(next); 1046 /* 1047 * The TR_RESUME_END and TR_SWTCH_END trace points 1048 * appear at the end of resume(), because we may not 1049 * return here 1050 */ 1051 } 1052 1053 1054 1055 #define CPU_IDLING(pri) ((pri) == -1) 1056 1057 static void 1058 cpu_resched(cpu_t *cp, pri_t tpri) 1059 { 1060 int call_poke_cpu = 0; 1061 pri_t cpupri = cp->cpu_dispatch_pri; 1062 1063 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1064 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1065 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1066 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1067 cp->cpu_runrun = 1; 1068 aston(cp->cpu_dispthread); 1069 if (tpri < kpreemptpri && cp != CPU) 1070 call_poke_cpu = 1; 1071 } 1072 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1073 cp->cpu_kprunrun = 1; 1074 if (cp != CPU) 1075 call_poke_cpu = 1; 1076 } 1077 } 1078 1079 /* 1080 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1081 */ 1082 membar_enter(); 1083 1084 if (call_poke_cpu) 1085 poke_cpu(cp->cpu_id); 1086 } 1087 1088 /* 1089 * Perform multi-level CMT load balancing of running threads. 1090 * tp is the thread being enqueued 1091 * cp is the hint CPU (chosen by cpu_choose()). 1092 */ 1093 static cpu_t * 1094 cmt_balance(kthread_t *tp, cpu_t *cp) 1095 { 1096 int hint, i, cpu, nsiblings; 1097 int self = 0; 1098 group_t *cmt_pgs, *siblings; 1099 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 1100 int pg_nrun, tpg_nrun; 1101 int level = 0; 1102 cpu_t *newcp; 1103 1104 ASSERT(THREAD_LOCK_HELD(tp)); 1105 1106 cmt_pgs = &cp->cpu_pg->cmt_pgs; 1107 1108 if (GROUP_SIZE(cmt_pgs) == 0) 1109 return (cp); /* nothing to do */ 1110 1111 if (tp == curthread) 1112 self = 1; 1113 1114 /* 1115 * Balance across siblings in the CPUs CMT lineage 1116 */ 1117 do { 1118 pg = GROUP_ACCESS(cmt_pgs, level); 1119 1120 siblings = pg->cmt_siblings; 1121 nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 1122 if (nsiblings == 1) 1123 continue; /* nobody to balance against */ 1124 1125 pg_nrun = pg->cmt_nrunning; 1126 if (self && 1127 bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 1128 pg_nrun--; /* Ignore curthread's effect */ 1129 1130 hint = pg->cmt_hint; 1131 /* 1132 * Check for validity of the hint 1133 * It should reference a valid sibling 1134 */ 1135 if (hint >= nsiblings) 1136 hint = pg->cmt_hint = 0; 1137 else 1138 pg->cmt_hint++; 1139 1140 /* 1141 * Find a balancing candidate from among our siblings 1142 * "hint" is a hint for where to start looking 1143 */ 1144 i = hint; 1145 do { 1146 ASSERT(i < nsiblings); 1147 pg_tmp = GROUP_ACCESS(siblings, i); 1148 1149 /* 1150 * The candidate must not be us, and must 1151 * have some CPU resources in the thread's 1152 * partition 1153 */ 1154 if (pg_tmp != pg && 1155 bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 1156 ((pg_t *)pg_tmp)->pg_id)) { 1157 tpg = pg_tmp; 1158 break; 1159 } 1160 1161 if (++i >= nsiblings) 1162 i = 0; 1163 } while (i != hint); 1164 1165 if (!tpg) 1166 continue; /* no candidates at this level */ 1167 1168 /* 1169 * Check if the balancing target is underloaded 1170 * Decide to balance if the target is running fewer 1171 * threads, or if it's running the same number of threads 1172 * with more online CPUs 1173 */ 1174 tpg_nrun = tpg->cmt_nrunning; 1175 if (pg_nrun > tpg_nrun || 1176 (pg_nrun == tpg_nrun && 1177 (GROUP_SIZE(&tpg->cmt_cpus_actv) > 1178 GROUP_SIZE(&pg->cmt_cpus_actv)))) { 1179 break; 1180 } 1181 tpg = NULL; 1182 } while (++level < GROUP_SIZE(cmt_pgs)); 1183 1184 1185 if (tpg) { 1186 /* 1187 * Select an idle CPU from the target PG 1188 */ 1189 for (cpu = 0; cpu < GROUP_SIZE(&tpg->cmt_cpus_actv); cpu++) { 1190 newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 1191 if (newcp->cpu_part == tp->t_cpupart && 1192 newcp->cpu_dispatch_pri == -1) { 1193 cp = newcp; 1194 break; 1195 } 1196 } 1197 } 1198 1199 return (cp); 1200 } 1201 1202 /* 1203 * setbackdq() keeps runqs balanced such that the difference in length 1204 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1205 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1206 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1207 * try to keep runqs perfectly balanced regardless of the thread priority. 1208 */ 1209 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1210 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1211 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1212 1213 /* 1214 * Put the specified thread on the back of the dispatcher 1215 * queue corresponding to its current priority. 1216 * 1217 * Called with the thread in transition, onproc or stopped state 1218 * and locked (transition implies locked) and at high spl. 1219 * Returns with the thread in TS_RUN state and still locked. 1220 */ 1221 void 1222 setbackdq(kthread_t *tp) 1223 { 1224 dispq_t *dq; 1225 disp_t *dp; 1226 cpu_t *cp; 1227 pri_t tpri; 1228 int bound; 1229 1230 ASSERT(THREAD_LOCK_HELD(tp)); 1231 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1232 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1233 1234 /* 1235 * If thread is "swapped" or on the swap queue don't 1236 * queue it, but wake sched. 1237 */ 1238 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1239 disp_swapped_setrun(tp); 1240 return; 1241 } 1242 1243 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1244 bound = 1; 1245 else 1246 bound = 0; 1247 1248 tpri = DISP_PRIO(tp); 1249 if (ncpus == 1) 1250 cp = tp->t_cpu; 1251 else if (!bound) { 1252 if (tpri >= kpqpri) { 1253 setkpdq(tp, SETKP_BACK); 1254 return; 1255 } 1256 /* 1257 * Let cpu_choose suggest a CPU. 1258 */ 1259 cp = cpu_choose(tp, tpri); 1260 1261 if (tp->t_cpupart == cp->cpu_part) { 1262 int qlen; 1263 1264 /* 1265 * Perform any CMT load balancing 1266 */ 1267 cp = cmt_balance(tp, cp); 1268 1269 /* 1270 * Balance across the run queues 1271 */ 1272 qlen = RUNQ_LEN(cp, tpri); 1273 if (tpri >= RUNQ_MATCH_PRI && 1274 !(tp->t_schedflag & TS_RUNQMATCH)) 1275 qlen -= RUNQ_MAX_DIFF; 1276 if (qlen > 0) { 1277 cpu_t *newcp; 1278 1279 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1280 newcp = cp->cpu_next_part; 1281 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1282 newcp = cp->cpu_next_part; 1283 } 1284 1285 if (RUNQ_LEN(newcp, tpri) < qlen) { 1286 DTRACE_PROBE3(runq__balance, 1287 kthread_t *, tp, 1288 cpu_t *, cp, cpu_t *, newcp); 1289 cp = newcp; 1290 } 1291 } 1292 } else { 1293 /* 1294 * Migrate to a cpu in the new partition. 1295 */ 1296 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1297 tp->t_lpl, tp->t_pri, NULL); 1298 } 1299 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1300 } else { 1301 /* 1302 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1303 * a short time until weak binding that existed when the 1304 * strong binding was established has dropped) so we must 1305 * favour weak binding over strong. 1306 */ 1307 cp = tp->t_weakbound_cpu ? 1308 tp->t_weakbound_cpu : tp->t_bound_cpu; 1309 } 1310 /* 1311 * A thread that is ONPROC may be temporarily placed on the run queue 1312 * but then chosen to run again by disp. If the thread we're placing on 1313 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1314 * replacement process is actually scheduled in swtch(). In this 1315 * situation, curthread is the only thread that could be in the ONPROC 1316 * state. 1317 */ 1318 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1319 hrtime_t curtime; 1320 1321 curtime = gethrtime_unscaled(); 1322 (void) cpu_update_pct(tp, curtime); 1323 tp->t_waitrq = curtime; 1324 } else { 1325 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1326 } 1327 1328 dp = cp->cpu_disp; 1329 disp_lock_enter_high(&dp->disp_lock); 1330 1331 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1332 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1333 tpri, cp, tp); 1334 1335 #ifndef NPROBE 1336 /* Kernel probe */ 1337 if (tnf_tracing_active) 1338 tnf_thread_queue(tp, cp, tpri); 1339 #endif /* NPROBE */ 1340 1341 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1342 1343 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1344 tp->t_disp_queue = dp; 1345 tp->t_link = NULL; 1346 1347 dq = &dp->disp_q[tpri]; 1348 dp->disp_nrunnable++; 1349 if (!bound) 1350 dp->disp_steal = 0; 1351 membar_enter(); 1352 1353 if (dq->dq_sruncnt++ != 0) { 1354 ASSERT(dq->dq_first != NULL); 1355 dq->dq_last->t_link = tp; 1356 dq->dq_last = tp; 1357 } else { 1358 ASSERT(dq->dq_first == NULL); 1359 ASSERT(dq->dq_last == NULL); 1360 dq->dq_first = dq->dq_last = tp; 1361 BT_SET(dp->disp_qactmap, tpri); 1362 if (tpri > dp->disp_maxrunpri) { 1363 dp->disp_maxrunpri = tpri; 1364 membar_enter(); 1365 cpu_resched(cp, tpri); 1366 } 1367 } 1368 1369 if (!bound && tpri > dp->disp_max_unbound_pri) { 1370 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1371 cp == CPU) { 1372 /* 1373 * If there are no other unbound threads on the 1374 * run queue, don't allow other CPUs to steal 1375 * this thread while we are in the middle of a 1376 * context switch. We may just switch to it 1377 * again right away. CPU_DISP_DONTSTEAL is cleared 1378 * in swtch and swtch_to. 1379 */ 1380 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1381 } 1382 dp->disp_max_unbound_pri = tpri; 1383 } 1384 (*disp_enq_thread)(cp, bound); 1385 } 1386 1387 /* 1388 * Put the specified thread on the front of the dispatcher 1389 * queue corresponding to its current priority. 1390 * 1391 * Called with the thread in transition, onproc or stopped state 1392 * and locked (transition implies locked) and at high spl. 1393 * Returns with the thread in TS_RUN state and still locked. 1394 */ 1395 void 1396 setfrontdq(kthread_t *tp) 1397 { 1398 disp_t *dp; 1399 dispq_t *dq; 1400 cpu_t *cp; 1401 pri_t tpri; 1402 int bound; 1403 1404 ASSERT(THREAD_LOCK_HELD(tp)); 1405 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1406 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1407 1408 /* 1409 * If thread is "swapped" or on the swap queue don't 1410 * queue it, but wake sched. 1411 */ 1412 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1413 disp_swapped_setrun(tp); 1414 return; 1415 } 1416 1417 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1418 bound = 1; 1419 else 1420 bound = 0; 1421 1422 tpri = DISP_PRIO(tp); 1423 if (ncpus == 1) 1424 cp = tp->t_cpu; 1425 else if (!bound) { 1426 if (tpri >= kpqpri) { 1427 setkpdq(tp, SETKP_FRONT); 1428 return; 1429 } 1430 cp = tp->t_cpu; 1431 if (tp->t_cpupart == cp->cpu_part) { 1432 /* 1433 * If we are of higher or equal priority than 1434 * the highest priority runnable thread of 1435 * the current CPU, just pick this CPU. Otherwise 1436 * Let cpu_choose() select the CPU. If this cpu 1437 * is the target of an offline request then do not 1438 * pick it - a thread_nomigrate() on the in motion 1439 * cpu relies on this when it forces a preempt. 1440 */ 1441 if (tpri < cp->cpu_disp->disp_maxrunpri || 1442 cp == cpu_inmotion) 1443 cp = cpu_choose(tp, tpri); 1444 } else { 1445 /* 1446 * Migrate to a cpu in the new partition. 1447 */ 1448 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1449 tp->t_lpl, tp->t_pri, NULL); 1450 } 1451 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1452 } else { 1453 /* 1454 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1455 * a short time until weak binding that existed when the 1456 * strong binding was established has dropped) so we must 1457 * favour weak binding over strong. 1458 */ 1459 cp = tp->t_weakbound_cpu ? 1460 tp->t_weakbound_cpu : tp->t_bound_cpu; 1461 } 1462 1463 /* 1464 * A thread that is ONPROC may be temporarily placed on the run queue 1465 * but then chosen to run again by disp. If the thread we're placing on 1466 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1467 * replacement process is actually scheduled in swtch(). In this 1468 * situation, curthread is the only thread that could be in the ONPROC 1469 * state. 1470 */ 1471 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1472 hrtime_t curtime; 1473 1474 curtime = gethrtime_unscaled(); 1475 (void) cpu_update_pct(tp, curtime); 1476 tp->t_waitrq = curtime; 1477 } else { 1478 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1479 } 1480 1481 dp = cp->cpu_disp; 1482 disp_lock_enter_high(&dp->disp_lock); 1483 1484 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1485 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1486 1487 #ifndef NPROBE 1488 /* Kernel probe */ 1489 if (tnf_tracing_active) 1490 tnf_thread_queue(tp, cp, tpri); 1491 #endif /* NPROBE */ 1492 1493 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1494 1495 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1496 tp->t_disp_queue = dp; 1497 1498 dq = &dp->disp_q[tpri]; 1499 dp->disp_nrunnable++; 1500 if (!bound) 1501 dp->disp_steal = 0; 1502 membar_enter(); 1503 1504 if (dq->dq_sruncnt++ != 0) { 1505 ASSERT(dq->dq_last != NULL); 1506 tp->t_link = dq->dq_first; 1507 dq->dq_first = tp; 1508 } else { 1509 ASSERT(dq->dq_last == NULL); 1510 ASSERT(dq->dq_first == NULL); 1511 tp->t_link = NULL; 1512 dq->dq_first = dq->dq_last = tp; 1513 BT_SET(dp->disp_qactmap, tpri); 1514 if (tpri > dp->disp_maxrunpri) { 1515 dp->disp_maxrunpri = tpri; 1516 membar_enter(); 1517 cpu_resched(cp, tpri); 1518 } 1519 } 1520 1521 if (!bound && tpri > dp->disp_max_unbound_pri) { 1522 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1523 cp == CPU) { 1524 /* 1525 * If there are no other unbound threads on the 1526 * run queue, don't allow other CPUs to steal 1527 * this thread while we are in the middle of a 1528 * context switch. We may just switch to it 1529 * again right away. CPU_DISP_DONTSTEAL is cleared 1530 * in swtch and swtch_to. 1531 */ 1532 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1533 } 1534 dp->disp_max_unbound_pri = tpri; 1535 } 1536 (*disp_enq_thread)(cp, bound); 1537 } 1538 1539 /* 1540 * Put a high-priority unbound thread on the kp queue 1541 */ 1542 static void 1543 setkpdq(kthread_t *tp, int borf) 1544 { 1545 dispq_t *dq; 1546 disp_t *dp; 1547 cpu_t *cp; 1548 pri_t tpri; 1549 1550 tpri = DISP_PRIO(tp); 1551 1552 dp = &tp->t_cpupart->cp_kp_queue; 1553 disp_lock_enter_high(&dp->disp_lock); 1554 1555 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1556 1557 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1558 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1559 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1560 tp->t_disp_queue = dp; 1561 dp->disp_nrunnable++; 1562 dq = &dp->disp_q[tpri]; 1563 1564 if (dq->dq_sruncnt++ != 0) { 1565 if (borf == SETKP_BACK) { 1566 ASSERT(dq->dq_first != NULL); 1567 tp->t_link = NULL; 1568 dq->dq_last->t_link = tp; 1569 dq->dq_last = tp; 1570 } else { 1571 ASSERT(dq->dq_last != NULL); 1572 tp->t_link = dq->dq_first; 1573 dq->dq_first = tp; 1574 } 1575 } else { 1576 if (borf == SETKP_BACK) { 1577 ASSERT(dq->dq_first == NULL); 1578 ASSERT(dq->dq_last == NULL); 1579 dq->dq_first = dq->dq_last = tp; 1580 } else { 1581 ASSERT(dq->dq_last == NULL); 1582 ASSERT(dq->dq_first == NULL); 1583 tp->t_link = NULL; 1584 dq->dq_first = dq->dq_last = tp; 1585 } 1586 BT_SET(dp->disp_qactmap, tpri); 1587 if (tpri > dp->disp_max_unbound_pri) 1588 dp->disp_max_unbound_pri = tpri; 1589 if (tpri > dp->disp_maxrunpri) { 1590 dp->disp_maxrunpri = tpri; 1591 membar_enter(); 1592 } 1593 } 1594 1595 cp = tp->t_cpu; 1596 if (tp->t_cpupart != cp->cpu_part) { 1597 /* migrate to a cpu in the new partition */ 1598 cp = tp->t_cpupart->cp_cpulist; 1599 } 1600 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1601 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1602 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1603 1604 #ifndef NPROBE 1605 /* Kernel probe */ 1606 if (tnf_tracing_active) 1607 tnf_thread_queue(tp, cp, tpri); 1608 #endif /* NPROBE */ 1609 1610 if (cp->cpu_chosen_level < tpri) 1611 cp->cpu_chosen_level = tpri; 1612 cpu_resched(cp, tpri); 1613 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1614 (*disp_enq_thread)(cp, 0); 1615 } 1616 1617 /* 1618 * Remove a thread from the dispatcher queue if it is on it. 1619 * It is not an error if it is not found but we return whether 1620 * or not it was found in case the caller wants to check. 1621 */ 1622 int 1623 dispdeq(kthread_t *tp) 1624 { 1625 disp_t *dp; 1626 dispq_t *dq; 1627 kthread_t *rp; 1628 kthread_t *trp; 1629 kthread_t **ptp; 1630 int tpri; 1631 1632 ASSERT(THREAD_LOCK_HELD(tp)); 1633 1634 if (tp->t_state != TS_RUN) 1635 return (0); 1636 1637 /* 1638 * The thread is "swapped" or is on the swap queue and 1639 * hence no longer on the run queue, so return true. 1640 */ 1641 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1642 return (1); 1643 1644 tpri = DISP_PRIO(tp); 1645 dp = tp->t_disp_queue; 1646 ASSERT(tpri < dp->disp_npri); 1647 dq = &dp->disp_q[tpri]; 1648 ptp = &dq->dq_first; 1649 rp = *ptp; 1650 trp = NULL; 1651 1652 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1653 1654 /* 1655 * Search for thread in queue. 1656 * Double links would simplify this at the expense of disp/setrun. 1657 */ 1658 while (rp != tp && rp != NULL) { 1659 trp = rp; 1660 ptp = &trp->t_link; 1661 rp = trp->t_link; 1662 } 1663 1664 if (rp == NULL) { 1665 panic("dispdeq: thread not on queue"); 1666 } 1667 1668 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1669 1670 /* 1671 * Found it so remove it from queue. 1672 */ 1673 if ((*ptp = rp->t_link) == NULL) 1674 dq->dq_last = trp; 1675 1676 dp->disp_nrunnable--; 1677 if (--dq->dq_sruncnt == 0) { 1678 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1679 if (dp->disp_nrunnable == 0) { 1680 dp->disp_max_unbound_pri = -1; 1681 dp->disp_maxrunpri = -1; 1682 } else if (tpri == dp->disp_maxrunpri) { 1683 int ipri; 1684 1685 ipri = bt_gethighbit(dp->disp_qactmap, 1686 dp->disp_maxrunpri >> BT_ULSHIFT); 1687 if (ipri < dp->disp_max_unbound_pri) 1688 dp->disp_max_unbound_pri = ipri; 1689 dp->disp_maxrunpri = ipri; 1690 } 1691 } 1692 tp->t_link = NULL; 1693 THREAD_TRANSITION(tp); /* put in intermediate state */ 1694 return (1); 1695 } 1696 1697 1698 /* 1699 * dq_sruninc and dq_srundec are public functions for 1700 * incrementing/decrementing the sruncnts when a thread on 1701 * a dispatcher queue is made schedulable/unschedulable by 1702 * resetting the TS_LOAD flag. 1703 * 1704 * The caller MUST have the thread lock and therefore the dispatcher 1705 * queue lock so that the operation which changes 1706 * the flag, the operation that checks the status of the thread to 1707 * determine if it's on a disp queue AND the call to this function 1708 * are one atomic operation with respect to interrupts. 1709 */ 1710 1711 /* 1712 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1713 */ 1714 void 1715 dq_sruninc(kthread_t *t) 1716 { 1717 ASSERT(t->t_state == TS_RUN); 1718 ASSERT(t->t_schedflag & TS_LOAD); 1719 1720 THREAD_TRANSITION(t); 1721 setfrontdq(t); 1722 } 1723 1724 /* 1725 * See comment on calling conventions above. 1726 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1727 */ 1728 void 1729 dq_srundec(kthread_t *t) 1730 { 1731 ASSERT(t->t_schedflag & TS_LOAD); 1732 1733 (void) dispdeq(t); 1734 disp_swapped_enq(t); 1735 } 1736 1737 /* 1738 * Change the dispatcher lock of thread to the "swapped_lock" 1739 * and return with thread lock still held. 1740 * 1741 * Called with thread_lock held, in transition state, and at high spl. 1742 */ 1743 void 1744 disp_swapped_enq(kthread_t *tp) 1745 { 1746 ASSERT(THREAD_LOCK_HELD(tp)); 1747 ASSERT(tp->t_schedflag & TS_LOAD); 1748 1749 switch (tp->t_state) { 1750 case TS_RUN: 1751 disp_lock_enter_high(&swapped_lock); 1752 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1753 break; 1754 case TS_ONPROC: 1755 disp_lock_enter_high(&swapped_lock); 1756 THREAD_TRANSITION(tp); 1757 wake_sched_sec = 1; /* tell clock to wake sched */ 1758 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1759 break; 1760 default: 1761 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1762 } 1763 } 1764 1765 /* 1766 * This routine is called by setbackdq/setfrontdq if the thread is 1767 * not loaded or loaded and on the swap queue. 1768 * 1769 * Thread state TS_SLEEP implies that a swapped thread 1770 * has been woken up and needs to be swapped in by the swapper. 1771 * 1772 * Thread state TS_RUN, it implies that the priority of a swapped 1773 * thread is being increased by scheduling class (e.g. ts_update). 1774 */ 1775 static void 1776 disp_swapped_setrun(kthread_t *tp) 1777 { 1778 ASSERT(THREAD_LOCK_HELD(tp)); 1779 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1780 1781 switch (tp->t_state) { 1782 case TS_SLEEP: 1783 disp_lock_enter_high(&swapped_lock); 1784 /* 1785 * Wakeup sched immediately (i.e., next tick) if the 1786 * thread priority is above maxclsyspri. 1787 */ 1788 if (DISP_PRIO(tp) > maxclsyspri) 1789 wake_sched = 1; 1790 else 1791 wake_sched_sec = 1; 1792 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1793 break; 1794 case TS_RUN: /* called from ts_update */ 1795 break; 1796 default: 1797 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1798 } 1799 } 1800 1801 1802 /* 1803 * Make a thread give up its processor. Find the processor on 1804 * which this thread is executing, and have that processor 1805 * preempt. 1806 */ 1807 void 1808 cpu_surrender(kthread_t *tp) 1809 { 1810 cpu_t *cpup; 1811 int max_pri; 1812 int max_run_pri; 1813 klwp_t *lwp; 1814 1815 ASSERT(THREAD_LOCK_HELD(tp)); 1816 1817 if (tp->t_state != TS_ONPROC) 1818 return; 1819 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1820 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1821 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1822 if (max_pri < max_run_pri) 1823 max_pri = max_run_pri; 1824 1825 cpup->cpu_runrun = 1; 1826 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1827 cpup->cpu_kprunrun = 1; 1828 } 1829 1830 /* 1831 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1832 */ 1833 membar_enter(); 1834 1835 DTRACE_SCHED1(surrender, kthread_t *, tp); 1836 1837 /* 1838 * Make the target thread take an excursion through trap() 1839 * to do preempt() (unless we're already in trap or post_syscall, 1840 * calling cpu_surrender via CL_TRAPRET). 1841 */ 1842 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1843 lwp->lwp_state != LWP_USER) { 1844 aston(tp); 1845 if (cpup != CPU) 1846 poke_cpu(cpup->cpu_id); 1847 } 1848 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1849 "cpu_surrender:tid %p cpu %p", tp, cpup); 1850 } 1851 1852 1853 /* 1854 * Commit to and ratify a scheduling decision 1855 */ 1856 /*ARGSUSED*/ 1857 static kthread_t * 1858 disp_ratify(kthread_t *tp, disp_t *kpq) 1859 { 1860 pri_t tpri, maxpri; 1861 pri_t maxkpri; 1862 cpu_t *cpup; 1863 1864 ASSERT(tp != NULL); 1865 /* 1866 * Commit to, then ratify scheduling decision 1867 */ 1868 cpup = CPU; 1869 if (cpup->cpu_runrun != 0) 1870 cpup->cpu_runrun = 0; 1871 if (cpup->cpu_kprunrun != 0) 1872 cpup->cpu_kprunrun = 0; 1873 if (cpup->cpu_chosen_level != -1) 1874 cpup->cpu_chosen_level = -1; 1875 membar_enter(); 1876 tpri = DISP_PRIO(tp); 1877 maxpri = cpup->cpu_disp->disp_maxrunpri; 1878 maxkpri = kpq->disp_maxrunpri; 1879 if (maxpri < maxkpri) 1880 maxpri = maxkpri; 1881 if (tpri < maxpri) { 1882 /* 1883 * should have done better 1884 * put this one back and indicate to try again 1885 */ 1886 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1887 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1888 thread_lock_high(tp); 1889 THREAD_TRANSITION(tp); 1890 setfrontdq(tp); 1891 thread_unlock_nopreempt(tp); 1892 1893 tp = NULL; 1894 } 1895 return (tp); 1896 } 1897 1898 /* 1899 * See if there is any work on the dispatcher queue for other CPUs. 1900 * If there is, dequeue the best thread and return. 1901 */ 1902 static kthread_t * 1903 disp_getwork(cpu_t *cp) 1904 { 1905 cpu_t *ocp; /* other CPU */ 1906 cpu_t *ocp_start; 1907 cpu_t *tcp; /* target local CPU */ 1908 kthread_t *tp; 1909 kthread_t *retval = NULL; 1910 pri_t maxpri; 1911 disp_t *kpq; /* kp queue for this partition */ 1912 lpl_t *lpl, *lpl_leaf; 1913 int hint, leafidx; 1914 hrtime_t stealtime; 1915 1916 maxpri = -1; 1917 tcp = NULL; 1918 1919 kpq = &cp->cpu_part->cp_kp_queue; 1920 while (kpq->disp_maxrunpri >= 0) { 1921 /* 1922 * Try to take a thread from the kp_queue. 1923 */ 1924 tp = (disp_getbest(kpq)); 1925 if (tp) 1926 return (disp_ratify(tp, kpq)); 1927 } 1928 1929 kpreempt_disable(); /* protect the cpu_active list */ 1930 1931 /* 1932 * Try to find something to do on another CPU's run queue. 1933 * Loop through all other CPUs looking for the one with the highest 1934 * priority unbound thread. 1935 * 1936 * On NUMA machines, the partition's CPUs are consulted in order of 1937 * distance from the current CPU. This way, the first available 1938 * work found is also the closest, and will suffer the least 1939 * from being migrated. 1940 */ 1941 lpl = lpl_leaf = cp->cpu_lpl; 1942 hint = leafidx = 0; 1943 1944 /* 1945 * This loop traverses the lpl hierarchy. Higher level lpls represent 1946 * broader levels of locality 1947 */ 1948 do { 1949 /* This loop iterates over the lpl's leaves */ 1950 do { 1951 if (lpl_leaf != cp->cpu_lpl) 1952 ocp = lpl_leaf->lpl_cpus; 1953 else 1954 ocp = cp->cpu_next_lpl; 1955 1956 /* This loop iterates over the CPUs in the leaf */ 1957 ocp_start = ocp; 1958 do { 1959 pri_t pri; 1960 1961 ASSERT(CPU_ACTIVE(ocp)); 1962 1963 /* 1964 * End our stroll around this lpl if: 1965 * 1966 * - Something became runnable on the local 1967 * queue...which also ends our stroll around 1968 * the partition. 1969 * 1970 * - We happen across another idle CPU. 1971 * Since it is patrolling the next portion 1972 * of the lpl's list (assuming it's not 1973 * halted), move to the next higher level 1974 * of locality. 1975 */ 1976 if (cp->cpu_disp->disp_nrunnable != 0) { 1977 kpreempt_enable(); 1978 return (NULL); 1979 } 1980 if (ocp->cpu_dispatch_pri == -1) { 1981 if (ocp->cpu_disp_flags & 1982 CPU_DISP_HALTED) 1983 continue; 1984 else 1985 break; 1986 } 1987 1988 /* 1989 * If there's only one thread and the CPU 1990 * is in the middle of a context switch, 1991 * or it's currently running the idle thread, 1992 * don't steal it. 1993 */ 1994 if ((ocp->cpu_disp_flags & 1995 CPU_DISP_DONTSTEAL) && 1996 ocp->cpu_disp->disp_nrunnable == 1) 1997 continue; 1998 1999 pri = ocp->cpu_disp->disp_max_unbound_pri; 2000 if (pri > maxpri) { 2001 /* 2002 * Don't steal threads that we attempted 2003 * to steal recently until they're ready 2004 * to be stolen again. 2005 */ 2006 stealtime = ocp->cpu_disp->disp_steal; 2007 if (stealtime == 0 || 2008 stealtime - gethrtime() <= 0) { 2009 maxpri = pri; 2010 tcp = ocp; 2011 } else { 2012 /* 2013 * Don't update tcp, just set 2014 * the retval to T_DONTSTEAL, so 2015 * that if no acceptable CPUs 2016 * are found the return value 2017 * will be T_DONTSTEAL rather 2018 * then NULL. 2019 */ 2020 retval = T_DONTSTEAL; 2021 } 2022 } 2023 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 2024 2025 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 2026 leafidx = 0; 2027 lpl_leaf = lpl->lpl_rset[leafidx]; 2028 } 2029 } while (leafidx != hint); 2030 2031 hint = leafidx = lpl->lpl_hint; 2032 if ((lpl = lpl->lpl_parent) != NULL) 2033 lpl_leaf = lpl->lpl_rset[hint]; 2034 } while (!tcp && lpl); 2035 2036 kpreempt_enable(); 2037 2038 /* 2039 * If another queue looks good, and there is still nothing on 2040 * the local queue, try to transfer one or more threads 2041 * from it to our queue. 2042 */ 2043 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 2044 tp = disp_getbest(tcp->cpu_disp); 2045 if (tp == NULL || tp == T_DONTSTEAL) 2046 return (tp); 2047 return (disp_ratify(tp, kpq)); 2048 } 2049 return (retval); 2050 } 2051 2052 2053 /* 2054 * disp_fix_unbound_pri() 2055 * Determines the maximum priority of unbound threads on the queue. 2056 * The priority is kept for the queue, but is only increased, never 2057 * reduced unless some CPU is looking for something on that queue. 2058 * 2059 * The priority argument is the known upper limit. 2060 * 2061 * Perhaps this should be kept accurately, but that probably means 2062 * separate bitmaps for bound and unbound threads. Since only idled 2063 * CPUs will have to do this recalculation, it seems better this way. 2064 */ 2065 static void 2066 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 2067 { 2068 kthread_t *tp; 2069 dispq_t *dq; 2070 ulong_t *dqactmap = dp->disp_qactmap; 2071 ulong_t mapword; 2072 int wx; 2073 2074 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 2075 2076 ASSERT(pri >= 0); /* checked by caller */ 2077 2078 /* 2079 * Start the search at the next lowest priority below the supplied 2080 * priority. This depends on the bitmap implementation. 2081 */ 2082 do { 2083 wx = pri >> BT_ULSHIFT; /* index of word in map */ 2084 2085 /* 2086 * Form mask for all lower priorities in the word. 2087 */ 2088 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 2089 2090 /* 2091 * Get next lower active priority. 2092 */ 2093 if (mapword != 0) { 2094 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 2095 } else if (wx > 0) { 2096 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 2097 if (pri < 0) 2098 break; 2099 } else { 2100 pri = -1; 2101 break; 2102 } 2103 2104 /* 2105 * Search the queue for unbound, runnable threads. 2106 */ 2107 dq = &dp->disp_q[pri]; 2108 tp = dq->dq_first; 2109 2110 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2111 tp = tp->t_link; 2112 } 2113 2114 /* 2115 * If a thread was found, set the priority and return. 2116 */ 2117 } while (tp == NULL); 2118 2119 /* 2120 * pri holds the maximum unbound thread priority or -1. 2121 */ 2122 if (dp->disp_max_unbound_pri != pri) 2123 dp->disp_max_unbound_pri = pri; 2124 } 2125 2126 /* 2127 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2128 * check if the CPU to which is was previously bound should have 2129 * its disp_max_unbound_pri increased. 2130 */ 2131 void 2132 disp_adjust_unbound_pri(kthread_t *tp) 2133 { 2134 disp_t *dp; 2135 pri_t tpri; 2136 2137 ASSERT(THREAD_LOCK_HELD(tp)); 2138 2139 /* 2140 * Don't do anything if the thread is not bound, or 2141 * currently not runnable or swapped out. 2142 */ 2143 if (tp->t_bound_cpu == NULL || 2144 tp->t_state != TS_RUN || 2145 tp->t_schedflag & TS_ON_SWAPQ) 2146 return; 2147 2148 tpri = DISP_PRIO(tp); 2149 dp = tp->t_bound_cpu->cpu_disp; 2150 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2151 if (tpri > dp->disp_max_unbound_pri) 2152 dp->disp_max_unbound_pri = tpri; 2153 } 2154 2155 /* 2156 * disp_getbest() 2157 * De-queue the highest priority unbound runnable thread. 2158 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2159 * Returns NULL if nothing found. 2160 * Returns T_DONTSTEAL if the thread was not stealable. 2161 * so that the caller will try again later. 2162 * 2163 * Passed a pointer to a dispatch queue not associated with this CPU, and 2164 * its type. 2165 */ 2166 static kthread_t * 2167 disp_getbest(disp_t *dp) 2168 { 2169 kthread_t *tp; 2170 dispq_t *dq; 2171 pri_t pri; 2172 cpu_t *cp, *tcp; 2173 boolean_t allbound; 2174 2175 disp_lock_enter(&dp->disp_lock); 2176 2177 /* 2178 * If there is nothing to run, or the CPU is in the middle of a 2179 * context switch of the only thread, return NULL. 2180 */ 2181 tcp = dp->disp_cpu; 2182 cp = CPU; 2183 pri = dp->disp_max_unbound_pri; 2184 if (pri == -1 || 2185 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2186 tcp->cpu_disp->disp_nrunnable == 1)) { 2187 disp_lock_exit_nopreempt(&dp->disp_lock); 2188 return (NULL); 2189 } 2190 2191 dq = &dp->disp_q[pri]; 2192 2193 2194 /* 2195 * Assume that all threads are bound on this queue, and change it 2196 * later when we find out that it is not the case. 2197 */ 2198 allbound = B_TRUE; 2199 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2200 hrtime_t now, nosteal, rqtime; 2201 2202 /* 2203 * Skip over bound threads which could be here even 2204 * though disp_max_unbound_pri indicated this level. 2205 */ 2206 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2207 continue; 2208 2209 /* 2210 * We've got some unbound threads on this queue, so turn 2211 * the allbound flag off now. 2212 */ 2213 allbound = B_FALSE; 2214 2215 /* 2216 * The thread is a candidate for stealing from its run queue. We 2217 * don't want to steal threads that became runnable just a 2218 * moment ago. This improves CPU affinity for threads that get 2219 * preempted for short periods of time and go back on the run 2220 * queue. 2221 * 2222 * We want to let it stay on its run queue if it was only placed 2223 * there recently and it was running on the same CPU before that 2224 * to preserve its cache investment. For the thread to remain on 2225 * its run queue, ALL of the following conditions must be 2226 * satisfied: 2227 * 2228 * - the disp queue should not be the kernel preemption queue 2229 * - delayed idle stealing should not be disabled 2230 * - nosteal_nsec should be non-zero 2231 * - it should run with user priority 2232 * - it should be on the run queue of the CPU where it was 2233 * running before being placed on the run queue 2234 * - it should be the only thread on the run queue (to prevent 2235 * extra scheduling latency for other threads) 2236 * - it should sit on the run queue for less than per-chip 2237 * nosteal interval or global nosteal interval 2238 * - in case of CPUs with shared cache it should sit in a run 2239 * queue of a CPU from a different chip 2240 * 2241 * The checks are arranged so that the ones that are faster are 2242 * placed earlier. 2243 */ 2244 if (tcp == NULL || 2245 pri >= minclsyspri || 2246 tp->t_cpu != tcp) 2247 break; 2248 2249 /* 2250 * Steal immediately if, due to CMT processor architecture 2251 * migraiton between cp and tcp would incur no performance 2252 * penalty. 2253 */ 2254 if (pg_cmt_can_migrate(cp, tcp)) 2255 break; 2256 2257 nosteal = nosteal_nsec; 2258 if (nosteal == 0) 2259 break; 2260 2261 /* 2262 * Calculate time spent sitting on run queue 2263 */ 2264 now = gethrtime_unscaled(); 2265 rqtime = now - tp->t_waitrq; 2266 scalehrtime(&rqtime); 2267 2268 /* 2269 * Steal immediately if the time spent on this run queue is more 2270 * than allowed nosteal delay. 2271 * 2272 * Negative rqtime check is needed here to avoid infinite 2273 * stealing delays caused by unlikely but not impossible 2274 * drifts between CPU times on different CPUs. 2275 */ 2276 if (rqtime > nosteal || rqtime < 0) 2277 break; 2278 2279 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2280 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2281 scalehrtime(&now); 2282 /* 2283 * Calculate when this thread becomes stealable 2284 */ 2285 now += (nosteal - rqtime); 2286 2287 /* 2288 * Calculate time when some thread becomes stealable 2289 */ 2290 if (now < dp->disp_steal) 2291 dp->disp_steal = now; 2292 } 2293 2294 /* 2295 * If there were no unbound threads on this queue, find the queue 2296 * where they are and then return later. The value of 2297 * disp_max_unbound_pri is not always accurate because it isn't 2298 * reduced until another idle CPU looks for work. 2299 */ 2300 if (allbound) 2301 disp_fix_unbound_pri(dp, pri); 2302 2303 /* 2304 * If we reached the end of the queue and found no unbound threads 2305 * then return NULL so that other CPUs will be considered. If there 2306 * are unbound threads but they cannot yet be stolen, then 2307 * return T_DONTSTEAL and try again later. 2308 */ 2309 if (tp == NULL) { 2310 disp_lock_exit_nopreempt(&dp->disp_lock); 2311 return (allbound ? NULL : T_DONTSTEAL); 2312 } 2313 2314 /* 2315 * Found a runnable, unbound thread, so remove it from queue. 2316 * dispdeq() requires that we have the thread locked, and we do, 2317 * by virtue of holding the dispatch queue lock. dispdeq() will 2318 * put the thread in transition state, thereby dropping the dispq 2319 * lock. 2320 */ 2321 2322 #ifdef DEBUG 2323 { 2324 int thread_was_on_queue; 2325 2326 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2327 ASSERT(thread_was_on_queue); 2328 } 2329 2330 #else /* DEBUG */ 2331 (void) dispdeq(tp); /* drops disp_lock */ 2332 #endif /* DEBUG */ 2333 2334 /* 2335 * Reset the disp_queue steal time - we do not know what is the smallest 2336 * value across the queue is. 2337 */ 2338 dp->disp_steal = 0; 2339 2340 tp->t_schedflag |= TS_DONT_SWAP; 2341 2342 /* 2343 * Setup thread to run on the current CPU. 2344 */ 2345 tp->t_disp_queue = cp->cpu_disp; 2346 2347 cp->cpu_dispthread = tp; /* protected by spl only */ 2348 cp->cpu_dispatch_pri = pri; 2349 ASSERT(pri == DISP_PRIO(tp)); 2350 2351 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2352 2353 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2354 2355 /* 2356 * Return with spl high so that swtch() won't need to raise it. 2357 * The disp_lock was dropped by dispdeq(). 2358 */ 2359 2360 return (tp); 2361 } 2362 2363 /* 2364 * disp_bound_common() - common routine for higher level functions 2365 * that check for bound threads under certain conditions. 2366 * If 'threadlistsafe' is set then there is no need to acquire 2367 * pidlock to stop the thread list from changing (eg, if 2368 * disp_bound_* is called with cpus paused). 2369 */ 2370 static int 2371 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2372 { 2373 int found = 0; 2374 kthread_t *tp; 2375 2376 ASSERT(flag); 2377 2378 if (!threadlistsafe) 2379 mutex_enter(&pidlock); 2380 tp = curthread; /* faster than allthreads */ 2381 do { 2382 if (tp->t_state != TS_FREE) { 2383 /* 2384 * If an interrupt thread is busy, but the 2385 * caller doesn't care (i.e. BOUND_INTR is off), 2386 * then just ignore it and continue through. 2387 */ 2388 if ((tp->t_flag & T_INTR_THREAD) && 2389 !(flag & BOUND_INTR)) 2390 continue; 2391 2392 /* 2393 * Skip the idle thread for the CPU 2394 * we're about to set offline. 2395 */ 2396 if (tp == cp->cpu_idle_thread) 2397 continue; 2398 2399 /* 2400 * Skip the pause thread for the CPU 2401 * we're about to set offline. 2402 */ 2403 if (tp == cp->cpu_pause_thread) 2404 continue; 2405 2406 if ((flag & BOUND_CPU) && 2407 (tp->t_bound_cpu == cp || 2408 tp->t_bind_cpu == cp->cpu_id || 2409 tp->t_weakbound_cpu == cp)) { 2410 found = 1; 2411 break; 2412 } 2413 2414 if ((flag & BOUND_PARTITION) && 2415 (tp->t_cpupart == cp->cpu_part)) { 2416 found = 1; 2417 break; 2418 } 2419 } 2420 } while ((tp = tp->t_next) != curthread && found == 0); 2421 if (!threadlistsafe) 2422 mutex_exit(&pidlock); 2423 return (found); 2424 } 2425 2426 /* 2427 * disp_bound_threads - return nonzero if threads are bound to the processor. 2428 * Called infrequently. Keep this simple. 2429 * Includes threads that are asleep or stopped but not onproc. 2430 */ 2431 int 2432 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2433 { 2434 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2435 } 2436 2437 /* 2438 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2439 * to the given processor, including interrupt threads. 2440 */ 2441 int 2442 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2443 { 2444 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2445 } 2446 2447 /* 2448 * disp_bound_partition - return nonzero if threads are bound to the same 2449 * partition as the processor. 2450 * Called infrequently. Keep this simple. 2451 * Includes threads that are asleep or stopped but not onproc. 2452 */ 2453 int 2454 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2455 { 2456 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2457 } 2458 2459 /* 2460 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2461 * threads to other CPUs. 2462 */ 2463 void 2464 disp_cpu_inactive(cpu_t *cp) 2465 { 2466 kthread_t *tp; 2467 disp_t *dp = cp->cpu_disp; 2468 dispq_t *dq; 2469 pri_t pri; 2470 int wasonq; 2471 2472 disp_lock_enter(&dp->disp_lock); 2473 while ((pri = dp->disp_max_unbound_pri) != -1) { 2474 dq = &dp->disp_q[pri]; 2475 tp = dq->dq_first; 2476 2477 /* 2478 * Skip over bound threads. 2479 */ 2480 while (tp != NULL && tp->t_bound_cpu != NULL) { 2481 tp = tp->t_link; 2482 } 2483 2484 if (tp == NULL) { 2485 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2486 disp_fix_unbound_pri(dp, pri); 2487 continue; 2488 } 2489 2490 wasonq = dispdeq(tp); /* drops disp_lock */ 2491 ASSERT(wasonq); 2492 ASSERT(tp->t_weakbound_cpu == NULL); 2493 2494 setbackdq(tp); 2495 /* 2496 * Called from cpu_offline: 2497 * 2498 * cp has already been removed from the list of active cpus 2499 * and tp->t_cpu has been changed so there is no risk of 2500 * tp ending up back on cp. 2501 * 2502 * Called from cpupart_move_cpu: 2503 * 2504 * The cpu has moved to a new cpupart. Any threads that 2505 * were on it's dispatch queues before the move remain 2506 * in the old partition and can't run in the new partition. 2507 */ 2508 ASSERT(tp->t_cpu != cp); 2509 thread_unlock(tp); 2510 2511 disp_lock_enter(&dp->disp_lock); 2512 } 2513 disp_lock_exit(&dp->disp_lock); 2514 } 2515 2516 /* 2517 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2518 * The hint passed in is used as a starting point so we don't favor 2519 * CPU 0 or any other CPU. The caller should pass in the most recently 2520 * used CPU for the thread. 2521 * 2522 * The lgroup and priority are used to determine the best CPU to run on 2523 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2524 * the thread priority will indicate whether the thread will actually run 2525 * there. To pick the best CPU, the CPUs inside and outside of the given 2526 * lgroup which are running the lowest priority threads are found. The 2527 * remote CPU is chosen only if the thread will not run locally on a CPU 2528 * within the lgroup, but will run on the remote CPU. If the thread 2529 * cannot immediately run on any CPU, the best local CPU will be chosen. 2530 * 2531 * The lpl specified also identifies the cpu partition from which 2532 * disp_lowpri_cpu should select a CPU. 2533 * 2534 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2535 * behalf of the current thread. (curthread is looking for a new cpu) 2536 * In this case, cpu_dispatch_pri for this thread's cpu should be 2537 * ignored. 2538 * 2539 * If a cpu is the target of an offline request then try to avoid it. 2540 * 2541 * This function must be called at either high SPL, or with preemption 2542 * disabled, so that the "hint" CPU cannot be removed from the online 2543 * CPU list while we are traversing it. 2544 */ 2545 cpu_t * 2546 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2547 { 2548 cpu_t *bestcpu; 2549 cpu_t *besthomecpu; 2550 cpu_t *cp, *cpstart; 2551 2552 pri_t bestpri; 2553 pri_t cpupri; 2554 2555 klgrpset_t done; 2556 klgrpset_t cur_set; 2557 2558 lpl_t *lpl_iter, *lpl_leaf; 2559 int i; 2560 2561 /* 2562 * Scan for a CPU currently running the lowest priority thread. 2563 * Cannot get cpu_lock here because it is adaptive. 2564 * We do not require lock on CPU list. 2565 */ 2566 ASSERT(hint != NULL); 2567 ASSERT(lpl != NULL); 2568 ASSERT(lpl->lpl_ncpu > 0); 2569 2570 /* 2571 * First examine local CPUs. Note that it's possible the hint CPU 2572 * passed in in remote to the specified home lgroup. If our priority 2573 * isn't sufficient enough such that we can run immediately at home, 2574 * then examine CPUs remote to our home lgroup. 2575 * We would like to give preference to CPUs closest to "home". 2576 * If we can't find a CPU where we'll run at a given level 2577 * of locality, we expand our search to include the next level. 2578 */ 2579 bestcpu = besthomecpu = NULL; 2580 klgrpset_clear(done); 2581 /* start with lpl we were passed */ 2582 2583 lpl_iter = lpl; 2584 2585 do { 2586 2587 bestpri = SHRT_MAX; 2588 klgrpset_clear(cur_set); 2589 2590 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2591 lpl_leaf = lpl_iter->lpl_rset[i]; 2592 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2593 continue; 2594 2595 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2596 2597 if (hint->cpu_lpl == lpl_leaf) 2598 cp = cpstart = hint; 2599 else 2600 cp = cpstart = lpl_leaf->lpl_cpus; 2601 2602 do { 2603 if (cp == curcpu) 2604 cpupri = -1; 2605 else if (cp == cpu_inmotion) 2606 cpupri = SHRT_MAX; 2607 else 2608 cpupri = cp->cpu_dispatch_pri; 2609 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2610 cpupri = cp->cpu_disp->disp_maxrunpri; 2611 if (cp->cpu_chosen_level > cpupri) 2612 cpupri = cp->cpu_chosen_level; 2613 if (cpupri < bestpri) { 2614 if (CPU_IDLING(cpupri)) { 2615 ASSERT((cp->cpu_flags & 2616 CPU_QUIESCED) == 0); 2617 return (cp); 2618 } 2619 bestcpu = cp; 2620 bestpri = cpupri; 2621 } 2622 } while ((cp = cp->cpu_next_lpl) != cpstart); 2623 } 2624 2625 if (bestcpu && (tpri > bestpri)) { 2626 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2627 return (bestcpu); 2628 } 2629 if (besthomecpu == NULL) 2630 besthomecpu = bestcpu; 2631 /* 2632 * Add the lgrps we just considered to the "done" set 2633 */ 2634 klgrpset_or(done, cur_set); 2635 2636 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2637 2638 /* 2639 * The specified priority isn't high enough to run immediately 2640 * anywhere, so just return the best CPU from the home lgroup. 2641 */ 2642 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2643 return (besthomecpu); 2644 } 2645 2646 /* 2647 * This routine provides the generic idle cpu function for all processors. 2648 * If a processor has some specific code to execute when idle (say, to stop 2649 * the pipeline and save power) then that routine should be defined in the 2650 * processors specific code (module_xx.c) and the global variable idle_cpu 2651 * set to that function. 2652 */ 2653 static void 2654 generic_idle_cpu(void) 2655 { 2656 } 2657 2658 /*ARGSUSED*/ 2659 static void 2660 generic_enq_thread(cpu_t *cpu, int bound) 2661 { 2662 } 2663 2664 /* 2665 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2666 * - t->t_cpu is not in this thread's assigned lgrp 2667 * - the time since the thread last came off t->t_cpu exceeds the 2668 * rechoose time for this cpu (ignore this if t is curthread in 2669 * which case it's on CPU and t->t_disp_time is inaccurate) 2670 * - t->t_cpu is presently the target of an offline or partition move 2671 * request 2672 */ 2673 static cpu_t * 2674 cpu_choose(kthread_t *t, pri_t tpri) 2675 { 2676 ASSERT(tpri < kpqpri); 2677 2678 if ((((lbolt - t->t_disp_time) > rechoose_interval) && 2679 t != curthread) || t->t_cpu == cpu_inmotion) { 2680 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2681 } 2682 2683 /* 2684 * Take a trip through disp_lowpri_cpu() if the thread was 2685 * running outside it's home lgroup 2686 */ 2687 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2688 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2689 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2690 (t == curthread) ? t->t_cpu : NULL)); 2691 } 2692 return (t->t_cpu); 2693 } 2694