1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/chip.h> 54 #include <sys/schedctl.h> 55 #include <sys/atomic.h> 56 #include <sys/dtrace.h> 57 #include <sys/sdt.h> 58 59 #include <vm/as.h> 60 61 #define BOUND_CPU 0x1 62 #define BOUND_PARTITION 0x2 63 #define BOUND_INTR 0x4 64 65 /* Dispatch queue allocation structure and functions */ 66 struct disp_queue_info { 67 disp_t *dp; 68 dispq_t *olddispq; 69 dispq_t *newdispq; 70 ulong_t *olddqactmap; 71 ulong_t *newdqactmap; 72 int oldnglobpris; 73 }; 74 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 75 disp_t *dp); 76 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 77 static void disp_dq_free(struct disp_queue_info *dptr); 78 79 /* platform-specific routine to call when processor is idle */ 80 static void generic_idle_cpu(); 81 void (*idle_cpu)() = generic_idle_cpu; 82 83 /* routines invoked when a CPU enters/exits the idle loop */ 84 static void idle_enter(); 85 static void idle_exit(); 86 87 /* platform-specific routine to call when thread is enqueued */ 88 static void generic_enq_thread(cpu_t *, int); 89 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 90 91 pri_t kpreemptpri; /* priority where kernel preemption applies */ 92 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 93 pri_t intr_pri; /* interrupt thread priority base level */ 94 95 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 96 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 97 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 98 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 99 int nswapped; /* total number of swapped threads */ 100 void disp_swapped_enq(kthread_t *tp); 101 static void disp_swapped_setrun(kthread_t *tp); 102 static void cpu_resched(cpu_t *cp, pri_t tpri); 103 104 /* 105 * If this is set, only interrupt threads will cause kernel preemptions. 106 * This is done by changing the value of kpreemptpri. kpreemptpri 107 * will either be the max sysclass pri + 1 or the min interrupt pri. 108 */ 109 int only_intr_kpreempt; 110 111 extern void set_idle_cpu(int cpun); 112 extern void unset_idle_cpu(int cpun); 113 static void setkpdq(kthread_t *tp, int borf); 114 #define SETKP_BACK 0 115 #define SETKP_FRONT 1 116 /* 117 * Parameter that determines how recently a thread must have run 118 * on the CPU to be considered loosely-bound to that CPU to reduce 119 * cold cache effects. The interval is in hertz. 120 * 121 * The platform may define a per physical processor adjustment of 122 * this parameter. For efficiency, the effective rechoose interval 123 * (rechoose_interval + per chip adjustment) is maintained in the 124 * cpu structures. See cpu_choose() 125 */ 126 int rechoose_interval = RECHOOSE_INTERVAL; 127 static cpu_t *cpu_choose(kthread_t *, pri_t); 128 129 /* 130 * Parameter that determines how long (in nanoseconds) a thread must 131 * be sitting on a run queue before it can be stolen by another CPU 132 * to reduce migrations. The interval is in nanoseconds. 133 * 134 * The nosteal_nsec should be set by a platform code to an appropriate value. 135 * 136 */ 137 hrtime_t nosteal_nsec = 0; 138 139 /* 140 * Value of nosteal_nsec meaning that nosteal optimization should be disabled 141 */ 142 #define NOSTEAL_DISABLED 1 143 144 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 145 146 disp_lock_t transition_lock; /* lock on transitioning threads */ 147 disp_lock_t stop_lock; /* lock on stopped threads */ 148 149 static void cpu_dispqalloc(int numpris); 150 151 /* 152 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 153 * a thread because it was sitting on its run queue for a very short 154 * period of time. 155 */ 156 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 157 158 static kthread_t *disp_getwork(cpu_t *to); 159 static kthread_t *disp_getbest(disp_t *from); 160 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 161 162 void swtch_to(kthread_t *); 163 164 /* 165 * dispatcher and scheduler initialization 166 */ 167 168 /* 169 * disp_setup - Common code to calculate and allocate dispatcher 170 * variables and structures based on the maximum priority. 171 */ 172 static void 173 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 174 { 175 pri_t newnglobpris; 176 177 ASSERT(MUTEX_HELD(&cpu_lock)); 178 179 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 180 181 if (newnglobpris > oldnglobpris) { 182 /* 183 * Allocate new kp queues for each CPU partition. 184 */ 185 cpupart_kpqalloc(newnglobpris); 186 187 /* 188 * Allocate new dispatch queues for each CPU. 189 */ 190 cpu_dispqalloc(newnglobpris); 191 192 /* 193 * compute new interrupt thread base priority 194 */ 195 intr_pri = maxglobpri; 196 if (only_intr_kpreempt) { 197 kpreemptpri = intr_pri + 1; 198 if (kpqpri == KPQPRI) 199 kpqpri = kpreemptpri; 200 } 201 v.v_nglobpris = newnglobpris; 202 } 203 } 204 205 /* 206 * dispinit - Called to initialize all loaded classes and the 207 * dispatcher framework. 208 */ 209 void 210 dispinit(void) 211 { 212 id_t cid; 213 pri_t maxglobpri; 214 pri_t cl_maxglobpri; 215 216 maxglobpri = -1; 217 218 /* 219 * Initialize transition lock, which will always be set. 220 */ 221 DISP_LOCK_INIT(&transition_lock); 222 disp_lock_enter_high(&transition_lock); 223 DISP_LOCK_INIT(&stop_lock); 224 225 mutex_enter(&cpu_lock); 226 CPU->cpu_disp->disp_maxrunpri = -1; 227 CPU->cpu_disp->disp_max_unbound_pri = -1; 228 /* 229 * Initialize the default CPU partition. 230 */ 231 cpupart_initialize_default(); 232 /* 233 * Call the class specific initialization functions for 234 * all pre-installed schedulers. 235 * 236 * We pass the size of a class specific parameter 237 * buffer to each of the initialization functions 238 * to try to catch problems with backward compatibility 239 * of class modules. 240 * 241 * For example a new class module running on an old system 242 * which didn't provide sufficiently large parameter buffers 243 * would be bad news. Class initialization modules can check for 244 * this and take action if they detect a problem. 245 */ 246 247 for (cid = 0; cid < nclass; cid++) { 248 sclass_t *sc; 249 250 sc = &sclass[cid]; 251 if (SCHED_INSTALLED(sc)) { 252 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 253 &sc->cl_funcs); 254 if (cl_maxglobpri > maxglobpri) 255 maxglobpri = cl_maxglobpri; 256 } 257 } 258 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 259 if (kpqpri == KPQPRI) 260 kpqpri = kpreemptpri; 261 262 ASSERT(maxglobpri >= 0); 263 disp_setup(maxglobpri, 0); 264 265 mutex_exit(&cpu_lock); 266 267 /* 268 * Get the default class ID; this may be later modified via 269 * dispadmin(1M). This will load the class (normally TS) and that will 270 * call disp_add(), which is why we had to drop cpu_lock first. 271 */ 272 if (getcid(defaultclass, &defaultcid) != 0) { 273 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 274 defaultclass); 275 } 276 } 277 278 /* 279 * disp_add - Called with class pointer to initialize the dispatcher 280 * for a newly loaded class. 281 */ 282 void 283 disp_add(sclass_t *clp) 284 { 285 pri_t maxglobpri; 286 pri_t cl_maxglobpri; 287 288 mutex_enter(&cpu_lock); 289 /* 290 * Initialize the scheduler class. 291 */ 292 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 293 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 294 if (cl_maxglobpri > maxglobpri) 295 maxglobpri = cl_maxglobpri; 296 297 /* 298 * Save old queue information. Since we're initializing a 299 * new scheduling class which has just been loaded, then 300 * the size of the dispq may have changed. We need to handle 301 * that here. 302 */ 303 disp_setup(maxglobpri, v.v_nglobpris); 304 305 mutex_exit(&cpu_lock); 306 } 307 308 309 /* 310 * For each CPU, allocate new dispatch queues 311 * with the stated number of priorities. 312 */ 313 static void 314 cpu_dispqalloc(int numpris) 315 { 316 cpu_t *cpup; 317 struct disp_queue_info *disp_mem; 318 int i, num; 319 320 ASSERT(MUTEX_HELD(&cpu_lock)); 321 322 disp_mem = kmem_zalloc(NCPU * 323 sizeof (struct disp_queue_info), KM_SLEEP); 324 325 /* 326 * This routine must allocate all of the memory before stopping 327 * the cpus because it must not sleep in kmem_alloc while the 328 * CPUs are stopped. Locks they hold will not be freed until they 329 * are restarted. 330 */ 331 i = 0; 332 cpup = cpu_list; 333 do { 334 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 335 i++; 336 cpup = cpup->cpu_next; 337 } while (cpup != cpu_list); 338 num = i; 339 340 pause_cpus(NULL); 341 for (i = 0; i < num; i++) 342 disp_dq_assign(&disp_mem[i], numpris); 343 start_cpus(); 344 345 /* 346 * I must free all of the memory after starting the cpus because 347 * I can not risk sleeping in kmem_free while the cpus are stopped. 348 */ 349 for (i = 0; i < num; i++) 350 disp_dq_free(&disp_mem[i]); 351 352 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 353 } 354 355 static void 356 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 357 { 358 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 359 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 360 sizeof (long), KM_SLEEP); 361 dptr->dp = dp; 362 } 363 364 static void 365 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 366 { 367 disp_t *dp; 368 369 dp = dptr->dp; 370 dptr->olddispq = dp->disp_q; 371 dptr->olddqactmap = dp->disp_qactmap; 372 dptr->oldnglobpris = dp->disp_npri; 373 374 ASSERT(dptr->oldnglobpris < numpris); 375 376 if (dptr->olddispq != NULL) { 377 /* 378 * Use kcopy because bcopy is platform-specific 379 * and could block while we might have paused the cpus. 380 */ 381 (void) kcopy(dptr->olddispq, dptr->newdispq, 382 dptr->oldnglobpris * sizeof (dispq_t)); 383 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 384 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 385 sizeof (long)); 386 } 387 dp->disp_q = dptr->newdispq; 388 dp->disp_qactmap = dptr->newdqactmap; 389 dp->disp_q_limit = &dptr->newdispq[numpris]; 390 dp->disp_npri = numpris; 391 } 392 393 static void 394 disp_dq_free(struct disp_queue_info *dptr) 395 { 396 if (dptr->olddispq != NULL) 397 kmem_free(dptr->olddispq, 398 dptr->oldnglobpris * sizeof (dispq_t)); 399 if (dptr->olddqactmap != NULL) 400 kmem_free(dptr->olddqactmap, 401 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 402 } 403 404 /* 405 * For a newly created CPU, initialize the dispatch queue. 406 * This is called before the CPU is known through cpu[] or on any lists. 407 */ 408 void 409 disp_cpu_init(cpu_t *cp) 410 { 411 disp_t *dp; 412 dispq_t *newdispq; 413 ulong_t *newdqactmap; 414 415 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 416 417 if (cp == cpu0_disp.disp_cpu) 418 dp = &cpu0_disp; 419 else 420 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 421 bzero(dp, sizeof (disp_t)); 422 cp->cpu_disp = dp; 423 dp->disp_cpu = cp; 424 dp->disp_maxrunpri = -1; 425 dp->disp_max_unbound_pri = -1; 426 DISP_LOCK_INIT(&cp->cpu_thread_lock); 427 /* 428 * Allocate memory for the dispatcher queue headers 429 * and the active queue bitmap. 430 */ 431 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 432 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 433 sizeof (long), KM_SLEEP); 434 dp->disp_q = newdispq; 435 dp->disp_qactmap = newdqactmap; 436 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 437 dp->disp_npri = v.v_nglobpris; 438 } 439 440 void 441 disp_cpu_fini(cpu_t *cp) 442 { 443 ASSERT(MUTEX_HELD(&cpu_lock)); 444 445 disp_kp_free(cp->cpu_disp); 446 if (cp->cpu_disp != &cpu0_disp) 447 kmem_free(cp->cpu_disp, sizeof (disp_t)); 448 } 449 450 /* 451 * Allocate new, larger kpreempt dispatch queue to replace the old one. 452 */ 453 void 454 disp_kp_alloc(disp_t *dq, pri_t npri) 455 { 456 struct disp_queue_info mem_info; 457 458 if (npri > dq->disp_npri) { 459 /* 460 * Allocate memory for the new array. 461 */ 462 disp_dq_alloc(&mem_info, npri, dq); 463 464 /* 465 * We need to copy the old structures to the new 466 * and free the old. 467 */ 468 disp_dq_assign(&mem_info, npri); 469 disp_dq_free(&mem_info); 470 } 471 } 472 473 /* 474 * Free dispatch queue. 475 * Used for the kpreempt queues for a removed CPU partition and 476 * for the per-CPU queues of deleted CPUs. 477 */ 478 void 479 disp_kp_free(disp_t *dq) 480 { 481 struct disp_queue_info mem_info; 482 483 mem_info.olddispq = dq->disp_q; 484 mem_info.olddqactmap = dq->disp_qactmap; 485 mem_info.oldnglobpris = dq->disp_npri; 486 disp_dq_free(&mem_info); 487 } 488 489 /* 490 * End dispatcher and scheduler initialization. 491 */ 492 493 /* 494 * See if there's anything to do other than remain idle. 495 * Return non-zero if there is. 496 * 497 * This function must be called with high spl, or with 498 * kernel preemption disabled to prevent the partition's 499 * active cpu list from changing while being traversed. 500 * 501 */ 502 int 503 disp_anywork(void) 504 { 505 cpu_t *cp = CPU; 506 cpu_t *ocp; 507 508 if (cp->cpu_disp->disp_nrunnable != 0) 509 return (1); 510 511 if (!(cp->cpu_flags & CPU_OFFLINE)) { 512 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 513 return (1); 514 515 /* 516 * Work can be taken from another CPU if: 517 * - There is unbound work on the run queue 518 * - That work isn't a thread undergoing a 519 * - context switch on an otherwise empty queue. 520 * - The CPU isn't running the idle loop. 521 */ 522 for (ocp = cp->cpu_next_part; ocp != cp; 523 ocp = ocp->cpu_next_part) { 524 ASSERT(CPU_ACTIVE(ocp)); 525 526 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 527 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 528 ocp->cpu_disp->disp_nrunnable == 1) && 529 ocp->cpu_dispatch_pri != -1) 530 return (1); 531 } 532 } 533 return (0); 534 } 535 536 /* 537 * Called when CPU enters the idle loop 538 */ 539 static void 540 idle_enter() 541 { 542 cpu_t *cp = CPU; 543 544 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 545 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 546 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 547 } 548 549 /* 550 * Called when CPU exits the idle loop 551 */ 552 static void 553 idle_exit() 554 { 555 cpu_t *cp = CPU; 556 557 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 558 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 559 } 560 561 /* 562 * Idle loop. 563 */ 564 void 565 idle() 566 { 567 struct cpu *cp = CPU; /* pointer to this CPU */ 568 kthread_t *t; /* taken thread */ 569 570 idle_enter(); 571 572 /* 573 * Uniprocessor version of idle loop. 574 * Do this until notified that we're on an actual multiprocessor. 575 */ 576 while (ncpus == 1) { 577 if (cp->cpu_disp->disp_nrunnable == 0) { 578 (*idle_cpu)(); 579 continue; 580 } 581 idle_exit(); 582 swtch(); 583 584 idle_enter(); /* returned from swtch */ 585 } 586 587 /* 588 * Multiprocessor idle loop. 589 */ 590 for (;;) { 591 /* 592 * If CPU is completely quiesced by p_online(2), just wait 593 * here with minimal bus traffic until put online. 594 */ 595 while (cp->cpu_flags & CPU_QUIESCED) 596 (*idle_cpu)(); 597 598 if (cp->cpu_disp->disp_nrunnable != 0) { 599 idle_exit(); 600 swtch(); 601 } else { 602 if (cp->cpu_flags & CPU_OFFLINE) 603 continue; 604 if ((t = disp_getwork(cp)) == NULL) { 605 if (cp->cpu_chosen_level != -1) { 606 disp_t *dp = cp->cpu_disp; 607 disp_t *kpq; 608 609 disp_lock_enter(&dp->disp_lock); 610 /* 611 * Set kpq under lock to prevent 612 * migration between partitions. 613 */ 614 kpq = &cp->cpu_part->cp_kp_queue; 615 if (kpq->disp_maxrunpri == -1) 616 cp->cpu_chosen_level = -1; 617 disp_lock_exit(&dp->disp_lock); 618 } 619 (*idle_cpu)(); 620 continue; 621 } 622 /* 623 * If there was a thread but we couldn't steal 624 * it, then keep trying. 625 */ 626 if (t == T_DONTSTEAL) 627 continue; 628 idle_exit(); 629 restore_mstate(t); 630 swtch_to(t); 631 } 632 idle_enter(); /* returned from swtch/swtch_to */ 633 } 634 } 635 636 637 /* 638 * Preempt the currently running thread in favor of the highest 639 * priority thread. The class of the current thread controls 640 * where it goes on the dispatcher queues. If panicking, turn 641 * preemption off. 642 */ 643 void 644 preempt() 645 { 646 kthread_t *t = curthread; 647 klwp_t *lwp = ttolwp(curthread); 648 649 if (panicstr) 650 return; 651 652 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 653 654 thread_lock(t); 655 656 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 657 /* 658 * this thread has already been chosen to be run on 659 * another CPU. Clear kprunrun on this CPU since we're 660 * already headed for swtch(). 661 */ 662 CPU->cpu_kprunrun = 0; 663 thread_unlock_nopreempt(t); 664 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 665 } else { 666 if (lwp != NULL) 667 lwp->lwp_ru.nivcsw++; 668 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 669 THREAD_TRANSITION(t); 670 CL_PREEMPT(t); 671 DTRACE_SCHED(preempt); 672 thread_unlock_nopreempt(t); 673 674 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 675 676 swtch(); /* clears CPU->cpu_runrun via disp() */ 677 } 678 } 679 680 extern kthread_t *thread_unpin(); 681 682 /* 683 * disp() - find the highest priority thread for this processor to run, and 684 * set it in TS_ONPROC state so that resume() can be called to run it. 685 */ 686 static kthread_t * 687 disp() 688 { 689 cpu_t *cpup; 690 disp_t *dp; 691 kthread_t *tp; 692 dispq_t *dq; 693 int maxrunword; 694 pri_t pri; 695 disp_t *kpq; 696 697 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 698 699 cpup = CPU; 700 /* 701 * Find the highest priority loaded, runnable thread. 702 */ 703 dp = cpup->cpu_disp; 704 705 reschedule: 706 /* 707 * If there is more important work on the global queue with a better 708 * priority than the maximum on this CPU, take it now. 709 */ 710 kpq = &cpup->cpu_part->cp_kp_queue; 711 while ((pri = kpq->disp_maxrunpri) >= 0 && 712 pri >= dp->disp_maxrunpri && 713 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 714 (tp = disp_getbest(kpq)) != NULL) { 715 if (disp_ratify(tp, kpq) != NULL) { 716 TRACE_1(TR_FAC_DISP, TR_DISP_END, 717 "disp_end:tid %p", tp); 718 restore_mstate(tp); 719 return (tp); 720 } 721 } 722 723 disp_lock_enter(&dp->disp_lock); 724 pri = dp->disp_maxrunpri; 725 726 /* 727 * If there is nothing to run, look at what's runnable on other queues. 728 * Choose the idle thread if the CPU is quiesced. 729 * Note that CPUs that have the CPU_OFFLINE flag set can still run 730 * interrupt threads, which will be the only threads on the CPU's own 731 * queue, but cannot run threads from other queues. 732 */ 733 if (pri == -1) { 734 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 735 disp_lock_exit(&dp->disp_lock); 736 if ((tp = disp_getwork(cpup)) == NULL || 737 tp == T_DONTSTEAL) { 738 tp = cpup->cpu_idle_thread; 739 (void) splhigh(); 740 THREAD_ONPROC(tp, cpup); 741 cpup->cpu_dispthread = tp; 742 cpup->cpu_dispatch_pri = -1; 743 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 744 cpup->cpu_chosen_level = -1; 745 } 746 } else { 747 disp_lock_exit_high(&dp->disp_lock); 748 tp = cpup->cpu_idle_thread; 749 THREAD_ONPROC(tp, cpup); 750 cpup->cpu_dispthread = tp; 751 cpup->cpu_dispatch_pri = -1; 752 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 753 cpup->cpu_chosen_level = -1; 754 } 755 TRACE_1(TR_FAC_DISP, TR_DISP_END, 756 "disp_end:tid %p", tp); 757 restore_mstate(tp); 758 return (tp); 759 } 760 761 dq = &dp->disp_q[pri]; 762 tp = dq->dq_first; 763 764 ASSERT(tp != NULL); 765 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 766 767 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 768 769 /* 770 * Found it so remove it from queue. 771 */ 772 dp->disp_nrunnable--; 773 dq->dq_sruncnt--; 774 if ((dq->dq_first = tp->t_link) == NULL) { 775 ulong_t *dqactmap = dp->disp_qactmap; 776 777 ASSERT(dq->dq_sruncnt == 0); 778 dq->dq_last = NULL; 779 780 /* 781 * The queue is empty, so the corresponding bit needs to be 782 * turned off in dqactmap. If nrunnable != 0 just took the 783 * last runnable thread off the 784 * highest queue, so recompute disp_maxrunpri. 785 */ 786 maxrunword = pri >> BT_ULSHIFT; 787 dqactmap[maxrunword] &= ~BT_BIW(pri); 788 789 if (dp->disp_nrunnable == 0) { 790 dp->disp_max_unbound_pri = -1; 791 dp->disp_maxrunpri = -1; 792 } else { 793 int ipri; 794 795 ipri = bt_gethighbit(dqactmap, maxrunword); 796 dp->disp_maxrunpri = ipri; 797 if (ipri < dp->disp_max_unbound_pri) 798 dp->disp_max_unbound_pri = ipri; 799 } 800 } else { 801 tp->t_link = NULL; 802 } 803 804 /* 805 * Set TS_DONT_SWAP flag to prevent another processor from swapping 806 * out this thread before we have a chance to run it. 807 * While running, it is protected against swapping by t_lock. 808 */ 809 tp->t_schedflag |= TS_DONT_SWAP; 810 cpup->cpu_dispthread = tp; /* protected by spl only */ 811 cpup->cpu_dispatch_pri = pri; 812 ASSERT(pri == DISP_PRIO(tp)); 813 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 814 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 815 816 ASSERT(tp != NULL); 817 TRACE_1(TR_FAC_DISP, TR_DISP_END, 818 "disp_end:tid %p", tp); 819 820 if (disp_ratify(tp, kpq) == NULL) 821 goto reschedule; 822 823 restore_mstate(tp); 824 return (tp); 825 } 826 827 /* 828 * swtch() 829 * Find best runnable thread and run it. 830 * Called with the current thread already switched to a new state, 831 * on a sleep queue, run queue, stopped, and not zombied. 832 * May be called at any spl level less than or equal to LOCK_LEVEL. 833 * Always drops spl to the base level (spl0()). 834 */ 835 void 836 swtch() 837 { 838 kthread_t *t = curthread; 839 kthread_t *next; 840 cpu_t *cp; 841 842 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 843 844 if (t->t_flag & T_INTR_THREAD) 845 cpu_intr_swtch_enter(t); 846 847 if (t->t_intr != NULL) { 848 /* 849 * We are an interrupt thread. Setup and return 850 * the interrupted thread to be resumed. 851 */ 852 (void) splhigh(); /* block other scheduler action */ 853 cp = CPU; /* now protected against migration */ 854 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 855 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 856 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 857 next = thread_unpin(); 858 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 859 resume_from_intr(next); 860 } else { 861 #ifdef DEBUG 862 if (t->t_state == TS_ONPROC && 863 t->t_disp_queue->disp_cpu == CPU && 864 t->t_preempt == 0) { 865 thread_lock(t); 866 ASSERT(t->t_state != TS_ONPROC || 867 t->t_disp_queue->disp_cpu != CPU || 868 t->t_preempt != 0); /* cannot migrate */ 869 thread_unlock_nopreempt(t); 870 } 871 #endif /* DEBUG */ 872 cp = CPU; 873 next = disp(); /* returns with spl high */ 874 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 875 876 /* OK to steal anything left on run queue */ 877 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 878 879 if (next != t) { 880 if (t == cp->cpu_idle_thread) { 881 CHIP_NRUNNING(cp->cpu_chip, 1); 882 } else if (next == cp->cpu_idle_thread) { 883 CHIP_NRUNNING(cp->cpu_chip, -1); 884 } 885 886 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 887 cp->cpu_last_swtch = t->t_disp_time = lbolt; 888 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 889 890 if (dtrace_vtime_active) 891 dtrace_vtime_switch(next); 892 893 resume(next); 894 /* 895 * The TR_RESUME_END and TR_SWTCH_END trace points 896 * appear at the end of resume(), because we may not 897 * return here 898 */ 899 } else { 900 if (t->t_flag & T_INTR_THREAD) 901 cpu_intr_swtch_exit(t); 902 903 DTRACE_SCHED(remain__cpu); 904 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 905 (void) spl0(); 906 } 907 } 908 } 909 910 /* 911 * swtch_from_zombie() 912 * Special case of swtch(), which allows checks for TS_ZOMB to be 913 * eliminated from normal resume. 914 * Find best runnable thread and run it. 915 * Called with the current thread zombied. 916 * Zombies cannot migrate, so CPU references are safe. 917 */ 918 void 919 swtch_from_zombie() 920 { 921 kthread_t *next; 922 cpu_t *cpu = CPU; 923 924 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 925 926 ASSERT(curthread->t_state == TS_ZOMB); 927 928 next = disp(); /* returns with spl high */ 929 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 930 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 931 ASSERT(next != curthread); 932 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 933 934 if (next == cpu->cpu_idle_thread) 935 CHIP_NRUNNING(cpu->cpu_chip, -1); 936 937 if (dtrace_vtime_active) 938 dtrace_vtime_switch(next); 939 940 resume_from_zombie(next); 941 /* 942 * The TR_RESUME_END and TR_SWTCH_END trace points 943 * appear at the end of resume(), because we certainly will not 944 * return here 945 */ 946 } 947 948 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 949 static int 950 thread_on_queue(kthread_t *tp) 951 { 952 cpu_t *cp; 953 cpu_t *self; 954 disp_t *dp; 955 956 self = CPU; 957 cp = self->cpu_next_onln; 958 dp = cp->cpu_disp; 959 for (;;) { 960 dispq_t *dq; 961 dispq_t *eq; 962 963 disp_lock_enter_high(&dp->disp_lock); 964 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 965 kthread_t *rp; 966 967 ASSERT(dq->dq_last == NULL || 968 dq->dq_last->t_link == NULL); 969 for (rp = dq->dq_first; rp; rp = rp->t_link) 970 if (tp == rp) { 971 disp_lock_exit_high(&dp->disp_lock); 972 return (1); 973 } 974 } 975 disp_lock_exit_high(&dp->disp_lock); 976 if (cp == NULL) 977 break; 978 if (cp == self) { 979 cp = NULL; 980 dp = &cp->cpu_part->cp_kp_queue; 981 } else { 982 cp = cp->cpu_next_onln; 983 dp = cp->cpu_disp; 984 } 985 } 986 return (0); 987 } /* end of thread_on_queue */ 988 #else 989 990 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 991 992 #endif /* DEBUG */ 993 994 /* 995 * like swtch(), but switch to a specified thread taken from another CPU. 996 * called with spl high.. 997 */ 998 void 999 swtch_to(kthread_t *next) 1000 { 1001 cpu_t *cp = CPU; 1002 1003 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1004 1005 /* 1006 * Update context switch statistics. 1007 */ 1008 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1009 1010 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1011 1012 if (curthread == cp->cpu_idle_thread) 1013 CHIP_NRUNNING(cp->cpu_chip, 1); 1014 1015 /* OK to steal anything left on run queue */ 1016 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1017 1018 /* record last execution time */ 1019 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1020 1021 if (dtrace_vtime_active) 1022 dtrace_vtime_switch(next); 1023 1024 resume(next); 1025 /* 1026 * The TR_RESUME_END and TR_SWTCH_END trace points 1027 * appear at the end of resume(), because we may not 1028 * return here 1029 */ 1030 } 1031 1032 1033 1034 #define CPU_IDLING(pri) ((pri) == -1) 1035 1036 static void 1037 cpu_resched(cpu_t *cp, pri_t tpri) 1038 { 1039 int call_poke_cpu = 0; 1040 pri_t cpupri = cp->cpu_dispatch_pri; 1041 1042 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1043 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1044 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1045 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1046 cp->cpu_runrun = 1; 1047 aston(cp->cpu_dispthread); 1048 if (tpri < kpreemptpri && cp != CPU) 1049 call_poke_cpu = 1; 1050 } 1051 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1052 cp->cpu_kprunrun = 1; 1053 if (cp != CPU) 1054 call_poke_cpu = 1; 1055 } 1056 } 1057 1058 /* 1059 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1060 */ 1061 membar_enter(); 1062 1063 if (call_poke_cpu) 1064 poke_cpu(cp->cpu_id); 1065 } 1066 1067 /* 1068 * Routine used by setbackdq() to balance load across the physical 1069 * processors. Returns a CPU of a lesser loaded chip in the lgroup 1070 * if balancing is necessary, or the "hint" CPU if it's not. 1071 * 1072 * - tp is the thread being enqueued 1073 * - cp is a hint CPU (chosen by cpu_choose()). 1074 * - curchip (if not NULL) is the chip on which the current thread 1075 * is running. 1076 * 1077 * The thread lock for "tp" must be held while calling this routine. 1078 */ 1079 static cpu_t * 1080 chip_balance(kthread_t *tp, cpu_t *cp, chip_t *curchip) 1081 { 1082 int chp_nrun, ochp_nrun; 1083 chip_t *chp, *nchp; 1084 1085 chp = cp->cpu_chip; 1086 chp_nrun = chp->chip_nrunning; 1087 1088 if (chp == curchip) 1089 chp_nrun--; /* Ignore curthread */ 1090 1091 /* 1092 * If this chip isn't at all idle, then let 1093 * run queue balancing do the work. 1094 */ 1095 if (chp_nrun == chp->chip_ncpu) 1096 return (cp); 1097 1098 nchp = chp->chip_balance; 1099 do { 1100 if (nchp == chp || 1101 !CHIP_IN_CPUPART(nchp, tp->t_cpupart)) 1102 continue; 1103 1104 ochp_nrun = nchp->chip_nrunning; 1105 1106 /* 1107 * If the other chip is running less threads, 1108 * or if it's running the same number of threads, but 1109 * has more online logical CPUs, then choose to balance. 1110 */ 1111 if (chp_nrun > ochp_nrun || 1112 (chp_nrun == ochp_nrun && 1113 nchp->chip_ncpu > chp->chip_ncpu)) { 1114 cp = nchp->chip_cpus; 1115 nchp->chip_cpus = cp->cpu_next_chip; 1116 1117 /* 1118 * Find a CPU on the chip in the correct 1119 * partition. We know at least one exists 1120 * because of the CHIP_IN_CPUPART() check above. 1121 */ 1122 while (cp->cpu_part != tp->t_cpupart) 1123 cp = cp->cpu_next_chip; 1124 } 1125 chp->chip_balance = nchp->chip_next_lgrp; 1126 break; 1127 } while ((nchp = nchp->chip_next_lgrp) != chp->chip_balance); 1128 1129 ASSERT(CHIP_IN_CPUPART(cp->cpu_chip, tp->t_cpupart)); 1130 return (cp); 1131 } 1132 1133 /* 1134 * setbackdq() keeps runqs balanced such that the difference in length 1135 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1136 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1137 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1138 * try to keep runqs perfectly balanced regardless of the thread priority. 1139 */ 1140 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1141 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1142 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1143 1144 /* 1145 * Put the specified thread on the back of the dispatcher 1146 * queue corresponding to its current priority. 1147 * 1148 * Called with the thread in transition, onproc or stopped state 1149 * and locked (transition implies locked) and at high spl. 1150 * Returns with the thread in TS_RUN state and still locked. 1151 */ 1152 void 1153 setbackdq(kthread_t *tp) 1154 { 1155 dispq_t *dq; 1156 disp_t *dp; 1157 chip_t *curchip = NULL; 1158 cpu_t *cp; 1159 pri_t tpri; 1160 int bound; 1161 1162 ASSERT(THREAD_LOCK_HELD(tp)); 1163 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1164 1165 if (tp->t_waitrq == 0) { 1166 hrtime_t curtime; 1167 1168 curtime = gethrtime_unscaled(); 1169 (void) cpu_update_pct(tp, curtime); 1170 tp->t_waitrq = curtime; 1171 } else { 1172 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1173 } 1174 1175 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1176 1177 /* 1178 * If thread is "swapped" or on the swap queue don't 1179 * queue it, but wake sched. 1180 */ 1181 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1182 disp_swapped_setrun(tp); 1183 return; 1184 } 1185 1186 tpri = DISP_PRIO(tp); 1187 if (tp == curthread) { 1188 curchip = CPU->cpu_chip; 1189 } 1190 1191 if (ncpus == 1) 1192 cp = tp->t_cpu; 1193 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1194 if (tpri >= kpqpri) { 1195 setkpdq(tp, SETKP_BACK); 1196 return; 1197 } 1198 /* 1199 * Let cpu_choose suggest a CPU. 1200 */ 1201 cp = cpu_choose(tp, tpri); 1202 1203 if (tp->t_cpupart == cp->cpu_part) { 1204 int qlen; 1205 1206 /* 1207 * Select another CPU if we need 1208 * to do some load balancing across the 1209 * physical processors. 1210 */ 1211 if (CHIP_SHOULD_BALANCE(cp->cpu_chip)) 1212 cp = chip_balance(tp, cp, curchip); 1213 1214 /* 1215 * Balance across the run queues 1216 */ 1217 qlen = RUNQ_LEN(cp, tpri); 1218 if (tpri >= RUNQ_MATCH_PRI && 1219 !(tp->t_schedflag & TS_RUNQMATCH)) 1220 qlen -= RUNQ_MAX_DIFF; 1221 if (qlen > 0) { 1222 cpu_t *newcp; 1223 1224 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1225 newcp = cp->cpu_next_part; 1226 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1227 newcp = cp->cpu_next_part; 1228 } 1229 1230 if (RUNQ_LEN(newcp, tpri) < qlen) { 1231 DTRACE_PROBE3(runq__balance, 1232 kthread_t *, tp, 1233 cpu_t *, cp, cpu_t *, newcp); 1234 cp = newcp; 1235 } 1236 } 1237 } else { 1238 /* 1239 * Migrate to a cpu in the new partition. 1240 */ 1241 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1242 tp->t_lpl, tp->t_pri, NULL); 1243 } 1244 bound = 0; 1245 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1246 } else { 1247 /* 1248 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1249 * a short time until weak binding that existed when the 1250 * strong binding was established has dropped) so we must 1251 * favour weak binding over strong. 1252 */ 1253 cp = tp->t_weakbound_cpu ? 1254 tp->t_weakbound_cpu : tp->t_bound_cpu; 1255 bound = 1; 1256 } 1257 dp = cp->cpu_disp; 1258 disp_lock_enter_high(&dp->disp_lock); 1259 1260 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1261 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1262 tpri, cp, tp); 1263 1264 #ifndef NPROBE 1265 /* Kernel probe */ 1266 if (tnf_tracing_active) 1267 tnf_thread_queue(tp, cp, tpri); 1268 #endif /* NPROBE */ 1269 1270 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1271 1272 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1273 tp->t_disp_queue = dp; 1274 tp->t_link = NULL; 1275 1276 dq = &dp->disp_q[tpri]; 1277 dp->disp_nrunnable++; 1278 if (!bound) 1279 dp->disp_steal = 0; 1280 membar_enter(); 1281 1282 if (dq->dq_sruncnt++ != 0) { 1283 ASSERT(dq->dq_first != NULL); 1284 dq->dq_last->t_link = tp; 1285 dq->dq_last = tp; 1286 } else { 1287 ASSERT(dq->dq_first == NULL); 1288 ASSERT(dq->dq_last == NULL); 1289 dq->dq_first = dq->dq_last = tp; 1290 BT_SET(dp->disp_qactmap, tpri); 1291 if (tpri > dp->disp_maxrunpri) { 1292 dp->disp_maxrunpri = tpri; 1293 membar_enter(); 1294 cpu_resched(cp, tpri); 1295 } 1296 } 1297 1298 if (!bound && tpri > dp->disp_max_unbound_pri) { 1299 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1300 cp == CPU) { 1301 /* 1302 * If there are no other unbound threads on the 1303 * run queue, don't allow other CPUs to steal 1304 * this thread while we are in the middle of a 1305 * context switch. We may just switch to it 1306 * again right away. CPU_DISP_DONTSTEAL is cleared 1307 * in swtch and swtch_to. 1308 */ 1309 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1310 } 1311 dp->disp_max_unbound_pri = tpri; 1312 } 1313 (*disp_enq_thread)(cp, bound); 1314 } 1315 1316 /* 1317 * Put the specified thread on the front of the dispatcher 1318 * queue corresponding to its current priority. 1319 * 1320 * Called with the thread in transition, onproc or stopped state 1321 * and locked (transition implies locked) and at high spl. 1322 * Returns with the thread in TS_RUN state and still locked. 1323 */ 1324 void 1325 setfrontdq(kthread_t *tp) 1326 { 1327 disp_t *dp; 1328 dispq_t *dq; 1329 cpu_t *cp; 1330 pri_t tpri; 1331 int bound; 1332 1333 ASSERT(THREAD_LOCK_HELD(tp)); 1334 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1335 1336 if (tp->t_waitrq == 0) { 1337 hrtime_t curtime; 1338 1339 curtime = gethrtime_unscaled(); 1340 (void) cpu_update_pct(tp, curtime); 1341 tp->t_waitrq = curtime; 1342 } else { 1343 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1344 } 1345 1346 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1347 1348 /* 1349 * If thread is "swapped" or on the swap queue don't 1350 * queue it, but wake sched. 1351 */ 1352 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1353 disp_swapped_setrun(tp); 1354 return; 1355 } 1356 1357 tpri = DISP_PRIO(tp); 1358 if (ncpus == 1) 1359 cp = tp->t_cpu; 1360 else if (!tp->t_bound_cpu && !tp->t_weakbound_cpu) { 1361 if (tpri >= kpqpri) { 1362 setkpdq(tp, SETKP_FRONT); 1363 return; 1364 } 1365 cp = tp->t_cpu; 1366 if (tp->t_cpupart == cp->cpu_part) { 1367 /* 1368 * If we are of higher or equal priority than 1369 * the highest priority runnable thread of 1370 * the current CPU, just pick this CPU. Otherwise 1371 * Let cpu_choose() select the CPU. If this cpu 1372 * is the target of an offline request then do not 1373 * pick it - a thread_nomigrate() on the in motion 1374 * cpu relies on this when it forces a preempt. 1375 */ 1376 if (tpri < cp->cpu_disp->disp_maxrunpri || 1377 cp == cpu_inmotion) 1378 cp = cpu_choose(tp, tpri); 1379 } else { 1380 /* 1381 * Migrate to a cpu in the new partition. 1382 */ 1383 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1384 tp->t_lpl, tp->t_pri, NULL); 1385 } 1386 bound = 0; 1387 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1388 } else { 1389 /* 1390 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1391 * a short time until weak binding that existed when the 1392 * strong binding was established has dropped) so we must 1393 * favour weak binding over strong. 1394 */ 1395 cp = tp->t_weakbound_cpu ? 1396 tp->t_weakbound_cpu : tp->t_bound_cpu; 1397 bound = 1; 1398 } 1399 dp = cp->cpu_disp; 1400 disp_lock_enter_high(&dp->disp_lock); 1401 1402 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1403 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1404 1405 #ifndef NPROBE 1406 /* Kernel probe */ 1407 if (tnf_tracing_active) 1408 tnf_thread_queue(tp, cp, tpri); 1409 #endif /* NPROBE */ 1410 1411 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1412 1413 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1414 tp->t_disp_queue = dp; 1415 1416 dq = &dp->disp_q[tpri]; 1417 dp->disp_nrunnable++; 1418 if (!bound) 1419 dp->disp_steal = 0; 1420 membar_enter(); 1421 1422 if (dq->dq_sruncnt++ != 0) { 1423 ASSERT(dq->dq_last != NULL); 1424 tp->t_link = dq->dq_first; 1425 dq->dq_first = tp; 1426 } else { 1427 ASSERT(dq->dq_last == NULL); 1428 ASSERT(dq->dq_first == NULL); 1429 tp->t_link = NULL; 1430 dq->dq_first = dq->dq_last = tp; 1431 BT_SET(dp->disp_qactmap, tpri); 1432 if (tpri > dp->disp_maxrunpri) { 1433 dp->disp_maxrunpri = tpri; 1434 membar_enter(); 1435 cpu_resched(cp, tpri); 1436 } 1437 } 1438 1439 if (!bound && tpri > dp->disp_max_unbound_pri) { 1440 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1441 cp == CPU) { 1442 /* 1443 * If there are no other unbound threads on the 1444 * run queue, don't allow other CPUs to steal 1445 * this thread while we are in the middle of a 1446 * context switch. We may just switch to it 1447 * again right away. CPU_DISP_DONTSTEAL is cleared 1448 * in swtch and swtch_to. 1449 */ 1450 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1451 } 1452 dp->disp_max_unbound_pri = tpri; 1453 } 1454 (*disp_enq_thread)(cp, bound); 1455 } 1456 1457 /* 1458 * Put a high-priority unbound thread on the kp queue 1459 */ 1460 static void 1461 setkpdq(kthread_t *tp, int borf) 1462 { 1463 dispq_t *dq; 1464 disp_t *dp; 1465 cpu_t *cp; 1466 pri_t tpri; 1467 1468 tpri = DISP_PRIO(tp); 1469 1470 dp = &tp->t_cpupart->cp_kp_queue; 1471 disp_lock_enter_high(&dp->disp_lock); 1472 1473 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1474 1475 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1476 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1477 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1478 tp->t_disp_queue = dp; 1479 dp->disp_nrunnable++; 1480 dq = &dp->disp_q[tpri]; 1481 1482 if (dq->dq_sruncnt++ != 0) { 1483 if (borf == SETKP_BACK) { 1484 ASSERT(dq->dq_first != NULL); 1485 tp->t_link = NULL; 1486 dq->dq_last->t_link = tp; 1487 dq->dq_last = tp; 1488 } else { 1489 ASSERT(dq->dq_last != NULL); 1490 tp->t_link = dq->dq_first; 1491 dq->dq_first = tp; 1492 } 1493 } else { 1494 if (borf == SETKP_BACK) { 1495 ASSERT(dq->dq_first == NULL); 1496 ASSERT(dq->dq_last == NULL); 1497 dq->dq_first = dq->dq_last = tp; 1498 } else { 1499 ASSERT(dq->dq_last == NULL); 1500 ASSERT(dq->dq_first == NULL); 1501 tp->t_link = NULL; 1502 dq->dq_first = dq->dq_last = tp; 1503 } 1504 BT_SET(dp->disp_qactmap, tpri); 1505 if (tpri > dp->disp_max_unbound_pri) 1506 dp->disp_max_unbound_pri = tpri; 1507 if (tpri > dp->disp_maxrunpri) { 1508 dp->disp_maxrunpri = tpri; 1509 membar_enter(); 1510 } 1511 } 1512 1513 cp = tp->t_cpu; 1514 if (tp->t_cpupart != cp->cpu_part) { 1515 /* migrate to a cpu in the new partition */ 1516 cp = tp->t_cpupart->cp_cpulist; 1517 } 1518 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1519 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1520 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1521 1522 #ifndef NPROBE 1523 /* Kernel probe */ 1524 if (tnf_tracing_active) 1525 tnf_thread_queue(tp, cp, tpri); 1526 #endif /* NPROBE */ 1527 1528 if (cp->cpu_chosen_level < tpri) 1529 cp->cpu_chosen_level = tpri; 1530 cpu_resched(cp, tpri); 1531 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1532 (*disp_enq_thread)(cp, 0); 1533 } 1534 1535 /* 1536 * Remove a thread from the dispatcher queue if it is on it. 1537 * It is not an error if it is not found but we return whether 1538 * or not it was found in case the caller wants to check. 1539 */ 1540 int 1541 dispdeq(kthread_t *tp) 1542 { 1543 disp_t *dp; 1544 dispq_t *dq; 1545 kthread_t *rp; 1546 kthread_t *trp; 1547 kthread_t **ptp; 1548 int tpri; 1549 1550 ASSERT(THREAD_LOCK_HELD(tp)); 1551 1552 if (tp->t_state != TS_RUN) 1553 return (0); 1554 1555 /* 1556 * The thread is "swapped" or is on the swap queue and 1557 * hence no longer on the run queue, so return true. 1558 */ 1559 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1560 return (1); 1561 1562 tpri = DISP_PRIO(tp); 1563 dp = tp->t_disp_queue; 1564 ASSERT(tpri < dp->disp_npri); 1565 dq = &dp->disp_q[tpri]; 1566 ptp = &dq->dq_first; 1567 rp = *ptp; 1568 trp = NULL; 1569 1570 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1571 1572 /* 1573 * Search for thread in queue. 1574 * Double links would simplify this at the expense of disp/setrun. 1575 */ 1576 while (rp != tp && rp != NULL) { 1577 trp = rp; 1578 ptp = &trp->t_link; 1579 rp = trp->t_link; 1580 } 1581 1582 if (rp == NULL) { 1583 panic("dispdeq: thread not on queue"); 1584 } 1585 1586 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1587 1588 /* 1589 * Found it so remove it from queue. 1590 */ 1591 if ((*ptp = rp->t_link) == NULL) 1592 dq->dq_last = trp; 1593 1594 dp->disp_nrunnable--; 1595 if (--dq->dq_sruncnt == 0) { 1596 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1597 if (dp->disp_nrunnable == 0) { 1598 dp->disp_max_unbound_pri = -1; 1599 dp->disp_maxrunpri = -1; 1600 } else if (tpri == dp->disp_maxrunpri) { 1601 int ipri; 1602 1603 ipri = bt_gethighbit(dp->disp_qactmap, 1604 dp->disp_maxrunpri >> BT_ULSHIFT); 1605 if (ipri < dp->disp_max_unbound_pri) 1606 dp->disp_max_unbound_pri = ipri; 1607 dp->disp_maxrunpri = ipri; 1608 } 1609 } 1610 tp->t_link = NULL; 1611 THREAD_TRANSITION(tp); /* put in intermediate state */ 1612 return (1); 1613 } 1614 1615 1616 /* 1617 * dq_sruninc and dq_srundec are public functions for 1618 * incrementing/decrementing the sruncnts when a thread on 1619 * a dispatcher queue is made schedulable/unschedulable by 1620 * resetting the TS_LOAD flag. 1621 * 1622 * The caller MUST have the thread lock and therefore the dispatcher 1623 * queue lock so that the operation which changes 1624 * the flag, the operation that checks the status of the thread to 1625 * determine if it's on a disp queue AND the call to this function 1626 * are one atomic operation with respect to interrupts. 1627 */ 1628 1629 /* 1630 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1631 */ 1632 void 1633 dq_sruninc(kthread_t *t) 1634 { 1635 ASSERT(t->t_state == TS_RUN); 1636 ASSERT(t->t_schedflag & TS_LOAD); 1637 1638 THREAD_TRANSITION(t); 1639 setfrontdq(t); 1640 } 1641 1642 /* 1643 * See comment on calling conventions above. 1644 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1645 */ 1646 void 1647 dq_srundec(kthread_t *t) 1648 { 1649 ASSERT(t->t_schedflag & TS_LOAD); 1650 1651 (void) dispdeq(t); 1652 disp_swapped_enq(t); 1653 } 1654 1655 /* 1656 * Change the dispatcher lock of thread to the "swapped_lock" 1657 * and return with thread lock still held. 1658 * 1659 * Called with thread_lock held, in transition state, and at high spl. 1660 */ 1661 void 1662 disp_swapped_enq(kthread_t *tp) 1663 { 1664 ASSERT(THREAD_LOCK_HELD(tp)); 1665 ASSERT(tp->t_schedflag & TS_LOAD); 1666 1667 switch (tp->t_state) { 1668 case TS_RUN: 1669 disp_lock_enter_high(&swapped_lock); 1670 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1671 break; 1672 case TS_ONPROC: 1673 disp_lock_enter_high(&swapped_lock); 1674 THREAD_TRANSITION(tp); 1675 wake_sched_sec = 1; /* tell clock to wake sched */ 1676 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1677 break; 1678 default: 1679 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1680 } 1681 } 1682 1683 /* 1684 * This routine is called by setbackdq/setfrontdq if the thread is 1685 * not loaded or loaded and on the swap queue. 1686 * 1687 * Thread state TS_SLEEP implies that a swapped thread 1688 * has been woken up and needs to be swapped in by the swapper. 1689 * 1690 * Thread state TS_RUN, it implies that the priority of a swapped 1691 * thread is being increased by scheduling class (e.g. ts_update). 1692 */ 1693 static void 1694 disp_swapped_setrun(kthread_t *tp) 1695 { 1696 ASSERT(THREAD_LOCK_HELD(tp)); 1697 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1698 1699 switch (tp->t_state) { 1700 case TS_SLEEP: 1701 disp_lock_enter_high(&swapped_lock); 1702 /* 1703 * Wakeup sched immediately (i.e., next tick) if the 1704 * thread priority is above maxclsyspri. 1705 */ 1706 if (DISP_PRIO(tp) > maxclsyspri) 1707 wake_sched = 1; 1708 else 1709 wake_sched_sec = 1; 1710 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1711 break; 1712 case TS_RUN: /* called from ts_update */ 1713 break; 1714 default: 1715 panic("disp_swapped_setrun: tp: %p bad t_state", tp); 1716 } 1717 } 1718 1719 1720 /* 1721 * Make a thread give up its processor. Find the processor on 1722 * which this thread is executing, and have that processor 1723 * preempt. 1724 */ 1725 void 1726 cpu_surrender(kthread_t *tp) 1727 { 1728 cpu_t *cpup; 1729 int max_pri; 1730 int max_run_pri; 1731 klwp_t *lwp; 1732 1733 ASSERT(THREAD_LOCK_HELD(tp)); 1734 1735 if (tp->t_state != TS_ONPROC) 1736 return; 1737 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1738 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1739 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1740 if (max_pri < max_run_pri) 1741 max_pri = max_run_pri; 1742 1743 cpup->cpu_runrun = 1; 1744 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1745 cpup->cpu_kprunrun = 1; 1746 } 1747 1748 /* 1749 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1750 */ 1751 membar_enter(); 1752 1753 DTRACE_SCHED1(surrender, kthread_t *, tp); 1754 1755 /* 1756 * Make the target thread take an excursion through trap() 1757 * to do preempt() (unless we're already in trap or post_syscall, 1758 * calling cpu_surrender via CL_TRAPRET). 1759 */ 1760 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1761 lwp->lwp_state != LWP_USER) { 1762 aston(tp); 1763 if (cpup != CPU) 1764 poke_cpu(cpup->cpu_id); 1765 } 1766 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1767 "cpu_surrender:tid %p cpu %p", tp, cpup); 1768 } 1769 1770 1771 /* 1772 * Commit to and ratify a scheduling decision 1773 */ 1774 /*ARGSUSED*/ 1775 static kthread_t * 1776 disp_ratify(kthread_t *tp, disp_t *kpq) 1777 { 1778 pri_t tpri, maxpri; 1779 pri_t maxkpri; 1780 cpu_t *cpup; 1781 1782 ASSERT(tp != NULL); 1783 /* 1784 * Commit to, then ratify scheduling decision 1785 */ 1786 cpup = CPU; 1787 if (cpup->cpu_runrun != 0) 1788 cpup->cpu_runrun = 0; 1789 if (cpup->cpu_kprunrun != 0) 1790 cpup->cpu_kprunrun = 0; 1791 if (cpup->cpu_chosen_level != -1) 1792 cpup->cpu_chosen_level = -1; 1793 membar_enter(); 1794 tpri = DISP_PRIO(tp); 1795 maxpri = cpup->cpu_disp->disp_maxrunpri; 1796 maxkpri = kpq->disp_maxrunpri; 1797 if (maxpri < maxkpri) 1798 maxpri = maxkpri; 1799 if (tpri < maxpri) { 1800 /* 1801 * should have done better 1802 * put this one back and indicate to try again 1803 */ 1804 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1805 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1806 thread_lock_high(tp); 1807 THREAD_TRANSITION(tp); 1808 setfrontdq(tp); 1809 thread_unlock_nopreempt(tp); 1810 1811 tp = NULL; 1812 } 1813 return (tp); 1814 } 1815 1816 /* 1817 * See if there is any work on the dispatcher queue for other CPUs. 1818 * If there is, dequeue the best thread and return. 1819 */ 1820 static kthread_t * 1821 disp_getwork(cpu_t *cp) 1822 { 1823 cpu_t *ocp; /* other CPU */ 1824 cpu_t *ocp_start; 1825 cpu_t *tcp; /* target local CPU */ 1826 kthread_t *tp; 1827 kthread_t *retval = NULL; 1828 pri_t maxpri; 1829 disp_t *kpq; /* kp queue for this partition */ 1830 lpl_t *lpl, *lpl_leaf; 1831 int hint, leafidx; 1832 hrtime_t stealtime; 1833 1834 maxpri = -1; 1835 tcp = NULL; 1836 1837 kpq = &cp->cpu_part->cp_kp_queue; 1838 while (kpq->disp_maxrunpri >= 0) { 1839 /* 1840 * Try to take a thread from the kp_queue. 1841 */ 1842 tp = (disp_getbest(kpq)); 1843 if (tp) 1844 return (disp_ratify(tp, kpq)); 1845 } 1846 1847 kpreempt_disable(); /* protect the cpu_active list */ 1848 1849 /* 1850 * Try to find something to do on another CPU's run queue. 1851 * Loop through all other CPUs looking for the one with the highest 1852 * priority unbound thread. 1853 * 1854 * On NUMA machines, the partition's CPUs are consulted in order of 1855 * distance from the current CPU. This way, the first available 1856 * work found is also the closest, and will suffer the least 1857 * from being migrated. 1858 */ 1859 lpl = lpl_leaf = cp->cpu_lpl; 1860 hint = leafidx = 0; 1861 1862 /* 1863 * This loop traverses the lpl hierarchy. Higher level lpls represent 1864 * broader levels of locality 1865 */ 1866 do { 1867 /* This loop iterates over the lpl's leaves */ 1868 do { 1869 if (lpl_leaf != cp->cpu_lpl) 1870 ocp = lpl_leaf->lpl_cpus; 1871 else 1872 ocp = cp->cpu_next_lpl; 1873 1874 /* This loop iterates over the CPUs in the leaf */ 1875 ocp_start = ocp; 1876 do { 1877 pri_t pri; 1878 1879 ASSERT(CPU_ACTIVE(ocp)); 1880 1881 /* 1882 * End our stroll around the partition if: 1883 * 1884 * - Something became runnable on the local 1885 * queue 1886 * 1887 * - We're at the broadest level of locality and 1888 * we happen across another idle CPU. At the 1889 * highest level of locality, all CPUs will 1890 * walk the partition's CPUs in the same 1891 * order, so we can end our stroll taking 1892 * comfort in knowing the other idle CPU is 1893 * already covering the next portion of the 1894 * list. 1895 */ 1896 if (cp->cpu_disp->disp_nrunnable != 0) 1897 break; 1898 if (ocp->cpu_dispatch_pri == -1) { 1899 if (ocp->cpu_disp_flags & 1900 CPU_DISP_HALTED) 1901 continue; 1902 else if (lpl->lpl_parent == NULL) 1903 break; 1904 } 1905 1906 /* 1907 * If there's only one thread and the CPU 1908 * is in the middle of a context switch, 1909 * or it's currently running the idle thread, 1910 * don't steal it. 1911 */ 1912 if ((ocp->cpu_disp_flags & 1913 CPU_DISP_DONTSTEAL) && 1914 ocp->cpu_disp->disp_nrunnable == 1) 1915 continue; 1916 1917 pri = ocp->cpu_disp->disp_max_unbound_pri; 1918 if (pri > maxpri) { 1919 /* 1920 * Don't steal threads that we attempted 1921 * to be stolen very recently until 1922 * they're ready to be stolen again. 1923 */ 1924 stealtime = ocp->cpu_disp->disp_steal; 1925 if (stealtime == 0 || 1926 stealtime - gethrtime() <= 0) { 1927 maxpri = pri; 1928 tcp = ocp; 1929 } else { 1930 /* 1931 * Don't update tcp, just set 1932 * the retval to T_DONTSTEAL, so 1933 * that if no acceptable CPUs 1934 * are found the return value 1935 * will be T_DONTSTEAL rather 1936 * then NULL. 1937 */ 1938 retval = T_DONTSTEAL; 1939 } 1940 } 1941 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 1942 1943 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 1944 leafidx = 0; 1945 lpl_leaf = lpl->lpl_rset[leafidx]; 1946 } 1947 } while (leafidx != hint); 1948 1949 hint = leafidx = lpl->lpl_hint; 1950 if ((lpl = lpl->lpl_parent) != NULL) 1951 lpl_leaf = lpl->lpl_rset[hint]; 1952 } while (!tcp && lpl); 1953 1954 kpreempt_enable(); 1955 1956 /* 1957 * If another queue looks good, and there is still nothing on 1958 * the local queue, try to transfer one or more threads 1959 * from it to our queue. 1960 */ 1961 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 1962 tp = disp_getbest(tcp->cpu_disp); 1963 if (tp == NULL || tp == T_DONTSTEAL) 1964 return (tp); 1965 return (disp_ratify(tp, kpq)); 1966 } 1967 return (retval); 1968 } 1969 1970 1971 /* 1972 * disp_fix_unbound_pri() 1973 * Determines the maximum priority of unbound threads on the queue. 1974 * The priority is kept for the queue, but is only increased, never 1975 * reduced unless some CPU is looking for something on that queue. 1976 * 1977 * The priority argument is the known upper limit. 1978 * 1979 * Perhaps this should be kept accurately, but that probably means 1980 * separate bitmaps for bound and unbound threads. Since only idled 1981 * CPUs will have to do this recalculation, it seems better this way. 1982 */ 1983 static void 1984 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 1985 { 1986 kthread_t *tp; 1987 dispq_t *dq; 1988 ulong_t *dqactmap = dp->disp_qactmap; 1989 ulong_t mapword; 1990 int wx; 1991 1992 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 1993 1994 ASSERT(pri >= 0); /* checked by caller */ 1995 1996 /* 1997 * Start the search at the next lowest priority below the supplied 1998 * priority. This depends on the bitmap implementation. 1999 */ 2000 do { 2001 wx = pri >> BT_ULSHIFT; /* index of word in map */ 2002 2003 /* 2004 * Form mask for all lower priorities in the word. 2005 */ 2006 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 2007 2008 /* 2009 * Get next lower active priority. 2010 */ 2011 if (mapword != 0) { 2012 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 2013 } else if (wx > 0) { 2014 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 2015 if (pri < 0) 2016 break; 2017 } else { 2018 pri = -1; 2019 break; 2020 } 2021 2022 /* 2023 * Search the queue for unbound, runnable threads. 2024 */ 2025 dq = &dp->disp_q[pri]; 2026 tp = dq->dq_first; 2027 2028 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2029 tp = tp->t_link; 2030 } 2031 2032 /* 2033 * If a thread was found, set the priority and return. 2034 */ 2035 } while (tp == NULL); 2036 2037 /* 2038 * pri holds the maximum unbound thread priority or -1. 2039 */ 2040 if (dp->disp_max_unbound_pri != pri) 2041 dp->disp_max_unbound_pri = pri; 2042 } 2043 2044 /* 2045 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2046 * check if the CPU to which is was previously bound should have 2047 * its disp_max_unbound_pri increased. 2048 */ 2049 void 2050 disp_adjust_unbound_pri(kthread_t *tp) 2051 { 2052 disp_t *dp; 2053 pri_t tpri; 2054 2055 ASSERT(THREAD_LOCK_HELD(tp)); 2056 2057 /* 2058 * Don't do anything if the thread is not bound, or 2059 * currently not runnable or swapped out. 2060 */ 2061 if (tp->t_bound_cpu == NULL || 2062 tp->t_state != TS_RUN || 2063 tp->t_schedflag & TS_ON_SWAPQ) 2064 return; 2065 2066 tpri = DISP_PRIO(tp); 2067 dp = tp->t_bound_cpu->cpu_disp; 2068 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2069 if (tpri > dp->disp_max_unbound_pri) 2070 dp->disp_max_unbound_pri = tpri; 2071 } 2072 2073 /* 2074 * disp_getbest() 2075 * De-queue the highest priority unbound runnable thread. 2076 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2077 * Returns NULL if nothing found. 2078 * Returns T_DONTSTEAL if the thread was not stealable. 2079 * so that the caller will try again later. 2080 * 2081 * Passed a pointer to a dispatch queue not associated with this CPU, and 2082 * its type. 2083 */ 2084 static kthread_t * 2085 disp_getbest(disp_t *dp) 2086 { 2087 kthread_t *tp; 2088 dispq_t *dq; 2089 pri_t pri; 2090 cpu_t *cp, *tcp; 2091 boolean_t allbound; 2092 2093 disp_lock_enter(&dp->disp_lock); 2094 2095 /* 2096 * If there is nothing to run, or the CPU is in the middle of a 2097 * context switch of the only thread, return NULL. 2098 */ 2099 tcp = dp->disp_cpu; 2100 cp = CPU; 2101 pri = dp->disp_max_unbound_pri; 2102 if (pri == -1 || 2103 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2104 tcp->cpu_disp->disp_nrunnable == 1)) { 2105 disp_lock_exit_nopreempt(&dp->disp_lock); 2106 return (NULL); 2107 } 2108 2109 dq = &dp->disp_q[pri]; 2110 2111 2112 /* 2113 * Assume that all threads are bound on this queue, and change it 2114 * later when we find out that it is not the case. 2115 */ 2116 allbound = B_TRUE; 2117 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2118 hrtime_t now, nosteal, rqtime; 2119 chip_type_t chtype; 2120 chip_t *chip; 2121 2122 /* 2123 * Skip over bound threads which could be here even 2124 * though disp_max_unbound_pri indicated this level. 2125 */ 2126 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2127 continue; 2128 2129 /* 2130 * We've got some unbound threads on this queue, so turn 2131 * the allbound flag off now. 2132 */ 2133 allbound = B_FALSE; 2134 2135 /* 2136 * The thread is a candidate for stealing from its run queue. We 2137 * don't want to steal threads that became runnable just a 2138 * moment ago. This improves CPU affinity for threads that get 2139 * preempted for short periods of time and go back on the run 2140 * queue. 2141 * 2142 * We want to let it stay on its run queue if it was only placed 2143 * there recently and it was running on the same CPU before that 2144 * to preserve its cache investment. For the thread to remain on 2145 * its run queue, ALL of the following conditions must be 2146 * satisfied: 2147 * 2148 * - the disp queue should not be the kernel preemption queue 2149 * - delayed idle stealing should not be disabled 2150 * - nosteal_nsec should be non-zero 2151 * - it should run with user priority 2152 * - it should be on the run queue of the CPU where it was 2153 * running before being placed on the run queue 2154 * - it should be the only thread on the run queue (to prevent 2155 * extra scheduling latency for other threads) 2156 * - it should sit on the run queue for less than per-chip 2157 * nosteal interval or global nosteal interval 2158 * - in case of CPUs with shared cache it should sit in a run 2159 * queue of a CPU from a different chip 2160 * 2161 * The checks are arranged so that the ones that are faster are 2162 * placed earlier. 2163 */ 2164 if (tcp == NULL || 2165 pri >= minclsyspri || 2166 tp->t_cpu != tcp) 2167 break; 2168 2169 /* 2170 * Steal immediately if the chip has shared cache and we are 2171 * sharing the chip with the target thread's CPU. 2172 */ 2173 chip = tcp->cpu_chip; 2174 chtype = chip->chip_type; 2175 if ((chtype == CHIP_SMT || chtype == CHIP_CMP_SHARED_CACHE) && 2176 chip == cp->cpu_chip) 2177 break; 2178 2179 /* 2180 * Get the value of nosteal interval either from nosteal_nsec 2181 * global variable or from a value specified by a chip 2182 */ 2183 nosteal = nosteal_nsec ? nosteal_nsec : chip->chip_nosteal; 2184 if (nosteal == 0 || nosteal == NOSTEAL_DISABLED) 2185 break; 2186 2187 /* 2188 * Calculate time spent sitting on run queue 2189 */ 2190 now = gethrtime_unscaled(); 2191 rqtime = now - tp->t_waitrq; 2192 scalehrtime(&rqtime); 2193 2194 /* 2195 * Steal immediately if the time spent on this run queue is more 2196 * than allowed nosteal delay. 2197 * 2198 * Negative rqtime check is needed here to avoid infinite 2199 * stealing delays caused by unlikely but not impossible 2200 * drifts between CPU times on different CPUs. 2201 */ 2202 if (rqtime > nosteal || rqtime < 0) 2203 break; 2204 2205 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2206 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2207 scalehrtime(&now); 2208 /* 2209 * Calculate when this thread becomes stealable 2210 */ 2211 now += (nosteal - rqtime); 2212 2213 /* 2214 * Calculate time when some thread becomes stealable 2215 */ 2216 if (now < dp->disp_steal) 2217 dp->disp_steal = now; 2218 } 2219 2220 /* 2221 * If there were no unbound threads on this queue, find the queue 2222 * where they are and then return later. The value of 2223 * disp_max_unbound_pri is not always accurate because it isn't 2224 * reduced until another idle CPU looks for work. 2225 */ 2226 if (allbound) 2227 disp_fix_unbound_pri(dp, pri); 2228 2229 /* 2230 * If we reached the end of the queue and found no unbound threads 2231 * then return NULL so that other CPUs will be considered. If there 2232 * are unbound threads but they cannot yet be stolen, then 2233 * return T_DONTSTEAL and try again later. 2234 */ 2235 if (tp == NULL) { 2236 disp_lock_exit_nopreempt(&dp->disp_lock); 2237 return (allbound ? NULL : T_DONTSTEAL); 2238 } 2239 2240 /* 2241 * Found a runnable, unbound thread, so remove it from queue. 2242 * dispdeq() requires that we have the thread locked, and we do, 2243 * by virtue of holding the dispatch queue lock. dispdeq() will 2244 * put the thread in transition state, thereby dropping the dispq 2245 * lock. 2246 */ 2247 2248 #ifdef DEBUG 2249 { 2250 int thread_was_on_queue; 2251 2252 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2253 ASSERT(thread_was_on_queue); 2254 } 2255 2256 #else /* DEBUG */ 2257 (void) dispdeq(tp); /* drops disp_lock */ 2258 #endif /* DEBUG */ 2259 2260 /* 2261 * Reset the disp_queue steal time - we do not know what is the smallest 2262 * value across the queue is. 2263 */ 2264 dp->disp_steal = 0; 2265 2266 tp->t_schedflag |= TS_DONT_SWAP; 2267 2268 /* 2269 * Setup thread to run on the current CPU. 2270 */ 2271 tp->t_disp_queue = cp->cpu_disp; 2272 2273 cp->cpu_dispthread = tp; /* protected by spl only */ 2274 cp->cpu_dispatch_pri = pri; 2275 ASSERT(pri == DISP_PRIO(tp)); 2276 2277 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2278 2279 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2280 2281 /* 2282 * Return with spl high so that swtch() won't need to raise it. 2283 * The disp_lock was dropped by dispdeq(). 2284 */ 2285 2286 return (tp); 2287 } 2288 2289 /* 2290 * disp_bound_common() - common routine for higher level functions 2291 * that check for bound threads under certain conditions. 2292 * If 'threadlistsafe' is set then there is no need to acquire 2293 * pidlock to stop the thread list from changing (eg, if 2294 * disp_bound_* is called with cpus paused). 2295 */ 2296 static int 2297 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2298 { 2299 int found = 0; 2300 kthread_t *tp; 2301 2302 ASSERT(flag); 2303 2304 if (!threadlistsafe) 2305 mutex_enter(&pidlock); 2306 tp = curthread; /* faster than allthreads */ 2307 do { 2308 if (tp->t_state != TS_FREE) { 2309 /* 2310 * If an interrupt thread is busy, but the 2311 * caller doesn't care (i.e. BOUND_INTR is off), 2312 * then just ignore it and continue through. 2313 */ 2314 if ((tp->t_flag & T_INTR_THREAD) && 2315 !(flag & BOUND_INTR)) 2316 continue; 2317 2318 /* 2319 * Skip the idle thread for the CPU 2320 * we're about to set offline. 2321 */ 2322 if (tp == cp->cpu_idle_thread) 2323 continue; 2324 2325 /* 2326 * Skip the pause thread for the CPU 2327 * we're about to set offline. 2328 */ 2329 if (tp == cp->cpu_pause_thread) 2330 continue; 2331 2332 if ((flag & BOUND_CPU) && 2333 (tp->t_bound_cpu == cp || 2334 tp->t_bind_cpu == cp->cpu_id || 2335 tp->t_weakbound_cpu == cp)) { 2336 found = 1; 2337 break; 2338 } 2339 2340 if ((flag & BOUND_PARTITION) && 2341 (tp->t_cpupart == cp->cpu_part)) { 2342 found = 1; 2343 break; 2344 } 2345 } 2346 } while ((tp = tp->t_next) != curthread && found == 0); 2347 if (!threadlistsafe) 2348 mutex_exit(&pidlock); 2349 return (found); 2350 } 2351 2352 /* 2353 * disp_bound_threads - return nonzero if threads are bound to the processor. 2354 * Called infrequently. Keep this simple. 2355 * Includes threads that are asleep or stopped but not onproc. 2356 */ 2357 int 2358 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2359 { 2360 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2361 } 2362 2363 /* 2364 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2365 * to the given processor, including interrupt threads. 2366 */ 2367 int 2368 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2369 { 2370 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2371 } 2372 2373 /* 2374 * disp_bound_partition - return nonzero if threads are bound to the same 2375 * partition as the processor. 2376 * Called infrequently. Keep this simple. 2377 * Includes threads that are asleep or stopped but not onproc. 2378 */ 2379 int 2380 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2381 { 2382 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2383 } 2384 2385 /* 2386 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2387 * threads to other CPUs. 2388 */ 2389 void 2390 disp_cpu_inactive(cpu_t *cp) 2391 { 2392 kthread_t *tp; 2393 disp_t *dp = cp->cpu_disp; 2394 dispq_t *dq; 2395 pri_t pri; 2396 int wasonq; 2397 2398 disp_lock_enter(&dp->disp_lock); 2399 while ((pri = dp->disp_max_unbound_pri) != -1) { 2400 dq = &dp->disp_q[pri]; 2401 tp = dq->dq_first; 2402 2403 /* 2404 * Skip over bound threads. 2405 */ 2406 while (tp != NULL && tp->t_bound_cpu != NULL) { 2407 tp = tp->t_link; 2408 } 2409 2410 if (tp == NULL) { 2411 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2412 disp_fix_unbound_pri(dp, pri); 2413 continue; 2414 } 2415 2416 wasonq = dispdeq(tp); /* drops disp_lock */ 2417 ASSERT(wasonq); 2418 ASSERT(tp->t_weakbound_cpu == NULL); 2419 2420 setbackdq(tp); 2421 /* 2422 * Called from cpu_offline: 2423 * 2424 * cp has already been removed from the list of active cpus 2425 * and tp->t_cpu has been changed so there is no risk of 2426 * tp ending up back on cp. 2427 * 2428 * Called from cpupart_move_cpu: 2429 * 2430 * The cpu has moved to a new cpupart. Any threads that 2431 * were on it's dispatch queues before the move remain 2432 * in the old partition and can't run in the new partition. 2433 */ 2434 ASSERT(tp->t_cpu != cp); 2435 thread_unlock(tp); 2436 2437 disp_lock_enter(&dp->disp_lock); 2438 } 2439 disp_lock_exit(&dp->disp_lock); 2440 } 2441 2442 /* 2443 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2444 * The hint passed in is used as a starting point so we don't favor 2445 * CPU 0 or any other CPU. The caller should pass in the most recently 2446 * used CPU for the thread. 2447 * 2448 * The lgroup and priority are used to determine the best CPU to run on 2449 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2450 * the thread priority will indicate whether the thread will actually run 2451 * there. To pick the best CPU, the CPUs inside and outside of the given 2452 * lgroup which are running the lowest priority threads are found. The 2453 * remote CPU is chosen only if the thread will not run locally on a CPU 2454 * within the lgroup, but will run on the remote CPU. If the thread 2455 * cannot immediately run on any CPU, the best local CPU will be chosen. 2456 * 2457 * The lpl specified also identifies the cpu partition from which 2458 * disp_lowpri_cpu should select a CPU. 2459 * 2460 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2461 * behalf of the current thread. (curthread is looking for a new cpu) 2462 * In this case, cpu_dispatch_pri for this thread's cpu should be 2463 * ignored. 2464 * 2465 * If a cpu is the target of an offline request then try to avoid it. 2466 * 2467 * This function must be called at either high SPL, or with preemption 2468 * disabled, so that the "hint" CPU cannot be removed from the online 2469 * CPU list while we are traversing it. 2470 */ 2471 cpu_t * 2472 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2473 { 2474 cpu_t *bestcpu; 2475 cpu_t *besthomecpu; 2476 cpu_t *cp, *cpstart; 2477 2478 pri_t bestpri; 2479 pri_t cpupri; 2480 2481 klgrpset_t done; 2482 klgrpset_t cur_set; 2483 2484 lpl_t *lpl_iter, *lpl_leaf; 2485 int i; 2486 2487 /* 2488 * Scan for a CPU currently running the lowest priority thread. 2489 * Cannot get cpu_lock here because it is adaptive. 2490 * We do not require lock on CPU list. 2491 */ 2492 ASSERT(hint != NULL); 2493 ASSERT(lpl != NULL); 2494 ASSERT(lpl->lpl_ncpu > 0); 2495 2496 /* 2497 * First examine local CPUs. Note that it's possible the hint CPU 2498 * passed in in remote to the specified home lgroup. If our priority 2499 * isn't sufficient enough such that we can run immediately at home, 2500 * then examine CPUs remote to our home lgroup. 2501 * We would like to give preference to CPUs closest to "home". 2502 * If we can't find a CPU where we'll run at a given level 2503 * of locality, we expand our search to include the next level. 2504 */ 2505 bestcpu = besthomecpu = NULL; 2506 klgrpset_clear(done); 2507 /* start with lpl we were passed */ 2508 2509 lpl_iter = lpl; 2510 2511 do { 2512 2513 bestpri = SHRT_MAX; 2514 klgrpset_clear(cur_set); 2515 2516 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2517 lpl_leaf = lpl_iter->lpl_rset[i]; 2518 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2519 continue; 2520 2521 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2522 2523 if (hint->cpu_lpl == lpl_leaf) 2524 cp = cpstart = hint; 2525 else 2526 cp = cpstart = lpl_leaf->lpl_cpus; 2527 2528 do { 2529 if (cp == curcpu) 2530 cpupri = -1; 2531 else if (cp == cpu_inmotion) 2532 cpupri = SHRT_MAX; 2533 else 2534 cpupri = cp->cpu_dispatch_pri; 2535 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2536 cpupri = cp->cpu_disp->disp_maxrunpri; 2537 if (cp->cpu_chosen_level > cpupri) 2538 cpupri = cp->cpu_chosen_level; 2539 if (cpupri < bestpri) { 2540 if (CPU_IDLING(cpupri)) { 2541 ASSERT((cp->cpu_flags & 2542 CPU_QUIESCED) == 0); 2543 return (cp); 2544 } 2545 bestcpu = cp; 2546 bestpri = cpupri; 2547 } 2548 } while ((cp = cp->cpu_next_lpl) != cpstart); 2549 } 2550 2551 if (bestcpu && (tpri > bestpri)) { 2552 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2553 return (bestcpu); 2554 } 2555 if (besthomecpu == NULL) 2556 besthomecpu = bestcpu; 2557 /* 2558 * Add the lgrps we just considered to the "done" set 2559 */ 2560 klgrpset_or(done, cur_set); 2561 2562 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2563 2564 /* 2565 * The specified priority isn't high enough to run immediately 2566 * anywhere, so just return the best CPU from the home lgroup. 2567 */ 2568 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2569 return (besthomecpu); 2570 } 2571 2572 /* 2573 * This routine provides the generic idle cpu function for all processors. 2574 * If a processor has some specific code to execute when idle (say, to stop 2575 * the pipeline and save power) then that routine should be defined in the 2576 * processors specific code (module_xx.c) and the global variable idle_cpu 2577 * set to that function. 2578 */ 2579 static void 2580 generic_idle_cpu(void) 2581 { 2582 } 2583 2584 /*ARGSUSED*/ 2585 static void 2586 generic_enq_thread(cpu_t *cpu, int bound) 2587 { 2588 } 2589 2590 /* 2591 * Select a CPU for this thread to run on. Choose t->t_cpu unless: 2592 * - t->t_cpu is not in this thread's assigned lgrp 2593 * - the time since the thread last came off t->t_cpu exceeds the 2594 * rechoose time for this cpu (ignore this if t is curthread in 2595 * which case it's on CPU and t->t_disp_time is inaccurate) 2596 * - t->t_cpu is presently the target of an offline or partition move 2597 * request 2598 */ 2599 static cpu_t * 2600 cpu_choose(kthread_t *t, pri_t tpri) 2601 { 2602 ASSERT(tpri < kpqpri); 2603 2604 if ((((lbolt - t->t_disp_time) > t->t_cpu->cpu_rechoose) && 2605 t != curthread) || t->t_cpu == cpu_inmotion) { 2606 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, NULL)); 2607 } 2608 2609 /* 2610 * Take a trip through disp_lowpri_cpu() if the thread was 2611 * running outside it's home lgroup 2612 */ 2613 if (!klgrpset_ismember(t->t_lpl->lpl_lgrp->lgrp_set[LGRP_RSRC_CPU], 2614 t->t_cpu->cpu_lpl->lpl_lgrpid)) { 2615 return (disp_lowpri_cpu(t->t_cpu, t->t_lpl, tpri, 2616 (t == curthread) ? t->t_cpu : NULL)); 2617 } 2618 return (t->t_cpu); 2619 } 2620