1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 2019 Joyent, Inc. 28 */ 29 30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 31 /* All Rights Reserved */ 32 33 34 #include <sys/types.h> 35 #include <sys/param.h> 36 #include <sys/sysmacros.h> 37 #include <sys/signal.h> 38 #include <sys/user.h> 39 #include <sys/systm.h> 40 #include <sys/sysinfo.h> 41 #include <sys/var.h> 42 #include <sys/errno.h> 43 #include <sys/cmn_err.h> 44 #include <sys/debug.h> 45 #include <sys/inline.h> 46 #include <sys/disp.h> 47 #include <sys/class.h> 48 #include <sys/bitmap.h> 49 #include <sys/kmem.h> 50 #include <sys/cpuvar.h> 51 #include <sys/vtrace.h> 52 #include <sys/tnf.h> 53 #include <sys/cpupart.h> 54 #include <sys/lgrp.h> 55 #include <sys/pg.h> 56 #include <sys/cmt.h> 57 #include <sys/bitset.h> 58 #include <sys/schedctl.h> 59 #include <sys/atomic.h> 60 #include <sys/dtrace.h> 61 #include <sys/sdt.h> 62 #include <sys/archsystm.h> 63 #include <sys/smt.h> 64 65 #include <vm/as.h> 66 67 #define BOUND_CPU 0x1 68 #define BOUND_PARTITION 0x2 69 #define BOUND_INTR 0x4 70 71 /* Dispatch queue allocation structure and functions */ 72 struct disp_queue_info { 73 disp_t *dp; 74 dispq_t *olddispq; 75 dispq_t *newdispq; 76 ulong_t *olddqactmap; 77 ulong_t *newdqactmap; 78 int oldnglobpris; 79 }; 80 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 81 disp_t *dp); 82 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 83 static void disp_dq_free(struct disp_queue_info *dptr); 84 85 /* platform-specific routine to call when processor is idle */ 86 static void generic_idle_cpu(); 87 void (*idle_cpu)() = generic_idle_cpu; 88 89 /* routines invoked when a CPU enters/exits the idle loop */ 90 static void idle_enter(); 91 static void idle_exit(); 92 93 /* platform-specific routine to call when thread is enqueued */ 94 static void generic_enq_thread(cpu_t *, int); 95 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 96 97 pri_t kpreemptpri; /* priority where kernel preemption applies */ 98 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 99 pri_t intr_pri; /* interrupt thread priority base level */ 100 101 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 102 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 103 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 104 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 105 int nswapped; /* total number of swapped threads */ 106 void disp_swapped_enq(kthread_t *tp); 107 static void disp_swapped_setrun(kthread_t *tp); 108 static void cpu_resched(cpu_t *cp, pri_t tpri); 109 110 /* 111 * If this is set, only interrupt threads will cause kernel preemptions. 112 * This is done by changing the value of kpreemptpri. kpreemptpri 113 * will either be the max sysclass pri + 1 or the min interrupt pri. 114 */ 115 int only_intr_kpreempt; 116 117 extern void set_idle_cpu(int cpun); 118 extern void unset_idle_cpu(int cpun); 119 static void setkpdq(kthread_t *tp, int borf); 120 #define SETKP_BACK 0 121 #define SETKP_FRONT 1 122 /* 123 * Parameter that determines how recently a thread must have run 124 * on the CPU to be considered loosely-bound to that CPU to reduce 125 * cold cache effects. The interval is in hertz. 126 */ 127 #define RECHOOSE_INTERVAL 3 128 int rechoose_interval = RECHOOSE_INTERVAL; 129 130 /* 131 * Parameter that determines how long (in nanoseconds) a thread must 132 * be sitting on a run queue before it can be stolen by another CPU 133 * to reduce migrations. The interval is in nanoseconds. 134 * 135 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval() 136 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED 137 * here indicating it is uninitiallized. 138 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'. 139 * 140 */ 141 #define NOSTEAL_UNINITIALIZED (-1) 142 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED; 143 extern void cmp_set_nosteal_interval(void); 144 145 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 146 147 disp_lock_t transition_lock; /* lock on transitioning threads */ 148 disp_lock_t stop_lock; /* lock on stopped threads */ 149 150 static void cpu_dispqalloc(int numpris); 151 152 /* 153 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 154 * a thread because it was sitting on its run queue for a very short 155 * period of time. 156 */ 157 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 158 159 static kthread_t *disp_getwork(cpu_t *to); 160 static kthread_t *disp_getbest(disp_t *from); 161 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 162 163 void swtch_to(kthread_t *); 164 165 /* 166 * dispatcher and scheduler initialization 167 */ 168 169 /* 170 * disp_setup - Common code to calculate and allocate dispatcher 171 * variables and structures based on the maximum priority. 172 */ 173 static void 174 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 175 { 176 pri_t newnglobpris; 177 178 ASSERT(MUTEX_HELD(&cpu_lock)); 179 180 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 181 182 if (newnglobpris > oldnglobpris) { 183 /* 184 * Allocate new kp queues for each CPU partition. 185 */ 186 cpupart_kpqalloc(newnglobpris); 187 188 /* 189 * Allocate new dispatch queues for each CPU. 190 */ 191 cpu_dispqalloc(newnglobpris); 192 193 /* 194 * compute new interrupt thread base priority 195 */ 196 intr_pri = maxglobpri; 197 if (only_intr_kpreempt) { 198 kpreemptpri = intr_pri + 1; 199 if (kpqpri == KPQPRI) 200 kpqpri = kpreemptpri; 201 } 202 v.v_nglobpris = newnglobpris; 203 } 204 } 205 206 /* 207 * dispinit - Called to initialize all loaded classes and the 208 * dispatcher framework. 209 */ 210 void 211 dispinit(void) 212 { 213 id_t cid; 214 pri_t maxglobpri; 215 pri_t cl_maxglobpri; 216 217 maxglobpri = -1; 218 219 /* 220 * Initialize transition lock, which will always be set. 221 */ 222 DISP_LOCK_INIT(&transition_lock); 223 disp_lock_enter_high(&transition_lock); 224 DISP_LOCK_INIT(&stop_lock); 225 226 mutex_enter(&cpu_lock); 227 CPU->cpu_disp->disp_maxrunpri = -1; 228 CPU->cpu_disp->disp_max_unbound_pri = -1; 229 230 /* 231 * Initialize the default CPU partition. 232 */ 233 cpupart_initialize_default(); 234 /* 235 * Call the class specific initialization functions for 236 * all pre-installed schedulers. 237 * 238 * We pass the size of a class specific parameter 239 * buffer to each of the initialization functions 240 * to try to catch problems with backward compatibility 241 * of class modules. 242 * 243 * For example a new class module running on an old system 244 * which didn't provide sufficiently large parameter buffers 245 * would be bad news. Class initialization modules can check for 246 * this and take action if they detect a problem. 247 */ 248 249 for (cid = 0; cid < nclass; cid++) { 250 sclass_t *sc; 251 252 sc = &sclass[cid]; 253 if (SCHED_INSTALLED(sc)) { 254 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 255 &sc->cl_funcs); 256 if (cl_maxglobpri > maxglobpri) 257 maxglobpri = cl_maxglobpri; 258 } 259 } 260 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 261 if (kpqpri == KPQPRI) 262 kpqpri = kpreemptpri; 263 264 ASSERT(maxglobpri >= 0); 265 disp_setup(maxglobpri, 0); 266 267 mutex_exit(&cpu_lock); 268 269 /* 270 * Platform specific sticky scheduler setup. 271 */ 272 if (nosteal_nsec == NOSTEAL_UNINITIALIZED) 273 cmp_set_nosteal_interval(); 274 275 /* 276 * Get the default class ID; this may be later modified via 277 * dispadmin(1M). This will load the class (normally TS) and that will 278 * call disp_add(), which is why we had to drop cpu_lock first. 279 */ 280 if (getcid(defaultclass, &defaultcid) != 0) { 281 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 282 defaultclass); 283 } 284 } 285 286 /* 287 * disp_add - Called with class pointer to initialize the dispatcher 288 * for a newly loaded class. 289 */ 290 void 291 disp_add(sclass_t *clp) 292 { 293 pri_t maxglobpri; 294 pri_t cl_maxglobpri; 295 296 mutex_enter(&cpu_lock); 297 /* 298 * Initialize the scheduler class. 299 */ 300 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 301 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 302 if (cl_maxglobpri > maxglobpri) 303 maxglobpri = cl_maxglobpri; 304 305 /* 306 * Save old queue information. Since we're initializing a 307 * new scheduling class which has just been loaded, then 308 * the size of the dispq may have changed. We need to handle 309 * that here. 310 */ 311 disp_setup(maxglobpri, v.v_nglobpris); 312 313 mutex_exit(&cpu_lock); 314 } 315 316 317 /* 318 * For each CPU, allocate new dispatch queues 319 * with the stated number of priorities. 320 */ 321 static void 322 cpu_dispqalloc(int numpris) 323 { 324 cpu_t *cpup; 325 struct disp_queue_info *disp_mem; 326 int i, num; 327 328 ASSERT(MUTEX_HELD(&cpu_lock)); 329 330 disp_mem = kmem_zalloc(NCPU * 331 sizeof (struct disp_queue_info), KM_SLEEP); 332 333 /* 334 * This routine must allocate all of the memory before stopping 335 * the cpus because it must not sleep in kmem_alloc while the 336 * CPUs are stopped. Locks they hold will not be freed until they 337 * are restarted. 338 */ 339 i = 0; 340 cpup = cpu_list; 341 do { 342 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 343 i++; 344 cpup = cpup->cpu_next; 345 } while (cpup != cpu_list); 346 num = i; 347 348 pause_cpus(NULL, NULL); 349 for (i = 0; i < num; i++) 350 disp_dq_assign(&disp_mem[i], numpris); 351 start_cpus(); 352 353 /* 354 * I must free all of the memory after starting the cpus because 355 * I can not risk sleeping in kmem_free while the cpus are stopped. 356 */ 357 for (i = 0; i < num; i++) 358 disp_dq_free(&disp_mem[i]); 359 360 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 361 } 362 363 static void 364 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 365 { 366 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 367 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 368 sizeof (long), KM_SLEEP); 369 dptr->dp = dp; 370 } 371 372 static void 373 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 374 { 375 disp_t *dp; 376 377 dp = dptr->dp; 378 dptr->olddispq = dp->disp_q; 379 dptr->olddqactmap = dp->disp_qactmap; 380 dptr->oldnglobpris = dp->disp_npri; 381 382 ASSERT(dptr->oldnglobpris < numpris); 383 384 if (dptr->olddispq != NULL) { 385 /* 386 * Use kcopy because bcopy is platform-specific 387 * and could block while we might have paused the cpus. 388 */ 389 (void) kcopy(dptr->olddispq, dptr->newdispq, 390 dptr->oldnglobpris * sizeof (dispq_t)); 391 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 392 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 393 sizeof (long)); 394 } 395 dp->disp_q = dptr->newdispq; 396 dp->disp_qactmap = dptr->newdqactmap; 397 dp->disp_q_limit = &dptr->newdispq[numpris]; 398 dp->disp_npri = numpris; 399 } 400 401 static void 402 disp_dq_free(struct disp_queue_info *dptr) 403 { 404 if (dptr->olddispq != NULL) 405 kmem_free(dptr->olddispq, 406 dptr->oldnglobpris * sizeof (dispq_t)); 407 if (dptr->olddqactmap != NULL) 408 kmem_free(dptr->olddqactmap, 409 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 410 } 411 412 /* 413 * For a newly created CPU, initialize the dispatch queue. 414 * This is called before the CPU is known through cpu[] or on any lists. 415 */ 416 void 417 disp_cpu_init(cpu_t *cp) 418 { 419 disp_t *dp; 420 dispq_t *newdispq; 421 ulong_t *newdqactmap; 422 423 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 424 425 if (cp == cpu0_disp.disp_cpu) 426 dp = &cpu0_disp; 427 else 428 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 429 bzero(dp, sizeof (disp_t)); 430 cp->cpu_disp = dp; 431 dp->disp_cpu = cp; 432 dp->disp_maxrunpri = -1; 433 dp->disp_max_unbound_pri = -1; 434 DISP_LOCK_INIT(&cp->cpu_thread_lock); 435 /* 436 * Allocate memory for the dispatcher queue headers 437 * and the active queue bitmap. 438 */ 439 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 440 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 441 sizeof (long), KM_SLEEP); 442 dp->disp_q = newdispq; 443 dp->disp_qactmap = newdqactmap; 444 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 445 dp->disp_npri = v.v_nglobpris; 446 } 447 448 void 449 disp_cpu_fini(cpu_t *cp) 450 { 451 ASSERT(MUTEX_HELD(&cpu_lock)); 452 453 disp_kp_free(cp->cpu_disp); 454 if (cp->cpu_disp != &cpu0_disp) 455 kmem_free(cp->cpu_disp, sizeof (disp_t)); 456 } 457 458 /* 459 * Allocate new, larger kpreempt dispatch queue to replace the old one. 460 */ 461 void 462 disp_kp_alloc(disp_t *dq, pri_t npri) 463 { 464 struct disp_queue_info mem_info; 465 466 if (npri > dq->disp_npri) { 467 /* 468 * Allocate memory for the new array. 469 */ 470 disp_dq_alloc(&mem_info, npri, dq); 471 472 /* 473 * We need to copy the old structures to the new 474 * and free the old. 475 */ 476 disp_dq_assign(&mem_info, npri); 477 disp_dq_free(&mem_info); 478 } 479 } 480 481 /* 482 * Free dispatch queue. 483 * Used for the kpreempt queues for a removed CPU partition and 484 * for the per-CPU queues of deleted CPUs. 485 */ 486 void 487 disp_kp_free(disp_t *dq) 488 { 489 struct disp_queue_info mem_info; 490 491 mem_info.olddispq = dq->disp_q; 492 mem_info.olddqactmap = dq->disp_qactmap; 493 mem_info.oldnglobpris = dq->disp_npri; 494 disp_dq_free(&mem_info); 495 } 496 497 /* 498 * End dispatcher and scheduler initialization. 499 */ 500 501 /* 502 * See if there's anything to do other than remain idle. 503 * Return non-zero if there is. 504 * 505 * This function must be called with high spl, or with 506 * kernel preemption disabled to prevent the partition's 507 * active cpu list from changing while being traversed. 508 * 509 * This is essentially a simpler version of disp_getwork() 510 * to be called by CPUs preparing to "halt". 511 */ 512 int 513 disp_anywork(void) 514 { 515 cpu_t *cp = CPU; 516 cpu_t *ocp; 517 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable; 518 519 if (!(cp->cpu_flags & CPU_OFFLINE)) { 520 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 521 return (1); 522 523 for (ocp = cp->cpu_next_part; ocp != cp; 524 ocp = ocp->cpu_next_part) { 525 ASSERT(CPU_ACTIVE(ocp)); 526 527 /* 528 * Something has appeared on the local run queue. 529 */ 530 if (*local_nrunnable > 0) 531 return (1); 532 /* 533 * If we encounter another idle CPU that will 534 * soon be trolling around through disp_anywork() 535 * terminate our walk here and let this other CPU 536 * patrol the next part of the list. 537 */ 538 if (ocp->cpu_dispatch_pri == -1 && 539 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0) 540 return (0); 541 /* 542 * Work can be taken from another CPU if: 543 * - There is unbound work on the run queue 544 * - That work isn't a thread undergoing a 545 * - context switch on an otherwise empty queue. 546 * - The CPU isn't running the idle loop. 547 */ 548 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 549 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 550 ocp->cpu_disp->disp_nrunnable == 1) && 551 ocp->cpu_dispatch_pri != -1) 552 return (1); 553 } 554 } 555 return (0); 556 } 557 558 /* 559 * Called when CPU enters the idle loop 560 */ 561 static void 562 idle_enter() 563 { 564 cpu_t *cp = CPU; 565 566 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 567 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 568 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 569 } 570 571 /* 572 * Called when CPU exits the idle loop 573 */ 574 static void 575 idle_exit() 576 { 577 cpu_t *cp = CPU; 578 579 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 580 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 581 } 582 583 /* 584 * Idle loop. 585 */ 586 void 587 idle() 588 { 589 struct cpu *cp = CPU; /* pointer to this CPU */ 590 kthread_t *t; /* taken thread */ 591 592 idle_enter(); 593 594 /* 595 * Uniprocessor version of idle loop. 596 * Do this until notified that we're on an actual multiprocessor. 597 */ 598 while (ncpus == 1) { 599 if (cp->cpu_disp->disp_nrunnable == 0) { 600 (*idle_cpu)(); 601 continue; 602 } 603 idle_exit(); 604 swtch(); 605 606 idle_enter(); /* returned from swtch */ 607 } 608 609 /* 610 * Multiprocessor idle loop. 611 */ 612 for (;;) { 613 /* 614 * If CPU is completely quiesced by p_online(2), just wait 615 * here with minimal bus traffic until put online. 616 */ 617 while (cp->cpu_flags & CPU_QUIESCED) 618 (*idle_cpu)(); 619 620 if (cp->cpu_disp->disp_nrunnable != 0) { 621 idle_exit(); 622 swtch(); 623 } else { 624 if (cp->cpu_flags & CPU_OFFLINE) 625 continue; 626 if ((t = disp_getwork(cp)) == NULL) { 627 if (cp->cpu_chosen_level != -1) { 628 disp_t *dp = cp->cpu_disp; 629 disp_t *kpq; 630 631 disp_lock_enter(&dp->disp_lock); 632 /* 633 * Set kpq under lock to prevent 634 * migration between partitions. 635 */ 636 kpq = &cp->cpu_part->cp_kp_queue; 637 if (kpq->disp_maxrunpri == -1) 638 cp->cpu_chosen_level = -1; 639 disp_lock_exit(&dp->disp_lock); 640 } 641 (*idle_cpu)(); 642 continue; 643 } 644 /* 645 * If there was a thread but we couldn't steal 646 * it, then keep trying. 647 */ 648 if (t == T_DONTSTEAL) 649 continue; 650 idle_exit(); 651 swtch_to(t); 652 } 653 idle_enter(); /* returned from swtch/swtch_to */ 654 } 655 } 656 657 658 /* 659 * Preempt the currently running thread in favor of the highest 660 * priority thread. The class of the current thread controls 661 * where it goes on the dispatcher queues. If panicking, turn 662 * preemption off. 663 */ 664 void 665 preempt() 666 { 667 kthread_t *t = curthread; 668 klwp_t *lwp = ttolwp(curthread); 669 670 if (panicstr) 671 return; 672 673 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 674 675 thread_lock(t); 676 677 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 678 /* 679 * this thread has already been chosen to be run on 680 * another CPU. Clear kprunrun on this CPU since we're 681 * already headed for swtch(). 682 */ 683 CPU->cpu_kprunrun = 0; 684 thread_unlock_nopreempt(t); 685 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 686 } else { 687 if (lwp != NULL) 688 lwp->lwp_ru.nivcsw++; 689 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 690 THREAD_TRANSITION(t); 691 CL_PREEMPT(t); 692 DTRACE_SCHED(preempt); 693 thread_unlock_nopreempt(t); 694 695 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 696 697 swtch(); /* clears CPU->cpu_runrun via disp() */ 698 } 699 } 700 701 extern kthread_t *thread_unpin(); 702 703 /* 704 * disp() - find the highest priority thread for this processor to run, and 705 * set it in TS_ONPROC state so that resume() can be called to run it. 706 */ 707 static kthread_t * 708 disp() 709 { 710 cpu_t *cpup; 711 disp_t *dp; 712 kthread_t *tp; 713 dispq_t *dq; 714 int maxrunword; 715 pri_t pri; 716 disp_t *kpq; 717 718 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 719 720 cpup = CPU; 721 /* 722 * Find the highest priority loaded, runnable thread. 723 */ 724 dp = cpup->cpu_disp; 725 726 reschedule: 727 /* 728 * If there is more important work on the global queue with a better 729 * priority than the maximum on this CPU, take it now. 730 */ 731 kpq = &cpup->cpu_part->cp_kp_queue; 732 while ((pri = kpq->disp_maxrunpri) >= 0 && 733 pri >= dp->disp_maxrunpri && 734 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 735 (tp = disp_getbest(kpq)) != NULL) { 736 if (disp_ratify(tp, kpq) != NULL) { 737 TRACE_1(TR_FAC_DISP, TR_DISP_END, 738 "disp_end:tid %p", tp); 739 return (tp); 740 } 741 } 742 743 disp_lock_enter(&dp->disp_lock); 744 pri = dp->disp_maxrunpri; 745 746 /* 747 * If there is nothing to run, look at what's runnable on other queues. 748 * Choose the idle thread if the CPU is quiesced. 749 * Note that CPUs that have the CPU_OFFLINE flag set can still run 750 * interrupt threads, which will be the only threads on the CPU's own 751 * queue, but cannot run threads from other queues. 752 */ 753 if (pri == -1) { 754 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 755 disp_lock_exit(&dp->disp_lock); 756 if ((tp = disp_getwork(cpup)) == NULL || 757 tp == T_DONTSTEAL) { 758 tp = cpup->cpu_idle_thread; 759 (void) splhigh(); 760 THREAD_ONPROC(tp, cpup); 761 cpup->cpu_dispthread = tp; 762 cpup->cpu_dispatch_pri = -1; 763 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 764 cpup->cpu_chosen_level = -1; 765 } 766 } else { 767 disp_lock_exit_high(&dp->disp_lock); 768 tp = cpup->cpu_idle_thread; 769 THREAD_ONPROC(tp, cpup); 770 cpup->cpu_dispthread = tp; 771 cpup->cpu_dispatch_pri = -1; 772 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 773 cpup->cpu_chosen_level = -1; 774 } 775 TRACE_1(TR_FAC_DISP, TR_DISP_END, 776 "disp_end:tid %p", tp); 777 return (tp); 778 } 779 780 dq = &dp->disp_q[pri]; 781 tp = dq->dq_first; 782 783 ASSERT(tp != NULL); 784 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 785 786 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 787 788 /* 789 * Found it so remove it from queue. 790 */ 791 dp->disp_nrunnable--; 792 dq->dq_sruncnt--; 793 if ((dq->dq_first = tp->t_link) == NULL) { 794 ulong_t *dqactmap = dp->disp_qactmap; 795 796 ASSERT(dq->dq_sruncnt == 0); 797 dq->dq_last = NULL; 798 799 /* 800 * The queue is empty, so the corresponding bit needs to be 801 * turned off in dqactmap. If nrunnable != 0 just took the 802 * last runnable thread off the 803 * highest queue, so recompute disp_maxrunpri. 804 */ 805 maxrunword = pri >> BT_ULSHIFT; 806 dqactmap[maxrunword] &= ~BT_BIW(pri); 807 808 if (dp->disp_nrunnable == 0) { 809 dp->disp_max_unbound_pri = -1; 810 dp->disp_maxrunpri = -1; 811 } else { 812 int ipri; 813 814 ipri = bt_gethighbit(dqactmap, maxrunword); 815 dp->disp_maxrunpri = ipri; 816 if (ipri < dp->disp_max_unbound_pri) 817 dp->disp_max_unbound_pri = ipri; 818 } 819 } else { 820 tp->t_link = NULL; 821 } 822 823 /* 824 * Set TS_DONT_SWAP flag to prevent another processor from swapping 825 * out this thread before we have a chance to run it. 826 * While running, it is protected against swapping by t_lock. 827 */ 828 tp->t_schedflag |= TS_DONT_SWAP; 829 cpup->cpu_dispthread = tp; /* protected by spl only */ 830 cpup->cpu_dispatch_pri = pri; 831 ASSERT(pri == DISP_PRIO(tp)); 832 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 833 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 834 835 ASSERT(tp != NULL); 836 TRACE_1(TR_FAC_DISP, TR_DISP_END, 837 "disp_end:tid %p", tp); 838 839 if (disp_ratify(tp, kpq) == NULL) 840 goto reschedule; 841 842 return (tp); 843 } 844 845 /* 846 * swtch() 847 * Find best runnable thread and run it. 848 * Called with the current thread already switched to a new state, 849 * on a sleep queue, run queue, stopped, and not zombied. 850 * May be called at any spl level less than or equal to LOCK_LEVEL. 851 * Always drops spl to the base level (spl0()). 852 */ 853 void 854 swtch() 855 { 856 kthread_t *t = curthread; 857 kthread_t *next; 858 cpu_t *cp; 859 860 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 861 862 if (t->t_flag & T_INTR_THREAD) 863 cpu_intr_swtch_enter(t); 864 865 if (t->t_intr != NULL) { 866 /* 867 * We are an interrupt thread. Setup and return 868 * the interrupted thread to be resumed. 869 */ 870 (void) splhigh(); /* block other scheduler action */ 871 cp = CPU; /* now protected against migration */ 872 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 873 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 874 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 875 next = thread_unpin(); 876 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 877 resume_from_intr(next); 878 } else { 879 #ifdef DEBUG 880 if (t->t_state == TS_ONPROC && 881 t->t_disp_queue->disp_cpu == CPU && 882 t->t_preempt == 0) { 883 thread_lock(t); 884 ASSERT(t->t_state != TS_ONPROC || 885 t->t_disp_queue->disp_cpu != CPU || 886 t->t_preempt != 0); /* cannot migrate */ 887 thread_unlock_nopreempt(t); 888 } 889 #endif /* DEBUG */ 890 cp = CPU; 891 next = disp(); /* returns with spl high */ 892 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 893 894 /* OK to steal anything left on run queue */ 895 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 896 897 if (next != t) { 898 hrtime_t now; 899 900 now = gethrtime_unscaled(); 901 pg_ev_thread_swtch(cp, now, t, next); 902 903 /* 904 * If t was previously in the TS_ONPROC state, 905 * setfrontdq and setbackdq won't have set its t_waitrq. 906 * Since we now finally know that we're switching away 907 * from this thread, set its t_waitrq if it is on a run 908 * queue. 909 */ 910 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 911 t->t_waitrq = now; 912 } 913 914 /* 915 * restore mstate of thread that we are switching to 916 */ 917 restore_mstate(next); 918 919 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 920 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt(); 921 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 922 923 if (dtrace_vtime_active) 924 dtrace_vtime_switch(next); 925 926 resume(next); 927 /* 928 * The TR_RESUME_END and TR_SWTCH_END trace points 929 * appear at the end of resume(), because we may not 930 * return here 931 */ 932 } else { 933 if (t->t_flag & T_INTR_THREAD) 934 cpu_intr_swtch_exit(t); 935 /* 936 * Threads that enqueue themselves on a run queue defer 937 * setting t_waitrq. It is then either set in swtch() 938 * when the CPU is actually yielded, or not at all if it 939 * is remaining on the CPU. 940 * There is however a window between where the thread 941 * placed itself on a run queue, and where it selects 942 * itself in disp(), where a third party (eg. clock() 943 * doing tick processing) may have re-enqueued this 944 * thread, setting t_waitrq in the process. We detect 945 * this race by noticing that despite switching to 946 * ourself, our t_waitrq has been set, and should be 947 * cleared. 948 */ 949 if (t->t_waitrq != 0) 950 t->t_waitrq = 0; 951 952 pg_ev_thread_remain(cp, t); 953 954 DTRACE_SCHED(remain__cpu); 955 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 956 (void) spl0(); 957 } 958 } 959 } 960 961 /* 962 * swtch_from_zombie() 963 * Special case of swtch(), which allows checks for TS_ZOMB to be 964 * eliminated from normal resume. 965 * Find best runnable thread and run it. 966 * Called with the current thread zombied. 967 * Zombies cannot migrate, so CPU references are safe. 968 */ 969 void 970 swtch_from_zombie() 971 { 972 kthread_t *next; 973 cpu_t *cpu = CPU; 974 975 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 976 977 ASSERT(curthread->t_state == TS_ZOMB); 978 979 next = disp(); /* returns with spl high */ 980 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 981 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 982 ASSERT(next != curthread); 983 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 984 985 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next); 986 987 restore_mstate(next); 988 989 if (dtrace_vtime_active) 990 dtrace_vtime_switch(next); 991 992 resume_from_zombie(next); 993 /* 994 * The TR_RESUME_END and TR_SWTCH_END trace points 995 * appear at the end of resume(), because we certainly will not 996 * return here 997 */ 998 } 999 1000 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 1001 1002 /* 1003 * search_disp_queues() 1004 * Search the given dispatch queues for thread tp. 1005 * Return 1 if tp is found, otherwise return 0. 1006 */ 1007 static int 1008 search_disp_queues(disp_t *dp, kthread_t *tp) 1009 { 1010 dispq_t *dq; 1011 dispq_t *eq; 1012 1013 disp_lock_enter_high(&dp->disp_lock); 1014 1015 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 1016 kthread_t *rp; 1017 1018 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1019 1020 for (rp = dq->dq_first; rp; rp = rp->t_link) 1021 if (tp == rp) { 1022 disp_lock_exit_high(&dp->disp_lock); 1023 return (1); 1024 } 1025 } 1026 disp_lock_exit_high(&dp->disp_lock); 1027 1028 return (0); 1029 } 1030 1031 /* 1032 * thread_on_queue() 1033 * Search all per-CPU dispatch queues and all partition-wide kpreempt 1034 * queues for thread tp. Return 1 if tp is found, otherwise return 0. 1035 */ 1036 static int 1037 thread_on_queue(kthread_t *tp) 1038 { 1039 cpu_t *cp; 1040 struct cpupart *part; 1041 1042 ASSERT(getpil() >= DISP_LEVEL); 1043 1044 /* 1045 * Search the per-CPU dispatch queues for tp. 1046 */ 1047 cp = CPU; 1048 do { 1049 if (search_disp_queues(cp->cpu_disp, tp)) 1050 return (1); 1051 } while ((cp = cp->cpu_next_onln) != CPU); 1052 1053 /* 1054 * Search the partition-wide kpreempt queues for tp. 1055 */ 1056 part = CPU->cpu_part; 1057 do { 1058 if (search_disp_queues(&part->cp_kp_queue, tp)) 1059 return (1); 1060 } while ((part = part->cp_next) != CPU->cpu_part); 1061 1062 return (0); 1063 } 1064 1065 #else 1066 1067 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1068 1069 #endif /* DEBUG */ 1070 1071 /* 1072 * like swtch(), but switch to a specified thread taken from another CPU. 1073 * called with spl high.. 1074 */ 1075 void 1076 swtch_to(kthread_t *next) 1077 { 1078 cpu_t *cp = CPU; 1079 hrtime_t now; 1080 1081 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1082 1083 /* 1084 * Update context switch statistics. 1085 */ 1086 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1087 1088 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1089 1090 now = gethrtime_unscaled(); 1091 pg_ev_thread_swtch(cp, now, curthread, next); 1092 1093 /* OK to steal anything left on run queue */ 1094 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1095 1096 /* record last execution time */ 1097 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt(); 1098 1099 /* 1100 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1101 * won't have set its t_waitrq. Since we now finally know that we're 1102 * switching away from this thread, set its t_waitrq if it is on a run 1103 * queue. 1104 */ 1105 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1106 curthread->t_waitrq = now; 1107 } 1108 1109 /* restore next thread to previously running microstate */ 1110 restore_mstate(next); 1111 1112 if (dtrace_vtime_active) 1113 dtrace_vtime_switch(next); 1114 1115 resume(next); 1116 /* 1117 * The TR_RESUME_END and TR_SWTCH_END trace points 1118 * appear at the end of resume(), because we may not 1119 * return here 1120 */ 1121 } 1122 1123 static void 1124 cpu_resched(cpu_t *cp, pri_t tpri) 1125 { 1126 int call_poke_cpu = 0; 1127 pri_t cpupri = cp->cpu_dispatch_pri; 1128 1129 if (cpupri != CPU_IDLE_PRI && cpupri < tpri) { 1130 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1131 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1132 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1133 cp->cpu_runrun = 1; 1134 aston(cp->cpu_dispthread); 1135 if (tpri < kpreemptpri && cp != CPU) 1136 call_poke_cpu = 1; 1137 } 1138 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1139 cp->cpu_kprunrun = 1; 1140 if (cp != CPU) 1141 call_poke_cpu = 1; 1142 } 1143 } 1144 1145 /* 1146 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1147 */ 1148 membar_enter(); 1149 1150 if (call_poke_cpu) 1151 poke_cpu(cp->cpu_id); 1152 } 1153 1154 /* 1155 * setbackdq() keeps runqs balanced such that the difference in length 1156 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1157 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1158 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1159 * try to keep runqs perfectly balanced regardless of the thread priority. 1160 */ 1161 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1162 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1163 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1164 1165 /* 1166 * Macro that evaluates to true if it is likely that the thread has cache 1167 * warmth. This is based on the amount of time that has elapsed since the 1168 * thread last ran. If that amount of time is less than "rechoose_interval" 1169 * ticks, then we decide that the thread has enough cache warmth to warrant 1170 * some affinity for t->t_cpu. 1171 */ 1172 #define THREAD_HAS_CACHE_WARMTH(thread) \ 1173 ((thread == curthread) || \ 1174 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval)) 1175 /* 1176 * Put the specified thread on the back of the dispatcher 1177 * queue corresponding to its current priority. 1178 * 1179 * Called with the thread in transition, onproc or stopped state 1180 * and locked (transition implies locked) and at high spl. 1181 * Returns with the thread in TS_RUN state and still locked. 1182 */ 1183 void 1184 setbackdq(kthread_t *tp) 1185 { 1186 dispq_t *dq; 1187 disp_t *dp; 1188 cpu_t *cp; 1189 pri_t tpri; 1190 int bound; 1191 boolean_t self; 1192 1193 ASSERT(THREAD_LOCK_HELD(tp)); 1194 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1195 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1196 1197 /* 1198 * If thread is "swapped" or on the swap queue don't 1199 * queue it, but wake sched. 1200 */ 1201 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1202 disp_swapped_setrun(tp); 1203 return; 1204 } 1205 1206 self = (tp == curthread); 1207 1208 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1209 bound = 1; 1210 else 1211 bound = 0; 1212 1213 tpri = DISP_PRIO(tp); 1214 if (ncpus == 1) 1215 cp = tp->t_cpu; 1216 else if (!bound) { 1217 if (tpri >= kpqpri) { 1218 setkpdq(tp, SETKP_BACK); 1219 return; 1220 } 1221 1222 /* 1223 * We'll generally let this thread continue to run where 1224 * it last ran...but will consider migration if: 1225 * - The thread probably doesn't have much cache warmth. 1226 * - SMT exclusion would prefer us to run elsewhere 1227 * - The CPU where it last ran is the target of an offline 1228 * request. 1229 * - The thread last ran outside its home lgroup. 1230 */ 1231 if ((!THREAD_HAS_CACHE_WARMTH(tp)) || 1232 !smt_should_run(tp, tp->t_cpu) || 1233 (tp->t_cpu == cpu_inmotion) || 1234 !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) { 1235 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri); 1236 } else { 1237 cp = tp->t_cpu; 1238 } 1239 1240 if (tp->t_cpupart == cp->cpu_part) { 1241 int qlen; 1242 1243 /* 1244 * Perform any CMT load balancing 1245 */ 1246 cp = cmt_balance(tp, cp); 1247 1248 /* 1249 * Balance across the run queues 1250 */ 1251 qlen = RUNQ_LEN(cp, tpri); 1252 if (tpri >= RUNQ_MATCH_PRI && 1253 !(tp->t_schedflag & TS_RUNQMATCH)) 1254 qlen -= RUNQ_MAX_DIFF; 1255 if (qlen > 0) { 1256 cpu_t *newcp; 1257 1258 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1259 newcp = cp->cpu_next_part; 1260 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1261 newcp = cp->cpu_next_part; 1262 } 1263 1264 if (smt_should_run(tp, newcp) && 1265 RUNQ_LEN(newcp, tpri) < qlen) { 1266 DTRACE_PROBE3(runq__balance, 1267 kthread_t *, tp, 1268 cpu_t *, cp, cpu_t *, newcp); 1269 cp = newcp; 1270 } 1271 } 1272 } else { 1273 /* 1274 * Migrate to a cpu in the new partition. 1275 */ 1276 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp, 1277 tp->t_pri); 1278 } 1279 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1280 } else { 1281 /* 1282 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1283 * a short time until weak binding that existed when the 1284 * strong binding was established has dropped) so we must 1285 * favour weak binding over strong. 1286 */ 1287 cp = tp->t_weakbound_cpu ? 1288 tp->t_weakbound_cpu : tp->t_bound_cpu; 1289 } 1290 /* 1291 * A thread that is ONPROC may be temporarily placed on the run queue 1292 * but then chosen to run again by disp. If the thread we're placing on 1293 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1294 * replacement process is actually scheduled in swtch(). In this 1295 * situation, curthread is the only thread that could be in the ONPROC 1296 * state. 1297 */ 1298 if ((!self) && (tp->t_waitrq == 0)) { 1299 hrtime_t curtime; 1300 1301 curtime = gethrtime_unscaled(); 1302 (void) cpu_update_pct(tp, curtime); 1303 tp->t_waitrq = curtime; 1304 } else { 1305 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1306 } 1307 1308 dp = cp->cpu_disp; 1309 disp_lock_enter_high(&dp->disp_lock); 1310 1311 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1312 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1313 tpri, cp, tp); 1314 1315 #ifndef NPROBE 1316 /* Kernel probe */ 1317 if (tnf_tracing_active) 1318 tnf_thread_queue(tp, cp, tpri); 1319 #endif /* NPROBE */ 1320 1321 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1322 1323 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1324 tp->t_disp_queue = dp; 1325 tp->t_link = NULL; 1326 1327 dq = &dp->disp_q[tpri]; 1328 dp->disp_nrunnable++; 1329 if (!bound) 1330 dp->disp_steal = 0; 1331 membar_enter(); 1332 1333 if (dq->dq_sruncnt++ != 0) { 1334 ASSERT(dq->dq_first != NULL); 1335 dq->dq_last->t_link = tp; 1336 dq->dq_last = tp; 1337 } else { 1338 ASSERT(dq->dq_first == NULL); 1339 ASSERT(dq->dq_last == NULL); 1340 dq->dq_first = dq->dq_last = tp; 1341 BT_SET(dp->disp_qactmap, tpri); 1342 if (tpri > dp->disp_maxrunpri) { 1343 dp->disp_maxrunpri = tpri; 1344 membar_enter(); 1345 cpu_resched(cp, tpri); 1346 } 1347 } 1348 1349 if (!bound && tpri > dp->disp_max_unbound_pri) { 1350 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) { 1351 /* 1352 * If there are no other unbound threads on the 1353 * run queue, don't allow other CPUs to steal 1354 * this thread while we are in the middle of a 1355 * context switch. We may just switch to it 1356 * again right away. CPU_DISP_DONTSTEAL is cleared 1357 * in swtch and swtch_to. 1358 */ 1359 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1360 } 1361 dp->disp_max_unbound_pri = tpri; 1362 } 1363 (*disp_enq_thread)(cp, bound); 1364 } 1365 1366 /* 1367 * Put the specified thread on the front of the dispatcher 1368 * queue corresponding to its current priority. 1369 * 1370 * Called with the thread in transition, onproc or stopped state 1371 * and locked (transition implies locked) and at high spl. 1372 * Returns with the thread in TS_RUN state and still locked. 1373 */ 1374 void 1375 setfrontdq(kthread_t *tp) 1376 { 1377 disp_t *dp; 1378 dispq_t *dq; 1379 cpu_t *cp; 1380 pri_t tpri; 1381 int bound; 1382 1383 ASSERT(THREAD_LOCK_HELD(tp)); 1384 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1385 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1386 1387 /* 1388 * If thread is "swapped" or on the swap queue don't 1389 * queue it, but wake sched. 1390 */ 1391 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) { 1392 disp_swapped_setrun(tp); 1393 return; 1394 } 1395 1396 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1397 bound = 1; 1398 else 1399 bound = 0; 1400 1401 tpri = DISP_PRIO(tp); 1402 if (ncpus == 1) 1403 cp = tp->t_cpu; 1404 else if (!bound) { 1405 if (tpri >= kpqpri) { 1406 setkpdq(tp, SETKP_FRONT); 1407 return; 1408 } 1409 cp = tp->t_cpu; 1410 if (tp->t_cpupart == cp->cpu_part) { 1411 /* 1412 * We'll generally let this thread continue to run 1413 * where it last ran, but will consider migration if: 1414 * - The thread last ran outside its home lgroup. 1415 * - The CPU where it last ran is the target of an 1416 * offline request (a thread_nomigrate() on the in 1417 * motion CPU relies on this when forcing a preempt). 1418 * - The thread isn't the highest priority thread where 1419 * it last ran, and it is considered not likely to 1420 * have significant cache warmth. 1421 */ 1422 if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) || 1423 cp == cpu_inmotion || 1424 (tpri < cp->cpu_disp->disp_maxrunpri && 1425 !THREAD_HAS_CACHE_WARMTH(tp))) { 1426 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri); 1427 } 1428 } else { 1429 /* 1430 * Migrate to a cpu in the new partition. 1431 */ 1432 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1433 tp, tp->t_pri); 1434 } 1435 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1436 } else { 1437 /* 1438 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1439 * a short time until weak binding that existed when the 1440 * strong binding was established has dropped) so we must 1441 * favour weak binding over strong. 1442 */ 1443 cp = tp->t_weakbound_cpu ? 1444 tp->t_weakbound_cpu : tp->t_bound_cpu; 1445 } 1446 1447 /* 1448 * A thread that is ONPROC may be temporarily placed on the run queue 1449 * but then chosen to run again by disp. If the thread we're placing on 1450 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1451 * replacement process is actually scheduled in swtch(). In this 1452 * situation, curthread is the only thread that could be in the ONPROC 1453 * state. 1454 */ 1455 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1456 hrtime_t curtime; 1457 1458 curtime = gethrtime_unscaled(); 1459 (void) cpu_update_pct(tp, curtime); 1460 tp->t_waitrq = curtime; 1461 } else { 1462 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1463 } 1464 1465 dp = cp->cpu_disp; 1466 disp_lock_enter_high(&dp->disp_lock); 1467 1468 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1469 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1470 1471 #ifndef NPROBE 1472 /* Kernel probe */ 1473 if (tnf_tracing_active) 1474 tnf_thread_queue(tp, cp, tpri); 1475 #endif /* NPROBE */ 1476 1477 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1478 1479 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1480 tp->t_disp_queue = dp; 1481 1482 dq = &dp->disp_q[tpri]; 1483 dp->disp_nrunnable++; 1484 if (!bound) 1485 dp->disp_steal = 0; 1486 membar_enter(); 1487 1488 if (dq->dq_sruncnt++ != 0) { 1489 ASSERT(dq->dq_last != NULL); 1490 tp->t_link = dq->dq_first; 1491 dq->dq_first = tp; 1492 } else { 1493 ASSERT(dq->dq_last == NULL); 1494 ASSERT(dq->dq_first == NULL); 1495 tp->t_link = NULL; 1496 dq->dq_first = dq->dq_last = tp; 1497 BT_SET(dp->disp_qactmap, tpri); 1498 if (tpri > dp->disp_maxrunpri) { 1499 dp->disp_maxrunpri = tpri; 1500 membar_enter(); 1501 cpu_resched(cp, tpri); 1502 } 1503 } 1504 1505 if (!bound && tpri > dp->disp_max_unbound_pri) { 1506 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1507 cp == CPU) { 1508 /* 1509 * If there are no other unbound threads on the 1510 * run queue, don't allow other CPUs to steal 1511 * this thread while we are in the middle of a 1512 * context switch. We may just switch to it 1513 * again right away. CPU_DISP_DONTSTEAL is cleared 1514 * in swtch and swtch_to. 1515 */ 1516 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1517 } 1518 dp->disp_max_unbound_pri = tpri; 1519 } 1520 (*disp_enq_thread)(cp, bound); 1521 } 1522 1523 /* 1524 * Put a high-priority unbound thread on the kp queue 1525 */ 1526 static void 1527 setkpdq(kthread_t *tp, int borf) 1528 { 1529 dispq_t *dq; 1530 disp_t *dp; 1531 cpu_t *cp; 1532 pri_t tpri; 1533 1534 tpri = DISP_PRIO(tp); 1535 1536 dp = &tp->t_cpupart->cp_kp_queue; 1537 disp_lock_enter_high(&dp->disp_lock); 1538 1539 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1540 1541 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1542 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1543 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1544 tp->t_disp_queue = dp; 1545 dp->disp_nrunnable++; 1546 dq = &dp->disp_q[tpri]; 1547 1548 if (dq->dq_sruncnt++ != 0) { 1549 if (borf == SETKP_BACK) { 1550 ASSERT(dq->dq_first != NULL); 1551 tp->t_link = NULL; 1552 dq->dq_last->t_link = tp; 1553 dq->dq_last = tp; 1554 } else { 1555 ASSERT(dq->dq_last != NULL); 1556 tp->t_link = dq->dq_first; 1557 dq->dq_first = tp; 1558 } 1559 } else { 1560 if (borf == SETKP_BACK) { 1561 ASSERT(dq->dq_first == NULL); 1562 ASSERT(dq->dq_last == NULL); 1563 dq->dq_first = dq->dq_last = tp; 1564 } else { 1565 ASSERT(dq->dq_last == NULL); 1566 ASSERT(dq->dq_first == NULL); 1567 tp->t_link = NULL; 1568 dq->dq_first = dq->dq_last = tp; 1569 } 1570 BT_SET(dp->disp_qactmap, tpri); 1571 if (tpri > dp->disp_max_unbound_pri) 1572 dp->disp_max_unbound_pri = tpri; 1573 if (tpri > dp->disp_maxrunpri) { 1574 dp->disp_maxrunpri = tpri; 1575 membar_enter(); 1576 } 1577 } 1578 1579 cp = tp->t_cpu; 1580 if (tp->t_cpupart != cp->cpu_part) { 1581 /* migrate to a cpu in the new partition */ 1582 cp = tp->t_cpupart->cp_cpulist; 1583 } 1584 cp = disp_lowpri_cpu(cp, tp, tp->t_pri); 1585 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1586 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1587 1588 #ifndef NPROBE 1589 /* Kernel probe */ 1590 if (tnf_tracing_active) 1591 tnf_thread_queue(tp, cp, tpri); 1592 #endif /* NPROBE */ 1593 1594 if (cp->cpu_chosen_level < tpri) 1595 cp->cpu_chosen_level = tpri; 1596 cpu_resched(cp, tpri); 1597 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1598 (*disp_enq_thread)(cp, 0); 1599 } 1600 1601 /* 1602 * Remove a thread from the dispatcher queue if it is on it. 1603 * It is not an error if it is not found but we return whether 1604 * or not it was found in case the caller wants to check. 1605 */ 1606 int 1607 dispdeq(kthread_t *tp) 1608 { 1609 disp_t *dp; 1610 dispq_t *dq; 1611 kthread_t *rp; 1612 kthread_t *trp; 1613 kthread_t **ptp; 1614 int tpri; 1615 1616 ASSERT(THREAD_LOCK_HELD(tp)); 1617 1618 if (tp->t_state != TS_RUN) 1619 return (0); 1620 1621 /* 1622 * The thread is "swapped" or is on the swap queue and 1623 * hence no longer on the run queue, so return true. 1624 */ 1625 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) 1626 return (1); 1627 1628 tpri = DISP_PRIO(tp); 1629 dp = tp->t_disp_queue; 1630 ASSERT(tpri < dp->disp_npri); 1631 dq = &dp->disp_q[tpri]; 1632 ptp = &dq->dq_first; 1633 rp = *ptp; 1634 trp = NULL; 1635 1636 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1637 1638 /* 1639 * Search for thread in queue. 1640 * Double links would simplify this at the expense of disp/setrun. 1641 */ 1642 while (rp != tp && rp != NULL) { 1643 trp = rp; 1644 ptp = &trp->t_link; 1645 rp = trp->t_link; 1646 } 1647 1648 if (rp == NULL) { 1649 panic("dispdeq: thread not on queue"); 1650 } 1651 1652 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1653 1654 /* 1655 * Found it so remove it from queue. 1656 */ 1657 if ((*ptp = rp->t_link) == NULL) 1658 dq->dq_last = trp; 1659 1660 dp->disp_nrunnable--; 1661 if (--dq->dq_sruncnt == 0) { 1662 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1663 if (dp->disp_nrunnable == 0) { 1664 dp->disp_max_unbound_pri = -1; 1665 dp->disp_maxrunpri = -1; 1666 } else if (tpri == dp->disp_maxrunpri) { 1667 int ipri; 1668 1669 ipri = bt_gethighbit(dp->disp_qactmap, 1670 dp->disp_maxrunpri >> BT_ULSHIFT); 1671 if (ipri < dp->disp_max_unbound_pri) 1672 dp->disp_max_unbound_pri = ipri; 1673 dp->disp_maxrunpri = ipri; 1674 } 1675 } 1676 tp->t_link = NULL; 1677 THREAD_TRANSITION(tp); /* put in intermediate state */ 1678 return (1); 1679 } 1680 1681 1682 /* 1683 * dq_sruninc and dq_srundec are public functions for 1684 * incrementing/decrementing the sruncnts when a thread on 1685 * a dispatcher queue is made schedulable/unschedulable by 1686 * resetting the TS_LOAD flag. 1687 * 1688 * The caller MUST have the thread lock and therefore the dispatcher 1689 * queue lock so that the operation which changes 1690 * the flag, the operation that checks the status of the thread to 1691 * determine if it's on a disp queue AND the call to this function 1692 * are one atomic operation with respect to interrupts. 1693 */ 1694 1695 /* 1696 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread. 1697 */ 1698 void 1699 dq_sruninc(kthread_t *t) 1700 { 1701 ASSERT(t->t_state == TS_RUN); 1702 ASSERT(t->t_schedflag & TS_LOAD); 1703 1704 THREAD_TRANSITION(t); 1705 setfrontdq(t); 1706 } 1707 1708 /* 1709 * See comment on calling conventions above. 1710 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread. 1711 */ 1712 void 1713 dq_srundec(kthread_t *t) 1714 { 1715 ASSERT(t->t_schedflag & TS_LOAD); 1716 1717 (void) dispdeq(t); 1718 disp_swapped_enq(t); 1719 } 1720 1721 /* 1722 * Change the dispatcher lock of thread to the "swapped_lock" 1723 * and return with thread lock still held. 1724 * 1725 * Called with thread_lock held, in transition state, and at high spl. 1726 */ 1727 void 1728 disp_swapped_enq(kthread_t *tp) 1729 { 1730 ASSERT(THREAD_LOCK_HELD(tp)); 1731 ASSERT(tp->t_schedflag & TS_LOAD); 1732 1733 switch (tp->t_state) { 1734 case TS_RUN: 1735 disp_lock_enter_high(&swapped_lock); 1736 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1737 break; 1738 case TS_ONPROC: 1739 disp_lock_enter_high(&swapped_lock); 1740 THREAD_TRANSITION(tp); 1741 wake_sched_sec = 1; /* tell clock to wake sched */ 1742 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */ 1743 break; 1744 default: 1745 panic("disp_swapped: tp: %p bad t_state", (void *)tp); 1746 } 1747 } 1748 1749 /* 1750 * This routine is called by setbackdq/setfrontdq if the thread is 1751 * not loaded or loaded and on the swap queue. 1752 * 1753 * Thread state TS_SLEEP implies that a swapped thread 1754 * has been woken up and needs to be swapped in by the swapper. 1755 * 1756 * Thread state TS_RUN, it implies that the priority of a swapped 1757 * thread is being increased by scheduling class (e.g. ts_update). 1758 */ 1759 static void 1760 disp_swapped_setrun(kthread_t *tp) 1761 { 1762 ASSERT(THREAD_LOCK_HELD(tp)); 1763 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); 1764 1765 switch (tp->t_state) { 1766 case TS_SLEEP: 1767 disp_lock_enter_high(&swapped_lock); 1768 /* 1769 * Wakeup sched immediately (i.e., next tick) if the 1770 * thread priority is above maxclsyspri. 1771 */ 1772 if (DISP_PRIO(tp) > maxclsyspri) 1773 wake_sched = 1; 1774 else 1775 wake_sched_sec = 1; 1776 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */ 1777 break; 1778 case TS_RUN: /* called from ts_update */ 1779 break; 1780 default: 1781 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp); 1782 } 1783 } 1784 1785 /* 1786 * Make a thread give up its processor. Find the processor on 1787 * which this thread is executing, and have that processor 1788 * preempt. 1789 * 1790 * We allow System Duty Cycle (SDC) threads to be preempted even if 1791 * they are running at kernel priorities. To implement this, we always 1792 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC 1793 * calls cpu_surrender() very often, we only preempt if there is anyone 1794 * competing with us. 1795 */ 1796 void 1797 cpu_surrender(kthread_t *tp) 1798 { 1799 cpu_t *cpup; 1800 int max_pri; 1801 int max_run_pri; 1802 klwp_t *lwp; 1803 1804 ASSERT(THREAD_LOCK_HELD(tp)); 1805 1806 if (tp->t_state != TS_ONPROC) 1807 return; 1808 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1809 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1810 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1811 if (max_pri < max_run_pri) 1812 max_pri = max_run_pri; 1813 1814 if (tp->t_cid == sysdccid) { 1815 uint_t t_pri = DISP_PRIO(tp); 1816 if (t_pri > max_pri) 1817 return; /* we are not competing w/ anyone */ 1818 cpup->cpu_runrun = cpup->cpu_kprunrun = 1; 1819 } else { 1820 cpup->cpu_runrun = 1; 1821 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1822 cpup->cpu_kprunrun = 1; 1823 } 1824 } 1825 1826 /* 1827 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1828 */ 1829 membar_enter(); 1830 1831 DTRACE_SCHED1(surrender, kthread_t *, tp); 1832 1833 /* 1834 * Make the target thread take an excursion through trap() 1835 * to do preempt() (unless we're already in trap or post_syscall, 1836 * calling cpu_surrender via CL_TRAPRET). 1837 */ 1838 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1839 lwp->lwp_state != LWP_USER) { 1840 aston(tp); 1841 if (cpup != CPU) 1842 poke_cpu(cpup->cpu_id); 1843 } 1844 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1845 "cpu_surrender:tid %p cpu %p", tp, cpup); 1846 } 1847 1848 /* 1849 * Commit to and ratify a scheduling decision 1850 */ 1851 /*ARGSUSED*/ 1852 static kthread_t * 1853 disp_ratify(kthread_t *tp, disp_t *kpq) 1854 { 1855 pri_t tpri, maxpri; 1856 pri_t maxkpri; 1857 cpu_t *cpup; 1858 1859 ASSERT(tp != NULL); 1860 /* 1861 * Commit to, then ratify scheduling decision 1862 */ 1863 cpup = CPU; 1864 if (cpup->cpu_runrun != 0) 1865 cpup->cpu_runrun = 0; 1866 if (cpup->cpu_kprunrun != 0) 1867 cpup->cpu_kprunrun = 0; 1868 if (cpup->cpu_chosen_level != -1) 1869 cpup->cpu_chosen_level = -1; 1870 membar_enter(); 1871 tpri = DISP_PRIO(tp); 1872 maxpri = cpup->cpu_disp->disp_maxrunpri; 1873 maxkpri = kpq->disp_maxrunpri; 1874 if (maxpri < maxkpri) 1875 maxpri = maxkpri; 1876 if (tpri < maxpri) { 1877 /* 1878 * should have done better 1879 * put this one back and indicate to try again 1880 */ 1881 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1882 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1883 thread_lock_high(tp); 1884 THREAD_TRANSITION(tp); 1885 setfrontdq(tp); 1886 thread_unlock_nopreempt(tp); 1887 1888 tp = NULL; 1889 } 1890 return (tp); 1891 } 1892 1893 /* 1894 * See if there is any work on the dispatcher queue for other CPUs. 1895 * If there is, dequeue the best thread and return. 1896 */ 1897 static kthread_t * 1898 disp_getwork(cpu_t *cp) 1899 { 1900 cpu_t *ocp; /* other CPU */ 1901 cpu_t *ocp_start; 1902 cpu_t *tcp; /* target local CPU */ 1903 kthread_t *tp; 1904 kthread_t *retval = NULL; 1905 pri_t maxpri; 1906 disp_t *kpq; /* kp queue for this partition */ 1907 lpl_t *lpl, *lpl_leaf; 1908 int leafidx, startidx; 1909 hrtime_t stealtime; 1910 lgrp_id_t local_id; 1911 1912 maxpri = -1; 1913 tcp = NULL; 1914 1915 kpq = &cp->cpu_part->cp_kp_queue; 1916 while (kpq->disp_maxrunpri >= 0) { 1917 /* 1918 * Try to take a thread from the kp_queue. 1919 */ 1920 tp = (disp_getbest(kpq)); 1921 if (tp) 1922 return (disp_ratify(tp, kpq)); 1923 } 1924 1925 kpreempt_disable(); /* protect the cpu_active list */ 1926 1927 /* 1928 * Try to find something to do on another CPU's run queue. 1929 * Loop through all other CPUs looking for the one with the highest 1930 * priority unbound thread. 1931 * 1932 * On NUMA machines, the partition's CPUs are consulted in order of 1933 * distance from the current CPU. This way, the first available 1934 * work found is also the closest, and will suffer the least 1935 * from being migrated. 1936 */ 1937 lpl = lpl_leaf = cp->cpu_lpl; 1938 local_id = lpl_leaf->lpl_lgrpid; 1939 leafidx = startidx = 0; 1940 1941 /* 1942 * This loop traverses the lpl hierarchy. Higher level lpls represent 1943 * broader levels of locality 1944 */ 1945 do { 1946 /* This loop iterates over the lpl's leaves */ 1947 do { 1948 if (lpl_leaf != cp->cpu_lpl) 1949 ocp = lpl_leaf->lpl_cpus; 1950 else 1951 ocp = cp->cpu_next_lpl; 1952 1953 /* This loop iterates over the CPUs in the leaf */ 1954 ocp_start = ocp; 1955 do { 1956 pri_t pri; 1957 1958 ASSERT(CPU_ACTIVE(ocp)); 1959 1960 /* 1961 * End our stroll around this lpl if: 1962 * 1963 * - Something became runnable on the local 1964 * queue...which also ends our stroll around 1965 * the partition. 1966 * 1967 * - We happen across another idle CPU. 1968 * Since it is patrolling the next portion 1969 * of the lpl's list (assuming it's not 1970 * halted, or busy servicing an interrupt), 1971 * move to the next higher level of locality. 1972 */ 1973 if (cp->cpu_disp->disp_nrunnable != 0) { 1974 kpreempt_enable(); 1975 return (NULL); 1976 } 1977 if (ocp->cpu_dispatch_pri == -1) { 1978 if (ocp->cpu_disp_flags & 1979 CPU_DISP_HALTED || 1980 ocp->cpu_intr_actv != 0) 1981 continue; 1982 else 1983 goto next_level; 1984 } 1985 1986 /* 1987 * If there's only one thread and the CPU 1988 * is in the middle of a context switch, 1989 * or it's currently running the idle thread, 1990 * don't steal it. 1991 */ 1992 if ((ocp->cpu_disp_flags & 1993 CPU_DISP_DONTSTEAL) && 1994 ocp->cpu_disp->disp_nrunnable == 1) 1995 continue; 1996 1997 pri = ocp->cpu_disp->disp_max_unbound_pri; 1998 if (pri > maxpri) { 1999 /* 2000 * Don't steal threads that we attempted 2001 * to steal recently until they're ready 2002 * to be stolen again. 2003 */ 2004 stealtime = ocp->cpu_disp->disp_steal; 2005 if (stealtime == 0 || 2006 stealtime - gethrtime() <= 0) { 2007 maxpri = pri; 2008 tcp = ocp; 2009 } else { 2010 /* 2011 * Don't update tcp, just set 2012 * the retval to T_DONTSTEAL, so 2013 * that if no acceptable CPUs 2014 * are found the return value 2015 * will be T_DONTSTEAL rather 2016 * then NULL. 2017 */ 2018 retval = T_DONTSTEAL; 2019 } 2020 } 2021 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 2022 2023 /* 2024 * Iterate to the next leaf lpl in the resource set 2025 * at this level of locality. If we hit the end of 2026 * the set, wrap back around to the beginning. 2027 * 2028 * Note: This iteration is NULL terminated for a reason 2029 * see lpl_topo_bootstrap() in lgrp.c for details. 2030 */ 2031 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 2032 leafidx = 0; 2033 lpl_leaf = lpl->lpl_rset[leafidx]; 2034 } 2035 } while (leafidx != startidx); 2036 2037 next_level: 2038 /* 2039 * Expand the search to include farther away CPUs (next 2040 * locality level). The closer CPUs that have already been 2041 * checked will be checked again. In doing so, idle CPUs 2042 * will tend to be more aggresive about stealing from CPUs 2043 * that are closer (since the closer CPUs will be considered 2044 * more often). 2045 * Begin at this level with the CPUs local leaf lpl. 2046 */ 2047 if ((lpl = lpl->lpl_parent) != NULL) { 2048 leafidx = startidx = lpl->lpl_id2rset[local_id]; 2049 lpl_leaf = lpl->lpl_rset[leafidx]; 2050 } 2051 } while (!tcp && lpl); 2052 2053 kpreempt_enable(); 2054 2055 /* 2056 * If another queue looks good, and there is still nothing on 2057 * the local queue, try to transfer one or more threads 2058 * from it to our queue. 2059 */ 2060 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 2061 tp = disp_getbest(tcp->cpu_disp); 2062 if (tp == NULL || tp == T_DONTSTEAL) 2063 return (tp); 2064 return (disp_ratify(tp, kpq)); 2065 } 2066 return (retval); 2067 } 2068 2069 2070 /* 2071 * disp_fix_unbound_pri() 2072 * Determines the maximum priority of unbound threads on the queue. 2073 * The priority is kept for the queue, but is only increased, never 2074 * reduced unless some CPU is looking for something on that queue. 2075 * 2076 * The priority argument is the known upper limit. 2077 * 2078 * Perhaps this should be kept accurately, but that probably means 2079 * separate bitmaps for bound and unbound threads. Since only idled 2080 * CPUs will have to do this recalculation, it seems better this way. 2081 */ 2082 static void 2083 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 2084 { 2085 kthread_t *tp; 2086 dispq_t *dq; 2087 ulong_t *dqactmap = dp->disp_qactmap; 2088 ulong_t mapword; 2089 int wx; 2090 2091 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 2092 2093 ASSERT(pri >= 0); /* checked by caller */ 2094 2095 /* 2096 * Start the search at the next lowest priority below the supplied 2097 * priority. This depends on the bitmap implementation. 2098 */ 2099 do { 2100 wx = pri >> BT_ULSHIFT; /* index of word in map */ 2101 2102 /* 2103 * Form mask for all lower priorities in the word. 2104 */ 2105 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 2106 2107 /* 2108 * Get next lower active priority. 2109 */ 2110 if (mapword != 0) { 2111 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 2112 } else if (wx > 0) { 2113 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 2114 if (pri < 0) 2115 break; 2116 } else { 2117 pri = -1; 2118 break; 2119 } 2120 2121 /* 2122 * Search the queue for unbound, runnable threads. 2123 */ 2124 dq = &dp->disp_q[pri]; 2125 tp = dq->dq_first; 2126 2127 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 2128 tp = tp->t_link; 2129 } 2130 2131 /* 2132 * If a thread was found, set the priority and return. 2133 */ 2134 } while (tp == NULL); 2135 2136 /* 2137 * pri holds the maximum unbound thread priority or -1. 2138 */ 2139 if (dp->disp_max_unbound_pri != pri) 2140 dp->disp_max_unbound_pri = pri; 2141 } 2142 2143 /* 2144 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2145 * check if the CPU to which is was previously bound should have 2146 * its disp_max_unbound_pri increased. 2147 */ 2148 void 2149 disp_adjust_unbound_pri(kthread_t *tp) 2150 { 2151 disp_t *dp; 2152 pri_t tpri; 2153 2154 ASSERT(THREAD_LOCK_HELD(tp)); 2155 2156 /* 2157 * Don't do anything if the thread is not bound, or 2158 * currently not runnable or swapped out. 2159 */ 2160 if (tp->t_bound_cpu == NULL || 2161 tp->t_state != TS_RUN || 2162 tp->t_schedflag & TS_ON_SWAPQ) 2163 return; 2164 2165 tpri = DISP_PRIO(tp); 2166 dp = tp->t_bound_cpu->cpu_disp; 2167 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2168 if (tpri > dp->disp_max_unbound_pri) 2169 dp->disp_max_unbound_pri = tpri; 2170 } 2171 2172 /* 2173 * disp_getbest() 2174 * De-queue the highest priority unbound runnable thread. 2175 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2176 * Returns NULL if nothing found. 2177 * Returns T_DONTSTEAL if the thread was not stealable. 2178 * so that the caller will try again later. 2179 * 2180 * Passed a pointer to a dispatch queue not associated with this CPU, and 2181 * its type. 2182 */ 2183 static kthread_t * 2184 disp_getbest(disp_t *dp) 2185 { 2186 kthread_t *tp; 2187 dispq_t *dq; 2188 pri_t pri; 2189 cpu_t *cp, *tcp; 2190 boolean_t allbound; 2191 2192 disp_lock_enter(&dp->disp_lock); 2193 2194 /* 2195 * If there is nothing to run, or the CPU is in the middle of a 2196 * context switch of the only thread, return NULL. 2197 */ 2198 tcp = dp->disp_cpu; 2199 cp = CPU; 2200 pri = dp->disp_max_unbound_pri; 2201 if (pri == -1 || 2202 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2203 tcp->cpu_disp->disp_nrunnable == 1)) { 2204 disp_lock_exit_nopreempt(&dp->disp_lock); 2205 return (NULL); 2206 } 2207 2208 dq = &dp->disp_q[pri]; 2209 2210 2211 /* 2212 * Assume that all threads are bound on this queue, and change it 2213 * later when we find out that it is not the case. 2214 */ 2215 allbound = B_TRUE; 2216 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2217 hrtime_t now, nosteal, rqtime; 2218 2219 /* 2220 * Skip over bound threads which could be here even 2221 * though disp_max_unbound_pri indicated this level. 2222 */ 2223 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2224 continue; 2225 2226 /* 2227 * We've got some unbound threads on this queue, so turn 2228 * the allbound flag off now. 2229 */ 2230 allbound = B_FALSE; 2231 2232 /* 2233 * The thread is a candidate for stealing from its run queue. We 2234 * don't want to steal threads that became runnable just a 2235 * moment ago. This improves CPU affinity for threads that get 2236 * preempted for short periods of time and go back on the run 2237 * queue. 2238 * 2239 * We want to let it stay on its run queue if it was only placed 2240 * there recently and it was running on the same CPU before that 2241 * to preserve its cache investment. For the thread to remain on 2242 * its run queue, ALL of the following conditions must be 2243 * satisfied: 2244 * 2245 * - the disp queue should not be the kernel preemption queue 2246 * - delayed idle stealing should not be disabled 2247 * - nosteal_nsec should be non-zero 2248 * - it should run with user priority 2249 * - it should be on the run queue of the CPU where it was 2250 * running before being placed on the run queue 2251 * - it should be the only thread on the run queue (to prevent 2252 * extra scheduling latency for other threads) 2253 * - it should sit on the run queue for less than per-chip 2254 * nosteal interval or global nosteal interval 2255 * - in case of CPUs with shared cache it should sit in a run 2256 * queue of a CPU from a different chip 2257 * 2258 * The checks are arranged so that the ones that are faster are 2259 * placed earlier. 2260 */ 2261 if (tcp == NULL || 2262 pri >= minclsyspri || 2263 tp->t_cpu != tcp) 2264 break; 2265 2266 /* 2267 * Steal immediately if, due to CMT processor architecture 2268 * migraiton between cp and tcp would incur no performance 2269 * penalty. 2270 */ 2271 if (pg_cmt_can_migrate(cp, tcp)) 2272 break; 2273 2274 nosteal = nosteal_nsec; 2275 if (nosteal == 0) 2276 break; 2277 2278 /* 2279 * Calculate time spent sitting on run queue 2280 */ 2281 now = gethrtime_unscaled(); 2282 rqtime = now - tp->t_waitrq; 2283 scalehrtime(&rqtime); 2284 2285 /* 2286 * Steal immediately if the time spent on this run queue is more 2287 * than allowed nosteal delay. 2288 * 2289 * Negative rqtime check is needed here to avoid infinite 2290 * stealing delays caused by unlikely but not impossible 2291 * drifts between CPU times on different CPUs. 2292 */ 2293 if (rqtime > nosteal || rqtime < 0) 2294 break; 2295 2296 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2297 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2298 scalehrtime(&now); 2299 /* 2300 * Calculate when this thread becomes stealable 2301 */ 2302 now += (nosteal - rqtime); 2303 2304 /* 2305 * Calculate time when some thread becomes stealable 2306 */ 2307 if (now < dp->disp_steal) 2308 dp->disp_steal = now; 2309 } 2310 2311 /* 2312 * If there were no unbound threads on this queue, find the queue 2313 * where they are and then return later. The value of 2314 * disp_max_unbound_pri is not always accurate because it isn't 2315 * reduced until another idle CPU looks for work. 2316 */ 2317 if (allbound) 2318 disp_fix_unbound_pri(dp, pri); 2319 2320 /* 2321 * If we reached the end of the queue and found no unbound threads 2322 * then return NULL so that other CPUs will be considered. If there 2323 * are unbound threads but they cannot yet be stolen, then 2324 * return T_DONTSTEAL and try again later. 2325 */ 2326 if (tp == NULL) { 2327 disp_lock_exit_nopreempt(&dp->disp_lock); 2328 return (allbound ? NULL : T_DONTSTEAL); 2329 } 2330 2331 /* 2332 * Found a runnable, unbound thread, so remove it from queue. 2333 * dispdeq() requires that we have the thread locked, and we do, 2334 * by virtue of holding the dispatch queue lock. dispdeq() will 2335 * put the thread in transition state, thereby dropping the dispq 2336 * lock. 2337 */ 2338 2339 #ifdef DEBUG 2340 { 2341 int thread_was_on_queue; 2342 2343 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2344 ASSERT(thread_was_on_queue); 2345 } 2346 2347 #else /* DEBUG */ 2348 (void) dispdeq(tp); /* drops disp_lock */ 2349 #endif /* DEBUG */ 2350 2351 /* 2352 * Reset the disp_queue steal time - we do not know what is the smallest 2353 * value across the queue is. 2354 */ 2355 dp->disp_steal = 0; 2356 2357 tp->t_schedflag |= TS_DONT_SWAP; 2358 2359 /* 2360 * Setup thread to run on the current CPU. 2361 */ 2362 tp->t_disp_queue = cp->cpu_disp; 2363 2364 cp->cpu_dispthread = tp; /* protected by spl only */ 2365 cp->cpu_dispatch_pri = pri; 2366 2367 /* 2368 * There can be a memory synchronization race between disp_getbest() 2369 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying 2370 * to preempt the current thread to run the enqueued thread while 2371 * disp_getbest() and disp_ratify() are changing the current thread 2372 * to the stolen thread. This may lead to a situation where 2373 * cpu_resched() tries to preempt the wrong thread and the 2374 * stolen thread continues to run on the CPU which has been tagged 2375 * for preemption. 2376 * Later the clock thread gets enqueued but doesn't get to run on the 2377 * CPU causing the system to hang. 2378 * 2379 * To avoid this, grabbing and dropping the disp_lock (which does 2380 * a memory barrier) is needed to synchronize the execution of 2381 * cpu_resched() with disp_getbest() and disp_ratify() and 2382 * synchronize the memory read and written by cpu_resched(), 2383 * disp_getbest(), and disp_ratify() with each other. 2384 * (see CR#6482861 for more details). 2385 */ 2386 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 2387 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 2388 2389 ASSERT(pri == DISP_PRIO(tp)); 2390 2391 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2392 2393 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2394 2395 /* 2396 * Return with spl high so that swtch() won't need to raise it. 2397 * The disp_lock was dropped by dispdeq(). 2398 */ 2399 2400 return (tp); 2401 } 2402 2403 /* 2404 * disp_bound_common() - common routine for higher level functions 2405 * that check for bound threads under certain conditions. 2406 * If 'threadlistsafe' is set then there is no need to acquire 2407 * pidlock to stop the thread list from changing (eg, if 2408 * disp_bound_* is called with cpus paused). 2409 */ 2410 static int 2411 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2412 { 2413 int found = 0; 2414 kthread_t *tp; 2415 2416 ASSERT(flag); 2417 2418 if (!threadlistsafe) 2419 mutex_enter(&pidlock); 2420 tp = curthread; /* faster than allthreads */ 2421 do { 2422 if (tp->t_state != TS_FREE) { 2423 /* 2424 * If an interrupt thread is busy, but the 2425 * caller doesn't care (i.e. BOUND_INTR is off), 2426 * then just ignore it and continue through. 2427 */ 2428 if ((tp->t_flag & T_INTR_THREAD) && 2429 !(flag & BOUND_INTR)) 2430 continue; 2431 2432 /* 2433 * Skip the idle thread for the CPU 2434 * we're about to set offline. 2435 */ 2436 if (tp == cp->cpu_idle_thread) 2437 continue; 2438 2439 /* 2440 * Skip the pause thread for the CPU 2441 * we're about to set offline. 2442 */ 2443 if (tp == cp->cpu_pause_thread) 2444 continue; 2445 2446 if ((flag & BOUND_CPU) && 2447 (tp->t_bound_cpu == cp || 2448 tp->t_bind_cpu == cp->cpu_id || 2449 tp->t_weakbound_cpu == cp)) { 2450 found = 1; 2451 break; 2452 } 2453 2454 if ((flag & BOUND_PARTITION) && 2455 (tp->t_cpupart == cp->cpu_part)) { 2456 found = 1; 2457 break; 2458 } 2459 } 2460 } while ((tp = tp->t_next) != curthread && found == 0); 2461 if (!threadlistsafe) 2462 mutex_exit(&pidlock); 2463 return (found); 2464 } 2465 2466 /* 2467 * disp_bound_threads - return nonzero if threads are bound to the processor. 2468 * Called infrequently. Keep this simple. 2469 * Includes threads that are asleep or stopped but not onproc. 2470 */ 2471 int 2472 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2473 { 2474 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2475 } 2476 2477 /* 2478 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2479 * to the given processor, including interrupt threads. 2480 */ 2481 int 2482 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2483 { 2484 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2485 } 2486 2487 /* 2488 * disp_bound_partition - return nonzero if threads are bound to the same 2489 * partition as the processor. 2490 * Called infrequently. Keep this simple. 2491 * Includes threads that are asleep or stopped but not onproc. 2492 */ 2493 int 2494 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2495 { 2496 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2497 } 2498 2499 /* 2500 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2501 * threads to other CPUs. 2502 */ 2503 void 2504 disp_cpu_inactive(cpu_t *cp) 2505 { 2506 kthread_t *tp; 2507 disp_t *dp = cp->cpu_disp; 2508 dispq_t *dq; 2509 pri_t pri; 2510 int wasonq; 2511 2512 disp_lock_enter(&dp->disp_lock); 2513 while ((pri = dp->disp_max_unbound_pri) != -1) { 2514 dq = &dp->disp_q[pri]; 2515 tp = dq->dq_first; 2516 2517 /* 2518 * Skip over bound threads. 2519 */ 2520 while (tp != NULL && tp->t_bound_cpu != NULL) { 2521 tp = tp->t_link; 2522 } 2523 2524 if (tp == NULL) { 2525 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2526 disp_fix_unbound_pri(dp, pri); 2527 continue; 2528 } 2529 2530 wasonq = dispdeq(tp); /* drops disp_lock */ 2531 ASSERT(wasonq); 2532 ASSERT(tp->t_weakbound_cpu == NULL); 2533 2534 setbackdq(tp); 2535 /* 2536 * Called from cpu_offline: 2537 * 2538 * cp has already been removed from the list of active cpus 2539 * and tp->t_cpu has been changed so there is no risk of 2540 * tp ending up back on cp. 2541 * 2542 * Called from cpupart_move_cpu: 2543 * 2544 * The cpu has moved to a new cpupart. Any threads that 2545 * were on it's dispatch queues before the move remain 2546 * in the old partition and can't run in the new partition. 2547 */ 2548 ASSERT(tp->t_cpu != cp); 2549 thread_unlock(tp); 2550 2551 disp_lock_enter(&dp->disp_lock); 2552 } 2553 disp_lock_exit(&dp->disp_lock); 2554 } 2555 2556 /* 2557 * Return a score rating this CPU for running this thread: lower is better. 2558 * 2559 * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for 2560 * curcpu (as that's our own priority). 2561 * 2562 * If a cpu is the target of an offline request, then try to avoid it. 2563 * 2564 * Otherwise we'll use double the effective dispatcher priority for the CPU. 2565 * 2566 * We do this so smt_adjust_cpu_score() can increment the score if needed, 2567 * without ending up over-riding a dispatcher priority. 2568 */ 2569 static pri_t 2570 cpu_score(cpu_t *cp, kthread_t *tp) 2571 { 2572 pri_t score; 2573 2574 if (tp == curthread && cp == curthread->t_cpu) 2575 score = 2 * CPU_IDLE_PRI; 2576 else if (cp == cpu_inmotion) 2577 score = SHRT_MAX; 2578 else 2579 score = 2 * cp->cpu_dispatch_pri; 2580 2581 if (2 * cp->cpu_disp->disp_maxrunpri > score) 2582 score = 2 * cp->cpu_disp->disp_maxrunpri; 2583 if (2 * cp->cpu_chosen_level > score) 2584 score = 2 * cp->cpu_chosen_level; 2585 2586 return (smt_adjust_cpu_score(tp, cp, score)); 2587 } 2588 2589 /* 2590 * disp_lowpri_cpu - find a suitable CPU to run the given thread. 2591 * 2592 * We are looking for a CPU with an effective dispatch priority lower than the 2593 * thread's, so that the thread will run immediately rather than be enqueued. 2594 * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group. 2595 * If we don't find an available CPU there, we will expand our search to include 2596 * wider locality levels. (Note these groups are already divided by CPU 2597 * partition.) 2598 * 2599 * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on 2600 * the best home CPU we found. 2601 * 2602 * The hint passed in is used as a starting point so we don't favor CPU 0 or any 2603 * other CPU. The caller should pass in the most recently used CPU for the 2604 * thread; it's of course possible that this CPU isn't in the home lgroup. 2605 * 2606 * This function must be called at either high SPL, or with preemption disabled, 2607 * so that the "hint" CPU cannot be removed from the online CPU list while we 2608 * are traversing it. 2609 */ 2610 cpu_t * 2611 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri) 2612 { 2613 cpu_t *bestcpu; 2614 cpu_t *besthomecpu; 2615 cpu_t *cp, *cpstart; 2616 2617 klgrpset_t done; 2618 2619 lpl_t *lpl_iter, *lpl_leaf; 2620 2621 ASSERT(hint != NULL); 2622 ASSERT(tp->t_lpl->lpl_ncpu > 0); 2623 2624 bestcpu = besthomecpu = NULL; 2625 klgrpset_clear(done); 2626 2627 lpl_iter = tp->t_lpl; 2628 2629 do { 2630 pri_t best = SHRT_MAX; 2631 klgrpset_t cur_set; 2632 2633 klgrpset_clear(cur_set); 2634 2635 for (int i = 0; i < lpl_iter->lpl_nrset; i++) { 2636 lpl_leaf = lpl_iter->lpl_rset[i]; 2637 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2638 continue; 2639 2640 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2641 2642 if (hint->cpu_lpl == lpl_leaf) 2643 cp = cpstart = hint; 2644 else 2645 cp = cpstart = lpl_leaf->lpl_cpus; 2646 2647 do { 2648 pri_t score = cpu_score(cp, tp); 2649 2650 if (score < best) { 2651 best = score; 2652 bestcpu = cp; 2653 2654 /* An idle CPU: we're done. */ 2655 if (score / 2 == CPU_IDLE_PRI) 2656 goto out; 2657 } 2658 } while ((cp = cp->cpu_next_lpl) != cpstart); 2659 } 2660 2661 if (bestcpu != NULL && tpri > (best / 2)) 2662 goto out; 2663 2664 if (besthomecpu == NULL) 2665 besthomecpu = bestcpu; 2666 2667 /* 2668 * Add the lgrps we just considered to the "done" set 2669 */ 2670 klgrpset_or(done, cur_set); 2671 2672 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2673 2674 /* 2675 * The specified priority isn't high enough to run immediately 2676 * anywhere, so just return the best CPU from the home lgroup. 2677 */ 2678 bestcpu = besthomecpu; 2679 2680 out: 2681 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2682 return (bestcpu); 2683 } 2684 2685 /* 2686 * This routine provides the generic idle cpu function for all processors. 2687 * If a processor has some specific code to execute when idle (say, to stop 2688 * the pipeline and save power) then that routine should be defined in the 2689 * processors specific code (module_xx.c) and the global variable idle_cpu 2690 * set to that function. 2691 */ 2692 static void 2693 generic_idle_cpu(void) 2694 { 2695 } 2696 2697 /*ARGSUSED*/ 2698 static void 2699 generic_enq_thread(cpu_t *cpu, int bound) 2700 { 2701 } 2702 2703 cpu_t * 2704 disp_choose_best_cpu(void) 2705 { 2706 kthread_t *t = curthread; 2707 cpu_t *curcpu = CPU; 2708 2709 ASSERT(t->t_preempt > 0); 2710 ASSERT(t->t_state == TS_ONPROC); 2711 ASSERT(t->t_schedflag & TS_VCPU); 2712 2713 if (smt_should_run(t, curcpu)) 2714 return (curcpu); 2715 2716 return (disp_lowpri_cpu(curcpu, t, t->t_pri)); 2717 } 2718