1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_hwpmc_hooks.h" 39 #include "opt_sched.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/cpuset.h> 44 #include <sys/kernel.h> 45 #include <sys/ktr.h> 46 #include <sys/lock.h> 47 #include <sys/kthread.h> 48 #include <sys/mutex.h> 49 #include <sys/proc.h> 50 #include <sys/resourcevar.h> 51 #include <sys/sched.h> 52 #include <sys/smp.h> 53 #include <sys/sysctl.h> 54 #include <sys/sx.h> 55 #include <sys/turnstile.h> 56 #include <sys/umtx.h> 57 #include <machine/pcb.h> 58 #include <machine/smp.h> 59 60 #ifdef HWPMC_HOOKS 61 #include <sys/pmckern.h> 62 #endif 63 64 /* 65 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 66 * the range 100-256 Hz (approximately). 67 */ 68 #define ESTCPULIM(e) \ 69 min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 70 RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 71 #ifdef SMP 72 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 73 #else 74 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 75 #endif 76 #define NICE_WEIGHT 1 /* Priorities per nice level. */ 77 78 /* 79 * The schedulable entity that runs a context. 80 * This is an extension to the thread structure and is tailored to 81 * the requirements of this scheduler 82 */ 83 struct td_sched { 84 fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */ 85 int ts_cpticks; /* (j) Ticks of cpu time. */ 86 int ts_slptime; /* (j) Seconds !RUNNING. */ 87 struct runq *ts_runq; /* runq the thread is currently on */ 88 }; 89 90 /* flags kept in td_flags */ 91 #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ 92 #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ 93 94 #define SKE_RUNQ_PCPU(ts) \ 95 ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) 96 97 static struct td_sched td_sched0; 98 struct mtx sched_lock; 99 100 static int sched_tdcnt; /* Total runnable threads in the system. */ 101 static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 102 #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ 103 104 static void setup_runqs(void); 105 static void schedcpu(void); 106 static void schedcpu_thread(void); 107 static void sched_priority(struct thread *td, u_char prio); 108 static void sched_setup(void *dummy); 109 static void maybe_resched(struct thread *td); 110 static void updatepri(struct thread *td); 111 static void resetpriority(struct thread *td); 112 static void resetpriority_thread(struct thread *td); 113 #ifdef SMP 114 static int forward_wakeup(int cpunum); 115 #endif 116 117 static struct kproc_desc sched_kp = { 118 "schedcpu", 119 schedcpu_thread, 120 NULL 121 }; 122 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, 123 &sched_kp); 124 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); 125 126 /* 127 * Global run queue. 128 */ 129 static struct runq runq; 130 131 #ifdef SMP 132 /* 133 * Per-CPU run queues 134 */ 135 static struct runq runq_pcpu[MAXCPU]; 136 #endif 137 138 static void 139 setup_runqs(void) 140 { 141 #ifdef SMP 142 int i; 143 144 for (i = 0; i < MAXCPU; ++i) 145 runq_init(&runq_pcpu[i]); 146 #endif 147 148 runq_init(&runq); 149 } 150 151 static int 152 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 153 { 154 int error, new_val; 155 156 new_val = sched_quantum * tick; 157 error = sysctl_handle_int(oidp, &new_val, 0, req); 158 if (error != 0 || req->newptr == NULL) 159 return (error); 160 if (new_val < tick) 161 return (EINVAL); 162 sched_quantum = new_val / tick; 163 hogticks = 2 * sched_quantum; 164 return (0); 165 } 166 167 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 168 169 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 170 "Scheduler name"); 171 172 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 173 0, sizeof sched_quantum, sysctl_kern_quantum, "I", 174 "Roundrobin scheduling quantum in microseconds"); 175 176 #ifdef SMP 177 /* Enable forwarding of wakeups to all other cpus */ 178 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); 179 180 static int runq_fuzz = 1; 181 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); 182 183 static int forward_wakeup_enabled = 1; 184 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, 185 &forward_wakeup_enabled, 0, 186 "Forwarding of wakeup to idle CPUs"); 187 188 static int forward_wakeups_requested = 0; 189 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, 190 &forward_wakeups_requested, 0, 191 "Requests for Forwarding of wakeup to idle CPUs"); 192 193 static int forward_wakeups_delivered = 0; 194 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, 195 &forward_wakeups_delivered, 0, 196 "Completed Forwarding of wakeup to idle CPUs"); 197 198 static int forward_wakeup_use_mask = 1; 199 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, 200 &forward_wakeup_use_mask, 0, 201 "Use the mask of idle cpus"); 202 203 static int forward_wakeup_use_loop = 0; 204 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, 205 &forward_wakeup_use_loop, 0, 206 "Use a loop to find idle cpus"); 207 208 static int forward_wakeup_use_single = 0; 209 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW, 210 &forward_wakeup_use_single, 0, 211 "Only signal one idle cpu"); 212 213 static int forward_wakeup_use_htt = 0; 214 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW, 215 &forward_wakeup_use_htt, 0, 216 "account for htt"); 217 218 #endif 219 #if 0 220 static int sched_followon = 0; 221 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, 222 &sched_followon, 0, 223 "allow threads to share a quantum"); 224 #endif 225 226 static __inline void 227 sched_load_add(void) 228 { 229 sched_tdcnt++; 230 CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); 231 } 232 233 static __inline void 234 sched_load_rem(void) 235 { 236 sched_tdcnt--; 237 CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); 238 } 239 /* 240 * Arrange to reschedule if necessary, taking the priorities and 241 * schedulers into account. 242 */ 243 static void 244 maybe_resched(struct thread *td) 245 { 246 247 THREAD_LOCK_ASSERT(td, MA_OWNED); 248 if (td->td_priority < curthread->td_priority) 249 curthread->td_flags |= TDF_NEEDRESCHED; 250 } 251 252 /* 253 * This function is called when a thread is about to be put on run queue 254 * because it has been made runnable or its priority has been adjusted. It 255 * determines if the new thread should be immediately preempted to. If so, 256 * it switches to it and eventually returns true. If not, it returns false 257 * so that the caller may place the thread on an appropriate run queue. 258 */ 259 int 260 maybe_preempt(struct thread *td) 261 { 262 #ifdef PREEMPTION 263 struct thread *ctd; 264 int cpri, pri; 265 #endif 266 267 #ifdef PREEMPTION 268 /* 269 * The new thread should not preempt the current thread if any of the 270 * following conditions are true: 271 * 272 * - The kernel is in the throes of crashing (panicstr). 273 * - The current thread has a higher (numerically lower) or 274 * equivalent priority. Note that this prevents curthread from 275 * trying to preempt to itself. 276 * - It is too early in the boot for context switches (cold is set). 277 * - The current thread has an inhibitor set or is in the process of 278 * exiting. In this case, the current thread is about to switch 279 * out anyways, so there's no point in preempting. If we did, 280 * the current thread would not be properly resumed as well, so 281 * just avoid that whole landmine. 282 * - If the new thread's priority is not a realtime priority and 283 * the current thread's priority is not an idle priority and 284 * FULL_PREEMPTION is disabled. 285 * 286 * If all of these conditions are false, but the current thread is in 287 * a nested critical section, then we have to defer the preemption 288 * until we exit the critical section. Otherwise, switch immediately 289 * to the new thread. 290 */ 291 ctd = curthread; 292 THREAD_LOCK_ASSERT(td, MA_OWNED); 293 KASSERT((td->td_inhibitors == 0), 294 ("maybe_preempt: trying to run inhibited thread")); 295 pri = td->td_priority; 296 cpri = ctd->td_priority; 297 if (panicstr != NULL || pri >= cpri || cold /* || dumping */ || 298 TD_IS_INHIBITED(ctd)) 299 return (0); 300 #ifndef FULL_PREEMPTION 301 if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) 302 return (0); 303 #endif 304 305 if (ctd->td_critnest > 1) { 306 CTR1(KTR_PROC, "maybe_preempt: in critical section %d", 307 ctd->td_critnest); 308 ctd->td_owepreempt = 1; 309 return (0); 310 } 311 /* 312 * Thread is runnable but not yet put on system run queue. 313 */ 314 MPASS(ctd->td_lock == td->td_lock); 315 MPASS(TD_ON_RUNQ(td)); 316 TD_SET_RUNNING(td); 317 CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, 318 td->td_proc->p_pid, td->td_name); 319 mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td); 320 /* 321 * td's lock pointer may have changed. We have to return with it 322 * locked. 323 */ 324 spinlock_enter(); 325 thread_unlock(ctd); 326 thread_lock(td); 327 spinlock_exit(); 328 return (1); 329 #else 330 return (0); 331 #endif 332 } 333 334 /* 335 * Constants for digital decay and forget: 336 * 90% of (td_estcpu) usage in 5 * loadav time 337 * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) 338 * Note that, as ps(1) mentions, this can let percentages 339 * total over 100% (I've seen 137.9% for 3 processes). 340 * 341 * Note that schedclock() updates td_estcpu and p_cpticks asynchronously. 342 * 343 * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds. 344 * That is, the system wants to compute a value of decay such 345 * that the following for loop: 346 * for (i = 0; i < (5 * loadavg); i++) 347 * td_estcpu *= decay; 348 * will compute 349 * td_estcpu *= 0.1; 350 * for all values of loadavg: 351 * 352 * Mathematically this loop can be expressed by saying: 353 * decay ** (5 * loadavg) ~= .1 354 * 355 * The system computes decay as: 356 * decay = (2 * loadavg) / (2 * loadavg + 1) 357 * 358 * We wish to prove that the system's computation of decay 359 * will always fulfill the equation: 360 * decay ** (5 * loadavg) ~= .1 361 * 362 * If we compute b as: 363 * b = 2 * loadavg 364 * then 365 * decay = b / (b + 1) 366 * 367 * We now need to prove two things: 368 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 369 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 370 * 371 * Facts: 372 * For x close to zero, exp(x) =~ 1 + x, since 373 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 374 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 375 * For x close to zero, ln(1+x) =~ x, since 376 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 377 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 378 * ln(.1) =~ -2.30 379 * 380 * Proof of (1): 381 * Solve (factor)**(power) =~ .1 given power (5*loadav): 382 * solving for factor, 383 * ln(factor) =~ (-2.30/5*loadav), or 384 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 385 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 386 * 387 * Proof of (2): 388 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 389 * solving for power, 390 * power*ln(b/(b+1)) =~ -2.30, or 391 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 392 * 393 * Actual power values for the implemented algorithm are as follows: 394 * loadav: 1 2 3 4 395 * power: 5.68 10.32 14.94 19.55 396 */ 397 398 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 399 #define loadfactor(loadav) (2 * (loadav)) 400 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 401 402 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 403 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 404 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 405 406 /* 407 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 408 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 409 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 410 * 411 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 412 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 413 * 414 * If you don't want to bother with the faster/more-accurate formula, you 415 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 416 * (more general) method of calculating the %age of CPU used by a process. 417 */ 418 #define CCPU_SHIFT 11 419 420 /* 421 * Recompute process priorities, every hz ticks. 422 * MP-safe, called without the Giant mutex. 423 */ 424 /* ARGSUSED */ 425 static void 426 schedcpu(void) 427 { 428 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 429 struct thread *td; 430 struct proc *p; 431 struct td_sched *ts; 432 int awake, realstathz; 433 434 realstathz = stathz ? stathz : hz; 435 sx_slock(&allproc_lock); 436 FOREACH_PROC_IN_SYSTEM(p) { 437 PROC_LOCK(p); 438 FOREACH_THREAD_IN_PROC(p, td) { 439 awake = 0; 440 thread_lock(td); 441 ts = td->td_sched; 442 /* 443 * Increment sleep time (if sleeping). We 444 * ignore overflow, as above. 445 */ 446 /* 447 * The td_sched slptimes are not touched in wakeup 448 * because the thread may not HAVE everything in 449 * memory? XXX I think this is out of date. 450 */ 451 if (TD_ON_RUNQ(td)) { 452 awake = 1; 453 td->td_flags &= ~TDF_DIDRUN; 454 } else if (TD_IS_RUNNING(td)) { 455 awake = 1; 456 /* Do not clear TDF_DIDRUN */ 457 } else if (td->td_flags & TDF_DIDRUN) { 458 awake = 1; 459 td->td_flags &= ~TDF_DIDRUN; 460 } 461 462 /* 463 * ts_pctcpu is only for ps and ttyinfo(). 464 */ 465 ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; 466 /* 467 * If the td_sched has been idle the entire second, 468 * stop recalculating its priority until 469 * it wakes up. 470 */ 471 if (ts->ts_cpticks != 0) { 472 #if (FSHIFT >= CCPU_SHIFT) 473 ts->ts_pctcpu += (realstathz == 100) 474 ? ((fixpt_t) ts->ts_cpticks) << 475 (FSHIFT - CCPU_SHIFT) : 476 100 * (((fixpt_t) ts->ts_cpticks) 477 << (FSHIFT - CCPU_SHIFT)) / realstathz; 478 #else 479 ts->ts_pctcpu += ((FSCALE - ccpu) * 480 (ts->ts_cpticks * 481 FSCALE / realstathz)) >> FSHIFT; 482 #endif 483 ts->ts_cpticks = 0; 484 } 485 /* 486 * If there are ANY running threads in this process, 487 * then don't count it as sleeping. 488 XXX this is broken 489 490 */ 491 if (awake) { 492 if (ts->ts_slptime > 1) { 493 /* 494 * In an ideal world, this should not 495 * happen, because whoever woke us 496 * up from the long sleep should have 497 * unwound the slptime and reset our 498 * priority before we run at the stale 499 * priority. Should KASSERT at some 500 * point when all the cases are fixed. 501 */ 502 updatepri(td); 503 } 504 ts->ts_slptime = 0; 505 } else 506 ts->ts_slptime++; 507 if (ts->ts_slptime > 1) { 508 thread_unlock(td); 509 continue; 510 } 511 td->td_estcpu = decay_cpu(loadfac, td->td_estcpu); 512 resetpriority(td); 513 resetpriority_thread(td); 514 thread_unlock(td); 515 } /* end of thread loop */ 516 PROC_UNLOCK(p); 517 } /* end of process loop */ 518 sx_sunlock(&allproc_lock); 519 } 520 521 /* 522 * Main loop for a kthread that executes schedcpu once a second. 523 */ 524 static void 525 schedcpu_thread(void) 526 { 527 528 for (;;) { 529 schedcpu(); 530 pause("-", hz); 531 } 532 } 533 534 /* 535 * Recalculate the priority of a process after it has slept for a while. 536 * For all load averages >= 1 and max td_estcpu of 255, sleeping for at 537 * least six times the loadfactor will decay td_estcpu to zero. 538 */ 539 static void 540 updatepri(struct thread *td) 541 { 542 struct td_sched *ts; 543 fixpt_t loadfac; 544 unsigned int newcpu; 545 546 ts = td->td_sched; 547 loadfac = loadfactor(averunnable.ldavg[0]); 548 if (ts->ts_slptime > 5 * loadfac) 549 td->td_estcpu = 0; 550 else { 551 newcpu = td->td_estcpu; 552 ts->ts_slptime--; /* was incremented in schedcpu() */ 553 while (newcpu && --ts->ts_slptime) 554 newcpu = decay_cpu(loadfac, newcpu); 555 td->td_estcpu = newcpu; 556 } 557 } 558 559 /* 560 * Compute the priority of a process when running in user mode. 561 * Arrange to reschedule if the resulting priority is better 562 * than that of the current process. 563 */ 564 static void 565 resetpriority(struct thread *td) 566 { 567 register unsigned int newpriority; 568 569 if (td->td_pri_class == PRI_TIMESHARE) { 570 newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT + 571 NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); 572 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 573 PRI_MAX_TIMESHARE); 574 sched_user_prio(td, newpriority); 575 } 576 } 577 578 /* 579 * Update the thread's priority when the associated process's user 580 * priority changes. 581 */ 582 static void 583 resetpriority_thread(struct thread *td) 584 { 585 586 /* Only change threads with a time sharing user priority. */ 587 if (td->td_priority < PRI_MIN_TIMESHARE || 588 td->td_priority > PRI_MAX_TIMESHARE) 589 return; 590 591 /* XXX the whole needresched thing is broken, but not silly. */ 592 maybe_resched(td); 593 594 sched_prio(td, td->td_user_pri); 595 } 596 597 /* ARGSUSED */ 598 static void 599 sched_setup(void *dummy) 600 { 601 setup_runqs(); 602 603 if (sched_quantum == 0) 604 sched_quantum = SCHED_QUANTUM; 605 hogticks = 2 * sched_quantum; 606 607 /* Account for thread0. */ 608 sched_load_add(); 609 } 610 611 /* External interfaces start here */ 612 /* 613 * Very early in the boot some setup of scheduler-specific 614 * parts of proc0 and of some scheduler resources needs to be done. 615 * Called from: 616 * proc0_init() 617 */ 618 void 619 schedinit(void) 620 { 621 /* 622 * Set up the scheduler specific parts of proc0. 623 */ 624 proc0.p_sched = NULL; /* XXX */ 625 thread0.td_sched = &td_sched0; 626 thread0.td_lock = &sched_lock; 627 mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); 628 } 629 630 int 631 sched_runnable(void) 632 { 633 #ifdef SMP 634 return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 635 #else 636 return runq_check(&runq); 637 #endif 638 } 639 640 int 641 sched_rr_interval(void) 642 { 643 if (sched_quantum == 0) 644 sched_quantum = SCHED_QUANTUM; 645 return (sched_quantum); 646 } 647 648 /* 649 * We adjust the priority of the current process. The priority of 650 * a process gets worse as it accumulates CPU time. The cpu usage 651 * estimator (td_estcpu) is increased here. resetpriority() will 652 * compute a different priority each time td_estcpu increases by 653 * INVERSE_ESTCPU_WEIGHT 654 * (until MAXPRI is reached). The cpu usage estimator ramps up 655 * quite quickly when the process is running (linearly), and decays 656 * away exponentially, at a rate which is proportionally slower when 657 * the system is busy. The basic principle is that the system will 658 * 90% forget that the process used a lot of CPU time in 5 * loadav 659 * seconds. This causes the system to favor processes which haven't 660 * run much recently, and to round-robin among other processes. 661 */ 662 void 663 sched_clock(struct thread *td) 664 { 665 struct td_sched *ts; 666 667 THREAD_LOCK_ASSERT(td, MA_OWNED); 668 ts = td->td_sched; 669 670 ts->ts_cpticks++; 671 td->td_estcpu = ESTCPULIM(td->td_estcpu + 1); 672 if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 673 resetpriority(td); 674 resetpriority_thread(td); 675 } 676 677 /* 678 * Force a context switch if the current thread has used up a full 679 * quantum (default quantum is 100ms). 680 */ 681 if (!TD_IS_IDLETHREAD(td) && 682 ticks - PCPU_GET(switchticks) >= sched_quantum) 683 td->td_flags |= TDF_NEEDRESCHED; 684 } 685 686 /* 687 * charge childs scheduling cpu usage to parent. 688 */ 689 void 690 sched_exit(struct proc *p, struct thread *td) 691 { 692 693 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 694 td, td->td_name, td->td_priority); 695 PROC_LOCK_ASSERT(p, MA_OWNED); 696 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 697 } 698 699 void 700 sched_exit_thread(struct thread *td, struct thread *child) 701 { 702 703 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 704 child, child->td_name, child->td_priority); 705 thread_lock(td); 706 td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu); 707 thread_unlock(td); 708 mtx_lock_spin(&sched_lock); 709 if ((child->td_proc->p_flag & P_NOLOAD) == 0) 710 sched_load_rem(); 711 mtx_unlock_spin(&sched_lock); 712 } 713 714 void 715 sched_fork(struct thread *td, struct thread *childtd) 716 { 717 sched_fork_thread(td, childtd); 718 } 719 720 void 721 sched_fork_thread(struct thread *td, struct thread *childtd) 722 { 723 struct td_sched *ts; 724 725 childtd->td_estcpu = td->td_estcpu; 726 childtd->td_lock = &sched_lock; 727 childtd->td_cpuset = cpuset_ref(td->td_cpuset); 728 ts = childtd->td_sched; 729 bzero(ts, sizeof(*ts)); 730 } 731 732 void 733 sched_nice(struct proc *p, int nice) 734 { 735 struct thread *td; 736 737 PROC_LOCK_ASSERT(p, MA_OWNED); 738 p->p_nice = nice; 739 FOREACH_THREAD_IN_PROC(p, td) { 740 thread_lock(td); 741 resetpriority(td); 742 resetpriority_thread(td); 743 thread_unlock(td); 744 } 745 } 746 747 void 748 sched_class(struct thread *td, int class) 749 { 750 THREAD_LOCK_ASSERT(td, MA_OWNED); 751 td->td_pri_class = class; 752 } 753 754 /* 755 * Adjust the priority of a thread. 756 */ 757 static void 758 sched_priority(struct thread *td, u_char prio) 759 { 760 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 761 td, td->td_name, td->td_priority, prio, curthread, 762 curthread->td_name); 763 764 THREAD_LOCK_ASSERT(td, MA_OWNED); 765 if (td->td_priority == prio) 766 return; 767 td->td_priority = prio; 768 if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { 769 sched_rem(td); 770 sched_add(td, SRQ_BORING); 771 } 772 } 773 774 /* 775 * Update a thread's priority when it is lent another thread's 776 * priority. 777 */ 778 void 779 sched_lend_prio(struct thread *td, u_char prio) 780 { 781 782 td->td_flags |= TDF_BORROWING; 783 sched_priority(td, prio); 784 } 785 786 /* 787 * Restore a thread's priority when priority propagation is 788 * over. The prio argument is the minimum priority the thread 789 * needs to have to satisfy other possible priority lending 790 * requests. If the thread's regulary priority is less 791 * important than prio the thread will keep a priority boost 792 * of prio. 793 */ 794 void 795 sched_unlend_prio(struct thread *td, u_char prio) 796 { 797 u_char base_pri; 798 799 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 800 td->td_base_pri <= PRI_MAX_TIMESHARE) 801 base_pri = td->td_user_pri; 802 else 803 base_pri = td->td_base_pri; 804 if (prio >= base_pri) { 805 td->td_flags &= ~TDF_BORROWING; 806 sched_prio(td, base_pri); 807 } else 808 sched_lend_prio(td, prio); 809 } 810 811 void 812 sched_prio(struct thread *td, u_char prio) 813 { 814 u_char oldprio; 815 816 /* First, update the base priority. */ 817 td->td_base_pri = prio; 818 819 /* 820 * If the thread is borrowing another thread's priority, don't ever 821 * lower the priority. 822 */ 823 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 824 return; 825 826 /* Change the real priority. */ 827 oldprio = td->td_priority; 828 sched_priority(td, prio); 829 830 /* 831 * If the thread is on a turnstile, then let the turnstile update 832 * its state. 833 */ 834 if (TD_ON_LOCK(td) && oldprio != prio) 835 turnstile_adjust(td, oldprio); 836 } 837 838 void 839 sched_user_prio(struct thread *td, u_char prio) 840 { 841 u_char oldprio; 842 843 THREAD_LOCK_ASSERT(td, MA_OWNED); 844 td->td_base_user_pri = prio; 845 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 846 return; 847 oldprio = td->td_user_pri; 848 td->td_user_pri = prio; 849 } 850 851 void 852 sched_lend_user_prio(struct thread *td, u_char prio) 853 { 854 u_char oldprio; 855 856 THREAD_LOCK_ASSERT(td, MA_OWNED); 857 td->td_flags |= TDF_UBORROWING; 858 oldprio = td->td_user_pri; 859 td->td_user_pri = prio; 860 } 861 862 void 863 sched_unlend_user_prio(struct thread *td, u_char prio) 864 { 865 u_char base_pri; 866 867 THREAD_LOCK_ASSERT(td, MA_OWNED); 868 base_pri = td->td_base_user_pri; 869 if (prio >= base_pri) { 870 td->td_flags &= ~TDF_UBORROWING; 871 sched_user_prio(td, base_pri); 872 } else { 873 sched_lend_user_prio(td, prio); 874 } 875 } 876 877 void 878 sched_sleep(struct thread *td, int pri) 879 { 880 881 THREAD_LOCK_ASSERT(td, MA_OWNED); 882 td->td_slptick = ticks; 883 td->td_sched->ts_slptime = 0; 884 if (pri) 885 sched_prio(td, pri); 886 if (TD_IS_SUSPENDED(td) || pri <= PSOCK) 887 td->td_flags |= TDF_CANSWAP; 888 } 889 890 void 891 sched_switch(struct thread *td, struct thread *newtd, int flags) 892 { 893 struct td_sched *ts; 894 struct proc *p; 895 896 ts = td->td_sched; 897 p = td->td_proc; 898 899 THREAD_LOCK_ASSERT(td, MA_OWNED); 900 /* 901 * Switch to the sched lock to fix things up and pick 902 * a new thread. 903 */ 904 if (td->td_lock != &sched_lock) { 905 mtx_lock_spin(&sched_lock); 906 thread_unlock(td); 907 } 908 909 if ((p->p_flag & P_NOLOAD) == 0) 910 sched_load_rem(); 911 912 if (newtd) 913 newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); 914 915 td->td_lastcpu = td->td_oncpu; 916 td->td_flags &= ~TDF_NEEDRESCHED; 917 td->td_owepreempt = 0; 918 td->td_oncpu = NOCPU; 919 /* 920 * At the last moment, if this thread is still marked RUNNING, 921 * then put it back on the run queue as it has not been suspended 922 * or stopped or any thing else similar. We never put the idle 923 * threads on the run queue, however. 924 */ 925 if (td->td_flags & TDF_IDLETD) { 926 TD_SET_CAN_RUN(td); 927 #ifdef SMP 928 idle_cpus_mask &= ~PCPU_GET(cpumask); 929 #endif 930 } else { 931 if (TD_IS_RUNNING(td)) { 932 /* Put us back on the run queue. */ 933 sched_add(td, (flags & SW_PREEMPT) ? 934 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 935 SRQ_OURSELF|SRQ_YIELDING); 936 } 937 } 938 if (newtd) { 939 /* 940 * The thread we are about to run needs to be counted 941 * as if it had been added to the run queue and selected. 942 * It came from: 943 * * A preemption 944 * * An upcall 945 * * A followon 946 */ 947 KASSERT((newtd->td_inhibitors == 0), 948 ("trying to run inhibited thread")); 949 newtd->td_flags |= TDF_DIDRUN; 950 TD_SET_RUNNING(newtd); 951 if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) 952 sched_load_add(); 953 } else { 954 newtd = choosethread(); 955 } 956 MPASS(newtd->td_lock == &sched_lock); 957 958 if (td != newtd) { 959 #ifdef HWPMC_HOOKS 960 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 961 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 962 #endif 963 /* I feel sleepy */ 964 lock_profile_release_lock(&sched_lock.lock_object); 965 cpu_switch(td, newtd, td->td_lock); 966 lock_profile_obtain_lock_success(&sched_lock.lock_object, 967 0, 0, __FILE__, __LINE__); 968 /* 969 * Where am I? What year is it? 970 * We are in the same thread that went to sleep above, 971 * but any amount of time may have passed. All out context 972 * will still be available as will local variables. 973 * PCPU values however may have changed as we may have 974 * changed CPU so don't trust cached values of them. 975 * New threads will go to fork_exit() instead of here 976 * so if you change things here you may need to change 977 * things there too. 978 * If the thread above was exiting it will never wake 979 * up again here, so either it has saved everything it 980 * needed to, or the thread_wait() or wait() will 981 * need to reap it. 982 */ 983 #ifdef HWPMC_HOOKS 984 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 985 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 986 #endif 987 } 988 989 #ifdef SMP 990 if (td->td_flags & TDF_IDLETD) 991 idle_cpus_mask |= PCPU_GET(cpumask); 992 #endif 993 sched_lock.mtx_lock = (uintptr_t)td; 994 td->td_oncpu = PCPU_GET(cpuid); 995 MPASS(td->td_lock == &sched_lock); 996 } 997 998 void 999 sched_wakeup(struct thread *td) 1000 { 1001 struct td_sched *ts; 1002 1003 THREAD_LOCK_ASSERT(td, MA_OWNED); 1004 ts = td->td_sched; 1005 td->td_flags &= ~TDF_CANSWAP; 1006 if (ts->ts_slptime > 1) { 1007 updatepri(td); 1008 resetpriority(td); 1009 } 1010 td->td_slptick = ticks; 1011 ts->ts_slptime = 0; 1012 sched_add(td, SRQ_BORING); 1013 } 1014 1015 #ifdef SMP 1016 /* enable HTT_2 if you have a 2-way HTT cpu.*/ 1017 static int 1018 forward_wakeup(int cpunum) 1019 { 1020 cpumask_t map, me, dontuse; 1021 cpumask_t map2; 1022 struct pcpu *pc; 1023 cpumask_t id, map3; 1024 1025 mtx_assert(&sched_lock, MA_OWNED); 1026 1027 CTR0(KTR_RUNQ, "forward_wakeup()"); 1028 1029 if ((!forward_wakeup_enabled) || 1030 (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) 1031 return (0); 1032 if (!smp_started || cold || panicstr) 1033 return (0); 1034 1035 forward_wakeups_requested++; 1036 1037 /* 1038 * check the idle mask we received against what we calculated before 1039 * in the old version. 1040 */ 1041 me = PCPU_GET(cpumask); 1042 /* 1043 * don't bother if we should be doing it ourself.. 1044 */ 1045 if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum))) 1046 return (0); 1047 1048 dontuse = me | stopped_cpus | hlt_cpus_mask; 1049 map3 = 0; 1050 if (forward_wakeup_use_loop) { 1051 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 1052 id = pc->pc_cpumask; 1053 if ( (id & dontuse) == 0 && 1054 pc->pc_curthread == pc->pc_idlethread) { 1055 map3 |= id; 1056 } 1057 } 1058 } 1059 1060 if (forward_wakeup_use_mask) { 1061 map = 0; 1062 map = idle_cpus_mask & ~dontuse; 1063 1064 /* If they are both on, compare and use loop if different */ 1065 if (forward_wakeup_use_loop) { 1066 if (map != map3) { 1067 printf("map (%02X) != map3 (%02X)\n", 1068 map, map3); 1069 map = map3; 1070 } 1071 } 1072 } else { 1073 map = map3; 1074 } 1075 /* If we only allow a specific CPU, then mask off all the others */ 1076 if (cpunum != NOCPU) { 1077 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); 1078 map &= (1 << cpunum); 1079 } else { 1080 /* Try choose an idle die. */ 1081 if (forward_wakeup_use_htt) { 1082 map2 = (map & (map >> 1)) & 0x5555; 1083 if (map2) { 1084 map = map2; 1085 } 1086 } 1087 1088 /* set only one bit */ 1089 if (forward_wakeup_use_single) { 1090 map = map & ((~map) + 1); 1091 } 1092 } 1093 if (map) { 1094 forward_wakeups_delivered++; 1095 ipi_selected(map, IPI_AST); 1096 return (1); 1097 } 1098 if (cpunum == NOCPU) 1099 printf("forward_wakeup: Idle processor not found\n"); 1100 return (0); 1101 } 1102 #endif 1103 1104 #ifdef SMP 1105 static void kick_other_cpu(int pri,int cpuid); 1106 1107 static void 1108 kick_other_cpu(int pri,int cpuid) 1109 { 1110 struct pcpu * pcpu = pcpu_find(cpuid); 1111 int cpri = pcpu->pc_curthread->td_priority; 1112 1113 if (idle_cpus_mask & pcpu->pc_cpumask) { 1114 forward_wakeups_delivered++; 1115 ipi_selected(pcpu->pc_cpumask, IPI_AST); 1116 return; 1117 } 1118 1119 if (pri >= cpri) 1120 return; 1121 1122 #if defined(IPI_PREEMPTION) && defined(PREEMPTION) 1123 #if !defined(FULL_PREEMPTION) 1124 if (pri <= PRI_MAX_ITHD) 1125 #endif /* ! FULL_PREEMPTION */ 1126 { 1127 ipi_selected(pcpu->pc_cpumask, IPI_PREEMPT); 1128 return; 1129 } 1130 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ 1131 1132 pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; 1133 ipi_selected( pcpu->pc_cpumask , IPI_AST); 1134 return; 1135 } 1136 #endif /* SMP */ 1137 1138 void 1139 sched_add(struct thread *td, int flags) 1140 #ifdef SMP 1141 { 1142 struct td_sched *ts; 1143 int forwarded = 0; 1144 int cpu; 1145 int single_cpu = 0; 1146 1147 ts = td->td_sched; 1148 THREAD_LOCK_ASSERT(td, MA_OWNED); 1149 KASSERT((td->td_inhibitors == 0), 1150 ("sched_add: trying to run inhibited thread")); 1151 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1152 ("sched_add: bad thread state")); 1153 KASSERT(td->td_flags & TDF_INMEM, 1154 ("sched_add: thread swapped out")); 1155 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1156 td, td->td_name, td->td_priority, curthread, 1157 curthread->td_name); 1158 /* 1159 * Now that the thread is moving to the run-queue, set the lock 1160 * to the scheduler's lock. 1161 */ 1162 if (td->td_lock != &sched_lock) { 1163 mtx_lock_spin(&sched_lock); 1164 thread_lock_set(td, &sched_lock); 1165 } 1166 TD_SET_RUNQ(td); 1167 1168 if (td->td_pinned != 0) { 1169 cpu = td->td_lastcpu; 1170 ts->ts_runq = &runq_pcpu[cpu]; 1171 single_cpu = 1; 1172 CTR3(KTR_RUNQ, 1173 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); 1174 } else if ((td)->td_flags & TDF_BOUND) { 1175 /* Find CPU from bound runq */ 1176 KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq")); 1177 cpu = ts->ts_runq - &runq_pcpu[0]; 1178 single_cpu = 1; 1179 CTR3(KTR_RUNQ, 1180 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); 1181 } else { 1182 CTR2(KTR_RUNQ, 1183 "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td); 1184 cpu = NOCPU; 1185 ts->ts_runq = &runq; 1186 } 1187 1188 if (single_cpu && (cpu != PCPU_GET(cpuid))) { 1189 kick_other_cpu(td->td_priority,cpu); 1190 } else { 1191 1192 if (!single_cpu) { 1193 cpumask_t me = PCPU_GET(cpumask); 1194 int idle = idle_cpus_mask & me; 1195 1196 if (!idle && ((flags & SRQ_INTR) == 0) && 1197 (idle_cpus_mask & ~(hlt_cpus_mask | me))) 1198 forwarded = forward_wakeup(cpu); 1199 } 1200 1201 if (!forwarded) { 1202 if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td)) 1203 return; 1204 else 1205 maybe_resched(td); 1206 } 1207 } 1208 1209 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1210 sched_load_add(); 1211 runq_add(ts->ts_runq, td, flags); 1212 } 1213 #else /* SMP */ 1214 { 1215 struct td_sched *ts; 1216 ts = td->td_sched; 1217 THREAD_LOCK_ASSERT(td, MA_OWNED); 1218 KASSERT((td->td_inhibitors == 0), 1219 ("sched_add: trying to run inhibited thread")); 1220 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1221 ("sched_add: bad thread state")); 1222 KASSERT(td->td_flags & TDF_INMEM, 1223 ("sched_add: thread swapped out")); 1224 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1225 td, td->td_name, td->td_priority, curthread, 1226 curthread->td_name); 1227 /* 1228 * Now that the thread is moving to the run-queue, set the lock 1229 * to the scheduler's lock. 1230 */ 1231 if (td->td_lock != &sched_lock) { 1232 mtx_lock_spin(&sched_lock); 1233 thread_lock_set(td, &sched_lock); 1234 } 1235 TD_SET_RUNQ(td); 1236 CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); 1237 ts->ts_runq = &runq; 1238 1239 /* 1240 * If we are yielding (on the way out anyhow) 1241 * or the thread being saved is US, 1242 * then don't try be smart about preemption 1243 * or kicking off another CPU 1244 * as it won't help and may hinder. 1245 * In the YIEDLING case, we are about to run whoever is 1246 * being put in the queue anyhow, and in the 1247 * OURSELF case, we are puting ourself on the run queue 1248 * which also only happens when we are about to yield. 1249 */ 1250 if((flags & SRQ_YIELDING) == 0) { 1251 if (maybe_preempt(td)) 1252 return; 1253 } 1254 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1255 sched_load_add(); 1256 runq_add(ts->ts_runq, td, flags); 1257 maybe_resched(td); 1258 } 1259 #endif /* SMP */ 1260 1261 void 1262 sched_rem(struct thread *td) 1263 { 1264 struct td_sched *ts; 1265 1266 ts = td->td_sched; 1267 KASSERT(td->td_flags & TDF_INMEM, 1268 ("sched_rem: thread swapped out")); 1269 KASSERT(TD_ON_RUNQ(td), 1270 ("sched_rem: thread not on run queue")); 1271 mtx_assert(&sched_lock, MA_OWNED); 1272 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1273 td, td->td_name, td->td_priority, curthread, 1274 curthread->td_name); 1275 1276 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1277 sched_load_rem(); 1278 runq_remove(ts->ts_runq, td); 1279 TD_SET_CAN_RUN(td); 1280 } 1281 1282 /* 1283 * Select threads to run. 1284 * Notice that the running threads still consume a slot. 1285 */ 1286 struct thread * 1287 sched_choose(void) 1288 { 1289 struct thread *td; 1290 struct runq *rq; 1291 1292 mtx_assert(&sched_lock, MA_OWNED); 1293 #ifdef SMP 1294 struct thread *tdcpu; 1295 1296 rq = &runq; 1297 td = runq_choose_fuzz(&runq, runq_fuzz); 1298 tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 1299 1300 if (td == NULL || 1301 (tdcpu != NULL && 1302 tdcpu->td_priority < td->td_priority)) { 1303 CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu, 1304 PCPU_GET(cpuid)); 1305 td = tdcpu; 1306 rq = &runq_pcpu[PCPU_GET(cpuid)]; 1307 } else { 1308 CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td); 1309 } 1310 1311 #else 1312 rq = &runq; 1313 td = runq_choose(&runq); 1314 #endif 1315 1316 if (td) { 1317 runq_remove(rq, td); 1318 td->td_flags |= TDF_DIDRUN; 1319 1320 KASSERT(td->td_flags & TDF_INMEM, 1321 ("sched_choose: thread swapped out")); 1322 return (td); 1323 } 1324 return (PCPU_GET(idlethread)); 1325 } 1326 1327 void 1328 sched_preempt(struct thread *td) 1329 { 1330 thread_lock(td); 1331 if (td->td_critnest > 1) 1332 td->td_owepreempt = 1; 1333 else 1334 mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL); 1335 thread_unlock(td); 1336 } 1337 1338 void 1339 sched_userret(struct thread *td) 1340 { 1341 /* 1342 * XXX we cheat slightly on the locking here to avoid locking in 1343 * the usual case. Setting td_priority here is essentially an 1344 * incomplete workaround for not setting it properly elsewhere. 1345 * Now that some interrupt handlers are threads, not setting it 1346 * properly elsewhere can clobber it in the window between setting 1347 * it here and returning to user mode, so don't waste time setting 1348 * it perfectly here. 1349 */ 1350 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1351 ("thread with borrowed priority returning to userland")); 1352 if (td->td_priority != td->td_user_pri) { 1353 thread_lock(td); 1354 td->td_priority = td->td_user_pri; 1355 td->td_base_pri = td->td_user_pri; 1356 thread_unlock(td); 1357 } 1358 } 1359 1360 void 1361 sched_bind(struct thread *td, int cpu) 1362 { 1363 struct td_sched *ts; 1364 1365 THREAD_LOCK_ASSERT(td, MA_OWNED); 1366 KASSERT(TD_IS_RUNNING(td), 1367 ("sched_bind: cannot bind non-running thread")); 1368 1369 ts = td->td_sched; 1370 1371 td->td_flags |= TDF_BOUND; 1372 #ifdef SMP 1373 ts->ts_runq = &runq_pcpu[cpu]; 1374 if (PCPU_GET(cpuid) == cpu) 1375 return; 1376 1377 mi_switch(SW_VOL, NULL); 1378 #endif 1379 } 1380 1381 void 1382 sched_unbind(struct thread* td) 1383 { 1384 THREAD_LOCK_ASSERT(td, MA_OWNED); 1385 td->td_flags &= ~TDF_BOUND; 1386 } 1387 1388 int 1389 sched_is_bound(struct thread *td) 1390 { 1391 THREAD_LOCK_ASSERT(td, MA_OWNED); 1392 return (td->td_flags & TDF_BOUND); 1393 } 1394 1395 void 1396 sched_relinquish(struct thread *td) 1397 { 1398 thread_lock(td); 1399 mi_switch(SW_VOL | SWT_RELINQUISH, NULL); 1400 thread_unlock(td); 1401 } 1402 1403 int 1404 sched_load(void) 1405 { 1406 return (sched_tdcnt); 1407 } 1408 1409 int 1410 sched_sizeof_proc(void) 1411 { 1412 return (sizeof(struct proc)); 1413 } 1414 1415 int 1416 sched_sizeof_thread(void) 1417 { 1418 return (sizeof(struct thread) + sizeof(struct td_sched)); 1419 } 1420 1421 fixpt_t 1422 sched_pctcpu(struct thread *td) 1423 { 1424 struct td_sched *ts; 1425 1426 ts = td->td_sched; 1427 return (ts->ts_pctcpu); 1428 } 1429 1430 void 1431 sched_tick(void) 1432 { 1433 } 1434 1435 /* 1436 * The actual idle process. 1437 */ 1438 void 1439 sched_idletd(void *dummy) 1440 { 1441 1442 for (;;) { 1443 mtx_assert(&Giant, MA_NOTOWNED); 1444 1445 while (sched_runnable() == 0) 1446 cpu_idle(0); 1447 1448 mtx_lock_spin(&sched_lock); 1449 mi_switch(SW_VOL | SWT_IDLE, NULL); 1450 mtx_unlock_spin(&sched_lock); 1451 } 1452 } 1453 1454 /* 1455 * A CPU is entering for the first time or a thread is exiting. 1456 */ 1457 void 1458 sched_throw(struct thread *td) 1459 { 1460 /* 1461 * Correct spinlock nesting. The idle thread context that we are 1462 * borrowing was created so that it would start out with a single 1463 * spin lock (sched_lock) held in fork_trampoline(). Since we've 1464 * explicitly acquired locks in this function, the nesting count 1465 * is now 2 rather than 1. Since we are nested, calling 1466 * spinlock_exit() will simply adjust the counts without allowing 1467 * spin lock using code to interrupt us. 1468 */ 1469 if (td == NULL) { 1470 mtx_lock_spin(&sched_lock); 1471 spinlock_exit(); 1472 } else { 1473 lock_profile_release_lock(&sched_lock.lock_object); 1474 MPASS(td->td_lock == &sched_lock); 1475 } 1476 mtx_assert(&sched_lock, MA_OWNED); 1477 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 1478 PCPU_SET(switchtime, cpu_ticks()); 1479 PCPU_SET(switchticks, ticks); 1480 cpu_throw(td, choosethread()); /* doesn't return */ 1481 } 1482 1483 void 1484 sched_fork_exit(struct thread *td) 1485 { 1486 1487 /* 1488 * Finish setting up thread glue so that it begins execution in a 1489 * non-nested critical section with sched_lock held but not recursed. 1490 */ 1491 td->td_oncpu = PCPU_GET(cpuid); 1492 sched_lock.mtx_lock = (uintptr_t)td; 1493 lock_profile_obtain_lock_success(&sched_lock.lock_object, 1494 0, 0, __FILE__, __LINE__); 1495 THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); 1496 } 1497 1498 void 1499 sched_affinity(struct thread *td) 1500 { 1501 } 1502