1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_hwpmc_hooks.h" 39 #include "opt_sched.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/cpuset.h> 44 #include <sys/kernel.h> 45 #include <sys/ktr.h> 46 #include <sys/lock.h> 47 #include <sys/kthread.h> 48 #include <sys/mutex.h> 49 #include <sys/proc.h> 50 #include <sys/resourcevar.h> 51 #include <sys/sched.h> 52 #include <sys/smp.h> 53 #include <sys/sysctl.h> 54 #include <sys/sx.h> 55 #include <sys/turnstile.h> 56 #include <sys/umtx.h> 57 #include <machine/pcb.h> 58 #include <machine/smp.h> 59 60 #ifdef HWPMC_HOOKS 61 #include <sys/pmckern.h> 62 #endif 63 64 /* 65 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 66 * the range 100-256 Hz (approximately). 67 */ 68 #define ESTCPULIM(e) \ 69 min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 70 RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 71 #ifdef SMP 72 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 73 #else 74 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 75 #endif 76 #define NICE_WEIGHT 1 /* Priorities per nice level. */ 77 78 /* 79 * The schedulable entity that runs a context. 80 * This is an extension to the thread structure and is tailored to 81 * the requirements of this scheduler 82 */ 83 struct td_sched { 84 fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */ 85 int ts_cpticks; /* (j) Ticks of cpu time. */ 86 int ts_slptime; /* (j) Seconds !RUNNING. */ 87 struct runq *ts_runq; /* runq the thread is currently on */ 88 }; 89 90 /* flags kept in td_flags */ 91 #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ 92 #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ 93 94 #define SKE_RUNQ_PCPU(ts) \ 95 ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) 96 97 static struct td_sched td_sched0; 98 struct mtx sched_lock; 99 100 static int sched_tdcnt; /* Total runnable threads in the system. */ 101 static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 102 #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ 103 104 static void setup_runqs(void); 105 static void schedcpu(void); 106 static void schedcpu_thread(void); 107 static void sched_priority(struct thread *td, u_char prio); 108 static void sched_setup(void *dummy); 109 static void maybe_resched(struct thread *td); 110 static void updatepri(struct thread *td); 111 static void resetpriority(struct thread *td); 112 static void resetpriority_thread(struct thread *td); 113 #ifdef SMP 114 static int forward_wakeup(int cpunum); 115 #endif 116 117 static struct kproc_desc sched_kp = { 118 "schedcpu", 119 schedcpu_thread, 120 NULL 121 }; 122 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, 123 &sched_kp); 124 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); 125 126 /* 127 * Global run queue. 128 */ 129 static struct runq runq; 130 131 #ifdef SMP 132 /* 133 * Per-CPU run queues 134 */ 135 static struct runq runq_pcpu[MAXCPU]; 136 #endif 137 138 static void 139 setup_runqs(void) 140 { 141 #ifdef SMP 142 int i; 143 144 for (i = 0; i < MAXCPU; ++i) 145 runq_init(&runq_pcpu[i]); 146 #endif 147 148 runq_init(&runq); 149 } 150 151 static int 152 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 153 { 154 int error, new_val; 155 156 new_val = sched_quantum * tick; 157 error = sysctl_handle_int(oidp, &new_val, 0, req); 158 if (error != 0 || req->newptr == NULL) 159 return (error); 160 if (new_val < tick) 161 return (EINVAL); 162 sched_quantum = new_val / tick; 163 hogticks = 2 * sched_quantum; 164 return (0); 165 } 166 167 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 168 169 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 170 "Scheduler name"); 171 172 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 173 0, sizeof sched_quantum, sysctl_kern_quantum, "I", 174 "Roundrobin scheduling quantum in microseconds"); 175 176 #ifdef SMP 177 /* Enable forwarding of wakeups to all other cpus */ 178 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); 179 180 static int runq_fuzz = 1; 181 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); 182 183 static int forward_wakeup_enabled = 1; 184 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, 185 &forward_wakeup_enabled, 0, 186 "Forwarding of wakeup to idle CPUs"); 187 188 static int forward_wakeups_requested = 0; 189 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, 190 &forward_wakeups_requested, 0, 191 "Requests for Forwarding of wakeup to idle CPUs"); 192 193 static int forward_wakeups_delivered = 0; 194 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, 195 &forward_wakeups_delivered, 0, 196 "Completed Forwarding of wakeup to idle CPUs"); 197 198 static int forward_wakeup_use_mask = 1; 199 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, 200 &forward_wakeup_use_mask, 0, 201 "Use the mask of idle cpus"); 202 203 static int forward_wakeup_use_loop = 0; 204 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, 205 &forward_wakeup_use_loop, 0, 206 "Use a loop to find idle cpus"); 207 208 static int forward_wakeup_use_single = 0; 209 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW, 210 &forward_wakeup_use_single, 0, 211 "Only signal one idle cpu"); 212 213 static int forward_wakeup_use_htt = 0; 214 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW, 215 &forward_wakeup_use_htt, 0, 216 "account for htt"); 217 218 #endif 219 #if 0 220 static int sched_followon = 0; 221 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, 222 &sched_followon, 0, 223 "allow threads to share a quantum"); 224 #endif 225 226 static __inline void 227 sched_load_add(void) 228 { 229 sched_tdcnt++; 230 CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); 231 } 232 233 static __inline void 234 sched_load_rem(void) 235 { 236 sched_tdcnt--; 237 CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); 238 } 239 /* 240 * Arrange to reschedule if necessary, taking the priorities and 241 * schedulers into account. 242 */ 243 static void 244 maybe_resched(struct thread *td) 245 { 246 247 THREAD_LOCK_ASSERT(td, MA_OWNED); 248 if (td->td_priority < curthread->td_priority) 249 curthread->td_flags |= TDF_NEEDRESCHED; 250 } 251 252 /* 253 * This function is called when a thread is about to be put on run queue 254 * because it has been made runnable or its priority has been adjusted. It 255 * determines if the new thread should be immediately preempted to. If so, 256 * it switches to it and eventually returns true. If not, it returns false 257 * so that the caller may place the thread on an appropriate run queue. 258 */ 259 int 260 maybe_preempt(struct thread *td) 261 { 262 #ifdef PREEMPTION 263 struct thread *ctd; 264 int cpri, pri; 265 #endif 266 267 #ifdef PREEMPTION 268 /* 269 * The new thread should not preempt the current thread if any of the 270 * following conditions are true: 271 * 272 * - The kernel is in the throes of crashing (panicstr). 273 * - The current thread has a higher (numerically lower) or 274 * equivalent priority. Note that this prevents curthread from 275 * trying to preempt to itself. 276 * - It is too early in the boot for context switches (cold is set). 277 * - The current thread has an inhibitor set or is in the process of 278 * exiting. In this case, the current thread is about to switch 279 * out anyways, so there's no point in preempting. If we did, 280 * the current thread would not be properly resumed as well, so 281 * just avoid that whole landmine. 282 * - If the new thread's priority is not a realtime priority and 283 * the current thread's priority is not an idle priority and 284 * FULL_PREEMPTION is disabled. 285 * 286 * If all of these conditions are false, but the current thread is in 287 * a nested critical section, then we have to defer the preemption 288 * until we exit the critical section. Otherwise, switch immediately 289 * to the new thread. 290 */ 291 ctd = curthread; 292 THREAD_LOCK_ASSERT(td, MA_OWNED); 293 KASSERT((td->td_inhibitors == 0), 294 ("maybe_preempt: trying to run inhibited thread")); 295 pri = td->td_priority; 296 cpri = ctd->td_priority; 297 if (panicstr != NULL || pri >= cpri || cold /* || dumping */ || 298 TD_IS_INHIBITED(ctd)) 299 return (0); 300 #ifndef FULL_PREEMPTION 301 if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) 302 return (0); 303 #endif 304 305 if (ctd->td_critnest > 1) { 306 CTR1(KTR_PROC, "maybe_preempt: in critical section %d", 307 ctd->td_critnest); 308 ctd->td_owepreempt = 1; 309 return (0); 310 } 311 /* 312 * Thread is runnable but not yet put on system run queue. 313 */ 314 MPASS(ctd->td_lock == td->td_lock); 315 MPASS(TD_ON_RUNQ(td)); 316 TD_SET_RUNNING(td); 317 CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, 318 td->td_proc->p_pid, td->td_name); 319 SCHED_STAT_INC(switch_preempt); 320 mi_switch(SW_INVOL|SW_PREEMPT, td); 321 /* 322 * td's lock pointer may have changed. We have to return with it 323 * locked. 324 */ 325 spinlock_enter(); 326 thread_unlock(ctd); 327 thread_lock(td); 328 spinlock_exit(); 329 return (1); 330 #else 331 return (0); 332 #endif 333 } 334 335 /* 336 * Constants for digital decay and forget: 337 * 90% of (td_estcpu) usage in 5 * loadav time 338 * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) 339 * Note that, as ps(1) mentions, this can let percentages 340 * total over 100% (I've seen 137.9% for 3 processes). 341 * 342 * Note that schedclock() updates td_estcpu and p_cpticks asynchronously. 343 * 344 * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds. 345 * That is, the system wants to compute a value of decay such 346 * that the following for loop: 347 * for (i = 0; i < (5 * loadavg); i++) 348 * td_estcpu *= decay; 349 * will compute 350 * td_estcpu *= 0.1; 351 * for all values of loadavg: 352 * 353 * Mathematically this loop can be expressed by saying: 354 * decay ** (5 * loadavg) ~= .1 355 * 356 * The system computes decay as: 357 * decay = (2 * loadavg) / (2 * loadavg + 1) 358 * 359 * We wish to prove that the system's computation of decay 360 * will always fulfill the equation: 361 * decay ** (5 * loadavg) ~= .1 362 * 363 * If we compute b as: 364 * b = 2 * loadavg 365 * then 366 * decay = b / (b + 1) 367 * 368 * We now need to prove two things: 369 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 370 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 371 * 372 * Facts: 373 * For x close to zero, exp(x) =~ 1 + x, since 374 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 375 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 376 * For x close to zero, ln(1+x) =~ x, since 377 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 378 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 379 * ln(.1) =~ -2.30 380 * 381 * Proof of (1): 382 * Solve (factor)**(power) =~ .1 given power (5*loadav): 383 * solving for factor, 384 * ln(factor) =~ (-2.30/5*loadav), or 385 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 386 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 387 * 388 * Proof of (2): 389 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 390 * solving for power, 391 * power*ln(b/(b+1)) =~ -2.30, or 392 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 393 * 394 * Actual power values for the implemented algorithm are as follows: 395 * loadav: 1 2 3 4 396 * power: 5.68 10.32 14.94 19.55 397 */ 398 399 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 400 #define loadfactor(loadav) (2 * (loadav)) 401 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 402 403 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 404 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 405 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 406 407 /* 408 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 409 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 410 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 411 * 412 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 413 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 414 * 415 * If you don't want to bother with the faster/more-accurate formula, you 416 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 417 * (more general) method of calculating the %age of CPU used by a process. 418 */ 419 #define CCPU_SHIFT 11 420 421 /* 422 * Recompute process priorities, every hz ticks. 423 * MP-safe, called without the Giant mutex. 424 */ 425 /* ARGSUSED */ 426 static void 427 schedcpu(void) 428 { 429 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 430 struct thread *td; 431 struct proc *p; 432 struct td_sched *ts; 433 int awake, realstathz; 434 435 realstathz = stathz ? stathz : hz; 436 sx_slock(&allproc_lock); 437 FOREACH_PROC_IN_SYSTEM(p) { 438 PROC_LOCK(p); 439 FOREACH_THREAD_IN_PROC(p, td) { 440 awake = 0; 441 thread_lock(td); 442 ts = td->td_sched; 443 /* 444 * Increment sleep time (if sleeping). We 445 * ignore overflow, as above. 446 */ 447 /* 448 * The td_sched slptimes are not touched in wakeup 449 * because the thread may not HAVE everything in 450 * memory? XXX I think this is out of date. 451 */ 452 if (TD_ON_RUNQ(td)) { 453 awake = 1; 454 td->td_flags &= ~TDF_DIDRUN; 455 } else if (TD_IS_RUNNING(td)) { 456 awake = 1; 457 /* Do not clear TDF_DIDRUN */ 458 } else if (td->td_flags & TDF_DIDRUN) { 459 awake = 1; 460 td->td_flags &= ~TDF_DIDRUN; 461 } 462 463 /* 464 * ts_pctcpu is only for ps and ttyinfo(). 465 */ 466 ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; 467 /* 468 * If the td_sched has been idle the entire second, 469 * stop recalculating its priority until 470 * it wakes up. 471 */ 472 if (ts->ts_cpticks != 0) { 473 #if (FSHIFT >= CCPU_SHIFT) 474 ts->ts_pctcpu += (realstathz == 100) 475 ? ((fixpt_t) ts->ts_cpticks) << 476 (FSHIFT - CCPU_SHIFT) : 477 100 * (((fixpt_t) ts->ts_cpticks) 478 << (FSHIFT - CCPU_SHIFT)) / realstathz; 479 #else 480 ts->ts_pctcpu += ((FSCALE - ccpu) * 481 (ts->ts_cpticks * 482 FSCALE / realstathz)) >> FSHIFT; 483 #endif 484 ts->ts_cpticks = 0; 485 } 486 /* 487 * If there are ANY running threads in this process, 488 * then don't count it as sleeping. 489 XXX this is broken 490 491 */ 492 if (awake) { 493 if (ts->ts_slptime > 1) { 494 /* 495 * In an ideal world, this should not 496 * happen, because whoever woke us 497 * up from the long sleep should have 498 * unwound the slptime and reset our 499 * priority before we run at the stale 500 * priority. Should KASSERT at some 501 * point when all the cases are fixed. 502 */ 503 updatepri(td); 504 } 505 ts->ts_slptime = 0; 506 } else 507 ts->ts_slptime++; 508 if (ts->ts_slptime > 1) { 509 thread_unlock(td); 510 continue; 511 } 512 td->td_estcpu = decay_cpu(loadfac, td->td_estcpu); 513 resetpriority(td); 514 resetpriority_thread(td); 515 thread_unlock(td); 516 } /* end of thread loop */ 517 PROC_UNLOCK(p); 518 } /* end of process loop */ 519 sx_sunlock(&allproc_lock); 520 } 521 522 /* 523 * Main loop for a kthread that executes schedcpu once a second. 524 */ 525 static void 526 schedcpu_thread(void) 527 { 528 529 for (;;) { 530 schedcpu(); 531 pause("-", hz); 532 } 533 } 534 535 /* 536 * Recalculate the priority of a process after it has slept for a while. 537 * For all load averages >= 1 and max td_estcpu of 255, sleeping for at 538 * least six times the loadfactor will decay td_estcpu to zero. 539 */ 540 static void 541 updatepri(struct thread *td) 542 { 543 struct td_sched *ts; 544 fixpt_t loadfac; 545 unsigned int newcpu; 546 547 ts = td->td_sched; 548 loadfac = loadfactor(averunnable.ldavg[0]); 549 if (ts->ts_slptime > 5 * loadfac) 550 td->td_estcpu = 0; 551 else { 552 newcpu = td->td_estcpu; 553 ts->ts_slptime--; /* was incremented in schedcpu() */ 554 while (newcpu && --ts->ts_slptime) 555 newcpu = decay_cpu(loadfac, newcpu); 556 td->td_estcpu = newcpu; 557 } 558 } 559 560 /* 561 * Compute the priority of a process when running in user mode. 562 * Arrange to reschedule if the resulting priority is better 563 * than that of the current process. 564 */ 565 static void 566 resetpriority(struct thread *td) 567 { 568 register unsigned int newpriority; 569 570 if (td->td_pri_class == PRI_TIMESHARE) { 571 newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT + 572 NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); 573 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 574 PRI_MAX_TIMESHARE); 575 sched_user_prio(td, newpriority); 576 } 577 } 578 579 /* 580 * Update the thread's priority when the associated process's user 581 * priority changes. 582 */ 583 static void 584 resetpriority_thread(struct thread *td) 585 { 586 587 /* Only change threads with a time sharing user priority. */ 588 if (td->td_priority < PRI_MIN_TIMESHARE || 589 td->td_priority > PRI_MAX_TIMESHARE) 590 return; 591 592 /* XXX the whole needresched thing is broken, but not silly. */ 593 maybe_resched(td); 594 595 sched_prio(td, td->td_user_pri); 596 } 597 598 /* ARGSUSED */ 599 static void 600 sched_setup(void *dummy) 601 { 602 setup_runqs(); 603 604 if (sched_quantum == 0) 605 sched_quantum = SCHED_QUANTUM; 606 hogticks = 2 * sched_quantum; 607 608 /* Account for thread0. */ 609 sched_load_add(); 610 } 611 612 /* External interfaces start here */ 613 /* 614 * Very early in the boot some setup of scheduler-specific 615 * parts of proc0 and of some scheduler resources needs to be done. 616 * Called from: 617 * proc0_init() 618 */ 619 void 620 schedinit(void) 621 { 622 /* 623 * Set up the scheduler specific parts of proc0. 624 */ 625 proc0.p_sched = NULL; /* XXX */ 626 thread0.td_sched = &td_sched0; 627 thread0.td_lock = &sched_lock; 628 mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); 629 } 630 631 int 632 sched_runnable(void) 633 { 634 #ifdef SMP 635 return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 636 #else 637 return runq_check(&runq); 638 #endif 639 } 640 641 int 642 sched_rr_interval(void) 643 { 644 if (sched_quantum == 0) 645 sched_quantum = SCHED_QUANTUM; 646 return (sched_quantum); 647 } 648 649 /* 650 * We adjust the priority of the current process. The priority of 651 * a process gets worse as it accumulates CPU time. The cpu usage 652 * estimator (td_estcpu) is increased here. resetpriority() will 653 * compute a different priority each time td_estcpu increases by 654 * INVERSE_ESTCPU_WEIGHT 655 * (until MAXPRI is reached). The cpu usage estimator ramps up 656 * quite quickly when the process is running (linearly), and decays 657 * away exponentially, at a rate which is proportionally slower when 658 * the system is busy. The basic principle is that the system will 659 * 90% forget that the process used a lot of CPU time in 5 * loadav 660 * seconds. This causes the system to favor processes which haven't 661 * run much recently, and to round-robin among other processes. 662 */ 663 void 664 sched_clock(struct thread *td) 665 { 666 struct td_sched *ts; 667 668 THREAD_LOCK_ASSERT(td, MA_OWNED); 669 ts = td->td_sched; 670 671 ts->ts_cpticks++; 672 td->td_estcpu = ESTCPULIM(td->td_estcpu + 1); 673 if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 674 resetpriority(td); 675 resetpriority_thread(td); 676 } 677 678 /* 679 * Force a context switch if the current thread has used up a full 680 * quantum (default quantum is 100ms). 681 */ 682 if (!TD_IS_IDLETHREAD(td) && 683 ticks - PCPU_GET(switchticks) >= sched_quantum) 684 td->td_flags |= TDF_NEEDRESCHED; 685 } 686 687 /* 688 * charge childs scheduling cpu usage to parent. 689 */ 690 void 691 sched_exit(struct proc *p, struct thread *td) 692 { 693 694 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 695 td, td->td_name, td->td_priority); 696 PROC_LOCK_ASSERT(p, MA_OWNED); 697 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 698 } 699 700 void 701 sched_exit_thread(struct thread *td, struct thread *child) 702 { 703 704 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 705 child, child->td_name, child->td_priority); 706 thread_lock(td); 707 td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu); 708 thread_unlock(td); 709 mtx_lock_spin(&sched_lock); 710 if ((child->td_proc->p_flag & P_NOLOAD) == 0) 711 sched_load_rem(); 712 mtx_unlock_spin(&sched_lock); 713 } 714 715 void 716 sched_fork(struct thread *td, struct thread *childtd) 717 { 718 sched_fork_thread(td, childtd); 719 } 720 721 void 722 sched_fork_thread(struct thread *td, struct thread *childtd) 723 { 724 struct td_sched *ts; 725 726 childtd->td_estcpu = td->td_estcpu; 727 childtd->td_lock = &sched_lock; 728 childtd->td_cpuset = cpuset_ref(td->td_cpuset); 729 ts = childtd->td_sched; 730 bzero(ts, sizeof(*ts)); 731 } 732 733 void 734 sched_nice(struct proc *p, int nice) 735 { 736 struct thread *td; 737 738 PROC_LOCK_ASSERT(p, MA_OWNED); 739 p->p_nice = nice; 740 FOREACH_THREAD_IN_PROC(p, td) { 741 thread_lock(td); 742 resetpriority(td); 743 resetpriority_thread(td); 744 thread_unlock(td); 745 } 746 } 747 748 void 749 sched_class(struct thread *td, int class) 750 { 751 THREAD_LOCK_ASSERT(td, MA_OWNED); 752 td->td_pri_class = class; 753 } 754 755 /* 756 * Adjust the priority of a thread. 757 */ 758 static void 759 sched_priority(struct thread *td, u_char prio) 760 { 761 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 762 td, td->td_name, td->td_priority, prio, curthread, 763 curthread->td_name); 764 765 THREAD_LOCK_ASSERT(td, MA_OWNED); 766 if (td->td_priority == prio) 767 return; 768 td->td_priority = prio; 769 if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { 770 sched_rem(td); 771 sched_add(td, SRQ_BORING); 772 } 773 } 774 775 /* 776 * Update a thread's priority when it is lent another thread's 777 * priority. 778 */ 779 void 780 sched_lend_prio(struct thread *td, u_char prio) 781 { 782 783 td->td_flags |= TDF_BORROWING; 784 sched_priority(td, prio); 785 } 786 787 /* 788 * Restore a thread's priority when priority propagation is 789 * over. The prio argument is the minimum priority the thread 790 * needs to have to satisfy other possible priority lending 791 * requests. If the thread's regulary priority is less 792 * important than prio the thread will keep a priority boost 793 * of prio. 794 */ 795 void 796 sched_unlend_prio(struct thread *td, u_char prio) 797 { 798 u_char base_pri; 799 800 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 801 td->td_base_pri <= PRI_MAX_TIMESHARE) 802 base_pri = td->td_user_pri; 803 else 804 base_pri = td->td_base_pri; 805 if (prio >= base_pri) { 806 td->td_flags &= ~TDF_BORROWING; 807 sched_prio(td, base_pri); 808 } else 809 sched_lend_prio(td, prio); 810 } 811 812 void 813 sched_prio(struct thread *td, u_char prio) 814 { 815 u_char oldprio; 816 817 /* First, update the base priority. */ 818 td->td_base_pri = prio; 819 820 /* 821 * If the thread is borrowing another thread's priority, don't ever 822 * lower the priority. 823 */ 824 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 825 return; 826 827 /* Change the real priority. */ 828 oldprio = td->td_priority; 829 sched_priority(td, prio); 830 831 /* 832 * If the thread is on a turnstile, then let the turnstile update 833 * its state. 834 */ 835 if (TD_ON_LOCK(td) && oldprio != prio) 836 turnstile_adjust(td, oldprio); 837 } 838 839 void 840 sched_user_prio(struct thread *td, u_char prio) 841 { 842 u_char oldprio; 843 844 THREAD_LOCK_ASSERT(td, MA_OWNED); 845 td->td_base_user_pri = prio; 846 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 847 return; 848 oldprio = td->td_user_pri; 849 td->td_user_pri = prio; 850 } 851 852 void 853 sched_lend_user_prio(struct thread *td, u_char prio) 854 { 855 u_char oldprio; 856 857 THREAD_LOCK_ASSERT(td, MA_OWNED); 858 td->td_flags |= TDF_UBORROWING; 859 oldprio = td->td_user_pri; 860 td->td_user_pri = prio; 861 } 862 863 void 864 sched_unlend_user_prio(struct thread *td, u_char prio) 865 { 866 u_char base_pri; 867 868 THREAD_LOCK_ASSERT(td, MA_OWNED); 869 base_pri = td->td_base_user_pri; 870 if (prio >= base_pri) { 871 td->td_flags &= ~TDF_UBORROWING; 872 sched_user_prio(td, base_pri); 873 } else { 874 sched_lend_user_prio(td, prio); 875 } 876 } 877 878 void 879 sched_sleep(struct thread *td, int pri) 880 { 881 882 THREAD_LOCK_ASSERT(td, MA_OWNED); 883 td->td_slptick = ticks; 884 td->td_sched->ts_slptime = 0; 885 if (pri) 886 sched_prio(td, pri); 887 if (TD_IS_SUSPENDED(td) || pri <= PSOCK) 888 td->td_flags |= TDF_CANSWAP; 889 } 890 891 void 892 sched_switch(struct thread *td, struct thread *newtd, int flags) 893 { 894 struct td_sched *ts; 895 struct proc *p; 896 897 ts = td->td_sched; 898 p = td->td_proc; 899 900 THREAD_LOCK_ASSERT(td, MA_OWNED); 901 /* 902 * Switch to the sched lock to fix things up and pick 903 * a new thread. 904 */ 905 if (td->td_lock != &sched_lock) { 906 mtx_lock_spin(&sched_lock); 907 thread_unlock(td); 908 } 909 910 if ((p->p_flag & P_NOLOAD) == 0) 911 sched_load_rem(); 912 913 if (newtd) 914 newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); 915 916 td->td_lastcpu = td->td_oncpu; 917 td->td_flags &= ~TDF_NEEDRESCHED; 918 td->td_owepreempt = 0; 919 td->td_oncpu = NOCPU; 920 /* 921 * At the last moment, if this thread is still marked RUNNING, 922 * then put it back on the run queue as it has not been suspended 923 * or stopped or any thing else similar. We never put the idle 924 * threads on the run queue, however. 925 */ 926 if (td->td_flags & TDF_IDLETD) { 927 TD_SET_CAN_RUN(td); 928 #ifdef SMP 929 idle_cpus_mask &= ~PCPU_GET(cpumask); 930 #endif 931 } else { 932 if (TD_IS_RUNNING(td)) { 933 /* Put us back on the run queue. */ 934 sched_add(td, (flags & SW_PREEMPT) ? 935 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 936 SRQ_OURSELF|SRQ_YIELDING); 937 } 938 } 939 if (newtd) { 940 /* 941 * The thread we are about to run needs to be counted 942 * as if it had been added to the run queue and selected. 943 * It came from: 944 * * A preemption 945 * * An upcall 946 * * A followon 947 */ 948 KASSERT((newtd->td_inhibitors == 0), 949 ("trying to run inhibited thread")); 950 newtd->td_flags |= TDF_DIDRUN; 951 TD_SET_RUNNING(newtd); 952 if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) 953 sched_load_add(); 954 } else { 955 newtd = choosethread(); 956 } 957 MPASS(newtd->td_lock == &sched_lock); 958 959 if (td != newtd) { 960 #ifdef HWPMC_HOOKS 961 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 962 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 963 #endif 964 /* I feel sleepy */ 965 lock_profile_release_lock(&sched_lock.lock_object); 966 cpu_switch(td, newtd, td->td_lock); 967 lock_profile_obtain_lock_success(&sched_lock.lock_object, 968 0, 0, __FILE__, __LINE__); 969 /* 970 * Where am I? What year is it? 971 * We are in the same thread that went to sleep above, 972 * but any amount of time may have passed. All out context 973 * will still be available as will local variables. 974 * PCPU values however may have changed as we may have 975 * changed CPU so don't trust cached values of them. 976 * New threads will go to fork_exit() instead of here 977 * so if you change things here you may need to change 978 * things there too. 979 * If the thread above was exiting it will never wake 980 * up again here, so either it has saved everything it 981 * needed to, or the thread_wait() or wait() will 982 * need to reap it. 983 */ 984 #ifdef HWPMC_HOOKS 985 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 986 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 987 #endif 988 } 989 990 #ifdef SMP 991 if (td->td_flags & TDF_IDLETD) 992 idle_cpus_mask |= PCPU_GET(cpumask); 993 #endif 994 sched_lock.mtx_lock = (uintptr_t)td; 995 td->td_oncpu = PCPU_GET(cpuid); 996 MPASS(td->td_lock == &sched_lock); 997 } 998 999 void 1000 sched_wakeup(struct thread *td) 1001 { 1002 struct td_sched *ts; 1003 1004 THREAD_LOCK_ASSERT(td, MA_OWNED); 1005 ts = td->td_sched; 1006 td->td_flags &= ~TDF_CANSWAP; 1007 if (ts->ts_slptime > 1) { 1008 updatepri(td); 1009 resetpriority(td); 1010 } 1011 td->td_slptick = ticks; 1012 ts->ts_slptime = 0; 1013 sched_add(td, SRQ_BORING); 1014 } 1015 1016 #ifdef SMP 1017 /* enable HTT_2 if you have a 2-way HTT cpu.*/ 1018 static int 1019 forward_wakeup(int cpunum) 1020 { 1021 cpumask_t map, me, dontuse; 1022 cpumask_t map2; 1023 struct pcpu *pc; 1024 cpumask_t id, map3; 1025 1026 mtx_assert(&sched_lock, MA_OWNED); 1027 1028 CTR0(KTR_RUNQ, "forward_wakeup()"); 1029 1030 if ((!forward_wakeup_enabled) || 1031 (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) 1032 return (0); 1033 if (!smp_started || cold || panicstr) 1034 return (0); 1035 1036 forward_wakeups_requested++; 1037 1038 /* 1039 * check the idle mask we received against what we calculated before 1040 * in the old version. 1041 */ 1042 me = PCPU_GET(cpumask); 1043 /* 1044 * don't bother if we should be doing it ourself.. 1045 */ 1046 if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum))) 1047 return (0); 1048 1049 dontuse = me | stopped_cpus | hlt_cpus_mask; 1050 map3 = 0; 1051 if (forward_wakeup_use_loop) { 1052 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 1053 id = pc->pc_cpumask; 1054 if ( (id & dontuse) == 0 && 1055 pc->pc_curthread == pc->pc_idlethread) { 1056 map3 |= id; 1057 } 1058 } 1059 } 1060 1061 if (forward_wakeup_use_mask) { 1062 map = 0; 1063 map = idle_cpus_mask & ~dontuse; 1064 1065 /* If they are both on, compare and use loop if different */ 1066 if (forward_wakeup_use_loop) { 1067 if (map != map3) { 1068 printf("map (%02X) != map3 (%02X)\n", 1069 map, map3); 1070 map = map3; 1071 } 1072 } 1073 } else { 1074 map = map3; 1075 } 1076 /* If we only allow a specific CPU, then mask off all the others */ 1077 if (cpunum != NOCPU) { 1078 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); 1079 map &= (1 << cpunum); 1080 } else { 1081 /* Try choose an idle die. */ 1082 if (forward_wakeup_use_htt) { 1083 map2 = (map & (map >> 1)) & 0x5555; 1084 if (map2) { 1085 map = map2; 1086 } 1087 } 1088 1089 /* set only one bit */ 1090 if (forward_wakeup_use_single) { 1091 map = map & ((~map) + 1); 1092 } 1093 } 1094 if (map) { 1095 forward_wakeups_delivered++; 1096 ipi_selected(map, IPI_AST); 1097 return (1); 1098 } 1099 if (cpunum == NOCPU) 1100 printf("forward_wakeup: Idle processor not found\n"); 1101 return (0); 1102 } 1103 #endif 1104 1105 #ifdef SMP 1106 static void kick_other_cpu(int pri,int cpuid); 1107 1108 static void 1109 kick_other_cpu(int pri,int cpuid) 1110 { 1111 struct pcpu * pcpu = pcpu_find(cpuid); 1112 int cpri = pcpu->pc_curthread->td_priority; 1113 1114 if (idle_cpus_mask & pcpu->pc_cpumask) { 1115 forward_wakeups_delivered++; 1116 ipi_selected(pcpu->pc_cpumask, IPI_AST); 1117 return; 1118 } 1119 1120 if (pri >= cpri) 1121 return; 1122 1123 #if defined(IPI_PREEMPTION) && defined(PREEMPTION) 1124 #if !defined(FULL_PREEMPTION) 1125 if (pri <= PRI_MAX_ITHD) 1126 #endif /* ! FULL_PREEMPTION */ 1127 { 1128 ipi_selected(pcpu->pc_cpumask, IPI_PREEMPT); 1129 return; 1130 } 1131 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ 1132 1133 pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; 1134 ipi_selected( pcpu->pc_cpumask , IPI_AST); 1135 return; 1136 } 1137 #endif /* SMP */ 1138 1139 void 1140 sched_add(struct thread *td, int flags) 1141 #ifdef SMP 1142 { 1143 struct td_sched *ts; 1144 int forwarded = 0; 1145 int cpu; 1146 int single_cpu = 0; 1147 1148 ts = td->td_sched; 1149 THREAD_LOCK_ASSERT(td, MA_OWNED); 1150 KASSERT((td->td_inhibitors == 0), 1151 ("sched_add: trying to run inhibited thread")); 1152 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1153 ("sched_add: bad thread state")); 1154 KASSERT(td->td_flags & TDF_INMEM, 1155 ("sched_add: thread swapped out")); 1156 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1157 td, td->td_name, td->td_priority, curthread, 1158 curthread->td_name); 1159 /* 1160 * Now that the thread is moving to the run-queue, set the lock 1161 * to the scheduler's lock. 1162 */ 1163 if (td->td_lock != &sched_lock) { 1164 mtx_lock_spin(&sched_lock); 1165 thread_lock_set(td, &sched_lock); 1166 } 1167 TD_SET_RUNQ(td); 1168 1169 if (td->td_pinned != 0) { 1170 cpu = td->td_lastcpu; 1171 ts->ts_runq = &runq_pcpu[cpu]; 1172 single_cpu = 1; 1173 CTR3(KTR_RUNQ, 1174 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); 1175 } else if ((td)->td_flags & TDF_BOUND) { 1176 /* Find CPU from bound runq */ 1177 KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq")); 1178 cpu = ts->ts_runq - &runq_pcpu[0]; 1179 single_cpu = 1; 1180 CTR3(KTR_RUNQ, 1181 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); 1182 } else { 1183 CTR2(KTR_RUNQ, 1184 "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td); 1185 cpu = NOCPU; 1186 ts->ts_runq = &runq; 1187 } 1188 1189 if (single_cpu && (cpu != PCPU_GET(cpuid))) { 1190 kick_other_cpu(td->td_priority,cpu); 1191 } else { 1192 1193 if (!single_cpu) { 1194 cpumask_t me = PCPU_GET(cpumask); 1195 int idle = idle_cpus_mask & me; 1196 1197 if (!idle && ((flags & SRQ_INTR) == 0) && 1198 (idle_cpus_mask & ~(hlt_cpus_mask | me))) 1199 forwarded = forward_wakeup(cpu); 1200 } 1201 1202 if (!forwarded) { 1203 if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td)) 1204 return; 1205 else 1206 maybe_resched(td); 1207 } 1208 } 1209 1210 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1211 sched_load_add(); 1212 runq_add(ts->ts_runq, td, flags); 1213 } 1214 #else /* SMP */ 1215 { 1216 struct td_sched *ts; 1217 ts = td->td_sched; 1218 THREAD_LOCK_ASSERT(td, MA_OWNED); 1219 KASSERT((td->td_inhibitors == 0), 1220 ("sched_add: trying to run inhibited thread")); 1221 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1222 ("sched_add: bad thread state")); 1223 KASSERT(td->td_flags & TDF_INMEM, 1224 ("sched_add: thread swapped out")); 1225 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1226 td, td->td_name, td->td_priority, curthread, 1227 curthread->td_name); 1228 /* 1229 * Now that the thread is moving to the run-queue, set the lock 1230 * to the scheduler's lock. 1231 */ 1232 if (td->td_lock != &sched_lock) { 1233 mtx_lock_spin(&sched_lock); 1234 thread_lock_set(td, &sched_lock); 1235 } 1236 TD_SET_RUNQ(td); 1237 CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); 1238 ts->ts_runq = &runq; 1239 1240 /* 1241 * If we are yielding (on the way out anyhow) 1242 * or the thread being saved is US, 1243 * then don't try be smart about preemption 1244 * or kicking off another CPU 1245 * as it won't help and may hinder. 1246 * In the YIEDLING case, we are about to run whoever is 1247 * being put in the queue anyhow, and in the 1248 * OURSELF case, we are puting ourself on the run queue 1249 * which also only happens when we are about to yield. 1250 */ 1251 if((flags & SRQ_YIELDING) == 0) { 1252 if (maybe_preempt(td)) 1253 return; 1254 } 1255 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1256 sched_load_add(); 1257 runq_add(ts->ts_runq, td, flags); 1258 maybe_resched(td); 1259 } 1260 #endif /* SMP */ 1261 1262 void 1263 sched_rem(struct thread *td) 1264 { 1265 struct td_sched *ts; 1266 1267 ts = td->td_sched; 1268 KASSERT(td->td_flags & TDF_INMEM, 1269 ("sched_rem: thread swapped out")); 1270 KASSERT(TD_ON_RUNQ(td), 1271 ("sched_rem: thread not on run queue")); 1272 mtx_assert(&sched_lock, MA_OWNED); 1273 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1274 td, td->td_name, td->td_priority, curthread, 1275 curthread->td_name); 1276 1277 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1278 sched_load_rem(); 1279 runq_remove(ts->ts_runq, td); 1280 TD_SET_CAN_RUN(td); 1281 } 1282 1283 /* 1284 * Select threads to run. 1285 * Notice that the running threads still consume a slot. 1286 */ 1287 struct thread * 1288 sched_choose(void) 1289 { 1290 struct thread *td; 1291 struct runq *rq; 1292 1293 mtx_assert(&sched_lock, MA_OWNED); 1294 #ifdef SMP 1295 struct thread *tdcpu; 1296 1297 rq = &runq; 1298 td = runq_choose_fuzz(&runq, runq_fuzz); 1299 tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 1300 1301 if (td == NULL || 1302 (tdcpu != NULL && 1303 tdcpu->td_priority < td->td_priority)) { 1304 CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu, 1305 PCPU_GET(cpuid)); 1306 td = tdcpu; 1307 rq = &runq_pcpu[PCPU_GET(cpuid)]; 1308 } else { 1309 CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td); 1310 } 1311 1312 #else 1313 rq = &runq; 1314 td = runq_choose(&runq); 1315 #endif 1316 1317 if (td) { 1318 runq_remove(rq, td); 1319 td->td_flags |= TDF_DIDRUN; 1320 1321 KASSERT(td->td_flags & TDF_INMEM, 1322 ("sched_choose: thread swapped out")); 1323 return (td); 1324 } 1325 return (PCPU_GET(idlethread)); 1326 } 1327 1328 void 1329 sched_preempt(struct thread *td) 1330 { 1331 thread_lock(td); 1332 if (td->td_critnest > 1) 1333 td->td_owepreempt = 1; 1334 else 1335 mi_switch(SW_INVOL | SW_PREEMPT, NULL); 1336 thread_unlock(td); 1337 } 1338 1339 void 1340 sched_userret(struct thread *td) 1341 { 1342 /* 1343 * XXX we cheat slightly on the locking here to avoid locking in 1344 * the usual case. Setting td_priority here is essentially an 1345 * incomplete workaround for not setting it properly elsewhere. 1346 * Now that some interrupt handlers are threads, not setting it 1347 * properly elsewhere can clobber it in the window between setting 1348 * it here and returning to user mode, so don't waste time setting 1349 * it perfectly here. 1350 */ 1351 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1352 ("thread with borrowed priority returning to userland")); 1353 if (td->td_priority != td->td_user_pri) { 1354 thread_lock(td); 1355 td->td_priority = td->td_user_pri; 1356 td->td_base_pri = td->td_user_pri; 1357 thread_unlock(td); 1358 } 1359 } 1360 1361 void 1362 sched_bind(struct thread *td, int cpu) 1363 { 1364 struct td_sched *ts; 1365 1366 THREAD_LOCK_ASSERT(td, MA_OWNED); 1367 KASSERT(TD_IS_RUNNING(td), 1368 ("sched_bind: cannot bind non-running thread")); 1369 1370 ts = td->td_sched; 1371 1372 td->td_flags |= TDF_BOUND; 1373 #ifdef SMP 1374 ts->ts_runq = &runq_pcpu[cpu]; 1375 if (PCPU_GET(cpuid) == cpu) 1376 return; 1377 1378 mi_switch(SW_VOL, NULL); 1379 #endif 1380 } 1381 1382 void 1383 sched_unbind(struct thread* td) 1384 { 1385 THREAD_LOCK_ASSERT(td, MA_OWNED); 1386 td->td_flags &= ~TDF_BOUND; 1387 } 1388 1389 int 1390 sched_is_bound(struct thread *td) 1391 { 1392 THREAD_LOCK_ASSERT(td, MA_OWNED); 1393 return (td->td_flags & TDF_BOUND); 1394 } 1395 1396 void 1397 sched_relinquish(struct thread *td) 1398 { 1399 thread_lock(td); 1400 SCHED_STAT_INC(switch_relinquish); 1401 mi_switch(SW_VOL, NULL); 1402 thread_unlock(td); 1403 } 1404 1405 int 1406 sched_load(void) 1407 { 1408 return (sched_tdcnt); 1409 } 1410 1411 int 1412 sched_sizeof_proc(void) 1413 { 1414 return (sizeof(struct proc)); 1415 } 1416 1417 int 1418 sched_sizeof_thread(void) 1419 { 1420 return (sizeof(struct thread) + sizeof(struct td_sched)); 1421 } 1422 1423 fixpt_t 1424 sched_pctcpu(struct thread *td) 1425 { 1426 struct td_sched *ts; 1427 1428 ts = td->td_sched; 1429 return (ts->ts_pctcpu); 1430 } 1431 1432 void 1433 sched_tick(void) 1434 { 1435 } 1436 1437 /* 1438 * The actual idle process. 1439 */ 1440 void 1441 sched_idletd(void *dummy) 1442 { 1443 1444 for (;;) { 1445 mtx_assert(&Giant, MA_NOTOWNED); 1446 1447 while (sched_runnable() == 0) 1448 cpu_idle(); 1449 1450 mtx_lock_spin(&sched_lock); 1451 mi_switch(SW_VOL, NULL); 1452 mtx_unlock_spin(&sched_lock); 1453 } 1454 } 1455 1456 /* 1457 * A CPU is entering for the first time or a thread is exiting. 1458 */ 1459 void 1460 sched_throw(struct thread *td) 1461 { 1462 /* 1463 * Correct spinlock nesting. The idle thread context that we are 1464 * borrowing was created so that it would start out with a single 1465 * spin lock (sched_lock) held in fork_trampoline(). Since we've 1466 * explicitly acquired locks in this function, the nesting count 1467 * is now 2 rather than 1. Since we are nested, calling 1468 * spinlock_exit() will simply adjust the counts without allowing 1469 * spin lock using code to interrupt us. 1470 */ 1471 if (td == NULL) { 1472 mtx_lock_spin(&sched_lock); 1473 spinlock_exit(); 1474 } else { 1475 lock_profile_release_lock(&sched_lock.lock_object); 1476 MPASS(td->td_lock == &sched_lock); 1477 } 1478 mtx_assert(&sched_lock, MA_OWNED); 1479 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 1480 PCPU_SET(switchtime, cpu_ticks()); 1481 PCPU_SET(switchticks, ticks); 1482 cpu_throw(td, choosethread()); /* doesn't return */ 1483 } 1484 1485 void 1486 sched_fork_exit(struct thread *td) 1487 { 1488 1489 /* 1490 * Finish setting up thread glue so that it begins execution in a 1491 * non-nested critical section with sched_lock held but not recursed. 1492 */ 1493 td->td_oncpu = PCPU_GET(cpuid); 1494 sched_lock.mtx_lock = (uintptr_t)td; 1495 lock_profile_obtain_lock_success(&sched_lock.lock_object, 1496 0, 0, __FILE__, __LINE__); 1497 THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); 1498 } 1499 1500 void 1501 sched_affinity(struct thread *td) 1502 { 1503 } 1504