1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_hwpmc_hooks.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/lock.h> 45 #include <sys/kthread.h> 46 #include <sys/mutex.h> 47 #include <sys/proc.h> 48 #include <sys/resourcevar.h> 49 #include <sys/sched.h> 50 #include <sys/smp.h> 51 #include <sys/sysctl.h> 52 #include <sys/sx.h> 53 #include <sys/turnstile.h> 54 #include <sys/umtx.h> 55 #include <machine/pcb.h> 56 #include <machine/smp.h> 57 58 #ifdef HWPMC_HOOKS 59 #include <sys/pmckern.h> 60 #endif 61 62 /* 63 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 64 * the range 100-256 Hz (approximately). 65 */ 66 #define ESTCPULIM(e) \ 67 min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 68 RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 69 #ifdef SMP 70 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 71 #else 72 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 73 #endif 74 #define NICE_WEIGHT 1 /* Priorities per nice level. */ 75 76 /* 77 * The schedulable entity that runs a context. 78 * This is an extension to the thread structure and is tailored to 79 * the requirements of this scheduler 80 */ 81 struct td_sched { 82 TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */ 83 struct thread *ts_thread; /* (*) Active associated thread. */ 84 fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */ 85 u_char ts_rqindex; /* (j) Run queue index. */ 86 enum { 87 TSS_THREAD = 0x0, /* slaved to thread state */ 88 TSS_ONRUNQ 89 } ts_state; /* (j) TD_STAT in scheduler status. */ 90 int ts_cpticks; /* (j) Ticks of cpu time. */ 91 struct runq *ts_runq; /* runq the thread is currently on */ 92 }; 93 94 /* flags kept in td_flags */ 95 #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ 96 #define TDF_EXIT TDF_SCHED1 /* thread is being killed. */ 97 #define TDF_BOUND TDF_SCHED2 98 99 #define ts_flags ts_thread->td_flags 100 #define TSF_DIDRUN TDF_DIDRUN /* thread actually ran. */ 101 #define TSF_EXIT TDF_EXIT /* thread is being killed. */ 102 #define TSF_BOUND TDF_BOUND /* stuck to one CPU */ 103 104 #define SKE_RUNQ_PCPU(ts) \ 105 ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) 106 107 static struct td_sched td_sched0; 108 109 static int sched_tdcnt; /* Total runnable threads in the system. */ 110 static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 111 #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ 112 113 static struct callout roundrobin_callout; 114 115 static struct td_sched *sched_choose(void); 116 117 static void setup_runqs(void); 118 static void roundrobin(void *arg); 119 static void schedcpu(void); 120 static void schedcpu_thread(void); 121 static void sched_priority(struct thread *td, u_char prio); 122 static void sched_setup(void *dummy); 123 static void maybe_resched(struct thread *td); 124 static void updatepri(struct thread *td); 125 static void resetpriority(struct thread *td); 126 static void resetpriority_thread(struct thread *td); 127 #ifdef SMP 128 static int forward_wakeup(int cpunum); 129 #endif 130 131 static struct kproc_desc sched_kp = { 132 "schedcpu", 133 schedcpu_thread, 134 NULL 135 }; 136 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp) 137 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 138 139 /* 140 * Global run queue. 141 */ 142 static struct runq runq; 143 144 #ifdef SMP 145 /* 146 * Per-CPU run queues 147 */ 148 static struct runq runq_pcpu[MAXCPU]; 149 #endif 150 151 static void 152 setup_runqs(void) 153 { 154 #ifdef SMP 155 int i; 156 157 for (i = 0; i < MAXCPU; ++i) 158 runq_init(&runq_pcpu[i]); 159 #endif 160 161 runq_init(&runq); 162 } 163 164 static int 165 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 166 { 167 int error, new_val; 168 169 new_val = sched_quantum * tick; 170 error = sysctl_handle_int(oidp, &new_val, 0, req); 171 if (error != 0 || req->newptr == NULL) 172 return (error); 173 if (new_val < tick) 174 return (EINVAL); 175 sched_quantum = new_val / tick; 176 hogticks = 2 * sched_quantum; 177 return (0); 178 } 179 180 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 181 182 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 183 "Scheduler name"); 184 185 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 186 0, sizeof sched_quantum, sysctl_kern_quantum, "I", 187 "Roundrobin scheduling quantum in microseconds"); 188 189 #ifdef SMP 190 /* Enable forwarding of wakeups to all other cpus */ 191 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); 192 193 static int forward_wakeup_enabled = 1; 194 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, 195 &forward_wakeup_enabled, 0, 196 "Forwarding of wakeup to idle CPUs"); 197 198 static int forward_wakeups_requested = 0; 199 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, 200 &forward_wakeups_requested, 0, 201 "Requests for Forwarding of wakeup to idle CPUs"); 202 203 static int forward_wakeups_delivered = 0; 204 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, 205 &forward_wakeups_delivered, 0, 206 "Completed Forwarding of wakeup to idle CPUs"); 207 208 static int forward_wakeup_use_mask = 1; 209 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, 210 &forward_wakeup_use_mask, 0, 211 "Use the mask of idle cpus"); 212 213 static int forward_wakeup_use_loop = 0; 214 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, 215 &forward_wakeup_use_loop, 0, 216 "Use a loop to find idle cpus"); 217 218 static int forward_wakeup_use_single = 0; 219 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW, 220 &forward_wakeup_use_single, 0, 221 "Only signal one idle cpu"); 222 223 static int forward_wakeup_use_htt = 0; 224 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW, 225 &forward_wakeup_use_htt, 0, 226 "account for htt"); 227 228 #endif 229 #if 0 230 static int sched_followon = 0; 231 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, 232 &sched_followon, 0, 233 "allow threads to share a quantum"); 234 #endif 235 236 static __inline void 237 sched_load_add(void) 238 { 239 sched_tdcnt++; 240 CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); 241 } 242 243 static __inline void 244 sched_load_rem(void) 245 { 246 sched_tdcnt--; 247 CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); 248 } 249 /* 250 * Arrange to reschedule if necessary, taking the priorities and 251 * schedulers into account. 252 */ 253 static void 254 maybe_resched(struct thread *td) 255 { 256 257 mtx_assert(&sched_lock, MA_OWNED); 258 if (td->td_priority < curthread->td_priority) 259 curthread->td_flags |= TDF_NEEDRESCHED; 260 } 261 262 /* 263 * Force switch among equal priority processes every 100ms. 264 * We don't actually need to force a context switch of the current process. 265 * The act of firing the event triggers a context switch to softclock() and 266 * then switching back out again which is equivalent to a preemption, thus 267 * no further work is needed on the local CPU. 268 */ 269 /* ARGSUSED */ 270 static void 271 roundrobin(void *arg) 272 { 273 274 #ifdef SMP 275 mtx_lock_spin(&sched_lock); 276 forward_roundrobin(); 277 mtx_unlock_spin(&sched_lock); 278 #endif 279 280 callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); 281 } 282 283 /* 284 * Constants for digital decay and forget: 285 * 90% of (td_estcpu) usage in 5 * loadav time 286 * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) 287 * Note that, as ps(1) mentions, this can let percentages 288 * total over 100% (I've seen 137.9% for 3 processes). 289 * 290 * Note that schedclock() updates td_estcpu and p_cpticks asynchronously. 291 * 292 * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds. 293 * That is, the system wants to compute a value of decay such 294 * that the following for loop: 295 * for (i = 0; i < (5 * loadavg); i++) 296 * td_estcpu *= decay; 297 * will compute 298 * td_estcpu *= 0.1; 299 * for all values of loadavg: 300 * 301 * Mathematically this loop can be expressed by saying: 302 * decay ** (5 * loadavg) ~= .1 303 * 304 * The system computes decay as: 305 * decay = (2 * loadavg) / (2 * loadavg + 1) 306 * 307 * We wish to prove that the system's computation of decay 308 * will always fulfill the equation: 309 * decay ** (5 * loadavg) ~= .1 310 * 311 * If we compute b as: 312 * b = 2 * loadavg 313 * then 314 * decay = b / (b + 1) 315 * 316 * We now need to prove two things: 317 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 318 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 319 * 320 * Facts: 321 * For x close to zero, exp(x) =~ 1 + x, since 322 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 323 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 324 * For x close to zero, ln(1+x) =~ x, since 325 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 326 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 327 * ln(.1) =~ -2.30 328 * 329 * Proof of (1): 330 * Solve (factor)**(power) =~ .1 given power (5*loadav): 331 * solving for factor, 332 * ln(factor) =~ (-2.30/5*loadav), or 333 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 334 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 335 * 336 * Proof of (2): 337 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 338 * solving for power, 339 * power*ln(b/(b+1)) =~ -2.30, or 340 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 341 * 342 * Actual power values for the implemented algorithm are as follows: 343 * loadav: 1 2 3 4 344 * power: 5.68 10.32 14.94 19.55 345 */ 346 347 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 348 #define loadfactor(loadav) (2 * (loadav)) 349 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 350 351 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 352 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 353 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 354 355 /* 356 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 357 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 358 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 359 * 360 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 361 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 362 * 363 * If you don't want to bother with the faster/more-accurate formula, you 364 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 365 * (more general) method of calculating the %age of CPU used by a process. 366 */ 367 #define CCPU_SHIFT 11 368 369 /* 370 * Recompute process priorities, every hz ticks. 371 * MP-safe, called without the Giant mutex. 372 */ 373 /* ARGSUSED */ 374 static void 375 schedcpu(void) 376 { 377 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 378 struct thread *td; 379 struct proc *p; 380 struct td_sched *ts; 381 int awake, realstathz; 382 383 realstathz = stathz ? stathz : hz; 384 sx_slock(&allproc_lock); 385 FOREACH_PROC_IN_SYSTEM(p) { 386 /* 387 * Prevent state changes and protect run queue. 388 */ 389 mtx_lock_spin(&sched_lock); 390 /* 391 * Increment time in/out of memory. We ignore overflow; with 392 * 16-bit int's (remember them?) overflow takes 45 days. 393 */ 394 p->p_swtime++; 395 FOREACH_THREAD_IN_PROC(p, td) { 396 awake = 0; 397 ts = td->td_sched; 398 /* 399 * Increment sleep time (if sleeping). We 400 * ignore overflow, as above. 401 */ 402 /* 403 * The td_sched slptimes are not touched in wakeup 404 * because the thread may not HAVE everything in 405 * memory? XXX I think this is out of date. 406 */ 407 if (ts->ts_state == TSS_ONRUNQ) { 408 awake = 1; 409 ts->ts_flags &= ~TSF_DIDRUN; 410 } else if ((ts->ts_state == TSS_THREAD) && 411 (TD_IS_RUNNING(td))) { 412 awake = 1; 413 /* Do not clear TSF_DIDRUN */ 414 } else if (ts->ts_flags & TSF_DIDRUN) { 415 awake = 1; 416 ts->ts_flags &= ~TSF_DIDRUN; 417 } 418 419 /* 420 * ts_pctcpu is only for ps and ttyinfo(). 421 * Do it per td_sched, and add them up at the end? 422 * XXXKSE 423 */ 424 ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; 425 /* 426 * If the td_sched has been idle the entire second, 427 * stop recalculating its priority until 428 * it wakes up. 429 */ 430 if (ts->ts_cpticks != 0) { 431 #if (FSHIFT >= CCPU_SHIFT) 432 ts->ts_pctcpu += (realstathz == 100) 433 ? ((fixpt_t) ts->ts_cpticks) << 434 (FSHIFT - CCPU_SHIFT) : 435 100 * (((fixpt_t) ts->ts_cpticks) 436 << (FSHIFT - CCPU_SHIFT)) / realstathz; 437 #else 438 ts->ts_pctcpu += ((FSCALE - ccpu) * 439 (ts->ts_cpticks * 440 FSCALE / realstathz)) >> FSHIFT; 441 #endif 442 ts->ts_cpticks = 0; 443 } 444 /* 445 * If there are ANY running threads in this process, 446 * then don't count it as sleeping. 447 XXX this is broken 448 449 */ 450 if (awake) { 451 if (p->p_slptime > 1) { 452 /* 453 * In an ideal world, this should not 454 * happen, because whoever woke us 455 * up from the long sleep should have 456 * unwound the slptime and reset our 457 * priority before we run at the stale 458 * priority. Should KASSERT at some 459 * point when all the cases are fixed. 460 */ 461 updatepri(td); 462 } 463 td->td_slptime = 0; 464 } else 465 td->td_slptime++; 466 if (td->td_slptime > 1) 467 continue; 468 td->td_estcpu = decay_cpu(loadfac, td->td_estcpu); 469 resetpriority(td); 470 resetpriority_thread(td); 471 } /* end of thread loop */ 472 mtx_unlock_spin(&sched_lock); 473 } /* end of process loop */ 474 sx_sunlock(&allproc_lock); 475 } 476 477 /* 478 * Main loop for a kthread that executes schedcpu once a second. 479 */ 480 static void 481 schedcpu_thread(void) 482 { 483 int nowake; 484 485 for (;;) { 486 schedcpu(); 487 tsleep(&nowake, 0, "-", hz); 488 } 489 } 490 491 /* 492 * Recalculate the priority of a process after it has slept for a while. 493 * For all load averages >= 1 and max td_estcpu of 255, sleeping for at 494 * least six times the loadfactor will decay td_estcpu to zero. 495 */ 496 static void 497 updatepri(struct thread *td) 498 { 499 register fixpt_t loadfac; 500 register unsigned int newcpu; 501 502 loadfac = loadfactor(averunnable.ldavg[0]); 503 if (td->td_slptime > 5 * loadfac) 504 td->td_estcpu = 0; 505 else { 506 newcpu = td->td_estcpu; 507 td->td_slptime--; /* was incremented in schedcpu() */ 508 while (newcpu && --td->td_slptime) 509 newcpu = decay_cpu(loadfac, newcpu); 510 td->td_estcpu = newcpu; 511 } 512 } 513 514 /* 515 * Compute the priority of a process when running in user mode. 516 * Arrange to reschedule if the resulting priority is better 517 * than that of the current process. 518 */ 519 static void 520 resetpriority(struct thread *td) 521 { 522 register unsigned int newpriority; 523 524 if (td->td_pri_class == PRI_TIMESHARE) { 525 newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT + 526 NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); 527 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 528 PRI_MAX_TIMESHARE); 529 sched_user_prio(td, newpriority); 530 } 531 } 532 533 /* 534 * Update the thread's priority when the associated process's user 535 * priority changes. 536 */ 537 static void 538 resetpriority_thread(struct thread *td) 539 { 540 541 /* Only change threads with a time sharing user priority. */ 542 if (td->td_priority < PRI_MIN_TIMESHARE || 543 td->td_priority > PRI_MAX_TIMESHARE) 544 return; 545 546 /* XXX the whole needresched thing is broken, but not silly. */ 547 maybe_resched(td); 548 549 sched_prio(td, td->td_user_pri); 550 } 551 552 /* ARGSUSED */ 553 static void 554 sched_setup(void *dummy) 555 { 556 setup_runqs(); 557 558 if (sched_quantum == 0) 559 sched_quantum = SCHED_QUANTUM; 560 hogticks = 2 * sched_quantum; 561 562 callout_init(&roundrobin_callout, CALLOUT_MPSAFE); 563 564 /* Kick off timeout driven events by calling first time. */ 565 roundrobin(NULL); 566 567 /* Account for thread0. */ 568 sched_load_add(); 569 } 570 571 /* External interfaces start here */ 572 /* 573 * Very early in the boot some setup of scheduler-specific 574 * parts of proc0 and of some scheduler resources needs to be done. 575 * Called from: 576 * proc0_init() 577 */ 578 void 579 schedinit(void) 580 { 581 /* 582 * Set up the scheduler specific parts of proc0. 583 */ 584 proc0.p_sched = NULL; /* XXX */ 585 thread0.td_sched = &td_sched0; 586 td_sched0.ts_thread = &thread0; 587 td_sched0.ts_state = TSS_THREAD; 588 } 589 590 int 591 sched_runnable(void) 592 { 593 #ifdef SMP 594 return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 595 #else 596 return runq_check(&runq); 597 #endif 598 } 599 600 int 601 sched_rr_interval(void) 602 { 603 if (sched_quantum == 0) 604 sched_quantum = SCHED_QUANTUM; 605 return (sched_quantum); 606 } 607 608 /* 609 * We adjust the priority of the current process. The priority of 610 * a process gets worse as it accumulates CPU time. The cpu usage 611 * estimator (td_estcpu) is increased here. resetpriority() will 612 * compute a different priority each time td_estcpu increases by 613 * INVERSE_ESTCPU_WEIGHT 614 * (until MAXPRI is reached). The cpu usage estimator ramps up 615 * quite quickly when the process is running (linearly), and decays 616 * away exponentially, at a rate which is proportionally slower when 617 * the system is busy. The basic principle is that the system will 618 * 90% forget that the process used a lot of CPU time in 5 * loadav 619 * seconds. This causes the system to favor processes which haven't 620 * run much recently, and to round-robin among other processes. 621 */ 622 void 623 sched_clock(struct thread *td) 624 { 625 struct td_sched *ts; 626 627 mtx_assert(&sched_lock, MA_OWNED); 628 ts = td->td_sched; 629 630 ts->ts_cpticks++; 631 td->td_estcpu = ESTCPULIM(td->td_estcpu + 1); 632 if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 633 resetpriority(td); 634 resetpriority_thread(td); 635 } 636 } 637 638 /* 639 * charge childs scheduling cpu usage to parent. 640 */ 641 void 642 sched_exit(struct proc *p, struct thread *td) 643 { 644 645 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 646 td, td->td_proc->p_comm, td->td_priority); 647 648 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 649 } 650 651 void 652 sched_exit_thread(struct thread *td, struct thread *child) 653 { 654 struct proc *childproc = child->td_proc; 655 656 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 657 child, childproc->p_comm, child->td_priority); 658 td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu); 659 childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu + 660 child->td_estcpu); 661 if ((child->td_proc->p_flag & P_NOLOAD) == 0) 662 sched_load_rem(); 663 } 664 665 void 666 sched_fork(struct thread *td, struct thread *childtd) 667 { 668 sched_fork_thread(td, childtd); 669 } 670 671 void 672 sched_fork_thread(struct thread *td, struct thread *childtd) 673 { 674 childtd->td_estcpu = td->td_estcpu; 675 sched_newthread(childtd); 676 } 677 678 void 679 sched_nice(struct proc *p, int nice) 680 { 681 struct thread *td; 682 683 PROC_LOCK_ASSERT(p, MA_OWNED); 684 mtx_assert(&sched_lock, MA_OWNED); 685 p->p_nice = nice; 686 FOREACH_THREAD_IN_PROC(p, td) { 687 resetpriority(td); 688 resetpriority_thread(td); 689 } 690 } 691 692 void 693 sched_class(struct thread *td, int class) 694 { 695 mtx_assert(&sched_lock, MA_OWNED); 696 td->td_pri_class = class; 697 } 698 699 /* 700 * Adjust the priority of a thread. 701 */ 702 static void 703 sched_priority(struct thread *td, u_char prio) 704 { 705 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 706 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 707 curthread->td_proc->p_comm); 708 709 mtx_assert(&sched_lock, MA_OWNED); 710 if (td->td_priority == prio) 711 return; 712 if (TD_ON_RUNQ(td)) { 713 adjustrunqueue(td, prio); 714 } else { 715 td->td_priority = prio; 716 } 717 } 718 719 /* 720 * Update a thread's priority when it is lent another thread's 721 * priority. 722 */ 723 void 724 sched_lend_prio(struct thread *td, u_char prio) 725 { 726 727 td->td_flags |= TDF_BORROWING; 728 sched_priority(td, prio); 729 } 730 731 /* 732 * Restore a thread's priority when priority propagation is 733 * over. The prio argument is the minimum priority the thread 734 * needs to have to satisfy other possible priority lending 735 * requests. If the thread's regulary priority is less 736 * important than prio the thread will keep a priority boost 737 * of prio. 738 */ 739 void 740 sched_unlend_prio(struct thread *td, u_char prio) 741 { 742 u_char base_pri; 743 744 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 745 td->td_base_pri <= PRI_MAX_TIMESHARE) 746 base_pri = td->td_user_pri; 747 else 748 base_pri = td->td_base_pri; 749 if (prio >= base_pri) { 750 td->td_flags &= ~TDF_BORROWING; 751 sched_prio(td, base_pri); 752 } else 753 sched_lend_prio(td, prio); 754 } 755 756 void 757 sched_prio(struct thread *td, u_char prio) 758 { 759 u_char oldprio; 760 761 /* First, update the base priority. */ 762 td->td_base_pri = prio; 763 764 /* 765 * If the thread is borrowing another thread's priority, don't ever 766 * lower the priority. 767 */ 768 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 769 return; 770 771 /* Change the real priority. */ 772 oldprio = td->td_priority; 773 sched_priority(td, prio); 774 775 /* 776 * If the thread is on a turnstile, then let the turnstile update 777 * its state. 778 */ 779 if (TD_ON_LOCK(td) && oldprio != prio) 780 turnstile_adjust(td, oldprio); 781 } 782 783 void 784 sched_user_prio(struct thread *td, u_char prio) 785 { 786 u_char oldprio; 787 788 td->td_base_user_pri = prio; 789 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 790 return; 791 oldprio = td->td_user_pri; 792 td->td_user_pri = prio; 793 794 if (TD_ON_UPILOCK(td) && oldprio != prio) 795 umtx_pi_adjust(td, oldprio); 796 } 797 798 void 799 sched_lend_user_prio(struct thread *td, u_char prio) 800 { 801 u_char oldprio; 802 803 td->td_flags |= TDF_UBORROWING; 804 805 oldprio = td->td_user_pri; 806 td->td_user_pri = prio; 807 808 if (TD_ON_UPILOCK(td) && oldprio != prio) 809 umtx_pi_adjust(td, oldprio); 810 } 811 812 void 813 sched_unlend_user_prio(struct thread *td, u_char prio) 814 { 815 u_char base_pri; 816 817 base_pri = td->td_base_user_pri; 818 if (prio >= base_pri) { 819 td->td_flags &= ~TDF_UBORROWING; 820 sched_user_prio(td, base_pri); 821 } else 822 sched_lend_user_prio(td, prio); 823 } 824 825 void 826 sched_sleep(struct thread *td) 827 { 828 829 mtx_assert(&sched_lock, MA_OWNED); 830 td->td_slptime = 0; 831 } 832 833 void 834 sched_switch(struct thread *td, struct thread *newtd, int flags) 835 { 836 struct td_sched *ts; 837 struct proc *p; 838 839 ts = td->td_sched; 840 p = td->td_proc; 841 842 mtx_assert(&sched_lock, MA_OWNED); 843 844 if ((p->p_flag & P_NOLOAD) == 0) 845 sched_load_rem(); 846 #if 0 847 /* 848 * We are volunteering to switch out so we get to nominate 849 * a successor for the rest of our quantum 850 * First try another thread in our process 851 * 852 * this is too expensive to do without per process run queues 853 * so skip it for now. 854 * XXX keep this comment as a marker. 855 */ 856 if (sched_followon && 857 (p->p_flag & P_HADTHREADS) && 858 (flags & SW_VOL) && 859 newtd == NULL) 860 newtd = mumble(); 861 #endif 862 863 if (newtd) 864 newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); 865 866 td->td_lastcpu = td->td_oncpu; 867 td->td_flags &= ~TDF_NEEDRESCHED; 868 td->td_owepreempt = 0; 869 td->td_oncpu = NOCPU; 870 /* 871 * At the last moment, if this thread is still marked RUNNING, 872 * then put it back on the run queue as it has not been suspended 873 * or stopped or any thing else similar. We never put the idle 874 * threads on the run queue, however. 875 */ 876 if (td == PCPU_GET(idlethread)) 877 TD_SET_CAN_RUN(td); 878 else { 879 if (TD_IS_RUNNING(td)) { 880 /* Put us back on the run queue. */ 881 setrunqueue(td, (flags & SW_PREEMPT) ? 882 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 883 SRQ_OURSELF|SRQ_YIELDING); 884 } 885 } 886 if (newtd) { 887 /* 888 * The thread we are about to run needs to be counted 889 * as if it had been added to the run queue and selected. 890 * It came from: 891 * * A preemption 892 * * An upcall 893 * * A followon 894 */ 895 KASSERT((newtd->td_inhibitors == 0), 896 ("trying to run inhibitted thread")); 897 newtd->td_sched->ts_flags |= TSF_DIDRUN; 898 TD_SET_RUNNING(newtd); 899 if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) 900 sched_load_add(); 901 } else { 902 newtd = choosethread(); 903 } 904 905 if (td != newtd) { 906 #ifdef HWPMC_HOOKS 907 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 908 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 909 #endif 910 911 cpu_switch(td, newtd); 912 #ifdef HWPMC_HOOKS 913 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 914 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 915 #endif 916 } 917 918 sched_lock.mtx_lock = (uintptr_t)td; 919 td->td_oncpu = PCPU_GET(cpuid); 920 } 921 922 void 923 sched_wakeup(struct thread *td) 924 { 925 mtx_assert(&sched_lock, MA_OWNED); 926 if (td->td_slptime > 1) { 927 updatepri(td); 928 resetpriority(td); 929 } 930 td->td_slptime = 0; 931 setrunqueue(td, SRQ_BORING); 932 } 933 934 #ifdef SMP 935 /* enable HTT_2 if you have a 2-way HTT cpu.*/ 936 static int 937 forward_wakeup(int cpunum) 938 { 939 cpumask_t map, me, dontuse; 940 cpumask_t map2; 941 struct pcpu *pc; 942 cpumask_t id, map3; 943 944 mtx_assert(&sched_lock, MA_OWNED); 945 946 CTR0(KTR_RUNQ, "forward_wakeup()"); 947 948 if ((!forward_wakeup_enabled) || 949 (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) 950 return (0); 951 if (!smp_started || cold || panicstr) 952 return (0); 953 954 forward_wakeups_requested++; 955 956 /* 957 * check the idle mask we received against what we calculated before 958 * in the old version. 959 */ 960 me = PCPU_GET(cpumask); 961 /* 962 * don't bother if we should be doing it ourself.. 963 */ 964 if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum))) 965 return (0); 966 967 dontuse = me | stopped_cpus | hlt_cpus_mask; 968 map3 = 0; 969 if (forward_wakeup_use_loop) { 970 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 971 id = pc->pc_cpumask; 972 if ( (id & dontuse) == 0 && 973 pc->pc_curthread == pc->pc_idlethread) { 974 map3 |= id; 975 } 976 } 977 } 978 979 if (forward_wakeup_use_mask) { 980 map = 0; 981 map = idle_cpus_mask & ~dontuse; 982 983 /* If they are both on, compare and use loop if different */ 984 if (forward_wakeup_use_loop) { 985 if (map != map3) { 986 printf("map (%02X) != map3 (%02X)\n", 987 map, map3); 988 map = map3; 989 } 990 } 991 } else { 992 map = map3; 993 } 994 /* If we only allow a specific CPU, then mask off all the others */ 995 if (cpunum != NOCPU) { 996 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); 997 map &= (1 << cpunum); 998 } else { 999 /* Try choose an idle die. */ 1000 if (forward_wakeup_use_htt) { 1001 map2 = (map & (map >> 1)) & 0x5555; 1002 if (map2) { 1003 map = map2; 1004 } 1005 } 1006 1007 /* set only one bit */ 1008 if (forward_wakeup_use_single) { 1009 map = map & ((~map) + 1); 1010 } 1011 } 1012 if (map) { 1013 forward_wakeups_delivered++; 1014 ipi_selected(map, IPI_AST); 1015 return (1); 1016 } 1017 if (cpunum == NOCPU) 1018 printf("forward_wakeup: Idle processor not found\n"); 1019 return (0); 1020 } 1021 #endif 1022 1023 #ifdef SMP 1024 static void kick_other_cpu(int pri,int cpuid); 1025 1026 static void 1027 kick_other_cpu(int pri,int cpuid) 1028 { 1029 struct pcpu * pcpu = pcpu_find(cpuid); 1030 int cpri = pcpu->pc_curthread->td_priority; 1031 1032 if (idle_cpus_mask & pcpu->pc_cpumask) { 1033 forward_wakeups_delivered++; 1034 ipi_selected(pcpu->pc_cpumask, IPI_AST); 1035 return; 1036 } 1037 1038 if (pri >= cpri) 1039 return; 1040 1041 #if defined(IPI_PREEMPTION) && defined(PREEMPTION) 1042 #if !defined(FULL_PREEMPTION) 1043 if (pri <= PRI_MAX_ITHD) 1044 #endif /* ! FULL_PREEMPTION */ 1045 { 1046 ipi_selected(pcpu->pc_cpumask, IPI_PREEMPT); 1047 return; 1048 } 1049 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ 1050 1051 pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; 1052 ipi_selected( pcpu->pc_cpumask , IPI_AST); 1053 return; 1054 } 1055 #endif /* SMP */ 1056 1057 void 1058 sched_add(struct thread *td, int flags) 1059 #ifdef SMP 1060 { 1061 struct td_sched *ts; 1062 int forwarded = 0; 1063 int cpu; 1064 int single_cpu = 0; 1065 1066 ts = td->td_sched; 1067 mtx_assert(&sched_lock, MA_OWNED); 1068 KASSERT(ts->ts_state != TSS_ONRUNQ, 1069 ("sched_add: td_sched %p (%s) already in run queue", ts, 1070 td->td_proc->p_comm)); 1071 KASSERT(td->td_proc->p_sflag & PS_INMEM, 1072 ("sched_add: process swapped out")); 1073 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1074 td, td->td_proc->p_comm, td->td_priority, curthread, 1075 curthread->td_proc->p_comm); 1076 1077 1078 if (td->td_pinned != 0) { 1079 cpu = td->td_lastcpu; 1080 ts->ts_runq = &runq_pcpu[cpu]; 1081 single_cpu = 1; 1082 CTR3(KTR_RUNQ, 1083 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); 1084 } else if ((ts)->ts_flags & TSF_BOUND) { 1085 /* Find CPU from bound runq */ 1086 KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq")); 1087 cpu = ts->ts_runq - &runq_pcpu[0]; 1088 single_cpu = 1; 1089 CTR3(KTR_RUNQ, 1090 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); 1091 } else { 1092 CTR2(KTR_RUNQ, 1093 "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td); 1094 cpu = NOCPU; 1095 ts->ts_runq = &runq; 1096 } 1097 1098 if (single_cpu && (cpu != PCPU_GET(cpuid))) { 1099 kick_other_cpu(td->td_priority,cpu); 1100 } else { 1101 1102 if (!single_cpu) { 1103 cpumask_t me = PCPU_GET(cpumask); 1104 int idle = idle_cpus_mask & me; 1105 1106 if (!idle && ((flags & SRQ_INTR) == 0) && 1107 (idle_cpus_mask & ~(hlt_cpus_mask | me))) 1108 forwarded = forward_wakeup(cpu); 1109 } 1110 1111 if (!forwarded) { 1112 if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td)) 1113 return; 1114 else 1115 maybe_resched(td); 1116 } 1117 } 1118 1119 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1120 sched_load_add(); 1121 runq_add(ts->ts_runq, ts, flags); 1122 ts->ts_state = TSS_ONRUNQ; 1123 } 1124 #else /* SMP */ 1125 { 1126 struct td_sched *ts; 1127 ts = td->td_sched; 1128 mtx_assert(&sched_lock, MA_OWNED); 1129 KASSERT(ts->ts_state != TSS_ONRUNQ, 1130 ("sched_add: td_sched %p (%s) already in run queue", ts, 1131 td->td_proc->p_comm)); 1132 KASSERT(td->td_proc->p_sflag & PS_INMEM, 1133 ("sched_add: process swapped out")); 1134 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1135 td, td->td_proc->p_comm, td->td_priority, curthread, 1136 curthread->td_proc->p_comm); 1137 CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); 1138 ts->ts_runq = &runq; 1139 1140 /* 1141 * If we are yielding (on the way out anyhow) 1142 * or the thread being saved is US, 1143 * then don't try be smart about preemption 1144 * or kicking off another CPU 1145 * as it won't help and may hinder. 1146 * In the YIEDLING case, we are about to run whoever is 1147 * being put in the queue anyhow, and in the 1148 * OURSELF case, we are puting ourself on the run queue 1149 * which also only happens when we are about to yield. 1150 */ 1151 if((flags & SRQ_YIELDING) == 0) { 1152 if (maybe_preempt(td)) 1153 return; 1154 } 1155 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1156 sched_load_add(); 1157 runq_add(ts->ts_runq, ts, flags); 1158 ts->ts_state = TSS_ONRUNQ; 1159 maybe_resched(td); 1160 } 1161 #endif /* SMP */ 1162 1163 void 1164 sched_rem(struct thread *td) 1165 { 1166 struct td_sched *ts; 1167 1168 ts = td->td_sched; 1169 KASSERT(td->td_proc->p_sflag & PS_INMEM, 1170 ("sched_rem: process swapped out")); 1171 KASSERT((ts->ts_state == TSS_ONRUNQ), 1172 ("sched_rem: thread not on run queue")); 1173 mtx_assert(&sched_lock, MA_OWNED); 1174 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1175 td, td->td_proc->p_comm, td->td_priority, curthread, 1176 curthread->td_proc->p_comm); 1177 1178 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1179 sched_load_rem(); 1180 runq_remove(ts->ts_runq, ts); 1181 1182 ts->ts_state = TSS_THREAD; 1183 } 1184 1185 /* 1186 * Select threads to run. 1187 * Notice that the running threads still consume a slot. 1188 */ 1189 struct td_sched * 1190 sched_choose(void) 1191 { 1192 struct td_sched *ts; 1193 struct runq *rq; 1194 1195 #ifdef SMP 1196 struct td_sched *kecpu; 1197 1198 rq = &runq; 1199 ts = runq_choose(&runq); 1200 kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 1201 1202 if (ts == NULL || 1203 (kecpu != NULL && 1204 kecpu->ts_thread->td_priority < ts->ts_thread->td_priority)) { 1205 CTR2(KTR_RUNQ, "choosing td_sched %p from pcpu runq %d", kecpu, 1206 PCPU_GET(cpuid)); 1207 ts = kecpu; 1208 rq = &runq_pcpu[PCPU_GET(cpuid)]; 1209 } else { 1210 CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", ts); 1211 } 1212 1213 #else 1214 rq = &runq; 1215 ts = runq_choose(&runq); 1216 #endif 1217 1218 if (ts) { 1219 runq_remove(rq, ts); 1220 ts->ts_state = TSS_THREAD; 1221 1222 KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM, 1223 ("sched_choose: process swapped out")); 1224 } 1225 return (ts); 1226 } 1227 1228 void 1229 sched_userret(struct thread *td) 1230 { 1231 /* 1232 * XXX we cheat slightly on the locking here to avoid locking in 1233 * the usual case. Setting td_priority here is essentially an 1234 * incomplete workaround for not setting it properly elsewhere. 1235 * Now that some interrupt handlers are threads, not setting it 1236 * properly elsewhere can clobber it in the window between setting 1237 * it here and returning to user mode, so don't waste time setting 1238 * it perfectly here. 1239 */ 1240 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1241 ("thread with borrowed priority returning to userland")); 1242 if (td->td_priority != td->td_user_pri) { 1243 mtx_lock_spin(&sched_lock); 1244 td->td_priority = td->td_user_pri; 1245 td->td_base_pri = td->td_user_pri; 1246 mtx_unlock_spin(&sched_lock); 1247 } 1248 } 1249 1250 void 1251 sched_bind(struct thread *td, int cpu) 1252 { 1253 struct td_sched *ts; 1254 1255 mtx_assert(&sched_lock, MA_OWNED); 1256 KASSERT(TD_IS_RUNNING(td), 1257 ("sched_bind: cannot bind non-running thread")); 1258 1259 ts = td->td_sched; 1260 1261 ts->ts_flags |= TSF_BOUND; 1262 #ifdef SMP 1263 ts->ts_runq = &runq_pcpu[cpu]; 1264 if (PCPU_GET(cpuid) == cpu) 1265 return; 1266 1267 ts->ts_state = TSS_THREAD; 1268 1269 mi_switch(SW_VOL, NULL); 1270 #endif 1271 } 1272 1273 void 1274 sched_unbind(struct thread* td) 1275 { 1276 mtx_assert(&sched_lock, MA_OWNED); 1277 td->td_sched->ts_flags &= ~TSF_BOUND; 1278 } 1279 1280 int 1281 sched_is_bound(struct thread *td) 1282 { 1283 mtx_assert(&sched_lock, MA_OWNED); 1284 return (td->td_sched->ts_flags & TSF_BOUND); 1285 } 1286 1287 void 1288 sched_relinquish(struct thread *td) 1289 { 1290 mtx_lock_spin(&sched_lock); 1291 if (td->td_pri_class == PRI_TIMESHARE) 1292 sched_prio(td, PRI_MAX_TIMESHARE); 1293 mi_switch(SW_VOL, NULL); 1294 mtx_unlock_spin(&sched_lock); 1295 } 1296 1297 int 1298 sched_load(void) 1299 { 1300 return (sched_tdcnt); 1301 } 1302 1303 int 1304 sched_sizeof_proc(void) 1305 { 1306 return (sizeof(struct proc)); 1307 } 1308 1309 int 1310 sched_sizeof_thread(void) 1311 { 1312 return (sizeof(struct thread) + sizeof(struct td_sched)); 1313 } 1314 1315 fixpt_t 1316 sched_pctcpu(struct thread *td) 1317 { 1318 struct td_sched *ts; 1319 1320 ts = td->td_sched; 1321 return (ts->ts_pctcpu); 1322 } 1323 1324 void 1325 sched_tick(void) 1326 { 1327 } 1328 #define KERN_SWITCH_INCLUDE 1 1329 #include "kern/kern_switch.c" 1330