1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #define kse td_sched 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/lock.h> 45 #include <sys/kthread.h> 46 #include <sys/mutex.h> 47 #include <sys/proc.h> 48 #include <sys/resourcevar.h> 49 #include <sys/sched.h> 50 #include <sys/smp.h> 51 #include <sys/sysctl.h> 52 #include <sys/sx.h> 53 #include <machine/smp.h> 54 55 /* 56 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 57 * the range 100-256 Hz (approximately). 58 */ 59 #define ESTCPULIM(e) \ 60 min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 61 RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 62 #ifdef SMP 63 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 64 #else 65 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 66 #endif 67 #define NICE_WEIGHT 1 /* Priorities per nice level. */ 68 69 /* 70 * The schedulable entity that can be given a context to run. 71 * A process may have several of these. Probably one per processor 72 * but posibly a few more. In this universe they are grouped 73 * with a KSEG that contains the priority and niceness 74 * for the group. 75 */ 76 struct kse { 77 TAILQ_ENTRY(kse) ke_kglist; /* (*) Queue of KSEs in ke_ksegrp. */ 78 TAILQ_ENTRY(kse) ke_kgrlist; /* (*) Queue of KSEs in this state. */ 79 TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 80 struct thread *ke_thread; /* (*) Active associated thread. */ 81 fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 82 u_char ke_oncpu; /* (j) Which cpu we are on. */ 83 char ke_rqindex; /* (j) Run queue index. */ 84 enum { 85 KES_THREAD = 0x0, /* slaved to thread state */ 86 KES_ONRUNQ 87 } ke_state; /* (j) KSE status. */ 88 int ke_cpticks; /* (j) Ticks of cpu time. */ 89 struct runq *ke_runq; /* runq the kse is currently on */ 90 }; 91 92 #define ke_proc ke_thread->td_proc 93 #define ke_ksegrp ke_thread->td_ksegrp 94 95 #define td_kse td_sched 96 97 /* flags kept in td_flags */ 98 #define TDF_DIDRUN TDF_SCHED0 /* KSE actually ran. */ 99 #define TDF_EXIT TDF_SCHED1 /* KSE is being killed. */ 100 #define TDF_BOUND TDF_SCHED2 101 102 #define ke_flags ke_thread->td_flags 103 #define KEF_DIDRUN TDF_DIDRUN /* KSE actually ran. */ 104 #define KEF_EXIT TDF_EXIT /* KSE is being killed. */ 105 #define KEF_BOUND TDF_BOUND /* stuck to one CPU */ 106 107 #define SKE_RUNQ_PCPU(ke) \ 108 ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq) 109 110 struct kg_sched { 111 struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 112 /* the system scheduler. */ 113 int skg_avail_opennings; /* (j) Num KSEs requested in group. */ 114 int skg_concurrency; /* (j) Num KSEs requested in group. */ 115 int skg_runq_kses; /* (j) Num KSEs on runq. */ 116 }; 117 #define kg_last_assigned kg_sched->skg_last_assigned 118 #define kg_avail_opennings kg_sched->skg_avail_opennings 119 #define kg_concurrency kg_sched->skg_concurrency 120 #define kg_runq_kses kg_sched->skg_runq_kses 121 122 #define SLOT_RELEASE(kg) \ 123 do { \ 124 kg->kg_avail_opennings++; \ 125 CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)", \ 126 kg, \ 127 kg->kg_concurrency, \ 128 kg->kg_avail_opennings); \ 129 /* KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency), \ 130 ("slots out of whack"));*/ \ 131 } while (0) 132 133 #define SLOT_USE(kg) \ 134 do { \ 135 kg->kg_avail_opennings--; \ 136 CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)", \ 137 kg, \ 138 kg->kg_concurrency, \ 139 kg->kg_avail_opennings); \ 140 /* KASSERT((kg->kg_avail_opennings >= 0), \ 141 ("slots out of whack"));*/ \ 142 } while (0) 143 144 /* 145 * KSE_CAN_MIGRATE macro returns true if the kse can migrate between 146 * cpus. 147 */ 148 #define KSE_CAN_MIGRATE(ke) \ 149 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 150 151 static struct kse kse0; 152 static struct kg_sched kg_sched0; 153 154 static int sched_tdcnt; /* Total runnable threads in the system. */ 155 static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 156 #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ 157 158 static struct callout roundrobin_callout; 159 160 static void slot_fill(struct ksegrp *kg); 161 static struct kse *sched_choose(void); /* XXX Should be thread * */ 162 163 static void setup_runqs(void); 164 static void roundrobin(void *arg); 165 static void schedcpu(void); 166 static void schedcpu_thread(void); 167 static void sched_setup(void *dummy); 168 static void maybe_resched(struct thread *td); 169 static void updatepri(struct ksegrp *kg); 170 static void resetpriority(struct ksegrp *kg); 171 #ifdef SMP 172 static int forward_wakeup(int cpunum); 173 #endif 174 175 static struct kproc_desc sched_kp = { 176 "schedcpu", 177 schedcpu_thread, 178 NULL 179 }; 180 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp) 181 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 182 183 /* 184 * Global run queue. 185 */ 186 static struct runq runq; 187 188 #ifdef SMP 189 /* 190 * Per-CPU run queues 191 */ 192 static struct runq runq_pcpu[MAXCPU]; 193 #endif 194 195 static void 196 setup_runqs(void) 197 { 198 #ifdef SMP 199 int i; 200 201 for (i = 0; i < MAXCPU; ++i) 202 runq_init(&runq_pcpu[i]); 203 #endif 204 205 runq_init(&runq); 206 } 207 208 static int 209 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 210 { 211 int error, new_val; 212 213 new_val = sched_quantum * tick; 214 error = sysctl_handle_int(oidp, &new_val, 0, req); 215 if (error != 0 || req->newptr == NULL) 216 return (error); 217 if (new_val < tick) 218 return (EINVAL); 219 sched_quantum = new_val / tick; 220 hogticks = 2 * sched_quantum; 221 return (0); 222 } 223 224 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 225 226 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 227 "Scheduler name"); 228 229 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 230 0, sizeof sched_quantum, sysctl_kern_quantum, "I", 231 "Roundrobin scheduling quantum in microseconds"); 232 233 #ifdef SMP 234 /* Enable forwarding of wakeups to all other cpus */ 235 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); 236 237 static int forward_wakeup_enabled = 1; 238 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, 239 &forward_wakeup_enabled, 0, 240 "Forwarding of wakeup to idle CPUs"); 241 242 static int forward_wakeups_requested = 0; 243 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, 244 &forward_wakeups_requested, 0, 245 "Requests for Forwarding of wakeup to idle CPUs"); 246 247 static int forward_wakeups_delivered = 0; 248 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, 249 &forward_wakeups_delivered, 0, 250 "Completed Forwarding of wakeup to idle CPUs"); 251 252 static int forward_wakeup_use_mask = 1; 253 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, 254 &forward_wakeup_use_mask, 0, 255 "Use the mask of idle cpus"); 256 257 static int forward_wakeup_use_loop = 0; 258 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, 259 &forward_wakeup_use_loop, 0, 260 "Use a loop to find idle cpus"); 261 262 static int forward_wakeup_use_single = 0; 263 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW, 264 &forward_wakeup_use_single, 0, 265 "Only signal one idle cpu"); 266 267 static int forward_wakeup_use_htt = 0; 268 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW, 269 &forward_wakeup_use_htt, 0, 270 "account for htt"); 271 272 #endif 273 static int sched_followon = 0; 274 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, 275 &sched_followon, 0, 276 "allow threads to share a quantum"); 277 278 static int sched_pfollowons = 0; 279 SYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD, 280 &sched_pfollowons, 0, 281 "number of followons done to a different ksegrp"); 282 283 static int sched_kgfollowons = 0; 284 SYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD, 285 &sched_kgfollowons, 0, 286 "number of followons done in a ksegrp"); 287 288 /* 289 * Arrange to reschedule if necessary, taking the priorities and 290 * schedulers into account. 291 */ 292 static void 293 maybe_resched(struct thread *td) 294 { 295 296 mtx_assert(&sched_lock, MA_OWNED); 297 if (td->td_priority < curthread->td_priority) 298 curthread->td_flags |= TDF_NEEDRESCHED; 299 } 300 301 /* 302 * Force switch among equal priority processes every 100ms. 303 * We don't actually need to force a context switch of the current process. 304 * The act of firing the event triggers a context switch to softclock() and 305 * then switching back out again which is equivalent to a preemption, thus 306 * no further work is needed on the local CPU. 307 */ 308 /* ARGSUSED */ 309 static void 310 roundrobin(void *arg) 311 { 312 313 #ifdef SMP 314 mtx_lock_spin(&sched_lock); 315 forward_roundrobin(); 316 mtx_unlock_spin(&sched_lock); 317 #endif 318 319 callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); 320 } 321 322 /* 323 * Constants for digital decay and forget: 324 * 90% of (kg_estcpu) usage in 5 * loadav time 325 * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive) 326 * Note that, as ps(1) mentions, this can let percentages 327 * total over 100% (I've seen 137.9% for 3 processes). 328 * 329 * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously. 330 * 331 * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds. 332 * That is, the system wants to compute a value of decay such 333 * that the following for loop: 334 * for (i = 0; i < (5 * loadavg); i++) 335 * kg_estcpu *= decay; 336 * will compute 337 * kg_estcpu *= 0.1; 338 * for all values of loadavg: 339 * 340 * Mathematically this loop can be expressed by saying: 341 * decay ** (5 * loadavg) ~= .1 342 * 343 * The system computes decay as: 344 * decay = (2 * loadavg) / (2 * loadavg + 1) 345 * 346 * We wish to prove that the system's computation of decay 347 * will always fulfill the equation: 348 * decay ** (5 * loadavg) ~= .1 349 * 350 * If we compute b as: 351 * b = 2 * loadavg 352 * then 353 * decay = b / (b + 1) 354 * 355 * We now need to prove two things: 356 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 357 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 358 * 359 * Facts: 360 * For x close to zero, exp(x) =~ 1 + x, since 361 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 362 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 363 * For x close to zero, ln(1+x) =~ x, since 364 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 365 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 366 * ln(.1) =~ -2.30 367 * 368 * Proof of (1): 369 * Solve (factor)**(power) =~ .1 given power (5*loadav): 370 * solving for factor, 371 * ln(factor) =~ (-2.30/5*loadav), or 372 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 373 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 374 * 375 * Proof of (2): 376 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 377 * solving for power, 378 * power*ln(b/(b+1)) =~ -2.30, or 379 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 380 * 381 * Actual power values for the implemented algorithm are as follows: 382 * loadav: 1 2 3 4 383 * power: 5.68 10.32 14.94 19.55 384 */ 385 386 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 387 #define loadfactor(loadav) (2 * (loadav)) 388 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 389 390 /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 391 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 392 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 393 394 /* 395 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 396 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 397 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 398 * 399 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 400 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 401 * 402 * If you don't want to bother with the faster/more-accurate formula, you 403 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 404 * (more general) method of calculating the %age of CPU used by a process. 405 */ 406 #define CCPU_SHIFT 11 407 408 /* 409 * Recompute process priorities, every hz ticks. 410 * MP-safe, called without the Giant mutex. 411 */ 412 /* ARGSUSED */ 413 static void 414 schedcpu(void) 415 { 416 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 417 struct thread *td; 418 struct proc *p; 419 struct kse *ke; 420 struct ksegrp *kg; 421 int awake, realstathz; 422 423 realstathz = stathz ? stathz : hz; 424 sx_slock(&allproc_lock); 425 FOREACH_PROC_IN_SYSTEM(p) { 426 /* 427 * Prevent state changes and protect run queue. 428 */ 429 mtx_lock_spin(&sched_lock); 430 /* 431 * Increment time in/out of memory. We ignore overflow; with 432 * 16-bit int's (remember them?) overflow takes 45 days. 433 */ 434 p->p_swtime++; 435 FOREACH_KSEGRP_IN_PROC(p, kg) { 436 awake = 0; 437 FOREACH_THREAD_IN_GROUP(kg, td) { 438 ke = td->td_kse; 439 /* 440 * Increment sleep time (if sleeping). We 441 * ignore overflow, as above. 442 */ 443 /* 444 * The kse slptimes are not touched in wakeup 445 * because the thread may not HAVE a KSE. 446 */ 447 if (ke->ke_state == KES_ONRUNQ) { 448 awake = 1; 449 ke->ke_flags &= ~KEF_DIDRUN; 450 } else if ((ke->ke_state == KES_THREAD) && 451 (TD_IS_RUNNING(td))) { 452 awake = 1; 453 /* Do not clear KEF_DIDRUN */ 454 } else if (ke->ke_flags & KEF_DIDRUN) { 455 awake = 1; 456 ke->ke_flags &= ~KEF_DIDRUN; 457 } 458 459 /* 460 * ke_pctcpu is only for ps and ttyinfo(). 461 * Do it per kse, and add them up at the end? 462 * XXXKSE 463 */ 464 ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> 465 FSHIFT; 466 /* 467 * If the kse has been idle the entire second, 468 * stop recalculating its priority until 469 * it wakes up. 470 */ 471 if (ke->ke_cpticks == 0) 472 continue; 473 #if (FSHIFT >= CCPU_SHIFT) 474 ke->ke_pctcpu += (realstathz == 100) 475 ? ((fixpt_t) ke->ke_cpticks) << 476 (FSHIFT - CCPU_SHIFT) : 477 100 * (((fixpt_t) ke->ke_cpticks) 478 << (FSHIFT - CCPU_SHIFT)) / realstathz; 479 #else 480 ke->ke_pctcpu += ((FSCALE - ccpu) * 481 (ke->ke_cpticks * 482 FSCALE / realstathz)) >> FSHIFT; 483 #endif 484 ke->ke_cpticks = 0; 485 } /* end of kse loop */ 486 /* 487 * If there are ANY running threads in this KSEGRP, 488 * then don't count it as sleeping. 489 */ 490 if (awake) { 491 if (kg->kg_slptime > 1) { 492 /* 493 * In an ideal world, this should not 494 * happen, because whoever woke us 495 * up from the long sleep should have 496 * unwound the slptime and reset our 497 * priority before we run at the stale 498 * priority. Should KASSERT at some 499 * point when all the cases are fixed. 500 */ 501 updatepri(kg); 502 } 503 kg->kg_slptime = 0; 504 } else 505 kg->kg_slptime++; 506 if (kg->kg_slptime > 1) 507 continue; 508 kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); 509 resetpriority(kg); 510 FOREACH_THREAD_IN_GROUP(kg, td) { 511 if (td->td_priority >= PUSER) { 512 sched_prio(td, kg->kg_user_pri); 513 } 514 } 515 } /* end of ksegrp loop */ 516 mtx_unlock_spin(&sched_lock); 517 } /* end of process loop */ 518 sx_sunlock(&allproc_lock); 519 } 520 521 /* 522 * Main loop for a kthread that executes schedcpu once a second. 523 */ 524 static void 525 schedcpu_thread(void) 526 { 527 int nowake; 528 529 for (;;) { 530 schedcpu(); 531 tsleep(&nowake, curthread->td_priority, "-", hz); 532 } 533 } 534 535 /* 536 * Recalculate the priority of a process after it has slept for a while. 537 * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at 538 * least six times the loadfactor will decay kg_estcpu to zero. 539 */ 540 static void 541 updatepri(struct ksegrp *kg) 542 { 543 register fixpt_t loadfac; 544 register unsigned int newcpu; 545 546 loadfac = loadfactor(averunnable.ldavg[0]); 547 if (kg->kg_slptime > 5 * loadfac) 548 kg->kg_estcpu = 0; 549 else { 550 newcpu = kg->kg_estcpu; 551 kg->kg_slptime--; /* was incremented in schedcpu() */ 552 while (newcpu && --kg->kg_slptime) 553 newcpu = decay_cpu(loadfac, newcpu); 554 kg->kg_estcpu = newcpu; 555 } 556 resetpriority(kg); 557 } 558 559 /* 560 * Compute the priority of a process when running in user mode. 561 * Arrange to reschedule if the resulting priority is better 562 * than that of the current process. 563 */ 564 static void 565 resetpriority(struct ksegrp *kg) 566 { 567 register unsigned int newpriority; 568 struct thread *td; 569 570 if (kg->kg_pri_class == PRI_TIMESHARE) { 571 newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + 572 NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN); 573 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 574 PRI_MAX_TIMESHARE); 575 kg->kg_user_pri = newpriority; 576 } 577 FOREACH_THREAD_IN_GROUP(kg, td) { 578 maybe_resched(td); /* XXXKSE silly */ 579 } 580 } 581 582 /* ARGSUSED */ 583 static void 584 sched_setup(void *dummy) 585 { 586 setup_runqs(); 587 588 if (sched_quantum == 0) 589 sched_quantum = SCHED_QUANTUM; 590 hogticks = 2 * sched_quantum; 591 592 callout_init(&roundrobin_callout, CALLOUT_MPSAFE); 593 594 /* Kick off timeout driven events by calling first time. */ 595 roundrobin(NULL); 596 597 /* Account for thread0. */ 598 sched_tdcnt++; 599 } 600 601 /* External interfaces start here */ 602 /* 603 * Very early in the boot some setup of scheduler-specific 604 * parts of proc0 and of soem scheduler resources needs to be done. 605 * Called from: 606 * proc0_init() 607 */ 608 void 609 schedinit(void) 610 { 611 /* 612 * Set up the scheduler specific parts of proc0. 613 */ 614 proc0.p_sched = NULL; /* XXX */ 615 ksegrp0.kg_sched = &kg_sched0; 616 thread0.td_sched = &kse0; 617 kse0.ke_thread = &thread0; 618 kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */ 619 kse0.ke_state = KES_THREAD; 620 kg_sched0.skg_concurrency = 1; 621 kg_sched0.skg_avail_opennings = 0; /* we are already running */ 622 } 623 624 int 625 sched_runnable(void) 626 { 627 #ifdef SMP 628 return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 629 #else 630 return runq_check(&runq); 631 #endif 632 } 633 634 int 635 sched_rr_interval(void) 636 { 637 if (sched_quantum == 0) 638 sched_quantum = SCHED_QUANTUM; 639 return (sched_quantum); 640 } 641 642 /* 643 * We adjust the priority of the current process. The priority of 644 * a process gets worse as it accumulates CPU time. The cpu usage 645 * estimator (kg_estcpu) is increased here. resetpriority() will 646 * compute a different priority each time kg_estcpu increases by 647 * INVERSE_ESTCPU_WEIGHT 648 * (until MAXPRI is reached). The cpu usage estimator ramps up 649 * quite quickly when the process is running (linearly), and decays 650 * away exponentially, at a rate which is proportionally slower when 651 * the system is busy. The basic principle is that the system will 652 * 90% forget that the process used a lot of CPU time in 5 * loadav 653 * seconds. This causes the system to favor processes which haven't 654 * run much recently, and to round-robin among other processes. 655 */ 656 void 657 sched_clock(struct thread *td) 658 { 659 struct ksegrp *kg; 660 struct kse *ke; 661 662 mtx_assert(&sched_lock, MA_OWNED); 663 kg = td->td_ksegrp; 664 ke = td->td_kse; 665 666 ke->ke_cpticks++; 667 kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); 668 if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 669 resetpriority(kg); 670 if (td->td_priority >= PUSER) 671 td->td_priority = kg->kg_user_pri; 672 } 673 } 674 675 /* 676 * charge childs scheduling cpu usage to parent. 677 * 678 * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. 679 * Charge it to the ksegrp that did the wait since process estcpu is sum of 680 * all ksegrps, this is strictly as expected. Assume that the child process 681 * aggregated all the estcpu into the 'built-in' ksegrp. 682 */ 683 void 684 sched_exit(struct proc *p, struct thread *td) 685 { 686 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td); 687 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 688 } 689 690 void 691 sched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd) 692 { 693 694 mtx_assert(&sched_lock, MA_OWNED); 695 kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu); 696 } 697 698 void 699 sched_exit_thread(struct thread *td, struct thread *child) 700 { 701 if ((child->td_proc->p_flag & P_NOLOAD) == 0) 702 sched_tdcnt--; 703 } 704 705 void 706 sched_fork(struct thread *td, struct thread *childtd) 707 { 708 sched_fork_ksegrp(td, childtd->td_ksegrp); 709 sched_fork_thread(td, childtd); 710 } 711 712 void 713 sched_fork_ksegrp(struct thread *td, struct ksegrp *child) 714 { 715 mtx_assert(&sched_lock, MA_OWNED); 716 child->kg_estcpu = td->td_ksegrp->kg_estcpu; 717 } 718 719 void 720 sched_fork_thread(struct thread *td, struct thread *childtd) 721 { 722 sched_newthread(childtd); 723 } 724 725 void 726 sched_nice(struct proc *p, int nice) 727 { 728 struct ksegrp *kg; 729 730 PROC_LOCK_ASSERT(p, MA_OWNED); 731 mtx_assert(&sched_lock, MA_OWNED); 732 p->p_nice = nice; 733 FOREACH_KSEGRP_IN_PROC(p, kg) { 734 resetpriority(kg); 735 } 736 } 737 738 void 739 sched_class(struct ksegrp *kg, int class) 740 { 741 mtx_assert(&sched_lock, MA_OWNED); 742 kg->kg_pri_class = class; 743 } 744 745 /* 746 * Adjust the priority of a thread. 747 * This may include moving the thread within the KSEGRP, 748 * changing the assignment of a kse to the thread, 749 * and moving a KSE in the system run queue. 750 */ 751 void 752 sched_prio(struct thread *td, u_char prio) 753 { 754 755 mtx_assert(&sched_lock, MA_OWNED); 756 if (TD_ON_RUNQ(td)) { 757 adjustrunqueue(td, prio); 758 } else { 759 td->td_priority = prio; 760 } 761 } 762 763 void 764 sched_sleep(struct thread *td) 765 { 766 767 mtx_assert(&sched_lock, MA_OWNED); 768 td->td_ksegrp->kg_slptime = 0; 769 td->td_base_pri = td->td_priority; 770 } 771 772 static void remrunqueue(struct thread *td); 773 774 void 775 sched_switch(struct thread *td, struct thread *newtd, int flags) 776 { 777 struct kse *ke; 778 struct ksegrp *kg; 779 struct proc *p; 780 781 ke = td->td_kse; 782 p = td->td_proc; 783 784 mtx_assert(&sched_lock, MA_OWNED); 785 786 if ((p->p_flag & P_NOLOAD) == 0) 787 sched_tdcnt--; 788 /* 789 * We are volunteering to switch out so we get to nominate 790 * a successor for the rest of our quantum 791 * First try another thread in our ksegrp, and then look for 792 * other ksegrps in our process. 793 */ 794 if (sched_followon && 795 (p->p_flag & P_HADTHREADS) && 796 (flags & SW_VOL) && 797 newtd == NULL) { 798 /* lets schedule another thread from this process */ 799 kg = td->td_ksegrp; 800 if ((newtd = TAILQ_FIRST(&kg->kg_runq))) { 801 remrunqueue(newtd); 802 sched_kgfollowons++; 803 } else { 804 FOREACH_KSEGRP_IN_PROC(p, kg) { 805 if ((newtd = TAILQ_FIRST(&kg->kg_runq))) { 806 sched_pfollowons++; 807 remrunqueue(newtd); 808 break; 809 } 810 } 811 } 812 } 813 814 if (newtd) 815 newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); 816 817 td->td_lastcpu = td->td_oncpu; 818 td->td_flags &= ~TDF_NEEDRESCHED; 819 td->td_pflags &= ~TDP_OWEPREEMPT; 820 td->td_oncpu = NOCPU; 821 /* 822 * At the last moment, if this thread is still marked RUNNING, 823 * then put it back on the run queue as it has not been suspended 824 * or stopped or any thing else similar. We never put the idle 825 * threads on the run queue, however. 826 */ 827 if (td == PCPU_GET(idlethread)) 828 TD_SET_CAN_RUN(td); 829 else { 830 SLOT_RELEASE(td->td_ksegrp); 831 if (TD_IS_RUNNING(td)) { 832 /* Put us back on the run queue (kse and all). */ 833 setrunqueue(td, (flags & SW_PREEMPT) ? 834 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 835 SRQ_OURSELF|SRQ_YIELDING); 836 } else if (p->p_flag & P_HADTHREADS) { 837 /* 838 * We will not be on the run queue. So we must be 839 * sleeping or similar. As it's available, 840 * someone else can use the KSE if they need it. 841 * It's NOT available if we are about to need it 842 */ 843 if (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp) 844 slot_fill(td->td_ksegrp); 845 } 846 } 847 if (newtd) { 848 /* 849 * The thread we are about to run needs to be counted 850 * as if it had been added to the run queue and selected. 851 * It came from: 852 * * A preemption 853 * * An upcall 854 * * A followon 855 */ 856 KASSERT((newtd->td_inhibitors == 0), 857 ("trying to run inhibitted thread")); 858 SLOT_USE(newtd->td_ksegrp); 859 newtd->td_kse->ke_flags |= KEF_DIDRUN; 860 TD_SET_RUNNING(newtd); 861 if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) 862 sched_tdcnt++; 863 } else { 864 newtd = choosethread(); 865 } 866 867 if (td != newtd) 868 cpu_switch(td, newtd); 869 sched_lock.mtx_lock = (uintptr_t)td; 870 td->td_oncpu = PCPU_GET(cpuid); 871 } 872 873 void 874 sched_wakeup(struct thread *td) 875 { 876 struct ksegrp *kg; 877 878 mtx_assert(&sched_lock, MA_OWNED); 879 kg = td->td_ksegrp; 880 if (kg->kg_slptime > 1) 881 updatepri(kg); 882 kg->kg_slptime = 0; 883 setrunqueue(td, SRQ_BORING); 884 } 885 886 #ifdef SMP 887 /* enable HTT_2 if you have a 2-way HTT cpu.*/ 888 static int 889 forward_wakeup(int cpunum) 890 { 891 cpumask_t map, me, dontuse; 892 cpumask_t map2; 893 struct pcpu *pc; 894 cpumask_t id, map3; 895 896 mtx_assert(&sched_lock, MA_OWNED); 897 898 CTR0(KTR_RUNQ, "forward_wakeup()"); 899 900 if ((!forward_wakeup_enabled) || 901 (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) 902 return (0); 903 if (!smp_started || cold || panicstr) 904 return (0); 905 906 forward_wakeups_requested++; 907 908 /* 909 * check the idle mask we received against what we calculated before 910 * in the old version. 911 */ 912 me = PCPU_GET(cpumask); 913 /* 914 * don't bother if we should be doing it ourself.. 915 */ 916 if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum))) 917 return (0); 918 919 dontuse = me | stopped_cpus | hlt_cpus_mask; 920 map3 = 0; 921 if (forward_wakeup_use_loop) { 922 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 923 id = pc->pc_cpumask; 924 if ( (id & dontuse) == 0 && 925 pc->pc_curthread == pc->pc_idlethread) { 926 map3 |= id; 927 } 928 } 929 } 930 931 if (forward_wakeup_use_mask) { 932 map = 0; 933 map = idle_cpus_mask & ~dontuse; 934 935 /* If they are both on, compare and use loop if different */ 936 if (forward_wakeup_use_loop) { 937 if (map != map3) { 938 printf("map (%02X) != map3 (%02X)\n", 939 map, map3); 940 map = map3; 941 } 942 } 943 } else { 944 map = map3; 945 } 946 /* If we only allow a specific CPU, then mask off all the others */ 947 if (cpunum != NOCPU) { 948 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); 949 map &= (1 << cpunum); 950 } else { 951 /* Try choose an idle die. */ 952 if (forward_wakeup_use_htt) { 953 map2 = (map & (map >> 1)) & 0x5555; 954 if (map2) { 955 map = map2; 956 } 957 } 958 959 /* set only one bit */ 960 if (forward_wakeup_use_single) { 961 map = map & ((~map) + 1); 962 } 963 } 964 if (map) { 965 forward_wakeups_delivered++; 966 ipi_selected(map, IPI_AST); 967 return (1); 968 } 969 if (cpunum == NOCPU) 970 printf("forward_wakeup: Idle processor not found\n"); 971 return (0); 972 } 973 #endif 974 975 void 976 sched_add(struct thread *td, int flags) 977 { 978 struct kse *ke; 979 #ifdef SMP 980 int forwarded = 0; 981 int cpu; 982 #endif 983 984 ke = td->td_kse; 985 mtx_assert(&sched_lock, MA_OWNED); 986 KASSERT(ke->ke_state != KES_ONRUNQ, 987 ("sched_add: kse %p (%s) already in run queue", ke, 988 ke->ke_proc->p_comm)); 989 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 990 ("sched_add: process swapped out")); 991 992 #ifdef SMP 993 if (KSE_CAN_MIGRATE(ke)) { 994 CTR2(KTR_RUNQ, 995 "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td); 996 cpu = NOCPU; 997 ke->ke_runq = &runq; 998 } else { 999 if (!SKE_RUNQ_PCPU(ke)) 1000 ke->ke_runq = &runq_pcpu[(cpu = PCPU_GET(cpuid))]; 1001 else 1002 cpu = td->td_lastcpu; 1003 CTR3(KTR_RUNQ, 1004 "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu); 1005 } 1006 #else 1007 CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td); 1008 ke->ke_runq = &runq; 1009 1010 #endif 1011 /* 1012 * If we are yielding (on the way out anyhow) 1013 * or the thread being saved is US, 1014 * then don't try be smart about preemption 1015 * or kicking off another CPU 1016 * as it won't help and may hinder. 1017 * In the YIEDLING case, we are about to run whoever is 1018 * being put in the queue anyhow, and in the 1019 * OURSELF case, we are puting ourself on the run queue 1020 * which also only happens when we are about to yield. 1021 */ 1022 if((flags & SRQ_YIELDING) == 0) { 1023 #ifdef SMP 1024 cpumask_t me = PCPU_GET(cpumask); 1025 int idle = idle_cpus_mask & me; 1026 /* 1027 * Only try to kick off another CPU if 1028 * the thread is unpinned 1029 * or pinned to another cpu, 1030 * and there are other available and idle CPUs. 1031 * if we are idle, or it's an interrupt, 1032 * then skip straight to preemption. 1033 */ 1034 if ( (! idle) && ((flags & SRQ_INTR) == 0) && 1035 (idle_cpus_mask & ~(hlt_cpus_mask | me)) && 1036 ( KSE_CAN_MIGRATE(ke) || 1037 ke->ke_runq != &runq_pcpu[PCPU_GET(cpuid)])) { 1038 forwarded = forward_wakeup(cpu); 1039 } 1040 /* 1041 * If we failed to kick off another cpu, then look to 1042 * see if we should preempt this CPU. Only allow this 1043 * if it is not pinned or IS pinned to this CPU. 1044 * If we are the idle thread, we also try do preempt. 1045 * as it will be quicker and being idle, we won't 1046 * lose in doing so.. 1047 */ 1048 if ((!forwarded) && 1049 (ke->ke_runq == &runq || 1050 ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)])) 1051 #endif 1052 1053 { 1054 if (maybe_preempt(td)) 1055 return; 1056 } 1057 } 1058 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1059 sched_tdcnt++; 1060 SLOT_USE(td->td_ksegrp); 1061 runq_add(ke->ke_runq, ke, flags); 1062 ke->ke_ksegrp->kg_runq_kses++; 1063 ke->ke_state = KES_ONRUNQ; 1064 maybe_resched(td); 1065 } 1066 1067 void 1068 sched_rem(struct thread *td) 1069 { 1070 struct kse *ke; 1071 1072 ke = td->td_kse; 1073 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1074 ("sched_rem: process swapped out")); 1075 KASSERT((ke->ke_state == KES_ONRUNQ), 1076 ("sched_rem: KSE not on run queue")); 1077 mtx_assert(&sched_lock, MA_OWNED); 1078 1079 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1080 sched_tdcnt--; 1081 SLOT_RELEASE(td->td_ksegrp); 1082 runq_remove(ke->ke_runq, ke); 1083 1084 ke->ke_state = KES_THREAD; 1085 td->td_ksegrp->kg_runq_kses--; 1086 } 1087 1088 /* 1089 * Select threads to run. 1090 * Notice that the running threads still consume a slot. 1091 */ 1092 struct kse * 1093 sched_choose(void) 1094 { 1095 struct kse *ke; 1096 struct runq *rq; 1097 1098 #ifdef SMP 1099 struct kse *kecpu; 1100 1101 rq = &runq; 1102 ke = runq_choose(&runq); 1103 kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 1104 1105 if (ke == NULL || 1106 (kecpu != NULL && 1107 kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) { 1108 CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu, 1109 PCPU_GET(cpuid)); 1110 ke = kecpu; 1111 rq = &runq_pcpu[PCPU_GET(cpuid)]; 1112 } else { 1113 CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke); 1114 } 1115 1116 #else 1117 rq = &runq; 1118 ke = runq_choose(&runq); 1119 #endif 1120 1121 if (ke != NULL) { 1122 runq_remove(rq, ke); 1123 ke->ke_state = KES_THREAD; 1124 ke->ke_ksegrp->kg_runq_kses--; 1125 1126 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1127 ("sched_choose: process swapped out")); 1128 } 1129 return (ke); 1130 } 1131 1132 void 1133 sched_userret(struct thread *td) 1134 { 1135 struct ksegrp *kg; 1136 /* 1137 * XXX we cheat slightly on the locking here to avoid locking in 1138 * the usual case. Setting td_priority here is essentially an 1139 * incomplete workaround for not setting it properly elsewhere. 1140 * Now that some interrupt handlers are threads, not setting it 1141 * properly elsewhere can clobber it in the window between setting 1142 * it here and returning to user mode, so don't waste time setting 1143 * it perfectly here. 1144 */ 1145 kg = td->td_ksegrp; 1146 if (td->td_priority != kg->kg_user_pri) { 1147 mtx_lock_spin(&sched_lock); 1148 td->td_priority = kg->kg_user_pri; 1149 mtx_unlock_spin(&sched_lock); 1150 } 1151 } 1152 1153 void 1154 sched_bind(struct thread *td, int cpu) 1155 { 1156 struct kse *ke; 1157 1158 mtx_assert(&sched_lock, MA_OWNED); 1159 KASSERT(TD_IS_RUNNING(td), 1160 ("sched_bind: cannot bind non-running thread")); 1161 1162 ke = td->td_kse; 1163 1164 ke->ke_flags |= KEF_BOUND; 1165 #ifdef SMP 1166 ke->ke_runq = &runq_pcpu[cpu]; 1167 if (PCPU_GET(cpuid) == cpu) 1168 return; 1169 1170 ke->ke_state = KES_THREAD; 1171 1172 mi_switch(SW_VOL, NULL); 1173 #endif 1174 } 1175 1176 void 1177 sched_unbind(struct thread* td) 1178 { 1179 mtx_assert(&sched_lock, MA_OWNED); 1180 td->td_kse->ke_flags &= ~KEF_BOUND; 1181 } 1182 1183 int 1184 sched_load(void) 1185 { 1186 return (sched_tdcnt); 1187 } 1188 1189 int 1190 sched_sizeof_ksegrp(void) 1191 { 1192 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1193 } 1194 int 1195 sched_sizeof_proc(void) 1196 { 1197 return (sizeof(struct proc)); 1198 } 1199 int 1200 sched_sizeof_thread(void) 1201 { 1202 return (sizeof(struct thread) + sizeof(struct kse)); 1203 } 1204 1205 fixpt_t 1206 sched_pctcpu(struct thread *td) 1207 { 1208 struct kse *ke; 1209 1210 ke = td->td_kse; 1211 return (ke->ke_pctcpu); 1212 1213 return (0); 1214 } 1215 #define KERN_SWITCH_INCLUDE 1 1216 #include "kern/kern_switch.c" 1217