1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/lock.h> 43 #include <sys/kthread.h> 44 #include <sys/mutex.h> 45 #include <sys/proc.h> 46 #include <sys/resourcevar.h> 47 #include <sys/sched.h> 48 #include <sys/smp.h> 49 #include <sys/sysctl.h> 50 #include <sys/sx.h> 51 52 #define KTR_4BSD 0x0 53 54 /* 55 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 56 * the range 100-256 Hz (approximately). 57 */ 58 #define ESTCPULIM(e) \ 59 min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 60 RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 61 #ifdef SMP 62 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 63 #else 64 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 65 #endif 66 #define NICE_WEIGHT 1 /* Priorities per nice level. */ 67 68 struct ke_sched { 69 int ske_cpticks; /* (j) Ticks of cpu time. */ 70 struct runq *ske_runq; /* runq the kse is currently on */ 71 }; 72 #define ke_runq ke_sched->ske_runq 73 #define KEF_BOUND KEF_SCHED1 74 75 #define SKE_RUNQ_PCPU(ke) \ 76 ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq) 77 78 /* 79 * KSE_CAN_MIGRATE macro returns true if the kse can migrate between 80 * cpus. 81 */ 82 #define KSE_CAN_MIGRATE(ke) \ 83 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 84 static struct ke_sched ke_sched; 85 86 struct ke_sched *kse0_sched = &ke_sched; 87 struct kg_sched *ksegrp0_sched = NULL; 88 struct p_sched *proc0_sched = NULL; 89 struct td_sched *thread0_sched = NULL; 90 91 static int sched_tdcnt; /* Total runnable threads in the system. */ 92 static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 93 #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ 94 95 static struct callout roundrobin_callout; 96 97 static void setup_runqs(void); 98 static void roundrobin(void *arg); 99 static void schedcpu(void); 100 static void schedcpu_thread(void); 101 static void sched_setup(void *dummy); 102 static void maybe_resched(struct thread *td); 103 static void updatepri(struct ksegrp *kg); 104 static void resetpriority(struct ksegrp *kg); 105 106 static struct kproc_desc sched_kp = { 107 "schedcpu", 108 schedcpu_thread, 109 NULL 110 }; 111 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp) 112 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 113 114 /* 115 * Global run queue. 116 */ 117 static struct runq runq; 118 119 #ifdef SMP 120 /* 121 * Per-CPU run queues 122 */ 123 static struct runq runq_pcpu[MAXCPU]; 124 #endif 125 126 static void 127 setup_runqs(void) 128 { 129 #ifdef SMP 130 int i; 131 132 for (i = 0; i < MAXCPU; ++i) 133 runq_init(&runq_pcpu[i]); 134 #endif 135 136 runq_init(&runq); 137 } 138 139 static int 140 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 141 { 142 int error, new_val; 143 144 new_val = sched_quantum * tick; 145 error = sysctl_handle_int(oidp, &new_val, 0, req); 146 if (error != 0 || req->newptr == NULL) 147 return (error); 148 if (new_val < tick) 149 return (EINVAL); 150 sched_quantum = new_val / tick; 151 hogticks = 2 * sched_quantum; 152 return (0); 153 } 154 155 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 156 157 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 158 "Scheduler name"); 159 160 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 161 0, sizeof sched_quantum, sysctl_kern_quantum, "I", 162 "Roundrobin scheduling quantum in microseconds"); 163 164 /* 165 * Arrange to reschedule if necessary, taking the priorities and 166 * schedulers into account. 167 */ 168 static void 169 maybe_resched(struct thread *td) 170 { 171 172 mtx_assert(&sched_lock, MA_OWNED); 173 if (td->td_priority < curthread->td_priority && curthread->td_kse) 174 curthread->td_flags |= TDF_NEEDRESCHED; 175 } 176 177 /* 178 * Force switch among equal priority processes every 100ms. 179 * We don't actually need to force a context switch of the current process. 180 * The act of firing the event triggers a context switch to softclock() and 181 * then switching back out again which is equivalent to a preemption, thus 182 * no further work is needed on the local CPU. 183 */ 184 /* ARGSUSED */ 185 static void 186 roundrobin(void *arg) 187 { 188 189 #ifdef SMP 190 mtx_lock_spin(&sched_lock); 191 forward_roundrobin(); 192 mtx_unlock_spin(&sched_lock); 193 #endif 194 195 callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); 196 } 197 198 /* 199 * Constants for digital decay and forget: 200 * 90% of (kg_estcpu) usage in 5 * loadav time 201 * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive) 202 * Note that, as ps(1) mentions, this can let percentages 203 * total over 100% (I've seen 137.9% for 3 processes). 204 * 205 * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously. 206 * 207 * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds. 208 * That is, the system wants to compute a value of decay such 209 * that the following for loop: 210 * for (i = 0; i < (5 * loadavg); i++) 211 * kg_estcpu *= decay; 212 * will compute 213 * kg_estcpu *= 0.1; 214 * for all values of loadavg: 215 * 216 * Mathematically this loop can be expressed by saying: 217 * decay ** (5 * loadavg) ~= .1 218 * 219 * The system computes decay as: 220 * decay = (2 * loadavg) / (2 * loadavg + 1) 221 * 222 * We wish to prove that the system's computation of decay 223 * will always fulfill the equation: 224 * decay ** (5 * loadavg) ~= .1 225 * 226 * If we compute b as: 227 * b = 2 * loadavg 228 * then 229 * decay = b / (b + 1) 230 * 231 * We now need to prove two things: 232 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 233 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 234 * 235 * Facts: 236 * For x close to zero, exp(x) =~ 1 + x, since 237 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 238 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 239 * For x close to zero, ln(1+x) =~ x, since 240 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 241 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 242 * ln(.1) =~ -2.30 243 * 244 * Proof of (1): 245 * Solve (factor)**(power) =~ .1 given power (5*loadav): 246 * solving for factor, 247 * ln(factor) =~ (-2.30/5*loadav), or 248 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 249 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 250 * 251 * Proof of (2): 252 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 253 * solving for power, 254 * power*ln(b/(b+1)) =~ -2.30, or 255 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 256 * 257 * Actual power values for the implemented algorithm are as follows: 258 * loadav: 1 2 3 4 259 * power: 5.68 10.32 14.94 19.55 260 */ 261 262 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 263 #define loadfactor(loadav) (2 * (loadav)) 264 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 265 266 /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 267 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 268 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 269 270 /* 271 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 272 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 273 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 274 * 275 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 276 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 277 * 278 * If you don't want to bother with the faster/more-accurate formula, you 279 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 280 * (more general) method of calculating the %age of CPU used by a process. 281 */ 282 #define CCPU_SHIFT 11 283 284 /* 285 * Recompute process priorities, every hz ticks. 286 * MP-safe, called without the Giant mutex. 287 */ 288 /* ARGSUSED */ 289 static void 290 schedcpu(void) 291 { 292 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 293 struct thread *td; 294 struct proc *p; 295 struct kse *ke; 296 struct ksegrp *kg; 297 int awake, realstathz; 298 299 realstathz = stathz ? stathz : hz; 300 sx_slock(&allproc_lock); 301 FOREACH_PROC_IN_SYSTEM(p) { 302 /* 303 * Prevent state changes and protect run queue. 304 */ 305 mtx_lock_spin(&sched_lock); 306 /* 307 * Increment time in/out of memory. We ignore overflow; with 308 * 16-bit int's (remember them?) overflow takes 45 days. 309 */ 310 p->p_swtime++; 311 FOREACH_KSEGRP_IN_PROC(p, kg) { 312 awake = 0; 313 FOREACH_KSE_IN_GROUP(kg, ke) { 314 /* 315 * Increment sleep time (if sleeping). We 316 * ignore overflow, as above. 317 */ 318 /* 319 * The kse slptimes are not touched in wakeup 320 * because the thread may not HAVE a KSE. 321 */ 322 if (ke->ke_state == KES_ONRUNQ) { 323 awake = 1; 324 ke->ke_flags &= ~KEF_DIDRUN; 325 } else if ((ke->ke_state == KES_THREAD) && 326 (TD_IS_RUNNING(ke->ke_thread))) { 327 awake = 1; 328 /* Do not clear KEF_DIDRUN */ 329 } else if (ke->ke_flags & KEF_DIDRUN) { 330 awake = 1; 331 ke->ke_flags &= ~KEF_DIDRUN; 332 } 333 334 /* 335 * ke_pctcpu is only for ps and ttyinfo(). 336 * Do it per kse, and add them up at the end? 337 * XXXKSE 338 */ 339 ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> 340 FSHIFT; 341 /* 342 * If the kse has been idle the entire second, 343 * stop recalculating its priority until 344 * it wakes up. 345 */ 346 if (ke->ke_sched->ske_cpticks == 0) 347 continue; 348 #if (FSHIFT >= CCPU_SHIFT) 349 ke->ke_pctcpu += (realstathz == 100) 350 ? ((fixpt_t) ke->ke_sched->ske_cpticks) << 351 (FSHIFT - CCPU_SHIFT) : 352 100 * (((fixpt_t) ke->ke_sched->ske_cpticks) 353 << (FSHIFT - CCPU_SHIFT)) / realstathz; 354 #else 355 ke->ke_pctcpu += ((FSCALE - ccpu) * 356 (ke->ke_sched->ske_cpticks * 357 FSCALE / realstathz)) >> FSHIFT; 358 #endif 359 ke->ke_sched->ske_cpticks = 0; 360 } /* end of kse loop */ 361 /* 362 * If there are ANY running threads in this KSEGRP, 363 * then don't count it as sleeping. 364 */ 365 if (awake) { 366 if (kg->kg_slptime > 1) { 367 /* 368 * In an ideal world, this should not 369 * happen, because whoever woke us 370 * up from the long sleep should have 371 * unwound the slptime and reset our 372 * priority before we run at the stale 373 * priority. Should KASSERT at some 374 * point when all the cases are fixed. 375 */ 376 updatepri(kg); 377 } 378 kg->kg_slptime = 0; 379 } else 380 kg->kg_slptime++; 381 if (kg->kg_slptime > 1) 382 continue; 383 kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); 384 resetpriority(kg); 385 FOREACH_THREAD_IN_GROUP(kg, td) { 386 if (td->td_priority >= PUSER) { 387 sched_prio(td, kg->kg_user_pri); 388 } 389 } 390 } /* end of ksegrp loop */ 391 mtx_unlock_spin(&sched_lock); 392 } /* end of process loop */ 393 sx_sunlock(&allproc_lock); 394 } 395 396 /* 397 * Main loop for a kthread that executes schedcpu once a second. 398 */ 399 static void 400 schedcpu_thread(void) 401 { 402 int nowake; 403 404 for (;;) { 405 schedcpu(); 406 tsleep(&nowake, curthread->td_priority, "-", hz); 407 } 408 } 409 410 /* 411 * Recalculate the priority of a process after it has slept for a while. 412 * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at 413 * least six times the loadfactor will decay kg_estcpu to zero. 414 */ 415 static void 416 updatepri(struct ksegrp *kg) 417 { 418 register fixpt_t loadfac; 419 register unsigned int newcpu; 420 421 loadfac = loadfactor(averunnable.ldavg[0]); 422 if (kg->kg_slptime > 5 * loadfac) 423 kg->kg_estcpu = 0; 424 else { 425 newcpu = kg->kg_estcpu; 426 kg->kg_slptime--; /* was incremented in schedcpu() */ 427 while (newcpu && --kg->kg_slptime) 428 newcpu = decay_cpu(loadfac, newcpu); 429 kg->kg_estcpu = newcpu; 430 } 431 resetpriority(kg); 432 } 433 434 /* 435 * Compute the priority of a process when running in user mode. 436 * Arrange to reschedule if the resulting priority is better 437 * than that of the current process. 438 */ 439 static void 440 resetpriority(struct ksegrp *kg) 441 { 442 register unsigned int newpriority; 443 struct thread *td; 444 445 if (kg->kg_pri_class == PRI_TIMESHARE) { 446 newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + 447 NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN); 448 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 449 PRI_MAX_TIMESHARE); 450 kg->kg_user_pri = newpriority; 451 } 452 FOREACH_THREAD_IN_GROUP(kg, td) { 453 maybe_resched(td); /* XXXKSE silly */ 454 } 455 } 456 457 /* ARGSUSED */ 458 static void 459 sched_setup(void *dummy) 460 { 461 setup_runqs(); 462 463 if (sched_quantum == 0) 464 sched_quantum = SCHED_QUANTUM; 465 hogticks = 2 * sched_quantum; 466 467 callout_init(&roundrobin_callout, CALLOUT_MPSAFE); 468 469 /* Kick off timeout driven events by calling first time. */ 470 roundrobin(NULL); 471 472 /* Account for thread0. */ 473 sched_tdcnt++; 474 } 475 476 /* External interfaces start here */ 477 int 478 sched_runnable(void) 479 { 480 #ifdef SMP 481 return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 482 #else 483 return runq_check(&runq); 484 #endif 485 } 486 487 int 488 sched_rr_interval(void) 489 { 490 if (sched_quantum == 0) 491 sched_quantum = SCHED_QUANTUM; 492 return (sched_quantum); 493 } 494 495 /* 496 * We adjust the priority of the current process. The priority of 497 * a process gets worse as it accumulates CPU time. The cpu usage 498 * estimator (kg_estcpu) is increased here. resetpriority() will 499 * compute a different priority each time kg_estcpu increases by 500 * INVERSE_ESTCPU_WEIGHT 501 * (until MAXPRI is reached). The cpu usage estimator ramps up 502 * quite quickly when the process is running (linearly), and decays 503 * away exponentially, at a rate which is proportionally slower when 504 * the system is busy. The basic principle is that the system will 505 * 90% forget that the process used a lot of CPU time in 5 * loadav 506 * seconds. This causes the system to favor processes which haven't 507 * run much recently, and to round-robin among other processes. 508 */ 509 void 510 sched_clock(struct thread *td) 511 { 512 struct ksegrp *kg; 513 struct kse *ke; 514 515 mtx_assert(&sched_lock, MA_OWNED); 516 kg = td->td_ksegrp; 517 ke = td->td_kse; 518 519 ke->ke_sched->ske_cpticks++; 520 kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); 521 if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 522 resetpriority(kg); 523 if (td->td_priority >= PUSER) 524 td->td_priority = kg->kg_user_pri; 525 } 526 } 527 528 /* 529 * charge childs scheduling cpu usage to parent. 530 * 531 * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. 532 * Charge it to the ksegrp that did the wait since process estcpu is sum of 533 * all ksegrps, this is strictly as expected. Assume that the child process 534 * aggregated all the estcpu into the 'built-in' ksegrp. 535 */ 536 void 537 sched_exit(struct proc *p, struct thread *td) 538 { 539 sched_exit_kse(FIRST_KSE_IN_PROC(p), td); 540 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td); 541 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 542 } 543 544 void 545 sched_exit_kse(struct kse *ke, struct thread *child) 546 { 547 } 548 549 void 550 sched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd) 551 { 552 553 mtx_assert(&sched_lock, MA_OWNED); 554 kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu); 555 } 556 557 void 558 sched_exit_thread(struct thread *td, struct thread *child) 559 { 560 if ((child->td_proc->p_flag & P_NOLOAD) == 0) 561 sched_tdcnt--; 562 } 563 564 void 565 sched_fork(struct thread *td, struct proc *p1) 566 { 567 sched_fork_kse(td, FIRST_KSE_IN_PROC(p1)); 568 sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1)); 569 sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1)); 570 } 571 572 void 573 sched_fork_kse(struct thread *td, struct kse *child) 574 { 575 child->ke_sched->ske_cpticks = 0; 576 } 577 578 void 579 sched_fork_ksegrp(struct thread *td, struct ksegrp *child) 580 { 581 mtx_assert(&sched_lock, MA_OWNED); 582 child->kg_estcpu = td->td_ksegrp->kg_estcpu; 583 } 584 585 void 586 sched_fork_thread(struct thread *td, struct thread *child) 587 { 588 } 589 590 void 591 sched_nice(struct proc *p, int nice) 592 { 593 struct ksegrp *kg; 594 595 PROC_LOCK_ASSERT(p, MA_OWNED); 596 mtx_assert(&sched_lock, MA_OWNED); 597 p->p_nice = nice; 598 FOREACH_KSEGRP_IN_PROC(p, kg) { 599 resetpriority(kg); 600 } 601 } 602 603 void 604 sched_class(struct ksegrp *kg, int class) 605 { 606 mtx_assert(&sched_lock, MA_OWNED); 607 kg->kg_pri_class = class; 608 } 609 610 /* 611 * Adjust the priority of a thread. 612 * This may include moving the thread within the KSEGRP, 613 * changing the assignment of a kse to the thread, 614 * and moving a KSE in the system run queue. 615 */ 616 void 617 sched_prio(struct thread *td, u_char prio) 618 { 619 620 mtx_assert(&sched_lock, MA_OWNED); 621 if (TD_ON_RUNQ(td)) { 622 adjustrunqueue(td, prio); 623 } else { 624 td->td_priority = prio; 625 } 626 } 627 628 void 629 sched_sleep(struct thread *td) 630 { 631 632 mtx_assert(&sched_lock, MA_OWNED); 633 td->td_ksegrp->kg_slptime = 0; 634 td->td_base_pri = td->td_priority; 635 } 636 637 void 638 sched_switch(struct thread *td, struct thread *newtd) 639 { 640 struct kse *ke; 641 struct proc *p; 642 643 ke = td->td_kse; 644 p = td->td_proc; 645 646 mtx_assert(&sched_lock, MA_OWNED); 647 KASSERT((ke->ke_state == KES_THREAD), ("sched_switch: kse state?")); 648 649 if ((p->p_flag & P_NOLOAD) == 0) 650 sched_tdcnt--; 651 if (newtd != NULL && (newtd->td_proc->p_flag & P_NOLOAD) == 0) 652 sched_tdcnt++; 653 td->td_lastcpu = td->td_oncpu; 654 td->td_last_kse = ke; 655 td->td_flags &= ~TDF_NEEDRESCHED; 656 td->td_pflags &= ~TDP_OWEPREEMPT; 657 td->td_oncpu = NOCPU; 658 /* 659 * At the last moment, if this thread is still marked RUNNING, 660 * then put it back on the run queue as it has not been suspended 661 * or stopped or any thing else similar. We never put the idle 662 * threads on the run queue, however. 663 */ 664 if (td == PCPU_GET(idlethread)) 665 TD_SET_CAN_RUN(td); 666 else if (TD_IS_RUNNING(td)) { 667 /* Put us back on the run queue (kse and all). */ 668 setrunqueue(td); 669 } else if (p->p_flag & P_SA) { 670 /* 671 * We will not be on the run queue. So we must be 672 * sleeping or similar. As it's available, 673 * someone else can use the KSE if they need it. 674 */ 675 kse_reassign(ke); 676 } 677 if (newtd == NULL) 678 newtd = choosethread(); 679 if (td != newtd) 680 cpu_switch(td, newtd); 681 sched_lock.mtx_lock = (uintptr_t)td; 682 td->td_oncpu = PCPU_GET(cpuid); 683 } 684 685 void 686 sched_wakeup(struct thread *td) 687 { 688 struct ksegrp *kg; 689 690 mtx_assert(&sched_lock, MA_OWNED); 691 kg = td->td_ksegrp; 692 if (kg->kg_slptime > 1) 693 updatepri(kg); 694 kg->kg_slptime = 0; 695 setrunqueue(td); 696 } 697 698 void 699 sched_add(struct thread *td) 700 { 701 struct kse *ke; 702 703 ke = td->td_kse; 704 mtx_assert(&sched_lock, MA_OWNED); 705 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 706 KASSERT((ke->ke_thread->td_kse != NULL), 707 ("sched_add: No KSE on thread")); 708 KASSERT(ke->ke_state != KES_ONRUNQ, 709 ("sched_add: kse %p (%s) already in run queue", ke, 710 ke->ke_proc->p_comm)); 711 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 712 ("sched_add: process swapped out")); 713 714 #ifdef SMP 715 /* 716 * Only try to preempt if the thread is unpinned or pinned to the 717 * current CPU. 718 */ 719 if (KSE_CAN_MIGRATE(ke) || ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)]) 720 #endif 721 if (maybe_preempt(td)) 722 return; 723 ke->ke_ksegrp->kg_runq_kses++; 724 ke->ke_state = KES_ONRUNQ; 725 726 #ifdef SMP 727 if (KSE_CAN_MIGRATE(ke)) { 728 CTR1(KTR_4BSD, "adding kse:%p to gbl runq", ke); 729 ke->ke_runq = &runq; 730 } else { 731 CTR1(KTR_4BSD, "adding kse:%p to pcpu runq", ke); 732 if (!SKE_RUNQ_PCPU(ke)) 733 ke->ke_runq = &runq_pcpu[PCPU_GET(cpuid)]; 734 } 735 #else 736 ke->ke_runq = &runq; 737 #endif 738 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 739 sched_tdcnt++; 740 runq_add(ke->ke_runq, ke); 741 maybe_resched(td); 742 } 743 744 void 745 sched_rem(struct thread *td) 746 { 747 struct kse *ke; 748 749 ke = td->td_kse; 750 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 751 ("sched_rem: process swapped out")); 752 KASSERT((ke->ke_state == KES_ONRUNQ), 753 ("sched_rem: KSE not on run queue")); 754 mtx_assert(&sched_lock, MA_OWNED); 755 756 if ((td->td_proc->p_flag & P_NOLOAD) == 0) 757 sched_tdcnt--; 758 runq_remove(ke->ke_sched->ske_runq, ke); 759 760 ke->ke_state = KES_THREAD; 761 ke->ke_ksegrp->kg_runq_kses--; 762 } 763 764 struct kse * 765 sched_choose(void) 766 { 767 struct kse *ke; 768 struct runq *rq; 769 770 #ifdef SMP 771 struct kse *kecpu; 772 773 rq = &runq; 774 ke = runq_choose(&runq); 775 kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 776 777 if (ke == NULL || 778 (kecpu != NULL && 779 kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) { 780 CTR2(KTR_4BSD, "choosing kse %p from pcpu runq %d", kecpu, 781 PCPU_GET(cpuid)); 782 ke = kecpu; 783 rq = &runq_pcpu[PCPU_GET(cpuid)]; 784 } else { 785 CTR1(KTR_4BSD, "choosing kse %p from main runq", ke); 786 } 787 788 #else 789 rq = &runq; 790 ke = runq_choose(&runq); 791 #endif 792 793 if (ke != NULL) { 794 runq_remove(rq, ke); 795 ke->ke_state = KES_THREAD; 796 797 KASSERT((ke->ke_thread != NULL), 798 ("sched_choose: No thread on KSE")); 799 KASSERT((ke->ke_thread->td_kse != NULL), 800 ("sched_choose: No KSE on thread")); 801 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 802 ("sched_choose: process swapped out")); 803 } 804 return (ke); 805 } 806 807 void 808 sched_userret(struct thread *td) 809 { 810 struct ksegrp *kg; 811 /* 812 * XXX we cheat slightly on the locking here to avoid locking in 813 * the usual case. Setting td_priority here is essentially an 814 * incomplete workaround for not setting it properly elsewhere. 815 * Now that some interrupt handlers are threads, not setting it 816 * properly elsewhere can clobber it in the window between setting 817 * it here and returning to user mode, so don't waste time setting 818 * it perfectly here. 819 */ 820 kg = td->td_ksegrp; 821 if (td->td_priority != kg->kg_user_pri) { 822 mtx_lock_spin(&sched_lock); 823 td->td_priority = kg->kg_user_pri; 824 mtx_unlock_spin(&sched_lock); 825 } 826 } 827 828 void 829 sched_bind(struct thread *td, int cpu) 830 { 831 struct kse *ke; 832 833 mtx_assert(&sched_lock, MA_OWNED); 834 KASSERT(TD_IS_RUNNING(td), 835 ("sched_bind: cannot bind non-running thread")); 836 837 ke = td->td_kse; 838 839 ke->ke_flags |= KEF_BOUND; 840 #ifdef SMP 841 ke->ke_runq = &runq_pcpu[cpu]; 842 if (PCPU_GET(cpuid) == cpu) 843 return; 844 845 ke->ke_state = KES_THREAD; 846 847 mi_switch(SW_VOL, NULL); 848 #endif 849 } 850 851 void 852 sched_unbind(struct thread* td) 853 { 854 mtx_assert(&sched_lock, MA_OWNED); 855 td->td_kse->ke_flags &= ~KEF_BOUND; 856 } 857 858 int 859 sched_load(void) 860 { 861 return (sched_tdcnt); 862 } 863 864 int 865 sched_sizeof_kse(void) 866 { 867 return (sizeof(struct kse) + sizeof(struct ke_sched)); 868 } 869 int 870 sched_sizeof_ksegrp(void) 871 { 872 return (sizeof(struct ksegrp)); 873 } 874 int 875 sched_sizeof_proc(void) 876 { 877 return (sizeof(struct proc)); 878 } 879 int 880 sched_sizeof_thread(void) 881 { 882 return (sizeof(struct thread)); 883 } 884 885 fixpt_t 886 sched_pctcpu(struct thread *td) 887 { 888 struct kse *ke; 889 890 ke = td->td_kse; 891 if (ke == NULL) 892 ke = td->td_last_kse; 893 if (ke) 894 return (ke->ke_pctcpu); 895 896 return (0); 897 } 898