1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 39 * $FreeBSD$ 40 */ 41 42 #include "opt_ddb.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/condvar.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/proc.h> 53 #include <sys/resourcevar.h> 54 #include <sys/sched.h> 55 #include <sys/signalvar.h> 56 #include <sys/smp.h> 57 #include <sys/sx.h> 58 #include <sys/sysctl.h> 59 #include <sys/sysproto.h> 60 #include <sys/vmmeter.h> 61 #ifdef DDB 62 #include <ddb/ddb.h> 63 #endif 64 #ifdef KTRACE 65 #include <sys/uio.h> 66 #include <sys/ktrace.h> 67 #endif 68 69 #include <machine/cpu.h> 70 71 static void sched_setup(void *dummy); 72 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) 73 74 int hogticks; 75 int lbolt; 76 77 static struct callout loadav_callout; 78 79 struct loadavg averunnable = 80 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 81 /* 82 * Constants for averages over 1, 5, and 15 minutes 83 * when sampling at 5 second intervals. 84 */ 85 static fixpt_t cexp[3] = { 86 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 87 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 88 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 89 }; 90 91 static void endtsleep(void *); 92 static void loadav(void *arg); 93 94 /* 95 * We're only looking at 7 bits of the address; everything is 96 * aligned to 4, lots of things are aligned to greater powers 97 * of 2. Shift right by 8, i.e. drop the bottom 256 worth. 98 */ 99 #define TABLESIZE 128 100 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; 101 #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) 102 103 void 104 sleepinit(void) 105 { 106 int i; 107 108 hogticks = (hz / 10) * 2; /* Default only. */ 109 for (i = 0; i < TABLESIZE; i++) 110 TAILQ_INIT(&slpque[i]); 111 } 112 113 /* 114 * General sleep call. Suspends the current process until a wakeup is 115 * performed on the specified identifier. The process will then be made 116 * runnable with the specified priority. Sleeps at most timo/hz seconds 117 * (0 means no timeout). If pri includes PCATCH flag, signals are checked 118 * before and after sleeping, else signals are not checked. Returns 0 if 119 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 120 * signal needs to be delivered, ERESTART is returned if the current system 121 * call should be restarted if possible, and EINTR is returned if the system 122 * call should be interrupted by the signal (return EINTR). 123 * 124 * The mutex argument is exited before the caller is suspended, and 125 * entered before msleep returns. If priority includes the PDROP 126 * flag the mutex is not entered before returning. 127 */ 128 129 int 130 msleep(ident, mtx, priority, wmesg, timo) 131 void *ident; 132 struct mtx *mtx; 133 int priority, timo; 134 const char *wmesg; 135 { 136 struct thread *td = curthread; 137 struct proc *p = td->td_proc; 138 int sig, catch = priority & PCATCH; 139 int rval = 0; 140 WITNESS_SAVE_DECL(mtx); 141 142 #ifdef KTRACE 143 if (KTRPOINT(td, KTR_CSW)) 144 ktrcsw(1, 0); 145 #endif 146 WITNESS_SLEEP(0, &mtx->mtx_object); 147 KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL, 148 ("sleeping without a mutex")); 149 /* 150 * If we are capable of async syscalls and there isn't already 151 * another one ready to return, start a new thread 152 * and queue it as ready to run. Note that there is danger here 153 * because we need to make sure that we don't sleep allocating 154 * the thread (recursion here might be bad). 155 * Hence the TDF_INMSLEEP flag. 156 */ 157 if (p->p_flag & P_KSES) { 158 /* 159 * Just don't bother if we are exiting 160 * and not the exiting thread. 161 */ 162 if ((p->p_flag & P_WEXIT) && catch && (p->p_singlethread != td)) 163 return (EINTR); 164 mtx_lock_spin(&sched_lock); 165 if ((td->td_flags & (TDF_UNBOUND|TDF_INMSLEEP)) == 166 TDF_UNBOUND) { 167 /* 168 * Arrange for an upcall to be readied. 169 * it will not actually happen until all 170 * pending in-kernel work for this KSEGRP 171 * has been done. 172 */ 173 /* Don't recurse here! */ 174 td->td_flags |= TDF_INMSLEEP; 175 thread_schedule_upcall(td, td->td_kse); 176 td->td_flags &= ~TDF_INMSLEEP; 177 } 178 } else { 179 mtx_lock_spin(&sched_lock); 180 } 181 if (cold ) { 182 /* 183 * During autoconfiguration, just give interrupts 184 * a chance, then just return. 185 * Don't run any other procs or panic below, 186 * in case this is the idle process and already asleep. 187 */ 188 if (mtx != NULL && priority & PDROP) 189 mtx_unlock(mtx); 190 mtx_unlock_spin(&sched_lock); 191 return (0); 192 } 193 194 DROP_GIANT(); 195 196 if (mtx != NULL) { 197 mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); 198 WITNESS_SAVE(&mtx->mtx_object, mtx); 199 mtx_unlock(mtx); 200 if (priority & PDROP) 201 mtx = NULL; 202 } 203 204 KASSERT(p != NULL, ("msleep1")); 205 KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); 206 207 CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)", 208 td, p->p_pid, p->p_comm, wmesg, ident); 209 210 td->td_wchan = ident; 211 td->td_wmesg = wmesg; 212 TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq); 213 TD_SET_ON_SLEEPQ(td); 214 if (timo) 215 callout_reset(&td->td_slpcallout, timo, endtsleep, td); 216 /* 217 * We put ourselves on the sleep queue and start our timeout 218 * before calling thread_suspend_check, as we could stop there, and 219 * a wakeup or a SIGCONT (or both) could occur while we were stopped. 220 * without resuming us, thus we must be ready for sleep 221 * when cursig is called. If the wakeup happens while we're 222 * stopped, td->td_wchan will be 0 upon return from cursig. 223 */ 224 if (catch) { 225 CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td, 226 p->p_pid, p->p_comm); 227 td->td_flags |= TDF_SINTR; 228 mtx_unlock_spin(&sched_lock); 229 PROC_LOCK(p); 230 sig = cursig(td); 231 if (sig == 0 && thread_suspend_check(1)) 232 sig = SIGSTOP; 233 mtx_lock_spin(&sched_lock); 234 PROC_UNLOCK(p); 235 if (sig != 0) { 236 if (TD_ON_SLEEPQ(td)) 237 unsleep(td); 238 } else if (!TD_ON_SLEEPQ(td)) 239 catch = 0; 240 } else 241 sig = 0; 242 243 /* 244 * Let the scheduler know we're about to voluntarily go to sleep. 245 */ 246 sched_sleep(td, priority & PRIMASK); 247 248 if (TD_ON_SLEEPQ(td)) { 249 p->p_stats->p_ru.ru_nvcsw++; 250 TD_SET_SLEEPING(td); 251 mi_switch(); 252 } 253 /* 254 * We're awake from voluntary sleep. 255 */ 256 CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid, 257 p->p_comm); 258 KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); 259 td->td_flags &= ~TDF_SINTR; 260 if (td->td_flags & TDF_TIMEOUT) { 261 td->td_flags &= ~TDF_TIMEOUT; 262 if (sig == 0) 263 rval = EWOULDBLOCK; 264 } else if (td->td_flags & TDF_TIMOFAIL) { 265 td->td_flags &= ~TDF_TIMOFAIL; 266 } else if (timo && callout_stop(&td->td_slpcallout) == 0) { 267 /* 268 * This isn't supposed to be pretty. If we are here, then 269 * the endtsleep() callout is currently executing on another 270 * CPU and is either spinning on the sched_lock or will be 271 * soon. If we don't synchronize here, there is a chance 272 * that this process may msleep() again before the callout 273 * has a chance to run and the callout may end up waking up 274 * the wrong msleep(). Yuck. 275 */ 276 TD_SET_SLEEPING(td); 277 p->p_stats->p_ru.ru_nivcsw++; 278 mi_switch(); 279 td->td_flags &= ~TDF_TIMOFAIL; 280 } 281 mtx_unlock_spin(&sched_lock); 282 283 if (rval == 0 && catch) { 284 PROC_LOCK(p); 285 /* XXX: shouldn't we always be calling cursig() */ 286 if (sig != 0 || (sig = cursig(td))) { 287 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) 288 rval = EINTR; 289 else 290 rval = ERESTART; 291 } 292 PROC_UNLOCK(p); 293 } 294 #ifdef KTRACE 295 if (KTRPOINT(td, KTR_CSW)) 296 ktrcsw(0, 0); 297 #endif 298 PICKUP_GIANT(); 299 if (mtx != NULL) { 300 mtx_lock(mtx); 301 WITNESS_RESTORE(&mtx->mtx_object, mtx); 302 } 303 return (rval); 304 } 305 306 /* 307 * Implement timeout for msleep() 308 * 309 * If process hasn't been awakened (wchan non-zero), 310 * set timeout flag and undo the sleep. If proc 311 * is stopped, just unsleep so it will remain stopped. 312 * MP-safe, called without the Giant mutex. 313 */ 314 static void 315 endtsleep(arg) 316 void *arg; 317 { 318 register struct thread *td = arg; 319 320 CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)", 321 td, td->td_proc->p_pid, td->td_proc->p_comm); 322 mtx_lock_spin(&sched_lock); 323 /* 324 * This is the other half of the synchronization with msleep() 325 * described above. If the TDS_TIMEOUT flag is set, we lost the 326 * race and just need to put the process back on the runqueue. 327 */ 328 if (TD_ON_SLEEPQ(td)) { 329 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); 330 TD_CLR_ON_SLEEPQ(td); 331 td->td_flags |= TDF_TIMEOUT; 332 } else { 333 td->td_flags |= TDF_TIMOFAIL; 334 } 335 TD_CLR_SLEEPING(td); 336 setrunnable(td); 337 mtx_unlock_spin(&sched_lock); 338 } 339 340 /* 341 * Abort a thread, as if an interrupt had occured. Only abort 342 * interruptable waits (unfortunatly it isn't only safe to abort others). 343 * This is about identical to cv_abort(). 344 * Think about merging them? 345 * Also, whatever the signal code does... 346 */ 347 void 348 abortsleep(struct thread *td) 349 { 350 351 mtx_assert(&sched_lock, MA_OWNED); 352 /* 353 * If the TDF_TIMEOUT flag is set, just leave. A 354 * timeout is scheduled anyhow. 355 */ 356 if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) { 357 if (TD_ON_SLEEPQ(td)) { 358 unsleep(td); 359 TD_CLR_SLEEPING(td); 360 setrunnable(td); 361 } 362 } 363 } 364 365 /* 366 * Remove a process from its wait queue 367 */ 368 void 369 unsleep(struct thread *td) 370 { 371 372 mtx_lock_spin(&sched_lock); 373 if (TD_ON_SLEEPQ(td)) { 374 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); 375 TD_CLR_ON_SLEEPQ(td); 376 } 377 mtx_unlock_spin(&sched_lock); 378 } 379 380 /* 381 * Make all processes sleeping on the specified identifier runnable. 382 */ 383 void 384 wakeup(ident) 385 register void *ident; 386 { 387 register struct slpquehead *qp; 388 register struct thread *td; 389 struct thread *ntd; 390 struct proc *p; 391 392 mtx_lock_spin(&sched_lock); 393 qp = &slpque[LOOKUP(ident)]; 394 restart: 395 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { 396 ntd = TAILQ_NEXT(td, td_slpq); 397 if (td->td_wchan == ident) { 398 unsleep(td); 399 TD_CLR_SLEEPING(td); 400 setrunnable(td); 401 p = td->td_proc; 402 CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)", 403 td, p->p_pid, p->p_comm); 404 goto restart; 405 } 406 } 407 mtx_unlock_spin(&sched_lock); 408 } 409 410 /* 411 * Make a process sleeping on the specified identifier runnable. 412 * May wake more than one process if a target process is currently 413 * swapped out. 414 */ 415 void 416 wakeup_one(ident) 417 register void *ident; 418 { 419 register struct slpquehead *qp; 420 register struct thread *td; 421 register struct proc *p; 422 struct thread *ntd; 423 424 mtx_lock_spin(&sched_lock); 425 qp = &slpque[LOOKUP(ident)]; 426 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { 427 ntd = TAILQ_NEXT(td, td_slpq); 428 if (td->td_wchan == ident) { 429 unsleep(td); 430 TD_CLR_SLEEPING(td); 431 setrunnable(td); 432 p = td->td_proc; 433 CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)", 434 td, p->p_pid, p->p_comm); 435 break; 436 } 437 } 438 mtx_unlock_spin(&sched_lock); 439 } 440 441 /* 442 * The machine independent parts of mi_switch(). 443 */ 444 void 445 mi_switch(void) 446 { 447 struct bintime new_switchtime; 448 struct thread *td = curthread; /* XXX */ 449 struct proc *p = td->td_proc; /* XXX */ 450 struct kse *ke = td->td_kse; 451 u_int sched_nest; 452 453 mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); 454 455 KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); 456 #ifdef INVARIANTS 457 if (!TD_ON_LOCK(td) && 458 !TD_ON_RUNQ(td) && 459 !TD_IS_RUNNING(td)) 460 mtx_assert(&Giant, MA_NOTOWNED); 461 #endif 462 KASSERT(td->td_critnest == 1, 463 ("mi_switch: switch in a critical section")); 464 465 /* 466 * Compute the amount of time during which the current 467 * process was running, and add that to its total so far. 468 */ 469 binuptime(&new_switchtime); 470 bintime_add(&p->p_runtime, &new_switchtime); 471 bintime_sub(&p->p_runtime, PCPU_PTR(switchtime)); 472 473 #ifdef DDB 474 /* 475 * Don't perform context switches from the debugger. 476 */ 477 if (db_active) { 478 mtx_unlock_spin(&sched_lock); 479 db_error("Context switches not allowed in the debugger."); 480 } 481 #endif 482 483 /* 484 * Check if the process exceeds its cpu resource allocation. If 485 * over max, arrange to kill the process in ast(). 486 */ 487 if (p->p_cpulimit != RLIM_INFINITY && 488 p->p_runtime.sec > p->p_cpulimit) { 489 p->p_sflag |= PS_XCPU; 490 ke->ke_flags |= KEF_ASTPENDING; 491 } 492 493 /* 494 * Finish up stats for outgoing thread. 495 */ 496 cnt.v_swtch++; 497 PCPU_SET(switchtime, new_switchtime); 498 CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid, 499 p->p_comm); 500 501 sched_nest = sched_lock.mtx_recurse; 502 sched_switchout(td); 503 504 cpu_switch(); /* SHAZAM!!*/ 505 506 sched_lock.mtx_recurse = sched_nest; 507 sched_lock.mtx_lock = (uintptr_t)td; 508 sched_switchin(td); 509 510 /* 511 * Start setting up stats etc. for the incoming thread. 512 * Similar code in fork_exit() is returned to by cpu_switch() 513 * in the case of a new thread/process. 514 */ 515 CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid, 516 p->p_comm); 517 if (PCPU_GET(switchtime.sec) == 0) 518 binuptime(PCPU_PTR(switchtime)); 519 PCPU_SET(switchticks, ticks); 520 521 /* 522 * Call the switchin function while still holding the scheduler lock 523 * (used by the idlezero code and the general page-zeroing code) 524 */ 525 if (td->td_switchin) 526 td->td_switchin(); 527 } 528 529 /* 530 * Change process state to be runnable, 531 * placing it on the run queue if it is in memory, 532 * and awakening the swapper if it isn't in memory. 533 */ 534 void 535 setrunnable(struct thread *td) 536 { 537 struct proc *p = td->td_proc; 538 539 mtx_assert(&sched_lock, MA_OWNED); 540 switch (p->p_state) { 541 case PRS_ZOMBIE: 542 panic("setrunnable(1)"); 543 default: 544 break; 545 } 546 switch (td->td_state) { 547 case TDS_RUNNING: 548 case TDS_RUNQ: 549 return; 550 case TDS_INHIBITED: 551 /* 552 * If we are only inhibited because we are swapped out 553 * then arange to swap in this process. Otherwise just return. 554 */ 555 if (td->td_inhibitors != TDI_SWAPPED) 556 return; 557 case TDS_CAN_RUN: 558 break; 559 default: 560 printf("state is 0x%x", td->td_state); 561 panic("setrunnable(2)"); 562 } 563 if ((p->p_sflag & PS_INMEM) == 0) { 564 if ((p->p_sflag & PS_SWAPPINGIN) == 0) { 565 p->p_sflag |= PS_SWAPINREQ; 566 wakeup(&proc0); 567 } 568 } else 569 sched_wakeup(td); 570 } 571 572 /* 573 * Compute a tenex style load average of a quantity on 574 * 1, 5 and 15 minute intervals. 575 * XXXKSE Needs complete rewrite when correct info is available. 576 * Completely Bogus.. only works with 1:1 (but compiles ok now :-) 577 */ 578 static void 579 loadav(void *arg) 580 { 581 int i, nrun; 582 struct loadavg *avg; 583 struct proc *p; 584 struct thread *td; 585 586 avg = &averunnable; 587 sx_slock(&allproc_lock); 588 nrun = 0; 589 FOREACH_PROC_IN_SYSTEM(p) { 590 FOREACH_THREAD_IN_PROC(p, td) { 591 switch (td->td_state) { 592 case TDS_RUNQ: 593 case TDS_RUNNING: 594 if ((p->p_flag & P_NOLOAD) != 0) 595 goto nextproc; 596 nrun++; /* XXXKSE */ 597 default: 598 break; 599 } 600 nextproc: 601 continue; 602 } 603 } 604 sx_sunlock(&allproc_lock); 605 for (i = 0; i < 3; i++) 606 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 607 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 608 609 /* 610 * Schedule the next update to occur after 5 seconds, but add a 611 * random variation to avoid synchronisation with processes that 612 * run at regular intervals. 613 */ 614 callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), 615 loadav, NULL); 616 } 617 618 /* ARGSUSED */ 619 static void 620 sched_setup(dummy) 621 void *dummy; 622 { 623 callout_init(&loadav_callout, 0); 624 625 /* Kick off timeout driven events by calling first time. */ 626 loadav(NULL); 627 } 628 629 /* 630 * General purpose yield system call 631 */ 632 int 633 yield(struct thread *td, struct yield_args *uap) 634 { 635 struct ksegrp *kg = td->td_ksegrp; 636 637 mtx_assert(&Giant, MA_NOTOWNED); 638 mtx_lock_spin(&sched_lock); 639 kg->kg_proc->p_stats->p_ru.ru_nvcsw++; 640 sched_prio(td, PRI_MAX_TIMESHARE); 641 mi_switch(); 642 mtx_unlock_spin(&sched_lock); 643 td->td_retval[0] = 0; 644 645 return (0); 646 } 647 648