xref: /freebsd/sys/kern/kern_synch.c (revision 6e8394b8baa7d5d9153ab90de6824bcd19b3b4e1)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $Id: kern_synch.c,v 1.75 1999/03/03 18:15:29 julian Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/kernel.h>
48 #include <sys/signalvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/vmmeter.h>
51 #include <sys/sysctl.h>
52 #include <vm/vm.h>
53 #include <vm/vm_extern.h>
54 #ifdef KTRACE
55 #include <sys/uio.h>
56 #include <sys/ktrace.h>
57 #endif
58 
59 #include <machine/cpu.h>
60 #ifdef SMP
61 #include <machine/smp.h>
62 #endif
63 #include <machine/limits.h>	/* for UCHAR_MAX = typeof(p_priority)_MAX */
64 
65 static void rqinit __P((void *));
66 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
67 static void sched_setup __P((void *dummy));
68 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
69 
70 u_char	curpriority;
71 int	hogticks;
72 int	lbolt;
73 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
74 
75 static void	endtsleep __P((void *));
76 static void	roundrobin __P((void *arg));
77 static void	schedcpu __P((void *arg));
78 static void	updatepri __P((struct proc *p));
79 
80 static int
81 sysctl_kern_quantum SYSCTL_HANDLER_ARGS
82 {
83 	int error, new_val;
84 
85 	new_val = sched_quantum * tick;
86 	error = sysctl_handle_int(oidp, &new_val, 0, req);
87         if (error != 0 || req->newptr == NULL)
88 		return (error);
89 	if (new_val < tick)
90 		return (EINVAL);
91 	sched_quantum = new_val / tick;
92 	hogticks = 2 * sched_quantum;
93 	return (0);
94 }
95 
96 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
97 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
98 
99 /* maybe_resched: Decide if you need to reschedule or not
100  * taking the priorities and schedulers into account.
101  */
102 static void maybe_resched(struct proc *chk)
103 {
104 	struct proc *p = curproc; /* XXX */
105 
106 	/*
107 	 * Compare priorities if the new process is on the same scheduler,
108 	 * otherwise the one on the more realtimeish scheduler wins.
109 	 *
110 	 * XXX idle scheduler still broken because proccess stays on idle
111 	 * scheduler during waits (such as when getting FS locks).  If a
112 	 * standard process becomes runaway cpu-bound, the system can lockup
113 	 * due to idle-scheduler processes in wakeup never getting any cpu.
114 	 */
115 	if (p == 0 ||
116 		(chk->p_priority < curpriority && RTP_PRIO_BASE(p->p_rtprio.type) == RTP_PRIO_BASE(chk->p_rtprio.type)) ||
117 		RTP_PRIO_BASE(chk->p_rtprio.type) < RTP_PRIO_BASE(p->p_rtprio.type)
118 	) {
119 		need_resched();
120 	}
121 }
122 
123 int
124 roundrobin_interval(void)
125 {
126 	return (sched_quantum);
127 }
128 
129 /*
130  * Force switch among equal priority processes every 100ms.
131  */
132 /* ARGSUSED */
133 static void
134 roundrobin(arg)
135 	void *arg;
136 {
137 #ifndef SMP
138  	struct proc *p = curproc; /* XXX */
139 #endif
140 
141 #ifdef SMP
142 	need_resched();
143 	forward_roundrobin();
144 #else
145  	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
146  		need_resched();
147 #endif
148 
149  	timeout(roundrobin, NULL, sched_quantum);
150 }
151 
152 /*
153  * Constants for digital decay and forget:
154  *	90% of (p_estcpu) usage in 5 * loadav time
155  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
156  *          Note that, as ps(1) mentions, this can let percentages
157  *          total over 100% (I've seen 137.9% for 3 processes).
158  *
159  * Note that statclock() updates p_estcpu and p_cpticks asynchronously.
160  *
161  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
162  * That is, the system wants to compute a value of decay such
163  * that the following for loop:
164  * 	for (i = 0; i < (5 * loadavg); i++)
165  * 		p_estcpu *= decay;
166  * will compute
167  * 	p_estcpu *= 0.1;
168  * for all values of loadavg:
169  *
170  * Mathematically this loop can be expressed by saying:
171  * 	decay ** (5 * loadavg) ~= .1
172  *
173  * The system computes decay as:
174  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
175  *
176  * We wish to prove that the system's computation of decay
177  * will always fulfill the equation:
178  * 	decay ** (5 * loadavg) ~= .1
179  *
180  * If we compute b as:
181  * 	b = 2 * loadavg
182  * then
183  * 	decay = b / (b + 1)
184  *
185  * We now need to prove two things:
186  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
187  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
188  *
189  * Facts:
190  *         For x close to zero, exp(x) =~ 1 + x, since
191  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
192  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
193  *         For x close to zero, ln(1+x) =~ x, since
194  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
195  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
196  *         ln(.1) =~ -2.30
197  *
198  * Proof of (1):
199  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
200  *	solving for factor,
201  *      ln(factor) =~ (-2.30/5*loadav), or
202  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
203  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
204  *
205  * Proof of (2):
206  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
207  *	solving for power,
208  *      power*ln(b/(b+1)) =~ -2.30, or
209  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
210  *
211  * Actual power values for the implemented algorithm are as follows:
212  *      loadav: 1       2       3       4
213  *      power:  5.68    10.32   14.94   19.55
214  */
215 
216 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
217 #define	loadfactor(loadav)	(2 * (loadav))
218 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
219 
220 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
221 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
222 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
223 
224 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
225 static int	fscale __unused = FSCALE;
226 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
227 
228 /*
229  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
230  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
231  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
232  *
233  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
234  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
235  *
236  * If you don't want to bother with the faster/more-accurate formula, you
237  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
238  * (more general) method of calculating the %age of CPU used by a process.
239  */
240 #define	CCPU_SHIFT	11
241 
242 /*
243  * Recompute process priorities, every hz ticks.
244  */
245 /* ARGSUSED */
246 static void
247 schedcpu(arg)
248 	void *arg;
249 {
250 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
251 	register struct proc *p;
252 	register int realstathz, s;
253 	register unsigned int newcpu;
254 
255 	realstathz = stathz ? stathz : hz;
256 	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
257 		/*
258 		 * Increment time in/out of memory and sleep time
259 		 * (if sleeping).  We ignore overflow; with 16-bit int's
260 		 * (remember them?) overflow takes 45 days.
261 		 */
262 		p->p_swtime++;
263 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
264 			p->p_slptime++;
265 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
266 		/*
267 		 * If the process has slept the entire second,
268 		 * stop recalculating its priority until it wakes up.
269 		 */
270 		if (p->p_slptime > 1)
271 			continue;
272 		s = splhigh();	/* prevent state changes and protect run queue */
273 		/*
274 		 * p_pctcpu is only for ps.
275 		 */
276 #if	(FSHIFT >= CCPU_SHIFT)
277 		p->p_pctcpu += (realstathz == 100)?
278 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
279                 	100 * (((fixpt_t) p->p_cpticks)
280 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
281 #else
282 		p->p_pctcpu += ((FSCALE - ccpu) *
283 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
284 #endif
285 		p->p_cpticks = 0;
286 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
287 		p->p_estcpu = min(newcpu, UCHAR_MAX);
288 		resetpriority(p);
289 		if (p->p_priority >= PUSER) {
290 #define	PPQ	(128 / NQS)		/* priorities per queue */
291 			if ((p != curproc) &&
292 #ifdef SMP
293 			    p->p_oncpu == 0xff && 	/* idle */
294 #endif
295 			    p->p_stat == SRUN &&
296 			    (p->p_flag & P_INMEM) &&
297 			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
298 				remrq(p);
299 				p->p_priority = p->p_usrpri;
300 				setrunqueue(p);
301 			} else
302 				p->p_priority = p->p_usrpri;
303 		}
304 		splx(s);
305 	}
306 	vmmeter();
307 	wakeup((caddr_t)&lbolt);
308 	timeout(schedcpu, (void *)0, hz);
309 }
310 
311 /*
312  * Recalculate the priority of a process after it has slept for a while.
313  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
314  * least six times the loadfactor will decay p_estcpu to zero.
315  */
316 static void
317 updatepri(p)
318 	register struct proc *p;
319 {
320 	register unsigned int newcpu = p->p_estcpu;
321 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
322 
323 	if (p->p_slptime > 5 * loadfac)
324 		p->p_estcpu = 0;
325 	else {
326 		p->p_slptime--;	/* the first time was done in schedcpu */
327 		while (newcpu && --p->p_slptime)
328 			newcpu = (int) decay_cpu(loadfac, newcpu);
329 		p->p_estcpu = min(newcpu, UCHAR_MAX);
330 	}
331 	resetpriority(p);
332 }
333 
334 /*
335  * We're only looking at 7 bits of the address; everything is
336  * aligned to 4, lots of things are aligned to greater powers
337  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
338  */
339 #define TABLESIZE	128
340 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
341 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
342 
343 /*
344  * During autoconfiguration or after a panic, a sleep will simply
345  * lower the priority briefly to allow interrupts, then return.
346  * The priority to be used (safepri) is machine-dependent, thus this
347  * value is initialized and maintained in the machine-dependent layers.
348  * This priority will typically be 0, or the lowest priority
349  * that is safe for use on the interrupt stack; it can be made
350  * higher to block network software interrupts after panics.
351  */
352 int safepri;
353 
354 void
355 sleepinit(void)
356 {
357 	int i;
358 
359 	sched_quantum = hz/10;
360 	hogticks = 2 * sched_quantum;
361 	for (i = 0; i < TABLESIZE; i++)
362 		TAILQ_INIT(&slpque[i]);
363 }
364 
365 /*
366  * General sleep call.  Suspends the current process until a wakeup is
367  * performed on the specified identifier.  The process will then be made
368  * runnable with the specified priority.  Sleeps at most timo/hz seconds
369  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
370  * before and after sleeping, else signals are not checked.  Returns 0 if
371  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
372  * signal needs to be delivered, ERESTART is returned if the current system
373  * call should be restarted if possible, and EINTR is returned if the system
374  * call should be interrupted by the signal (return EINTR).
375  */
376 int
377 tsleep(ident, priority, wmesg, timo)
378 	void *ident;
379 	int priority, timo;
380 	const char *wmesg;
381 {
382 	struct proc *p = curproc;
383 	int s, sig, catch = priority & PCATCH;
384 	struct callout_handle thandle;
385 
386 #ifdef KTRACE
387 	if (KTRPOINT(p, KTR_CSW))
388 		ktrcsw(p->p_tracep, 1, 0);
389 #endif
390 	s = splhigh();
391 	if (cold || panicstr) {
392 		/*
393 		 * After a panic, or during autoconfiguration,
394 		 * just give interrupts a chance, then just return;
395 		 * don't run any other procs or panic below,
396 		 * in case this is the idle process and already asleep.
397 		 */
398 		splx(safepri);
399 		splx(s);
400 		return (0);
401 	}
402 	KASSERT(p != NULL, ("tsleep1"));
403 	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
404 	/*
405 	 * Process may be sitting on a slpque if asleep() was called, remove
406 	 * it before re-adding.
407 	 */
408 	if (p->p_wchan != NULL)
409 		unsleep(p);
410 
411 	p->p_wchan = ident;
412 	p->p_wmesg = wmesg;
413 	p->p_slptime = 0;
414 	p->p_priority = priority & PRIMASK;
415 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
416 	if (timo)
417 		thandle = timeout(endtsleep, (void *)p, timo);
418 	/*
419 	 * We put ourselves on the sleep queue and start our timeout
420 	 * before calling CURSIG, as we could stop there, and a wakeup
421 	 * or a SIGCONT (or both) could occur while we were stopped.
422 	 * A SIGCONT would cause us to be marked as SSLEEP
423 	 * without resuming us, thus we must be ready for sleep
424 	 * when CURSIG is called.  If the wakeup happens while we're
425 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
426 	 */
427 	if (catch) {
428 		p->p_flag |= P_SINTR;
429 		if ((sig = CURSIG(p))) {
430 			if (p->p_wchan)
431 				unsleep(p);
432 			p->p_stat = SRUN;
433 			goto resume;
434 		}
435 		if (p->p_wchan == 0) {
436 			catch = 0;
437 			goto resume;
438 		}
439 	} else
440 		sig = 0;
441 	p->p_stat = SSLEEP;
442 	p->p_stats->p_ru.ru_nvcsw++;
443 	mi_switch();
444 resume:
445 	curpriority = p->p_usrpri;
446 	splx(s);
447 	p->p_flag &= ~P_SINTR;
448 	if (p->p_flag & P_TIMEOUT) {
449 		p->p_flag &= ~P_TIMEOUT;
450 		if (sig == 0) {
451 #ifdef KTRACE
452 			if (KTRPOINT(p, KTR_CSW))
453 				ktrcsw(p->p_tracep, 0, 0);
454 #endif
455 			return (EWOULDBLOCK);
456 		}
457 	} else if (timo)
458 		untimeout(endtsleep, (void *)p, thandle);
459 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
460 #ifdef KTRACE
461 		if (KTRPOINT(p, KTR_CSW))
462 			ktrcsw(p->p_tracep, 0, 0);
463 #endif
464 		if (p->p_sigacts->ps_sigintr & sigmask(sig))
465 			return (EINTR);
466 		return (ERESTART);
467 	}
468 #ifdef KTRACE
469 	if (KTRPOINT(p, KTR_CSW))
470 		ktrcsw(p->p_tracep, 0, 0);
471 #endif
472 	return (0);
473 }
474 
475 /*
476  * asleep() - async sleep call.  Place process on wait queue and return
477  * immediately without blocking.  The process stays runnable until await()
478  * is called.  If ident is NULL, remove process from wait queue if it is still
479  * on one.
480  *
481  * Only the most recent sleep condition is effective when making successive
482  * calls to asleep() or when calling tsleep().
483  *
484  * The timeout, if any, is not initiated until await() is called.  The sleep
485  * priority, signal, and timeout is specified in the asleep() call but may be
486  * overriden in the await() call.
487  *
488  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
489  */
490 
491 int
492 asleep(void *ident, int priority, const char *wmesg, int timo)
493 {
494 	struct proc *p = curproc;
495 	int s;
496 
497 	/*
498 	 * splhigh() while manipulating sleep structures and slpque.
499 	 *
500 	 * Remove preexisting wait condition (if any) and place process
501 	 * on appropriate slpque, but do not put process to sleep.
502 	 */
503 
504 	s = splhigh();
505 
506 	if (p->p_wchan != NULL)
507 		unsleep(p);
508 
509 	if (ident) {
510 		p->p_wchan = ident;
511 		p->p_wmesg = wmesg;
512 		p->p_slptime = 0;
513 		p->p_asleep.as_priority = priority;
514 		p->p_asleep.as_timo = timo;
515 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
516 	}
517 
518 	splx(s);
519 
520 	return(0);
521 }
522 
523 /*
524  * await() - wait for async condition to occur.   The process blocks until
525  * wakeup() is called on the most recent asleep() address.  If wakeup is called
526  * priority to await(), await() winds up being a NOP.
527  *
528  * If await() is called more then once (without an intervening asleep() call),
529  * await() is still effectively a NOP but it calls mi_switch() to give other
530  * processes some cpu before returning.  The process is left runnable.
531  *
532  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
533  */
534 
535 int
536 await(int priority, int timo)
537 {
538 	struct proc *p = curproc;
539 	int s;
540 
541 	s = splhigh();
542 
543 	if (p->p_wchan != NULL) {
544 		struct callout_handle thandle;
545 		int sig;
546 		int catch;
547 
548 		/*
549 		 * The call to await() can override defaults specified in
550 		 * the original asleep().
551 		 */
552 		if (priority < 0)
553 			priority = p->p_asleep.as_priority;
554 		if (timo < 0)
555 			timo = p->p_asleep.as_timo;
556 
557 		/*
558 		 * Install timeout
559 		 */
560 
561 		if (timo)
562 			thandle = timeout(endtsleep, (void *)p, timo);
563 
564 		sig = 0;
565 		catch = priority & PCATCH;
566 
567 		if (catch) {
568 			p->p_flag |= P_SINTR;
569 			if ((sig = CURSIG(p))) {
570 				if (p->p_wchan)
571 					unsleep(p);
572 				p->p_stat = SRUN;
573 				goto resume;
574 			}
575 			if (p->p_wchan == NULL) {
576 				catch = 0;
577 				goto resume;
578 			}
579 		}
580 		p->p_stat = SSLEEP;
581 		p->p_stats->p_ru.ru_nvcsw++;
582 		mi_switch();
583 resume:
584 		curpriority = p->p_usrpri;
585 
586 		splx(s);
587 		p->p_flag &= ~P_SINTR;
588 		if (p->p_flag & P_TIMEOUT) {
589 			p->p_flag &= ~P_TIMEOUT;
590 			if (sig == 0) {
591 #ifdef KTRACE
592 				if (KTRPOINT(p, KTR_CSW))
593 					ktrcsw(p->p_tracep, 0, 0);
594 #endif
595 				return (EWOULDBLOCK);
596 			}
597 		} else if (timo)
598 			untimeout(endtsleep, (void *)p, thandle);
599 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
600 #ifdef KTRACE
601 			if (KTRPOINT(p, KTR_CSW))
602 				ktrcsw(p->p_tracep, 0, 0);
603 #endif
604 			if (p->p_sigacts->ps_sigintr & sigmask(sig))
605 				return (EINTR);
606 			return (ERESTART);
607 		}
608 #ifdef KTRACE
609 		if (KTRPOINT(p, KTR_CSW))
610 			ktrcsw(p->p_tracep, 0, 0);
611 #endif
612 	} else {
613 		/*
614 		 * If as_priority is 0, await() has been called without an
615 		 * intervening asleep().  We are still effectively a NOP,
616 		 * but we call mi_switch() for safety.
617 		 */
618 
619 		if (p->p_asleep.as_priority == 0) {
620 			p->p_stats->p_ru.ru_nvcsw++;
621 			mi_switch();
622 		}
623 		splx(s);
624 	}
625 
626 	/*
627 	 * clear p_asleep.as_priority as an indication that await() has been
628 	 * called.  If await() is called again without an intervening asleep(),
629 	 * await() is still effectively a NOP but the above mi_switch() code
630 	 * is triggered as a safety.
631 	 */
632 	p->p_asleep.as_priority = 0;
633 
634 	return (0);
635 }
636 
637 /*
638  * Implement timeout for tsleep or asleep()/await()
639  *
640  * If process hasn't been awakened (wchan non-zero),
641  * set timeout flag and undo the sleep.  If proc
642  * is stopped, just unsleep so it will remain stopped.
643  */
644 static void
645 endtsleep(arg)
646 	void *arg;
647 {
648 	register struct proc *p;
649 	int s;
650 
651 	p = (struct proc *)arg;
652 	s = splhigh();
653 	if (p->p_wchan) {
654 		if (p->p_stat == SSLEEP)
655 			setrunnable(p);
656 		else
657 			unsleep(p);
658 		p->p_flag |= P_TIMEOUT;
659 	}
660 	splx(s);
661 }
662 
663 /*
664  * Remove a process from its wait queue
665  */
666 void
667 unsleep(p)
668 	register struct proc *p;
669 {
670 	int s;
671 
672 	s = splhigh();
673 	if (p->p_wchan) {
674 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
675 		p->p_wchan = 0;
676 	}
677 	splx(s);
678 }
679 
680 /*
681  * Make all processes sleeping on the specified identifier runnable.
682  */
683 void
684 wakeup(ident)
685 	register void *ident;
686 {
687 	register struct slpquehead *qp;
688 	register struct proc *p;
689 	int s;
690 
691 	s = splhigh();
692 	qp = &slpque[LOOKUP(ident)];
693 restart:
694 	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
695 		if (p->p_wchan == ident) {
696 			TAILQ_REMOVE(qp, p, p_procq);
697 			p->p_wchan = 0;
698 			if (p->p_stat == SSLEEP) {
699 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
700 				if (p->p_slptime > 1)
701 					updatepri(p);
702 				p->p_slptime = 0;
703 				p->p_stat = SRUN;
704 				if (p->p_flag & P_INMEM) {
705 					setrunqueue(p);
706 					maybe_resched(p);
707 				} else {
708 					p->p_flag |= P_SWAPINREQ;
709 					wakeup((caddr_t)&proc0);
710 				}
711 				/* END INLINE EXPANSION */
712 				goto restart;
713 			}
714 		}
715 	}
716 	splx(s);
717 }
718 
719 /*
720  * Make a process sleeping on the specified identifier runnable.
721  * May wake more than one process if a target prcoess is currently
722  * swapped out.
723  */
724 void
725 wakeup_one(ident)
726 	register void *ident;
727 {
728 	register struct slpquehead *qp;
729 	register struct proc *p;
730 	int s;
731 
732 	s = splhigh();
733 	qp = &slpque[LOOKUP(ident)];
734 
735 	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
736 		if (p->p_wchan == ident) {
737 			TAILQ_REMOVE(qp, p, p_procq);
738 			p->p_wchan = 0;
739 			if (p->p_stat == SSLEEP) {
740 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
741 				if (p->p_slptime > 1)
742 					updatepri(p);
743 				p->p_slptime = 0;
744 				p->p_stat = SRUN;
745 				if (p->p_flag & P_INMEM) {
746 					setrunqueue(p);
747 					maybe_resched(p);
748 					break;
749 				} else {
750 					p->p_flag |= P_SWAPINREQ;
751 					wakeup((caddr_t)&proc0);
752 				}
753 				/* END INLINE EXPANSION */
754 			}
755 		}
756 	}
757 	splx(s);
758 }
759 
760 /*
761  * The machine independent parts of mi_switch().
762  * Must be called at splstatclock() or higher.
763  */
764 void
765 mi_switch()
766 {
767 	struct timeval new_switchtime;
768 	register struct proc *p = curproc;	/* XXX */
769 	register struct rlimit *rlim;
770 	int x;
771 
772 	/*
773 	 * XXX this spl is almost unnecessary.  It is partly to allow for
774 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
775 	 * main offender).  It is partly to work around a bug in the i386
776 	 * cpu_switch() (the ipl is not preserved).  We ran for years
777 	 * without it.  I think there was only a interrupt latency problem.
778 	 * The main caller, tsleep(), does an splx() a couple of instructions
779 	 * after calling here.  The buggy caller, issignal(), usually calls
780 	 * here at spl0() and sometimes returns at splhigh().  The process
781 	 * then runs for a little too long at splhigh().  The ipl gets fixed
782 	 * when the process returns to user mode (or earlier).
783 	 *
784 	 * It would probably be better to always call here at spl0(). Callers
785 	 * are prepared to give up control to another process, so they must
786 	 * be prepared to be interrupted.  The clock stuff here may not
787 	 * actually need splstatclock().
788 	 */
789 	x = splstatclock();
790 
791 #ifdef SIMPLELOCK_DEBUG
792 	if (p->p_simple_locks)
793 		printf("sleep: holding simple lock\n");
794 #endif
795 	/*
796 	 * Compute the amount of time during which the current
797 	 * process was running, and add that to its total so far.
798 	 */
799 	microuptime(&new_switchtime);
800 	p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
801 	    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
802 
803 	/*
804 	 * Check if the process exceeds its cpu resource allocation.
805 	 * If over max, kill it.
806 	 */
807 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
808 	    p->p_runtime > p->p_limit->p_cpulimit) {
809 		rlim = &p->p_rlimit[RLIMIT_CPU];
810 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
811 			killproc(p, "exceeded maximum CPU limit");
812 		} else {
813 			psignal(p, SIGXCPU);
814 			if (rlim->rlim_cur < rlim->rlim_max) {
815 				/* XXX: we should make a private copy */
816 				rlim->rlim_cur += 5;
817 			}
818 		}
819 	}
820 
821 	/*
822 	 * Pick a new current process and record its start time.
823 	 */
824 	cnt.v_swtch++;
825 	switchtime = new_switchtime;
826 	cpu_switch(p);
827 	if (switchtime.tv_sec == 0)
828 		microuptime(&switchtime);
829 	switchticks = ticks;
830 
831 	splx(x);
832 }
833 
834 /*
835  * Initialize the (doubly-linked) run queues
836  * to be empty.
837  */
838 /* ARGSUSED*/
839 static void
840 rqinit(dummy)
841 	void *dummy;
842 {
843 	register int i;
844 
845 	for (i = 0; i < NQS; i++) {
846 		qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
847 		rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i];
848 		idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i];
849 	}
850 }
851 
852 /*
853  * Change process state to be runnable,
854  * placing it on the run queue if it is in memory,
855  * and awakening the swapper if it isn't in memory.
856  */
857 void
858 setrunnable(p)
859 	register struct proc *p;
860 {
861 	register int s;
862 
863 	s = splhigh();
864 	switch (p->p_stat) {
865 	case 0:
866 	case SRUN:
867 	case SZOMB:
868 	default:
869 		panic("setrunnable");
870 	case SSTOP:
871 	case SSLEEP:
872 		unsleep(p);		/* e.g. when sending signals */
873 		break;
874 
875 	case SIDL:
876 		break;
877 	}
878 	p->p_stat = SRUN;
879 	if (p->p_flag & P_INMEM)
880 		setrunqueue(p);
881 	splx(s);
882 	if (p->p_slptime > 1)
883 		updatepri(p);
884 	p->p_slptime = 0;
885 	if ((p->p_flag & P_INMEM) == 0) {
886 		p->p_flag |= P_SWAPINREQ;
887 		wakeup((caddr_t)&proc0);
888 	}
889 	else
890 		maybe_resched(p);
891 }
892 
893 /*
894  * Compute the priority of a process when running in user mode.
895  * Arrange to reschedule if the resulting priority is better
896  * than that of the current process.
897  */
898 void
899 resetpriority(p)
900 	register struct proc *p;
901 {
902 	register unsigned int newpriority;
903 
904 	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
905 		newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
906 		newpriority = min(newpriority, MAXPRI);
907 		p->p_usrpri = newpriority;
908 	}
909 	maybe_resched(p);
910 }
911 
912 /* ARGSUSED */
913 static void
914 sched_setup(dummy)
915 	void *dummy;
916 {
917 	/* Kick off timeout driven events by calling first time. */
918 	roundrobin(NULL);
919 	schedcpu(NULL);
920 }
921 
922