xref: /freebsd/sys/kern/kern_synch.c (revision 23f282aa31e9b6fceacd449020e936e98d6f2298)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/kernel.h>
48 #include <sys/signalvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/vmmeter.h>
51 #include <sys/sysctl.h>
52 #include <vm/vm.h>
53 #include <vm/vm_extern.h>
54 #ifdef KTRACE
55 #include <sys/uio.h>
56 #include <sys/ktrace.h>
57 #endif
58 
59 #include <machine/cpu.h>
60 #include <machine/ipl.h>
61 #include <machine/smp.h>
62 
63 static void sched_setup __P((void *dummy));
64 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
65 
66 u_char	curpriority;
67 int	hogticks;
68 int	lbolt;
69 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
70 
71 static int	curpriority_cmp __P((struct proc *p));
72 static void	endtsleep __P((void *));
73 static void	maybe_resched __P((struct proc *chk));
74 static void	roundrobin __P((void *arg));
75 static void	schedcpu __P((void *arg));
76 static void	updatepri __P((struct proc *p));
77 
78 static int
79 sysctl_kern_quantum SYSCTL_HANDLER_ARGS
80 {
81 	int error, new_val;
82 
83 	new_val = sched_quantum * tick;
84 	error = sysctl_handle_int(oidp, &new_val, 0, req);
85         if (error != 0 || req->newptr == NULL)
86 		return (error);
87 	if (new_val < tick)
88 		return (EINVAL);
89 	sched_quantum = new_val / tick;
90 	hogticks = 2 * sched_quantum;
91 	return (0);
92 }
93 
94 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
95 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
96 
97 /*-
98  * Compare priorities.  Return:
99  *     <0: priority of p < current priority
100  *      0: priority of p == current priority
101  *     >0: priority of p > current priority
102  * The priorities are the normal priorities or the normal realtime priorities
103  * if p is on the same scheduler as curproc.  Otherwise the process on the
104  * more realtimeish scheduler has lowest priority.  As usual, a higher
105  * priority really means a lower priority.
106  */
107 static int
108 curpriority_cmp(p)
109 	struct proc *p;
110 {
111 	int c_class, p_class;
112 
113 	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
114 	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
115 	if (p_class != c_class)
116 		return (p_class - c_class);
117 	if (p_class == RTP_PRIO_NORMAL)
118 		return (((int)p->p_priority - (int)curpriority) / PPQ);
119 	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
120 }
121 
122 /*
123  * Arrange to reschedule if necessary, taking the priorities and
124  * schedulers into account.
125  */
126 static void
127 maybe_resched(chk)
128 	struct proc *chk;
129 {
130 	struct proc *p = curproc; /* XXX */
131 
132 	/*
133 	 * XXX idle scheduler still broken because proccess stays on idle
134 	 * scheduler during waits (such as when getting FS locks).  If a
135 	 * standard process becomes runaway cpu-bound, the system can lockup
136 	 * due to idle-scheduler processes in wakeup never getting any cpu.
137 	 */
138 	if (p == NULL) {
139 #if 0
140 		need_resched();
141 #endif
142 	} else if (chk == p) {
143 		/* We may need to yield if our priority has been raised. */
144 		if (curpriority_cmp(chk) > 0)
145 			need_resched();
146 	} else if (curpriority_cmp(chk) < 0)
147 		need_resched();
148 }
149 
150 int
151 roundrobin_interval(void)
152 {
153 	return (sched_quantum);
154 }
155 
156 /*
157  * Force switch among equal priority processes every 100ms.
158  */
159 /* ARGSUSED */
160 static void
161 roundrobin(arg)
162 	void *arg;
163 {
164 #ifndef SMP
165  	struct proc *p = curproc; /* XXX */
166 #endif
167 
168 #ifdef SMP
169 	need_resched();
170 	forward_roundrobin();
171 #else
172  	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
173  		need_resched();
174 #endif
175 
176  	timeout(roundrobin, NULL, sched_quantum);
177 }
178 
179 /*
180  * Constants for digital decay and forget:
181  *	90% of (p_estcpu) usage in 5 * loadav time
182  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
183  *          Note that, as ps(1) mentions, this can let percentages
184  *          total over 100% (I've seen 137.9% for 3 processes).
185  *
186  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
187  *
188  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
189  * That is, the system wants to compute a value of decay such
190  * that the following for loop:
191  * 	for (i = 0; i < (5 * loadavg); i++)
192  * 		p_estcpu *= decay;
193  * will compute
194  * 	p_estcpu *= 0.1;
195  * for all values of loadavg:
196  *
197  * Mathematically this loop can be expressed by saying:
198  * 	decay ** (5 * loadavg) ~= .1
199  *
200  * The system computes decay as:
201  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
202  *
203  * We wish to prove that the system's computation of decay
204  * will always fulfill the equation:
205  * 	decay ** (5 * loadavg) ~= .1
206  *
207  * If we compute b as:
208  * 	b = 2 * loadavg
209  * then
210  * 	decay = b / (b + 1)
211  *
212  * We now need to prove two things:
213  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
214  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
215  *
216  * Facts:
217  *         For x close to zero, exp(x) =~ 1 + x, since
218  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
219  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
220  *         For x close to zero, ln(1+x) =~ x, since
221  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
222  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
223  *         ln(.1) =~ -2.30
224  *
225  * Proof of (1):
226  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
227  *	solving for factor,
228  *      ln(factor) =~ (-2.30/5*loadav), or
229  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
230  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
231  *
232  * Proof of (2):
233  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
234  *	solving for power,
235  *      power*ln(b/(b+1)) =~ -2.30, or
236  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
237  *
238  * Actual power values for the implemented algorithm are as follows:
239  *      loadav: 1       2       3       4
240  *      power:  5.68    10.32   14.94   19.55
241  */
242 
243 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
244 #define	loadfactor(loadav)	(2 * (loadav))
245 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
246 
247 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
248 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
249 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
250 
251 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
252 static int	fscale __unused = FSCALE;
253 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
254 
255 /*
256  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
257  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
258  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
259  *
260  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
261  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
262  *
263  * If you don't want to bother with the faster/more-accurate formula, you
264  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
265  * (more general) method of calculating the %age of CPU used by a process.
266  */
267 #define	CCPU_SHIFT	11
268 
269 /*
270  * Recompute process priorities, every hz ticks.
271  */
272 /* ARGSUSED */
273 static void
274 schedcpu(arg)
275 	void *arg;
276 {
277 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
278 	register struct proc *p;
279 	register int realstathz, s;
280 
281 	realstathz = stathz ? stathz : hz;
282 	LIST_FOREACH(p, &allproc, p_list) {
283 		/*
284 		 * Increment time in/out of memory and sleep time
285 		 * (if sleeping).  We ignore overflow; with 16-bit int's
286 		 * (remember them?) overflow takes 45 days.
287 		 */
288 		p->p_swtime++;
289 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
290 			p->p_slptime++;
291 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
292 		/*
293 		 * If the process has slept the entire second,
294 		 * stop recalculating its priority until it wakes up.
295 		 */
296 		if (p->p_slptime > 1)
297 			continue;
298 		s = splhigh();	/* prevent state changes and protect run queue */
299 		/*
300 		 * p_pctcpu is only for ps.
301 		 */
302 #if	(FSHIFT >= CCPU_SHIFT)
303 		p->p_pctcpu += (realstathz == 100)?
304 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
305                 	100 * (((fixpt_t) p->p_cpticks)
306 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
307 #else
308 		p->p_pctcpu += ((FSCALE - ccpu) *
309 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
310 #endif
311 		p->p_cpticks = 0;
312 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
313 		resetpriority(p);
314 		if (p->p_priority >= PUSER) {
315 			if ((p != curproc) &&
316 #ifdef SMP
317 			    p->p_oncpu == 0xff && 	/* idle */
318 #endif
319 			    p->p_stat == SRUN &&
320 			    (p->p_flag & P_INMEM) &&
321 			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
322 				remrunqueue(p);
323 				p->p_priority = p->p_usrpri;
324 				setrunqueue(p);
325 			} else
326 				p->p_priority = p->p_usrpri;
327 		}
328 		splx(s);
329 	}
330 	vmmeter();
331 	wakeup((caddr_t)&lbolt);
332 	timeout(schedcpu, (void *)0, hz);
333 }
334 
335 /*
336  * Recalculate the priority of a process after it has slept for a while.
337  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
338  * least six times the loadfactor will decay p_estcpu to zero.
339  */
340 static void
341 updatepri(p)
342 	register struct proc *p;
343 {
344 	register unsigned int newcpu = p->p_estcpu;
345 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
346 
347 	if (p->p_slptime > 5 * loadfac)
348 		p->p_estcpu = 0;
349 	else {
350 		p->p_slptime--;	/* the first time was done in schedcpu */
351 		while (newcpu && --p->p_slptime)
352 			newcpu = decay_cpu(loadfac, newcpu);
353 		p->p_estcpu = newcpu;
354 	}
355 	resetpriority(p);
356 }
357 
358 /*
359  * We're only looking at 7 bits of the address; everything is
360  * aligned to 4, lots of things are aligned to greater powers
361  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
362  */
363 #define TABLESIZE	128
364 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
365 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
366 
367 /*
368  * During autoconfiguration or after a panic, a sleep will simply
369  * lower the priority briefly to allow interrupts, then return.
370  * The priority to be used (safepri) is machine-dependent, thus this
371  * value is initialized and maintained in the machine-dependent layers.
372  * This priority will typically be 0, or the lowest priority
373  * that is safe for use on the interrupt stack; it can be made
374  * higher to block network software interrupts after panics.
375  */
376 int safepri;
377 
378 void
379 sleepinit(void)
380 {
381 	int i;
382 
383 	sched_quantum = hz/10;
384 	hogticks = 2 * sched_quantum;
385 	for (i = 0; i < TABLESIZE; i++)
386 		TAILQ_INIT(&slpque[i]);
387 }
388 
389 /*
390  * General sleep call.  Suspends the current process until a wakeup is
391  * performed on the specified identifier.  The process will then be made
392  * runnable with the specified priority.  Sleeps at most timo/hz seconds
393  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
394  * before and after sleeping, else signals are not checked.  Returns 0 if
395  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
396  * signal needs to be delivered, ERESTART is returned if the current system
397  * call should be restarted if possible, and EINTR is returned if the system
398  * call should be interrupted by the signal (return EINTR).
399  */
400 int
401 tsleep(ident, priority, wmesg, timo)
402 	void *ident;
403 	int priority, timo;
404 	const char *wmesg;
405 {
406 	struct proc *p = curproc;
407 	int s, sig, catch = priority & PCATCH;
408 	struct callout_handle thandle;
409 
410 #ifdef KTRACE
411 	if (p && KTRPOINT(p, KTR_CSW))
412 		ktrcsw(p->p_tracep, 1, 0);
413 #endif
414 	s = splhigh();
415 	if (cold || panicstr) {
416 		/*
417 		 * After a panic, or during autoconfiguration,
418 		 * just give interrupts a chance, then just return;
419 		 * don't run any other procs or panic below,
420 		 * in case this is the idle process and already asleep.
421 		 */
422 		splx(safepri);
423 		splx(s);
424 		return (0);
425 	}
426 	KASSERT(p != NULL, ("tsleep1"));
427 	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
428 	/*
429 	 * Process may be sitting on a slpque if asleep() was called, remove
430 	 * it before re-adding.
431 	 */
432 	if (p->p_wchan != NULL)
433 		unsleep(p);
434 
435 	p->p_wchan = ident;
436 	p->p_wmesg = wmesg;
437 	p->p_slptime = 0;
438 	p->p_priority = priority & PRIMASK;
439 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
440 	if (timo)
441 		thandle = timeout(endtsleep, (void *)p, timo);
442 	/*
443 	 * We put ourselves on the sleep queue and start our timeout
444 	 * before calling CURSIG, as we could stop there, and a wakeup
445 	 * or a SIGCONT (or both) could occur while we were stopped.
446 	 * A SIGCONT would cause us to be marked as SSLEEP
447 	 * without resuming us, thus we must be ready for sleep
448 	 * when CURSIG is called.  If the wakeup happens while we're
449 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
450 	 */
451 	if (catch) {
452 		p->p_flag |= P_SINTR;
453 		if ((sig = CURSIG(p))) {
454 			if (p->p_wchan)
455 				unsleep(p);
456 			p->p_stat = SRUN;
457 			goto resume;
458 		}
459 		if (p->p_wchan == 0) {
460 			catch = 0;
461 			goto resume;
462 		}
463 	} else
464 		sig = 0;
465 	p->p_stat = SSLEEP;
466 	p->p_stats->p_ru.ru_nvcsw++;
467 	mi_switch();
468 resume:
469 	curpriority = p->p_usrpri;
470 	splx(s);
471 	p->p_flag &= ~P_SINTR;
472 	if (p->p_flag & P_TIMEOUT) {
473 		p->p_flag &= ~P_TIMEOUT;
474 		if (sig == 0) {
475 #ifdef KTRACE
476 			if (KTRPOINT(p, KTR_CSW))
477 				ktrcsw(p->p_tracep, 0, 0);
478 #endif
479 			return (EWOULDBLOCK);
480 		}
481 	} else if (timo)
482 		untimeout(endtsleep, (void *)p, thandle);
483 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
484 #ifdef KTRACE
485 		if (KTRPOINT(p, KTR_CSW))
486 			ktrcsw(p->p_tracep, 0, 0);
487 #endif
488 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
489 			return (EINTR);
490 		return (ERESTART);
491 	}
492 #ifdef KTRACE
493 	if (KTRPOINT(p, KTR_CSW))
494 		ktrcsw(p->p_tracep, 0, 0);
495 #endif
496 	return (0);
497 }
498 
499 /*
500  * asleep() - async sleep call.  Place process on wait queue and return
501  * immediately without blocking.  The process stays runnable until await()
502  * is called.  If ident is NULL, remove process from wait queue if it is still
503  * on one.
504  *
505  * Only the most recent sleep condition is effective when making successive
506  * calls to asleep() or when calling tsleep().
507  *
508  * The timeout, if any, is not initiated until await() is called.  The sleep
509  * priority, signal, and timeout is specified in the asleep() call but may be
510  * overriden in the await() call.
511  *
512  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
513  */
514 
515 int
516 asleep(void *ident, int priority, const char *wmesg, int timo)
517 {
518 	struct proc *p = curproc;
519 	int s;
520 
521 	/*
522 	 * splhigh() while manipulating sleep structures and slpque.
523 	 *
524 	 * Remove preexisting wait condition (if any) and place process
525 	 * on appropriate slpque, but do not put process to sleep.
526 	 */
527 
528 	s = splhigh();
529 
530 	if (p->p_wchan != NULL)
531 		unsleep(p);
532 
533 	if (ident) {
534 		p->p_wchan = ident;
535 		p->p_wmesg = wmesg;
536 		p->p_slptime = 0;
537 		p->p_asleep.as_priority = priority;
538 		p->p_asleep.as_timo = timo;
539 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
540 	}
541 
542 	splx(s);
543 
544 	return(0);
545 }
546 
547 /*
548  * await() - wait for async condition to occur.   The process blocks until
549  * wakeup() is called on the most recent asleep() address.  If wakeup is called
550  * priority to await(), await() winds up being a NOP.
551  *
552  * If await() is called more then once (without an intervening asleep() call),
553  * await() is still effectively a NOP but it calls mi_switch() to give other
554  * processes some cpu before returning.  The process is left runnable.
555  *
556  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
557  */
558 
559 int
560 await(int priority, int timo)
561 {
562 	struct proc *p = curproc;
563 	int s;
564 
565 	s = splhigh();
566 
567 	if (p->p_wchan != NULL) {
568 		struct callout_handle thandle;
569 		int sig;
570 		int catch;
571 
572 		/*
573 		 * The call to await() can override defaults specified in
574 		 * the original asleep().
575 		 */
576 		if (priority < 0)
577 			priority = p->p_asleep.as_priority;
578 		if (timo < 0)
579 			timo = p->p_asleep.as_timo;
580 
581 		/*
582 		 * Install timeout
583 		 */
584 
585 		if (timo)
586 			thandle = timeout(endtsleep, (void *)p, timo);
587 
588 		sig = 0;
589 		catch = priority & PCATCH;
590 
591 		if (catch) {
592 			p->p_flag |= P_SINTR;
593 			if ((sig = CURSIG(p))) {
594 				if (p->p_wchan)
595 					unsleep(p);
596 				p->p_stat = SRUN;
597 				goto resume;
598 			}
599 			if (p->p_wchan == NULL) {
600 				catch = 0;
601 				goto resume;
602 			}
603 		}
604 		p->p_stat = SSLEEP;
605 		p->p_stats->p_ru.ru_nvcsw++;
606 		mi_switch();
607 resume:
608 		curpriority = p->p_usrpri;
609 
610 		splx(s);
611 		p->p_flag &= ~P_SINTR;
612 		if (p->p_flag & P_TIMEOUT) {
613 			p->p_flag &= ~P_TIMEOUT;
614 			if (sig == 0) {
615 #ifdef KTRACE
616 				if (KTRPOINT(p, KTR_CSW))
617 					ktrcsw(p->p_tracep, 0, 0);
618 #endif
619 				return (EWOULDBLOCK);
620 			}
621 		} else if (timo)
622 			untimeout(endtsleep, (void *)p, thandle);
623 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
624 #ifdef KTRACE
625 			if (KTRPOINT(p, KTR_CSW))
626 				ktrcsw(p->p_tracep, 0, 0);
627 #endif
628 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
629 				return (EINTR);
630 			return (ERESTART);
631 		}
632 #ifdef KTRACE
633 		if (KTRPOINT(p, KTR_CSW))
634 			ktrcsw(p->p_tracep, 0, 0);
635 #endif
636 	} else {
637 		/*
638 		 * If as_priority is 0, await() has been called without an
639 		 * intervening asleep().  We are still effectively a NOP,
640 		 * but we call mi_switch() for safety.
641 		 */
642 
643 		if (p->p_asleep.as_priority == 0) {
644 			p->p_stats->p_ru.ru_nvcsw++;
645 			mi_switch();
646 		}
647 		splx(s);
648 	}
649 
650 	/*
651 	 * clear p_asleep.as_priority as an indication that await() has been
652 	 * called.  If await() is called again without an intervening asleep(),
653 	 * await() is still effectively a NOP but the above mi_switch() code
654 	 * is triggered as a safety.
655 	 */
656 	p->p_asleep.as_priority = 0;
657 
658 	return (0);
659 }
660 
661 /*
662  * Implement timeout for tsleep or asleep()/await()
663  *
664  * If process hasn't been awakened (wchan non-zero),
665  * set timeout flag and undo the sleep.  If proc
666  * is stopped, just unsleep so it will remain stopped.
667  */
668 static void
669 endtsleep(arg)
670 	void *arg;
671 {
672 	register struct proc *p;
673 	int s;
674 
675 	p = (struct proc *)arg;
676 	s = splhigh();
677 	if (p->p_wchan) {
678 		if (p->p_stat == SSLEEP)
679 			setrunnable(p);
680 		else
681 			unsleep(p);
682 		p->p_flag |= P_TIMEOUT;
683 	}
684 	splx(s);
685 }
686 
687 /*
688  * Remove a process from its wait queue
689  */
690 void
691 unsleep(p)
692 	register struct proc *p;
693 {
694 	int s;
695 
696 	s = splhigh();
697 	if (p->p_wchan) {
698 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
699 		p->p_wchan = 0;
700 	}
701 	splx(s);
702 }
703 
704 /*
705  * Make all processes sleeping on the specified identifier runnable.
706  */
707 void
708 wakeup(ident)
709 	register void *ident;
710 {
711 	register struct slpquehead *qp;
712 	register struct proc *p;
713 	int s;
714 
715 	s = splhigh();
716 	qp = &slpque[LOOKUP(ident)];
717 restart:
718 	TAILQ_FOREACH(p, qp, p_procq) {
719 		if (p->p_wchan == ident) {
720 			TAILQ_REMOVE(qp, p, p_procq);
721 			p->p_wchan = 0;
722 			if (p->p_stat == SSLEEP) {
723 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
724 				if (p->p_slptime > 1)
725 					updatepri(p);
726 				p->p_slptime = 0;
727 				p->p_stat = SRUN;
728 				if (p->p_flag & P_INMEM) {
729 					setrunqueue(p);
730 					maybe_resched(p);
731 				} else {
732 					p->p_flag |= P_SWAPINREQ;
733 					wakeup((caddr_t)&proc0);
734 				}
735 				/* END INLINE EXPANSION */
736 				goto restart;
737 			}
738 		}
739 	}
740 	splx(s);
741 }
742 
743 /*
744  * Make a process sleeping on the specified identifier runnable.
745  * May wake more than one process if a target prcoess is currently
746  * swapped out.
747  */
748 void
749 wakeup_one(ident)
750 	register void *ident;
751 {
752 	register struct slpquehead *qp;
753 	register struct proc *p;
754 	int s;
755 
756 	s = splhigh();
757 	qp = &slpque[LOOKUP(ident)];
758 
759 	TAILQ_FOREACH(p, qp, p_procq) {
760 		if (p->p_wchan == ident) {
761 			TAILQ_REMOVE(qp, p, p_procq);
762 			p->p_wchan = 0;
763 			if (p->p_stat == SSLEEP) {
764 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
765 				if (p->p_slptime > 1)
766 					updatepri(p);
767 				p->p_slptime = 0;
768 				p->p_stat = SRUN;
769 				if (p->p_flag & P_INMEM) {
770 					setrunqueue(p);
771 					maybe_resched(p);
772 					break;
773 				} else {
774 					p->p_flag |= P_SWAPINREQ;
775 					wakeup((caddr_t)&proc0);
776 				}
777 				/* END INLINE EXPANSION */
778 			}
779 		}
780 	}
781 	splx(s);
782 }
783 
784 /*
785  * The machine independent parts of mi_switch().
786  * Must be called at splstatclock() or higher.
787  */
788 void
789 mi_switch()
790 {
791 	struct timeval new_switchtime;
792 	register struct proc *p = curproc;	/* XXX */
793 	register struct rlimit *rlim;
794 	int x;
795 
796 	/*
797 	 * XXX this spl is almost unnecessary.  It is partly to allow for
798 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
799 	 * main offender).  It is partly to work around a bug in the i386
800 	 * cpu_switch() (the ipl is not preserved).  We ran for years
801 	 * without it.  I think there was only a interrupt latency problem.
802 	 * The main caller, tsleep(), does an splx() a couple of instructions
803 	 * after calling here.  The buggy caller, issignal(), usually calls
804 	 * here at spl0() and sometimes returns at splhigh().  The process
805 	 * then runs for a little too long at splhigh().  The ipl gets fixed
806 	 * when the process returns to user mode (or earlier).
807 	 *
808 	 * It would probably be better to always call here at spl0(). Callers
809 	 * are prepared to give up control to another process, so they must
810 	 * be prepared to be interrupted.  The clock stuff here may not
811 	 * actually need splstatclock().
812 	 */
813 	x = splstatclock();
814 
815 #ifdef SIMPLELOCK_DEBUG
816 	if (p->p_simple_locks)
817 		printf("sleep: holding simple lock\n");
818 #endif
819 	/*
820 	 * Compute the amount of time during which the current
821 	 * process was running, and add that to its total so far.
822 	 */
823 	microuptime(&new_switchtime);
824 	if (timevalcmp(&new_switchtime, &switchtime, <)) {
825 		printf("microuptime() went backwards (%ld.%06ld -> %ld,%06ld)\n",
826 		    switchtime.tv_sec, switchtime.tv_usec,
827 		    new_switchtime.tv_sec, new_switchtime.tv_usec);
828 		new_switchtime = switchtime;
829 	} else {
830 		p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
831 		    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
832 	}
833 
834 	/*
835 	 * Check if the process exceeds its cpu resource allocation.
836 	 * If over max, kill it.
837 	 */
838 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
839 	    p->p_runtime > p->p_limit->p_cpulimit) {
840 		rlim = &p->p_rlimit[RLIMIT_CPU];
841 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
842 			killproc(p, "exceeded maximum CPU limit");
843 		} else {
844 			psignal(p, SIGXCPU);
845 			if (rlim->rlim_cur < rlim->rlim_max) {
846 				/* XXX: we should make a private copy */
847 				rlim->rlim_cur += 5;
848 			}
849 		}
850 	}
851 
852 	/*
853 	 * Pick a new current process and record its start time.
854 	 */
855 	cnt.v_swtch++;
856 	switchtime = new_switchtime;
857 	cpu_switch(p);
858 	if (switchtime.tv_sec == 0)
859 		microuptime(&switchtime);
860 	switchticks = ticks;
861 
862 	splx(x);
863 }
864 
865 /*
866  * Change process state to be runnable,
867  * placing it on the run queue if it is in memory,
868  * and awakening the swapper if it isn't in memory.
869  */
870 void
871 setrunnable(p)
872 	register struct proc *p;
873 {
874 	register int s;
875 
876 	s = splhigh();
877 	switch (p->p_stat) {
878 	case 0:
879 	case SRUN:
880 	case SZOMB:
881 	default:
882 		panic("setrunnable");
883 	case SSTOP:
884 	case SSLEEP:
885 		unsleep(p);		/* e.g. when sending signals */
886 		break;
887 
888 	case SIDL:
889 		break;
890 	}
891 	p->p_stat = SRUN;
892 	if (p->p_flag & P_INMEM)
893 		setrunqueue(p);
894 	splx(s);
895 	if (p->p_slptime > 1)
896 		updatepri(p);
897 	p->p_slptime = 0;
898 	if ((p->p_flag & P_INMEM) == 0) {
899 		p->p_flag |= P_SWAPINREQ;
900 		wakeup((caddr_t)&proc0);
901 	}
902 	else
903 		maybe_resched(p);
904 }
905 
906 /*
907  * Compute the priority of a process when running in user mode.
908  * Arrange to reschedule if the resulting priority is better
909  * than that of the current process.
910  */
911 void
912 resetpriority(p)
913 	register struct proc *p;
914 {
915 	register unsigned int newpriority;
916 
917 	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
918 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
919 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
920 		newpriority = min(newpriority, MAXPRI);
921 		p->p_usrpri = newpriority;
922 	}
923 	maybe_resched(p);
924 }
925 
926 /* ARGSUSED */
927 static void
928 sched_setup(dummy)
929 	void *dummy;
930 {
931 	/* Kick off timeout driven events by calling first time. */
932 	roundrobin(NULL);
933 	schedcpu(NULL);
934 }
935 
936 /*
937  * We adjust the priority of the current process.  The priority of
938  * a process gets worse as it accumulates CPU time.  The cpu usage
939  * estimator (p_estcpu) is increased here.  resetpriority() will
940  * compute a different priority each time p_estcpu increases by
941  * INVERSE_ESTCPU_WEIGHT
942  * (until MAXPRI is reached).  The cpu usage estimator ramps up
943  * quite quickly when the process is running (linearly), and decays
944  * away exponentially, at a rate which is proportionally slower when
945  * the system is busy.  The basic principle is that the system will
946  * 90% forget that the process used a lot of CPU time in 5 * loadav
947  * seconds.  This causes the system to favor processes which haven't
948  * run much recently, and to round-robin among other processes.
949  */
950 void
951 schedclock(p)
952 	struct proc *p;
953 {
954 
955 	p->p_cpticks++;
956 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
957 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
958 		resetpriority(p);
959 		if (p->p_priority >= PUSER)
960 			p->p_priority = p->p_usrpri;
961 	}
962 }
963