xref: /freebsd/sys/kern/kern_synch.c (revision 4cf49a43559ed9fdad601bdcccd2c55963008675)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/kernel.h>
48 #include <sys/signalvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/vmmeter.h>
51 #include <sys/sysctl.h>
52 #include <vm/vm.h>
53 #include <vm/vm_extern.h>
54 #ifdef KTRACE
55 #include <sys/uio.h>
56 #include <sys/ktrace.h>
57 #endif
58 
59 #include <machine/cpu.h>
60 #ifdef SMP
61 #include <machine/smp.h>
62 #endif
63 #include <machine/limits.h>	/* for UCHAR_MAX = typeof(p_priority)_MAX */
64 
65 static void sched_setup __P((void *dummy));
66 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
67 
68 u_char	curpriority;
69 int	hogticks;
70 int	lbolt;
71 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
72 
73 static void	endtsleep __P((void *));
74 static void	roundrobin __P((void *arg));
75 static void	schedcpu __P((void *arg));
76 static void	updatepri __P((struct proc *p));
77 
78 static int
79 sysctl_kern_quantum SYSCTL_HANDLER_ARGS
80 {
81 	int error, new_val;
82 
83 	new_val = sched_quantum * tick;
84 	error = sysctl_handle_int(oidp, &new_val, 0, req);
85         if (error != 0 || req->newptr == NULL)
86 		return (error);
87 	if (new_val < tick)
88 		return (EINVAL);
89 	sched_quantum = new_val / tick;
90 	hogticks = 2 * sched_quantum;
91 	return (0);
92 }
93 
94 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
95 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
96 
97 /* maybe_resched: Decide if you need to reschedule or not
98  * taking the priorities and schedulers into account.
99  */
100 static void maybe_resched(struct proc *chk)
101 {
102 	struct proc *p = curproc; /* XXX */
103 
104 	/*
105 	 * Compare priorities if the new process is on the same scheduler,
106 	 * otherwise the one on the more realtimeish scheduler wins.
107 	 *
108 	 * XXX idle scheduler still broken because proccess stays on idle
109 	 * scheduler during waits (such as when getting FS locks).  If a
110 	 * standard process becomes runaway cpu-bound, the system can lockup
111 	 * due to idle-scheduler processes in wakeup never getting any cpu.
112 	 */
113 	if (p == 0 ||
114 		(chk->p_priority < curpriority && RTP_PRIO_BASE(p->p_rtprio.type) == RTP_PRIO_BASE(chk->p_rtprio.type)) ||
115 		RTP_PRIO_BASE(chk->p_rtprio.type) < RTP_PRIO_BASE(p->p_rtprio.type)
116 	) {
117 		need_resched();
118 	}
119 }
120 
121 int
122 roundrobin_interval(void)
123 {
124 	return (sched_quantum);
125 }
126 
127 /*
128  * Force switch among equal priority processes every 100ms.
129  */
130 /* ARGSUSED */
131 static void
132 roundrobin(arg)
133 	void *arg;
134 {
135 #ifndef SMP
136  	struct proc *p = curproc; /* XXX */
137 #endif
138 
139 #ifdef SMP
140 	need_resched();
141 	forward_roundrobin();
142 #else
143  	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
144  		need_resched();
145 #endif
146 
147  	timeout(roundrobin, NULL, sched_quantum);
148 }
149 
150 /*
151  * Constants for digital decay and forget:
152  *	90% of (p_estcpu) usage in 5 * loadav time
153  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
154  *          Note that, as ps(1) mentions, this can let percentages
155  *          total over 100% (I've seen 137.9% for 3 processes).
156  *
157  * Note that statclock() updates p_estcpu and p_cpticks asynchronously.
158  *
159  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
160  * That is, the system wants to compute a value of decay such
161  * that the following for loop:
162  * 	for (i = 0; i < (5 * loadavg); i++)
163  * 		p_estcpu *= decay;
164  * will compute
165  * 	p_estcpu *= 0.1;
166  * for all values of loadavg:
167  *
168  * Mathematically this loop can be expressed by saying:
169  * 	decay ** (5 * loadavg) ~= .1
170  *
171  * The system computes decay as:
172  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
173  *
174  * We wish to prove that the system's computation of decay
175  * will always fulfill the equation:
176  * 	decay ** (5 * loadavg) ~= .1
177  *
178  * If we compute b as:
179  * 	b = 2 * loadavg
180  * then
181  * 	decay = b / (b + 1)
182  *
183  * We now need to prove two things:
184  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
185  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
186  *
187  * Facts:
188  *         For x close to zero, exp(x) =~ 1 + x, since
189  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
190  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
191  *         For x close to zero, ln(1+x) =~ x, since
192  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
193  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
194  *         ln(.1) =~ -2.30
195  *
196  * Proof of (1):
197  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
198  *	solving for factor,
199  *      ln(factor) =~ (-2.30/5*loadav), or
200  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
201  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
202  *
203  * Proof of (2):
204  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
205  *	solving for power,
206  *      power*ln(b/(b+1)) =~ -2.30, or
207  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
208  *
209  * Actual power values for the implemented algorithm are as follows:
210  *      loadav: 1       2       3       4
211  *      power:  5.68    10.32   14.94   19.55
212  */
213 
214 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
215 #define	loadfactor(loadav)	(2 * (loadav))
216 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
217 
218 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
219 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
220 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
221 
222 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
223 static int	fscale __unused = FSCALE;
224 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
225 
226 /*
227  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
228  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
229  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
230  *
231  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
232  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
233  *
234  * If you don't want to bother with the faster/more-accurate formula, you
235  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
236  * (more general) method of calculating the %age of CPU used by a process.
237  */
238 #define	CCPU_SHIFT	11
239 
240 /*
241  * Recompute process priorities, every hz ticks.
242  */
243 /* ARGSUSED */
244 static void
245 schedcpu(arg)
246 	void *arg;
247 {
248 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
249 	register struct proc *p;
250 	register int realstathz, s;
251 	register unsigned int newcpu;
252 
253 	realstathz = stathz ? stathz : hz;
254 	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
255 		/*
256 		 * Increment time in/out of memory and sleep time
257 		 * (if sleeping).  We ignore overflow; with 16-bit int's
258 		 * (remember them?) overflow takes 45 days.
259 		 */
260 		p->p_swtime++;
261 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
262 			p->p_slptime++;
263 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
264 		/*
265 		 * If the process has slept the entire second,
266 		 * stop recalculating its priority until it wakes up.
267 		 */
268 		if (p->p_slptime > 1)
269 			continue;
270 		s = splhigh();	/* prevent state changes and protect run queue */
271 		/*
272 		 * p_pctcpu is only for ps.
273 		 */
274 #if	(FSHIFT >= CCPU_SHIFT)
275 		p->p_pctcpu += (realstathz == 100)?
276 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
277                 	100 * (((fixpt_t) p->p_cpticks)
278 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
279 #else
280 		p->p_pctcpu += ((FSCALE - ccpu) *
281 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
282 #endif
283 		p->p_cpticks = 0;
284 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
285 		p->p_estcpu = min(newcpu, UCHAR_MAX);
286 		resetpriority(p);
287 		if (p->p_priority >= PUSER) {
288 #define	PPQ	(128 / NQS)		/* priorities per queue */
289 			if ((p != curproc) &&
290 #ifdef SMP
291 			    p->p_oncpu == 0xff && 	/* idle */
292 #endif
293 			    p->p_stat == SRUN &&
294 			    (p->p_flag & P_INMEM) &&
295 			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
296 				remrunqueue(p);
297 				p->p_priority = p->p_usrpri;
298 				setrunqueue(p);
299 			} else
300 				p->p_priority = p->p_usrpri;
301 		}
302 		splx(s);
303 	}
304 	vmmeter();
305 	wakeup((caddr_t)&lbolt);
306 	timeout(schedcpu, (void *)0, hz);
307 }
308 
309 /*
310  * Recalculate the priority of a process after it has slept for a while.
311  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
312  * least six times the loadfactor will decay p_estcpu to zero.
313  */
314 static void
315 updatepri(p)
316 	register struct proc *p;
317 {
318 	register unsigned int newcpu = p->p_estcpu;
319 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
320 
321 	if (p->p_slptime > 5 * loadfac)
322 		p->p_estcpu = 0;
323 	else {
324 		p->p_slptime--;	/* the first time was done in schedcpu */
325 		while (newcpu && --p->p_slptime)
326 			newcpu = (int) decay_cpu(loadfac, newcpu);
327 		p->p_estcpu = min(newcpu, UCHAR_MAX);
328 	}
329 	resetpriority(p);
330 }
331 
332 /*
333  * We're only looking at 7 bits of the address; everything is
334  * aligned to 4, lots of things are aligned to greater powers
335  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
336  */
337 #define TABLESIZE	128
338 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
339 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
340 
341 /*
342  * During autoconfiguration or after a panic, a sleep will simply
343  * lower the priority briefly to allow interrupts, then return.
344  * The priority to be used (safepri) is machine-dependent, thus this
345  * value is initialized and maintained in the machine-dependent layers.
346  * This priority will typically be 0, or the lowest priority
347  * that is safe for use on the interrupt stack; it can be made
348  * higher to block network software interrupts after panics.
349  */
350 int safepri;
351 
352 void
353 sleepinit(void)
354 {
355 	int i;
356 
357 	sched_quantum = hz/10;
358 	hogticks = 2 * sched_quantum;
359 	for (i = 0; i < TABLESIZE; i++)
360 		TAILQ_INIT(&slpque[i]);
361 }
362 
363 /*
364  * General sleep call.  Suspends the current process until a wakeup is
365  * performed on the specified identifier.  The process will then be made
366  * runnable with the specified priority.  Sleeps at most timo/hz seconds
367  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
368  * before and after sleeping, else signals are not checked.  Returns 0 if
369  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
370  * signal needs to be delivered, ERESTART is returned if the current system
371  * call should be restarted if possible, and EINTR is returned if the system
372  * call should be interrupted by the signal (return EINTR).
373  */
374 int
375 tsleep(ident, priority, wmesg, timo)
376 	void *ident;
377 	int priority, timo;
378 	const char *wmesg;
379 {
380 	struct proc *p = curproc;
381 	int s, sig, catch = priority & PCATCH;
382 	struct callout_handle thandle;
383 
384 #ifdef KTRACE
385 	if (KTRPOINT(p, KTR_CSW))
386 		ktrcsw(p->p_tracep, 1, 0);
387 #endif
388 	s = splhigh();
389 	if (cold || panicstr) {
390 		/*
391 		 * After a panic, or during autoconfiguration,
392 		 * just give interrupts a chance, then just return;
393 		 * don't run any other procs or panic below,
394 		 * in case this is the idle process and already asleep.
395 		 */
396 		splx(safepri);
397 		splx(s);
398 		return (0);
399 	}
400 	KASSERT(p != NULL, ("tsleep1"));
401 	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
402 	/*
403 	 * Process may be sitting on a slpque if asleep() was called, remove
404 	 * it before re-adding.
405 	 */
406 	if (p->p_wchan != NULL)
407 		unsleep(p);
408 
409 	p->p_wchan = ident;
410 	p->p_wmesg = wmesg;
411 	p->p_slptime = 0;
412 	p->p_priority = priority & PRIMASK;
413 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
414 	if (timo)
415 		thandle = timeout(endtsleep, (void *)p, timo);
416 	/*
417 	 * We put ourselves on the sleep queue and start our timeout
418 	 * before calling CURSIG, as we could stop there, and a wakeup
419 	 * or a SIGCONT (or both) could occur while we were stopped.
420 	 * A SIGCONT would cause us to be marked as SSLEEP
421 	 * without resuming us, thus we must be ready for sleep
422 	 * when CURSIG is called.  If the wakeup happens while we're
423 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
424 	 */
425 	if (catch) {
426 		p->p_flag |= P_SINTR;
427 		if ((sig = CURSIG(p))) {
428 			if (p->p_wchan)
429 				unsleep(p);
430 			p->p_stat = SRUN;
431 			goto resume;
432 		}
433 		if (p->p_wchan == 0) {
434 			catch = 0;
435 			goto resume;
436 		}
437 	} else
438 		sig = 0;
439 	p->p_stat = SSLEEP;
440 	p->p_stats->p_ru.ru_nvcsw++;
441 	mi_switch();
442 resume:
443 	curpriority = p->p_usrpri;
444 	splx(s);
445 	p->p_flag &= ~P_SINTR;
446 	if (p->p_flag & P_TIMEOUT) {
447 		p->p_flag &= ~P_TIMEOUT;
448 		if (sig == 0) {
449 #ifdef KTRACE
450 			if (KTRPOINT(p, KTR_CSW))
451 				ktrcsw(p->p_tracep, 0, 0);
452 #endif
453 			return (EWOULDBLOCK);
454 		}
455 	} else if (timo)
456 		untimeout(endtsleep, (void *)p, thandle);
457 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
458 #ifdef KTRACE
459 		if (KTRPOINT(p, KTR_CSW))
460 			ktrcsw(p->p_tracep, 0, 0);
461 #endif
462 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
463 			return (EINTR);
464 		return (ERESTART);
465 	}
466 #ifdef KTRACE
467 	if (KTRPOINT(p, KTR_CSW))
468 		ktrcsw(p->p_tracep, 0, 0);
469 #endif
470 	return (0);
471 }
472 
473 /*
474  * asleep() - async sleep call.  Place process on wait queue and return
475  * immediately without blocking.  The process stays runnable until await()
476  * is called.  If ident is NULL, remove process from wait queue if it is still
477  * on one.
478  *
479  * Only the most recent sleep condition is effective when making successive
480  * calls to asleep() or when calling tsleep().
481  *
482  * The timeout, if any, is not initiated until await() is called.  The sleep
483  * priority, signal, and timeout is specified in the asleep() call but may be
484  * overriden in the await() call.
485  *
486  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
487  */
488 
489 int
490 asleep(void *ident, int priority, const char *wmesg, int timo)
491 {
492 	struct proc *p = curproc;
493 	int s;
494 
495 	/*
496 	 * splhigh() while manipulating sleep structures and slpque.
497 	 *
498 	 * Remove preexisting wait condition (if any) and place process
499 	 * on appropriate slpque, but do not put process to sleep.
500 	 */
501 
502 	s = splhigh();
503 
504 	if (p->p_wchan != NULL)
505 		unsleep(p);
506 
507 	if (ident) {
508 		p->p_wchan = ident;
509 		p->p_wmesg = wmesg;
510 		p->p_slptime = 0;
511 		p->p_asleep.as_priority = priority;
512 		p->p_asleep.as_timo = timo;
513 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
514 	}
515 
516 	splx(s);
517 
518 	return(0);
519 }
520 
521 /*
522  * await() - wait for async condition to occur.   The process blocks until
523  * wakeup() is called on the most recent asleep() address.  If wakeup is called
524  * priority to await(), await() winds up being a NOP.
525  *
526  * If await() is called more then once (without an intervening asleep() call),
527  * await() is still effectively a NOP but it calls mi_switch() to give other
528  * processes some cpu before returning.  The process is left runnable.
529  *
530  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
531  */
532 
533 int
534 await(int priority, int timo)
535 {
536 	struct proc *p = curproc;
537 	int s;
538 
539 	s = splhigh();
540 
541 	if (p->p_wchan != NULL) {
542 		struct callout_handle thandle;
543 		int sig;
544 		int catch;
545 
546 		/*
547 		 * The call to await() can override defaults specified in
548 		 * the original asleep().
549 		 */
550 		if (priority < 0)
551 			priority = p->p_asleep.as_priority;
552 		if (timo < 0)
553 			timo = p->p_asleep.as_timo;
554 
555 		/*
556 		 * Install timeout
557 		 */
558 
559 		if (timo)
560 			thandle = timeout(endtsleep, (void *)p, timo);
561 
562 		sig = 0;
563 		catch = priority & PCATCH;
564 
565 		if (catch) {
566 			p->p_flag |= P_SINTR;
567 			if ((sig = CURSIG(p))) {
568 				if (p->p_wchan)
569 					unsleep(p);
570 				p->p_stat = SRUN;
571 				goto resume;
572 			}
573 			if (p->p_wchan == NULL) {
574 				catch = 0;
575 				goto resume;
576 			}
577 		}
578 		p->p_stat = SSLEEP;
579 		p->p_stats->p_ru.ru_nvcsw++;
580 		mi_switch();
581 resume:
582 		curpriority = p->p_usrpri;
583 
584 		splx(s);
585 		p->p_flag &= ~P_SINTR;
586 		if (p->p_flag & P_TIMEOUT) {
587 			p->p_flag &= ~P_TIMEOUT;
588 			if (sig == 0) {
589 #ifdef KTRACE
590 				if (KTRPOINT(p, KTR_CSW))
591 					ktrcsw(p->p_tracep, 0, 0);
592 #endif
593 				return (EWOULDBLOCK);
594 			}
595 		} else if (timo)
596 			untimeout(endtsleep, (void *)p, thandle);
597 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
598 #ifdef KTRACE
599 			if (KTRPOINT(p, KTR_CSW))
600 				ktrcsw(p->p_tracep, 0, 0);
601 #endif
602 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
603 				return (EINTR);
604 			return (ERESTART);
605 		}
606 #ifdef KTRACE
607 		if (KTRPOINT(p, KTR_CSW))
608 			ktrcsw(p->p_tracep, 0, 0);
609 #endif
610 	} else {
611 		/*
612 		 * If as_priority is 0, await() has been called without an
613 		 * intervening asleep().  We are still effectively a NOP,
614 		 * but we call mi_switch() for safety.
615 		 */
616 
617 		if (p->p_asleep.as_priority == 0) {
618 			p->p_stats->p_ru.ru_nvcsw++;
619 			mi_switch();
620 		}
621 		splx(s);
622 	}
623 
624 	/*
625 	 * clear p_asleep.as_priority as an indication that await() has been
626 	 * called.  If await() is called again without an intervening asleep(),
627 	 * await() is still effectively a NOP but the above mi_switch() code
628 	 * is triggered as a safety.
629 	 */
630 	p->p_asleep.as_priority = 0;
631 
632 	return (0);
633 }
634 
635 /*
636  * Implement timeout for tsleep or asleep()/await()
637  *
638  * If process hasn't been awakened (wchan non-zero),
639  * set timeout flag and undo the sleep.  If proc
640  * is stopped, just unsleep so it will remain stopped.
641  */
642 static void
643 endtsleep(arg)
644 	void *arg;
645 {
646 	register struct proc *p;
647 	int s;
648 
649 	p = (struct proc *)arg;
650 	s = splhigh();
651 	if (p->p_wchan) {
652 		if (p->p_stat == SSLEEP)
653 			setrunnable(p);
654 		else
655 			unsleep(p);
656 		p->p_flag |= P_TIMEOUT;
657 	}
658 	splx(s);
659 }
660 
661 /*
662  * Remove a process from its wait queue
663  */
664 void
665 unsleep(p)
666 	register struct proc *p;
667 {
668 	int s;
669 
670 	s = splhigh();
671 	if (p->p_wchan) {
672 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
673 		p->p_wchan = 0;
674 	}
675 	splx(s);
676 }
677 
678 /*
679  * Make all processes sleeping on the specified identifier runnable.
680  */
681 void
682 wakeup(ident)
683 	register void *ident;
684 {
685 	register struct slpquehead *qp;
686 	register struct proc *p;
687 	int s;
688 
689 	s = splhigh();
690 	qp = &slpque[LOOKUP(ident)];
691 restart:
692 	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
693 		if (p->p_wchan == ident) {
694 			TAILQ_REMOVE(qp, p, p_procq);
695 			p->p_wchan = 0;
696 			if (p->p_stat == SSLEEP) {
697 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
698 				if (p->p_slptime > 1)
699 					updatepri(p);
700 				p->p_slptime = 0;
701 				p->p_stat = SRUN;
702 				if (p->p_flag & P_INMEM) {
703 					setrunqueue(p);
704 					maybe_resched(p);
705 				} else {
706 					p->p_flag |= P_SWAPINREQ;
707 					wakeup((caddr_t)&proc0);
708 				}
709 				/* END INLINE EXPANSION */
710 				goto restart;
711 			}
712 		}
713 	}
714 	splx(s);
715 }
716 
717 /*
718  * Make a process sleeping on the specified identifier runnable.
719  * May wake more than one process if a target prcoess is currently
720  * swapped out.
721  */
722 void
723 wakeup_one(ident)
724 	register void *ident;
725 {
726 	register struct slpquehead *qp;
727 	register struct proc *p;
728 	int s;
729 
730 	s = splhigh();
731 	qp = &slpque[LOOKUP(ident)];
732 
733 	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
734 		if (p->p_wchan == ident) {
735 			TAILQ_REMOVE(qp, p, p_procq);
736 			p->p_wchan = 0;
737 			if (p->p_stat == SSLEEP) {
738 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
739 				if (p->p_slptime > 1)
740 					updatepri(p);
741 				p->p_slptime = 0;
742 				p->p_stat = SRUN;
743 				if (p->p_flag & P_INMEM) {
744 					setrunqueue(p);
745 					maybe_resched(p);
746 					break;
747 				} else {
748 					p->p_flag |= P_SWAPINREQ;
749 					wakeup((caddr_t)&proc0);
750 				}
751 				/* END INLINE EXPANSION */
752 			}
753 		}
754 	}
755 	splx(s);
756 }
757 
758 /*
759  * The machine independent parts of mi_switch().
760  * Must be called at splstatclock() or higher.
761  */
762 void
763 mi_switch()
764 {
765 	struct timeval new_switchtime;
766 	register struct proc *p = curproc;	/* XXX */
767 	register struct rlimit *rlim;
768 	int x;
769 
770 	/*
771 	 * XXX this spl is almost unnecessary.  It is partly to allow for
772 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
773 	 * main offender).  It is partly to work around a bug in the i386
774 	 * cpu_switch() (the ipl is not preserved).  We ran for years
775 	 * without it.  I think there was only a interrupt latency problem.
776 	 * The main caller, tsleep(), does an splx() a couple of instructions
777 	 * after calling here.  The buggy caller, issignal(), usually calls
778 	 * here at spl0() and sometimes returns at splhigh().  The process
779 	 * then runs for a little too long at splhigh().  The ipl gets fixed
780 	 * when the process returns to user mode (or earlier).
781 	 *
782 	 * It would probably be better to always call here at spl0(). Callers
783 	 * are prepared to give up control to another process, so they must
784 	 * be prepared to be interrupted.  The clock stuff here may not
785 	 * actually need splstatclock().
786 	 */
787 	x = splstatclock();
788 
789 #ifdef SIMPLELOCK_DEBUG
790 	if (p->p_simple_locks)
791 		printf("sleep: holding simple lock\n");
792 #endif
793 	/*
794 	 * Compute the amount of time during which the current
795 	 * process was running, and add that to its total so far.
796 	 */
797 	microuptime(&new_switchtime);
798 	p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
799 	    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
800 
801 	/*
802 	 * Check if the process exceeds its cpu resource allocation.
803 	 * If over max, kill it.
804 	 */
805 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
806 	    p->p_runtime > p->p_limit->p_cpulimit) {
807 		rlim = &p->p_rlimit[RLIMIT_CPU];
808 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
809 			killproc(p, "exceeded maximum CPU limit");
810 		} else {
811 			psignal(p, SIGXCPU);
812 			if (rlim->rlim_cur < rlim->rlim_max) {
813 				/* XXX: we should make a private copy */
814 				rlim->rlim_cur += 5;
815 			}
816 		}
817 	}
818 
819 	/*
820 	 * Pick a new current process and record its start time.
821 	 */
822 	cnt.v_swtch++;
823 	switchtime = new_switchtime;
824 	cpu_switch(p);
825 	if (switchtime.tv_sec == 0)
826 		microuptime(&switchtime);
827 	switchticks = ticks;
828 
829 	splx(x);
830 }
831 
832 /*
833  * Change process state to be runnable,
834  * placing it on the run queue if it is in memory,
835  * and awakening the swapper if it isn't in memory.
836  */
837 void
838 setrunnable(p)
839 	register struct proc *p;
840 {
841 	register int s;
842 
843 	s = splhigh();
844 	switch (p->p_stat) {
845 	case 0:
846 	case SRUN:
847 	case SZOMB:
848 	default:
849 		panic("setrunnable");
850 	case SSTOP:
851 	case SSLEEP:
852 		unsleep(p);		/* e.g. when sending signals */
853 		break;
854 
855 	case SIDL:
856 		break;
857 	}
858 	p->p_stat = SRUN;
859 	if (p->p_flag & P_INMEM)
860 		setrunqueue(p);
861 	splx(s);
862 	if (p->p_slptime > 1)
863 		updatepri(p);
864 	p->p_slptime = 0;
865 	if ((p->p_flag & P_INMEM) == 0) {
866 		p->p_flag |= P_SWAPINREQ;
867 		wakeup((caddr_t)&proc0);
868 	}
869 	else
870 		maybe_resched(p);
871 }
872 
873 /*
874  * Compute the priority of a process when running in user mode.
875  * Arrange to reschedule if the resulting priority is better
876  * than that of the current process.
877  */
878 void
879 resetpriority(p)
880 	register struct proc *p;
881 {
882 	register unsigned int newpriority;
883 
884 	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
885 		newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
886 		newpriority = min(newpriority, MAXPRI);
887 		p->p_usrpri = newpriority;
888 	}
889 	maybe_resched(p);
890 }
891 
892 /* ARGSUSED */
893 static void
894 sched_setup(dummy)
895 	void *dummy;
896 {
897 	/* Kick off timeout driven events by calling first time. */
898 	roundrobin(NULL);
899 	schedcpu(NULL);
900 }
901 
902