xref: /freebsd/sys/kern/kern_synch.c (revision 77a0943ded95b9e6438f7db70c4a28e4d93946d4)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/ipl.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/mutex.h>
51 #include <sys/signalvar.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vmmeter.h>
54 #include <sys/sysctl.h>
55 #include <vm/vm.h>
56 #include <vm/vm_extern.h>
57 #ifdef KTRACE
58 #include <sys/uio.h>
59 #include <sys/ktrace.h>
60 #endif
61 
62 #include <machine/cpu.h>
63 #include <machine/smp.h>
64 
65 static void sched_setup __P((void *dummy));
66 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
67 
68 u_char	curpriority;
69 int	hogticks;
70 int	lbolt;
71 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
72 
73 static int	curpriority_cmp __P((struct proc *p));
74 static void	endtsleep __P((void *));
75 static void	maybe_resched __P((struct proc *chk));
76 static void	roundrobin __P((void *arg));
77 static void	schedcpu __P((void *arg));
78 static void	updatepri __P((struct proc *p));
79 
80 static int
81 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
82 {
83 	int error, new_val;
84 
85 	new_val = sched_quantum * tick;
86 	error = sysctl_handle_int(oidp, &new_val, 0, req);
87         if (error != 0 || req->newptr == NULL)
88 		return (error);
89 	if (new_val < tick)
90 		return (EINVAL);
91 	sched_quantum = new_val / tick;
92 	hogticks = 2 * sched_quantum;
93 	return (0);
94 }
95 
96 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
97 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
98 
99 /*-
100  * Compare priorities.  Return:
101  *     <0: priority of p < current priority
102  *      0: priority of p == current priority
103  *     >0: priority of p > current priority
104  * The priorities are the normal priorities or the normal realtime priorities
105  * if p is on the same scheduler as curproc.  Otherwise the process on the
106  * more realtimeish scheduler has lowest priority.  As usual, a higher
107  * priority really means a lower priority.
108  */
109 static int
110 curpriority_cmp(p)
111 	struct proc *p;
112 {
113 	int c_class, p_class;
114 
115 	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
116 	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
117 	if (p_class != c_class)
118 		return (p_class - c_class);
119 	if (p_class == RTP_PRIO_NORMAL)
120 		return (((int)p->p_priority - (int)curpriority) / PPQ);
121 	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
122 }
123 
124 /*
125  * Arrange to reschedule if necessary, taking the priorities and
126  * schedulers into account.
127  */
128 static void
129 maybe_resched(chk)
130 	struct proc *chk;
131 {
132 	struct proc *p = curproc; /* XXX */
133 
134 	/*
135 	 * XXX idle scheduler still broken because proccess stays on idle
136 	 * scheduler during waits (such as when getting FS locks).  If a
137 	 * standard process becomes runaway cpu-bound, the system can lockup
138 	 * due to idle-scheduler processes in wakeup never getting any cpu.
139 	 */
140 	if (p == idleproc) {
141 #if 0
142 		need_resched();
143 #endif
144 	} else if (chk == p) {
145 		/* We may need to yield if our priority has been raised. */
146 		if (curpriority_cmp(chk) > 0)
147 			need_resched();
148 	} else if (curpriority_cmp(chk) < 0)
149 		need_resched();
150 }
151 
152 int
153 roundrobin_interval(void)
154 {
155 	return (sched_quantum);
156 }
157 
158 /*
159  * Force switch among equal priority processes every 100ms.
160  */
161 /* ARGSUSED */
162 static void
163 roundrobin(arg)
164 	void *arg;
165 {
166 #ifndef SMP
167  	struct proc *p = curproc; /* XXX */
168 #endif
169 
170 #ifdef SMP
171 	need_resched();
172 	forward_roundrobin();
173 #else
174  	if (p == idleproc || RTP_PRIO_NEED_RR(p->p_rtprio.type))
175  		need_resched();
176 #endif
177 
178  	timeout(roundrobin, NULL, sched_quantum);
179 }
180 
181 /*
182  * Constants for digital decay and forget:
183  *	90% of (p_estcpu) usage in 5 * loadav time
184  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
185  *          Note that, as ps(1) mentions, this can let percentages
186  *          total over 100% (I've seen 137.9% for 3 processes).
187  *
188  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
189  *
190  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
191  * That is, the system wants to compute a value of decay such
192  * that the following for loop:
193  * 	for (i = 0; i < (5 * loadavg); i++)
194  * 		p_estcpu *= decay;
195  * will compute
196  * 	p_estcpu *= 0.1;
197  * for all values of loadavg:
198  *
199  * Mathematically this loop can be expressed by saying:
200  * 	decay ** (5 * loadavg) ~= .1
201  *
202  * The system computes decay as:
203  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
204  *
205  * We wish to prove that the system's computation of decay
206  * will always fulfill the equation:
207  * 	decay ** (5 * loadavg) ~= .1
208  *
209  * If we compute b as:
210  * 	b = 2 * loadavg
211  * then
212  * 	decay = b / (b + 1)
213  *
214  * We now need to prove two things:
215  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
216  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
217  *
218  * Facts:
219  *         For x close to zero, exp(x) =~ 1 + x, since
220  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
221  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
222  *         For x close to zero, ln(1+x) =~ x, since
223  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
224  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
225  *         ln(.1) =~ -2.30
226  *
227  * Proof of (1):
228  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
229  *	solving for factor,
230  *      ln(factor) =~ (-2.30/5*loadav), or
231  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
232  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
233  *
234  * Proof of (2):
235  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
236  *	solving for power,
237  *      power*ln(b/(b+1)) =~ -2.30, or
238  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
239  *
240  * Actual power values for the implemented algorithm are as follows:
241  *      loadav: 1       2       3       4
242  *      power:  5.68    10.32   14.94   19.55
243  */
244 
245 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
246 #define	loadfactor(loadav)	(2 * (loadav))
247 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
248 
249 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
250 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
251 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
252 
253 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
254 static int	fscale __unused = FSCALE;
255 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
256 
257 /*
258  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
259  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
260  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
261  *
262  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
263  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
264  *
265  * If you don't want to bother with the faster/more-accurate formula, you
266  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
267  * (more general) method of calculating the %age of CPU used by a process.
268  */
269 #define	CCPU_SHIFT	11
270 
271 /*
272  * Recompute process priorities, every hz ticks.
273  */
274 /* ARGSUSED */
275 static void
276 schedcpu(arg)
277 	void *arg;
278 {
279 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
280 	register struct proc *p;
281 	register int realstathz, s;
282 
283 	realstathz = stathz ? stathz : hz;
284 	lockmgr(&allproc_lock, LK_SHARED, NULL, CURPROC);
285 	LIST_FOREACH(p, &allproc, p_list) {
286 		/*
287 		 * Increment time in/out of memory and sleep time
288 		 * (if sleeping).  We ignore overflow; with 16-bit int's
289 		 * (remember them?) overflow takes 45 days.
290 		if (p->p_stat == SWAIT)
291 			continue;
292 		 */
293 		mtx_enter(&sched_lock, MTX_SPIN);
294 		p->p_swtime++;
295 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
296 			p->p_slptime++;
297 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
298 		/*
299 		 * If the process has slept the entire second,
300 		 * stop recalculating its priority until it wakes up.
301 		 */
302 		if (p->p_slptime > 1) {
303 			mtx_exit(&sched_lock, MTX_SPIN);
304 			continue;
305 		}
306 
307 		/*
308 		 * prevent state changes and protect run queue
309 		 */
310 		s = splhigh();
311 
312 		/*
313 		 * p_pctcpu is only for ps.
314 		 */
315 #if	(FSHIFT >= CCPU_SHIFT)
316 		p->p_pctcpu += (realstathz == 100)?
317 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
318                 	100 * (((fixpt_t) p->p_cpticks)
319 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
320 #else
321 		p->p_pctcpu += ((FSCALE - ccpu) *
322 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
323 #endif
324 		p->p_cpticks = 0;
325 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
326 		resetpriority(p);
327 		if (p->p_priority >= PUSER) {
328 			if ((p != curproc) &&
329 #ifdef SMP
330 			    p->p_oncpu == 0xff && 	/* idle */
331 #endif
332 			    p->p_stat == SRUN &&
333 			    (p->p_flag & P_INMEM) &&
334 			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
335 				remrunqueue(p);
336 				p->p_priority = p->p_usrpri;
337 				setrunqueue(p);
338 			} else
339 				p->p_priority = p->p_usrpri;
340 		}
341 		mtx_exit(&sched_lock, MTX_SPIN);
342 		splx(s);
343 	}
344 	lockmgr(&allproc_lock, LK_RELEASE, NULL, CURPROC);
345 	vmmeter();
346 	wakeup((caddr_t)&lbolt);
347 	timeout(schedcpu, (void *)0, hz);
348 }
349 
350 /*
351  * Recalculate the priority of a process after it has slept for a while.
352  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
353  * least six times the loadfactor will decay p_estcpu to zero.
354  */
355 static void
356 updatepri(p)
357 	register struct proc *p;
358 {
359 	register unsigned int newcpu = p->p_estcpu;
360 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
361 
362 	if (p->p_slptime > 5 * loadfac)
363 		p->p_estcpu = 0;
364 	else {
365 		p->p_slptime--;	/* the first time was done in schedcpu */
366 		while (newcpu && --p->p_slptime)
367 			newcpu = decay_cpu(loadfac, newcpu);
368 		p->p_estcpu = newcpu;
369 	}
370 	resetpriority(p);
371 }
372 
373 /*
374  * We're only looking at 7 bits of the address; everything is
375  * aligned to 4, lots of things are aligned to greater powers
376  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
377  */
378 #define TABLESIZE	128
379 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
380 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
381 
382 void
383 sleepinit(void)
384 {
385 	int i;
386 
387 	sched_quantum = hz/10;
388 	hogticks = 2 * sched_quantum;
389 	for (i = 0; i < TABLESIZE; i++)
390 		TAILQ_INIT(&slpque[i]);
391 }
392 
393 /*
394  * General sleep call.  Suspends the current process until a wakeup is
395  * performed on the specified identifier.  The process will then be made
396  * runnable with the specified priority.  Sleeps at most timo/hz seconds
397  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
398  * before and after sleeping, else signals are not checked.  Returns 0 if
399  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
400  * signal needs to be delivered, ERESTART is returned if the current system
401  * call should be restarted if possible, and EINTR is returned if the system
402  * call should be interrupted by the signal (return EINTR).
403  *
404  * The mutex argument is exited before the caller is suspended, and
405  * entered before msleep returns.  If priority includes the PDROP
406  * flag the mutex is not entered before returning.
407  */
408 int
409 msleep(ident, mtx, priority, wmesg, timo)
410 	void *ident;
411 	struct mtx *mtx;
412 	int priority, timo;
413 	const char *wmesg;
414 {
415 	struct proc *p = curproc;
416 	int s, sig, catch = priority & PCATCH;
417 	struct callout_handle thandle;
418 	int rval = 0;
419 	WITNESS_SAVE_DECL(mtx);
420 
421 #ifdef KTRACE
422 	if (p && KTRPOINT(p, KTR_CSW))
423 		ktrcsw(p->p_tracep, 1, 0);
424 #endif
425 	WITNESS_SLEEP(0, mtx);
426 	mtx_enter(&sched_lock, MTX_SPIN);
427 	DROP_GIANT_NOSWITCH();
428 
429 	if (mtx != NULL) {
430 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
431 		WITNESS_SAVE(mtx, mtx);
432 		mtx_exit(mtx, MTX_DEF | MTX_NOSWITCH);
433 		if (priority & PDROP)
434 			mtx = NULL;
435 	}
436 
437 	s = splhigh();
438 	if (cold || panicstr) {
439 		/*
440 		 * After a panic, or during autoconfiguration,
441 		 * just give interrupts a chance, then just return;
442 		 * don't run any other procs or panic below,
443 		 * in case this is the idle process and already asleep.
444 		 */
445 		mtx_exit(&sched_lock, MTX_SPIN);
446 		splx(s);
447 		return (0);
448 	}
449 
450 	KASSERT(p != NULL, ("msleep1"));
451 	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
452 	/*
453 	 * Process may be sitting on a slpque if asleep() was called, remove
454 	 * it before re-adding.
455 	 */
456 	if (p->p_wchan != NULL)
457 		unsleep(p);
458 
459 	p->p_wchan = ident;
460 	p->p_wmesg = wmesg;
461 	p->p_slptime = 0;
462 	p->p_priority = priority & PRIMASK;
463 	p->p_nativepri = p->p_priority;
464 	CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
465 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
466 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
467 	if (timo)
468 		thandle = timeout(endtsleep, (void *)p, timo);
469 	/*
470 	 * We put ourselves on the sleep queue and start our timeout
471 	 * before calling CURSIG, as we could stop there, and a wakeup
472 	 * or a SIGCONT (or both) could occur while we were stopped.
473 	 * A SIGCONT would cause us to be marked as SSLEEP
474 	 * without resuming us, thus we must be ready for sleep
475 	 * when CURSIG is called.  If the wakeup happens while we're
476 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
477 	 */
478 	if (catch) {
479 		CTR4(KTR_PROC,
480 		        "msleep caught: proc %p (pid %d, %s), schedlock %p",
481 			p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
482 		p->p_flag |= P_SINTR;
483 		mtx_exit(&sched_lock, MTX_SPIN);
484 		if ((sig = CURSIG(p))) {
485 			mtx_enter(&sched_lock, MTX_SPIN);
486 			if (p->p_wchan)
487 				unsleep(p);
488 			p->p_stat = SRUN;
489 			goto resume;
490 		}
491 		mtx_enter(&sched_lock, MTX_SPIN);
492 		if (p->p_wchan == 0) {
493 			catch = 0;
494 			goto resume;
495 		}
496 	} else
497 		sig = 0;
498 	p->p_stat = SSLEEP;
499 	p->p_stats->p_ru.ru_nvcsw++;
500 	mi_switch();
501 	CTR4(KTR_PROC,
502 	        "msleep resume: proc %p (pid %d, %s), schedlock %p",
503 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
504 resume:
505 	curpriority = p->p_usrpri;
506 	splx(s);
507 	p->p_flag &= ~P_SINTR;
508 	if (p->p_flag & P_TIMEOUT) {
509 		p->p_flag &= ~P_TIMEOUT;
510 		if (sig == 0) {
511 #ifdef KTRACE
512 			if (KTRPOINT(p, KTR_CSW))
513 				ktrcsw(p->p_tracep, 0, 0);
514 #endif
515 			rval = EWOULDBLOCK;
516 			mtx_exit(&sched_lock, MTX_SPIN);
517 			goto out;
518 		}
519 	} else if (timo)
520 		untimeout(endtsleep, (void *)p, thandle);
521 	mtx_exit(&sched_lock, MTX_SPIN);
522 
523 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
524 #ifdef KTRACE
525 		if (KTRPOINT(p, KTR_CSW))
526 			ktrcsw(p->p_tracep, 0, 0);
527 #endif
528 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
529 			rval = EINTR;
530 		else
531 			rval = ERESTART;
532 		goto out;
533 	}
534 out:
535 #ifdef KTRACE
536 	if (KTRPOINT(p, KTR_CSW))
537 		ktrcsw(p->p_tracep, 0, 0);
538 #endif
539 	PICKUP_GIANT();
540 	if (mtx != NULL) {
541 		mtx_enter(mtx, MTX_DEF);
542 		WITNESS_RESTORE(mtx, mtx);
543 	}
544 	return (rval);
545 }
546 
547 /*
548  * asleep() - async sleep call.  Place process on wait queue and return
549  * immediately without blocking.  The process stays runnable until mawait()
550  * is called.  If ident is NULL, remove process from wait queue if it is still
551  * on one.
552  *
553  * Only the most recent sleep condition is effective when making successive
554  * calls to asleep() or when calling msleep().
555  *
556  * The timeout, if any, is not initiated until mawait() is called.  The sleep
557  * priority, signal, and timeout is specified in the asleep() call but may be
558  * overriden in the mawait() call.
559  *
560  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
561  */
562 
563 int
564 asleep(void *ident, int priority, const char *wmesg, int timo)
565 {
566 	struct proc *p = curproc;
567 	int s;
568 
569 	/*
570 	 * obtain sched_lock while manipulating sleep structures and slpque.
571 	 *
572 	 * Remove preexisting wait condition (if any) and place process
573 	 * on appropriate slpque, but do not put process to sleep.
574 	 */
575 
576 	s = splhigh();
577 	mtx_enter(&sched_lock, MTX_SPIN);
578 
579 	if (p->p_wchan != NULL)
580 		unsleep(p);
581 
582 	if (ident) {
583 		p->p_wchan = ident;
584 		p->p_wmesg = wmesg;
585 		p->p_slptime = 0;
586 		p->p_asleep.as_priority = priority;
587 		p->p_asleep.as_timo = timo;
588 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
589 	}
590 
591 	mtx_exit(&sched_lock, MTX_SPIN);
592 	splx(s);
593 
594 	return(0);
595 }
596 
597 /*
598  * mawait() - wait for async condition to occur.   The process blocks until
599  * wakeup() is called on the most recent asleep() address.  If wakeup is called
600  * prior to mawait(), mawait() winds up being a NOP.
601  *
602  * If mawait() is called more then once (without an intervening asleep() call),
603  * mawait() is still effectively a NOP but it calls mi_switch() to give other
604  * processes some cpu before returning.  The process is left runnable.
605  *
606  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
607  */
608 
609 int
610 mawait(struct mtx *mtx, int priority, int timo)
611 {
612 	struct proc *p = curproc;
613 	int rval = 0;
614 	int s;
615 	WITNESS_SAVE_DECL(mtx);
616 
617 	WITNESS_SLEEP(0, mtx);
618 	mtx_enter(&sched_lock, MTX_SPIN);
619 	DROP_GIANT_NOSWITCH();
620 	if (mtx != NULL) {
621 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
622 		WITNESS_SAVE(mtx, mtx);
623 		mtx_exit(mtx, MTX_DEF | MTX_NOSWITCH);
624 		if (priority & PDROP)
625 			mtx = NULL;
626 	}
627 
628 	s = splhigh();
629 
630 	if (p->p_wchan != NULL) {
631 		struct callout_handle thandle;
632 		int sig;
633 		int catch;
634 
635 		/*
636 		 * The call to mawait() can override defaults specified in
637 		 * the original asleep().
638 		 */
639 		if (priority < 0)
640 			priority = p->p_asleep.as_priority;
641 		if (timo < 0)
642 			timo = p->p_asleep.as_timo;
643 
644 		/*
645 		 * Install timeout
646 		 */
647 
648 		if (timo)
649 			thandle = timeout(endtsleep, (void *)p, timo);
650 
651 		sig = 0;
652 		catch = priority & PCATCH;
653 
654 		if (catch) {
655 			p->p_flag |= P_SINTR;
656 			mtx_exit(&sched_lock, MTX_SPIN);
657 			if ((sig = CURSIG(p))) {
658 				mtx_enter(&sched_lock, MTX_SPIN);
659 				if (p->p_wchan)
660 					unsleep(p);
661 				p->p_stat = SRUN;
662 				goto resume;
663 			}
664 			mtx_enter(&sched_lock, MTX_SPIN);
665 			if (p->p_wchan == NULL) {
666 				catch = 0;
667 				goto resume;
668 			}
669 		}
670 		p->p_stat = SSLEEP;
671 		p->p_stats->p_ru.ru_nvcsw++;
672 		mi_switch();
673 resume:
674 		curpriority = p->p_usrpri;
675 
676 		splx(s);
677 		p->p_flag &= ~P_SINTR;
678 		if (p->p_flag & P_TIMEOUT) {
679 			p->p_flag &= ~P_TIMEOUT;
680 			if (sig == 0) {
681 #ifdef KTRACE
682 				if (KTRPOINT(p, KTR_CSW))
683 					ktrcsw(p->p_tracep, 0, 0);
684 #endif
685 				rval = EWOULDBLOCK;
686 				mtx_exit(&sched_lock, MTX_SPIN);
687 				goto out;
688 			}
689 		} else if (timo)
690 			untimeout(endtsleep, (void *)p, thandle);
691 		mtx_exit(&sched_lock, MTX_SPIN);
692 
693 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
694 #ifdef KTRACE
695 			if (KTRPOINT(p, KTR_CSW))
696 				ktrcsw(p->p_tracep, 0, 0);
697 #endif
698 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
699 				rval = EINTR;
700 			else
701 				rval = ERESTART;
702 			goto out;
703 		}
704 #ifdef KTRACE
705 		if (KTRPOINT(p, KTR_CSW))
706 			ktrcsw(p->p_tracep, 0, 0);
707 #endif
708 	} else {
709 		/*
710 		 * If as_priority is 0, mawait() has been called without an
711 		 * intervening asleep().  We are still effectively a NOP,
712 		 * but we call mi_switch() for safety.
713 		 */
714 
715 		if (p->p_asleep.as_priority == 0) {
716 			p->p_stats->p_ru.ru_nvcsw++;
717 			mi_switch();
718 		}
719 		mtx_exit(&sched_lock, MTX_SPIN);
720 		splx(s);
721 	}
722 
723 	/*
724 	 * clear p_asleep.as_priority as an indication that mawait() has been
725 	 * called.  If mawait() is called again without an intervening asleep(),
726 	 * mawait() is still effectively a NOP but the above mi_switch() code
727 	 * is triggered as a safety.
728 	 */
729 	p->p_asleep.as_priority = 0;
730 
731 out:
732 	PICKUP_GIANT();
733 	if (mtx != NULL) {
734 		mtx_enter(mtx, MTX_DEF);
735 		WITNESS_RESTORE(mtx, mtx);
736 	}
737 	return (rval);
738 }
739 
740 /*
741  * Implement timeout for msleep or asleep()/mawait()
742  *
743  * If process hasn't been awakened (wchan non-zero),
744  * set timeout flag and undo the sleep.  If proc
745  * is stopped, just unsleep so it will remain stopped.
746  */
747 static void
748 endtsleep(arg)
749 	void *arg;
750 {
751 	register struct proc *p;
752 	int s;
753 
754 	p = (struct proc *)arg;
755 	CTR4(KTR_PROC,
756 	        "endtsleep: proc %p (pid %d, %s), schedlock %p",
757 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
758 	s = splhigh();
759 	mtx_enter(&sched_lock, MTX_SPIN);
760 	if (p->p_wchan) {
761 		if (p->p_stat == SSLEEP)
762 			setrunnable(p);
763 		else
764 			unsleep(p);
765 		p->p_flag |= P_TIMEOUT;
766 	}
767 	mtx_exit(&sched_lock, MTX_SPIN);
768 	splx(s);
769 }
770 
771 /*
772  * Remove a process from its wait queue
773  */
774 void
775 unsleep(p)
776 	register struct proc *p;
777 {
778 	int s;
779 
780 	s = splhigh();
781 	mtx_enter(&sched_lock, MTX_SPIN);
782 	if (p->p_wchan) {
783 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
784 		p->p_wchan = 0;
785 	}
786 	mtx_exit(&sched_lock, MTX_SPIN);
787 	splx(s);
788 }
789 
790 /*
791  * Make all processes sleeping on the specified identifier runnable.
792  */
793 void
794 wakeup(ident)
795 	register void *ident;
796 {
797 	register struct slpquehead *qp;
798 	register struct proc *p;
799 	int s;
800 
801 	s = splhigh();
802 	mtx_enter(&sched_lock, MTX_SPIN);
803 	qp = &slpque[LOOKUP(ident)];
804 restart:
805 	TAILQ_FOREACH(p, qp, p_slpq) {
806 		if (p->p_wchan == ident) {
807 			TAILQ_REMOVE(qp, p, p_slpq);
808 			p->p_wchan = 0;
809 			if (p->p_stat == SSLEEP) {
810 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
811 				CTR4(KTR_PROC,
812 				        "wakeup: proc %p (pid %d, %s), schedlock %p",
813 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
814 				if (p->p_slptime > 1)
815 					updatepri(p);
816 				p->p_slptime = 0;
817 				p->p_stat = SRUN;
818 				if (p->p_flag & P_INMEM) {
819 					setrunqueue(p);
820 					maybe_resched(p);
821 				} else {
822 					p->p_flag |= P_SWAPINREQ;
823 					wakeup((caddr_t)&proc0);
824 				}
825 				/* END INLINE EXPANSION */
826 				goto restart;
827 			}
828 		}
829 	}
830 	mtx_exit(&sched_lock, MTX_SPIN);
831 	splx(s);
832 }
833 
834 /*
835  * Make a process sleeping on the specified identifier runnable.
836  * May wake more than one process if a target process is currently
837  * swapped out.
838  */
839 void
840 wakeup_one(ident)
841 	register void *ident;
842 {
843 	register struct slpquehead *qp;
844 	register struct proc *p;
845 	int s;
846 
847 	s = splhigh();
848 	mtx_enter(&sched_lock, MTX_SPIN);
849 	qp = &slpque[LOOKUP(ident)];
850 
851 	TAILQ_FOREACH(p, qp, p_slpq) {
852 		if (p->p_wchan == ident) {
853 			TAILQ_REMOVE(qp, p, p_slpq);
854 			p->p_wchan = 0;
855 			if (p->p_stat == SSLEEP) {
856 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
857 				CTR4(KTR_PROC,
858 				        "wakeup1: proc %p (pid %d, %s), schedlock %p",
859 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
860 				if (p->p_slptime > 1)
861 					updatepri(p);
862 				p->p_slptime = 0;
863 				p->p_stat = SRUN;
864 				if (p->p_flag & P_INMEM) {
865 					setrunqueue(p);
866 					maybe_resched(p);
867 					break;
868 				} else {
869 					p->p_flag |= P_SWAPINREQ;
870 					wakeup((caddr_t)&proc0);
871 				}
872 				/* END INLINE EXPANSION */
873 			}
874 		}
875 	}
876 	mtx_exit(&sched_lock, MTX_SPIN);
877 	splx(s);
878 }
879 
880 /*
881  * The machine independent parts of mi_switch().
882  * Must be called at splstatclock() or higher.
883  */
884 void
885 mi_switch()
886 {
887 	struct timeval new_switchtime;
888 	register struct proc *p = curproc;	/* XXX */
889 	register struct rlimit *rlim;
890 	int x;
891 
892 	/*
893 	 * XXX this spl is almost unnecessary.  It is partly to allow for
894 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
895 	 * main offender).  It is partly to work around a bug in the i386
896 	 * cpu_switch() (the ipl is not preserved).  We ran for years
897 	 * without it.  I think there was only a interrupt latency problem.
898 	 * The main caller, msleep(), does an splx() a couple of instructions
899 	 * after calling here.  The buggy caller, issignal(), usually calls
900 	 * here at spl0() and sometimes returns at splhigh().  The process
901 	 * then runs for a little too long at splhigh().  The ipl gets fixed
902 	 * when the process returns to user mode (or earlier).
903 	 *
904 	 * It would probably be better to always call here at spl0(). Callers
905 	 * are prepared to give up control to another process, so they must
906 	 * be prepared to be interrupted.  The clock stuff here may not
907 	 * actually need splstatclock().
908 	 */
909 	x = splstatclock();
910 
911 	mtx_assert(&sched_lock, MA_OWNED);
912 
913 #ifdef SIMPLELOCK_DEBUG
914 	if (p->p_simple_locks)
915 		printf("sleep: holding simple lock\n");
916 #endif
917 	/*
918 	 * Compute the amount of time during which the current
919 	 * process was running, and add that to its total so far.
920 	 */
921 	microuptime(&new_switchtime);
922 	if (timevalcmp(&new_switchtime, &switchtime, <)) {
923 		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
924 		    switchtime.tv_sec, switchtime.tv_usec,
925 		    new_switchtime.tv_sec, new_switchtime.tv_usec);
926 		new_switchtime = switchtime;
927 	} else {
928 		p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
929 		    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
930 	}
931 
932 	/*
933 	 * Check if the process exceeds its cpu resource allocation.
934 	 * If over max, kill it.
935 	 *
936 	 * XXX drop sched_lock, pickup Giant
937 	 */
938 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
939 	    p->p_runtime > p->p_limit->p_cpulimit) {
940 		rlim = &p->p_rlimit[RLIMIT_CPU];
941 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
942 			killproc(p, "exceeded maximum CPU limit");
943 		} else {
944 			psignal(p, SIGXCPU);
945 			if (rlim->rlim_cur < rlim->rlim_max) {
946 				/* XXX: we should make a private copy */
947 				rlim->rlim_cur += 5;
948 			}
949 		}
950 	}
951 
952 	/*
953 	 * Pick a new current process and record its start time.
954 	 */
955 	cnt.v_swtch++;
956 	switchtime = new_switchtime;
957 	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
958 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
959 	cpu_switch();
960 	CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
961 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
962 	if (switchtime.tv_sec == 0)
963 		microuptime(&switchtime);
964 	switchticks = ticks;
965 	splx(x);
966 }
967 
968 /*
969  * Change process state to be runnable,
970  * placing it on the run queue if it is in memory,
971  * and awakening the swapper if it isn't in memory.
972  */
973 void
974 setrunnable(p)
975 	register struct proc *p;
976 {
977 	register int s;
978 
979 	s = splhigh();
980 	mtx_enter(&sched_lock, MTX_SPIN);
981 	switch (p->p_stat) {
982 	case 0:
983 	case SRUN:
984 	case SZOMB:
985 	case SWAIT:
986 	default:
987 		panic("setrunnable");
988 	case SSTOP:
989 	case SSLEEP:
990 		unsleep(p);		/* e.g. when sending signals */
991 		break;
992 
993 	case SIDL:
994 		break;
995 	}
996 	p->p_stat = SRUN;
997 	if (p->p_flag & P_INMEM)
998 		setrunqueue(p);
999 	splx(s);
1000 	if (p->p_slptime > 1)
1001 		updatepri(p);
1002 	p->p_slptime = 0;
1003 	if ((p->p_flag & P_INMEM) == 0) {
1004 		p->p_flag |= P_SWAPINREQ;
1005 		wakeup((caddr_t)&proc0);
1006 	}
1007 	else
1008 		maybe_resched(p);
1009 	mtx_exit(&sched_lock, MTX_SPIN);
1010 }
1011 
1012 /*
1013  * Compute the priority of a process when running in user mode.
1014  * Arrange to reschedule if the resulting priority is better
1015  * than that of the current process.
1016  */
1017 void
1018 resetpriority(p)
1019 	register struct proc *p;
1020 {
1021 	register unsigned int newpriority;
1022 
1023 	mtx_enter(&sched_lock, MTX_SPIN);
1024 	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
1025 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
1026 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
1027 		newpriority = min(newpriority, MAXPRI);
1028 		p->p_usrpri = newpriority;
1029 	}
1030 	maybe_resched(p);
1031 	mtx_exit(&sched_lock, MTX_SPIN);
1032 }
1033 
1034 /* ARGSUSED */
1035 static void
1036 sched_setup(dummy)
1037 	void *dummy;
1038 {
1039 	/* Kick off timeout driven events by calling first time. */
1040 	roundrobin(NULL);
1041 	schedcpu(NULL);
1042 }
1043 
1044 /*
1045  * We adjust the priority of the current process.  The priority of
1046  * a process gets worse as it accumulates CPU time.  The cpu usage
1047  * estimator (p_estcpu) is increased here.  resetpriority() will
1048  * compute a different priority each time p_estcpu increases by
1049  * INVERSE_ESTCPU_WEIGHT
1050  * (until MAXPRI is reached).  The cpu usage estimator ramps up
1051  * quite quickly when the process is running (linearly), and decays
1052  * away exponentially, at a rate which is proportionally slower when
1053  * the system is busy.  The basic principle is that the system will
1054  * 90% forget that the process used a lot of CPU time in 5 * loadav
1055  * seconds.  This causes the system to favor processes which haven't
1056  * run much recently, and to round-robin among other processes.
1057  */
1058 void
1059 schedclock(p)
1060 	struct proc *p;
1061 {
1062 
1063 	p->p_cpticks++;
1064 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1065 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1066 		resetpriority(p);
1067 		if (p->p_priority >= PUSER)
1068 			p->p_priority = p->p_usrpri;
1069 	}
1070 }
1071