xref: /freebsd/sys/kern/kern_synch.c (revision 6de306ecee3831f48debaad1d0b22418faa48e10)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/ipl.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/condvar.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/signalvar.h>
54 #include <sys/resourcevar.h>
55 #include <sys/vmmeter.h>
56 #include <sys/sysctl.h>
57 #include <sys/sysproto.h>
58 #include <vm/vm.h>
59 #include <vm/vm_extern.h>
60 #ifdef KTRACE
61 #include <sys/uio.h>
62 #include <sys/ktrace.h>
63 #endif
64 
65 #include <machine/cpu.h>
66 #include <machine/smp.h>
67 
68 static void sched_setup __P((void *dummy));
69 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
70 
71 int	hogticks;
72 int	lbolt;
73 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
74 
75 static struct callout schedcpu_callout;
76 static struct callout roundrobin_callout;
77 
78 static void	endtsleep __P((void *));
79 static void	roundrobin __P((void *arg));
80 static void	schedcpu __P((void *arg));
81 
82 static int
83 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
84 {
85 	int error, new_val;
86 
87 	new_val = sched_quantum * tick;
88 	error = sysctl_handle_int(oidp, &new_val, 0, req);
89         if (error != 0 || req->newptr == NULL)
90 		return (error);
91 	if (new_val < tick)
92 		return (EINVAL);
93 	sched_quantum = new_val / tick;
94 	hogticks = 2 * sched_quantum;
95 	return (0);
96 }
97 
98 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
99 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
100 
101 /*
102  * Arrange to reschedule if necessary, taking the priorities and
103  * schedulers into account.
104  */
105 void
106 maybe_resched(p)
107 	struct proc *p;
108 {
109 
110 	mtx_assert(&sched_lock, MA_OWNED);
111 	if (p->p_pri.pri_level < curproc->p_pri.pri_level)
112 		need_resched();
113 }
114 
115 int
116 roundrobin_interval(void)
117 {
118 	return (sched_quantum);
119 }
120 
121 /*
122  * Force switch among equal priority processes every 100ms.
123  */
124 /* ARGSUSED */
125 static void
126 roundrobin(arg)
127 	void *arg;
128 {
129 
130 	mtx_lock_spin(&sched_lock);
131 	need_resched();
132 	mtx_unlock_spin(&sched_lock);
133 #ifdef SMP
134 	forward_roundrobin();
135 #endif
136 
137 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
138 }
139 
140 /*
141  * Constants for digital decay and forget:
142  *	90% of (p_estcpu) usage in 5 * loadav time
143  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
144  *          Note that, as ps(1) mentions, this can let percentages
145  *          total over 100% (I've seen 137.9% for 3 processes).
146  *
147  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
148  *
149  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
150  * That is, the system wants to compute a value of decay such
151  * that the following for loop:
152  * 	for (i = 0; i < (5 * loadavg); i++)
153  * 		p_estcpu *= decay;
154  * will compute
155  * 	p_estcpu *= 0.1;
156  * for all values of loadavg:
157  *
158  * Mathematically this loop can be expressed by saying:
159  * 	decay ** (5 * loadavg) ~= .1
160  *
161  * The system computes decay as:
162  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
163  *
164  * We wish to prove that the system's computation of decay
165  * will always fulfill the equation:
166  * 	decay ** (5 * loadavg) ~= .1
167  *
168  * If we compute b as:
169  * 	b = 2 * loadavg
170  * then
171  * 	decay = b / (b + 1)
172  *
173  * We now need to prove two things:
174  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
175  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
176  *
177  * Facts:
178  *         For x close to zero, exp(x) =~ 1 + x, since
179  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
180  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
181  *         For x close to zero, ln(1+x) =~ x, since
182  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
183  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
184  *         ln(.1) =~ -2.30
185  *
186  * Proof of (1):
187  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
188  *	solving for factor,
189  *      ln(factor) =~ (-2.30/5*loadav), or
190  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
191  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
192  *
193  * Proof of (2):
194  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
195  *	solving for power,
196  *      power*ln(b/(b+1)) =~ -2.30, or
197  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
198  *
199  * Actual power values for the implemented algorithm are as follows:
200  *      loadav: 1       2       3       4
201  *      power:  5.68    10.32   14.94   19.55
202  */
203 
204 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
205 #define	loadfactor(loadav)	(2 * (loadav))
206 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
207 
208 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
209 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
210 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
211 
212 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
213 static int	fscale __unused = FSCALE;
214 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
215 
216 /*
217  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
218  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
219  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
220  *
221  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
222  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
223  *
224  * If you don't want to bother with the faster/more-accurate formula, you
225  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
226  * (more general) method of calculating the %age of CPU used by a process.
227  */
228 #define	CCPU_SHIFT	11
229 
230 /*
231  * Recompute process priorities, every hz ticks.
232  * MP-safe, called without the Giant mutex.
233  */
234 /* ARGSUSED */
235 static void
236 schedcpu(arg)
237 	void *arg;
238 {
239 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
240 	register struct proc *p;
241 	register int realstathz, s;
242 
243 	realstathz = stathz ? stathz : hz;
244 	ALLPROC_LOCK(AP_SHARED);
245 	LIST_FOREACH(p, &allproc, p_list) {
246 		/*
247 		 * Increment time in/out of memory and sleep time
248 		 * (if sleeping).  We ignore overflow; with 16-bit int's
249 		 * (remember them?) overflow takes 45 days.
250 		if (p->p_stat == SWAIT)
251 			continue;
252 		 */
253 		mtx_lock_spin(&sched_lock);
254 		p->p_swtime++;
255 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
256 			p->p_slptime++;
257 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
258 		/*
259 		 * If the process has slept the entire second,
260 		 * stop recalculating its priority until it wakes up.
261 		 */
262 		if (p->p_slptime > 1) {
263 			mtx_unlock_spin(&sched_lock);
264 			continue;
265 		}
266 
267 		/*
268 		 * prevent state changes and protect run queue
269 		 */
270 		s = splhigh();
271 
272 		/*
273 		 * p_pctcpu is only for ps.
274 		 */
275 #if	(FSHIFT >= CCPU_SHIFT)
276 		p->p_pctcpu += (realstathz == 100)?
277 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
278                 	100 * (((fixpt_t) p->p_cpticks)
279 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
280 #else
281 		p->p_pctcpu += ((FSCALE - ccpu) *
282 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
283 #endif
284 		p->p_cpticks = 0;
285 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
286 		resetpriority(p);
287 		if (p->p_pri.pri_level >= PUSER) {
288 			if ((p != curproc) &&
289 #ifdef SMP
290 			    p->p_oncpu == NOCPU && 	/* idle */
291 #endif
292 			    p->p_stat == SRUN &&
293 			    (p->p_sflag & PS_INMEM) &&
294 			    (p->p_pri.pri_level / RQ_PPQ) !=
295 			    (p->p_pri.pri_user / RQ_PPQ)) {
296 				remrunqueue(p);
297 				p->p_pri.pri_level = p->p_pri.pri_user;
298 				setrunqueue(p);
299 			} else
300 				p->p_pri.pri_level = p->p_pri.pri_user;
301 		}
302 		mtx_unlock_spin(&sched_lock);
303 		splx(s);
304 	}
305 	ALLPROC_LOCK(AP_RELEASE);
306 	vmmeter();
307 	wakeup((caddr_t)&lbolt);
308 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
309 }
310 
311 /*
312  * Recalculate the priority of a process after it has slept for a while.
313  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
314  * least six times the loadfactor will decay p_estcpu to zero.
315  */
316 void
317 updatepri(p)
318 	register struct proc *p;
319 {
320 	register unsigned int newcpu = p->p_estcpu;
321 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
322 
323 	if (p->p_slptime > 5 * loadfac)
324 		p->p_estcpu = 0;
325 	else {
326 		p->p_slptime--;	/* the first time was done in schedcpu */
327 		while (newcpu && --p->p_slptime)
328 			newcpu = decay_cpu(loadfac, newcpu);
329 		p->p_estcpu = newcpu;
330 	}
331 	resetpriority(p);
332 }
333 
334 /*
335  * We're only looking at 7 bits of the address; everything is
336  * aligned to 4, lots of things are aligned to greater powers
337  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
338  */
339 #define TABLESIZE	128
340 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
341 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
342 
343 void
344 sleepinit(void)
345 {
346 	int i;
347 
348 	sched_quantum = hz/10;
349 	hogticks = 2 * sched_quantum;
350 	for (i = 0; i < TABLESIZE; i++)
351 		TAILQ_INIT(&slpque[i]);
352 }
353 
354 /*
355  * General sleep call.  Suspends the current process until a wakeup is
356  * performed on the specified identifier.  The process will then be made
357  * runnable with the specified priority.  Sleeps at most timo/hz seconds
358  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
359  * before and after sleeping, else signals are not checked.  Returns 0 if
360  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
361  * signal needs to be delivered, ERESTART is returned if the current system
362  * call should be restarted if possible, and EINTR is returned if the system
363  * call should be interrupted by the signal (return EINTR).
364  *
365  * The mutex argument is exited before the caller is suspended, and
366  * entered before msleep returns.  If priority includes the PDROP
367  * flag the mutex is not entered before returning.
368  */
369 int
370 msleep(ident, mtx, priority, wmesg, timo)
371 	void *ident;
372 	struct mtx *mtx;
373 	int priority, timo;
374 	const char *wmesg;
375 {
376 	struct proc *p = curproc;
377 	int s, sig, catch = priority & PCATCH;
378 	int rval = 0;
379 	WITNESS_SAVE_DECL(mtx);
380 
381 #ifdef KTRACE
382 	if (p && KTRPOINT(p, KTR_CSW))
383 		ktrcsw(p->p_tracep, 1, 0);
384 #endif
385 	WITNESS_SLEEP(0, mtx);
386 	mtx_lock_spin(&sched_lock);
387 	s = splhigh();
388 	if (cold || panicstr) {
389 		/*
390 		 * After a panic, or during autoconfiguration,
391 		 * just give interrupts a chance, then just return;
392 		 * don't run any other procs or panic below,
393 		 * in case this is the idle process and already asleep.
394 		 */
395 		if (mtx != NULL && priority & PDROP)
396 			mtx_unlock_flags(mtx, MTX_NOSWITCH);
397 		mtx_unlock_spin(&sched_lock);
398 		splx(s);
399 		return (0);
400 	}
401 
402 	DROP_GIANT_NOSWITCH();
403 
404 	if (mtx != NULL) {
405 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
406 		WITNESS_SAVE(mtx, mtx);
407 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
408 		if (priority & PDROP)
409 			mtx = NULL;
410 	}
411 
412 	KASSERT(p != NULL, ("msleep1"));
413 	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
414 	/*
415 	 * Process may be sitting on a slpque if asleep() was called, remove
416 	 * it before re-adding.
417 	 */
418 	if (p->p_wchan != NULL)
419 		unsleep(p);
420 
421 	p->p_wchan = ident;
422 	p->p_wmesg = wmesg;
423 	p->p_slptime = 0;
424 	p->p_pri.pri_level = priority & PRIMASK;
425 	CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
426 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
427 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
428 	if (timo)
429 		callout_reset(&p->p_slpcallout, timo, endtsleep, p);
430 	/*
431 	 * We put ourselves on the sleep queue and start our timeout
432 	 * before calling CURSIG, as we could stop there, and a wakeup
433 	 * or a SIGCONT (or both) could occur while we were stopped.
434 	 * A SIGCONT would cause us to be marked as SSLEEP
435 	 * without resuming us, thus we must be ready for sleep
436 	 * when CURSIG is called.  If the wakeup happens while we're
437 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
438 	 */
439 	if (catch) {
440 		CTR4(KTR_PROC,
441 		        "msleep caught: proc %p (pid %d, %s), schedlock %p",
442 			p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
443 		p->p_sflag |= PS_SINTR;
444 		mtx_unlock_spin(&sched_lock);
445 		if ((sig = CURSIG(p))) {
446 			mtx_lock_spin(&sched_lock);
447 			if (p->p_wchan)
448 				unsleep(p);
449 			p->p_stat = SRUN;
450 			goto resume;
451 		}
452 		mtx_lock_spin(&sched_lock);
453 		if (p->p_wchan == NULL) {
454 			catch = 0;
455 			goto resume;
456 		}
457 	} else
458 		sig = 0;
459 	p->p_stat = SSLEEP;
460 	p->p_stats->p_ru.ru_nvcsw++;
461 	mi_switch();
462 	CTR4(KTR_PROC,
463 	        "msleep resume: proc %p (pid %d, %s), schedlock %p",
464 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
465 resume:
466 	splx(s);
467 	p->p_sflag &= ~PS_SINTR;
468 	if (p->p_sflag & PS_TIMEOUT) {
469 		p->p_sflag &= ~PS_TIMEOUT;
470 		if (sig == 0) {
471 #ifdef KTRACE
472 			if (KTRPOINT(p, KTR_CSW))
473 				ktrcsw(p->p_tracep, 0, 0);
474 #endif
475 			rval = EWOULDBLOCK;
476 			mtx_unlock_spin(&sched_lock);
477 			goto out;
478 		}
479 	} else if (timo)
480 		callout_stop(&p->p_slpcallout);
481 	mtx_unlock_spin(&sched_lock);
482 
483 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
484 #ifdef KTRACE
485 		if (KTRPOINT(p, KTR_CSW))
486 			ktrcsw(p->p_tracep, 0, 0);
487 #endif
488 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
489 			rval = EINTR;
490 		else
491 			rval = ERESTART;
492 		goto out;
493 	}
494 out:
495 #ifdef KTRACE
496 	if (KTRPOINT(p, KTR_CSW))
497 		ktrcsw(p->p_tracep, 0, 0);
498 #endif
499 	PICKUP_GIANT();
500 	if (mtx != NULL) {
501 		mtx_lock(mtx);
502 		WITNESS_RESTORE(mtx, mtx);
503 	}
504 	return (rval);
505 }
506 
507 /*
508  * asleep() - async sleep call.  Place process on wait queue and return
509  * immediately without blocking.  The process stays runnable until mawait()
510  * is called.  If ident is NULL, remove process from wait queue if it is still
511  * on one.
512  *
513  * Only the most recent sleep condition is effective when making successive
514  * calls to asleep() or when calling msleep().
515  *
516  * The timeout, if any, is not initiated until mawait() is called.  The sleep
517  * priority, signal, and timeout is specified in the asleep() call but may be
518  * overriden in the mawait() call.
519  *
520  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
521  */
522 
523 int
524 asleep(void *ident, int priority, const char *wmesg, int timo)
525 {
526 	struct proc *p = curproc;
527 	int s;
528 
529 	/*
530 	 * obtain sched_lock while manipulating sleep structures and slpque.
531 	 *
532 	 * Remove preexisting wait condition (if any) and place process
533 	 * on appropriate slpque, but do not put process to sleep.
534 	 */
535 
536 	s = splhigh();
537 	mtx_lock_spin(&sched_lock);
538 
539 	if (p->p_wchan != NULL)
540 		unsleep(p);
541 
542 	if (ident) {
543 		p->p_wchan = ident;
544 		p->p_wmesg = wmesg;
545 		p->p_slptime = 0;
546 		p->p_asleep.as_priority = priority;
547 		p->p_asleep.as_timo = timo;
548 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
549 	}
550 
551 	mtx_unlock_spin(&sched_lock);
552 	splx(s);
553 
554 	return(0);
555 }
556 
557 /*
558  * mawait() - wait for async condition to occur.   The process blocks until
559  * wakeup() is called on the most recent asleep() address.  If wakeup is called
560  * prior to mawait(), mawait() winds up being a NOP.
561  *
562  * If mawait() is called more then once (without an intervening asleep() call),
563  * mawait() is still effectively a NOP but it calls mi_switch() to give other
564  * processes some cpu before returning.  The process is left runnable.
565  *
566  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
567  */
568 
569 int
570 mawait(struct mtx *mtx, int priority, int timo)
571 {
572 	struct proc *p = curproc;
573 	int rval = 0;
574 	int s;
575 	WITNESS_SAVE_DECL(mtx);
576 
577 	WITNESS_SLEEP(0, mtx);
578 	mtx_lock_spin(&sched_lock);
579 	DROP_GIANT_NOSWITCH();
580 	if (mtx != NULL) {
581 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
582 		WITNESS_SAVE(mtx, mtx);
583 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
584 		if (priority & PDROP)
585 			mtx = NULL;
586 	}
587 
588 	s = splhigh();
589 
590 	if (p->p_wchan != NULL) {
591 		int sig;
592 		int catch;
593 
594 		/*
595 		 * The call to mawait() can override defaults specified in
596 		 * the original asleep().
597 		 */
598 		if (priority < 0)
599 			priority = p->p_asleep.as_priority;
600 		if (timo < 0)
601 			timo = p->p_asleep.as_timo;
602 
603 		/*
604 		 * Install timeout
605 		 */
606 
607 		if (timo)
608 			callout_reset(&p->p_slpcallout, timo, endtsleep, p);
609 
610 		sig = 0;
611 		catch = priority & PCATCH;
612 
613 		if (catch) {
614 			p->p_sflag |= PS_SINTR;
615 			mtx_unlock_spin(&sched_lock);
616 			if ((sig = CURSIG(p))) {
617 				mtx_lock_spin(&sched_lock);
618 				if (p->p_wchan)
619 					unsleep(p);
620 				p->p_stat = SRUN;
621 				goto resume;
622 			}
623 			mtx_lock_spin(&sched_lock);
624 			if (p->p_wchan == NULL) {
625 				catch = 0;
626 				goto resume;
627 			}
628 		}
629 		p->p_stat = SSLEEP;
630 		p->p_stats->p_ru.ru_nvcsw++;
631 		mi_switch();
632 resume:
633 
634 		splx(s);
635 		p->p_sflag &= ~PS_SINTR;
636 		if (p->p_sflag & PS_TIMEOUT) {
637 			p->p_sflag &= ~PS_TIMEOUT;
638 			if (sig == 0) {
639 #ifdef KTRACE
640 				if (KTRPOINT(p, KTR_CSW))
641 					ktrcsw(p->p_tracep, 0, 0);
642 #endif
643 				rval = EWOULDBLOCK;
644 				mtx_unlock_spin(&sched_lock);
645 				goto out;
646 			}
647 		} else if (timo)
648 			callout_stop(&p->p_slpcallout);
649 		mtx_unlock_spin(&sched_lock);
650 
651 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
652 #ifdef KTRACE
653 			if (KTRPOINT(p, KTR_CSW))
654 				ktrcsw(p->p_tracep, 0, 0);
655 #endif
656 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
657 				rval = EINTR;
658 			else
659 				rval = ERESTART;
660 			goto out;
661 		}
662 #ifdef KTRACE
663 		if (KTRPOINT(p, KTR_CSW))
664 			ktrcsw(p->p_tracep, 0, 0);
665 #endif
666 	} else {
667 		/*
668 		 * If as_priority is 0, mawait() has been called without an
669 		 * intervening asleep().  We are still effectively a NOP,
670 		 * but we call mi_switch() for safety.
671 		 */
672 
673 		if (p->p_asleep.as_priority == 0) {
674 			p->p_stats->p_ru.ru_nvcsw++;
675 			mi_switch();
676 		}
677 		mtx_unlock_spin(&sched_lock);
678 		splx(s);
679 	}
680 
681 	/*
682 	 * clear p_asleep.as_priority as an indication that mawait() has been
683 	 * called.  If mawait() is called again without an intervening asleep(),
684 	 * mawait() is still effectively a NOP but the above mi_switch() code
685 	 * is triggered as a safety.
686 	 */
687 	p->p_asleep.as_priority = 0;
688 
689 out:
690 	PICKUP_GIANT();
691 	if (mtx != NULL) {
692 		mtx_lock(mtx);
693 		WITNESS_RESTORE(mtx, mtx);
694 	}
695 	return (rval);
696 }
697 
698 /*
699  * Implement timeout for msleep or asleep()/mawait()
700  *
701  * If process hasn't been awakened (wchan non-zero),
702  * set timeout flag and undo the sleep.  If proc
703  * is stopped, just unsleep so it will remain stopped.
704  * MP-safe, called without the Giant mutex.
705  */
706 static void
707 endtsleep(arg)
708 	void *arg;
709 {
710 	register struct proc *p;
711 	int s;
712 
713 	p = (struct proc *)arg;
714 	CTR4(KTR_PROC,
715 	        "endtsleep: proc %p (pid %d, %s), schedlock %p",
716 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
717 	s = splhigh();
718 	mtx_lock_spin(&sched_lock);
719 	if (p->p_wchan) {
720 		if (p->p_stat == SSLEEP)
721 			setrunnable(p);
722 		else
723 			unsleep(p);
724 		p->p_sflag |= PS_TIMEOUT;
725 	}
726 	mtx_unlock_spin(&sched_lock);
727 	splx(s);
728 }
729 
730 /*
731  * Remove a process from its wait queue
732  */
733 void
734 unsleep(p)
735 	register struct proc *p;
736 {
737 	int s;
738 
739 	s = splhigh();
740 	mtx_lock_spin(&sched_lock);
741 	if (p->p_wchan) {
742 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
743 		p->p_wchan = NULL;
744 	}
745 	mtx_unlock_spin(&sched_lock);
746 	splx(s);
747 }
748 
749 /*
750  * Make all processes sleeping on the specified identifier runnable.
751  */
752 void
753 wakeup(ident)
754 	register void *ident;
755 {
756 	register struct slpquehead *qp;
757 	register struct proc *p;
758 	int s;
759 
760 	s = splhigh();
761 	mtx_lock_spin(&sched_lock);
762 	qp = &slpque[LOOKUP(ident)];
763 restart:
764 	TAILQ_FOREACH(p, qp, p_slpq) {
765 		if (p->p_wchan == ident) {
766 			TAILQ_REMOVE(qp, p, p_slpq);
767 			p->p_wchan = NULL;
768 			if (p->p_stat == SSLEEP) {
769 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
770 				CTR4(KTR_PROC,
771 				        "wakeup: proc %p (pid %d, %s), schedlock %p",
772 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
773 				if (p->p_slptime > 1)
774 					updatepri(p);
775 				p->p_slptime = 0;
776 				p->p_stat = SRUN;
777 				if (p->p_sflag & PS_INMEM) {
778 					setrunqueue(p);
779 					maybe_resched(p);
780 				} else {
781 					p->p_sflag |= PS_SWAPINREQ;
782 					wakeup((caddr_t)&proc0);
783 				}
784 				/* END INLINE EXPANSION */
785 				goto restart;
786 			}
787 		}
788 	}
789 	mtx_unlock_spin(&sched_lock);
790 	splx(s);
791 }
792 
793 /*
794  * Make a process sleeping on the specified identifier runnable.
795  * May wake more than one process if a target process is currently
796  * swapped out.
797  */
798 void
799 wakeup_one(ident)
800 	register void *ident;
801 {
802 	register struct slpquehead *qp;
803 	register struct proc *p;
804 	int s;
805 
806 	s = splhigh();
807 	mtx_lock_spin(&sched_lock);
808 	qp = &slpque[LOOKUP(ident)];
809 
810 	TAILQ_FOREACH(p, qp, p_slpq) {
811 		if (p->p_wchan == ident) {
812 			TAILQ_REMOVE(qp, p, p_slpq);
813 			p->p_wchan = NULL;
814 			if (p->p_stat == SSLEEP) {
815 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
816 				CTR4(KTR_PROC,
817 				        "wakeup1: proc %p (pid %d, %s), schedlock %p",
818 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
819 				if (p->p_slptime > 1)
820 					updatepri(p);
821 				p->p_slptime = 0;
822 				p->p_stat = SRUN;
823 				if (p->p_sflag & PS_INMEM) {
824 					setrunqueue(p);
825 					maybe_resched(p);
826 					break;
827 				} else {
828 					p->p_sflag |= PS_SWAPINREQ;
829 					wakeup((caddr_t)&proc0);
830 				}
831 				/* END INLINE EXPANSION */
832 			}
833 		}
834 	}
835 	mtx_unlock_spin(&sched_lock);
836 	splx(s);
837 }
838 
839 /*
840  * The machine independent parts of mi_switch().
841  * Must be called at splstatclock() or higher.
842  */
843 void
844 mi_switch()
845 {
846 	struct timeval new_switchtime;
847 	register struct proc *p = curproc;	/* XXX */
848 #if 0
849 	register struct rlimit *rlim;
850 #endif
851 	int x;
852 	u_int sched_nest;
853 
854 	/*
855 	 * XXX this spl is almost unnecessary.  It is partly to allow for
856 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
857 	 * main offender).  It is partly to work around a bug in the i386
858 	 * cpu_switch() (the ipl is not preserved).  We ran for years
859 	 * without it.  I think there was only a interrupt latency problem.
860 	 * The main caller, msleep(), does an splx() a couple of instructions
861 	 * after calling here.  The buggy caller, issignal(), usually calls
862 	 * here at spl0() and sometimes returns at splhigh().  The process
863 	 * then runs for a little too long at splhigh().  The ipl gets fixed
864 	 * when the process returns to user mode (or earlier).
865 	 *
866 	 * It would probably be better to always call here at spl0(). Callers
867 	 * are prepared to give up control to another process, so they must
868 	 * be prepared to be interrupted.  The clock stuff here may not
869 	 * actually need splstatclock().
870 	 */
871 	x = splstatclock();
872 
873 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
874 
875 	/*
876 	 * Compute the amount of time during which the current
877 	 * process was running, and add that to its total so far.
878 	 */
879 	microuptime(&new_switchtime);
880 	if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
881 #if 0
882 		/* XXX: This doesn't play well with sched_lock right now. */
883 		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
884 		    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
885 		    new_switchtime.tv_sec, new_switchtime.tv_usec);
886 #endif
887 		new_switchtime = PCPU_GET(switchtime);
888 	} else {
889 		p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
890 		    (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
891 		    (int64_t)1000000;
892 	}
893 
894 #if 0
895 	/*
896 	 * Check if the process exceeds its cpu resource allocation.
897 	 * If over max, kill it.
898 	 *
899 	 * XXX drop sched_lock, pickup Giant
900 	 */
901 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
902 	    p->p_runtime > p->p_limit->p_cpulimit) {
903 		rlim = &p->p_rlimit[RLIMIT_CPU];
904 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
905 			mtx_unlock_spin(&sched_lock);
906 			killproc(p, "exceeded maximum CPU limit");
907 			mtx_lock_spin(&sched_lock);
908 		} else {
909 			mtx_unlock_spin(&sched_lock);
910 			psignal(p, SIGXCPU);
911 			mtx_lock_spin(&sched_lock);
912 			if (rlim->rlim_cur < rlim->rlim_max) {
913 				/* XXX: we should make a private copy */
914 				rlim->rlim_cur += 5;
915 			}
916 		}
917 	}
918 #endif
919 
920 	/*
921 	 * Pick a new current process and record its start time.
922 	 */
923 	cnt.v_swtch++;
924 	PCPU_SET(switchtime, new_switchtime);
925 	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
926 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
927 	sched_nest = sched_lock.mtx_recurse;
928 	curproc->p_lastcpu = curproc->p_oncpu;
929 	curproc->p_oncpu = NOCPU;
930 	clear_resched();
931 	cpu_switch();
932 	curproc->p_oncpu = PCPU_GET(cpuid);
933 	sched_lock.mtx_recurse = sched_nest;
934 	sched_lock.mtx_lock = (uintptr_t)curproc;
935 	CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
936 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
937 	if (PCPU_GET(switchtime.tv_sec) == 0)
938 		microuptime(PCPU_PTR(switchtime));
939 	PCPU_SET(switchticks, ticks);
940 	splx(x);
941 }
942 
943 /*
944  * Change process state to be runnable,
945  * placing it on the run queue if it is in memory,
946  * and awakening the swapper if it isn't in memory.
947  */
948 void
949 setrunnable(p)
950 	register struct proc *p;
951 {
952 	register int s;
953 
954 	s = splhigh();
955 	mtx_lock_spin(&sched_lock);
956 	switch (p->p_stat) {
957 	case 0:
958 	case SRUN:
959 	case SZOMB:
960 	case SWAIT:
961 	default:
962 		panic("setrunnable");
963 	case SSTOP:
964 	case SSLEEP:			/* e.g. when sending signals */
965 		if (p->p_sflag & PS_CVWAITQ)
966 			cv_waitq_remove(p);
967 		else
968 			unsleep(p);
969 		break;
970 
971 	case SIDL:
972 		break;
973 	}
974 	p->p_stat = SRUN;
975 	if (p->p_sflag & PS_INMEM)
976 		setrunqueue(p);
977 	splx(s);
978 	if (p->p_slptime > 1)
979 		updatepri(p);
980 	p->p_slptime = 0;
981 	if ((p->p_sflag & PS_INMEM) == 0) {
982 		p->p_sflag |= PS_SWAPINREQ;
983 		wakeup((caddr_t)&proc0);
984 	}
985 	else
986 		maybe_resched(p);
987 	mtx_unlock_spin(&sched_lock);
988 }
989 
990 /*
991  * Compute the priority of a process when running in user mode.
992  * Arrange to reschedule if the resulting priority is better
993  * than that of the current process.
994  */
995 void
996 resetpriority(p)
997 	register struct proc *p;
998 {
999 	register unsigned int newpriority;
1000 
1001 	mtx_lock_spin(&sched_lock);
1002 	if (p->p_pri.pri_class == PRI_TIMESHARE) {
1003 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
1004 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
1005 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
1006 		    PRI_MAX_TIMESHARE);
1007 		p->p_pri.pri_user = newpriority;
1008 	}
1009 	maybe_resched(p);
1010 	mtx_unlock_spin(&sched_lock);
1011 }
1012 
1013 /* ARGSUSED */
1014 static void
1015 sched_setup(dummy)
1016 	void *dummy;
1017 {
1018 
1019 	callout_init(&schedcpu_callout, 1);
1020 	callout_init(&roundrobin_callout, 0);
1021 
1022 	/* Kick off timeout driven events by calling first time. */
1023 	roundrobin(NULL);
1024 	schedcpu(NULL);
1025 }
1026 
1027 /*
1028  * We adjust the priority of the current process.  The priority of
1029  * a process gets worse as it accumulates CPU time.  The cpu usage
1030  * estimator (p_estcpu) is increased here.  resetpriority() will
1031  * compute a different priority each time p_estcpu increases by
1032  * INVERSE_ESTCPU_WEIGHT
1033  * (until MAXPRI is reached).  The cpu usage estimator ramps up
1034  * quite quickly when the process is running (linearly), and decays
1035  * away exponentially, at a rate which is proportionally slower when
1036  * the system is busy.  The basic principle is that the system will
1037  * 90% forget that the process used a lot of CPU time in 5 * loadav
1038  * seconds.  This causes the system to favor processes which haven't
1039  * run much recently, and to round-robin among other processes.
1040  */
1041 void
1042 schedclock(p)
1043 	struct proc *p;
1044 {
1045 
1046 	p->p_cpticks++;
1047 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1048 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1049 		resetpriority(p);
1050 		if (p->p_pri.pri_level >= PUSER)
1051 			p->p_pri.pri_level = p->p_pri.pri_user;
1052 	}
1053 }
1054 
1055 /*
1056  * General purpose yield system call
1057  */
1058 int
1059 yield(struct proc *p, struct yield_args *uap)
1060 {
1061 	int s;
1062 
1063 	p->p_retval[0] = 0;
1064 
1065 	s = splhigh();
1066 	mtx_lock_spin(&sched_lock);
1067 	DROP_GIANT_NOSWITCH();
1068 	p->p_pri.pri_level = PRI_MAX_TIMESHARE;
1069 	setrunqueue(p);
1070 	p->p_stats->p_ru.ru_nvcsw++;
1071 	mi_switch();
1072 	mtx_unlock_spin(&sched_lock);
1073 	PICKUP_GIANT();
1074 	splx(s);
1075 
1076 	return (0);
1077 }
1078