xref: /freebsd/sys/kern/kern_synch.c (revision 1b6c76a2fe091c74f08427e6c870851025a9cf67)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/condvar.h>
47 #include <sys/kernel.h>
48 #include <sys/ktr.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/signalvar.h>
54 #include <sys/smp.h>
55 #include <sys/sx.h>
56 #include <sys/sysctl.h>
57 #include <sys/sysproto.h>
58 #include <sys/vmmeter.h>
59 #include <vm/vm.h>
60 #include <vm/vm_extern.h>
61 #ifdef KTRACE
62 #include <sys/uio.h>
63 #include <sys/ktrace.h>
64 #endif
65 
66 #include <machine/cpu.h>
67 
68 static void sched_setup __P((void *dummy));
69 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
70 
71 int	hogticks;
72 int	lbolt;
73 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
74 
75 static struct callout schedcpu_callout;
76 static struct callout roundrobin_callout;
77 
78 static void	endtsleep __P((void *));
79 static void	roundrobin __P((void *arg));
80 static void	schedcpu __P((void *arg));
81 
82 static int
83 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
84 {
85 	int error, new_val;
86 
87 	new_val = sched_quantum * tick;
88 	error = sysctl_handle_int(oidp, &new_val, 0, req);
89         if (error != 0 || req->newptr == NULL)
90 		return (error);
91 	if (new_val < tick)
92 		return (EINVAL);
93 	sched_quantum = new_val / tick;
94 	hogticks = 2 * sched_quantum;
95 	return (0);
96 }
97 
98 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
99 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
100 
101 /*
102  * Arrange to reschedule if necessary, taking the priorities and
103  * schedulers into account.
104  */
105 void
106 maybe_resched(p)
107 	struct proc *p;
108 {
109 
110 	mtx_assert(&sched_lock, MA_OWNED);
111 	if (p->p_pri.pri_level < curproc->p_pri.pri_level)
112 		need_resched(curproc);
113 }
114 
115 int
116 roundrobin_interval(void)
117 {
118 	return (sched_quantum);
119 }
120 
121 /*
122  * Force switch among equal priority processes every 100ms.
123  */
124 /* ARGSUSED */
125 static void
126 roundrobin(arg)
127 	void *arg;
128 {
129 
130 	mtx_lock_spin(&sched_lock);
131 	need_resched(curproc);
132 #ifdef SMP
133 	forward_roundrobin();
134 #endif
135 	mtx_unlock_spin(&sched_lock);
136 
137 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
138 }
139 
140 /*
141  * Constants for digital decay and forget:
142  *	90% of (p_estcpu) usage in 5 * loadav time
143  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
144  *          Note that, as ps(1) mentions, this can let percentages
145  *          total over 100% (I've seen 137.9% for 3 processes).
146  *
147  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
148  *
149  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
150  * That is, the system wants to compute a value of decay such
151  * that the following for loop:
152  * 	for (i = 0; i < (5 * loadavg); i++)
153  * 		p_estcpu *= decay;
154  * will compute
155  * 	p_estcpu *= 0.1;
156  * for all values of loadavg:
157  *
158  * Mathematically this loop can be expressed by saying:
159  * 	decay ** (5 * loadavg) ~= .1
160  *
161  * The system computes decay as:
162  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
163  *
164  * We wish to prove that the system's computation of decay
165  * will always fulfill the equation:
166  * 	decay ** (5 * loadavg) ~= .1
167  *
168  * If we compute b as:
169  * 	b = 2 * loadavg
170  * then
171  * 	decay = b / (b + 1)
172  *
173  * We now need to prove two things:
174  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
175  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
176  *
177  * Facts:
178  *         For x close to zero, exp(x) =~ 1 + x, since
179  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
180  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
181  *         For x close to zero, ln(1+x) =~ x, since
182  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
183  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
184  *         ln(.1) =~ -2.30
185  *
186  * Proof of (1):
187  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
188  *	solving for factor,
189  *      ln(factor) =~ (-2.30/5*loadav), or
190  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
191  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
192  *
193  * Proof of (2):
194  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
195  *	solving for power,
196  *      power*ln(b/(b+1)) =~ -2.30, or
197  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
198  *
199  * Actual power values for the implemented algorithm are as follows:
200  *      loadav: 1       2       3       4
201  *      power:  5.68    10.32   14.94   19.55
202  */
203 
204 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
205 #define	loadfactor(loadav)	(2 * (loadav))
206 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
207 
208 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
209 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
210 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
211 
212 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
213 static int	fscale __unused = FSCALE;
214 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
215 
216 /*
217  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
218  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
219  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
220  *
221  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
222  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
223  *
224  * If you don't want to bother with the faster/more-accurate formula, you
225  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
226  * (more general) method of calculating the %age of CPU used by a process.
227  */
228 #define	CCPU_SHIFT	11
229 
230 /*
231  * Recompute process priorities, every hz ticks.
232  * MP-safe, called without the Giant mutex.
233  */
234 /* ARGSUSED */
235 static void
236 schedcpu(arg)
237 	void *arg;
238 {
239 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
240 	register struct proc *p;
241 	register int realstathz, s;
242 
243 	realstathz = stathz ? stathz : hz;
244 	sx_slock(&allproc_lock);
245 	LIST_FOREACH(p, &allproc, p_list) {
246 		/*
247 		 * Increment time in/out of memory and sleep time
248 		 * (if sleeping).  We ignore overflow; with 16-bit int's
249 		 * (remember them?) overflow takes 45 days.
250 		if (p->p_stat == SWAIT)
251 			continue;
252 		 */
253 		mtx_lock_spin(&sched_lock);
254 		p->p_swtime++;
255 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
256 			p->p_slptime++;
257 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
258 		/*
259 		 * If the process has slept the entire second,
260 		 * stop recalculating its priority until it wakes up.
261 		 */
262 		if (p->p_slptime > 1) {
263 			mtx_unlock_spin(&sched_lock);
264 			continue;
265 		}
266 
267 		/*
268 		 * prevent state changes and protect run queue
269 		 */
270 		s = splhigh();
271 
272 		/*
273 		 * p_pctcpu is only for ps.
274 		 */
275 #if	(FSHIFT >= CCPU_SHIFT)
276 		p->p_pctcpu += (realstathz == 100)?
277 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
278                 	100 * (((fixpt_t) p->p_cpticks)
279 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
280 #else
281 		p->p_pctcpu += ((FSCALE - ccpu) *
282 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
283 #endif
284 		p->p_cpticks = 0;
285 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
286 		resetpriority(p);
287 		if (p->p_pri.pri_level >= PUSER) {
288 			if ((p != curproc) &&
289 #ifdef SMP
290 			    p->p_oncpu == NOCPU && 	/* idle */
291 #endif
292 			    p->p_stat == SRUN &&
293 			    (p->p_sflag & PS_INMEM) &&
294 			    (p->p_pri.pri_level / RQ_PPQ) !=
295 			    (p->p_pri.pri_user / RQ_PPQ)) {
296 				remrunqueue(p);
297 				p->p_pri.pri_level = p->p_pri.pri_user;
298 				setrunqueue(p);
299 			} else
300 				p->p_pri.pri_level = p->p_pri.pri_user;
301 		}
302 		mtx_unlock_spin(&sched_lock);
303 		splx(s);
304 	}
305 	sx_sunlock(&allproc_lock);
306 	vmmeter();
307 	wakeup((caddr_t)&lbolt);
308 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
309 }
310 
311 /*
312  * Recalculate the priority of a process after it has slept for a while.
313  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
314  * least six times the loadfactor will decay p_estcpu to zero.
315  */
316 void
317 updatepri(p)
318 	register struct proc *p;
319 {
320 	register unsigned int newcpu = p->p_estcpu;
321 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
322 
323 	if (p->p_slptime > 5 * loadfac)
324 		p->p_estcpu = 0;
325 	else {
326 		p->p_slptime--;	/* the first time was done in schedcpu */
327 		while (newcpu && --p->p_slptime)
328 			newcpu = decay_cpu(loadfac, newcpu);
329 		p->p_estcpu = newcpu;
330 	}
331 	resetpriority(p);
332 }
333 
334 /*
335  * We're only looking at 7 bits of the address; everything is
336  * aligned to 4, lots of things are aligned to greater powers
337  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
338  */
339 #define TABLESIZE	128
340 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
341 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
342 
343 void
344 sleepinit(void)
345 {
346 	int i;
347 
348 	sched_quantum = hz/10;
349 	hogticks = 2 * sched_quantum;
350 	for (i = 0; i < TABLESIZE; i++)
351 		TAILQ_INIT(&slpque[i]);
352 }
353 
354 /*
355  * General sleep call.  Suspends the current process until a wakeup is
356  * performed on the specified identifier.  The process will then be made
357  * runnable with the specified priority.  Sleeps at most timo/hz seconds
358  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
359  * before and after sleeping, else signals are not checked.  Returns 0 if
360  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
361  * signal needs to be delivered, ERESTART is returned if the current system
362  * call should be restarted if possible, and EINTR is returned if the system
363  * call should be interrupted by the signal (return EINTR).
364  *
365  * The mutex argument is exited before the caller is suspended, and
366  * entered before msleep returns.  If priority includes the PDROP
367  * flag the mutex is not entered before returning.
368  */
369 int
370 msleep(ident, mtx, priority, wmesg, timo)
371 	void *ident;
372 	struct mtx *mtx;
373 	int priority, timo;
374 	const char *wmesg;
375 {
376 	struct proc *p = curproc;
377 	int sig, catch = priority & PCATCH;
378 	int rval = 0;
379 	WITNESS_SAVE_DECL(mtx);
380 
381 #ifdef KTRACE
382 	if (p && KTRPOINT(p, KTR_CSW))
383 		ktrcsw(p->p_tracep, 1, 0);
384 #endif
385 	WITNESS_SLEEP(0, &mtx->mtx_object);
386 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
387 	    ("sleeping without a mutex"));
388 	mtx_lock_spin(&sched_lock);
389 	if (cold || panicstr) {
390 		/*
391 		 * After a panic, or during autoconfiguration,
392 		 * just give interrupts a chance, then just return;
393 		 * don't run any other procs or panic below,
394 		 * in case this is the idle process and already asleep.
395 		 */
396 		if (mtx != NULL && priority & PDROP)
397 			mtx_unlock_flags(mtx, MTX_NOSWITCH);
398 		mtx_unlock_spin(&sched_lock);
399 		return (0);
400 	}
401 
402 	DROP_GIANT_NOSWITCH();
403 
404 	if (mtx != NULL) {
405 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
406 		WITNESS_SAVE(&mtx->mtx_object, mtx);
407 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
408 		if (priority & PDROP)
409 			mtx = NULL;
410 	}
411 
412 	KASSERT(p != NULL, ("msleep1"));
413 	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
414 	/*
415 	 * Process may be sitting on a slpque if asleep() was called, remove
416 	 * it before re-adding.
417 	 */
418 	if (p->p_wchan != NULL)
419 		unsleep(p);
420 
421 	p->p_wchan = ident;
422 	p->p_wmesg = wmesg;
423 	p->p_slptime = 0;
424 	p->p_pri.pri_level = priority & PRIMASK;
425 	CTR3(KTR_PROC, "msleep: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm);
426 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
427 	if (timo)
428 		callout_reset(&p->p_slpcallout, timo, endtsleep, p);
429 	/*
430 	 * We put ourselves on the sleep queue and start our timeout
431 	 * before calling CURSIG, as we could stop there, and a wakeup
432 	 * or a SIGCONT (or both) could occur while we were stopped.
433 	 * A SIGCONT would cause us to be marked as SSLEEP
434 	 * without resuming us, thus we must be ready for sleep
435 	 * when CURSIG is called.  If the wakeup happens while we're
436 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
437 	 */
438 	if (catch) {
439 		CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,
440 		    p->p_pid, p->p_comm);
441 		p->p_sflag |= PS_SINTR;
442 		mtx_unlock_spin(&sched_lock);
443 		PROC_LOCK(p);
444 		sig = CURSIG(p);
445 		mtx_lock_spin(&sched_lock);
446 		PROC_UNLOCK_NOSWITCH(p);
447 		if (sig != 0) {
448 			if (p->p_wchan)
449 				unsleep(p);
450 		} else if (p->p_wchan == NULL)
451 			catch = 0;
452 	} else
453 		sig = 0;
454 	if (p->p_wchan != NULL) {
455 		p->p_stat = SSLEEP;
456 		p->p_stats->p_ru.ru_nvcsw++;
457 		mi_switch();
458 	}
459 	CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", p, p->p_pid,
460 	    p->p_comm);
461 	KASSERT(p->p_stat == SRUN, ("running but not SRUN"));
462 	p->p_sflag &= ~PS_SINTR;
463 	if (p->p_sflag & PS_TIMEOUT) {
464 		p->p_sflag &= ~PS_TIMEOUT;
465 		if (sig == 0)
466 			rval = EWOULDBLOCK;
467 	} else if (timo)
468 		callout_stop(&p->p_slpcallout);
469 	mtx_unlock_spin(&sched_lock);
470 
471 	if (rval == 0 && catch) {
472 		PROC_LOCK(p);
473 		/* XXX: shouldn't we always be calling CURSIG() */
474 		if (sig != 0 || (sig = CURSIG(p))) {
475 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
476 				rval = EINTR;
477 			else
478 				rval = ERESTART;
479 		}
480 		PROC_UNLOCK(p);
481 	}
482 	PICKUP_GIANT();
483 #ifdef KTRACE
484 	mtx_lock(&Giant);
485 	if (KTRPOINT(p, KTR_CSW))
486 		ktrcsw(p->p_tracep, 0, 0);
487 	mtx_unlock(&Giant);
488 #endif
489 	if (mtx != NULL) {
490 		mtx_lock(mtx);
491 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
492 	}
493 	return (rval);
494 }
495 
496 /*
497  * asleep() - async sleep call.  Place process on wait queue and return
498  * immediately without blocking.  The process stays runnable until mawait()
499  * is called.  If ident is NULL, remove process from wait queue if it is still
500  * on one.
501  *
502  * Only the most recent sleep condition is effective when making successive
503  * calls to asleep() or when calling msleep().
504  *
505  * The timeout, if any, is not initiated until mawait() is called.  The sleep
506  * priority, signal, and timeout is specified in the asleep() call but may be
507  * overriden in the mawait() call.
508  *
509  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
510  */
511 
512 int
513 asleep(void *ident, int priority, const char *wmesg, int timo)
514 {
515 	struct proc *p = curproc;
516 
517 	/*
518 	 * Remove preexisting wait condition (if any) and place process
519 	 * on appropriate slpque, but do not put process to sleep.
520 	 */
521 
522 	mtx_lock_spin(&sched_lock);
523 
524 	if (p->p_wchan != NULL)
525 		unsleep(p);
526 
527 	if (ident) {
528 		p->p_wchan = ident;
529 		p->p_wmesg = wmesg;
530 		p->p_slptime = 0;
531 		p->p_asleep.as_priority = priority;
532 		p->p_asleep.as_timo = timo;
533 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
534 	}
535 
536 	mtx_unlock_spin(&sched_lock);
537 
538 	return(0);
539 }
540 
541 /*
542  * mawait() - wait for async condition to occur.   The process blocks until
543  * wakeup() is called on the most recent asleep() address.  If wakeup is called
544  * prior to mawait(), mawait() winds up being a NOP.
545  *
546  * If mawait() is called more then once (without an intervening asleep() call),
547  * mawait() is still effectively a NOP but it calls mi_switch() to give other
548  * processes some cpu before returning.  The process is left runnable.
549  *
550  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
551  */
552 
553 int
554 mawait(struct mtx *mtx, int priority, int timo)
555 {
556 	struct proc *p = curproc;
557 	int rval = 0;
558 	WITNESS_SAVE_DECL(mtx);
559 
560 	WITNESS_SLEEP(0, &mtx->mtx_object);
561 	KASSERT(timo > 0 || mtx_owned(&Giant) || mtx != NULL,
562 	    ("sleeping without a mutex"));
563 	mtx_lock_spin(&sched_lock);
564 	DROP_GIANT_NOSWITCH();
565 	if (mtx != NULL) {
566 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
567 		WITNESS_SAVE(&mtx->mtx_object, mtx);
568 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
569 		if (priority & PDROP)
570 			mtx = NULL;
571 	}
572 
573 	if (p->p_wchan != NULL) {
574 		int sig;
575 		int catch;
576 
577 #ifdef KTRACE
578 		if (p && KTRPOINT(p, KTR_CSW))
579 			ktrcsw(p->p_tracep, 1, 0);
580 #endif
581 		/*
582 		 * The call to mawait() can override defaults specified in
583 		 * the original asleep().
584 		 */
585 		if (priority < 0)
586 			priority = p->p_asleep.as_priority;
587 		if (timo < 0)
588 			timo = p->p_asleep.as_timo;
589 
590 		/*
591 		 * Install timeout
592 		 */
593 
594 		if (timo)
595 			callout_reset(&p->p_slpcallout, timo, endtsleep, p);
596 
597 		sig = 0;
598 		catch = priority & PCATCH;
599 
600 		if (catch) {
601 			p->p_sflag |= PS_SINTR;
602 			mtx_unlock_spin(&sched_lock);
603 			PROC_LOCK(p);
604 			sig = CURSIG(p);
605 			mtx_lock_spin(&sched_lock);
606 			PROC_UNLOCK_NOSWITCH(p);
607 			if (sig != 0) {
608 				if (p->p_wchan)
609 					unsleep(p);
610 			} else if (p->p_wchan == NULL)
611 				catch = 0;
612 		}
613 		if (p->p_wchan != NULL) {
614 			p->p_stat = SSLEEP;
615 			p->p_stats->p_ru.ru_nvcsw++;
616 			mi_switch();
617 		}
618 		KASSERT(p->p_stat == SRUN, ("running but not SRUN"));
619 		p->p_sflag &= ~PS_SINTR;
620 		if (p->p_sflag & PS_TIMEOUT) {
621 			p->p_sflag &= ~PS_TIMEOUT;
622 			if (sig == 0)
623 				rval = EWOULDBLOCK;
624 		} else if (timo)
625 			callout_stop(&p->p_slpcallout);
626 		mtx_unlock_spin(&sched_lock);
627 		if (rval == 0 && catch) {
628 			PROC_LOCK(p);
629 			if (sig != 0 || (sig = CURSIG(p))) {
630 				if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
631 					rval = EINTR;
632 				else
633 					rval = ERESTART;
634 			}
635 			PROC_UNLOCK(p);
636 		}
637 #ifdef KTRACE
638 		mtx_lock(&Giant);
639 		if (KTRPOINT(p, KTR_CSW))
640 			ktrcsw(p->p_tracep, 0, 0);
641 		mtx_unlock(&Giant);
642 #endif
643 	} else {
644 		/*
645 		 * If as_priority is 0, mawait() has been called without an
646 		 * intervening asleep().  We are still effectively a NOP,
647 		 * but we call mi_switch() for safety.
648 		 */
649 
650 		if (p->p_asleep.as_priority == 0) {
651 			p->p_stats->p_ru.ru_nvcsw++;
652 			mi_switch();
653 		}
654 		mtx_unlock_spin(&sched_lock);
655 	}
656 
657 	/*
658 	 * clear p_asleep.as_priority as an indication that mawait() has been
659 	 * called.  If mawait() is called again without an intervening asleep(),
660 	 * mawait() is still effectively a NOP but the above mi_switch() code
661 	 * is triggered as a safety.
662 	 */
663 	if (rval == 0)
664 		p->p_asleep.as_priority = 0;
665 
666 	PICKUP_GIANT();
667 	if (mtx != NULL) {
668 		mtx_lock(mtx);
669 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
670 	}
671 	return (rval);
672 }
673 
674 /*
675  * Implement timeout for msleep or asleep()/mawait()
676  *
677  * If process hasn't been awakened (wchan non-zero),
678  * set timeout flag and undo the sleep.  If proc
679  * is stopped, just unsleep so it will remain stopped.
680  * MP-safe, called without the Giant mutex.
681  */
682 static void
683 endtsleep(arg)
684 	void *arg;
685 {
686 	register struct proc *p;
687 	int s;
688 
689 	p = (struct proc *)arg;
690 	CTR3(KTR_PROC, "endtsleep: proc %p (pid %d, %s)", p, p->p_pid,
691 	    p->p_comm);
692 	s = splhigh();
693 	mtx_lock_spin(&sched_lock);
694 	if (p->p_wchan) {
695 		if (p->p_stat == SSLEEP)
696 			setrunnable(p);
697 		else
698 			unsleep(p);
699 		p->p_sflag |= PS_TIMEOUT;
700 	}
701 	mtx_unlock_spin(&sched_lock);
702 	splx(s);
703 }
704 
705 /*
706  * Remove a process from its wait queue
707  */
708 void
709 unsleep(p)
710 	register struct proc *p;
711 {
712 	int s;
713 
714 	s = splhigh();
715 	mtx_lock_spin(&sched_lock);
716 	if (p->p_wchan) {
717 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
718 		p->p_wchan = NULL;
719 	}
720 	mtx_unlock_spin(&sched_lock);
721 	splx(s);
722 }
723 
724 /*
725  * Make all processes sleeping on the specified identifier runnable.
726  */
727 void
728 wakeup(ident)
729 	register void *ident;
730 {
731 	register struct slpquehead *qp;
732 	register struct proc *p;
733 	int s;
734 
735 	s = splhigh();
736 	mtx_lock_spin(&sched_lock);
737 	qp = &slpque[LOOKUP(ident)];
738 restart:
739 	TAILQ_FOREACH(p, qp, p_slpq) {
740 		if (p->p_wchan == ident) {
741 			TAILQ_REMOVE(qp, p, p_slpq);
742 			p->p_wchan = NULL;
743 			if (p->p_stat == SSLEEP) {
744 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
745 				CTR3(KTR_PROC, "wakeup: proc %p (pid %d, %s)",
746 				    p, p->p_pid, p->p_comm);
747 				if (p->p_slptime > 1)
748 					updatepri(p);
749 				p->p_slptime = 0;
750 				p->p_stat = SRUN;
751 				if (p->p_sflag & PS_INMEM) {
752 					setrunqueue(p);
753 					maybe_resched(p);
754 				} else {
755 					p->p_sflag |= PS_SWAPINREQ;
756 					wakeup((caddr_t)&proc0);
757 				}
758 				/* END INLINE EXPANSION */
759 				goto restart;
760 			}
761 		}
762 	}
763 	mtx_unlock_spin(&sched_lock);
764 	splx(s);
765 }
766 
767 /*
768  * Make a process sleeping on the specified identifier runnable.
769  * May wake more than one process if a target process is currently
770  * swapped out.
771  */
772 void
773 wakeup_one(ident)
774 	register void *ident;
775 {
776 	register struct slpquehead *qp;
777 	register struct proc *p;
778 	int s;
779 
780 	s = splhigh();
781 	mtx_lock_spin(&sched_lock);
782 	qp = &slpque[LOOKUP(ident)];
783 
784 	TAILQ_FOREACH(p, qp, p_slpq) {
785 		if (p->p_wchan == ident) {
786 			TAILQ_REMOVE(qp, p, p_slpq);
787 			p->p_wchan = NULL;
788 			if (p->p_stat == SSLEEP) {
789 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
790 				CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",
791 				    p, p->p_pid, p->p_comm);
792 				if (p->p_slptime > 1)
793 					updatepri(p);
794 				p->p_slptime = 0;
795 				p->p_stat = SRUN;
796 				if (p->p_sflag & PS_INMEM) {
797 					setrunqueue(p);
798 					maybe_resched(p);
799 					break;
800 				} else {
801 					p->p_sflag |= PS_SWAPINREQ;
802 					wakeup((caddr_t)&proc0);
803 				}
804 				/* END INLINE EXPANSION */
805 			}
806 		}
807 	}
808 	mtx_unlock_spin(&sched_lock);
809 	splx(s);
810 }
811 
812 /*
813  * The machine independent parts of mi_switch().
814  * Must be called at splstatclock() or higher.
815  */
816 void
817 mi_switch()
818 {
819 	struct timeval new_switchtime;
820 	register struct proc *p = curproc;	/* XXX */
821 #if 0
822 	register struct rlimit *rlim;
823 #endif
824 	critical_t sched_crit;
825 	u_int sched_nest;
826 
827 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
828 
829 	/*
830 	 * Compute the amount of time during which the current
831 	 * process was running, and add that to its total so far.
832 	 */
833 	microuptime(&new_switchtime);
834 	if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
835 #if 0
836 		/* XXX: This doesn't play well with sched_lock right now. */
837 		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
838 		    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
839 		    new_switchtime.tv_sec, new_switchtime.tv_usec);
840 #endif
841 		new_switchtime = PCPU_GET(switchtime);
842 	} else {
843 		p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
844 		    (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
845 		    (int64_t)1000000;
846 	}
847 
848 #if 0
849 	/*
850 	 * Check if the process exceeds its cpu resource allocation.
851 	 * If over max, kill it.
852 	 *
853 	 * XXX drop sched_lock, pickup Giant
854 	 */
855 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
856 	    p->p_runtime > p->p_limit->p_cpulimit) {
857 		rlim = &p->p_rlimit[RLIMIT_CPU];
858 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
859 			mtx_unlock_spin(&sched_lock);
860 			PROC_LOCK(p);
861 			killproc(p, "exceeded maximum CPU limit");
862 			mtx_lock_spin(&sched_lock);
863 			PROC_UNLOCK_NOSWITCH(p);
864 		} else {
865 			mtx_unlock_spin(&sched_lock);
866 			PROC_LOCK(p);
867 			psignal(p, SIGXCPU);
868 			mtx_lock_spin(&sched_lock);
869 			PROC_UNLOCK_NOSWITCH(p);
870 			if (rlim->rlim_cur < rlim->rlim_max) {
871 				/* XXX: we should make a private copy */
872 				rlim->rlim_cur += 5;
873 			}
874 		}
875 	}
876 #endif
877 
878 	/*
879 	 * Pick a new current process and record its start time.
880 	 */
881 	cnt.v_swtch++;
882 	PCPU_SET(switchtime, new_switchtime);
883 	CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,
884 	    p->p_comm);
885 	sched_crit = sched_lock.mtx_savecrit;
886 	sched_nest = sched_lock.mtx_recurse;
887 	curproc->p_lastcpu = curproc->p_oncpu;
888 	curproc->p_oncpu = NOCPU;
889 	clear_resched(curproc);
890 	cpu_switch();
891 	curproc->p_oncpu = PCPU_GET(cpuid);
892 	sched_lock.mtx_savecrit = sched_crit;
893 	sched_lock.mtx_recurse = sched_nest;
894 	sched_lock.mtx_lock = (uintptr_t)curproc;
895 	CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,
896 	    p->p_comm);
897 	if (PCPU_GET(switchtime.tv_sec) == 0)
898 		microuptime(PCPU_PTR(switchtime));
899 	PCPU_SET(switchticks, ticks);
900 }
901 
902 /*
903  * Change process state to be runnable,
904  * placing it on the run queue if it is in memory,
905  * and awakening the swapper if it isn't in memory.
906  */
907 void
908 setrunnable(p)
909 	register struct proc *p;
910 {
911 	register int s;
912 
913 	s = splhigh();
914 	mtx_lock_spin(&sched_lock);
915 	switch (p->p_stat) {
916 	case 0:
917 	case SRUN:
918 	case SZOMB:
919 	case SWAIT:
920 	default:
921 		panic("setrunnable");
922 	case SSTOP:
923 	case SSLEEP:			/* e.g. when sending signals */
924 		if (p->p_sflag & PS_CVWAITQ)
925 			cv_waitq_remove(p);
926 		else
927 			unsleep(p);
928 		break;
929 
930 	case SIDL:
931 		break;
932 	}
933 	p->p_stat = SRUN;
934 	if (p->p_sflag & PS_INMEM)
935 		setrunqueue(p);
936 	splx(s);
937 	if (p->p_slptime > 1)
938 		updatepri(p);
939 	p->p_slptime = 0;
940 	if ((p->p_sflag & PS_INMEM) == 0) {
941 		p->p_sflag |= PS_SWAPINREQ;
942 		wakeup((caddr_t)&proc0);
943 	}
944 	else
945 		maybe_resched(p);
946 	mtx_unlock_spin(&sched_lock);
947 }
948 
949 /*
950  * Compute the priority of a process when running in user mode.
951  * Arrange to reschedule if the resulting priority is better
952  * than that of the current process.
953  */
954 void
955 resetpriority(p)
956 	register struct proc *p;
957 {
958 	register unsigned int newpriority;
959 
960 	mtx_lock_spin(&sched_lock);
961 	if (p->p_pri.pri_class == PRI_TIMESHARE) {
962 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
963 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
964 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
965 		    PRI_MAX_TIMESHARE);
966 		p->p_pri.pri_user = newpriority;
967 	}
968 	maybe_resched(p);
969 	mtx_unlock_spin(&sched_lock);
970 }
971 
972 /* ARGSUSED */
973 static void
974 sched_setup(dummy)
975 	void *dummy;
976 {
977 
978 	callout_init(&schedcpu_callout, 1);
979 	callout_init(&roundrobin_callout, 0);
980 
981 	/* Kick off timeout driven events by calling first time. */
982 	roundrobin(NULL);
983 	schedcpu(NULL);
984 }
985 
986 /*
987  * We adjust the priority of the current process.  The priority of
988  * a process gets worse as it accumulates CPU time.  The cpu usage
989  * estimator (p_estcpu) is increased here.  resetpriority() will
990  * compute a different priority each time p_estcpu increases by
991  * INVERSE_ESTCPU_WEIGHT
992  * (until MAXPRI is reached).  The cpu usage estimator ramps up
993  * quite quickly when the process is running (linearly), and decays
994  * away exponentially, at a rate which is proportionally slower when
995  * the system is busy.  The basic principle is that the system will
996  * 90% forget that the process used a lot of CPU time in 5 * loadav
997  * seconds.  This causes the system to favor processes which haven't
998  * run much recently, and to round-robin among other processes.
999  */
1000 void
1001 schedclock(p)
1002 	struct proc *p;
1003 {
1004 
1005 	p->p_cpticks++;
1006 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1007 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1008 		resetpriority(p);
1009 		if (p->p_pri.pri_level >= PUSER)
1010 			p->p_pri.pri_level = p->p_pri.pri_user;
1011 	}
1012 }
1013 
1014 /*
1015  * General purpose yield system call
1016  */
1017 int
1018 yield(struct proc *p, struct yield_args *uap)
1019 {
1020 	int s;
1021 
1022 	p->p_retval[0] = 0;
1023 
1024 	s = splhigh();
1025 	mtx_lock_spin(&sched_lock);
1026 	DROP_GIANT_NOSWITCH();
1027 	p->p_pri.pri_level = PRI_MAX_TIMESHARE;
1028 	setrunqueue(p);
1029 	p->p_stats->p_ru.ru_nvcsw++;
1030 	mi_switch();
1031 	mtx_unlock_spin(&sched_lock);
1032 	PICKUP_GIANT();
1033 	splx(s);
1034 
1035 	return (0);
1036 }
1037