xref: /freebsd/sys/kern/kern_synch.c (revision ce4946daa5ce852d28008dac492029500ab2ee95)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/condvar.h>
47 #include <sys/ipl.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/proc.h>
53 #include <sys/resourcevar.h>
54 #include <sys/signalvar.h>
55 #include <sys/smp.h>
56 #include <sys/sx.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/vmmeter.h>
60 #include <vm/vm.h>
61 #include <vm/vm_extern.h>
62 #ifdef KTRACE
63 #include <sys/uio.h>
64 #include <sys/ktrace.h>
65 #endif
66 
67 #include <machine/cpu.h>
68 
69 static void sched_setup __P((void *dummy));
70 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
71 
72 int	hogticks;
73 int	lbolt;
74 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
75 
76 static struct callout schedcpu_callout;
77 static struct callout roundrobin_callout;
78 
79 static void	endtsleep __P((void *));
80 static void	roundrobin __P((void *arg));
81 static void	schedcpu __P((void *arg));
82 
83 static int
84 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
85 {
86 	int error, new_val;
87 
88 	new_val = sched_quantum * tick;
89 	error = sysctl_handle_int(oidp, &new_val, 0, req);
90         if (error != 0 || req->newptr == NULL)
91 		return (error);
92 	if (new_val < tick)
93 		return (EINVAL);
94 	sched_quantum = new_val / tick;
95 	hogticks = 2 * sched_quantum;
96 	return (0);
97 }
98 
99 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
100 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
101 
102 /*
103  * Arrange to reschedule if necessary, taking the priorities and
104  * schedulers into account.
105  */
106 void
107 maybe_resched(p)
108 	struct proc *p;
109 {
110 
111 	mtx_assert(&sched_lock, MA_OWNED);
112 	if (p->p_pri.pri_level < curproc->p_pri.pri_level)
113 		need_resched(curproc);
114 }
115 
116 int
117 roundrobin_interval(void)
118 {
119 	return (sched_quantum);
120 }
121 
122 /*
123  * Force switch among equal priority processes every 100ms.
124  */
125 /* ARGSUSED */
126 static void
127 roundrobin(arg)
128 	void *arg;
129 {
130 
131 	mtx_lock_spin(&sched_lock);
132 	need_resched(curproc);
133 #ifdef SMP
134 	forward_roundrobin();
135 #endif
136 	mtx_unlock_spin(&sched_lock);
137 
138 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
139 }
140 
141 /*
142  * Constants for digital decay and forget:
143  *	90% of (p_estcpu) usage in 5 * loadav time
144  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
145  *          Note that, as ps(1) mentions, this can let percentages
146  *          total over 100% (I've seen 137.9% for 3 processes).
147  *
148  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
149  *
150  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
151  * That is, the system wants to compute a value of decay such
152  * that the following for loop:
153  * 	for (i = 0; i < (5 * loadavg); i++)
154  * 		p_estcpu *= decay;
155  * will compute
156  * 	p_estcpu *= 0.1;
157  * for all values of loadavg:
158  *
159  * Mathematically this loop can be expressed by saying:
160  * 	decay ** (5 * loadavg) ~= .1
161  *
162  * The system computes decay as:
163  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
164  *
165  * We wish to prove that the system's computation of decay
166  * will always fulfill the equation:
167  * 	decay ** (5 * loadavg) ~= .1
168  *
169  * If we compute b as:
170  * 	b = 2 * loadavg
171  * then
172  * 	decay = b / (b + 1)
173  *
174  * We now need to prove two things:
175  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
176  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
177  *
178  * Facts:
179  *         For x close to zero, exp(x) =~ 1 + x, since
180  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
181  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
182  *         For x close to zero, ln(1+x) =~ x, since
183  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
184  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
185  *         ln(.1) =~ -2.30
186  *
187  * Proof of (1):
188  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
189  *	solving for factor,
190  *      ln(factor) =~ (-2.30/5*loadav), or
191  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
192  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
193  *
194  * Proof of (2):
195  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
196  *	solving for power,
197  *      power*ln(b/(b+1)) =~ -2.30, or
198  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
199  *
200  * Actual power values for the implemented algorithm are as follows:
201  *      loadav: 1       2       3       4
202  *      power:  5.68    10.32   14.94   19.55
203  */
204 
205 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
206 #define	loadfactor(loadav)	(2 * (loadav))
207 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
208 
209 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
210 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
211 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
212 
213 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
214 static int	fscale __unused = FSCALE;
215 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
216 
217 /*
218  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
219  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
220  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
221  *
222  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
223  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
224  *
225  * If you don't want to bother with the faster/more-accurate formula, you
226  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
227  * (more general) method of calculating the %age of CPU used by a process.
228  */
229 #define	CCPU_SHIFT	11
230 
231 /*
232  * Recompute process priorities, every hz ticks.
233  * MP-safe, called without the Giant mutex.
234  */
235 /* ARGSUSED */
236 static void
237 schedcpu(arg)
238 	void *arg;
239 {
240 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
241 	register struct proc *p;
242 	register int realstathz, s;
243 
244 	realstathz = stathz ? stathz : hz;
245 	sx_slock(&allproc_lock);
246 	LIST_FOREACH(p, &allproc, p_list) {
247 		/*
248 		 * Increment time in/out of memory and sleep time
249 		 * (if sleeping).  We ignore overflow; with 16-bit int's
250 		 * (remember them?) overflow takes 45 days.
251 		if (p->p_stat == SWAIT)
252 			continue;
253 		 */
254 		mtx_lock_spin(&sched_lock);
255 		p->p_swtime++;
256 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
257 			p->p_slptime++;
258 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
259 		/*
260 		 * If the process has slept the entire second,
261 		 * stop recalculating its priority until it wakes up.
262 		 */
263 		if (p->p_slptime > 1) {
264 			mtx_unlock_spin(&sched_lock);
265 			continue;
266 		}
267 
268 		/*
269 		 * prevent state changes and protect run queue
270 		 */
271 		s = splhigh();
272 
273 		/*
274 		 * p_pctcpu is only for ps.
275 		 */
276 #if	(FSHIFT >= CCPU_SHIFT)
277 		p->p_pctcpu += (realstathz == 100)?
278 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
279                 	100 * (((fixpt_t) p->p_cpticks)
280 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
281 #else
282 		p->p_pctcpu += ((FSCALE - ccpu) *
283 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
284 #endif
285 		p->p_cpticks = 0;
286 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
287 		resetpriority(p);
288 		if (p->p_pri.pri_level >= PUSER) {
289 			if ((p != curproc) &&
290 #ifdef SMP
291 			    p->p_oncpu == NOCPU && 	/* idle */
292 #endif
293 			    p->p_stat == SRUN &&
294 			    (p->p_sflag & PS_INMEM) &&
295 			    (p->p_pri.pri_level / RQ_PPQ) !=
296 			    (p->p_pri.pri_user / RQ_PPQ)) {
297 				remrunqueue(p);
298 				p->p_pri.pri_level = p->p_pri.pri_user;
299 				setrunqueue(p);
300 			} else
301 				p->p_pri.pri_level = p->p_pri.pri_user;
302 		}
303 		mtx_unlock_spin(&sched_lock);
304 		splx(s);
305 	}
306 	sx_sunlock(&allproc_lock);
307 	vmmeter();
308 	wakeup((caddr_t)&lbolt);
309 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
310 }
311 
312 /*
313  * Recalculate the priority of a process after it has slept for a while.
314  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
315  * least six times the loadfactor will decay p_estcpu to zero.
316  */
317 void
318 updatepri(p)
319 	register struct proc *p;
320 {
321 	register unsigned int newcpu = p->p_estcpu;
322 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
323 
324 	if (p->p_slptime > 5 * loadfac)
325 		p->p_estcpu = 0;
326 	else {
327 		p->p_slptime--;	/* the first time was done in schedcpu */
328 		while (newcpu && --p->p_slptime)
329 			newcpu = decay_cpu(loadfac, newcpu);
330 		p->p_estcpu = newcpu;
331 	}
332 	resetpriority(p);
333 }
334 
335 /*
336  * We're only looking at 7 bits of the address; everything is
337  * aligned to 4, lots of things are aligned to greater powers
338  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
339  */
340 #define TABLESIZE	128
341 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
342 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
343 
344 void
345 sleepinit(void)
346 {
347 	int i;
348 
349 	sched_quantum = hz/10;
350 	hogticks = 2 * sched_quantum;
351 	for (i = 0; i < TABLESIZE; i++)
352 		TAILQ_INIT(&slpque[i]);
353 }
354 
355 /*
356  * General sleep call.  Suspends the current process until a wakeup is
357  * performed on the specified identifier.  The process will then be made
358  * runnable with the specified priority.  Sleeps at most timo/hz seconds
359  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
360  * before and after sleeping, else signals are not checked.  Returns 0 if
361  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
362  * signal needs to be delivered, ERESTART is returned if the current system
363  * call should be restarted if possible, and EINTR is returned if the system
364  * call should be interrupted by the signal (return EINTR).
365  *
366  * The mutex argument is exited before the caller is suspended, and
367  * entered before msleep returns.  If priority includes the PDROP
368  * flag the mutex is not entered before returning.
369  */
370 int
371 msleep(ident, mtx, priority, wmesg, timo)
372 	void *ident;
373 	struct mtx *mtx;
374 	int priority, timo;
375 	const char *wmesg;
376 {
377 	struct proc *p = curproc;
378 	int sig, catch = priority & PCATCH;
379 	int rval = 0;
380 	WITNESS_SAVE_DECL(mtx);
381 
382 #ifdef KTRACE
383 	if (p && KTRPOINT(p, KTR_CSW))
384 		ktrcsw(p->p_tracep, 1, 0);
385 #endif
386 	WITNESS_SLEEP(0, &mtx->mtx_object);
387 	mtx_lock_spin(&sched_lock);
388 	if (cold || panicstr) {
389 		/*
390 		 * After a panic, or during autoconfiguration,
391 		 * just give interrupts a chance, then just return;
392 		 * don't run any other procs or panic below,
393 		 * in case this is the idle process and already asleep.
394 		 */
395 		if (mtx != NULL && priority & PDROP)
396 			mtx_unlock_flags(mtx, MTX_NOSWITCH);
397 		mtx_unlock_spin(&sched_lock);
398 		return (0);
399 	}
400 
401 	DROP_GIANT_NOSWITCH();
402 
403 	if (mtx != NULL) {
404 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
405 		WITNESS_SAVE(&mtx->mtx_object, mtx);
406 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
407 		if (priority & PDROP)
408 			mtx = NULL;
409 	}
410 
411 	KASSERT(p != NULL, ("msleep1"));
412 	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
413 	/*
414 	 * Process may be sitting on a slpque if asleep() was called, remove
415 	 * it before re-adding.
416 	 */
417 	if (p->p_wchan != NULL)
418 		unsleep(p);
419 
420 	p->p_wchan = ident;
421 	p->p_wmesg = wmesg;
422 	p->p_slptime = 0;
423 	p->p_pri.pri_level = priority & PRIMASK;
424 	CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
425 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
426 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
427 	if (timo)
428 		callout_reset(&p->p_slpcallout, timo, endtsleep, p);
429 	/*
430 	 * We put ourselves on the sleep queue and start our timeout
431 	 * before calling CURSIG, as we could stop there, and a wakeup
432 	 * or a SIGCONT (or both) could occur while we were stopped.
433 	 * A SIGCONT would cause us to be marked as SSLEEP
434 	 * without resuming us, thus we must be ready for sleep
435 	 * when CURSIG is called.  If the wakeup happens while we're
436 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
437 	 */
438 	if (catch) {
439 		CTR4(KTR_PROC,
440 		        "msleep caught: proc %p (pid %d, %s), schedlock %p",
441 			p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
442 		p->p_sflag |= PS_SINTR;
443 		mtx_unlock_spin(&sched_lock);
444 		if ((sig = CURSIG(p))) {
445 			mtx_lock_spin(&sched_lock);
446 			if (p->p_wchan)
447 				unsleep(p);
448 			p->p_stat = SRUN;
449 			goto resume;
450 		}
451 		mtx_lock_spin(&sched_lock);
452 		if (p->p_wchan == NULL) {
453 			catch = 0;
454 			goto resume;
455 		}
456 	} else
457 		sig = 0;
458 	p->p_stat = SSLEEP;
459 	p->p_stats->p_ru.ru_nvcsw++;
460 	mi_switch();
461 	CTR4(KTR_PROC,
462 	        "msleep resume: proc %p (pid %d, %s), schedlock %p",
463 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
464 resume:
465 	p->p_sflag &= ~PS_SINTR;
466 	if (p->p_sflag & PS_TIMEOUT) {
467 		p->p_sflag &= ~PS_TIMEOUT;
468 		if (sig == 0) {
469 #ifdef KTRACE
470 			if (KTRPOINT(p, KTR_CSW))
471 				ktrcsw(p->p_tracep, 0, 0);
472 #endif
473 			rval = EWOULDBLOCK;
474 			mtx_unlock_spin(&sched_lock);
475 			goto out;
476 		}
477 	} else if (timo)
478 		callout_stop(&p->p_slpcallout);
479 	mtx_unlock_spin(&sched_lock);
480 
481 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
482 #ifdef KTRACE
483 		if (KTRPOINT(p, KTR_CSW))
484 			ktrcsw(p->p_tracep, 0, 0);
485 #endif
486 		PROC_LOCK(p);
487 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
488 			rval = EINTR;
489 		else
490 			rval = ERESTART;
491 		PROC_UNLOCK(p);
492 		goto out;
493 	}
494 out:
495 #ifdef KTRACE
496 	if (KTRPOINT(p, KTR_CSW))
497 		ktrcsw(p->p_tracep, 0, 0);
498 #endif
499 	PICKUP_GIANT();
500 	if (mtx != NULL) {
501 		mtx_lock(mtx);
502 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
503 	}
504 	return (rval);
505 }
506 
507 /*
508  * asleep() - async sleep call.  Place process on wait queue and return
509  * immediately without blocking.  The process stays runnable until mawait()
510  * is called.  If ident is NULL, remove process from wait queue if it is still
511  * on one.
512  *
513  * Only the most recent sleep condition is effective when making successive
514  * calls to asleep() or when calling msleep().
515  *
516  * The timeout, if any, is not initiated until mawait() is called.  The sleep
517  * priority, signal, and timeout is specified in the asleep() call but may be
518  * overriden in the mawait() call.
519  *
520  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
521  */
522 
523 int
524 asleep(void *ident, int priority, const char *wmesg, int timo)
525 {
526 	struct proc *p = curproc;
527 	int s;
528 
529 	/*
530 	 * obtain sched_lock while manipulating sleep structures and slpque.
531 	 *
532 	 * Remove preexisting wait condition (if any) and place process
533 	 * on appropriate slpque, but do not put process to sleep.
534 	 */
535 
536 	s = splhigh();
537 	mtx_lock_spin(&sched_lock);
538 
539 	if (p->p_wchan != NULL)
540 		unsleep(p);
541 
542 	if (ident) {
543 		p->p_wchan = ident;
544 		p->p_wmesg = wmesg;
545 		p->p_slptime = 0;
546 		p->p_asleep.as_priority = priority;
547 		p->p_asleep.as_timo = timo;
548 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
549 	}
550 
551 	mtx_unlock_spin(&sched_lock);
552 	splx(s);
553 
554 	return(0);
555 }
556 
557 /*
558  * mawait() - wait for async condition to occur.   The process blocks until
559  * wakeup() is called on the most recent asleep() address.  If wakeup is called
560  * prior to mawait(), mawait() winds up being a NOP.
561  *
562  * If mawait() is called more then once (without an intervening asleep() call),
563  * mawait() is still effectively a NOP but it calls mi_switch() to give other
564  * processes some cpu before returning.  The process is left runnable.
565  *
566  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
567  */
568 
569 int
570 mawait(struct mtx *mtx, int priority, int timo)
571 {
572 	struct proc *p = curproc;
573 	int rval = 0;
574 	int s;
575 	WITNESS_SAVE_DECL(mtx);
576 
577 	WITNESS_SLEEP(0, &mtx->mtx_object);
578 	mtx_lock_spin(&sched_lock);
579 	DROP_GIANT_NOSWITCH();
580 	if (mtx != NULL) {
581 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
582 		WITNESS_SAVE(&mtx->mtx_object, mtx);
583 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
584 		if (priority & PDROP)
585 			mtx = NULL;
586 	}
587 
588 	s = splhigh();
589 
590 	if (p->p_wchan != NULL) {
591 		int sig;
592 		int catch;
593 
594 		/*
595 		 * The call to mawait() can override defaults specified in
596 		 * the original asleep().
597 		 */
598 		if (priority < 0)
599 			priority = p->p_asleep.as_priority;
600 		if (timo < 0)
601 			timo = p->p_asleep.as_timo;
602 
603 		/*
604 		 * Install timeout
605 		 */
606 
607 		if (timo)
608 			callout_reset(&p->p_slpcallout, timo, endtsleep, p);
609 
610 		sig = 0;
611 		catch = priority & PCATCH;
612 
613 		if (catch) {
614 			p->p_sflag |= PS_SINTR;
615 			mtx_unlock_spin(&sched_lock);
616 			if ((sig = CURSIG(p))) {
617 				mtx_lock_spin(&sched_lock);
618 				if (p->p_wchan)
619 					unsleep(p);
620 				p->p_stat = SRUN;
621 				goto resume;
622 			}
623 			mtx_lock_spin(&sched_lock);
624 			if (p->p_wchan == NULL) {
625 				catch = 0;
626 				goto resume;
627 			}
628 		}
629 		p->p_stat = SSLEEP;
630 		p->p_stats->p_ru.ru_nvcsw++;
631 		mi_switch();
632 resume:
633 
634 		splx(s);
635 		p->p_sflag &= ~PS_SINTR;
636 		if (p->p_sflag & PS_TIMEOUT) {
637 			p->p_sflag &= ~PS_TIMEOUT;
638 			if (sig == 0) {
639 #ifdef KTRACE
640 				if (KTRPOINT(p, KTR_CSW))
641 					ktrcsw(p->p_tracep, 0, 0);
642 #endif
643 				rval = EWOULDBLOCK;
644 				mtx_unlock_spin(&sched_lock);
645 				goto out;
646 			}
647 		} else if (timo)
648 			callout_stop(&p->p_slpcallout);
649 		mtx_unlock_spin(&sched_lock);
650 
651 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
652 #ifdef KTRACE
653 			if (KTRPOINT(p, KTR_CSW))
654 				ktrcsw(p->p_tracep, 0, 0);
655 #endif
656 			PROC_LOCK(p);
657 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
658 				rval = EINTR;
659 			else
660 				rval = ERESTART;
661 			PROC_UNLOCK(p);
662 			goto out;
663 		}
664 #ifdef KTRACE
665 		if (KTRPOINT(p, KTR_CSW))
666 			ktrcsw(p->p_tracep, 0, 0);
667 #endif
668 	} else {
669 		/*
670 		 * If as_priority is 0, mawait() has been called without an
671 		 * intervening asleep().  We are still effectively a NOP,
672 		 * but we call mi_switch() for safety.
673 		 */
674 
675 		if (p->p_asleep.as_priority == 0) {
676 			p->p_stats->p_ru.ru_nvcsw++;
677 			mi_switch();
678 		}
679 		mtx_unlock_spin(&sched_lock);
680 		splx(s);
681 	}
682 
683 	/*
684 	 * clear p_asleep.as_priority as an indication that mawait() has been
685 	 * called.  If mawait() is called again without an intervening asleep(),
686 	 * mawait() is still effectively a NOP but the above mi_switch() code
687 	 * is triggered as a safety.
688 	 */
689 	p->p_asleep.as_priority = 0;
690 
691 out:
692 	PICKUP_GIANT();
693 	if (mtx != NULL) {
694 		mtx_lock(mtx);
695 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
696 	}
697 	return (rval);
698 }
699 
700 /*
701  * Implement timeout for msleep or asleep()/mawait()
702  *
703  * If process hasn't been awakened (wchan non-zero),
704  * set timeout flag and undo the sleep.  If proc
705  * is stopped, just unsleep so it will remain stopped.
706  * MP-safe, called without the Giant mutex.
707  */
708 static void
709 endtsleep(arg)
710 	void *arg;
711 {
712 	register struct proc *p;
713 	int s;
714 
715 	p = (struct proc *)arg;
716 	CTR4(KTR_PROC,
717 	        "endtsleep: proc %p (pid %d, %s), schedlock %p",
718 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
719 	s = splhigh();
720 	mtx_lock_spin(&sched_lock);
721 	if (p->p_wchan) {
722 		if (p->p_stat == SSLEEP)
723 			setrunnable(p);
724 		else
725 			unsleep(p);
726 		p->p_sflag |= PS_TIMEOUT;
727 	}
728 	mtx_unlock_spin(&sched_lock);
729 	splx(s);
730 }
731 
732 /*
733  * Remove a process from its wait queue
734  */
735 void
736 unsleep(p)
737 	register struct proc *p;
738 {
739 	int s;
740 
741 	s = splhigh();
742 	mtx_lock_spin(&sched_lock);
743 	if (p->p_wchan) {
744 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
745 		p->p_wchan = NULL;
746 	}
747 	mtx_unlock_spin(&sched_lock);
748 	splx(s);
749 }
750 
751 /*
752  * Make all processes sleeping on the specified identifier runnable.
753  */
754 void
755 wakeup(ident)
756 	register void *ident;
757 {
758 	register struct slpquehead *qp;
759 	register struct proc *p;
760 	int s;
761 
762 	s = splhigh();
763 	mtx_lock_spin(&sched_lock);
764 	qp = &slpque[LOOKUP(ident)];
765 restart:
766 	TAILQ_FOREACH(p, qp, p_slpq) {
767 		if (p->p_wchan == ident) {
768 			TAILQ_REMOVE(qp, p, p_slpq);
769 			p->p_wchan = NULL;
770 			if (p->p_stat == SSLEEP) {
771 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
772 				CTR4(KTR_PROC,
773 				        "wakeup: proc %p (pid %d, %s), schedlock %p",
774 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
775 				if (p->p_slptime > 1)
776 					updatepri(p);
777 				p->p_slptime = 0;
778 				p->p_stat = SRUN;
779 				if (p->p_sflag & PS_INMEM) {
780 					setrunqueue(p);
781 					maybe_resched(p);
782 				} else {
783 					p->p_sflag |= PS_SWAPINREQ;
784 					wakeup((caddr_t)&proc0);
785 				}
786 				/* END INLINE EXPANSION */
787 				goto restart;
788 			}
789 		}
790 	}
791 	mtx_unlock_spin(&sched_lock);
792 	splx(s);
793 }
794 
795 /*
796  * Make a process sleeping on the specified identifier runnable.
797  * May wake more than one process if a target process is currently
798  * swapped out.
799  */
800 void
801 wakeup_one(ident)
802 	register void *ident;
803 {
804 	register struct slpquehead *qp;
805 	register struct proc *p;
806 	int s;
807 
808 	s = splhigh();
809 	mtx_lock_spin(&sched_lock);
810 	qp = &slpque[LOOKUP(ident)];
811 
812 	TAILQ_FOREACH(p, qp, p_slpq) {
813 		if (p->p_wchan == ident) {
814 			TAILQ_REMOVE(qp, p, p_slpq);
815 			p->p_wchan = NULL;
816 			if (p->p_stat == SSLEEP) {
817 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
818 				CTR4(KTR_PROC,
819 				        "wakeup1: proc %p (pid %d, %s), schedlock %p",
820 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
821 				if (p->p_slptime > 1)
822 					updatepri(p);
823 				p->p_slptime = 0;
824 				p->p_stat = SRUN;
825 				if (p->p_sflag & PS_INMEM) {
826 					setrunqueue(p);
827 					maybe_resched(p);
828 					break;
829 				} else {
830 					p->p_sflag |= PS_SWAPINREQ;
831 					wakeup((caddr_t)&proc0);
832 				}
833 				/* END INLINE EXPANSION */
834 			}
835 		}
836 	}
837 	mtx_unlock_spin(&sched_lock);
838 	splx(s);
839 }
840 
841 /*
842  * The machine independent parts of mi_switch().
843  * Must be called at splstatclock() or higher.
844  */
845 void
846 mi_switch()
847 {
848 	struct timeval new_switchtime;
849 	register struct proc *p = curproc;	/* XXX */
850 #if 0
851 	register struct rlimit *rlim;
852 #endif
853 	u_int sched_nest;
854 
855 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
856 
857 	/*
858 	 * Compute the amount of time during which the current
859 	 * process was running, and add that to its total so far.
860 	 */
861 	microuptime(&new_switchtime);
862 	if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
863 #if 0
864 		/* XXX: This doesn't play well with sched_lock right now. */
865 		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
866 		    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
867 		    new_switchtime.tv_sec, new_switchtime.tv_usec);
868 #endif
869 		new_switchtime = PCPU_GET(switchtime);
870 	} else {
871 		p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
872 		    (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
873 		    (int64_t)1000000;
874 	}
875 
876 #if 0
877 	/*
878 	 * Check if the process exceeds its cpu resource allocation.
879 	 * If over max, kill it.
880 	 *
881 	 * XXX drop sched_lock, pickup Giant
882 	 */
883 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
884 	    p->p_runtime > p->p_limit->p_cpulimit) {
885 		rlim = &p->p_rlimit[RLIMIT_CPU];
886 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
887 			mtx_unlock_spin(&sched_lock);
888 			killproc(p, "exceeded maximum CPU limit");
889 			mtx_lock_spin(&sched_lock);
890 		} else {
891 			mtx_unlock_spin(&sched_lock);
892 			PROC_LOCK(p);
893 			psignal(p, SIGXCPU);
894 			mtx_lock_spin(&sched_lock);
895 			PROC_UNLOCK_NOSWITCH(p);
896 			if (rlim->rlim_cur < rlim->rlim_max) {
897 				/* XXX: we should make a private copy */
898 				rlim->rlim_cur += 5;
899 			}
900 		}
901 	}
902 #endif
903 
904 	/*
905 	 * Pick a new current process and record its start time.
906 	 */
907 	cnt.v_swtch++;
908 	PCPU_SET(switchtime, new_switchtime);
909 	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
910 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
911 	sched_nest = sched_lock.mtx_recurse;
912 	curproc->p_lastcpu = curproc->p_oncpu;
913 	curproc->p_oncpu = NOCPU;
914 	clear_resched(curproc);
915 	cpu_switch();
916 	curproc->p_oncpu = PCPU_GET(cpuid);
917 	sched_lock.mtx_recurse = sched_nest;
918 	sched_lock.mtx_lock = (uintptr_t)curproc;
919 	CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
920 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
921 	if (PCPU_GET(switchtime.tv_sec) == 0)
922 		microuptime(PCPU_PTR(switchtime));
923 	PCPU_SET(switchticks, ticks);
924 }
925 
926 /*
927  * Change process state to be runnable,
928  * placing it on the run queue if it is in memory,
929  * and awakening the swapper if it isn't in memory.
930  */
931 void
932 setrunnable(p)
933 	register struct proc *p;
934 {
935 	register int s;
936 
937 	s = splhigh();
938 	mtx_lock_spin(&sched_lock);
939 	switch (p->p_stat) {
940 	case 0:
941 	case SRUN:
942 	case SZOMB:
943 	case SWAIT:
944 	default:
945 		panic("setrunnable");
946 	case SSTOP:
947 	case SSLEEP:			/* e.g. when sending signals */
948 		if (p->p_sflag & PS_CVWAITQ)
949 			cv_waitq_remove(p);
950 		else
951 			unsleep(p);
952 		break;
953 
954 	case SIDL:
955 		break;
956 	}
957 	p->p_stat = SRUN;
958 	if (p->p_sflag & PS_INMEM)
959 		setrunqueue(p);
960 	splx(s);
961 	if (p->p_slptime > 1)
962 		updatepri(p);
963 	p->p_slptime = 0;
964 	if ((p->p_sflag & PS_INMEM) == 0) {
965 		p->p_sflag |= PS_SWAPINREQ;
966 		wakeup((caddr_t)&proc0);
967 	}
968 	else
969 		maybe_resched(p);
970 	mtx_unlock_spin(&sched_lock);
971 }
972 
973 /*
974  * Compute the priority of a process when running in user mode.
975  * Arrange to reschedule if the resulting priority is better
976  * than that of the current process.
977  */
978 void
979 resetpriority(p)
980 	register struct proc *p;
981 {
982 	register unsigned int newpriority;
983 
984 	mtx_lock_spin(&sched_lock);
985 	if (p->p_pri.pri_class == PRI_TIMESHARE) {
986 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
987 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
988 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
989 		    PRI_MAX_TIMESHARE);
990 		p->p_pri.pri_user = newpriority;
991 	}
992 	maybe_resched(p);
993 	mtx_unlock_spin(&sched_lock);
994 }
995 
996 /* ARGSUSED */
997 static void
998 sched_setup(dummy)
999 	void *dummy;
1000 {
1001 
1002 	callout_init(&schedcpu_callout, 1);
1003 	callout_init(&roundrobin_callout, 0);
1004 
1005 	/* Kick off timeout driven events by calling first time. */
1006 	roundrobin(NULL);
1007 	schedcpu(NULL);
1008 }
1009 
1010 /*
1011  * We adjust the priority of the current process.  The priority of
1012  * a process gets worse as it accumulates CPU time.  The cpu usage
1013  * estimator (p_estcpu) is increased here.  resetpriority() will
1014  * compute a different priority each time p_estcpu increases by
1015  * INVERSE_ESTCPU_WEIGHT
1016  * (until MAXPRI is reached).  The cpu usage estimator ramps up
1017  * quite quickly when the process is running (linearly), and decays
1018  * away exponentially, at a rate which is proportionally slower when
1019  * the system is busy.  The basic principle is that the system will
1020  * 90% forget that the process used a lot of CPU time in 5 * loadav
1021  * seconds.  This causes the system to favor processes which haven't
1022  * run much recently, and to round-robin among other processes.
1023  */
1024 void
1025 schedclock(p)
1026 	struct proc *p;
1027 {
1028 
1029 	p->p_cpticks++;
1030 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1031 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1032 		resetpriority(p);
1033 		if (p->p_pri.pri_level >= PUSER)
1034 			p->p_pri.pri_level = p->p_pri.pri_user;
1035 	}
1036 }
1037 
1038 /*
1039  * General purpose yield system call
1040  */
1041 int
1042 yield(struct proc *p, struct yield_args *uap)
1043 {
1044 	int s;
1045 
1046 	p->p_retval[0] = 0;
1047 
1048 	s = splhigh();
1049 	mtx_lock_spin(&sched_lock);
1050 	DROP_GIANT_NOSWITCH();
1051 	p->p_pri.pri_level = PRI_MAX_TIMESHARE;
1052 	setrunqueue(p);
1053 	p->p_stats->p_ru.ru_nvcsw++;
1054 	mi_switch();
1055 	mtx_unlock_spin(&sched_lock);
1056 	PICKUP_GIANT();
1057 	splx(s);
1058 
1059 	return (0);
1060 }
1061