xref: /freebsd/sys/kern/sched_4bsd.c (revision 2357939bc239bd5334a169b62313806178dd8f30)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/ktr.h>
42 #include <sys/lock.h>
43 #include <sys/kthread.h>
44 #include <sys/mutex.h>
45 #include <sys/proc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 #include <sys/sx.h>
51 
52 #define KTR_4BSD	0x0
53 
54 /*
55  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
56  * the range 100-256 Hz (approximately).
57  */
58 #define	ESTCPULIM(e) \
59     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
60     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
61 #ifdef SMP
62 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
63 #else
64 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
65 #endif
66 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
67 
68 struct ke_sched {
69 	int		ske_cpticks;	/* (j) Ticks of cpu time. */
70 	struct runq	*ske_runq;	/* runq the kse is currently on */
71 };
72 #define ke_runq 	ke_sched->ske_runq
73 #define KEF_BOUND	KEF_SCHED1
74 
75 #define SKE_RUNQ_PCPU(ke)						\
76     ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
77 
78 /*
79  * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
80  * cpus.
81  */
82 #define KSE_CAN_MIGRATE(ke)						\
83     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
84 static struct ke_sched ke_sched;
85 
86 struct ke_sched *kse0_sched = &ke_sched;
87 struct kg_sched *ksegrp0_sched = NULL;
88 struct p_sched *proc0_sched = NULL;
89 struct td_sched *thread0_sched = NULL;
90 
91 static int	sched_tdcnt;	/* Total runnable threads in the system. */
92 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
93 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
94 
95 static struct callout roundrobin_callout;
96 
97 static void	setup_runqs(void);
98 static void	roundrobin(void *arg);
99 static void	schedcpu(void);
100 static void	schedcpu_thread(void);
101 static void	sched_setup(void *dummy);
102 static void	maybe_resched(struct thread *td);
103 static void	updatepri(struct ksegrp *kg);
104 static void	resetpriority(struct ksegrp *kg);
105 
106 static struct kproc_desc sched_kp = {
107         "schedcpu",
108         schedcpu_thread,
109         NULL
110 };
111 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp)
112 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
113 
114 /*
115  * Global run queue.
116  */
117 static struct runq runq;
118 
119 #ifdef SMP
120 /*
121  * Per-CPU run queues
122  */
123 static struct runq runq_pcpu[MAXCPU];
124 #endif
125 
126 static void
127 setup_runqs(void)
128 {
129 #ifdef SMP
130 	int i;
131 
132 	for (i = 0; i < MAXCPU; ++i)
133 		runq_init(&runq_pcpu[i]);
134 #endif
135 
136 	runq_init(&runq);
137 }
138 
139 static int
140 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
141 {
142 	int error, new_val;
143 
144 	new_val = sched_quantum * tick;
145 	error = sysctl_handle_int(oidp, &new_val, 0, req);
146         if (error != 0 || req->newptr == NULL)
147 		return (error);
148 	if (new_val < tick)
149 		return (EINVAL);
150 	sched_quantum = new_val / tick;
151 	hogticks = 2 * sched_quantum;
152 	return (0);
153 }
154 
155 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
156 	0, sizeof sched_quantum, sysctl_kern_quantum, "I",
157 	"Roundrobin scheduling quantum in microseconds");
158 
159 /*
160  * Arrange to reschedule if necessary, taking the priorities and
161  * schedulers into account.
162  */
163 static void
164 maybe_resched(struct thread *td)
165 {
166 
167 	mtx_assert(&sched_lock, MA_OWNED);
168 	if (td->td_priority < curthread->td_priority && curthread->td_kse)
169 		curthread->td_flags |= TDF_NEEDRESCHED;
170 }
171 
172 /*
173  * Force switch among equal priority processes every 100ms.
174  * We don't actually need to force a context switch of the current process.
175  * The act of firing the event triggers a context switch to softclock() and
176  * then switching back out again which is equivalent to a preemption, thus
177  * no further work is needed on the local CPU.
178  */
179 /* ARGSUSED */
180 static void
181 roundrobin(void *arg)
182 {
183 
184 #ifdef SMP
185 	mtx_lock_spin(&sched_lock);
186 	forward_roundrobin();
187 	mtx_unlock_spin(&sched_lock);
188 #endif
189 
190 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
191 }
192 
193 /*
194  * Constants for digital decay and forget:
195  *	90% of (kg_estcpu) usage in 5 * loadav time
196  *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
197  *          Note that, as ps(1) mentions, this can let percentages
198  *          total over 100% (I've seen 137.9% for 3 processes).
199  *
200  * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
201  *
202  * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
203  * That is, the system wants to compute a value of decay such
204  * that the following for loop:
205  * 	for (i = 0; i < (5 * loadavg); i++)
206  * 		kg_estcpu *= decay;
207  * will compute
208  * 	kg_estcpu *= 0.1;
209  * for all values of loadavg:
210  *
211  * Mathematically this loop can be expressed by saying:
212  * 	decay ** (5 * loadavg) ~= .1
213  *
214  * The system computes decay as:
215  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
216  *
217  * We wish to prove that the system's computation of decay
218  * will always fulfill the equation:
219  * 	decay ** (5 * loadavg) ~= .1
220  *
221  * If we compute b as:
222  * 	b = 2 * loadavg
223  * then
224  * 	decay = b / (b + 1)
225  *
226  * We now need to prove two things:
227  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
228  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
229  *
230  * Facts:
231  *         For x close to zero, exp(x) =~ 1 + x, since
232  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
233  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
234  *         For x close to zero, ln(1+x) =~ x, since
235  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
236  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
237  *         ln(.1) =~ -2.30
238  *
239  * Proof of (1):
240  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
241  *	solving for factor,
242  *      ln(factor) =~ (-2.30/5*loadav), or
243  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
244  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
245  *
246  * Proof of (2):
247  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
248  *	solving for power,
249  *      power*ln(b/(b+1)) =~ -2.30, or
250  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
251  *
252  * Actual power values for the implemented algorithm are as follows:
253  *      loadav: 1       2       3       4
254  *      power:  5.68    10.32   14.94   19.55
255  */
256 
257 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
258 #define	loadfactor(loadav)	(2 * (loadav))
259 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
260 
261 /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
262 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
263 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
264 
265 /*
266  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
267  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
268  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
269  *
270  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
271  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
272  *
273  * If you don't want to bother with the faster/more-accurate formula, you
274  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
275  * (more general) method of calculating the %age of CPU used by a process.
276  */
277 #define	CCPU_SHIFT	11
278 
279 /*
280  * Recompute process priorities, every hz ticks.
281  * MP-safe, called without the Giant mutex.
282  */
283 /* ARGSUSED */
284 static void
285 schedcpu(void)
286 {
287 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
288 	struct thread *td;
289 	struct proc *p;
290 	struct kse *ke;
291 	struct ksegrp *kg;
292 	int awake, realstathz;
293 
294 	realstathz = stathz ? stathz : hz;
295 	sx_slock(&allproc_lock);
296 	FOREACH_PROC_IN_SYSTEM(p) {
297 		/*
298 		 * Prevent state changes and protect run queue.
299 		 */
300 		mtx_lock_spin(&sched_lock);
301 		/*
302 		 * Increment time in/out of memory.  We ignore overflow; with
303 		 * 16-bit int's (remember them?) overflow takes 45 days.
304 		 */
305 		p->p_swtime++;
306 		FOREACH_KSEGRP_IN_PROC(p, kg) {
307 			awake = 0;
308 			FOREACH_KSE_IN_GROUP(kg, ke) {
309 				/*
310 				 * Increment sleep time (if sleeping).  We
311 				 * ignore overflow, as above.
312 				 */
313 				/*
314 				 * The kse slptimes are not touched in wakeup
315 				 * because the thread may not HAVE a KSE.
316 				 */
317 				if (ke->ke_state == KES_ONRUNQ) {
318 					awake = 1;
319 					ke->ke_flags &= ~KEF_DIDRUN;
320 				} else if ((ke->ke_state == KES_THREAD) &&
321 				    (TD_IS_RUNNING(ke->ke_thread))) {
322 					awake = 1;
323 					/* Do not clear KEF_DIDRUN */
324 				} else if (ke->ke_flags & KEF_DIDRUN) {
325 					awake = 1;
326 					ke->ke_flags &= ~KEF_DIDRUN;
327 				}
328 
329 				/*
330 				 * ke_pctcpu is only for ps and ttyinfo().
331 				 * Do it per kse, and add them up at the end?
332 				 * XXXKSE
333 				 */
334 				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
335 				    FSHIFT;
336 				/*
337 				 * If the kse has been idle the entire second,
338 				 * stop recalculating its priority until
339 				 * it wakes up.
340 				 */
341 				if (ke->ke_sched->ske_cpticks == 0)
342 					continue;
343 #if	(FSHIFT >= CCPU_SHIFT)
344 				ke->ke_pctcpu += (realstathz == 100)
345 				    ? ((fixpt_t) ke->ke_sched->ske_cpticks) <<
346 				    (FSHIFT - CCPU_SHIFT) :
347 				    100 * (((fixpt_t) ke->ke_sched->ske_cpticks)
348 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
349 #else
350 				ke->ke_pctcpu += ((FSCALE - ccpu) *
351 				    (ke->ke_sched->ske_cpticks *
352 				    FSCALE / realstathz)) >> FSHIFT;
353 #endif
354 				ke->ke_sched->ske_cpticks = 0;
355 			} /* end of kse loop */
356 			/*
357 			 * If there are ANY running threads in this KSEGRP,
358 			 * then don't count it as sleeping.
359 			 */
360 			if (awake) {
361 				if (kg->kg_slptime > 1) {
362 					/*
363 					 * In an ideal world, this should not
364 					 * happen, because whoever woke us
365 					 * up from the long sleep should have
366 					 * unwound the slptime and reset our
367 					 * priority before we run at the stale
368 					 * priority.  Should KASSERT at some
369 					 * point when all the cases are fixed.
370 					 */
371 					updatepri(kg);
372 				}
373 				kg->kg_slptime = 0;
374 			} else
375 				kg->kg_slptime++;
376 			if (kg->kg_slptime > 1)
377 				continue;
378 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
379 		      	resetpriority(kg);
380 			FOREACH_THREAD_IN_GROUP(kg, td) {
381 				if (td->td_priority >= PUSER) {
382 					sched_prio(td, kg->kg_user_pri);
383 				}
384 			}
385 		} /* end of ksegrp loop */
386 		mtx_unlock_spin(&sched_lock);
387 	} /* end of process loop */
388 	sx_sunlock(&allproc_lock);
389 }
390 
391 /*
392  * Main loop for a kthread that executes schedcpu once a second.
393  */
394 static void
395 schedcpu_thread(void)
396 {
397 	int nowake;
398 
399 	for (;;) {
400 		schedcpu();
401 		tsleep(&nowake, curthread->td_priority, "-", hz);
402 	}
403 }
404 
405 /*
406  * Recalculate the priority of a process after it has slept for a while.
407  * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
408  * least six times the loadfactor will decay kg_estcpu to zero.
409  */
410 static void
411 updatepri(struct ksegrp *kg)
412 {
413 	register fixpt_t loadfac;
414 	register unsigned int newcpu;
415 
416 	loadfac = loadfactor(averunnable.ldavg[0]);
417 	if (kg->kg_slptime > 5 * loadfac)
418 		kg->kg_estcpu = 0;
419 	else {
420 		newcpu = kg->kg_estcpu;
421 		kg->kg_slptime--;	/* was incremented in schedcpu() */
422 		while (newcpu && --kg->kg_slptime)
423 			newcpu = decay_cpu(loadfac, newcpu);
424 		kg->kg_estcpu = newcpu;
425 	}
426 	resetpriority(kg);
427 }
428 
429 /*
430  * Compute the priority of a process when running in user mode.
431  * Arrange to reschedule if the resulting priority is better
432  * than that of the current process.
433  */
434 static void
435 resetpriority(struct ksegrp *kg)
436 {
437 	register unsigned int newpriority;
438 	struct thread *td;
439 
440 	if (kg->kg_pri_class == PRI_TIMESHARE) {
441 		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
442 		    NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
443 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
444 		    PRI_MAX_TIMESHARE);
445 		kg->kg_user_pri = newpriority;
446 	}
447 	FOREACH_THREAD_IN_GROUP(kg, td) {
448 		maybe_resched(td);			/* XXXKSE silly */
449 	}
450 }
451 
452 /* ARGSUSED */
453 static void
454 sched_setup(void *dummy)
455 {
456 	setup_runqs();
457 
458 	if (sched_quantum == 0)
459 		sched_quantum = SCHED_QUANTUM;
460 	hogticks = 2 * sched_quantum;
461 
462 	callout_init(&roundrobin_callout, CALLOUT_MPSAFE);
463 
464 	/* Kick off timeout driven events by calling first time. */
465 	roundrobin(NULL);
466 
467 	/* Account for thread0. */
468 	sched_tdcnt++;
469 }
470 
471 /* External interfaces start here */
472 int
473 sched_runnable(void)
474 {
475 #ifdef SMP
476 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
477 #else
478 	return runq_check(&runq);
479 #endif
480 }
481 
482 int
483 sched_rr_interval(void)
484 {
485 	if (sched_quantum == 0)
486 		sched_quantum = SCHED_QUANTUM;
487 	return (sched_quantum);
488 }
489 
490 /*
491  * We adjust the priority of the current process.  The priority of
492  * a process gets worse as it accumulates CPU time.  The cpu usage
493  * estimator (kg_estcpu) is increased here.  resetpriority() will
494  * compute a different priority each time kg_estcpu increases by
495  * INVERSE_ESTCPU_WEIGHT
496  * (until MAXPRI is reached).  The cpu usage estimator ramps up
497  * quite quickly when the process is running (linearly), and decays
498  * away exponentially, at a rate which is proportionally slower when
499  * the system is busy.  The basic principle is that the system will
500  * 90% forget that the process used a lot of CPU time in 5 * loadav
501  * seconds.  This causes the system to favor processes which haven't
502  * run much recently, and to round-robin among other processes.
503  */
504 void
505 sched_clock(struct thread *td)
506 {
507 	struct ksegrp *kg;
508 	struct kse *ke;
509 
510 	mtx_assert(&sched_lock, MA_OWNED);
511 	kg = td->td_ksegrp;
512 	ke = td->td_kse;
513 
514 	ke->ke_sched->ske_cpticks++;
515 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
516 	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
517 		resetpriority(kg);
518 		if (td->td_priority >= PUSER)
519 			td->td_priority = kg->kg_user_pri;
520 	}
521 }
522 
523 /*
524  * charge childs scheduling cpu usage to parent.
525  *
526  * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
527  * Charge it to the ksegrp that did the wait since process estcpu is sum of
528  * all ksegrps, this is strictly as expected.  Assume that the child process
529  * aggregated all the estcpu into the 'built-in' ksegrp.
530  */
531 void
532 sched_exit(struct proc *p, struct proc *p1)
533 {
534 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
535 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
536 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
537 }
538 
539 void
540 sched_exit_kse(struct kse *ke, struct kse *child)
541 {
542 }
543 
544 void
545 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
546 {
547 
548 	mtx_assert(&sched_lock, MA_OWNED);
549 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu);
550 }
551 
552 void
553 sched_exit_thread(struct thread *td, struct thread *child)
554 {
555 	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
556 		sched_tdcnt--;
557 }
558 
559 void
560 sched_fork(struct proc *p, struct proc *p1)
561 {
562 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
563 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
564 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
565 }
566 
567 void
568 sched_fork_kse(struct kse *ke, struct kse *child)
569 {
570 	child->ke_sched->ske_cpticks = 0;
571 }
572 
573 void
574 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
575 {
576 	mtx_assert(&sched_lock, MA_OWNED);
577 	child->kg_estcpu = kg->kg_estcpu;
578 }
579 
580 void
581 sched_fork_thread(struct thread *td, struct thread *child)
582 {
583 }
584 
585 void
586 sched_nice(struct ksegrp *kg, int nice)
587 {
588 
589 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
590 	mtx_assert(&sched_lock, MA_OWNED);
591 	kg->kg_nice = nice;
592 	resetpriority(kg);
593 }
594 
595 void
596 sched_class(struct ksegrp *kg, int class)
597 {
598 	mtx_assert(&sched_lock, MA_OWNED);
599 	kg->kg_pri_class = class;
600 }
601 
602 /*
603  * Adjust the priority of a thread.
604  * This may include moving the thread within the KSEGRP,
605  * changing the assignment of a kse to the thread,
606  * and moving a KSE in the system run queue.
607  */
608 void
609 sched_prio(struct thread *td, u_char prio)
610 {
611 
612 	mtx_assert(&sched_lock, MA_OWNED);
613 	if (TD_ON_RUNQ(td)) {
614 		adjustrunqueue(td, prio);
615 	} else {
616 		td->td_priority = prio;
617 	}
618 }
619 
620 void
621 sched_sleep(struct thread *td)
622 {
623 
624 	mtx_assert(&sched_lock, MA_OWNED);
625 	td->td_ksegrp->kg_slptime = 0;
626 	td->td_base_pri = td->td_priority;
627 }
628 
629 void
630 sched_switch(struct thread *td)
631 {
632 	struct thread *newtd;
633 	struct kse *ke;
634 	struct proc *p;
635 
636 	ke = td->td_kse;
637 	p = td->td_proc;
638 
639 	mtx_assert(&sched_lock, MA_OWNED);
640 	KASSERT((ke->ke_state == KES_THREAD), ("sched_switch: kse state?"));
641 
642 	if ((p->p_flag & P_NOLOAD) == 0)
643 		sched_tdcnt--;
644 	td->td_lastcpu = td->td_oncpu;
645 	td->td_last_kse = ke;
646 	td->td_flags &= ~TDF_NEEDRESCHED;
647 	td->td_oncpu = NOCPU;
648 	/*
649 	 * At the last moment, if this thread is still marked RUNNING,
650 	 * then put it back on the run queue as it has not been suspended
651 	 * or stopped or any thing else similar.
652 	 */
653 	if (TD_IS_RUNNING(td)) {
654 		/* Put us back on the run queue (kse and all). */
655 		setrunqueue(td);
656 	} else if (p->p_flag & P_SA) {
657 		/*
658 		 * We will not be on the run queue. So we must be
659 		 * sleeping or similar. As it's available,
660 		 * someone else can use the KSE if they need it.
661 		 */
662 		kse_reassign(ke);
663 	}
664 	newtd = choosethread();
665 	if (td != newtd)
666 		cpu_switch(td, newtd);
667 	sched_lock.mtx_lock = (uintptr_t)td;
668 	td->td_oncpu = PCPU_GET(cpuid);
669 }
670 
671 void
672 sched_wakeup(struct thread *td)
673 {
674 	struct ksegrp *kg;
675 
676 	mtx_assert(&sched_lock, MA_OWNED);
677 	kg = td->td_ksegrp;
678 	if (kg->kg_slptime > 1)
679 		updatepri(kg);
680 	kg->kg_slptime = 0;
681 	setrunqueue(td);
682 	maybe_resched(td);
683 }
684 
685 void
686 sched_add(struct thread *td)
687 {
688 	struct kse *ke;
689 
690 	ke = td->td_kse;
691 	mtx_assert(&sched_lock, MA_OWNED);
692 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
693 	KASSERT((ke->ke_thread->td_kse != NULL),
694 	    ("sched_add: No KSE on thread"));
695 	KASSERT(ke->ke_state != KES_ONRUNQ,
696 	    ("sched_add: kse %p (%s) already in run queue", ke,
697 	    ke->ke_proc->p_comm));
698 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
699 	    ("sched_add: process swapped out"));
700 	ke->ke_ksegrp->kg_runq_kses++;
701 	ke->ke_state = KES_ONRUNQ;
702 
703 #ifdef SMP
704 	if (KSE_CAN_MIGRATE(ke)) {
705 		CTR1(KTR_4BSD, "adding kse:%p to gbl runq", ke);
706 		ke->ke_runq = &runq;
707 	} else {
708 		CTR1(KTR_4BSD, "adding kse:%p to pcpu runq", ke);
709 		if (!SKE_RUNQ_PCPU(ke))
710 			ke->ke_runq = &runq_pcpu[PCPU_GET(cpuid)];
711 	}
712 #else
713 	ke->ke_runq = &runq;
714 #endif
715 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
716 		sched_tdcnt++;
717 	runq_add(ke->ke_runq, ke);
718 }
719 
720 void
721 sched_rem(struct thread *td)
722 {
723 	struct kse *ke;
724 
725 	ke = td->td_kse;
726 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
727 	    ("sched_rem: process swapped out"));
728 	KASSERT((ke->ke_state == KES_ONRUNQ),
729 	    ("sched_rem: KSE not on run queue"));
730 	mtx_assert(&sched_lock, MA_OWNED);
731 
732 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
733 		sched_tdcnt--;
734 	runq_remove(ke->ke_sched->ske_runq, ke);
735 
736 	ke->ke_state = KES_THREAD;
737 	ke->ke_ksegrp->kg_runq_kses--;
738 }
739 
740 struct kse *
741 sched_choose(void)
742 {
743 	struct kse *ke;
744 	struct runq *rq;
745 
746 #ifdef SMP
747 	struct kse *kecpu;
748 
749 	rq = &runq;
750 	ke = runq_choose(&runq);
751 	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
752 
753 	if (ke == NULL ||
754 	    (kecpu != NULL &&
755 	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
756 		CTR2(KTR_4BSD, "choosing kse %p from pcpu runq %d", kecpu,
757 		     PCPU_GET(cpuid));
758 		ke = kecpu;
759 		rq = &runq_pcpu[PCPU_GET(cpuid)];
760 	} else {
761 		CTR1(KTR_4BSD, "choosing kse %p from main runq", ke);
762 	}
763 
764 #else
765 	rq = &runq;
766 	ke = runq_choose(&runq);
767 #endif
768 
769 	if (ke != NULL) {
770 		runq_remove(rq, ke);
771 		ke->ke_state = KES_THREAD;
772 
773 		KASSERT((ke->ke_thread != NULL),
774 		    ("sched_choose: No thread on KSE"));
775 		KASSERT((ke->ke_thread->td_kse != NULL),
776 		    ("sched_choose: No KSE on thread"));
777 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
778 		    ("sched_choose: process swapped out"));
779 	}
780 	return (ke);
781 }
782 
783 void
784 sched_userret(struct thread *td)
785 {
786 	struct ksegrp *kg;
787 	/*
788 	 * XXX we cheat slightly on the locking here to avoid locking in
789 	 * the usual case.  Setting td_priority here is essentially an
790 	 * incomplete workaround for not setting it properly elsewhere.
791 	 * Now that some interrupt handlers are threads, not setting it
792 	 * properly elsewhere can clobber it in the window between setting
793 	 * it here and returning to user mode, so don't waste time setting
794 	 * it perfectly here.
795 	 */
796 	kg = td->td_ksegrp;
797 	if (td->td_priority != kg->kg_user_pri) {
798 		mtx_lock_spin(&sched_lock);
799 		td->td_priority = kg->kg_user_pri;
800 		mtx_unlock_spin(&sched_lock);
801 	}
802 }
803 
804 void
805 sched_bind(struct thread *td, int cpu)
806 {
807 	struct kse *ke;
808 
809 	mtx_assert(&sched_lock, MA_OWNED);
810 	KASSERT(TD_IS_RUNNING(td),
811 	    ("sched_bind: cannot bind non-running thread"));
812 
813 	ke = td->td_kse;
814 
815 	ke->ke_flags |= KEF_BOUND;
816 #ifdef SMP
817 	ke->ke_runq = &runq_pcpu[cpu];
818 	if (PCPU_GET(cpuid) == cpu)
819 		return;
820 
821 	ke->ke_state = KES_THREAD;
822 
823 	mi_switch(SW_VOL);
824 #endif
825 }
826 
827 void
828 sched_unbind(struct thread* td)
829 {
830 	mtx_assert(&sched_lock, MA_OWNED);
831 	td->td_kse->ke_flags &= ~KEF_BOUND;
832 }
833 
834 int
835 sched_load(void)
836 {
837 	return (sched_tdcnt);
838 }
839 
840 int
841 sched_sizeof_kse(void)
842 {
843 	return (sizeof(struct kse) + sizeof(struct ke_sched));
844 }
845 int
846 sched_sizeof_ksegrp(void)
847 {
848 	return (sizeof(struct ksegrp));
849 }
850 int
851 sched_sizeof_proc(void)
852 {
853 	return (sizeof(struct proc));
854 }
855 int
856 sched_sizeof_thread(void)
857 {
858 	return (sizeof(struct thread));
859 }
860 
861 fixpt_t
862 sched_pctcpu(struct thread *td)
863 {
864 	struct kse *ke;
865 
866 	ke = td->td_kse;
867 	if (ke == NULL)
868 		ke = td->td_last_kse;
869 	if (ke)
870 		return (ke->ke_pctcpu);
871 
872 	return (0);
873 }
874