xref: /freebsd/sys/kern/sched_ule.c (revision 4b60e3242ee71bd3118653ab7ca66fe1ff94fdb4)
1 /*-
2  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/ktr.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/proc.h>
37 #include <sys/resource.h>
38 #include <sys/sched.h>
39 #include <sys/smp.h>
40 #include <sys/sx.h>
41 #include <sys/sysctl.h>
42 #include <sys/sysproto.h>
43 #include <sys/vmmeter.h>
44 #ifdef DDB
45 #include <ddb/ddb.h>
46 #endif
47 #ifdef KTRACE
48 #include <sys/uio.h>
49 #include <sys/ktrace.h>
50 #endif
51 
52 #include <machine/cpu.h>
53 
54 #define KTR_ULE         KTR_NFS
55 
56 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
57 /* XXX This is bogus compatability crap for ps */
58 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
59 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
60 
61 static void sched_setup(void *dummy);
62 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
63 
64 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
65 
66 static int sched_strict;
67 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
68 
69 static int slice_min = 1;
70 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
71 
72 static int slice_max = 10;
73 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
74 
75 int realstathz;
76 int tickincr = 1;
77 
78 #ifdef SMP
79 /* Callout to handle load balancing SMP systems. */
80 static struct callout kseq_lb_callout;
81 #endif
82 
83 /*
84  * These datastructures are allocated within their parent datastructure but
85  * are scheduler specific.
86  */
87 
88 struct ke_sched {
89 	int		ske_slice;
90 	struct runq	*ske_runq;
91 	/* The following variables are only used for pctcpu calculation */
92 	int		ske_ltick;	/* Last tick that we were running on */
93 	int		ske_ftick;	/* First tick that we were running on */
94 	int		ske_ticks;	/* Tick count */
95 	/* CPU that we have affinity for. */
96 	u_char		ske_cpu;
97 };
98 #define	ke_slice	ke_sched->ske_slice
99 #define	ke_runq		ke_sched->ske_runq
100 #define	ke_ltick	ke_sched->ske_ltick
101 #define	ke_ftick	ke_sched->ske_ftick
102 #define	ke_ticks	ke_sched->ske_ticks
103 #define	ke_cpu		ke_sched->ske_cpu
104 
105 struct kg_sched {
106 	int	skg_slptime;		/* Number of ticks we vol. slept */
107 	int	skg_runtime;		/* Number of ticks we were running */
108 };
109 #define	kg_slptime	kg_sched->skg_slptime
110 #define	kg_runtime	kg_sched->skg_runtime
111 
112 struct td_sched {
113 	int	std_slptime;
114 };
115 #define	td_slptime	td_sched->std_slptime
116 
117 struct td_sched td_sched;
118 struct ke_sched ke_sched;
119 struct kg_sched kg_sched;
120 
121 struct ke_sched *kse0_sched = &ke_sched;
122 struct kg_sched *ksegrp0_sched = &kg_sched;
123 struct p_sched *proc0_sched = NULL;
124 struct td_sched *thread0_sched = &td_sched;
125 
126 /*
127  * This priority range has 20 priorities on either end that are reachable
128  * only through nice values.
129  *
130  * PRI_RANGE:	Total priority range for timeshare threads.
131  * PRI_NRESV:	Reserved priorities for nice.
132  * PRI_BASE:	The start of the dynamic range.
133  * DYN_RANGE:	Number of priorities that are available int the dynamic
134  *		priority range.
135  */
136 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
137 #define	SCHED_PRI_NRESV		PRIO_TOTAL
138 #define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
139 #define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
140 #define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
141 #define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
142 #define	SCHED_PRI_INTERACT(score)					\
143     ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_MAX)
144 
145 /*
146  * These determine the interactivity of a process.
147  *
148  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
149  *		before throttling back.
150  * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
151  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
152  * INTERACT_THRESH:	Threshhold for placement on the current runq.
153  */
154 #define	SCHED_SLP_RUN_MAX	((hz * 2) << 10)
155 #define	SCHED_SLP_RUN_THROTTLE	(2)
156 #define	SCHED_INTERACT_MAX	(100)
157 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
158 #define	SCHED_INTERACT_THRESH	(20)
159 
160 /*
161  * These parameters and macros determine the size of the time slice that is
162  * granted to each thread.
163  *
164  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
165  * SLICE_MAX:	Maximum time slice granted.
166  * SLICE_RANGE:	Range of available time slices scaled by hz.
167  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
168  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
169  */
170 #define	SCHED_SLICE_MIN			(slice_min)
171 #define	SCHED_SLICE_MAX			(slice_max)
172 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
173 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
174 #define	SCHED_SLICE_NICE(nice)						\
175     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
176 
177 /*
178  * This macro determines whether or not the kse belongs on the current or
179  * next run queue.
180  *
181  * XXX nice value should effect how interactive a kg is.
182  */
183 #define	SCHED_INTERACTIVE(kg)						\
184     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
185 #define	SCHED_CURR(kg, ke)						\
186     (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
187 
188 /*
189  * Cpu percentage computation macros and defines.
190  *
191  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
192  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
193  */
194 
195 #define	SCHED_CPU_TIME	10
196 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
197 
198 /*
199  * kseq - per processor runqs and statistics.
200  */
201 
202 #define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
203 
204 struct kseq {
205 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
206 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
207 	struct runq	*ksq_next;		/* Next timeshare queue. */
208 	struct runq	*ksq_curr;		/* Current queue. */
209 	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
210 	int		ksq_load;		/* Aggregate load. */
211 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
212 	short		ksq_nicemin;		/* Least nice. */
213 #ifdef SMP
214 	unsigned int	ksq_rslices;	/* Slices on run queue */
215 #endif
216 };
217 
218 /*
219  * One kse queue per processor.
220  */
221 #ifdef SMP
222 struct kseq	kseq_cpu[MAXCPU];
223 #define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
224 #define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
225 #else
226 struct kseq	kseq_cpu;
227 #define	KSEQ_SELF()	(&kseq_cpu)
228 #define	KSEQ_CPU(x)	(&kseq_cpu)
229 #endif
230 
231 static void sched_slice(struct kse *ke);
232 static void sched_priority(struct ksegrp *kg);
233 static int sched_interact_score(struct ksegrp *kg);
234 static void sched_interact_update(struct ksegrp *kg);
235 void sched_pctcpu_update(struct kse *ke);
236 int sched_pickcpu(void);
237 
238 /* Operations on per processor queues */
239 static struct kse * kseq_choose(struct kseq *kseq);
240 static void kseq_setup(struct kseq *kseq);
241 static void kseq_add(struct kseq *kseq, struct kse *ke);
242 static void kseq_rem(struct kseq *kseq, struct kse *ke);
243 static void kseq_nice_add(struct kseq *kseq, int nice);
244 static void kseq_nice_rem(struct kseq *kseq, int nice);
245 void kseq_print(int cpu);
246 #ifdef SMP
247 struct kseq * kseq_load_highest(void);
248 void kseq_balance(void *arg);
249 void kseq_move(struct kseq *from, int cpu);
250 #endif
251 
252 void
253 kseq_print(int cpu)
254 {
255 	struct kseq *kseq;
256 	int i;
257 
258 	kseq = KSEQ_CPU(cpu);
259 
260 	printf("kseq:\n");
261 	printf("\tload:           %d\n", kseq->ksq_load);
262 	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
263 	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
264 	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
265 	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
266 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
267 	printf("\tnice counts:\n");
268 	for (i = 0; i < PRIO_TOTAL + 1; i++)
269 		if (kseq->ksq_nice[i])
270 			printf("\t\t%d = %d\n",
271 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
272 }
273 
274 static void
275 kseq_add(struct kseq *kseq, struct kse *ke)
276 {
277 	mtx_assert(&sched_lock, MA_OWNED);
278 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
279 	kseq->ksq_load++;
280 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
281 	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
282 	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
283 	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
284 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
285 		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
286 #ifdef SMP
287 	kseq->ksq_rslices += ke->ke_slice;
288 #endif
289 }
290 
291 static void
292 kseq_rem(struct kseq *kseq, struct kse *ke)
293 {
294 	mtx_assert(&sched_lock, MA_OWNED);
295 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
296 	kseq->ksq_load--;
297 	ke->ke_runq = NULL;
298 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
299 		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
300 #ifdef SMP
301 	kseq->ksq_rslices -= ke->ke_slice;
302 #endif
303 }
304 
305 static void
306 kseq_nice_add(struct kseq *kseq, int nice)
307 {
308 	mtx_assert(&sched_lock, MA_OWNED);
309 	/* Normalize to zero. */
310 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
311 	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
312 		kseq->ksq_nicemin = nice;
313 }
314 
315 static void
316 kseq_nice_rem(struct kseq *kseq, int nice)
317 {
318 	int n;
319 
320 	mtx_assert(&sched_lock, MA_OWNED);
321 	/* Normalize to zero. */
322 	n = nice + SCHED_PRI_NHALF;
323 	kseq->ksq_nice[n]--;
324 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
325 
326 	/*
327 	 * If this wasn't the smallest nice value or there are more in
328 	 * this bucket we can just return.  Otherwise we have to recalculate
329 	 * the smallest nice.
330 	 */
331 	if (nice != kseq->ksq_nicemin ||
332 	    kseq->ksq_nice[n] != 0 ||
333 	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
334 		return;
335 
336 	for (; n < SCHED_PRI_NRESV + 1; n++)
337 		if (kseq->ksq_nice[n]) {
338 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
339 			return;
340 		}
341 }
342 
343 #ifdef SMP
344 /*
345  * kseq_balance is a simple CPU load balancing algorithm.  It operates by
346  * finding the least loaded and most loaded cpu and equalizing their load
347  * by migrating some processes.
348  *
349  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
350  * installations will only have 2 cpus.  Secondly, load balancing too much at
351  * once can have an unpleasant effect on the system.  The scheduler rarely has
352  * enough information to make perfect decisions.  So this algorithm chooses
353  * algorithm simplicity and more gradual effects on load in larger systems.
354  *
355  * It could be improved by considering the priorities and slices assigned to
356  * each task prior to balancing them.  There are many pathological cases with
357  * any approach and so the semi random algorithm below may work as well as any.
358  *
359  */
360 void
361 kseq_balance(void *arg)
362 {
363 	struct kseq *kseq;
364 	int high_load;
365 	int low_load;
366 	int high_cpu;
367 	int low_cpu;
368 	int move;
369 	int diff;
370 	int i;
371 
372 	high_cpu = 0;
373 	low_cpu = 0;
374 	high_load = 0;
375 	low_load = -1;
376 
377 	mtx_lock_spin(&sched_lock);
378 	for (i = 0; i < mp_maxid; i++) {
379 		if (CPU_ABSENT(i))
380 			continue;
381 		kseq = KSEQ_CPU(i);
382 		if (kseq->ksq_load > high_load) {
383 			high_load = kseq->ksq_load;
384 			high_cpu = i;
385 		}
386 		if (low_load == -1 || kseq->ksq_load < low_load) {
387 			low_load = kseq->ksq_load;
388 			low_cpu = i;
389 		}
390 	}
391 
392 	/*
393 	 * Nothing to do.
394 	 */
395 	if (high_load < 2 || low_load == high_load)
396 		goto out;
397 
398 	diff = high_load - low_load;
399 	move = diff / 2;
400 	if (diff & 0x1)
401 		move++;
402 
403 	for (i = 0; i < move; i++)
404 		kseq_move(KSEQ_CPU(high_cpu), low_cpu);
405 
406 out:
407 	mtx_unlock_spin(&sched_lock);
408 	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
409 
410 	return;
411 }
412 
413 struct kseq *
414 kseq_load_highest(void)
415 {
416 	struct kseq *kseq;
417 	int load;
418 	int cpu;
419 	int i;
420 
421 	mtx_assert(&sched_lock, MA_OWNED);
422 	cpu = 0;
423 	load = 0;
424 
425 	for (i = 0; i < mp_maxid; i++) {
426 		if (CPU_ABSENT(i))
427 			continue;
428 		kseq = KSEQ_CPU(i);
429 		if (kseq->ksq_load > load) {
430 			load = kseq->ksq_load;
431 			cpu = i;
432 		}
433 	}
434 	if (load > 1)
435 		return (KSEQ_CPU(cpu));
436 
437 	return (NULL);
438 }
439 
440 void
441 kseq_move(struct kseq *from, int cpu)
442 {
443 	struct kse *ke;
444 
445 	ke = kseq_choose(from);
446 	runq_remove(ke->ke_runq, ke);
447 	ke->ke_state = KES_THREAD;
448 	kseq_rem(from, ke);
449 
450 	ke->ke_cpu = cpu;
451 	sched_add(ke);
452 }
453 #endif
454 
455 struct kse *
456 kseq_choose(struct kseq *kseq)
457 {
458 	struct kse *ke;
459 	struct runq *swap;
460 
461 	mtx_assert(&sched_lock, MA_OWNED);
462 	swap = NULL;
463 
464 	for (;;) {
465 		ke = runq_choose(kseq->ksq_curr);
466 		if (ke == NULL) {
467 			/*
468 			 * We already swaped once and didn't get anywhere.
469 			 */
470 			if (swap)
471 				break;
472 			swap = kseq->ksq_curr;
473 			kseq->ksq_curr = kseq->ksq_next;
474 			kseq->ksq_next = swap;
475 			continue;
476 		}
477 		/*
478 		 * If we encounter a slice of 0 the kse is in a
479 		 * TIMESHARE kse group and its nice was too far out
480 		 * of the range that receives slices.
481 		 */
482 		if (ke->ke_slice == 0) {
483 			runq_remove(ke->ke_runq, ke);
484 			sched_slice(ke);
485 			ke->ke_runq = kseq->ksq_next;
486 			runq_add(ke->ke_runq, ke);
487 			continue;
488 		}
489 		return (ke);
490 	}
491 
492 	return (runq_choose(&kseq->ksq_idle));
493 }
494 
495 static void
496 kseq_setup(struct kseq *kseq)
497 {
498 	runq_init(&kseq->ksq_timeshare[0]);
499 	runq_init(&kseq->ksq_timeshare[1]);
500 	runq_init(&kseq->ksq_idle);
501 
502 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
503 	kseq->ksq_next = &kseq->ksq_timeshare[1];
504 
505 	kseq->ksq_loads[PRI_ITHD] = 0;
506 	kseq->ksq_loads[PRI_REALTIME] = 0;
507 	kseq->ksq_loads[PRI_TIMESHARE] = 0;
508 	kseq->ksq_loads[PRI_IDLE] = 0;
509 	kseq->ksq_load = 0;
510 #ifdef SMP
511 	kseq->ksq_rslices = 0;
512 #endif
513 }
514 
515 static void
516 sched_setup(void *dummy)
517 {
518 	int i;
519 
520 	slice_min = (hz/100);
521 	slice_max = (hz/10);
522 
523 	mtx_lock_spin(&sched_lock);
524 	/* init kseqs */
525 	for (i = 0; i < MAXCPU; i++)
526 		kseq_setup(KSEQ_CPU(i));
527 
528 	kseq_add(KSEQ_SELF(), &kse0);
529 	mtx_unlock_spin(&sched_lock);
530 #ifdef SMP
531 	callout_init(&kseq_lb_callout, 1);
532 	kseq_balance(NULL);
533 #endif
534 }
535 
536 /*
537  * Scale the scheduling priority according to the "interactivity" of this
538  * process.
539  */
540 static void
541 sched_priority(struct ksegrp *kg)
542 {
543 	int pri;
544 
545 	if (kg->kg_pri_class != PRI_TIMESHARE)
546 		return;
547 
548 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
549 	pri += SCHED_PRI_BASE;
550 	pri += kg->kg_nice;
551 
552 	if (pri > PRI_MAX_TIMESHARE)
553 		pri = PRI_MAX_TIMESHARE;
554 	else if (pri < PRI_MIN_TIMESHARE)
555 		pri = PRI_MIN_TIMESHARE;
556 
557 	kg->kg_user_pri = pri;
558 
559 	return;
560 }
561 
562 /*
563  * Calculate a time slice based on the properties of the kseg and the runq
564  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
565  */
566 static void
567 sched_slice(struct kse *ke)
568 {
569 	struct kseq *kseq;
570 	struct ksegrp *kg;
571 
572 	kg = ke->ke_ksegrp;
573 	kseq = KSEQ_CPU(ke->ke_cpu);
574 
575 	/*
576 	 * Rationale:
577 	 * KSEs in interactive ksegs get the minimum slice so that we
578 	 * quickly notice if it abuses its advantage.
579 	 *
580 	 * KSEs in non-interactive ksegs are assigned a slice that is
581 	 * based on the ksegs nice value relative to the least nice kseg
582 	 * on the run queue for this cpu.
583 	 *
584 	 * If the KSE is less nice than all others it gets the maximum
585 	 * slice and other KSEs will adjust their slice relative to
586 	 * this when they first expire.
587 	 *
588 	 * There is 20 point window that starts relative to the least
589 	 * nice kse on the run queue.  Slice size is determined by
590 	 * the kse distance from the last nice ksegrp.
591 	 *
592 	 * If you are outside of the window you will get no slice and
593 	 * you will be reevaluated each time you are selected on the
594 	 * run queue.
595 	 *
596 	 */
597 
598 	if (!SCHED_INTERACTIVE(kg)) {
599 		int nice;
600 
601 		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
602 		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
603 		    kg->kg_nice < kseq->ksq_nicemin)
604 			ke->ke_slice = SCHED_SLICE_MAX;
605 		else if (nice <= SCHED_PRI_NTHRESH)
606 			ke->ke_slice = SCHED_SLICE_NICE(nice);
607 		else
608 			ke->ke_slice = 0;
609 	} else
610 		ke->ke_slice = SCHED_SLICE_MIN;
611 
612 	CTR6(KTR_ULE,
613 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
614 	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
615 	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
616 
617 	/*
618 	 * Check to see if we need to scale back the slp and run time
619 	 * in the kg.  This will cause us to forget old interactivity
620 	 * while maintaining the current ratio.
621 	 */
622 	sched_interact_update(kg);
623 
624 	return;
625 }
626 
627 static void
628 sched_interact_update(struct ksegrp *kg)
629 {
630 	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
631 		kg->kg_runtime = (kg->kg_runtime / 5) * 4;
632 		kg->kg_slptime = (kg->kg_slptime / 5) * 4;
633 	}
634 }
635 
636 static int
637 sched_interact_score(struct ksegrp *kg)
638 {
639 	int div;
640 
641 	if (kg->kg_runtime > kg->kg_slptime) {
642 		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
643 		return (SCHED_INTERACT_HALF +
644 		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
645 	} if (kg->kg_slptime > kg->kg_runtime) {
646 		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
647 		return (kg->kg_runtime / div);
648 	}
649 
650 	/*
651 	 * This can happen if slptime and runtime are 0.
652 	 */
653 	return (0);
654 
655 }
656 
657 /*
658  * This is only somewhat accurate since given many processes of the same
659  * priority they will switch when their slices run out, which will be
660  * at most SCHED_SLICE_MAX.
661  */
662 int
663 sched_rr_interval(void)
664 {
665 	return (SCHED_SLICE_MAX);
666 }
667 
668 void
669 sched_pctcpu_update(struct kse *ke)
670 {
671 	/*
672 	 * Adjust counters and watermark for pctcpu calc.
673 	 */
674 
675 	/*
676 	 * Shift the tick count out so that the divide doesn't round away
677 	 * our results.
678 	 */
679 	ke->ke_ticks <<= 10;
680 	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
681 		    SCHED_CPU_TICKS;
682 	ke->ke_ticks >>= 10;
683 	ke->ke_ltick = ticks;
684 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
685 }
686 
687 #ifdef SMP
688 /* XXX Should be changed to kseq_load_lowest() */
689 int
690 sched_pickcpu(void)
691 {
692 	struct kseq *kseq;
693 	int load;
694 	int cpu;
695 	int i;
696 
697 	mtx_assert(&sched_lock, MA_OWNED);
698 	if (!smp_started)
699 		return (0);
700 
701 	load = 0;
702 	cpu = 0;
703 
704 	for (i = 0; i < mp_maxid; i++) {
705 		if (CPU_ABSENT(i))
706 			continue;
707 		kseq = KSEQ_CPU(i);
708 		if (kseq->ksq_load < load) {
709 			cpu = i;
710 			load = kseq->ksq_load;
711 		}
712 	}
713 
714 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
715 	return (cpu);
716 }
717 #else
718 int
719 sched_pickcpu(void)
720 {
721 	return (0);
722 }
723 #endif
724 
725 void
726 sched_prio(struct thread *td, u_char prio)
727 {
728 	struct kse *ke;
729 	struct runq *rq;
730 
731 	mtx_assert(&sched_lock, MA_OWNED);
732 	ke = td->td_kse;
733 	td->td_priority = prio;
734 
735 	if (TD_ON_RUNQ(td)) {
736 		rq = ke->ke_runq;
737 
738 		runq_remove(rq, ke);
739 		runq_add(rq, ke);
740 	}
741 }
742 
743 void
744 sched_switchout(struct thread *td)
745 {
746 	struct kse *ke;
747 
748 	mtx_assert(&sched_lock, MA_OWNED);
749 
750 	ke = td->td_kse;
751 
752 	td->td_last_kse = ke;
753         td->td_lastcpu = td->td_oncpu;
754 	td->td_oncpu = NOCPU;
755         td->td_flags &= ~TDF_NEEDRESCHED;
756 
757 	if (TD_IS_RUNNING(td)) {
758 		/*
759 		 * This queue is always correct except for idle threads which
760 		 * have a higher priority due to priority propagation.
761 		 */
762 		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE &&
763 		    ke->ke_thread->td_priority > PRI_MIN_IDLE)
764 			ke->ke_runq = KSEQ_SELF()->ksq_curr;
765 		runq_add(ke->ke_runq, ke);
766 		/* setrunqueue(td); */
767 		return;
768 	}
769 	if (ke->ke_runq)
770 		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
771 	/*
772 	 * We will not be on the run queue. So we must be
773 	 * sleeping or similar.
774 	 */
775 	if (td->td_proc->p_flag & P_SA)
776 		kse_reassign(ke);
777 }
778 
779 void
780 sched_switchin(struct thread *td)
781 {
782 	/* struct kse *ke = td->td_kse; */
783 	mtx_assert(&sched_lock, MA_OWNED);
784 
785 	td->td_oncpu = PCPU_GET(cpuid);
786 }
787 
788 void
789 sched_nice(struct ksegrp *kg, int nice)
790 {
791 	struct kse *ke;
792 	struct thread *td;
793 	struct kseq *kseq;
794 
795 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
796 	mtx_assert(&sched_lock, MA_OWNED);
797 	/*
798 	 * We need to adjust the nice counts for running KSEs.
799 	 */
800 	if (kg->kg_pri_class == PRI_TIMESHARE)
801 		FOREACH_KSE_IN_GROUP(kg, ke) {
802 			if (ke->ke_state != KES_ONRUNQ &&
803 			    ke->ke_state != KES_THREAD)
804 				continue;
805 			kseq = KSEQ_CPU(ke->ke_cpu);
806 			kseq_nice_rem(kseq, kg->kg_nice);
807 			kseq_nice_add(kseq, nice);
808 		}
809 	kg->kg_nice = nice;
810 	sched_priority(kg);
811 	FOREACH_THREAD_IN_GROUP(kg, td)
812 		td->td_flags |= TDF_NEEDRESCHED;
813 }
814 
815 void
816 sched_sleep(struct thread *td, u_char prio)
817 {
818 	mtx_assert(&sched_lock, MA_OWNED);
819 
820 	td->td_slptime = ticks;
821 	td->td_priority = prio;
822 
823 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
824 	    td->td_kse, td->td_slptime);
825 }
826 
827 void
828 sched_wakeup(struct thread *td)
829 {
830 	mtx_assert(&sched_lock, MA_OWNED);
831 
832 	/*
833 	 * Let the kseg know how long we slept for.  This is because process
834 	 * interactivity behavior is modeled in the kseg.
835 	 */
836 	if (td->td_slptime) {
837 		struct ksegrp *kg;
838 		int hzticks;
839 
840 		kg = td->td_ksegrp;
841 		hzticks = ticks - td->td_slptime;
842 		kg->kg_slptime += hzticks << 10;
843 		sched_interact_update(kg);
844 		sched_priority(kg);
845 		if (td->td_kse)
846 			sched_slice(td->td_kse);
847 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
848 		    td->td_kse, hzticks);
849 		td->td_slptime = 0;
850 	}
851 	setrunqueue(td);
852         if (td->td_priority < curthread->td_priority)
853                 curthread->td_flags |= TDF_NEEDRESCHED;
854 }
855 
856 /*
857  * Penalize the parent for creating a new child and initialize the child's
858  * priority.
859  */
860 void
861 sched_fork(struct proc *p, struct proc *p1)
862 {
863 
864 	mtx_assert(&sched_lock, MA_OWNED);
865 
866 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
867 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
868 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
869 }
870 
871 void
872 sched_fork_kse(struct kse *ke, struct kse *child)
873 {
874 
875 	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
876 	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
877 	child->ke_runq = NULL;
878 
879 	/*
880 	 * Claim that we've been running for one second for statistical
881 	 * purposes.
882 	 */
883 	child->ke_ticks = 0;
884 	child->ke_ltick = ticks;
885 	child->ke_ftick = ticks - hz;
886 }
887 
888 void
889 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
890 {
891 
892 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
893 	/* XXX Need something better here */
894 
895 	child->kg_slptime = kg->kg_slptime;
896 	child->kg_runtime = kg->kg_runtime;
897 	kg->kg_runtime += tickincr << 10;
898 	sched_interact_update(kg);
899 
900 	child->kg_user_pri = kg->kg_user_pri;
901 	child->kg_nice = kg->kg_nice;
902 }
903 
904 void
905 sched_fork_thread(struct thread *td, struct thread *child)
906 {
907 }
908 
909 void
910 sched_class(struct ksegrp *kg, int class)
911 {
912 	struct kseq *kseq;
913 	struct kse *ke;
914 
915 	mtx_assert(&sched_lock, MA_OWNED);
916 	if (kg->kg_pri_class == class)
917 		return;
918 
919 	FOREACH_KSE_IN_GROUP(kg, ke) {
920 		if (ke->ke_state != KES_ONRUNQ &&
921 		    ke->ke_state != KES_THREAD)
922 			continue;
923 		kseq = KSEQ_CPU(ke->ke_cpu);
924 
925 		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
926 		kseq->ksq_loads[PRI_BASE(class)]++;
927 
928 		if (kg->kg_pri_class == PRI_TIMESHARE)
929 			kseq_nice_rem(kseq, kg->kg_nice);
930 		else if (class == PRI_TIMESHARE)
931 			kseq_nice_add(kseq, kg->kg_nice);
932 	}
933 
934 	kg->kg_pri_class = class;
935 }
936 
937 /*
938  * Return some of the child's priority and interactivity to the parent.
939  */
940 void
941 sched_exit(struct proc *p, struct proc *child)
942 {
943 	/* XXX Need something better here */
944 	mtx_assert(&sched_lock, MA_OWNED);
945 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
946 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
947 }
948 
949 void
950 sched_exit_kse(struct kse *ke, struct kse *child)
951 {
952 	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
953 }
954 
955 void
956 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
957 {
958 	/* kg->kg_slptime += child->kg_slptime; */
959 	kg->kg_runtime += child->kg_runtime;
960 	sched_interact_update(kg);
961 }
962 
963 void
964 sched_exit_thread(struct thread *td, struct thread *child)
965 {
966 }
967 
968 void
969 sched_clock(struct kse *ke)
970 {
971 	struct kseq *kseq;
972 	struct ksegrp *kg;
973 	struct thread *td;
974 #if 0
975 	struct kse *nke;
976 #endif
977 
978 	/*
979 	 * sched_setup() apparently happens prior to stathz being set.  We
980 	 * need to resolve the timers earlier in the boot so we can avoid
981 	 * calculating this here.
982 	 */
983 	if (realstathz == 0) {
984 		realstathz = stathz ? stathz : hz;
985 		tickincr = hz / realstathz;
986 		/*
987 		 * XXX This does not work for values of stathz that are much
988 		 * larger than hz.
989 		 */
990 		if (tickincr == 0)
991 			tickincr = 1;
992 	}
993 
994 	td = ke->ke_thread;
995 	kg = ke->ke_ksegrp;
996 
997 	mtx_assert(&sched_lock, MA_OWNED);
998 	KASSERT((td != NULL), ("schedclock: null thread pointer"));
999 
1000 	/* Adjust ticks for pctcpu */
1001 	ke->ke_ticks++;
1002 	ke->ke_ltick = ticks;
1003 
1004 	/* Go up to one second beyond our max and then trim back down */
1005 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1006 		sched_pctcpu_update(ke);
1007 
1008 	if (td->td_flags & TDF_IDLETD)
1009 		return;
1010 
1011 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
1012 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
1013 
1014 	/*
1015 	 * We only do slicing code for TIMESHARE ksegrps.
1016 	 */
1017 	if (kg->kg_pri_class != PRI_TIMESHARE)
1018 		return;
1019 	/*
1020 	 * Check for a higher priority task on the run queue.  This can happen
1021 	 * on SMP if another processor woke up a process on our runq.
1022 	 */
1023 	kseq = KSEQ_SELF();
1024 #if 0
1025 	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
1026 		if (sched_strict &&
1027 		    nke->ke_thread->td_priority < td->td_priority)
1028 			td->td_flags |= TDF_NEEDRESCHED;
1029 		else if (nke->ke_thread->td_priority <
1030 		    td->td_priority SCHED_PRIO_SLOP)
1031 
1032 		if (nke->ke_thread->td_priority < td->td_priority)
1033 			td->td_flags |= TDF_NEEDRESCHED;
1034 	}
1035 #endif
1036 	/*
1037 	 * We used a tick charge it to the ksegrp so that we can compute our
1038 	 * interactivity.
1039 	 */
1040 	kg->kg_runtime += tickincr << 10;
1041 	sched_interact_update(kg);
1042 
1043 	/*
1044 	 * We used up one time slice.
1045 	 */
1046 	ke->ke_slice--;
1047 #ifdef SMP
1048 	kseq->ksq_rslices--;
1049 #endif
1050 
1051 	if (ke->ke_slice > 0)
1052 		return;
1053 	/*
1054 	 * We're out of time, recompute priorities and requeue.
1055 	 */
1056 	kseq_rem(kseq, ke);
1057 	sched_priority(kg);
1058 	sched_slice(ke);
1059 	if (SCHED_CURR(kg, ke))
1060 		ke->ke_runq = kseq->ksq_curr;
1061 	else
1062 		ke->ke_runq = kseq->ksq_next;
1063 	kseq_add(kseq, ke);
1064 	td->td_flags |= TDF_NEEDRESCHED;
1065 }
1066 
1067 int
1068 sched_runnable(void)
1069 {
1070 	struct kseq *kseq;
1071 	int load;
1072 
1073 	load = 1;
1074 
1075 	mtx_lock_spin(&sched_lock);
1076 	kseq = KSEQ_SELF();
1077 
1078 	if (kseq->ksq_load)
1079 		goto out;
1080 #ifdef SMP
1081 	/*
1082 	 * For SMP we may steal other processor's KSEs.  Just search until we
1083 	 * verify that at least on other cpu has a runnable task.
1084 	 */
1085 	if (smp_started) {
1086 		int i;
1087 
1088 		for (i = 0; i < mp_maxid; i++) {
1089 			if (CPU_ABSENT(i))
1090 				continue;
1091 			kseq = KSEQ_CPU(i);
1092 			if (kseq->ksq_load > 1)
1093 				goto out;
1094 		}
1095 	}
1096 #endif
1097 	load = 0;
1098 out:
1099 	mtx_unlock_spin(&sched_lock);
1100 	return (load);
1101 }
1102 
1103 void
1104 sched_userret(struct thread *td)
1105 {
1106 	struct ksegrp *kg;
1107 	struct kseq *kseq;
1108 	struct kse *ke;
1109 
1110 	kg = td->td_ksegrp;
1111 
1112 	if (td->td_priority != kg->kg_user_pri) {
1113 		mtx_lock_spin(&sched_lock);
1114 		td->td_priority = kg->kg_user_pri;
1115 		kseq = KSEQ_SELF();
1116 		if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
1117 		    kseq->ksq_load > 1 &&
1118 		    (ke = kseq_choose(kseq)) != NULL &&
1119 		    ke->ke_thread->td_priority < td->td_priority)
1120 			curthread->td_flags |= TDF_NEEDRESCHED;
1121 		mtx_unlock_spin(&sched_lock);
1122 	}
1123 }
1124 
1125 struct kse *
1126 sched_choose(void)
1127 {
1128 	struct kseq *kseq;
1129 	struct kse *ke;
1130 
1131 	mtx_assert(&sched_lock, MA_OWNED);
1132 #ifdef SMP
1133 retry:
1134 #endif
1135 	kseq = KSEQ_SELF();
1136 	ke = kseq_choose(kseq);
1137 	if (ke) {
1138 		runq_remove(ke->ke_runq, ke);
1139 		ke->ke_state = KES_THREAD;
1140 
1141 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1142 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1143 			    ke, ke->ke_runq, ke->ke_slice,
1144 			    ke->ke_thread->td_priority);
1145 		}
1146 		return (ke);
1147 	}
1148 
1149 #ifdef SMP
1150 	if (smp_started) {
1151 		/*
1152 		 * Find the cpu with the highest load and steal one proc.
1153 		 */
1154 		if ((kseq = kseq_load_highest()) == NULL)
1155 			return (NULL);
1156 
1157 		/*
1158 		 * Remove this kse from this kseq and runq and then requeue
1159 		 * on the current processor.  Then we will dequeue it
1160 		 * normally above.
1161 		 */
1162 		kseq_move(kseq, PCPU_GET(cpuid));
1163 		goto retry;
1164 	}
1165 #endif
1166 
1167 	return (NULL);
1168 }
1169 
1170 void
1171 sched_add(struct kse *ke)
1172 {
1173 	struct kseq *kseq;
1174 	struct ksegrp *kg;
1175 
1176 	mtx_assert(&sched_lock, MA_OWNED);
1177 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1178 	KASSERT((ke->ke_thread->td_kse != NULL),
1179 	    ("sched_add: No KSE on thread"));
1180 	KASSERT(ke->ke_state != KES_ONRUNQ,
1181 	    ("sched_add: kse %p (%s) already in run queue", ke,
1182 	    ke->ke_proc->p_comm));
1183 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1184 	    ("sched_add: process swapped out"));
1185 	KASSERT(ke->ke_runq == NULL,
1186 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1187 
1188 	kg = ke->ke_ksegrp;
1189 
1190 	switch (PRI_BASE(kg->kg_pri_class)) {
1191 	case PRI_ITHD:
1192 	case PRI_REALTIME:
1193 		kseq = KSEQ_SELF();
1194 		ke->ke_runq = kseq->ksq_curr;
1195 		ke->ke_slice = SCHED_SLICE_MAX;
1196 		ke->ke_cpu = PCPU_GET(cpuid);
1197 		break;
1198 	case PRI_TIMESHARE:
1199 		kseq = KSEQ_CPU(ke->ke_cpu);
1200 		if (SCHED_CURR(kg, ke))
1201 			ke->ke_runq = kseq->ksq_curr;
1202 		else
1203 			ke->ke_runq = kseq->ksq_next;
1204 		break;
1205 	case PRI_IDLE:
1206 		kseq = KSEQ_CPU(ke->ke_cpu);
1207 		/*
1208 		 * This is for priority prop.
1209 		 */
1210 		if (ke->ke_thread->td_priority > PRI_MIN_IDLE)
1211 			ke->ke_runq = kseq->ksq_curr;
1212 		else
1213 			ke->ke_runq = &kseq->ksq_idle;
1214 		ke->ke_slice = SCHED_SLICE_MIN;
1215 		break;
1216 	default:
1217 		panic("Unknown pri class.\n");
1218 		break;
1219 	}
1220 
1221 	ke->ke_ksegrp->kg_runq_kses++;
1222 	ke->ke_state = KES_ONRUNQ;
1223 
1224 	runq_add(ke->ke_runq, ke);
1225 	kseq_add(kseq, ke);
1226 }
1227 
1228 void
1229 sched_rem(struct kse *ke)
1230 {
1231 	struct kseq *kseq;
1232 
1233 	mtx_assert(&sched_lock, MA_OWNED);
1234 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
1235 
1236 	ke->ke_state = KES_THREAD;
1237 	ke->ke_ksegrp->kg_runq_kses--;
1238 	kseq = KSEQ_CPU(ke->ke_cpu);
1239 	runq_remove(ke->ke_runq, ke);
1240 	kseq_rem(kseq, ke);
1241 }
1242 
1243 fixpt_t
1244 sched_pctcpu(struct kse *ke)
1245 {
1246 	fixpt_t pctcpu;
1247 
1248 	pctcpu = 0;
1249 
1250 	mtx_lock_spin(&sched_lock);
1251 	if (ke->ke_ticks) {
1252 		int rtick;
1253 
1254 		/* Update to account for time potentially spent sleeping */
1255 		ke->ke_ltick = ticks;
1256 		/*
1257 		 * Don't update more frequently than twice a second.  Allowing
1258 		 * this causes the cpu usage to decay away too quickly due to
1259 		 * rounding errors.
1260 		 */
1261 		if (ke->ke_ltick < (ticks - (hz / 2)))
1262 			sched_pctcpu_update(ke);
1263 
1264 		/* How many rtick per second ? */
1265 		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1266 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1267 	}
1268 
1269 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1270 	mtx_unlock_spin(&sched_lock);
1271 
1272 	return (pctcpu);
1273 }
1274 
1275 int
1276 sched_sizeof_kse(void)
1277 {
1278 	return (sizeof(struct kse) + sizeof(struct ke_sched));
1279 }
1280 
1281 int
1282 sched_sizeof_ksegrp(void)
1283 {
1284 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1285 }
1286 
1287 int
1288 sched_sizeof_proc(void)
1289 {
1290 	return (sizeof(struct proc));
1291 }
1292 
1293 int
1294 sched_sizeof_thread(void)
1295 {
1296 	return (sizeof(struct thread) + sizeof(struct td_sched));
1297 }
1298