xref: /freebsd/sys/kern/sched_ule.c (revision c36ccfa22bc39764d3b634e96e1d88086bad8639)
1 /*-
2  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/ktr.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/resource.h>
37 #include <sys/sched.h>
38 #include <sys/smp.h>
39 #include <sys/sx.h>
40 #include <sys/sysctl.h>
41 #include <sys/sysproto.h>
42 #include <sys/vmmeter.h>
43 #ifdef DDB
44 #include <ddb/ddb.h>
45 #endif
46 #ifdef KTRACE
47 #include <sys/uio.h>
48 #include <sys/ktrace.h>
49 #endif
50 
51 #include <machine/cpu.h>
52 
53 #define KTR_ULE         KTR_NFS
54 
55 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
56 /* XXX This is bogus compatability crap for ps */
57 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
58 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
59 
60 static void sched_setup(void *dummy);
61 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
62 
63 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
64 
65 static int sched_strict;
66 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
67 
68 static int slice_min = 1;
69 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
70 
71 static int slice_max = 2;
72 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
73 
74 int realstathz;
75 int tickincr = 1;
76 
77 /*
78  * These datastructures are allocated within their parent datastructure but
79  * are scheduler specific.
80  */
81 
82 struct ke_sched {
83 	int		ske_slice;
84 	struct runq	*ske_runq;
85 	/* The following variables are only used for pctcpu calculation */
86 	int		ske_ltick;	/* Last tick that we were running on */
87 	int		ske_ftick;	/* First tick that we were running on */
88 	int		ske_ticks;	/* Tick count */
89 	/* CPU that we have affinity for. */
90 	u_char		ske_cpu;
91 };
92 #define	ke_slice	ke_sched->ske_slice
93 #define	ke_runq		ke_sched->ske_runq
94 #define	ke_ltick	ke_sched->ske_ltick
95 #define	ke_ftick	ke_sched->ske_ftick
96 #define	ke_ticks	ke_sched->ske_ticks
97 #define	ke_cpu		ke_sched->ske_cpu
98 
99 struct kg_sched {
100 	int	skg_slptime;		/* Number of ticks we vol. slept */
101 	int	skg_runtime;		/* Number of ticks we were running */
102 };
103 #define	kg_slptime	kg_sched->skg_slptime
104 #define	kg_runtime	kg_sched->skg_runtime
105 
106 struct td_sched {
107 	int	std_slptime;
108 };
109 #define	td_slptime	td_sched->std_slptime
110 
111 struct td_sched td_sched;
112 struct ke_sched ke_sched;
113 struct kg_sched kg_sched;
114 
115 struct ke_sched *kse0_sched = &ke_sched;
116 struct kg_sched *ksegrp0_sched = &kg_sched;
117 struct p_sched *proc0_sched = NULL;
118 struct td_sched *thread0_sched = &td_sched;
119 
120 /*
121  * This priority range has 20 priorities on either end that are reachable
122  * only through nice values.
123  *
124  * PRI_RANGE:	Total priority range for timeshare threads.
125  * PRI_NRESV:	Reserved priorities for nice.
126  * PRI_BASE:	The start of the dynamic range.
127  * DYN_RANGE:	Number of priorities that are available int the dynamic
128  *		priority range.
129  */
130 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
131 #define	SCHED_PRI_NRESV		PRIO_TOTAL
132 #define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
133 #define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
134 #define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
135 #define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
136 #define	SCHED_PRI_INTERACT(score)					\
137     ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE)
138 
139 /*
140  * These determine the interactivity of a process.
141  *
142  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
143  *		before throttling back.
144  * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
145  * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
146  * INTERACT_HALF:	Convenience define, half of the interactivity range.
147  * INTERACT_THRESH:	Threshhold for placement on the current runq.
148  */
149 #define	SCHED_SLP_RUN_MAX	((hz / 10) << 10)
150 #define	SCHED_SLP_RUN_THROTTLE	(10)
151 #define	SCHED_INTERACT_RANGE	(100)
152 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
153 #define	SCHED_INTERACT_THRESH	(10)
154 
155 /*
156  * These parameters and macros determine the size of the time slice that is
157  * granted to each thread.
158  *
159  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
160  * SLICE_MAX:	Maximum time slice granted.
161  * SLICE_RANGE:	Range of available time slices scaled by hz.
162  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
163  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
164  */
165 #define	SCHED_SLICE_MIN			(slice_min)
166 #define	SCHED_SLICE_MAX			(slice_max)
167 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
168 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
169 #define	SCHED_SLICE_NICE(nice)						\
170     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
171 
172 /*
173  * This macro determines whether or not the kse belongs on the current or
174  * next run queue.
175  *
176  * XXX nice value should effect how interactive a kg is.
177  */
178 #define	SCHED_INTERACTIVE(kg)						\
179     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
180 #define	SCHED_CURR(kg, ke)	SCHED_INTERACTIVE(kg)
181 #if 0
182     (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
183 #endif
184 
185 /*
186  * Cpu percentage computation macros and defines.
187  *
188  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
189  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
190  */
191 
192 #define	SCHED_CPU_TIME	10
193 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
194 
195 /*
196  * kseq - per processor runqs and statistics.
197  */
198 
199 #define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
200 
201 struct kseq {
202 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
203 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
204 	struct runq	*ksq_next;		/* Next timeshare queue. */
205 	struct runq	*ksq_curr;		/* Current queue. */
206 	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
207 	int		ksq_load;		/* Aggregate load. */
208 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
209 	short		ksq_nicemin;		/* Least nice. */
210 #ifdef SMP
211 	unsigned int	ksq_rslices;	/* Slices on run queue */
212 #endif
213 };
214 
215 /*
216  * One kse queue per processor.
217  */
218 #ifdef SMP
219 struct kseq	kseq_cpu[MAXCPU];
220 #define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
221 #define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
222 #else
223 struct kseq	kseq_cpu;
224 #define	KSEQ_SELF()	(&kseq_cpu)
225 #define	KSEQ_CPU(x)	(&kseq_cpu)
226 #endif
227 
228 static void sched_slice(struct kse *ke);
229 static void sched_priority(struct ksegrp *kg);
230 static int sched_interact_score(struct ksegrp *kg);
231 void sched_pctcpu_update(struct kse *ke);
232 int sched_pickcpu(void);
233 
234 /* Operations on per processor queues */
235 static struct kse * kseq_choose(struct kseq *kseq);
236 static void kseq_setup(struct kseq *kseq);
237 static void kseq_add(struct kseq *kseq, struct kse *ke);
238 static void kseq_rem(struct kseq *kseq, struct kse *ke);
239 static void kseq_nice_add(struct kseq *kseq, int nice);
240 static void kseq_nice_rem(struct kseq *kseq, int nice);
241 void kseq_print(struct kseq *kseq);
242 #ifdef SMP
243 struct kseq * kseq_load_highest(void);
244 #endif
245 
246 void
247 kseq_print(struct kseq *kseq)
248 {
249 	int i;
250 
251 	if (kseq == NULL)
252 		kseq = KSEQ_SELF();
253 
254 	printf("kseq:\n");
255 	printf("\tload:           %d\n", kseq->ksq_load);
256 	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
257 	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
258 	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
259 	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
260 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
261 	printf("\tnice counts:\n");
262 	for (i = 0; i < PRIO_TOTAL + 1; i++)
263 		if (kseq->ksq_nice[i])
264 			printf("\t\t%d = %d\n",
265 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
266 }
267 
268 static void
269 kseq_add(struct kseq *kseq, struct kse *ke)
270 {
271 	kseq->ksq_loads[ke->ke_ksegrp->kg_pri_class]++;
272 	kseq->ksq_load++;
273 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
274 	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
275 	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
276 	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
277 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
278 		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
279 #ifdef SMP
280 	kseq->ksq_rslices += ke->ke_slice;
281 #endif
282 }
283 
284 static void
285 kseq_rem(struct kseq *kseq, struct kse *ke)
286 {
287 	kseq->ksq_loads[ke->ke_ksegrp->kg_pri_class]--;
288 	kseq->ksq_load--;
289 	ke->ke_runq = NULL;
290 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
291 		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
292 #ifdef SMP
293 	kseq->ksq_rslices -= ke->ke_slice;
294 #endif
295 }
296 
297 static void
298 kseq_nice_add(struct kseq *kseq, int nice)
299 {
300 	/* Normalize to zero. */
301 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
302 	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 0)
303 		kseq->ksq_nicemin = nice;
304 }
305 
306 static void
307 kseq_nice_rem(struct kseq *kseq, int nice)
308 {
309 	int n;
310 
311 	/* Normalize to zero. */
312 	n = nice + SCHED_PRI_NHALF;
313 	kseq->ksq_nice[n]--;
314 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
315 
316 	/*
317 	 * If this wasn't the smallest nice value or there are more in
318 	 * this bucket we can just return.  Otherwise we have to recalculate
319 	 * the smallest nice.
320 	 */
321 	if (nice != kseq->ksq_nicemin ||
322 	    kseq->ksq_nice[n] != 0 ||
323 	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
324 		return;
325 
326 	for (; n < SCHED_PRI_NRESV + 1; n++)
327 		if (kseq->ksq_nice[n]) {
328 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
329 			return;
330 		}
331 }
332 
333 #ifdef SMP
334 struct kseq *
335 kseq_load_highest(void)
336 {
337 	struct kseq *kseq;
338 	int load;
339 	int cpu;
340 	int i;
341 
342 	cpu = 0;
343 	load = 0;
344 
345 	for (i = 0; i < mp_maxid; i++) {
346 		if (CPU_ABSENT(i))
347 			continue;
348 		kseq = KSEQ_CPU(i);
349 		if (kseq->ksq_load > load) {
350 			load = kseq->ksq_load;
351 			cpu = i;
352 		}
353 	}
354 	if (load)
355 		return (KSEQ_CPU(cpu));
356 
357 	return (NULL);
358 }
359 #endif
360 
361 struct kse *
362 kseq_choose(struct kseq *kseq)
363 {
364 	struct kse *ke;
365 	struct runq *swap;
366 
367 	swap = NULL;
368 
369 	for (;;) {
370 		ke = runq_choose(kseq->ksq_curr);
371 		if (ke == NULL) {
372 			/*
373 			 * We already swaped once and didn't get anywhere.
374 			 */
375 			if (swap)
376 				break;
377 			swap = kseq->ksq_curr;
378 			kseq->ksq_curr = kseq->ksq_next;
379 			kseq->ksq_next = swap;
380 			continue;
381 		}
382 		/*
383 		 * If we encounter a slice of 0 the kse is in a
384 		 * TIMESHARE kse group and its nice was too far out
385 		 * of the range that receives slices.
386 		 */
387 		if (ke->ke_slice == 0) {
388 			runq_remove(ke->ke_runq, ke);
389 			sched_slice(ke);
390 			ke->ke_runq = kseq->ksq_next;
391 			runq_add(ke->ke_runq, ke);
392 			continue;
393 		}
394 		return (ke);
395 	}
396 
397 	return (runq_choose(&kseq->ksq_idle));
398 }
399 
400 static void
401 kseq_setup(struct kseq *kseq)
402 {
403 	runq_init(&kseq->ksq_timeshare[0]);
404 	runq_init(&kseq->ksq_timeshare[1]);
405 	runq_init(&kseq->ksq_idle);
406 
407 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
408 	kseq->ksq_next = &kseq->ksq_timeshare[1];
409 
410 	kseq->ksq_loads[PRI_ITHD] = 0;
411 	kseq->ksq_loads[PRI_REALTIME] = 0;
412 	kseq->ksq_loads[PRI_TIMESHARE] = 0;
413 	kseq->ksq_loads[PRI_IDLE] = 0;
414 #ifdef SMP
415 	kseq->ksq_rslices = 0;
416 #endif
417 }
418 
419 static void
420 sched_setup(void *dummy)
421 {
422 	int i;
423 
424 	slice_min = (hz/100);
425 	slice_max = (hz/10);
426 
427 	mtx_lock_spin(&sched_lock);
428 	/* init kseqs */
429 	for (i = 0; i < MAXCPU; i++)
430 		kseq_setup(KSEQ_CPU(i));
431 
432 	kseq_add(KSEQ_SELF(), &kse0);
433 	mtx_unlock_spin(&sched_lock);
434 }
435 
436 /*
437  * Scale the scheduling priority according to the "interactivity" of this
438  * process.
439  */
440 static void
441 sched_priority(struct ksegrp *kg)
442 {
443 	int pri;
444 
445 	if (kg->kg_pri_class != PRI_TIMESHARE)
446 		return;
447 
448 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
449 	pri += SCHED_PRI_BASE;
450 	pri += kg->kg_nice;
451 
452 	if (pri > PRI_MAX_TIMESHARE)
453 		pri = PRI_MAX_TIMESHARE;
454 	else if (pri < PRI_MIN_TIMESHARE)
455 		pri = PRI_MIN_TIMESHARE;
456 
457 	kg->kg_user_pri = pri;
458 
459 	return;
460 }
461 
462 /*
463  * Calculate a time slice based on the properties of the kseg and the runq
464  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
465  */
466 static void
467 sched_slice(struct kse *ke)
468 {
469 	struct kseq *kseq;
470 	struct ksegrp *kg;
471 
472 	kg = ke->ke_ksegrp;
473 	kseq = KSEQ_CPU(ke->ke_cpu);
474 
475 	/*
476 	 * Rationale:
477 	 * KSEs in interactive ksegs get the minimum slice so that we
478 	 * quickly notice if it abuses its advantage.
479 	 *
480 	 * KSEs in non-interactive ksegs are assigned a slice that is
481 	 * based on the ksegs nice value relative to the least nice kseg
482 	 * on the run queue for this cpu.
483 	 *
484 	 * If the KSE is less nice than all others it gets the maximum
485 	 * slice and other KSEs will adjust their slice relative to
486 	 * this when they first expire.
487 	 *
488 	 * There is 20 point window that starts relative to the least
489 	 * nice kse on the run queue.  Slice size is determined by
490 	 * the kse distance from the last nice ksegrp.
491 	 *
492 	 * If you are outside of the window you will get no slice and
493 	 * you will be reevaluated each time you are selected on the
494 	 * run queue.
495 	 *
496 	 */
497 
498 	if (!SCHED_INTERACTIVE(kg)) {
499 		int nice;
500 
501 		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
502 		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
503 		    kg->kg_nice < kseq->ksq_nicemin)
504 			ke->ke_slice = SCHED_SLICE_MAX;
505 		else if (nice <= SCHED_PRI_NTHRESH)
506 			ke->ke_slice = SCHED_SLICE_NICE(nice);
507 		else
508 			ke->ke_slice = 0;
509 	} else
510 		ke->ke_slice = SCHED_SLICE_MIN;
511 
512 	CTR6(KTR_ULE,
513 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
514 	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
515 	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
516 
517 	/*
518 	 * Check to see if we need to scale back the slp and run time
519 	 * in the kg.  This will cause us to forget old interactivity
520 	 * while maintaining the current ratio.
521 	 */
522 	CTR4(KTR_ULE, "Slp vs Run %p (Slp %d, Run %d, Score %d)",
523 	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
524 	    sched_interact_score(kg));
525 
526 	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
527 		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
528 		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
529 	}
530 	CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)",
531 	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
532 	    sched_interact_score(kg));
533 
534 	return;
535 }
536 
537 static int
538 sched_interact_score(struct ksegrp *kg)
539 {
540 	int big;
541 	int small;
542 	int base;
543 
544 	if (kg->kg_runtime > kg->kg_slptime) {
545 		big = kg->kg_runtime;
546 		small = kg->kg_slptime;
547 		base = SCHED_INTERACT_HALF;
548 	} else {
549 		big = kg->kg_slptime;
550 		small = kg->kg_runtime;
551 		base = 0;
552 	}
553 
554 	big /= SCHED_INTERACT_HALF;
555 	if (big != 0)
556 		small /= big;
557 	else
558 		small = 0;
559 
560 	small += base;
561 	/* XXX Factor in nice */
562 	return (small);
563 }
564 
565 /*
566  * This is only somewhat accurate since given many processes of the same
567  * priority they will switch when their slices run out, which will be
568  * at most SCHED_SLICE_MAX.
569  */
570 int
571 sched_rr_interval(void)
572 {
573 	return (SCHED_SLICE_MAX);
574 }
575 
576 void
577 sched_pctcpu_update(struct kse *ke)
578 {
579 	/*
580 	 * Adjust counters and watermark for pctcpu calc.
581 	 *
582 	 * Shift the tick count out so that the divide doesn't round away
583 	 * our results.
584 	 */
585 	ke->ke_ticks <<= 10;
586 	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
587 		    SCHED_CPU_TICKS;
588 	ke->ke_ticks >>= 10;
589 	ke->ke_ltick = ticks;
590 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
591 }
592 
593 #ifdef SMP
594 /* XXX Should be changed to kseq_load_lowest() */
595 int
596 sched_pickcpu(void)
597 {
598 	struct kseq *kseq;
599 	int load;
600 	int cpu;
601 	int i;
602 
603 	if (!smp_started)
604 		return (0);
605 
606 	load = 0;
607 	cpu = 0;
608 
609 	for (i = 0; i < mp_maxid; i++) {
610 		if (CPU_ABSENT(i))
611 			continue;
612 		kseq = KSEQ_CPU(i);
613 		if (kseq->ksq_load < load) {
614 			cpu = i;
615 			load = kseq->ksq_load;
616 		}
617 	}
618 
619 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
620 	return (cpu);
621 }
622 #else
623 int
624 sched_pickcpu(void)
625 {
626 	return (0);
627 }
628 #endif
629 
630 void
631 sched_prio(struct thread *td, u_char prio)
632 {
633 	struct kse *ke;
634 	struct runq *rq;
635 
636 	mtx_assert(&sched_lock, MA_OWNED);
637 	ke = td->td_kse;
638 	td->td_priority = prio;
639 
640 	if (TD_ON_RUNQ(td)) {
641 		rq = ke->ke_runq;
642 
643 		runq_remove(rq, ke);
644 		runq_add(rq, ke);
645 	}
646 }
647 
648 void
649 sched_switchout(struct thread *td)
650 {
651 	struct kse *ke;
652 
653 	mtx_assert(&sched_lock, MA_OWNED);
654 
655 	ke = td->td_kse;
656 
657 	td->td_last_kse = ke;
658         td->td_lastcpu = td->td_oncpu;
659 	td->td_oncpu = NOCPU;
660         td->td_flags &= ~TDF_NEEDRESCHED;
661 
662 	if (TD_IS_RUNNING(td)) {
663 		runq_add(ke->ke_runq, ke);
664 		/* setrunqueue(td); */
665 		return;
666 	}
667 	if (ke->ke_runq)
668 		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
669 	/*
670 	 * We will not be on the run queue. So we must be
671 	 * sleeping or similar.
672 	 */
673 	if (td->td_proc->p_flag & P_THREADED)
674 		kse_reassign(ke);
675 }
676 
677 void
678 sched_switchin(struct thread *td)
679 {
680 	/* struct kse *ke = td->td_kse; */
681 	mtx_assert(&sched_lock, MA_OWNED);
682 
683 	td->td_oncpu = PCPU_GET(cpuid);
684 
685 	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
686 	    td->td_priority != td->td_ksegrp->kg_user_pri)
687 		curthread->td_flags |= TDF_NEEDRESCHED;
688 }
689 
690 void
691 sched_nice(struct ksegrp *kg, int nice)
692 {
693 	struct kse *ke;
694 	struct thread *td;
695 	struct kseq *kseq;
696 
697 	/*
698 	 * We need to adjust the nice counts for running KSEs.
699 	 */
700 	if (kg->kg_pri_class == PRI_TIMESHARE)
701 		FOREACH_KSE_IN_GROUP(kg, ke) {
702 			if (ke->ke_state != KES_ONRUNQ &&
703 			    ke->ke_state != KES_THREAD)
704 				continue;
705 			kseq = KSEQ_CPU(ke->ke_cpu);
706 			kseq_nice_rem(kseq, kg->kg_nice);
707 			kseq_nice_add(kseq, nice);
708 		}
709 	kg->kg_nice = nice;
710 	sched_priority(kg);
711 	FOREACH_THREAD_IN_GROUP(kg, td)
712 		td->td_flags |= TDF_NEEDRESCHED;
713 }
714 
715 void
716 sched_sleep(struct thread *td, u_char prio)
717 {
718 	mtx_assert(&sched_lock, MA_OWNED);
719 
720 	td->td_slptime = ticks;
721 	td->td_priority = prio;
722 
723 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
724 	    td->td_kse, td->td_slptime);
725 }
726 
727 void
728 sched_wakeup(struct thread *td)
729 {
730 	mtx_assert(&sched_lock, MA_OWNED);
731 
732 	/*
733 	 * Let the kseg know how long we slept for.  This is because process
734 	 * interactivity behavior is modeled in the kseg.
735 	 */
736 	if (td->td_slptime) {
737 		struct ksegrp *kg;
738 		int hzticks;
739 
740 		kg = td->td_ksegrp;
741 		hzticks = ticks - td->td_slptime;
742 		kg->kg_slptime += hzticks << 10;
743 		sched_priority(kg);
744 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
745 		    td->td_kse, hzticks);
746 		td->td_slptime = 0;
747 	}
748 	setrunqueue(td);
749         if (td->td_priority < curthread->td_priority)
750                 curthread->td_flags |= TDF_NEEDRESCHED;
751 }
752 
753 /*
754  * Penalize the parent for creating a new child and initialize the child's
755  * priority.
756  */
757 void
758 sched_fork(struct proc *p, struct proc *p1)
759 {
760 
761 	mtx_assert(&sched_lock, MA_OWNED);
762 
763 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
764 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
765 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
766 }
767 
768 void
769 sched_fork_kse(struct kse *ke, struct kse *child)
770 {
771 	child->ke_slice = ke->ke_slice;
772 	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
773 	child->ke_runq = NULL;
774 
775 	/*
776 	 * Claim that we've been running for one second for statistical
777 	 * purposes.
778 	 */
779 	child->ke_ticks = 0;
780 	child->ke_ltick = ticks;
781 	child->ke_ftick = ticks - hz;
782 }
783 
784 void
785 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
786 {
787 	/* XXX Need something better here */
788 	if (kg->kg_slptime > kg->kg_runtime) {
789 		child->kg_slptime = SCHED_DYN_RANGE;
790 		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
791 	} else {
792 		child->kg_runtime = SCHED_DYN_RANGE;
793 		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
794 	}
795 
796 	child->kg_user_pri = kg->kg_user_pri;
797 	child->kg_nice = kg->kg_nice;
798 }
799 
800 void
801 sched_fork_thread(struct thread *td, struct thread *child)
802 {
803 }
804 
805 void
806 sched_class(struct ksegrp *kg, int class)
807 {
808 	struct kseq *kseq;
809 	struct kse *ke;
810 
811 	if (kg->kg_pri_class == class)
812 		return;
813 
814 	FOREACH_KSE_IN_GROUP(kg, ke) {
815 		if (ke->ke_state != KES_ONRUNQ &&
816 		    ke->ke_state != KES_THREAD)
817 			continue;
818 		kseq = KSEQ_CPU(ke->ke_cpu);
819 
820 		kseq->ksq_loads[kg->kg_pri_class]--;
821 		kseq->ksq_loads[class]++;
822 
823 		if (kg->kg_pri_class == PRI_TIMESHARE)
824 			kseq_nice_rem(kseq, kg->kg_nice);
825 		else if (class == PRI_TIMESHARE)
826 			kseq_nice_add(kseq, kg->kg_nice);
827 	}
828 
829 	kg->kg_pri_class = class;
830 }
831 
832 /*
833  * Return some of the child's priority and interactivity to the parent.
834  */
835 void
836 sched_exit(struct proc *p, struct proc *child)
837 {
838 	struct ksegrp *kg;
839 	struct kse *ke;
840 
841 	/* XXX Need something better here */
842 	mtx_assert(&sched_lock, MA_OWNED);
843 	kg = FIRST_KSEGRP_IN_PROC(child);
844 	ke = FIRST_KSE_IN_KSEGRP(kg);
845 	kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
846 }
847 
848 void
849 sched_clock(struct kse *ke)
850 {
851 	struct kseq *kseq;
852 	struct ksegrp *kg;
853 	struct thread *td;
854 #if 0
855 	struct kse *nke;
856 #endif
857 
858 	/*
859 	 * sched_setup() apparently happens prior to stathz being set.  We
860 	 * need to resolve the timers earlier in the boot so we can avoid
861 	 * calculating this here.
862 	 */
863 	if (realstathz == 0) {
864 		realstathz = stathz ? stathz : hz;
865 		tickincr = hz / realstathz;
866 		/*
867 		 * XXX This does not work for values of stathz that are much
868 		 * larger than hz.
869 		 */
870 		if (tickincr == 0)
871 			tickincr = 1;
872 	}
873 
874 	td = ke->ke_thread;
875 	kg = ke->ke_ksegrp;
876 
877 	mtx_assert(&sched_lock, MA_OWNED);
878 	KASSERT((td != NULL), ("schedclock: null thread pointer"));
879 
880 	/* Adjust ticks for pctcpu */
881 	ke->ke_ticks++;
882 	ke->ke_ltick = ticks;
883 
884 	/* Go up to one second beyond our max and then trim back down */
885 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
886 		sched_pctcpu_update(ke);
887 
888 	if (td->td_kse->ke_flags & KEF_IDLEKSE)
889 		return;
890 
891 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
892 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
893 
894 	/*
895 	 * We only do slicing code for TIMESHARE ksegrps.
896 	 */
897 	if (kg->kg_pri_class != PRI_TIMESHARE)
898 		return;
899 	/*
900 	 * Check for a higher priority task on the run queue.  This can happen
901 	 * on SMP if another processor woke up a process on our runq.
902 	 */
903 	kseq = KSEQ_SELF();
904 #if 0
905 	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
906 		if (sched_strict &&
907 		    nke->ke_thread->td_priority < td->td_priority)
908 			td->td_flags |= TDF_NEEDRESCHED;
909 		else if (nke->ke_thread->td_priority <
910 		    td->td_priority SCHED_PRIO_SLOP)
911 
912 		if (nke->ke_thread->td_priority < td->td_priority)
913 			td->td_flags |= TDF_NEEDRESCHED;
914 	}
915 #endif
916 	/*
917 	 * We used a tick charge it to the ksegrp so that we can compute our
918 	 * interactivity.
919 	 */
920 	kg->kg_runtime += tickincr << 10;
921 
922 	/*
923 	 * We used up one time slice.
924 	 */
925 	ke->ke_slice--;
926 #ifdef SMP
927 	kseq->ksq_rslices--;
928 #endif
929 
930 	if (ke->ke_slice > 0)
931 		return;
932 	/*
933 	 * We're out of time, recompute priorities and requeue.
934 	 */
935 	kseq_rem(kseq, ke);
936 	sched_priority(kg);
937 	sched_slice(ke);
938 	if (SCHED_CURR(kg, ke))
939 		ke->ke_runq = kseq->ksq_curr;
940 	else
941 		ke->ke_runq = kseq->ksq_next;
942 	kseq_add(kseq, ke);
943 	td->td_flags |= TDF_NEEDRESCHED;
944 }
945 
946 int
947 sched_runnable(void)
948 {
949 	struct kseq *kseq;
950 
951 	kseq = KSEQ_SELF();
952 
953 	if (kseq->ksq_load)
954 		return (1);
955 #ifdef SMP
956 	/*
957 	 * For SMP we may steal other processor's KSEs.  Just search until we
958 	 * verify that at least on other cpu has a runnable task.
959 	 */
960 	if (smp_started) {
961 		int i;
962 
963 		for (i = 0; i < mp_maxid; i++) {
964 			if (CPU_ABSENT(i))
965 				continue;
966 			kseq = KSEQ_CPU(i);
967 			if (kseq->ksq_load)
968 				return (1);
969 		}
970 	}
971 #endif
972 	return (0);
973 }
974 
975 void
976 sched_userret(struct thread *td)
977 {
978 	struct ksegrp *kg;
979 
980 	kg = td->td_ksegrp;
981 
982 	if (td->td_priority != kg->kg_user_pri) {
983 		mtx_lock_spin(&sched_lock);
984 		td->td_priority = kg->kg_user_pri;
985 		mtx_unlock_spin(&sched_lock);
986 	}
987 }
988 
989 struct kse *
990 sched_choose(void)
991 {
992 	struct kseq *kseq;
993 	struct kse *ke;
994 
995 #ifdef SMP
996 retry:
997 #endif
998 	kseq = KSEQ_SELF();
999 	ke = kseq_choose(kseq);
1000 	if (ke) {
1001 		runq_remove(ke->ke_runq, ke);
1002 		ke->ke_state = KES_THREAD;
1003 
1004 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1005 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1006 			    ke, ke->ke_runq, ke->ke_slice,
1007 			    ke->ke_thread->td_priority);
1008 		}
1009 		return (ke);
1010 	}
1011 
1012 #ifdef SMP
1013 	if (smp_started) {
1014 		/*
1015 		 * Find the cpu with the highest load and steal one proc.
1016 		 */
1017 		if ((kseq = kseq_load_highest()) == NULL)
1018 			return (NULL);
1019 
1020 		/*
1021 		 * Remove this kse from this kseq and runq and then requeue
1022 		 * on the current processor.  Then we will dequeue it
1023 		 * normally above.
1024 		 */
1025 		ke = kseq_choose(kseq);
1026 		runq_remove(ke->ke_runq, ke);
1027 		ke->ke_state = KES_THREAD;
1028 		kseq_rem(kseq, ke);
1029 
1030 		ke->ke_cpu = PCPU_GET(cpuid);
1031 		sched_add(ke);
1032 		goto retry;
1033 	}
1034 #endif
1035 
1036 	return (NULL);
1037 }
1038 
1039 void
1040 sched_add(struct kse *ke)
1041 {
1042 	struct kseq *kseq;
1043 	struct ksegrp *kg;
1044 
1045 	mtx_assert(&sched_lock, MA_OWNED);
1046 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1047 	KASSERT((ke->ke_thread->td_kse != NULL),
1048 	    ("sched_add: No KSE on thread"));
1049 	KASSERT(ke->ke_state != KES_ONRUNQ,
1050 	    ("sched_add: kse %p (%s) already in run queue", ke,
1051 	    ke->ke_proc->p_comm));
1052 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1053 	    ("sched_add: process swapped out"));
1054 
1055 	kg = ke->ke_ksegrp;
1056 
1057 	if (ke->ke_runq)
1058 		Debugger("hrm?");
1059 
1060 	switch (kg->kg_pri_class) {
1061 	case PRI_ITHD:
1062 	case PRI_REALTIME:
1063 		kseq = KSEQ_SELF();
1064 		if (ke->ke_runq == NULL)
1065 			kseq_add(kseq, ke);
1066 		ke->ke_runq = kseq->ksq_curr;
1067 		ke->ke_slice = SCHED_SLICE_MAX;
1068 		break;
1069 	case PRI_TIMESHARE:
1070 		kseq = KSEQ_CPU(ke->ke_cpu);
1071 		if (ke->ke_runq == NULL) {
1072 			if (SCHED_CURR(kg, ke))
1073 				ke->ke_runq = kseq->ksq_curr;
1074 			else
1075 				ke->ke_runq = kseq->ksq_next;
1076 			kseq_add(kseq, ke);
1077 		}
1078 		break;
1079 	case PRI_IDLE:
1080 		kseq = KSEQ_CPU(ke->ke_cpu);
1081 
1082 		if (ke->ke_runq == NULL)
1083 			kseq_add(kseq, ke);
1084 		/*
1085 		 * This is for priority prop.
1086 		 */
1087 		if (ke->ke_thread->td_priority < PRI_MAX_TIMESHARE)
1088 			ke->ke_runq = kseq->ksq_curr;
1089 		else
1090 			ke->ke_runq = &kseq->ksq_idle;
1091 		ke->ke_slice = SCHED_SLICE_MIN;
1092 		break;
1093 	default:
1094 		panic("Unknown pri class.\n");
1095 		break;
1096 	}
1097 
1098 	ke->ke_ksegrp->kg_runq_kses++;
1099 	ke->ke_state = KES_ONRUNQ;
1100 
1101 	runq_add(ke->ke_runq, ke);
1102 }
1103 
1104 void
1105 sched_rem(struct kse *ke)
1106 {
1107 	struct kseq *kseq;
1108 
1109 	mtx_assert(&sched_lock, MA_OWNED);
1110 	/* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */
1111 	panic("WTF\n");
1112 
1113 	ke->ke_state = KES_THREAD;
1114 	ke->ke_ksegrp->kg_runq_kses--;
1115 	kseq = KSEQ_CPU(ke->ke_cpu);
1116 	runq_remove(ke->ke_runq, ke);
1117 	kseq_rem(kseq, ke);
1118 }
1119 
1120 fixpt_t
1121 sched_pctcpu(struct kse *ke)
1122 {
1123 	fixpt_t pctcpu;
1124 
1125 	pctcpu = 0;
1126 
1127 	if (ke->ke_ticks) {
1128 		int rtick;
1129 
1130 		/* Update to account for time potentially spent sleeping */
1131 		ke->ke_ltick = ticks;
1132 		sched_pctcpu_update(ke);
1133 
1134 		/* How many rtick per second ? */
1135 		rtick = ke->ke_ticks / SCHED_CPU_TIME;
1136 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1137 	}
1138 
1139 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1140 
1141 	return (pctcpu);
1142 }
1143 
1144 int
1145 sched_sizeof_kse(void)
1146 {
1147 	return (sizeof(struct kse) + sizeof(struct ke_sched));
1148 }
1149 
1150 int
1151 sched_sizeof_ksegrp(void)
1152 {
1153 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1154 }
1155 
1156 int
1157 sched_sizeof_proc(void)
1158 {
1159 	return (sizeof(struct proc));
1160 }
1161 
1162 int
1163 sched_sizeof_thread(void)
1164 {
1165 	return (sizeof(struct thread) + sizeof(struct td_sched));
1166 }
1167