xref: /freebsd/sys/kern/sched_ule.c (revision 2056d0a168da61194845626fcbfc1722893a2d2c)
1 /*-
2  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/ktr.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/resource.h>
37 #include <sys/sched.h>
38 #include <sys/smp.h>
39 #include <sys/sx.h>
40 #include <sys/sysctl.h>
41 #include <sys/sysproto.h>
42 #include <sys/vmmeter.h>
43 #ifdef DDB
44 #include <ddb/ddb.h>
45 #endif
46 #ifdef KTRACE
47 #include <sys/uio.h>
48 #include <sys/ktrace.h>
49 #endif
50 
51 #include <machine/cpu.h>
52 
53 #define KTR_ULE         KTR_NFS
54 
55 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
56 /* XXX This is bogus compatability crap for ps */
57 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
58 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
59 
60 static void sched_setup(void *dummy);
61 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
62 
63 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
64 
65 static int sched_strict;
66 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
67 
68 static int slice_min = 1;
69 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
70 
71 static int slice_max = 2;
72 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
73 
74 int realstathz;
75 int tickincr = 1;
76 
77 /*
78  * These datastructures are allocated within their parent datastructure but
79  * are scheduler specific.
80  */
81 
82 struct ke_sched {
83 	int		ske_slice;
84 	struct runq	*ske_runq;
85 	/* The following variables are only used for pctcpu calculation */
86 	int		ske_ltick;	/* Last tick that we were running on */
87 	int		ske_ftick;	/* First tick that we were running on */
88 	int		ske_ticks;	/* Tick count */
89 	/* CPU that we have affinity for. */
90 	u_char		ske_cpu;
91 };
92 #define	ke_slice	ke_sched->ske_slice
93 #define	ke_runq		ke_sched->ske_runq
94 #define	ke_ltick	ke_sched->ske_ltick
95 #define	ke_ftick	ke_sched->ske_ftick
96 #define	ke_ticks	ke_sched->ske_ticks
97 #define	ke_cpu		ke_sched->ske_cpu
98 
99 struct kg_sched {
100 	int	skg_slptime;		/* Number of ticks we vol. slept */
101 	int	skg_runtime;		/* Number of ticks we were running */
102 };
103 #define	kg_slptime	kg_sched->skg_slptime
104 #define	kg_runtime	kg_sched->skg_runtime
105 
106 struct td_sched {
107 	int	std_slptime;
108 };
109 #define	td_slptime	td_sched->std_slptime
110 
111 struct td_sched td_sched;
112 struct ke_sched ke_sched;
113 struct kg_sched kg_sched;
114 
115 struct ke_sched *kse0_sched = &ke_sched;
116 struct kg_sched *ksegrp0_sched = &kg_sched;
117 struct p_sched *proc0_sched = NULL;
118 struct td_sched *thread0_sched = &td_sched;
119 
120 /*
121  * This priority range has 20 priorities on either end that are reachable
122  * only through nice values.
123  *
124  * PRI_RANGE:	Total priority range for timeshare threads.
125  * PRI_NRESV:	Reserved priorities for nice.
126  * PRI_BASE:	The start of the dynamic range.
127  * DYN_RANGE:	Number of priorities that are available int the dynamic
128  *		priority range.
129  */
130 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
131 #define	SCHED_PRI_NRESV		PRIO_TOTAL
132 #define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
133 #define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
134 #define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
135 #define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
136 #define	SCHED_PRI_INTERACT(score)					\
137     ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE)
138 
139 /*
140  * These determine the interactivity of a process.
141  *
142  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
143  *		before throttling back.
144  * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
145  * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
146  * INTERACT_HALF:	Convenience define, half of the interactivity range.
147  * INTERACT_THRESH:	Threshhold for placement on the current runq.
148  */
149 #define	SCHED_SLP_RUN_MAX	((hz / 10) << 10)
150 #define	SCHED_SLP_RUN_THROTTLE	(10)
151 #define	SCHED_INTERACT_RANGE	(100)
152 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
153 #define	SCHED_INTERACT_THRESH	(10)
154 
155 /*
156  * These parameters and macros determine the size of the time slice that is
157  * granted to each thread.
158  *
159  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
160  * SLICE_MAX:	Maximum time slice granted.
161  * SLICE_RANGE:	Range of available time slices scaled by hz.
162  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
163  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
164  */
165 #define	SCHED_SLICE_MIN			(slice_min)
166 #define	SCHED_SLICE_MAX			(slice_max)
167 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
168 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
169 #define	SCHED_SLICE_NICE(nice)						\
170     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
171 
172 /*
173  * This macro determines whether or not the kse belongs on the current or
174  * next run queue.
175  *
176  * XXX nice value should effect how interactive a kg is.
177  */
178 #define	SCHED_INTERACTIVE(kg)						\
179     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
180 #define	SCHED_CURR(kg, ke)						\
181     (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
182 
183 /*
184  * Cpu percentage computation macros and defines.
185  *
186  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
187  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
188  */
189 
190 #define	SCHED_CPU_TIME	10
191 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
192 
193 /*
194  * kseq - per processor runqs and statistics.
195  */
196 
197 #define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
198 
199 struct kseq {
200 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
201 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
202 	struct runq	*ksq_next;		/* Next timeshare queue. */
203 	struct runq	*ksq_curr;		/* Current queue. */
204 	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
205 	int		ksq_load;		/* Aggregate load. */
206 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
207 	short		ksq_nicemin;		/* Least nice. */
208 #ifdef SMP
209 	unsigned int	ksq_rslices;	/* Slices on run queue */
210 #endif
211 };
212 
213 /*
214  * One kse queue per processor.
215  */
216 #ifdef SMP
217 struct kseq	kseq_cpu[MAXCPU];
218 #define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
219 #define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
220 #else
221 struct kseq	kseq_cpu;
222 #define	KSEQ_SELF()	(&kseq_cpu)
223 #define	KSEQ_CPU(x)	(&kseq_cpu)
224 #endif
225 
226 static void sched_slice(struct kse *ke);
227 static void sched_priority(struct ksegrp *kg);
228 static int sched_interact_score(struct ksegrp *kg);
229 void sched_pctcpu_update(struct kse *ke);
230 int sched_pickcpu(void);
231 
232 /* Operations on per processor queues */
233 static struct kse * kseq_choose(struct kseq *kseq);
234 static void kseq_setup(struct kseq *kseq);
235 static void kseq_add(struct kseq *kseq, struct kse *ke);
236 static void kseq_rem(struct kseq *kseq, struct kse *ke);
237 static void kseq_nice_add(struct kseq *kseq, int nice);
238 static void kseq_nice_rem(struct kseq *kseq, int nice);
239 void kseq_print(int cpu);
240 #ifdef SMP
241 struct kseq * kseq_load_highest(void);
242 #endif
243 
244 void
245 kseq_print(int cpu)
246 {
247 	struct kseq *kseq;
248 	int i;
249 
250 	kseq = KSEQ_CPU(cpu);
251 
252 	printf("kseq:\n");
253 	printf("\tload:           %d\n", kseq->ksq_load);
254 	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
255 	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
256 	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
257 	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
258 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
259 	printf("\tnice counts:\n");
260 	for (i = 0; i < PRIO_TOTAL + 1; i++)
261 		if (kseq->ksq_nice[i])
262 			printf("\t\t%d = %d\n",
263 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
264 }
265 
266 static void
267 kseq_add(struct kseq *kseq, struct kse *ke)
268 {
269 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
270 	kseq->ksq_load++;
271 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
272 	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
273 	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
274 	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
275 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
276 		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
277 #ifdef SMP
278 	kseq->ksq_rslices += ke->ke_slice;
279 #endif
280 }
281 
282 static void
283 kseq_rem(struct kseq *kseq, struct kse *ke)
284 {
285 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
286 	kseq->ksq_load--;
287 	ke->ke_runq = NULL;
288 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
289 		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
290 #ifdef SMP
291 	kseq->ksq_rslices -= ke->ke_slice;
292 #endif
293 }
294 
295 static void
296 kseq_nice_add(struct kseq *kseq, int nice)
297 {
298 	/* Normalize to zero. */
299 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
300 	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 0)
301 		kseq->ksq_nicemin = nice;
302 }
303 
304 static void
305 kseq_nice_rem(struct kseq *kseq, int nice)
306 {
307 	int n;
308 
309 	/* Normalize to zero. */
310 	n = nice + SCHED_PRI_NHALF;
311 	kseq->ksq_nice[n]--;
312 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
313 
314 	/*
315 	 * If this wasn't the smallest nice value or there are more in
316 	 * this bucket we can just return.  Otherwise we have to recalculate
317 	 * the smallest nice.
318 	 */
319 	if (nice != kseq->ksq_nicemin ||
320 	    kseq->ksq_nice[n] != 0 ||
321 	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
322 		return;
323 
324 	for (; n < SCHED_PRI_NRESV + 1; n++)
325 		if (kseq->ksq_nice[n]) {
326 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
327 			return;
328 		}
329 }
330 
331 #ifdef SMP
332 struct kseq *
333 kseq_load_highest(void)
334 {
335 	struct kseq *kseq;
336 	int load;
337 	int cpu;
338 	int i;
339 
340 	cpu = 0;
341 	load = 0;
342 
343 	for (i = 0; i < mp_maxid; i++) {
344 		if (CPU_ABSENT(i))
345 			continue;
346 		kseq = KSEQ_CPU(i);
347 		if (kseq->ksq_load > load) {
348 			load = kseq->ksq_load;
349 			cpu = i;
350 		}
351 	}
352 	if (load > 1)
353 		return (KSEQ_CPU(cpu));
354 
355 	return (NULL);
356 }
357 #endif
358 
359 struct kse *
360 kseq_choose(struct kseq *kseq)
361 {
362 	struct kse *ke;
363 	struct runq *swap;
364 
365 	swap = NULL;
366 
367 	for (;;) {
368 		ke = runq_choose(kseq->ksq_curr);
369 		if (ke == NULL) {
370 			/*
371 			 * We already swaped once and didn't get anywhere.
372 			 */
373 			if (swap)
374 				break;
375 			swap = kseq->ksq_curr;
376 			kseq->ksq_curr = kseq->ksq_next;
377 			kseq->ksq_next = swap;
378 			continue;
379 		}
380 		/*
381 		 * If we encounter a slice of 0 the kse is in a
382 		 * TIMESHARE kse group and its nice was too far out
383 		 * of the range that receives slices.
384 		 */
385 		if (ke->ke_slice == 0) {
386 			runq_remove(ke->ke_runq, ke);
387 			sched_slice(ke);
388 			ke->ke_runq = kseq->ksq_next;
389 			runq_add(ke->ke_runq, ke);
390 			continue;
391 		}
392 		return (ke);
393 	}
394 
395 	return (runq_choose(&kseq->ksq_idle));
396 }
397 
398 static void
399 kseq_setup(struct kseq *kseq)
400 {
401 	runq_init(&kseq->ksq_timeshare[0]);
402 	runq_init(&kseq->ksq_timeshare[1]);
403 	runq_init(&kseq->ksq_idle);
404 
405 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
406 	kseq->ksq_next = &kseq->ksq_timeshare[1];
407 
408 	kseq->ksq_loads[PRI_ITHD] = 0;
409 	kseq->ksq_loads[PRI_REALTIME] = 0;
410 	kseq->ksq_loads[PRI_TIMESHARE] = 0;
411 	kseq->ksq_loads[PRI_IDLE] = 0;
412 	kseq->ksq_load = 0;
413 #ifdef SMP
414 	kseq->ksq_rslices = 0;
415 #endif
416 }
417 
418 static void
419 sched_setup(void *dummy)
420 {
421 	int i;
422 
423 	slice_min = (hz/100);
424 	slice_max = (hz/10);
425 
426 	mtx_lock_spin(&sched_lock);
427 	/* init kseqs */
428 	for (i = 0; i < MAXCPU; i++)
429 		kseq_setup(KSEQ_CPU(i));
430 
431 	kseq_add(KSEQ_SELF(), &kse0);
432 	mtx_unlock_spin(&sched_lock);
433 }
434 
435 /*
436  * Scale the scheduling priority according to the "interactivity" of this
437  * process.
438  */
439 static void
440 sched_priority(struct ksegrp *kg)
441 {
442 	int pri;
443 
444 	if (kg->kg_pri_class != PRI_TIMESHARE)
445 		return;
446 
447 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
448 	pri += SCHED_PRI_BASE;
449 	pri += kg->kg_nice;
450 
451 	if (pri > PRI_MAX_TIMESHARE)
452 		pri = PRI_MAX_TIMESHARE;
453 	else if (pri < PRI_MIN_TIMESHARE)
454 		pri = PRI_MIN_TIMESHARE;
455 
456 	kg->kg_user_pri = pri;
457 
458 	return;
459 }
460 
461 /*
462  * Calculate a time slice based on the properties of the kseg and the runq
463  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
464  */
465 static void
466 sched_slice(struct kse *ke)
467 {
468 	struct kseq *kseq;
469 	struct ksegrp *kg;
470 
471 	kg = ke->ke_ksegrp;
472 	kseq = KSEQ_CPU(ke->ke_cpu);
473 
474 	/*
475 	 * Rationale:
476 	 * KSEs in interactive ksegs get the minimum slice so that we
477 	 * quickly notice if it abuses its advantage.
478 	 *
479 	 * KSEs in non-interactive ksegs are assigned a slice that is
480 	 * based on the ksegs nice value relative to the least nice kseg
481 	 * on the run queue for this cpu.
482 	 *
483 	 * If the KSE is less nice than all others it gets the maximum
484 	 * slice and other KSEs will adjust their slice relative to
485 	 * this when they first expire.
486 	 *
487 	 * There is 20 point window that starts relative to the least
488 	 * nice kse on the run queue.  Slice size is determined by
489 	 * the kse distance from the last nice ksegrp.
490 	 *
491 	 * If you are outside of the window you will get no slice and
492 	 * you will be reevaluated each time you are selected on the
493 	 * run queue.
494 	 *
495 	 */
496 
497 	if (!SCHED_INTERACTIVE(kg)) {
498 		int nice;
499 
500 		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
501 		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
502 		    kg->kg_nice < kseq->ksq_nicemin)
503 			ke->ke_slice = SCHED_SLICE_MAX;
504 		else if (nice <= SCHED_PRI_NTHRESH)
505 			ke->ke_slice = SCHED_SLICE_NICE(nice);
506 		else
507 			ke->ke_slice = 0;
508 	} else
509 		ke->ke_slice = SCHED_SLICE_MIN;
510 
511 	CTR6(KTR_ULE,
512 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
513 	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
514 	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
515 
516 	/*
517 	 * Check to see if we need to scale back the slp and run time
518 	 * in the kg.  This will cause us to forget old interactivity
519 	 * while maintaining the current ratio.
520 	 */
521 	CTR4(KTR_ULE, "Slp vs Run %p (Slp %d, Run %d, Score %d)",
522 	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
523 	    sched_interact_score(kg));
524 
525 	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
526 		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
527 		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
528 	}
529 	CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)",
530 	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
531 	    sched_interact_score(kg));
532 
533 	return;
534 }
535 
536 static int
537 sched_interact_score(struct ksegrp *kg)
538 {
539 	int big;
540 	int small;
541 	int base;
542 
543 	if (kg->kg_runtime > kg->kg_slptime) {
544 		big = kg->kg_runtime;
545 		small = kg->kg_slptime;
546 		base = SCHED_INTERACT_HALF;
547 	} else {
548 		big = kg->kg_slptime;
549 		small = kg->kg_runtime;
550 		base = 0;
551 	}
552 
553 	big /= SCHED_INTERACT_HALF;
554 	if (big != 0)
555 		small /= big;
556 	else
557 		small = 0;
558 
559 	small += base;
560 	/* XXX Factor in nice */
561 	return (small);
562 }
563 
564 /*
565  * This is only somewhat accurate since given many processes of the same
566  * priority they will switch when their slices run out, which will be
567  * at most SCHED_SLICE_MAX.
568  */
569 int
570 sched_rr_interval(void)
571 {
572 	return (SCHED_SLICE_MAX);
573 }
574 
575 void
576 sched_pctcpu_update(struct kse *ke)
577 {
578 	/*
579 	 * Adjust counters and watermark for pctcpu calc.
580 	 *
581 	 * Shift the tick count out so that the divide doesn't round away
582 	 * our results.
583 	 */
584 	ke->ke_ticks <<= 10;
585 	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
586 		    SCHED_CPU_TICKS;
587 	ke->ke_ticks >>= 10;
588 	ke->ke_ltick = ticks;
589 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
590 }
591 
592 #ifdef SMP
593 /* XXX Should be changed to kseq_load_lowest() */
594 int
595 sched_pickcpu(void)
596 {
597 	struct kseq *kseq;
598 	int load;
599 	int cpu;
600 	int i;
601 
602 	if (!smp_started)
603 		return (0);
604 
605 	load = 0;
606 	cpu = 0;
607 
608 	for (i = 0; i < mp_maxid; i++) {
609 		if (CPU_ABSENT(i))
610 			continue;
611 		kseq = KSEQ_CPU(i);
612 		if (kseq->ksq_load < load) {
613 			cpu = i;
614 			load = kseq->ksq_load;
615 		}
616 	}
617 
618 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
619 	return (cpu);
620 }
621 #else
622 int
623 sched_pickcpu(void)
624 {
625 	return (0);
626 }
627 #endif
628 
629 void
630 sched_prio(struct thread *td, u_char prio)
631 {
632 	struct kse *ke;
633 	struct runq *rq;
634 
635 	mtx_assert(&sched_lock, MA_OWNED);
636 	ke = td->td_kse;
637 	td->td_priority = prio;
638 
639 	if (TD_ON_RUNQ(td)) {
640 		rq = ke->ke_runq;
641 
642 		runq_remove(rq, ke);
643 		runq_add(rq, ke);
644 	}
645 }
646 
647 void
648 sched_switchout(struct thread *td)
649 {
650 	struct kse *ke;
651 
652 	mtx_assert(&sched_lock, MA_OWNED);
653 
654 	ke = td->td_kse;
655 
656 	td->td_last_kse = ke;
657         td->td_lastcpu = td->td_oncpu;
658 	td->td_oncpu = NOCPU;
659         td->td_flags &= ~TDF_NEEDRESCHED;
660 
661 	if (TD_IS_RUNNING(td)) {
662 		runq_add(ke->ke_runq, ke);
663 		/* setrunqueue(td); */
664 		return;
665 	}
666 	if (ke->ke_runq)
667 		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
668 	/*
669 	 * We will not be on the run queue. So we must be
670 	 * sleeping or similar.
671 	 */
672 	if (td->td_proc->p_flag & P_THREADED)
673 		kse_reassign(ke);
674 }
675 
676 void
677 sched_switchin(struct thread *td)
678 {
679 	/* struct kse *ke = td->td_kse; */
680 	mtx_assert(&sched_lock, MA_OWNED);
681 
682 	td->td_oncpu = PCPU_GET(cpuid);
683 
684 	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
685 	    td->td_priority != td->td_ksegrp->kg_user_pri)
686 		curthread->td_flags |= TDF_NEEDRESCHED;
687 }
688 
689 void
690 sched_nice(struct ksegrp *kg, int nice)
691 {
692 	struct kse *ke;
693 	struct thread *td;
694 	struct kseq *kseq;
695 
696 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
697 	mtx_assert(&sched_lock, MA_OWNED);
698 	/*
699 	 * We need to adjust the nice counts for running KSEs.
700 	 */
701 	if (kg->kg_pri_class == PRI_TIMESHARE)
702 		FOREACH_KSE_IN_GROUP(kg, ke) {
703 			if (ke->ke_state != KES_ONRUNQ &&
704 			    ke->ke_state != KES_THREAD)
705 				continue;
706 			kseq = KSEQ_CPU(ke->ke_cpu);
707 			kseq_nice_rem(kseq, kg->kg_nice);
708 			kseq_nice_add(kseq, nice);
709 		}
710 	kg->kg_nice = nice;
711 	sched_priority(kg);
712 	FOREACH_THREAD_IN_GROUP(kg, td)
713 		td->td_flags |= TDF_NEEDRESCHED;
714 }
715 
716 void
717 sched_sleep(struct thread *td, u_char prio)
718 {
719 	mtx_assert(&sched_lock, MA_OWNED);
720 
721 	td->td_slptime = ticks;
722 	td->td_priority = prio;
723 
724 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
725 	    td->td_kse, td->td_slptime);
726 }
727 
728 void
729 sched_wakeup(struct thread *td)
730 {
731 	mtx_assert(&sched_lock, MA_OWNED);
732 
733 	/*
734 	 * Let the kseg know how long we slept for.  This is because process
735 	 * interactivity behavior is modeled in the kseg.
736 	 */
737 	if (td->td_slptime) {
738 		struct ksegrp *kg;
739 		int hzticks;
740 
741 		kg = td->td_ksegrp;
742 		hzticks = ticks - td->td_slptime;
743 		kg->kg_slptime += hzticks << 10;
744 		sched_priority(kg);
745 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
746 		    td->td_kse, hzticks);
747 		td->td_slptime = 0;
748 	}
749 	setrunqueue(td);
750         if (td->td_priority < curthread->td_priority)
751                 curthread->td_flags |= TDF_NEEDRESCHED;
752 }
753 
754 /*
755  * Penalize the parent for creating a new child and initialize the child's
756  * priority.
757  */
758 void
759 sched_fork(struct proc *p, struct proc *p1)
760 {
761 
762 	mtx_assert(&sched_lock, MA_OWNED);
763 
764 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
765 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
766 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
767 }
768 
769 void
770 sched_fork_kse(struct kse *ke, struct kse *child)
771 {
772 
773 	child->ke_slice = ke->ke_slice;
774 	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
775 	child->ke_runq = NULL;
776 
777 	/*
778 	 * Claim that we've been running for one second for statistical
779 	 * purposes.
780 	 */
781 	child->ke_ticks = 0;
782 	child->ke_ltick = ticks;
783 	child->ke_ftick = ticks - hz;
784 }
785 
786 void
787 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
788 {
789 
790 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
791 	/* XXX Need something better here */
792 	if (kg->kg_slptime > kg->kg_runtime) {
793 		child->kg_slptime = SCHED_DYN_RANGE;
794 		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
795 	} else {
796 		child->kg_runtime = SCHED_DYN_RANGE;
797 		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
798 	}
799 
800 	child->kg_user_pri = kg->kg_user_pri;
801 	child->kg_nice = kg->kg_nice;
802 }
803 
804 void
805 sched_fork_thread(struct thread *td, struct thread *child)
806 {
807 }
808 
809 void
810 sched_class(struct ksegrp *kg, int class)
811 {
812 	struct kseq *kseq;
813 	struct kse *ke;
814 
815 	mtx_assert(&sched_lock, MA_OWNED);
816 	if (kg->kg_pri_class == class)
817 		return;
818 
819 	FOREACH_KSE_IN_GROUP(kg, ke) {
820 		if (ke->ke_state != KES_ONRUNQ &&
821 		    ke->ke_state != KES_THREAD)
822 			continue;
823 		kseq = KSEQ_CPU(ke->ke_cpu);
824 
825 		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
826 		kseq->ksq_loads[PRI_BASE(class)]++;
827 
828 		if (kg->kg_pri_class == PRI_TIMESHARE)
829 			kseq_nice_rem(kseq, kg->kg_nice);
830 		else if (class == PRI_TIMESHARE)
831 			kseq_nice_add(kseq, kg->kg_nice);
832 	}
833 
834 	kg->kg_pri_class = class;
835 }
836 
837 /*
838  * Return some of the child's priority and interactivity to the parent.
839  */
840 void
841 sched_exit(struct proc *p, struct proc *child)
842 {
843 	/* XXX Need something better here */
844 	mtx_assert(&sched_lock, MA_OWNED);
845 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
846 }
847 
848 void
849 sched_exit_kse(struct kse *ke, struct kse *child)
850 {
851 	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
852 }
853 
854 void
855 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
856 {
857 }
858 
859 void
860 sched_exit_thread(struct thread *td, struct thread *child)
861 {
862 }
863 
864 void
865 sched_clock(struct kse *ke)
866 {
867 	struct kseq *kseq;
868 	struct ksegrp *kg;
869 	struct thread *td;
870 #if 0
871 	struct kse *nke;
872 #endif
873 
874 	/*
875 	 * sched_setup() apparently happens prior to stathz being set.  We
876 	 * need to resolve the timers earlier in the boot so we can avoid
877 	 * calculating this here.
878 	 */
879 	if (realstathz == 0) {
880 		realstathz = stathz ? stathz : hz;
881 		tickincr = hz / realstathz;
882 		/*
883 		 * XXX This does not work for values of stathz that are much
884 		 * larger than hz.
885 		 */
886 		if (tickincr == 0)
887 			tickincr = 1;
888 	}
889 
890 	td = ke->ke_thread;
891 	kg = ke->ke_ksegrp;
892 
893 	mtx_assert(&sched_lock, MA_OWNED);
894 	KASSERT((td != NULL), ("schedclock: null thread pointer"));
895 
896 	/* Adjust ticks for pctcpu */
897 	ke->ke_ticks++;
898 	ke->ke_ltick = ticks;
899 
900 	/* Go up to one second beyond our max and then trim back down */
901 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
902 		sched_pctcpu_update(ke);
903 
904 	if (td->td_kse->ke_flags & KEF_IDLEKSE)
905 		return;
906 
907 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
908 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
909 
910 	/*
911 	 * We only do slicing code for TIMESHARE ksegrps.
912 	 */
913 	if (kg->kg_pri_class != PRI_TIMESHARE)
914 		return;
915 	/*
916 	 * Check for a higher priority task on the run queue.  This can happen
917 	 * on SMP if another processor woke up a process on our runq.
918 	 */
919 	kseq = KSEQ_SELF();
920 #if 0
921 	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
922 		if (sched_strict &&
923 		    nke->ke_thread->td_priority < td->td_priority)
924 			td->td_flags |= TDF_NEEDRESCHED;
925 		else if (nke->ke_thread->td_priority <
926 		    td->td_priority SCHED_PRIO_SLOP)
927 
928 		if (nke->ke_thread->td_priority < td->td_priority)
929 			td->td_flags |= TDF_NEEDRESCHED;
930 	}
931 #endif
932 	/*
933 	 * We used a tick charge it to the ksegrp so that we can compute our
934 	 * interactivity.
935 	 */
936 	kg->kg_runtime += tickincr << 10;
937 
938 	/*
939 	 * We used up one time slice.
940 	 */
941 	ke->ke_slice--;
942 #ifdef SMP
943 	kseq->ksq_rslices--;
944 #endif
945 
946 	if (ke->ke_slice > 0)
947 		return;
948 	/*
949 	 * We're out of time, recompute priorities and requeue.
950 	 */
951 	kseq_rem(kseq, ke);
952 	sched_priority(kg);
953 	sched_slice(ke);
954 	if (SCHED_CURR(kg, ke))
955 		ke->ke_runq = kseq->ksq_curr;
956 	else
957 		ke->ke_runq = kseq->ksq_next;
958 	kseq_add(kseq, ke);
959 	td->td_flags |= TDF_NEEDRESCHED;
960 }
961 
962 int
963 sched_runnable(void)
964 {
965 	struct kseq *kseq;
966 
967 	kseq = KSEQ_SELF();
968 
969 	if (kseq->ksq_load)
970 		return (1);
971 #ifdef SMP
972 	/*
973 	 * For SMP we may steal other processor's KSEs.  Just search until we
974 	 * verify that at least on other cpu has a runnable task.
975 	 */
976 	if (smp_started) {
977 		int i;
978 
979 		for (i = 0; i < mp_maxid; i++) {
980 			if (CPU_ABSENT(i))
981 				continue;
982 			kseq = KSEQ_CPU(i);
983 			if (kseq->ksq_load > 1)
984 				return (1);
985 		}
986 	}
987 #endif
988 	return (0);
989 }
990 
991 void
992 sched_userret(struct thread *td)
993 {
994 	struct ksegrp *kg;
995 
996 	kg = td->td_ksegrp;
997 
998 	if (td->td_priority != kg->kg_user_pri) {
999 		mtx_lock_spin(&sched_lock);
1000 		td->td_priority = kg->kg_user_pri;
1001 		mtx_unlock_spin(&sched_lock);
1002 	}
1003 }
1004 
1005 struct kse *
1006 sched_choose(void)
1007 {
1008 	struct kseq *kseq;
1009 	struct kse *ke;
1010 
1011 #ifdef SMP
1012 retry:
1013 #endif
1014 	kseq = KSEQ_SELF();
1015 	ke = kseq_choose(kseq);
1016 	if (ke) {
1017 		runq_remove(ke->ke_runq, ke);
1018 		ke->ke_state = KES_THREAD;
1019 
1020 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1021 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1022 			    ke, ke->ke_runq, ke->ke_slice,
1023 			    ke->ke_thread->td_priority);
1024 		}
1025 		return (ke);
1026 	}
1027 
1028 #ifdef SMP
1029 	if (smp_started) {
1030 		/*
1031 		 * Find the cpu with the highest load and steal one proc.
1032 		 */
1033 		if ((kseq = kseq_load_highest()) == NULL)
1034 			return (NULL);
1035 
1036 		/*
1037 		 * Remove this kse from this kseq and runq and then requeue
1038 		 * on the current processor.  Then we will dequeue it
1039 		 * normally above.
1040 		 */
1041 		ke = kseq_choose(kseq);
1042 		runq_remove(ke->ke_runq, ke);
1043 		ke->ke_state = KES_THREAD;
1044 		kseq_rem(kseq, ke);
1045 
1046 		ke->ke_cpu = PCPU_GET(cpuid);
1047 		sched_add(ke);
1048 		goto retry;
1049 	}
1050 #endif
1051 
1052 	return (NULL);
1053 }
1054 
1055 void
1056 sched_add(struct kse *ke)
1057 {
1058 	struct kseq *kseq;
1059 	struct ksegrp *kg;
1060 
1061 	mtx_assert(&sched_lock, MA_OWNED);
1062 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1063 	KASSERT((ke->ke_thread->td_kse != NULL),
1064 	    ("sched_add: No KSE on thread"));
1065 	KASSERT(ke->ke_state != KES_ONRUNQ,
1066 	    ("sched_add: kse %p (%s) already in run queue", ke,
1067 	    ke->ke_proc->p_comm));
1068 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1069 	    ("sched_add: process swapped out"));
1070 	KASSERT(ke->ke_runq == NULL,
1071 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1072 
1073 	kg = ke->ke_ksegrp;
1074 
1075 	switch (PRI_BASE(kg->kg_pri_class)) {
1076 	case PRI_ITHD:
1077 	case PRI_REALTIME:
1078 		kseq = KSEQ_SELF();
1079 		ke->ke_runq = kseq->ksq_curr;
1080 		ke->ke_slice = SCHED_SLICE_MAX;
1081 		ke->ke_cpu = PCPU_GET(cpuid);
1082 		break;
1083 	case PRI_TIMESHARE:
1084 		kseq = KSEQ_CPU(ke->ke_cpu);
1085 		if (SCHED_CURR(kg, ke))
1086 			ke->ke_runq = kseq->ksq_curr;
1087 		else
1088 			ke->ke_runq = kseq->ksq_next;
1089 		break;
1090 	case PRI_IDLE:
1091 		kseq = KSEQ_CPU(ke->ke_cpu);
1092 		/*
1093 		 * This is for priority prop.
1094 		 */
1095 		if (ke->ke_thread->td_priority < PRI_MAX_TIMESHARE)
1096 			ke->ke_runq = kseq->ksq_curr;
1097 		else
1098 			ke->ke_runq = &kseq->ksq_idle;
1099 		ke->ke_slice = SCHED_SLICE_MIN;
1100 		break;
1101 	default:
1102 		panic("Unknown pri class.\n");
1103 		break;
1104 	}
1105 
1106 	ke->ke_ksegrp->kg_runq_kses++;
1107 	ke->ke_state = KES_ONRUNQ;
1108 
1109 	runq_add(ke->ke_runq, ke);
1110 	kseq_add(kseq, ke);
1111 }
1112 
1113 void
1114 sched_rem(struct kse *ke)
1115 {
1116 	struct kseq *kseq;
1117 
1118 	mtx_assert(&sched_lock, MA_OWNED);
1119 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
1120 
1121 	ke->ke_state = KES_THREAD;
1122 	ke->ke_ksegrp->kg_runq_kses--;
1123 	kseq = KSEQ_CPU(ke->ke_cpu);
1124 	runq_remove(ke->ke_runq, ke);
1125 	kseq_rem(kseq, ke);
1126 }
1127 
1128 fixpt_t
1129 sched_pctcpu(struct kse *ke)
1130 {
1131 	fixpt_t pctcpu;
1132 
1133 	pctcpu = 0;
1134 
1135 	if (ke->ke_ticks) {
1136 		int rtick;
1137 
1138 		/* Update to account for time potentially spent sleeping */
1139 		ke->ke_ltick = ticks;
1140 		sched_pctcpu_update(ke);
1141 
1142 		/* How many rtick per second ? */
1143 		rtick = ke->ke_ticks / SCHED_CPU_TIME;
1144 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1145 	}
1146 
1147 	mtx_lock_spin(&sched_lock);
1148 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1149 	mtx_unlock_spin(&sched_lock);
1150 
1151 	return (pctcpu);
1152 }
1153 
1154 int
1155 sched_sizeof_kse(void)
1156 {
1157 	return (sizeof(struct kse) + sizeof(struct ke_sched));
1158 }
1159 
1160 int
1161 sched_sizeof_ksegrp(void)
1162 {
1163 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1164 }
1165 
1166 int
1167 sched_sizeof_proc(void)
1168 {
1169 	return (sizeof(struct proc));
1170 }
1171 
1172 int
1173 sched_sizeof_thread(void)
1174 {
1175 	return (sizeof(struct thread) + sizeof(struct td_sched));
1176 }
1177