xref: /freebsd/sys/kern/kern_tc.c (revision 0c43d89a0d8e976ca494d4837f4c1f3734d2c300)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39  * $Id: kern_clock.c,v 1.3 1994/08/02 07:41:54 davidg Exp $
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/dkstat.h>
45 #include <sys/callout.h>
46 #include <sys/kernel.h>
47 #include <sys/proc.h>
48 #include <sys/resourcevar.h>
49 
50 #include <machine/cpu.h>
51 
52 #ifdef GPROF
53 #include <sys/gmon.h>
54 #endif
55 
56 /* Does anybody else really care about these? */
57 struct callout *callfree, *callout, calltodo;
58 int ncallout;
59 
60 /* Some of these don't belong here, but it's easiest to concentrate them. */
61 long cp_time[CPUSTATES];
62 long dk_seek[DK_NDRIVE];
63 long dk_time[DK_NDRIVE];
64 long dk_wds[DK_NDRIVE];
65 long dk_wpms[DK_NDRIVE];
66 long dk_xfer[DK_NDRIVE];
67 
68 int dk_busy;
69 int dk_ndrive = DK_NDRIVE;
70 
71 long tk_cancc;
72 long tk_nin;
73 long tk_nout;
74 long tk_rawcc;
75 
76 /*
77  * Clock handling routines.
78  *
79  * This code is written to operate with two timers that run independently of
80  * each other.  The main clock, running hz times per second, is used to keep
81  * track of real time.  The second timer handles kernel and user profiling,
82  * and does resource use estimation.  If the second timer is programmable,
83  * it is randomized to avoid aliasing between the two clocks.  For example,
84  * the randomization prevents an adversary from always giving up the cpu
85  * just before its quantum expires.  Otherwise, it would never accumulate
86  * cpu ticks.  The mean frequency of the second timer is stathz.
87  *
88  * If no second timer exists, stathz will be zero; in this case we drive
89  * profiling and statistics off the main clock.  This WILL NOT be accurate;
90  * do not do it unless absolutely necessary.
91  *
92  * The statistics clock may (or may not) be run at a higher rate while
93  * profiling.  This profile clock runs at profhz.  We require that profhz
94  * be an integral multiple of stathz.
95  *
96  * If the statistics clock is running fast, it must be divided by the ratio
97  * profhz/stathz for statistics.  (For profiling, every tick counts.)
98  */
99 
100 /*
101  * TODO:
102  *	allocate more timeout table slots when table overflows.
103  */
104 
105 /*
106  * Bump a timeval by a small number of usec's.
107  */
108 #define BUMPTIME(t, usec) { \
109 	register volatile struct timeval *tp = (t); \
110 	register long us; \
111  \
112 	tp->tv_usec = us = tp->tv_usec + (usec); \
113 	if (us >= 1000000) { \
114 		tp->tv_usec = us - 1000000; \
115 		tp->tv_sec++; \
116 	} \
117 }
118 
119 int	stathz;
120 int	profhz;
121 int	profprocs;
122 int	ticks;
123 static int psdiv, pscnt;	/* prof => stat divider */
124 int	psratio;		/* ratio: prof / stat */
125 
126 volatile struct	timeval time;
127 volatile struct	timeval mono_time;
128 
129 /*
130  * Initialize clock frequencies and start both clocks running.
131  */
132 void
133 initclocks()
134 {
135 	register int i;
136 
137 	/*
138 	 * Set divisors to 1 (normal case) and let the machine-specific
139 	 * code do its bit.
140 	 */
141 	psdiv = pscnt = 1;
142 	cpu_initclocks();
143 
144 	/*
145 	 * Compute profhz/stathz, and fix profhz if needed.
146 	 */
147 	i = stathz ? stathz : hz;
148 	if (profhz == 0)
149 		profhz = i;
150 	psratio = profhz / i;
151 }
152 
153 /*
154  * The real-time timer, interrupting hz times per second.
155  */
156 void
157 hardclock(frame)
158 	register struct clockframe *frame;
159 {
160 	register struct callout *p1;
161 	register struct proc *p;
162 	register int delta, needsoft;
163 	extern int tickdelta;
164 	extern long timedelta;
165 
166 	/*
167 	 * Update real-time timeout queue.
168 	 * At front of queue are some number of events which are ``due''.
169 	 * The time to these is <= 0 and if negative represents the
170 	 * number of ticks which have passed since it was supposed to happen.
171 	 * The rest of the q elements (times > 0) are events yet to happen,
172 	 * where the time for each is given as a delta from the previous.
173 	 * Decrementing just the first of these serves to decrement the time
174 	 * to all events.
175 	 */
176 	needsoft = 0;
177 	for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
178 		if (--p1->c_time > 0)
179 			break;
180 		needsoft = 1;
181 		if (p1->c_time == 0)
182 			break;
183 	}
184 
185 	p = curproc;
186 	if (p) {
187 		register struct pstats *pstats;
188 
189 		/*
190 		 * Run current process's virtual and profile time, as needed.
191 		 */
192 		pstats = p->p_stats;
193 		if (CLKF_USERMODE(frame) &&
194 		    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
195 		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
196 			psignal(p, SIGVTALRM);
197 		if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
198 		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
199 			psignal(p, SIGPROF);
200 	}
201 
202 	/*
203 	 * If no separate statistics clock is available, run it from here.
204 	 */
205 	if (stathz == 0)
206 		statclock(frame);
207 
208 	/*
209 	 * Increment the time-of-day.  The increment is just ``tick'' unless
210 	 * we are still adjusting the clock; see adjtime().
211 	 */
212 	ticks++;
213 	if (timedelta == 0)
214 		delta = tick;
215 	else {
216 		delta = tick + tickdelta;
217 		timedelta -= tickdelta;
218 	}
219 	BUMPTIME(&time, delta);
220 	BUMPTIME(&mono_time, delta);
221 
222 	/*
223 	 * Process callouts at a very low cpu priority, so we don't keep the
224 	 * relatively high clock interrupt priority any longer than necessary.
225 	 */
226 	if (needsoft) {
227 		if (CLKF_BASEPRI(frame)) {
228 			/*
229 			 * Save the overhead of a software interrupt;
230 			 * it will happen as soon as we return, so do it now.
231 			 */
232 			(void)splsoftclock();
233 			softclock();
234 		} else
235 			setsoftclock();
236 	}
237 }
238 
239 /*
240  * Software (low priority) clock interrupt.
241  * Run periodic events from timeout queue.
242  */
243 /*ARGSUSED*/
244 void
245 softclock()
246 {
247 	register struct callout *c;
248 	register void *arg;
249 	register void (*func) __P((void *));
250 	register int s;
251 
252 	s = splhigh();
253 	while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
254 		func = c->c_func;
255 		arg = c->c_arg;
256 		calltodo.c_next = c->c_next;
257 		c->c_next = callfree;
258 		callfree = c;
259 		splx(s);
260 		(*func)(arg);
261 		(void) splhigh();
262 	}
263 	splx(s);
264 }
265 
266 /*
267  * timeout --
268  *	Execute a function after a specified length of time.
269  *
270  * untimeout --
271  *	Cancel previous timeout function call.
272  *
273  *	See AT&T BCI Driver Reference Manual for specification.  This
274  *	implementation differs from that one in that no identification
275  *	value is returned from timeout, rather, the original arguments
276  *	to timeout are used to identify entries for untimeout.
277  */
278 void
279 timeout(ftn, arg, ticks)
280 	timeout_t ftn;
281 	void *arg;
282 	register int ticks;
283 {
284 	register struct callout *new, *p, *t;
285 	register int s;
286 
287 	if (ticks <= 0)
288 		ticks = 1;
289 
290 	/* Lock out the clock. */
291 	s = splhigh();
292 
293 	/* Fill in the next free callout structure. */
294 	if (callfree == NULL)
295 		panic("timeout table full");
296 	new = callfree;
297 	callfree = new->c_next;
298 	new->c_arg = arg;
299 	new->c_func = ftn;
300 
301 	/*
302 	 * The time for each event is stored as a difference from the time
303 	 * of the previous event on the queue.  Walk the queue, correcting
304 	 * the ticks argument for queue entries passed.  Correct the ticks
305 	 * value for the queue entry immediately after the insertion point
306 	 * as well.  Watch out for negative c_time values; these represent
307 	 * overdue events.
308 	 */
309 	for (p = &calltodo;
310 	    (t = p->c_next) != NULL && ticks > t->c_time; p = t)
311 		if (t->c_time > 0)
312 			ticks -= t->c_time;
313 	new->c_time = ticks;
314 	if (t != NULL)
315 		t->c_time -= ticks;
316 
317 	/* Insert the new entry into the queue. */
318 	p->c_next = new;
319 	new->c_next = t;
320 	splx(s);
321 }
322 
323 void
324 untimeout(ftn, arg)
325 	timeout_t ftn;
326 	void *arg;
327 {
328 	register struct callout *p, *t;
329 	register int s;
330 
331 	s = splhigh();
332 	for (p = &calltodo; (t = p->c_next) != NULL; p = t)
333 		if (t->c_func == ftn && t->c_arg == arg) {
334 			/* Increment next entry's tick count. */
335 			if (t->c_next && t->c_time > 0)
336 				t->c_next->c_time += t->c_time;
337 
338 			/* Move entry from callout queue to callfree queue. */
339 			p->c_next = t->c_next;
340 			t->c_next = callfree;
341 			callfree = t;
342 			break;
343 		}
344 	splx(s);
345 }
346 
347 /*
348  * Compute number of hz until specified time.  Used to
349  * compute third argument to timeout() from an absolute time.
350  */
351 int
352 hzto(tv)
353 	struct timeval *tv;
354 {
355 	register long ticks, sec;
356 	int s;
357 
358 	/*
359 	 * If number of milliseconds will fit in 32 bit arithmetic,
360 	 * then compute number of milliseconds to time and scale to
361 	 * ticks.  Otherwise just compute number of hz in time, rounding
362 	 * times greater than representible to maximum value.
363 	 *
364 	 * Delta times less than 25 days can be computed ``exactly''.
365 	 * Maximum value for any timeout in 10ms ticks is 250 days.
366 	 */
367 	s = splhigh();
368 	sec = tv->tv_sec - time.tv_sec;
369 	if (sec <= 0x7fffffff / 1000 - 1000)
370 		ticks = ((tv->tv_sec - time.tv_sec) * 1000 +
371 			(tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000);
372 	else if (sec <= 0x7fffffff / hz)
373 		ticks = sec * hz;
374 	else
375 		ticks = 0x7fffffff;
376 	splx(s);
377 	return (ticks);
378 }
379 
380 /*
381  * Start profiling on a process.
382  *
383  * Kernel profiling passes proc0 which never exits and hence
384  * keeps the profile clock running constantly.
385  */
386 void
387 startprofclock(p)
388 	register struct proc *p;
389 {
390 	int s;
391 
392 	if ((p->p_flag & P_PROFIL) == 0) {
393 		p->p_flag |= P_PROFIL;
394 		if (++profprocs == 1 && stathz != 0) {
395 			s = splstatclock();
396 			psdiv = pscnt = psratio;
397 			setstatclockrate(profhz);
398 			splx(s);
399 		}
400 	}
401 }
402 
403 /*
404  * Stop profiling on a process.
405  */
406 void
407 stopprofclock(p)
408 	register struct proc *p;
409 {
410 	int s;
411 
412 	if (p->p_flag & P_PROFIL) {
413 		p->p_flag &= ~P_PROFIL;
414 		if (--profprocs == 0 && stathz != 0) {
415 			s = splstatclock();
416 			psdiv = pscnt = 1;
417 			setstatclockrate(stathz);
418 			splx(s);
419 		}
420 	}
421 }
422 
423 /*
424  * Statistics clock.  Grab profile sample, and if divider reaches 0,
425  * do process and kernel statistics.
426  */
427 void
428 statclock(frame)
429 	register struct clockframe *frame;
430 {
431 #ifdef GPROF
432 	register struct gmonparam *g;
433 #endif
434 	register struct proc *p;
435 	register int i;
436 
437 	if (CLKF_USERMODE(frame)) {
438 		p = curproc;
439 		if (p->p_flag & P_PROFIL)
440 			addupc_intr(p, CLKF_PC(frame), 1);
441 		if (--pscnt > 0)
442 			return;
443 		/*
444 		 * Came from user mode; CPU was in user state.
445 		 * If this process is being profiled record the tick.
446 		 */
447 		p->p_uticks++;
448 		if (p->p_nice > NZERO)
449 			cp_time[CP_NICE]++;
450 		else
451 			cp_time[CP_USER]++;
452 	} else {
453 #ifdef GPROF
454 		/*
455 		 * Kernel statistics are just like addupc_intr, only easier.
456 		 */
457 		g = &_gmonparam;
458 		if (g->state == GMON_PROF_ON) {
459 			i = CLKF_PC(frame) - g->lowpc;
460 			if (i < g->textsize) {
461 				i /= HISTFRACTION * sizeof(*g->kcount);
462 				g->kcount[i]++;
463 			}
464 		}
465 #endif
466 		if (--pscnt > 0)
467 			return;
468 		/*
469 		 * Came from kernel mode, so we were:
470 		 * - handling an interrupt,
471 		 * - doing syscall or trap work on behalf of the current
472 		 *   user process, or
473 		 * - spinning in the idle loop.
474 		 * Whichever it is, charge the time as appropriate.
475 		 * Note that we charge interrupts to the current process,
476 		 * regardless of whether they are ``for'' that process,
477 		 * so that we know how much of its real time was spent
478 		 * in ``non-process'' (i.e., interrupt) work.
479 		 */
480 		p = curproc;
481 		if (CLKF_INTR(frame)) {
482 			if (p != NULL)
483 				p->p_iticks++;
484 			cp_time[CP_INTR]++;
485 		} else if (p != NULL) {
486 			p->p_sticks++;
487 			cp_time[CP_SYS]++;
488 		} else
489 			cp_time[CP_IDLE]++;
490 	}
491 	pscnt = psdiv;
492 
493 	/*
494 	 * We maintain statistics shown by user-level statistics
495 	 * programs:  the amount of time in each cpu state, and
496 	 * the amount of time each of DK_NDRIVE ``drives'' is busy.
497 	 *
498 	 * XXX	should either run linked list of drives, or (better)
499 	 *	grab timestamps in the start & done code.
500 	 */
501 	for (i = 0; i < DK_NDRIVE; i++)
502 		if (dk_busy & (1 << i))
503 			dk_time[i]++;
504 
505 	/*
506 	 * We adjust the priority of the current process.  The priority of
507 	 * a process gets worse as it accumulates CPU time.  The cpu usage
508 	 * estimator (p_estcpu) is increased here.  The formula for computing
509 	 * priorities (in kern_synch.c) will compute a different value each
510 	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
511 	 * quite quickly when the process is running (linearly), and decays
512 	 * away exponentially, at a rate which is proportionally slower when
513 	 * the system is busy.  The basic principal is that the system will
514 	 * 90% forget that the process used a lot of CPU time in 5 * loadav
515 	 * seconds.  This causes the system to favor processes which haven't
516 	 * run much recently, and to round-robin among other processes.
517 	 */
518 	if (p != NULL) {
519 		p->p_cpticks++;
520 		if (++p->p_estcpu == 0)
521 			p->p_estcpu--;
522 		if ((p->p_estcpu & 3) == 0) {
523 			resetpriority(p);
524 			if (p->p_priority >= PUSER)
525 				p->p_priority = p->p_usrpri;
526 		}
527 	}
528 }
529 
530 /*
531  * Return information about system clocks.
532  */
533 int
534 sysctl_clockrate(where, sizep)
535 	register char *where;
536 	size_t *sizep;
537 {
538 	struct clockinfo clkinfo;
539 
540 	/*
541 	 * Construct clockinfo structure.
542 	 */
543 	clkinfo.hz = hz;
544 	clkinfo.tick = tick;
545 	clkinfo.profhz = profhz;
546 	clkinfo.stathz = stathz ? stathz : hz;
547 	return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)));
548 }
549