xref: /freebsd/sys/kern/kern_clock.c (revision a89ec05e3eddc4235386e1246b638b3d4d8815f5)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ntp.h"
45 #include "opt_ddb.h"
46 #include "opt_watchdog.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/callout.h>
51 #include <sys/kernel.h>
52 #include <sys/lock.h>
53 #include <sys/ktr.h>
54 #include <sys/mutex.h>
55 #include <sys/proc.h>
56 #include <sys/resource.h>
57 #include <sys/resourcevar.h>
58 #include <sys/sched.h>
59 #include <sys/signalvar.h>
60 #include <sys/smp.h>
61 #include <vm/vm.h>
62 #include <vm/pmap.h>
63 #include <vm/vm_map.h>
64 #include <sys/sysctl.h>
65 #include <sys/bus.h>
66 #include <sys/interrupt.h>
67 #include <sys/limits.h>
68 #include <sys/timetc.h>
69 
70 #include <machine/cpu.h>
71 
72 #ifdef GPROF
73 #include <sys/gmon.h>
74 #endif
75 
76 #ifdef DDB
77 #include <ddb/ddb.h>
78 #endif
79 
80 #ifdef DEVICE_POLLING
81 extern void hardclock_device_poll(void);
82 #endif /* DEVICE_POLLING */
83 
84 static void initclocks(void *dummy);
85 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
86 
87 /* Some of these don't belong here, but it's easiest to concentrate them. */
88 long cp_time[CPUSTATES];
89 
90 SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
91     "LU", "CPU time statistics");
92 
93 #ifdef WATCHDOG
94 static int sysctl_watchdog_reset(SYSCTL_HANDLER_ARGS);
95 static void watchdog_fire(void);
96 
97 static int watchdog_enabled;
98 static unsigned int watchdog_ticks;
99 static int watchdog_timeout = 20;
100 
101 SYSCTL_NODE(_debug, OID_AUTO, watchdog, CTLFLAG_RW, 0, "System watchdog");
102 SYSCTL_INT(_debug_watchdog, OID_AUTO, enabled, CTLFLAG_RW, &watchdog_enabled,
103 	0, "Enable the watchdog");
104 SYSCTL_INT(_debug_watchdog, OID_AUTO, timeout, CTLFLAG_RW, &watchdog_timeout,
105 	0, "Timeout for watchdog checkins");
106 
107 #endif /* WATCHDOG */
108 
109 /*
110  * Clock handling routines.
111  *
112  * This code is written to operate with two timers that run independently of
113  * each other.
114  *
115  * The main timer, running hz times per second, is used to trigger interval
116  * timers, timeouts and rescheduling as needed.
117  *
118  * The second timer handles kernel and user profiling,
119  * and does resource use estimation.  If the second timer is programmable,
120  * it is randomized to avoid aliasing between the two clocks.  For example,
121  * the randomization prevents an adversary from always giving up the cpu
122  * just before its quantum expires.  Otherwise, it would never accumulate
123  * cpu ticks.  The mean frequency of the second timer is stathz.
124  *
125  * If no second timer exists, stathz will be zero; in this case we drive
126  * profiling and statistics off the main clock.  This WILL NOT be accurate;
127  * do not do it unless absolutely necessary.
128  *
129  * The statistics clock may (or may not) be run at a higher rate while
130  * profiling.  This profile clock runs at profhz.  We require that profhz
131  * be an integral multiple of stathz.
132  *
133  * If the statistics clock is running fast, it must be divided by the ratio
134  * profhz/stathz for statistics.  (For profiling, every tick counts.)
135  *
136  * Time-of-day is maintained using a "timecounter", which may or may
137  * not be related to the hardware generating the above mentioned
138  * interrupts.
139  */
140 
141 int	stathz;
142 int	profhz;
143 int	profprocs;
144 int	ticks;
145 int	psratio;
146 
147 /*
148  * Initialize clock frequencies and start both clocks running.
149  */
150 /* ARGSUSED*/
151 static void
152 initclocks(dummy)
153 	void *dummy;
154 {
155 	register int i;
156 
157 	/*
158 	 * Set divisors to 1 (normal case) and let the machine-specific
159 	 * code do its bit.
160 	 */
161 	cpu_initclocks();
162 
163 	/*
164 	 * Compute profhz/stathz, and fix profhz if needed.
165 	 */
166 	i = stathz ? stathz : hz;
167 	if (profhz == 0)
168 		profhz = i;
169 	psratio = profhz / i;
170 }
171 
172 /*
173  * Each time the real-time timer fires, this function is called on all CPUs.
174  * Note that hardclock() calls hardclock_process() for the boot CPU, so only
175  * the other CPUs in the system need to call this function.
176  */
177 void
178 hardclock_process(frame)
179 	register struct clockframe *frame;
180 {
181 	struct pstats *pstats;
182 	struct thread *td = curthread;
183 	struct proc *p = td->td_proc;
184 
185 	/*
186 	 * Run current process's virtual and profile time, as needed.
187 	 */
188 	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
189 	if (p->p_flag & P_SA) {
190 		/* XXXKSE What to do? */
191 	} else {
192 		pstats = p->p_stats;
193 		if (CLKF_USERMODE(frame) &&
194 		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
195 		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
196 			p->p_sflag |= PS_ALRMPEND;
197 			td->td_flags |= TDF_ASTPENDING;
198 		}
199 		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
200 		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
201 			p->p_sflag |= PS_PROFPEND;
202 			td->td_flags |= TDF_ASTPENDING;
203 		}
204 	}
205 	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
206 }
207 
208 /*
209  * The real-time timer, interrupting hz times per second.
210  */
211 void
212 hardclock(frame)
213 	register struct clockframe *frame;
214 {
215 	int need_softclock = 0;
216 
217 	CTR0(KTR_CLK, "hardclock fired");
218 	hardclock_process(frame);
219 
220 	tc_ticktock();
221 	/*
222 	 * If no separate statistics clock is available, run it from here.
223 	 *
224 	 * XXX: this only works for UP
225 	 */
226 	if (stathz == 0) {
227 		profclock(frame);
228 		statclock(frame);
229 	}
230 
231 #ifdef DEVICE_POLLING
232 	hardclock_device_poll();	/* this is very short and quick */
233 #endif /* DEVICE_POLLING */
234 
235 	/*
236 	 * Process callouts at a very low cpu priority, so we don't keep the
237 	 * relatively high clock interrupt priority any longer than necessary.
238 	 */
239 	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
240 	ticks++;
241 	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
242 		need_softclock = 1;
243 	} else if (softticks + 1 == ticks)
244 		++softticks;
245 	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
246 
247 	/*
248 	 * swi_sched acquires sched_lock, so we don't want to call it with
249 	 * callout_lock held; incorrect locking order.
250 	 */
251 	if (need_softclock)
252 		swi_sched(softclock_ih, 0);
253 
254 #ifdef WATCHDOG
255 	if (watchdog_enabled > 0 &&
256 	    (int)(ticks - watchdog_ticks) >= (hz * watchdog_timeout))
257 		watchdog_fire();
258 #endif /* WATCHDOG */
259 }
260 
261 /*
262  * Compute number of ticks in the specified amount of time.
263  */
264 int
265 tvtohz(tv)
266 	struct timeval *tv;
267 {
268 	register unsigned long ticks;
269 	register long sec, usec;
270 
271 	/*
272 	 * If the number of usecs in the whole seconds part of the time
273 	 * difference fits in a long, then the total number of usecs will
274 	 * fit in an unsigned long.  Compute the total and convert it to
275 	 * ticks, rounding up and adding 1 to allow for the current tick
276 	 * to expire.  Rounding also depends on unsigned long arithmetic
277 	 * to avoid overflow.
278 	 *
279 	 * Otherwise, if the number of ticks in the whole seconds part of
280 	 * the time difference fits in a long, then convert the parts to
281 	 * ticks separately and add, using similar rounding methods and
282 	 * overflow avoidance.  This method would work in the previous
283 	 * case but it is slightly slower and assumes that hz is integral.
284 	 *
285 	 * Otherwise, round the time difference down to the maximum
286 	 * representable value.
287 	 *
288 	 * If ints have 32 bits, then the maximum value for any timeout in
289 	 * 10ms ticks is 248 days.
290 	 */
291 	sec = tv->tv_sec;
292 	usec = tv->tv_usec;
293 	if (usec < 0) {
294 		sec--;
295 		usec += 1000000;
296 	}
297 	if (sec < 0) {
298 #ifdef DIAGNOSTIC
299 		if (usec > 0) {
300 			sec++;
301 			usec -= 1000000;
302 		}
303 		printf("tvotohz: negative time difference %ld sec %ld usec\n",
304 		       sec, usec);
305 #endif
306 		ticks = 1;
307 	} else if (sec <= LONG_MAX / 1000000)
308 		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
309 			/ tick + 1;
310 	else if (sec <= LONG_MAX / hz)
311 		ticks = sec * hz
312 			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
313 	else
314 		ticks = LONG_MAX;
315 	if (ticks > INT_MAX)
316 		ticks = INT_MAX;
317 	return ((int)ticks);
318 }
319 
320 /*
321  * Start profiling on a process.
322  *
323  * Kernel profiling passes proc0 which never exits and hence
324  * keeps the profile clock running constantly.
325  */
326 void
327 startprofclock(p)
328 	register struct proc *p;
329 {
330 
331 	/*
332 	 * XXX; Right now sched_lock protects statclock(), but perhaps
333 	 * it should be protected later on by a time_lock, which would
334 	 * cover psdiv, etc. as well.
335 	 */
336 	PROC_LOCK_ASSERT(p, MA_OWNED);
337 	if (p->p_flag & P_STOPPROF)
338 		return;
339 	if ((p->p_flag & P_PROFIL) == 0) {
340 		mtx_lock_spin(&sched_lock);
341 		p->p_flag |= P_PROFIL;
342 		if (++profprocs == 1)
343 			cpu_startprofclock();
344 		mtx_unlock_spin(&sched_lock);
345 	}
346 }
347 
348 /*
349  * Stop profiling on a process.
350  */
351 void
352 stopprofclock(p)
353 	register struct proc *p;
354 {
355 
356 	PROC_LOCK_ASSERT(p, MA_OWNED);
357 	if (p->p_flag & P_PROFIL) {
358 		if (p->p_profthreads != 0) {
359 			p->p_flag |= P_STOPPROF;
360 			while (p->p_profthreads != 0)
361 				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
362 				    "stopprof", 0);
363 			p->p_flag &= ~P_STOPPROF;
364 		}
365 		mtx_lock_spin(&sched_lock);
366 		p->p_flag &= ~P_PROFIL;
367 		if (--profprocs == 0)
368 			cpu_stopprofclock();
369 		mtx_unlock_spin(&sched_lock);
370 	}
371 }
372 
373 /*
374  * Statistics clock.  Grab profile sample, and if divider reaches 0,
375  * do process and kernel statistics.  Most of the statistics are only
376  * used by user-level statistics programs.  The main exceptions are
377  * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
378  * This should be called by all active processors.
379  */
380 void
381 statclock(frame)
382 	register struct clockframe *frame;
383 {
384 	struct pstats *pstats;
385 	struct rusage *ru;
386 	struct vmspace *vm;
387 	struct thread *td;
388 	struct proc *p;
389 	long rss;
390 
391 	td = curthread;
392 	p = td->td_proc;
393 
394 	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
395 	if (CLKF_USERMODE(frame)) {
396 		/*
397 		 * Charge the time as appropriate.
398 		 */
399 		if (p->p_flag & P_SA)
400 			thread_statclock(1);
401 		p->p_uticks++;
402 		if (td->td_ksegrp->kg_nice > NZERO)
403 			cp_time[CP_NICE]++;
404 		else
405 			cp_time[CP_USER]++;
406 	} else {
407 		/*
408 		 * Came from kernel mode, so we were:
409 		 * - handling an interrupt,
410 		 * - doing syscall or trap work on behalf of the current
411 		 *   user process, or
412 		 * - spinning in the idle loop.
413 		 * Whichever it is, charge the time as appropriate.
414 		 * Note that we charge interrupts to the current process,
415 		 * regardless of whether they are ``for'' that process,
416 		 * so that we know how much of its real time was spent
417 		 * in ``non-process'' (i.e., interrupt) work.
418 		 */
419 		if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
420 			p->p_iticks++;
421 			cp_time[CP_INTR]++;
422 		} else {
423 			if (p->p_flag & P_SA)
424 				thread_statclock(0);
425 			td->td_sticks++;
426 			p->p_sticks++;
427 			if (p != PCPU_GET(idlethread)->td_proc)
428 				cp_time[CP_SYS]++;
429 			else
430 				cp_time[CP_IDLE]++;
431 		}
432 	}
433 
434 	sched_clock(td);
435 
436 	/* Update resource usage integrals and maximums. */
437 	if ((pstats = p->p_stats) != NULL &&
438 	    (ru = &pstats->p_ru) != NULL &&
439 	    (vm = p->p_vmspace) != NULL) {
440 		ru->ru_ixrss += pgtok(vm->vm_tsize);
441 		ru->ru_idrss += pgtok(vm->vm_dsize);
442 		ru->ru_isrss += pgtok(vm->vm_ssize);
443 		rss = pgtok(vmspace_resident_count(vm));
444 		if (ru->ru_maxrss < rss)
445 			ru->ru_maxrss = rss;
446 	}
447 	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
448 }
449 
450 void
451 profclock(frame)
452 	register struct clockframe *frame;
453 {
454 	struct thread *td;
455 #ifdef GPROF
456 	struct gmonparam *g;
457 	int i;
458 #endif
459 
460 	td = curthread;
461 	if (CLKF_USERMODE(frame)) {
462 		/*
463 		 * Came from user mode; CPU was in user state.
464 		 * If this process is being profiled, record the tick.
465 		 * if there is no related user location yet, don't
466 		 * bother trying to count it.
467 		 */
468 		td = curthread;
469 		if (td->td_proc->p_flag & P_PROFIL)
470 			addupc_intr(td, CLKF_PC(frame), 1);
471 	}
472 #ifdef GPROF
473 	else {
474 		/*
475 		 * Kernel statistics are just like addupc_intr, only easier.
476 		 */
477 		g = &_gmonparam;
478 		if (g->state == GMON_PROF_ON) {
479 			i = CLKF_PC(frame) - g->lowpc;
480 			if (i < g->textsize) {
481 				i /= HISTFRACTION * sizeof(*g->kcount);
482 				g->kcount[i]++;
483 			}
484 		}
485 	}
486 #endif
487 }
488 
489 /*
490  * Return information about system clocks.
491  */
492 static int
493 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
494 {
495 	struct clockinfo clkinfo;
496 	/*
497 	 * Construct clockinfo structure.
498 	 */
499 	bzero(&clkinfo, sizeof(clkinfo));
500 	clkinfo.hz = hz;
501 	clkinfo.tick = tick;
502 	clkinfo.profhz = profhz;
503 	clkinfo.stathz = stathz ? stathz : hz;
504 	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
505 }
506 
507 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
508 	0, 0, sysctl_kern_clockrate, "S,clockinfo",
509 	"Rate and period of various kernel clocks");
510 
511 #ifdef WATCHDOG
512 /*
513  * Reset the watchdog timer to ticks, thus preventing the watchdog
514  * from firing for another watchdog timeout period.
515  */
516 static int
517 sysctl_watchdog_reset(SYSCTL_HANDLER_ARGS)
518 {
519 	int ret;
520 
521 	ret = 0;
522 	watchdog_ticks = ticks;
523 	return sysctl_handle_int(oidp, &ret, 0, req);
524 }
525 
526 SYSCTL_PROC(_debug_watchdog, OID_AUTO, reset, CTLFLAG_RW, 0, 0,
527     sysctl_watchdog_reset, "I", "Reset the watchdog");
528 
529 /*
530  * Handle a watchdog timeout by dumping interrupt information and
531  * then either dropping to DDB or panicing.
532  */
533 static void
534 watchdog_fire(void)
535 {
536 	int nintr;
537 	u_int64_t inttotal;
538 	u_long *curintr;
539 	char *curname;
540 
541 	curintr = intrcnt;
542 	curname = intrnames;
543 	inttotal = 0;
544 	nintr = eintrcnt - intrcnt;
545 
546 	printf("interrupt                   total\n");
547 	while (--nintr >= 0) {
548 		if (*curintr)
549 			printf("%-12s %20lu\n", curname, *curintr);
550 		curname += strlen(curname) + 1;
551 		inttotal += *curintr++;
552 	}
553 	printf("Total        %20ju\n", (uintmax_t)inttotal);
554 
555 #ifdef DDB
556 	db_print_backtrace();
557 	Debugger("watchdog timeout");
558 #else /* !DDB */
559 	panic("watchdog timeout");
560 #endif /* DDB */
561 }
562 
563 #endif /* WATCHDOG */
564