1 /*- 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ntp.h" 41 #include "opt_watchdog.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/callout.h> 46 #include <sys/kdb.h> 47 #include <sys/kernel.h> 48 #include <sys/lock.h> 49 #include <sys/ktr.h> 50 #include <sys/mutex.h> 51 #include <sys/proc.h> 52 #include <sys/resource.h> 53 #include <sys/resourcevar.h> 54 #include <sys/sched.h> 55 #include <sys/signalvar.h> 56 #include <sys/smp.h> 57 #include <vm/vm.h> 58 #include <vm/pmap.h> 59 #include <vm/vm_map.h> 60 #include <sys/sysctl.h> 61 #include <sys/bus.h> 62 #include <sys/interrupt.h> 63 #include <sys/limits.h> 64 #include <sys/timetc.h> 65 66 #include <machine/cpu.h> 67 68 #ifdef GPROF 69 #include <sys/gmon.h> 70 #endif 71 72 #ifdef DEVICE_POLLING 73 extern void hardclock_device_poll(void); 74 #endif /* DEVICE_POLLING */ 75 76 static void initclocks(void *dummy); 77 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) 78 79 /* Some of these don't belong here, but it's easiest to concentrate them. */ 80 long cp_time[CPUSTATES]; 81 82 SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time), 83 "LU", "CPU time statistics"); 84 85 #ifdef SW_WATCHDOG 86 #include <sys/watchdog.h> 87 88 static int watchdog_ticks; 89 static int watchdog_enabled; 90 static void watchdog_fire(void); 91 static void watchdog_config(void *, u_int, int *); 92 #endif /* SW_WATCHDOG */ 93 94 /* 95 * Clock handling routines. 96 * 97 * This code is written to operate with two timers that run independently of 98 * each other. 99 * 100 * The main timer, running hz times per second, is used to trigger interval 101 * timers, timeouts and rescheduling as needed. 102 * 103 * The second timer handles kernel and user profiling, 104 * and does resource use estimation. If the second timer is programmable, 105 * it is randomized to avoid aliasing between the two clocks. For example, 106 * the randomization prevents an adversary from always giving up the cpu 107 * just before its quantum expires. Otherwise, it would never accumulate 108 * cpu ticks. The mean frequency of the second timer is stathz. 109 * 110 * If no second timer exists, stathz will be zero; in this case we drive 111 * profiling and statistics off the main clock. This WILL NOT be accurate; 112 * do not do it unless absolutely necessary. 113 * 114 * The statistics clock may (or may not) be run at a higher rate while 115 * profiling. This profile clock runs at profhz. We require that profhz 116 * be an integral multiple of stathz. 117 * 118 * If the statistics clock is running fast, it must be divided by the ratio 119 * profhz/stathz for statistics. (For profiling, every tick counts.) 120 * 121 * Time-of-day is maintained using a "timecounter", which may or may 122 * not be related to the hardware generating the above mentioned 123 * interrupts. 124 */ 125 126 int stathz; 127 int profhz; 128 int profprocs; 129 int ticks; 130 int psratio; 131 132 /* 133 * Initialize clock frequencies and start both clocks running. 134 */ 135 /* ARGSUSED*/ 136 static void 137 initclocks(dummy) 138 void *dummy; 139 { 140 register int i; 141 142 /* 143 * Set divisors to 1 (normal case) and let the machine-specific 144 * code do its bit. 145 */ 146 cpu_initclocks(); 147 148 /* 149 * Compute profhz/stathz, and fix profhz if needed. 150 */ 151 i = stathz ? stathz : hz; 152 if (profhz == 0) 153 profhz = i; 154 psratio = profhz / i; 155 #ifdef SW_WATCHDOG 156 EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0); 157 #endif 158 } 159 160 /* 161 * Each time the real-time timer fires, this function is called on all CPUs. 162 * Note that hardclock() calls hardclock_process() for the boot CPU, so only 163 * the other CPUs in the system need to call this function. 164 */ 165 void 166 hardclock_process(frame) 167 register struct clockframe *frame; 168 { 169 struct pstats *pstats; 170 struct thread *td = curthread; 171 struct proc *p = td->td_proc; 172 173 /* 174 * Run current process's virtual and profile time, as needed. 175 */ 176 mtx_lock_spin_flags(&sched_lock, MTX_QUIET); 177 if (p->p_flag & P_SA) { 178 /* XXXKSE What to do? */ 179 } else { 180 pstats = p->p_stats; 181 if (CLKF_USERMODE(frame) && 182 timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && 183 itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { 184 p->p_sflag |= PS_ALRMPEND; 185 td->td_flags |= TDF_ASTPENDING; 186 } 187 if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && 188 itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { 189 p->p_sflag |= PS_PROFPEND; 190 td->td_flags |= TDF_ASTPENDING; 191 } 192 } 193 mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); 194 } 195 196 /* 197 * The real-time timer, interrupting hz times per second. 198 */ 199 void 200 hardclock(frame) 201 register struct clockframe *frame; 202 { 203 int need_softclock = 0; 204 205 CTR0(KTR_CLK, "hardclock fired"); 206 hardclock_process(frame); 207 208 tc_ticktock(); 209 /* 210 * If no separate statistics clock is available, run it from here. 211 * 212 * XXX: this only works for UP 213 */ 214 if (stathz == 0) { 215 profclock(frame); 216 statclock(frame); 217 } 218 219 #ifdef DEVICE_POLLING 220 hardclock_device_poll(); /* this is very short and quick */ 221 #endif /* DEVICE_POLLING */ 222 223 /* 224 * Process callouts at a very low cpu priority, so we don't keep the 225 * relatively high clock interrupt priority any longer than necessary. 226 */ 227 mtx_lock_spin_flags(&callout_lock, MTX_QUIET); 228 ticks++; 229 if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { 230 need_softclock = 1; 231 } else if (softticks + 1 == ticks) 232 ++softticks; 233 mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); 234 235 /* 236 * swi_sched acquires sched_lock, so we don't want to call it with 237 * callout_lock held; incorrect locking order. 238 */ 239 if (need_softclock) 240 swi_sched(softclock_ih, 0); 241 242 #ifdef SW_WATCHDOG 243 if (watchdog_enabled > 0 && --watchdog_ticks <= 0) 244 watchdog_fire(); 245 #endif /* SW_WATCHDOG */ 246 } 247 248 /* 249 * Compute number of ticks in the specified amount of time. 250 */ 251 int 252 tvtohz(tv) 253 struct timeval *tv; 254 { 255 register unsigned long ticks; 256 register long sec, usec; 257 258 /* 259 * If the number of usecs in the whole seconds part of the time 260 * difference fits in a long, then the total number of usecs will 261 * fit in an unsigned long. Compute the total and convert it to 262 * ticks, rounding up and adding 1 to allow for the current tick 263 * to expire. Rounding also depends on unsigned long arithmetic 264 * to avoid overflow. 265 * 266 * Otherwise, if the number of ticks in the whole seconds part of 267 * the time difference fits in a long, then convert the parts to 268 * ticks separately and add, using similar rounding methods and 269 * overflow avoidance. This method would work in the previous 270 * case but it is slightly slower and assumes that hz is integral. 271 * 272 * Otherwise, round the time difference down to the maximum 273 * representable value. 274 * 275 * If ints have 32 bits, then the maximum value for any timeout in 276 * 10ms ticks is 248 days. 277 */ 278 sec = tv->tv_sec; 279 usec = tv->tv_usec; 280 if (usec < 0) { 281 sec--; 282 usec += 1000000; 283 } 284 if (sec < 0) { 285 #ifdef DIAGNOSTIC 286 if (usec > 0) { 287 sec++; 288 usec -= 1000000; 289 } 290 printf("tvotohz: negative time difference %ld sec %ld usec\n", 291 sec, usec); 292 #endif 293 ticks = 1; 294 } else if (sec <= LONG_MAX / 1000000) 295 ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) 296 / tick + 1; 297 else if (sec <= LONG_MAX / hz) 298 ticks = sec * hz 299 + ((unsigned long)usec + (tick - 1)) / tick + 1; 300 else 301 ticks = LONG_MAX; 302 if (ticks > INT_MAX) 303 ticks = INT_MAX; 304 return ((int)ticks); 305 } 306 307 /* 308 * Start profiling on a process. 309 * 310 * Kernel profiling passes proc0 which never exits and hence 311 * keeps the profile clock running constantly. 312 */ 313 void 314 startprofclock(p) 315 register struct proc *p; 316 { 317 318 /* 319 * XXX; Right now sched_lock protects statclock(), but perhaps 320 * it should be protected later on by a time_lock, which would 321 * cover psdiv, etc. as well. 322 */ 323 PROC_LOCK_ASSERT(p, MA_OWNED); 324 if (p->p_flag & P_STOPPROF) 325 return; 326 if ((p->p_flag & P_PROFIL) == 0) { 327 mtx_lock_spin(&sched_lock); 328 p->p_flag |= P_PROFIL; 329 if (++profprocs == 1) 330 cpu_startprofclock(); 331 mtx_unlock_spin(&sched_lock); 332 } 333 } 334 335 /* 336 * Stop profiling on a process. 337 */ 338 void 339 stopprofclock(p) 340 register struct proc *p; 341 { 342 343 PROC_LOCK_ASSERT(p, MA_OWNED); 344 if (p->p_flag & P_PROFIL) { 345 if (p->p_profthreads != 0) { 346 p->p_flag |= P_STOPPROF; 347 while (p->p_profthreads != 0) 348 msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, 349 "stopprof", 0); 350 p->p_flag &= ~P_STOPPROF; 351 } 352 if ((p->p_flag & P_PROFIL) == 0) 353 return; 354 mtx_lock_spin(&sched_lock); 355 p->p_flag &= ~P_PROFIL; 356 if (--profprocs == 0) 357 cpu_stopprofclock(); 358 mtx_unlock_spin(&sched_lock); 359 } 360 } 361 362 /* 363 * Statistics clock. Grab profile sample, and if divider reaches 0, 364 * do process and kernel statistics. Most of the statistics are only 365 * used by user-level statistics programs. The main exceptions are 366 * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. 367 * This should be called by all active processors. 368 */ 369 void 370 statclock(frame) 371 register struct clockframe *frame; 372 { 373 struct rusage *ru; 374 struct vmspace *vm; 375 struct thread *td; 376 struct proc *p; 377 long rss; 378 379 td = curthread; 380 p = td->td_proc; 381 382 mtx_lock_spin_flags(&sched_lock, MTX_QUIET); 383 if (CLKF_USERMODE(frame)) { 384 /* 385 * Charge the time as appropriate. 386 */ 387 if (p->p_flag & P_SA) 388 thread_statclock(1); 389 p->p_uticks++; 390 if (p->p_nice > NZERO) 391 cp_time[CP_NICE]++; 392 else 393 cp_time[CP_USER]++; 394 } else { 395 /* 396 * Came from kernel mode, so we were: 397 * - handling an interrupt, 398 * - doing syscall or trap work on behalf of the current 399 * user process, or 400 * - spinning in the idle loop. 401 * Whichever it is, charge the time as appropriate. 402 * Note that we charge interrupts to the current process, 403 * regardless of whether they are ``for'' that process, 404 * so that we know how much of its real time was spent 405 * in ``non-process'' (i.e., interrupt) work. 406 */ 407 if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) { 408 p->p_iticks++; 409 cp_time[CP_INTR]++; 410 } else { 411 if (p->p_flag & P_SA) 412 thread_statclock(0); 413 td->td_sticks++; 414 p->p_sticks++; 415 if (p != PCPU_GET(idlethread)->td_proc) 416 cp_time[CP_SYS]++; 417 else 418 cp_time[CP_IDLE]++; 419 } 420 } 421 422 sched_clock(td); 423 424 /* Update resource usage integrals and maximums. */ 425 MPASS(p->p_stats != NULL); 426 MPASS(p->p_vmspace != NULL); 427 vm = p->p_vmspace; 428 ru = &p->p_stats->p_ru; 429 ru->ru_ixrss += pgtok(vm->vm_tsize); 430 ru->ru_idrss += pgtok(vm->vm_dsize); 431 ru->ru_isrss += pgtok(vm->vm_ssize); 432 rss = pgtok(vmspace_resident_count(vm)); 433 if (ru->ru_maxrss < rss) 434 ru->ru_maxrss = rss; 435 mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); 436 } 437 438 void 439 profclock(frame) 440 register struct clockframe *frame; 441 { 442 struct thread *td; 443 #ifdef GPROF 444 struct gmonparam *g; 445 int i; 446 #endif 447 448 td = curthread; 449 if (CLKF_USERMODE(frame)) { 450 /* 451 * Came from user mode; CPU was in user state. 452 * If this process is being profiled, record the tick. 453 * if there is no related user location yet, don't 454 * bother trying to count it. 455 */ 456 if (td->td_proc->p_flag & P_PROFIL) 457 addupc_intr(td, CLKF_PC(frame), 1); 458 } 459 #ifdef GPROF 460 else { 461 /* 462 * Kernel statistics are just like addupc_intr, only easier. 463 */ 464 g = &_gmonparam; 465 if (g->state == GMON_PROF_ON) { 466 i = CLKF_PC(frame) - g->lowpc; 467 if (i < g->textsize) { 468 i /= HISTFRACTION * sizeof(*g->kcount); 469 g->kcount[i]++; 470 } 471 } 472 } 473 #endif 474 } 475 476 /* 477 * Return information about system clocks. 478 */ 479 static int 480 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) 481 { 482 struct clockinfo clkinfo; 483 /* 484 * Construct clockinfo structure. 485 */ 486 bzero(&clkinfo, sizeof(clkinfo)); 487 clkinfo.hz = hz; 488 clkinfo.tick = tick; 489 clkinfo.profhz = profhz; 490 clkinfo.stathz = stathz ? stathz : hz; 491 return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); 492 } 493 494 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 495 0, 0, sysctl_kern_clockrate, "S,clockinfo", 496 "Rate and period of various kernel clocks"); 497 498 #ifdef SW_WATCHDOG 499 500 static void 501 watchdog_config(void *unused __unused, u_int cmd, int *err) 502 { 503 u_int u; 504 505 u = cmd & WD_INTERVAL; 506 if (cmd && u >= WD_TO_1SEC) { 507 u = cmd & WD_INTERVAL; 508 watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz; 509 watchdog_enabled = 1; 510 *err = 0; 511 } else { 512 watchdog_enabled = 0; 513 } 514 } 515 516 /* 517 * Handle a watchdog timeout by dumping interrupt information and 518 * then either dropping to DDB or panicing. 519 */ 520 static void 521 watchdog_fire(void) 522 { 523 int nintr; 524 u_int64_t inttotal; 525 u_long *curintr; 526 char *curname; 527 528 curintr = intrcnt; 529 curname = intrnames; 530 inttotal = 0; 531 nintr = eintrcnt - intrcnt; 532 533 printf("interrupt total\n"); 534 while (--nintr >= 0) { 535 if (*curintr) 536 printf("%-12s %20lu\n", curname, *curintr); 537 curname += strlen(curname) + 1; 538 inttotal += *curintr++; 539 } 540 printf("Total %20ju\n", (uintmax_t)inttotal); 541 542 #ifdef KDB 543 kdb_backtrace(); 544 kdb_enter("watchdog timeout"); 545 #else 546 panic("watchdog timeout"); 547 #endif /* KDB */ 548 } 549 550 #endif /* SW_WATCHDOG */ 551