xref: /illumos-gate/usr/src/uts/i86pc/os/timestamp.c (revision f875b4ebb1dd9fdbeb043557cab38ab3bf7f6e01)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/disp.h>
32 #include <sys/var.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/x86_archext.h>
36 #include <sys/archsystm.h>
37 #include <sys/cpuvar.h>
38 #include <sys/psm_defs.h>
39 #include <sys/clock.h>
40 #include <sys/atomic.h>
41 #include <sys/lockstat.h>
42 #include <sys/smp_impldefs.h>
43 #include <sys/dtrace.h>
44 #include <sys/time.h>
45 
46 /*
47  * Using the Pentium's TSC register for gethrtime()
48  * ------------------------------------------------
49  *
50  * The Pentium family, like many chip architectures, has a high-resolution
51  * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
52  * of the timestamp counter are read with the RDTSC instruction.
53  *
54  * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
55  * must be translated into nanoseconds in order to implement gethrtime().
56  * We avoid inducing floating point operations in this conversion by
57  * implementing the same nsec_scale algorithm as that found in the sun4u
58  * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
59  * a detailed description of the algorithm; the comment is not reproduced
60  * here.  This implementation differs only in its value for NSEC_SHIFT:
61  * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
62  * 60 MHz Pentiums.
63  *
64  * While TSC and %tick are both cycle counting registers, TSC's functionality
65  * falls short in several critical ways:
66  *
67  *  (a)	TSCs on different CPUs are not guaranteed to be in sync.  While in
68  *	practice they often _are_ in sync, this isn't guaranteed by the
69  *	architecture.
70  *
71  *  (b)	The TSC cannot be reliably set to an arbitrary value.  The architecture
72  *	only supports writing the low 32-bits of TSC, making it impractical
73  *	to rewrite.
74  *
75  *  (c)	The architecture doesn't have the capacity to interrupt based on
76  *	arbitrary values of TSC; there is no TICK_CMPR equivalent.
77  *
78  * Together, (a) and (b) imply that software must track the skew between
79  * TSCs and account for it (it is assumed that while there may exist skew,
80  * there does not exist drift).  To determine the skew between CPUs, we
81  * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
82  * the online operation calls tsc_sync_master().  Once both CPUs are ready,
83  * the master sets a shared flag, and each reads its TSC register.  To reduce
84  * bias, we then wait until both CPUs are ready again, but this time the
85  * slave sets the shared flag, and each reads its TSC register again. The
86  * master compares the average of the two sample values, and, if observable
87  * skew is found, changes the gethrtimef function pointer to point to a
88  * gethrtime() implementation which will take the discovered skew into
89  * consideration.
90  *
91  * In the absence of time-of-day clock adjustments, gethrtime() must stay in
92  * sync with gettimeofday().  This is problematic; given (c), the software
93  * cannot drive its time-of-day source from TSC, and yet they must somehow be
94  * kept in sync.  We implement this by having a routine, tsc_tick(), which
95  * is called once per second from the interrupt which drives time-of-day.
96  * tsc_tick() recalculates nsec_scale based on the number of the CPU cycles
97  * since boot versus the number of seconds since boot.  This algorithm
98  * becomes more accurate over time and converges quickly; the error in
99  * nsec_scale is typically under 1 ppm less than 10 seconds after boot, and
100  * is less than 100 ppb 1 minute after boot.
101  *
102  * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
103  * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
104  * monotonically increases.
105  */
106 
107 #define	NSEC_SHIFT 5
108 
109 static uint_t nsec_scale;
110 
111 /*
112  * These two variables used to be grouped together inside of a structure that
113  * lived on a single cache line. A regression (bug ID 4623398) caused the
114  * compiler to emit code that "optimized" away the while-loops below. The
115  * result was that no synchronization between the onlining and onlined CPUs
116  * took place.
117  */
118 static volatile int tsc_ready;
119 static volatile int tsc_sync_go;
120 
121 /*
122  * Used as indices into the tsc_sync_snaps[] array.
123  */
124 #define	TSC_MASTER		0
125 #define	TSC_SLAVE		1
126 
127 /*
128  * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
129  */
130 #define	TSC_SYNC_STOP		1
131 #define	TSC_SYNC_GO		2
132 #define	TSC_SYNC_AGAIN		3
133 
134 /*
135  * XX64	Is the faster way to do this with a 64-bit ABI?
136  */
137 
138 #define	TSC_CONVERT_AND_ADD(tsc, hrt, scale) { 		\
139 	unsigned int *_l = (unsigned int *)&(tsc); 	\
140 	(hrt) += mul32(_l[1], scale) << NSEC_SHIFT; 	\
141 	(hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
142 }
143 
144 #define	TSC_CONVERT(tsc, hrt, scale) { 			\
145 	unsigned int *_l = (unsigned int *)&(tsc); 	\
146 	(hrt) = mul32(_l[1], scale) << NSEC_SHIFT; 	\
147 	(hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
148 }
149 
150 int tsc_master_slave_sync_needed = 1;
151 
152 static int	tsc_max_delta;
153 static hrtime_t tsc_sync_snaps[2];
154 static hrtime_t tsc_sync_delta[NCPU];
155 static hrtime_t tsc_sync_tick_delta[NCPU];
156 static hrtime_t	tsc_last = 0;
157 static hrtime_t	tsc_last_jumped = 0;
158 static hrtime_t	tsc_hrtime_base = 0;
159 static int	tsc_jumped = 0;
160 
161 static hrtime_t	shadow_tsc_hrtime_base;
162 static hrtime_t	shadow_tsc_last;
163 static uint_t	shadow_nsec_scale;
164 static uint32_t	shadow_hres_lock;
165 
166 /*
167  * Called by the master after the sync operation is complete.  If the
168  * slave is discovered to lag, gethrtimef will be changed to point to
169  * tsc_gethrtime_delta().
170  */
171 static void
172 tsc_digest(processorid_t target)
173 {
174 	hrtime_t tdelta, hdelta = 0;
175 	int max = tsc_max_delta;
176 	processorid_t source = CPU->cpu_id;
177 	int update;
178 
179 	update = tsc_sync_delta[source] != 0 ||
180 	    gethrtimef == tsc_gethrtime_delta;
181 
182 	/*
183 	 * We divide by 2 since each of the data points is the sum of two TSC
184 	 * reads; this takes the average of the two.
185 	 */
186 	tdelta = (tsc_sync_snaps[TSC_SLAVE] - tsc_sync_snaps[TSC_MASTER]) / 2;
187 	if ((tdelta > max) || ((tdelta >= 0) && update)) {
188 		TSC_CONVERT_AND_ADD(tdelta, hdelta, nsec_scale);
189 		tsc_sync_delta[target] = tsc_sync_delta[source] - hdelta;
190 		tsc_sync_tick_delta[target] = -tdelta;
191 		gethrtimef = tsc_gethrtime_delta;
192 		gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
193 		return;
194 	}
195 
196 	tdelta = -tdelta;
197 	if ((tdelta > max) || update) {
198 		TSC_CONVERT_AND_ADD(tdelta, hdelta, nsec_scale);
199 		tsc_sync_delta[target] = tsc_sync_delta[source] + hdelta;
200 		tsc_sync_tick_delta[target] = tdelta;
201 		gethrtimef = tsc_gethrtime_delta;
202 		gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
203 	}
204 
205 }
206 
207 /*
208  * Called by a CPU which has just performed an online operation on another
209  * CPU.  It is expected that the newly onlined CPU will call tsc_sync_slave().
210  */
211 void
212 tsc_sync_master(processorid_t slave)
213 {
214 	ulong_t flags;
215 	hrtime_t hrt;
216 
217 	if (!tsc_master_slave_sync_needed)
218 		return;
219 
220 	ASSERT(tsc_sync_go != TSC_SYNC_GO);
221 
222 	flags = clear_int_flag();
223 
224 	/*
225 	 * Wait for the slave CPU to arrive.
226 	 */
227 	while (tsc_ready != TSC_SYNC_GO)
228 		continue;
229 
230 	/*
231 	 * Tell the slave CPU to begin reading its TSC; read our own.
232 	 */
233 	tsc_sync_go = TSC_SYNC_GO;
234 	hrt = tsc_read();
235 
236 	/*
237 	 * Tell the slave that we're ready, and wait for the slave to tell us
238 	 * to read our TSC again.
239 	 */
240 	tsc_ready = TSC_SYNC_AGAIN;
241 	while (tsc_sync_go != TSC_SYNC_AGAIN)
242 		continue;
243 
244 	hrt += tsc_read();
245 	tsc_sync_snaps[TSC_MASTER] = hrt;
246 
247 	/*
248 	 * Wait for the slave to finish reading its TSC.
249 	 */
250 	while (tsc_ready != TSC_SYNC_STOP)
251 		continue;
252 
253 	/*
254 	 * At this point, both CPUs have performed their tsc_read() calls.
255 	 * We'll digest it now before letting the slave CPU return.
256 	 */
257 	tsc_digest(slave);
258 	tsc_sync_go = TSC_SYNC_STOP;
259 
260 	restore_int_flag(flags);
261 }
262 
263 /*
264  * Called by a CPU which has just been onlined.  It is expected that the CPU
265  * performing the online operation will call tsc_sync_master().
266  */
267 void
268 tsc_sync_slave(void)
269 {
270 	ulong_t flags;
271 	hrtime_t hrt;
272 
273 	if (!tsc_master_slave_sync_needed)
274 		return;
275 
276 	ASSERT(tsc_sync_go != TSC_SYNC_GO);
277 
278 	flags = clear_int_flag();
279 
280 	/* to test tsc_gethrtime_delta, add wrmsr(REG_TSC, 0) here */
281 
282 	/*
283 	 * Tell the master CPU that we're ready, and wait for the master to
284 	 * tell us to begin reading our TSC.
285 	 */
286 	tsc_ready = TSC_SYNC_GO;
287 	while (tsc_sync_go != TSC_SYNC_GO)
288 		continue;
289 
290 	hrt = tsc_read();
291 
292 	/*
293 	 * Wait for the master CPU to be ready to read its TSC again.
294 	 */
295 	while (tsc_ready != TSC_SYNC_AGAIN)
296 		continue;
297 
298 	/*
299 	 * Tell the master CPU to read its TSC again; read ours again.
300 	 */
301 	tsc_sync_go = TSC_SYNC_AGAIN;
302 
303 	hrt += tsc_read();
304 	tsc_sync_snaps[TSC_SLAVE] = hrt;
305 
306 	/*
307 	 * Tell the master that we're done, and wait to be dismissed.
308 	 */
309 	tsc_ready = TSC_SYNC_STOP;
310 	while (tsc_sync_go != TSC_SYNC_STOP)
311 		continue;
312 
313 	restore_int_flag(flags);
314 }
315 
316 void
317 tsc_hrtimeinit(uint64_t cpu_freq_hz)
318 {
319 	longlong_t tsc;
320 	ulong_t flags;
321 
322 	/*
323 	 * cpu_freq_hz is the measured cpu frequency in hertz
324 	 */
325 
326 	/*
327 	 * We can't accommodate CPUs slower than 31.25 MHz.
328 	 */
329 	ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
330 	nsec_scale =
331 	    (uint_t)
332 		(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
333 
334 	flags = clear_int_flag();
335 	tsc = tsc_read();
336 	(void) tsc_gethrtime();
337 	tsc_max_delta = tsc_read() - tsc;
338 	restore_int_flag(flags);
339 }
340 
341 /*
342  * Called once per second on a CPU from the cyclic subsystem's
343  * CY_HIGH_LEVEL interrupt.  (No longer just cpu0-only)
344  */
345 void
346 tsc_tick(void)
347 {
348 	hrtime_t now, delta;
349 	ushort_t spl;
350 
351 	/*
352 	 * Before we set the new variables, we set the shadow values.  This
353 	 * allows for lock free operation in dtrace_gethrtime().
354 	 */
355 	lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
356 	    ipltospl(CBE_HIGH_PIL), &spl);
357 
358 	shadow_tsc_hrtime_base = tsc_hrtime_base;
359 	shadow_tsc_last = tsc_last;
360 	shadow_nsec_scale = nsec_scale;
361 
362 	shadow_hres_lock++;
363 	splx(spl);
364 
365 	CLOCK_LOCK(&spl);
366 
367 	now = tsc_read();
368 
369 	if (gethrtimef == tsc_gethrtime_delta)
370 		now += tsc_sync_tick_delta[CPU->cpu_id];
371 
372 	if (now < tsc_last) {
373 		/*
374 		 * The TSC has just jumped into the past.  We assume that
375 		 * this is due to a suspend/resume cycle, and we're going
376 		 * to use the _current_ value of TSC as the delta.  This
377 		 * will keep tsc_hrtime_base correct.  We're also going to
378 		 * assume that rate of tsc does not change after a suspend
379 		 * resume (i.e nsec_scale remains the same).
380 		 */
381 		delta = now;
382 		tsc_last_jumped += tsc_last;
383 		tsc_jumped = 1;
384 	} else {
385 		/*
386 		 * Determine the number of TSC ticks since the last clock
387 		 * tick, and add that to the hrtime base.
388 		 */
389 		delta = now - tsc_last;
390 	}
391 
392 	TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
393 	tsc_last = now;
394 
395 	CLOCK_UNLOCK(spl);
396 }
397 
398 hrtime_t
399 tsc_gethrtime(void)
400 {
401 	uint32_t old_hres_lock;
402 	hrtime_t tsc, hrt;
403 
404 	do {
405 		old_hres_lock = hres_lock;
406 
407 		if ((tsc = tsc_read()) >= tsc_last) {
408 			/*
409 			 * It would seem to be obvious that this is true
410 			 * (that is, the past is less than the present),
411 			 * but it isn't true in the presence of suspend/resume
412 			 * cycles.  If we manage to call gethrtime()
413 			 * after a resume, but before the first call to
414 			 * tsc_tick(), we will see the jump.  In this case,
415 			 * we will simply use the value in TSC as the delta.
416 			 */
417 			tsc -= tsc_last;
418 		} else if (tsc >= tsc_last - 2*tsc_max_delta) {
419 			/*
420 			 * There is a chance that tsc_tick() has just run on
421 			 * another CPU, and we have drifted just enough so that
422 			 * we appear behind tsc_last.  In this case, force the
423 			 * delta to be zero.
424 			 */
425 			tsc = 0;
426 		}
427 		hrt = tsc_hrtime_base;
428 
429 		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
430 	} while ((old_hres_lock & ~1) != hres_lock);
431 
432 	return (hrt);
433 }
434 
435 /*
436  * This is similar to the above, but it cannot actually spin on hres_lock.
437  * As a result, it caches all of the variables it needs; if the variables
438  * don't change, it's done.
439  */
440 hrtime_t
441 dtrace_gethrtime(void)
442 {
443 	uint32_t old_hres_lock;
444 	hrtime_t tsc, hrt;
445 	int flags;
446 
447 	do {
448 		old_hres_lock = hres_lock;
449 
450 		/*
451 		 * Interrupts are disabled to ensure that the thread isn't
452 		 * migrated between the tsc_read() and adding the CPU's
453 		 * TSC tick delta.
454 		 */
455 		flags = clear_int_flag();
456 
457 		tsc = tsc_read();
458 
459 		if (gethrtimef == tsc_gethrtime_delta)
460 			tsc += tsc_sync_tick_delta[CPU->cpu_id];
461 
462 		restore_int_flag(flags);
463 
464 		/*
465 		 * See the comments in tsc_gethrtime(), above.
466 		 */
467 		if (tsc >= tsc_last)
468 			tsc -= tsc_last;
469 		else if (tsc >= tsc_last - 2*tsc_max_delta)
470 			tsc = 0;
471 
472 		hrt = tsc_hrtime_base;
473 
474 		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
475 
476 		if ((old_hres_lock & ~1) == hres_lock)
477 			break;
478 
479 		/*
480 		 * If we're here, the clock lock is locked -- or it has been
481 		 * unlocked and locked since we looked.  This may be due to
482 		 * tsc_tick() running on another CPU -- or it may be because
483 		 * some code path has ended up in dtrace_probe() with
484 		 * CLOCK_LOCK held.  We'll try to determine that we're in
485 		 * the former case by taking another lap if the lock has
486 		 * changed since when we first looked at it.
487 		 */
488 		if (old_hres_lock != hres_lock)
489 			continue;
490 
491 		/*
492 		 * So the lock was and is locked.  We'll use the old data
493 		 * instead.
494 		 */
495 		old_hres_lock = shadow_hres_lock;
496 
497 		/*
498 		 * Again, disable interrupts to ensure that the thread
499 		 * isn't migrated between the tsc_read() and adding
500 		 * the CPU's TSC tick delta.
501 		 */
502 		flags = clear_int_flag();
503 
504 		tsc = tsc_read();
505 
506 		if (gethrtimef == tsc_gethrtime_delta)
507 			tsc += tsc_sync_tick_delta[CPU->cpu_id];
508 
509 		restore_int_flag(flags);
510 
511 		/*
512 		 * See the comments in tsc_gethrtime(), above.
513 		 */
514 		if (tsc >= shadow_tsc_last)
515 			tsc -= shadow_tsc_last;
516 		else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
517 			tsc = 0;
518 
519 		hrt = shadow_tsc_hrtime_base;
520 
521 		TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
522 	} while ((old_hres_lock & ~1) != shadow_hres_lock);
523 
524 	return (hrt);
525 }
526 
527 hrtime_t
528 tsc_gethrtime_delta(void)
529 {
530 	uint32_t old_hres_lock;
531 	hrtime_t tsc, hrt;
532 	int flags;
533 
534 	do {
535 		old_hres_lock = hres_lock;
536 
537 		/*
538 		 * We need to disable interrupts here to assure that we
539 		 * don't migrate between the call to tsc_read() and
540 		 * adding the CPU's TSC tick delta. Note that disabling
541 		 * and reenabling preemption is forbidden here because
542 		 * we may be in the middle of a fast trap. In the amd64
543 		 * kernel we cannot tolerate preemption during a fast
544 		 * trap. See _update_sregs().
545 		 */
546 
547 		flags = clear_int_flag();
548 		tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
549 		restore_int_flag(flags);
550 
551 		/* See comments in tsc_gethrtime() above */
552 
553 		if (tsc >= tsc_last) {
554 			tsc -= tsc_last;
555 		} else if (tsc >= tsc_last - 2 * tsc_max_delta) {
556 			tsc = 0;
557 		}
558 
559 		hrt = tsc_hrtime_base;
560 
561 		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
562 	} while ((old_hres_lock & ~1) != hres_lock);
563 
564 	return (hrt);
565 }
566 
567 extern uint64_t cpu_freq_hz;
568 extern int tsc_gethrtime_enable;
569 
570 /*
571  * The following converts nanoseconds of highres-time to ticks
572  */
573 
574 static uint64_t
575 hrtime2tick(hrtime_t ts)
576 {
577 	hrtime_t q = ts / NANOSEC;
578 	hrtime_t r = ts - (q * NANOSEC);
579 
580 	return (q * cpu_freq_hz + ((r * cpu_freq_hz) / NANOSEC));
581 }
582 
583 /*
584  * This is used to convert scaled high-res time from nanoseconds to
585  * unscaled hardware ticks.  (Read from hardware timestamp counter)
586  */
587 
588 uint64_t
589 unscalehrtime(hrtime_t ts)
590 {
591 	if (tsc_gethrtime_enable) {
592 		uint64_t unscale = 0;
593 		hrtime_t rescale;
594 		hrtime_t diff = ts;
595 
596 		while (diff > (nsec_per_tick)) {
597 			unscale += hrtime2tick(diff);
598 			rescale = unscale;
599 			scalehrtime(&rescale);
600 			diff = ts - rescale;
601 		}
602 
603 		return (unscale);
604 	}
605 	return (0);
606 }
607 
608 
609 hrtime_t
610 tsc_gethrtimeunscaled(void)
611 {
612 	uint32_t old_hres_lock;
613 	hrtime_t tsc;
614 
615 	do {
616 		old_hres_lock = hres_lock;
617 
618 		/* See tsc_tick(). */
619 		tsc = tsc_read() + tsc_last_jumped;
620 	} while ((old_hres_lock & ~1) != hres_lock);
621 
622 	return (tsc);
623 }
624 
625 
626 /* Convert a tsc timestamp to nanoseconds */
627 void
628 tsc_scalehrtime(hrtime_t *tsc)
629 {
630 	hrtime_t hrt;
631 	hrtime_t mytsc;
632 
633 	if (tsc == NULL)
634 		return;
635 	mytsc = *tsc;
636 
637 	TSC_CONVERT(mytsc, hrt, nsec_scale);
638 	*tsc  = hrt;
639 }
640 
641 hrtime_t
642 tsc_gethrtimeunscaled_delta(void)
643 {
644 	hrtime_t hrt;
645 	int flags;
646 
647 	/*
648 	 * Similarly to tsc_gethrtime_delta, we need to disable preemption
649 	 * to prevent migration between the call to tsc_gethrtimeunscaled
650 	 * and adding the CPU's hrtime delta. Note that disabling and
651 	 * reenabling preemption is forbidden here because we may be in the
652 	 * middle of a fast trap. In the amd64 kernel we cannot tolerate
653 	 * preemption during a fast trap. See _update_sregs().
654 	 */
655 
656 	flags = clear_int_flag();
657 	hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
658 	restore_int_flag(flags);
659 
660 	return (hrt);
661 }
662