1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
28 * Copyright 2016 Joyent, Inc.
29 */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/disp.h>
35 #include <sys/var.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/x86_archext.h>
39 #include <sys/archsystm.h>
40 #include <sys/cpuvar.h>
41 #include <sys/psm_defs.h>
42 #include <sys/clock.h>
43 #include <sys/atomic.h>
44 #include <sys/lockstat.h>
45 #include <sys/smp_impldefs.h>
46 #include <sys/dtrace.h>
47 #include <sys/time.h>
48 #include <sys/panic.h>
49 #include <sys/cpu.h>
50 #include <sys/sdt.h>
51 #include <sys/comm_page.h>
52
53 /*
54 * Using the Pentium's TSC register for gethrtime()
55 * ------------------------------------------------
56 *
57 * The Pentium family, like many chip architectures, has a high-resolution
58 * timestamp counter ("TSC") which increments once per CPU cycle. The contents
59 * of the timestamp counter are read with the RDTSC instruction.
60 *
61 * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
62 * must be translated into nanoseconds in order to implement gethrtime().
63 * We avoid inducing floating point operations in this conversion by
64 * implementing the same nsec_scale algorithm as that found in the sun4u
65 * platform code. The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
66 * a detailed description of the algorithm; the comment is not reproduced
67 * here. This implementation differs only in its value for NSEC_SHIFT:
68 * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
69 * 60 MHz Pentiums.
70 *
71 * While TSC and %tick are both cycle counting registers, TSC's functionality
72 * falls short in several critical ways:
73 *
74 * (a) TSCs on different CPUs are not guaranteed to be in sync. While in
75 * practice they often _are_ in sync, this isn't guaranteed by the
76 * architecture.
77 *
78 * (b) The TSC cannot be reliably set to an arbitrary value. The architecture
79 * only supports writing the low 32-bits of TSC, making it impractical
80 * to rewrite.
81 *
82 * (c) The architecture doesn't have the capacity to interrupt based on
83 * arbitrary values of TSC; there is no TICK_CMPR equivalent.
84 *
85 * Together, (a) and (b) imply that software must track the skew between
86 * TSCs and account for it (it is assumed that while there may exist skew,
87 * there does not exist drift). To determine the skew between CPUs, we
88 * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
89 * the online operation calls tsc_sync_master().
90 *
91 * In the absence of time-of-day clock adjustments, gethrtime() must stay in
92 * sync with gettimeofday(). This is problematic; given (c), the software
93 * cannot drive its time-of-day source from TSC, and yet they must somehow be
94 * kept in sync. We implement this by having a routine, tsc_tick(), which
95 * is called once per second from the interrupt which drives time-of-day.
96 *
97 * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
98 * atomically with nsec_scale under CLOCK_LOCK. This assures that time
99 * monotonically increases.
100 */
101
102 #define NSEC_SHIFT 5
103
104 static uint_t nsec_unscale;
105
106 /*
107 * These two variables used to be grouped together inside of a structure that
108 * lived on a single cache line. A regression (bug ID 4623398) caused the
109 * compiler to emit code that "optimized" away the while-loops below. The
110 * result was that no synchronization between the onlining and onlined CPUs
111 * took place.
112 */
113 static volatile int tsc_ready;
114 static volatile int tsc_sync_go;
115
116 /*
117 * Used as indices into the tsc_sync_snaps[] array.
118 */
119 #define TSC_MASTER 0
120 #define TSC_SLAVE 1
121
122 /*
123 * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
124 */
125 #define TSC_SYNC_STOP 1
126 #define TSC_SYNC_GO 2
127 #define TSC_SYNC_DONE 3
128 #define SYNC_ITERATIONS 10
129
130 #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) { \
131 unsigned int *_l = (unsigned int *)&(tsc); \
132 (hrt) += mul32(_l[1], scale) << NSEC_SHIFT; \
133 (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
134 }
135
136 #define TSC_CONVERT(tsc, hrt, scale) { \
137 unsigned int *_l = (unsigned int *)&(tsc); \
138 (hrt) = mul32(_l[1], scale) << NSEC_SHIFT; \
139 (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
140 }
141
142 int tsc_master_slave_sync_needed = 1;
143
144 typedef struct tsc_sync {
145 volatile hrtime_t master_tsc, slave_tsc;
146 } tsc_sync_t;
147 static tsc_sync_t *tscp;
148 static hrtime_t largest_tsc_delta = 0;
149 static ulong_t shortest_write_time = ~0UL;
150
151 static hrtime_t tsc_last_jumped = 0;
152 static int tsc_jumped = 0;
153 static uint32_t tsc_wayback = 0;
154 /*
155 * The cap of 1 second was chosen since it is the frequency at which the
156 * tsc_tick() function runs which means that when gethrtime() is called it
157 * should never be more than 1 second since tsc_last was updated.
158 */
159 static hrtime_t tsc_resume_cap_ns = NANOSEC; /* 1s */
160
161 static hrtime_t shadow_tsc_hrtime_base;
162 static hrtime_t shadow_tsc_last;
163 static uint_t shadow_nsec_scale;
164 static uint32_t shadow_hres_lock;
165 int get_tsc_ready();
166
167 static inline
tsc_protect(hrtime_t a)168 hrtime_t tsc_protect(hrtime_t a) {
169 if (a > tsc_resume_cap) {
170 atomic_inc_32(&tsc_wayback);
171 DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
172 uint32_t, tsc_wayback);
173 return (tsc_resume_cap);
174 }
175 return (a);
176 }
177
178 hrtime_t
tsc_gethrtime(void)179 tsc_gethrtime(void)
180 {
181 uint32_t old_hres_lock;
182 hrtime_t tsc, hrt;
183
184 do {
185 old_hres_lock = hres_lock;
186
187 if ((tsc = tsc_read()) >= tsc_last) {
188 /*
189 * It would seem to be obvious that this is true
190 * (that is, the past is less than the present),
191 * but it isn't true in the presence of suspend/resume
192 * cycles. If we manage to call gethrtime()
193 * after a resume, but before the first call to
194 * tsc_tick(), we will see the jump. In this case,
195 * we will simply use the value in TSC as the delta.
196 */
197 tsc -= tsc_last;
198 } else if (tsc >= tsc_last - 2*tsc_max_delta) {
199 /*
200 * There is a chance that tsc_tick() has just run on
201 * another CPU, and we have drifted just enough so that
202 * we appear behind tsc_last. In this case, force the
203 * delta to be zero.
204 */
205 tsc = 0;
206 } else {
207 /*
208 * If we reach this else clause we assume that we have
209 * gone through a suspend/resume cycle and use the
210 * current tsc value as the delta.
211 *
212 * In rare cases we can reach this else clause due to
213 * a lack of monotonicity in the TSC value. In such
214 * cases using the current TSC value as the delta would
215 * cause us to return a value ~2x of what it should
216 * be. To protect against these cases we cap the
217 * suspend/resume delta at tsc_resume_cap.
218 */
219 tsc = tsc_protect(tsc);
220 }
221
222 hrt = tsc_hrtime_base;
223
224 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
225 } while ((old_hres_lock & ~1) != hres_lock);
226
227 return (hrt);
228 }
229
230 hrtime_t
tsc_gethrtime_delta(void)231 tsc_gethrtime_delta(void)
232 {
233 uint32_t old_hres_lock;
234 hrtime_t tsc, hrt;
235 ulong_t flags;
236
237 do {
238 old_hres_lock = hres_lock;
239
240 /*
241 * We need to disable interrupts here to assure that we
242 * don't migrate between the call to tsc_read() and
243 * adding the CPU's TSC tick delta. Note that disabling
244 * and reenabling preemption is forbidden here because
245 * we may be in the middle of a fast trap. In the amd64
246 * kernel we cannot tolerate preemption during a fast
247 * trap. See _update_sregs().
248 */
249
250 flags = clear_int_flag();
251 tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
252 restore_int_flag(flags);
253
254 /* See comments in tsc_gethrtime() above */
255
256 if (tsc >= tsc_last) {
257 tsc -= tsc_last;
258 } else if (tsc >= tsc_last - 2 * tsc_max_delta) {
259 tsc = 0;
260 } else {
261 tsc = tsc_protect(tsc);
262 }
263
264 hrt = tsc_hrtime_base;
265
266 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
267 } while ((old_hres_lock & ~1) != hres_lock);
268
269 return (hrt);
270 }
271
272 hrtime_t
tsc_gethrtime_tick_delta(void)273 tsc_gethrtime_tick_delta(void)
274 {
275 hrtime_t hrt;
276 ulong_t flags;
277
278 flags = clear_int_flag();
279 hrt = tsc_sync_tick_delta[CPU->cpu_id];
280 restore_int_flag(flags);
281
282 return (hrt);
283 }
284
285 /*
286 * This is similar to the above, but it cannot actually spin on hres_lock.
287 * As a result, it caches all of the variables it needs; if the variables
288 * don't change, it's done.
289 */
290 hrtime_t
dtrace_gethrtime(void)291 dtrace_gethrtime(void)
292 {
293 uint32_t old_hres_lock;
294 hrtime_t tsc, hrt;
295 ulong_t flags;
296
297 do {
298 old_hres_lock = hres_lock;
299
300 /*
301 * Interrupts are disabled to ensure that the thread isn't
302 * migrated between the tsc_read() and adding the CPU's
303 * TSC tick delta.
304 */
305 flags = clear_int_flag();
306
307 tsc = tsc_read();
308
309 if (gethrtimef == tsc_gethrtime_delta)
310 tsc += tsc_sync_tick_delta[CPU->cpu_id];
311
312 restore_int_flag(flags);
313
314 /*
315 * See the comments in tsc_gethrtime(), above.
316 */
317 if (tsc >= tsc_last)
318 tsc -= tsc_last;
319 else if (tsc >= tsc_last - 2*tsc_max_delta)
320 tsc = 0;
321 else
322 tsc = tsc_protect(tsc);
323
324 hrt = tsc_hrtime_base;
325
326 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
327
328 if ((old_hres_lock & ~1) == hres_lock)
329 break;
330
331 /*
332 * If we're here, the clock lock is locked -- or it has been
333 * unlocked and locked since we looked. This may be due to
334 * tsc_tick() running on another CPU -- or it may be because
335 * some code path has ended up in dtrace_probe() with
336 * CLOCK_LOCK held. We'll try to determine that we're in
337 * the former case by taking another lap if the lock has
338 * changed since when we first looked at it.
339 */
340 if (old_hres_lock != hres_lock)
341 continue;
342
343 /*
344 * So the lock was and is locked. We'll use the old data
345 * instead.
346 */
347 old_hres_lock = shadow_hres_lock;
348
349 /*
350 * Again, disable interrupts to ensure that the thread
351 * isn't migrated between the tsc_read() and adding
352 * the CPU's TSC tick delta.
353 */
354 flags = clear_int_flag();
355
356 tsc = tsc_read();
357
358 if (gethrtimef == tsc_gethrtime_delta)
359 tsc += tsc_sync_tick_delta[CPU->cpu_id];
360
361 restore_int_flag(flags);
362
363 /*
364 * See the comments in tsc_gethrtime(), above.
365 */
366 if (tsc >= shadow_tsc_last)
367 tsc -= shadow_tsc_last;
368 else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
369 tsc = 0;
370 else
371 tsc = tsc_protect(tsc);
372
373 hrt = shadow_tsc_hrtime_base;
374
375 TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
376 } while ((old_hres_lock & ~1) != shadow_hres_lock);
377
378 return (hrt);
379 }
380
381 hrtime_t
tsc_gethrtimeunscaled(void)382 tsc_gethrtimeunscaled(void)
383 {
384 uint32_t old_hres_lock;
385 hrtime_t tsc;
386
387 do {
388 old_hres_lock = hres_lock;
389
390 /* See tsc_tick(). */
391 tsc = tsc_read() + tsc_last_jumped;
392 } while ((old_hres_lock & ~1) != hres_lock);
393
394 return (tsc);
395 }
396
397 /*
398 * Convert a nanosecond based timestamp to tsc
399 */
400 uint64_t
tsc_unscalehrtime(hrtime_t nsec)401 tsc_unscalehrtime(hrtime_t nsec)
402 {
403 hrtime_t tsc;
404
405 if (tsc_gethrtime_enable) {
406 TSC_CONVERT(nsec, tsc, nsec_unscale);
407 return (tsc);
408 }
409 return ((uint64_t)nsec);
410 }
411
412 /* Convert a tsc timestamp to nanoseconds */
413 void
tsc_scalehrtime(hrtime_t * tsc)414 tsc_scalehrtime(hrtime_t *tsc)
415 {
416 hrtime_t hrt;
417 hrtime_t mytsc;
418
419 if (tsc == NULL)
420 return;
421 mytsc = *tsc;
422
423 TSC_CONVERT(mytsc, hrt, nsec_scale);
424 *tsc = hrt;
425 }
426
427 hrtime_t
tsc_gethrtimeunscaled_delta(void)428 tsc_gethrtimeunscaled_delta(void)
429 {
430 hrtime_t hrt;
431 ulong_t flags;
432
433 /*
434 * Similarly to tsc_gethrtime_delta, we need to disable preemption
435 * to prevent migration between the call to tsc_gethrtimeunscaled
436 * and adding the CPU's hrtime delta. Note that disabling and
437 * reenabling preemption is forbidden here because we may be in the
438 * middle of a fast trap. In the amd64 kernel we cannot tolerate
439 * preemption during a fast trap. See _update_sregs().
440 */
441
442 flags = clear_int_flag();
443 hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
444 restore_int_flag(flags);
445
446 return (hrt);
447 }
448
449 /*
450 * Called by the master in the TSC sync operation (usually the boot CPU).
451 * If the slave is discovered to have a skew, gethrtimef will be changed to
452 * point to tsc_gethrtime_delta(). Calculating skews is precise only when
453 * the master and slave TSCs are read simultaneously; however, there is no
454 * algorithm that can read both CPUs in perfect simultaneity. The proposed
455 * algorithm is an approximate method based on the behaviour of cache
456 * management. The slave CPU continuously reads TSC and then reads a global
457 * variable which the master CPU updates. The moment the master's update reaches
458 * the slave's visibility (being forced by an mfence operation) we use the TSC
459 * reading taken on the slave. A corresponding TSC read will be taken on the
460 * master as soon as possible after finishing the mfence operation. But the
461 * delay between causing the slave to notice the invalid cache line and the
462 * competion of mfence is not repeatable. This error is heuristically assumed
463 * to be 1/4th of the total write time as being measured by the two TSC reads
464 * on the master sandwiching the mfence. Furthermore, due to the nature of
465 * bus arbitration, contention on memory bus, etc., the time taken for the write
466 * to reflect globally can vary a lot. So instead of taking a single reading,
467 * a set of readings are taken and the one with least write time is chosen
468 * to calculate the final skew.
469 *
470 * TSC sync is disabled in the context of virtualization because the CPUs
471 * assigned to the guest are virtual CPUs which means the real CPUs on which
472 * guest runs keep changing during life time of guest OS. So we would end up
473 * calculating TSC skews for a set of CPUs during boot whereas the guest
474 * might migrate to a different set of physical CPUs at a later point of
475 * time.
476 */
477 void
tsc_sync_master(processorid_t slave)478 tsc_sync_master(processorid_t slave)
479 {
480 ulong_t flags, source, min_write_time = ~0UL;
481 hrtime_t write_time, x, mtsc_after, tdelta;
482 tsc_sync_t *tsc = tscp;
483 int cnt;
484 int hwtype;
485
486 hwtype = get_hwenv();
487 if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
488 return;
489
490 flags = clear_int_flag();
491 source = CPU->cpu_id;
492
493 for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
494 while (tsc_sync_go != TSC_SYNC_GO)
495 SMT_PAUSE();
496
497 tsc->master_tsc = tsc_read();
498 membar_enter();
499 mtsc_after = tsc_read();
500 while (tsc_sync_go != TSC_SYNC_DONE)
501 SMT_PAUSE();
502 write_time = mtsc_after - tsc->master_tsc;
503 if (write_time <= min_write_time) {
504 min_write_time = write_time;
505 /*
506 * Apply heuristic adjustment only if the calculated
507 * delta is > 1/4th of the write time.
508 */
509 x = tsc->slave_tsc - mtsc_after;
510 if (x < 0)
511 x = -x;
512 if (x > (min_write_time/4))
513 /*
514 * Subtract 1/4th of the measured write time
515 * from the master's TSC value, as an estimate
516 * of how late the mfence completion came
517 * after the slave noticed the cache line
518 * change.
519 */
520 tdelta = tsc->slave_tsc -
521 (mtsc_after - (min_write_time/4));
522 else
523 tdelta = tsc->slave_tsc - mtsc_after;
524 tsc_sync_tick_delta[slave] =
525 tsc_sync_tick_delta[source] - tdelta;
526 }
527
528 tsc->master_tsc = tsc->slave_tsc = write_time = 0;
529 membar_enter();
530 tsc_sync_go = TSC_SYNC_STOP;
531 }
532 if (tdelta < 0)
533 tdelta = -tdelta;
534 if (tdelta > largest_tsc_delta)
535 largest_tsc_delta = tdelta;
536 if (min_write_time < shortest_write_time)
537 shortest_write_time = min_write_time;
538 /*
539 * Enable delta variants of tsc functions if the largest of all chosen
540 * deltas is > smallest of the write time.
541 */
542 if (largest_tsc_delta > shortest_write_time) {
543 gethrtimef = tsc_gethrtime_delta;
544 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
545 tsc_ncpu = NCPU;
546 }
547 restore_int_flag(flags);
548 }
549
550 /*
551 * Called by a CPU which has just been onlined. It is expected that the CPU
552 * performing the online operation will call tsc_sync_master().
553 *
554 * TSC sync is disabled in the context of virtualization. See comments
555 * above tsc_sync_master.
556 */
557 void
tsc_sync_slave(void)558 tsc_sync_slave(void)
559 {
560 ulong_t flags;
561 hrtime_t s1;
562 tsc_sync_t *tsc = tscp;
563 int cnt;
564 int hwtype;
565
566 hwtype = get_hwenv();
567 if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
568 return;
569
570 flags = clear_int_flag();
571
572 for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
573 /* Re-fill the cache line */
574 s1 = tsc->master_tsc;
575 membar_enter();
576 tsc_sync_go = TSC_SYNC_GO;
577 do {
578 /*
579 * Do not put an SMT_PAUSE here. For instance,
580 * if the master and slave are really the same
581 * hyper-threaded CPU, then you want the master
582 * to yield to the slave as quickly as possible here,
583 * but not the other way.
584 */
585 s1 = tsc_read();
586 } while (tsc->master_tsc == 0);
587 tsc->slave_tsc = s1;
588 membar_enter();
589 tsc_sync_go = TSC_SYNC_DONE;
590
591 while (tsc_sync_go != TSC_SYNC_STOP)
592 SMT_PAUSE();
593 }
594
595 restore_int_flag(flags);
596 }
597
598 /*
599 * Called once per second on a CPU from the cyclic subsystem's
600 * CY_HIGH_LEVEL interrupt. (No longer just cpu0-only)
601 */
602 void
tsc_tick(void)603 tsc_tick(void)
604 {
605 hrtime_t now, delta;
606 ushort_t spl;
607
608 /*
609 * Before we set the new variables, we set the shadow values. This
610 * allows for lock free operation in dtrace_gethrtime().
611 */
612 lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
613 ipltospl(CBE_HIGH_PIL), &spl);
614
615 shadow_tsc_hrtime_base = tsc_hrtime_base;
616 shadow_tsc_last = tsc_last;
617 shadow_nsec_scale = nsec_scale;
618
619 shadow_hres_lock++;
620 splx(spl);
621
622 CLOCK_LOCK(&spl);
623
624 now = tsc_read();
625
626 if (gethrtimef == tsc_gethrtime_delta)
627 now += tsc_sync_tick_delta[CPU->cpu_id];
628
629 if (now < tsc_last) {
630 /*
631 * The TSC has just jumped into the past. We assume that
632 * this is due to a suspend/resume cycle, and we're going
633 * to use the _current_ value of TSC as the delta. This
634 * will keep tsc_hrtime_base correct. We're also going to
635 * assume that rate of tsc does not change after a suspend
636 * resume (i.e nsec_scale remains the same).
637 */
638 delta = now;
639 delta = tsc_protect(delta);
640 tsc_last_jumped += tsc_last;
641 tsc_jumped = 1;
642 } else {
643 /*
644 * Determine the number of TSC ticks since the last clock
645 * tick, and add that to the hrtime base.
646 */
647 delta = now - tsc_last;
648 }
649
650 TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
651 tsc_last = now;
652
653 CLOCK_UNLOCK(spl);
654 }
655
656 void
tsc_hrtimeinit(uint64_t cpu_freq_hz)657 tsc_hrtimeinit(uint64_t cpu_freq_hz)
658 {
659 extern int gethrtime_hires;
660 longlong_t tsc;
661 ulong_t flags;
662
663 /*
664 * cpu_freq_hz is the measured cpu frequency in hertz
665 */
666
667 /*
668 * We can't accommodate CPUs slower than 31.25 MHz.
669 */
670 ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
671 nsec_scale =
672 (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
673 nsec_unscale =
674 (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
675
676 flags = clear_int_flag();
677 tsc = tsc_read();
678 (void) tsc_gethrtime();
679 tsc_max_delta = tsc_read() - tsc;
680 restore_int_flag(flags);
681 gethrtimef = tsc_gethrtime;
682 gethrtimeunscaledf = tsc_gethrtimeunscaled;
683 scalehrtimef = tsc_scalehrtime;
684 unscalehrtimef = tsc_unscalehrtime;
685 hrtime_tick = tsc_tick;
686 gethrtime_hires = 1;
687 /*
688 * Being part of the comm page, tsc_ncpu communicates the published
689 * length of the tsc_sync_tick_delta array. This is kept zeroed to
690 * ignore the absent delta data while the TSCs are synced.
691 */
692 tsc_ncpu = 0;
693 /*
694 * Allocate memory for the structure used in the tsc sync logic.
695 * This structure should be aligned on a multiple of cache line size.
696 */
697 tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
698
699 /*
700 * Convert the TSC resume cap ns value into its unscaled TSC value.
701 * See tsc_gethrtime().
702 */
703 if (tsc_resume_cap == 0)
704 TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
705 }
706
707 int
get_tsc_ready()708 get_tsc_ready()
709 {
710 return (tsc_ready);
711 }
712
713 /*
714 * Adjust all the deltas by adding the passed value to the array.
715 * Then use the "delt" versions of the the gethrtime functions.
716 * Note that 'tdelta' _could_ be a negative number, which should
717 * reduce the values in the array (used, for example, if the Solaris
718 * instance was moved by a virtual manager to a machine with a higher
719 * value of tsc).
720 */
721 void
tsc_adjust_delta(hrtime_t tdelta)722 tsc_adjust_delta(hrtime_t tdelta)
723 {
724 int i;
725
726 for (i = 0; i < NCPU; i++) {
727 tsc_sync_tick_delta[i] += tdelta;
728 }
729
730 gethrtimef = tsc_gethrtime_delta;
731 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
732 tsc_ncpu = NCPU;
733 }
734
735 /*
736 * Functions to manage TSC and high-res time on suspend and resume.
737 */
738
739 /*
740 * declarations needed for time adjustment
741 */
742 extern void rtcsync(void);
743 extern tod_ops_t *tod_ops;
744 /* There must be a better way than exposing nsec_scale! */
745 extern uint_t nsec_scale;
746 static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
747 static timestruc_t tsc_saved_ts;
748 static int tsc_needs_resume = 0; /* We only want to do this once. */
749 int tsc_delta_onsuspend = 0;
750 int tsc_adjust_seconds = 1;
751 int tsc_suspend_count = 0;
752 int tsc_resume_in_cyclic = 0;
753
754 /*
755 * Let timestamp.c know that we are suspending. It needs to take
756 * snapshots of the current time, and do any pre-suspend work.
757 */
758 void
tsc_suspend(void)759 tsc_suspend(void)
760 {
761 /*
762 * What we need to do here, is to get the time we suspended, so that we
763 * know how much we should add to the resume.
764 * This routine is called by each CPU, so we need to handle reentry.
765 */
766 if (tsc_gethrtime_enable) {
767 /*
768 * We put the tsc_read() inside the lock as it
769 * as no locking constraints, and it puts the
770 * aquired value closer to the time stamp (in
771 * case we delay getting the lock).
772 */
773 mutex_enter(&tod_lock);
774 tsc_saved_tsc = tsc_read();
775 tsc_saved_ts = TODOP_GET(tod_ops);
776 mutex_exit(&tod_lock);
777 /* We only want to do this once. */
778 if (tsc_needs_resume == 0) {
779 if (tsc_delta_onsuspend) {
780 tsc_adjust_delta(tsc_saved_tsc);
781 } else {
782 tsc_adjust_delta(nsec_scale);
783 }
784 tsc_suspend_count++;
785 }
786 }
787
788 invalidate_cache();
789 tsc_needs_resume = 1;
790 }
791
792 /*
793 * Restore all timestamp state based on the snapshots taken at
794 * suspend time.
795 */
796 void
tsc_resume(void)797 tsc_resume(void)
798 {
799 /*
800 * We only need to (and want to) do this once. So let the first
801 * caller handle this (we are locked by the cpu lock), as it
802 * is preferential that we get the earliest sync.
803 */
804 if (tsc_needs_resume) {
805 /*
806 * If using the TSC, adjust the delta based on how long
807 * we were sleeping (or away). We also adjust for
808 * migration and a grown TSC.
809 */
810 if (tsc_saved_tsc != 0) {
811 timestruc_t ts;
812 hrtime_t now, sleep_tsc = 0;
813 int sleep_sec;
814 extern void tsc_tick(void);
815 extern uint64_t cpu_freq_hz;
816
817 /* tsc_read() MUST be before TODOP_GET() */
818 mutex_enter(&tod_lock);
819 now = tsc_read();
820 ts = TODOP_GET(tod_ops);
821 mutex_exit(&tod_lock);
822
823 /* Compute seconds of sleep time */
824 sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
825
826 /*
827 * If the saved sec is less that or equal to
828 * the current ts, then there is likely a
829 * problem with the clock. Assume at least
830 * one second has passed, so that time goes forward.
831 */
832 if (sleep_sec <= 0) {
833 sleep_sec = 1;
834 }
835
836 /* How many TSC's should have occured while sleeping */
837 if (tsc_adjust_seconds)
838 sleep_tsc = sleep_sec * cpu_freq_hz;
839
840 /*
841 * We also want to subtract from the "sleep_tsc"
842 * the current value of tsc_read(), so that our
843 * adjustment accounts for the amount of time we
844 * have been resumed _or_ an adjustment based on
845 * the fact that we didn't actually power off the
846 * CPU (migration is another issue, but _should_
847 * also comply with this calculation). If the CPU
848 * never powered off, then:
849 * 'now == sleep_tsc + saved_tsc'
850 * and the delta will effectively be "0".
851 */
852 sleep_tsc -= now;
853 if (tsc_delta_onsuspend) {
854 tsc_adjust_delta(sleep_tsc);
855 } else {
856 tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
857 }
858 tsc_saved_tsc = 0;
859
860 tsc_tick();
861 }
862 tsc_needs_resume = 0;
863 }
864
865 }
866