1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/disp.h> 33 #include <sys/var.h> 34 #include <sys/cmn_err.h> 35 #include <sys/debug.h> 36 #include <sys/x86_archext.h> 37 #include <sys/archsystm.h> 38 #include <sys/cpuvar.h> 39 #include <sys/psm_defs.h> 40 #include <sys/clock.h> 41 #include <sys/atomic.h> 42 #include <sys/lockstat.h> 43 #include <sys/smp_impldefs.h> 44 #include <sys/dtrace.h> 45 #include <sys/time.h> 46 #include <sys/panic.h> 47 48 /* 49 * Using the Pentium's TSC register for gethrtime() 50 * ------------------------------------------------ 51 * 52 * The Pentium family, like many chip architectures, has a high-resolution 53 * timestamp counter ("TSC") which increments once per CPU cycle. The contents 54 * of the timestamp counter are read with the RDTSC instruction. 55 * 56 * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count 57 * must be translated into nanoseconds in order to implement gethrtime(). 58 * We avoid inducing floating point operations in this conversion by 59 * implementing the same nsec_scale algorithm as that found in the sun4u 60 * platform code. The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains 61 * a detailed description of the algorithm; the comment is not reproduced 62 * here. This implementation differs only in its value for NSEC_SHIFT: 63 * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for 64 * 60 MHz Pentiums. 65 * 66 * While TSC and %tick are both cycle counting registers, TSC's functionality 67 * falls short in several critical ways: 68 * 69 * (a) TSCs on different CPUs are not guaranteed to be in sync. While in 70 * practice they often _are_ in sync, this isn't guaranteed by the 71 * architecture. 72 * 73 * (b) The TSC cannot be reliably set to an arbitrary value. The architecture 74 * only supports writing the low 32-bits of TSC, making it impractical 75 * to rewrite. 76 * 77 * (c) The architecture doesn't have the capacity to interrupt based on 78 * arbitrary values of TSC; there is no TICK_CMPR equivalent. 79 * 80 * Together, (a) and (b) imply that software must track the skew between 81 * TSCs and account for it (it is assumed that while there may exist skew, 82 * there does not exist drift). To determine the skew between CPUs, we 83 * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing 84 * the online operation calls tsc_sync_master(). Once both CPUs are ready, 85 * the master sets a shared flag, and each reads its TSC register. To reduce 86 * bias, we then wait until both CPUs are ready again, but this time the 87 * slave sets the shared flag, and each reads its TSC register again. The 88 * master compares the average of the two sample values, and, if observable 89 * skew is found, changes the gethrtimef function pointer to point to a 90 * gethrtime() implementation which will take the discovered skew into 91 * consideration. 92 * 93 * In the absence of time-of-day clock adjustments, gethrtime() must stay in 94 * sync with gettimeofday(). This is problematic; given (c), the software 95 * cannot drive its time-of-day source from TSC, and yet they must somehow be 96 * kept in sync. We implement this by having a routine, tsc_tick(), which 97 * is called once per second from the interrupt which drives time-of-day. 98 * tsc_tick() recalculates nsec_scale based on the number of the CPU cycles 99 * since boot versus the number of seconds since boot. This algorithm 100 * becomes more accurate over time and converges quickly; the error in 101 * nsec_scale is typically under 1 ppm less than 10 seconds after boot, and 102 * is less than 100 ppb 1 minute after boot. 103 * 104 * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified 105 * atomically with nsec_scale under CLOCK_LOCK. This assures that time 106 * monotonically increases. 107 */ 108 109 #define NSEC_SHIFT 5 110 111 static uint_t nsec_scale; 112 113 /* 114 * These two variables used to be grouped together inside of a structure that 115 * lived on a single cache line. A regression (bug ID 4623398) caused the 116 * compiler to emit code that "optimized" away the while-loops below. The 117 * result was that no synchronization between the onlining and onlined CPUs 118 * took place. 119 */ 120 static volatile int tsc_ready; 121 static volatile int tsc_sync_go; 122 123 /* 124 * Used as indices into the tsc_sync_snaps[] array. 125 */ 126 #define TSC_MASTER 0 127 #define TSC_SLAVE 1 128 129 /* 130 * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous. 131 */ 132 #define TSC_SYNC_STOP 1 133 #define TSC_SYNC_GO 2 134 #define TSC_SYNC_AGAIN 3 135 136 #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) { \ 137 unsigned int *_l = (unsigned int *)&(tsc); \ 138 (hrt) += mul32(_l[1], scale) << NSEC_SHIFT; \ 139 (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \ 140 } 141 142 #define TSC_CONVERT(tsc, hrt, scale) { \ 143 unsigned int *_l = (unsigned int *)&(tsc); \ 144 (hrt) = mul32(_l[1], scale) << NSEC_SHIFT; \ 145 (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \ 146 } 147 148 int tsc_master_slave_sync_needed = 1; 149 150 static int tsc_max_delta; 151 static hrtime_t tsc_sync_snaps[2]; 152 static hrtime_t tsc_sync_delta[NCPU]; 153 static hrtime_t tsc_sync_tick_delta[NCPU]; 154 static hrtime_t tsc_last = 0; 155 static hrtime_t tsc_last_jumped = 0; 156 static hrtime_t tsc_hrtime_base = 0; 157 static int tsc_jumped = 0; 158 159 static hrtime_t shadow_tsc_hrtime_base; 160 static hrtime_t shadow_tsc_last; 161 static uint_t shadow_nsec_scale; 162 static uint32_t shadow_hres_lock; 163 164 hrtime_t 165 tsc_gethrtime(void) 166 { 167 uint32_t old_hres_lock; 168 hrtime_t tsc, hrt; 169 170 do { 171 old_hres_lock = hres_lock; 172 173 if ((tsc = tsc_read()) >= tsc_last) { 174 /* 175 * It would seem to be obvious that this is true 176 * (that is, the past is less than the present), 177 * but it isn't true in the presence of suspend/resume 178 * cycles. If we manage to call gethrtime() 179 * after a resume, but before the first call to 180 * tsc_tick(), we will see the jump. In this case, 181 * we will simply use the value in TSC as the delta. 182 */ 183 tsc -= tsc_last; 184 } else if (tsc >= tsc_last - 2*tsc_max_delta) { 185 /* 186 * There is a chance that tsc_tick() has just run on 187 * another CPU, and we have drifted just enough so that 188 * we appear behind tsc_last. In this case, force the 189 * delta to be zero. 190 */ 191 tsc = 0; 192 } 193 194 hrt = tsc_hrtime_base; 195 196 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale); 197 } while ((old_hres_lock & ~1) != hres_lock); 198 199 return (hrt); 200 } 201 202 hrtime_t 203 tsc_gethrtime_delta(void) 204 { 205 uint32_t old_hres_lock; 206 hrtime_t tsc, hrt; 207 int flags; 208 209 do { 210 old_hres_lock = hres_lock; 211 212 /* 213 * We need to disable interrupts here to assure that we 214 * don't migrate between the call to tsc_read() and 215 * adding the CPU's TSC tick delta. Note that disabling 216 * and reenabling preemption is forbidden here because 217 * we may be in the middle of a fast trap. In the amd64 218 * kernel we cannot tolerate preemption during a fast 219 * trap. See _update_sregs(). 220 */ 221 222 flags = clear_int_flag(); 223 tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id]; 224 restore_int_flag(flags); 225 226 /* See comments in tsc_gethrtime() above */ 227 228 if (tsc >= tsc_last) { 229 tsc -= tsc_last; 230 } else if (tsc >= tsc_last - 2 * tsc_max_delta) { 231 tsc = 0; 232 } 233 234 hrt = tsc_hrtime_base; 235 236 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale); 237 } while ((old_hres_lock & ~1) != hres_lock); 238 239 return (hrt); 240 } 241 242 /* 243 * This is similar to the above, but it cannot actually spin on hres_lock. 244 * As a result, it caches all of the variables it needs; if the variables 245 * don't change, it's done. 246 */ 247 hrtime_t 248 dtrace_gethrtime(void) 249 { 250 uint32_t old_hres_lock; 251 hrtime_t tsc, hrt; 252 int flags; 253 254 do { 255 old_hres_lock = hres_lock; 256 257 /* 258 * Interrupts are disabled to ensure that the thread isn't 259 * migrated between the tsc_read() and adding the CPU's 260 * TSC tick delta. 261 */ 262 flags = clear_int_flag(); 263 264 tsc = tsc_read(); 265 266 if (gethrtimef == tsc_gethrtime_delta) 267 tsc += tsc_sync_tick_delta[CPU->cpu_id]; 268 269 restore_int_flag(flags); 270 271 /* 272 * See the comments in tsc_gethrtime(), above. 273 */ 274 if (tsc >= tsc_last) 275 tsc -= tsc_last; 276 else if (tsc >= tsc_last - 2*tsc_max_delta) 277 tsc = 0; 278 279 hrt = tsc_hrtime_base; 280 281 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale); 282 283 if ((old_hres_lock & ~1) == hres_lock) 284 break; 285 286 /* 287 * If we're here, the clock lock is locked -- or it has been 288 * unlocked and locked since we looked. This may be due to 289 * tsc_tick() running on another CPU -- or it may be because 290 * some code path has ended up in dtrace_probe() with 291 * CLOCK_LOCK held. We'll try to determine that we're in 292 * the former case by taking another lap if the lock has 293 * changed since when we first looked at it. 294 */ 295 if (old_hres_lock != hres_lock) 296 continue; 297 298 /* 299 * So the lock was and is locked. We'll use the old data 300 * instead. 301 */ 302 old_hres_lock = shadow_hres_lock; 303 304 /* 305 * Again, disable interrupts to ensure that the thread 306 * isn't migrated between the tsc_read() and adding 307 * the CPU's TSC tick delta. 308 */ 309 flags = clear_int_flag(); 310 311 tsc = tsc_read(); 312 313 if (gethrtimef == tsc_gethrtime_delta) 314 tsc += tsc_sync_tick_delta[CPU->cpu_id]; 315 316 restore_int_flag(flags); 317 318 /* 319 * See the comments in tsc_gethrtime(), above. 320 */ 321 if (tsc >= shadow_tsc_last) 322 tsc -= shadow_tsc_last; 323 else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta) 324 tsc = 0; 325 326 hrt = shadow_tsc_hrtime_base; 327 328 TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale); 329 } while ((old_hres_lock & ~1) != shadow_hres_lock); 330 331 return (hrt); 332 } 333 334 hrtime_t 335 tsc_gethrtimeunscaled(void) 336 { 337 uint32_t old_hres_lock; 338 hrtime_t tsc; 339 340 do { 341 old_hres_lock = hres_lock; 342 343 /* See tsc_tick(). */ 344 tsc = tsc_read() + tsc_last_jumped; 345 } while ((old_hres_lock & ~1) != hres_lock); 346 347 return (tsc); 348 } 349 350 351 /* Convert a tsc timestamp to nanoseconds */ 352 void 353 tsc_scalehrtime(hrtime_t *tsc) 354 { 355 hrtime_t hrt; 356 hrtime_t mytsc; 357 358 if (tsc == NULL) 359 return; 360 mytsc = *tsc; 361 362 TSC_CONVERT(mytsc, hrt, nsec_scale); 363 *tsc = hrt; 364 } 365 366 hrtime_t 367 tsc_gethrtimeunscaled_delta(void) 368 { 369 hrtime_t hrt; 370 int flags; 371 372 /* 373 * Similarly to tsc_gethrtime_delta, we need to disable preemption 374 * to prevent migration between the call to tsc_gethrtimeunscaled 375 * and adding the CPU's hrtime delta. Note that disabling and 376 * reenabling preemption is forbidden here because we may be in the 377 * middle of a fast trap. In the amd64 kernel we cannot tolerate 378 * preemption during a fast trap. See _update_sregs(). 379 */ 380 381 flags = clear_int_flag(); 382 hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id]; 383 restore_int_flag(flags); 384 385 return (hrt); 386 } 387 388 /* 389 * Called by the master after the sync operation is complete. If the 390 * slave is discovered to lag, gethrtimef will be changed to point to 391 * tsc_gethrtime_delta(). 392 */ 393 static void 394 tsc_digest(processorid_t target) 395 { 396 hrtime_t tdelta, hdelta = 0; 397 int max = tsc_max_delta; 398 processorid_t source = CPU->cpu_id; 399 int update; 400 401 update = tsc_sync_delta[source] != 0 || 402 gethrtimef == tsc_gethrtime_delta; 403 404 /* 405 * We divide by 2 since each of the data points is the sum of two TSC 406 * reads; this takes the average of the two. 407 */ 408 tdelta = (tsc_sync_snaps[TSC_SLAVE] - tsc_sync_snaps[TSC_MASTER]) / 2; 409 if ((tdelta > max) || ((tdelta >= 0) && update)) { 410 TSC_CONVERT_AND_ADD(tdelta, hdelta, nsec_scale); 411 tsc_sync_delta[target] = tsc_sync_delta[source] - hdelta; 412 tsc_sync_tick_delta[target] = -tdelta; 413 gethrtimef = tsc_gethrtime_delta; 414 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; 415 return; 416 } 417 418 tdelta = -tdelta; 419 if ((tdelta > max) || update) { 420 TSC_CONVERT_AND_ADD(tdelta, hdelta, nsec_scale); 421 tsc_sync_delta[target] = tsc_sync_delta[source] + hdelta; 422 tsc_sync_tick_delta[target] = tdelta; 423 gethrtimef = tsc_gethrtime_delta; 424 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; 425 } 426 427 } 428 429 /* 430 * Called by a CPU which has just performed an online operation on another 431 * CPU. It is expected that the newly onlined CPU will call tsc_sync_slave(). 432 */ 433 void 434 tsc_sync_master(processorid_t slave) 435 { 436 ulong_t flags; 437 hrtime_t hrt; 438 439 if (!tsc_master_slave_sync_needed) 440 return; 441 442 ASSERT(tsc_sync_go != TSC_SYNC_GO); 443 444 flags = clear_int_flag(); 445 446 /* 447 * Wait for the slave CPU to arrive. 448 */ 449 while (tsc_ready != TSC_SYNC_GO) 450 continue; 451 452 /* 453 * Tell the slave CPU to begin reading its TSC; read our own. 454 */ 455 tsc_sync_go = TSC_SYNC_GO; 456 hrt = tsc_read(); 457 458 /* 459 * Tell the slave that we're ready, and wait for the slave to tell us 460 * to read our TSC again. 461 */ 462 tsc_ready = TSC_SYNC_AGAIN; 463 while (tsc_sync_go != TSC_SYNC_AGAIN) 464 continue; 465 466 hrt += tsc_read(); 467 tsc_sync_snaps[TSC_MASTER] = hrt; 468 469 /* 470 * Wait for the slave to finish reading its TSC. 471 */ 472 while (tsc_ready != TSC_SYNC_STOP) 473 continue; 474 475 /* 476 * At this point, both CPUs have performed their tsc_read() calls. 477 * We'll digest it now before letting the slave CPU return. 478 */ 479 tsc_digest(slave); 480 tsc_sync_go = TSC_SYNC_STOP; 481 482 restore_int_flag(flags); 483 } 484 485 /* 486 * Called by a CPU which has just been onlined. It is expected that the CPU 487 * performing the online operation will call tsc_sync_master(). 488 */ 489 void 490 tsc_sync_slave(void) 491 { 492 ulong_t flags; 493 hrtime_t hrt; 494 495 if (!tsc_master_slave_sync_needed) 496 return; 497 498 ASSERT(tsc_sync_go != TSC_SYNC_GO); 499 500 flags = clear_int_flag(); 501 502 /* to test tsc_gethrtime_delta, add wrmsr(REG_TSC, 0) here */ 503 504 /* 505 * Tell the master CPU that we're ready, and wait for the master to 506 * tell us to begin reading our TSC. 507 */ 508 tsc_ready = TSC_SYNC_GO; 509 while (tsc_sync_go != TSC_SYNC_GO) 510 continue; 511 512 hrt = tsc_read(); 513 514 /* 515 * Wait for the master CPU to be ready to read its TSC again. 516 */ 517 while (tsc_ready != TSC_SYNC_AGAIN) 518 continue; 519 520 /* 521 * Tell the master CPU to read its TSC again; read ours again. 522 */ 523 tsc_sync_go = TSC_SYNC_AGAIN; 524 525 hrt += tsc_read(); 526 tsc_sync_snaps[TSC_SLAVE] = hrt; 527 528 /* 529 * Tell the master that we're done, and wait to be dismissed. 530 */ 531 tsc_ready = TSC_SYNC_STOP; 532 while (tsc_sync_go != TSC_SYNC_STOP) 533 continue; 534 535 restore_int_flag(flags); 536 } 537 538 /* 539 * Called once per second on a CPU from the cyclic subsystem's 540 * CY_HIGH_LEVEL interrupt. (No longer just cpu0-only) 541 */ 542 void 543 tsc_tick(void) 544 { 545 hrtime_t now, delta; 546 ushort_t spl; 547 548 /* 549 * Before we set the new variables, we set the shadow values. This 550 * allows for lock free operation in dtrace_gethrtime(). 551 */ 552 lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET, 553 ipltospl(CBE_HIGH_PIL), &spl); 554 555 shadow_tsc_hrtime_base = tsc_hrtime_base; 556 shadow_tsc_last = tsc_last; 557 shadow_nsec_scale = nsec_scale; 558 559 shadow_hres_lock++; 560 splx(spl); 561 562 CLOCK_LOCK(&spl); 563 564 now = tsc_read(); 565 566 if (gethrtimef == tsc_gethrtime_delta) 567 now += tsc_sync_tick_delta[CPU->cpu_id]; 568 569 if (now < tsc_last) { 570 /* 571 * The TSC has just jumped into the past. We assume that 572 * this is due to a suspend/resume cycle, and we're going 573 * to use the _current_ value of TSC as the delta. This 574 * will keep tsc_hrtime_base correct. We're also going to 575 * assume that rate of tsc does not change after a suspend 576 * resume (i.e nsec_scale remains the same). 577 */ 578 delta = now; 579 tsc_last_jumped += tsc_last; 580 tsc_jumped = 1; 581 } else { 582 /* 583 * Determine the number of TSC ticks since the last clock 584 * tick, and add that to the hrtime base. 585 */ 586 delta = now - tsc_last; 587 } 588 589 TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale); 590 tsc_last = now; 591 592 CLOCK_UNLOCK(spl); 593 } 594 595 void 596 tsc_hrtimeinit(uint64_t cpu_freq_hz) 597 { 598 extern int gethrtime_hires; 599 longlong_t tsc; 600 ulong_t flags; 601 602 /* 603 * cpu_freq_hz is the measured cpu frequency in hertz 604 */ 605 606 /* 607 * We can't accommodate CPUs slower than 31.25 MHz. 608 */ 609 ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT)); 610 nsec_scale = 611 (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz); 612 613 flags = clear_int_flag(); 614 tsc = tsc_read(); 615 (void) tsc_gethrtime(); 616 tsc_max_delta = tsc_read() - tsc; 617 restore_int_flag(flags); 618 gethrtimef = tsc_gethrtime; 619 gethrtimeunscaledf = tsc_gethrtimeunscaled; 620 scalehrtimef = tsc_scalehrtime; 621 hrtime_tick = tsc_tick; 622 gethrtime_hires = 1; 623 } 624