1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Xen time implementation. 4 * 5 * This is implemented in terms of a clocksource driver which uses 6 * the hypervisor clock as a nanosecond timebase, and a clockevent 7 * driver which uses the hypervisor's timer mechanism. 8 * 9 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 10 */ 11 #include <linux/kernel.h> 12 #include <linux/interrupt.h> 13 #include <linux/clocksource.h> 14 #include <linux/clockchips.h> 15 #include <linux/gfp.h> 16 #include <linux/slab.h> 17 #include <linux/pvclock_gtod.h> 18 #include <linux/timekeeper_internal.h> 19 #include <linux/sched/cputime.h> 20 21 #include <asm/cpuid/api.h> 22 #include <asm/pvclock.h> 23 #include <asm/timer.h> 24 #include <asm/xen/hypervisor.h> 25 #include <asm/xen/hypercall.h> 26 #include <asm/xen/cpuid.h> 27 28 #include <xen/events.h> 29 #include <xen/features.h> 30 #include <xen/interface/xen.h> 31 #include <xen/interface/vcpu.h> 32 33 #include "xen-ops.h" 34 35 /* Minimum amount of time until next clock event fires */ 36 #define TIMER_SLOP 1 37 38 static u64 xen_sched_clock_offset __read_mostly; 39 40 /* Get the TSC speed from Xen */ 41 static unsigned long xen_tsc_khz(void) 42 { 43 struct pvclock_vcpu_time_info *info = 44 &HYPERVISOR_shared_info->vcpu_info[0].time; 45 46 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); 47 return pvclock_tsc_khz(info); 48 } 49 50 static u64 xen_clocksource_read(void) 51 { 52 struct pvclock_vcpu_time_info *src; 53 u64 ret; 54 55 preempt_disable_notrace(); 56 src = &__this_cpu_read(xen_vcpu)->time; 57 ret = pvclock_clocksource_read(src); 58 preempt_enable_notrace(); 59 return ret; 60 } 61 62 static u64 xen_clocksource_get_cycles(struct clocksource *cs) 63 { 64 return xen_clocksource_read(); 65 } 66 67 static noinstr u64 xen_sched_clock(void) 68 { 69 struct pvclock_vcpu_time_info *src; 70 u64 ret; 71 72 src = &__this_cpu_read(xen_vcpu)->time; 73 ret = pvclock_clocksource_read_nowd(src); 74 ret -= xen_sched_clock_offset; 75 76 return ret; 77 } 78 79 static void xen_read_wallclock(struct timespec64 *ts) 80 { 81 struct shared_info *s = HYPERVISOR_shared_info; 82 struct pvclock_wall_clock *wall_clock = &(s->wc); 83 struct pvclock_vcpu_time_info *vcpu_time; 84 85 vcpu_time = &get_cpu_var(xen_vcpu)->time; 86 pvclock_read_wallclock(wall_clock, vcpu_time, ts); 87 put_cpu_var(xen_vcpu); 88 } 89 90 static void xen_get_wallclock(struct timespec64 *now) 91 { 92 xen_read_wallclock(now); 93 } 94 95 static int xen_set_wallclock(const struct timespec64 *now) 96 { 97 return -ENODEV; 98 } 99 100 static int xen_pvclock_gtod_notify(struct notifier_block *nb, 101 unsigned long was_set, void *priv) 102 { 103 /* Protected by the calling core code serialization */ 104 static struct timespec64 next_sync; 105 106 struct xen_platform_op op; 107 struct timespec64 now; 108 struct timekeeper *tk = priv; 109 static bool settime64_supported = true; 110 int ret; 111 112 now.tv_sec = tk->xtime_sec; 113 now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); 114 115 /* 116 * We only take the expensive HV call when the clock was set 117 * or when the 11 minutes RTC synchronization time elapsed. 118 */ 119 if (!was_set && timespec64_compare(&now, &next_sync) < 0) 120 return NOTIFY_OK; 121 122 again: 123 if (settime64_supported) { 124 op.cmd = XENPF_settime64; 125 op.u.settime64.mbz = 0; 126 op.u.settime64.secs = now.tv_sec; 127 op.u.settime64.nsecs = now.tv_nsec; 128 op.u.settime64.system_time = xen_clocksource_read(); 129 } else { 130 op.cmd = XENPF_settime32; 131 op.u.settime32.secs = now.tv_sec; 132 op.u.settime32.nsecs = now.tv_nsec; 133 op.u.settime32.system_time = xen_clocksource_read(); 134 } 135 136 ret = HYPERVISOR_platform_op(&op); 137 138 if (ret == -ENOSYS && settime64_supported) { 139 settime64_supported = false; 140 goto again; 141 } 142 if (ret < 0) 143 return NOTIFY_BAD; 144 145 /* 146 * Move the next drift compensation time 11 minutes 147 * ahead. That's emulating the sync_cmos_clock() update for 148 * the hardware RTC. 149 */ 150 next_sync = now; 151 next_sync.tv_sec += 11 * 60; 152 153 return NOTIFY_OK; 154 } 155 156 static struct notifier_block xen_pvclock_gtod_notifier = { 157 .notifier_call = xen_pvclock_gtod_notify, 158 }; 159 160 static int xen_cs_enable(struct clocksource *cs) 161 { 162 vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); 163 return 0; 164 } 165 166 static struct clocksource xen_clocksource __read_mostly = { 167 .name = "xen", 168 .rating = 400, 169 .read = xen_clocksource_get_cycles, 170 .mask = CLOCKSOURCE_MASK(64), 171 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 172 .enable = xen_cs_enable, 173 }; 174 175 /* 176 Xen clockevent implementation 177 178 Xen has two clockevent implementations: 179 180 The old timer_op one works with all released versions of Xen prior 181 to version 3.0.4. This version of the hypervisor provides a 182 single-shot timer with nanosecond resolution. However, sharing the 183 same event channel is a 100Hz tick which is delivered while the 184 vcpu is running. We don't care about or use this tick, but it will 185 cause the core time code to think the timer fired too soon, and 186 will end up resetting it each time. It could be filtered, but 187 doing so has complications when the ktime clocksource is not yet 188 the xen clocksource (ie, at boot time). 189 190 The new vcpu_op-based timer interface allows the tick timer period 191 to be changed or turned off. The tick timer is not useful as a 192 periodic timer because events are only delivered to running vcpus. 193 The one-shot timer can report when a timeout is in the past, so 194 set_next_event is capable of returning -ETIME when appropriate. 195 This interface is used when available. 196 */ 197 198 199 /* 200 Get a hypervisor absolute time. In theory we could maintain an 201 offset between the kernel's time and the hypervisor's time, and 202 apply that to a kernel's absolute timeout. Unfortunately the 203 hypervisor and kernel times can drift even if the kernel is using 204 the Xen clocksource, because ntp can warp the kernel's clocksource. 205 */ 206 static s64 get_abs_timeout(unsigned long delta) 207 { 208 return xen_clocksource_read() + delta; 209 } 210 211 static int xen_timerop_shutdown(struct clock_event_device *evt) 212 { 213 /* cancel timeout */ 214 HYPERVISOR_set_timer_op(0); 215 216 return 0; 217 } 218 219 static int xen_timerop_set_next_event(unsigned long delta, 220 struct clock_event_device *evt) 221 { 222 WARN_ON(!clockevent_state_oneshot(evt)); 223 224 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) 225 BUG(); 226 227 /* We may have missed the deadline, but there's no real way of 228 knowing for sure. If the event was in the past, then we'll 229 get an immediate interrupt. */ 230 231 return 0; 232 } 233 234 static struct clock_event_device xen_timerop_clockevent __ro_after_init = { 235 .name = "xen", 236 .features = CLOCK_EVT_FEAT_ONESHOT, 237 238 .max_delta_ns = 0xffffffff, 239 .max_delta_ticks = 0xffffffff, 240 .min_delta_ns = TIMER_SLOP, 241 .min_delta_ticks = TIMER_SLOP, 242 243 .mult = 1, 244 .shift = 0, 245 .rating = 500, 246 247 .set_state_shutdown = xen_timerop_shutdown, 248 .set_next_event = xen_timerop_set_next_event, 249 }; 250 251 static int xen_vcpuop_shutdown(struct clock_event_device *evt) 252 { 253 int cpu = smp_processor_id(); 254 255 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu), 256 NULL) || 257 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), 258 NULL)) 259 BUG(); 260 261 return 0; 262 } 263 264 static int xen_vcpuop_set_oneshot(struct clock_event_device *evt) 265 { 266 int cpu = smp_processor_id(); 267 268 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), 269 NULL)) 270 BUG(); 271 272 return 0; 273 } 274 275 static int xen_vcpuop_set_next_event(unsigned long delta, 276 struct clock_event_device *evt) 277 { 278 int cpu = smp_processor_id(); 279 struct vcpu_set_singleshot_timer single; 280 int ret; 281 282 WARN_ON(!clockevent_state_oneshot(evt)); 283 284 single.timeout_abs_ns = get_abs_timeout(delta); 285 /* Get an event anyway, even if the timeout is already expired */ 286 single.flags = 0; 287 288 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu), 289 &single); 290 BUG_ON(ret != 0); 291 292 return ret; 293 } 294 295 static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = { 296 .name = "xen", 297 .features = CLOCK_EVT_FEAT_ONESHOT, 298 299 .max_delta_ns = 0xffffffff, 300 .max_delta_ticks = 0xffffffff, 301 .min_delta_ns = TIMER_SLOP, 302 .min_delta_ticks = TIMER_SLOP, 303 304 .mult = 1, 305 .shift = 0, 306 .rating = 500, 307 308 .set_state_shutdown = xen_vcpuop_shutdown, 309 .set_state_oneshot = xen_vcpuop_set_oneshot, 310 .set_next_event = xen_vcpuop_set_next_event, 311 }; 312 313 static const struct clock_event_device *xen_clockevent = 314 &xen_timerop_clockevent; 315 316 struct xen_clock_event_device { 317 struct clock_event_device evt; 318 char name[16]; 319 }; 320 static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; 321 322 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) 323 { 324 struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt); 325 irqreturn_t ret; 326 327 ret = IRQ_NONE; 328 if (evt->event_handler) { 329 evt->event_handler(evt); 330 ret = IRQ_HANDLED; 331 } 332 333 return ret; 334 } 335 336 void xen_teardown_timer(int cpu) 337 { 338 struct clock_event_device *evt; 339 evt = &per_cpu(xen_clock_events, cpu).evt; 340 341 if (evt->irq >= 0) { 342 unbind_from_irqhandler(evt->irq, NULL); 343 evt->irq = -1; 344 } 345 } 346 347 void xen_setup_timer(int cpu) 348 { 349 struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu); 350 struct clock_event_device *evt = &xevt->evt; 351 int irq; 352 353 WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); 354 if (evt->irq >= 0) 355 xen_teardown_timer(cpu); 356 357 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); 358 359 snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu); 360 361 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 362 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| 363 IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, 364 xevt->name, NULL); 365 (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); 366 367 memcpy(evt, xen_clockevent, sizeof(*evt)); 368 369 evt->cpumask = cpumask_of(cpu); 370 evt->irq = irq; 371 } 372 373 374 void xen_setup_cpu_clockevents(void) 375 { 376 clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt)); 377 } 378 379 void xen_timer_resume(void) 380 { 381 int cpu; 382 383 if (xen_clockevent != &xen_vcpuop_clockevent) 384 return; 385 386 for_each_online_cpu(cpu) { 387 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 388 xen_vcpu_nr(cpu), NULL)) 389 BUG(); 390 } 391 } 392 393 static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; 394 static u64 xen_clock_value_saved; 395 396 void xen_save_time_memory_area(void) 397 { 398 struct vcpu_register_time_memory_area t; 399 int ret; 400 401 xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset; 402 403 if (!xen_clock) 404 return; 405 406 t.addr.v = NULL; 407 408 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); 409 if (ret != 0) 410 pr_notice("Cannot save secondary vcpu_time_info (err %d)", 411 ret); 412 else 413 clear_page(xen_clock); 414 } 415 416 void xen_restore_time_memory_area(void) 417 { 418 struct vcpu_register_time_memory_area t; 419 int ret; 420 421 if (!xen_clock) 422 goto out; 423 424 t.addr.v = &xen_clock->pvti; 425 426 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); 427 428 /* 429 * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to 430 * register the secondary time info with Xen or if we migrated to a 431 * host without the necessary flags. On both of these cases what 432 * happens is either process seeing a zeroed out pvti or seeing no 433 * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and 434 * if 0, it discards the data in pvti and fallbacks to a system 435 * call for a reliable timestamp. 436 */ 437 if (ret != 0) 438 pr_notice("Cannot restore secondary vcpu_time_info (err %d)", 439 ret); 440 441 out: 442 /* Need pvclock_resume() before using xen_clocksource_read(). */ 443 pvclock_resume(); 444 xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved; 445 } 446 447 static void xen_setup_vsyscall_time_info(void) 448 { 449 struct vcpu_register_time_memory_area t; 450 struct pvclock_vsyscall_time_info *ti; 451 int ret; 452 453 ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL); 454 if (!ti) 455 return; 456 457 t.addr.v = &ti->pvti; 458 459 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); 460 if (ret) { 461 pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret); 462 free_page((unsigned long)ti); 463 return; 464 } 465 466 /* 467 * If primary time info had this bit set, secondary should too since 468 * it's the same data on both just different memory regions. But we 469 * still check it in case hypervisor is buggy. 470 */ 471 if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) { 472 t.addr.v = NULL; 473 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 474 0, &t); 475 if (!ret) 476 free_page((unsigned long)ti); 477 478 pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n"); 479 return; 480 } 481 482 xen_clock = ti; 483 pvclock_set_pvti_cpu0_va(xen_clock); 484 485 xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; 486 } 487 488 /* 489 * Check if it is possible to safely use the tsc as a clocksource. This is 490 * only true if the hypervisor notifies the guest that its tsc is invariant, 491 * the tsc is stable, and the tsc instruction will never be emulated. 492 */ 493 static int __init xen_tsc_safe_clocksource(void) 494 { 495 u32 eax, ebx, ecx, edx; 496 497 if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC))) 498 return 0; 499 500 if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC))) 501 return 0; 502 503 if (check_tsc_unstable()) 504 return 0; 505 506 /* Leaf 4, sub-leaf 0 (0x40000x03) */ 507 cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx); 508 509 return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE; 510 } 511 512 static void __init xen_time_init(void) 513 { 514 struct pvclock_vcpu_time_info *pvti; 515 int cpu = smp_processor_id(); 516 struct timespec64 tp; 517 518 /* 519 * As Dom0 is never moved, no penalty on using TSC there. 520 * 521 * If it is possible for the guest to determine that the tsc is a safe 522 * clocksource, then set xen_clocksource rating below that of the tsc 523 * so that the system prefers tsc instead. 524 */ 525 if (xen_initial_domain()) 526 xen_clocksource.rating = 275; 527 else if (xen_tsc_safe_clocksource()) 528 xen_clocksource.rating = 299; 529 530 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); 531 532 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu), 533 NULL) == 0) { 534 /* Successfully turned off 100Hz tick, so we have the 535 vcpuop-based timer interface */ 536 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); 537 xen_clockevent = &xen_vcpuop_clockevent; 538 } 539 540 /* Set initial system time with full resolution */ 541 xen_read_wallclock(&tp); 542 do_settimeofday64(&tp); 543 544 setup_force_cpu_cap(X86_FEATURE_TSC); 545 546 /* 547 * We check ahead on the primary time info if this 548 * bit is supported hence speeding up Xen clocksource. 549 */ 550 pvti = &__this_cpu_read(xen_vcpu)->time; 551 if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) { 552 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 553 xen_setup_vsyscall_time_info(); 554 } 555 556 xen_setup_runstate_info(cpu); 557 xen_setup_timer(cpu); 558 xen_setup_cpu_clockevents(); 559 560 xen_time_setup_guest(); 561 562 if (xen_initial_domain()) 563 pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); 564 } 565 566 static void __init xen_init_time_common(void) 567 { 568 xen_sched_clock_offset = xen_clocksource_read(); 569 static_call_update(pv_steal_clock, xen_steal_clock); 570 paravirt_set_sched_clock(xen_sched_clock); 571 572 x86_platform.calibrate_tsc = xen_tsc_khz; 573 x86_platform.get_wallclock = xen_get_wallclock; 574 } 575 576 void __init xen_init_time_ops(void) 577 { 578 xen_init_time_common(); 579 580 x86_init.timers.timer_init = xen_time_init; 581 x86_init.timers.setup_percpu_clockev = x86_init_noop; 582 x86_cpuinit.setup_percpu_clockev = x86_init_noop; 583 584 /* Dom0 uses the native method to set the hardware RTC. */ 585 if (!xen_initial_domain()) 586 x86_platform.set_wallclock = xen_set_wallclock; 587 } 588 589 #ifdef CONFIG_XEN_PVHVM 590 static void xen_hvm_setup_cpu_clockevents(void) 591 { 592 int cpu = smp_processor_id(); 593 xen_setup_runstate_info(cpu); 594 /* 595 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence 596 * doing it xen_hvm_cpu_notify (which gets called by smp_init during 597 * early bootup and also during CPU hotplug events). 598 */ 599 xen_setup_cpu_clockevents(); 600 } 601 602 void __init xen_hvm_init_time_ops(void) 603 { 604 static bool hvm_time_initialized; 605 606 if (hvm_time_initialized) 607 return; 608 609 /* 610 * vector callback is needed otherwise we cannot receive interrupts 611 * on cpu > 0 and at this point we don't know how many cpus are 612 * available. 613 */ 614 if (!xen_have_vector_callback) 615 return; 616 617 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { 618 pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer"); 619 return; 620 } 621 622 /* 623 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'. 624 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest 625 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access 626 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic. 627 * 628 * The xen_hvm_init_time_ops() should be called again later after 629 * __this_cpu_read(xen_vcpu) is available. 630 */ 631 if (!__this_cpu_read(xen_vcpu)) { 632 pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n", 633 xen_vcpu_nr(0)); 634 return; 635 } 636 637 xen_init_time_common(); 638 639 x86_init.timers.setup_percpu_clockev = xen_time_init; 640 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; 641 642 x86_platform.set_wallclock = xen_set_wallclock; 643 644 hvm_time_initialized = true; 645 } 646 #endif 647 648 /* Kernel parameter to specify Xen timer slop */ 649 static int __init parse_xen_timer_slop(char *ptr) 650 { 651 unsigned long slop = memparse(ptr, NULL); 652 653 xen_timerop_clockevent.min_delta_ns = slop; 654 xen_timerop_clockevent.min_delta_ticks = slop; 655 xen_vcpuop_clockevent.min_delta_ns = slop; 656 xen_vcpuop_clockevent.min_delta_ticks = slop; 657 658 return 0; 659 } 660 early_param("xen_timer_slop", parse_xen_timer_slop); 661