1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14 #include <linux/percpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/clockchips.h> 17 #include <linux/clocksource.h> 18 #include <linux/sched_clock.h> 19 #include <linux/mm.h> 20 #include <linux/cpuhotplug.h> 21 #include <linux/interrupt.h> 22 #include <linux/irq.h> 23 #include <linux/acpi.h> 24 #include <linux/hyperv.h> 25 #include <clocksource/hyperv_timer.h> 26 #include <asm/hyperv-tlfs.h> 27 #include <asm/mshyperv.h> 28 29 static struct clock_event_device __percpu *hv_clock_event; 30 /* Note: offset can hold negative values after hibernation. */ 31 static u64 hv_sched_clock_offset __read_mostly; 32 33 /* 34 * If false, we're using the old mechanism for stimer0 interrupts 35 * where it sends a VMbus message when it expires. The old 36 * mechanism is used when running on older versions of Hyper-V 37 * that don't support Direct Mode. While Hyper-V provides 38 * four stimer's per CPU, Linux uses only stimer0. 39 * 40 * Because Direct Mode does not require processing a VMbus 41 * message, stimer interrupts can be enabled earlier in the 42 * process of booting a CPU, and consistent with when timer 43 * interrupts are enabled for other clocksource drivers. 44 * However, for legacy versions of Hyper-V when Direct Mode 45 * is not enabled, setting up stimer interrupts must be 46 * delayed until VMbus is initialized and can process the 47 * interrupt message. 48 */ 49 static bool direct_mode_enabled; 50 51 static int stimer0_irq = -1; 52 static int stimer0_message_sint; 53 static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); 54 55 /* 56 * Common code for stimer0 interrupts coming via Direct Mode or 57 * as a VMbus message. 58 */ 59 void hv_stimer0_isr(void) 60 { 61 struct clock_event_device *ce; 62 63 ce = this_cpu_ptr(hv_clock_event); 64 ce->event_handler(ce); 65 } 66 EXPORT_SYMBOL_GPL(hv_stimer0_isr); 67 68 /* 69 * stimer0 interrupt handler for architectures that support 70 * per-cpu interrupts, which also implies Direct Mode. 71 */ 72 static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) 73 { 74 hv_stimer0_isr(); 75 return IRQ_HANDLED; 76 } 77 78 static int hv_ce_set_next_event(unsigned long delta, 79 struct clock_event_device *evt) 80 { 81 u64 current_tick; 82 83 current_tick = hv_read_reference_counter(); 84 current_tick += delta; 85 hv_set_msr(HV_MSR_STIMER0_COUNT, current_tick); 86 return 0; 87 } 88 89 static int hv_ce_shutdown(struct clock_event_device *evt) 90 { 91 hv_set_msr(HV_MSR_STIMER0_COUNT, 0); 92 hv_set_msr(HV_MSR_STIMER0_CONFIG, 0); 93 if (direct_mode_enabled && stimer0_irq >= 0) 94 disable_percpu_irq(stimer0_irq); 95 96 return 0; 97 } 98 99 static int hv_ce_set_oneshot(struct clock_event_device *evt) 100 { 101 union hv_stimer_config timer_cfg; 102 103 timer_cfg.as_uint64 = 0; 104 timer_cfg.enable = 1; 105 timer_cfg.auto_enable = 1; 106 if (direct_mode_enabled) { 107 /* 108 * When it expires, the timer will directly interrupt 109 * on the specified hardware vector/IRQ. 110 */ 111 timer_cfg.direct_mode = 1; 112 timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; 113 if (stimer0_irq >= 0) 114 enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); 115 } else { 116 /* 117 * When it expires, the timer will generate a VMbus message, 118 * to be handled by the normal VMbus interrupt handler. 119 */ 120 timer_cfg.direct_mode = 0; 121 timer_cfg.sintx = stimer0_message_sint; 122 } 123 hv_set_msr(HV_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); 124 return 0; 125 } 126 127 /* 128 * hv_stimer_init - Per-cpu initialization of the clockevent 129 */ 130 static int hv_stimer_init(unsigned int cpu) 131 { 132 struct clock_event_device *ce; 133 134 if (!hv_clock_event) 135 return 0; 136 137 ce = per_cpu_ptr(hv_clock_event, cpu); 138 ce->name = "Hyper-V clockevent"; 139 ce->features = CLOCK_EVT_FEAT_ONESHOT; 140 ce->cpumask = cpumask_of(cpu); 141 142 /* 143 * Lower the rating of the Hyper-V timer in a TDX VM without paravisor, 144 * so the local APIC timer (lapic_clockevent) is the default timer in 145 * such a VM. The Hyper-V timer is not preferred in such a VM because 146 * it depends on the slow VM Reference Counter MSR (the Hyper-V TSC 147 * page is not enbled in such a VM because the VM uses Invariant TSC 148 * as a better clocksource and it's challenging to mark the Hyper-V 149 * TSC page shared in very early boot). 150 */ 151 if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx()) 152 ce->rating = 90; 153 else 154 ce->rating = 1000; 155 156 ce->set_state_shutdown = hv_ce_shutdown; 157 ce->set_state_oneshot = hv_ce_set_oneshot; 158 ce->set_next_event = hv_ce_set_next_event; 159 160 clockevents_config_and_register(ce, 161 HV_CLOCK_HZ, 162 HV_MIN_DELTA_TICKS, 163 HV_MAX_MAX_DELTA_TICKS); 164 return 0; 165 } 166 167 /* 168 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 169 */ 170 int hv_stimer_cleanup(unsigned int cpu) 171 { 172 struct clock_event_device *ce; 173 174 if (!hv_clock_event) 175 return 0; 176 177 /* 178 * In the legacy case where Direct Mode is not enabled 179 * (which can only be on x86/64), stimer cleanup happens 180 * relatively early in the CPU offlining process. We 181 * must unbind the stimer-based clockevent device so 182 * that the LAPIC timer can take over until clockevents 183 * are no longer needed in the offlining process. Note 184 * that clockevents_unbind_device() eventually calls 185 * hv_ce_shutdown(). 186 * 187 * The unbind should not be done when Direct Mode is 188 * enabled because we may be on an architecture where 189 * there are no other clockevent devices to fallback to. 190 */ 191 ce = per_cpu_ptr(hv_clock_event, cpu); 192 if (direct_mode_enabled) 193 hv_ce_shutdown(ce); 194 else 195 clockevents_unbind_device(ce, cpu); 196 197 return 0; 198 } 199 EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 200 201 /* 202 * These placeholders are overridden by arch specific code on 203 * architectures that need special setup of the stimer0 IRQ because 204 * they don't support per-cpu IRQs (such as x86/x64). 205 */ 206 void __weak hv_setup_stimer0_handler(void (*handler)(void)) 207 { 208 }; 209 210 void __weak hv_remove_stimer0_handler(void) 211 { 212 }; 213 214 #ifdef CONFIG_ACPI 215 /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ 216 static int hv_setup_stimer0_irq(void) 217 { 218 int ret; 219 220 ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, 221 ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); 222 if (ret < 0) { 223 pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); 224 return ret; 225 } 226 stimer0_irq = ret; 227 228 ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, 229 "Hyper-V stimer0", &stimer0_evt); 230 if (ret) { 231 pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", 232 stimer0_irq, ret); 233 acpi_unregister_gsi(stimer0_irq); 234 stimer0_irq = -1; 235 } 236 return ret; 237 } 238 239 static void hv_remove_stimer0_irq(void) 240 { 241 if (stimer0_irq == -1) { 242 hv_remove_stimer0_handler(); 243 } else { 244 free_percpu_irq(stimer0_irq, &stimer0_evt); 245 acpi_unregister_gsi(stimer0_irq); 246 stimer0_irq = -1; 247 } 248 } 249 #else 250 static int hv_setup_stimer0_irq(void) 251 { 252 return 0; 253 } 254 255 static void hv_remove_stimer0_irq(void) 256 { 257 } 258 #endif 259 260 /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 261 int hv_stimer_alloc(bool have_percpu_irqs) 262 { 263 int ret; 264 265 /* 266 * Synthetic timers are always available except on old versions of 267 * Hyper-V on x86. In that case, return as error as Linux will use a 268 * clockevent based on emulated LAPIC timer hardware. 269 */ 270 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 271 return -EINVAL; 272 273 hv_clock_event = alloc_percpu(struct clock_event_device); 274 if (!hv_clock_event) 275 return -ENOMEM; 276 277 direct_mode_enabled = ms_hyperv.misc_features & 278 HV_STIMER_DIRECT_MODE_AVAILABLE; 279 280 /* 281 * If Direct Mode isn't enabled, the remainder of the initialization 282 * is done later by hv_stimer_legacy_init() 283 */ 284 if (!direct_mode_enabled) 285 return 0; 286 287 if (have_percpu_irqs) { 288 ret = hv_setup_stimer0_irq(); 289 if (ret) 290 goto free_clock_event; 291 } else { 292 hv_setup_stimer0_handler(hv_stimer0_isr); 293 } 294 295 /* 296 * Since we are in Direct Mode, stimer initialization 297 * can be done now with a CPUHP value in the same range 298 * as other clockevent devices. 299 */ 300 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 301 "clockevents/hyperv/stimer:starting", 302 hv_stimer_init, hv_stimer_cleanup); 303 if (ret < 0) { 304 hv_remove_stimer0_irq(); 305 goto free_clock_event; 306 } 307 return ret; 308 309 free_clock_event: 310 free_percpu(hv_clock_event); 311 hv_clock_event = NULL; 312 return ret; 313 } 314 EXPORT_SYMBOL_GPL(hv_stimer_alloc); 315 316 /* 317 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 318 * the case when Direct Mode is not enabled, and the stimer 319 * must be initialized late in the CPU onlining process. 320 * 321 */ 322 void hv_stimer_legacy_init(unsigned int cpu, int sint) 323 { 324 if (direct_mode_enabled) 325 return; 326 327 /* 328 * This function gets called by each vCPU, so setting the 329 * global stimer_message_sint value each time is conceptually 330 * not ideal, but the value passed in is always the same and 331 * it avoids introducing yet another interface into this 332 * clocksource driver just to set the sint in the legacy case. 333 */ 334 stimer0_message_sint = sint; 335 (void)hv_stimer_init(cpu); 336 } 337 EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 338 339 /* 340 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 341 * handle the case when Direct Mode is not enabled, and the 342 * stimer must be cleaned up early in the CPU offlining 343 * process. 344 */ 345 void hv_stimer_legacy_cleanup(unsigned int cpu) 346 { 347 if (direct_mode_enabled) 348 return; 349 (void)hv_stimer_cleanup(cpu); 350 } 351 EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 352 353 /* 354 * Do a global cleanup of clockevents for the cases of kexec and 355 * vmbus exit 356 */ 357 void hv_stimer_global_cleanup(void) 358 { 359 int cpu; 360 361 /* 362 * hv_stime_legacy_cleanup() will stop the stimer if Direct 363 * Mode is not enabled, and fallback to the LAPIC timer. 364 */ 365 for_each_present_cpu(cpu) { 366 hv_stimer_legacy_cleanup(cpu); 367 } 368 369 if (!hv_clock_event) 370 return; 371 372 if (direct_mode_enabled) { 373 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 374 hv_remove_stimer0_irq(); 375 stimer0_irq = -1; 376 } 377 free_percpu(hv_clock_event); 378 hv_clock_event = NULL; 379 380 } 381 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 382 383 static __always_inline u64 read_hv_clock_msr(void) 384 { 385 /* 386 * Read the partition counter to get the current tick count. This count 387 * is set to 0 when the partition is created and is incremented in 100 388 * nanosecond units. 389 * 390 * Use hv_raw_get_msr() because this function is used from 391 * noinstr. Notable; while HV_MSR_TIME_REF_COUNT is a synthetic 392 * register it doesn't need the GHCB path. 393 */ 394 return hv_raw_get_msr(HV_MSR_TIME_REF_COUNT); 395 } 396 397 /* 398 * Code and definitions for the Hyper-V clocksources. Two 399 * clocksources are defined: one that reads the Hyper-V defined MSR, and 400 * the other that uses the TSC reference page feature as defined in the 401 * TLFS. The MSR version is for compatibility with old versions of 402 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 403 */ 404 405 static union { 406 struct ms_hyperv_tsc_page page; 407 u8 reserved[PAGE_SIZE]; 408 } tsc_pg __bss_decrypted __aligned(PAGE_SIZE); 409 410 static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; 411 static unsigned long tsc_pfn; 412 413 unsigned long hv_get_tsc_pfn(void) 414 { 415 return tsc_pfn; 416 } 417 EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); 418 419 struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 420 { 421 return tsc_page; 422 } 423 EXPORT_SYMBOL_GPL(hv_get_tsc_page); 424 425 static __always_inline u64 read_hv_clock_tsc(void) 426 { 427 u64 cur_tsc, time; 428 429 /* 430 * The Hyper-V Top-Level Function Spec (TLFS), section Timers, 431 * subsection Refererence Counter, guarantees that the TSC and MSR 432 * times are in sync and monotonic. Therefore we can fall back 433 * to the MSR in case the TSC page indicates unavailability. 434 */ 435 if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) 436 time = read_hv_clock_msr(); 437 438 return time; 439 } 440 441 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 442 { 443 return read_hv_clock_tsc(); 444 } 445 446 static u64 noinstr read_hv_sched_clock_tsc(void) 447 { 448 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 449 (NSEC_PER_SEC / HV_CLOCK_HZ); 450 } 451 452 static void suspend_hv_clock_tsc(struct clocksource *arg) 453 { 454 union hv_reference_tsc_msr tsc_msr; 455 456 /* Disable the TSC page */ 457 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 458 tsc_msr.enable = 0; 459 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 460 } 461 462 463 static void resume_hv_clock_tsc(struct clocksource *arg) 464 { 465 union hv_reference_tsc_msr tsc_msr; 466 467 /* Re-enable the TSC page */ 468 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 469 tsc_msr.enable = 1; 470 tsc_msr.pfn = tsc_pfn; 471 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 472 } 473 474 /* 475 * Called during resume from hibernation, from overridden 476 * x86_platform.restore_sched_clock_state routine. This is to adjust offsets 477 * used to calculate time for hv tsc page based sched_clock, to account for 478 * time spent before hibernation. 479 */ 480 void hv_adj_sched_clock_offset(u64 offset) 481 { 482 hv_sched_clock_offset -= offset; 483 } 484 485 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 486 static int hv_cs_enable(struct clocksource *cs) 487 { 488 vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); 489 return 0; 490 } 491 #endif 492 493 static struct clocksource hyperv_cs_tsc = { 494 .name = "hyperv_clocksource_tsc_page", 495 .rating = 500, 496 .read = read_hv_clock_tsc_cs, 497 .mask = CLOCKSOURCE_MASK(64), 498 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 499 .suspend= suspend_hv_clock_tsc, 500 .resume = resume_hv_clock_tsc, 501 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 502 .enable = hv_cs_enable, 503 .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, 504 #else 505 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 506 #endif 507 }; 508 509 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 510 { 511 return read_hv_clock_msr(); 512 } 513 514 static struct clocksource hyperv_cs_msr = { 515 .name = "hyperv_clocksource_msr", 516 .rating = 495, 517 .read = read_hv_clock_msr_cs, 518 .mask = CLOCKSOURCE_MASK(64), 519 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 520 }; 521 522 /* 523 * Reference to pv_ops must be inline so objtool 524 * detection of noinstr violations can work correctly. 525 */ 526 #ifdef CONFIG_GENERIC_SCHED_CLOCK 527 static __always_inline void hv_setup_sched_clock(void *sched_clock) 528 { 529 /* 530 * We're on an architecture with generic sched clock (not x86/x64). 531 * The Hyper-V sched clock read function returns nanoseconds, not 532 * the normal 100ns units of the Hyper-V synthetic clock. 533 */ 534 sched_clock_register(sched_clock, 64, NSEC_PER_SEC); 535 } 536 #elif defined CONFIG_PARAVIRT 537 static __always_inline void hv_setup_sched_clock(void *sched_clock) 538 { 539 /* We're on x86/x64 *and* using PV ops */ 540 paravirt_set_sched_clock(sched_clock); 541 } 542 #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ 543 static __always_inline void hv_setup_sched_clock(void *sched_clock) {} 544 #endif /* CONFIG_GENERIC_SCHED_CLOCK */ 545 546 static void __init hv_init_tsc_clocksource(void) 547 { 548 union hv_reference_tsc_msr tsc_msr; 549 550 /* 551 * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly 552 * handles frequency and offset changes due to live migration, 553 * pause/resume, and other VM management operations. So lower the 554 * Hyper-V Reference TSC rating, causing the generic TSC to be used. 555 * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference 556 * TSC will be preferred over the virtualized ARM64 arch counter. 557 */ 558 if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { 559 hyperv_cs_tsc.rating = 250; 560 hyperv_cs_msr.rating = 245; 561 } 562 563 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 564 return; 565 566 hv_read_reference_counter = read_hv_clock_tsc; 567 568 /* 569 * TSC page mapping works differently in root compared to guest. 570 * - In guest partition the guest PFN has to be passed to the 571 * hypervisor. 572 * - In root partition it's other way around: it has to map the PFN 573 * provided by the hypervisor. 574 * But it can't be mapped right here as it's too early and MMU isn't 575 * ready yet. So, we only set the enable bit here and will remap the 576 * page later in hv_remap_tsc_clocksource(). 577 * 578 * It worth mentioning, that TSC clocksource read function 579 * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when 580 * TSC page is zeroed (which is the case until the PFN is remapped) and 581 * thus TSC clocksource will work even without the real TSC page 582 * mapped. 583 */ 584 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 585 if (hv_root_partition) 586 tsc_pfn = tsc_msr.pfn; 587 else 588 tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); 589 tsc_msr.enable = 1; 590 tsc_msr.pfn = tsc_pfn; 591 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 592 593 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 594 595 /* 596 * If TSC is invariant, then let it stay as the sched clock since it 597 * will be faster than reading the TSC page. But if not invariant, use 598 * the TSC page so that live migrations across hosts with different 599 * frequencies is handled correctly. 600 */ 601 if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) { 602 hv_sched_clock_offset = hv_read_reference_counter(); 603 hv_setup_sched_clock(read_hv_sched_clock_tsc); 604 } 605 } 606 607 void __init hv_init_clocksource(void) 608 { 609 /* 610 * Try to set up the TSC page clocksource, then the MSR clocksource. 611 * At least one of these will always be available except on very old 612 * versions of Hyper-V on x86. In that case we won't have a Hyper-V 613 * clocksource, but Linux will still run with a clocksource based 614 * on the emulated PIT or LAPIC timer. 615 * 616 * Never use the MSR clocksource as sched clock. It's too slow. 617 * Better to use the native sched clock as the fallback. 618 */ 619 hv_init_tsc_clocksource(); 620 621 if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) 622 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 623 } 624 625 void __init hv_remap_tsc_clocksource(void) 626 { 627 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 628 return; 629 630 if (!hv_root_partition) { 631 WARN(1, "%s: attempt to remap TSC page in guest partition\n", 632 __func__); 633 return; 634 } 635 636 tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg), 637 MEMREMAP_WB); 638 if (!tsc_page) 639 pr_err("Failed to remap Hyper-V TSC page.\n"); 640 } 641