1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14 #include <linux/percpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/clockchips.h> 17 #include <linux/clocksource.h> 18 #include <linux/sched_clock.h> 19 #include <linux/mm.h> 20 #include <linux/cpuhotplug.h> 21 #include <linux/interrupt.h> 22 #include <linux/irq.h> 23 #include <linux/acpi.h> 24 #include <linux/hyperv.h> 25 #include <linux/export.h> 26 #include <clocksource/hyperv_timer.h> 27 #include <hyperv/hvhdk.h> 28 #include <asm/mshyperv.h> 29 30 static struct clock_event_device __percpu *hv_clock_event; 31 /* Note: offset can hold negative values after hibernation. */ 32 static u64 hv_sched_clock_offset __read_mostly; 33 34 /* 35 * If false, we're using the old mechanism for stimer0 interrupts 36 * where it sends a VMbus message when it expires. The old 37 * mechanism is used when running on older versions of Hyper-V 38 * that don't support Direct Mode. While Hyper-V provides 39 * four stimer's per CPU, Linux uses only stimer0. 40 * 41 * Because Direct Mode does not require processing a VMbus 42 * message, stimer interrupts can be enabled earlier in the 43 * process of booting a CPU, and consistent with when timer 44 * interrupts are enabled for other clocksource drivers. 45 * However, for legacy versions of Hyper-V when Direct Mode 46 * is not enabled, setting up stimer interrupts must be 47 * delayed until VMbus is initialized and can process the 48 * interrupt message. 49 */ 50 static bool direct_mode_enabled; 51 52 static int stimer0_irq = -1; 53 static int stimer0_message_sint; 54 static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); 55 56 /* 57 * Common code for stimer0 interrupts coming via Direct Mode or 58 * as a VMbus message. 59 */ 60 void hv_stimer0_isr(void) 61 { 62 struct clock_event_device *ce; 63 64 ce = this_cpu_ptr(hv_clock_event); 65 ce->event_handler(ce); 66 } 67 EXPORT_SYMBOL_GPL(hv_stimer0_isr); 68 69 /* 70 * stimer0 interrupt handler for architectures that support 71 * per-cpu interrupts, which also implies Direct Mode. 72 */ 73 static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) 74 { 75 hv_stimer0_isr(); 76 return IRQ_HANDLED; 77 } 78 79 static int hv_ce_set_next_event(unsigned long delta, 80 struct clock_event_device *evt) 81 { 82 u64 current_tick; 83 84 current_tick = hv_read_reference_counter(); 85 current_tick += delta; 86 hv_set_msr(HV_MSR_STIMER0_COUNT, current_tick); 87 return 0; 88 } 89 90 static int hv_ce_shutdown(struct clock_event_device *evt) 91 { 92 hv_set_msr(HV_MSR_STIMER0_COUNT, 0); 93 hv_set_msr(HV_MSR_STIMER0_CONFIG, 0); 94 if (direct_mode_enabled && stimer0_irq >= 0) 95 disable_percpu_irq(stimer0_irq); 96 97 return 0; 98 } 99 100 static int hv_ce_set_oneshot(struct clock_event_device *evt) 101 { 102 union hv_stimer_config timer_cfg; 103 104 timer_cfg.as_uint64 = 0; 105 timer_cfg.enable = 1; 106 timer_cfg.auto_enable = 1; 107 if (direct_mode_enabled) { 108 /* 109 * When it expires, the timer will directly interrupt 110 * on the specified hardware vector/IRQ. 111 */ 112 timer_cfg.direct_mode = 1; 113 timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; 114 if (stimer0_irq >= 0) 115 enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); 116 } else { 117 /* 118 * When it expires, the timer will generate a VMbus message, 119 * to be handled by the normal VMbus interrupt handler. 120 */ 121 timer_cfg.direct_mode = 0; 122 timer_cfg.sintx = stimer0_message_sint; 123 } 124 hv_set_msr(HV_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); 125 return 0; 126 } 127 128 /* 129 * hv_stimer_init - Per-cpu initialization of the clockevent 130 */ 131 static int hv_stimer_init(unsigned int cpu) 132 { 133 struct clock_event_device *ce; 134 135 if (!hv_clock_event) 136 return 0; 137 138 ce = per_cpu_ptr(hv_clock_event, cpu); 139 ce->name = "Hyper-V clockevent"; 140 ce->features = CLOCK_EVT_FEAT_ONESHOT; 141 ce->cpumask = cpumask_of(cpu); 142 143 /* 144 * Lower the rating of the Hyper-V timer in a TDX VM without paravisor, 145 * so the local APIC timer (lapic_clockevent) is the default timer in 146 * such a VM. The Hyper-V timer is not preferred in such a VM because 147 * it depends on the slow VM Reference Counter MSR (the Hyper-V TSC 148 * page is not enbled in such a VM because the VM uses Invariant TSC 149 * as a better clocksource and it's challenging to mark the Hyper-V 150 * TSC page shared in very early boot). 151 */ 152 if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx()) 153 ce->rating = 90; 154 else 155 ce->rating = 1000; 156 157 ce->set_state_shutdown = hv_ce_shutdown; 158 ce->set_state_oneshot = hv_ce_set_oneshot; 159 ce->set_next_event = hv_ce_set_next_event; 160 161 clockevents_config_and_register(ce, 162 HV_CLOCK_HZ, 163 HV_MIN_DELTA_TICKS, 164 HV_MAX_MAX_DELTA_TICKS); 165 return 0; 166 } 167 168 /* 169 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 170 */ 171 int hv_stimer_cleanup(unsigned int cpu) 172 { 173 struct clock_event_device *ce; 174 175 if (!hv_clock_event) 176 return 0; 177 178 /* 179 * In the legacy case where Direct Mode is not enabled 180 * (which can only be on x86/64), stimer cleanup happens 181 * relatively early in the CPU offlining process. We 182 * must unbind the stimer-based clockevent device so 183 * that the LAPIC timer can take over until clockevents 184 * are no longer needed in the offlining process. Note 185 * that clockevents_unbind_device() eventually calls 186 * hv_ce_shutdown(). 187 * 188 * The unbind should not be done when Direct Mode is 189 * enabled because we may be on an architecture where 190 * there are no other clockevent devices to fallback to. 191 */ 192 ce = per_cpu_ptr(hv_clock_event, cpu); 193 if (direct_mode_enabled) 194 hv_ce_shutdown(ce); 195 else 196 clockevents_unbind_device(ce, cpu); 197 198 return 0; 199 } 200 EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 201 202 /* 203 * These placeholders are overridden by arch specific code on 204 * architectures that need special setup of the stimer0 IRQ because 205 * they don't support per-cpu IRQs (such as x86/x64). 206 */ 207 void __weak hv_setup_stimer0_handler(void (*handler)(void)) 208 { 209 }; 210 211 void __weak hv_remove_stimer0_handler(void) 212 { 213 }; 214 215 #ifdef CONFIG_ACPI 216 /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ 217 static int hv_setup_stimer0_irq(void) 218 { 219 int ret; 220 221 ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, 222 ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); 223 if (ret < 0) { 224 pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); 225 return ret; 226 } 227 stimer0_irq = ret; 228 229 ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, 230 "Hyper-V stimer0", &stimer0_evt); 231 if (ret) { 232 pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", 233 stimer0_irq, ret); 234 acpi_unregister_gsi(stimer0_irq); 235 stimer0_irq = -1; 236 } 237 return ret; 238 } 239 240 static void hv_remove_stimer0_irq(void) 241 { 242 if (stimer0_irq == -1) { 243 hv_remove_stimer0_handler(); 244 } else { 245 free_percpu_irq(stimer0_irq, &stimer0_evt); 246 acpi_unregister_gsi(stimer0_irq); 247 stimer0_irq = -1; 248 } 249 } 250 #else 251 static int hv_setup_stimer0_irq(void) 252 { 253 return 0; 254 } 255 256 static void hv_remove_stimer0_irq(void) 257 { 258 } 259 #endif 260 261 /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 262 int hv_stimer_alloc(bool have_percpu_irqs) 263 { 264 int ret; 265 266 /* 267 * Synthetic timers are always available except on old versions of 268 * Hyper-V on x86. In that case, return as error as Linux will use a 269 * clockevent based on emulated LAPIC timer hardware. 270 */ 271 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 272 return -EINVAL; 273 274 hv_clock_event = alloc_percpu(struct clock_event_device); 275 if (!hv_clock_event) 276 return -ENOMEM; 277 278 direct_mode_enabled = ms_hyperv.misc_features & 279 HV_STIMER_DIRECT_MODE_AVAILABLE; 280 281 /* 282 * If Direct Mode isn't enabled, the remainder of the initialization 283 * is done later by hv_stimer_legacy_init() 284 */ 285 if (!direct_mode_enabled) 286 return 0; 287 288 if (have_percpu_irqs) { 289 ret = hv_setup_stimer0_irq(); 290 if (ret) 291 goto free_clock_event; 292 } else { 293 hv_setup_stimer0_handler(hv_stimer0_isr); 294 } 295 296 /* 297 * Since we are in Direct Mode, stimer initialization 298 * can be done now with a CPUHP value in the same range 299 * as other clockevent devices. 300 */ 301 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 302 "clockevents/hyperv/stimer:starting", 303 hv_stimer_init, hv_stimer_cleanup); 304 if (ret < 0) { 305 hv_remove_stimer0_irq(); 306 goto free_clock_event; 307 } 308 return ret; 309 310 free_clock_event: 311 free_percpu(hv_clock_event); 312 hv_clock_event = NULL; 313 return ret; 314 } 315 EXPORT_SYMBOL_GPL(hv_stimer_alloc); 316 317 /* 318 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 319 * the case when Direct Mode is not enabled, and the stimer 320 * must be initialized late in the CPU onlining process. 321 * 322 */ 323 void hv_stimer_legacy_init(unsigned int cpu, int sint) 324 { 325 if (direct_mode_enabled) 326 return; 327 328 /* 329 * This function gets called by each vCPU, so setting the 330 * global stimer_message_sint value each time is conceptually 331 * not ideal, but the value passed in is always the same and 332 * it avoids introducing yet another interface into this 333 * clocksource driver just to set the sint in the legacy case. 334 */ 335 stimer0_message_sint = sint; 336 (void)hv_stimer_init(cpu); 337 } 338 EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 339 340 /* 341 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 342 * handle the case when Direct Mode is not enabled, and the 343 * stimer must be cleaned up early in the CPU offlining 344 * process. 345 */ 346 void hv_stimer_legacy_cleanup(unsigned int cpu) 347 { 348 if (direct_mode_enabled) 349 return; 350 (void)hv_stimer_cleanup(cpu); 351 } 352 EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 353 354 /* 355 * Do a global cleanup of clockevents for the cases of kexec and 356 * vmbus exit 357 */ 358 void hv_stimer_global_cleanup(void) 359 { 360 int cpu; 361 362 /* 363 * hv_stime_legacy_cleanup() will stop the stimer if Direct 364 * Mode is not enabled, and fallback to the LAPIC timer. 365 */ 366 for_each_present_cpu(cpu) { 367 hv_stimer_legacy_cleanup(cpu); 368 } 369 370 if (!hv_clock_event) 371 return; 372 373 if (direct_mode_enabled) { 374 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 375 hv_remove_stimer0_irq(); 376 stimer0_irq = -1; 377 } 378 free_percpu(hv_clock_event); 379 hv_clock_event = NULL; 380 381 } 382 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 383 384 static __always_inline u64 read_hv_clock_msr(void) 385 { 386 /* 387 * Read the partition counter to get the current tick count. This count 388 * is set to 0 when the partition is created and is incremented in 100 389 * nanosecond units. 390 * 391 * Use hv_raw_get_msr() because this function is used from 392 * noinstr. Notable; while HV_MSR_TIME_REF_COUNT is a synthetic 393 * register it doesn't need the GHCB path. 394 */ 395 return hv_raw_get_msr(HV_MSR_TIME_REF_COUNT); 396 } 397 398 /* 399 * Code and definitions for the Hyper-V clocksources. Two 400 * clocksources are defined: one that reads the Hyper-V defined MSR, and 401 * the other that uses the TSC reference page feature as defined in the 402 * TLFS. The MSR version is for compatibility with old versions of 403 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 404 */ 405 406 static union { 407 struct ms_hyperv_tsc_page page; 408 u8 reserved[PAGE_SIZE]; 409 } tsc_pg __bss_decrypted __aligned(PAGE_SIZE); 410 411 static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; 412 static unsigned long tsc_pfn; 413 414 unsigned long hv_get_tsc_pfn(void) 415 { 416 return tsc_pfn; 417 } 418 EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); 419 420 struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 421 { 422 return tsc_page; 423 } 424 EXPORT_SYMBOL_GPL(hv_get_tsc_page); 425 426 static __always_inline u64 read_hv_clock_tsc(void) 427 { 428 u64 cur_tsc, time; 429 430 /* 431 * The Hyper-V Top-Level Function Spec (TLFS), section Timers, 432 * subsection Refererence Counter, guarantees that the TSC and MSR 433 * times are in sync and monotonic. Therefore we can fall back 434 * to the MSR in case the TSC page indicates unavailability. 435 */ 436 if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) 437 time = read_hv_clock_msr(); 438 439 return time; 440 } 441 442 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 443 { 444 return read_hv_clock_tsc(); 445 } 446 447 static u64 noinstr read_hv_sched_clock_tsc(void) 448 { 449 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 450 (NSEC_PER_SEC / HV_CLOCK_HZ); 451 } 452 453 static void suspend_hv_clock_tsc(struct clocksource *arg) 454 { 455 union hv_reference_tsc_msr tsc_msr; 456 457 /* Disable the TSC page */ 458 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 459 tsc_msr.enable = 0; 460 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 461 } 462 463 464 static void resume_hv_clock_tsc(struct clocksource *arg) 465 { 466 union hv_reference_tsc_msr tsc_msr; 467 468 /* Re-enable the TSC page */ 469 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 470 tsc_msr.enable = 1; 471 tsc_msr.pfn = tsc_pfn; 472 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 473 } 474 475 /* 476 * Called during resume from hibernation, from overridden 477 * x86_platform.restore_sched_clock_state routine. This is to adjust offsets 478 * used to calculate time for hv tsc page based sched_clock, to account for 479 * time spent before hibernation. 480 */ 481 void hv_adj_sched_clock_offset(u64 offset) 482 { 483 hv_sched_clock_offset -= offset; 484 } 485 486 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 487 static int hv_cs_enable(struct clocksource *cs) 488 { 489 vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); 490 return 0; 491 } 492 #endif 493 494 static struct clocksource hyperv_cs_tsc = { 495 .name = "hyperv_clocksource_tsc_page", 496 .rating = 500, 497 .read = read_hv_clock_tsc_cs, 498 .mask = CLOCKSOURCE_MASK(64), 499 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 500 .suspend= suspend_hv_clock_tsc, 501 .resume = resume_hv_clock_tsc, 502 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 503 .enable = hv_cs_enable, 504 .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, 505 #else 506 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 507 #endif 508 }; 509 510 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 511 { 512 return read_hv_clock_msr(); 513 } 514 515 static struct clocksource hyperv_cs_msr = { 516 .name = "hyperv_clocksource_msr", 517 .rating = 495, 518 .read = read_hv_clock_msr_cs, 519 .mask = CLOCKSOURCE_MASK(64), 520 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 521 }; 522 523 /* 524 * Reference to pv_ops must be inline so objtool 525 * detection of noinstr violations can work correctly. 526 */ 527 #ifdef CONFIG_GENERIC_SCHED_CLOCK 528 static __always_inline void hv_setup_sched_clock(void *sched_clock) 529 { 530 /* 531 * We're on an architecture with generic sched clock (not x86/x64). 532 * The Hyper-V sched clock read function returns nanoseconds, not 533 * the normal 100ns units of the Hyper-V synthetic clock. 534 */ 535 sched_clock_register(sched_clock, 64, NSEC_PER_SEC); 536 } 537 #elif defined CONFIG_PARAVIRT 538 static __always_inline void hv_setup_sched_clock(void *sched_clock) 539 { 540 /* We're on x86/x64 *and* using PV ops */ 541 paravirt_set_sched_clock(sched_clock); 542 } 543 #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ 544 static __always_inline void hv_setup_sched_clock(void *sched_clock) {} 545 #endif /* CONFIG_GENERIC_SCHED_CLOCK */ 546 547 static void __init hv_init_tsc_clocksource(void) 548 { 549 union hv_reference_tsc_msr tsc_msr; 550 551 /* 552 * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly 553 * handles frequency and offset changes due to live migration, 554 * pause/resume, and other VM management operations. So lower the 555 * Hyper-V Reference TSC rating, causing the generic TSC to be used. 556 * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference 557 * TSC will be preferred over the virtualized ARM64 arch counter. 558 */ 559 if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { 560 hyperv_cs_tsc.rating = 250; 561 hyperv_cs_msr.rating = 245; 562 } 563 564 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 565 return; 566 567 hv_read_reference_counter = read_hv_clock_tsc; 568 569 /* 570 * TSC page mapping works differently in root compared to guest. 571 * - In guest partition the guest PFN has to be passed to the 572 * hypervisor. 573 * - In root partition it's other way around: it has to map the PFN 574 * provided by the hypervisor. 575 * But it can't be mapped right here as it's too early and MMU isn't 576 * ready yet. So, we only set the enable bit here and will remap the 577 * page later in hv_remap_tsc_clocksource(). 578 * 579 * It worth mentioning, that TSC clocksource read function 580 * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when 581 * TSC page is zeroed (which is the case until the PFN is remapped) and 582 * thus TSC clocksource will work even without the real TSC page 583 * mapped. 584 */ 585 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 586 if (hv_root_partition()) 587 tsc_pfn = tsc_msr.pfn; 588 else 589 tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); 590 tsc_msr.enable = 1; 591 tsc_msr.pfn = tsc_pfn; 592 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 593 594 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 595 596 /* 597 * If TSC is invariant, then let it stay as the sched clock since it 598 * will be faster than reading the TSC page. But if not invariant, use 599 * the TSC page so that live migrations across hosts with different 600 * frequencies is handled correctly. 601 */ 602 if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) { 603 hv_sched_clock_offset = hv_read_reference_counter(); 604 hv_setup_sched_clock(read_hv_sched_clock_tsc); 605 } 606 } 607 608 void __init hv_init_clocksource(void) 609 { 610 /* 611 * Try to set up the TSC page clocksource, then the MSR clocksource. 612 * At least one of these will always be available except on very old 613 * versions of Hyper-V on x86. In that case we won't have a Hyper-V 614 * clocksource, but Linux will still run with a clocksource based 615 * on the emulated PIT or LAPIC timer. 616 * 617 * Never use the MSR clocksource as sched clock. It's too slow. 618 * Better to use the native sched clock as the fallback. 619 */ 620 hv_init_tsc_clocksource(); 621 622 if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) 623 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 624 } 625 626 void __init hv_remap_tsc_clocksource(void) 627 { 628 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 629 return; 630 631 if (!hv_root_partition()) { 632 WARN(1, "%s: attempt to remap TSC page in guest partition\n", 633 __func__); 634 return; 635 } 636 637 tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg), 638 MEMREMAP_WB); 639 if (!tsc_page) 640 pr_err("Failed to remap Hyper-V TSC page.\n"); 641 } 642