1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14 #include <linux/percpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/clockchips.h> 17 #include <linux/clocksource.h> 18 #include <linux/sched_clock.h> 19 #include <linux/mm.h> 20 #include <linux/cpuhotplug.h> 21 #include <linux/interrupt.h> 22 #include <linux/irq.h> 23 #include <linux/acpi.h> 24 #include <linux/hyperv.h> 25 #include <clocksource/hyperv_timer.h> 26 #include <asm/hyperv-tlfs.h> 27 #include <asm/mshyperv.h> 28 29 static struct clock_event_device __percpu *hv_clock_event; 30 static u64 hv_sched_clock_offset __ro_after_init; 31 32 /* 33 * If false, we're using the old mechanism for stimer0 interrupts 34 * where it sends a VMbus message when it expires. The old 35 * mechanism is used when running on older versions of Hyper-V 36 * that don't support Direct Mode. While Hyper-V provides 37 * four stimer's per CPU, Linux uses only stimer0. 38 * 39 * Because Direct Mode does not require processing a VMbus 40 * message, stimer interrupts can be enabled earlier in the 41 * process of booting a CPU, and consistent with when timer 42 * interrupts are enabled for other clocksource drivers. 43 * However, for legacy versions of Hyper-V when Direct Mode 44 * is not enabled, setting up stimer interrupts must be 45 * delayed until VMbus is initialized and can process the 46 * interrupt message. 47 */ 48 static bool direct_mode_enabled; 49 50 static int stimer0_irq = -1; 51 static int stimer0_message_sint; 52 static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); 53 54 /* 55 * Common code for stimer0 interrupts coming via Direct Mode or 56 * as a VMbus message. 57 */ 58 void hv_stimer0_isr(void) 59 { 60 struct clock_event_device *ce; 61 62 ce = this_cpu_ptr(hv_clock_event); 63 ce->event_handler(ce); 64 } 65 EXPORT_SYMBOL_GPL(hv_stimer0_isr); 66 67 /* 68 * stimer0 interrupt handler for architectures that support 69 * per-cpu interrupts, which also implies Direct Mode. 70 */ 71 static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) 72 { 73 hv_stimer0_isr(); 74 return IRQ_HANDLED; 75 } 76 77 static int hv_ce_set_next_event(unsigned long delta, 78 struct clock_event_device *evt) 79 { 80 u64 current_tick; 81 82 current_tick = hv_read_reference_counter(); 83 current_tick += delta; 84 hv_set_msr(HV_MSR_STIMER0_COUNT, current_tick); 85 return 0; 86 } 87 88 static int hv_ce_shutdown(struct clock_event_device *evt) 89 { 90 hv_set_msr(HV_MSR_STIMER0_COUNT, 0); 91 hv_set_msr(HV_MSR_STIMER0_CONFIG, 0); 92 if (direct_mode_enabled && stimer0_irq >= 0) 93 disable_percpu_irq(stimer0_irq); 94 95 return 0; 96 } 97 98 static int hv_ce_set_oneshot(struct clock_event_device *evt) 99 { 100 union hv_stimer_config timer_cfg; 101 102 timer_cfg.as_uint64 = 0; 103 timer_cfg.enable = 1; 104 timer_cfg.auto_enable = 1; 105 if (direct_mode_enabled) { 106 /* 107 * When it expires, the timer will directly interrupt 108 * on the specified hardware vector/IRQ. 109 */ 110 timer_cfg.direct_mode = 1; 111 timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; 112 if (stimer0_irq >= 0) 113 enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); 114 } else { 115 /* 116 * When it expires, the timer will generate a VMbus message, 117 * to be handled by the normal VMbus interrupt handler. 118 */ 119 timer_cfg.direct_mode = 0; 120 timer_cfg.sintx = stimer0_message_sint; 121 } 122 hv_set_msr(HV_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); 123 return 0; 124 } 125 126 /* 127 * hv_stimer_init - Per-cpu initialization of the clockevent 128 */ 129 static int hv_stimer_init(unsigned int cpu) 130 { 131 struct clock_event_device *ce; 132 133 if (!hv_clock_event) 134 return 0; 135 136 ce = per_cpu_ptr(hv_clock_event, cpu); 137 ce->name = "Hyper-V clockevent"; 138 ce->features = CLOCK_EVT_FEAT_ONESHOT; 139 ce->cpumask = cpumask_of(cpu); 140 141 /* 142 * Lower the rating of the Hyper-V timer in a TDX VM without paravisor, 143 * so the local APIC timer (lapic_clockevent) is the default timer in 144 * such a VM. The Hyper-V timer is not preferred in such a VM because 145 * it depends on the slow VM Reference Counter MSR (the Hyper-V TSC 146 * page is not enbled in such a VM because the VM uses Invariant TSC 147 * as a better clocksource and it's challenging to mark the Hyper-V 148 * TSC page shared in very early boot). 149 */ 150 if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx()) 151 ce->rating = 90; 152 else 153 ce->rating = 1000; 154 155 ce->set_state_shutdown = hv_ce_shutdown; 156 ce->set_state_oneshot = hv_ce_set_oneshot; 157 ce->set_next_event = hv_ce_set_next_event; 158 159 clockevents_config_and_register(ce, 160 HV_CLOCK_HZ, 161 HV_MIN_DELTA_TICKS, 162 HV_MAX_MAX_DELTA_TICKS); 163 return 0; 164 } 165 166 /* 167 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 168 */ 169 int hv_stimer_cleanup(unsigned int cpu) 170 { 171 struct clock_event_device *ce; 172 173 if (!hv_clock_event) 174 return 0; 175 176 /* 177 * In the legacy case where Direct Mode is not enabled 178 * (which can only be on x86/64), stimer cleanup happens 179 * relatively early in the CPU offlining process. We 180 * must unbind the stimer-based clockevent device so 181 * that the LAPIC timer can take over until clockevents 182 * are no longer needed in the offlining process. Note 183 * that clockevents_unbind_device() eventually calls 184 * hv_ce_shutdown(). 185 * 186 * The unbind should not be done when Direct Mode is 187 * enabled because we may be on an architecture where 188 * there are no other clockevent devices to fallback to. 189 */ 190 ce = per_cpu_ptr(hv_clock_event, cpu); 191 if (direct_mode_enabled) 192 hv_ce_shutdown(ce); 193 else 194 clockevents_unbind_device(ce, cpu); 195 196 return 0; 197 } 198 EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 199 200 /* 201 * These placeholders are overridden by arch specific code on 202 * architectures that need special setup of the stimer0 IRQ because 203 * they don't support per-cpu IRQs (such as x86/x64). 204 */ 205 void __weak hv_setup_stimer0_handler(void (*handler)(void)) 206 { 207 }; 208 209 void __weak hv_remove_stimer0_handler(void) 210 { 211 }; 212 213 #ifdef CONFIG_ACPI 214 /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ 215 static int hv_setup_stimer0_irq(void) 216 { 217 int ret; 218 219 ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, 220 ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); 221 if (ret < 0) { 222 pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); 223 return ret; 224 } 225 stimer0_irq = ret; 226 227 ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, 228 "Hyper-V stimer0", &stimer0_evt); 229 if (ret) { 230 pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", 231 stimer0_irq, ret); 232 acpi_unregister_gsi(stimer0_irq); 233 stimer0_irq = -1; 234 } 235 return ret; 236 } 237 238 static void hv_remove_stimer0_irq(void) 239 { 240 if (stimer0_irq == -1) { 241 hv_remove_stimer0_handler(); 242 } else { 243 free_percpu_irq(stimer0_irq, &stimer0_evt); 244 acpi_unregister_gsi(stimer0_irq); 245 stimer0_irq = -1; 246 } 247 } 248 #else 249 static int hv_setup_stimer0_irq(void) 250 { 251 return 0; 252 } 253 254 static void hv_remove_stimer0_irq(void) 255 { 256 } 257 #endif 258 259 /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 260 int hv_stimer_alloc(bool have_percpu_irqs) 261 { 262 int ret; 263 264 /* 265 * Synthetic timers are always available except on old versions of 266 * Hyper-V on x86. In that case, return as error as Linux will use a 267 * clockevent based on emulated LAPIC timer hardware. 268 */ 269 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 270 return -EINVAL; 271 272 hv_clock_event = alloc_percpu(struct clock_event_device); 273 if (!hv_clock_event) 274 return -ENOMEM; 275 276 direct_mode_enabled = ms_hyperv.misc_features & 277 HV_STIMER_DIRECT_MODE_AVAILABLE; 278 279 /* 280 * If Direct Mode isn't enabled, the remainder of the initialization 281 * is done later by hv_stimer_legacy_init() 282 */ 283 if (!direct_mode_enabled) 284 return 0; 285 286 if (have_percpu_irqs) { 287 ret = hv_setup_stimer0_irq(); 288 if (ret) 289 goto free_clock_event; 290 } else { 291 hv_setup_stimer0_handler(hv_stimer0_isr); 292 } 293 294 /* 295 * Since we are in Direct Mode, stimer initialization 296 * can be done now with a CPUHP value in the same range 297 * as other clockevent devices. 298 */ 299 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 300 "clockevents/hyperv/stimer:starting", 301 hv_stimer_init, hv_stimer_cleanup); 302 if (ret < 0) { 303 hv_remove_stimer0_irq(); 304 goto free_clock_event; 305 } 306 return ret; 307 308 free_clock_event: 309 free_percpu(hv_clock_event); 310 hv_clock_event = NULL; 311 return ret; 312 } 313 EXPORT_SYMBOL_GPL(hv_stimer_alloc); 314 315 /* 316 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 317 * the case when Direct Mode is not enabled, and the stimer 318 * must be initialized late in the CPU onlining process. 319 * 320 */ 321 void hv_stimer_legacy_init(unsigned int cpu, int sint) 322 { 323 if (direct_mode_enabled) 324 return; 325 326 /* 327 * This function gets called by each vCPU, so setting the 328 * global stimer_message_sint value each time is conceptually 329 * not ideal, but the value passed in is always the same and 330 * it avoids introducing yet another interface into this 331 * clocksource driver just to set the sint in the legacy case. 332 */ 333 stimer0_message_sint = sint; 334 (void)hv_stimer_init(cpu); 335 } 336 EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 337 338 /* 339 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 340 * handle the case when Direct Mode is not enabled, and the 341 * stimer must be cleaned up early in the CPU offlining 342 * process. 343 */ 344 void hv_stimer_legacy_cleanup(unsigned int cpu) 345 { 346 if (direct_mode_enabled) 347 return; 348 (void)hv_stimer_cleanup(cpu); 349 } 350 EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 351 352 /* 353 * Do a global cleanup of clockevents for the cases of kexec and 354 * vmbus exit 355 */ 356 void hv_stimer_global_cleanup(void) 357 { 358 int cpu; 359 360 /* 361 * hv_stime_legacy_cleanup() will stop the stimer if Direct 362 * Mode is not enabled, and fallback to the LAPIC timer. 363 */ 364 for_each_present_cpu(cpu) { 365 hv_stimer_legacy_cleanup(cpu); 366 } 367 368 if (!hv_clock_event) 369 return; 370 371 if (direct_mode_enabled) { 372 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 373 hv_remove_stimer0_irq(); 374 stimer0_irq = -1; 375 } 376 free_percpu(hv_clock_event); 377 hv_clock_event = NULL; 378 379 } 380 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 381 382 static __always_inline u64 read_hv_clock_msr(void) 383 { 384 /* 385 * Read the partition counter to get the current tick count. This count 386 * is set to 0 when the partition is created and is incremented in 100 387 * nanosecond units. 388 * 389 * Use hv_raw_get_msr() because this function is used from 390 * noinstr. Notable; while HV_MSR_TIME_REF_COUNT is a synthetic 391 * register it doesn't need the GHCB path. 392 */ 393 return hv_raw_get_msr(HV_MSR_TIME_REF_COUNT); 394 } 395 396 /* 397 * Code and definitions for the Hyper-V clocksources. Two 398 * clocksources are defined: one that reads the Hyper-V defined MSR, and 399 * the other that uses the TSC reference page feature as defined in the 400 * TLFS. The MSR version is for compatibility with old versions of 401 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 402 */ 403 404 static union { 405 struct ms_hyperv_tsc_page page; 406 u8 reserved[PAGE_SIZE]; 407 } tsc_pg __bss_decrypted __aligned(PAGE_SIZE); 408 409 static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; 410 static unsigned long tsc_pfn; 411 412 unsigned long hv_get_tsc_pfn(void) 413 { 414 return tsc_pfn; 415 } 416 EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); 417 418 struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 419 { 420 return tsc_page; 421 } 422 EXPORT_SYMBOL_GPL(hv_get_tsc_page); 423 424 static __always_inline u64 read_hv_clock_tsc(void) 425 { 426 u64 cur_tsc, time; 427 428 /* 429 * The Hyper-V Top-Level Function Spec (TLFS), section Timers, 430 * subsection Refererence Counter, guarantees that the TSC and MSR 431 * times are in sync and monotonic. Therefore we can fall back 432 * to the MSR in case the TSC page indicates unavailability. 433 */ 434 if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) 435 time = read_hv_clock_msr(); 436 437 return time; 438 } 439 440 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 441 { 442 return read_hv_clock_tsc(); 443 } 444 445 static u64 noinstr read_hv_sched_clock_tsc(void) 446 { 447 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 448 (NSEC_PER_SEC / HV_CLOCK_HZ); 449 } 450 451 static void suspend_hv_clock_tsc(struct clocksource *arg) 452 { 453 union hv_reference_tsc_msr tsc_msr; 454 455 /* Disable the TSC page */ 456 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 457 tsc_msr.enable = 0; 458 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 459 } 460 461 462 static void resume_hv_clock_tsc(struct clocksource *arg) 463 { 464 union hv_reference_tsc_msr tsc_msr; 465 466 /* Re-enable the TSC page */ 467 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 468 tsc_msr.enable = 1; 469 tsc_msr.pfn = tsc_pfn; 470 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 471 } 472 473 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 474 static int hv_cs_enable(struct clocksource *cs) 475 { 476 vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); 477 return 0; 478 } 479 #endif 480 481 static struct clocksource hyperv_cs_tsc = { 482 .name = "hyperv_clocksource_tsc_page", 483 .rating = 500, 484 .read = read_hv_clock_tsc_cs, 485 .mask = CLOCKSOURCE_MASK(64), 486 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 487 .suspend= suspend_hv_clock_tsc, 488 .resume = resume_hv_clock_tsc, 489 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 490 .enable = hv_cs_enable, 491 .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, 492 #else 493 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 494 #endif 495 }; 496 497 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 498 { 499 return read_hv_clock_msr(); 500 } 501 502 static struct clocksource hyperv_cs_msr = { 503 .name = "hyperv_clocksource_msr", 504 .rating = 495, 505 .read = read_hv_clock_msr_cs, 506 .mask = CLOCKSOURCE_MASK(64), 507 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 508 }; 509 510 /* 511 * Reference to pv_ops must be inline so objtool 512 * detection of noinstr violations can work correctly. 513 */ 514 #ifdef CONFIG_GENERIC_SCHED_CLOCK 515 static __always_inline void hv_setup_sched_clock(void *sched_clock) 516 { 517 /* 518 * We're on an architecture with generic sched clock (not x86/x64). 519 * The Hyper-V sched clock read function returns nanoseconds, not 520 * the normal 100ns units of the Hyper-V synthetic clock. 521 */ 522 sched_clock_register(sched_clock, 64, NSEC_PER_SEC); 523 } 524 #elif defined CONFIG_PARAVIRT 525 static __always_inline void hv_setup_sched_clock(void *sched_clock) 526 { 527 /* We're on x86/x64 *and* using PV ops */ 528 paravirt_set_sched_clock(sched_clock); 529 } 530 #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ 531 static __always_inline void hv_setup_sched_clock(void *sched_clock) {} 532 #endif /* CONFIG_GENERIC_SCHED_CLOCK */ 533 534 static void __init hv_init_tsc_clocksource(void) 535 { 536 union hv_reference_tsc_msr tsc_msr; 537 538 /* 539 * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly 540 * handles frequency and offset changes due to live migration, 541 * pause/resume, and other VM management operations. So lower the 542 * Hyper-V Reference TSC rating, causing the generic TSC to be used. 543 * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference 544 * TSC will be preferred over the virtualized ARM64 arch counter. 545 */ 546 if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { 547 hyperv_cs_tsc.rating = 250; 548 hyperv_cs_msr.rating = 245; 549 } 550 551 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 552 return; 553 554 hv_read_reference_counter = read_hv_clock_tsc; 555 556 /* 557 * TSC page mapping works differently in root compared to guest. 558 * - In guest partition the guest PFN has to be passed to the 559 * hypervisor. 560 * - In root partition it's other way around: it has to map the PFN 561 * provided by the hypervisor. 562 * But it can't be mapped right here as it's too early and MMU isn't 563 * ready yet. So, we only set the enable bit here and will remap the 564 * page later in hv_remap_tsc_clocksource(). 565 * 566 * It worth mentioning, that TSC clocksource read function 567 * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when 568 * TSC page is zeroed (which is the case until the PFN is remapped) and 569 * thus TSC clocksource will work even without the real TSC page 570 * mapped. 571 */ 572 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 573 if (hv_root_partition) 574 tsc_pfn = tsc_msr.pfn; 575 else 576 tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); 577 tsc_msr.enable = 1; 578 tsc_msr.pfn = tsc_pfn; 579 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 580 581 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 582 583 /* 584 * If TSC is invariant, then let it stay as the sched clock since it 585 * will be faster than reading the TSC page. But if not invariant, use 586 * the TSC page so that live migrations across hosts with different 587 * frequencies is handled correctly. 588 */ 589 if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) { 590 hv_sched_clock_offset = hv_read_reference_counter(); 591 hv_setup_sched_clock(read_hv_sched_clock_tsc); 592 } 593 } 594 595 void __init hv_init_clocksource(void) 596 { 597 /* 598 * Try to set up the TSC page clocksource, then the MSR clocksource. 599 * At least one of these will always be available except on very old 600 * versions of Hyper-V on x86. In that case we won't have a Hyper-V 601 * clocksource, but Linux will still run with a clocksource based 602 * on the emulated PIT or LAPIC timer. 603 * 604 * Never use the MSR clocksource as sched clock. It's too slow. 605 * Better to use the native sched clock as the fallback. 606 */ 607 hv_init_tsc_clocksource(); 608 609 if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) 610 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 611 } 612 613 void __init hv_remap_tsc_clocksource(void) 614 { 615 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 616 return; 617 618 if (!hv_root_partition) { 619 WARN(1, "%s: attempt to remap TSC page in guest partition\n", 620 __func__); 621 return; 622 } 623 624 tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg), 625 MEMREMAP_WB); 626 if (!tsc_page) 627 pr_err("Failed to remap Hyper-V TSC page.\n"); 628 } 629