1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14 #include <linux/percpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/clockchips.h> 17 #include <linux/clocksource.h> 18 #include <linux/sched_clock.h> 19 #include <linux/mm.h> 20 #include <linux/cpuhotplug.h> 21 #include <clocksource/hyperv_timer.h> 22 #include <asm/hyperv-tlfs.h> 23 #include <asm/mshyperv.h> 24 25 static struct clock_event_device __percpu *hv_clock_event; 26 static u64 hv_sched_clock_offset __ro_after_init; 27 28 /* 29 * If false, we're using the old mechanism for stimer0 interrupts 30 * where it sends a VMbus message when it expires. The old 31 * mechanism is used when running on older versions of Hyper-V 32 * that don't support Direct Mode. While Hyper-V provides 33 * four stimer's per CPU, Linux uses only stimer0. 34 * 35 * Because Direct Mode does not require processing a VMbus 36 * message, stimer interrupts can be enabled earlier in the 37 * process of booting a CPU, and consistent with when timer 38 * interrupts are enabled for other clocksource drivers. 39 * However, for legacy versions of Hyper-V when Direct Mode 40 * is not enabled, setting up stimer interrupts must be 41 * delayed until VMbus is initialized and can process the 42 * interrupt message. 43 */ 44 static bool direct_mode_enabled; 45 46 static int stimer0_irq; 47 static int stimer0_vector; 48 static int stimer0_message_sint; 49 50 /* 51 * ISR for when stimer0 is operating in Direct Mode. Direct Mode 52 * does not use VMbus or any VMbus messages, so process here and not 53 * in the VMbus driver code. 54 */ 55 void hv_stimer0_isr(void) 56 { 57 struct clock_event_device *ce; 58 59 ce = this_cpu_ptr(hv_clock_event); 60 ce->event_handler(ce); 61 } 62 EXPORT_SYMBOL_GPL(hv_stimer0_isr); 63 64 static int hv_ce_set_next_event(unsigned long delta, 65 struct clock_event_device *evt) 66 { 67 u64 current_tick; 68 69 current_tick = hv_read_reference_counter(); 70 current_tick += delta; 71 hv_init_timer(0, current_tick); 72 return 0; 73 } 74 75 static int hv_ce_shutdown(struct clock_event_device *evt) 76 { 77 hv_init_timer(0, 0); 78 hv_init_timer_config(0, 0); 79 if (direct_mode_enabled) 80 hv_disable_stimer0_percpu_irq(stimer0_irq); 81 82 return 0; 83 } 84 85 static int hv_ce_set_oneshot(struct clock_event_device *evt) 86 { 87 union hv_stimer_config timer_cfg; 88 89 timer_cfg.as_uint64 = 0; 90 timer_cfg.enable = 1; 91 timer_cfg.auto_enable = 1; 92 if (direct_mode_enabled) { 93 /* 94 * When it expires, the timer will directly interrupt 95 * on the specified hardware vector/IRQ. 96 */ 97 timer_cfg.direct_mode = 1; 98 timer_cfg.apic_vector = stimer0_vector; 99 hv_enable_stimer0_percpu_irq(stimer0_irq); 100 } else { 101 /* 102 * When it expires, the timer will generate a VMbus message, 103 * to be handled by the normal VMbus interrupt handler. 104 */ 105 timer_cfg.direct_mode = 0; 106 timer_cfg.sintx = stimer0_message_sint; 107 } 108 hv_init_timer_config(0, timer_cfg.as_uint64); 109 return 0; 110 } 111 112 /* 113 * hv_stimer_init - Per-cpu initialization of the clockevent 114 */ 115 static int hv_stimer_init(unsigned int cpu) 116 { 117 struct clock_event_device *ce; 118 119 if (!hv_clock_event) 120 return 0; 121 122 ce = per_cpu_ptr(hv_clock_event, cpu); 123 ce->name = "Hyper-V clockevent"; 124 ce->features = CLOCK_EVT_FEAT_ONESHOT; 125 ce->cpumask = cpumask_of(cpu); 126 ce->rating = 1000; 127 ce->set_state_shutdown = hv_ce_shutdown; 128 ce->set_state_oneshot = hv_ce_set_oneshot; 129 ce->set_next_event = hv_ce_set_next_event; 130 131 clockevents_config_and_register(ce, 132 HV_CLOCK_HZ, 133 HV_MIN_DELTA_TICKS, 134 HV_MAX_MAX_DELTA_TICKS); 135 return 0; 136 } 137 138 /* 139 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 140 */ 141 int hv_stimer_cleanup(unsigned int cpu) 142 { 143 struct clock_event_device *ce; 144 145 if (!hv_clock_event) 146 return 0; 147 148 /* 149 * In the legacy case where Direct Mode is not enabled 150 * (which can only be on x86/64), stimer cleanup happens 151 * relatively early in the CPU offlining process. We 152 * must unbind the stimer-based clockevent device so 153 * that the LAPIC timer can take over until clockevents 154 * are no longer needed in the offlining process. Note 155 * that clockevents_unbind_device() eventually calls 156 * hv_ce_shutdown(). 157 * 158 * The unbind should not be done when Direct Mode is 159 * enabled because we may be on an architecture where 160 * there are no other clockevent devices to fallback to. 161 */ 162 ce = per_cpu_ptr(hv_clock_event, cpu); 163 if (direct_mode_enabled) 164 hv_ce_shutdown(ce); 165 else 166 clockevents_unbind_device(ce, cpu); 167 168 return 0; 169 } 170 EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 171 172 /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 173 int hv_stimer_alloc(void) 174 { 175 int ret = 0; 176 177 /* 178 * Synthetic timers are always available except on old versions of 179 * Hyper-V on x86. In that case, return as error as Linux will use a 180 * clockevent based on emulated LAPIC timer hardware. 181 */ 182 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 183 return -EINVAL; 184 185 hv_clock_event = alloc_percpu(struct clock_event_device); 186 if (!hv_clock_event) 187 return -ENOMEM; 188 189 direct_mode_enabled = ms_hyperv.misc_features & 190 HV_STIMER_DIRECT_MODE_AVAILABLE; 191 if (direct_mode_enabled) { 192 ret = hv_setup_stimer0_irq(&stimer0_irq, &stimer0_vector, 193 hv_stimer0_isr); 194 if (ret) 195 goto free_percpu; 196 197 /* 198 * Since we are in Direct Mode, stimer initialization 199 * can be done now with a CPUHP value in the same range 200 * as other clockevent devices. 201 */ 202 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 203 "clockevents/hyperv/stimer:starting", 204 hv_stimer_init, hv_stimer_cleanup); 205 if (ret < 0) 206 goto free_stimer0_irq; 207 } 208 return ret; 209 210 free_stimer0_irq: 211 hv_remove_stimer0_irq(stimer0_irq); 212 stimer0_irq = 0; 213 free_percpu: 214 free_percpu(hv_clock_event); 215 hv_clock_event = NULL; 216 return ret; 217 } 218 EXPORT_SYMBOL_GPL(hv_stimer_alloc); 219 220 /* 221 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 222 * the case when Direct Mode is not enabled, and the stimer 223 * must be initialized late in the CPU onlining process. 224 * 225 */ 226 void hv_stimer_legacy_init(unsigned int cpu, int sint) 227 { 228 if (direct_mode_enabled) 229 return; 230 231 /* 232 * This function gets called by each vCPU, so setting the 233 * global stimer_message_sint value each time is conceptually 234 * not ideal, but the value passed in is always the same and 235 * it avoids introducing yet another interface into this 236 * clocksource driver just to set the sint in the legacy case. 237 */ 238 stimer0_message_sint = sint; 239 (void)hv_stimer_init(cpu); 240 } 241 EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 242 243 /* 244 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 245 * handle the case when Direct Mode is not enabled, and the 246 * stimer must be cleaned up early in the CPU offlining 247 * process. 248 */ 249 void hv_stimer_legacy_cleanup(unsigned int cpu) 250 { 251 if (direct_mode_enabled) 252 return; 253 (void)hv_stimer_cleanup(cpu); 254 } 255 EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 256 257 258 /* hv_stimer_free - Free global resources allocated by hv_stimer_alloc() */ 259 void hv_stimer_free(void) 260 { 261 if (!hv_clock_event) 262 return; 263 264 if (direct_mode_enabled) { 265 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 266 hv_remove_stimer0_irq(stimer0_irq); 267 stimer0_irq = 0; 268 } 269 free_percpu(hv_clock_event); 270 hv_clock_event = NULL; 271 } 272 EXPORT_SYMBOL_GPL(hv_stimer_free); 273 274 /* 275 * Do a global cleanup of clockevents for the cases of kexec and 276 * vmbus exit 277 */ 278 void hv_stimer_global_cleanup(void) 279 { 280 int cpu; 281 282 /* 283 * hv_stime_legacy_cleanup() will stop the stimer if Direct 284 * Mode is not enabled, and fallback to the LAPIC timer. 285 */ 286 for_each_present_cpu(cpu) { 287 hv_stimer_legacy_cleanup(cpu); 288 } 289 290 /* 291 * If Direct Mode is enabled, the cpuhp teardown callback 292 * (hv_stimer_cleanup) will be run on all CPUs to stop the 293 * stimers. 294 */ 295 hv_stimer_free(); 296 } 297 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 298 299 /* 300 * Code and definitions for the Hyper-V clocksources. Two 301 * clocksources are defined: one that reads the Hyper-V defined MSR, and 302 * the other that uses the TSC reference page feature as defined in the 303 * TLFS. The MSR version is for compatibility with old versions of 304 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 305 * 306 * The Hyper-V clocksource ratings of 250 are chosen to be below the 307 * TSC clocksource rating of 300. In configurations where Hyper-V offers 308 * an InvariantTSC, the TSC is not marked "unstable", so the TSC clocksource 309 * is available and preferred. With the higher rating, it will be the 310 * default. On older hardware and Hyper-V versions, the TSC is marked 311 * "unstable", so no TSC clocksource is created and the selected Hyper-V 312 * clocksource will be the default. 313 */ 314 315 u64 (*hv_read_reference_counter)(void); 316 EXPORT_SYMBOL_GPL(hv_read_reference_counter); 317 318 static union { 319 struct ms_hyperv_tsc_page page; 320 u8 reserved[PAGE_SIZE]; 321 } tsc_pg __aligned(PAGE_SIZE); 322 323 struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 324 { 325 return &tsc_pg.page; 326 } 327 EXPORT_SYMBOL_GPL(hv_get_tsc_page); 328 329 static u64 notrace read_hv_clock_tsc(void) 330 { 331 u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); 332 333 if (current_tick == U64_MAX) 334 hv_get_time_ref_count(current_tick); 335 336 return current_tick; 337 } 338 339 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 340 { 341 return read_hv_clock_tsc(); 342 } 343 344 static u64 notrace read_hv_sched_clock_tsc(void) 345 { 346 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 347 (NSEC_PER_SEC / HV_CLOCK_HZ); 348 } 349 350 static void suspend_hv_clock_tsc(struct clocksource *arg) 351 { 352 u64 tsc_msr; 353 354 /* Disable the TSC page */ 355 hv_get_reference_tsc(tsc_msr); 356 tsc_msr &= ~BIT_ULL(0); 357 hv_set_reference_tsc(tsc_msr); 358 } 359 360 361 static void resume_hv_clock_tsc(struct clocksource *arg) 362 { 363 phys_addr_t phys_addr = virt_to_phys(&tsc_pg); 364 u64 tsc_msr; 365 366 /* Re-enable the TSC page */ 367 hv_get_reference_tsc(tsc_msr); 368 tsc_msr &= GENMASK_ULL(11, 0); 369 tsc_msr |= BIT_ULL(0) | (u64)phys_addr; 370 hv_set_reference_tsc(tsc_msr); 371 } 372 373 static int hv_cs_enable(struct clocksource *cs) 374 { 375 hv_enable_vdso_clocksource(); 376 return 0; 377 } 378 379 static struct clocksource hyperv_cs_tsc = { 380 .name = "hyperv_clocksource_tsc_page", 381 .rating = 250, 382 .read = read_hv_clock_tsc_cs, 383 .mask = CLOCKSOURCE_MASK(64), 384 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 385 .suspend= suspend_hv_clock_tsc, 386 .resume = resume_hv_clock_tsc, 387 .enable = hv_cs_enable, 388 }; 389 390 static u64 notrace read_hv_clock_msr(void) 391 { 392 u64 current_tick; 393 /* 394 * Read the partition counter to get the current tick count. This count 395 * is set to 0 when the partition is created and is incremented in 396 * 100 nanosecond units. 397 */ 398 hv_get_time_ref_count(current_tick); 399 return current_tick; 400 } 401 402 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 403 { 404 return read_hv_clock_msr(); 405 } 406 407 static u64 notrace read_hv_sched_clock_msr(void) 408 { 409 return (read_hv_clock_msr() - hv_sched_clock_offset) * 410 (NSEC_PER_SEC / HV_CLOCK_HZ); 411 } 412 413 static struct clocksource hyperv_cs_msr = { 414 .name = "hyperv_clocksource_msr", 415 .rating = 250, 416 .read = read_hv_clock_msr_cs, 417 .mask = CLOCKSOURCE_MASK(64), 418 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 419 }; 420 421 static bool __init hv_init_tsc_clocksource(void) 422 { 423 u64 tsc_msr; 424 phys_addr_t phys_addr; 425 426 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 427 return false; 428 429 if (hv_root_partition) 430 return false; 431 432 hv_read_reference_counter = read_hv_clock_tsc; 433 phys_addr = virt_to_phys(hv_get_tsc_page()); 434 435 /* 436 * The Hyper-V TLFS specifies to preserve the value of reserved 437 * bits in registers. So read the existing value, preserve the 438 * low order 12 bits, and add in the guest physical address 439 * (which already has at least the low 12 bits set to zero since 440 * it is page aligned). Also set the "enable" bit, which is bit 0. 441 */ 442 hv_get_reference_tsc(tsc_msr); 443 tsc_msr &= GENMASK_ULL(11, 0); 444 tsc_msr = tsc_msr | 0x1 | (u64)phys_addr; 445 hv_set_reference_tsc(tsc_msr); 446 447 hv_set_clocksource_vdso(hyperv_cs_tsc); 448 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 449 450 hv_sched_clock_offset = hv_read_reference_counter(); 451 hv_setup_sched_clock(read_hv_sched_clock_tsc); 452 453 return true; 454 } 455 456 void __init hv_init_clocksource(void) 457 { 458 /* 459 * Try to set up the TSC page clocksource. If it succeeds, we're 460 * done. Otherwise, set up the MSR clocksoruce. At least one of 461 * these will always be available except on very old versions of 462 * Hyper-V on x86. In that case we won't have a Hyper-V 463 * clocksource, but Linux will still run with a clocksource based 464 * on the emulated PIT or LAPIC timer. 465 */ 466 if (hv_init_tsc_clocksource()) 467 return; 468 469 if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)) 470 return; 471 472 hv_read_reference_counter = read_hv_clock_msr; 473 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 474 475 hv_sched_clock_offset = hv_read_reference_counter(); 476 hv_setup_sched_clock(read_hv_sched_clock_msr); 477 } 478 EXPORT_SYMBOL_GPL(hv_init_clocksource); 479