1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtual PTP 1588 clock for use with LM-safe VMclock device. 4 * 5 * Copyright © 2024 Amazon.com, Inc. or its affiliates. 6 */ 7 8 #include <linux/acpi.h> 9 #include <linux/device.h> 10 #include <linux/err.h> 11 #include <linux/file.h> 12 #include <linux/fs.h> 13 #include <linux/init.h> 14 #include <linux/kernel.h> 15 #include <linux/miscdevice.h> 16 #include <linux/mm.h> 17 #include <linux/module.h> 18 #include <linux/platform_device.h> 19 #include <linux/slab.h> 20 21 #include <uapi/linux/vmclock-abi.h> 22 23 #include <linux/ptp_clock_kernel.h> 24 25 #ifdef CONFIG_X86 26 #include <asm/pvclock.h> 27 #include <asm/kvmclock.h> 28 #endif 29 30 #ifdef CONFIG_KVM_GUEST 31 #define SUPPORT_KVMCLOCK 32 #endif 33 34 static DEFINE_IDA(vmclock_ida); 35 36 ACPI_MODULE_NAME("vmclock"); 37 38 struct vmclock_state { 39 struct resource res; 40 struct vmclock_abi *clk; 41 struct miscdevice miscdev; 42 struct ptp_clock_info ptp_clock_info; 43 struct ptp_clock *ptp_clock; 44 enum clocksource_ids cs_id, sys_cs_id; 45 int index; 46 char *name; 47 }; 48 49 #define VMCLOCK_MAX_WAIT ms_to_ktime(100) 50 51 /* Require at least the flags field to be present. All else can be optional. */ 52 #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad) 53 54 #define VMCLOCK_FIELD_PRESENT(_c, _f) \ 55 (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \ 56 sizeof((_c)->_f))) 57 58 /* 59 * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64 60 * and add the fractional second part of the reference time. 61 * 62 * The result is a 128-bit value, the top 64 bits of which are seconds, and 63 * the low 64 bits are (seconds >> 64). 64 */ 65 static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, 66 uint64_t period, uint8_t shift, 67 uint64_t frac_sec) 68 { 69 unsigned __int128 res = (unsigned __int128)delta * period; 70 71 res >>= shift; 72 res += frac_sec; 73 *res_hi = res >> 64; 74 return (uint64_t)res; 75 } 76 77 static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) 78 { 79 if (likely(clk->time_type == VMCLOCK_TIME_UTC)) 80 return true; 81 82 if (clk->time_type == VMCLOCK_TIME_TAI && 83 (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) { 84 if (sec) 85 *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec); 86 return true; 87 } 88 return false; 89 } 90 91 static int vmclock_get_crosststamp(struct vmclock_state *st, 92 struct ptp_system_timestamp *sts, 93 struct system_counterval_t *system_counter, 94 struct timespec64 *tspec) 95 { 96 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); 97 struct system_time_snapshot systime_snapshot; 98 uint64_t cycle, delta, seq, frac_sec; 99 100 #ifdef CONFIG_X86 101 /* 102 * We'd expect the hypervisor to know this and to report the clock 103 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid. 104 */ 105 if (check_tsc_unstable()) 106 return -EINVAL; 107 #endif 108 109 while (1) { 110 seq = le32_to_cpu(st->clk->seq_count) & ~1ULL; 111 112 /* 113 * This pairs with a write barrier in the hypervisor 114 * which populates this structure. 115 */ 116 virt_rmb(); 117 118 if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE) 119 return -EINVAL; 120 121 /* 122 * When invoked for gettimex64(), fill in the pre/post system 123 * times. The simple case is when system time is based on the 124 * same counter as st->cs_id, in which case all three times 125 * will be derived from the *same* counter value. 126 * 127 * If the system isn't using the same counter, then the value 128 * from ktime_get_snapshot() will still be used as pre_ts, and 129 * ptp_read_system_postts() is called to populate postts after 130 * calling get_cycles(). 131 * 132 * The conversion to timespec64 happens further down, outside 133 * the seq_count loop. 134 */ 135 if (sts) { 136 ktime_get_snapshot(&systime_snapshot); 137 if (systime_snapshot.cs_id == st->cs_id) { 138 cycle = systime_snapshot.cycles; 139 } else { 140 cycle = get_cycles(); 141 ptp_read_system_postts(sts); 142 } 143 } else { 144 cycle = get_cycles(); 145 } 146 147 delta = cycle - le64_to_cpu(st->clk->counter_value); 148 149 frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta, 150 le64_to_cpu(st->clk->counter_period_frac_sec), 151 st->clk->counter_period_shift, 152 le64_to_cpu(st->clk->time_frac_sec)); 153 tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64); 154 tspec->tv_sec += le64_to_cpu(st->clk->time_sec); 155 156 if (!tai_adjust(st->clk, &tspec->tv_sec)) 157 return -EINVAL; 158 159 /* 160 * This pairs with a write barrier in the hypervisor 161 * which populates this structure. 162 */ 163 virt_rmb(); 164 if (seq == le32_to_cpu(st->clk->seq_count)) 165 break; 166 167 if (ktime_after(ktime_get(), deadline)) 168 return -ETIMEDOUT; 169 } 170 171 if (system_counter) { 172 system_counter->cycles = cycle; 173 system_counter->cs_id = st->cs_id; 174 } 175 176 if (sts) { 177 sts->pre_ts = ktime_to_timespec64(systime_snapshot.real); 178 if (systime_snapshot.cs_id == st->cs_id) 179 sts->post_ts = sts->pre_ts; 180 } 181 182 return 0; 183 } 184 185 #ifdef SUPPORT_KVMCLOCK 186 /* 187 * In the case where the system is using the KVM clock for timekeeping, convert 188 * the TSC value into a KVM clock time in order to return a paired reading that 189 * get_device_system_crosststamp() can cope with. 190 */ 191 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st, 192 struct ptp_system_timestamp *sts, 193 struct system_counterval_t *system_counter, 194 struct timespec64 *tspec) 195 { 196 struct pvclock_vcpu_time_info *pvti = this_cpu_pvti(); 197 unsigned int pvti_ver; 198 int ret; 199 200 preempt_disable_notrace(); 201 202 do { 203 pvti_ver = pvclock_read_begin(pvti); 204 205 ret = vmclock_get_crosststamp(st, sts, system_counter, tspec); 206 if (ret) 207 break; 208 209 system_counter->cycles = __pvclock_read_cycles(pvti, 210 system_counter->cycles); 211 system_counter->cs_id = CSID_X86_KVM_CLK; 212 213 /* 214 * This retry should never really happen; if the TSC is 215 * stable and reliable enough across vCPUS that it is sane 216 * for the hypervisor to expose a VMCLOCK device which uses 217 * it as the reference counter, then the KVM clock sohuld be 218 * in 'master clock mode' and basically never changed. But 219 * the KVM clock is a fickle and often broken thing, so do 220 * it "properly" just in case. 221 */ 222 } while (pvclock_read_retry(pvti, pvti_ver)); 223 224 preempt_enable_notrace(); 225 226 return ret; 227 } 228 #endif 229 230 static int ptp_vmclock_get_time_fn(ktime_t *device_time, 231 struct system_counterval_t *system_counter, 232 void *ctx) 233 { 234 struct vmclock_state *st = ctx; 235 struct timespec64 tspec; 236 int ret; 237 238 #ifdef SUPPORT_KVMCLOCK 239 if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK) 240 ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter, 241 &tspec); 242 else 243 #endif 244 ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec); 245 246 if (!ret) 247 *device_time = timespec64_to_ktime(tspec); 248 249 return ret; 250 } 251 252 static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp, 253 struct system_device_crosststamp *xtstamp) 254 { 255 struct vmclock_state *st = container_of(ptp, struct vmclock_state, 256 ptp_clock_info); 257 int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st, 258 NULL, xtstamp); 259 #ifdef SUPPORT_KVMCLOCK 260 /* 261 * On x86, the KVM clock may be used for the system time. We can 262 * actually convert a TSC reading to that, and return a paired 263 * timestamp that get_device_system_crosststamp() *can* handle. 264 */ 265 if (ret == -ENODEV) { 266 struct system_time_snapshot systime_snapshot; 267 268 ktime_get_snapshot(&systime_snapshot); 269 270 if (systime_snapshot.cs_id == CSID_X86_TSC || 271 systime_snapshot.cs_id == CSID_X86_KVM_CLK) { 272 WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id); 273 ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, 274 st, NULL, xtstamp); 275 } 276 } 277 #endif 278 return ret; 279 } 280 281 /* 282 * PTP clock operations 283 */ 284 285 static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta) 286 { 287 return -EOPNOTSUPP; 288 } 289 290 static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta) 291 { 292 return -EOPNOTSUPP; 293 } 294 295 static int ptp_vmclock_settime(struct ptp_clock_info *ptp, 296 const struct timespec64 *ts) 297 { 298 return -EOPNOTSUPP; 299 } 300 301 static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, 302 struct ptp_system_timestamp *sts) 303 { 304 struct vmclock_state *st = container_of(ptp, struct vmclock_state, 305 ptp_clock_info); 306 307 return vmclock_get_crosststamp(st, sts, NULL, ts); 308 } 309 310 static int ptp_vmclock_enable(struct ptp_clock_info *ptp, 311 struct ptp_clock_request *rq, int on) 312 { 313 return -EOPNOTSUPP; 314 } 315 316 static const struct ptp_clock_info ptp_vmclock_info = { 317 .owner = THIS_MODULE, 318 .max_adj = 0, 319 .n_ext_ts = 0, 320 .n_pins = 0, 321 .pps = 0, 322 .adjfine = ptp_vmclock_adjfine, 323 .adjtime = ptp_vmclock_adjtime, 324 .gettimex64 = ptp_vmclock_gettimex, 325 .settime64 = ptp_vmclock_settime, 326 .enable = ptp_vmclock_enable, 327 .getcrosststamp = ptp_vmclock_getcrosststamp, 328 }; 329 330 static struct ptp_clock *vmclock_ptp_register(struct device *dev, 331 struct vmclock_state *st) 332 { 333 enum clocksource_ids cs_id; 334 335 if (IS_ENABLED(CONFIG_ARM64) && 336 st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) { 337 /* Can we check it's the virtual counter? */ 338 cs_id = CSID_ARM_ARCH_COUNTER; 339 } else if (IS_ENABLED(CONFIG_X86) && 340 st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) { 341 cs_id = CSID_X86_TSC; 342 } else { 343 return NULL; 344 } 345 346 /* Only UTC, or TAI with offset */ 347 if (!tai_adjust(st->clk, NULL)) { 348 dev_info(dev, "vmclock does not provide unambiguous UTC\n"); 349 return NULL; 350 } 351 352 st->sys_cs_id = cs_id; 353 st->cs_id = cs_id; 354 st->ptp_clock_info = ptp_vmclock_info; 355 strscpy(st->ptp_clock_info.name, st->name); 356 357 return ptp_clock_register(&st->ptp_clock_info, dev); 358 } 359 360 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) 361 { 362 struct vmclock_state *st = container_of(fp->private_data, 363 struct vmclock_state, miscdev); 364 365 if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) 366 return -EROFS; 367 368 if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff) 369 return -EINVAL; 370 371 if (io_remap_pfn_range(vma, vma->vm_start, 372 st->res.start >> PAGE_SHIFT, PAGE_SIZE, 373 vma->vm_page_prot)) 374 return -EAGAIN; 375 376 return 0; 377 } 378 379 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, 380 size_t count, loff_t *ppos) 381 { 382 struct vmclock_state *st = container_of(fp->private_data, 383 struct vmclock_state, miscdev); 384 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); 385 size_t max_count; 386 uint32_t seq; 387 388 if (*ppos >= PAGE_SIZE) 389 return 0; 390 391 max_count = PAGE_SIZE - *ppos; 392 if (count > max_count) 393 count = max_count; 394 395 while (1) { 396 seq = le32_to_cpu(st->clk->seq_count) & ~1U; 397 /* Pairs with hypervisor wmb */ 398 virt_rmb(); 399 400 if (copy_to_user(buf, ((char *)st->clk) + *ppos, count)) 401 return -EFAULT; 402 403 /* Pairs with hypervisor wmb */ 404 virt_rmb(); 405 if (seq == le32_to_cpu(st->clk->seq_count)) 406 break; 407 408 if (ktime_after(ktime_get(), deadline)) 409 return -ETIMEDOUT; 410 } 411 412 *ppos += count; 413 return count; 414 } 415 416 static const struct file_operations vmclock_miscdev_fops = { 417 .mmap = vmclock_miscdev_mmap, 418 .read = vmclock_miscdev_read, 419 }; 420 421 /* module operations */ 422 423 static void vmclock_remove(struct platform_device *pdev) 424 { 425 struct device *dev = &pdev->dev; 426 struct vmclock_state *st = dev_get_drvdata(dev); 427 428 if (st->ptp_clock) 429 ptp_clock_unregister(st->ptp_clock); 430 431 if (st->miscdev.minor != MISC_DYNAMIC_MINOR) 432 misc_deregister(&st->miscdev); 433 } 434 435 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) 436 { 437 struct vmclock_state *st = data; 438 struct resource_win win; 439 struct resource *res = &win.res; 440 441 if (ares->type == ACPI_RESOURCE_TYPE_END_TAG) 442 return AE_OK; 443 444 /* There can be only one */ 445 if (resource_type(&st->res) == IORESOURCE_MEM) 446 return AE_ERROR; 447 448 if (acpi_dev_resource_memory(ares, res) || 449 acpi_dev_resource_address_space(ares, &win)) { 450 451 if (resource_type(res) != IORESOURCE_MEM || 452 resource_size(res) < sizeof(st->clk)) 453 return AE_ERROR; 454 455 st->res = *res; 456 return AE_OK; 457 } 458 459 return AE_ERROR; 460 } 461 462 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) 463 { 464 struct acpi_device *adev = ACPI_COMPANION(dev); 465 acpi_status status; 466 467 /* 468 * This should never happen as this function is only called when 469 * has_acpi_companion(dev) is true, but the logic is sufficiently 470 * complex that Coverity can't see the tautology. 471 */ 472 if (!adev) 473 return -ENODEV; 474 475 status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, 476 vmclock_acpi_resources, st); 477 if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) { 478 dev_err(dev, "failed to get resources\n"); 479 return -ENODEV; 480 } 481 482 return 0; 483 } 484 485 static void vmclock_put_idx(void *data) 486 { 487 struct vmclock_state *st = data; 488 489 ida_free(&vmclock_ida, st->index); 490 } 491 492 static int vmclock_probe(struct platform_device *pdev) 493 { 494 struct device *dev = &pdev->dev; 495 struct vmclock_state *st; 496 int ret; 497 498 st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); 499 if (!st) 500 return -ENOMEM; 501 502 if (has_acpi_companion(dev)) 503 ret = vmclock_probe_acpi(dev, st); 504 else 505 ret = -EINVAL; /* Only ACPI for now */ 506 507 if (ret) { 508 dev_info(dev, "Failed to obtain physical address: %d\n", ret); 509 goto out; 510 } 511 512 if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { 513 dev_info(dev, "Region too small (0x%llx)\n", 514 resource_size(&st->res)); 515 ret = -EINVAL; 516 goto out; 517 } 518 st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), 519 MEMREMAP_WB | MEMREMAP_DEC); 520 if (IS_ERR(st->clk)) { 521 ret = PTR_ERR(st->clk); 522 dev_info(dev, "failed to map shared memory\n"); 523 st->clk = NULL; 524 goto out; 525 } 526 527 if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || 528 le32_to_cpu(st->clk->size) > resource_size(&st->res) || 529 le16_to_cpu(st->clk->version) != 1) { 530 dev_info(dev, "vmclock magic fields invalid\n"); 531 ret = -EINVAL; 532 goto out; 533 } 534 535 ret = ida_alloc(&vmclock_ida, GFP_KERNEL); 536 if (ret < 0) 537 goto out; 538 539 st->index = ret; 540 ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); 541 if (ret) 542 goto out; 543 544 st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index); 545 if (!st->name) { 546 ret = -ENOMEM; 547 goto out; 548 } 549 550 /* 551 * If the structure is big enough, it can be mapped to userspace. 552 * Theoretically a guest OS even using larger pages could still 553 * use 4KiB PTEs to map smaller MMIO regions like this, but let's 554 * cross that bridge if/when we come to it. 555 */ 556 if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) { 557 st->miscdev.minor = MISC_DYNAMIC_MINOR; 558 st->miscdev.fops = &vmclock_miscdev_fops; 559 st->miscdev.name = st->name; 560 561 ret = misc_register(&st->miscdev); 562 if (ret) 563 goto out; 564 } 565 566 /* If there is valid clock information, register a PTP clock */ 567 if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) { 568 /* Can return a silent NULL, or an error. */ 569 st->ptp_clock = vmclock_ptp_register(dev, st); 570 if (IS_ERR(st->ptp_clock)) { 571 ret = PTR_ERR(st->ptp_clock); 572 st->ptp_clock = NULL; 573 vmclock_remove(pdev); 574 goto out; 575 } 576 } 577 578 if (!st->miscdev.minor && !st->ptp_clock) { 579 /* Neither miscdev nor PTP registered */ 580 dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); 581 ret = -ENODEV; 582 goto out; 583 } 584 585 dev_info(dev, "%s: registered %s%s%s\n", st->name, 586 st->miscdev.minor ? "miscdev" : "", 587 (st->miscdev.minor && st->ptp_clock) ? ", " : "", 588 st->ptp_clock ? "PTP" : ""); 589 590 dev_set_drvdata(dev, st); 591 592 out: 593 return ret; 594 } 595 596 static const struct acpi_device_id vmclock_acpi_ids[] = { 597 { "AMZNC10C", 0 }, 598 {} 599 }; 600 MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); 601 602 static struct platform_driver vmclock_platform_driver = { 603 .probe = vmclock_probe, 604 .remove_new = vmclock_remove, 605 .driver = { 606 .name = "vmclock", 607 .acpi_match_table = vmclock_acpi_ids, 608 }, 609 }; 610 611 module_platform_driver(vmclock_platform_driver) 612 613 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>"); 614 MODULE_DESCRIPTION("PTP clock using VMCLOCK"); 615 MODULE_LICENSE("GPL"); 616