1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Virtual PTP 1588 clock for use with LM-safe VMclock device.
4 *
5 * Copyright © 2024 Amazon.com, Inc. or its affiliates.
6 */
7
8 #include "linux/poll.h"
9 #include "linux/types.h"
10 #include "linux/wait.h"
11 #include <linux/acpi.h>
12 #include <linux/device.h>
13 #include <linux/err.h>
14 #include <linux/file.h>
15 #include <linux/fs.h>
16 #include <linux/init.h>
17 #include <linux/io.h>
18 #include <linux/interrupt.h>
19 #include <linux/kernel.h>
20 #include <linux/miscdevice.h>
21 #include <linux/mm.h>
22 #include <linux/module.h>
23 #include <linux/of.h>
24 #include <linux/platform_device.h>
25 #include <linux/slab.h>
26
27 #include <uapi/linux/vmclock-abi.h>
28
29 #include <linux/ptp_clock_kernel.h>
30
31 #ifdef CONFIG_X86
32 #include <asm/pvclock.h>
33 #include <asm/kvmclock.h>
34 #endif
35
36 #ifdef CONFIG_KVM_GUEST
37 #define SUPPORT_KVMCLOCK
38 #endif
39
40 static DEFINE_IDA(vmclock_ida);
41
42 ACPI_MODULE_NAME("vmclock");
43
44 struct vmclock_state {
45 struct resource res;
46 struct vmclock_abi *clk;
47 struct miscdevice miscdev;
48 wait_queue_head_t disrupt_wait;
49 struct ptp_clock_info ptp_clock_info;
50 struct ptp_clock *ptp_clock;
51 enum clocksource_ids cs_id, sys_cs_id;
52 int index;
53 char *name;
54 };
55
56 #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
57
58 /* Require at least the flags field to be present. All else can be optional. */
59 #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
60
61 #define VMCLOCK_FIELD_PRESENT(_c, _f) \
62 (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \
63 sizeof((_c)->_f)))
64
65 /*
66 * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
67 * and add the fractional second part of the reference time.
68 *
69 * The result is a 128-bit value, the top 64 bits of which are seconds, and
70 * the low 64 bits are (seconds >> 64).
71 */
mul_u64_u64_shr_add_u64(uint64_t * res_hi,uint64_t delta,uint64_t period,uint8_t shift,uint64_t frac_sec)72 static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
73 uint64_t period, uint8_t shift,
74 uint64_t frac_sec)
75 {
76 unsigned __int128 res = (unsigned __int128)delta * period;
77
78 res >>= shift;
79 res += frac_sec;
80 *res_hi = res >> 64;
81 return (uint64_t)res;
82 }
83
tai_adjust(struct vmclock_abi * clk,uint64_t * sec)84 static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
85 {
86 if (clk->time_type == VMCLOCK_TIME_TAI)
87 return true;
88
89 if (clk->time_type == VMCLOCK_TIME_UTC &&
90 (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
91 if (sec)
92 *sec -= (int16_t)le16_to_cpu(clk->tai_offset_sec);
93 return true;
94 }
95 return false;
96 }
97
vmclock_get_crosststamp(struct vmclock_state * st,struct ptp_system_timestamp * sts,struct system_counterval_t * system_counter,struct timespec64 * tspec)98 static int vmclock_get_crosststamp(struct vmclock_state *st,
99 struct ptp_system_timestamp *sts,
100 struct system_counterval_t *system_counter,
101 struct timespec64 *tspec)
102 {
103 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
104 struct system_time_snapshot systime_snapshot;
105 uint64_t cycle, delta, seq, frac_sec;
106
107 #ifdef CONFIG_X86
108 /*
109 * We'd expect the hypervisor to know this and to report the clock
110 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
111 */
112 if (check_tsc_unstable())
113 return -EINVAL;
114 #endif
115
116 while (1) {
117 seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
118
119 /*
120 * This pairs with a write barrier in the hypervisor
121 * which populates this structure.
122 */
123 virt_rmb();
124
125 if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
126 return -EINVAL;
127
128 /*
129 * When invoked for gettimex64(), fill in the pre/post system
130 * times. The simple case is when system time is based on the
131 * same counter as st->cs_id, in which case all three times
132 * will be derived from the *same* counter value.
133 *
134 * If the system isn't using the same counter, then the value
135 * from ktime_get_snapshot() will still be used as pre_ts, and
136 * ptp_read_system_postts() is called to populate postts after
137 * calling get_cycles().
138 *
139 * The conversion to timespec64 happens further down, outside
140 * the seq_count loop.
141 */
142 if (sts) {
143 ktime_get_snapshot(&systime_snapshot);
144 if (systime_snapshot.cs_id == st->cs_id) {
145 cycle = systime_snapshot.cycles;
146 } else {
147 cycle = get_cycles();
148 ptp_read_system_postts(sts);
149 }
150 } else {
151 cycle = get_cycles();
152 }
153
154 delta = cycle - le64_to_cpu(st->clk->counter_value);
155
156 frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
157 le64_to_cpu(st->clk->counter_period_frac_sec),
158 st->clk->counter_period_shift,
159 le64_to_cpu(st->clk->time_frac_sec));
160 tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
161 tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
162
163 if (!tai_adjust(st->clk, &tspec->tv_sec))
164 return -EINVAL;
165
166 /*
167 * This pairs with a write barrier in the hypervisor
168 * which populates this structure.
169 */
170 virt_rmb();
171 if (seq == le32_to_cpu(st->clk->seq_count))
172 break;
173
174 if (ktime_after(ktime_get(), deadline))
175 return -ETIMEDOUT;
176 }
177
178 if (system_counter) {
179 system_counter->cycles = cycle;
180 system_counter->cs_id = st->cs_id;
181 }
182
183 if (sts) {
184 sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
185 if (systime_snapshot.cs_id == st->cs_id)
186 sts->post_ts = sts->pre_ts;
187 }
188
189 return 0;
190 }
191
192 #ifdef SUPPORT_KVMCLOCK
193 /*
194 * In the case where the system is using the KVM clock for timekeeping, convert
195 * the TSC value into a KVM clock time in order to return a paired reading that
196 * get_device_system_crosststamp() can cope with.
197 */
vmclock_get_crosststamp_kvmclock(struct vmclock_state * st,struct ptp_system_timestamp * sts,struct system_counterval_t * system_counter,struct timespec64 * tspec)198 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
199 struct ptp_system_timestamp *sts,
200 struct system_counterval_t *system_counter,
201 struct timespec64 *tspec)
202 {
203 struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
204 unsigned int pvti_ver;
205 int ret;
206
207 preempt_disable_notrace();
208
209 do {
210 pvti_ver = pvclock_read_begin(pvti);
211
212 ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
213 if (ret)
214 break;
215
216 system_counter->cycles = __pvclock_read_cycles(pvti,
217 system_counter->cycles);
218 system_counter->cs_id = CSID_X86_KVM_CLK;
219
220 /*
221 * This retry should never really happen; if the TSC is
222 * stable and reliable enough across vCPUS that it is sane
223 * for the hypervisor to expose a VMCLOCK device which uses
224 * it as the reference counter, then the KVM clock sohuld be
225 * in 'master clock mode' and basically never changed. But
226 * the KVM clock is a fickle and often broken thing, so do
227 * it "properly" just in case.
228 */
229 } while (pvclock_read_retry(pvti, pvti_ver));
230
231 preempt_enable_notrace();
232
233 return ret;
234 }
235 #endif
236
ptp_vmclock_get_time_fn(ktime_t * device_time,struct system_counterval_t * system_counter,void * ctx)237 static int ptp_vmclock_get_time_fn(ktime_t *device_time,
238 struct system_counterval_t *system_counter,
239 void *ctx)
240 {
241 struct vmclock_state *st = ctx;
242 struct timespec64 tspec;
243 int ret;
244
245 #ifdef SUPPORT_KVMCLOCK
246 if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
247 ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
248 &tspec);
249 else
250 #endif
251 ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
252
253 if (!ret)
254 *device_time = timespec64_to_ktime(tspec);
255
256 return ret;
257 }
258
ptp_vmclock_getcrosststamp(struct ptp_clock_info * ptp,struct system_device_crosststamp * xtstamp)259 static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
260 struct system_device_crosststamp *xtstamp)
261 {
262 struct vmclock_state *st = container_of(ptp, struct vmclock_state,
263 ptp_clock_info);
264 int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
265 NULL, xtstamp);
266 #ifdef SUPPORT_KVMCLOCK
267 /*
268 * On x86, the KVM clock may be used for the system time. We can
269 * actually convert a TSC reading to that, and return a paired
270 * timestamp that get_device_system_crosststamp() *can* handle.
271 */
272 if (ret == -ENODEV) {
273 struct system_time_snapshot systime_snapshot;
274
275 ktime_get_snapshot(&systime_snapshot);
276
277 if (systime_snapshot.cs_id == CSID_X86_TSC ||
278 systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
279 WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
280 ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
281 st, NULL, xtstamp);
282 }
283 }
284 #endif
285 return ret;
286 }
287
288 /*
289 * PTP clock operations
290 */
291
ptp_vmclock_adjfine(struct ptp_clock_info * ptp,long delta)292 static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
293 {
294 return -EOPNOTSUPP;
295 }
296
ptp_vmclock_adjtime(struct ptp_clock_info * ptp,s64 delta)297 static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
298 {
299 return -EOPNOTSUPP;
300 }
301
ptp_vmclock_settime(struct ptp_clock_info * ptp,const struct timespec64 * ts)302 static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
303 const struct timespec64 *ts)
304 {
305 return -EOPNOTSUPP;
306 }
307
ptp_vmclock_gettimex(struct ptp_clock_info * ptp,struct timespec64 * ts,struct ptp_system_timestamp * sts)308 static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
309 struct ptp_system_timestamp *sts)
310 {
311 struct vmclock_state *st = container_of(ptp, struct vmclock_state,
312 ptp_clock_info);
313
314 return vmclock_get_crosststamp(st, sts, NULL, ts);
315 }
316
ptp_vmclock_enable(struct ptp_clock_info * ptp,struct ptp_clock_request * rq,int on)317 static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
318 struct ptp_clock_request *rq, int on)
319 {
320 return -EOPNOTSUPP;
321 }
322
323 static const struct ptp_clock_info ptp_vmclock_info = {
324 .owner = THIS_MODULE,
325 .max_adj = 0,
326 .n_ext_ts = 0,
327 .n_pins = 0,
328 .pps = 0,
329 .adjfine = ptp_vmclock_adjfine,
330 .adjtime = ptp_vmclock_adjtime,
331 .gettimex64 = ptp_vmclock_gettimex,
332 .settime64 = ptp_vmclock_settime,
333 .enable = ptp_vmclock_enable,
334 .getcrosststamp = ptp_vmclock_getcrosststamp,
335 };
336
vmclock_ptp_register(struct device * dev,struct vmclock_state * st)337 static struct ptp_clock *vmclock_ptp_register(struct device *dev,
338 struct vmclock_state *st)
339 {
340 enum clocksource_ids cs_id;
341
342 if (IS_ENABLED(CONFIG_ARM64) &&
343 st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
344 /* Can we check it's the virtual counter? */
345 cs_id = CSID_ARM_ARCH_COUNTER;
346 } else if (IS_ENABLED(CONFIG_X86) &&
347 st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
348 cs_id = CSID_X86_TSC;
349 } else {
350 return NULL;
351 }
352
353 /* Accept TAI directly, or UTC with valid offset for conversion to TAI */
354 if (!tai_adjust(st->clk, NULL)) {
355 dev_info(dev, "vmclock does not provide unambiguous time\n");
356 return NULL;
357 }
358
359 st->sys_cs_id = cs_id;
360 st->cs_id = cs_id;
361 st->ptp_clock_info = ptp_vmclock_info;
362 strscpy(st->ptp_clock_info.name, st->name);
363
364 return ptp_clock_register(&st->ptp_clock_info, dev);
365 }
366
367 struct vmclock_file_state {
368 struct vmclock_state *st;
369 atomic_t seq;
370 };
371
vmclock_miscdev_mmap(struct file * fp,struct vm_area_struct * vma)372 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
373 {
374 struct vmclock_file_state *fst = fp->private_data;
375 struct vmclock_state *st = fst->st;
376
377 if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
378 return -EROFS;
379
380 if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
381 return -EINVAL;
382
383 if (io_remap_pfn_range(vma, vma->vm_start,
384 st->res.start >> PAGE_SHIFT, PAGE_SIZE,
385 vma->vm_page_prot))
386 return -EAGAIN;
387
388 return 0;
389 }
390
vmclock_miscdev_read(struct file * fp,char __user * buf,size_t count,loff_t * ppos)391 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
392 size_t count, loff_t *ppos)
393 {
394 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
395 struct vmclock_file_state *fst = fp->private_data;
396 struct vmclock_state *st = fst->st;
397 uint32_t seq, old_seq;
398 size_t max_count;
399
400 if (*ppos >= PAGE_SIZE)
401 return 0;
402
403 max_count = PAGE_SIZE - *ppos;
404 if (count > max_count)
405 count = max_count;
406
407 old_seq = atomic_read(&fst->seq);
408 while (1) {
409 seq = le32_to_cpu(st->clk->seq_count) & ~1U;
410 /* Pairs with hypervisor wmb */
411 virt_rmb();
412
413 if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
414 return -EFAULT;
415
416 /* Pairs with hypervisor wmb */
417 virt_rmb();
418 if (seq == le32_to_cpu(st->clk->seq_count)) {
419 /*
420 * Either we updated fst->seq to seq (the latest version we observed)
421 * or someone else did (old_seq == seq), so we can break.
422 */
423 if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) ||
424 old_seq == seq) {
425 break;
426 }
427 }
428
429 if (ktime_after(ktime_get(), deadline))
430 return -ETIMEDOUT;
431 }
432
433 *ppos += count;
434 return count;
435 }
436
vmclock_miscdev_poll(struct file * fp,poll_table * wait)437 static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait)
438 {
439 struct vmclock_file_state *fst = fp->private_data;
440 struct vmclock_state *st = fst->st;
441 uint32_t seq;
442
443 /*
444 * Hypervisor will not send us any notifications, so fail immediately
445 * to avoid having caller sleeping for ever.
446 */
447 if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
448 return POLLHUP;
449
450 poll_wait(fp, &st->disrupt_wait, wait);
451
452 seq = le32_to_cpu(st->clk->seq_count);
453 if (atomic_read(&fst->seq) != seq)
454 return POLLIN | POLLRDNORM;
455
456 return 0;
457 }
458
vmclock_miscdev_open(struct inode * inode,struct file * fp)459 static int vmclock_miscdev_open(struct inode *inode, struct file *fp)
460 {
461 struct vmclock_state *st = container_of(fp->private_data,
462 struct vmclock_state, miscdev);
463 struct vmclock_file_state *fst = kzalloc_obj(*fst);
464
465 if (!fst)
466 return -ENOMEM;
467
468 fst->st = st;
469 atomic_set(&fst->seq, 0);
470
471 fp->private_data = fst;
472
473 return 0;
474 }
475
vmclock_miscdev_release(struct inode * inode,struct file * fp)476 static int vmclock_miscdev_release(struct inode *inode, struct file *fp)
477 {
478 kfree(fp->private_data);
479 return 0;
480 }
481
482 static const struct file_operations vmclock_miscdev_fops = {
483 .owner = THIS_MODULE,
484 .open = vmclock_miscdev_open,
485 .release = vmclock_miscdev_release,
486 .mmap = vmclock_miscdev_mmap,
487 .read = vmclock_miscdev_read,
488 .poll = vmclock_miscdev_poll,
489 };
490
491 /* module operations */
492
493 #if IS_ENABLED(CONFIG_ACPI)
vmclock_acpi_resources(struct acpi_resource * ares,void * data)494 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
495 {
496 struct vmclock_state *st = data;
497 struct resource_win win;
498 struct resource *res = &win.res;
499
500 if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
501 return AE_OK;
502
503 /* There can be only one */
504 if (resource_type(&st->res) == IORESOURCE_MEM)
505 return AE_ERROR;
506
507 if (acpi_dev_resource_memory(ares, res) ||
508 acpi_dev_resource_address_space(ares, &win)) {
509
510 if (resource_type(res) != IORESOURCE_MEM ||
511 resource_size(res) < sizeof(st->clk))
512 return AE_ERROR;
513
514 st->res = *res;
515 return AE_OK;
516 }
517
518 return AE_ERROR;
519 }
520
521 static void
vmclock_acpi_notification_handler(acpi_handle __always_unused handle,u32 __always_unused event,void * dev)522 vmclock_acpi_notification_handler(acpi_handle __always_unused handle,
523 u32 __always_unused event, void *dev)
524 {
525 struct device *device = dev;
526 struct vmclock_state *st = device->driver_data;
527
528 wake_up_interruptible(&st->disrupt_wait);
529 }
530
vmclock_setup_acpi_notification(struct device * dev)531 static int vmclock_setup_acpi_notification(struct device *dev)
532 {
533 struct acpi_device *adev = ACPI_COMPANION(dev);
534 acpi_status status;
535
536 /*
537 * This should never happen as this function is only called when
538 * has_acpi_companion(dev) is true, but the logic is sufficiently
539 * complex that Coverity can't see the tautology.
540 */
541 if (!adev)
542 return -ENODEV;
543
544 status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
545 vmclock_acpi_notification_handler,
546 dev);
547 if (ACPI_FAILURE(status)) {
548 dev_err(dev, "failed to install notification handler");
549 return -ENODEV;
550 }
551
552 return 0;
553 }
554
vmclock_probe_acpi(struct device * dev,struct vmclock_state * st)555 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
556 {
557 struct acpi_device *adev = ACPI_COMPANION(dev);
558 acpi_status status;
559
560 /*
561 * This should never happen as this function is only called when
562 * has_acpi_companion(dev) is true, but the logic is sufficiently
563 * complex that Coverity can't see the tautology.
564 */
565 if (!adev)
566 return -ENODEV;
567
568 status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
569 vmclock_acpi_resources, st);
570 if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
571 dev_err(dev, "failed to get resources\n");
572 return -ENODEV;
573 }
574
575 return 0;
576 }
577 #endif /* CONFIG_ACPI */
578
vmclock_of_irq_handler(int __always_unused irq,void * _st)579 static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *_st)
580 {
581 struct vmclock_state *st = _st;
582
583 wake_up_interruptible(&st->disrupt_wait);
584 return IRQ_HANDLED;
585 }
586
vmclock_probe_dt(struct device * dev,struct vmclock_state * st)587 static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st)
588 {
589 struct platform_device *pdev = to_platform_device(dev);
590 struct resource *res;
591
592 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
593 if (!res)
594 return -ENODEV;
595
596 st->res = *res;
597
598 return 0;
599 }
600
vmclock_setup_of_notification(struct device * dev)601 static int vmclock_setup_of_notification(struct device *dev)
602 {
603 struct platform_device *pdev = to_platform_device(dev);
604 int irq;
605
606 irq = platform_get_irq(pdev, 0);
607 if (irq < 0)
608 return irq;
609
610 return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED,
611 "vmclock", dev->driver_data);
612 }
613
vmclock_setup_notification(struct device * dev,struct vmclock_state * st)614 static int vmclock_setup_notification(struct device *dev,
615 struct vmclock_state *st)
616 {
617 /* The device does not support notifications. Nothing else to do */
618 if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
619 return 0;
620
621 #if IS_ENABLED(CONFIG_ACPI)
622 if (has_acpi_companion(dev))
623 return vmclock_setup_acpi_notification(dev);
624 #endif
625 return vmclock_setup_of_notification(dev);
626 }
627
vmclock_remove(void * data)628 static void vmclock_remove(void *data)
629 {
630 struct device *dev = data;
631 struct vmclock_state *st = dev->driver_data;
632
633 if (!st) {
634 dev_err(dev, "%s called with NULL driver_data", __func__);
635 return;
636 }
637
638 #if IS_ENABLED(CONFIG_ACPI)
639 if (has_acpi_companion(dev))
640 acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle,
641 ACPI_DEVICE_NOTIFY,
642 vmclock_acpi_notification_handler);
643 #endif
644
645 if (st->ptp_clock)
646 ptp_clock_unregister(st->ptp_clock);
647
648 if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
649 misc_deregister(&st->miscdev);
650
651 dev->driver_data = NULL;
652 }
653
vmclock_put_idx(void * data)654 static void vmclock_put_idx(void *data)
655 {
656 struct vmclock_state *st = data;
657
658 ida_free(&vmclock_ida, st->index);
659 }
660
vmclock_probe(struct platform_device * pdev)661 static int vmclock_probe(struct platform_device *pdev)
662 {
663 struct device *dev = &pdev->dev;
664 struct vmclock_state *st;
665 int ret;
666
667 st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
668 if (!st)
669 return -ENOMEM;
670
671 #if IS_ENABLED(CONFIG_ACPI)
672 if (has_acpi_companion(dev))
673 ret = vmclock_probe_acpi(dev, st);
674 else
675 #endif
676 ret = vmclock_probe_dt(dev, st);
677
678 if (ret) {
679 dev_info(dev, "Failed to obtain physical address: %d\n", ret);
680 return ret;
681 }
682
683 if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
684 dev_info(dev, "Region too small (0x%llx)\n",
685 resource_size(&st->res));
686 return -EINVAL;
687 }
688 st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
689 MEMREMAP_WB | MEMREMAP_DEC);
690 if (IS_ERR(st->clk)) {
691 ret = PTR_ERR(st->clk);
692 dev_info(dev, "failed to map shared memory\n");
693 st->clk = NULL;
694 return ret;
695 }
696
697 if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
698 le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
699 le16_to_cpu(st->clk->version) != 1) {
700 dev_info(dev, "vmclock magic fields invalid\n");
701 return -EINVAL;
702 }
703
704 ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
705 if (ret < 0)
706 return ret;
707
708 st->index = ret;
709 ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
710 if (ret)
711 return ret;
712
713 st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
714 if (!st->name)
715 return -ENOMEM;
716
717 st->miscdev.minor = MISC_DYNAMIC_MINOR;
718
719 init_waitqueue_head(&st->disrupt_wait);
720 dev->driver_data = st;
721
722 ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev);
723 if (ret)
724 return ret;
725
726 ret = vmclock_setup_notification(dev, st);
727 if (ret)
728 return ret;
729
730 /*
731 * If the structure is big enough, it can be mapped to userspace.
732 * Theoretically a guest OS even using larger pages could still
733 * use 4KiB PTEs to map smaller MMIO regions like this, but let's
734 * cross that bridge if/when we come to it.
735 */
736 if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
737 st->miscdev.fops = &vmclock_miscdev_fops;
738 st->miscdev.name = st->name;
739
740 ret = misc_register(&st->miscdev);
741 if (ret)
742 return ret;
743 }
744
745 /* If there is valid clock information, register a PTP clock */
746 if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
747 /* Can return a silent NULL, or an error. */
748 st->ptp_clock = vmclock_ptp_register(dev, st);
749 if (IS_ERR(st->ptp_clock)) {
750 ret = PTR_ERR(st->ptp_clock);
751 st->ptp_clock = NULL;
752 return ret;
753 }
754 }
755
756 if (!st->miscdev.minor && !st->ptp_clock) {
757 /* Neither miscdev nor PTP registered */
758 dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
759 return -ENODEV;
760 }
761
762 dev_info(dev, "%s: registered %s%s%s\n", st->name,
763 st->miscdev.minor ? "miscdev" : "",
764 (st->miscdev.minor && st->ptp_clock) ? ", " : "",
765 st->ptp_clock ? "PTP" : "");
766
767 return 0;
768 }
769
770 static const struct acpi_device_id vmclock_acpi_ids[] = {
771 { "AMZNC10C", 0 },
772 { "VMCLOCK", 0 },
773 {}
774 };
775 MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
776
777 static const struct of_device_id vmclock_of_ids[] = {
778 { .compatible = "amazon,vmclock", },
779 { },
780 };
781 MODULE_DEVICE_TABLE(of, vmclock_of_ids);
782
783 static struct platform_driver vmclock_platform_driver = {
784 .probe = vmclock_probe,
785 .driver = {
786 .name = "vmclock",
787 .acpi_match_table = vmclock_acpi_ids,
788 .of_match_table = vmclock_of_ids,
789 },
790 };
791
792 module_platform_driver(vmclock_platform_driver)
793
794 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
795 MODULE_DESCRIPTION("PTP clock using VMCLOCK");
796 MODULE_LICENSE("GPL");
797