xref: /linux/drivers/ptp/ptp_vmclock.c (revision 348f968b89bfeec0bb53dd82dba58b94d97fbd34)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * Virtual PTP 1588 clock for use with LM-safe VMclock device.
4   *
5   * Copyright © 2024 Amazon.com, Inc. or its affiliates.
6   */
7  
8  #include <linux/acpi.h>
9  #include <linux/device.h>
10  #include <linux/err.h>
11  #include <linux/file.h>
12  #include <linux/fs.h>
13  #include <linux/init.h>
14  #include <linux/kernel.h>
15  #include <linux/miscdevice.h>
16  #include <linux/mm.h>
17  #include <linux/module.h>
18  #include <linux/platform_device.h>
19  #include <linux/slab.h>
20  
21  #include <uapi/linux/vmclock-abi.h>
22  
23  #include <linux/ptp_clock_kernel.h>
24  
25  #ifdef CONFIG_X86
26  #include <asm/pvclock.h>
27  #include <asm/kvmclock.h>
28  #endif
29  
30  #ifdef CONFIG_KVM_GUEST
31  #define SUPPORT_KVMCLOCK
32  #endif
33  
34  static DEFINE_IDA(vmclock_ida);
35  
36  ACPI_MODULE_NAME("vmclock");
37  
38  struct vmclock_state {
39  	struct resource res;
40  	struct vmclock_abi *clk;
41  	struct miscdevice miscdev;
42  	struct ptp_clock_info ptp_clock_info;
43  	struct ptp_clock *ptp_clock;
44  	enum clocksource_ids cs_id, sys_cs_id;
45  	int index;
46  	char *name;
47  };
48  
49  #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
50  
51  /* Require at least the flags field to be present. All else can be optional. */
52  #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
53  
54  #define VMCLOCK_FIELD_PRESENT(_c, _f)			  \
55  	(le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) +	\
56  				     sizeof((_c)->_f)))
57  
58  /*
59   * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
60   * and add the fractional second part of the reference time.
61   *
62   * The result is a 128-bit value, the top 64 bits of which are seconds, and
63   * the low 64 bits are (seconds >> 64).
64   */
mul_u64_u64_shr_add_u64(uint64_t * res_hi,uint64_t delta,uint64_t period,uint8_t shift,uint64_t frac_sec)65  static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
66  					uint64_t period, uint8_t shift,
67  					uint64_t frac_sec)
68  {
69  	unsigned __int128 res = (unsigned __int128)delta * period;
70  
71  	res >>= shift;
72  	res += frac_sec;
73  	*res_hi = res >> 64;
74  	return (uint64_t)res;
75  }
76  
tai_adjust(struct vmclock_abi * clk,uint64_t * sec)77  static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
78  {
79  	if (likely(clk->time_type == VMCLOCK_TIME_UTC))
80  		return true;
81  
82  	if (clk->time_type == VMCLOCK_TIME_TAI &&
83  	    (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
84  		if (sec)
85  			*sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
86  		return true;
87  	}
88  	return false;
89  }
90  
vmclock_get_crosststamp(struct vmclock_state * st,struct ptp_system_timestamp * sts,struct system_counterval_t * system_counter,struct timespec64 * tspec)91  static int vmclock_get_crosststamp(struct vmclock_state *st,
92  				   struct ptp_system_timestamp *sts,
93  				   struct system_counterval_t *system_counter,
94  				   struct timespec64 *tspec)
95  {
96  	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
97  	struct system_time_snapshot systime_snapshot;
98  	uint64_t cycle, delta, seq, frac_sec;
99  
100  #ifdef CONFIG_X86
101  	/*
102  	 * We'd expect the hypervisor to know this and to report the clock
103  	 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
104  	 */
105  	if (check_tsc_unstable())
106  		return -EINVAL;
107  #endif
108  
109  	while (1) {
110  		seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
111  
112  		/*
113  		 * This pairs with a write barrier in the hypervisor
114  		 * which populates this structure.
115  		 */
116  		virt_rmb();
117  
118  		if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
119  			return -EINVAL;
120  
121  		/*
122  		 * When invoked for gettimex64(), fill in the pre/post system
123  		 * times. The simple case is when system time is based on the
124  		 * same counter as st->cs_id, in which case all three times
125  		 * will be derived from the *same* counter value.
126  		 *
127  		 * If the system isn't using the same counter, then the value
128  		 * from ktime_get_snapshot() will still be used as pre_ts, and
129  		 * ptp_read_system_postts() is called to populate postts after
130  		 * calling get_cycles().
131  		 *
132  		 * The conversion to timespec64 happens further down, outside
133  		 * the seq_count loop.
134  		 */
135  		if (sts) {
136  			ktime_get_snapshot(&systime_snapshot);
137  			if (systime_snapshot.cs_id == st->cs_id) {
138  				cycle = systime_snapshot.cycles;
139  			} else {
140  				cycle = get_cycles();
141  				ptp_read_system_postts(sts);
142  			}
143  		} else {
144  			cycle = get_cycles();
145  		}
146  
147  		delta = cycle - le64_to_cpu(st->clk->counter_value);
148  
149  		frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
150  						   le64_to_cpu(st->clk->counter_period_frac_sec),
151  						   st->clk->counter_period_shift,
152  						   le64_to_cpu(st->clk->time_frac_sec));
153  		tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
154  		tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
155  
156  		if (!tai_adjust(st->clk, &tspec->tv_sec))
157  			return -EINVAL;
158  
159  		/*
160  		 * This pairs with a write barrier in the hypervisor
161  		 * which populates this structure.
162  		 */
163  		virt_rmb();
164  		if (seq == le32_to_cpu(st->clk->seq_count))
165  			break;
166  
167  		if (ktime_after(ktime_get(), deadline))
168  			return -ETIMEDOUT;
169  	}
170  
171  	if (system_counter) {
172  		system_counter->cycles = cycle;
173  		system_counter->cs_id = st->cs_id;
174  	}
175  
176  	if (sts) {
177  		sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
178  		if (systime_snapshot.cs_id == st->cs_id)
179  			sts->post_ts = sts->pre_ts;
180  	}
181  
182  	return 0;
183  }
184  
185  #ifdef SUPPORT_KVMCLOCK
186  /*
187   * In the case where the system is using the KVM clock for timekeeping, convert
188   * the TSC value into a KVM clock time in order to return a paired reading that
189   * get_device_system_crosststamp() can cope with.
190   */
vmclock_get_crosststamp_kvmclock(struct vmclock_state * st,struct ptp_system_timestamp * sts,struct system_counterval_t * system_counter,struct timespec64 * tspec)191  static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
192  					    struct ptp_system_timestamp *sts,
193  					    struct system_counterval_t *system_counter,
194  					    struct timespec64 *tspec)
195  {
196  	struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
197  	unsigned int pvti_ver;
198  	int ret;
199  
200  	preempt_disable_notrace();
201  
202  	do {
203  		pvti_ver = pvclock_read_begin(pvti);
204  
205  		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
206  		if (ret)
207  			break;
208  
209  		system_counter->cycles = __pvclock_read_cycles(pvti,
210  							       system_counter->cycles);
211  		system_counter->cs_id = CSID_X86_KVM_CLK;
212  
213  		/*
214  		 * This retry should never really happen; if the TSC is
215  		 * stable and reliable enough across vCPUS that it is sane
216  		 * for the hypervisor to expose a VMCLOCK device which uses
217  		 * it as the reference counter, then the KVM clock sohuld be
218  		 * in 'master clock mode' and basically never changed. But
219  		 * the KVM clock is a fickle and often broken thing, so do
220  		 * it "properly" just in case.
221  		 */
222  	} while (pvclock_read_retry(pvti, pvti_ver));
223  
224  	preempt_enable_notrace();
225  
226  	return ret;
227  }
228  #endif
229  
ptp_vmclock_get_time_fn(ktime_t * device_time,struct system_counterval_t * system_counter,void * ctx)230  static int ptp_vmclock_get_time_fn(ktime_t *device_time,
231  				   struct system_counterval_t *system_counter,
232  				   void *ctx)
233  {
234  	struct vmclock_state *st = ctx;
235  	struct timespec64 tspec;
236  	int ret;
237  
238  #ifdef SUPPORT_KVMCLOCK
239  	if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
240  		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
241  						       &tspec);
242  	else
243  #endif
244  		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
245  
246  	if (!ret)
247  		*device_time = timespec64_to_ktime(tspec);
248  
249  	return ret;
250  }
251  
ptp_vmclock_getcrosststamp(struct ptp_clock_info * ptp,struct system_device_crosststamp * xtstamp)252  static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
253  				      struct system_device_crosststamp *xtstamp)
254  {
255  	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
256  						ptp_clock_info);
257  	int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
258  						NULL, xtstamp);
259  #ifdef SUPPORT_KVMCLOCK
260  	/*
261  	 * On x86, the KVM clock may be used for the system time. We can
262  	 * actually convert a TSC reading to that, and return a paired
263  	 * timestamp that get_device_system_crosststamp() *can* handle.
264  	 */
265  	if (ret == -ENODEV) {
266  		struct system_time_snapshot systime_snapshot;
267  
268  		ktime_get_snapshot(&systime_snapshot);
269  
270  		if (systime_snapshot.cs_id == CSID_X86_TSC ||
271  		    systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
272  			WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
273  			ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
274  							    st, NULL, xtstamp);
275  		}
276  	}
277  #endif
278  	return ret;
279  }
280  
281  /*
282   * PTP clock operations
283   */
284  
ptp_vmclock_adjfine(struct ptp_clock_info * ptp,long delta)285  static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
286  {
287  	return -EOPNOTSUPP;
288  }
289  
ptp_vmclock_adjtime(struct ptp_clock_info * ptp,s64 delta)290  static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
291  {
292  	return -EOPNOTSUPP;
293  }
294  
ptp_vmclock_settime(struct ptp_clock_info * ptp,const struct timespec64 * ts)295  static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
296  			   const struct timespec64 *ts)
297  {
298  	return -EOPNOTSUPP;
299  }
300  
ptp_vmclock_gettimex(struct ptp_clock_info * ptp,struct timespec64 * ts,struct ptp_system_timestamp * sts)301  static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
302  				struct ptp_system_timestamp *sts)
303  {
304  	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
305  						ptp_clock_info);
306  
307  	return vmclock_get_crosststamp(st, sts, NULL, ts);
308  }
309  
ptp_vmclock_enable(struct ptp_clock_info * ptp,struct ptp_clock_request * rq,int on)310  static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
311  			  struct ptp_clock_request *rq, int on)
312  {
313  	return -EOPNOTSUPP;
314  }
315  
316  static const struct ptp_clock_info ptp_vmclock_info = {
317  	.owner		= THIS_MODULE,
318  	.max_adj	= 0,
319  	.n_ext_ts	= 0,
320  	.n_pins		= 0,
321  	.pps		= 0,
322  	.adjfine	= ptp_vmclock_adjfine,
323  	.adjtime	= ptp_vmclock_adjtime,
324  	.gettimex64	= ptp_vmclock_gettimex,
325  	.settime64	= ptp_vmclock_settime,
326  	.enable		= ptp_vmclock_enable,
327  	.getcrosststamp = ptp_vmclock_getcrosststamp,
328  };
329  
vmclock_ptp_register(struct device * dev,struct vmclock_state * st)330  static struct ptp_clock *vmclock_ptp_register(struct device *dev,
331  					      struct vmclock_state *st)
332  {
333  	enum clocksource_ids cs_id;
334  
335  	if (IS_ENABLED(CONFIG_ARM64) &&
336  	    st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
337  		/* Can we check it's the virtual counter? */
338  		cs_id = CSID_ARM_ARCH_COUNTER;
339  	} else if (IS_ENABLED(CONFIG_X86) &&
340  		   st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
341  		cs_id = CSID_X86_TSC;
342  	} else {
343  		return NULL;
344  	}
345  
346  	/* Only UTC, or TAI with offset */
347  	if (!tai_adjust(st->clk, NULL)) {
348  		dev_info(dev, "vmclock does not provide unambiguous UTC\n");
349  		return NULL;
350  	}
351  
352  	st->sys_cs_id = cs_id;
353  	st->cs_id = cs_id;
354  	st->ptp_clock_info = ptp_vmclock_info;
355  	strscpy(st->ptp_clock_info.name, st->name);
356  
357  	return ptp_clock_register(&st->ptp_clock_info, dev);
358  }
359  
vmclock_miscdev_mmap(struct file * fp,struct vm_area_struct * vma)360  static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
361  {
362  	struct vmclock_state *st = container_of(fp->private_data,
363  						struct vmclock_state, miscdev);
364  
365  	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
366  		return -EROFS;
367  
368  	if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
369  		return -EINVAL;
370  
371  	if (io_remap_pfn_range(vma, vma->vm_start,
372  			       st->res.start >> PAGE_SHIFT, PAGE_SIZE,
373  			       vma->vm_page_prot))
374  		return -EAGAIN;
375  
376  	return 0;
377  }
378  
vmclock_miscdev_read(struct file * fp,char __user * buf,size_t count,loff_t * ppos)379  static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
380  				    size_t count, loff_t *ppos)
381  {
382  	struct vmclock_state *st = container_of(fp->private_data,
383  						struct vmclock_state, miscdev);
384  	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
385  	size_t max_count;
386  	uint32_t seq;
387  
388  	if (*ppos >= PAGE_SIZE)
389  		return 0;
390  
391  	max_count = PAGE_SIZE - *ppos;
392  	if (count > max_count)
393  		count = max_count;
394  
395  	while (1) {
396  		seq = le32_to_cpu(st->clk->seq_count) & ~1U;
397  		/* Pairs with hypervisor wmb */
398  		virt_rmb();
399  
400  		if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
401  			return -EFAULT;
402  
403  		/* Pairs with hypervisor wmb */
404  		virt_rmb();
405  		if (seq == le32_to_cpu(st->clk->seq_count))
406  			break;
407  
408  		if (ktime_after(ktime_get(), deadline))
409  			return -ETIMEDOUT;
410  	}
411  
412  	*ppos += count;
413  	return count;
414  }
415  
416  static const struct file_operations vmclock_miscdev_fops = {
417  	.owner = THIS_MODULE,
418  	.mmap = vmclock_miscdev_mmap,
419  	.read = vmclock_miscdev_read,
420  };
421  
422  /* module operations */
423  
vmclock_remove(void * data)424  static void vmclock_remove(void *data)
425  {
426  	struct vmclock_state *st = data;
427  
428  	if (st->ptp_clock)
429  		ptp_clock_unregister(st->ptp_clock);
430  
431  	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
432  		misc_deregister(&st->miscdev);
433  }
434  
vmclock_acpi_resources(struct acpi_resource * ares,void * data)435  static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
436  {
437  	struct vmclock_state *st = data;
438  	struct resource_win win;
439  	struct resource *res = &win.res;
440  
441  	if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
442  		return AE_OK;
443  
444  	/* There can be only one */
445  	if (resource_type(&st->res) == IORESOURCE_MEM)
446  		return AE_ERROR;
447  
448  	if (acpi_dev_resource_memory(ares, res) ||
449  	    acpi_dev_resource_address_space(ares, &win)) {
450  
451  		if (resource_type(res) != IORESOURCE_MEM ||
452  		    resource_size(res) < sizeof(st->clk))
453  			return AE_ERROR;
454  
455  		st->res = *res;
456  		return AE_OK;
457  	}
458  
459  	return AE_ERROR;
460  }
461  
vmclock_probe_acpi(struct device * dev,struct vmclock_state * st)462  static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
463  {
464  	struct acpi_device *adev = ACPI_COMPANION(dev);
465  	acpi_status status;
466  
467  	/*
468  	 * This should never happen as this function is only called when
469  	 * has_acpi_companion(dev) is true, but the logic is sufficiently
470  	 * complex that Coverity can't see the tautology.
471  	 */
472  	if (!adev)
473  		return -ENODEV;
474  
475  	status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
476  				     vmclock_acpi_resources, st);
477  	if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
478  		dev_err(dev, "failed to get resources\n");
479  		return -ENODEV;
480  	}
481  
482  	return 0;
483  }
484  
vmclock_put_idx(void * data)485  static void vmclock_put_idx(void *data)
486  {
487  	struct vmclock_state *st = data;
488  
489  	ida_free(&vmclock_ida, st->index);
490  }
491  
vmclock_probe(struct platform_device * pdev)492  static int vmclock_probe(struct platform_device *pdev)
493  {
494  	struct device *dev = &pdev->dev;
495  	struct vmclock_state *st;
496  	int ret;
497  
498  	st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
499  	if (!st)
500  		return -ENOMEM;
501  
502  	if (has_acpi_companion(dev))
503  		ret = vmclock_probe_acpi(dev, st);
504  	else
505  		ret = -EINVAL; /* Only ACPI for now */
506  
507  	if (ret) {
508  		dev_info(dev, "Failed to obtain physical address: %d\n", ret);
509  		return ret;
510  	}
511  
512  	if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
513  		dev_info(dev, "Region too small (0x%llx)\n",
514  			 resource_size(&st->res));
515  		return -EINVAL;
516  	}
517  	st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
518  				MEMREMAP_WB | MEMREMAP_DEC);
519  	if (IS_ERR(st->clk)) {
520  		ret = PTR_ERR(st->clk);
521  		dev_info(dev, "failed to map shared memory\n");
522  		st->clk = NULL;
523  		return ret;
524  	}
525  
526  	if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
527  	    le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
528  	    le16_to_cpu(st->clk->version) != 1) {
529  		dev_info(dev, "vmclock magic fields invalid\n");
530  		return -EINVAL;
531  	}
532  
533  	ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
534  	if (ret < 0)
535  		return ret;
536  
537  	st->index = ret;
538  	ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
539  	if (ret)
540  		return ret;
541  
542  	st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
543  	if (!st->name)
544  		return -ENOMEM;
545  
546  	st->miscdev.minor = MISC_DYNAMIC_MINOR;
547  
548  	ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st);
549  	if (ret)
550  		return ret;
551  
552  	/*
553  	 * If the structure is big enough, it can be mapped to userspace.
554  	 * Theoretically a guest OS even using larger pages could still
555  	 * use 4KiB PTEs to map smaller MMIO regions like this, but let's
556  	 * cross that bridge if/when we come to it.
557  	 */
558  	if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
559  		st->miscdev.fops = &vmclock_miscdev_fops;
560  		st->miscdev.name = st->name;
561  
562  		ret = misc_register(&st->miscdev);
563  		if (ret)
564  			return ret;
565  	}
566  
567  	/* If there is valid clock information, register a PTP clock */
568  	if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
569  		/* Can return a silent NULL, or an error. */
570  		st->ptp_clock = vmclock_ptp_register(dev, st);
571  		if (IS_ERR(st->ptp_clock)) {
572  			ret = PTR_ERR(st->ptp_clock);
573  			st->ptp_clock = NULL;
574  			return ret;
575  		}
576  	}
577  
578  	if (!st->miscdev.minor && !st->ptp_clock) {
579  		/* Neither miscdev nor PTP registered */
580  		dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
581  		return -ENODEV;
582  	}
583  
584  	dev_info(dev, "%s: registered %s%s%s\n", st->name,
585  		 st->miscdev.minor ? "miscdev" : "",
586  		 (st->miscdev.minor && st->ptp_clock) ? ", " : "",
587  		 st->ptp_clock ? "PTP" : "");
588  
589  	return 0;
590  }
591  
592  static const struct acpi_device_id vmclock_acpi_ids[] = {
593  	{ "AMZNC10C", 0 },
594  	{}
595  };
596  MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
597  
598  static struct platform_driver vmclock_platform_driver = {
599  	.probe		= vmclock_probe,
600  	.driver	= {
601  		.name	= "vmclock",
602  		.acpi_match_table = vmclock_acpi_ids,
603  	},
604  };
605  
606  module_platform_driver(vmclock_platform_driver)
607  
608  MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
609  MODULE_DESCRIPTION("PTP clock using VMCLOCK");
610  MODULE_LICENSE("GPL");
611