xref: /linux/drivers/ptp/ptp_vmclock.c (revision fcc79e1714e8c2b8e216dc3149812edd37884eef)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Virtual PTP 1588 clock for use with LM-safe VMclock device.
4  *
5  * Copyright © 2024 Amazon.com, Inc. or its affiliates.
6  */
7 
8 #include <linux/acpi.h>
9 #include <linux/device.h>
10 #include <linux/err.h>
11 #include <linux/file.h>
12 #include <linux/fs.h>
13 #include <linux/init.h>
14 #include <linux/kernel.h>
15 #include <linux/miscdevice.h>
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/platform_device.h>
19 #include <linux/slab.h>
20 
21 #include <uapi/linux/vmclock-abi.h>
22 
23 #include <linux/ptp_clock_kernel.h>
24 
25 #ifdef CONFIG_X86
26 #include <asm/pvclock.h>
27 #include <asm/kvmclock.h>
28 #endif
29 
30 #ifdef CONFIG_KVM_GUEST
31 #define SUPPORT_KVMCLOCK
32 #endif
33 
34 static DEFINE_IDA(vmclock_ida);
35 
36 ACPI_MODULE_NAME("vmclock");
37 
38 struct vmclock_state {
39 	struct resource res;
40 	struct vmclock_abi *clk;
41 	struct miscdevice miscdev;
42 	struct ptp_clock_info ptp_clock_info;
43 	struct ptp_clock *ptp_clock;
44 	enum clocksource_ids cs_id, sys_cs_id;
45 	int index;
46 	char *name;
47 };
48 
49 #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
50 
51 /* Require at least the flags field to be present. All else can be optional. */
52 #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
53 
54 #define VMCLOCK_FIELD_PRESENT(_c, _f)			  \
55 	(le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) +	\
56 				     sizeof((_c)->_f)))
57 
58 /*
59  * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
60  * and add the fractional second part of the reference time.
61  *
62  * The result is a 128-bit value, the top 64 bits of which are seconds, and
63  * the low 64 bits are (seconds >> 64).
64  */
65 static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
66 					uint64_t period, uint8_t shift,
67 					uint64_t frac_sec)
68 {
69 	unsigned __int128 res = (unsigned __int128)delta * period;
70 
71 	res >>= shift;
72 	res += frac_sec;
73 	*res_hi = res >> 64;
74 	return (uint64_t)res;
75 }
76 
77 static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
78 {
79 	if (likely(clk->time_type == VMCLOCK_TIME_UTC))
80 		return true;
81 
82 	if (clk->time_type == VMCLOCK_TIME_TAI &&
83 	    (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
84 		if (sec)
85 			*sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
86 		return true;
87 	}
88 	return false;
89 }
90 
91 static int vmclock_get_crosststamp(struct vmclock_state *st,
92 				   struct ptp_system_timestamp *sts,
93 				   struct system_counterval_t *system_counter,
94 				   struct timespec64 *tspec)
95 {
96 	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
97 	struct system_time_snapshot systime_snapshot;
98 	uint64_t cycle, delta, seq, frac_sec;
99 
100 #ifdef CONFIG_X86
101 	/*
102 	 * We'd expect the hypervisor to know this and to report the clock
103 	 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
104 	 */
105 	if (check_tsc_unstable())
106 		return -EINVAL;
107 #endif
108 
109 	while (1) {
110 		seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
111 
112 		/*
113 		 * This pairs with a write barrier in the hypervisor
114 		 * which populates this structure.
115 		 */
116 		virt_rmb();
117 
118 		if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
119 			return -EINVAL;
120 
121 		/*
122 		 * When invoked for gettimex64(), fill in the pre/post system
123 		 * times. The simple case is when system time is based on the
124 		 * same counter as st->cs_id, in which case all three times
125 		 * will be derived from the *same* counter value.
126 		 *
127 		 * If the system isn't using the same counter, then the value
128 		 * from ktime_get_snapshot() will still be used as pre_ts, and
129 		 * ptp_read_system_postts() is called to populate postts after
130 		 * calling get_cycles().
131 		 *
132 		 * The conversion to timespec64 happens further down, outside
133 		 * the seq_count loop.
134 		 */
135 		if (sts) {
136 			ktime_get_snapshot(&systime_snapshot);
137 			if (systime_snapshot.cs_id == st->cs_id) {
138 				cycle = systime_snapshot.cycles;
139 			} else {
140 				cycle = get_cycles();
141 				ptp_read_system_postts(sts);
142 			}
143 		} else {
144 			cycle = get_cycles();
145 		}
146 
147 		delta = cycle - le64_to_cpu(st->clk->counter_value);
148 
149 		frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
150 						   le64_to_cpu(st->clk->counter_period_frac_sec),
151 						   st->clk->counter_period_shift,
152 						   le64_to_cpu(st->clk->time_frac_sec));
153 		tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
154 		tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
155 
156 		if (!tai_adjust(st->clk, &tspec->tv_sec))
157 			return -EINVAL;
158 
159 		/*
160 		 * This pairs with a write barrier in the hypervisor
161 		 * which populates this structure.
162 		 */
163 		virt_rmb();
164 		if (seq == le32_to_cpu(st->clk->seq_count))
165 			break;
166 
167 		if (ktime_after(ktime_get(), deadline))
168 			return -ETIMEDOUT;
169 	}
170 
171 	if (system_counter) {
172 		system_counter->cycles = cycle;
173 		system_counter->cs_id = st->cs_id;
174 	}
175 
176 	if (sts) {
177 		sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
178 		if (systime_snapshot.cs_id == st->cs_id)
179 			sts->post_ts = sts->pre_ts;
180 	}
181 
182 	return 0;
183 }
184 
185 #ifdef SUPPORT_KVMCLOCK
186 /*
187  * In the case where the system is using the KVM clock for timekeeping, convert
188  * the TSC value into a KVM clock time in order to return a paired reading that
189  * get_device_system_crosststamp() can cope with.
190  */
191 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
192 					    struct ptp_system_timestamp *sts,
193 					    struct system_counterval_t *system_counter,
194 					    struct timespec64 *tspec)
195 {
196 	struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
197 	unsigned int pvti_ver;
198 	int ret;
199 
200 	preempt_disable_notrace();
201 
202 	do {
203 		pvti_ver = pvclock_read_begin(pvti);
204 
205 		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
206 		if (ret)
207 			break;
208 
209 		system_counter->cycles = __pvclock_read_cycles(pvti,
210 							       system_counter->cycles);
211 		system_counter->cs_id = CSID_X86_KVM_CLK;
212 
213 		/*
214 		 * This retry should never really happen; if the TSC is
215 		 * stable and reliable enough across vCPUS that it is sane
216 		 * for the hypervisor to expose a VMCLOCK device which uses
217 		 * it as the reference counter, then the KVM clock sohuld be
218 		 * in 'master clock mode' and basically never changed. But
219 		 * the KVM clock is a fickle and often broken thing, so do
220 		 * it "properly" just in case.
221 		 */
222 	} while (pvclock_read_retry(pvti, pvti_ver));
223 
224 	preempt_enable_notrace();
225 
226 	return ret;
227 }
228 #endif
229 
230 static int ptp_vmclock_get_time_fn(ktime_t *device_time,
231 				   struct system_counterval_t *system_counter,
232 				   void *ctx)
233 {
234 	struct vmclock_state *st = ctx;
235 	struct timespec64 tspec;
236 	int ret;
237 
238 #ifdef SUPPORT_KVMCLOCK
239 	if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
240 		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
241 						       &tspec);
242 	else
243 #endif
244 		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
245 
246 	if (!ret)
247 		*device_time = timespec64_to_ktime(tspec);
248 
249 	return ret;
250 }
251 
252 static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
253 				      struct system_device_crosststamp *xtstamp)
254 {
255 	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
256 						ptp_clock_info);
257 	int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
258 						NULL, xtstamp);
259 #ifdef SUPPORT_KVMCLOCK
260 	/*
261 	 * On x86, the KVM clock may be used for the system time. We can
262 	 * actually convert a TSC reading to that, and return a paired
263 	 * timestamp that get_device_system_crosststamp() *can* handle.
264 	 */
265 	if (ret == -ENODEV) {
266 		struct system_time_snapshot systime_snapshot;
267 
268 		ktime_get_snapshot(&systime_snapshot);
269 
270 		if (systime_snapshot.cs_id == CSID_X86_TSC ||
271 		    systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
272 			WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
273 			ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
274 							    st, NULL, xtstamp);
275 		}
276 	}
277 #endif
278 	return ret;
279 }
280 
281 /*
282  * PTP clock operations
283  */
284 
285 static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
286 {
287 	return -EOPNOTSUPP;
288 }
289 
290 static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
291 {
292 	return -EOPNOTSUPP;
293 }
294 
295 static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
296 			   const struct timespec64 *ts)
297 {
298 	return -EOPNOTSUPP;
299 }
300 
301 static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
302 				struct ptp_system_timestamp *sts)
303 {
304 	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
305 						ptp_clock_info);
306 
307 	return vmclock_get_crosststamp(st, sts, NULL, ts);
308 }
309 
310 static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
311 			  struct ptp_clock_request *rq, int on)
312 {
313 	return -EOPNOTSUPP;
314 }
315 
316 static const struct ptp_clock_info ptp_vmclock_info = {
317 	.owner		= THIS_MODULE,
318 	.max_adj	= 0,
319 	.n_ext_ts	= 0,
320 	.n_pins		= 0,
321 	.pps		= 0,
322 	.adjfine	= ptp_vmclock_adjfine,
323 	.adjtime	= ptp_vmclock_adjtime,
324 	.gettimex64	= ptp_vmclock_gettimex,
325 	.settime64	= ptp_vmclock_settime,
326 	.enable		= ptp_vmclock_enable,
327 	.getcrosststamp = ptp_vmclock_getcrosststamp,
328 };
329 
330 static struct ptp_clock *vmclock_ptp_register(struct device *dev,
331 					      struct vmclock_state *st)
332 {
333 	enum clocksource_ids cs_id;
334 
335 	if (IS_ENABLED(CONFIG_ARM64) &&
336 	    st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
337 		/* Can we check it's the virtual counter? */
338 		cs_id = CSID_ARM_ARCH_COUNTER;
339 	} else if (IS_ENABLED(CONFIG_X86) &&
340 		   st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
341 		cs_id = CSID_X86_TSC;
342 	} else {
343 		return NULL;
344 	}
345 
346 	/* Only UTC, or TAI with offset */
347 	if (!tai_adjust(st->clk, NULL)) {
348 		dev_info(dev, "vmclock does not provide unambiguous UTC\n");
349 		return NULL;
350 	}
351 
352 	st->sys_cs_id = cs_id;
353 	st->cs_id = cs_id;
354 	st->ptp_clock_info = ptp_vmclock_info;
355 	strscpy(st->ptp_clock_info.name, st->name);
356 
357 	return ptp_clock_register(&st->ptp_clock_info, dev);
358 }
359 
360 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
361 {
362 	struct vmclock_state *st = container_of(fp->private_data,
363 						struct vmclock_state, miscdev);
364 
365 	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
366 		return -EROFS;
367 
368 	if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
369 		return -EINVAL;
370 
371 	if (io_remap_pfn_range(vma, vma->vm_start,
372 			       st->res.start >> PAGE_SHIFT, PAGE_SIZE,
373 			       vma->vm_page_prot))
374 		return -EAGAIN;
375 
376 	return 0;
377 }
378 
379 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
380 				    size_t count, loff_t *ppos)
381 {
382 	struct vmclock_state *st = container_of(fp->private_data,
383 						struct vmclock_state, miscdev);
384 	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
385 	size_t max_count;
386 	uint32_t seq;
387 
388 	if (*ppos >= PAGE_SIZE)
389 		return 0;
390 
391 	max_count = PAGE_SIZE - *ppos;
392 	if (count > max_count)
393 		count = max_count;
394 
395 	while (1) {
396 		seq = le32_to_cpu(st->clk->seq_count) & ~1U;
397 		/* Pairs with hypervisor wmb */
398 		virt_rmb();
399 
400 		if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
401 			return -EFAULT;
402 
403 		/* Pairs with hypervisor wmb */
404 		virt_rmb();
405 		if (seq == le32_to_cpu(st->clk->seq_count))
406 			break;
407 
408 		if (ktime_after(ktime_get(), deadline))
409 			return -ETIMEDOUT;
410 	}
411 
412 	*ppos += count;
413 	return count;
414 }
415 
416 static const struct file_operations vmclock_miscdev_fops = {
417 	.mmap = vmclock_miscdev_mmap,
418 	.read = vmclock_miscdev_read,
419 };
420 
421 /* module operations */
422 
423 static void vmclock_remove(struct platform_device *pdev)
424 {
425 	struct device *dev = &pdev->dev;
426 	struct vmclock_state *st = dev_get_drvdata(dev);
427 
428 	if (st->ptp_clock)
429 		ptp_clock_unregister(st->ptp_clock);
430 
431 	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
432 		misc_deregister(&st->miscdev);
433 }
434 
435 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
436 {
437 	struct vmclock_state *st = data;
438 	struct resource_win win;
439 	struct resource *res = &win.res;
440 
441 	if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
442 		return AE_OK;
443 
444 	/* There can be only one */
445 	if (resource_type(&st->res) == IORESOURCE_MEM)
446 		return AE_ERROR;
447 
448 	if (acpi_dev_resource_memory(ares, res) ||
449 	    acpi_dev_resource_address_space(ares, &win)) {
450 
451 		if (resource_type(res) != IORESOURCE_MEM ||
452 		    resource_size(res) < sizeof(st->clk))
453 			return AE_ERROR;
454 
455 		st->res = *res;
456 		return AE_OK;
457 	}
458 
459 	return AE_ERROR;
460 }
461 
462 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
463 {
464 	struct acpi_device *adev = ACPI_COMPANION(dev);
465 	acpi_status status;
466 
467 	/*
468 	 * This should never happen as this function is only called when
469 	 * has_acpi_companion(dev) is true, but the logic is sufficiently
470 	 * complex that Coverity can't see the tautology.
471 	 */
472 	if (!adev)
473 		return -ENODEV;
474 
475 	status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
476 				     vmclock_acpi_resources, st);
477 	if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
478 		dev_err(dev, "failed to get resources\n");
479 		return -ENODEV;
480 	}
481 
482 	return 0;
483 }
484 
485 static void vmclock_put_idx(void *data)
486 {
487 	struct vmclock_state *st = data;
488 
489 	ida_free(&vmclock_ida, st->index);
490 }
491 
492 static int vmclock_probe(struct platform_device *pdev)
493 {
494 	struct device *dev = &pdev->dev;
495 	struct vmclock_state *st;
496 	int ret;
497 
498 	st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
499 	if (!st)
500 		return -ENOMEM;
501 
502 	if (has_acpi_companion(dev))
503 		ret = vmclock_probe_acpi(dev, st);
504 	else
505 		ret = -EINVAL; /* Only ACPI for now */
506 
507 	if (ret) {
508 		dev_info(dev, "Failed to obtain physical address: %d\n", ret);
509 		goto out;
510 	}
511 
512 	if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
513 		dev_info(dev, "Region too small (0x%llx)\n",
514 			 resource_size(&st->res));
515 		ret = -EINVAL;
516 		goto out;
517 	}
518 	st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
519 				MEMREMAP_WB | MEMREMAP_DEC);
520 	if (IS_ERR(st->clk)) {
521 		ret = PTR_ERR(st->clk);
522 		dev_info(dev, "failed to map shared memory\n");
523 		st->clk = NULL;
524 		goto out;
525 	}
526 
527 	if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
528 	    le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
529 	    le16_to_cpu(st->clk->version) != 1) {
530 		dev_info(dev, "vmclock magic fields invalid\n");
531 		ret = -EINVAL;
532 		goto out;
533 	}
534 
535 	ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
536 	if (ret < 0)
537 		goto out;
538 
539 	st->index = ret;
540 	ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
541 	if (ret)
542 		goto out;
543 
544 	st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
545 	if (!st->name) {
546 		ret = -ENOMEM;
547 		goto out;
548 	}
549 
550 	/*
551 	 * If the structure is big enough, it can be mapped to userspace.
552 	 * Theoretically a guest OS even using larger pages could still
553 	 * use 4KiB PTEs to map smaller MMIO regions like this, but let's
554 	 * cross that bridge if/when we come to it.
555 	 */
556 	if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
557 		st->miscdev.minor = MISC_DYNAMIC_MINOR;
558 		st->miscdev.fops = &vmclock_miscdev_fops;
559 		st->miscdev.name = st->name;
560 
561 		ret = misc_register(&st->miscdev);
562 		if (ret)
563 			goto out;
564 	}
565 
566 	/* If there is valid clock information, register a PTP clock */
567 	if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
568 		/* Can return a silent NULL, or an error. */
569 		st->ptp_clock = vmclock_ptp_register(dev, st);
570 		if (IS_ERR(st->ptp_clock)) {
571 			ret = PTR_ERR(st->ptp_clock);
572 			st->ptp_clock = NULL;
573 			vmclock_remove(pdev);
574 			goto out;
575 		}
576 	}
577 
578 	if (!st->miscdev.minor && !st->ptp_clock) {
579 		/* Neither miscdev nor PTP registered */
580 		dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
581 		ret = -ENODEV;
582 		goto out;
583 	}
584 
585 	dev_info(dev, "%s: registered %s%s%s\n", st->name,
586 		 st->miscdev.minor ? "miscdev" : "",
587 		 (st->miscdev.minor && st->ptp_clock) ? ", " : "",
588 		 st->ptp_clock ? "PTP" : "");
589 
590 	dev_set_drvdata(dev, st);
591 
592  out:
593 	return ret;
594 }
595 
596 static const struct acpi_device_id vmclock_acpi_ids[] = {
597 	{ "AMZNC10C", 0 },
598 	{}
599 };
600 MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
601 
602 static struct platform_driver vmclock_platform_driver = {
603 	.probe		= vmclock_probe,
604 	.remove_new	= vmclock_remove,
605 	.driver	= {
606 		.name	= "vmclock",
607 		.acpi_match_table = vmclock_acpi_ids,
608 	},
609 };
610 
611 module_platform_driver(vmclock_platform_driver)
612 
613 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
614 MODULE_DESCRIPTION("PTP clock using VMCLOCK");
615 MODULE_LICENSE("GPL");
616