xref: /linux/arch/x86/xen/time.c (revision 2b8232ce512105e28453f301d1510de8363bccd1)
1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 
16 #include <asm/xen/hypervisor.h>
17 #include <asm/xen/hypercall.h>
18 
19 #include <xen/events.h>
20 #include <xen/interface/xen.h>
21 #include <xen/interface/vcpu.h>
22 
23 #include "xen-ops.h"
24 
25 #define XEN_SHIFT 22
26 
27 /* Xen may fire a timer up to this many ns early */
28 #define TIMER_SLOP	100000
29 #define NS_PER_TICK	(1000000000LL / HZ)
30 
31 static cycle_t xen_clocksource_read(void);
32 
33 /* These are perodically updated in shared_info, and then copied here. */
34 struct shadow_time_info {
35 	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
36 	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
37 	u32 tsc_to_nsec_mul;
38 	int tsc_shift;
39 	u32 version;
40 };
41 
42 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43 
44 /* runstate info updated by Xen */
45 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46 
47 /* snapshots of runstate info */
48 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
49 
50 /* unused ns of stolen and blocked time */
51 static DEFINE_PER_CPU(u64, residual_stolen);
52 static DEFINE_PER_CPU(u64, residual_blocked);
53 
54 /* return an consistent snapshot of 64-bit time/counter value */
55 static u64 get64(const u64 *p)
56 {
57 	u64 ret;
58 
59 	if (BITS_PER_LONG < 64) {
60 		u32 *p32 = (u32 *)p;
61 		u32 h, l;
62 
63 		/*
64 		 * Read high then low, and then make sure high is
65 		 * still the same; this will only loop if low wraps
66 		 * and carries into high.
67 		 * XXX some clean way to make this endian-proof?
68 		 */
69 		do {
70 			h = p32[1];
71 			barrier();
72 			l = p32[0];
73 			barrier();
74 		} while (p32[1] != h);
75 
76 		ret = (((u64)h) << 32) | l;
77 	} else
78 		ret = *p;
79 
80 	return ret;
81 }
82 
83 /*
84  * Runstate accounting
85  */
86 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
87 {
88 	u64 state_time;
89 	struct vcpu_runstate_info *state;
90 
91 	BUG_ON(preemptible());
92 
93 	state = &__get_cpu_var(runstate);
94 
95 	/*
96 	 * The runstate info is always updated by the hypervisor on
97 	 * the current CPU, so there's no need to use anything
98 	 * stronger than a compiler barrier when fetching it.
99 	 */
100 	do {
101 		state_time = get64(&state->state_entry_time);
102 		barrier();
103 		*res = *state;
104 		barrier();
105 	} while (get64(&state->state_entry_time) != state_time);
106 }
107 
108 static void setup_runstate_info(int cpu)
109 {
110 	struct vcpu_register_runstate_memory_area area;
111 
112 	area.addr.v = &per_cpu(runstate, cpu);
113 
114 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
115 			       cpu, &area))
116 		BUG();
117 }
118 
119 static void do_stolen_accounting(void)
120 {
121 	struct vcpu_runstate_info state;
122 	struct vcpu_runstate_info *snap;
123 	s64 blocked, runnable, offline, stolen;
124 	cputime_t ticks;
125 
126 	get_runstate_snapshot(&state);
127 
128 	WARN_ON(state.state != RUNSTATE_running);
129 
130 	snap = &__get_cpu_var(runstate_snapshot);
131 
132 	/* work out how much time the VCPU has not been runn*ing*  */
133 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
134 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
135 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
136 
137 	*snap = state;
138 
139 	/* Add the appropriate number of ticks of stolen time,
140 	   including any left-overs from last time.  Passing NULL to
141 	   account_steal_time accounts the time as stolen. */
142 	stolen = runnable + offline + __get_cpu_var(residual_stolen);
143 
144 	if (stolen < 0)
145 		stolen = 0;
146 
147 	ticks = 0;
148 	while (stolen >= NS_PER_TICK) {
149 		ticks++;
150 		stolen -= NS_PER_TICK;
151 	}
152 	__get_cpu_var(residual_stolen) = stolen;
153 	account_steal_time(NULL, ticks);
154 
155 	/* Add the appropriate number of ticks of blocked time,
156 	   including any left-overs from last time.  Passing idle to
157 	   account_steal_time accounts the time as idle/wait. */
158 	blocked += __get_cpu_var(residual_blocked);
159 
160 	if (blocked < 0)
161 		blocked = 0;
162 
163 	ticks = 0;
164 	while (blocked >= NS_PER_TICK) {
165 		ticks++;
166 		blocked -= NS_PER_TICK;
167 	}
168 	__get_cpu_var(residual_blocked) = blocked;
169 	account_steal_time(idle_task(smp_processor_id()), ticks);
170 }
171 
172 /*
173  * Xen sched_clock implementation.  Returns the number of unstolen
174  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
175  * states.
176  */
177 unsigned long long xen_sched_clock(void)
178 {
179 	struct vcpu_runstate_info state;
180 	cycle_t now;
181 	u64 ret;
182 	s64 offset;
183 
184 	/*
185 	 * Ideally sched_clock should be called on a per-cpu basis
186 	 * anyway, so preempt should already be disabled, but that's
187 	 * not current practice at the moment.
188 	 */
189 	preempt_disable();
190 
191 	now = xen_clocksource_read();
192 
193 	get_runstate_snapshot(&state);
194 
195 	WARN_ON(state.state != RUNSTATE_running);
196 
197 	offset = now - state.state_entry_time;
198 	if (offset < 0)
199 		offset = 0;
200 
201 	ret = state.time[RUNSTATE_blocked] +
202 		state.time[RUNSTATE_running] +
203 		offset;
204 
205 	preempt_enable();
206 
207 	return ret;
208 }
209 
210 
211 /* Get the CPU speed from Xen */
212 unsigned long xen_cpu_khz(void)
213 {
214 	u64 cpu_khz = 1000000ULL << 32;
215 	const struct vcpu_time_info *info =
216 		&HYPERVISOR_shared_info->vcpu_info[0].time;
217 
218 	do_div(cpu_khz, info->tsc_to_system_mul);
219 	if (info->tsc_shift < 0)
220 		cpu_khz <<= -info->tsc_shift;
221 	else
222 		cpu_khz >>= info->tsc_shift;
223 
224 	return cpu_khz;
225 }
226 
227 /*
228  * Reads a consistent set of time-base values from Xen, into a shadow data
229  * area.
230  */
231 static unsigned get_time_values_from_xen(void)
232 {
233 	struct vcpu_time_info   *src;
234 	struct shadow_time_info *dst;
235 
236 	/* src is shared memory with the hypervisor, so we need to
237 	   make sure we get a consistent snapshot, even in the face of
238 	   being preempted. */
239 	src = &__get_cpu_var(xen_vcpu)->time;
240 	dst = &__get_cpu_var(shadow_time);
241 
242 	do {
243 		dst->version = src->version;
244 		rmb();		/* fetch version before data */
245 		dst->tsc_timestamp     = src->tsc_timestamp;
246 		dst->system_timestamp  = src->system_time;
247 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
248 		dst->tsc_shift         = src->tsc_shift;
249 		rmb();		/* test version after fetching data */
250 	} while ((src->version & 1) | (dst->version ^ src->version));
251 
252 	return dst->version;
253 }
254 
255 /*
256  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
257  * yielding a 64-bit result.
258  */
259 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
260 {
261 	u64 product;
262 #ifdef __i386__
263 	u32 tmp1, tmp2;
264 #endif
265 
266 	if (shift < 0)
267 		delta >>= -shift;
268 	else
269 		delta <<= shift;
270 
271 #ifdef __i386__
272 	__asm__ (
273 		"mul  %5       ; "
274 		"mov  %4,%%eax ; "
275 		"mov  %%edx,%4 ; "
276 		"mul  %5       ; "
277 		"xor  %5,%5    ; "
278 		"add  %4,%%eax ; "
279 		"adc  %5,%%edx ; "
280 		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
281 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
282 #elif __x86_64__
283 	__asm__ (
284 		"mul %%rdx ; shrd $32,%%rdx,%%rax"
285 		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
286 #else
287 #error implement me!
288 #endif
289 
290 	return product;
291 }
292 
293 static u64 get_nsec_offset(struct shadow_time_info *shadow)
294 {
295 	u64 now, delta;
296 	now = native_read_tsc();
297 	delta = now - shadow->tsc_timestamp;
298 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
299 }
300 
301 static cycle_t xen_clocksource_read(void)
302 {
303 	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
304 	cycle_t ret;
305 	unsigned version;
306 
307 	do {
308 		version = get_time_values_from_xen();
309 		barrier();
310 		ret = shadow->system_timestamp + get_nsec_offset(shadow);
311 		barrier();
312 	} while (version != __get_cpu_var(xen_vcpu)->time.version);
313 
314 	put_cpu_var(shadow_time);
315 
316 	return ret;
317 }
318 
319 static void xen_read_wallclock(struct timespec *ts)
320 {
321 	const struct shared_info *s = HYPERVISOR_shared_info;
322 	u32 version;
323 	u64 delta;
324 	struct timespec now;
325 
326 	/* get wallclock at system boot */
327 	do {
328 		version = s->wc_version;
329 		rmb();		/* fetch version before time */
330 		now.tv_sec  = s->wc_sec;
331 		now.tv_nsec = s->wc_nsec;
332 		rmb();		/* fetch time before checking version */
333 	} while ((s->wc_version & 1) | (version ^ s->wc_version));
334 
335 	delta = xen_clocksource_read();	/* time since system boot */
336 	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
337 
338 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
339 	now.tv_sec = delta;
340 
341 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
342 }
343 
344 unsigned long xen_get_wallclock(void)
345 {
346 	struct timespec ts;
347 
348 	xen_read_wallclock(&ts);
349 
350 	return ts.tv_sec;
351 }
352 
353 int xen_set_wallclock(unsigned long now)
354 {
355 	/* do nothing for domU */
356 	return -1;
357 }
358 
359 static struct clocksource xen_clocksource __read_mostly = {
360 	.name = "xen",
361 	.rating = 400,
362 	.read = xen_clocksource_read,
363 	.mask = ~0,
364 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
365 	.shift = XEN_SHIFT,
366 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
367 };
368 
369 /*
370    Xen clockevent implementation
371 
372    Xen has two clockevent implementations:
373 
374    The old timer_op one works with all released versions of Xen prior
375    to version 3.0.4.  This version of the hypervisor provides a
376    single-shot timer with nanosecond resolution.  However, sharing the
377    same event channel is a 100Hz tick which is delivered while the
378    vcpu is running.  We don't care about or use this tick, but it will
379    cause the core time code to think the timer fired too soon, and
380    will end up resetting it each time.  It could be filtered, but
381    doing so has complications when the ktime clocksource is not yet
382    the xen clocksource (ie, at boot time).
383 
384    The new vcpu_op-based timer interface allows the tick timer period
385    to be changed or turned off.  The tick timer is not useful as a
386    periodic timer because events are only delivered to running vcpus.
387    The one-shot timer can report when a timeout is in the past, so
388    set_next_event is capable of returning -ETIME when appropriate.
389    This interface is used when available.
390 */
391 
392 
393 /*
394   Get a hypervisor absolute time.  In theory we could maintain an
395   offset between the kernel's time and the hypervisor's time, and
396   apply that to a kernel's absolute timeout.  Unfortunately the
397   hypervisor and kernel times can drift even if the kernel is using
398   the Xen clocksource, because ntp can warp the kernel's clocksource.
399 */
400 static s64 get_abs_timeout(unsigned long delta)
401 {
402 	return xen_clocksource_read() + delta;
403 }
404 
405 static void xen_timerop_set_mode(enum clock_event_mode mode,
406 				 struct clock_event_device *evt)
407 {
408 	switch (mode) {
409 	case CLOCK_EVT_MODE_PERIODIC:
410 		/* unsupported */
411 		WARN_ON(1);
412 		break;
413 
414 	case CLOCK_EVT_MODE_ONESHOT:
415 	case CLOCK_EVT_MODE_RESUME:
416 		break;
417 
418 	case CLOCK_EVT_MODE_UNUSED:
419 	case CLOCK_EVT_MODE_SHUTDOWN:
420 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
421 		break;
422 	}
423 }
424 
425 static int xen_timerop_set_next_event(unsigned long delta,
426 				      struct clock_event_device *evt)
427 {
428 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
429 
430 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
431 		BUG();
432 
433 	/* We may have missed the deadline, but there's no real way of
434 	   knowing for sure.  If the event was in the past, then we'll
435 	   get an immediate interrupt. */
436 
437 	return 0;
438 }
439 
440 static const struct clock_event_device xen_timerop_clockevent = {
441 	.name = "xen",
442 	.features = CLOCK_EVT_FEAT_ONESHOT,
443 
444 	.max_delta_ns = 0xffffffff,
445 	.min_delta_ns = TIMER_SLOP,
446 
447 	.mult = 1,
448 	.shift = 0,
449 	.rating = 500,
450 
451 	.set_mode = xen_timerop_set_mode,
452 	.set_next_event = xen_timerop_set_next_event,
453 };
454 
455 
456 
457 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
458 				struct clock_event_device *evt)
459 {
460 	int cpu = smp_processor_id();
461 
462 	switch (mode) {
463 	case CLOCK_EVT_MODE_PERIODIC:
464 		WARN_ON(1);	/* unsupported */
465 		break;
466 
467 	case CLOCK_EVT_MODE_ONESHOT:
468 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
469 			BUG();
470 		break;
471 
472 	case CLOCK_EVT_MODE_UNUSED:
473 	case CLOCK_EVT_MODE_SHUTDOWN:
474 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
475 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
476 			BUG();
477 		break;
478 	case CLOCK_EVT_MODE_RESUME:
479 		break;
480 	}
481 }
482 
483 static int xen_vcpuop_set_next_event(unsigned long delta,
484 				     struct clock_event_device *evt)
485 {
486 	int cpu = smp_processor_id();
487 	struct vcpu_set_singleshot_timer single;
488 	int ret;
489 
490 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
491 
492 	single.timeout_abs_ns = get_abs_timeout(delta);
493 	single.flags = VCPU_SSHOTTMR_future;
494 
495 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
496 
497 	BUG_ON(ret != 0 && ret != -ETIME);
498 
499 	return ret;
500 }
501 
502 static const struct clock_event_device xen_vcpuop_clockevent = {
503 	.name = "xen",
504 	.features = CLOCK_EVT_FEAT_ONESHOT,
505 
506 	.max_delta_ns = 0xffffffff,
507 	.min_delta_ns = TIMER_SLOP,
508 
509 	.mult = 1,
510 	.shift = 0,
511 	.rating = 500,
512 
513 	.set_mode = xen_vcpuop_set_mode,
514 	.set_next_event = xen_vcpuop_set_next_event,
515 };
516 
517 static const struct clock_event_device *xen_clockevent =
518 	&xen_timerop_clockevent;
519 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
520 
521 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
522 {
523 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
524 	irqreturn_t ret;
525 
526 	ret = IRQ_NONE;
527 	if (evt->event_handler) {
528 		evt->event_handler(evt);
529 		ret = IRQ_HANDLED;
530 	}
531 
532 	do_stolen_accounting();
533 
534 	return ret;
535 }
536 
537 void xen_setup_timer(int cpu)
538 {
539 	const char *name;
540 	struct clock_event_device *evt;
541 	int irq;
542 
543 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
544 
545 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
546 	if (!name)
547 		name = "<timer kasprintf failed>";
548 
549 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
550 				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
551 				      name, NULL);
552 
553 	evt = &per_cpu(xen_clock_events, cpu);
554 	memcpy(evt, xen_clockevent, sizeof(*evt));
555 
556 	evt->cpumask = cpumask_of_cpu(cpu);
557 	evt->irq = irq;
558 
559 	setup_runstate_info(cpu);
560 }
561 
562 void xen_setup_cpu_clockevents(void)
563 {
564 	BUG_ON(preemptible());
565 
566 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
567 }
568 
569 __init void xen_time_init(void)
570 {
571 	int cpu = smp_processor_id();
572 
573 	get_time_values_from_xen();
574 
575 	clocksource_register(&xen_clocksource);
576 
577 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
578 		/* Successfully turned off 100Hz tick, so we have the
579 		   vcpuop-based timer interface */
580 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
581 		xen_clockevent = &xen_vcpuop_clockevent;
582 	}
583 
584 	/* Set initial system time with full resolution */
585 	xen_read_wallclock(&xtime);
586 	set_normalized_timespec(&wall_to_monotonic,
587 				-xtime.tv_sec, -xtime.tv_nsec);
588 
589 	tsc_disable = 0;
590 
591 	xen_setup_timer(cpu);
592 	xen_setup_cpu_clockevents();
593 }
594