xref: /linux/arch/powerpc/kernel/time.c (revision 26deb04342e343ac58ab05bc7d2345ff0be9b667)
1 /*
2  * Common time routines among all ppc machines.
3  *
4  * Written by Cort Dougan (cort@cs.nmt.edu) to merge
5  * Paul Mackerras' version and mine for PReP and Pmac.
6  * MPC8xx/MBX changes by Dan Malek (dmalek@jlc.net).
7  * Converted for 64-bit by Mike Corrigan (mikejc@us.ibm.com)
8  *
9  * First round of bugfixes by Gabriel Paubert (paubert@iram.es)
10  * to make clock more stable (2.4.0-test5). The only thing
11  * that this code assumes is that the timebases have been synchronized
12  * by firmware on SMP and are never stopped (never do sleep
13  * on SMP then, nap and doze are OK).
14  *
15  * Speeded up do_gettimeofday by getting rid of references to
16  * xtime (which required locks for consistency). (mikejc@us.ibm.com)
17  *
18  * TODO (not necessarily in this file):
19  * - improve precision and reproducibility of timebase frequency
20  * measurement at boot time.
21  * - for astronomical applications: add a new function to get
22  * non ambiguous timestamps even around leap seconds. This needs
23  * a new timestamp format and a good name.
24  *
25  * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
26  *             "A Kernel Model for Precision Timekeeping" by Dave Mills
27  *
28  *      This program is free software; you can redistribute it and/or
29  *      modify it under the terms of the GNU General Public License
30  *      as published by the Free Software Foundation; either version
31  *      2 of the License, or (at your option) any later version.
32  */
33 
34 #include <linux/errno.h>
35 #include <linux/export.h>
36 #include <linux/sched.h>
37 #include <linux/sched/clock.h>
38 #include <linux/kernel.h>
39 #include <linux/param.h>
40 #include <linux/string.h>
41 #include <linux/mm.h>
42 #include <linux/interrupt.h>
43 #include <linux/timex.h>
44 #include <linux/kernel_stat.h>
45 #include <linux/time.h>
46 #include <linux/init.h>
47 #include <linux/profile.h>
48 #include <linux/cpu.h>
49 #include <linux/security.h>
50 #include <linux/percpu.h>
51 #include <linux/rtc.h>
52 #include <linux/jiffies.h>
53 #include <linux/posix-timers.h>
54 #include <linux/irq.h>
55 #include <linux/delay.h>
56 #include <linux/irq_work.h>
57 #include <linux/clk-provider.h>
58 #include <linux/suspend.h>
59 #include <linux/sched/cputime.h>
60 #include <linux/processor.h>
61 #include <asm/trace.h>
62 
63 #include <asm/io.h>
64 #include <asm/nvram.h>
65 #include <asm/cache.h>
66 #include <asm/machdep.h>
67 #include <linux/uaccess.h>
68 #include <asm/time.h>
69 #include <asm/prom.h>
70 #include <asm/irq.h>
71 #include <asm/div64.h>
72 #include <asm/smp.h>
73 #include <asm/vdso_datapage.h>
74 #include <asm/firmware.h>
75 #include <asm/asm-prototypes.h>
76 
77 /* powerpc clocksource/clockevent code */
78 
79 #include <linux/clockchips.h>
80 #include <linux/timekeeper_internal.h>
81 
82 static u64 rtc_read(struct clocksource *);
83 static struct clocksource clocksource_rtc = {
84 	.name         = "rtc",
85 	.rating       = 400,
86 	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
87 	.mask         = CLOCKSOURCE_MASK(64),
88 	.read         = rtc_read,
89 };
90 
91 static u64 timebase_read(struct clocksource *);
92 static struct clocksource clocksource_timebase = {
93 	.name         = "timebase",
94 	.rating       = 400,
95 	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
96 	.mask         = CLOCKSOURCE_MASK(64),
97 	.read         = timebase_read,
98 };
99 
100 #define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
101 u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
102 
103 static int decrementer_set_next_event(unsigned long evt,
104 				      struct clock_event_device *dev);
105 static int decrementer_shutdown(struct clock_event_device *evt);
106 
107 struct clock_event_device decrementer_clockevent = {
108 	.name			= "decrementer",
109 	.rating			= 200,
110 	.irq			= 0,
111 	.set_next_event		= decrementer_set_next_event,
112 	.set_state_oneshot_stopped = decrementer_shutdown,
113 	.set_state_shutdown	= decrementer_shutdown,
114 	.tick_resume		= decrementer_shutdown,
115 	.features		= CLOCK_EVT_FEAT_ONESHOT |
116 				  CLOCK_EVT_FEAT_C3STOP,
117 };
118 EXPORT_SYMBOL(decrementer_clockevent);
119 
120 DEFINE_PER_CPU(u64, decrementers_next_tb);
121 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
122 
123 #define XSEC_PER_SEC (1024*1024)
124 
125 #ifdef CONFIG_PPC64
126 #define SCALE_XSEC(xsec, max)	(((xsec) * max) / XSEC_PER_SEC)
127 #else
128 /* compute ((xsec << 12) * max) >> 32 */
129 #define SCALE_XSEC(xsec, max)	mulhwu((xsec) << 12, max)
130 #endif
131 
132 unsigned long tb_ticks_per_jiffy;
133 unsigned long tb_ticks_per_usec = 100; /* sane default */
134 EXPORT_SYMBOL(tb_ticks_per_usec);
135 unsigned long tb_ticks_per_sec;
136 EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
137 
138 DEFINE_SPINLOCK(rtc_lock);
139 EXPORT_SYMBOL_GPL(rtc_lock);
140 
141 static u64 tb_to_ns_scale __read_mostly;
142 static unsigned tb_to_ns_shift __read_mostly;
143 static u64 boot_tb __read_mostly;
144 
145 extern struct timezone sys_tz;
146 static long timezone_offset;
147 
148 unsigned long ppc_proc_freq;
149 EXPORT_SYMBOL_GPL(ppc_proc_freq);
150 unsigned long ppc_tb_freq;
151 EXPORT_SYMBOL_GPL(ppc_tb_freq);
152 
153 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
154 /*
155  * Factor for converting from cputime_t (timebase ticks) to
156  * microseconds. This is stored as 0.64 fixed-point binary fraction.
157  */
158 u64 __cputime_usec_factor;
159 EXPORT_SYMBOL(__cputime_usec_factor);
160 
161 #ifdef CONFIG_PPC_SPLPAR
162 void (*dtl_consumer)(struct dtl_entry *, u64);
163 #endif
164 
165 static void calc_cputime_factors(void)
166 {
167 	struct div_result res;
168 
169 	div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
170 	__cputime_usec_factor = res.result_low;
171 }
172 
173 /*
174  * Read the SPURR on systems that have it, otherwise the PURR,
175  * or if that doesn't exist return the timebase value passed in.
176  */
177 static inline unsigned long read_spurr(unsigned long tb)
178 {
179 	if (cpu_has_feature(CPU_FTR_SPURR))
180 		return mfspr(SPRN_SPURR);
181 	if (cpu_has_feature(CPU_FTR_PURR))
182 		return mfspr(SPRN_PURR);
183 	return tb;
184 }
185 
186 #ifdef CONFIG_PPC_SPLPAR
187 
188 /*
189  * Scan the dispatch trace log and count up the stolen time.
190  * Should be called with interrupts disabled.
191  */
192 static u64 scan_dispatch_log(u64 stop_tb)
193 {
194 	u64 i = local_paca->dtl_ridx;
195 	struct dtl_entry *dtl = local_paca->dtl_curr;
196 	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
197 	struct lppaca *vpa = local_paca->lppaca_ptr;
198 	u64 tb_delta;
199 	u64 stolen = 0;
200 	u64 dtb;
201 
202 	if (!dtl)
203 		return 0;
204 
205 	if (i == be64_to_cpu(vpa->dtl_idx))
206 		return 0;
207 	while (i < be64_to_cpu(vpa->dtl_idx)) {
208 		dtb = be64_to_cpu(dtl->timebase);
209 		tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
210 			be32_to_cpu(dtl->ready_to_enqueue_time);
211 		barrier();
212 		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
213 			/* buffer has overflowed */
214 			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
215 			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
216 			continue;
217 		}
218 		if (dtb > stop_tb)
219 			break;
220 		if (dtl_consumer)
221 			dtl_consumer(dtl, i);
222 		stolen += tb_delta;
223 		++i;
224 		++dtl;
225 		if (dtl == dtl_end)
226 			dtl = local_paca->dispatch_log;
227 	}
228 	local_paca->dtl_ridx = i;
229 	local_paca->dtl_curr = dtl;
230 	return stolen;
231 }
232 
233 /*
234  * Accumulate stolen time by scanning the dispatch trace log.
235  * Called on entry from user mode.
236  */
237 void accumulate_stolen_time(void)
238 {
239 	u64 sst, ust;
240 	unsigned long save_irq_soft_mask = irq_soft_mask_return();
241 	struct cpu_accounting_data *acct = &local_paca->accounting;
242 
243 	/* We are called early in the exception entry, before
244 	 * soft/hard_enabled are sync'ed to the expected state
245 	 * for the exception. We are hard disabled but the PACA
246 	 * needs to reflect that so various debug stuff doesn't
247 	 * complain
248 	 */
249 	irq_soft_mask_set(IRQS_DISABLED);
250 
251 	sst = scan_dispatch_log(acct->starttime_user);
252 	ust = scan_dispatch_log(acct->starttime);
253 	acct->stime -= sst;
254 	acct->utime -= ust;
255 	acct->steal_time += ust + sst;
256 
257 	irq_soft_mask_set(save_irq_soft_mask);
258 }
259 
260 static inline u64 calculate_stolen_time(u64 stop_tb)
261 {
262 	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
263 		return 0;
264 
265 	if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx))
266 		return scan_dispatch_log(stop_tb);
267 
268 	return 0;
269 }
270 
271 #else /* CONFIG_PPC_SPLPAR */
272 static inline u64 calculate_stolen_time(u64 stop_tb)
273 {
274 	return 0;
275 }
276 
277 #endif /* CONFIG_PPC_SPLPAR */
278 
279 /*
280  * Account time for a transition between system, hard irq
281  * or soft irq state.
282  */
283 static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct,
284 					unsigned long now, unsigned long stime)
285 {
286 	unsigned long stime_scaled = 0;
287 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
288 	unsigned long nowscaled, deltascaled;
289 	unsigned long utime, utime_scaled;
290 
291 	nowscaled = read_spurr(now);
292 	deltascaled = nowscaled - acct->startspurr;
293 	acct->startspurr = nowscaled;
294 	utime = acct->utime - acct->utime_sspurr;
295 	acct->utime_sspurr = acct->utime;
296 
297 	/*
298 	 * Because we don't read the SPURR on every kernel entry/exit,
299 	 * deltascaled includes both user and system SPURR ticks.
300 	 * Apportion these ticks to system SPURR ticks and user
301 	 * SPURR ticks in the same ratio as the system time (delta)
302 	 * and user time (udelta) values obtained from the timebase
303 	 * over the same interval.  The system ticks get accounted here;
304 	 * the user ticks get saved up in paca->user_time_scaled to be
305 	 * used by account_process_tick.
306 	 */
307 	stime_scaled = stime;
308 	utime_scaled = utime;
309 	if (deltascaled != stime + utime) {
310 		if (utime) {
311 			stime_scaled = deltascaled * stime / (stime + utime);
312 			utime_scaled = deltascaled - stime_scaled;
313 		} else {
314 			stime_scaled = deltascaled;
315 		}
316 	}
317 	acct->utime_scaled += utime_scaled;
318 #endif
319 
320 	return stime_scaled;
321 }
322 
323 static unsigned long vtime_delta(struct task_struct *tsk,
324 				 unsigned long *stime_scaled,
325 				 unsigned long *steal_time)
326 {
327 	unsigned long now, stime;
328 	struct cpu_accounting_data *acct = get_accounting(tsk);
329 
330 	WARN_ON_ONCE(!irqs_disabled());
331 
332 	now = mftb();
333 	stime = now - acct->starttime;
334 	acct->starttime = now;
335 
336 	*stime_scaled = vtime_delta_scaled(acct, now, stime);
337 
338 	*steal_time = calculate_stolen_time(now);
339 
340 	return stime;
341 }
342 
343 void vtime_account_system(struct task_struct *tsk)
344 {
345 	unsigned long stime, stime_scaled, steal_time;
346 	struct cpu_accounting_data *acct = get_accounting(tsk);
347 
348 	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
349 
350 	stime -= min(stime, steal_time);
351 	acct->steal_time += steal_time;
352 
353 	if ((tsk->flags & PF_VCPU) && !irq_count()) {
354 		acct->gtime += stime;
355 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
356 		acct->utime_scaled += stime_scaled;
357 #endif
358 	} else {
359 		if (hardirq_count())
360 			acct->hardirq_time += stime;
361 		else if (in_serving_softirq())
362 			acct->softirq_time += stime;
363 		else
364 			acct->stime += stime;
365 
366 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
367 		acct->stime_scaled += stime_scaled;
368 #endif
369 	}
370 }
371 EXPORT_SYMBOL_GPL(vtime_account_system);
372 
373 void vtime_account_idle(struct task_struct *tsk)
374 {
375 	unsigned long stime, stime_scaled, steal_time;
376 	struct cpu_accounting_data *acct = get_accounting(tsk);
377 
378 	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
379 	acct->idle_time += stime + steal_time;
380 }
381 
382 static void vtime_flush_scaled(struct task_struct *tsk,
383 			       struct cpu_accounting_data *acct)
384 {
385 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
386 	if (acct->utime_scaled)
387 		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
388 	if (acct->stime_scaled)
389 		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
390 
391 	acct->utime_scaled = 0;
392 	acct->utime_sspurr = 0;
393 	acct->stime_scaled = 0;
394 #endif
395 }
396 
397 /*
398  * Account the whole cputime accumulated in the paca
399  * Must be called with interrupts disabled.
400  * Assumes that vtime_account_system/idle() has been called
401  * recently (i.e. since the last entry from usermode) so that
402  * get_paca()->user_time_scaled is up to date.
403  */
404 void vtime_flush(struct task_struct *tsk)
405 {
406 	struct cpu_accounting_data *acct = get_accounting(tsk);
407 
408 	if (acct->utime)
409 		account_user_time(tsk, cputime_to_nsecs(acct->utime));
410 
411 	if (acct->gtime)
412 		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
413 
414 	if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) {
415 		account_steal_time(cputime_to_nsecs(acct->steal_time));
416 		acct->steal_time = 0;
417 	}
418 
419 	if (acct->idle_time)
420 		account_idle_time(cputime_to_nsecs(acct->idle_time));
421 
422 	if (acct->stime)
423 		account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
424 					  CPUTIME_SYSTEM);
425 
426 	if (acct->hardirq_time)
427 		account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
428 					  CPUTIME_IRQ);
429 	if (acct->softirq_time)
430 		account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
431 					  CPUTIME_SOFTIRQ);
432 
433 	vtime_flush_scaled(tsk, acct);
434 
435 	acct->utime = 0;
436 	acct->gtime = 0;
437 	acct->idle_time = 0;
438 	acct->stime = 0;
439 	acct->hardirq_time = 0;
440 	acct->softirq_time = 0;
441 }
442 
443 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
444 #define calc_cputime_factors()
445 #endif
446 
447 void __delay(unsigned long loops)
448 {
449 	unsigned long start;
450 	int diff;
451 
452 	spin_begin();
453 	if (__USE_RTC()) {
454 		start = get_rtcl();
455 		do {
456 			/* the RTCL register wraps at 1000000000 */
457 			diff = get_rtcl() - start;
458 			if (diff < 0)
459 				diff += 1000000000;
460 			spin_cpu_relax();
461 		} while (diff < loops);
462 	} else {
463 		start = get_tbl();
464 		while (get_tbl() - start < loops)
465 			spin_cpu_relax();
466 	}
467 	spin_end();
468 }
469 EXPORT_SYMBOL(__delay);
470 
471 void udelay(unsigned long usecs)
472 {
473 	__delay(tb_ticks_per_usec * usecs);
474 }
475 EXPORT_SYMBOL(udelay);
476 
477 #ifdef CONFIG_SMP
478 unsigned long profile_pc(struct pt_regs *regs)
479 {
480 	unsigned long pc = instruction_pointer(regs);
481 
482 	if (in_lock_functions(pc))
483 		return regs->link;
484 
485 	return pc;
486 }
487 EXPORT_SYMBOL(profile_pc);
488 #endif
489 
490 #ifdef CONFIG_IRQ_WORK
491 
492 /*
493  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
494  */
495 #ifdef CONFIG_PPC64
496 static inline unsigned long test_irq_work_pending(void)
497 {
498 	unsigned long x;
499 
500 	asm volatile("lbz %0,%1(13)"
501 		: "=r" (x)
502 		: "i" (offsetof(struct paca_struct, irq_work_pending)));
503 	return x;
504 }
505 
506 static inline void set_irq_work_pending_flag(void)
507 {
508 	asm volatile("stb %0,%1(13)" : :
509 		"r" (1),
510 		"i" (offsetof(struct paca_struct, irq_work_pending)));
511 }
512 
513 static inline void clear_irq_work_pending(void)
514 {
515 	asm volatile("stb %0,%1(13)" : :
516 		"r" (0),
517 		"i" (offsetof(struct paca_struct, irq_work_pending)));
518 }
519 
520 void arch_irq_work_raise(void)
521 {
522 	preempt_disable();
523 	set_irq_work_pending_flag();
524 	/*
525 	 * Non-nmi code running with interrupts disabled will replay
526 	 * irq_happened before it re-enables interrupts, so setthe
527 	 * decrementer there instead of causing a hardware exception
528 	 * which would immediately hit the masked interrupt handler
529 	 * and have the net effect of setting the decrementer in
530 	 * irq_happened.
531 	 *
532 	 * NMI interrupts can not check this when they return, so the
533 	 * decrementer hardware exception is raised, which will fire
534 	 * when interrupts are next enabled.
535 	 *
536 	 * BookE does not support this yet, it must audit all NMI
537 	 * interrupt handlers to ensure they call nmi_enter() so this
538 	 * check would be correct.
539 	 */
540 	if (IS_ENABLED(CONFIG_BOOKE) || !irqs_disabled() || in_nmi()) {
541 		set_dec(1);
542 	} else {
543 		hard_irq_disable();
544 		local_paca->irq_happened |= PACA_IRQ_DEC;
545 	}
546 	preempt_enable();
547 }
548 
549 #else /* 32-bit */
550 
551 DEFINE_PER_CPU(u8, irq_work_pending);
552 
553 #define set_irq_work_pending_flag()	__this_cpu_write(irq_work_pending, 1)
554 #define test_irq_work_pending()		__this_cpu_read(irq_work_pending)
555 #define clear_irq_work_pending()	__this_cpu_write(irq_work_pending, 0)
556 
557 void arch_irq_work_raise(void)
558 {
559 	preempt_disable();
560 	set_irq_work_pending_flag();
561 	set_dec(1);
562 	preempt_enable();
563 }
564 
565 #endif /* 32 vs 64 bit */
566 
567 #else  /* CONFIG_IRQ_WORK */
568 
569 #define test_irq_work_pending()	0
570 #define clear_irq_work_pending()
571 
572 #endif /* CONFIG_IRQ_WORK */
573 
574 /*
575  * timer_interrupt - gets called when the decrementer overflows,
576  * with interrupts disabled.
577  */
578 void timer_interrupt(struct pt_regs *regs)
579 {
580 	struct clock_event_device *evt = this_cpu_ptr(&decrementers);
581 	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
582 	struct pt_regs *old_regs;
583 	u64 now;
584 
585 	/* Some implementations of hotplug will get timer interrupts while
586 	 * offline, just ignore these and we also need to set
587 	 * decrementers_next_tb as MAX to make sure __check_irq_replay
588 	 * don't replay timer interrupt when return, otherwise we'll trap
589 	 * here infinitely :(
590 	 */
591 	if (unlikely(!cpu_online(smp_processor_id()))) {
592 		*next_tb = ~(u64)0;
593 		set_dec(decrementer_max);
594 		return;
595 	}
596 
597 	/* Ensure a positive value is written to the decrementer, or else
598 	 * some CPUs will continue to take decrementer exceptions. When the
599 	 * PPC_WATCHDOG (decrementer based) is configured, keep this at most
600 	 * 31 bits, which is about 4 seconds on most systems, which gives
601 	 * the watchdog a chance of catching timer interrupt hard lockups.
602 	 */
603 	if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
604 		set_dec(0x7fffffff);
605 	else
606 		set_dec(decrementer_max);
607 
608 	/* Conditionally hard-enable interrupts now that the DEC has been
609 	 * bumped to its maximum value
610 	 */
611 	may_hard_irq_enable();
612 
613 
614 #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
615 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
616 		do_IRQ(regs);
617 #endif
618 
619 	old_regs = set_irq_regs(regs);
620 	irq_enter();
621 	trace_timer_interrupt_entry(regs);
622 
623 	if (test_irq_work_pending()) {
624 		clear_irq_work_pending();
625 		irq_work_run();
626 	}
627 
628 	now = get_tb_or_rtc();
629 	if (now >= *next_tb) {
630 		*next_tb = ~(u64)0;
631 		if (evt->event_handler)
632 			evt->event_handler(evt);
633 		__this_cpu_inc(irq_stat.timer_irqs_event);
634 	} else {
635 		now = *next_tb - now;
636 		if (now <= decrementer_max)
637 			set_dec(now);
638 		/* We may have raced with new irq work */
639 		if (test_irq_work_pending())
640 			set_dec(1);
641 		__this_cpu_inc(irq_stat.timer_irqs_others);
642 	}
643 
644 	trace_timer_interrupt_exit(regs);
645 	irq_exit();
646 	set_irq_regs(old_regs);
647 }
648 EXPORT_SYMBOL(timer_interrupt);
649 
650 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
651 void timer_broadcast_interrupt(void)
652 {
653 	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
654 
655 	*next_tb = ~(u64)0;
656 	tick_receive_broadcast();
657 	__this_cpu_inc(irq_stat.broadcast_irqs_event);
658 }
659 #endif
660 
661 /*
662  * Hypervisor decrementer interrupts shouldn't occur but are sometimes
663  * left pending on exit from a KVM guest.  We don't need to do anything
664  * to clear them, as they are edge-triggered.
665  */
666 void hdec_interrupt(struct pt_regs *regs)
667 {
668 }
669 
670 #ifdef CONFIG_SUSPEND
671 static void generic_suspend_disable_irqs(void)
672 {
673 	/* Disable the decrementer, so that it doesn't interfere
674 	 * with suspending.
675 	 */
676 
677 	set_dec(decrementer_max);
678 	local_irq_disable();
679 	set_dec(decrementer_max);
680 }
681 
682 static void generic_suspend_enable_irqs(void)
683 {
684 	local_irq_enable();
685 }
686 
687 /* Overrides the weak version in kernel/power/main.c */
688 void arch_suspend_disable_irqs(void)
689 {
690 	if (ppc_md.suspend_disable_irqs)
691 		ppc_md.suspend_disable_irqs();
692 	generic_suspend_disable_irqs();
693 }
694 
695 /* Overrides the weak version in kernel/power/main.c */
696 void arch_suspend_enable_irqs(void)
697 {
698 	generic_suspend_enable_irqs();
699 	if (ppc_md.suspend_enable_irqs)
700 		ppc_md.suspend_enable_irqs();
701 }
702 #endif
703 
704 unsigned long long tb_to_ns(unsigned long long ticks)
705 {
706 	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
707 }
708 EXPORT_SYMBOL_GPL(tb_to_ns);
709 
710 /*
711  * Scheduler clock - returns current time in nanosec units.
712  *
713  * Note: mulhdu(a, b) (multiply high double unsigned) returns
714  * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
715  * are 64-bit unsigned numbers.
716  */
717 notrace unsigned long long sched_clock(void)
718 {
719 	if (__USE_RTC())
720 		return get_rtc();
721 	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
722 }
723 
724 
725 #ifdef CONFIG_PPC_PSERIES
726 
727 /*
728  * Running clock - attempts to give a view of time passing for a virtualised
729  * kernels.
730  * Uses the VTB register if available otherwise a next best guess.
731  */
732 unsigned long long running_clock(void)
733 {
734 	/*
735 	 * Don't read the VTB as a host since KVM does not switch in host
736 	 * timebase into the VTB when it takes a guest off the CPU, reading the
737 	 * VTB would result in reading 'last switched out' guest VTB.
738 	 *
739 	 * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it
740 	 * would be unsafe to rely only on the #ifdef above.
741 	 */
742 	if (firmware_has_feature(FW_FEATURE_LPAR) &&
743 	    cpu_has_feature(CPU_FTR_ARCH_207S))
744 		return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
745 
746 	/*
747 	 * This is a next best approximation without a VTB.
748 	 * On a host which is running bare metal there should never be any stolen
749 	 * time and on a host which doesn't do any virtualisation TB *should* equal
750 	 * VTB so it makes no difference anyway.
751 	 */
752 	return local_clock() - kcpustat_this_cpu->cpustat[CPUTIME_STEAL];
753 }
754 #endif
755 
756 static int __init get_freq(char *name, int cells, unsigned long *val)
757 {
758 	struct device_node *cpu;
759 	const __be32 *fp;
760 	int found = 0;
761 
762 	/* The cpu node should have timebase and clock frequency properties */
763 	cpu = of_find_node_by_type(NULL, "cpu");
764 
765 	if (cpu) {
766 		fp = of_get_property(cpu, name, NULL);
767 		if (fp) {
768 			found = 1;
769 			*val = of_read_ulong(fp, cells);
770 		}
771 
772 		of_node_put(cpu);
773 	}
774 
775 	return found;
776 }
777 
778 static void start_cpu_decrementer(void)
779 {
780 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
781 	unsigned int tcr;
782 
783 	/* Clear any pending timer interrupts */
784 	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
785 
786 	tcr = mfspr(SPRN_TCR);
787 	/*
788 	 * The watchdog may have already been enabled by u-boot. So leave
789 	 * TRC[WP] (Watchdog Period) alone.
790 	 */
791 	tcr &= TCR_WP_MASK;	/* Clear all bits except for TCR[WP] */
792 	tcr |= TCR_DIE;		/* Enable decrementer */
793 	mtspr(SPRN_TCR, tcr);
794 #endif
795 }
796 
797 void __init generic_calibrate_decr(void)
798 {
799 	ppc_tb_freq = DEFAULT_TB_FREQ;		/* hardcoded default */
800 
801 	if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) &&
802 	    !get_freq("timebase-frequency", 1, &ppc_tb_freq)) {
803 
804 		printk(KERN_ERR "WARNING: Estimating decrementer frequency "
805 				"(not found)\n");
806 	}
807 
808 	ppc_proc_freq = DEFAULT_PROC_FREQ;	/* hardcoded default */
809 
810 	if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) &&
811 	    !get_freq("clock-frequency", 1, &ppc_proc_freq)) {
812 
813 		printk(KERN_ERR "WARNING: Estimating processor frequency "
814 				"(not found)\n");
815 	}
816 }
817 
818 int update_persistent_clock64(struct timespec64 now)
819 {
820 	struct rtc_time tm;
821 
822 	if (!ppc_md.set_rtc_time)
823 		return -ENODEV;
824 
825 	rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm);
826 
827 	return ppc_md.set_rtc_time(&tm);
828 }
829 
830 static void __read_persistent_clock(struct timespec64 *ts)
831 {
832 	struct rtc_time tm;
833 	static int first = 1;
834 
835 	ts->tv_nsec = 0;
836 	/* XXX this is a litle fragile but will work okay in the short term */
837 	if (first) {
838 		first = 0;
839 		if (ppc_md.time_init)
840 			timezone_offset = ppc_md.time_init();
841 
842 		/* get_boot_time() isn't guaranteed to be safe to call late */
843 		if (ppc_md.get_boot_time) {
844 			ts->tv_sec = ppc_md.get_boot_time() - timezone_offset;
845 			return;
846 		}
847 	}
848 	if (!ppc_md.get_rtc_time) {
849 		ts->tv_sec = 0;
850 		return;
851 	}
852 	ppc_md.get_rtc_time(&tm);
853 
854 	ts->tv_sec = rtc_tm_to_time64(&tm);
855 }
856 
857 void read_persistent_clock64(struct timespec64 *ts)
858 {
859 	__read_persistent_clock(ts);
860 
861 	/* Sanitize it in case real time clock is set below EPOCH */
862 	if (ts->tv_sec < 0) {
863 		ts->tv_sec = 0;
864 		ts->tv_nsec = 0;
865 	}
866 
867 }
868 
869 /* clocksource code */
870 static notrace u64 rtc_read(struct clocksource *cs)
871 {
872 	return (u64)get_rtc();
873 }
874 
875 static notrace u64 timebase_read(struct clocksource *cs)
876 {
877 	return (u64)get_tb();
878 }
879 
880 
881 void update_vsyscall(struct timekeeper *tk)
882 {
883 	struct timespec xt;
884 	struct clocksource *clock = tk->tkr_mono.clock;
885 	u32 mult = tk->tkr_mono.mult;
886 	u32 shift = tk->tkr_mono.shift;
887 	u64 cycle_last = tk->tkr_mono.cycle_last;
888 	u64 new_tb_to_xs, new_stamp_xsec;
889 	u64 frac_sec;
890 
891 	if (clock != &clocksource_timebase)
892 		return;
893 
894 	xt.tv_sec = tk->xtime_sec;
895 	xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
896 
897 	/* Make userspace gettimeofday spin until we're done. */
898 	++vdso_data->tb_update_count;
899 	smp_mb();
900 
901 	/*
902 	 * This computes ((2^20 / 1e9) * mult) >> shift as a
903 	 * 0.64 fixed-point fraction.
904 	 * The computation in the else clause below won't overflow
905 	 * (as long as the timebase frequency is >= 1.049 MHz)
906 	 * but loses precision because we lose the low bits of the constant
907 	 * in the shift.  Note that 19342813113834067 ~= 2^(20+64) / 1e9.
908 	 * For a shift of 24 the error is about 0.5e-9, or about 0.5ns
909 	 * over a second.  (Shift values are usually 22, 23 or 24.)
910 	 * For high frequency clocks such as the 512MHz timebase clock
911 	 * on POWER[6789], the mult value is small (e.g. 32768000)
912 	 * and so we can shift the constant by 16 initially
913 	 * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
914 	 * remaining shifts after the multiplication, which gives a
915 	 * more accurate result (e.g. with mult = 32768000, shift = 24,
916 	 * the error is only about 1.2e-12, or 0.7ns over 10 minutes).
917 	 */
918 	if (mult <= 62500000 && clock->shift >= 16)
919 		new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
920 	else
921 		new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
922 
923 	/*
924 	 * Compute the fractional second in units of 2^-32 seconds.
925 	 * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
926 	 * in nanoseconds, so multiplying that by 2^32 / 1e9 gives
927 	 * it in units of 2^-32 seconds.
928 	 * We assume shift <= 32 because clocks_calc_mult_shift()
929 	 * generates shift values in the range 0 - 32.
930 	 */
931 	frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
932 	do_div(frac_sec, NSEC_PER_SEC);
933 
934 	/*
935 	 * Work out new stamp_xsec value for any legacy users of systemcfg.
936 	 * stamp_xsec is in units of 2^-20 seconds.
937 	 */
938 	new_stamp_xsec = frac_sec >> 12;
939 	new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
940 
941 	/*
942 	 * tb_update_count is used to allow the userspace gettimeofday code
943 	 * to assure itself that it sees a consistent view of the tb_to_xs and
944 	 * stamp_xsec variables.  It reads the tb_update_count, then reads
945 	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
946 	 * the two values of tb_update_count match and are even then the
947 	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
948 	 * loops back and reads them again until this criteria is met.
949 	 */
950 	vdso_data->tb_orig_stamp = cycle_last;
951 	vdso_data->stamp_xsec = new_stamp_xsec;
952 	vdso_data->tb_to_xs = new_tb_to_xs;
953 	vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
954 	vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
955 	vdso_data->stamp_xtime = xt;
956 	vdso_data->stamp_sec_fraction = frac_sec;
957 	smp_wmb();
958 	++(vdso_data->tb_update_count);
959 }
960 
961 void update_vsyscall_tz(void)
962 {
963 	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
964 	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
965 }
966 
967 static void __init clocksource_init(void)
968 {
969 	struct clocksource *clock;
970 
971 	if (__USE_RTC())
972 		clock = &clocksource_rtc;
973 	else
974 		clock = &clocksource_timebase;
975 
976 	if (clocksource_register_hz(clock, tb_ticks_per_sec)) {
977 		printk(KERN_ERR "clocksource: %s is already registered\n",
978 		       clock->name);
979 		return;
980 	}
981 
982 	printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n",
983 	       clock->name, clock->mult, clock->shift);
984 }
985 
986 static int decrementer_set_next_event(unsigned long evt,
987 				      struct clock_event_device *dev)
988 {
989 	__this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt);
990 	set_dec(evt);
991 
992 	/* We may have raced with new irq work */
993 	if (test_irq_work_pending())
994 		set_dec(1);
995 
996 	return 0;
997 }
998 
999 static int decrementer_shutdown(struct clock_event_device *dev)
1000 {
1001 	decrementer_set_next_event(decrementer_max, dev);
1002 	return 0;
1003 }
1004 
1005 static void register_decrementer_clockevent(int cpu)
1006 {
1007 	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
1008 
1009 	*dec = decrementer_clockevent;
1010 	dec->cpumask = cpumask_of(cpu);
1011 
1012 	clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max);
1013 
1014 	printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
1015 		    dec->name, dec->mult, dec->shift, cpu);
1016 
1017 	/* Set values for KVM, see kvm_emulate_dec() */
1018 	decrementer_clockevent.mult = dec->mult;
1019 	decrementer_clockevent.shift = dec->shift;
1020 }
1021 
1022 static void enable_large_decrementer(void)
1023 {
1024 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
1025 		return;
1026 
1027 	if (decrementer_max <= DECREMENTER_DEFAULT_MAX)
1028 		return;
1029 
1030 	/*
1031 	 * If we're running as the hypervisor we need to enable the LD manually
1032 	 * otherwise firmware should have done it for us.
1033 	 */
1034 	if (cpu_has_feature(CPU_FTR_HVMODE))
1035 		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_LD);
1036 }
1037 
1038 static void __init set_decrementer_max(void)
1039 {
1040 	struct device_node *cpu;
1041 	u32 bits = 32;
1042 
1043 	/* Prior to ISAv3 the decrementer is always 32 bit */
1044 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
1045 		return;
1046 
1047 	cpu = of_find_node_by_type(NULL, "cpu");
1048 
1049 	if (of_property_read_u32(cpu, "ibm,dec-bits", &bits) == 0) {
1050 		if (bits > 64 || bits < 32) {
1051 			pr_warn("time_init: firmware supplied invalid ibm,dec-bits");
1052 			bits = 32;
1053 		}
1054 
1055 		/* calculate the signed maximum given this many bits */
1056 		decrementer_max = (1ul << (bits - 1)) - 1;
1057 	}
1058 
1059 	of_node_put(cpu);
1060 
1061 	pr_info("time_init: %u bit decrementer (max: %llx)\n",
1062 		bits, decrementer_max);
1063 }
1064 
1065 static void __init init_decrementer_clockevent(void)
1066 {
1067 	register_decrementer_clockevent(smp_processor_id());
1068 }
1069 
1070 void secondary_cpu_time_init(void)
1071 {
1072 	/* Enable and test the large decrementer for this cpu */
1073 	enable_large_decrementer();
1074 
1075 	/* Start the decrementer on CPUs that have manual control
1076 	 * such as BookE
1077 	 */
1078 	start_cpu_decrementer();
1079 
1080 	/* FIME: Should make unrelatred change to move snapshot_timebase
1081 	 * call here ! */
1082 	register_decrementer_clockevent(smp_processor_id());
1083 }
1084 
1085 /* This function is only called on the boot processor */
1086 void __init time_init(void)
1087 {
1088 	struct div_result res;
1089 	u64 scale;
1090 	unsigned shift;
1091 
1092 	if (__USE_RTC()) {
1093 		/* 601 processor: dec counts down by 128 every 128ns */
1094 		ppc_tb_freq = 1000000000;
1095 	} else {
1096 		/* Normal PowerPC with timebase register */
1097 		ppc_md.calibrate_decr();
1098 		printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n",
1099 		       ppc_tb_freq / 1000000, ppc_tb_freq % 1000000);
1100 		printk(KERN_DEBUG "time_init: processor frequency   = %lu.%.6lu MHz\n",
1101 		       ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
1102 	}
1103 
1104 	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
1105 	tb_ticks_per_sec = ppc_tb_freq;
1106 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
1107 	calc_cputime_factors();
1108 
1109 	/*
1110 	 * Compute scale factor for sched_clock.
1111 	 * The calibrate_decr() function has set tb_ticks_per_sec,
1112 	 * which is the timebase frequency.
1113 	 * We compute 1e9 * 2^64 / tb_ticks_per_sec and interpret
1114 	 * the 128-bit result as a 64.64 fixed-point number.
1115 	 * We then shift that number right until it is less than 1.0,
1116 	 * giving us the scale factor and shift count to use in
1117 	 * sched_clock().
1118 	 */
1119 	div128_by_32(1000000000, 0, tb_ticks_per_sec, &res);
1120 	scale = res.result_low;
1121 	for (shift = 0; res.result_high != 0; ++shift) {
1122 		scale = (scale >> 1) | (res.result_high << 63);
1123 		res.result_high >>= 1;
1124 	}
1125 	tb_to_ns_scale = scale;
1126 	tb_to_ns_shift = shift;
1127 	/* Save the current timebase to pretty up CONFIG_PRINTK_TIME */
1128 	boot_tb = get_tb_or_rtc();
1129 
1130 	/* If platform provided a timezone (pmac), we correct the time */
1131 	if (timezone_offset) {
1132 		sys_tz.tz_minuteswest = -timezone_offset / 60;
1133 		sys_tz.tz_dsttime = 0;
1134 	}
1135 
1136 	vdso_data->tb_update_count = 0;
1137 	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
1138 
1139 	/* initialise and enable the large decrementer (if we have one) */
1140 	set_decrementer_max();
1141 	enable_large_decrementer();
1142 
1143 	/* Start the decrementer on CPUs that have manual control
1144 	 * such as BookE
1145 	 */
1146 	start_cpu_decrementer();
1147 
1148 	/* Register the clocksource */
1149 	clocksource_init();
1150 
1151 	init_decrementer_clockevent();
1152 	tick_setup_hrtimer_broadcast();
1153 
1154 #ifdef CONFIG_COMMON_CLK
1155 	of_clk_init(NULL);
1156 #endif
1157 }
1158 
1159 /*
1160  * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
1161  * result.
1162  */
1163 void div128_by_32(u64 dividend_high, u64 dividend_low,
1164 		  unsigned divisor, struct div_result *dr)
1165 {
1166 	unsigned long a, b, c, d;
1167 	unsigned long w, x, y, z;
1168 	u64 ra, rb, rc;
1169 
1170 	a = dividend_high >> 32;
1171 	b = dividend_high & 0xffffffff;
1172 	c = dividend_low >> 32;
1173 	d = dividend_low & 0xffffffff;
1174 
1175 	w = a / divisor;
1176 	ra = ((u64)(a - (w * divisor)) << 32) + b;
1177 
1178 	rb = ((u64) do_div(ra, divisor) << 32) + c;
1179 	x = ra;
1180 
1181 	rc = ((u64) do_div(rb, divisor) << 32) + d;
1182 	y = rb;
1183 
1184 	do_div(rc, divisor);
1185 	z = rc;
1186 
1187 	dr->result_high = ((u64)w << 32) + x;
1188 	dr->result_low  = ((u64)y << 32) + z;
1189 
1190 }
1191 
1192 /* We don't need to calibrate delay, we use the CPU timebase for that */
1193 void calibrate_delay(void)
1194 {
1195 	/* Some generic code (such as spinlock debug) use loops_per_jiffy
1196 	 * as the number of __delay(1) in a jiffy, so make it so
1197 	 */
1198 	loops_per_jiffy = tb_ticks_per_jiffy;
1199 }
1200 
1201 #if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
1202 static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
1203 {
1204 	ppc_md.get_rtc_time(tm);
1205 	return 0;
1206 }
1207 
1208 static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
1209 {
1210 	if (!ppc_md.set_rtc_time)
1211 		return -EOPNOTSUPP;
1212 
1213 	if (ppc_md.set_rtc_time(tm) < 0)
1214 		return -EOPNOTSUPP;
1215 
1216 	return 0;
1217 }
1218 
1219 static const struct rtc_class_ops rtc_generic_ops = {
1220 	.read_time = rtc_generic_get_time,
1221 	.set_time = rtc_generic_set_time,
1222 };
1223 
1224 static int __init rtc_init(void)
1225 {
1226 	struct platform_device *pdev;
1227 
1228 	if (!ppc_md.get_rtc_time)
1229 		return -ENODEV;
1230 
1231 	pdev = platform_device_register_data(NULL, "rtc-generic", -1,
1232 					     &rtc_generic_ops,
1233 					     sizeof(rtc_generic_ops));
1234 
1235 	return PTR_ERR_OR_ZERO(pdev);
1236 }
1237 
1238 device_initcall(rtc_init);
1239 #endif
1240