xref: /linux/kernel/time/timekeeping.c (revision 9b7fc3f14576c268f62fe0b882fac5e61239b659)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Kernel timekeeping code and accessor functions. Based on code from
4  *  timer.c, moved in commit 8524070b7982.
5  */
6 #include <linux/timekeeper_internal.h>
7 #include <linux/module.h>
8 #include <linux/interrupt.h>
9 #include <linux/kobject.h>
10 #include <linux/percpu.h>
11 #include <linux/init.h>
12 #include <linux/mm.h>
13 #include <linux/nmi.h>
14 #include <linux/sched.h>
15 #include <linux/sched/loadavg.h>
16 #include <linux/sched/clock.h>
17 #include <linux/syscore_ops.h>
18 #include <linux/clocksource.h>
19 #include <linux/jiffies.h>
20 #include <linux/time.h>
21 #include <linux/timex.h>
22 #include <linux/tick.h>
23 #include <linux/stop_machine.h>
24 #include <linux/pvclock_gtod.h>
25 #include <linux/compiler.h>
26 #include <linux/audit.h>
27 #include <linux/random.h>
28 
29 #include <vdso/auxclock.h>
30 
31 #include "tick-internal.h"
32 #include "ntp_internal.h"
33 #include "timekeeping_internal.h"
34 
35 #define TK_CLEAR_NTP		(1 << 0)
36 #define TK_CLOCK_WAS_SET	(1 << 1)
37 
38 #define TK_UPDATE_ALL		(TK_CLEAR_NTP | TK_CLOCK_WAS_SET)
39 
40 enum timekeeping_adv_mode {
41 	/* Update timekeeper when a tick has passed */
42 	TK_ADV_TICK,
43 
44 	/* Update timekeeper on a direct frequency change */
45 	TK_ADV_FREQ
46 };
47 
48 /*
49  * The most important data for readout fits into a single 64 byte
50  * cache line.
51  */
52 struct tk_data {
53 	seqcount_raw_spinlock_t	seq;
54 	struct timekeeper	timekeeper;
55 	struct timekeeper	shadow_timekeeper;
56 	raw_spinlock_t		lock;
57 } ____cacheline_aligned;
58 
59 static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
60 
61 /* The core timekeeper */
62 #define tk_core		(timekeeper_data[TIMEKEEPER_CORE])
63 
64 #ifdef CONFIG_POSIX_AUX_CLOCKS
65 static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
66 {
67 	return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
68 }
69 #else
70 static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
71 {
72 	return false;
73 }
74 #endif
75 
76 /* flag for if timekeeping is suspended */
77 int __read_mostly timekeeping_suspended;
78 
79 /**
80  * struct tk_fast - NMI safe timekeeper
81  * @seq:	Sequence counter for protecting updates. The lowest bit
82  *		is the index for the tk_read_base array
83  * @base:	tk_read_base array. Access is indexed by the lowest bit of
84  *		@seq.
85  *
86  * See @update_fast_timekeeper() below.
87  */
88 struct tk_fast {
89 	seqcount_latch_t	seq;
90 	struct tk_read_base	base[2];
91 };
92 
93 /* Suspend-time cycles value for halted fast timekeeper. */
94 static u64 cycles_at_suspend;
95 
96 static u64 dummy_clock_read(struct clocksource *cs)
97 {
98 	if (timekeeping_suspended)
99 		return cycles_at_suspend;
100 	return local_clock();
101 }
102 
103 static struct clocksource dummy_clock = {
104 	.read = dummy_clock_read,
105 };
106 
107 /*
108  * Boot time initialization which allows local_clock() to be utilized
109  * during early boot when clocksources are not available. local_clock()
110  * returns nanoseconds already so no conversion is required, hence mult=1
111  * and shift=0. When the first proper clocksource is installed then
112  * the fast time keepers are updated with the correct values.
113  */
114 #define FAST_TK_INIT						\
115 	{							\
116 		.clock		= &dummy_clock,			\
117 		.mask		= CLOCKSOURCE_MASK(64),		\
118 		.mult		= 1,				\
119 		.shift		= 0,				\
120 	}
121 
122 static struct tk_fast tk_fast_mono ____cacheline_aligned = {
123 	.seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
124 	.base[0] = FAST_TK_INIT,
125 	.base[1] = FAST_TK_INIT,
126 };
127 
128 static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
129 	.seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
130 	.base[0] = FAST_TK_INIT,
131 	.base[1] = FAST_TK_INIT,
132 };
133 
134 #ifdef CONFIG_POSIX_AUX_CLOCKS
135 static __init void tk_aux_setup(void);
136 static void tk_aux_update_clocksource(void);
137 static void tk_aux_advance(void);
138 #else
139 static inline void tk_aux_setup(void) { }
140 static inline void tk_aux_update_clocksource(void) { }
141 static inline void tk_aux_advance(void) { }
142 #endif
143 
144 unsigned long timekeeper_lock_irqsave(void)
145 {
146 	unsigned long flags;
147 
148 	raw_spin_lock_irqsave(&tk_core.lock, flags);
149 	return flags;
150 }
151 
152 void timekeeper_unlock_irqrestore(unsigned long flags)
153 {
154 	raw_spin_unlock_irqrestore(&tk_core.lock, flags);
155 }
156 
157 /*
158  * Multigrain timestamps require tracking the latest fine-grained timestamp
159  * that has been issued, and never returning a coarse-grained timestamp that is
160  * earlier than that value.
161  *
162  * mg_floor represents the latest fine-grained time that has been handed out as
163  * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
164  * converted to a realtime clock value on an as-needed basis.
165  *
166  * Maintaining mg_floor ensures the multigrain interfaces never issue a
167  * timestamp earlier than one that has been previously issued.
168  *
169  * The exception to this rule is when there is a backward realtime clock jump. If
170  * such an event occurs, a timestamp can appear to be earlier than a previous one.
171  */
172 static __cacheline_aligned_in_smp atomic64_t mg_floor;
173 
174 static inline void tk_normalize_xtime(struct timekeeper *tk)
175 {
176 	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
177 		tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
178 		tk->xtime_sec++;
179 	}
180 	while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
181 		tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
182 		tk->raw_sec++;
183 	}
184 }
185 
186 static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
187 {
188 	struct timespec64 ts;
189 
190 	ts.tv_sec = tk->xtime_sec;
191 	ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
192 	return ts;
193 }
194 
195 static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
196 {
197 	struct timespec64 ts;
198 
199 	ts.tv_sec = tk->xtime_sec;
200 	ts.tv_nsec = tk->coarse_nsec;
201 	return ts;
202 }
203 
204 /*
205  * Update the nanoseconds part for the coarse time keepers. They can't rely
206  * on xtime_nsec because xtime_nsec could be adjusted by a small negative
207  * amount when the multiplication factor of the clock is adjusted, which
208  * could cause the coarse clocks to go slightly backwards. See
209  * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
210  * clockids which only is updated when the clock has been set or  we have
211  * accumulated time.
212  */
213 static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
214 {
215 	tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
216 }
217 
218 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
219 {
220 	tk->xtime_sec = ts->tv_sec;
221 	tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
222 	tk_update_coarse_nsecs(tk);
223 }
224 
225 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
226 {
227 	tk->xtime_sec += ts->tv_sec;
228 	tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
229 	tk_normalize_xtime(tk);
230 	tk_update_coarse_nsecs(tk);
231 }
232 
233 static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
234 {
235 	struct timespec64 tmp;
236 
237 	/*
238 	 * Verify consistency of: offset_real = -wall_to_monotonic
239 	 * before modifying anything
240 	 */
241 	set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
242 					-tk->wall_to_monotonic.tv_nsec);
243 	WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
244 	tk->wall_to_monotonic = wtm;
245 	set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
246 	/* Paired with READ_ONCE() in ktime_mono_to_any() */
247 	WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
248 	WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
249 }
250 
251 static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
252 {
253 	/* Paired with READ_ONCE() in ktime_mono_to_any() */
254 	WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
255 	/*
256 	 * Timespec representation for VDSO update to avoid 64bit division
257 	 * on every update.
258 	 */
259 	tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
260 }
261 
262 /*
263  * tk_clock_read - atomic clocksource read() helper
264  *
265  * This helper is necessary to use in the read paths because, while the
266  * seqcount ensures we don't return a bad value while structures are updated,
267  * it doesn't protect from potential crashes. There is the possibility that
268  * the tkr's clocksource may change between the read reference, and the
269  * clock reference passed to the read function.  This can cause crashes if
270  * the wrong clocksource is passed to the wrong read function.
271  * This isn't necessary to use when holding the tk_core.lock or doing
272  * a read of the fast-timekeeper tkrs (which is protected by its own locking
273  * and update logic).
274  */
275 static inline u64 tk_clock_read(const struct tk_read_base *tkr)
276 {
277 	struct clocksource *clock = READ_ONCE(tkr->clock);
278 
279 	return clock->read(clock);
280 }
281 
282 /**
283  * tk_setup_internals - Set up internals to use clocksource clock.
284  *
285  * @tk:		The target timekeeper to setup.
286  * @clock:		Pointer to clocksource.
287  *
288  * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
289  * pair and interval request.
290  *
291  * Unless you're the timekeeping code, you should not be using this!
292  */
293 static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
294 {
295 	u64 interval;
296 	u64 tmp, ntpinterval;
297 	struct clocksource *old_clock;
298 
299 	++tk->cs_was_changed_seq;
300 	old_clock = tk->tkr_mono.clock;
301 	tk->tkr_mono.clock = clock;
302 	tk->tkr_mono.mask = clock->mask;
303 	tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
304 
305 	tk->tkr_raw.clock = clock;
306 	tk->tkr_raw.mask = clock->mask;
307 	tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
308 
309 	/* Do the ns -> cycle conversion first, using original mult */
310 	tmp = NTP_INTERVAL_LENGTH;
311 	tmp <<= clock->shift;
312 	ntpinterval = tmp;
313 	tmp += clock->mult/2;
314 	do_div(tmp, clock->mult);
315 	if (tmp == 0)
316 		tmp = 1;
317 
318 	interval = (u64) tmp;
319 	tk->cycle_interval = interval;
320 
321 	/* Go back from cycles -> shifted ns */
322 	tk->xtime_interval = interval * clock->mult;
323 	tk->xtime_remainder = ntpinterval - tk->xtime_interval;
324 	tk->raw_interval = interval * clock->mult;
325 
326 	 /* if changing clocks, convert xtime_nsec shift units */
327 	if (old_clock) {
328 		int shift_change = clock->shift - old_clock->shift;
329 		if (shift_change < 0) {
330 			tk->tkr_mono.xtime_nsec >>= -shift_change;
331 			tk->tkr_raw.xtime_nsec >>= -shift_change;
332 		} else {
333 			tk->tkr_mono.xtime_nsec <<= shift_change;
334 			tk->tkr_raw.xtime_nsec <<= shift_change;
335 		}
336 	}
337 
338 	tk->tkr_mono.shift = clock->shift;
339 	tk->tkr_raw.shift = clock->shift;
340 
341 	tk->ntp_error = 0;
342 	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
343 	tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
344 
345 	/*
346 	 * The timekeeper keeps its own mult values for the currently
347 	 * active clocksource. These value will be adjusted via NTP
348 	 * to counteract clock drifting.
349 	 */
350 	tk->tkr_mono.mult = clock->mult;
351 	tk->tkr_raw.mult = clock->mult;
352 	tk->ntp_err_mult = 0;
353 	tk->skip_second_overflow = 0;
354 }
355 
356 /* Timekeeper helper functions. */
357 static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
358 {
359 	return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
360 }
361 
362 static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
363 {
364 	/* Calculate the delta since the last update_wall_time() */
365 	u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
366 
367 	/*
368 	 * This detects both negative motion and the case where the delta
369 	 * overflows the multiplication with tkr->mult.
370 	 */
371 	if (unlikely(delta > tkr->clock->max_cycles)) {
372 		/*
373 		 * Handle clocksource inconsistency between CPUs to prevent
374 		 * time from going backwards by checking for the MSB of the
375 		 * mask being set in the delta.
376 		 */
377 		if (delta & ~(mask >> 1))
378 			return tkr->xtime_nsec >> tkr->shift;
379 
380 		return delta_to_ns_safe(tkr, delta);
381 	}
382 
383 	return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
384 }
385 
386 static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
387 {
388 	return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
389 }
390 
391 /**
392  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
393  * @tkr: Timekeeping readout base from which we take the update
394  * @tkf: Pointer to NMI safe timekeeper
395  *
396  * We want to use this from any context including NMI and tracing /
397  * instrumenting the timekeeping code itself.
398  *
399  * Employ the latch technique; see @write_seqcount_latch.
400  *
401  * So if a NMI hits the update of base[0] then it will use base[1]
402  * which is still consistent. In the worst case this can result is a
403  * slightly wrong timestamp (a few nanoseconds). See
404  * @ktime_get_mono_fast_ns.
405  */
406 static void update_fast_timekeeper(const struct tk_read_base *tkr,
407 				   struct tk_fast *tkf)
408 {
409 	struct tk_read_base *base = tkf->base;
410 
411 	/* Force readers off to base[1] */
412 	write_seqcount_latch_begin(&tkf->seq);
413 
414 	/* Update base[0] */
415 	memcpy(base, tkr, sizeof(*base));
416 
417 	/* Force readers back to base[0] */
418 	write_seqcount_latch(&tkf->seq);
419 
420 	/* Update base[1] */
421 	memcpy(base + 1, base, sizeof(*base));
422 
423 	write_seqcount_latch_end(&tkf->seq);
424 }
425 
426 static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
427 {
428 	struct tk_read_base *tkr;
429 	unsigned int seq;
430 	u64 now;
431 
432 	do {
433 		seq = read_seqcount_latch(&tkf->seq);
434 		tkr = tkf->base + (seq & 0x01);
435 		now = ktime_to_ns(tkr->base);
436 		now += timekeeping_get_ns(tkr);
437 	} while (read_seqcount_latch_retry(&tkf->seq, seq));
438 
439 	return now;
440 }
441 
442 /**
443  * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
444  *
445  * This timestamp is not guaranteed to be monotonic across an update.
446  * The timestamp is calculated by:
447  *
448  *	now = base_mono + clock_delta * slope
449  *
450  * So if the update lowers the slope, readers who are forced to the
451  * not yet updated second array are still using the old steeper slope.
452  *
453  * tmono
454  * ^
455  * |    o  n
456  * |   o n
457  * |  u
458  * | o
459  * |o
460  * |12345678---> reader order
461  *
462  * o = old slope
463  * u = update
464  * n = new slope
465  *
466  * So reader 6 will observe time going backwards versus reader 5.
467  *
468  * While other CPUs are likely to be able to observe that, the only way
469  * for a CPU local observation is when an NMI hits in the middle of
470  * the update. Timestamps taken from that NMI context might be ahead
471  * of the following timestamps. Callers need to be aware of that and
472  * deal with it.
473  */
474 u64 notrace ktime_get_mono_fast_ns(void)
475 {
476 	return __ktime_get_fast_ns(&tk_fast_mono);
477 }
478 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
479 
480 /**
481  * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
482  *
483  * Contrary to ktime_get_mono_fast_ns() this is always correct because the
484  * conversion factor is not affected by NTP/PTP correction.
485  */
486 u64 notrace ktime_get_raw_fast_ns(void)
487 {
488 	return __ktime_get_fast_ns(&tk_fast_raw);
489 }
490 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
491 
492 /**
493  * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
494  *
495  * To keep it NMI safe since we're accessing from tracing, we're not using a
496  * separate timekeeper with updates to monotonic clock and boot offset
497  * protected with seqcounts. This has the following minor side effects:
498  *
499  * (1) Its possible that a timestamp be taken after the boot offset is updated
500  * but before the timekeeper is updated. If this happens, the new boot offset
501  * is added to the old timekeeping making the clock appear to update slightly
502  * earlier:
503  *    CPU 0                                        CPU 1
504  *    timekeeping_inject_sleeptime64()
505  *    __timekeeping_inject_sleeptime(tk, delta);
506  *                                                 timestamp();
507  *    timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
508  *
509  * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
510  * partially updated.  Since the tk->offs_boot update is a rare event, this
511  * should be a rare occurrence which postprocessing should be able to handle.
512  *
513  * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
514  * apply as well.
515  */
516 u64 notrace ktime_get_boot_fast_ns(void)
517 {
518 	struct timekeeper *tk = &tk_core.timekeeper;
519 
520 	return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
521 }
522 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
523 
524 /**
525  * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
526  *
527  * The same limitations as described for ktime_get_boot_fast_ns() apply. The
528  * mono time and the TAI offset are not read atomically which may yield wrong
529  * readouts. However, an update of the TAI offset is an rare event e.g., caused
530  * by settime or adjtimex with an offset. The user of this function has to deal
531  * with the possibility of wrong timestamps in post processing.
532  */
533 u64 notrace ktime_get_tai_fast_ns(void)
534 {
535 	struct timekeeper *tk = &tk_core.timekeeper;
536 
537 	return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
538 }
539 EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
540 
541 /**
542  * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
543  *
544  * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
545  */
546 u64 ktime_get_real_fast_ns(void)
547 {
548 	struct tk_fast *tkf = &tk_fast_mono;
549 	struct tk_read_base *tkr;
550 	u64 baser, delta;
551 	unsigned int seq;
552 
553 	do {
554 		seq = raw_read_seqcount_latch(&tkf->seq);
555 		tkr = tkf->base + (seq & 0x01);
556 		baser = ktime_to_ns(tkr->base_real);
557 		delta = timekeeping_get_ns(tkr);
558 	} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
559 
560 	return baser + delta;
561 }
562 EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
563 
564 /**
565  * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
566  * @tk: Timekeeper to snapshot.
567  *
568  * It generally is unsafe to access the clocksource after timekeeping has been
569  * suspended, so take a snapshot of the readout base of @tk and use it as the
570  * fast timekeeper's readout base while suspended.  It will return the same
571  * number of cycles every time until timekeeping is resumed at which time the
572  * proper readout base for the fast timekeeper will be restored automatically.
573  */
574 static void halt_fast_timekeeper(const struct timekeeper *tk)
575 {
576 	static struct tk_read_base tkr_dummy;
577 	const struct tk_read_base *tkr = &tk->tkr_mono;
578 
579 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
580 	cycles_at_suspend = tk_clock_read(tkr);
581 	tkr_dummy.clock = &dummy_clock;
582 	tkr_dummy.base_real = tkr->base + tk->offs_real;
583 	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
584 
585 	tkr = &tk->tkr_raw;
586 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
587 	tkr_dummy.clock = &dummy_clock;
588 	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
589 }
590 
591 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
592 
593 static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
594 {
595 	raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
596 }
597 
598 /**
599  * pvclock_gtod_register_notifier - register a pvclock timedata update listener
600  * @nb: Pointer to the notifier block to register
601  */
602 int pvclock_gtod_register_notifier(struct notifier_block *nb)
603 {
604 	struct timekeeper *tk = &tk_core.timekeeper;
605 	int ret;
606 
607 	guard(raw_spinlock_irqsave)(&tk_core.lock);
608 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
609 	update_pvclock_gtod(tk, true);
610 
611 	return ret;
612 }
613 EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
614 
615 /**
616  * pvclock_gtod_unregister_notifier - unregister a pvclock
617  * timedata update listener
618  * @nb: Pointer to the notifier block to unregister
619  */
620 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
621 {
622 	guard(raw_spinlock_irqsave)(&tk_core.lock);
623 	return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
624 }
625 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
626 
627 /*
628  * tk_update_leap_state - helper to update the next_leap_ktime
629  */
630 static inline void tk_update_leap_state(struct timekeeper *tk)
631 {
632 	tk->next_leap_ktime = ntp_get_next_leap(tk->id);
633 	if (tk->next_leap_ktime != KTIME_MAX)
634 		/* Convert to monotonic time */
635 		tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
636 }
637 
638 /*
639  * Leap state update for both shadow and the real timekeeper
640  * Separate to spare a full memcpy() of the timekeeper.
641  */
642 static void tk_update_leap_state_all(struct tk_data *tkd)
643 {
644 	write_seqcount_begin(&tkd->seq);
645 	tk_update_leap_state(&tkd->shadow_timekeeper);
646 	tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
647 	write_seqcount_end(&tkd->seq);
648 }
649 
650 /*
651  * Update the ktime_t based scalar nsec members of the timekeeper
652  */
653 static inline void tk_update_ktime_data(struct timekeeper *tk)
654 {
655 	u64 seconds;
656 	u32 nsec;
657 
658 	/*
659 	 * The xtime based monotonic readout is:
660 	 *	nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
661 	 * The ktime based monotonic readout is:
662 	 *	nsec = base_mono + now();
663 	 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
664 	 */
665 	seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
666 	nsec = (u32) tk->wall_to_monotonic.tv_nsec;
667 	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
668 
669 	/*
670 	 * The sum of the nanoseconds portions of xtime and
671 	 * wall_to_monotonic can be greater/equal one second. Take
672 	 * this into account before updating tk->ktime_sec.
673 	 */
674 	nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
675 	if (nsec >= NSEC_PER_SEC)
676 		seconds++;
677 	tk->ktime_sec = seconds;
678 
679 	/* Update the monotonic raw base */
680 	tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
681 }
682 
683 /*
684  * Restore the shadow timekeeper from the real timekeeper.
685  */
686 static void timekeeping_restore_shadow(struct tk_data *tkd)
687 {
688 	lockdep_assert_held(&tkd->lock);
689 	memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
690 }
691 
692 static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
693 {
694 	struct timekeeper *tk = &tkd->shadow_timekeeper;
695 
696 	lockdep_assert_held(&tkd->lock);
697 
698 	/*
699 	 * Block out readers before running the updates below because that
700 	 * updates VDSO and other time related infrastructure. Not blocking
701 	 * the readers might let a reader see time going backwards when
702 	 * reading from the VDSO after the VDSO update and then reading in
703 	 * the kernel from the timekeeper before that got updated.
704 	 */
705 	write_seqcount_begin(&tkd->seq);
706 
707 	if (action & TK_CLEAR_NTP) {
708 		tk->ntp_error = 0;
709 		ntp_clear(tk->id);
710 	}
711 
712 	tk_update_leap_state(tk);
713 	tk_update_ktime_data(tk);
714 	tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
715 
716 	if (tk->id == TIMEKEEPER_CORE) {
717 		update_vsyscall(tk);
718 		update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
719 
720 		update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
721 		update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
722 	}
723 
724 	if (action & TK_CLOCK_WAS_SET)
725 		tk->clock_was_set_seq++;
726 
727 	/*
728 	 * Update the real timekeeper.
729 	 *
730 	 * We could avoid this memcpy() by switching pointers, but that has
731 	 * the downside that the reader side does not longer benefit from
732 	 * the cacheline optimized data layout of the timekeeper and requires
733 	 * another indirection.
734 	 */
735 	memcpy(&tkd->timekeeper, tk, sizeof(*tk));
736 	write_seqcount_end(&tkd->seq);
737 }
738 
739 /**
740  * timekeeping_forward_now - update clock to the current time
741  * @tk:		Pointer to the timekeeper to update
742  *
743  * Forward the current clock to update its state since the last call to
744  * update_wall_time(). This is useful before significant clock changes,
745  * as it avoids having to deal with this time offset explicitly.
746  */
747 static void timekeeping_forward_now(struct timekeeper *tk)
748 {
749 	u64 cycle_now, delta;
750 
751 	cycle_now = tk_clock_read(&tk->tkr_mono);
752 	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
753 				  tk->tkr_mono.clock->max_raw_delta);
754 	tk->tkr_mono.cycle_last = cycle_now;
755 	tk->tkr_raw.cycle_last  = cycle_now;
756 
757 	while (delta > 0) {
758 		u64 max = tk->tkr_mono.clock->max_cycles;
759 		u64 incr = delta < max ? delta : max;
760 
761 		tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
762 		tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
763 		tk_normalize_xtime(tk);
764 		delta -= incr;
765 	}
766 	tk_update_coarse_nsecs(tk);
767 }
768 
769 /**
770  * ktime_get_real_ts64 - Returns the time of day in a timespec64.
771  * @ts:		pointer to the timespec to be set
772  *
773  * Returns the time of day in a timespec64 (WARN if suspended).
774  */
775 void ktime_get_real_ts64(struct timespec64 *ts)
776 {
777 	struct timekeeper *tk = &tk_core.timekeeper;
778 	unsigned int seq;
779 	u64 nsecs;
780 
781 	WARN_ON(timekeeping_suspended);
782 
783 	do {
784 		seq = read_seqcount_begin(&tk_core.seq);
785 
786 		ts->tv_sec = tk->xtime_sec;
787 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
788 
789 	} while (read_seqcount_retry(&tk_core.seq, seq));
790 
791 	ts->tv_nsec = 0;
792 	timespec64_add_ns(ts, nsecs);
793 }
794 EXPORT_SYMBOL(ktime_get_real_ts64);
795 
796 ktime_t ktime_get(void)
797 {
798 	struct timekeeper *tk = &tk_core.timekeeper;
799 	unsigned int seq;
800 	ktime_t base;
801 	u64 nsecs;
802 
803 	WARN_ON(timekeeping_suspended);
804 
805 	do {
806 		seq = read_seqcount_begin(&tk_core.seq);
807 		base = tk->tkr_mono.base;
808 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
809 
810 	} while (read_seqcount_retry(&tk_core.seq, seq));
811 
812 	return ktime_add_ns(base, nsecs);
813 }
814 EXPORT_SYMBOL_GPL(ktime_get);
815 
816 u32 ktime_get_resolution_ns(void)
817 {
818 	struct timekeeper *tk = &tk_core.timekeeper;
819 	unsigned int seq;
820 	u32 nsecs;
821 
822 	WARN_ON(timekeeping_suspended);
823 
824 	do {
825 		seq = read_seqcount_begin(&tk_core.seq);
826 		nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
827 	} while (read_seqcount_retry(&tk_core.seq, seq));
828 
829 	return nsecs;
830 }
831 EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
832 
833 static ktime_t *offsets[TK_OFFS_MAX] = {
834 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
835 	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
836 	[TK_OFFS_TAI]	= &tk_core.timekeeper.offs_tai,
837 };
838 
839 ktime_t ktime_get_with_offset(enum tk_offsets offs)
840 {
841 	struct timekeeper *tk = &tk_core.timekeeper;
842 	unsigned int seq;
843 	ktime_t base, *offset = offsets[offs];
844 	u64 nsecs;
845 
846 	WARN_ON(timekeeping_suspended);
847 
848 	do {
849 		seq = read_seqcount_begin(&tk_core.seq);
850 		base = ktime_add(tk->tkr_mono.base, *offset);
851 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
852 
853 	} while (read_seqcount_retry(&tk_core.seq, seq));
854 
855 	return ktime_add_ns(base, nsecs);
856 
857 }
858 EXPORT_SYMBOL_GPL(ktime_get_with_offset);
859 
860 ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
861 {
862 	struct timekeeper *tk = &tk_core.timekeeper;
863 	ktime_t base, *offset = offsets[offs];
864 	unsigned int seq;
865 	u64 nsecs;
866 
867 	WARN_ON(timekeeping_suspended);
868 
869 	do {
870 		seq = read_seqcount_begin(&tk_core.seq);
871 		base = ktime_add(tk->tkr_mono.base, *offset);
872 		nsecs = tk->coarse_nsec;
873 
874 	} while (read_seqcount_retry(&tk_core.seq, seq));
875 
876 	return ktime_add_ns(base, nsecs);
877 }
878 EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
879 
880 /**
881  * ktime_mono_to_any() - convert monotonic time to any other time
882  * @tmono:	time to convert.
883  * @offs:	which offset to use
884  */
885 ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
886 {
887 	ktime_t *offset = offsets[offs];
888 	unsigned int seq;
889 	ktime_t tconv;
890 
891 	if (IS_ENABLED(CONFIG_64BIT)) {
892 		/*
893 		 * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
894 		 * tk_update_sleep_time().
895 		 */
896 		return ktime_add(tmono, READ_ONCE(*offset));
897 	}
898 
899 	do {
900 		seq = read_seqcount_begin(&tk_core.seq);
901 		tconv = ktime_add(tmono, *offset);
902 	} while (read_seqcount_retry(&tk_core.seq, seq));
903 
904 	return tconv;
905 }
906 EXPORT_SYMBOL_GPL(ktime_mono_to_any);
907 
908 /**
909  * ktime_get_raw - Returns the raw monotonic time in ktime_t format
910  */
911 ktime_t ktime_get_raw(void)
912 {
913 	struct timekeeper *tk = &tk_core.timekeeper;
914 	unsigned int seq;
915 	ktime_t base;
916 	u64 nsecs;
917 
918 	do {
919 		seq = read_seqcount_begin(&tk_core.seq);
920 		base = tk->tkr_raw.base;
921 		nsecs = timekeeping_get_ns(&tk->tkr_raw);
922 
923 	} while (read_seqcount_retry(&tk_core.seq, seq));
924 
925 	return ktime_add_ns(base, nsecs);
926 }
927 EXPORT_SYMBOL_GPL(ktime_get_raw);
928 
929 /**
930  * ktime_get_ts64 - get the monotonic clock in timespec64 format
931  * @ts:		pointer to timespec variable
932  *
933  * The function calculates the monotonic clock from the realtime
934  * clock and the wall_to_monotonic offset and stores the result
935  * in normalized timespec64 format in the variable pointed to by @ts.
936  */
937 void ktime_get_ts64(struct timespec64 *ts)
938 {
939 	struct timekeeper *tk = &tk_core.timekeeper;
940 	struct timespec64 tomono;
941 	unsigned int seq;
942 	u64 nsec;
943 
944 	WARN_ON(timekeeping_suspended);
945 
946 	do {
947 		seq = read_seqcount_begin(&tk_core.seq);
948 		ts->tv_sec = tk->xtime_sec;
949 		nsec = timekeeping_get_ns(&tk->tkr_mono);
950 		tomono = tk->wall_to_monotonic;
951 
952 	} while (read_seqcount_retry(&tk_core.seq, seq));
953 
954 	ts->tv_sec += tomono.tv_sec;
955 	ts->tv_nsec = 0;
956 	timespec64_add_ns(ts, nsec + tomono.tv_nsec);
957 }
958 EXPORT_SYMBOL_GPL(ktime_get_ts64);
959 
960 /**
961  * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
962  *
963  * Returns the seconds portion of CLOCK_MONOTONIC with a single non
964  * serialized read. tk->ktime_sec is of type 'unsigned long' so this
965  * works on both 32 and 64 bit systems. On 32 bit systems the readout
966  * covers ~136 years of uptime which should be enough to prevent
967  * premature wrap arounds.
968  */
969 time64_t ktime_get_seconds(void)
970 {
971 	struct timekeeper *tk = &tk_core.timekeeper;
972 
973 	WARN_ON(timekeeping_suspended);
974 	return tk->ktime_sec;
975 }
976 EXPORT_SYMBOL_GPL(ktime_get_seconds);
977 
978 /**
979  * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
980  *
981  * Returns the wall clock seconds since 1970.
982  *
983  * For 64bit systems the fast access to tk->xtime_sec is preserved. On
984  * 32bit systems the access must be protected with the sequence
985  * counter to provide "atomic" access to the 64bit tk->xtime_sec
986  * value.
987  */
988 time64_t ktime_get_real_seconds(void)
989 {
990 	struct timekeeper *tk = &tk_core.timekeeper;
991 	time64_t seconds;
992 	unsigned int seq;
993 
994 	if (IS_ENABLED(CONFIG_64BIT))
995 		return tk->xtime_sec;
996 
997 	do {
998 		seq = read_seqcount_begin(&tk_core.seq);
999 		seconds = tk->xtime_sec;
1000 
1001 	} while (read_seqcount_retry(&tk_core.seq, seq));
1002 
1003 	return seconds;
1004 }
1005 EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
1006 
1007 /**
1008  * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
1009  *
1010  * The same as ktime_get_real_seconds() but without the sequence counter
1011  * protection. This function is used in restricted contexts like the x86 MCE
1012  * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
1013  * completed modification and only to be used for such critical contexts.
1014  *
1015  * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
1016  */
1017 noinstr time64_t __ktime_get_real_seconds(void)
1018 {
1019 	struct timekeeper *tk = &tk_core.timekeeper;
1020 
1021 	return tk->xtime_sec;
1022 }
1023 
1024 /**
1025  * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
1026  * @systime_snapshot:	pointer to struct receiving the system time snapshot
1027  */
1028 void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
1029 {
1030 	struct timekeeper *tk = &tk_core.timekeeper;
1031 	unsigned int seq;
1032 	ktime_t base_raw;
1033 	ktime_t base_real;
1034 	ktime_t base_boot;
1035 	u64 nsec_raw;
1036 	u64 nsec_real;
1037 	u64 now;
1038 
1039 	WARN_ON_ONCE(timekeeping_suspended);
1040 
1041 	do {
1042 		seq = read_seqcount_begin(&tk_core.seq);
1043 		now = tk_clock_read(&tk->tkr_mono);
1044 		systime_snapshot->cs_id = tk->tkr_mono.clock->id;
1045 		systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
1046 		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
1047 		base_real = ktime_add(tk->tkr_mono.base,
1048 				      tk_core.timekeeper.offs_real);
1049 		base_boot = ktime_add(tk->tkr_mono.base,
1050 				      tk_core.timekeeper.offs_boot);
1051 		base_raw = tk->tkr_raw.base;
1052 		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
1053 		nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
1054 	} while (read_seqcount_retry(&tk_core.seq, seq));
1055 
1056 	systime_snapshot->cycles = now;
1057 	systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
1058 	systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
1059 	systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
1060 }
1061 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
1062 
1063 /* Scale base by mult/div checking for overflow */
1064 static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
1065 {
1066 	u64 tmp, rem;
1067 
1068 	tmp = div64_u64_rem(*base, div, &rem);
1069 
1070 	if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
1071 	    ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
1072 		return -EOVERFLOW;
1073 	tmp *= mult;
1074 
1075 	rem = div64_u64(rem * mult, div);
1076 	*base = tmp + rem;
1077 	return 0;
1078 }
1079 
1080 /**
1081  * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
1082  * @history:			Snapshot representing start of history
1083  * @partial_history_cycles:	Cycle offset into history (fractional part)
1084  * @total_history_cycles:	Total history length in cycles
1085  * @discontinuity:		True indicates clock was set on history period
1086  * @ts:				Cross timestamp that should be adjusted using
1087  *	partial/total ratio
1088  *
1089  * Helper function used by get_device_system_crosststamp() to correct the
1090  * crosstimestamp corresponding to the start of the current interval to the
1091  * system counter value (timestamp point) provided by the driver. The
1092  * total_history_* quantities are the total history starting at the provided
1093  * reference point and ending at the start of the current interval. The cycle
1094  * count between the driver timestamp point and the start of the current
1095  * interval is partial_history_cycles.
1096  */
1097 static int adjust_historical_crosststamp(struct system_time_snapshot *history,
1098 					 u64 partial_history_cycles,
1099 					 u64 total_history_cycles,
1100 					 bool discontinuity,
1101 					 struct system_device_crosststamp *ts)
1102 {
1103 	struct timekeeper *tk = &tk_core.timekeeper;
1104 	u64 corr_raw, corr_real;
1105 	bool interp_forward;
1106 	int ret;
1107 
1108 	if (total_history_cycles == 0 || partial_history_cycles == 0)
1109 		return 0;
1110 
1111 	/* Interpolate shortest distance from beginning or end of history */
1112 	interp_forward = partial_history_cycles > total_history_cycles / 2;
1113 	partial_history_cycles = interp_forward ?
1114 		total_history_cycles - partial_history_cycles :
1115 		partial_history_cycles;
1116 
1117 	/*
1118 	 * Scale the monotonic raw time delta by:
1119 	 *	partial_history_cycles / total_history_cycles
1120 	 */
1121 	corr_raw = (u64)ktime_to_ns(
1122 		ktime_sub(ts->sys_monoraw, history->raw));
1123 	ret = scale64_check_overflow(partial_history_cycles,
1124 				     total_history_cycles, &corr_raw);
1125 	if (ret)
1126 		return ret;
1127 
1128 	/*
1129 	 * If there is a discontinuity in the history, scale monotonic raw
1130 	 *	correction by:
1131 	 *	mult(real)/mult(raw) yielding the realtime correction
1132 	 * Otherwise, calculate the realtime correction similar to monotonic
1133 	 *	raw calculation
1134 	 */
1135 	if (discontinuity) {
1136 		corr_real = mul_u64_u32_div
1137 			(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
1138 	} else {
1139 		corr_real = (u64)ktime_to_ns(
1140 			ktime_sub(ts->sys_realtime, history->real));
1141 		ret = scale64_check_overflow(partial_history_cycles,
1142 					     total_history_cycles, &corr_real);
1143 		if (ret)
1144 			return ret;
1145 	}
1146 
1147 	/* Fixup monotonic raw and real time time values */
1148 	if (interp_forward) {
1149 		ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
1150 		ts->sys_realtime = ktime_add_ns(history->real, corr_real);
1151 	} else {
1152 		ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
1153 		ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
1154 	}
1155 
1156 	return 0;
1157 }
1158 
1159 /*
1160  * timestamp_in_interval - true if ts is chronologically in [start, end]
1161  *
1162  * True if ts occurs chronologically at or after start, and before or at end.
1163  */
1164 static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
1165 {
1166 	if (ts >= start && ts <= end)
1167 		return true;
1168 	if (start > end && (ts >= start || ts <= end))
1169 		return true;
1170 	return false;
1171 }
1172 
1173 static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
1174 {
1175 	u64 rem, res;
1176 
1177 	if (!numerator || !denominator)
1178 		return false;
1179 
1180 	res = div64_u64_rem(*val, denominator, &rem) * numerator;
1181 	*val = res + div_u64(rem * numerator, denominator);
1182 	return true;
1183 }
1184 
1185 static bool convert_base_to_cs(struct system_counterval_t *scv)
1186 {
1187 	struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
1188 	struct clocksource_base *base;
1189 	u32 num, den;
1190 
1191 	/* The timestamp was taken from the time keeper clock source */
1192 	if (cs->id == scv->cs_id)
1193 		return true;
1194 
1195 	/*
1196 	 * Check whether cs_id matches the base clock. Prevent the compiler from
1197 	 * re-evaluating @base as the clocksource might change concurrently.
1198 	 */
1199 	base = READ_ONCE(cs->base);
1200 	if (!base || base->id != scv->cs_id)
1201 		return false;
1202 
1203 	num = scv->use_nsecs ? cs->freq_khz : base->numerator;
1204 	den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;
1205 
1206 	if (!convert_clock(&scv->cycles, num, den))
1207 		return false;
1208 
1209 	scv->cycles += base->offset;
1210 	return true;
1211 }
1212 
1213 static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
1214 {
1215 	struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
1216 	struct clocksource_base *base;
1217 
1218 	/*
1219 	 * Check whether base_id matches the base clock. Prevent the compiler from
1220 	 * re-evaluating @base as the clocksource might change concurrently.
1221 	 */
1222 	base = READ_ONCE(cs->base);
1223 	if (!base || base->id != base_id)
1224 		return false;
1225 
1226 	*cycles -= base->offset;
1227 	if (!convert_clock(cycles, base->denominator, base->numerator))
1228 		return false;
1229 	return true;
1230 }
1231 
1232 static bool convert_ns_to_cs(u64 *delta)
1233 {
1234 	struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
1235 
1236 	if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
1237 		return false;
1238 
1239 	*delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
1240 	return true;
1241 }
1242 
1243 /**
1244  * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
1245  * @treal:	CLOCK_REALTIME timestamp to convert
1246  * @base_id:	base clocksource id
1247  * @cycles:	pointer to store the converted base clock timestamp
1248  *
1249  * Converts a supplied, future realtime clock value to the corresponding base clock value.
1250  *
1251  * Return:  true if the conversion is successful, false otherwise.
1252  */
1253 bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
1254 {
1255 	struct timekeeper *tk = &tk_core.timekeeper;
1256 	unsigned int seq;
1257 	u64 delta;
1258 
1259 	do {
1260 		seq = read_seqcount_begin(&tk_core.seq);
1261 		if ((u64)treal < tk->tkr_mono.base_real)
1262 			return false;
1263 		delta = (u64)treal - tk->tkr_mono.base_real;
1264 		if (!convert_ns_to_cs(&delta))
1265 			return false;
1266 		*cycles = tk->tkr_mono.cycle_last + delta;
1267 		if (!convert_cs_to_base(cycles, base_id))
1268 			return false;
1269 	} while (read_seqcount_retry(&tk_core.seq, seq));
1270 
1271 	return true;
1272 }
1273 EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);
1274 
1275 /**
1276  * get_device_system_crosststamp - Synchronously capture system/device timestamp
1277  * @get_time_fn:	Callback to get simultaneous device time and
1278  *	system counter from the device driver
1279  * @ctx:		Context passed to get_time_fn()
1280  * @history_begin:	Historical reference point used to interpolate system
1281  *	time when counter provided by the driver is before the current interval
1282  * @xtstamp:		Receives simultaneously captured system and device time
1283  *
1284  * Reads a timestamp from a device and correlates it to system time
1285  */
1286 int get_device_system_crosststamp(int (*get_time_fn)
1287 				  (ktime_t *device_time,
1288 				   struct system_counterval_t *sys_counterval,
1289 				   void *ctx),
1290 				  void *ctx,
1291 				  struct system_time_snapshot *history_begin,
1292 				  struct system_device_crosststamp *xtstamp)
1293 {
1294 	struct system_counterval_t system_counterval;
1295 	struct timekeeper *tk = &tk_core.timekeeper;
1296 	u64 cycles, now, interval_start;
1297 	unsigned int clock_was_set_seq = 0;
1298 	ktime_t base_real, base_raw;
1299 	u64 nsec_real, nsec_raw;
1300 	u8 cs_was_changed_seq;
1301 	unsigned int seq;
1302 	bool do_interp;
1303 	int ret;
1304 
1305 	do {
1306 		seq = read_seqcount_begin(&tk_core.seq);
1307 		/*
1308 		 * Try to synchronously capture device time and a system
1309 		 * counter value calling back into the device driver
1310 		 */
1311 		ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
1312 		if (ret)
1313 			return ret;
1314 
1315 		/*
1316 		 * Verify that the clocksource ID associated with the captured
1317 		 * system counter value is the same as for the currently
1318 		 * installed timekeeper clocksource
1319 		 */
1320 		if (system_counterval.cs_id == CSID_GENERIC ||
1321 		    !convert_base_to_cs(&system_counterval))
1322 			return -ENODEV;
1323 		cycles = system_counterval.cycles;
1324 
1325 		/*
1326 		 * Check whether the system counter value provided by the
1327 		 * device driver is on the current timekeeping interval.
1328 		 */
1329 		now = tk_clock_read(&tk->tkr_mono);
1330 		interval_start = tk->tkr_mono.cycle_last;
1331 		if (!timestamp_in_interval(interval_start, now, cycles)) {
1332 			clock_was_set_seq = tk->clock_was_set_seq;
1333 			cs_was_changed_seq = tk->cs_was_changed_seq;
1334 			cycles = interval_start;
1335 			do_interp = true;
1336 		} else {
1337 			do_interp = false;
1338 		}
1339 
1340 		base_real = ktime_add(tk->tkr_mono.base,
1341 				      tk_core.timekeeper.offs_real);
1342 		base_raw = tk->tkr_raw.base;
1343 
1344 		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
1345 		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
1346 	} while (read_seqcount_retry(&tk_core.seq, seq));
1347 
1348 	xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
1349 	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
1350 
1351 	/*
1352 	 * Interpolate if necessary, adjusting back from the start of the
1353 	 * current interval
1354 	 */
1355 	if (do_interp) {
1356 		u64 partial_history_cycles, total_history_cycles;
1357 		bool discontinuity;
1358 
1359 		/*
1360 		 * Check that the counter value is not before the provided
1361 		 * history reference and that the history doesn't cross a
1362 		 * clocksource change
1363 		 */
1364 		if (!history_begin ||
1365 		    !timestamp_in_interval(history_begin->cycles,
1366 					   cycles, system_counterval.cycles) ||
1367 		    history_begin->cs_was_changed_seq != cs_was_changed_seq)
1368 			return -EINVAL;
1369 		partial_history_cycles = cycles - system_counterval.cycles;
1370 		total_history_cycles = cycles - history_begin->cycles;
1371 		discontinuity =
1372 			history_begin->clock_was_set_seq != clock_was_set_seq;
1373 
1374 		ret = adjust_historical_crosststamp(history_begin,
1375 						    partial_history_cycles,
1376 						    total_history_cycles,
1377 						    discontinuity, xtstamp);
1378 		if (ret)
1379 			return ret;
1380 	}
1381 
1382 	return 0;
1383 }
1384 EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
1385 
1386 /**
1387  * timekeeping_clocksource_has_base - Check whether the current clocksource
1388  *				      is based on given a base clock
1389  * @id:		base clocksource ID
1390  *
1391  * Note:	The return value is a snapshot which can become invalid right
1392  *		after the function returns.
1393  *
1394  * Return:	true if the timekeeper clocksource has a base clock with @id,
1395  *		false otherwise
1396  */
1397 bool timekeeping_clocksource_has_base(enum clocksource_ids id)
1398 {
1399 	/*
1400 	 * This is a snapshot, so no point in using the sequence
1401 	 * count. Just prevent the compiler from re-evaluating @base as the
1402 	 * clocksource might change concurrently.
1403 	 */
1404 	struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);
1405 
1406 	return base ? base->id == id : false;
1407 }
1408 EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);
1409 
1410 /**
1411  * do_settimeofday64 - Sets the time of day.
1412  * @ts:     pointer to the timespec64 variable containing the new time
1413  *
1414  * Sets the time of day to the new time and update NTP and notify hrtimers
1415  */
1416 int do_settimeofday64(const struct timespec64 *ts)
1417 {
1418 	struct timespec64 ts_delta, xt;
1419 
1420 	if (!timespec64_valid_settod(ts))
1421 		return -EINVAL;
1422 
1423 	scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
1424 		struct timekeeper *tks = &tk_core.shadow_timekeeper;
1425 
1426 		timekeeping_forward_now(tks);
1427 
1428 		xt = tk_xtime(tks);
1429 		ts_delta = timespec64_sub(*ts, xt);
1430 
1431 		if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
1432 			timekeeping_restore_shadow(&tk_core);
1433 			return -EINVAL;
1434 		}
1435 
1436 		tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
1437 		tk_set_xtime(tks, ts);
1438 		timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
1439 	}
1440 
1441 	/* Signal hrtimers about time change */
1442 	clock_was_set(CLOCK_SET_WALL);
1443 
1444 	audit_tk_injoffset(ts_delta);
1445 	add_device_randomness(ts, sizeof(*ts));
1446 	return 0;
1447 }
1448 EXPORT_SYMBOL(do_settimeofday64);
1449 
1450 static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
1451 {
1452 	return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
1453 }
1454 
1455 /**
1456  * __timekeeping_inject_offset - Adds or subtracts from the current time.
1457  * @tkd:	Pointer to the timekeeper to modify
1458  * @ts:		Pointer to the timespec variable containing the offset
1459  *
1460  * Adds or subtracts an offset value from the current time.
1461  */
1462 static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
1463 {
1464 	struct timekeeper *tks = &tkd->shadow_timekeeper;
1465 	struct timespec64 tmp;
1466 
1467 	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
1468 		return -EINVAL;
1469 
1470 	timekeeping_forward_now(tks);
1471 
1472 	if (timekeeper_is_core_tk(tks)) {
1473 		/* Make sure the proposed value is valid */
1474 		tmp = timespec64_add(tk_xtime(tks), *ts);
1475 		if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
1476 		    !timespec64_valid_settod(&tmp)) {
1477 			timekeeping_restore_shadow(tkd);
1478 			return -EINVAL;
1479 		}
1480 
1481 		tk_xtime_add(tks, ts);
1482 		tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
1483 	} else {
1484 		struct tk_read_base *tkr_mono = &tks->tkr_mono;
1485 		ktime_t now, offs;
1486 
1487 		/* Get the current time */
1488 		now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
1489 		/* Add the relative offset change */
1490 		offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
1491 
1492 		/* Prevent that the resulting time becomes negative */
1493 		if (ktime_add(now, offs) < 0) {
1494 			timekeeping_restore_shadow(tkd);
1495 			return -EINVAL;
1496 		}
1497 		tks->offs_aux = offs;
1498 	}
1499 
1500 	timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
1501 	return 0;
1502 }
1503 
1504 static int timekeeping_inject_offset(const struct timespec64 *ts)
1505 {
1506 	int ret;
1507 
1508 	scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
1509 		ret = __timekeeping_inject_offset(&tk_core, ts);
1510 
1511 	/* Signal hrtimers about time change */
1512 	if (!ret)
1513 		clock_was_set(CLOCK_SET_WALL);
1514 	return ret;
1515 }
1516 
1517 /*
1518  * Indicates if there is an offset between the system clock and the hardware
1519  * clock/persistent clock/rtc.
1520  */
1521 int persistent_clock_is_local;
1522 
1523 /*
1524  * Adjust the time obtained from the CMOS to be UTC time instead of
1525  * local time.
1526  *
1527  * This is ugly, but preferable to the alternatives.  Otherwise we
1528  * would either need to write a program to do it in /etc/rc (and risk
1529  * confusion if the program gets run more than once; it would also be
1530  * hard to make the program warp the clock precisely n hours)  or
1531  * compile in the timezone information into the kernel.  Bad, bad....
1532  *
1533  *						- TYT, 1992-01-01
1534  *
1535  * The best thing to do is to keep the CMOS clock in universal time (UTC)
1536  * as real UNIX machines always do it. This avoids all headaches about
1537  * daylight saving times and warping kernel clocks.
1538  */
1539 void timekeeping_warp_clock(void)
1540 {
1541 	if (sys_tz.tz_minuteswest != 0) {
1542 		struct timespec64 adjust;
1543 
1544 		persistent_clock_is_local = 1;
1545 		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
1546 		adjust.tv_nsec = 0;
1547 		timekeeping_inject_offset(&adjust);
1548 	}
1549 }
1550 
1551 /*
1552  * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
1553  */
1554 static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
1555 {
1556 	tk->tai_offset = tai_offset;
1557 	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
1558 }
1559 
1560 /*
1561  * change_clocksource - Swaps clocksources if a new one is available
1562  *
1563  * Accumulates current time interval and initializes new clocksource
1564  */
1565 static int change_clocksource(void *data)
1566 {
1567 	struct clocksource *new = data, *old = NULL;
1568 
1569 	/*
1570 	 * If the clocksource is in a module, get a module reference.
1571 	 * Succeeds for built-in code (owner == NULL) as well. Abort if the
1572 	 * reference can't be acquired.
1573 	 */
1574 	if (!try_module_get(new->owner))
1575 		return 0;
1576 
1577 	/* Abort if the device can't be enabled */
1578 	if (new->enable && new->enable(new) != 0) {
1579 		module_put(new->owner);
1580 		return 0;
1581 	}
1582 
1583 	scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
1584 		struct timekeeper *tks = &tk_core.shadow_timekeeper;
1585 
1586 		timekeeping_forward_now(tks);
1587 		old = tks->tkr_mono.clock;
1588 		tk_setup_internals(tks, new);
1589 		timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
1590 	}
1591 
1592 	tk_aux_update_clocksource();
1593 
1594 	if (old) {
1595 		if (old->disable)
1596 			old->disable(old);
1597 		module_put(old->owner);
1598 	}
1599 
1600 	return 0;
1601 }
1602 
1603 /**
1604  * timekeeping_notify - Install a new clock source
1605  * @clock:		pointer to the clock source
1606  *
1607  * This function is called from clocksource.c after a new, better clock
1608  * source has been registered. The caller holds the clocksource_mutex.
1609  */
1610 int timekeeping_notify(struct clocksource *clock)
1611 {
1612 	struct timekeeper *tk = &tk_core.timekeeper;
1613 
1614 	if (tk->tkr_mono.clock == clock)
1615 		return 0;
1616 	stop_machine(change_clocksource, clock, NULL);
1617 	tick_clock_notify();
1618 	return tk->tkr_mono.clock == clock ? 0 : -1;
1619 }
1620 
1621 /**
1622  * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
1623  * @ts:		pointer to the timespec64 to be set
1624  *
1625  * Returns the raw monotonic time (completely un-modified by ntp)
1626  */
1627 void ktime_get_raw_ts64(struct timespec64 *ts)
1628 {
1629 	struct timekeeper *tk = &tk_core.timekeeper;
1630 	unsigned int seq;
1631 	u64 nsecs;
1632 
1633 	do {
1634 		seq = read_seqcount_begin(&tk_core.seq);
1635 		ts->tv_sec = tk->raw_sec;
1636 		nsecs = timekeeping_get_ns(&tk->tkr_raw);
1637 
1638 	} while (read_seqcount_retry(&tk_core.seq, seq));
1639 
1640 	ts->tv_nsec = 0;
1641 	timespec64_add_ns(ts, nsecs);
1642 }
1643 EXPORT_SYMBOL(ktime_get_raw_ts64);
1644 
1645 /**
1646  * ktime_get_clock_ts64 - Returns time of a clock in a timespec
1647  * @id:		POSIX clock ID of the clock to read
1648  * @ts:		Pointer to the timespec64 to be set
1649  *
1650  * The timestamp is invalidated (@ts->sec is set to -1) if the
1651  * clock @id is not available.
1652  */
1653 void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
1654 {
1655 	/* Invalidate time stamp */
1656 	ts->tv_sec = -1;
1657 	ts->tv_nsec = 0;
1658 
1659 	switch (id) {
1660 	case CLOCK_REALTIME:
1661 		ktime_get_real_ts64(ts);
1662 		return;
1663 	case CLOCK_MONOTONIC:
1664 		ktime_get_ts64(ts);
1665 		return;
1666 	case CLOCK_MONOTONIC_RAW:
1667 		ktime_get_raw_ts64(ts);
1668 		return;
1669 	case CLOCK_AUX ... CLOCK_AUX_LAST:
1670 		if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
1671 			ktime_get_aux_ts64(id, ts);
1672 		return;
1673 	default:
1674 		WARN_ON_ONCE(1);
1675 	}
1676 }
1677 EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
1678 
1679 /**
1680  * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
1681  */
1682 int timekeeping_valid_for_hres(void)
1683 {
1684 	struct timekeeper *tk = &tk_core.timekeeper;
1685 	unsigned int seq;
1686 	int ret;
1687 
1688 	do {
1689 		seq = read_seqcount_begin(&tk_core.seq);
1690 
1691 		ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
1692 
1693 	} while (read_seqcount_retry(&tk_core.seq, seq));
1694 
1695 	return ret;
1696 }
1697 
1698 /**
1699  * timekeeping_max_deferment - Returns max time the clocksource can be deferred
1700  */
1701 u64 timekeeping_max_deferment(void)
1702 {
1703 	struct timekeeper *tk = &tk_core.timekeeper;
1704 	unsigned int seq;
1705 	u64 ret;
1706 
1707 	do {
1708 		seq = read_seqcount_begin(&tk_core.seq);
1709 
1710 		ret = tk->tkr_mono.clock->max_idle_ns;
1711 
1712 	} while (read_seqcount_retry(&tk_core.seq, seq));
1713 
1714 	return ret;
1715 }
1716 
1717 /**
1718  * read_persistent_clock64 -  Return time from the persistent clock.
1719  * @ts: Pointer to the storage for the readout value
1720  *
1721  * Weak dummy function for arches that do not yet support it.
1722  * Reads the time from the battery backed persistent clock.
1723  * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
1724  *
1725  *  XXX - Do be sure to remove it once all arches implement it.
1726  */
1727 void __weak read_persistent_clock64(struct timespec64 *ts)
1728 {
1729 	ts->tv_sec = 0;
1730 	ts->tv_nsec = 0;
1731 }
1732 
1733 /**
1734  * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
1735  *                                        from the boot.
1736  * @wall_time:	  current time as returned by persistent clock
1737  * @boot_offset:  offset that is defined as wall_time - boot_time
1738  *
1739  * Weak dummy function for arches that do not yet support it.
1740  *
1741  * The default function calculates offset based on the current value of
1742  * local_clock(). This way architectures that support sched_clock() but don't
1743  * support dedicated boot time clock will provide the best estimate of the
1744  * boot time.
1745  */
1746 void __weak __init
1747 read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
1748 				     struct timespec64 *boot_offset)
1749 {
1750 	read_persistent_clock64(wall_time);
1751 	*boot_offset = ns_to_timespec64(local_clock());
1752 }
1753 
1754 static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
1755 {
1756 	raw_spin_lock_init(&tkd->lock);
1757 	seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
1758 	tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
1759 	tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
1760 }
1761 
1762 /*
1763  * Flag reflecting whether timekeeping_resume() has injected sleeptime.
1764  *
1765  * The flag starts of false and is only set when a suspend reaches
1766  * timekeeping_suspend(), timekeeping_resume() sets it to false when the
1767  * timekeeper clocksource is not stopping across suspend and has been
1768  * used to update sleep time. If the timekeeper clocksource has stopped
1769  * then the flag stays true and is used by the RTC resume code to decide
1770  * whether sleeptime must be injected and if so the flag gets false then.
1771  *
1772  * If a suspend fails before reaching timekeeping_resume() then the flag
1773  * stays false and prevents erroneous sleeptime injection.
1774  */
1775 static bool suspend_timing_needed;
1776 
1777 /* Flag for if there is a persistent clock on this platform */
1778 static bool persistent_clock_exists;
1779 
1780 /*
1781  * timekeeping_init - Initializes the clocksource and common timekeeping values
1782  */
1783 void __init timekeeping_init(void)
1784 {
1785 	struct timespec64 wall_time, boot_offset, wall_to_mono;
1786 	struct timekeeper *tks = &tk_core.shadow_timekeeper;
1787 	struct clocksource *clock;
1788 
1789 	tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
1790 	tk_aux_setup();
1791 
1792 	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
1793 	if (timespec64_valid_settod(&wall_time) &&
1794 	    timespec64_to_ns(&wall_time) > 0) {
1795 		persistent_clock_exists = true;
1796 	} else if (timespec64_to_ns(&wall_time) != 0) {
1797 		pr_warn("Persistent clock returned invalid value");
1798 		wall_time = (struct timespec64){0};
1799 	}
1800 
1801 	if (timespec64_compare(&wall_time, &boot_offset) < 0)
1802 		boot_offset = (struct timespec64){0};
1803 
1804 	/*
1805 	 * We want set wall_to_mono, so the following is true:
1806 	 * wall time + wall_to_mono = boot time
1807 	 */
1808 	wall_to_mono = timespec64_sub(boot_offset, wall_time);
1809 
1810 	guard(raw_spinlock_irqsave)(&tk_core.lock);
1811 
1812 	ntp_init();
1813 
1814 	clock = clocksource_default_clock();
1815 	if (clock->enable)
1816 		clock->enable(clock);
1817 	tk_setup_internals(tks, clock);
1818 
1819 	tk_set_xtime(tks, &wall_time);
1820 	tks->raw_sec = 0;
1821 
1822 	tk_set_wall_to_mono(tks, wall_to_mono);
1823 
1824 	timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
1825 }
1826 
1827 /* time in seconds when suspend began for persistent clock */
1828 static struct timespec64 timekeeping_suspend_time;
1829 
1830 /**
1831  * __timekeeping_inject_sleeptime - Internal function to add sleep interval
1832  * @tk:		Pointer to the timekeeper to be updated
1833  * @delta:	Pointer to the delta value in timespec64 format
1834  *
1835  * Takes a timespec offset measuring a suspend interval and properly
1836  * adds the sleep offset to the timekeeping variables.
1837  */
1838 static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1839 					   const struct timespec64 *delta)
1840 {
1841 	if (!timespec64_valid_strict(delta)) {
1842 		printk_deferred(KERN_WARNING
1843 				"__timekeeping_inject_sleeptime: Invalid "
1844 				"sleep delta value!\n");
1845 		return;
1846 	}
1847 	tk_xtime_add(tk, delta);
1848 	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
1849 	tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
1850 	tk_debug_account_sleep_time(delta);
1851 }
1852 
1853 #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
1854 /*
1855  * We have three kinds of time sources to use for sleep time
1856  * injection, the preference order is:
1857  * 1) non-stop clocksource
1858  * 2) persistent clock (ie: RTC accessible when irqs are off)
1859  * 3) RTC
1860  *
1861  * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
1862  * If system has neither 1) nor 2), 3) will be used finally.
1863  *
1864  *
1865  * If timekeeping has injected sleeptime via either 1) or 2),
1866  * 3) becomes needless, so in this case we don't need to call
1867  * rtc_resume(), and this is what timekeeping_rtc_skipresume()
1868  * means.
1869  */
1870 bool timekeeping_rtc_skipresume(void)
1871 {
1872 	return !suspend_timing_needed;
1873 }
1874 
1875 /*
1876  * 1) can be determined whether to use or not only when doing
1877  * timekeeping_resume() which is invoked after rtc_suspend(),
1878  * so we can't skip rtc_suspend() surely if system has 1).
1879  *
1880  * But if system has 2), 2) will definitely be used, so in this
1881  * case we don't need to call rtc_suspend(), and this is what
1882  * timekeeping_rtc_skipsuspend() means.
1883  */
1884 bool timekeeping_rtc_skipsuspend(void)
1885 {
1886 	return persistent_clock_exists;
1887 }
1888 
1889 /**
1890  * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
1891  * @delta: pointer to a timespec64 delta value
1892  *
1893  * This hook is for architectures that cannot support read_persistent_clock64
1894  * because their RTC/persistent clock is only accessible when irqs are enabled.
1895  * and also don't have an effective nonstop clocksource.
1896  *
1897  * This function should only be called by rtc_resume(), and allows
1898  * a suspend offset to be injected into the timekeeping values.
1899  */
1900 void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
1901 {
1902 	scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
1903 		struct timekeeper *tks = &tk_core.shadow_timekeeper;
1904 
1905 		suspend_timing_needed = false;
1906 		timekeeping_forward_now(tks);
1907 		__timekeeping_inject_sleeptime(tks, delta);
1908 		timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
1909 	}
1910 
1911 	/* Signal hrtimers about time change */
1912 	clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
1913 }
1914 #endif
1915 
1916 /**
1917  * timekeeping_resume - Resumes the generic timekeeping subsystem.
1918  */
1919 void timekeeping_resume(void)
1920 {
1921 	struct timekeeper *tks = &tk_core.shadow_timekeeper;
1922 	struct clocksource *clock = tks->tkr_mono.clock;
1923 	struct timespec64 ts_new, ts_delta;
1924 	bool inject_sleeptime = false;
1925 	u64 cycle_now, nsec;
1926 	unsigned long flags;
1927 
1928 	read_persistent_clock64(&ts_new);
1929 
1930 	clockevents_resume();
1931 	clocksource_resume();
1932 
1933 	raw_spin_lock_irqsave(&tk_core.lock, flags);
1934 
1935 	/*
1936 	 * After system resumes, we need to calculate the suspended time and
1937 	 * compensate it for the OS time. There are 3 sources that could be
1938 	 * used: Nonstop clocksource during suspend, persistent clock and rtc
1939 	 * device.
1940 	 *
1941 	 * One specific platform may have 1 or 2 or all of them, and the
1942 	 * preference will be:
1943 	 *	suspend-nonstop clocksource -> persistent clock -> rtc
1944 	 * The less preferred source will only be tried if there is no better
1945 	 * usable source. The rtc part is handled separately in rtc core code.
1946 	 */
1947 	cycle_now = tk_clock_read(&tks->tkr_mono);
1948 	nsec = clocksource_stop_suspend_timing(clock, cycle_now);
1949 	if (nsec > 0) {
1950 		ts_delta = ns_to_timespec64(nsec);
1951 		inject_sleeptime = true;
1952 	} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
1953 		ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
1954 		inject_sleeptime = true;
1955 	}
1956 
1957 	if (inject_sleeptime) {
1958 		suspend_timing_needed = false;
1959 		__timekeeping_inject_sleeptime(tks, &ts_delta);
1960 	}
1961 
1962 	/* Re-base the last cycle value */
1963 	tks->tkr_mono.cycle_last = cycle_now;
1964 	tks->tkr_raw.cycle_last  = cycle_now;
1965 
1966 	tks->ntp_error = 0;
1967 	timekeeping_suspended = 0;
1968 	timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
1969 	raw_spin_unlock_irqrestore(&tk_core.lock, flags);
1970 
1971 	touch_softlockup_watchdog();
1972 
1973 	/* Resume the clockevent device(s) and hrtimers */
1974 	tick_resume();
1975 	/* Notify timerfd as resume is equivalent to clock_was_set() */
1976 	timerfd_resume();
1977 }
1978 
1979 int timekeeping_suspend(void)
1980 {
1981 	struct timekeeper *tks = &tk_core.shadow_timekeeper;
1982 	struct timespec64 delta, delta_delta;
1983 	static struct timespec64 old_delta;
1984 	struct clocksource *curr_clock;
1985 	unsigned long flags;
1986 	u64 cycle_now;
1987 
1988 	read_persistent_clock64(&timekeeping_suspend_time);
1989 
1990 	/*
1991 	 * On some systems the persistent_clock can not be detected at
1992 	 * timekeeping_init by its return value, so if we see a valid
1993 	 * value returned, update the persistent_clock_exists flag.
1994 	 */
1995 	if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
1996 		persistent_clock_exists = true;
1997 
1998 	suspend_timing_needed = true;
1999 
2000 	raw_spin_lock_irqsave(&tk_core.lock, flags);
2001 	timekeeping_forward_now(tks);
2002 	timekeeping_suspended = 1;
2003 
2004 	/*
2005 	 * Since we've called forward_now, cycle_last stores the value
2006 	 * just read from the current clocksource. Save this to potentially
2007 	 * use in suspend timing.
2008 	 */
2009 	curr_clock = tks->tkr_mono.clock;
2010 	cycle_now = tks->tkr_mono.cycle_last;
2011 	clocksource_start_suspend_timing(curr_clock, cycle_now);
2012 
2013 	if (persistent_clock_exists) {
2014 		/*
2015 		 * To avoid drift caused by repeated suspend/resumes,
2016 		 * which each can add ~1 second drift error,
2017 		 * try to compensate so the difference in system time
2018 		 * and persistent_clock time stays close to constant.
2019 		 */
2020 		delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
2021 		delta_delta = timespec64_sub(delta, old_delta);
2022 		if (abs(delta_delta.tv_sec) >= 2) {
2023 			/*
2024 			 * if delta_delta is too large, assume time correction
2025 			 * has occurred and set old_delta to the current delta.
2026 			 */
2027 			old_delta = delta;
2028 		} else {
2029 			/* Otherwise try to adjust old_system to compensate */
2030 			timekeeping_suspend_time =
2031 				timespec64_add(timekeeping_suspend_time, delta_delta);
2032 		}
2033 	}
2034 
2035 	timekeeping_update_from_shadow(&tk_core, 0);
2036 	halt_fast_timekeeper(tks);
2037 	raw_spin_unlock_irqrestore(&tk_core.lock, flags);
2038 
2039 	tick_suspend();
2040 	clocksource_suspend();
2041 	clockevents_suspend();
2042 
2043 	return 0;
2044 }
2045 
2046 /* sysfs resume/suspend bits for timekeeping */
2047 static struct syscore_ops timekeeping_syscore_ops = {
2048 	.resume		= timekeeping_resume,
2049 	.suspend	= timekeeping_suspend,
2050 };
2051 
2052 static int __init timekeeping_init_ops(void)
2053 {
2054 	register_syscore_ops(&timekeeping_syscore_ops);
2055 	return 0;
2056 }
2057 device_initcall(timekeeping_init_ops);
2058 
2059 /*
2060  * Apply a multiplier adjustment to the timekeeper
2061  */
2062 static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
2063 							 s64 offset,
2064 							 s32 mult_adj)
2065 {
2066 	s64 interval = tk->cycle_interval;
2067 
2068 	if (mult_adj == 0) {
2069 		return;
2070 	} else if (mult_adj == -1) {
2071 		interval = -interval;
2072 		offset = -offset;
2073 	} else if (mult_adj != 1) {
2074 		interval *= mult_adj;
2075 		offset *= mult_adj;
2076 	}
2077 
2078 	/*
2079 	 * So the following can be confusing.
2080 	 *
2081 	 * To keep things simple, lets assume mult_adj == 1 for now.
2082 	 *
2083 	 * When mult_adj != 1, remember that the interval and offset values
2084 	 * have been appropriately scaled so the math is the same.
2085 	 *
2086 	 * The basic idea here is that we're increasing the multiplier
2087 	 * by one, this causes the xtime_interval to be incremented by
2088 	 * one cycle_interval. This is because:
2089 	 *	xtime_interval = cycle_interval * mult
2090 	 * So if mult is being incremented by one:
2091 	 *	xtime_interval = cycle_interval * (mult + 1)
2092 	 * Its the same as:
2093 	 *	xtime_interval = (cycle_interval * mult) + cycle_interval
2094 	 * Which can be shortened to:
2095 	 *	xtime_interval += cycle_interval
2096 	 *
2097 	 * So offset stores the non-accumulated cycles. Thus the current
2098 	 * time (in shifted nanoseconds) is:
2099 	 *	now = (offset * adj) + xtime_nsec
2100 	 * Now, even though we're adjusting the clock frequency, we have
2101 	 * to keep time consistent. In other words, we can't jump back
2102 	 * in time, and we also want to avoid jumping forward in time.
2103 	 *
2104 	 * So given the same offset value, we need the time to be the same
2105 	 * both before and after the freq adjustment.
2106 	 *	now = (offset * adj_1) + xtime_nsec_1
2107 	 *	now = (offset * adj_2) + xtime_nsec_2
2108 	 * So:
2109 	 *	(offset * adj_1) + xtime_nsec_1 =
2110 	 *		(offset * adj_2) + xtime_nsec_2
2111 	 * And we know:
2112 	 *	adj_2 = adj_1 + 1
2113 	 * So:
2114 	 *	(offset * adj_1) + xtime_nsec_1 =
2115 	 *		(offset * (adj_1+1)) + xtime_nsec_2
2116 	 *	(offset * adj_1) + xtime_nsec_1 =
2117 	 *		(offset * adj_1) + offset + xtime_nsec_2
2118 	 * Canceling the sides:
2119 	 *	xtime_nsec_1 = offset + xtime_nsec_2
2120 	 * Which gives us:
2121 	 *	xtime_nsec_2 = xtime_nsec_1 - offset
2122 	 * Which simplifies to:
2123 	 *	xtime_nsec -= offset
2124 	 */
2125 	if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
2126 		/* NTP adjustment caused clocksource mult overflow */
2127 		WARN_ON_ONCE(1);
2128 		return;
2129 	}
2130 
2131 	tk->tkr_mono.mult += mult_adj;
2132 	tk->xtime_interval += interval;
2133 	tk->tkr_mono.xtime_nsec -= offset;
2134 }
2135 
2136 /*
2137  * Adjust the timekeeper's multiplier to the correct frequency
2138  * and also to reduce the accumulated error value.
2139  */
2140 static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
2141 {
2142 	u64 ntp_tl = ntp_tick_length(tk->id);
2143 	u32 mult;
2144 
2145 	/*
2146 	 * Determine the multiplier from the current NTP tick length.
2147 	 * Avoid expensive division when the tick length doesn't change.
2148 	 */
2149 	if (likely(tk->ntp_tick == ntp_tl)) {
2150 		mult = tk->tkr_mono.mult - tk->ntp_err_mult;
2151 	} else {
2152 		tk->ntp_tick = ntp_tl;
2153 		mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
2154 				 tk->xtime_remainder, tk->cycle_interval);
2155 	}
2156 
2157 	/*
2158 	 * If the clock is behind the NTP time, increase the multiplier by 1
2159 	 * to catch up with it. If it's ahead and there was a remainder in the
2160 	 * tick division, the clock will slow down. Otherwise it will stay
2161 	 * ahead until the tick length changes to a non-divisible value.
2162 	 */
2163 	tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
2164 	mult += tk->ntp_err_mult;
2165 
2166 	timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);
2167 
2168 	if (unlikely(tk->tkr_mono.clock->maxadj &&
2169 		(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
2170 			> tk->tkr_mono.clock->maxadj))) {
2171 		printk_once(KERN_WARNING
2172 			"Adjusting %s more than 11%% (%ld vs %ld)\n",
2173 			tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
2174 			(long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
2175 	}
2176 
2177 	/*
2178 	 * It may be possible that when we entered this function, xtime_nsec
2179 	 * was very small.  Further, if we're slightly speeding the clocksource
2180 	 * in the code above, its possible the required corrective factor to
2181 	 * xtime_nsec could cause it to underflow.
2182 	 *
2183 	 * Now, since we have already accumulated the second and the NTP
2184 	 * subsystem has been notified via second_overflow(), we need to skip
2185 	 * the next update.
2186 	 */
2187 	if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
2188 		tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
2189 							tk->tkr_mono.shift;
2190 		tk->xtime_sec--;
2191 		tk->skip_second_overflow = 1;
2192 	}
2193 }
2194 
2195 /*
2196  * accumulate_nsecs_to_secs - Accumulates nsecs into secs
2197  *
2198  * Helper function that accumulates the nsecs greater than a second
2199  * from the xtime_nsec field to the xtime_secs field.
2200  * It also calls into the NTP code to handle leapsecond processing.
2201  */
2202 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
2203 {
2204 	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
2205 	unsigned int clock_set = 0;
2206 
2207 	while (tk->tkr_mono.xtime_nsec >= nsecps) {
2208 		int leap;
2209 
2210 		tk->tkr_mono.xtime_nsec -= nsecps;
2211 		tk->xtime_sec++;
2212 
2213 		/*
2214 		 * Skip NTP update if this second was accumulated before,
2215 		 * i.e. xtime_nsec underflowed in timekeeping_adjust()
2216 		 */
2217 		if (unlikely(tk->skip_second_overflow)) {
2218 			tk->skip_second_overflow = 0;
2219 			continue;
2220 		}
2221 
2222 		/* Figure out if its a leap sec and apply if needed */
2223 		leap = second_overflow(tk->id, tk->xtime_sec);
2224 		if (unlikely(leap)) {
2225 			struct timespec64 ts;
2226 
2227 			tk->xtime_sec += leap;
2228 
2229 			ts.tv_sec = leap;
2230 			ts.tv_nsec = 0;
2231 			tk_set_wall_to_mono(tk,
2232 				timespec64_sub(tk->wall_to_monotonic, ts));
2233 
2234 			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
2235 
2236 			clock_set = TK_CLOCK_WAS_SET;
2237 		}
2238 	}
2239 	return clock_set;
2240 }
2241 
2242 /*
2243  * logarithmic_accumulation - shifted accumulation of cycles
2244  *
2245  * This functions accumulates a shifted interval of cycles into
2246  * a shifted interval nanoseconds. Allows for O(log) accumulation
2247  * loop.
2248  *
2249  * Returns the unconsumed cycles.
2250  */
2251 static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
2252 				    u32 shift, unsigned int *clock_set)
2253 {
2254 	u64 interval = tk->cycle_interval << shift;
2255 	u64 snsec_per_sec;
2256 
2257 	/* If the offset is smaller than a shifted interval, do nothing */
2258 	if (offset < interval)
2259 		return offset;
2260 
2261 	/* Accumulate one shifted interval */
2262 	offset -= interval;
2263 	tk->tkr_mono.cycle_last += interval;
2264 	tk->tkr_raw.cycle_last  += interval;
2265 
2266 	tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
2267 	*clock_set |= accumulate_nsecs_to_secs(tk);
2268 
2269 	/* Accumulate raw time */
2270 	tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
2271 	snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
2272 	while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
2273 		tk->tkr_raw.xtime_nsec -= snsec_per_sec;
2274 		tk->raw_sec++;
2275 	}
2276 
2277 	/* Accumulate error between NTP and clock interval */
2278 	tk->ntp_error += tk->ntp_tick << shift;
2279 	tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
2280 						(tk->ntp_error_shift + shift);
2281 
2282 	return offset;
2283 }
2284 
2285 /*
2286  * timekeeping_advance - Updates the timekeeper to the current time and
2287  * current NTP tick length
2288  */
2289 static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
2290 {
2291 	struct timekeeper *tk = &tkd->shadow_timekeeper;
2292 	struct timekeeper *real_tk = &tkd->timekeeper;
2293 	unsigned int clock_set = 0;
2294 	int shift = 0, maxshift;
2295 	u64 offset, orig_offset;
2296 
2297 	/* Make sure we're fully resumed: */
2298 	if (unlikely(timekeeping_suspended))
2299 		return false;
2300 
2301 	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
2302 				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
2303 				   tk->tkr_mono.clock->max_raw_delta);
2304 	orig_offset = offset;
2305 	/* Check if there's really nothing to do */
2306 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
2307 		return false;
2308 
2309 	/*
2310 	 * With NO_HZ we may have to accumulate many cycle_intervals
2311 	 * (think "ticks") worth of time at once. To do this efficiently,
2312 	 * we calculate the largest doubling multiple of cycle_intervals
2313 	 * that is smaller than the offset.  We then accumulate that
2314 	 * chunk in one go, and then try to consume the next smaller
2315 	 * doubled multiple.
2316 	 */
2317 	shift = ilog2(offset) - ilog2(tk->cycle_interval);
2318 	shift = max(0, shift);
2319 	/* Bound shift to one less than what overflows tick_length */
2320 	maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
2321 	shift = min(shift, maxshift);
2322 	while (offset >= tk->cycle_interval) {
2323 		offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
2324 		if (offset < tk->cycle_interval<<shift)
2325 			shift--;
2326 	}
2327 
2328 	/* Adjust the multiplier to correct NTP error */
2329 	timekeeping_adjust(tk, offset);
2330 
2331 	/*
2332 	 * Finally, make sure that after the rounding
2333 	 * xtime_nsec isn't larger than NSEC_PER_SEC
2334 	 */
2335 	clock_set |= accumulate_nsecs_to_secs(tk);
2336 
2337 	/*
2338 	 * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
2339 	 * making small negative adjustments to the base xtime_nsec
2340 	 * value, only update the coarse clocks if we accumulated time
2341 	 */
2342 	if (orig_offset != offset)
2343 		tk_update_coarse_nsecs(tk);
2344 
2345 	timekeeping_update_from_shadow(tkd, clock_set);
2346 
2347 	return !!clock_set;
2348 }
2349 
2350 static bool timekeeping_advance(enum timekeeping_adv_mode mode)
2351 {
2352 	guard(raw_spinlock_irqsave)(&tk_core.lock);
2353 	return __timekeeping_advance(&tk_core, mode);
2354 }
2355 
2356 /**
2357  * update_wall_time - Uses the current clocksource to increment the wall time
2358  *
2359  * It also updates the enabled auxiliary clock timekeepers
2360  */
2361 void update_wall_time(void)
2362 {
2363 	if (timekeeping_advance(TK_ADV_TICK))
2364 		clock_was_set_delayed();
2365 	tk_aux_advance();
2366 }
2367 
2368 /**
2369  * getboottime64 - Return the real time of system boot.
2370  * @ts:		pointer to the timespec64 to be set
2371  *
2372  * Returns the wall-time of boot in a timespec64.
2373  *
2374  * This is based on the wall_to_monotonic offset and the total suspend
2375  * time. Calls to settimeofday will affect the value returned (which
2376  * basically means that however wrong your real time clock is at boot time,
2377  * you get the right time here).
2378  */
2379 void getboottime64(struct timespec64 *ts)
2380 {
2381 	struct timekeeper *tk = &tk_core.timekeeper;
2382 	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
2383 
2384 	*ts = ktime_to_timespec64(t);
2385 }
2386 EXPORT_SYMBOL_GPL(getboottime64);
2387 
2388 void ktime_get_coarse_real_ts64(struct timespec64 *ts)
2389 {
2390 	struct timekeeper *tk = &tk_core.timekeeper;
2391 	unsigned int seq;
2392 
2393 	do {
2394 		seq = read_seqcount_begin(&tk_core.seq);
2395 
2396 		*ts = tk_xtime_coarse(tk);
2397 	} while (read_seqcount_retry(&tk_core.seq, seq));
2398 }
2399 EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
2400 
2401 /**
2402  * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
2403  * @ts:		timespec64 to be filled
2404  *
2405  * Fetch the global mg_floor value, convert it to realtime and compare it
2406  * to the current coarse-grained time. Fill @ts with whichever is
2407  * latest. Note that this is a filesystem-specific interface and should be
2408  * avoided outside of that context.
2409  */
2410 void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
2411 {
2412 	struct timekeeper *tk = &tk_core.timekeeper;
2413 	u64 floor = atomic64_read(&mg_floor);
2414 	ktime_t f_real, offset, coarse;
2415 	unsigned int seq;
2416 
2417 	do {
2418 		seq = read_seqcount_begin(&tk_core.seq);
2419 		*ts = tk_xtime_coarse(tk);
2420 		offset = tk_core.timekeeper.offs_real;
2421 	} while (read_seqcount_retry(&tk_core.seq, seq));
2422 
2423 	coarse = timespec64_to_ktime(*ts);
2424 	f_real = ktime_add(floor, offset);
2425 	if (ktime_after(f_real, coarse))
2426 		*ts = ktime_to_timespec64(f_real);
2427 }
2428 
2429 /**
2430  * ktime_get_real_ts64_mg - attempt to update floor value and return result
2431  * @ts:		pointer to the timespec to be set
2432  *
2433  * Get a monotonic fine-grained time value and attempt to swap it into
2434  * mg_floor. If that succeeds then accept the new floor value. If it fails
2435  * then another task raced in during the interim time and updated the
2436  * floor.  Since any update to the floor must be later than the previous
2437  * floor, either outcome is acceptable.
2438  *
2439  * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
2440  * and determining that the resulting coarse-grained timestamp did not effect
2441  * a change in ctime. Any more recent floor value would effect a change to
2442  * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
2443  *
2444  * @ts will be filled with the latest floor value, regardless of the outcome of
2445  * the cmpxchg. Note that this is a filesystem specific interface and should be
2446  * avoided outside of that context.
2447  */
2448 void ktime_get_real_ts64_mg(struct timespec64 *ts)
2449 {
2450 	struct timekeeper *tk = &tk_core.timekeeper;
2451 	ktime_t old = atomic64_read(&mg_floor);
2452 	ktime_t offset, mono;
2453 	unsigned int seq;
2454 	u64 nsecs;
2455 
2456 	do {
2457 		seq = read_seqcount_begin(&tk_core.seq);
2458 
2459 		ts->tv_sec = tk->xtime_sec;
2460 		mono = tk->tkr_mono.base;
2461 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
2462 		offset = tk_core.timekeeper.offs_real;
2463 	} while (read_seqcount_retry(&tk_core.seq, seq));
2464 
2465 	mono = ktime_add_ns(mono, nsecs);
2466 
2467 	/*
2468 	 * Attempt to update the floor with the new time value. As any
2469 	 * update must be later then the existing floor, and would effect
2470 	 * a change to ctime from the perspective of the current task,
2471 	 * accept the resulting floor value regardless of the outcome of
2472 	 * the swap.
2473 	 */
2474 	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
2475 		ts->tv_nsec = 0;
2476 		timespec64_add_ns(ts, nsecs);
2477 		timekeeping_inc_mg_floor_swaps();
2478 	} else {
2479 		/*
2480 		 * Another task changed mg_floor since "old" was fetched.
2481 		 * "old" has been updated with the latest value of "mg_floor".
2482 		 * That value is newer than the previous floor value, which
2483 		 * is enough to effect a change to ctime. Accept it.
2484 		 */
2485 		*ts = ktime_to_timespec64(ktime_add(old, offset));
2486 	}
2487 }
2488 
2489 void ktime_get_coarse_ts64(struct timespec64 *ts)
2490 {
2491 	struct timekeeper *tk = &tk_core.timekeeper;
2492 	struct timespec64 now, mono;
2493 	unsigned int seq;
2494 
2495 	do {
2496 		seq = read_seqcount_begin(&tk_core.seq);
2497 
2498 		now = tk_xtime_coarse(tk);
2499 		mono = tk->wall_to_monotonic;
2500 	} while (read_seqcount_retry(&tk_core.seq, seq));
2501 
2502 	set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
2503 				  now.tv_nsec + mono.tv_nsec);
2504 }
2505 EXPORT_SYMBOL(ktime_get_coarse_ts64);
2506 
2507 /*
2508  * Must hold jiffies_lock
2509  */
2510 void do_timer(unsigned long ticks)
2511 {
2512 	jiffies_64 += ticks;
2513 	calc_global_load();
2514 }
2515 
2516 /**
2517  * ktime_get_update_offsets_now - hrtimer helper
2518  * @cwsseq:	pointer to check and store the clock was set sequence number
2519  * @offs_real:	pointer to storage for monotonic -> realtime offset
2520  * @offs_boot:	pointer to storage for monotonic -> boottime offset
2521  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
2522  *
2523  * Returns current monotonic time and updates the offsets if the
2524  * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
2525  * different.
2526  *
2527  * Called from hrtimer_interrupt() or retrigger_next_event()
2528  */
2529 ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
2530 				     ktime_t *offs_boot, ktime_t *offs_tai)
2531 {
2532 	struct timekeeper *tk = &tk_core.timekeeper;
2533 	unsigned int seq;
2534 	ktime_t base;
2535 	u64 nsecs;
2536 
2537 	do {
2538 		seq = read_seqcount_begin(&tk_core.seq);
2539 
2540 		base = tk->tkr_mono.base;
2541 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
2542 		base = ktime_add_ns(base, nsecs);
2543 
2544 		if (*cwsseq != tk->clock_was_set_seq) {
2545 			*cwsseq = tk->clock_was_set_seq;
2546 			*offs_real = tk->offs_real;
2547 			*offs_boot = tk->offs_boot;
2548 			*offs_tai = tk->offs_tai;
2549 		}
2550 
2551 		/* Handle leapsecond insertion adjustments */
2552 		if (unlikely(base >= tk->next_leap_ktime))
2553 			*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
2554 
2555 	} while (read_seqcount_retry(&tk_core.seq, seq));
2556 
2557 	return base;
2558 }
2559 
2560 /*
2561  * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
2562  */
2563 static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
2564 {
2565 	if (txc->modes & ADJ_ADJTIME) {
2566 		/* singleshot must not be used with any other mode bits */
2567 		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
2568 			return -EINVAL;
2569 		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
2570 		    !capable(CAP_SYS_TIME))
2571 			return -EPERM;
2572 	} else {
2573 		/* In order to modify anything, you gotta be super-user! */
2574 		if (txc->modes && !capable(CAP_SYS_TIME))
2575 			return -EPERM;
2576 		/*
2577 		 * if the quartz is off by more than 10% then
2578 		 * something is VERY wrong!
2579 		 */
2580 		if (txc->modes & ADJ_TICK &&
2581 		    (txc->tick <  900000/USER_HZ ||
2582 		     txc->tick > 1100000/USER_HZ))
2583 			return -EINVAL;
2584 	}
2585 
2586 	if (txc->modes & ADJ_SETOFFSET) {
2587 		/* In order to inject time, you gotta be super-user! */
2588 		if (!capable(CAP_SYS_TIME))
2589 			return -EPERM;
2590 
2591 		/*
2592 		 * Validate if a timespec/timeval used to inject a time
2593 		 * offset is valid.  Offsets can be positive or negative, so
2594 		 * we don't check tv_sec. The value of the timeval/timespec
2595 		 * is the sum of its fields,but *NOTE*:
2596 		 * The field tv_usec/tv_nsec must always be non-negative and
2597 		 * we can't have more nanoseconds/microseconds than a second.
2598 		 */
2599 		if (txc->time.tv_usec < 0)
2600 			return -EINVAL;
2601 
2602 		if (txc->modes & ADJ_NANO) {
2603 			if (txc->time.tv_usec >= NSEC_PER_SEC)
2604 				return -EINVAL;
2605 		} else {
2606 			if (txc->time.tv_usec >= USEC_PER_SEC)
2607 				return -EINVAL;
2608 		}
2609 	}
2610 
2611 	/*
2612 	 * Check for potential multiplication overflows that can
2613 	 * only happen on 64-bit systems:
2614 	 */
2615 	if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
2616 		if (LLONG_MIN / PPM_SCALE > txc->freq)
2617 			return -EINVAL;
2618 		if (LLONG_MAX / PPM_SCALE < txc->freq)
2619 			return -EINVAL;
2620 	}
2621 
2622 	if (aux_clock) {
2623 		/* Auxiliary clocks are similar to TAI and do not have leap seconds */
2624 		if (txc->status & (STA_INS | STA_DEL))
2625 			return -EINVAL;
2626 
2627 		/* No TAI offset setting */
2628 		if (txc->modes & ADJ_TAI)
2629 			return -EINVAL;
2630 
2631 		/* No PPS support either */
2632 		if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
2633 			return -EINVAL;
2634 	}
2635 
2636 	return 0;
2637 }
2638 
2639 /**
2640  * random_get_entropy_fallback - Returns the raw clock source value,
2641  * used by random.c for platforms with no valid random_get_entropy().
2642  */
2643 unsigned long random_get_entropy_fallback(void)
2644 {
2645 	struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
2646 	struct clocksource *clock = READ_ONCE(tkr->clock);
2647 
2648 	if (unlikely(timekeeping_suspended || !clock))
2649 		return 0;
2650 	return clock->read(clock);
2651 }
2652 EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
2653 
2654 struct adjtimex_result {
2655 	struct audit_ntp_data	ad;
2656 	struct timespec64	delta;
2657 	bool			clock_set;
2658 };
2659 
2660 static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
2661 			 struct adjtimex_result *result)
2662 {
2663 	struct timekeeper *tks = &tkd->shadow_timekeeper;
2664 	bool aux_clock = !timekeeper_is_core_tk(tks);
2665 	struct timespec64 ts;
2666 	s32 orig_tai, tai;
2667 	int ret;
2668 
2669 	/* Validate the data before disabling interrupts */
2670 	ret = timekeeping_validate_timex(txc, aux_clock);
2671 	if (ret)
2672 		return ret;
2673 	add_device_randomness(txc, sizeof(*txc));
2674 
2675 	if (!aux_clock)
2676 		ktime_get_real_ts64(&ts);
2677 	else
2678 		tk_get_aux_ts64(tkd->timekeeper.id, &ts);
2679 
2680 	add_device_randomness(&ts, sizeof(ts));
2681 
2682 	guard(raw_spinlock_irqsave)(&tkd->lock);
2683 
2684 	if (!tks->clock_valid)
2685 		return -ENODEV;
2686 
2687 	if (txc->modes & ADJ_SETOFFSET) {
2688 		result->delta.tv_sec  = txc->time.tv_sec;
2689 		result->delta.tv_nsec = txc->time.tv_usec;
2690 		if (!(txc->modes & ADJ_NANO))
2691 			result->delta.tv_nsec *= 1000;
2692 		ret = __timekeeping_inject_offset(tkd, &result->delta);
2693 		if (ret)
2694 			return ret;
2695 		result->clock_set = true;
2696 	}
2697 
2698 	orig_tai = tai = tks->tai_offset;
2699 	ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
2700 
2701 	if (tai != orig_tai) {
2702 		__timekeeping_set_tai_offset(tks, tai);
2703 		timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
2704 		result->clock_set = true;
2705 	} else {
2706 		tk_update_leap_state_all(&tk_core);
2707 	}
2708 
2709 	/* Update the multiplier immediately if frequency was set directly */
2710 	if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
2711 		result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
2712 
2713 	return ret;
2714 }
2715 
2716 /**
2717  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
2718  * @txc:	Pointer to kernel_timex structure containing NTP parameters
2719  */
2720 int do_adjtimex(struct __kernel_timex *txc)
2721 {
2722 	struct adjtimex_result result = { };
2723 	int ret;
2724 
2725 	ret = __do_adjtimex(&tk_core, txc, &result);
2726 	if (ret < 0)
2727 		return ret;
2728 
2729 	if (txc->modes & ADJ_SETOFFSET)
2730 		audit_tk_injoffset(result.delta);
2731 
2732 	audit_ntp_log(&result.ad);
2733 
2734 	if (result.clock_set)
2735 		clock_was_set(CLOCK_SET_WALL);
2736 
2737 	ntp_notify_cmos_timer(result.delta.tv_sec != 0);
2738 
2739 	return ret;
2740 }
2741 
2742 /*
2743  * Invoked from NTP with the time keeper lock held, so lockless access is
2744  * fine.
2745  */
2746 long ktime_get_ntp_seconds(unsigned int id)
2747 {
2748 	return timekeeper_data[id].timekeeper.xtime_sec;
2749 }
2750 
2751 #ifdef CONFIG_NTP_PPS
2752 /**
2753  * hardpps() - Accessor function to NTP __hardpps function
2754  * @phase_ts:	Pointer to timespec64 structure representing phase timestamp
2755  * @raw_ts:	Pointer to timespec64 structure representing raw timestamp
2756  */
2757 void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
2758 {
2759 	guard(raw_spinlock_irqsave)(&tk_core.lock);
2760 	__hardpps(phase_ts, raw_ts);
2761 }
2762 EXPORT_SYMBOL(hardpps);
2763 #endif /* CONFIG_NTP_PPS */
2764 
2765 #ifdef CONFIG_POSIX_AUX_CLOCKS
2766 #include "posix-timers.h"
2767 
2768 /*
2769  * Bitmap for the activated auxiliary timekeepers to allow lockless quick
2770  * checks in the hot paths without touching extra cache lines. If set, then
2771  * the state of the corresponding timekeeper has to be re-checked under
2772  * timekeeper::lock.
2773  */
2774 static unsigned long aux_timekeepers;
2775 
2776 static inline unsigned int clockid_to_tkid(unsigned int id)
2777 {
2778 	return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
2779 }
2780 
2781 static inline struct tk_data *aux_get_tk_data(clockid_t id)
2782 {
2783 	if (!clockid_aux_valid(id))
2784 		return NULL;
2785 	return &timekeeper_data[clockid_to_tkid(id)];
2786 }
2787 
2788 /* Invoked from timekeeping after a clocksource change */
2789 static void tk_aux_update_clocksource(void)
2790 {
2791 	unsigned long active = READ_ONCE(aux_timekeepers);
2792 	unsigned int id;
2793 
2794 	for_each_set_bit(id, &active, BITS_PER_LONG) {
2795 		struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
2796 		struct timekeeper *tks = &tkd->shadow_timekeeper;
2797 
2798 		guard(raw_spinlock_irqsave)(&tkd->lock);
2799 		if (!tks->clock_valid)
2800 			continue;
2801 
2802 		timekeeping_forward_now(tks);
2803 		tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
2804 		timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
2805 	}
2806 }
2807 
2808 static void tk_aux_advance(void)
2809 {
2810 	unsigned long active = READ_ONCE(aux_timekeepers);
2811 	unsigned int id;
2812 
2813 	/* Lockless quick check to avoid extra cache lines */
2814 	for_each_set_bit(id, &active, BITS_PER_LONG) {
2815 		struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
2816 
2817 		guard(raw_spinlock)(&aux_tkd->lock);
2818 		if (aux_tkd->shadow_timekeeper.clock_valid)
2819 			__timekeeping_advance(aux_tkd, TK_ADV_TICK);
2820 	}
2821 }
2822 
2823 /**
2824  * ktime_get_aux - Get time for a AUX clock
2825  * @id:	ID of the clock to read (CLOCK_AUX...)
2826  * @kt:	Pointer to ktime_t to store the time stamp
2827  *
2828  * Returns: True if the timestamp is valid, false otherwise
2829  */
2830 bool ktime_get_aux(clockid_t id, ktime_t *kt)
2831 {
2832 	struct tk_data *aux_tkd = aux_get_tk_data(id);
2833 	struct timekeeper *aux_tk;
2834 	unsigned int seq;
2835 	ktime_t base;
2836 	u64 nsecs;
2837 
2838 	WARN_ON(timekeeping_suspended);
2839 
2840 	if (!aux_tkd)
2841 		return false;
2842 
2843 	aux_tk = &aux_tkd->timekeeper;
2844 	do {
2845 		seq = read_seqcount_begin(&aux_tkd->seq);
2846 		if (!aux_tk->clock_valid)
2847 			return false;
2848 
2849 		base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
2850 		nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
2851 	} while (read_seqcount_retry(&aux_tkd->seq, seq));
2852 
2853 	*kt = ktime_add_ns(base, nsecs);
2854 	return true;
2855 }
2856 EXPORT_SYMBOL_GPL(ktime_get_aux);
2857 
2858 /**
2859  * ktime_get_aux_ts64 - Get time for a AUX clock
2860  * @id:	ID of the clock to read (CLOCK_AUX...)
2861  * @ts:	Pointer to timespec64 to store the time stamp
2862  *
2863  * Returns: True if the timestamp is valid, false otherwise
2864  */
2865 bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
2866 {
2867 	ktime_t now;
2868 
2869 	if (!ktime_get_aux(id, &now))
2870 		return false;
2871 	*ts = ktime_to_timespec64(now);
2872 	return true;
2873 }
2874 EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
2875 
2876 static int aux_get_res(clockid_t id, struct timespec64 *tp)
2877 {
2878 	if (!clockid_aux_valid(id))
2879 		return -ENODEV;
2880 
2881 	tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
2882 	tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
2883 	return 0;
2884 }
2885 
2886 static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
2887 {
2888 	return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
2889 }
2890 
2891 static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
2892 {
2893 	struct tk_data *aux_tkd = aux_get_tk_data(id);
2894 	struct timekeeper *aux_tks;
2895 	ktime_t tnow, nsecs;
2896 
2897 	if (!timespec64_valid_settod(tnew))
2898 		return -EINVAL;
2899 	if (!aux_tkd)
2900 		return -ENODEV;
2901 
2902 	aux_tks = &aux_tkd->shadow_timekeeper;
2903 
2904 	guard(raw_spinlock_irq)(&aux_tkd->lock);
2905 	if (!aux_tks->clock_valid)
2906 		return -ENODEV;
2907 
2908 	/* Forward the timekeeper base time */
2909 	timekeeping_forward_now(aux_tks);
2910 	/*
2911 	 * Get the updated base time. tkr_mono.base has not been
2912 	 * updated yet, so do that first. That makes the update
2913 	 * in timekeeping_update_from_shadow() redundant, but
2914 	 * that's harmless. After that @tnow can be calculated
2915 	 * by using tkr_mono::cycle_last, which has been set
2916 	 * by timekeeping_forward_now().
2917 	 */
2918 	tk_update_ktime_data(aux_tks);
2919 	nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
2920 	tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
2921 
2922 	/*
2923 	 * Calculate the new AUX offset as delta to @tnow ("monotonic").
2924 	 * That avoids all the tk::xtime back and forth conversions as
2925 	 * xtime ("realtime") is not applicable for auxiliary clocks and
2926 	 * kept in sync with "monotonic".
2927 	 */
2928 	aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow);
2929 
2930 	timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
2931 	return 0;
2932 }
2933 
2934 static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
2935 {
2936 	struct tk_data *aux_tkd = aux_get_tk_data(id);
2937 	struct adjtimex_result result = { };
2938 
2939 	if (!aux_tkd)
2940 		return -ENODEV;
2941 
2942 	/*
2943 	 * @result is ignored for now as there are neither hrtimers nor a
2944 	 * RTC related to auxiliary clocks for now.
2945 	 */
2946 	return __do_adjtimex(aux_tkd, txc, &result);
2947 }
2948 
2949 const struct k_clock clock_aux = {
2950 	.clock_getres		= aux_get_res,
2951 	.clock_get_timespec	= aux_get_timespec,
2952 	.clock_set		= aux_clock_set,
2953 	.clock_adj		= aux_clock_adj,
2954 };
2955 
2956 static void aux_clock_enable(clockid_t id)
2957 {
2958 	struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
2959 	struct tk_data *aux_tkd = aux_get_tk_data(id);
2960 	struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
2961 
2962 	/* Prevent the core timekeeper from changing. */
2963 	guard(raw_spinlock_irq)(&tk_core.lock);
2964 
2965 	/*
2966 	 * Setup the auxiliary clock assuming that the raw core timekeeper
2967 	 * clock frequency conversion is close enough. Userspace has to
2968 	 * adjust for the deviation via clock_adjtime(2).
2969 	 */
2970 	guard(raw_spinlock_nested)(&aux_tkd->lock);
2971 
2972 	/* Remove leftovers of a previous registration */
2973 	memset(aux_tks, 0, sizeof(*aux_tks));
2974 	/* Restore the timekeeper id */
2975 	aux_tks->id = aux_tkd->timekeeper.id;
2976 	/* Setup the timekeeper based on the current system clocksource */
2977 	tk_setup_internals(aux_tks, tkr_raw->clock);
2978 
2979 	/* Mark it valid and set it live */
2980 	aux_tks->clock_valid = true;
2981 	timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
2982 }
2983 
2984 static void aux_clock_disable(clockid_t id)
2985 {
2986 	struct tk_data *aux_tkd = aux_get_tk_data(id);
2987 
2988 	guard(raw_spinlock_irq)(&aux_tkd->lock);
2989 	aux_tkd->shadow_timekeeper.clock_valid = false;
2990 	timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
2991 }
2992 
2993 static DEFINE_MUTEX(aux_clock_mutex);
2994 
2995 static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
2996 				      const char *buf, size_t count)
2997 {
2998 	/* Lazy atoi() as name is "0..7" */
2999 	int id = kobj->name[0] & 0x7;
3000 	bool enable;
3001 
3002 	if (!capable(CAP_SYS_TIME))
3003 		return -EPERM;
3004 
3005 	if (kstrtobool(buf, &enable) < 0)
3006 		return -EINVAL;
3007 
3008 	guard(mutex)(&aux_clock_mutex);
3009 	if (enable == test_bit(id, &aux_timekeepers))
3010 		return count;
3011 
3012 	if (enable) {
3013 		aux_clock_enable(CLOCK_AUX + id);
3014 		set_bit(id, &aux_timekeepers);
3015 	} else {
3016 		aux_clock_disable(CLOCK_AUX + id);
3017 		clear_bit(id, &aux_timekeepers);
3018 	}
3019 	return count;
3020 }
3021 
3022 static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
3023 {
3024 	unsigned long active = READ_ONCE(aux_timekeepers);
3025 	/* Lazy atoi() as name is "0..7" */
3026 	int id = kobj->name[0] & 0x7;
3027 
3028 	return sysfs_emit(buf, "%d\n", test_bit(id, &active));
3029 }
3030 
3031 static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
3032 
3033 static struct attribute *aux_clock_enable_attrs[] = {
3034 	&aux_clock_enable_attr.attr,
3035 	NULL
3036 };
3037 
3038 static const struct attribute_group aux_clock_enable_attr_group = {
3039 	.attrs = aux_clock_enable_attrs,
3040 };
3041 
3042 static int __init tk_aux_sysfs_init(void)
3043 {
3044 	struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
3045 
3046 	if (!tko)
3047 		return -ENOMEM;
3048 
3049 	auxo = kobject_create_and_add("aux_clocks", tko);
3050 	if (!auxo) {
3051 		kobject_put(tko);
3052 		return -ENOMEM;
3053 	}
3054 
3055 	for (int i = 0; i <= MAX_AUX_CLOCKS; i++) {
3056 		char id[2] = { [0] = '0' + i, };
3057 		struct kobject *clk = kobject_create_and_add(id, auxo);
3058 
3059 		if (!clk)
3060 			return -ENOMEM;
3061 
3062 		int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
3063 
3064 		if (ret)
3065 			return ret;
3066 	}
3067 	return 0;
3068 }
3069 late_initcall(tk_aux_sysfs_init);
3070 
3071 static __init void tk_aux_setup(void)
3072 {
3073 	for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
3074 		tkd_basic_setup(&timekeeper_data[i], i, false);
3075 }
3076 #endif /* CONFIG_POSIX_AUX_CLOCKS */
3077