xref: /linux/kernel/time/hrtimer.c (revision 5d330d652d7a455b2215c38e7b0c6149c6f8225d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
4  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
5  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
6  *
7  *  High-resolution kernel timers
8  *
9  *  In contrast to the low-resolution timeout API, aka timer wheel,
10  *  hrtimers provide finer resolution and accuracy depending on system
11  *  configuration and capabilities.
12  *
13  *  Started by: Thomas Gleixner and Ingo Molnar
14  *
15  *  Credits:
16  *	Based on the original timer wheel code
17  *
18  *	Help, testing, suggestions, bugfixes, improvements were
19  *	provided by:
20  *
21  *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
22  *	et. al.
23  */
24 
25 #include <linux/cpu.h>
26 #include <linux/export.h>
27 #include <linux/percpu.h>
28 #include <linux/hrtimer.h>
29 #include <linux/notifier.h>
30 #include <linux/syscalls.h>
31 #include <linux/interrupt.h>
32 #include <linux/tick.h>
33 #include <linux/err.h>
34 #include <linux/debugobjects.h>
35 #include <linux/sched/signal.h>
36 #include <linux/sched/sysctl.h>
37 #include <linux/sched/rt.h>
38 #include <linux/sched/deadline.h>
39 #include <linux/sched/nohz.h>
40 #include <linux/sched/debug.h>
41 #include <linux/sched/isolation.h>
42 #include <linux/timer.h>
43 #include <linux/freezer.h>
44 #include <linux/compat.h>
45 
46 #include <linux/uaccess.h>
47 
48 #include <trace/events/timer.h>
49 
50 #include "tick-internal.h"
51 
52 /*
53  * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
54  *
55  * The callback state is kept separate in the CPU base because having it in
56  * the timer would required touching the timer after the callback, which
57  * makes it impossible to free the timer from the callback function.
58  *
59  * Therefore we track the callback state in:
60  *
61  *	timer->base->cpu_base->running == timer
62  *
63  * On SMP it is possible to have a "callback function running and enqueued"
64  * status. It happens for example when a posix timer expired and the callback
65  * queued a signal. Between dropping the lock which protects the posix timer
66  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
67  * signal and rearm the timer.
68  *
69  * All state transitions are protected by cpu_base->lock.
70  */
71 #define HRTIMER_STATE_INACTIVE	false
72 #define HRTIMER_STATE_ENQUEUED	true
73 
74 /*
75  * The resolution of the clocks. The resolution value is returned in
76  * the clock_getres() system call to give application programmers an
77  * idea of the (in)accuracy of timers. Timer values are rounded up to
78  * this resolution values.
79  */
80 #define HIGH_RES_NSEC		1
81 
82 /*
83  * Masks for selecting the soft and hard context timers from
84  * cpu_base->active
85  */
86 #define MASK_SHIFT		(HRTIMER_BASE_MONOTONIC_SOFT)
87 #define HRTIMER_ACTIVE_HARD	((1U << MASK_SHIFT) - 1)
88 #define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
89 #define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
90 
91 static void retrigger_next_event(void *arg);
92 static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
93 
94 /*
95  * The timer bases:
96  *
97  * There are more clockids than hrtimer bases. Thus, we index
98  * into the timer bases by the hrtimer_base_type enum. When trying
99  * to reach a base using a clockid, hrtimer_clockid_to_base()
100  * is used to convert from clockid to the proper hrtimer_base_type.
101  */
102 
103 #define BASE_INIT(idx, cid)			\
104 	[idx] = { .index = idx, .clockid = cid }
105 
106 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
107 {
108 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
109 	.clock_base = {
110 		BASE_INIT(HRTIMER_BASE_MONOTONIC,	CLOCK_MONOTONIC),
111 		BASE_INIT(HRTIMER_BASE_REALTIME,	CLOCK_REALTIME),
112 		BASE_INIT(HRTIMER_BASE_BOOTTIME,	CLOCK_BOOTTIME),
113 		BASE_INIT(HRTIMER_BASE_TAI,		CLOCK_TAI),
114 		BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT,	CLOCK_MONOTONIC),
115 		BASE_INIT(HRTIMER_BASE_REALTIME_SOFT,	CLOCK_REALTIME),
116 		BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT,	CLOCK_BOOTTIME),
117 		BASE_INIT(HRTIMER_BASE_TAI_SOFT,	CLOCK_TAI),
118 	},
119 	.csd = CSD_INIT(retrigger_next_event, NULL)
120 };
121 
122 static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
123 {
124 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
125 		return true;
126 	else
127 		return likely(base->online);
128 }
129 
130 #ifdef CONFIG_HIGH_RES_TIMERS
131 DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);
132 
133 static void hrtimer_hres_workfn(struct work_struct *work)
134 {
135 	static_branch_enable(&hrtimer_highres_enabled_key);
136 }
137 
138 static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);
139 
140 static inline void hrtimer_schedule_hres_work(void)
141 {
142 	if (!hrtimer_highres_enabled())
143 		schedule_work(&hrtimer_hres_work);
144 }
145 #else
146 static inline void hrtimer_schedule_hres_work(void) { }
147 #endif
148 
149 /*
150  * Functions and macros which are different for UP/SMP systems are kept in a
151  * single place
152  */
153 #ifdef CONFIG_SMP
154 /*
155  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
156  * such that hrtimer_callback_running() can unconditionally dereference
157  * timer->base->cpu_base
158  */
159 static struct hrtimer_cpu_base migration_cpu_base = {
160 	.clock_base = {
161 		[0] = {
162 			.cpu_base = &migration_cpu_base,
163 			.seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
164 							     &migration_cpu_base.lock),
165 		},
166 	},
167 };
168 
169 #define migration_base	migration_cpu_base.clock_base[0]
170 
171 /*
172  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
173  * means that all timers which are tied to this base via timer->base are
174  * locked, and the base itself is locked too.
175  *
176  * So __run_timers/migrate_timers can safely modify all timers which could
177  * be found on the lists/queues.
178  *
179  * When the timer's base is locked, and the timer removed from list, it is
180  * possible to set timer->base = &migration_base and drop the lock: the timer
181  * remains locked.
182  */
183 static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
184 						    unsigned long *flags)
185 	__acquires(&timer->base->lock)
186 {
187 	for (;;) {
188 		struct hrtimer_clock_base *base = READ_ONCE(timer->base);
189 
190 		if (likely(base != &migration_base)) {
191 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
192 			if (likely(base == timer->base))
193 				return base;
194 			/* The timer has migrated to another CPU: */
195 			raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
196 		}
197 		cpu_relax();
198 	}
199 }
200 
201 /*
202  * Check if the elected target is suitable considering its next
203  * event and the hotplug state of the current CPU.
204  *
205  * If the elected target is remote and its next event is after the timer
206  * to queue, then a remote reprogram is necessary. However there is no
207  * guarantee the IPI handling the operation would arrive in time to meet
208  * the high resolution deadline. In this case the local CPU becomes a
209  * preferred target, unless it is offline.
210  *
211  * High and low resolution modes are handled the same way for simplicity.
212  *
213  * Called with cpu_base->lock of target cpu held.
214  */
215 static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
216 				    struct hrtimer_cpu_base *new_cpu_base,
217 				    struct hrtimer_cpu_base *this_cpu_base)
218 {
219 	ktime_t expires;
220 
221 	/*
222 	 * The local CPU clockevent can be reprogrammed. Also get_target_base()
223 	 * guarantees it is online.
224 	 */
225 	if (new_cpu_base == this_cpu_base)
226 		return true;
227 
228 	/*
229 	 * The offline local CPU can't be the default target if the
230 	 * next remote target event is after this timer. Keep the
231 	 * elected new base. An IPI will be issued to reprogram
232 	 * it as a last resort.
233 	 */
234 	if (!hrtimer_base_is_online(this_cpu_base))
235 		return true;
236 
237 	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
238 
239 	return expires >= new_base->cpu_base->expires_next;
240 }
241 
242 static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
243 {
244 	if (!hrtimer_base_is_online(base)) {
245 		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
246 
247 		return &per_cpu(hrtimer_bases, cpu);
248 	}
249 
250 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
251 	if (static_branch_likely(&timers_migration_enabled) && !pinned)
252 		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
253 #endif
254 	return base;
255 }
256 
257 /*
258  * We switch the timer base to a power-optimized selected CPU target,
259  * if:
260  *	- NO_HZ_COMMON is enabled
261  *	- timer migration is enabled
262  *	- the timer callback is not running
263  *	- the timer is not the first expiring timer on the new target
264  *
265  * If one of the above requirements is not fulfilled we move the timer
266  * to the current CPU or leave it on the previously assigned CPU if
267  * the timer callback is currently running.
268  */
269 static inline struct hrtimer_clock_base *
270 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
271 {
272 	struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
273 	struct hrtimer_clock_base *new_base;
274 	int basenum = base->index;
275 
276 	this_cpu_base = this_cpu_ptr(&hrtimer_bases);
277 	new_cpu_base = get_target_base(this_cpu_base, pinned);
278 again:
279 	new_base = &new_cpu_base->clock_base[basenum];
280 
281 	if (base != new_base) {
282 		/*
283 		 * We are trying to move timer to new_base. However we can't
284 		 * change timer's base while it is running, so we keep it on
285 		 * the same CPU. No hassle vs. reprogramming the event source
286 		 * in the high resolution case. The remote CPU will take care
287 		 * of this when the timer function has completed. There is no
288 		 * conflict as we hold the lock until the timer is enqueued.
289 		 */
290 		if (unlikely(hrtimer_callback_running(timer)))
291 			return base;
292 
293 		/* See the comment in lock_hrtimer_base() */
294 		WRITE_ONCE(timer->base, &migration_base);
295 		raw_spin_unlock(&base->cpu_base->lock);
296 		raw_spin_lock(&new_base->cpu_base->lock);
297 
298 		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
299 			raw_spin_unlock(&new_base->cpu_base->lock);
300 			raw_spin_lock(&base->cpu_base->lock);
301 			new_cpu_base = this_cpu_base;
302 			WRITE_ONCE(timer->base, base);
303 			goto again;
304 		}
305 		WRITE_ONCE(timer->base, new_base);
306 	} else {
307 		if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
308 			new_cpu_base = this_cpu_base;
309 			goto again;
310 		}
311 	}
312 	return new_base;
313 }
314 
315 #else /* CONFIG_SMP */
316 
317 static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
318 							   unsigned long *flags)
319 	__acquires(&timer->base->cpu_base->lock)
320 {
321 	struct hrtimer_clock_base *base = timer->base;
322 
323 	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
324 	return base;
325 }
326 
327 # define switch_hrtimer_base(t, b, p)	(b)
328 
329 #endif	/* !CONFIG_SMP */
330 
331 /*
332  * Functions for the union type storage format of ktime_t which are
333  * too large for inlining:
334  */
335 #if BITS_PER_LONG < 64
336 /*
337  * Divide a ktime value by a nanosecond value
338  */
339 s64 __ktime_divns(const ktime_t kt, s64 div)
340 {
341 	int sft = 0;
342 	s64 dclc;
343 	u64 tmp;
344 
345 	dclc = ktime_to_ns(kt);
346 	tmp = dclc < 0 ? -dclc : dclc;
347 
348 	/* Make sure the divisor is less than 2^32: */
349 	while (div >> 32) {
350 		sft++;
351 		div >>= 1;
352 	}
353 	tmp >>= sft;
354 	do_div(tmp, (u32) div);
355 	return dclc < 0 ? -tmp : tmp;
356 }
357 EXPORT_SYMBOL_GPL(__ktime_divns);
358 #endif /* BITS_PER_LONG < 64 */
359 
360 /*
361  * Add two ktime values and do a safety check for overflow:
362  */
363 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
364 {
365 	ktime_t res = ktime_add_unsafe(lhs, rhs);
366 
367 	/*
368 	 * We use KTIME_SEC_MAX here, the maximum timeout which we can
369 	 * return to user space in a timespec:
370 	 */
371 	if (res < 0 || res < lhs || res < rhs)
372 		res = ktime_set(KTIME_SEC_MAX, 0);
373 
374 	return res;
375 }
376 
377 EXPORT_SYMBOL_GPL(ktime_add_safe);
378 
379 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
380 
381 static const struct debug_obj_descr hrtimer_debug_descr;
382 
383 static void *hrtimer_debug_hint(void *addr)
384 {
385 	return ACCESS_PRIVATE((struct hrtimer *)addr, function);
386 }
387 
388 /*
389  * fixup_init is called when:
390  * - an active object is initialized
391  */
392 static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
393 {
394 	struct hrtimer *timer = addr;
395 
396 	switch (state) {
397 	case ODEBUG_STATE_ACTIVE:
398 		hrtimer_cancel(timer);
399 		debug_object_init(timer, &hrtimer_debug_descr);
400 		return true;
401 	default:
402 		return false;
403 	}
404 }
405 
406 /*
407  * fixup_activate is called when:
408  * - an active object is activated
409  * - an unknown non-static object is activated
410  */
411 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
412 {
413 	switch (state) {
414 	case ODEBUG_STATE_ACTIVE:
415 		WARN_ON(1);
416 		fallthrough;
417 	default:
418 		return false;
419 	}
420 }
421 
422 /*
423  * fixup_free is called when:
424  * - an active object is freed
425  */
426 static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
427 {
428 	struct hrtimer *timer = addr;
429 
430 	switch (state) {
431 	case ODEBUG_STATE_ACTIVE:
432 		hrtimer_cancel(timer);
433 		debug_object_free(timer, &hrtimer_debug_descr);
434 		return true;
435 	default:
436 		return false;
437 	}
438 }
439 
440 /* Stub timer callback for improperly used timers. */
441 static enum hrtimer_restart stub_timer(struct hrtimer *unused)
442 {
443 	WARN_ON_ONCE(1);
444 	return HRTIMER_NORESTART;
445 }
446 
447 /*
448  * hrtimer_fixup_assert_init is called when:
449  * - an untracked/uninit-ed object is found
450  */
451 static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
452 {
453 	struct hrtimer *timer = addr;
454 
455 	switch (state) {
456 	case ODEBUG_STATE_NOTAVAILABLE:
457 		hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
458 		return true;
459 	default:
460 		return false;
461 	}
462 }
463 
464 static const struct debug_obj_descr hrtimer_debug_descr = {
465 	.name			= "hrtimer",
466 	.debug_hint		= hrtimer_debug_hint,
467 	.fixup_init		= hrtimer_fixup_init,
468 	.fixup_activate		= hrtimer_fixup_activate,
469 	.fixup_free		= hrtimer_fixup_free,
470 	.fixup_assert_init	= hrtimer_fixup_assert_init,
471 };
472 
473 static inline void debug_hrtimer_init(struct hrtimer *timer)
474 {
475 	debug_object_init(timer, &hrtimer_debug_descr);
476 }
477 
478 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
479 {
480 	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
481 }
482 
483 static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
484 {
485 	debug_object_activate(timer, &hrtimer_debug_descr);
486 }
487 
488 static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
489 {
490 	debug_object_deactivate(timer, &hrtimer_debug_descr);
491 }
492 
493 static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
494 {
495 	debug_object_assert_init(timer, &hrtimer_debug_descr);
496 }
497 
498 void destroy_hrtimer_on_stack(struct hrtimer *timer)
499 {
500 	debug_object_free(timer, &hrtimer_debug_descr);
501 }
502 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
503 
504 #else
505 
506 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
507 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
508 static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
509 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
510 static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
511 #endif
512 
513 static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
514 {
515 	debug_hrtimer_init(timer);
516 	trace_hrtimer_setup(timer, clockid, mode);
517 }
518 
519 static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
520 					enum hrtimer_mode mode)
521 {
522 	debug_hrtimer_init_on_stack(timer);
523 	trace_hrtimer_setup(timer, clockid, mode);
524 }
525 
526 static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
527 {
528 	debug_hrtimer_activate(timer, mode);
529 	trace_hrtimer_start(timer, mode, was_armed);
530 }
531 
532 #define for_each_active_base(base, cpu_base, active)					\
533 	for (unsigned int idx = ffs(active); idx--; idx = ffs((active)))		\
534 		for (bool done = false; !done; active &= ~(1U << idx))			\
535 			for (base = &cpu_base->clock_base[idx]; !done; done = true)
536 
537 #define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)
538 
539 #if defined(CONFIG_NO_HZ_COMMON)
540 /*
541  * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
542  * does not update cpu_base->next_timer/expires.
543  */
544 static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
545 						const struct hrtimer *exclude,
546 						unsigned int active, ktime_t expires_next)
547 {
548 	struct hrtimer_clock_base *base;
549 	ktime_t expires;
550 
551 	lockdep_assert_held(&cpu_base->lock);
552 
553 	for_each_active_base(base, cpu_base, active) {
554 		expires = ktime_sub(base->expires_next, base->offset);
555 		if (expires >= expires_next)
556 			continue;
557 
558 		/*
559 		 * If the excluded timer is the first on this base evaluate the
560 		 * next timer.
561 		 */
562 		struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);
563 
564 		if (unlikely(&exclude->node == node)) {
565 			node = timerqueue_linked_next(node);
566 			if (!node)
567 				continue;
568 			expires = ktime_sub(node->expires, base->offset);
569 			if (expires >= expires_next)
570 				continue;
571 		}
572 		expires_next = expires;
573 	}
574 	/* If base->offset changed, the result might be negative */
575 	return max(expires_next, 0);
576 }
577 #endif
578 
579 static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
580 {
581 	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
582 
583 	return hrtimer_from_timerqueue_node(next);
584 }
585 
586 /* Find the base with the earliest expiry */
587 static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
588 				ktime_t *expires_next, struct hrtimer **next_timer)
589 {
590 	struct hrtimer_clock_base *base;
591 	ktime_t expires;
592 
593 	for_each_active_base(base, cpu_base, active) {
594 		expires = ktime_sub(base->expires_next, base->offset);
595 		if (expires < *expires_next) {
596 			*expires_next = expires;
597 			*next_timer = clock_base_next_timer(base);
598 		}
599 	}
600 }
601 
602 /*
603  * Recomputes cpu_base::*next_timer and returns the earliest expires_next
604  * but does not set cpu_base::*expires_next, that is done by
605  * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
606  * cpu_base::*expires_next right away, reprogramming logic would no longer
607  * work.
608  *
609  * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
610  * those timers will get run whenever the softirq gets handled, at the end of
611  * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
612  *
613  * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
614  * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
615  * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
616  *
617  * @active_mask must be one of:
618  *  - HRTIMER_ACTIVE_ALL,
619  *  - HRTIMER_ACTIVE_SOFT, or
620  *  - HRTIMER_ACTIVE_HARD.
621  */
622 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
623 {
624 	struct hrtimer *next_timer = NULL;
625 	ktime_t expires_next = KTIME_MAX;
626 	unsigned int active;
627 
628 	lockdep_assert_held(&cpu_base->lock);
629 
630 	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
631 		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
632 		if (active)
633 			hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
634 		cpu_base->softirq_next_timer = next_timer;
635 	}
636 
637 	if (active_mask & HRTIMER_ACTIVE_HARD) {
638 		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
639 		if (active)
640 			hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
641 		cpu_base->next_timer = next_timer;
642 	}
643 	return max(expires_next, 0);
644 }
645 
646 static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
647 {
648 	ktime_t expires_next, soft = KTIME_MAX;
649 
650 	/*
651 	 * If the soft interrupt has already been activated, ignore the
652 	 * soft bases. They will be handled in the already raised soft
653 	 * interrupt.
654 	 */
655 	if (!cpu_base->softirq_activated) {
656 		soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
657 		/*
658 		 * Update the soft expiry time. clock_settime() might have
659 		 * affected it.
660 		 */
661 		cpu_base->softirq_expires_next = soft;
662 	}
663 
664 	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
665 	/*
666 	 * If a softirq timer is expiring first, update cpu_base->next_timer
667 	 * and program the hardware with the soft expiry time.
668 	 */
669 	if (expires_next > soft) {
670 		cpu_base->next_timer = cpu_base->softirq_next_timer;
671 		expires_next = soft;
672 	}
673 
674 	return expires_next;
675 }
676 
677 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
678 {
679 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
680 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
681 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
682 
683 	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
684 						   offs_boot, offs_tai);
685 
686 	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
687 	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
688 	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
689 
690 	return now;
691 }
692 
693 /*
694  * Is the high resolution mode active in the CPU base. This cannot use the
695  * static key as the CPUs are switched to high resolution mode
696  * asynchronously.
697  */
698 static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
699 {
700 	return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
701 		cpu_base->hres_active : 0;
702 }
703 
704 static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
705 {
706 	trace_hrtimer_rearm(expires_next, deferred);
707 	tick_program_event(expires_next, 1);
708 }
709 
710 static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
711 				ktime_t expires_next)
712 {
713 	cpu_base->expires_next = expires_next;
714 
715 	/*
716 	 * If hres is not active, hardware does not have to be
717 	 * reprogrammed yet.
718 	 *
719 	 * If a hang was detected in the last timer interrupt then we
720 	 * leave the hang delay active in the hardware. We want the
721 	 * system to make progress. That also prevents the following
722 	 * scenario:
723 	 * T1 expires 50ms from now
724 	 * T2 expires 5s from now
725 	 *
726 	 * T1 is removed, so this code is called and would reprogram
727 	 * the hardware to 5s from now. Any hrtimer_start after that
728 	 * will not reprogram the hardware due to hang_detected being
729 	 * set. So we'd effectively block all timers until the T2 event
730 	 * fires.
731 	 */
732 	if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
733 		return;
734 
735 	hrtimer_rearm_event(expires_next, false);
736 }
737 
738 /* Reprogram the event source with a evaluation of all clock bases */
739 static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
740 {
741 	ktime_t expires_next = hrtimer_update_next_event(cpu_base);
742 
743 	if (skip_equal && expires_next == cpu_base->expires_next)
744 		return;
745 
746 	__hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
747 }
748 
749 /* High resolution timer related functions */
750 #ifdef CONFIG_HIGH_RES_TIMERS
751 
752 /* High resolution timer enabled ? */
753 static bool hrtimer_hres_enabled __read_mostly  = true;
754 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
755 EXPORT_SYMBOL_GPL(hrtimer_resolution);
756 
757 /* Enable / Disable high resolution mode */
758 static int __init setup_hrtimer_hres(char *str)
759 {
760 	return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
761 }
762 __setup("highres=", setup_hrtimer_hres);
763 
764 /* hrtimer_high_res_enabled - query, if the highres mode is enabled */
765 static inline bool hrtimer_is_hres_enabled(void)
766 {
767 	return hrtimer_hres_enabled;
768 }
769 
770 /* Switch to high resolution mode */
771 static void hrtimer_switch_to_hres(void)
772 {
773 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
774 
775 	if (tick_init_highres()) {
776 		pr_warn("Could not switch to high resolution mode on CPU %u\n",	base->cpu);
777 		return;
778 	}
779 	base->hres_active = true;
780 	hrtimer_resolution = HIGH_RES_NSEC;
781 
782 	tick_setup_sched_timer(true);
783 	/* "Retrigger" the interrupt to get things going */
784 	retrigger_next_event(NULL);
785 	hrtimer_schedule_hres_work();
786 }
787 
788 #else
789 
790 static inline bool hrtimer_is_hres_enabled(void) { return 0; }
791 static inline void hrtimer_switch_to_hres(void) { }
792 
793 #endif /* CONFIG_HIGH_RES_TIMERS */
794 
795 /*
796  * Retrigger next event is called after clock was set with interrupts
797  * disabled through an SMP function call or directly from low level
798  * resume code.
799  *
800  * This is only invoked when:
801  *	- CONFIG_HIGH_RES_TIMERS is enabled.
802  *	- CONFIG_NOHZ_COMMON is enabled
803  *
804  * For the other cases this function is empty and because the call sites
805  * are optimized out it vanishes as well, i.e. no need for lots of
806  * #ifdeffery.
807  */
808 static void retrigger_next_event(void *arg)
809 {
810 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
811 
812 	/*
813 	 * When high resolution mode or nohz is active, then the offsets of
814 	 * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
815 	 * next tick will take care of that.
816 	 *
817 	 * If high resolution mode is active then the next expiring timer
818 	 * must be reevaluated and the clock event device reprogrammed if
819 	 * necessary.
820 	 *
821 	 * In the NOHZ case the update of the offset and the reevaluation
822 	 * of the next expiring timer is enough. The return from the SMP
823 	 * function call will take care of the reprogramming in case the
824 	 * CPU was in a NOHZ idle sleep.
825 	 *
826 	 * In periodic low resolution mode, the next softirq expiration
827 	 * must also be updated.
828 	 */
829 	guard(raw_spinlock)(&base->lock);
830 	hrtimer_update_base(base);
831 	if (hrtimer_hres_active(base))
832 		hrtimer_force_reprogram(base, /* skip_equal */ false);
833 	else
834 		hrtimer_update_next_event(base);
835 }
836 
837 /*
838  * When a timer is enqueued and expires earlier than the already enqueued
839  * timers, we have to check, whether it expires earlier than the timer for
840  * which the clock event device was armed.
841  *
842  * Called with interrupts disabled and base->cpu_base.lock held
843  */
844 static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
845 {
846 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
847 	struct hrtimer_clock_base *base = timer->base;
848 	ktime_t expires = hrtimer_get_expires(timer);
849 
850 	WARN_ON_ONCE(expires < 0);
851 
852 	expires = ktime_sub(expires, base->offset);
853 	/*
854 	 * CLOCK_REALTIME timer might be requested with an absolute
855 	 * expiry time which is less than base->offset. Set it to 0.
856 	 */
857 	if (expires < 0)
858 		expires = 0;
859 
860 	if (timer->is_soft) {
861 		/*
862 		 * soft hrtimer could be started on a remote CPU. In this
863 		 * case softirq_expires_next needs to be updated on the
864 		 * remote CPU. The soft hrtimer will not expire before the
865 		 * first hard hrtimer on the remote CPU -
866 		 * hrtimer_check_target() prevents this case.
867 		 */
868 		struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
869 
870 		if (timer_cpu_base->softirq_activated)
871 			return;
872 
873 		if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
874 			return;
875 
876 		timer_cpu_base->softirq_next_timer = timer;
877 		timer_cpu_base->softirq_expires_next = expires;
878 
879 		if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
880 			return;
881 	}
882 
883 	/*
884 	 * If the timer is not on the current cpu, we cannot reprogram
885 	 * the other cpus clock event device.
886 	 */
887 	if (base->cpu_base != cpu_base)
888 		return;
889 
890 	if (expires >= cpu_base->expires_next)
891 		return;
892 
893 	/* If a deferred rearm is pending skip reprogramming the device */
894 	if (cpu_base->deferred_rearm)
895 		return;
896 
897 	cpu_base->next_timer = timer;
898 
899 	__hrtimer_reprogram(cpu_base, timer, expires);
900 }
901 
902 static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
903 {
904 	struct hrtimer_clock_base *base;
905 	unsigned int seq;
906 	ktime_t expires;
907 
908 	/*
909 	 * Update the base offsets unconditionally so the following
910 	 * checks whether the SMP function call is required works.
911 	 *
912 	 * The update is safe even when the remote CPU is in the hrtimer
913 	 * interrupt or the hrtimer soft interrupt and expiring affected
914 	 * bases. Either it will see the update before handling a base or
915 	 * it will see it when it finishes the processing and reevaluates
916 	 * the next expiring timer.
917 	 */
918 	seq = cpu_base->clock_was_set_seq;
919 	hrtimer_update_base(cpu_base);
920 
921 	/*
922 	 * If the sequence did not change over the update then the
923 	 * remote CPU already handled it.
924 	 */
925 	if (seq == cpu_base->clock_was_set_seq)
926 		return false;
927 
928 	/* If a deferred rearm is pending the remote CPU will take care of it */
929 	if (cpu_base->deferred_rearm) {
930 		cpu_base->deferred_needs_update = true;
931 		return false;
932 	}
933 
934 	/*
935 	 * Walk the affected clock bases and check whether the first expiring
936 	 * timer in a clock base is moving ahead of the first expiring timer of
937 	 * @cpu_base. If so, the IPI must be invoked because per CPU clock
938 	 * event devices cannot be remotely reprogrammed.
939 	 */
940 	active &= cpu_base->active_bases;
941 
942 	for_each_active_base(base, cpu_base, active) {
943 		struct timerqueue_linked_node *next;
944 
945 		next = timerqueue_linked_first(&base->active);
946 		expires = ktime_sub(next->expires, base->offset);
947 		if (expires < cpu_base->expires_next)
948 			return true;
949 
950 		/* Extra check for softirq clock bases */
951 		if (base->index < HRTIMER_BASE_MONOTONIC_SOFT)
952 			continue;
953 		if (cpu_base->softirq_activated)
954 			continue;
955 		if (expires < cpu_base->softirq_expires_next)
956 			return true;
957 	}
958 	return false;
959 }
960 
961 /*
962  * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
963  * CLOCK_BOOTTIME (for late sleep time injection).
964  *
965  * This requires to update the offsets for these clocks
966  * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
967  * also requires to eventually reprogram the per CPU clock event devices
968  * when the change moves an affected timer ahead of the first expiring
969  * timer on that CPU. Obviously remote per CPU clock event devices cannot
970  * be reprogrammed. The other reason why an IPI has to be sent is when the
971  * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
972  * in the tick, which obviously might be stopped, so this has to bring out
973  * the remote CPU which might sleep in idle to get this sorted.
974  */
975 void clock_was_set(unsigned int bases)
976 {
977 	cpumask_var_t mask;
978 
979 	if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
980 		goto out_timerfd;
981 
982 	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
983 		on_each_cpu(retrigger_next_event, NULL, 1);
984 		goto out_timerfd;
985 	}
986 
987 	/* Avoid interrupting CPUs if possible */
988 	scoped_guard(cpus_read_lock) {
989 		int cpu;
990 
991 		for_each_online_cpu(cpu) {
992 			struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
993 
994 			guard(raw_spinlock_irqsave)(&cpu_base->lock);
995 			if (update_needs_ipi(cpu_base, bases))
996 				cpumask_set_cpu(cpu, mask);
997 		}
998 		scoped_guard(preempt)
999 			smp_call_function_many(mask, retrigger_next_event, NULL, 1);
1000 	}
1001 	free_cpumask_var(mask);
1002 
1003 out_timerfd:
1004 	timerfd_clock_was_set();
1005 }
1006 
1007 static void clock_was_set_work(struct work_struct *work)
1008 {
1009 	clock_was_set(CLOCK_SET_WALL);
1010 }
1011 
1012 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
1013 
1014 /*
1015  * Called from timekeeping code to reprogram the hrtimer interrupt device
1016  * on all cpus and to notify timerfd.
1017  */
1018 void clock_was_set_delayed(void)
1019 {
1020 	schedule_work(&hrtimer_work);
1021 }
1022 
1023 /*
1024  * Called during resume either directly from via timekeeping_resume()
1025  * or in the case of s2idle from tick_unfreeze() to ensure that the
1026  * hrtimers are up to date.
1027  */
1028 void hrtimers_resume_local(void)
1029 {
1030 	lockdep_assert_irqs_disabled();
1031 	/* Retrigger on the local CPU */
1032 	retrigger_next_event(NULL);
1033 }
1034 
1035 /* Counterpart to lock_hrtimer_base above */
1036 static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
1037 	__releases(&timer->base->cpu_base->lock)
1038 {
1039 	raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
1040 }
1041 
1042 /**
1043  * hrtimer_forward() - forward the timer expiry
1044  * @timer:	hrtimer to forward
1045  * @now:	forward past this time
1046  * @interval:	the interval to forward
1047  *
1048  * Forward the timer expiry so it will expire in the future.
1049  *
1050  * .. note::
1051  *  This only updates the timer expiry value and does not requeue the timer.
1052  *
1053  * There is also a variant of this function: hrtimer_forward_now().
1054  *
1055  * Context: Can be safely called from the callback function of @timer. If called
1056  *          from other contexts @timer must neither be enqueued nor running the
1057  *          callback and the caller needs to take care of serialization.
1058  *
1059  * Return: The number of overruns are returned.
1060  */
1061 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
1062 {
1063 	ktime_t delta;
1064 	u64 orun = 1;
1065 
1066 	delta = ktime_sub(now, hrtimer_get_expires(timer));
1067 
1068 	if (delta < 0)
1069 		return 0;
1070 
1071 	if (WARN_ON(timer->is_queued))
1072 		return 0;
1073 
1074 	if (interval < hrtimer_resolution)
1075 		interval = hrtimer_resolution;
1076 
1077 	if (unlikely(delta >= interval)) {
1078 		s64 incr = ktime_to_ns(interval);
1079 
1080 		orun = ktime_divns(delta, incr);
1081 		hrtimer_add_expires_ns(timer, incr * orun);
1082 		if (hrtimer_get_expires(timer) > now)
1083 			return orun;
1084 		/*
1085 		 * This (and the ktime_add() below) is the
1086 		 * correction for exact:
1087 		 */
1088 		orun++;
1089 	}
1090 	hrtimer_add_expires(timer, interval);
1091 
1092 	return orun;
1093 }
1094 EXPORT_SYMBOL_GPL(hrtimer_forward);
1095 
1096 /*
1097  * enqueue_hrtimer - internal function to (re)start a timer
1098  *
1099  * The timer is inserted in expiry order. Insertion into the
1100  * red black tree is O(log(n)).
1101  *
1102  * Returns true when the new timer is the leftmost timer in the tree.
1103  */
1104 static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
1105 			    enum hrtimer_mode mode, bool was_armed)
1106 {
1107 	lockdep_assert_held(&base->cpu_base->lock);
1108 
1109 	debug_activate(timer, mode, was_armed);
1110 	WARN_ON_ONCE(!base->cpu_base->online);
1111 
1112 	base->cpu_base->active_bases |= 1 << base->index;
1113 
1114 	/* Pairs with the lockless read in hrtimer_is_queued() */
1115 	WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
1116 
1117 	if (!timerqueue_linked_add(&base->active, &timer->node))
1118 		return false;
1119 
1120 	base->expires_next = hrtimer_get_expires(timer);
1121 	return true;
1122 }
1123 
1124 static inline void base_update_next_timer(struct hrtimer_clock_base *base)
1125 {
1126 	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
1127 
1128 	base->expires_next = next ? next->expires : KTIME_MAX;
1129 }
1130 
1131 /*
1132  * __remove_hrtimer - internal function to remove a timer
1133  *
1134  * High resolution timer mode reprograms the clock event device when the
1135  * timer is the one which expires next. The caller can disable this by setting
1136  * reprogram to zero. This is useful, when the context does a reprogramming
1137  * anyway (e.g. timer interrupt)
1138  */
1139 static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
1140 			     bool newstate, bool reprogram)
1141 {
1142 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1143 	bool was_first;
1144 
1145 	lockdep_assert_held(&cpu_base->lock);
1146 
1147 	if (!timer->is_queued)
1148 		return;
1149 
1150 	/* Pairs with the lockless read in hrtimer_is_queued() */
1151 	WRITE_ONCE(timer->is_queued, newstate);
1152 
1153 	was_first = !timerqueue_linked_prev(&timer->node);
1154 
1155 	if (!timerqueue_linked_del(&base->active, &timer->node))
1156 		cpu_base->active_bases &= ~(1 << base->index);
1157 
1158 	/* Nothing to update if this was not the first timer in the base */
1159 	if (!was_first)
1160 		return;
1161 
1162 	base_update_next_timer(base);
1163 
1164 	/*
1165 	 * If reprogram is false don't update cpu_base->next_timer and do not
1166 	 * touch the clock event device.
1167 	 *
1168 	 * This happens when removing the first timer on a remote CPU, which
1169 	 * will be handled by the remote CPU's interrupt. It also happens when
1170 	 * a local timer is removed to be immediately restarted. That's handled
1171 	 * at the call site.
1172 	 */
1173 	if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
1174 		return;
1175 
1176 	if (cpu_base->deferred_rearm)
1177 		cpu_base->deferred_needs_update = true;
1178 	else
1179 		hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
1180 }
1181 
1182 static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
1183 				  bool newstate)
1184 {
1185 	lockdep_assert_held(&base->cpu_base->lock);
1186 
1187 	if (timer->is_queued) {
1188 		bool reprogram;
1189 
1190 		debug_hrtimer_deactivate(timer);
1191 
1192 		/*
1193 		 * Remove the timer and force reprogramming when high
1194 		 * resolution mode is active and the timer is on the current
1195 		 * CPU. If we remove a timer on another CPU, reprogramming is
1196 		 * skipped. The interrupt event on this CPU is fired and
1197 		 * reprogramming happens in the interrupt handler. This is a
1198 		 * rare case and less expensive than a smp call.
1199 		 */
1200 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
1201 
1202 		__remove_hrtimer(timer, base, newstate, reprogram);
1203 		return true;
1204 	}
1205 	return false;
1206 }
1207 
1208 /*
1209  * Update in place has to retrieve the expiry times of the neighbour nodes
1210  * if they exist. That is cache line neutral because the dequeue/enqueue
1211  * operation is going to need the same cache lines. But there is a big win
1212  * when the dequeue/enqueue can be avoided because the RB tree does not
1213  * have to be rebalanced twice.
1214  */
1215 static inline bool
1216 hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
1217 {
1218 	struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
1219 	struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);
1220 
1221 	/* If the new expiry goes behind the next timer, requeue is required */
1222 	if (next && expires > next->expires)
1223 		return false;
1224 
1225 	/* If this is the first timer, update in place */
1226 	if (!prev)
1227 		return true;
1228 
1229 	/* Update in place when it does not go ahead of the previous one */
1230 	return expires >= prev->expires;
1231 }
1232 
1233 static inline bool
1234 remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
1235 			     const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
1236 {
1237 	bool was_first = false;
1238 
1239 	/* Remove it from the timer queue if active */
1240 	if (timer->is_queued) {
1241 		was_first = !timerqueue_linked_prev(&timer->node);
1242 
1243 		/* Try to update in place to avoid the de/enqueue dance */
1244 		if (hrtimer_can_update_in_place(timer, base, expires)) {
1245 			hrtimer_set_expires_range_ns(timer, expires, delta_ns);
1246 			trace_hrtimer_start(timer, mode, true);
1247 			if (was_first)
1248 				base->expires_next = expires;
1249 			return was_first;
1250 		}
1251 
1252 		debug_hrtimer_deactivate(timer);
1253 		timerqueue_linked_del(&base->active, &timer->node);
1254 	}
1255 
1256 	/* Set the new expiry time */
1257 	hrtimer_set_expires_range_ns(timer, expires, delta_ns);
1258 
1259 	debug_activate(timer, mode, timer->is_queued);
1260 	base->cpu_base->active_bases |= 1 << base->index;
1261 
1262 	/* Pairs with the lockless read in hrtimer_is_queued() */
1263 	WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
1264 
1265 	/* If it's the first expiring timer now or again, update base */
1266 	if (timerqueue_linked_add(&base->active, &timer->node)) {
1267 		base->expires_next = expires;
1268 		return true;
1269 	}
1270 
1271 	if (was_first)
1272 		base_update_next_timer(base);
1273 
1274 	return false;
1275 }
1276 
1277 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
1278 					    const enum hrtimer_mode mode)
1279 {
1280 #ifdef CONFIG_TIME_LOW_RES
1281 	/*
1282 	 * CONFIG_TIME_LOW_RES indicates that the system has no way to return
1283 	 * granular time values. For relative timers we add hrtimer_resolution
1284 	 * (i.e. one jiffy) to prevent short timeouts.
1285 	 */
1286 	timer->is_rel = mode & HRTIMER_MODE_REL;
1287 	if (timer->is_rel)
1288 		tim = ktime_add_safe(tim, hrtimer_resolution);
1289 #endif
1290 	return tim;
1291 }
1292 
1293 static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
1294 {
1295 	ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
1296 
1297 	/*
1298 	 * Reprogramming needs to be triggered, even if the next soft
1299 	 * hrtimer expires at the same time as the next hard
1300 	 * hrtimer. cpu_base->softirq_expires_next needs to be updated!
1301 	 */
1302 	if (expires == KTIME_MAX)
1303 		return;
1304 
1305 	/*
1306 	 * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
1307 	 * cpu_base->expires_next is only set by hrtimer_reprogram()
1308 	 */
1309 	hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
1310 }
1311 
1312 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
1313 static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
1314 {
1315 	if (static_branch_likely(&timers_migration_enabled)) {
1316 		/*
1317 		 * If it is local and the first expiring timer keep it on the local
1318 		 * CPU to optimize reprogramming of the clockevent device. Also
1319 		 * avoid switch_hrtimer_base() overhead when local and pinned.
1320 		 */
1321 		if (!is_local)
1322 			return false;
1323 		if (is_first || is_pinned)
1324 			return true;
1325 
1326 		/* Honour the NOHZ full restrictions */
1327 		if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
1328 			return false;
1329 
1330 		/*
1331 		 * If the tick is not stopped or need_resched() is set, then
1332 		 * there is no point in moving the timer somewhere else.
1333 		 */
1334 		return !tick_nohz_tick_stopped() || need_resched();
1335 	}
1336 	return is_local;
1337 }
1338 #else
1339 static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
1340 {
1341 	return is_local;
1342 }
1343 #endif
1344 
1345 static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
1346 				     bool is_pinned)
1347 {
1348 	/* If the timer is running the callback it has to stay on its CPU base. */
1349 	if (unlikely(timer->base->running == timer))
1350 		return true;
1351 
1352 	return hrtimer_prefer_local(is_local, is_first, is_pinned);
1353 }
1354 
1355 enum {
1356 	HRTIMER_REPROGRAM_NONE,
1357 	HRTIMER_REPROGRAM,
1358 	HRTIMER_REPROGRAM_FORCE,
1359 };
1360 
1361 static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
1362 				    const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
1363 {
1364 	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
1365 	bool is_pinned, first, was_first, keep_base = false;
1366 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1367 
1368 	was_first = cpu_base->next_timer == timer;
1369 	is_pinned = !!(mode & HRTIMER_MODE_PINNED);
1370 
1371 	/*
1372 	 * Don't keep it local if this enqueue happens on a unplugged CPU
1373 	 * after hrtimer_cpu_dying() has been invoked.
1374 	 */
1375 	if (likely(this_cpu_base->online)) {
1376 		bool is_local = cpu_base == this_cpu_base;
1377 
1378 		keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
1379 	}
1380 
1381 	/* Calculate absolute expiry time for relative timers */
1382 	if (mode & HRTIMER_MODE_REL)
1383 		tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
1384 	/* Compensate for low resolution granularity */
1385 	tim = hrtimer_update_lowres(timer, tim, mode);
1386 
1387 	/*
1388 	 * Remove an active timer from the queue. In case it is not queued
1389 	 * on the current CPU, make sure that remove_hrtimer() updates the
1390 	 * remote data correctly.
1391 	 *
1392 	 * If it's on the current CPU and the first expiring timer, then
1393 	 * skip reprogramming, keep the timer local and enforce
1394 	 * reprogramming later if it was the first expiring timer.  This
1395 	 * avoids programming the underlying clock event twice (once at
1396 	 * removal and once after enqueue).
1397 	 *
1398 	 * @keep_base is also true if the timer callback is running on a
1399 	 * remote CPU and for local pinned timers.
1400 	 */
1401 	if (likely(keep_base)) {
1402 		first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
1403 	} else {
1404 		/* Keep the ENQUEUED state in case it is queued */
1405 		bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);
1406 
1407 		hrtimer_set_expires_range_ns(timer, tim, delta_ns);
1408 
1409 		/* Switch the timer base, if necessary: */
1410 		base = switch_hrtimer_base(timer, base, is_pinned);
1411 		cpu_base = base->cpu_base;
1412 
1413 		first = enqueue_hrtimer(timer, base, mode, was_armed);
1414 	}
1415 
1416 	/* If a deferred rearm is pending skip reprogramming the device */
1417 	if (cpu_base->deferred_rearm) {
1418 		cpu_base->deferred_needs_update = true;
1419 		return HRTIMER_REPROGRAM_NONE;
1420 	}
1421 
1422 	if (!was_first || cpu_base != this_cpu_base) {
1423 		/*
1424 		 * If the current CPU base is online, then the timer is never
1425 		 * queued on a remote CPU if it would be the first expiring
1426 		 * timer there unless the timer callback is currently executed
1427 		 * on the remote CPU. In the latter case the remote CPU will
1428 		 * re-evaluate the first expiring timer after completing the
1429 		 * callbacks.
1430 		 */
1431 		if (likely(hrtimer_base_is_online(this_cpu_base)))
1432 			return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE;
1433 
1434 		/*
1435 		 * Timer was enqueued remote because the current base is
1436 		 * already offline. If the timer is the first to expire,
1437 		 * kick the remote CPU to reprogram the clock event.
1438 		 */
1439 		if (first)
1440 			smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
1441 		return HRTIMER_REPROGRAM_NONE;
1442 	}
1443 
1444 	/*
1445 	 * Special case for the HRTICK timer. It is frequently rearmed and most
1446 	 * of the time moves the expiry into the future. That's expensive in
1447 	 * virtual machines and it's better to take the pointless already armed
1448 	 * interrupt than reprogramming the hardware on every context switch.
1449 	 *
1450 	 * If the new expiry is before the armed time, then reprogramming is
1451 	 * required.
1452 	 */
1453 	if (timer->is_lazy) {
1454 		if (cpu_base->expires_next <= hrtimer_get_expires(timer))
1455 			return HRTIMER_REPROGRAM_NONE;
1456 	}
1457 
1458 	/*
1459 	 * Timer was the first expiring timer and forced to stay on the
1460 	 * current CPU to avoid reprogramming on removal and enqueue. Force
1461 	 * reprogram the hardware by evaluating the new first expiring
1462 	 * timer.
1463 	 */
1464 	return HRTIMER_REPROGRAM_FORCE;
1465 }
1466 
1467 static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim,
1468 					 u64 delta_ns, const enum hrtimer_mode mode,
1469 					 struct hrtimer_clock_base *base)
1470 {
1471 	/*
1472 	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
1473 	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
1474 	 * expiry mode because unmarked timers are moved to softirq expiry.
1475 	 */
1476 	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
1477 		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
1478 	else
1479 		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
1480 
1481 	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base);
1482 }
1483 
1484 /**
1485  * hrtimer_start_range_ns - (re)start an hrtimer
1486  * @timer:	the timer to be added
1487  * @tim:	expiry time
1488  * @delta_ns:	"slack" range for the timer
1489  * @mode:	timer mode: absolute (HRTIMER_MODE_ABS) or
1490  *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
1491  *		softirq based mode is considered for debug purpose only!
1492  */
1493 void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
1494 			    const enum hrtimer_mode mode)
1495 {
1496 	struct hrtimer_clock_base *base;
1497 	unsigned long flags;
1498 
1499 	debug_hrtimer_assert_init(timer);
1500 
1501 	base = lock_hrtimer_base(timer, &flags);
1502 
1503 	switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
1504 	case HRTIMER_REPROGRAM:
1505 		hrtimer_reprogram(timer, true);
1506 		break;
1507 	case HRTIMER_REPROGRAM_FORCE:
1508 		hrtimer_force_reprogram(timer->base->cpu_base, 1);
1509 		break;
1510 	case HRTIMER_REPROGRAM_NONE:
1511 		break;
1512 	}
1513 
1514 	unlock_hrtimer_base(timer, &flags);
1515 }
1516 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1517 
1518 static inline bool hrtimer_check_user_timer(struct hrtimer *timer)
1519 {
1520 	struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
1521 	ktime_t expires;
1522 
1523 	/*
1524 	 * This uses soft expires because that's the user provided
1525 	 * expiry time, while expires can be further in the past
1526 	 * due to a slack value added to the user expiry time.
1527 	 */
1528 	expires = hrtimer_get_softexpires(timer);
1529 
1530 	/* Convert to monotonic */
1531 	expires = ktime_sub(expires, timer->base->offset);
1532 
1533 	/*
1534 	 * Check whether this timer will end up as the first expiring timer in
1535 	 * the CPU base. If not, no further checks required as it's then
1536 	 * guaranteed to expire in the future.
1537 	 */
1538 	if (expires >= cpu_base->expires_next)
1539 		return true;
1540 
1541 	/* Validate that the expiry time is in the future. */
1542 	if (expires > ktime_get())
1543 		return true;
1544 
1545 	debug_hrtimer_deactivate(timer);
1546 	__remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false);
1547 	trace_hrtimer_start_expired(timer);
1548 	return false;
1549 }
1550 
1551 /**
1552  * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer
1553  * @timer:	the timer to be added
1554  * @tim:	expiry time
1555  * @delta_ns:	"slack" range for the timer
1556  * @mode:	timer mode: absolute (HRTIMER_MODE_ABS) or
1557  *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
1558  *		softirq based mode is considered for debug purpose only!
1559  *
1560  * Returns: True when the timer was queued, false if it was already expired
1561  *
1562  * This function cannot invoke the timer callback for expired timers as it might
1563  * be called under a lock which the timer callback needs to acquire. So the
1564  * caller has to handle that case.
1565  */
1566 bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim,
1567 				 u64 delta_ns, const enum hrtimer_mode mode)
1568 {
1569 	struct hrtimer_clock_base *base;
1570 	unsigned long flags;
1571 	bool ret = true;
1572 
1573 	debug_hrtimer_assert_init(timer);
1574 
1575 	base = lock_hrtimer_base(timer, &flags);
1576 
1577 	switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
1578 	case HRTIMER_REPROGRAM:
1579 		ret = hrtimer_check_user_timer(timer);
1580 		if (ret)
1581 			hrtimer_reprogram(timer, true);
1582 		break;
1583 	case HRTIMER_REPROGRAM_FORCE:
1584 		ret = hrtimer_check_user_timer(timer);
1585 		/*
1586 		 * The base must always be reevaluated, independent of the
1587 		 * result above because the timer was the first pending timer.
1588 		 */
1589 		hrtimer_force_reprogram(timer->base->cpu_base, 1);
1590 		break;
1591 	case HRTIMER_REPROGRAM_NONE:
1592 		break;
1593 	}
1594 
1595 	unlock_hrtimer_base(timer, &flags);
1596 	return ret;
1597 }
1598 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user);
1599 
1600 /**
1601  * hrtimer_try_to_cancel - try to deactivate a timer
1602  * @timer:	hrtimer to stop
1603  *
1604  * Returns:
1605  *
1606  *  *  0 when the timer was not active
1607  *  *  1 when the timer was active
1608  *  * -1 when the timer is currently executing the callback function and
1609  *    cannot be stopped
1610  */
1611 int hrtimer_try_to_cancel(struct hrtimer *timer)
1612 {
1613 	struct hrtimer_clock_base *base;
1614 	unsigned long flags;
1615 	int ret = -1;
1616 
1617 	/*
1618 	 * Check lockless first. If the timer is not active (neither
1619 	 * enqueued nor running the callback, nothing to do here.  The
1620 	 * base lock does not serialize against a concurrent enqueue,
1621 	 * so we can avoid taking it.
1622 	 */
1623 	if (!hrtimer_active(timer))
1624 		return 0;
1625 
1626 	base = lock_hrtimer_base(timer, &flags);
1627 
1628 	if (!hrtimer_callback_running(timer)) {
1629 		ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
1630 		if (ret)
1631 			trace_hrtimer_cancel(timer);
1632 	}
1633 
1634 	unlock_hrtimer_base(timer, &flags);
1635 
1636 	return ret;
1637 
1638 }
1639 EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
1640 
1641 #ifdef CONFIG_PREEMPT_RT
1642 static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
1643 {
1644 	spin_lock_init(&base->softirq_expiry_lock);
1645 }
1646 
1647 static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
1648 	__acquires(&base->softirq_expiry_lock)
1649 {
1650 	spin_lock(&base->softirq_expiry_lock);
1651 }
1652 
1653 static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
1654 	__releases(&base->softirq_expiry_lock)
1655 {
1656 	spin_unlock(&base->softirq_expiry_lock);
1657 }
1658 
1659 /*
1660  * The counterpart to hrtimer_cancel_wait_running().
1661  *
1662  * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
1663  * the timer callback to finish. Drop expiry_lock and reacquire it. That
1664  * allows the waiter to acquire the lock and make progress.
1665  */
1666 static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
1667 {
1668 	if (atomic_read(&cpu_base->timer_waiters)) {
1669 		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1670 		spin_unlock(&cpu_base->softirq_expiry_lock);
1671 		spin_lock(&cpu_base->softirq_expiry_lock);
1672 		raw_spin_lock_irq(&cpu_base->lock);
1673 	}
1674 }
1675 
1676 #ifdef CONFIG_SMP
1677 static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
1678 {
1679 	return base == &migration_base;
1680 }
1681 #else
1682 static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
1683 {
1684 	return false;
1685 }
1686 #endif
1687 
1688 /*
1689  * This function is called on PREEMPT_RT kernels when the fast path
1690  * deletion of a timer failed because the timer callback function was
1691  * running.
1692  *
1693  * This prevents priority inversion: if the soft irq thread is preempted
1694  * in the middle of a timer callback, then calling hrtimer_cancel() can
1695  * lead to two issues:
1696  *
1697  *  - If the caller is on a remote CPU then it has to spin wait for the timer
1698  *    handler to complete. This can result in unbound priority inversion.
1699  *
1700  *  - If the caller originates from the task which preempted the timer
1701  *    handler on the same CPU, then spin waiting for the timer handler to
1702  *    complete is never going to end.
1703  */
1704 void hrtimer_cancel_wait_running(const struct hrtimer *timer)
1705 {
1706 	/* Lockless read. Prevent the compiler from reloading it below */
1707 	struct hrtimer_clock_base *base = READ_ONCE(timer->base);
1708 
1709 	/*
1710 	 * Just relax if the timer expires in hard interrupt context or if
1711 	 * it is currently on the migration base.
1712 	 */
1713 	if (!timer->is_soft || is_migration_base(base)) {
1714 		cpu_relax();
1715 		return;
1716 	}
1717 
1718 	/*
1719 	 * Mark the base as contended and grab the expiry lock, which is
1720 	 * held by the softirq across the timer callback. Drop the lock
1721 	 * immediately so the softirq can expire the next timer. In theory
1722 	 * the timer could already be running again, but that's more than
1723 	 * unlikely and just causes another wait loop.
1724 	 */
1725 	atomic_inc(&base->cpu_base->timer_waiters);
1726 	spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
1727 	atomic_dec(&base->cpu_base->timer_waiters);
1728 	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
1729 }
1730 #else
1731 static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
1732 static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
1733 static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
1734 static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
1735 #endif
1736 
1737 /**
1738  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
1739  * @timer:	the timer to be cancelled
1740  *
1741  * Returns:
1742  *  0 when the timer was not active
1743  *  1 when the timer was active
1744  */
1745 int hrtimer_cancel(struct hrtimer *timer)
1746 {
1747 	int ret;
1748 
1749 	do {
1750 		ret = hrtimer_try_to_cancel(timer);
1751 
1752 		if (ret < 0)
1753 			hrtimer_cancel_wait_running(timer);
1754 	} while (ret < 0);
1755 	return ret;
1756 }
1757 EXPORT_SYMBOL_GPL(hrtimer_cancel);
1758 
1759 /**
1760  * __hrtimer_get_remaining - get remaining time for the timer
1761  * @timer:	the timer to read
1762  * @adjust:	adjust relative timers when CONFIG_TIME_LOW_RES=y
1763  */
1764 ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
1765 {
1766 	unsigned long flags;
1767 	ktime_t rem;
1768 
1769 	lock_hrtimer_base(timer, &flags);
1770 	if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
1771 		rem = hrtimer_expires_remaining_adjusted(timer);
1772 	else
1773 		rem = hrtimer_expires_remaining(timer);
1774 	unlock_hrtimer_base(timer, &flags);
1775 
1776 	return rem;
1777 }
1778 EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
1779 
1780 #ifdef CONFIG_NO_HZ_COMMON
1781 /**
1782  * hrtimer_get_next_event - get the time until next expiry event
1783  *
1784  * Returns the next expiry time or KTIME_MAX if no timer is pending.
1785  */
1786 ktime_t hrtimer_get_next_event(void)
1787 {
1788 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1789 	ktime_t expires = KTIME_MAX;
1790 
1791 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
1792 	if (!hrtimer_hres_active(cpu_base))
1793 		expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
1794 
1795 	return expires;
1796 }
1797 
1798 /**
1799  * hrtimer_next_event_without - time until next expiry event w/o one timer
1800  * @exclude:	timer to exclude
1801  *
1802  * Returns the next expiry time over all timers except for the @exclude one or
1803  * KTIME_MAX if none of them is pending.
1804  */
1805 ktime_t hrtimer_next_event_without(const struct hrtimer *exclude)
1806 {
1807 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1808 	ktime_t expires = KTIME_MAX;
1809 	unsigned int active;
1810 
1811 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
1812 	if (!hrtimer_hres_active(cpu_base))
1813 		return expires;
1814 
1815 	active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
1816 	if (active && !cpu_base->softirq_activated)
1817 		expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);
1818 
1819 	active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
1820 	if (!active)
1821 		return expires;
1822 	return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
1823 }
1824 #endif
1825 
1826 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
1827 {
1828 	switch (clock_id) {
1829 	case CLOCK_MONOTONIC:
1830 		return HRTIMER_BASE_MONOTONIC;
1831 	case CLOCK_REALTIME:
1832 		return HRTIMER_BASE_REALTIME;
1833 	case CLOCK_BOOTTIME:
1834 		return HRTIMER_BASE_BOOTTIME;
1835 	case CLOCK_TAI:
1836 		return HRTIMER_BASE_TAI;
1837 	default:
1838 		WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1839 		return HRTIMER_BASE_MONOTONIC;
1840 	}
1841 }
1842 
1843 static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
1844 {
1845 	switch (clock_id) {
1846 	case CLOCK_MONOTONIC:
1847 		return ktime_get();
1848 	case CLOCK_REALTIME:
1849 		return ktime_get_real();
1850 	case CLOCK_BOOTTIME:
1851 		return ktime_get_boottime();
1852 	case CLOCK_TAI:
1853 		return ktime_get_clocktai();
1854 	default:
1855 		WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1856 		return ktime_get();
1857 	}
1858 }
1859 
1860 ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
1861 {
1862 	return __hrtimer_cb_get_time(timer->base->clockid);
1863 }
1864 EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
1865 
1866 static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
1867 			    clockid_t clock_id, enum hrtimer_mode mode)
1868 {
1869 	bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
1870 	struct hrtimer_cpu_base *cpu_base;
1871 	int base;
1872 
1873 	/*
1874 	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
1875 	 * marked for hard interrupt expiry mode are moved into soft
1876 	 * interrupt context for latency reasons and because the callbacks
1877 	 * can invoke functions which might sleep on RT, e.g. spin_lock().
1878 	 */
1879 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
1880 		softtimer = true;
1881 
1882 	memset(timer, 0, sizeof(struct hrtimer));
1883 
1884 	cpu_base = raw_cpu_ptr(&hrtimer_bases);
1885 
1886 	/*
1887 	 * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
1888 	 * clock modifications, so they needs to become CLOCK_MONOTONIC to
1889 	 * ensure POSIX compliance.
1890 	 */
1891 	if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
1892 		clock_id = CLOCK_MONOTONIC;
1893 
1894 	base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
1895 	base += hrtimer_clockid_to_base(clock_id);
1896 	timer->is_soft = softtimer;
1897 	timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
1898 	timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
1899 	timer->base = &cpu_base->clock_base[base];
1900 	timerqueue_linked_init(&timer->node);
1901 
1902 	if (WARN_ON_ONCE(!fn))
1903 		ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
1904 	else
1905 		ACCESS_PRIVATE(timer, function) = fn;
1906 }
1907 
1908 /**
1909  * hrtimer_setup - initialize a timer to the given clock
1910  * @timer:	the timer to be initialized
1911  * @function:	the callback function
1912  * @clock_id:	the clock to be used
1913  * @mode:       The modes which are relevant for initialization:
1914  *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
1915  *              HRTIMER_MODE_REL_SOFT
1916  *
1917  *              The PINNED variants of the above can be handed in,
1918  *              but the PINNED bit is ignored as pinning happens
1919  *              when the hrtimer is started
1920  */
1921 void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
1922 		   clockid_t clock_id, enum hrtimer_mode mode)
1923 {
1924 	debug_setup(timer, clock_id, mode);
1925 	__hrtimer_setup(timer, function, clock_id, mode);
1926 }
1927 EXPORT_SYMBOL_GPL(hrtimer_setup);
1928 
1929 /**
1930  * hrtimer_setup_on_stack - initialize a timer on stack memory
1931  * @timer:	The timer to be initialized
1932  * @function:	the callback function
1933  * @clock_id:	The clock to be used
1934  * @mode:       The timer mode
1935  *
1936  * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
1937  * memory.
1938  */
1939 void hrtimer_setup_on_stack(struct hrtimer *timer,
1940 			    enum hrtimer_restart (*function)(struct hrtimer *),
1941 			    clockid_t clock_id, enum hrtimer_mode mode)
1942 {
1943 	debug_setup_on_stack(timer, clock_id, mode);
1944 	__hrtimer_setup(timer, function, clock_id, mode);
1945 }
1946 EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);
1947 
1948 /*
1949  * A timer is active, when it is enqueued into the rbtree or the
1950  * callback function is running or it's in the state of being migrated
1951  * to another cpu.
1952  *
1953  * It is important for this function to not return a false negative.
1954  */
1955 bool hrtimer_active(const struct hrtimer *timer)
1956 {
1957 	struct hrtimer_clock_base *base;
1958 	unsigned int seq;
1959 
1960 	do {
1961 		base = READ_ONCE(timer->base);
1962 		seq = raw_read_seqcount_begin(&base->seq);
1963 
1964 		if (timer->is_queued || base->running == timer)
1965 			return true;
1966 
1967 	} while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));
1968 
1969 	return false;
1970 }
1971 EXPORT_SYMBOL_GPL(hrtimer_active);
1972 
1973 /*
1974  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
1975  * distinct sections:
1976  *
1977  *  - queued:	the timer is queued
1978  *  - callback:	the timer is being ran
1979  *  - post:	the timer is inactive or (re)queued
1980  *
1981  * On the read side we ensure we observe timer->is_queued and cpu_base->running
1982  * from the same section, if anything changed while we looked at it, we retry.
1983  * This includes timer->base changing because sequence numbers alone are
1984  * insufficient for that.
1985  *
1986  * The sequence numbers are required because otherwise we could still observe
1987  * a false negative if the read side got smeared over multiple consecutive
1988  * __run_hrtimer() invocations.
1989  */
1990 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
1991 			  struct hrtimer *timer, ktime_t now, unsigned long flags)
1992 	__must_hold(&cpu_base->lock)
1993 {
1994 	enum hrtimer_restart (*fn)(struct hrtimer *);
1995 	bool expires_in_hardirq;
1996 	int restart;
1997 
1998 	lockdep_assert_held(&cpu_base->lock);
1999 
2000 	debug_hrtimer_deactivate(timer);
2001 	base->running = timer;
2002 
2003 	/*
2004 	 * Separate the ->running assignment from the ->is_queued assignment.
2005 	 *
2006 	 * As with a regular write barrier, this ensures the read side in
2007 	 * hrtimer_active() cannot observe base->running == NULL &&
2008 	 * timer->is_queued == INACTIVE.
2009 	 */
2010 	raw_write_seqcount_barrier(&base->seq);
2011 
2012 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
2013 	fn = ACCESS_PRIVATE(timer, function);
2014 
2015 	/*
2016 	 * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
2017 	 * timer is restarted with a period then it becomes an absolute
2018 	 * timer. If its not restarted it does not matter.
2019 	 */
2020 	if (IS_ENABLED(CONFIG_TIME_LOW_RES))
2021 		timer->is_rel = false;
2022 
2023 	/*
2024 	 * The timer is marked as running in the CPU base, so it is
2025 	 * protected against migration to a different CPU even if the lock
2026 	 * is dropped.
2027 	 */
2028 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2029 	trace_hrtimer_expire_entry(timer, now);
2030 	expires_in_hardirq = lockdep_hrtimer_enter(timer);
2031 
2032 	restart = fn(timer);
2033 
2034 	lockdep_hrtimer_exit(expires_in_hardirq);
2035 	trace_hrtimer_expire_exit(timer);
2036 	raw_spin_lock_irq(&cpu_base->lock);
2037 
2038 	/*
2039 	 * Note: We clear the running state after enqueue_hrtimer and
2040 	 * we do not reprogram the event hardware. Happens either in
2041 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
2042 	 *
2043 	 * Note: Because we dropped the cpu_base->lock above,
2044 	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
2045 	 * for us already.
2046 	 */
2047 	if (restart == HRTIMER_RESTART && !timer->is_queued)
2048 		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);
2049 
2050 	/*
2051 	 * Separate the ->running assignment from the ->is_queued assignment.
2052 	 *
2053 	 * As with a regular write barrier, this ensures the read side in
2054 	 * hrtimer_active() cannot observe base->running.timer == NULL &&
2055 	 * timer->is_queued == INACTIVE.
2056 	 */
2057 	raw_write_seqcount_barrier(&base->seq);
2058 
2059 	WARN_ON_ONCE(base->running != timer);
2060 	base->running = NULL;
2061 }
2062 
2063 static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
2064 {
2065 	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
2066 
2067 	return next ? hrtimer_from_timerqueue_node(next) : NULL;
2068 }
2069 
2070 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
2071 				 unsigned long flags, unsigned int active_mask)
2072 {
2073 	unsigned int active = cpu_base->active_bases & active_mask;
2074 	struct hrtimer_clock_base *base;
2075 
2076 	for_each_active_base(base, cpu_base, active) {
2077 		ktime_t basenow = ktime_add(now, base->offset);
2078 		struct hrtimer *timer;
2079 
2080 		while ((timer = clock_base_next_timer(base))) {
2081 			/*
2082 			 * The immediate goal for using the softexpires is
2083 			 * minimizing wakeups, not running timers at the
2084 			 * earliest interrupt after their soft expiration.
2085 			 * This allows us to avoid using a Priority Search
2086 			 * Tree, which can answer a stabbing query for
2087 			 * overlapping intervals and instead use the simple
2088 			 * BST we already have.
2089 			 * We don't add extra wakeups by delaying timers that
2090 			 * are right-of a not yet expired timer, because that
2091 			 * timer will have to trigger a wakeup anyway.
2092 			 */
2093 			if (basenow < hrtimer_get_softexpires(timer))
2094 				break;
2095 
2096 			__run_hrtimer(cpu_base, base, timer, basenow, flags);
2097 			if (active_mask == HRTIMER_ACTIVE_SOFT)
2098 				hrtimer_sync_wait_running(cpu_base, flags);
2099 		}
2100 	}
2101 }
2102 
2103 static __latent_entropy void hrtimer_run_softirq(void)
2104 {
2105 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2106 	unsigned long flags;
2107 	ktime_t now;
2108 
2109 	hrtimer_cpu_base_lock_expiry(cpu_base);
2110 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
2111 
2112 	now = hrtimer_update_base(cpu_base);
2113 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
2114 
2115 	cpu_base->softirq_activated = false;
2116 	hrtimer_update_softirq_timer(cpu_base, true);
2117 
2118 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2119 	hrtimer_cpu_base_unlock_expiry(cpu_base);
2120 }
2121 
2122 #ifdef CONFIG_HIGH_RES_TIMERS
2123 
2124 /*
2125  * Very similar to hrtimer_force_reprogram(), except it deals with
2126  * deferred_rearm and hang_detected.
2127  */
2128 static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
2129 {
2130 	cpu_base->expires_next = expires_next;
2131 	cpu_base->deferred_rearm = false;
2132 
2133 	if (unlikely(cpu_base->hang_detected)) {
2134 		/*
2135 		 * Give the system a chance to do something else than looping
2136 		 * on hrtimer interrupts.
2137 		 */
2138 		expires_next = ktime_add_ns(ktime_get(),
2139 					    min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
2140 	}
2141 	hrtimer_rearm_event(expires_next, deferred);
2142 }
2143 
2144 #ifdef CONFIG_HRTIMER_REARM_DEFERRED
2145 void __hrtimer_rearm_deferred(void)
2146 {
2147 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2148 	ktime_t expires_next;
2149 
2150 	if (!cpu_base->deferred_rearm)
2151 		return;
2152 
2153 	guard(raw_spinlock)(&cpu_base->lock);
2154 	if (cpu_base->deferred_needs_update) {
2155 		hrtimer_update_base(cpu_base);
2156 		expires_next = hrtimer_update_next_event(cpu_base);
2157 	} else {
2158 		/* No timer added/removed. Use the cached value */
2159 		expires_next = cpu_base->deferred_expires_next;
2160 	}
2161 	hrtimer_rearm(cpu_base, expires_next, true);
2162 }
2163 
2164 static __always_inline void
2165 hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
2166 {
2167 	/* hrtimer_interrupt() just re-evaluated the first expiring timer */
2168 	cpu_base->deferred_needs_update = false;
2169 	/* Cache the expiry time */
2170 	cpu_base->deferred_expires_next = expires_next;
2171 	set_thread_flag(TIF_HRTIMER_REARM);
2172 }
2173 #else  /* CONFIG_HRTIMER_REARM_DEFERRED */
2174 static __always_inline void
2175 hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
2176 {
2177 	hrtimer_rearm(cpu_base, expires_next, false);
2178 }
2179 #endif  /* !CONFIG_HRTIMER_REARM_DEFERRED */
2180 
2181 /*
2182  * High resolution timer interrupt
2183  * Called with interrupts disabled
2184  */
2185 void hrtimer_interrupt(struct clock_event_device *dev)
2186 {
2187 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2188 	ktime_t expires_next, now, entry_time, delta;
2189 	unsigned long flags;
2190 	int retries = 0;
2191 
2192 	BUG_ON(!cpu_base->hres_active);
2193 	cpu_base->nr_events++;
2194 	dev->next_event = KTIME_MAX;
2195 	dev->next_event_forced = 0;
2196 
2197 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
2198 	entry_time = now = hrtimer_update_base(cpu_base);
2199 retry:
2200 	cpu_base->deferred_rearm = true;
2201 	/*
2202 	 * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
2203 	 * timers while __hrtimer_run_queues() is expiring the clock bases.
2204 	 * Timers which are re/enqueued on the local CPU are not affected by
2205 	 * this.
2206 	 */
2207 	cpu_base->expires_next = KTIME_MAX;
2208 
2209 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
2210 		cpu_base->softirq_expires_next = KTIME_MAX;
2211 		cpu_base->softirq_activated = true;
2212 		raise_timer_softirq(HRTIMER_SOFTIRQ);
2213 	}
2214 
2215 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
2216 
2217 	/*
2218 	 * The next timer was already expired due to:
2219 	 * - tracing
2220 	 * - long lasting callbacks
2221 	 * - being scheduled away when running in a VM
2222 	 *
2223 	 * We need to prevent that we loop forever in the hrtiner interrupt
2224 	 * routine. We give it 3 attempts to avoid overreacting on some
2225 	 * spurious event.
2226 	 */
2227 	now = hrtimer_update_base(cpu_base);
2228 	expires_next = hrtimer_update_next_event(cpu_base);
2229 	cpu_base->hang_detected = false;
2230 	if (expires_next < now) {
2231 		if (++retries < 3)
2232 			goto retry;
2233 
2234 		delta = ktime_sub(now, entry_time);
2235 		cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
2236 		cpu_base->nr_hangs++;
2237 		cpu_base->hang_detected = true;
2238 	}
2239 
2240 	hrtimer_interrupt_rearm(cpu_base, expires_next);
2241 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2242 }
2243 
2244 #endif /* !CONFIG_HIGH_RES_TIMERS */
2245 
2246 /*
2247  * Called from run_local_timers in hardirq context every jiffy
2248  */
2249 void hrtimer_run_queues(void)
2250 {
2251 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2252 	unsigned long flags;
2253 	ktime_t now;
2254 
2255 	if (hrtimer_hres_active(cpu_base))
2256 		return;
2257 
2258 	/*
2259 	 * This _is_ ugly: We have to check periodically, whether we
2260 	 * can switch to highres and / or nohz mode. The clocksource
2261 	 * switch happens with xtime_lock held. Notification from
2262 	 * there only sets the check bit in the tick_oneshot code,
2263 	 * otherwise we might deadlock vs. xtime_lock.
2264 	 */
2265 	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
2266 		hrtimer_switch_to_hres();
2267 		return;
2268 	}
2269 
2270 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
2271 	now = hrtimer_update_base(cpu_base);
2272 
2273 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
2274 		cpu_base->softirq_expires_next = KTIME_MAX;
2275 		cpu_base->softirq_activated = true;
2276 		raise_timer_softirq(HRTIMER_SOFTIRQ);
2277 	}
2278 
2279 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
2280 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2281 }
2282 
2283 /*
2284  * Sleep related functions:
2285  */
2286 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
2287 {
2288 	struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
2289 	struct task_struct *task = t->task;
2290 
2291 	t->task = NULL;
2292 	if (task)
2293 		wake_up_process(task);
2294 
2295 	return HRTIMER_NORESTART;
2296 }
2297 
2298 /**
2299  * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
2300  * @sl:		sleeper to be started
2301  * @mode:	timer mode abs/rel
2302  *
2303  * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
2304  * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
2305  */
2306 void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
2307 {
2308 	/*
2309 	 * Make the enqueue delivery mode check work on RT. If the sleeper
2310 	 * was initialized for hard interrupt delivery, force the mode bit.
2311 	 * This is a special case for hrtimer_sleepers because
2312 	 * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
2313 	 * fiddling with this decision is avoided at the call sites.
2314 	 */
2315 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
2316 		mode |= HRTIMER_MODE_HARD;
2317 
2318 	/* If already expired, clear the task pointer and set current state to running */
2319 	if (!hrtimer_start_expires_user(&sl->timer, mode)) {
2320 		sl->task = NULL;
2321 		__set_current_state(TASK_RUNNING);
2322 	}
2323 }
2324 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
2325 
2326 static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
2327 				    enum hrtimer_mode mode)
2328 {
2329 	/*
2330 	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
2331 	 * marked for hard interrupt expiry mode are moved into soft
2332 	 * interrupt context either for latency reasons or because the
2333 	 * hrtimer callback takes regular spinlocks or invokes other
2334 	 * functions which are not suitable for hard interrupt context on
2335 	 * PREEMPT_RT.
2336 	 *
2337 	 * The hrtimer_sleeper callback is RT compatible in hard interrupt
2338 	 * context, but there is a latency concern: Untrusted userspace can
2339 	 * spawn many threads which arm timers for the same expiry time on
2340 	 * the same CPU. That causes a latency spike due to the wakeup of
2341 	 * a gazillion threads.
2342 	 *
2343 	 * OTOH, privileged real-time user space applications rely on the
2344 	 * low latency of hard interrupt wakeups. If the current task is in
2345 	 * a real-time scheduling class, mark the mode for hard interrupt
2346 	 * expiry.
2347 	 */
2348 	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
2349 		if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
2350 			mode |= HRTIMER_MODE_HARD;
2351 	}
2352 
2353 	__hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
2354 	sl->task = current;
2355 }
2356 
2357 /**
2358  * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
2359  * @sl:		sleeper to be initialized
2360  * @clock_id:	the clock to be used
2361  * @mode:	timer mode abs/rel
2362  */
2363 void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
2364 				    enum hrtimer_mode mode)
2365 {
2366 	debug_setup_on_stack(&sl->timer, clock_id, mode);
2367 	__hrtimer_setup_sleeper(sl, clock_id, mode);
2368 }
2369 EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);
2370 
2371 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
2372 {
2373 	switch(restart->nanosleep.type) {
2374 #ifdef CONFIG_COMPAT_32BIT_TIME
2375 	case TT_COMPAT:
2376 		if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
2377 			return -EFAULT;
2378 		break;
2379 #endif
2380 	case TT_NATIVE:
2381 		if (put_timespec64(ts, restart->nanosleep.rmtp))
2382 			return -EFAULT;
2383 		break;
2384 	default:
2385 		BUG();
2386 	}
2387 	return -ERESTART_RESTARTBLOCK;
2388 }
2389 
2390 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
2391 {
2392 	struct restart_block *restart;
2393 
2394 	do {
2395 		set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
2396 		hrtimer_sleeper_start_expires(t, mode);
2397 
2398 		if (likely(t->task))
2399 			schedule();
2400 
2401 		hrtimer_cancel(&t->timer);
2402 		mode = HRTIMER_MODE_ABS;
2403 
2404 	} while (t->task && !signal_pending(current));
2405 
2406 	__set_current_state(TASK_RUNNING);
2407 
2408 	if (!t->task)
2409 		return 0;
2410 
2411 	restart = &current->restart_block;
2412 	if (restart->nanosleep.type != TT_NONE) {
2413 		ktime_t rem = hrtimer_expires_remaining(&t->timer);
2414 		struct timespec64 rmt;
2415 
2416 		if (rem <= 0)
2417 			return 0;
2418 		rmt = ktime_to_timespec64(rem);
2419 
2420 		return nanosleep_copyout(restart, &rmt);
2421 	}
2422 	return -ERESTART_RESTARTBLOCK;
2423 }
2424 
2425 static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
2426 {
2427 	struct hrtimer_sleeper t;
2428 	int ret;
2429 
2430 	hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
2431 	hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
2432 	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
2433 	destroy_hrtimer_on_stack(&t.timer);
2434 	return ret;
2435 }
2436 
2437 long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
2438 {
2439 	struct restart_block *restart;
2440 	struct hrtimer_sleeper t;
2441 	int ret;
2442 
2443 	hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
2444 	hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
2445 	ret = do_nanosleep(&t, mode);
2446 	if (ret != -ERESTART_RESTARTBLOCK)
2447 		goto out;
2448 
2449 	/* Absolute timers do not update the rmtp value and restart: */
2450 	if (mode == HRTIMER_MODE_ABS) {
2451 		ret = -ERESTARTNOHAND;
2452 		goto out;
2453 	}
2454 
2455 	restart = &current->restart_block;
2456 	restart->nanosleep.clockid = t.timer.base->clockid;
2457 	restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
2458 	set_restart_fn(restart, hrtimer_nanosleep_restart);
2459 out:
2460 	destroy_hrtimer_on_stack(&t.timer);
2461 	return ret;
2462 }
2463 
2464 #ifdef CONFIG_64BIT
2465 
2466 SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
2467 		struct __kernel_timespec __user *, rmtp)
2468 {
2469 	struct timespec64 tu;
2470 
2471 	if (get_timespec64(&tu, rqtp))
2472 		return -EFAULT;
2473 
2474 	if (!timespec64_valid(&tu))
2475 		return -EINVAL;
2476 
2477 	current->restart_block.fn = do_no_restart_syscall;
2478 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
2479 	current->restart_block.nanosleep.rmtp = rmtp;
2480 	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
2481 }
2482 
2483 #endif
2484 
2485 #ifdef CONFIG_COMPAT_32BIT_TIME
2486 
2487 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
2488 		struct old_timespec32 __user *, rmtp)
2489 {
2490 	struct timespec64 tu;
2491 
2492 	if (get_old_timespec32(&tu, rqtp))
2493 		return -EFAULT;
2494 
2495 	if (!timespec64_valid(&tu))
2496 		return -EINVAL;
2497 
2498 	current->restart_block.fn = do_no_restart_syscall;
2499 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
2500 	current->restart_block.nanosleep.compat_rmtp = rmtp;
2501 	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
2502 }
2503 #endif
2504 
2505 /*
2506  * Functions related to boot-time initialization:
2507  */
2508 int hrtimers_prepare_cpu(unsigned int cpu)
2509 {
2510 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
2511 
2512 	for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
2513 		struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
2514 
2515 		clock_b->cpu_base = cpu_base;
2516 		seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
2517 		timerqueue_linked_init_head(&clock_b->active);
2518 	}
2519 
2520 	cpu_base->cpu = cpu;
2521 	hrtimer_cpu_base_init_expiry_lock(cpu_base);
2522 	return 0;
2523 }
2524 
2525 int hrtimers_cpu_starting(unsigned int cpu)
2526 {
2527 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2528 
2529 	/* Clear out any left over state from a CPU down operation */
2530 	cpu_base->active_bases = 0;
2531 	cpu_base->hres_active = false;
2532 	cpu_base->hang_detected = false;
2533 	cpu_base->next_timer = NULL;
2534 	cpu_base->softirq_next_timer = NULL;
2535 	cpu_base->expires_next = KTIME_MAX;
2536 	cpu_base->softirq_expires_next = KTIME_MAX;
2537 	cpu_base->softirq_activated = false;
2538 	cpu_base->online = true;
2539 	return 0;
2540 }
2541 
2542 #ifdef CONFIG_HOTPLUG_CPU
2543 
2544 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
2545 				struct hrtimer_clock_base *new_base)
2546 {
2547 	struct timerqueue_linked_node *node;
2548 	struct hrtimer *timer;
2549 
2550 	while ((node = timerqueue_linked_first(&old_base->active))) {
2551 		timer = hrtimer_from_timerqueue_node(node);
2552 		BUG_ON(hrtimer_callback_running(timer));
2553 		debug_hrtimer_deactivate(timer);
2554 
2555 		/*
2556 		 * Mark it as ENQUEUED not INACTIVE otherwise the
2557 		 * timer could be seen as !active and just vanish away
2558 		 * under us on another CPU
2559 		 */
2560 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
2561 		timer->base = new_base;
2562 		/*
2563 		 * Enqueue the timers on the new cpu. This does not
2564 		 * reprogram the event device in case the timer
2565 		 * expires before the earliest on this CPU, but we run
2566 		 * hrtimer_interrupt after we migrated everything to
2567 		 * sort out already expired timers and reprogram the
2568 		 * event device.
2569 		 */
2570 		enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
2571 	}
2572 }
2573 
2574 int hrtimers_cpu_dying(unsigned int dying_cpu)
2575 {
2576 	int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
2577 	struct hrtimer_cpu_base *old_base, *new_base;
2578 
2579 	old_base = this_cpu_ptr(&hrtimer_bases);
2580 	new_base = &per_cpu(hrtimer_bases, ncpu);
2581 
2582 	/*
2583 	 * The caller is globally serialized and nobody else
2584 	 * takes two locks at once, deadlock is not possible.
2585 	 */
2586 	raw_spin_lock(&old_base->lock);
2587 	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
2588 
2589 	for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
2590 		migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);
2591 
2592 	/* Tell the other CPU to retrigger the next event */
2593 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
2594 
2595 	raw_spin_unlock(&new_base->lock);
2596 	old_base->online = false;
2597 	raw_spin_unlock(&old_base->lock);
2598 
2599 	return 0;
2600 }
2601 
2602 #endif /* CONFIG_HOTPLUG_CPU */
2603 
2604 void __init hrtimers_init(void)
2605 {
2606 	hrtimers_prepare_cpu(smp_processor_id());
2607 	hrtimers_cpu_starting(smp_processor_id());
2608 	open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
2609 }
2610