xref: /linux/kernel/time/hrtimer.c (revision ff1c0c5d07028a84837950b619d30da623f8ddb2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
4  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
5  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
6  *
7  *  High-resolution kernel timers
8  *
9  *  In contrast to the low-resolution timeout API, aka timer wheel,
10  *  hrtimers provide finer resolution and accuracy depending on system
11  *  configuration and capabilities.
12  *
13  *  Started by: Thomas Gleixner and Ingo Molnar
14  *
15  *  Credits:
16  *	Based on the original timer wheel code
17  *
18  *	Help, testing, suggestions, bugfixes, improvements were
19  *	provided by:
20  *
21  *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
22  *	et. al.
23  */
24 
25 #include <linux/cpu.h>
26 #include <linux/export.h>
27 #include <linux/percpu.h>
28 #include <linux/hrtimer.h>
29 #include <linux/notifier.h>
30 #include <linux/syscalls.h>
31 #include <linux/interrupt.h>
32 #include <linux/tick.h>
33 #include <linux/err.h>
34 #include <linux/debugobjects.h>
35 #include <linux/sched/signal.h>
36 #include <linux/sched/sysctl.h>
37 #include <linux/sched/rt.h>
38 #include <linux/sched/deadline.h>
39 #include <linux/sched/nohz.h>
40 #include <linux/sched/debug.h>
41 #include <linux/sched/isolation.h>
42 #include <linux/timer.h>
43 #include <linux/freezer.h>
44 #include <linux/compat.h>
45 
46 #include <linux/uaccess.h>
47 
48 #include <trace/events/timer.h>
49 
50 #include "tick-internal.h"
51 
52 /*
53  * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
54  *
55  * The callback state is kept separate in the CPU base because having it in
56  * the timer would required touching the timer after the callback, which
57  * makes it impossible to free the timer from the callback function.
58  *
59  * Therefore we track the callback state in:
60  *
61  *	timer->base->cpu_base->running == timer
62  *
63  * On SMP it is possible to have a "callback function running and enqueued"
64  * status. It happens for example when a posix timer expired and the callback
65  * queued a signal. Between dropping the lock which protects the posix timer
66  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
67  * signal and rearm the timer.
68  *
69  * All state transitions are protected by cpu_base->lock.
70  */
71 #define HRTIMER_STATE_INACTIVE	false
72 #define HRTIMER_STATE_ENQUEUED	true
73 
74 /*
75  * The resolution of the clocks. The resolution value is returned in
76  * the clock_getres() system call to give application programmers an
77  * idea of the (in)accuracy of timers. Timer values are rounded up to
78  * this resolution values.
79  */
80 #define HIGH_RES_NSEC		1
81 
82 /*
83  * Masks for selecting the soft and hard context timers from
84  * cpu_base->active
85  */
86 #define MASK_SHIFT		(HRTIMER_BASE_MONOTONIC_SOFT)
87 #define HRTIMER_ACTIVE_HARD	((1U << MASK_SHIFT) - 1)
88 #define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
89 #define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
90 
91 static void retrigger_next_event(void *arg);
92 static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
93 
94 /*
95  * The timer bases:
96  *
97  * There are more clockids than hrtimer bases. Thus, we index
98  * into the timer bases by the hrtimer_base_type enum. When trying
99  * to reach a base using a clockid, hrtimer_clockid_to_base()
100  * is used to convert from clockid to the proper hrtimer_base_type.
101  */
102 
103 #define BASE_INIT(idx, cid)			\
104 	[idx] = { .index = idx, .clockid = cid }
105 
106 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
107 {
108 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
109 	.clock_base = {
110 		BASE_INIT(HRTIMER_BASE_MONOTONIC,	CLOCK_MONOTONIC),
111 		BASE_INIT(HRTIMER_BASE_REALTIME,	CLOCK_REALTIME),
112 		BASE_INIT(HRTIMER_BASE_BOOTTIME,	CLOCK_BOOTTIME),
113 		BASE_INIT(HRTIMER_BASE_TAI,		CLOCK_TAI),
114 		BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT,	CLOCK_MONOTONIC),
115 		BASE_INIT(HRTIMER_BASE_REALTIME_SOFT,	CLOCK_REALTIME),
116 		BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT,	CLOCK_BOOTTIME),
117 		BASE_INIT(HRTIMER_BASE_TAI_SOFT,	CLOCK_TAI),
118 	},
119 	.csd = CSD_INIT(retrigger_next_event, NULL)
120 };
121 
122 static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
123 {
124 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
125 		return true;
126 	else
127 		return likely(base->online);
128 }
129 
130 #ifdef CONFIG_HIGH_RES_TIMERS
131 DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);
132 
133 static void hrtimer_hres_workfn(struct work_struct *work)
134 {
135 	static_branch_enable(&hrtimer_highres_enabled_key);
136 }
137 
138 static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);
139 
140 static inline void hrtimer_schedule_hres_work(void)
141 {
142 	if (!hrtimer_highres_enabled())
143 		schedule_work(&hrtimer_hres_work);
144 }
145 #else
146 static inline void hrtimer_schedule_hres_work(void) { }
147 #endif
148 
149 /*
150  * Functions and macros which are different for UP/SMP systems are kept in a
151  * single place
152  */
153 #ifdef CONFIG_SMP
154 /*
155  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
156  * such that hrtimer_callback_running() can unconditionally dereference
157  * timer->base->cpu_base
158  */
159 static struct hrtimer_cpu_base migration_cpu_base = {
160 	.clock_base = {
161 		[0] = {
162 			.cpu_base = &migration_cpu_base,
163 			.seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
164 							     &migration_cpu_base.lock),
165 		},
166 	},
167 };
168 
169 #define migration_base	migration_cpu_base.clock_base[0]
170 
171 /*
172  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
173  * means that all timers which are tied to this base via timer->base are
174  * locked, and the base itself is locked too.
175  *
176  * So __run_timers/migrate_timers can safely modify all timers which could
177  * be found on the lists/queues.
178  *
179  * When the timer's base is locked, and the timer removed from list, it is
180  * possible to set timer->base = &migration_base and drop the lock: the timer
181  * remains locked.
182  */
183 static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
184 						    unsigned long *flags)
185 	__acquires(&timer->base->lock)
186 {
187 	for (;;) {
188 		struct hrtimer_clock_base *base = READ_ONCE(timer->base);
189 
190 		if (likely(base != &migration_base)) {
191 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
192 			if (likely(base == timer->base))
193 				return base;
194 			/* The timer has migrated to another CPU: */
195 			raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
196 		}
197 		cpu_relax();
198 	}
199 }
200 
201 /*
202  * Check if the elected target is suitable considering its next
203  * event and the hotplug state of the current CPU.
204  *
205  * If the elected target is remote and its next event is after the timer
206  * to queue, then a remote reprogram is necessary. However there is no
207  * guarantee the IPI handling the operation would arrive in time to meet
208  * the high resolution deadline. In this case the local CPU becomes a
209  * preferred target, unless it is offline.
210  *
211  * High and low resolution modes are handled the same way for simplicity.
212  *
213  * Called with cpu_base->lock of target cpu held.
214  */
215 static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
216 				    struct hrtimer_cpu_base *new_cpu_base,
217 				    struct hrtimer_cpu_base *this_cpu_base)
218 {
219 	ktime_t expires;
220 
221 	/*
222 	 * The local CPU clockevent can be reprogrammed. Also get_target_base()
223 	 * guarantees it is online.
224 	 */
225 	if (new_cpu_base == this_cpu_base)
226 		return true;
227 
228 	/*
229 	 * The offline local CPU can't be the default target if the
230 	 * next remote target event is after this timer. Keep the
231 	 * elected new base. An IPI will be issued to reprogram
232 	 * it as a last resort.
233 	 */
234 	if (!hrtimer_base_is_online(this_cpu_base))
235 		return true;
236 
237 	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
238 
239 	return expires >= new_base->cpu_base->expires_next;
240 }
241 
242 static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
243 {
244 	if (!hrtimer_base_is_online(base)) {
245 		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
246 
247 		return &per_cpu(hrtimer_bases, cpu);
248 	}
249 
250 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
251 	if (static_branch_likely(&timers_migration_enabled) && !pinned)
252 		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
253 #endif
254 	return base;
255 }
256 
257 /*
258  * We switch the timer base to a power-optimized selected CPU target,
259  * if:
260  *	- NO_HZ_COMMON is enabled
261  *	- timer migration is enabled
262  *	- the timer callback is not running
263  *	- the timer is not the first expiring timer on the new target
264  *
265  * If one of the above requirements is not fulfilled we move the timer
266  * to the current CPU or leave it on the previously assigned CPU if
267  * the timer callback is currently running.
268  */
269 static inline struct hrtimer_clock_base *
270 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
271 {
272 	struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
273 	struct hrtimer_clock_base *new_base;
274 	int basenum = base->index;
275 
276 	this_cpu_base = this_cpu_ptr(&hrtimer_bases);
277 	new_cpu_base = get_target_base(this_cpu_base, pinned);
278 again:
279 	new_base = &new_cpu_base->clock_base[basenum];
280 
281 	if (base != new_base) {
282 		/*
283 		 * We are trying to move timer to new_base. However we can't
284 		 * change timer's base while it is running, so we keep it on
285 		 * the same CPU. No hassle vs. reprogramming the event source
286 		 * in the high resolution case. The remote CPU will take care
287 		 * of this when the timer function has completed. There is no
288 		 * conflict as we hold the lock until the timer is enqueued.
289 		 */
290 		if (unlikely(hrtimer_callback_running(timer)))
291 			return base;
292 
293 		/* See the comment in lock_hrtimer_base() */
294 		WRITE_ONCE(timer->base, &migration_base);
295 		raw_spin_unlock(&base->cpu_base->lock);
296 		raw_spin_lock(&new_base->cpu_base->lock);
297 
298 		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
299 			raw_spin_unlock(&new_base->cpu_base->lock);
300 			raw_spin_lock(&base->cpu_base->lock);
301 			new_cpu_base = this_cpu_base;
302 			WRITE_ONCE(timer->base, base);
303 			goto again;
304 		}
305 		WRITE_ONCE(timer->base, new_base);
306 	} else {
307 		if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
308 			new_cpu_base = this_cpu_base;
309 			goto again;
310 		}
311 	}
312 	return new_base;
313 }
314 
315 #else /* CONFIG_SMP */
316 
317 static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
318 							   unsigned long *flags)
319 	__acquires(&timer->base->cpu_base->lock)
320 {
321 	struct hrtimer_clock_base *base = timer->base;
322 
323 	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
324 	return base;
325 }
326 
327 # define switch_hrtimer_base(t, b, p)	(b)
328 
329 #endif	/* !CONFIG_SMP */
330 
331 /*
332  * Functions for the union type storage format of ktime_t which are
333  * too large for inlining:
334  */
335 #if BITS_PER_LONG < 64
336 /*
337  * Divide a ktime value by a nanosecond value
338  */
339 s64 __ktime_divns(const ktime_t kt, s64 div)
340 {
341 	int sft = 0;
342 	s64 dclc;
343 	u64 tmp;
344 
345 	dclc = ktime_to_ns(kt);
346 	tmp = dclc < 0 ? -dclc : dclc;
347 
348 	/* Make sure the divisor is less than 2^32: */
349 	while (div >> 32) {
350 		sft++;
351 		div >>= 1;
352 	}
353 	tmp >>= sft;
354 	do_div(tmp, (u32) div);
355 	return dclc < 0 ? -tmp : tmp;
356 }
357 EXPORT_SYMBOL_GPL(__ktime_divns);
358 #endif /* BITS_PER_LONG < 64 */
359 
360 /*
361  * Add two ktime values and do a safety check for overflow:
362  */
363 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
364 {
365 	ktime_t res = ktime_add_unsafe(lhs, rhs);
366 
367 	/*
368 	 * We use KTIME_SEC_MAX here, the maximum timeout which we can
369 	 * return to user space in a timespec:
370 	 */
371 	if (res < 0 || res < lhs || res < rhs)
372 		res = ktime_set(KTIME_SEC_MAX, 0);
373 
374 	return res;
375 }
376 
377 EXPORT_SYMBOL_GPL(ktime_add_safe);
378 
379 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
380 
381 static const struct debug_obj_descr hrtimer_debug_descr;
382 
383 static void *hrtimer_debug_hint(void *addr)
384 {
385 	return ACCESS_PRIVATE((struct hrtimer *)addr, function);
386 }
387 
388 /*
389  * fixup_init is called when:
390  * - an active object is initialized
391  */
392 static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
393 {
394 	struct hrtimer *timer = addr;
395 
396 	switch (state) {
397 	case ODEBUG_STATE_ACTIVE:
398 		hrtimer_cancel(timer);
399 		debug_object_init(timer, &hrtimer_debug_descr);
400 		return true;
401 	default:
402 		return false;
403 	}
404 }
405 
406 /*
407  * fixup_activate is called when:
408  * - an active object is activated
409  * - an unknown non-static object is activated
410  */
411 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
412 {
413 	switch (state) {
414 	case ODEBUG_STATE_ACTIVE:
415 		WARN_ON(1);
416 		fallthrough;
417 	default:
418 		return false;
419 	}
420 }
421 
422 /*
423  * fixup_free is called when:
424  * - an active object is freed
425  */
426 static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
427 {
428 	struct hrtimer *timer = addr;
429 
430 	switch (state) {
431 	case ODEBUG_STATE_ACTIVE:
432 		hrtimer_cancel(timer);
433 		debug_object_free(timer, &hrtimer_debug_descr);
434 		return true;
435 	default:
436 		return false;
437 	}
438 }
439 
440 /* Stub timer callback for improperly used timers. */
441 static enum hrtimer_restart stub_timer(struct hrtimer *unused)
442 {
443 	WARN_ON_ONCE(1);
444 	return HRTIMER_NORESTART;
445 }
446 
447 /*
448  * hrtimer_fixup_assert_init is called when:
449  * - an untracked/uninit-ed object is found
450  */
451 static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
452 {
453 	struct hrtimer *timer = addr;
454 
455 	switch (state) {
456 	case ODEBUG_STATE_NOTAVAILABLE:
457 		hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
458 		return true;
459 	default:
460 		return false;
461 	}
462 }
463 
464 static const struct debug_obj_descr hrtimer_debug_descr = {
465 	.name			= "hrtimer",
466 	.debug_hint		= hrtimer_debug_hint,
467 	.fixup_init		= hrtimer_fixup_init,
468 	.fixup_activate		= hrtimer_fixup_activate,
469 	.fixup_free		= hrtimer_fixup_free,
470 	.fixup_assert_init	= hrtimer_fixup_assert_init,
471 };
472 
473 static inline void debug_hrtimer_init(struct hrtimer *timer)
474 {
475 	debug_object_init(timer, &hrtimer_debug_descr);
476 }
477 
478 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
479 {
480 	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
481 }
482 
483 static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
484 {
485 	debug_object_activate(timer, &hrtimer_debug_descr);
486 }
487 
488 static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
489 {
490 	debug_object_deactivate(timer, &hrtimer_debug_descr);
491 }
492 
493 static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
494 {
495 	debug_object_assert_init(timer, &hrtimer_debug_descr);
496 }
497 
498 void destroy_hrtimer_on_stack(struct hrtimer *timer)
499 {
500 	debug_object_free(timer, &hrtimer_debug_descr);
501 }
502 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
503 
504 #else
505 
506 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
507 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
508 static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
509 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
510 static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
511 #endif
512 
513 static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
514 {
515 	debug_hrtimer_init(timer);
516 	trace_hrtimer_setup(timer, clockid, mode);
517 }
518 
519 static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
520 					enum hrtimer_mode mode)
521 {
522 	debug_hrtimer_init_on_stack(timer);
523 	trace_hrtimer_setup(timer, clockid, mode);
524 }
525 
526 static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
527 {
528 	debug_hrtimer_activate(timer, mode);
529 	trace_hrtimer_start(timer, mode, was_armed);
530 }
531 
532 #define for_each_active_base(base, cpu_base, active)					\
533 	for (unsigned int idx = ffs(active); idx--; idx = ffs((active)))		\
534 		for (bool done = false; !done; active &= ~(1U << idx))			\
535 			for (base = &cpu_base->clock_base[idx]; !done; done = true)
536 
537 #define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)
538 
539 #if defined(CONFIG_NO_HZ_COMMON)
540 /*
541  * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
542  * does not update cpu_base->next_timer/expires.
543  */
544 static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
545 						const struct hrtimer *exclude,
546 						unsigned int active, ktime_t expires_next)
547 {
548 	struct hrtimer_clock_base *base;
549 	ktime_t expires;
550 
551 	lockdep_assert_held(&cpu_base->lock);
552 
553 	for_each_active_base(base, cpu_base, active) {
554 		expires = ktime_sub(base->expires_next, base->offset);
555 		if (expires >= expires_next)
556 			continue;
557 
558 		/*
559 		 * If the excluded timer is the first on this base evaluate the
560 		 * next timer.
561 		 */
562 		struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);
563 
564 		if (unlikely(&exclude->node == node)) {
565 			node = timerqueue_linked_next(node);
566 			if (!node)
567 				continue;
568 			expires = ktime_sub(node->expires, base->offset);
569 			if (expires >= expires_next)
570 				continue;
571 		}
572 		expires_next = expires;
573 	}
574 	/* If base->offset changed, the result might be negative */
575 	return max(expires_next, 0);
576 }
577 #endif
578 
579 static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
580 {
581 	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
582 
583 	return hrtimer_from_timerqueue_node(next);
584 }
585 
586 /* Find the base with the earliest expiry */
587 static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
588 				ktime_t *expires_next, struct hrtimer **next_timer)
589 {
590 	struct hrtimer_clock_base *base;
591 	ktime_t expires;
592 
593 	for_each_active_base(base, cpu_base, active) {
594 		expires = ktime_sub(base->expires_next, base->offset);
595 		if (expires < *expires_next) {
596 			*expires_next = expires;
597 			*next_timer = clock_base_next_timer(base);
598 		}
599 	}
600 }
601 
602 /*
603  * Recomputes cpu_base::*next_timer and returns the earliest expires_next
604  * but does not set cpu_base::*expires_next, that is done by
605  * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
606  * cpu_base::*expires_next right away, reprogramming logic would no longer
607  * work.
608  *
609  * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
610  * those timers will get run whenever the softirq gets handled, at the end of
611  * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
612  *
613  * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
614  * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
615  * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
616  *
617  * @active_mask must be one of:
618  *  - HRTIMER_ACTIVE_ALL,
619  *  - HRTIMER_ACTIVE_SOFT, or
620  *  - HRTIMER_ACTIVE_HARD.
621  */
622 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
623 {
624 	struct hrtimer *next_timer = NULL;
625 	ktime_t expires_next = KTIME_MAX;
626 	unsigned int active;
627 
628 	lockdep_assert_held(&cpu_base->lock);
629 
630 	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
631 		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
632 		if (active)
633 			hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
634 		cpu_base->softirq_next_timer = next_timer;
635 	}
636 
637 	if (active_mask & HRTIMER_ACTIVE_HARD) {
638 		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
639 		if (active)
640 			hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
641 		cpu_base->next_timer = next_timer;
642 	}
643 	return max(expires_next, 0);
644 }
645 
646 static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
647 {
648 	ktime_t expires_next, soft = KTIME_MAX;
649 
650 	/*
651 	 * If the soft interrupt has already been activated, ignore the
652 	 * soft bases. They will be handled in the already raised soft
653 	 * interrupt.
654 	 */
655 	if (!cpu_base->softirq_activated) {
656 		soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
657 		/*
658 		 * Update the soft expiry time. clock_settime() might have
659 		 * affected it.
660 		 */
661 		cpu_base->softirq_expires_next = soft;
662 	}
663 
664 	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
665 	/*
666 	 * If a softirq timer is expiring first, update cpu_base->next_timer
667 	 * and program the hardware with the soft expiry time.
668 	 */
669 	if (expires_next > soft) {
670 		cpu_base->next_timer = cpu_base->softirq_next_timer;
671 		expires_next = soft;
672 	}
673 
674 	return expires_next;
675 }
676 
677 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
678 {
679 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
680 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
681 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
682 
683 	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
684 						   offs_boot, offs_tai);
685 
686 	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
687 	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
688 	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
689 
690 	return now;
691 }
692 
693 /*
694  * Is the high resolution mode active in the CPU base. This cannot use the
695  * static key as the CPUs are switched to high resolution mode
696  * asynchronously.
697  */
698 static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
699 {
700 	return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
701 		cpu_base->hres_active : 0;
702 }
703 
704 static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
705 {
706 	trace_hrtimer_rearm(expires_next, deferred);
707 	tick_program_event(expires_next, 1);
708 }
709 
710 static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
711 				ktime_t expires_next)
712 {
713 	cpu_base->expires_next = expires_next;
714 
715 	/*
716 	 * If hres is not active, hardware does not have to be
717 	 * reprogrammed yet.
718 	 *
719 	 * If a hang was detected in the last timer interrupt then we
720 	 * leave the hang delay active in the hardware. We want the
721 	 * system to make progress. That also prevents the following
722 	 * scenario:
723 	 * T1 expires 50ms from now
724 	 * T2 expires 5s from now
725 	 *
726 	 * T1 is removed, so this code is called and would reprogram
727 	 * the hardware to 5s from now. Any hrtimer_start after that
728 	 * will not reprogram the hardware due to hang_detected being
729 	 * set. So we'd effectively block all timers until the T2 event
730 	 * fires.
731 	 */
732 	if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
733 		return;
734 
735 	hrtimer_rearm_event(expires_next, false);
736 }
737 
738 /* Reprogram the event source with a evaluation of all clock bases */
739 static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
740 {
741 	ktime_t expires_next = hrtimer_update_next_event(cpu_base);
742 
743 	if (skip_equal && expires_next == cpu_base->expires_next)
744 		return;
745 
746 	__hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
747 }
748 
749 /* High resolution timer related functions */
750 #ifdef CONFIG_HIGH_RES_TIMERS
751 
752 /* High resolution timer enabled ? */
753 static bool hrtimer_hres_enabled __read_mostly  = true;
754 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
755 EXPORT_SYMBOL_GPL(hrtimer_resolution);
756 
757 /* Enable / Disable high resolution mode */
758 static int __init setup_hrtimer_hres(char *str)
759 {
760 	return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
761 }
762 __setup("highres=", setup_hrtimer_hres);
763 
764 /* hrtimer_high_res_enabled - query, if the highres mode is enabled */
765 static inline bool hrtimer_is_hres_enabled(void)
766 {
767 	return hrtimer_hres_enabled;
768 }
769 
770 /* Switch to high resolution mode */
771 static void hrtimer_switch_to_hres(void)
772 {
773 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
774 
775 	if (tick_init_highres()) {
776 		pr_warn("Could not switch to high resolution mode on CPU %u\n",	base->cpu);
777 		return;
778 	}
779 	base->hres_active = true;
780 	hrtimer_resolution = HIGH_RES_NSEC;
781 
782 	tick_setup_sched_timer(true);
783 	/* "Retrigger" the interrupt to get things going */
784 	retrigger_next_event(NULL);
785 	hrtimer_schedule_hres_work();
786 }
787 
788 #else
789 
790 static inline bool hrtimer_is_hres_enabled(void) { return 0; }
791 static inline void hrtimer_switch_to_hres(void) { }
792 
793 #endif /* CONFIG_HIGH_RES_TIMERS */
794 
795 /*
796  * Retrigger next event is called after clock was set with interrupts
797  * disabled through an SMP function call or directly from low level
798  * resume code.
799  *
800  * This is only invoked when:
801  *	- CONFIG_HIGH_RES_TIMERS is enabled.
802  *	- CONFIG_NOHZ_COMMON is enabled
803  *
804  * For the other cases this function is empty and because the call sites
805  * are optimized out it vanishes as well, i.e. no need for lots of
806  * #ifdeffery.
807  */
808 static void retrigger_next_event(void *arg)
809 {
810 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
811 
812 	/*
813 	 * When high resolution mode or nohz is active, then the offsets of
814 	 * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
815 	 * next tick will take care of that.
816 	 *
817 	 * If high resolution mode is active then the next expiring timer
818 	 * must be reevaluated and the clock event device reprogrammed if
819 	 * necessary.
820 	 *
821 	 * In the NOHZ case the update of the offset and the reevaluation
822 	 * of the next expiring timer is enough. The return from the SMP
823 	 * function call will take care of the reprogramming in case the
824 	 * CPU was in a NOHZ idle sleep.
825 	 *
826 	 * In periodic low resolution mode, the next softirq expiration
827 	 * must also be updated.
828 	 */
829 	guard(raw_spinlock)(&base->lock);
830 	hrtimer_update_base(base);
831 	if (hrtimer_hres_active(base))
832 		hrtimer_force_reprogram(base, /* skip_equal */ false);
833 	else
834 		hrtimer_update_next_event(base);
835 }
836 
837 /*
838  * When a timer is enqueued and expires earlier than the already enqueued
839  * timers, we have to check, whether it expires earlier than the timer for
840  * which the clock event device was armed.
841  *
842  * Called with interrupts disabled and base->cpu_base.lock held
843  */
844 static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
845 {
846 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
847 	struct hrtimer_clock_base *base = timer->base;
848 	ktime_t expires = hrtimer_get_expires(timer);
849 
850 	WARN_ON_ONCE(expires < 0);
851 
852 	expires = ktime_sub(expires, base->offset);
853 	/*
854 	 * CLOCK_REALTIME timer might be requested with an absolute
855 	 * expiry time which is less than base->offset. Set it to 0.
856 	 */
857 	if (expires < 0)
858 		expires = 0;
859 
860 	if (timer->is_soft) {
861 		/*
862 		 * soft hrtimer could be started on a remote CPU. In this
863 		 * case softirq_expires_next needs to be updated on the
864 		 * remote CPU. The soft hrtimer will not expire before the
865 		 * first hard hrtimer on the remote CPU -
866 		 * hrtimer_check_target() prevents this case.
867 		 */
868 		struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
869 
870 		if (timer_cpu_base->softirq_activated)
871 			return;
872 
873 		if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
874 			return;
875 
876 		timer_cpu_base->softirq_next_timer = timer;
877 		timer_cpu_base->softirq_expires_next = expires;
878 
879 		if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
880 			return;
881 	}
882 
883 	/*
884 	 * If the timer is not on the current cpu, we cannot reprogram
885 	 * the other cpus clock event device.
886 	 */
887 	if (base->cpu_base != cpu_base)
888 		return;
889 
890 	if (expires >= cpu_base->expires_next)
891 		return;
892 
893 	/* If a deferred rearm is pending skip reprogramming the device */
894 	if (cpu_base->deferred_rearm)
895 		return;
896 
897 	cpu_base->next_timer = timer;
898 
899 	__hrtimer_reprogram(cpu_base, timer, expires);
900 }
901 
902 static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
903 {
904 	struct hrtimer_clock_base *base;
905 	unsigned int seq;
906 	ktime_t expires;
907 
908 	/*
909 	 * Update the base offsets unconditionally so the following
910 	 * checks whether the SMP function call is required works.
911 	 *
912 	 * The update is safe even when the remote CPU is in the hrtimer
913 	 * interrupt or the hrtimer soft interrupt and expiring affected
914 	 * bases. Either it will see the update before handling a base or
915 	 * it will see it when it finishes the processing and reevaluates
916 	 * the next expiring timer.
917 	 */
918 	seq = cpu_base->clock_was_set_seq;
919 	hrtimer_update_base(cpu_base);
920 
921 	/*
922 	 * If the sequence did not change over the update then the
923 	 * remote CPU already handled it.
924 	 */
925 	if (seq == cpu_base->clock_was_set_seq)
926 		return false;
927 
928 	/* If a deferred rearm is pending the remote CPU will take care of it */
929 	if (cpu_base->deferred_rearm) {
930 		cpu_base->deferred_needs_update = true;
931 		return false;
932 	}
933 
934 	/*
935 	 * Walk the affected clock bases and check whether the first expiring
936 	 * timer in a clock base is moving ahead of the first expiring timer of
937 	 * @cpu_base. If so, the IPI must be invoked because per CPU clock
938 	 * event devices cannot be remotely reprogrammed.
939 	 */
940 	active &= cpu_base->active_bases;
941 
942 	for_each_active_base(base, cpu_base, active) {
943 		struct timerqueue_linked_node *next;
944 
945 		next = timerqueue_linked_first(&base->active);
946 		expires = ktime_sub(next->expires, base->offset);
947 		if (expires < cpu_base->expires_next)
948 			return true;
949 
950 		/* Extra check for softirq clock bases */
951 		if (base->index < HRTIMER_BASE_MONOTONIC_SOFT)
952 			continue;
953 		if (cpu_base->softirq_activated)
954 			continue;
955 		if (expires < cpu_base->softirq_expires_next)
956 			return true;
957 	}
958 	return false;
959 }
960 
961 /*
962  * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
963  * CLOCK_BOOTTIME (for late sleep time injection).
964  *
965  * This requires to update the offsets for these clocks
966  * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
967  * also requires to eventually reprogram the per CPU clock event devices
968  * when the change moves an affected timer ahead of the first expiring
969  * timer on that CPU. Obviously remote per CPU clock event devices cannot
970  * be reprogrammed. The other reason why an IPI has to be sent is when the
971  * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
972  * in the tick, which obviously might be stopped, so this has to bring out
973  * the remote CPU which might sleep in idle to get this sorted.
974  */
975 void clock_was_set(unsigned int bases)
976 {
977 	cpumask_var_t mask;
978 
979 	if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
980 		goto out_timerfd;
981 
982 	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
983 		on_each_cpu(retrigger_next_event, NULL, 1);
984 		goto out_timerfd;
985 	}
986 
987 	/* Avoid interrupting CPUs if possible */
988 	scoped_guard(cpus_read_lock) {
989 		int cpu;
990 
991 		for_each_online_cpu(cpu) {
992 			struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
993 
994 			guard(raw_spinlock_irqsave)(&cpu_base->lock);
995 			if (update_needs_ipi(cpu_base, bases))
996 				cpumask_set_cpu(cpu, mask);
997 		}
998 		scoped_guard(preempt)
999 			smp_call_function_many(mask, retrigger_next_event, NULL, 1);
1000 	}
1001 	free_cpumask_var(mask);
1002 
1003 out_timerfd:
1004 	timerfd_clock_was_set();
1005 }
1006 
1007 static void clock_was_set_work(struct work_struct *work)
1008 {
1009 	clock_was_set(CLOCK_SET_WALL);
1010 }
1011 
1012 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
1013 
1014 /*
1015  * Called from timekeeping code to reprogram the hrtimer interrupt device
1016  * on all cpus and to notify timerfd.
1017  */
1018 void clock_was_set_delayed(void)
1019 {
1020 	schedule_work(&hrtimer_work);
1021 }
1022 
1023 /*
1024  * Called during resume either directly from via timekeeping_resume()
1025  * or in the case of s2idle from tick_unfreeze() to ensure that the
1026  * hrtimers are up to date.
1027  */
1028 void hrtimers_resume_local(void)
1029 {
1030 	lockdep_assert_irqs_disabled();
1031 	/* Retrigger on the local CPU */
1032 	retrigger_next_event(NULL);
1033 }
1034 
1035 /* Counterpart to lock_hrtimer_base above */
1036 static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
1037 	__releases(&timer->base->cpu_base->lock)
1038 {
1039 	raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
1040 }
1041 
1042 /**
1043  * hrtimer_forward() - forward the timer expiry
1044  * @timer:	hrtimer to forward
1045  * @now:	forward past this time
1046  * @interval:	the interval to forward
1047  *
1048  * Forward the timer expiry so it will expire in the future.
1049  *
1050  * .. note::
1051  *  This only updates the timer expiry value and does not requeue the timer.
1052  *
1053  * There is also a variant of this function: hrtimer_forward_now().
1054  *
1055  * Context: Can be safely called from the callback function of @timer. If called
1056  *          from other contexts @timer must neither be enqueued nor running the
1057  *          callback and the caller needs to take care of serialization.
1058  *
1059  * Return: The number of overruns are returned.
1060  */
1061 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
1062 {
1063 	ktime_t delta;
1064 	u64 orun = 1;
1065 
1066 	delta = ktime_sub(now, hrtimer_get_expires(timer));
1067 
1068 	if (delta < 0)
1069 		return 0;
1070 
1071 	if (WARN_ON(timer->is_queued))
1072 		return 0;
1073 
1074 	if (interval < hrtimer_resolution)
1075 		interval = hrtimer_resolution;
1076 
1077 	if (unlikely(delta >= interval)) {
1078 		s64 incr = ktime_to_ns(interval);
1079 
1080 		orun = ktime_divns(delta, incr);
1081 		hrtimer_add_expires_ns(timer, incr * orun);
1082 		if (hrtimer_get_expires(timer) > now)
1083 			return orun;
1084 		/*
1085 		 * This (and the ktime_add() below) is the
1086 		 * correction for exact:
1087 		 */
1088 		orun++;
1089 	}
1090 	hrtimer_add_expires(timer, interval);
1091 
1092 	return orun;
1093 }
1094 EXPORT_SYMBOL_GPL(hrtimer_forward);
1095 
1096 /*
1097  * enqueue_hrtimer - internal function to (re)start a timer
1098  *
1099  * The timer is inserted in expiry order. Insertion into the
1100  * red black tree is O(log(n)).
1101  *
1102  * Returns true when the new timer is the leftmost timer in the tree.
1103  */
1104 static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
1105 			    enum hrtimer_mode mode, bool was_armed)
1106 {
1107 	lockdep_assert_held(&base->cpu_base->lock);
1108 
1109 	debug_activate(timer, mode, was_armed);
1110 	WARN_ON_ONCE(!base->cpu_base->online);
1111 
1112 	base->cpu_base->active_bases |= 1 << base->index;
1113 
1114 	/* Pairs with the lockless read in hrtimer_is_queued() */
1115 	WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
1116 
1117 	if (!timerqueue_linked_add(&base->active, &timer->node))
1118 		return false;
1119 
1120 	base->expires_next = hrtimer_get_expires(timer);
1121 	return true;
1122 }
1123 
1124 static inline void base_update_next_timer(struct hrtimer_clock_base *base)
1125 {
1126 	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
1127 
1128 	base->expires_next = next ? next->expires : KTIME_MAX;
1129 }
1130 
1131 /*
1132  * __remove_hrtimer - internal function to remove a timer
1133  *
1134  * High resolution timer mode reprograms the clock event device when the
1135  * timer is the one which expires next. The caller can disable this by setting
1136  * reprogram to zero. This is useful, when the context does a reprogramming
1137  * anyway (e.g. timer interrupt)
1138  */
1139 static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
1140 			     bool newstate, bool reprogram)
1141 {
1142 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1143 	bool was_first;
1144 
1145 	lockdep_assert_held(&cpu_base->lock);
1146 
1147 	if (!timer->is_queued)
1148 		return;
1149 
1150 	/* Pairs with the lockless read in hrtimer_is_queued() */
1151 	WRITE_ONCE(timer->is_queued, newstate);
1152 
1153 	was_first = !timerqueue_linked_prev(&timer->node);
1154 
1155 	if (!timerqueue_linked_del(&base->active, &timer->node))
1156 		cpu_base->active_bases &= ~(1 << base->index);
1157 
1158 	/* Nothing to update if this was not the first timer in the base */
1159 	if (!was_first)
1160 		return;
1161 
1162 	base_update_next_timer(base);
1163 
1164 	/*
1165 	 * If reprogram is false don't update cpu_base->next_timer and do not
1166 	 * touch the clock event device.
1167 	 *
1168 	 * This happens when removing the first timer on a remote CPU, which
1169 	 * will be handled by the remote CPU's interrupt. It also happens when
1170 	 * a local timer is removed to be immediately restarted. That's handled
1171 	 * at the call site.
1172 	 */
1173 	if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
1174 		return;
1175 
1176 	if (cpu_base->deferred_rearm)
1177 		cpu_base->deferred_needs_update = true;
1178 	else
1179 		hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
1180 }
1181 
1182 static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
1183 				  bool newstate)
1184 {
1185 	lockdep_assert_held(&base->cpu_base->lock);
1186 
1187 	if (timer->is_queued) {
1188 		bool reprogram;
1189 
1190 		debug_hrtimer_deactivate(timer);
1191 
1192 		/*
1193 		 * Remove the timer and force reprogramming when high
1194 		 * resolution mode is active and the timer is on the current
1195 		 * CPU. If we remove a timer on another CPU, reprogramming is
1196 		 * skipped. The interrupt event on this CPU is fired and
1197 		 * reprogramming happens in the interrupt handler. This is a
1198 		 * rare case and less expensive than a smp call.
1199 		 */
1200 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
1201 
1202 		__remove_hrtimer(timer, base, newstate, reprogram);
1203 		return true;
1204 	}
1205 	return false;
1206 }
1207 
1208 /*
1209  * Update in place has to retrieve the expiry times of the neighbour nodes
1210  * if they exist. That is cache line neutral because the dequeue/enqueue
1211  * operation is going to need the same cache lines. But there is a big win
1212  * when the dequeue/enqueue can be avoided because the RB tree does not
1213  * have to be rebalanced twice.
1214  */
1215 static inline bool
1216 hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
1217 {
1218 	struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
1219 	struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);
1220 
1221 	/* If the new expiry goes behind the next timer, requeue is required */
1222 	if (next && expires > next->expires)
1223 		return false;
1224 
1225 	/* If this is the first timer, update in place */
1226 	if (!prev)
1227 		return true;
1228 
1229 	/* Update in place when it does not go ahead of the previous one */
1230 	return expires >= prev->expires;
1231 }
1232 
1233 static inline bool
1234 remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
1235 			     const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
1236 {
1237 	bool was_first = false;
1238 
1239 	/* Remove it from the timer queue if active */
1240 	if (timer->is_queued) {
1241 		was_first = !timerqueue_linked_prev(&timer->node);
1242 
1243 		/* Try to update in place to avoid the de/enqueue dance */
1244 		if (hrtimer_can_update_in_place(timer, base, expires)) {
1245 			hrtimer_set_expires_range_ns(timer, expires, delta_ns);
1246 			trace_hrtimer_start(timer, mode, true);
1247 			if (was_first)
1248 				base->expires_next = expires;
1249 			return was_first;
1250 		}
1251 
1252 		debug_hrtimer_deactivate(timer);
1253 		timerqueue_linked_del(&base->active, &timer->node);
1254 	}
1255 
1256 	/* Set the new expiry time */
1257 	hrtimer_set_expires_range_ns(timer, expires, delta_ns);
1258 
1259 	debug_activate(timer, mode, timer->is_queued);
1260 	base->cpu_base->active_bases |= 1 << base->index;
1261 
1262 	/* Pairs with the lockless read in hrtimer_is_queued() */
1263 	WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
1264 
1265 	/* If it's the first expiring timer now or again, update base */
1266 	if (timerqueue_linked_add(&base->active, &timer->node)) {
1267 		base->expires_next = expires;
1268 		return true;
1269 	}
1270 
1271 	if (was_first)
1272 		base_update_next_timer(base);
1273 
1274 	return false;
1275 }
1276 
1277 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
1278 					    const enum hrtimer_mode mode)
1279 {
1280 #ifdef CONFIG_TIME_LOW_RES
1281 	/*
1282 	 * CONFIG_TIME_LOW_RES indicates that the system has no way to return
1283 	 * granular time values. For relative timers we add hrtimer_resolution
1284 	 * (i.e. one jiffy) to prevent short timeouts.
1285 	 */
1286 	timer->is_rel = mode & HRTIMER_MODE_REL;
1287 	if (timer->is_rel)
1288 		tim = ktime_add_safe(tim, hrtimer_resolution);
1289 #endif
1290 	return tim;
1291 }
1292 
1293 static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
1294 {
1295 	ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
1296 
1297 	/*
1298 	 * Reprogramming needs to be triggered, even if the next soft
1299 	 * hrtimer expires at the same time as the next hard
1300 	 * hrtimer. cpu_base->softirq_expires_next needs to be updated!
1301 	 */
1302 	if (expires == KTIME_MAX)
1303 		return;
1304 
1305 	/*
1306 	 * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
1307 	 * cpu_base->expires_next is only set by hrtimer_reprogram()
1308 	 */
1309 	hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
1310 }
1311 
1312 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
1313 static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
1314 {
1315 	if (static_branch_likely(&timers_migration_enabled)) {
1316 		/*
1317 		 * If it is local and the first expiring timer keep it on the local
1318 		 * CPU to optimize reprogramming of the clockevent device. Also
1319 		 * avoid switch_hrtimer_base() overhead when local and pinned.
1320 		 */
1321 		if (!is_local)
1322 			return false;
1323 		if (is_first || is_pinned)
1324 			return true;
1325 
1326 		/* Honour the NOHZ full restrictions */
1327 		if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
1328 			return false;
1329 
1330 		/*
1331 		 * If the tick is not stopped or need_resched() is set, then
1332 		 * there is no point in moving the timer somewhere else.
1333 		 */
1334 		return !tick_nohz_tick_stopped() || need_resched();
1335 	}
1336 	return is_local;
1337 }
1338 #else
1339 static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
1340 {
1341 	return is_local;
1342 }
1343 #endif
1344 
1345 static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
1346 				     bool is_pinned)
1347 {
1348 	/* If the timer is running the callback it has to stay on its CPU base. */
1349 	if (unlikely(timer->base->running == timer))
1350 		return true;
1351 
1352 	return hrtimer_prefer_local(is_local, is_first, is_pinned);
1353 }
1354 
1355 static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
1356 				     const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
1357 {
1358 	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
1359 	bool is_pinned, first, was_first, keep_base = false;
1360 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1361 
1362 	was_first = cpu_base->next_timer == timer;
1363 	is_pinned = !!(mode & HRTIMER_MODE_PINNED);
1364 
1365 	/*
1366 	 * Don't keep it local if this enqueue happens on a unplugged CPU
1367 	 * after hrtimer_cpu_dying() has been invoked.
1368 	 */
1369 	if (likely(this_cpu_base->online)) {
1370 		bool is_local = cpu_base == this_cpu_base;
1371 
1372 		keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
1373 	}
1374 
1375 	/* Calculate absolute expiry time for relative timers */
1376 	if (mode & HRTIMER_MODE_REL)
1377 		tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
1378 	/* Compensate for low resolution granularity */
1379 	tim = hrtimer_update_lowres(timer, tim, mode);
1380 
1381 	/*
1382 	 * Remove an active timer from the queue. In case it is not queued
1383 	 * on the current CPU, make sure that remove_hrtimer() updates the
1384 	 * remote data correctly.
1385 	 *
1386 	 * If it's on the current CPU and the first expiring timer, then
1387 	 * skip reprogramming, keep the timer local and enforce
1388 	 * reprogramming later if it was the first expiring timer.  This
1389 	 * avoids programming the underlying clock event twice (once at
1390 	 * removal and once after enqueue).
1391 	 *
1392 	 * @keep_base is also true if the timer callback is running on a
1393 	 * remote CPU and for local pinned timers.
1394 	 */
1395 	if (likely(keep_base)) {
1396 		first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
1397 	} else {
1398 		/* Keep the ENQUEUED state in case it is queued */
1399 		bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);
1400 
1401 		hrtimer_set_expires_range_ns(timer, tim, delta_ns);
1402 
1403 		/* Switch the timer base, if necessary: */
1404 		base = switch_hrtimer_base(timer, base, is_pinned);
1405 		cpu_base = base->cpu_base;
1406 
1407 		first = enqueue_hrtimer(timer, base, mode, was_armed);
1408 	}
1409 
1410 	/* If a deferred rearm is pending skip reprogramming the device */
1411 	if (cpu_base->deferred_rearm) {
1412 		cpu_base->deferred_needs_update = true;
1413 		return false;
1414 	}
1415 
1416 	if (!was_first || cpu_base != this_cpu_base) {
1417 		/*
1418 		 * If the current CPU base is online, then the timer is never
1419 		 * queued on a remote CPU if it would be the first expiring
1420 		 * timer there unless the timer callback is currently executed
1421 		 * on the remote CPU. In the latter case the remote CPU will
1422 		 * re-evaluate the first expiring timer after completing the
1423 		 * callbacks.
1424 		 */
1425 		if (likely(hrtimer_base_is_online(this_cpu_base)))
1426 			return first;
1427 
1428 		/*
1429 		 * Timer was enqueued remote because the current base is
1430 		 * already offline. If the timer is the first to expire,
1431 		 * kick the remote CPU to reprogram the clock event.
1432 		 */
1433 		if (first)
1434 			smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
1435 		return false;
1436 	}
1437 
1438 	/*
1439 	 * Special case for the HRTICK timer. It is frequently rearmed and most
1440 	 * of the time moves the expiry into the future. That's expensive in
1441 	 * virtual machines and it's better to take the pointless already armed
1442 	 * interrupt than reprogramming the hardware on every context switch.
1443 	 *
1444 	 * If the new expiry is before the armed time, then reprogramming is
1445 	 * required.
1446 	 */
1447 	if (timer->is_lazy) {
1448 		if (cpu_base->expires_next <= hrtimer_get_expires(timer))
1449 			return false;
1450 	}
1451 
1452 	/*
1453 	 * Timer was the first expiring timer and forced to stay on the
1454 	 * current CPU to avoid reprogramming on removal and enqueue. Force
1455 	 * reprogram the hardware by evaluating the new first expiring
1456 	 * timer.
1457 	 */
1458 	hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
1459 	return false;
1460 }
1461 
1462 /**
1463  * hrtimer_start_range_ns - (re)start an hrtimer
1464  * @timer:	the timer to be added
1465  * @tim:	expiry time
1466  * @delta_ns:	"slack" range for the timer
1467  * @mode:	timer mode: absolute (HRTIMER_MODE_ABS) or
1468  *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
1469  *		softirq based mode is considered for debug purpose only!
1470  */
1471 void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
1472 			    const enum hrtimer_mode mode)
1473 {
1474 	struct hrtimer_clock_base *base;
1475 	unsigned long flags;
1476 
1477 	debug_hrtimer_assert_init(timer);
1478 
1479 	/*
1480 	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
1481 	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
1482 	 * expiry mode because unmarked timers are moved to softirq expiry.
1483 	 */
1484 	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
1485 		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
1486 	else
1487 		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
1488 
1489 	base = lock_hrtimer_base(timer, &flags);
1490 
1491 	if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
1492 		hrtimer_reprogram(timer, true);
1493 
1494 	unlock_hrtimer_base(timer, &flags);
1495 }
1496 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1497 
1498 /**
1499  * hrtimer_try_to_cancel - try to deactivate a timer
1500  * @timer:	hrtimer to stop
1501  *
1502  * Returns:
1503  *
1504  *  *  0 when the timer was not active
1505  *  *  1 when the timer was active
1506  *  * -1 when the timer is currently executing the callback function and
1507  *    cannot be stopped
1508  */
1509 int hrtimer_try_to_cancel(struct hrtimer *timer)
1510 {
1511 	struct hrtimer_clock_base *base;
1512 	unsigned long flags;
1513 	int ret = -1;
1514 
1515 	/*
1516 	 * Check lockless first. If the timer is not active (neither
1517 	 * enqueued nor running the callback, nothing to do here.  The
1518 	 * base lock does not serialize against a concurrent enqueue,
1519 	 * so we can avoid taking it.
1520 	 */
1521 	if (!hrtimer_active(timer))
1522 		return 0;
1523 
1524 	base = lock_hrtimer_base(timer, &flags);
1525 
1526 	if (!hrtimer_callback_running(timer)) {
1527 		ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
1528 		if (ret)
1529 			trace_hrtimer_cancel(timer);
1530 	}
1531 
1532 	unlock_hrtimer_base(timer, &flags);
1533 
1534 	return ret;
1535 
1536 }
1537 EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
1538 
1539 #ifdef CONFIG_PREEMPT_RT
1540 static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
1541 {
1542 	spin_lock_init(&base->softirq_expiry_lock);
1543 }
1544 
1545 static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
1546 	__acquires(&base->softirq_expiry_lock)
1547 {
1548 	spin_lock(&base->softirq_expiry_lock);
1549 }
1550 
1551 static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
1552 	__releases(&base->softirq_expiry_lock)
1553 {
1554 	spin_unlock(&base->softirq_expiry_lock);
1555 }
1556 
1557 /*
1558  * The counterpart to hrtimer_cancel_wait_running().
1559  *
1560  * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
1561  * the timer callback to finish. Drop expiry_lock and reacquire it. That
1562  * allows the waiter to acquire the lock and make progress.
1563  */
1564 static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
1565 {
1566 	if (atomic_read(&cpu_base->timer_waiters)) {
1567 		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1568 		spin_unlock(&cpu_base->softirq_expiry_lock);
1569 		spin_lock(&cpu_base->softirq_expiry_lock);
1570 		raw_spin_lock_irq(&cpu_base->lock);
1571 	}
1572 }
1573 
1574 #ifdef CONFIG_SMP
1575 static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
1576 {
1577 	return base == &migration_base;
1578 }
1579 #else
1580 static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
1581 {
1582 	return false;
1583 }
1584 #endif
1585 
1586 /*
1587  * This function is called on PREEMPT_RT kernels when the fast path
1588  * deletion of a timer failed because the timer callback function was
1589  * running.
1590  *
1591  * This prevents priority inversion: if the soft irq thread is preempted
1592  * in the middle of a timer callback, then calling hrtimer_cancel() can
1593  * lead to two issues:
1594  *
1595  *  - If the caller is on a remote CPU then it has to spin wait for the timer
1596  *    handler to complete. This can result in unbound priority inversion.
1597  *
1598  *  - If the caller originates from the task which preempted the timer
1599  *    handler on the same CPU, then spin waiting for the timer handler to
1600  *    complete is never going to end.
1601  */
1602 void hrtimer_cancel_wait_running(const struct hrtimer *timer)
1603 {
1604 	/* Lockless read. Prevent the compiler from reloading it below */
1605 	struct hrtimer_clock_base *base = READ_ONCE(timer->base);
1606 
1607 	/*
1608 	 * Just relax if the timer expires in hard interrupt context or if
1609 	 * it is currently on the migration base.
1610 	 */
1611 	if (!timer->is_soft || is_migration_base(base)) {
1612 		cpu_relax();
1613 		return;
1614 	}
1615 
1616 	/*
1617 	 * Mark the base as contended and grab the expiry lock, which is
1618 	 * held by the softirq across the timer callback. Drop the lock
1619 	 * immediately so the softirq can expire the next timer. In theory
1620 	 * the timer could already be running again, but that's more than
1621 	 * unlikely and just causes another wait loop.
1622 	 */
1623 	atomic_inc(&base->cpu_base->timer_waiters);
1624 	spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
1625 	atomic_dec(&base->cpu_base->timer_waiters);
1626 	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
1627 }
1628 #else
1629 static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
1630 static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
1631 static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
1632 static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
1633 #endif
1634 
1635 /**
1636  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
1637  * @timer:	the timer to be cancelled
1638  *
1639  * Returns:
1640  *  0 when the timer was not active
1641  *  1 when the timer was active
1642  */
1643 int hrtimer_cancel(struct hrtimer *timer)
1644 {
1645 	int ret;
1646 
1647 	do {
1648 		ret = hrtimer_try_to_cancel(timer);
1649 
1650 		if (ret < 0)
1651 			hrtimer_cancel_wait_running(timer);
1652 	} while (ret < 0);
1653 	return ret;
1654 }
1655 EXPORT_SYMBOL_GPL(hrtimer_cancel);
1656 
1657 /**
1658  * __hrtimer_get_remaining - get remaining time for the timer
1659  * @timer:	the timer to read
1660  * @adjust:	adjust relative timers when CONFIG_TIME_LOW_RES=y
1661  */
1662 ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
1663 {
1664 	unsigned long flags;
1665 	ktime_t rem;
1666 
1667 	lock_hrtimer_base(timer, &flags);
1668 	if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
1669 		rem = hrtimer_expires_remaining_adjusted(timer);
1670 	else
1671 		rem = hrtimer_expires_remaining(timer);
1672 	unlock_hrtimer_base(timer, &flags);
1673 
1674 	return rem;
1675 }
1676 EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
1677 
1678 #ifdef CONFIG_NO_HZ_COMMON
1679 /**
1680  * hrtimer_get_next_event - get the time until next expiry event
1681  *
1682  * Returns the next expiry time or KTIME_MAX if no timer is pending.
1683  */
1684 u64 hrtimer_get_next_event(void)
1685 {
1686 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1687 	u64 expires = KTIME_MAX;
1688 
1689 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
1690 	if (!hrtimer_hres_active(cpu_base))
1691 		expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
1692 
1693 	return expires;
1694 }
1695 
1696 /**
1697  * hrtimer_next_event_without - time until next expiry event w/o one timer
1698  * @exclude:	timer to exclude
1699  *
1700  * Returns the next expiry time over all timers except for the @exclude one or
1701  * KTIME_MAX if none of them is pending.
1702  */
1703 u64 hrtimer_next_event_without(const struct hrtimer *exclude)
1704 {
1705 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1706 	u64 expires = KTIME_MAX;
1707 	unsigned int active;
1708 
1709 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
1710 	if (!hrtimer_hres_active(cpu_base))
1711 		return expires;
1712 
1713 	active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
1714 	if (active && !cpu_base->softirq_activated)
1715 		expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);
1716 
1717 	active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
1718 	if (!active)
1719 		return expires;
1720 	return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
1721 }
1722 #endif
1723 
1724 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
1725 {
1726 	switch (clock_id) {
1727 	case CLOCK_MONOTONIC:
1728 		return HRTIMER_BASE_MONOTONIC;
1729 	case CLOCK_REALTIME:
1730 		return HRTIMER_BASE_REALTIME;
1731 	case CLOCK_BOOTTIME:
1732 		return HRTIMER_BASE_BOOTTIME;
1733 	case CLOCK_TAI:
1734 		return HRTIMER_BASE_TAI;
1735 	default:
1736 		WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1737 		return HRTIMER_BASE_MONOTONIC;
1738 	}
1739 }
1740 
1741 static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
1742 {
1743 	switch (clock_id) {
1744 	case CLOCK_MONOTONIC:
1745 		return ktime_get();
1746 	case CLOCK_REALTIME:
1747 		return ktime_get_real();
1748 	case CLOCK_BOOTTIME:
1749 		return ktime_get_boottime();
1750 	case CLOCK_TAI:
1751 		return ktime_get_clocktai();
1752 	default:
1753 		WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1754 		return ktime_get();
1755 	}
1756 }
1757 
1758 ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
1759 {
1760 	return __hrtimer_cb_get_time(timer->base->clockid);
1761 }
1762 EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
1763 
1764 static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
1765 			    clockid_t clock_id, enum hrtimer_mode mode)
1766 {
1767 	bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
1768 	struct hrtimer_cpu_base *cpu_base;
1769 	int base;
1770 
1771 	/*
1772 	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
1773 	 * marked for hard interrupt expiry mode are moved into soft
1774 	 * interrupt context for latency reasons and because the callbacks
1775 	 * can invoke functions which might sleep on RT, e.g. spin_lock().
1776 	 */
1777 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
1778 		softtimer = true;
1779 
1780 	memset(timer, 0, sizeof(struct hrtimer));
1781 
1782 	cpu_base = raw_cpu_ptr(&hrtimer_bases);
1783 
1784 	/*
1785 	 * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
1786 	 * clock modifications, so they needs to become CLOCK_MONOTONIC to
1787 	 * ensure POSIX compliance.
1788 	 */
1789 	if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
1790 		clock_id = CLOCK_MONOTONIC;
1791 
1792 	base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
1793 	base += hrtimer_clockid_to_base(clock_id);
1794 	timer->is_soft = softtimer;
1795 	timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
1796 	timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
1797 	timer->base = &cpu_base->clock_base[base];
1798 	timerqueue_linked_init(&timer->node);
1799 
1800 	if (WARN_ON_ONCE(!fn))
1801 		ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
1802 	else
1803 		ACCESS_PRIVATE(timer, function) = fn;
1804 }
1805 
1806 /**
1807  * hrtimer_setup - initialize a timer to the given clock
1808  * @timer:	the timer to be initialized
1809  * @function:	the callback function
1810  * @clock_id:	the clock to be used
1811  * @mode:       The modes which are relevant for initialization:
1812  *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
1813  *              HRTIMER_MODE_REL_SOFT
1814  *
1815  *              The PINNED variants of the above can be handed in,
1816  *              but the PINNED bit is ignored as pinning happens
1817  *              when the hrtimer is started
1818  */
1819 void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
1820 		   clockid_t clock_id, enum hrtimer_mode mode)
1821 {
1822 	debug_setup(timer, clock_id, mode);
1823 	__hrtimer_setup(timer, function, clock_id, mode);
1824 }
1825 EXPORT_SYMBOL_GPL(hrtimer_setup);
1826 
1827 /**
1828  * hrtimer_setup_on_stack - initialize a timer on stack memory
1829  * @timer:	The timer to be initialized
1830  * @function:	the callback function
1831  * @clock_id:	The clock to be used
1832  * @mode:       The timer mode
1833  *
1834  * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
1835  * memory.
1836  */
1837 void hrtimer_setup_on_stack(struct hrtimer *timer,
1838 			    enum hrtimer_restart (*function)(struct hrtimer *),
1839 			    clockid_t clock_id, enum hrtimer_mode mode)
1840 {
1841 	debug_setup_on_stack(timer, clock_id, mode);
1842 	__hrtimer_setup(timer, function, clock_id, mode);
1843 }
1844 EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);
1845 
1846 /*
1847  * A timer is active, when it is enqueued into the rbtree or the
1848  * callback function is running or it's in the state of being migrated
1849  * to another cpu.
1850  *
1851  * It is important for this function to not return a false negative.
1852  */
1853 bool hrtimer_active(const struct hrtimer *timer)
1854 {
1855 	struct hrtimer_clock_base *base;
1856 	unsigned int seq;
1857 
1858 	do {
1859 		base = READ_ONCE(timer->base);
1860 		seq = raw_read_seqcount_begin(&base->seq);
1861 
1862 		if (timer->is_queued || base->running == timer)
1863 			return true;
1864 
1865 	} while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));
1866 
1867 	return false;
1868 }
1869 EXPORT_SYMBOL_GPL(hrtimer_active);
1870 
1871 /*
1872  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
1873  * distinct sections:
1874  *
1875  *  - queued:	the timer is queued
1876  *  - callback:	the timer is being ran
1877  *  - post:	the timer is inactive or (re)queued
1878  *
1879  * On the read side we ensure we observe timer->is_queued and cpu_base->running
1880  * from the same section, if anything changed while we looked at it, we retry.
1881  * This includes timer->base changing because sequence numbers alone are
1882  * insufficient for that.
1883  *
1884  * The sequence numbers are required because otherwise we could still observe
1885  * a false negative if the read side got smeared over multiple consecutive
1886  * __run_hrtimer() invocations.
1887  */
1888 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
1889 			  struct hrtimer *timer, ktime_t now, unsigned long flags)
1890 	__must_hold(&cpu_base->lock)
1891 {
1892 	enum hrtimer_restart (*fn)(struct hrtimer *);
1893 	bool expires_in_hardirq;
1894 	int restart;
1895 
1896 	lockdep_assert_held(&cpu_base->lock);
1897 
1898 	debug_hrtimer_deactivate(timer);
1899 	base->running = timer;
1900 
1901 	/*
1902 	 * Separate the ->running assignment from the ->is_queued assignment.
1903 	 *
1904 	 * As with a regular write barrier, this ensures the read side in
1905 	 * hrtimer_active() cannot observe base->running == NULL &&
1906 	 * timer->is_queued == INACTIVE.
1907 	 */
1908 	raw_write_seqcount_barrier(&base->seq);
1909 
1910 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
1911 	fn = ACCESS_PRIVATE(timer, function);
1912 
1913 	/*
1914 	 * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
1915 	 * timer is restarted with a period then it becomes an absolute
1916 	 * timer. If its not restarted it does not matter.
1917 	 */
1918 	if (IS_ENABLED(CONFIG_TIME_LOW_RES))
1919 		timer->is_rel = false;
1920 
1921 	/*
1922 	 * The timer is marked as running in the CPU base, so it is
1923 	 * protected against migration to a different CPU even if the lock
1924 	 * is dropped.
1925 	 */
1926 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1927 	trace_hrtimer_expire_entry(timer, now);
1928 	expires_in_hardirq = lockdep_hrtimer_enter(timer);
1929 
1930 	restart = fn(timer);
1931 
1932 	lockdep_hrtimer_exit(expires_in_hardirq);
1933 	trace_hrtimer_expire_exit(timer);
1934 	raw_spin_lock_irq(&cpu_base->lock);
1935 
1936 	/*
1937 	 * Note: We clear the running state after enqueue_hrtimer and
1938 	 * we do not reprogram the event hardware. Happens either in
1939 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1940 	 *
1941 	 * Note: Because we dropped the cpu_base->lock above,
1942 	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
1943 	 * for us already.
1944 	 */
1945 	if (restart == HRTIMER_RESTART && !timer->is_queued)
1946 		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);
1947 
1948 	/*
1949 	 * Separate the ->running assignment from the ->is_queued assignment.
1950 	 *
1951 	 * As with a regular write barrier, this ensures the read side in
1952 	 * hrtimer_active() cannot observe base->running.timer == NULL &&
1953 	 * timer->is_queued == INACTIVE.
1954 	 */
1955 	raw_write_seqcount_barrier(&base->seq);
1956 
1957 	WARN_ON_ONCE(base->running != timer);
1958 	base->running = NULL;
1959 }
1960 
1961 static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
1962 {
1963 	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
1964 
1965 	return next ? hrtimer_from_timerqueue_node(next) : NULL;
1966 }
1967 
1968 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
1969 				 unsigned long flags, unsigned int active_mask)
1970 {
1971 	unsigned int active = cpu_base->active_bases & active_mask;
1972 	struct hrtimer_clock_base *base;
1973 
1974 	for_each_active_base(base, cpu_base, active) {
1975 		ktime_t basenow = ktime_add(now, base->offset);
1976 		struct hrtimer *timer;
1977 
1978 		while ((timer = clock_base_next_timer(base))) {
1979 			/*
1980 			 * The immediate goal for using the softexpires is
1981 			 * minimizing wakeups, not running timers at the
1982 			 * earliest interrupt after their soft expiration.
1983 			 * This allows us to avoid using a Priority Search
1984 			 * Tree, which can answer a stabbing query for
1985 			 * overlapping intervals and instead use the simple
1986 			 * BST we already have.
1987 			 * We don't add extra wakeups by delaying timers that
1988 			 * are right-of a not yet expired timer, because that
1989 			 * timer will have to trigger a wakeup anyway.
1990 			 */
1991 			if (basenow < hrtimer_get_softexpires(timer))
1992 				break;
1993 
1994 			__run_hrtimer(cpu_base, base, timer, basenow, flags);
1995 			if (active_mask == HRTIMER_ACTIVE_SOFT)
1996 				hrtimer_sync_wait_running(cpu_base, flags);
1997 		}
1998 	}
1999 }
2000 
2001 static __latent_entropy void hrtimer_run_softirq(void)
2002 {
2003 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2004 	unsigned long flags;
2005 	ktime_t now;
2006 
2007 	hrtimer_cpu_base_lock_expiry(cpu_base);
2008 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
2009 
2010 	now = hrtimer_update_base(cpu_base);
2011 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
2012 
2013 	cpu_base->softirq_activated = false;
2014 	hrtimer_update_softirq_timer(cpu_base, true);
2015 
2016 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2017 	hrtimer_cpu_base_unlock_expiry(cpu_base);
2018 }
2019 
2020 #ifdef CONFIG_HIGH_RES_TIMERS
2021 
2022 /*
2023  * Very similar to hrtimer_force_reprogram(), except it deals with
2024  * deferred_rearm and hang_detected.
2025  */
2026 static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
2027 {
2028 	cpu_base->expires_next = expires_next;
2029 	cpu_base->deferred_rearm = false;
2030 
2031 	if (unlikely(cpu_base->hang_detected)) {
2032 		/*
2033 		 * Give the system a chance to do something else than looping
2034 		 * on hrtimer interrupts.
2035 		 */
2036 		expires_next = ktime_add_ns(ktime_get(),
2037 					    min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
2038 	}
2039 	hrtimer_rearm_event(expires_next, deferred);
2040 }
2041 
2042 #ifdef CONFIG_HRTIMER_REARM_DEFERRED
2043 void __hrtimer_rearm_deferred(void)
2044 {
2045 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2046 	ktime_t expires_next;
2047 
2048 	if (!cpu_base->deferred_rearm)
2049 		return;
2050 
2051 	guard(raw_spinlock)(&cpu_base->lock);
2052 	if (cpu_base->deferred_needs_update) {
2053 		hrtimer_update_base(cpu_base);
2054 		expires_next = hrtimer_update_next_event(cpu_base);
2055 	} else {
2056 		/* No timer added/removed. Use the cached value */
2057 		expires_next = cpu_base->deferred_expires_next;
2058 	}
2059 	hrtimer_rearm(cpu_base, expires_next, true);
2060 }
2061 
2062 static __always_inline void
2063 hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
2064 {
2065 	/* hrtimer_interrupt() just re-evaluated the first expiring timer */
2066 	cpu_base->deferred_needs_update = false;
2067 	/* Cache the expiry time */
2068 	cpu_base->deferred_expires_next = expires_next;
2069 	set_thread_flag(TIF_HRTIMER_REARM);
2070 }
2071 #else  /* CONFIG_HRTIMER_REARM_DEFERRED */
2072 static __always_inline void
2073 hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
2074 {
2075 	hrtimer_rearm(cpu_base, expires_next, false);
2076 }
2077 #endif  /* !CONFIG_HRTIMER_REARM_DEFERRED */
2078 
2079 /*
2080  * High resolution timer interrupt
2081  * Called with interrupts disabled
2082  */
2083 void hrtimer_interrupt(struct clock_event_device *dev)
2084 {
2085 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2086 	ktime_t expires_next, now, entry_time, delta;
2087 	unsigned long flags;
2088 	int retries = 0;
2089 
2090 	BUG_ON(!cpu_base->hres_active);
2091 	cpu_base->nr_events++;
2092 	dev->next_event = KTIME_MAX;
2093 	dev->next_event_forced = 0;
2094 
2095 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
2096 	entry_time = now = hrtimer_update_base(cpu_base);
2097 retry:
2098 	cpu_base->deferred_rearm = true;
2099 	/*
2100 	 * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
2101 	 * timers while __hrtimer_run_queues() is expiring the clock bases.
2102 	 * Timers which are re/enqueued on the local CPU are not affected by
2103 	 * this.
2104 	 */
2105 	cpu_base->expires_next = KTIME_MAX;
2106 
2107 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
2108 		cpu_base->softirq_expires_next = KTIME_MAX;
2109 		cpu_base->softirq_activated = true;
2110 		raise_timer_softirq(HRTIMER_SOFTIRQ);
2111 	}
2112 
2113 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
2114 
2115 	/*
2116 	 * The next timer was already expired due to:
2117 	 * - tracing
2118 	 * - long lasting callbacks
2119 	 * - being scheduled away when running in a VM
2120 	 *
2121 	 * We need to prevent that we loop forever in the hrtiner interrupt
2122 	 * routine. We give it 3 attempts to avoid overreacting on some
2123 	 * spurious event.
2124 	 */
2125 	now = hrtimer_update_base(cpu_base);
2126 	expires_next = hrtimer_update_next_event(cpu_base);
2127 	cpu_base->hang_detected = false;
2128 	if (expires_next < now) {
2129 		if (++retries < 3)
2130 			goto retry;
2131 
2132 		delta = ktime_sub(now, entry_time);
2133 		cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
2134 		cpu_base->nr_hangs++;
2135 		cpu_base->hang_detected = true;
2136 	}
2137 
2138 	hrtimer_interrupt_rearm(cpu_base, expires_next);
2139 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2140 }
2141 
2142 #endif /* !CONFIG_HIGH_RES_TIMERS */
2143 
2144 /*
2145  * Called from run_local_timers in hardirq context every jiffy
2146  */
2147 void hrtimer_run_queues(void)
2148 {
2149 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2150 	unsigned long flags;
2151 	ktime_t now;
2152 
2153 	if (hrtimer_hres_active(cpu_base))
2154 		return;
2155 
2156 	/*
2157 	 * This _is_ ugly: We have to check periodically, whether we
2158 	 * can switch to highres and / or nohz mode. The clocksource
2159 	 * switch happens with xtime_lock held. Notification from
2160 	 * there only sets the check bit in the tick_oneshot code,
2161 	 * otherwise we might deadlock vs. xtime_lock.
2162 	 */
2163 	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
2164 		hrtimer_switch_to_hres();
2165 		return;
2166 	}
2167 
2168 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
2169 	now = hrtimer_update_base(cpu_base);
2170 
2171 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
2172 		cpu_base->softirq_expires_next = KTIME_MAX;
2173 		cpu_base->softirq_activated = true;
2174 		raise_timer_softirq(HRTIMER_SOFTIRQ);
2175 	}
2176 
2177 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
2178 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
2179 }
2180 
2181 /*
2182  * Sleep related functions:
2183  */
2184 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
2185 {
2186 	struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
2187 	struct task_struct *task = t->task;
2188 
2189 	t->task = NULL;
2190 	if (task)
2191 		wake_up_process(task);
2192 
2193 	return HRTIMER_NORESTART;
2194 }
2195 
2196 /**
2197  * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
2198  * @sl:		sleeper to be started
2199  * @mode:	timer mode abs/rel
2200  *
2201  * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
2202  * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
2203  */
2204 void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
2205 {
2206 	/*
2207 	 * Make the enqueue delivery mode check work on RT. If the sleeper
2208 	 * was initialized for hard interrupt delivery, force the mode bit.
2209 	 * This is a special case for hrtimer_sleepers because
2210 	 * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
2211 	 * fiddling with this decision is avoided at the call sites.
2212 	 */
2213 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
2214 		mode |= HRTIMER_MODE_HARD;
2215 
2216 	hrtimer_start_expires(&sl->timer, mode);
2217 }
2218 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
2219 
2220 static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
2221 				    enum hrtimer_mode mode)
2222 {
2223 	/*
2224 	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
2225 	 * marked for hard interrupt expiry mode are moved into soft
2226 	 * interrupt context either for latency reasons or because the
2227 	 * hrtimer callback takes regular spinlocks or invokes other
2228 	 * functions which are not suitable for hard interrupt context on
2229 	 * PREEMPT_RT.
2230 	 *
2231 	 * The hrtimer_sleeper callback is RT compatible in hard interrupt
2232 	 * context, but there is a latency concern: Untrusted userspace can
2233 	 * spawn many threads which arm timers for the same expiry time on
2234 	 * the same CPU. That causes a latency spike due to the wakeup of
2235 	 * a gazillion threads.
2236 	 *
2237 	 * OTOH, privileged real-time user space applications rely on the
2238 	 * low latency of hard interrupt wakeups. If the current task is in
2239 	 * a real-time scheduling class, mark the mode for hard interrupt
2240 	 * expiry.
2241 	 */
2242 	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
2243 		if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
2244 			mode |= HRTIMER_MODE_HARD;
2245 	}
2246 
2247 	__hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
2248 	sl->task = current;
2249 }
2250 
2251 /**
2252  * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
2253  * @sl:		sleeper to be initialized
2254  * @clock_id:	the clock to be used
2255  * @mode:	timer mode abs/rel
2256  */
2257 void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
2258 				    enum hrtimer_mode mode)
2259 {
2260 	debug_setup_on_stack(&sl->timer, clock_id, mode);
2261 	__hrtimer_setup_sleeper(sl, clock_id, mode);
2262 }
2263 EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);
2264 
2265 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
2266 {
2267 	switch(restart->nanosleep.type) {
2268 #ifdef CONFIG_COMPAT_32BIT_TIME
2269 	case TT_COMPAT:
2270 		if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
2271 			return -EFAULT;
2272 		break;
2273 #endif
2274 	case TT_NATIVE:
2275 		if (put_timespec64(ts, restart->nanosleep.rmtp))
2276 			return -EFAULT;
2277 		break;
2278 	default:
2279 		BUG();
2280 	}
2281 	return -ERESTART_RESTARTBLOCK;
2282 }
2283 
2284 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
2285 {
2286 	struct restart_block *restart;
2287 
2288 	do {
2289 		set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
2290 		hrtimer_sleeper_start_expires(t, mode);
2291 
2292 		if (likely(t->task))
2293 			schedule();
2294 
2295 		hrtimer_cancel(&t->timer);
2296 		mode = HRTIMER_MODE_ABS;
2297 
2298 	} while (t->task && !signal_pending(current));
2299 
2300 	__set_current_state(TASK_RUNNING);
2301 
2302 	if (!t->task)
2303 		return 0;
2304 
2305 	restart = &current->restart_block;
2306 	if (restart->nanosleep.type != TT_NONE) {
2307 		ktime_t rem = hrtimer_expires_remaining(&t->timer);
2308 		struct timespec64 rmt;
2309 
2310 		if (rem <= 0)
2311 			return 0;
2312 		rmt = ktime_to_timespec64(rem);
2313 
2314 		return nanosleep_copyout(restart, &rmt);
2315 	}
2316 	return -ERESTART_RESTARTBLOCK;
2317 }
2318 
2319 static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
2320 {
2321 	struct hrtimer_sleeper t;
2322 	int ret;
2323 
2324 	hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
2325 	hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
2326 	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
2327 	destroy_hrtimer_on_stack(&t.timer);
2328 	return ret;
2329 }
2330 
2331 long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
2332 {
2333 	struct restart_block *restart;
2334 	struct hrtimer_sleeper t;
2335 	int ret;
2336 
2337 	hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
2338 	hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
2339 	ret = do_nanosleep(&t, mode);
2340 	if (ret != -ERESTART_RESTARTBLOCK)
2341 		goto out;
2342 
2343 	/* Absolute timers do not update the rmtp value and restart: */
2344 	if (mode == HRTIMER_MODE_ABS) {
2345 		ret = -ERESTARTNOHAND;
2346 		goto out;
2347 	}
2348 
2349 	restart = &current->restart_block;
2350 	restart->nanosleep.clockid = t.timer.base->clockid;
2351 	restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
2352 	set_restart_fn(restart, hrtimer_nanosleep_restart);
2353 out:
2354 	destroy_hrtimer_on_stack(&t.timer);
2355 	return ret;
2356 }
2357 
2358 #ifdef CONFIG_64BIT
2359 
2360 SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
2361 		struct __kernel_timespec __user *, rmtp)
2362 {
2363 	struct timespec64 tu;
2364 
2365 	if (get_timespec64(&tu, rqtp))
2366 		return -EFAULT;
2367 
2368 	if (!timespec64_valid(&tu))
2369 		return -EINVAL;
2370 
2371 	current->restart_block.fn = do_no_restart_syscall;
2372 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
2373 	current->restart_block.nanosleep.rmtp = rmtp;
2374 	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
2375 }
2376 
2377 #endif
2378 
2379 #ifdef CONFIG_COMPAT_32BIT_TIME
2380 
2381 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
2382 		struct old_timespec32 __user *, rmtp)
2383 {
2384 	struct timespec64 tu;
2385 
2386 	if (get_old_timespec32(&tu, rqtp))
2387 		return -EFAULT;
2388 
2389 	if (!timespec64_valid(&tu))
2390 		return -EINVAL;
2391 
2392 	current->restart_block.fn = do_no_restart_syscall;
2393 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
2394 	current->restart_block.nanosleep.compat_rmtp = rmtp;
2395 	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
2396 }
2397 #endif
2398 
2399 /*
2400  * Functions related to boot-time initialization:
2401  */
2402 int hrtimers_prepare_cpu(unsigned int cpu)
2403 {
2404 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
2405 
2406 	for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
2407 		struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
2408 
2409 		clock_b->cpu_base = cpu_base;
2410 		seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
2411 		timerqueue_linked_init_head(&clock_b->active);
2412 	}
2413 
2414 	cpu_base->cpu = cpu;
2415 	hrtimer_cpu_base_init_expiry_lock(cpu_base);
2416 	return 0;
2417 }
2418 
2419 int hrtimers_cpu_starting(unsigned int cpu)
2420 {
2421 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2422 
2423 	/* Clear out any left over state from a CPU down operation */
2424 	cpu_base->active_bases = 0;
2425 	cpu_base->hres_active = false;
2426 	cpu_base->hang_detected = false;
2427 	cpu_base->next_timer = NULL;
2428 	cpu_base->softirq_next_timer = NULL;
2429 	cpu_base->expires_next = KTIME_MAX;
2430 	cpu_base->softirq_expires_next = KTIME_MAX;
2431 	cpu_base->softirq_activated = false;
2432 	cpu_base->online = true;
2433 	return 0;
2434 }
2435 
2436 #ifdef CONFIG_HOTPLUG_CPU
2437 
2438 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
2439 				struct hrtimer_clock_base *new_base)
2440 {
2441 	struct timerqueue_linked_node *node;
2442 	struct hrtimer *timer;
2443 
2444 	while ((node = timerqueue_linked_first(&old_base->active))) {
2445 		timer = hrtimer_from_timerqueue_node(node);
2446 		BUG_ON(hrtimer_callback_running(timer));
2447 		debug_hrtimer_deactivate(timer);
2448 
2449 		/*
2450 		 * Mark it as ENQUEUED not INACTIVE otherwise the
2451 		 * timer could be seen as !active and just vanish away
2452 		 * under us on another CPU
2453 		 */
2454 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
2455 		timer->base = new_base;
2456 		/*
2457 		 * Enqueue the timers on the new cpu. This does not
2458 		 * reprogram the event device in case the timer
2459 		 * expires before the earliest on this CPU, but we run
2460 		 * hrtimer_interrupt after we migrated everything to
2461 		 * sort out already expired timers and reprogram the
2462 		 * event device.
2463 		 */
2464 		enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
2465 	}
2466 }
2467 
2468 int hrtimers_cpu_dying(unsigned int dying_cpu)
2469 {
2470 	int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
2471 	struct hrtimer_cpu_base *old_base, *new_base;
2472 
2473 	old_base = this_cpu_ptr(&hrtimer_bases);
2474 	new_base = &per_cpu(hrtimer_bases, ncpu);
2475 
2476 	/*
2477 	 * The caller is globally serialized and nobody else
2478 	 * takes two locks at once, deadlock is not possible.
2479 	 */
2480 	raw_spin_lock(&old_base->lock);
2481 	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
2482 
2483 	for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
2484 		migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);
2485 
2486 	/* Tell the other CPU to retrigger the next event */
2487 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
2488 
2489 	raw_spin_unlock(&new_base->lock);
2490 	old_base->online = false;
2491 	raw_spin_unlock(&old_base->lock);
2492 
2493 	return 0;
2494 }
2495 
2496 #endif /* CONFIG_HOTPLUG_CPU */
2497 
2498 void __init hrtimers_init(void)
2499 {
2500 	hrtimers_prepare_cpu(smp_processor_id());
2501 	hrtimers_cpu_starting(smp_processor_id());
2502 	open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
2503 }
2504