1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> 4 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * 7 * High-resolution kernel timers 8 * 9 * In contrast to the low-resolution timeout API, aka timer wheel, 10 * hrtimers provide finer resolution and accuracy depending on system 11 * configuration and capabilities. 12 * 13 * Started by: Thomas Gleixner and Ingo Molnar 14 * 15 * Credits: 16 * Based on the original timer wheel code 17 * 18 * Help, testing, suggestions, bugfixes, improvements were 19 * provided by: 20 * 21 * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel 22 * et. al. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/export.h> 27 #include <linux/percpu.h> 28 #include <linux/hrtimer.h> 29 #include <linux/notifier.h> 30 #include <linux/syscalls.h> 31 #include <linux/interrupt.h> 32 #include <linux/tick.h> 33 #include <linux/err.h> 34 #include <linux/debugobjects.h> 35 #include <linux/sched/signal.h> 36 #include <linux/sched/sysctl.h> 37 #include <linux/sched/rt.h> 38 #include <linux/sched/deadline.h> 39 #include <linux/sched/nohz.h> 40 #include <linux/sched/debug.h> 41 #include <linux/sched/isolation.h> 42 #include <linux/timer.h> 43 #include <linux/freezer.h> 44 #include <linux/compat.h> 45 46 #include <linux/uaccess.h> 47 48 #include <trace/events/timer.h> 49 50 #include "tick-internal.h" 51 52 /* 53 * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) 54 * 55 * The callback state is kept separate in the CPU base because having it in 56 * the timer would required touching the timer after the callback, which 57 * makes it impossible to free the timer from the callback function. 58 * 59 * Therefore we track the callback state in: 60 * 61 * timer->base->cpu_base->running == timer 62 * 63 * On SMP it is possible to have a "callback function running and enqueued" 64 * status. It happens for example when a posix timer expired and the callback 65 * queued a signal. Between dropping the lock which protects the posix timer 66 * and reacquiring the base lock of the hrtimer, another CPU can deliver the 67 * signal and rearm the timer. 68 * 69 * All state transitions are protected by cpu_base->lock. 70 */ 71 #define HRTIMER_STATE_INACTIVE false 72 #define HRTIMER_STATE_ENQUEUED true 73 74 /* 75 * The resolution of the clocks. The resolution value is returned in 76 * the clock_getres() system call to give application programmers an 77 * idea of the (in)accuracy of timers. Timer values are rounded up to 78 * this resolution values. 79 */ 80 #define HIGH_RES_NSEC 1 81 82 /* 83 * Masks for selecting the soft and hard context timers from 84 * cpu_base->active 85 */ 86 #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) 87 #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) 88 #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) 89 #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) 90 91 static void retrigger_next_event(void *arg); 92 static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); 93 94 /* 95 * The timer bases: 96 * 97 * There are more clockids than hrtimer bases. Thus, we index 98 * into the timer bases by the hrtimer_base_type enum. When trying 99 * to reach a base using a clockid, hrtimer_clockid_to_base() 100 * is used to convert from clockid to the proper hrtimer_base_type. 101 */ 102 103 #define BASE_INIT(idx, cid) \ 104 [idx] = { .index = idx, .clockid = cid } 105 106 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 107 { 108 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), 109 .clock_base = { 110 BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC), 111 BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME), 112 BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME), 113 BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI), 114 BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC), 115 BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME), 116 BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME), 117 BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI), 118 }, 119 .csd = CSD_INIT(retrigger_next_event, NULL) 120 }; 121 122 static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) 123 { 124 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 125 return true; 126 else 127 return likely(base->online); 128 } 129 130 #ifdef CONFIG_HIGH_RES_TIMERS 131 DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); 132 133 static void hrtimer_hres_workfn(struct work_struct *work) 134 { 135 static_branch_enable(&hrtimer_highres_enabled_key); 136 } 137 138 static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); 139 140 static inline void hrtimer_schedule_hres_work(void) 141 { 142 if (!hrtimer_highres_enabled()) 143 schedule_work(&hrtimer_hres_work); 144 } 145 #else 146 static inline void hrtimer_schedule_hres_work(void) { } 147 #endif 148 149 /* 150 * Functions and macros which are different for UP/SMP systems are kept in a 151 * single place 152 */ 153 #ifdef CONFIG_SMP 154 /* 155 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() 156 * such that hrtimer_callback_running() can unconditionally dereference 157 * timer->base->cpu_base 158 */ 159 static struct hrtimer_cpu_base migration_cpu_base = { 160 .clock_base = { 161 [0] = { 162 .cpu_base = &migration_cpu_base, 163 .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, 164 &migration_cpu_base.lock), 165 }, 166 }, 167 }; 168 169 #define migration_base migration_cpu_base.clock_base[0] 170 171 /* 172 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock 173 * means that all timers which are tied to this base via timer->base are 174 * locked, and the base itself is locked too. 175 * 176 * So __run_timers/migrate_timers can safely modify all timers which could 177 * be found on the lists/queues. 178 * 179 * When the timer's base is locked, and the timer removed from list, it is 180 * possible to set timer->base = &migration_base and drop the lock: the timer 181 * remains locked. 182 */ 183 static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, 184 unsigned long *flags) 185 __acquires(&timer->base->lock) 186 { 187 for (;;) { 188 struct hrtimer_clock_base *base = READ_ONCE(timer->base); 189 190 if (likely(base != &migration_base)) { 191 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); 192 if (likely(base == timer->base)) 193 return base; 194 /* The timer has migrated to another CPU: */ 195 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 196 } 197 cpu_relax(); 198 } 199 } 200 201 /* 202 * Check if the elected target is suitable considering its next 203 * event and the hotplug state of the current CPU. 204 * 205 * If the elected target is remote and its next event is after the timer 206 * to queue, then a remote reprogram is necessary. However there is no 207 * guarantee the IPI handling the operation would arrive in time to meet 208 * the high resolution deadline. In this case the local CPU becomes a 209 * preferred target, unless it is offline. 210 * 211 * High and low resolution modes are handled the same way for simplicity. 212 * 213 * Called with cpu_base->lock of target cpu held. 214 */ 215 static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, 216 struct hrtimer_cpu_base *new_cpu_base, 217 struct hrtimer_cpu_base *this_cpu_base) 218 { 219 ktime_t expires; 220 221 /* 222 * The local CPU clockevent can be reprogrammed. Also get_target_base() 223 * guarantees it is online. 224 */ 225 if (new_cpu_base == this_cpu_base) 226 return true; 227 228 /* 229 * The offline local CPU can't be the default target if the 230 * next remote target event is after this timer. Keep the 231 * elected new base. An IPI will be issued to reprogram 232 * it as a last resort. 233 */ 234 if (!hrtimer_base_is_online(this_cpu_base)) 235 return true; 236 237 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); 238 239 return expires >= new_base->cpu_base->expires_next; 240 } 241 242 static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned) 243 { 244 if (!hrtimer_base_is_online(base)) { 245 int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); 246 247 return &per_cpu(hrtimer_bases, cpu); 248 } 249 250 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 251 if (static_branch_likely(&timers_migration_enabled) && !pinned) 252 return &per_cpu(hrtimer_bases, get_nohz_timer_target()); 253 #endif 254 return base; 255 } 256 257 /* 258 * We switch the timer base to a power-optimized selected CPU target, 259 * if: 260 * - NO_HZ_COMMON is enabled 261 * - timer migration is enabled 262 * - the timer callback is not running 263 * - the timer is not the first expiring timer on the new target 264 * 265 * If one of the above requirements is not fulfilled we move the timer 266 * to the current CPU or leave it on the previously assigned CPU if 267 * the timer callback is currently running. 268 */ 269 static inline struct hrtimer_clock_base * 270 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned) 271 { 272 struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; 273 struct hrtimer_clock_base *new_base; 274 int basenum = base->index; 275 276 this_cpu_base = this_cpu_ptr(&hrtimer_bases); 277 new_cpu_base = get_target_base(this_cpu_base, pinned); 278 again: 279 new_base = &new_cpu_base->clock_base[basenum]; 280 281 if (base != new_base) { 282 /* 283 * We are trying to move timer to new_base. However we can't 284 * change timer's base while it is running, so we keep it on 285 * the same CPU. No hassle vs. reprogramming the event source 286 * in the high resolution case. The remote CPU will take care 287 * of this when the timer function has completed. There is no 288 * conflict as we hold the lock until the timer is enqueued. 289 */ 290 if (unlikely(hrtimer_callback_running(timer))) 291 return base; 292 293 /* See the comment in lock_hrtimer_base() */ 294 WRITE_ONCE(timer->base, &migration_base); 295 raw_spin_unlock(&base->cpu_base->lock); 296 raw_spin_lock(&new_base->cpu_base->lock); 297 298 if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { 299 raw_spin_unlock(&new_base->cpu_base->lock); 300 raw_spin_lock(&base->cpu_base->lock); 301 new_cpu_base = this_cpu_base; 302 WRITE_ONCE(timer->base, base); 303 goto again; 304 } 305 WRITE_ONCE(timer->base, new_base); 306 } else { 307 if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { 308 new_cpu_base = this_cpu_base; 309 goto again; 310 } 311 } 312 return new_base; 313 } 314 315 #else /* CONFIG_SMP */ 316 317 static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, 318 unsigned long *flags) 319 __acquires(&timer->base->cpu_base->lock) 320 { 321 struct hrtimer_clock_base *base = timer->base; 322 323 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); 324 return base; 325 } 326 327 # define switch_hrtimer_base(t, b, p) (b) 328 329 #endif /* !CONFIG_SMP */ 330 331 /* 332 * Functions for the union type storage format of ktime_t which are 333 * too large for inlining: 334 */ 335 #if BITS_PER_LONG < 64 336 /* 337 * Divide a ktime value by a nanosecond value 338 */ 339 s64 __ktime_divns(const ktime_t kt, s64 div) 340 { 341 int sft = 0; 342 s64 dclc; 343 u64 tmp; 344 345 dclc = ktime_to_ns(kt); 346 tmp = dclc < 0 ? -dclc : dclc; 347 348 /* Make sure the divisor is less than 2^32: */ 349 while (div >> 32) { 350 sft++; 351 div >>= 1; 352 } 353 tmp >>= sft; 354 do_div(tmp, (u32) div); 355 return dclc < 0 ? -tmp : tmp; 356 } 357 EXPORT_SYMBOL_GPL(__ktime_divns); 358 #endif /* BITS_PER_LONG < 64 */ 359 360 /* 361 * Add two ktime values and do a safety check for overflow: 362 */ 363 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) 364 { 365 ktime_t res = ktime_add_unsafe(lhs, rhs); 366 367 /* 368 * We use KTIME_SEC_MAX here, the maximum timeout which we can 369 * return to user space in a timespec: 370 */ 371 if (res < 0 || res < lhs || res < rhs) 372 res = ktime_set(KTIME_SEC_MAX, 0); 373 374 return res; 375 } 376 377 EXPORT_SYMBOL_GPL(ktime_add_safe); 378 379 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS 380 381 static const struct debug_obj_descr hrtimer_debug_descr; 382 383 static void *hrtimer_debug_hint(void *addr) 384 { 385 return ACCESS_PRIVATE((struct hrtimer *)addr, function); 386 } 387 388 /* 389 * fixup_init is called when: 390 * - an active object is initialized 391 */ 392 static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) 393 { 394 struct hrtimer *timer = addr; 395 396 switch (state) { 397 case ODEBUG_STATE_ACTIVE: 398 hrtimer_cancel(timer); 399 debug_object_init(timer, &hrtimer_debug_descr); 400 return true; 401 default: 402 return false; 403 } 404 } 405 406 /* 407 * fixup_activate is called when: 408 * - an active object is activated 409 * - an unknown non-static object is activated 410 */ 411 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) 412 { 413 switch (state) { 414 case ODEBUG_STATE_ACTIVE: 415 WARN_ON(1); 416 fallthrough; 417 default: 418 return false; 419 } 420 } 421 422 /* 423 * fixup_free is called when: 424 * - an active object is freed 425 */ 426 static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) 427 { 428 struct hrtimer *timer = addr; 429 430 switch (state) { 431 case ODEBUG_STATE_ACTIVE: 432 hrtimer_cancel(timer); 433 debug_object_free(timer, &hrtimer_debug_descr); 434 return true; 435 default: 436 return false; 437 } 438 } 439 440 /* Stub timer callback for improperly used timers. */ 441 static enum hrtimer_restart stub_timer(struct hrtimer *unused) 442 { 443 WARN_ON_ONCE(1); 444 return HRTIMER_NORESTART; 445 } 446 447 /* 448 * hrtimer_fixup_assert_init is called when: 449 * - an untracked/uninit-ed object is found 450 */ 451 static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state) 452 { 453 struct hrtimer *timer = addr; 454 455 switch (state) { 456 case ODEBUG_STATE_NOTAVAILABLE: 457 hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0); 458 return true; 459 default: 460 return false; 461 } 462 } 463 464 static const struct debug_obj_descr hrtimer_debug_descr = { 465 .name = "hrtimer", 466 .debug_hint = hrtimer_debug_hint, 467 .fixup_init = hrtimer_fixup_init, 468 .fixup_activate = hrtimer_fixup_activate, 469 .fixup_free = hrtimer_fixup_free, 470 .fixup_assert_init = hrtimer_fixup_assert_init, 471 }; 472 473 static inline void debug_hrtimer_init(struct hrtimer *timer) 474 { 475 debug_object_init(timer, &hrtimer_debug_descr); 476 } 477 478 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) 479 { 480 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 481 } 482 483 static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) 484 { 485 debug_object_activate(timer, &hrtimer_debug_descr); 486 } 487 488 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) 489 { 490 debug_object_deactivate(timer, &hrtimer_debug_descr); 491 } 492 493 static inline void debug_hrtimer_assert_init(struct hrtimer *timer) 494 { 495 debug_object_assert_init(timer, &hrtimer_debug_descr); 496 } 497 498 void destroy_hrtimer_on_stack(struct hrtimer *timer) 499 { 500 debug_object_free(timer, &hrtimer_debug_descr); 501 } 502 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); 503 504 #else 505 506 static inline void debug_hrtimer_init(struct hrtimer *timer) { } 507 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } 508 static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } 509 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 510 static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { } 511 #endif 512 513 static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) 514 { 515 debug_hrtimer_init(timer); 516 trace_hrtimer_setup(timer, clockid, mode); 517 } 518 519 static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, 520 enum hrtimer_mode mode) 521 { 522 debug_hrtimer_init_on_stack(timer); 523 trace_hrtimer_setup(timer, clockid, mode); 524 } 525 526 static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed) 527 { 528 debug_hrtimer_activate(timer, mode); 529 trace_hrtimer_start(timer, mode, was_armed); 530 } 531 532 #define for_each_active_base(base, cpu_base, active) \ 533 for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \ 534 for (bool done = false; !done; active &= ~(1U << idx)) \ 535 for (base = &cpu_base->clock_base[idx]; !done; done = true) 536 537 #define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node) 538 539 #if defined(CONFIG_NO_HZ_COMMON) 540 /* 541 * Same as hrtimer_bases_next_event() below, but skips the excluded timer and 542 * does not update cpu_base->next_timer/expires. 543 */ 544 static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base, 545 const struct hrtimer *exclude, 546 unsigned int active, ktime_t expires_next) 547 { 548 struct hrtimer_clock_base *base; 549 ktime_t expires; 550 551 lockdep_assert_held(&cpu_base->lock); 552 553 for_each_active_base(base, cpu_base, active) { 554 expires = ktime_sub(base->expires_next, base->offset); 555 if (expires >= expires_next) 556 continue; 557 558 /* 559 * If the excluded timer is the first on this base evaluate the 560 * next timer. 561 */ 562 struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); 563 564 if (unlikely(&exclude->node == node)) { 565 node = timerqueue_linked_next(node); 566 if (!node) 567 continue; 568 expires = ktime_sub(node->expires, base->offset); 569 if (expires >= expires_next) 570 continue; 571 } 572 expires_next = expires; 573 } 574 /* If base->offset changed, the result might be negative */ 575 return max(expires_next, 0); 576 } 577 #endif 578 579 static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) 580 { 581 struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); 582 583 return hrtimer_from_timerqueue_node(next); 584 } 585 586 /* Find the base with the earliest expiry */ 587 static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active, 588 ktime_t *expires_next, struct hrtimer **next_timer) 589 { 590 struct hrtimer_clock_base *base; 591 ktime_t expires; 592 593 for_each_active_base(base, cpu_base, active) { 594 expires = ktime_sub(base->expires_next, base->offset); 595 if (expires < *expires_next) { 596 *expires_next = expires; 597 *next_timer = clock_base_next_timer(base); 598 } 599 } 600 } 601 602 /* 603 * Recomputes cpu_base::*next_timer and returns the earliest expires_next 604 * but does not set cpu_base::*expires_next, that is done by 605 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating 606 * cpu_base::*expires_next right away, reprogramming logic would no longer 607 * work. 608 * 609 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, 610 * those timers will get run whenever the softirq gets handled, at the end of 611 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. 612 * 613 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. 614 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual 615 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. 616 * 617 * @active_mask must be one of: 618 * - HRTIMER_ACTIVE_ALL, 619 * - HRTIMER_ACTIVE_SOFT, or 620 * - HRTIMER_ACTIVE_HARD. 621 */ 622 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) 623 { 624 struct hrtimer *next_timer = NULL; 625 ktime_t expires_next = KTIME_MAX; 626 unsigned int active; 627 628 lockdep_assert_held(&cpu_base->lock); 629 630 if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { 631 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 632 if (active) 633 hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); 634 cpu_base->softirq_next_timer = next_timer; 635 } 636 637 if (active_mask & HRTIMER_ACTIVE_HARD) { 638 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 639 if (active) 640 hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); 641 cpu_base->next_timer = next_timer; 642 } 643 return max(expires_next, 0); 644 } 645 646 static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) 647 { 648 ktime_t expires_next, soft = KTIME_MAX; 649 650 /* 651 * If the soft interrupt has already been activated, ignore the 652 * soft bases. They will be handled in the already raised soft 653 * interrupt. 654 */ 655 if (!cpu_base->softirq_activated) { 656 soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); 657 /* 658 * Update the soft expiry time. clock_settime() might have 659 * affected it. 660 */ 661 cpu_base->softirq_expires_next = soft; 662 } 663 664 expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); 665 /* 666 * If a softirq timer is expiring first, update cpu_base->next_timer 667 * and program the hardware with the soft expiry time. 668 */ 669 if (expires_next > soft) { 670 cpu_base->next_timer = cpu_base->softirq_next_timer; 671 expires_next = soft; 672 } 673 674 return expires_next; 675 } 676 677 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 678 { 679 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 680 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 681 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; 682 683 ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, 684 offs_boot, offs_tai); 685 686 base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; 687 base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; 688 base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; 689 690 return now; 691 } 692 693 /* 694 * Is the high resolution mode active in the CPU base. This cannot use the 695 * static key as the CPUs are switched to high resolution mode 696 * asynchronously. 697 */ 698 static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) 699 { 700 return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? 701 cpu_base->hres_active : 0; 702 } 703 704 static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred) 705 { 706 trace_hrtimer_rearm(expires_next, deferred); 707 tick_program_event(expires_next, 1); 708 } 709 710 static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, 711 ktime_t expires_next) 712 { 713 cpu_base->expires_next = expires_next; 714 715 /* 716 * If hres is not active, hardware does not have to be 717 * reprogrammed yet. 718 * 719 * If a hang was detected in the last timer interrupt then we 720 * leave the hang delay active in the hardware. We want the 721 * system to make progress. That also prevents the following 722 * scenario: 723 * T1 expires 50ms from now 724 * T2 expires 5s from now 725 * 726 * T1 is removed, so this code is called and would reprogram 727 * the hardware to 5s from now. Any hrtimer_start after that 728 * will not reprogram the hardware due to hang_detected being 729 * set. So we'd effectively block all timers until the T2 event 730 * fires. 731 */ 732 if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) 733 return; 734 735 hrtimer_rearm_event(expires_next, false); 736 } 737 738 /* Reprogram the event source with a evaluation of all clock bases */ 739 static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal) 740 { 741 ktime_t expires_next = hrtimer_update_next_event(cpu_base); 742 743 if (skip_equal && expires_next == cpu_base->expires_next) 744 return; 745 746 __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); 747 } 748 749 /* High resolution timer related functions */ 750 #ifdef CONFIG_HIGH_RES_TIMERS 751 752 /* High resolution timer enabled ? */ 753 static bool hrtimer_hres_enabled __read_mostly = true; 754 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; 755 EXPORT_SYMBOL_GPL(hrtimer_resolution); 756 757 /* Enable / Disable high resolution mode */ 758 static int __init setup_hrtimer_hres(char *str) 759 { 760 return (kstrtobool(str, &hrtimer_hres_enabled) == 0); 761 } 762 __setup("highres=", setup_hrtimer_hres); 763 764 /* hrtimer_high_res_enabled - query, if the highres mode is enabled */ 765 static inline bool hrtimer_is_hres_enabled(void) 766 { 767 return hrtimer_hres_enabled; 768 } 769 770 /* Switch to high resolution mode */ 771 static void hrtimer_switch_to_hres(void) 772 { 773 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); 774 775 if (tick_init_highres()) { 776 pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); 777 return; 778 } 779 base->hres_active = true; 780 hrtimer_resolution = HIGH_RES_NSEC; 781 782 tick_setup_sched_timer(true); 783 /* "Retrigger" the interrupt to get things going */ 784 retrigger_next_event(NULL); 785 hrtimer_schedule_hres_work(); 786 } 787 788 #else 789 790 static inline bool hrtimer_is_hres_enabled(void) { return 0; } 791 static inline void hrtimer_switch_to_hres(void) { } 792 793 #endif /* CONFIG_HIGH_RES_TIMERS */ 794 795 /* 796 * Retrigger next event is called after clock was set with interrupts 797 * disabled through an SMP function call or directly from low level 798 * resume code. 799 * 800 * This is only invoked when: 801 * - CONFIG_HIGH_RES_TIMERS is enabled. 802 * - CONFIG_NOHZ_COMMON is enabled 803 * 804 * For the other cases this function is empty and because the call sites 805 * are optimized out it vanishes as well, i.e. no need for lots of 806 * #ifdeffery. 807 */ 808 static void retrigger_next_event(void *arg) 809 { 810 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); 811 812 /* 813 * When high resolution mode or nohz is active, then the offsets of 814 * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the 815 * next tick will take care of that. 816 * 817 * If high resolution mode is active then the next expiring timer 818 * must be reevaluated and the clock event device reprogrammed if 819 * necessary. 820 * 821 * In the NOHZ case the update of the offset and the reevaluation 822 * of the next expiring timer is enough. The return from the SMP 823 * function call will take care of the reprogramming in case the 824 * CPU was in a NOHZ idle sleep. 825 * 826 * In periodic low resolution mode, the next softirq expiration 827 * must also be updated. 828 */ 829 guard(raw_spinlock)(&base->lock); 830 hrtimer_update_base(base); 831 if (hrtimer_hres_active(base)) 832 hrtimer_force_reprogram(base, /* skip_equal */ false); 833 else 834 hrtimer_update_next_event(base); 835 } 836 837 /* 838 * When a timer is enqueued and expires earlier than the already enqueued 839 * timers, we have to check, whether it expires earlier than the timer for 840 * which the clock event device was armed. 841 * 842 * Called with interrupts disabled and base->cpu_base.lock held 843 */ 844 static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) 845 { 846 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 847 struct hrtimer_clock_base *base = timer->base; 848 ktime_t expires = hrtimer_get_expires(timer); 849 850 WARN_ON_ONCE(expires < 0); 851 852 expires = ktime_sub(expires, base->offset); 853 /* 854 * CLOCK_REALTIME timer might be requested with an absolute 855 * expiry time which is less than base->offset. Set it to 0. 856 */ 857 if (expires < 0) 858 expires = 0; 859 860 if (timer->is_soft) { 861 /* 862 * soft hrtimer could be started on a remote CPU. In this 863 * case softirq_expires_next needs to be updated on the 864 * remote CPU. The soft hrtimer will not expire before the 865 * first hard hrtimer on the remote CPU - 866 * hrtimer_check_target() prevents this case. 867 */ 868 struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; 869 870 if (timer_cpu_base->softirq_activated) 871 return; 872 873 if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) 874 return; 875 876 timer_cpu_base->softirq_next_timer = timer; 877 timer_cpu_base->softirq_expires_next = expires; 878 879 if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) 880 return; 881 } 882 883 /* 884 * If the timer is not on the current cpu, we cannot reprogram 885 * the other cpus clock event device. 886 */ 887 if (base->cpu_base != cpu_base) 888 return; 889 890 if (expires >= cpu_base->expires_next) 891 return; 892 893 /* If a deferred rearm is pending skip reprogramming the device */ 894 if (cpu_base->deferred_rearm) 895 return; 896 897 cpu_base->next_timer = timer; 898 899 __hrtimer_reprogram(cpu_base, timer, expires); 900 } 901 902 static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) 903 { 904 struct hrtimer_clock_base *base; 905 unsigned int seq; 906 ktime_t expires; 907 908 /* 909 * Update the base offsets unconditionally so the following 910 * checks whether the SMP function call is required works. 911 * 912 * The update is safe even when the remote CPU is in the hrtimer 913 * interrupt or the hrtimer soft interrupt and expiring affected 914 * bases. Either it will see the update before handling a base or 915 * it will see it when it finishes the processing and reevaluates 916 * the next expiring timer. 917 */ 918 seq = cpu_base->clock_was_set_seq; 919 hrtimer_update_base(cpu_base); 920 921 /* 922 * If the sequence did not change over the update then the 923 * remote CPU already handled it. 924 */ 925 if (seq == cpu_base->clock_was_set_seq) 926 return false; 927 928 /* If a deferred rearm is pending the remote CPU will take care of it */ 929 if (cpu_base->deferred_rearm) { 930 cpu_base->deferred_needs_update = true; 931 return false; 932 } 933 934 /* 935 * Walk the affected clock bases and check whether the first expiring 936 * timer in a clock base is moving ahead of the first expiring timer of 937 * @cpu_base. If so, the IPI must be invoked because per CPU clock 938 * event devices cannot be remotely reprogrammed. 939 */ 940 active &= cpu_base->active_bases; 941 942 for_each_active_base(base, cpu_base, active) { 943 struct timerqueue_linked_node *next; 944 945 next = timerqueue_linked_first(&base->active); 946 expires = ktime_sub(next->expires, base->offset); 947 if (expires < cpu_base->expires_next) 948 return true; 949 950 /* Extra check for softirq clock bases */ 951 if (base->index < HRTIMER_BASE_MONOTONIC_SOFT) 952 continue; 953 if (cpu_base->softirq_activated) 954 continue; 955 if (expires < cpu_base->softirq_expires_next) 956 return true; 957 } 958 return false; 959 } 960 961 /* 962 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and 963 * CLOCK_BOOTTIME (for late sleep time injection). 964 * 965 * This requires to update the offsets for these clocks 966 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this 967 * also requires to eventually reprogram the per CPU clock event devices 968 * when the change moves an affected timer ahead of the first expiring 969 * timer on that CPU. Obviously remote per CPU clock event devices cannot 970 * be reprogrammed. The other reason why an IPI has to be sent is when the 971 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets 972 * in the tick, which obviously might be stopped, so this has to bring out 973 * the remote CPU which might sleep in idle to get this sorted. 974 */ 975 void clock_was_set(unsigned int bases) 976 { 977 cpumask_var_t mask; 978 979 if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) 980 goto out_timerfd; 981 982 if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { 983 on_each_cpu(retrigger_next_event, NULL, 1); 984 goto out_timerfd; 985 } 986 987 /* Avoid interrupting CPUs if possible */ 988 scoped_guard(cpus_read_lock) { 989 int cpu; 990 991 for_each_online_cpu(cpu) { 992 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 993 994 guard(raw_spinlock_irqsave)(&cpu_base->lock); 995 if (update_needs_ipi(cpu_base, bases)) 996 cpumask_set_cpu(cpu, mask); 997 } 998 scoped_guard(preempt) 999 smp_call_function_many(mask, retrigger_next_event, NULL, 1); 1000 } 1001 free_cpumask_var(mask); 1002 1003 out_timerfd: 1004 timerfd_clock_was_set(); 1005 } 1006 1007 static void clock_was_set_work(struct work_struct *work) 1008 { 1009 clock_was_set(CLOCK_SET_WALL); 1010 } 1011 1012 static DECLARE_WORK(hrtimer_work, clock_was_set_work); 1013 1014 /* 1015 * Called from timekeeping code to reprogram the hrtimer interrupt device 1016 * on all cpus and to notify timerfd. 1017 */ 1018 void clock_was_set_delayed(void) 1019 { 1020 schedule_work(&hrtimer_work); 1021 } 1022 1023 /* 1024 * Called during resume either directly from via timekeeping_resume() 1025 * or in the case of s2idle from tick_unfreeze() to ensure that the 1026 * hrtimers are up to date. 1027 */ 1028 void hrtimers_resume_local(void) 1029 { 1030 lockdep_assert_irqs_disabled(); 1031 /* Retrigger on the local CPU */ 1032 retrigger_next_event(NULL); 1033 } 1034 1035 /* Counterpart to lock_hrtimer_base above */ 1036 static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 1037 __releases(&timer->base->cpu_base->lock) 1038 { 1039 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 1040 } 1041 1042 /** 1043 * hrtimer_forward() - forward the timer expiry 1044 * @timer: hrtimer to forward 1045 * @now: forward past this time 1046 * @interval: the interval to forward 1047 * 1048 * Forward the timer expiry so it will expire in the future. 1049 * 1050 * .. note:: 1051 * This only updates the timer expiry value and does not requeue the timer. 1052 * 1053 * There is also a variant of this function: hrtimer_forward_now(). 1054 * 1055 * Context: Can be safely called from the callback function of @timer. If called 1056 * from other contexts @timer must neither be enqueued nor running the 1057 * callback and the caller needs to take care of serialization. 1058 * 1059 * Return: The number of overruns are returned. 1060 */ 1061 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) 1062 { 1063 ktime_t delta; 1064 u64 orun = 1; 1065 1066 delta = ktime_sub(now, hrtimer_get_expires(timer)); 1067 1068 if (delta < 0) 1069 return 0; 1070 1071 if (WARN_ON(timer->is_queued)) 1072 return 0; 1073 1074 if (interval < hrtimer_resolution) 1075 interval = hrtimer_resolution; 1076 1077 if (unlikely(delta >= interval)) { 1078 s64 incr = ktime_to_ns(interval); 1079 1080 orun = ktime_divns(delta, incr); 1081 hrtimer_add_expires_ns(timer, incr * orun); 1082 if (hrtimer_get_expires(timer) > now) 1083 return orun; 1084 /* 1085 * This (and the ktime_add() below) is the 1086 * correction for exact: 1087 */ 1088 orun++; 1089 } 1090 hrtimer_add_expires(timer, interval); 1091 1092 return orun; 1093 } 1094 EXPORT_SYMBOL_GPL(hrtimer_forward); 1095 1096 /* 1097 * enqueue_hrtimer - internal function to (re)start a timer 1098 * 1099 * The timer is inserted in expiry order. Insertion into the 1100 * red black tree is O(log(n)). 1101 * 1102 * Returns true when the new timer is the leftmost timer in the tree. 1103 */ 1104 static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1105 enum hrtimer_mode mode, bool was_armed) 1106 { 1107 lockdep_assert_held(&base->cpu_base->lock); 1108 1109 debug_activate(timer, mode, was_armed); 1110 WARN_ON_ONCE(!base->cpu_base->online); 1111 1112 base->cpu_base->active_bases |= 1 << base->index; 1113 1114 /* Pairs with the lockless read in hrtimer_is_queued() */ 1115 WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); 1116 1117 if (!timerqueue_linked_add(&base->active, &timer->node)) 1118 return false; 1119 1120 base->expires_next = hrtimer_get_expires(timer); 1121 return true; 1122 } 1123 1124 static inline void base_update_next_timer(struct hrtimer_clock_base *base) 1125 { 1126 struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); 1127 1128 base->expires_next = next ? next->expires : KTIME_MAX; 1129 } 1130 1131 /* 1132 * __remove_hrtimer - internal function to remove a timer 1133 * 1134 * High resolution timer mode reprograms the clock event device when the 1135 * timer is the one which expires next. The caller can disable this by setting 1136 * reprogram to zero. This is useful, when the context does a reprogramming 1137 * anyway (e.g. timer interrupt) 1138 */ 1139 static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1140 bool newstate, bool reprogram) 1141 { 1142 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1143 bool was_first; 1144 1145 lockdep_assert_held(&cpu_base->lock); 1146 1147 if (!timer->is_queued) 1148 return; 1149 1150 /* Pairs with the lockless read in hrtimer_is_queued() */ 1151 WRITE_ONCE(timer->is_queued, newstate); 1152 1153 was_first = !timerqueue_linked_prev(&timer->node); 1154 1155 if (!timerqueue_linked_del(&base->active, &timer->node)) 1156 cpu_base->active_bases &= ~(1 << base->index); 1157 1158 /* Nothing to update if this was not the first timer in the base */ 1159 if (!was_first) 1160 return; 1161 1162 base_update_next_timer(base); 1163 1164 /* 1165 * If reprogram is false don't update cpu_base->next_timer and do not 1166 * touch the clock event device. 1167 * 1168 * This happens when removing the first timer on a remote CPU, which 1169 * will be handled by the remote CPU's interrupt. It also happens when 1170 * a local timer is removed to be immediately restarted. That's handled 1171 * at the call site. 1172 */ 1173 if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) 1174 return; 1175 1176 if (cpu_base->deferred_rearm) 1177 cpu_base->deferred_needs_update = true; 1178 else 1179 hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); 1180 } 1181 1182 static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1183 bool newstate) 1184 { 1185 lockdep_assert_held(&base->cpu_base->lock); 1186 1187 if (timer->is_queued) { 1188 bool reprogram; 1189 1190 debug_hrtimer_deactivate(timer); 1191 1192 /* 1193 * Remove the timer and force reprogramming when high 1194 * resolution mode is active and the timer is on the current 1195 * CPU. If we remove a timer on another CPU, reprogramming is 1196 * skipped. The interrupt event on this CPU is fired and 1197 * reprogramming happens in the interrupt handler. This is a 1198 * rare case and less expensive than a smp call. 1199 */ 1200 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); 1201 1202 __remove_hrtimer(timer, base, newstate, reprogram); 1203 return true; 1204 } 1205 return false; 1206 } 1207 1208 /* 1209 * Update in place has to retrieve the expiry times of the neighbour nodes 1210 * if they exist. That is cache line neutral because the dequeue/enqueue 1211 * operation is going to need the same cache lines. But there is a big win 1212 * when the dequeue/enqueue can be avoided because the RB tree does not 1213 * have to be rebalanced twice. 1214 */ 1215 static inline bool 1216 hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires) 1217 { 1218 struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node); 1219 struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node); 1220 1221 /* If the new expiry goes behind the next timer, requeue is required */ 1222 if (next && expires > next->expires) 1223 return false; 1224 1225 /* If this is the first timer, update in place */ 1226 if (!prev) 1227 return true; 1228 1229 /* Update in place when it does not go ahead of the previous one */ 1230 return expires >= prev->expires; 1231 } 1232 1233 static inline bool 1234 remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, 1235 const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) 1236 { 1237 bool was_first = false; 1238 1239 /* Remove it from the timer queue if active */ 1240 if (timer->is_queued) { 1241 was_first = !timerqueue_linked_prev(&timer->node); 1242 1243 /* Try to update in place to avoid the de/enqueue dance */ 1244 if (hrtimer_can_update_in_place(timer, base, expires)) { 1245 hrtimer_set_expires_range_ns(timer, expires, delta_ns); 1246 trace_hrtimer_start(timer, mode, true); 1247 if (was_first) 1248 base->expires_next = expires; 1249 return was_first; 1250 } 1251 1252 debug_hrtimer_deactivate(timer); 1253 timerqueue_linked_del(&base->active, &timer->node); 1254 } 1255 1256 /* Set the new expiry time */ 1257 hrtimer_set_expires_range_ns(timer, expires, delta_ns); 1258 1259 debug_activate(timer, mode, timer->is_queued); 1260 base->cpu_base->active_bases |= 1 << base->index; 1261 1262 /* Pairs with the lockless read in hrtimer_is_queued() */ 1263 WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); 1264 1265 /* If it's the first expiring timer now or again, update base */ 1266 if (timerqueue_linked_add(&base->active, &timer->node)) { 1267 base->expires_next = expires; 1268 return true; 1269 } 1270 1271 if (was_first) 1272 base_update_next_timer(base); 1273 1274 return false; 1275 } 1276 1277 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, 1278 const enum hrtimer_mode mode) 1279 { 1280 #ifdef CONFIG_TIME_LOW_RES 1281 /* 1282 * CONFIG_TIME_LOW_RES indicates that the system has no way to return 1283 * granular time values. For relative timers we add hrtimer_resolution 1284 * (i.e. one jiffy) to prevent short timeouts. 1285 */ 1286 timer->is_rel = mode & HRTIMER_MODE_REL; 1287 if (timer->is_rel) 1288 tim = ktime_add_safe(tim, hrtimer_resolution); 1289 #endif 1290 return tim; 1291 } 1292 1293 static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) 1294 { 1295 ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); 1296 1297 /* 1298 * Reprogramming needs to be triggered, even if the next soft 1299 * hrtimer expires at the same time as the next hard 1300 * hrtimer. cpu_base->softirq_expires_next needs to be updated! 1301 */ 1302 if (expires == KTIME_MAX) 1303 return; 1304 1305 /* 1306 * cpu_base->next_timer is recomputed by __hrtimer_get_next_event() 1307 * cpu_base->expires_next is only set by hrtimer_reprogram() 1308 */ 1309 hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); 1310 } 1311 1312 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 1313 static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) 1314 { 1315 if (static_branch_likely(&timers_migration_enabled)) { 1316 /* 1317 * If it is local and the first expiring timer keep it on the local 1318 * CPU to optimize reprogramming of the clockevent device. Also 1319 * avoid switch_hrtimer_base() overhead when local and pinned. 1320 */ 1321 if (!is_local) 1322 return false; 1323 if (is_first || is_pinned) 1324 return true; 1325 1326 /* Honour the NOHZ full restrictions */ 1327 if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE)) 1328 return false; 1329 1330 /* 1331 * If the tick is not stopped or need_resched() is set, then 1332 * there is no point in moving the timer somewhere else. 1333 */ 1334 return !tick_nohz_tick_stopped() || need_resched(); 1335 } 1336 return is_local; 1337 } 1338 #else 1339 static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) 1340 { 1341 return is_local; 1342 } 1343 #endif 1344 1345 static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first, 1346 bool is_pinned) 1347 { 1348 /* If the timer is running the callback it has to stay on its CPU base. */ 1349 if (unlikely(timer->base->running == timer)) 1350 return true; 1351 1352 return hrtimer_prefer_local(is_local, is_first, is_pinned); 1353 } 1354 1355 static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, 1356 const enum hrtimer_mode mode, struct hrtimer_clock_base *base) 1357 { 1358 struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); 1359 bool is_pinned, first, was_first, keep_base = false; 1360 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1361 1362 was_first = cpu_base->next_timer == timer; 1363 is_pinned = !!(mode & HRTIMER_MODE_PINNED); 1364 1365 /* 1366 * Don't keep it local if this enqueue happens on a unplugged CPU 1367 * after hrtimer_cpu_dying() has been invoked. 1368 */ 1369 if (likely(this_cpu_base->online)) { 1370 bool is_local = cpu_base == this_cpu_base; 1371 1372 keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned); 1373 } 1374 1375 /* Calculate absolute expiry time for relative timers */ 1376 if (mode & HRTIMER_MODE_REL) 1377 tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); 1378 /* Compensate for low resolution granularity */ 1379 tim = hrtimer_update_lowres(timer, tim, mode); 1380 1381 /* 1382 * Remove an active timer from the queue. In case it is not queued 1383 * on the current CPU, make sure that remove_hrtimer() updates the 1384 * remote data correctly. 1385 * 1386 * If it's on the current CPU and the first expiring timer, then 1387 * skip reprogramming, keep the timer local and enforce 1388 * reprogramming later if it was the first expiring timer. This 1389 * avoids programming the underlying clock event twice (once at 1390 * removal and once after enqueue). 1391 * 1392 * @keep_base is also true if the timer callback is running on a 1393 * remote CPU and for local pinned timers. 1394 */ 1395 if (likely(keep_base)) { 1396 first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns); 1397 } else { 1398 /* Keep the ENQUEUED state in case it is queued */ 1399 bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED); 1400 1401 hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1402 1403 /* Switch the timer base, if necessary: */ 1404 base = switch_hrtimer_base(timer, base, is_pinned); 1405 cpu_base = base->cpu_base; 1406 1407 first = enqueue_hrtimer(timer, base, mode, was_armed); 1408 } 1409 1410 /* If a deferred rearm is pending skip reprogramming the device */ 1411 if (cpu_base->deferred_rearm) { 1412 cpu_base->deferred_needs_update = true; 1413 return false; 1414 } 1415 1416 if (!was_first || cpu_base != this_cpu_base) { 1417 /* 1418 * If the current CPU base is online, then the timer is never 1419 * queued on a remote CPU if it would be the first expiring 1420 * timer there unless the timer callback is currently executed 1421 * on the remote CPU. In the latter case the remote CPU will 1422 * re-evaluate the first expiring timer after completing the 1423 * callbacks. 1424 */ 1425 if (likely(hrtimer_base_is_online(this_cpu_base))) 1426 return first; 1427 1428 /* 1429 * Timer was enqueued remote because the current base is 1430 * already offline. If the timer is the first to expire, 1431 * kick the remote CPU to reprogram the clock event. 1432 */ 1433 if (first) 1434 smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); 1435 return false; 1436 } 1437 1438 /* 1439 * Special case for the HRTICK timer. It is frequently rearmed and most 1440 * of the time moves the expiry into the future. That's expensive in 1441 * virtual machines and it's better to take the pointless already armed 1442 * interrupt than reprogramming the hardware on every context switch. 1443 * 1444 * If the new expiry is before the armed time, then reprogramming is 1445 * required. 1446 */ 1447 if (timer->is_lazy) { 1448 if (cpu_base->expires_next <= hrtimer_get_expires(timer)) 1449 return false; 1450 } 1451 1452 /* 1453 * Timer was the first expiring timer and forced to stay on the 1454 * current CPU to avoid reprogramming on removal and enqueue. Force 1455 * reprogram the hardware by evaluating the new first expiring 1456 * timer. 1457 */ 1458 hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); 1459 return false; 1460 } 1461 1462 /** 1463 * hrtimer_start_range_ns - (re)start an hrtimer 1464 * @timer: the timer to be added 1465 * @tim: expiry time 1466 * @delta_ns: "slack" range for the timer 1467 * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or 1468 * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); 1469 * softirq based mode is considered for debug purpose only! 1470 */ 1471 void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, 1472 const enum hrtimer_mode mode) 1473 { 1474 struct hrtimer_clock_base *base; 1475 unsigned long flags; 1476 1477 debug_hrtimer_assert_init(timer); 1478 1479 /* 1480 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft 1481 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard 1482 * expiry mode because unmarked timers are moved to softirq expiry. 1483 */ 1484 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 1485 WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); 1486 else 1487 WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); 1488 1489 base = lock_hrtimer_base(timer, &flags); 1490 1491 if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) 1492 hrtimer_reprogram(timer, true); 1493 1494 unlock_hrtimer_base(timer, &flags); 1495 } 1496 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 1497 1498 /** 1499 * hrtimer_try_to_cancel - try to deactivate a timer 1500 * @timer: hrtimer to stop 1501 * 1502 * Returns: 1503 * 1504 * * 0 when the timer was not active 1505 * * 1 when the timer was active 1506 * * -1 when the timer is currently executing the callback function and 1507 * cannot be stopped 1508 */ 1509 int hrtimer_try_to_cancel(struct hrtimer *timer) 1510 { 1511 struct hrtimer_clock_base *base; 1512 unsigned long flags; 1513 int ret = -1; 1514 1515 /* 1516 * Check lockless first. If the timer is not active (neither 1517 * enqueued nor running the callback, nothing to do here. The 1518 * base lock does not serialize against a concurrent enqueue, 1519 * so we can avoid taking it. 1520 */ 1521 if (!hrtimer_active(timer)) 1522 return 0; 1523 1524 base = lock_hrtimer_base(timer, &flags); 1525 1526 if (!hrtimer_callback_running(timer)) { 1527 ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE); 1528 if (ret) 1529 trace_hrtimer_cancel(timer); 1530 } 1531 1532 unlock_hrtimer_base(timer, &flags); 1533 1534 return ret; 1535 1536 } 1537 EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); 1538 1539 #ifdef CONFIG_PREEMPT_RT 1540 static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) 1541 { 1542 spin_lock_init(&base->softirq_expiry_lock); 1543 } 1544 1545 static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) 1546 __acquires(&base->softirq_expiry_lock) 1547 { 1548 spin_lock(&base->softirq_expiry_lock); 1549 } 1550 1551 static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) 1552 __releases(&base->softirq_expiry_lock) 1553 { 1554 spin_unlock(&base->softirq_expiry_lock); 1555 } 1556 1557 /* 1558 * The counterpart to hrtimer_cancel_wait_running(). 1559 * 1560 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for 1561 * the timer callback to finish. Drop expiry_lock and reacquire it. That 1562 * allows the waiter to acquire the lock and make progress. 1563 */ 1564 static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) 1565 { 1566 if (atomic_read(&cpu_base->timer_waiters)) { 1567 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1568 spin_unlock(&cpu_base->softirq_expiry_lock); 1569 spin_lock(&cpu_base->softirq_expiry_lock); 1570 raw_spin_lock_irq(&cpu_base->lock); 1571 } 1572 } 1573 1574 #ifdef CONFIG_SMP 1575 static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) 1576 { 1577 return base == &migration_base; 1578 } 1579 #else 1580 static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) 1581 { 1582 return false; 1583 } 1584 #endif 1585 1586 /* 1587 * This function is called on PREEMPT_RT kernels when the fast path 1588 * deletion of a timer failed because the timer callback function was 1589 * running. 1590 * 1591 * This prevents priority inversion: if the soft irq thread is preempted 1592 * in the middle of a timer callback, then calling hrtimer_cancel() can 1593 * lead to two issues: 1594 * 1595 * - If the caller is on a remote CPU then it has to spin wait for the timer 1596 * handler to complete. This can result in unbound priority inversion. 1597 * 1598 * - If the caller originates from the task which preempted the timer 1599 * handler on the same CPU, then spin waiting for the timer handler to 1600 * complete is never going to end. 1601 */ 1602 void hrtimer_cancel_wait_running(const struct hrtimer *timer) 1603 { 1604 /* Lockless read. Prevent the compiler from reloading it below */ 1605 struct hrtimer_clock_base *base = READ_ONCE(timer->base); 1606 1607 /* 1608 * Just relax if the timer expires in hard interrupt context or if 1609 * it is currently on the migration base. 1610 */ 1611 if (!timer->is_soft || is_migration_base(base)) { 1612 cpu_relax(); 1613 return; 1614 } 1615 1616 /* 1617 * Mark the base as contended and grab the expiry lock, which is 1618 * held by the softirq across the timer callback. Drop the lock 1619 * immediately so the softirq can expire the next timer. In theory 1620 * the timer could already be running again, but that's more than 1621 * unlikely and just causes another wait loop. 1622 */ 1623 atomic_inc(&base->cpu_base->timer_waiters); 1624 spin_lock_bh(&base->cpu_base->softirq_expiry_lock); 1625 atomic_dec(&base->cpu_base->timer_waiters); 1626 spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); 1627 } 1628 #else 1629 static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } 1630 static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } 1631 static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } 1632 static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { } 1633 #endif 1634 1635 /** 1636 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 1637 * @timer: the timer to be cancelled 1638 * 1639 * Returns: 1640 * 0 when the timer was not active 1641 * 1 when the timer was active 1642 */ 1643 int hrtimer_cancel(struct hrtimer *timer) 1644 { 1645 int ret; 1646 1647 do { 1648 ret = hrtimer_try_to_cancel(timer); 1649 1650 if (ret < 0) 1651 hrtimer_cancel_wait_running(timer); 1652 } while (ret < 0); 1653 return ret; 1654 } 1655 EXPORT_SYMBOL_GPL(hrtimer_cancel); 1656 1657 /** 1658 * __hrtimer_get_remaining - get remaining time for the timer 1659 * @timer: the timer to read 1660 * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y 1661 */ 1662 ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) 1663 { 1664 unsigned long flags; 1665 ktime_t rem; 1666 1667 lock_hrtimer_base(timer, &flags); 1668 if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) 1669 rem = hrtimer_expires_remaining_adjusted(timer); 1670 else 1671 rem = hrtimer_expires_remaining(timer); 1672 unlock_hrtimer_base(timer, &flags); 1673 1674 return rem; 1675 } 1676 EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); 1677 1678 #ifdef CONFIG_NO_HZ_COMMON 1679 /** 1680 * hrtimer_get_next_event - get the time until next expiry event 1681 * 1682 * Returns the next expiry time or KTIME_MAX if no timer is pending. 1683 */ 1684 u64 hrtimer_get_next_event(void) 1685 { 1686 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1687 u64 expires = KTIME_MAX; 1688 1689 guard(raw_spinlock_irqsave)(&cpu_base->lock); 1690 if (!hrtimer_hres_active(cpu_base)) 1691 expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); 1692 1693 return expires; 1694 } 1695 1696 /** 1697 * hrtimer_next_event_without - time until next expiry event w/o one timer 1698 * @exclude: timer to exclude 1699 * 1700 * Returns the next expiry time over all timers except for the @exclude one or 1701 * KTIME_MAX if none of them is pending. 1702 */ 1703 u64 hrtimer_next_event_without(const struct hrtimer *exclude) 1704 { 1705 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1706 u64 expires = KTIME_MAX; 1707 unsigned int active; 1708 1709 guard(raw_spinlock_irqsave)(&cpu_base->lock); 1710 if (!hrtimer_hres_active(cpu_base)) 1711 return expires; 1712 1713 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 1714 if (active && !cpu_base->softirq_activated) 1715 expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX); 1716 1717 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 1718 if (!active) 1719 return expires; 1720 return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires); 1721 } 1722 #endif 1723 1724 static inline int hrtimer_clockid_to_base(clockid_t clock_id) 1725 { 1726 switch (clock_id) { 1727 case CLOCK_MONOTONIC: 1728 return HRTIMER_BASE_MONOTONIC; 1729 case CLOCK_REALTIME: 1730 return HRTIMER_BASE_REALTIME; 1731 case CLOCK_BOOTTIME: 1732 return HRTIMER_BASE_BOOTTIME; 1733 case CLOCK_TAI: 1734 return HRTIMER_BASE_TAI; 1735 default: 1736 WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); 1737 return HRTIMER_BASE_MONOTONIC; 1738 } 1739 } 1740 1741 static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) 1742 { 1743 switch (clock_id) { 1744 case CLOCK_MONOTONIC: 1745 return ktime_get(); 1746 case CLOCK_REALTIME: 1747 return ktime_get_real(); 1748 case CLOCK_BOOTTIME: 1749 return ktime_get_boottime(); 1750 case CLOCK_TAI: 1751 return ktime_get_clocktai(); 1752 default: 1753 WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); 1754 return ktime_get(); 1755 } 1756 } 1757 1758 ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) 1759 { 1760 return __hrtimer_cb_get_time(timer->base->clockid); 1761 } 1762 EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); 1763 1764 static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *), 1765 clockid_t clock_id, enum hrtimer_mode mode) 1766 { 1767 bool softtimer = !!(mode & HRTIMER_MODE_SOFT); 1768 struct hrtimer_cpu_base *cpu_base; 1769 int base; 1770 1771 /* 1772 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly 1773 * marked for hard interrupt expiry mode are moved into soft 1774 * interrupt context for latency reasons and because the callbacks 1775 * can invoke functions which might sleep on RT, e.g. spin_lock(). 1776 */ 1777 if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) 1778 softtimer = true; 1779 1780 memset(timer, 0, sizeof(struct hrtimer)); 1781 1782 cpu_base = raw_cpu_ptr(&hrtimer_bases); 1783 1784 /* 1785 * POSIX magic: Relative CLOCK_REALTIME timers are not affected by 1786 * clock modifications, so they needs to become CLOCK_MONOTONIC to 1787 * ensure POSIX compliance. 1788 */ 1789 if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) 1790 clock_id = CLOCK_MONOTONIC; 1791 1792 base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; 1793 base += hrtimer_clockid_to_base(clock_id); 1794 timer->is_soft = softtimer; 1795 timer->is_hard = !!(mode & HRTIMER_MODE_HARD); 1796 timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); 1797 timer->base = &cpu_base->clock_base[base]; 1798 timerqueue_linked_init(&timer->node); 1799 1800 if (WARN_ON_ONCE(!fn)) 1801 ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; 1802 else 1803 ACCESS_PRIVATE(timer, function) = fn; 1804 } 1805 1806 /** 1807 * hrtimer_setup - initialize a timer to the given clock 1808 * @timer: the timer to be initialized 1809 * @function: the callback function 1810 * @clock_id: the clock to be used 1811 * @mode: The modes which are relevant for initialization: 1812 * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, 1813 * HRTIMER_MODE_REL_SOFT 1814 * 1815 * The PINNED variants of the above can be handed in, 1816 * but the PINNED bit is ignored as pinning happens 1817 * when the hrtimer is started 1818 */ 1819 void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), 1820 clockid_t clock_id, enum hrtimer_mode mode) 1821 { 1822 debug_setup(timer, clock_id, mode); 1823 __hrtimer_setup(timer, function, clock_id, mode); 1824 } 1825 EXPORT_SYMBOL_GPL(hrtimer_setup); 1826 1827 /** 1828 * hrtimer_setup_on_stack - initialize a timer on stack memory 1829 * @timer: The timer to be initialized 1830 * @function: the callback function 1831 * @clock_id: The clock to be used 1832 * @mode: The timer mode 1833 * 1834 * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack 1835 * memory. 1836 */ 1837 void hrtimer_setup_on_stack(struct hrtimer *timer, 1838 enum hrtimer_restart (*function)(struct hrtimer *), 1839 clockid_t clock_id, enum hrtimer_mode mode) 1840 { 1841 debug_setup_on_stack(timer, clock_id, mode); 1842 __hrtimer_setup(timer, function, clock_id, mode); 1843 } 1844 EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); 1845 1846 /* 1847 * A timer is active, when it is enqueued into the rbtree or the 1848 * callback function is running or it's in the state of being migrated 1849 * to another cpu. 1850 * 1851 * It is important for this function to not return a false negative. 1852 */ 1853 bool hrtimer_active(const struct hrtimer *timer) 1854 { 1855 struct hrtimer_clock_base *base; 1856 unsigned int seq; 1857 1858 do { 1859 base = READ_ONCE(timer->base); 1860 seq = raw_read_seqcount_begin(&base->seq); 1861 1862 if (timer->is_queued || base->running == timer) 1863 return true; 1864 1865 } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); 1866 1867 return false; 1868 } 1869 EXPORT_SYMBOL_GPL(hrtimer_active); 1870 1871 /* 1872 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 1873 * distinct sections: 1874 * 1875 * - queued: the timer is queued 1876 * - callback: the timer is being ran 1877 * - post: the timer is inactive or (re)queued 1878 * 1879 * On the read side we ensure we observe timer->is_queued and cpu_base->running 1880 * from the same section, if anything changed while we looked at it, we retry. 1881 * This includes timer->base changing because sequence numbers alone are 1882 * insufficient for that. 1883 * 1884 * The sequence numbers are required because otherwise we could still observe 1885 * a false negative if the read side got smeared over multiple consecutive 1886 * __run_hrtimer() invocations. 1887 */ 1888 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, 1889 struct hrtimer *timer, ktime_t now, unsigned long flags) 1890 __must_hold(&cpu_base->lock) 1891 { 1892 enum hrtimer_restart (*fn)(struct hrtimer *); 1893 bool expires_in_hardirq; 1894 int restart; 1895 1896 lockdep_assert_held(&cpu_base->lock); 1897 1898 debug_hrtimer_deactivate(timer); 1899 base->running = timer; 1900 1901 /* 1902 * Separate the ->running assignment from the ->is_queued assignment. 1903 * 1904 * As with a regular write barrier, this ensures the read side in 1905 * hrtimer_active() cannot observe base->running == NULL && 1906 * timer->is_queued == INACTIVE. 1907 */ 1908 raw_write_seqcount_barrier(&base->seq); 1909 1910 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false); 1911 fn = ACCESS_PRIVATE(timer, function); 1912 1913 /* 1914 * Clear the 'is relative' flag for the TIME_LOW_RES case. If the 1915 * timer is restarted with a period then it becomes an absolute 1916 * timer. If its not restarted it does not matter. 1917 */ 1918 if (IS_ENABLED(CONFIG_TIME_LOW_RES)) 1919 timer->is_rel = false; 1920 1921 /* 1922 * The timer is marked as running in the CPU base, so it is 1923 * protected against migration to a different CPU even if the lock 1924 * is dropped. 1925 */ 1926 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1927 trace_hrtimer_expire_entry(timer, now); 1928 expires_in_hardirq = lockdep_hrtimer_enter(timer); 1929 1930 restart = fn(timer); 1931 1932 lockdep_hrtimer_exit(expires_in_hardirq); 1933 trace_hrtimer_expire_exit(timer); 1934 raw_spin_lock_irq(&cpu_base->lock); 1935 1936 /* 1937 * Note: We clear the running state after enqueue_hrtimer and 1938 * we do not reprogram the event hardware. Happens either in 1939 * hrtimer_start_range_ns() or in hrtimer_interrupt() 1940 * 1941 * Note: Because we dropped the cpu_base->lock above, 1942 * hrtimer_start_range_ns() can have popped in and enqueued the timer 1943 * for us already. 1944 */ 1945 if (restart == HRTIMER_RESTART && !timer->is_queued) 1946 enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); 1947 1948 /* 1949 * Separate the ->running assignment from the ->is_queued assignment. 1950 * 1951 * As with a regular write barrier, this ensures the read side in 1952 * hrtimer_active() cannot observe base->running.timer == NULL && 1953 * timer->is_queued == INACTIVE. 1954 */ 1955 raw_write_seqcount_barrier(&base->seq); 1956 1957 WARN_ON_ONCE(base->running != timer); 1958 base->running = NULL; 1959 } 1960 1961 static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) 1962 { 1963 struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); 1964 1965 return next ? hrtimer_from_timerqueue_node(next) : NULL; 1966 } 1967 1968 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, 1969 unsigned long flags, unsigned int active_mask) 1970 { 1971 unsigned int active = cpu_base->active_bases & active_mask; 1972 struct hrtimer_clock_base *base; 1973 1974 for_each_active_base(base, cpu_base, active) { 1975 ktime_t basenow = ktime_add(now, base->offset); 1976 struct hrtimer *timer; 1977 1978 while ((timer = clock_base_next_timer(base))) { 1979 /* 1980 * The immediate goal for using the softexpires is 1981 * minimizing wakeups, not running timers at the 1982 * earliest interrupt after their soft expiration. 1983 * This allows us to avoid using a Priority Search 1984 * Tree, which can answer a stabbing query for 1985 * overlapping intervals and instead use the simple 1986 * BST we already have. 1987 * We don't add extra wakeups by delaying timers that 1988 * are right-of a not yet expired timer, because that 1989 * timer will have to trigger a wakeup anyway. 1990 */ 1991 if (basenow < hrtimer_get_softexpires(timer)) 1992 break; 1993 1994 __run_hrtimer(cpu_base, base, timer, basenow, flags); 1995 if (active_mask == HRTIMER_ACTIVE_SOFT) 1996 hrtimer_sync_wait_running(cpu_base, flags); 1997 } 1998 } 1999 } 2000 2001 static __latent_entropy void hrtimer_run_softirq(void) 2002 { 2003 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2004 unsigned long flags; 2005 ktime_t now; 2006 2007 hrtimer_cpu_base_lock_expiry(cpu_base); 2008 raw_spin_lock_irqsave(&cpu_base->lock, flags); 2009 2010 now = hrtimer_update_base(cpu_base); 2011 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); 2012 2013 cpu_base->softirq_activated = false; 2014 hrtimer_update_softirq_timer(cpu_base, true); 2015 2016 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 2017 hrtimer_cpu_base_unlock_expiry(cpu_base); 2018 } 2019 2020 #ifdef CONFIG_HIGH_RES_TIMERS 2021 2022 /* 2023 * Very similar to hrtimer_force_reprogram(), except it deals with 2024 * deferred_rearm and hang_detected. 2025 */ 2026 static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) 2027 { 2028 cpu_base->expires_next = expires_next; 2029 cpu_base->deferred_rearm = false; 2030 2031 if (unlikely(cpu_base->hang_detected)) { 2032 /* 2033 * Give the system a chance to do something else than looping 2034 * on hrtimer interrupts. 2035 */ 2036 expires_next = ktime_add_ns(ktime_get(), 2037 min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time)); 2038 } 2039 hrtimer_rearm_event(expires_next, deferred); 2040 } 2041 2042 #ifdef CONFIG_HRTIMER_REARM_DEFERRED 2043 void __hrtimer_rearm_deferred(void) 2044 { 2045 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2046 ktime_t expires_next; 2047 2048 if (!cpu_base->deferred_rearm) 2049 return; 2050 2051 guard(raw_spinlock)(&cpu_base->lock); 2052 if (cpu_base->deferred_needs_update) { 2053 hrtimer_update_base(cpu_base); 2054 expires_next = hrtimer_update_next_event(cpu_base); 2055 } else { 2056 /* No timer added/removed. Use the cached value */ 2057 expires_next = cpu_base->deferred_expires_next; 2058 } 2059 hrtimer_rearm(cpu_base, expires_next, true); 2060 } 2061 2062 static __always_inline void 2063 hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) 2064 { 2065 /* hrtimer_interrupt() just re-evaluated the first expiring timer */ 2066 cpu_base->deferred_needs_update = false; 2067 /* Cache the expiry time */ 2068 cpu_base->deferred_expires_next = expires_next; 2069 set_thread_flag(TIF_HRTIMER_REARM); 2070 } 2071 #else /* CONFIG_HRTIMER_REARM_DEFERRED */ 2072 static __always_inline void 2073 hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) 2074 { 2075 hrtimer_rearm(cpu_base, expires_next, false); 2076 } 2077 #endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ 2078 2079 /* 2080 * High resolution timer interrupt 2081 * Called with interrupts disabled 2082 */ 2083 void hrtimer_interrupt(struct clock_event_device *dev) 2084 { 2085 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2086 ktime_t expires_next, now, entry_time, delta; 2087 unsigned long flags; 2088 int retries = 0; 2089 2090 BUG_ON(!cpu_base->hres_active); 2091 cpu_base->nr_events++; 2092 dev->next_event = KTIME_MAX; 2093 dev->next_event_forced = 0; 2094 2095 raw_spin_lock_irqsave(&cpu_base->lock, flags); 2096 entry_time = now = hrtimer_update_base(cpu_base); 2097 retry: 2098 cpu_base->deferred_rearm = true; 2099 /* 2100 * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue 2101 * timers while __hrtimer_run_queues() is expiring the clock bases. 2102 * Timers which are re/enqueued on the local CPU are not affected by 2103 * this. 2104 */ 2105 cpu_base->expires_next = KTIME_MAX; 2106 2107 if (!ktime_before(now, cpu_base->softirq_expires_next)) { 2108 cpu_base->softirq_expires_next = KTIME_MAX; 2109 cpu_base->softirq_activated = true; 2110 raise_timer_softirq(HRTIMER_SOFTIRQ); 2111 } 2112 2113 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); 2114 2115 /* 2116 * The next timer was already expired due to: 2117 * - tracing 2118 * - long lasting callbacks 2119 * - being scheduled away when running in a VM 2120 * 2121 * We need to prevent that we loop forever in the hrtiner interrupt 2122 * routine. We give it 3 attempts to avoid overreacting on some 2123 * spurious event. 2124 */ 2125 now = hrtimer_update_base(cpu_base); 2126 expires_next = hrtimer_update_next_event(cpu_base); 2127 cpu_base->hang_detected = false; 2128 if (expires_next < now) { 2129 if (++retries < 3) 2130 goto retry; 2131 2132 delta = ktime_sub(now, entry_time); 2133 cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta); 2134 cpu_base->nr_hangs++; 2135 cpu_base->hang_detected = true; 2136 } 2137 2138 hrtimer_interrupt_rearm(cpu_base, expires_next); 2139 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 2140 } 2141 2142 #endif /* !CONFIG_HIGH_RES_TIMERS */ 2143 2144 /* 2145 * Called from run_local_timers in hardirq context every jiffy 2146 */ 2147 void hrtimer_run_queues(void) 2148 { 2149 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2150 unsigned long flags; 2151 ktime_t now; 2152 2153 if (hrtimer_hres_active(cpu_base)) 2154 return; 2155 2156 /* 2157 * This _is_ ugly: We have to check periodically, whether we 2158 * can switch to highres and / or nohz mode. The clocksource 2159 * switch happens with xtime_lock held. Notification from 2160 * there only sets the check bit in the tick_oneshot code, 2161 * otherwise we might deadlock vs. xtime_lock. 2162 */ 2163 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { 2164 hrtimer_switch_to_hres(); 2165 return; 2166 } 2167 2168 raw_spin_lock_irqsave(&cpu_base->lock, flags); 2169 now = hrtimer_update_base(cpu_base); 2170 2171 if (!ktime_before(now, cpu_base->softirq_expires_next)) { 2172 cpu_base->softirq_expires_next = KTIME_MAX; 2173 cpu_base->softirq_activated = true; 2174 raise_timer_softirq(HRTIMER_SOFTIRQ); 2175 } 2176 2177 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); 2178 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 2179 } 2180 2181 /* 2182 * Sleep related functions: 2183 */ 2184 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) 2185 { 2186 struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); 2187 struct task_struct *task = t->task; 2188 2189 t->task = NULL; 2190 if (task) 2191 wake_up_process(task); 2192 2193 return HRTIMER_NORESTART; 2194 } 2195 2196 /** 2197 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer 2198 * @sl: sleeper to be started 2199 * @mode: timer mode abs/rel 2200 * 2201 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers 2202 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) 2203 */ 2204 void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) 2205 { 2206 /* 2207 * Make the enqueue delivery mode check work on RT. If the sleeper 2208 * was initialized for hard interrupt delivery, force the mode bit. 2209 * This is a special case for hrtimer_sleepers because 2210 * __hrtimer_setup_sleeper() determines the delivery mode on RT so the 2211 * fiddling with this decision is avoided at the call sites. 2212 */ 2213 if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) 2214 mode |= HRTIMER_MODE_HARD; 2215 2216 hrtimer_start_expires(&sl->timer, mode); 2217 } 2218 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); 2219 2220 static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, 2221 enum hrtimer_mode mode) 2222 { 2223 /* 2224 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly 2225 * marked for hard interrupt expiry mode are moved into soft 2226 * interrupt context either for latency reasons or because the 2227 * hrtimer callback takes regular spinlocks or invokes other 2228 * functions which are not suitable for hard interrupt context on 2229 * PREEMPT_RT. 2230 * 2231 * The hrtimer_sleeper callback is RT compatible in hard interrupt 2232 * context, but there is a latency concern: Untrusted userspace can 2233 * spawn many threads which arm timers for the same expiry time on 2234 * the same CPU. That causes a latency spike due to the wakeup of 2235 * a gazillion threads. 2236 * 2237 * OTOH, privileged real-time user space applications rely on the 2238 * low latency of hard interrupt wakeups. If the current task is in 2239 * a real-time scheduling class, mark the mode for hard interrupt 2240 * expiry. 2241 */ 2242 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 2243 if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) 2244 mode |= HRTIMER_MODE_HARD; 2245 } 2246 2247 __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); 2248 sl->task = current; 2249 } 2250 2251 /** 2252 * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory 2253 * @sl: sleeper to be initialized 2254 * @clock_id: the clock to be used 2255 * @mode: timer mode abs/rel 2256 */ 2257 void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, 2258 enum hrtimer_mode mode) 2259 { 2260 debug_setup_on_stack(&sl->timer, clock_id, mode); 2261 __hrtimer_setup_sleeper(sl, clock_id, mode); 2262 } 2263 EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); 2264 2265 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) 2266 { 2267 switch(restart->nanosleep.type) { 2268 #ifdef CONFIG_COMPAT_32BIT_TIME 2269 case TT_COMPAT: 2270 if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) 2271 return -EFAULT; 2272 break; 2273 #endif 2274 case TT_NATIVE: 2275 if (put_timespec64(ts, restart->nanosleep.rmtp)) 2276 return -EFAULT; 2277 break; 2278 default: 2279 BUG(); 2280 } 2281 return -ERESTART_RESTARTBLOCK; 2282 } 2283 2284 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 2285 { 2286 struct restart_block *restart; 2287 2288 do { 2289 set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 2290 hrtimer_sleeper_start_expires(t, mode); 2291 2292 if (likely(t->task)) 2293 schedule(); 2294 2295 hrtimer_cancel(&t->timer); 2296 mode = HRTIMER_MODE_ABS; 2297 2298 } while (t->task && !signal_pending(current)); 2299 2300 __set_current_state(TASK_RUNNING); 2301 2302 if (!t->task) 2303 return 0; 2304 2305 restart = ¤t->restart_block; 2306 if (restart->nanosleep.type != TT_NONE) { 2307 ktime_t rem = hrtimer_expires_remaining(&t->timer); 2308 struct timespec64 rmt; 2309 2310 if (rem <= 0) 2311 return 0; 2312 rmt = ktime_to_timespec64(rem); 2313 2314 return nanosleep_copyout(restart, &rmt); 2315 } 2316 return -ERESTART_RESTARTBLOCK; 2317 } 2318 2319 static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) 2320 { 2321 struct hrtimer_sleeper t; 2322 int ret; 2323 2324 hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); 2325 hrtimer_set_expires(&t.timer, restart->nanosleep.expires); 2326 ret = do_nanosleep(&t, HRTIMER_MODE_ABS); 2327 destroy_hrtimer_on_stack(&t.timer); 2328 return ret; 2329 } 2330 2331 long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) 2332 { 2333 struct restart_block *restart; 2334 struct hrtimer_sleeper t; 2335 int ret; 2336 2337 hrtimer_setup_sleeper_on_stack(&t, clockid, mode); 2338 hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); 2339 ret = do_nanosleep(&t, mode); 2340 if (ret != -ERESTART_RESTARTBLOCK) 2341 goto out; 2342 2343 /* Absolute timers do not update the rmtp value and restart: */ 2344 if (mode == HRTIMER_MODE_ABS) { 2345 ret = -ERESTARTNOHAND; 2346 goto out; 2347 } 2348 2349 restart = ¤t->restart_block; 2350 restart->nanosleep.clockid = t.timer.base->clockid; 2351 restart->nanosleep.expires = hrtimer_get_expires(&t.timer); 2352 set_restart_fn(restart, hrtimer_nanosleep_restart); 2353 out: 2354 destroy_hrtimer_on_stack(&t.timer); 2355 return ret; 2356 } 2357 2358 #ifdef CONFIG_64BIT 2359 2360 SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, 2361 struct __kernel_timespec __user *, rmtp) 2362 { 2363 struct timespec64 tu; 2364 2365 if (get_timespec64(&tu, rqtp)) 2366 return -EFAULT; 2367 2368 if (!timespec64_valid(&tu)) 2369 return -EINVAL; 2370 2371 current->restart_block.fn = do_no_restart_syscall; 2372 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; 2373 current->restart_block.nanosleep.rmtp = rmtp; 2374 return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); 2375 } 2376 2377 #endif 2378 2379 #ifdef CONFIG_COMPAT_32BIT_TIME 2380 2381 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, 2382 struct old_timespec32 __user *, rmtp) 2383 { 2384 struct timespec64 tu; 2385 2386 if (get_old_timespec32(&tu, rqtp)) 2387 return -EFAULT; 2388 2389 if (!timespec64_valid(&tu)) 2390 return -EINVAL; 2391 2392 current->restart_block.fn = do_no_restart_syscall; 2393 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; 2394 current->restart_block.nanosleep.compat_rmtp = rmtp; 2395 return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); 2396 } 2397 #endif 2398 2399 /* 2400 * Functions related to boot-time initialization: 2401 */ 2402 int hrtimers_prepare_cpu(unsigned int cpu) 2403 { 2404 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 2405 2406 for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 2407 struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; 2408 2409 clock_b->cpu_base = cpu_base; 2410 seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); 2411 timerqueue_linked_init_head(&clock_b->active); 2412 } 2413 2414 cpu_base->cpu = cpu; 2415 hrtimer_cpu_base_init_expiry_lock(cpu_base); 2416 return 0; 2417 } 2418 2419 int hrtimers_cpu_starting(unsigned int cpu) 2420 { 2421 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2422 2423 /* Clear out any left over state from a CPU down operation */ 2424 cpu_base->active_bases = 0; 2425 cpu_base->hres_active = false; 2426 cpu_base->hang_detected = false; 2427 cpu_base->next_timer = NULL; 2428 cpu_base->softirq_next_timer = NULL; 2429 cpu_base->expires_next = KTIME_MAX; 2430 cpu_base->softirq_expires_next = KTIME_MAX; 2431 cpu_base->softirq_activated = false; 2432 cpu_base->online = true; 2433 return 0; 2434 } 2435 2436 #ifdef CONFIG_HOTPLUG_CPU 2437 2438 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 2439 struct hrtimer_clock_base *new_base) 2440 { 2441 struct timerqueue_linked_node *node; 2442 struct hrtimer *timer; 2443 2444 while ((node = timerqueue_linked_first(&old_base->active))) { 2445 timer = hrtimer_from_timerqueue_node(node); 2446 BUG_ON(hrtimer_callback_running(timer)); 2447 debug_hrtimer_deactivate(timer); 2448 2449 /* 2450 * Mark it as ENQUEUED not INACTIVE otherwise the 2451 * timer could be seen as !active and just vanish away 2452 * under us on another CPU 2453 */ 2454 __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false); 2455 timer->base = new_base; 2456 /* 2457 * Enqueue the timers on the new cpu. This does not 2458 * reprogram the event device in case the timer 2459 * expires before the earliest on this CPU, but we run 2460 * hrtimer_interrupt after we migrated everything to 2461 * sort out already expired timers and reprogram the 2462 * event device. 2463 */ 2464 enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true); 2465 } 2466 } 2467 2468 int hrtimers_cpu_dying(unsigned int dying_cpu) 2469 { 2470 int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); 2471 struct hrtimer_cpu_base *old_base, *new_base; 2472 2473 old_base = this_cpu_ptr(&hrtimer_bases); 2474 new_base = &per_cpu(hrtimer_bases, ncpu); 2475 2476 /* 2477 * The caller is globally serialized and nobody else 2478 * takes two locks at once, deadlock is not possible. 2479 */ 2480 raw_spin_lock(&old_base->lock); 2481 raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); 2482 2483 for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 2484 migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); 2485 2486 /* Tell the other CPU to retrigger the next event */ 2487 smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); 2488 2489 raw_spin_unlock(&new_base->lock); 2490 old_base->online = false; 2491 raw_spin_unlock(&old_base->lock); 2492 2493 return 0; 2494 } 2495 2496 #endif /* CONFIG_HOTPLUG_CPU */ 2497 2498 void __init hrtimers_init(void) 2499 { 2500 hrtimers_prepare_cpu(smp_processor_id()); 2501 hrtimers_cpu_starting(smp_processor_id()); 2502 open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); 2503 } 2504