1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * 7 * High-resolution kernel timers 8 * 9 * In contrast to the low-resolution timeout API, aka timer wheel, 10 * hrtimers provide finer resolution and accuracy depending on system 11 * configuration and capabilities. 12 * 13 * Started by: Thomas Gleixner and Ingo Molnar 14 * 15 * Credits: 16 * Based on the original timer wheel code 17 * 18 * Help, testing, suggestions, bugfixes, improvements were 19 * provided by: 20 * 21 * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel 22 * et. al. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/export.h> 27 #include <linux/percpu.h> 28 #include <linux/hrtimer.h> 29 #include <linux/notifier.h> 30 #include <linux/syscalls.h> 31 #include <linux/interrupt.h> 32 #include <linux/tick.h> 33 #include <linux/err.h> 34 #include <linux/debugobjects.h> 35 #include <linux/sched/signal.h> 36 #include <linux/sched/sysctl.h> 37 #include <linux/sched/rt.h> 38 #include <linux/sched/deadline.h> 39 #include <linux/sched/nohz.h> 40 #include <linux/sched/debug.h> 41 #include <linux/sched/isolation.h> 42 #include <linux/timer.h> 43 #include <linux/freezer.h> 44 #include <linux/compat.h> 45 46 #include <linux/uaccess.h> 47 48 #include <trace/events/timer.h> 49 50 #include "tick-internal.h" 51 52 /* 53 * Masks for selecting the soft and hard context timers from 54 * cpu_base->active 55 */ 56 #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) 57 #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) 58 #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) 59 #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) 60 61 /* 62 * The timer bases: 63 * 64 * There are more clockids than hrtimer bases. Thus, we index 65 * into the timer bases by the hrtimer_base_type enum. When trying 66 * to reach a base using a clockid, hrtimer_clockid_to_base() 67 * is used to convert from clockid to the proper hrtimer_base_type. 68 */ 69 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 70 { 71 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), 72 .clock_base = 73 { 74 { 75 .index = HRTIMER_BASE_MONOTONIC, 76 .clockid = CLOCK_MONOTONIC, 77 .get_time = &ktime_get, 78 }, 79 { 80 .index = HRTIMER_BASE_REALTIME, 81 .clockid = CLOCK_REALTIME, 82 .get_time = &ktime_get_real, 83 }, 84 { 85 .index = HRTIMER_BASE_BOOTTIME, 86 .clockid = CLOCK_BOOTTIME, 87 .get_time = &ktime_get_boottime, 88 }, 89 { 90 .index = HRTIMER_BASE_TAI, 91 .clockid = CLOCK_TAI, 92 .get_time = &ktime_get_clocktai, 93 }, 94 { 95 .index = HRTIMER_BASE_MONOTONIC_SOFT, 96 .clockid = CLOCK_MONOTONIC, 97 .get_time = &ktime_get, 98 }, 99 { 100 .index = HRTIMER_BASE_REALTIME_SOFT, 101 .clockid = CLOCK_REALTIME, 102 .get_time = &ktime_get_real, 103 }, 104 { 105 .index = HRTIMER_BASE_BOOTTIME_SOFT, 106 .clockid = CLOCK_BOOTTIME, 107 .get_time = &ktime_get_boottime, 108 }, 109 { 110 .index = HRTIMER_BASE_TAI_SOFT, 111 .clockid = CLOCK_TAI, 112 .get_time = &ktime_get_clocktai, 113 }, 114 } 115 }; 116 117 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { 118 /* Make sure we catch unsupported clockids */ 119 [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, 120 121 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 122 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 123 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 124 [CLOCK_TAI] = HRTIMER_BASE_TAI, 125 }; 126 127 /* 128 * Functions and macros which are different for UP/SMP systems are kept in a 129 * single place 130 */ 131 #ifdef CONFIG_SMP 132 133 /* 134 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() 135 * such that hrtimer_callback_running() can unconditionally dereference 136 * timer->base->cpu_base 137 */ 138 static struct hrtimer_cpu_base migration_cpu_base = { 139 .clock_base = { { 140 .cpu_base = &migration_cpu_base, 141 .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, 142 &migration_cpu_base.lock), 143 }, }, 144 }; 145 146 #define migration_base migration_cpu_base.clock_base[0] 147 148 static inline bool is_migration_base(struct hrtimer_clock_base *base) 149 { 150 return base == &migration_base; 151 } 152 153 /* 154 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock 155 * means that all timers which are tied to this base via timer->base are 156 * locked, and the base itself is locked too. 157 * 158 * So __run_timers/migrate_timers can safely modify all timers which could 159 * be found on the lists/queues. 160 * 161 * When the timer's base is locked, and the timer removed from list, it is 162 * possible to set timer->base = &migration_base and drop the lock: the timer 163 * remains locked. 164 */ 165 static 166 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, 167 unsigned long *flags) 168 __acquires(&timer->base->lock) 169 { 170 struct hrtimer_clock_base *base; 171 172 for (;;) { 173 base = READ_ONCE(timer->base); 174 if (likely(base != &migration_base)) { 175 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); 176 if (likely(base == timer->base)) 177 return base; 178 /* The timer has migrated to another CPU: */ 179 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 180 } 181 cpu_relax(); 182 } 183 } 184 185 /* 186 * We do not migrate the timer when it is expiring before the next 187 * event on the target cpu. When high resolution is enabled, we cannot 188 * reprogram the target cpu hardware and we would cause it to fire 189 * late. To keep it simple, we handle the high resolution enabled and 190 * disabled case similar. 191 * 192 * Called with cpu_base->lock of target cpu held. 193 */ 194 static int 195 hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) 196 { 197 ktime_t expires; 198 199 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); 200 return expires < new_base->cpu_base->expires_next; 201 } 202 203 static inline 204 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, 205 int pinned) 206 { 207 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 208 if (static_branch_likely(&timers_migration_enabled) && !pinned) 209 return &per_cpu(hrtimer_bases, get_nohz_timer_target()); 210 #endif 211 return base; 212 } 213 214 /* 215 * We switch the timer base to a power-optimized selected CPU target, 216 * if: 217 * - NO_HZ_COMMON is enabled 218 * - timer migration is enabled 219 * - the timer callback is not running 220 * - the timer is not the first expiring timer on the new target 221 * 222 * If one of the above requirements is not fulfilled we move the timer 223 * to the current CPU or leave it on the previously assigned CPU if 224 * the timer callback is currently running. 225 */ 226 static inline struct hrtimer_clock_base * 227 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, 228 int pinned) 229 { 230 struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; 231 struct hrtimer_clock_base *new_base; 232 int basenum = base->index; 233 234 this_cpu_base = this_cpu_ptr(&hrtimer_bases); 235 new_cpu_base = get_target_base(this_cpu_base, pinned); 236 again: 237 new_base = &new_cpu_base->clock_base[basenum]; 238 239 if (base != new_base) { 240 /* 241 * We are trying to move timer to new_base. 242 * However we can't change timer's base while it is running, 243 * so we keep it on the same CPU. No hassle vs. reprogramming 244 * the event source in the high resolution case. The softirq 245 * code will take care of this when the timer function has 246 * completed. There is no conflict as we hold the lock until 247 * the timer is enqueued. 248 */ 249 if (unlikely(hrtimer_callback_running(timer))) 250 return base; 251 252 /* See the comment in lock_hrtimer_base() */ 253 WRITE_ONCE(timer->base, &migration_base); 254 raw_spin_unlock(&base->cpu_base->lock); 255 raw_spin_lock(&new_base->cpu_base->lock); 256 257 if (new_cpu_base != this_cpu_base && 258 hrtimer_check_target(timer, new_base)) { 259 raw_spin_unlock(&new_base->cpu_base->lock); 260 raw_spin_lock(&base->cpu_base->lock); 261 new_cpu_base = this_cpu_base; 262 WRITE_ONCE(timer->base, base); 263 goto again; 264 } 265 WRITE_ONCE(timer->base, new_base); 266 } else { 267 if (new_cpu_base != this_cpu_base && 268 hrtimer_check_target(timer, new_base)) { 269 new_cpu_base = this_cpu_base; 270 goto again; 271 } 272 } 273 return new_base; 274 } 275 276 #else /* CONFIG_SMP */ 277 278 static inline bool is_migration_base(struct hrtimer_clock_base *base) 279 { 280 return false; 281 } 282 283 static inline struct hrtimer_clock_base * 284 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 285 __acquires(&timer->base->cpu_base->lock) 286 { 287 struct hrtimer_clock_base *base = timer->base; 288 289 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); 290 291 return base; 292 } 293 294 # define switch_hrtimer_base(t, b, p) (b) 295 296 #endif /* !CONFIG_SMP */ 297 298 /* 299 * Functions for the union type storage format of ktime_t which are 300 * too large for inlining: 301 */ 302 #if BITS_PER_LONG < 64 303 /* 304 * Divide a ktime value by a nanosecond value 305 */ 306 s64 __ktime_divns(const ktime_t kt, s64 div) 307 { 308 int sft = 0; 309 s64 dclc; 310 u64 tmp; 311 312 dclc = ktime_to_ns(kt); 313 tmp = dclc < 0 ? -dclc : dclc; 314 315 /* Make sure the divisor is less than 2^32: */ 316 while (div >> 32) { 317 sft++; 318 div >>= 1; 319 } 320 tmp >>= sft; 321 do_div(tmp, (u32) div); 322 return dclc < 0 ? -tmp : tmp; 323 } 324 EXPORT_SYMBOL_GPL(__ktime_divns); 325 #endif /* BITS_PER_LONG >= 64 */ 326 327 /* 328 * Add two ktime values and do a safety check for overflow: 329 */ 330 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) 331 { 332 ktime_t res = ktime_add_unsafe(lhs, rhs); 333 334 /* 335 * We use KTIME_SEC_MAX here, the maximum timeout which we can 336 * return to user space in a timespec: 337 */ 338 if (res < 0 || res < lhs || res < rhs) 339 res = ktime_set(KTIME_SEC_MAX, 0); 340 341 return res; 342 } 343 344 EXPORT_SYMBOL_GPL(ktime_add_safe); 345 346 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS 347 348 static const struct debug_obj_descr hrtimer_debug_descr; 349 350 static void *hrtimer_debug_hint(void *addr) 351 { 352 return ((struct hrtimer *) addr)->function; 353 } 354 355 /* 356 * fixup_init is called when: 357 * - an active object is initialized 358 */ 359 static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) 360 { 361 struct hrtimer *timer = addr; 362 363 switch (state) { 364 case ODEBUG_STATE_ACTIVE: 365 hrtimer_cancel(timer); 366 debug_object_init(timer, &hrtimer_debug_descr); 367 return true; 368 default: 369 return false; 370 } 371 } 372 373 /* 374 * fixup_activate is called when: 375 * - an active object is activated 376 * - an unknown non-static object is activated 377 */ 378 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) 379 { 380 switch (state) { 381 case ODEBUG_STATE_ACTIVE: 382 WARN_ON(1); 383 fallthrough; 384 default: 385 return false; 386 } 387 } 388 389 /* 390 * fixup_free is called when: 391 * - an active object is freed 392 */ 393 static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) 394 { 395 struct hrtimer *timer = addr; 396 397 switch (state) { 398 case ODEBUG_STATE_ACTIVE: 399 hrtimer_cancel(timer); 400 debug_object_free(timer, &hrtimer_debug_descr); 401 return true; 402 default: 403 return false; 404 } 405 } 406 407 static const struct debug_obj_descr hrtimer_debug_descr = { 408 .name = "hrtimer", 409 .debug_hint = hrtimer_debug_hint, 410 .fixup_init = hrtimer_fixup_init, 411 .fixup_activate = hrtimer_fixup_activate, 412 .fixup_free = hrtimer_fixup_free, 413 }; 414 415 static inline void debug_hrtimer_init(struct hrtimer *timer) 416 { 417 debug_object_init(timer, &hrtimer_debug_descr); 418 } 419 420 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) 421 { 422 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 423 } 424 425 static inline void debug_hrtimer_activate(struct hrtimer *timer, 426 enum hrtimer_mode mode) 427 { 428 debug_object_activate(timer, &hrtimer_debug_descr); 429 } 430 431 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) 432 { 433 debug_object_deactivate(timer, &hrtimer_debug_descr); 434 } 435 436 void destroy_hrtimer_on_stack(struct hrtimer *timer) 437 { 438 debug_object_free(timer, &hrtimer_debug_descr); 439 } 440 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); 441 442 #else 443 444 static inline void debug_hrtimer_init(struct hrtimer *timer) { } 445 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } 446 static inline void debug_hrtimer_activate(struct hrtimer *timer, 447 enum hrtimer_mode mode) { } 448 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 449 #endif 450 451 static inline void 452 debug_init(struct hrtimer *timer, clockid_t clockid, 453 enum hrtimer_mode mode) 454 { 455 debug_hrtimer_init(timer); 456 trace_hrtimer_init(timer, clockid, mode); 457 } 458 459 static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, 460 enum hrtimer_mode mode) 461 { 462 debug_hrtimer_init_on_stack(timer); 463 trace_hrtimer_init(timer, clockid, mode); 464 } 465 466 static inline void debug_activate(struct hrtimer *timer, 467 enum hrtimer_mode mode) 468 { 469 debug_hrtimer_activate(timer, mode); 470 trace_hrtimer_start(timer, mode); 471 } 472 473 static inline void debug_deactivate(struct hrtimer *timer) 474 { 475 debug_hrtimer_deactivate(timer); 476 trace_hrtimer_cancel(timer); 477 } 478 479 static struct hrtimer_clock_base * 480 __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) 481 { 482 unsigned int idx; 483 484 if (!*active) 485 return NULL; 486 487 idx = __ffs(*active); 488 *active &= ~(1U << idx); 489 490 return &cpu_base->clock_base[idx]; 491 } 492 493 #define for_each_active_base(base, cpu_base, active) \ 494 while ((base = __next_base((cpu_base), &(active)))) 495 496 static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, 497 const struct hrtimer *exclude, 498 unsigned int active, 499 ktime_t expires_next) 500 { 501 struct hrtimer_clock_base *base; 502 ktime_t expires; 503 504 for_each_active_base(base, cpu_base, active) { 505 struct timerqueue_node *next; 506 struct hrtimer *timer; 507 508 next = timerqueue_getnext(&base->active); 509 timer = container_of(next, struct hrtimer, node); 510 if (timer == exclude) { 511 /* Get to the next timer in the queue. */ 512 next = timerqueue_iterate_next(next); 513 if (!next) 514 continue; 515 516 timer = container_of(next, struct hrtimer, node); 517 } 518 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 519 if (expires < expires_next) { 520 expires_next = expires; 521 522 /* Skip cpu_base update if a timer is being excluded. */ 523 if (exclude) 524 continue; 525 526 if (timer->is_soft) 527 cpu_base->softirq_next_timer = timer; 528 else 529 cpu_base->next_timer = timer; 530 } 531 } 532 /* 533 * clock_was_set() might have changed base->offset of any of 534 * the clock bases so the result might be negative. Fix it up 535 * to prevent a false positive in clockevents_program_event(). 536 */ 537 if (expires_next < 0) 538 expires_next = 0; 539 return expires_next; 540 } 541 542 /* 543 * Recomputes cpu_base::*next_timer and returns the earliest expires_next 544 * but does not set cpu_base::*expires_next, that is done by 545 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating 546 * cpu_base::*expires_next right away, reprogramming logic would no longer 547 * work. 548 * 549 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, 550 * those timers will get run whenever the softirq gets handled, at the end of 551 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. 552 * 553 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. 554 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual 555 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. 556 * 557 * @active_mask must be one of: 558 * - HRTIMER_ACTIVE_ALL, 559 * - HRTIMER_ACTIVE_SOFT, or 560 * - HRTIMER_ACTIVE_HARD. 561 */ 562 static ktime_t 563 __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) 564 { 565 unsigned int active; 566 struct hrtimer *next_timer = NULL; 567 ktime_t expires_next = KTIME_MAX; 568 569 if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { 570 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 571 cpu_base->softirq_next_timer = NULL; 572 expires_next = __hrtimer_next_event_base(cpu_base, NULL, 573 active, KTIME_MAX); 574 575 next_timer = cpu_base->softirq_next_timer; 576 } 577 578 if (active_mask & HRTIMER_ACTIVE_HARD) { 579 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 580 cpu_base->next_timer = next_timer; 581 expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, 582 expires_next); 583 } 584 585 return expires_next; 586 } 587 588 static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) 589 { 590 ktime_t expires_next, soft = KTIME_MAX; 591 592 /* 593 * If the soft interrupt has already been activated, ignore the 594 * soft bases. They will be handled in the already raised soft 595 * interrupt. 596 */ 597 if (!cpu_base->softirq_activated) { 598 soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); 599 /* 600 * Update the soft expiry time. clock_settime() might have 601 * affected it. 602 */ 603 cpu_base->softirq_expires_next = soft; 604 } 605 606 expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); 607 /* 608 * If a softirq timer is expiring first, update cpu_base->next_timer 609 * and program the hardware with the soft expiry time. 610 */ 611 if (expires_next > soft) { 612 cpu_base->next_timer = cpu_base->softirq_next_timer; 613 expires_next = soft; 614 } 615 616 return expires_next; 617 } 618 619 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 620 { 621 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 622 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 623 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; 624 625 ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, 626 offs_real, offs_boot, offs_tai); 627 628 base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; 629 base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; 630 base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; 631 632 return now; 633 } 634 635 /* 636 * Is the high resolution mode active ? 637 */ 638 static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) 639 { 640 return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? 641 cpu_base->hres_active : 0; 642 } 643 644 static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, 645 struct hrtimer *next_timer, 646 ktime_t expires_next) 647 { 648 cpu_base->expires_next = expires_next; 649 650 /* 651 * If hres is not active, hardware does not have to be 652 * reprogrammed yet. 653 * 654 * If a hang was detected in the last timer interrupt then we 655 * leave the hang delay active in the hardware. We want the 656 * system to make progress. That also prevents the following 657 * scenario: 658 * T1 expires 50ms from now 659 * T2 expires 5s from now 660 * 661 * T1 is removed, so this code is called and would reprogram 662 * the hardware to 5s from now. Any hrtimer_start after that 663 * will not reprogram the hardware due to hang_detected being 664 * set. So we'd effectively block all timers until the T2 event 665 * fires. 666 */ 667 if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) 668 return; 669 670 tick_program_event(expires_next, 1); 671 } 672 673 /* 674 * Reprogram the event source with checking both queues for the 675 * next event 676 * Called with interrupts disabled and base->lock held 677 */ 678 static void 679 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 680 { 681 ktime_t expires_next; 682 683 expires_next = hrtimer_update_next_event(cpu_base); 684 685 if (skip_equal && expires_next == cpu_base->expires_next) 686 return; 687 688 __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); 689 } 690 691 /* High resolution timer related functions */ 692 #ifdef CONFIG_HIGH_RES_TIMERS 693 694 /* 695 * High resolution timer enabled ? 696 */ 697 static bool hrtimer_hres_enabled __read_mostly = true; 698 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; 699 EXPORT_SYMBOL_GPL(hrtimer_resolution); 700 701 /* 702 * Enable / Disable high resolution mode 703 */ 704 static int __init setup_hrtimer_hres(char *str) 705 { 706 return (kstrtobool(str, &hrtimer_hres_enabled) == 0); 707 } 708 709 __setup("highres=", setup_hrtimer_hres); 710 711 /* 712 * hrtimer_high_res_enabled - query, if the highres mode is enabled 713 */ 714 static inline int hrtimer_is_hres_enabled(void) 715 { 716 return hrtimer_hres_enabled; 717 } 718 719 static void retrigger_next_event(void *arg); 720 721 /* 722 * Switch to high resolution mode 723 */ 724 static void hrtimer_switch_to_hres(void) 725 { 726 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); 727 728 if (tick_init_highres()) { 729 pr_warn("Could not switch to high resolution mode on CPU %u\n", 730 base->cpu); 731 return; 732 } 733 base->hres_active = 1; 734 hrtimer_resolution = HIGH_RES_NSEC; 735 736 tick_setup_sched_timer(true); 737 /* "Retrigger" the interrupt to get things going */ 738 retrigger_next_event(NULL); 739 } 740 741 #else 742 743 static inline int hrtimer_is_hres_enabled(void) { return 0; } 744 static inline void hrtimer_switch_to_hres(void) { } 745 746 #endif /* CONFIG_HIGH_RES_TIMERS */ 747 /* 748 * Retrigger next event is called after clock was set with interrupts 749 * disabled through an SMP function call or directly from low level 750 * resume code. 751 * 752 * This is only invoked when: 753 * - CONFIG_HIGH_RES_TIMERS is enabled. 754 * - CONFIG_NOHZ_COMMON is enabled 755 * 756 * For the other cases this function is empty and because the call sites 757 * are optimized out it vanishes as well, i.e. no need for lots of 758 * #ifdeffery. 759 */ 760 static void retrigger_next_event(void *arg) 761 { 762 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); 763 764 /* 765 * When high resolution mode or nohz is active, then the offsets of 766 * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the 767 * next tick will take care of that. 768 * 769 * If high resolution mode is active then the next expiring timer 770 * must be reevaluated and the clock event device reprogrammed if 771 * necessary. 772 * 773 * In the NOHZ case the update of the offset and the reevaluation 774 * of the next expiring timer is enough. The return from the SMP 775 * function call will take care of the reprogramming in case the 776 * CPU was in a NOHZ idle sleep. 777 */ 778 if (!hrtimer_hres_active(base) && !tick_nohz_active) 779 return; 780 781 raw_spin_lock(&base->lock); 782 hrtimer_update_base(base); 783 if (hrtimer_hres_active(base)) 784 hrtimer_force_reprogram(base, 0); 785 else 786 hrtimer_update_next_event(base); 787 raw_spin_unlock(&base->lock); 788 } 789 790 /* 791 * When a timer is enqueued and expires earlier than the already enqueued 792 * timers, we have to check, whether it expires earlier than the timer for 793 * which the clock event device was armed. 794 * 795 * Called with interrupts disabled and base->cpu_base.lock held 796 */ 797 static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) 798 { 799 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 800 struct hrtimer_clock_base *base = timer->base; 801 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 802 803 WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); 804 805 /* 806 * CLOCK_REALTIME timer might be requested with an absolute 807 * expiry time which is less than base->offset. Set it to 0. 808 */ 809 if (expires < 0) 810 expires = 0; 811 812 if (timer->is_soft) { 813 /* 814 * soft hrtimer could be started on a remote CPU. In this 815 * case softirq_expires_next needs to be updated on the 816 * remote CPU. The soft hrtimer will not expire before the 817 * first hard hrtimer on the remote CPU - 818 * hrtimer_check_target() prevents this case. 819 */ 820 struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; 821 822 if (timer_cpu_base->softirq_activated) 823 return; 824 825 if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) 826 return; 827 828 timer_cpu_base->softirq_next_timer = timer; 829 timer_cpu_base->softirq_expires_next = expires; 830 831 if (!ktime_before(expires, timer_cpu_base->expires_next) || 832 !reprogram) 833 return; 834 } 835 836 /* 837 * If the timer is not on the current cpu, we cannot reprogram 838 * the other cpus clock event device. 839 */ 840 if (base->cpu_base != cpu_base) 841 return; 842 843 if (expires >= cpu_base->expires_next) 844 return; 845 846 /* 847 * If the hrtimer interrupt is running, then it will reevaluate the 848 * clock bases and reprogram the clock event device. 849 */ 850 if (cpu_base->in_hrtirq) 851 return; 852 853 cpu_base->next_timer = timer; 854 855 __hrtimer_reprogram(cpu_base, timer, expires); 856 } 857 858 static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, 859 unsigned int active) 860 { 861 struct hrtimer_clock_base *base; 862 unsigned int seq; 863 ktime_t expires; 864 865 /* 866 * Update the base offsets unconditionally so the following 867 * checks whether the SMP function call is required works. 868 * 869 * The update is safe even when the remote CPU is in the hrtimer 870 * interrupt or the hrtimer soft interrupt and expiring affected 871 * bases. Either it will see the update before handling a base or 872 * it will see it when it finishes the processing and reevaluates 873 * the next expiring timer. 874 */ 875 seq = cpu_base->clock_was_set_seq; 876 hrtimer_update_base(cpu_base); 877 878 /* 879 * If the sequence did not change over the update then the 880 * remote CPU already handled it. 881 */ 882 if (seq == cpu_base->clock_was_set_seq) 883 return false; 884 885 /* 886 * If the remote CPU is currently handling an hrtimer interrupt, it 887 * will reevaluate the first expiring timer of all clock bases 888 * before reprogramming. Nothing to do here. 889 */ 890 if (cpu_base->in_hrtirq) 891 return false; 892 893 /* 894 * Walk the affected clock bases and check whether the first expiring 895 * timer in a clock base is moving ahead of the first expiring timer of 896 * @cpu_base. If so, the IPI must be invoked because per CPU clock 897 * event devices cannot be remotely reprogrammed. 898 */ 899 active &= cpu_base->active_bases; 900 901 for_each_active_base(base, cpu_base, active) { 902 struct timerqueue_node *next; 903 904 next = timerqueue_getnext(&base->active); 905 expires = ktime_sub(next->expires, base->offset); 906 if (expires < cpu_base->expires_next) 907 return true; 908 909 /* Extra check for softirq clock bases */ 910 if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) 911 continue; 912 if (cpu_base->softirq_activated) 913 continue; 914 if (expires < cpu_base->softirq_expires_next) 915 return true; 916 } 917 return false; 918 } 919 920 /* 921 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and 922 * CLOCK_BOOTTIME (for late sleep time injection). 923 * 924 * This requires to update the offsets for these clocks 925 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this 926 * also requires to eventually reprogram the per CPU clock event devices 927 * when the change moves an affected timer ahead of the first expiring 928 * timer on that CPU. Obviously remote per CPU clock event devices cannot 929 * be reprogrammed. The other reason why an IPI has to be sent is when the 930 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets 931 * in the tick, which obviously might be stopped, so this has to bring out 932 * the remote CPU which might sleep in idle to get this sorted. 933 */ 934 void clock_was_set(unsigned int bases) 935 { 936 struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); 937 cpumask_var_t mask; 938 int cpu; 939 940 if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) 941 goto out_timerfd; 942 943 if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { 944 on_each_cpu(retrigger_next_event, NULL, 1); 945 goto out_timerfd; 946 } 947 948 /* Avoid interrupting CPUs if possible */ 949 cpus_read_lock(); 950 for_each_online_cpu(cpu) { 951 unsigned long flags; 952 953 cpu_base = &per_cpu(hrtimer_bases, cpu); 954 raw_spin_lock_irqsave(&cpu_base->lock, flags); 955 956 if (update_needs_ipi(cpu_base, bases)) 957 cpumask_set_cpu(cpu, mask); 958 959 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 960 } 961 962 preempt_disable(); 963 smp_call_function_many(mask, retrigger_next_event, NULL, 1); 964 preempt_enable(); 965 cpus_read_unlock(); 966 free_cpumask_var(mask); 967 968 out_timerfd: 969 timerfd_clock_was_set(); 970 } 971 972 static void clock_was_set_work(struct work_struct *work) 973 { 974 clock_was_set(CLOCK_SET_WALL); 975 } 976 977 static DECLARE_WORK(hrtimer_work, clock_was_set_work); 978 979 /* 980 * Called from timekeeping code to reprogram the hrtimer interrupt device 981 * on all cpus and to notify timerfd. 982 */ 983 void clock_was_set_delayed(void) 984 { 985 schedule_work(&hrtimer_work); 986 } 987 988 /* 989 * Called during resume either directly from via timekeeping_resume() 990 * or in the case of s2idle from tick_unfreeze() to ensure that the 991 * hrtimers are up to date. 992 */ 993 void hrtimers_resume_local(void) 994 { 995 lockdep_assert_irqs_disabled(); 996 /* Retrigger on the local CPU */ 997 retrigger_next_event(NULL); 998 } 999 1000 /* 1001 * Counterpart to lock_hrtimer_base above: 1002 */ 1003 static inline 1004 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 1005 __releases(&timer->base->cpu_base->lock) 1006 { 1007 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 1008 } 1009 1010 /** 1011 * hrtimer_forward() - forward the timer expiry 1012 * @timer: hrtimer to forward 1013 * @now: forward past this time 1014 * @interval: the interval to forward 1015 * 1016 * Forward the timer expiry so it will expire in the future. 1017 * 1018 * .. note:: 1019 * This only updates the timer expiry value and does not requeue the timer. 1020 * 1021 * There is also a variant of the function hrtimer_forward_now(). 1022 * 1023 * Context: Can be safely called from the callback function of @timer. If called 1024 * from other contexts @timer must neither be enqueued nor running the 1025 * callback and the caller needs to take care of serialization. 1026 * 1027 * Return: The number of overruns are returned. 1028 */ 1029 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) 1030 { 1031 u64 orun = 1; 1032 ktime_t delta; 1033 1034 delta = ktime_sub(now, hrtimer_get_expires(timer)); 1035 1036 if (delta < 0) 1037 return 0; 1038 1039 if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) 1040 return 0; 1041 1042 if (interval < hrtimer_resolution) 1043 interval = hrtimer_resolution; 1044 1045 if (unlikely(delta >= interval)) { 1046 s64 incr = ktime_to_ns(interval); 1047 1048 orun = ktime_divns(delta, incr); 1049 hrtimer_add_expires_ns(timer, incr * orun); 1050 if (hrtimer_get_expires_tv64(timer) > now) 1051 return orun; 1052 /* 1053 * This (and the ktime_add() below) is the 1054 * correction for exact: 1055 */ 1056 orun++; 1057 } 1058 hrtimer_add_expires(timer, interval); 1059 1060 return orun; 1061 } 1062 EXPORT_SYMBOL_GPL(hrtimer_forward); 1063 1064 /* 1065 * enqueue_hrtimer - internal function to (re)start a timer 1066 * 1067 * The timer is inserted in expiry order. Insertion into the 1068 * red black tree is O(log(n)). Must hold the base lock. 1069 * 1070 * Returns true when the new timer is the leftmost timer in the tree. 1071 */ 1072 static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1073 enum hrtimer_mode mode) 1074 { 1075 debug_activate(timer, mode); 1076 WARN_ON_ONCE(!base->cpu_base->online); 1077 1078 base->cpu_base->active_bases |= 1 << base->index; 1079 1080 /* Pairs with the lockless read in hrtimer_is_queued() */ 1081 WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); 1082 1083 return timerqueue_add(&base->active, &timer->node); 1084 } 1085 1086 /* 1087 * __remove_hrtimer - internal function to remove a timer 1088 * 1089 * Caller must hold the base lock. 1090 * 1091 * High resolution timer mode reprograms the clock event device when the 1092 * timer is the one which expires next. The caller can disable this by setting 1093 * reprogram to zero. This is useful, when the context does a reprogramming 1094 * anyway (e.g. timer interrupt) 1095 */ 1096 static void __remove_hrtimer(struct hrtimer *timer, 1097 struct hrtimer_clock_base *base, 1098 u8 newstate, int reprogram) 1099 { 1100 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1101 u8 state = timer->state; 1102 1103 /* Pairs with the lockless read in hrtimer_is_queued() */ 1104 WRITE_ONCE(timer->state, newstate); 1105 if (!(state & HRTIMER_STATE_ENQUEUED)) 1106 return; 1107 1108 if (!timerqueue_del(&base->active, &timer->node)) 1109 cpu_base->active_bases &= ~(1 << base->index); 1110 1111 /* 1112 * Note: If reprogram is false we do not update 1113 * cpu_base->next_timer. This happens when we remove the first 1114 * timer on a remote cpu. No harm as we never dereference 1115 * cpu_base->next_timer. So the worst thing what can happen is 1116 * an superfluous call to hrtimer_force_reprogram() on the 1117 * remote cpu later on if the same timer gets enqueued again. 1118 */ 1119 if (reprogram && timer == cpu_base->next_timer) 1120 hrtimer_force_reprogram(cpu_base, 1); 1121 } 1122 1123 /* 1124 * remove hrtimer, called with base lock held 1125 */ 1126 static inline int 1127 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1128 bool restart, bool keep_local) 1129 { 1130 u8 state = timer->state; 1131 1132 if (state & HRTIMER_STATE_ENQUEUED) { 1133 bool reprogram; 1134 1135 /* 1136 * Remove the timer and force reprogramming when high 1137 * resolution mode is active and the timer is on the current 1138 * CPU. If we remove a timer on another CPU, reprogramming is 1139 * skipped. The interrupt event on this CPU is fired and 1140 * reprogramming happens in the interrupt handler. This is a 1141 * rare case and less expensive than a smp call. 1142 */ 1143 debug_deactivate(timer); 1144 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); 1145 1146 /* 1147 * If the timer is not restarted then reprogramming is 1148 * required if the timer is local. If it is local and about 1149 * to be restarted, avoid programming it twice (on removal 1150 * and a moment later when it's requeued). 1151 */ 1152 if (!restart) 1153 state = HRTIMER_STATE_INACTIVE; 1154 else 1155 reprogram &= !keep_local; 1156 1157 __remove_hrtimer(timer, base, state, reprogram); 1158 return 1; 1159 } 1160 return 0; 1161 } 1162 1163 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, 1164 const enum hrtimer_mode mode) 1165 { 1166 #ifdef CONFIG_TIME_LOW_RES 1167 /* 1168 * CONFIG_TIME_LOW_RES indicates that the system has no way to return 1169 * granular time values. For relative timers we add hrtimer_resolution 1170 * (i.e. one jiffy) to prevent short timeouts. 1171 */ 1172 timer->is_rel = mode & HRTIMER_MODE_REL; 1173 if (timer->is_rel) 1174 tim = ktime_add_safe(tim, hrtimer_resolution); 1175 #endif 1176 return tim; 1177 } 1178 1179 static void 1180 hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) 1181 { 1182 ktime_t expires; 1183 1184 /* 1185 * Find the next SOFT expiration. 1186 */ 1187 expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); 1188 1189 /* 1190 * reprogramming needs to be triggered, even if the next soft 1191 * hrtimer expires at the same time than the next hard 1192 * hrtimer. cpu_base->softirq_expires_next needs to be updated! 1193 */ 1194 if (expires == KTIME_MAX) 1195 return; 1196 1197 /* 1198 * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() 1199 * cpu_base->*expires_next is only set by hrtimer_reprogram() 1200 */ 1201 hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); 1202 } 1203 1204 static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, 1205 u64 delta_ns, const enum hrtimer_mode mode, 1206 struct hrtimer_clock_base *base) 1207 { 1208 struct hrtimer_clock_base *new_base; 1209 bool force_local, first; 1210 1211 /* 1212 * If the timer is on the local cpu base and is the first expiring 1213 * timer then this might end up reprogramming the hardware twice 1214 * (on removal and on enqueue). To avoid that by prevent the 1215 * reprogram on removal, keep the timer local to the current CPU 1216 * and enforce reprogramming after it is queued no matter whether 1217 * it is the new first expiring timer again or not. 1218 */ 1219 force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); 1220 force_local &= base->cpu_base->next_timer == timer; 1221 1222 /* 1223 * Remove an active timer from the queue. In case it is not queued 1224 * on the current CPU, make sure that remove_hrtimer() updates the 1225 * remote data correctly. 1226 * 1227 * If it's on the current CPU and the first expiring timer, then 1228 * skip reprogramming, keep the timer local and enforce 1229 * reprogramming later if it was the first expiring timer. This 1230 * avoids programming the underlying clock event twice (once at 1231 * removal and once after enqueue). 1232 */ 1233 remove_hrtimer(timer, base, true, force_local); 1234 1235 if (mode & HRTIMER_MODE_REL) 1236 tim = ktime_add_safe(tim, base->get_time()); 1237 1238 tim = hrtimer_update_lowres(timer, tim, mode); 1239 1240 hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1241 1242 /* Switch the timer base, if necessary: */ 1243 if (!force_local) { 1244 new_base = switch_hrtimer_base(timer, base, 1245 mode & HRTIMER_MODE_PINNED); 1246 } else { 1247 new_base = base; 1248 } 1249 1250 first = enqueue_hrtimer(timer, new_base, mode); 1251 if (!force_local) 1252 return first; 1253 1254 /* 1255 * Timer was forced to stay on the current CPU to avoid 1256 * reprogramming on removal and enqueue. Force reprogram the 1257 * hardware by evaluating the new first expiring timer. 1258 */ 1259 hrtimer_force_reprogram(new_base->cpu_base, 1); 1260 return 0; 1261 } 1262 1263 /** 1264 * hrtimer_start_range_ns - (re)start an hrtimer 1265 * @timer: the timer to be added 1266 * @tim: expiry time 1267 * @delta_ns: "slack" range for the timer 1268 * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or 1269 * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); 1270 * softirq based mode is considered for debug purpose only! 1271 */ 1272 void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, 1273 u64 delta_ns, const enum hrtimer_mode mode) 1274 { 1275 struct hrtimer_clock_base *base; 1276 unsigned long flags; 1277 1278 if (WARN_ON_ONCE(!timer->function)) 1279 return; 1280 /* 1281 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft 1282 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard 1283 * expiry mode because unmarked timers are moved to softirq expiry. 1284 */ 1285 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 1286 WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); 1287 else 1288 WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); 1289 1290 base = lock_hrtimer_base(timer, &flags); 1291 1292 if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) 1293 hrtimer_reprogram(timer, true); 1294 1295 unlock_hrtimer_base(timer, &flags); 1296 } 1297 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 1298 1299 /** 1300 * hrtimer_try_to_cancel - try to deactivate a timer 1301 * @timer: hrtimer to stop 1302 * 1303 * Returns: 1304 * 1305 * * 0 when the timer was not active 1306 * * 1 when the timer was active 1307 * * -1 when the timer is currently executing the callback function and 1308 * cannot be stopped 1309 */ 1310 int hrtimer_try_to_cancel(struct hrtimer *timer) 1311 { 1312 struct hrtimer_clock_base *base; 1313 unsigned long flags; 1314 int ret = -1; 1315 1316 /* 1317 * Check lockless first. If the timer is not active (neither 1318 * enqueued nor running the callback, nothing to do here. The 1319 * base lock does not serialize against a concurrent enqueue, 1320 * so we can avoid taking it. 1321 */ 1322 if (!hrtimer_active(timer)) 1323 return 0; 1324 1325 base = lock_hrtimer_base(timer, &flags); 1326 1327 if (!hrtimer_callback_running(timer)) 1328 ret = remove_hrtimer(timer, base, false, false); 1329 1330 unlock_hrtimer_base(timer, &flags); 1331 1332 return ret; 1333 1334 } 1335 EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); 1336 1337 #ifdef CONFIG_PREEMPT_RT 1338 static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) 1339 { 1340 spin_lock_init(&base->softirq_expiry_lock); 1341 } 1342 1343 static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) 1344 __acquires(&base->softirq_expiry_lock) 1345 { 1346 spin_lock(&base->softirq_expiry_lock); 1347 } 1348 1349 static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) 1350 __releases(&base->softirq_expiry_lock) 1351 { 1352 spin_unlock(&base->softirq_expiry_lock); 1353 } 1354 1355 /* 1356 * The counterpart to hrtimer_cancel_wait_running(). 1357 * 1358 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for 1359 * the timer callback to finish. Drop expiry_lock and reacquire it. That 1360 * allows the waiter to acquire the lock and make progress. 1361 */ 1362 static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, 1363 unsigned long flags) 1364 { 1365 if (atomic_read(&cpu_base->timer_waiters)) { 1366 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1367 spin_unlock(&cpu_base->softirq_expiry_lock); 1368 spin_lock(&cpu_base->softirq_expiry_lock); 1369 raw_spin_lock_irq(&cpu_base->lock); 1370 } 1371 } 1372 1373 /* 1374 * This function is called on PREEMPT_RT kernels when the fast path 1375 * deletion of a timer failed because the timer callback function was 1376 * running. 1377 * 1378 * This prevents priority inversion: if the soft irq thread is preempted 1379 * in the middle of a timer callback, then calling del_timer_sync() can 1380 * lead to two issues: 1381 * 1382 * - If the caller is on a remote CPU then it has to spin wait for the timer 1383 * handler to complete. This can result in unbound priority inversion. 1384 * 1385 * - If the caller originates from the task which preempted the timer 1386 * handler on the same CPU, then spin waiting for the timer handler to 1387 * complete is never going to end. 1388 */ 1389 void hrtimer_cancel_wait_running(const struct hrtimer *timer) 1390 { 1391 /* Lockless read. Prevent the compiler from reloading it below */ 1392 struct hrtimer_clock_base *base = READ_ONCE(timer->base); 1393 1394 /* 1395 * Just relax if the timer expires in hard interrupt context or if 1396 * it is currently on the migration base. 1397 */ 1398 if (!timer->is_soft || is_migration_base(base)) { 1399 cpu_relax(); 1400 return; 1401 } 1402 1403 /* 1404 * Mark the base as contended and grab the expiry lock, which is 1405 * held by the softirq across the timer callback. Drop the lock 1406 * immediately so the softirq can expire the next timer. In theory 1407 * the timer could already be running again, but that's more than 1408 * unlikely and just causes another wait loop. 1409 */ 1410 atomic_inc(&base->cpu_base->timer_waiters); 1411 spin_lock_bh(&base->cpu_base->softirq_expiry_lock); 1412 atomic_dec(&base->cpu_base->timer_waiters); 1413 spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); 1414 } 1415 #else 1416 static inline void 1417 hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } 1418 static inline void 1419 hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } 1420 static inline void 1421 hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } 1422 static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, 1423 unsigned long flags) { } 1424 #endif 1425 1426 /** 1427 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 1428 * @timer: the timer to be cancelled 1429 * 1430 * Returns: 1431 * 0 when the timer was not active 1432 * 1 when the timer was active 1433 */ 1434 int hrtimer_cancel(struct hrtimer *timer) 1435 { 1436 int ret; 1437 1438 do { 1439 ret = hrtimer_try_to_cancel(timer); 1440 1441 if (ret < 0) 1442 hrtimer_cancel_wait_running(timer); 1443 } while (ret < 0); 1444 return ret; 1445 } 1446 EXPORT_SYMBOL_GPL(hrtimer_cancel); 1447 1448 /** 1449 * __hrtimer_get_remaining - get remaining time for the timer 1450 * @timer: the timer to read 1451 * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y 1452 */ 1453 ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) 1454 { 1455 unsigned long flags; 1456 ktime_t rem; 1457 1458 lock_hrtimer_base(timer, &flags); 1459 if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) 1460 rem = hrtimer_expires_remaining_adjusted(timer); 1461 else 1462 rem = hrtimer_expires_remaining(timer); 1463 unlock_hrtimer_base(timer, &flags); 1464 1465 return rem; 1466 } 1467 EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); 1468 1469 #ifdef CONFIG_NO_HZ_COMMON 1470 /** 1471 * hrtimer_get_next_event - get the time until next expiry event 1472 * 1473 * Returns the next expiry time or KTIME_MAX if no timer is pending. 1474 */ 1475 u64 hrtimer_get_next_event(void) 1476 { 1477 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1478 u64 expires = KTIME_MAX; 1479 unsigned long flags; 1480 1481 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1482 1483 if (!hrtimer_hres_active(cpu_base)) 1484 expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); 1485 1486 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1487 1488 return expires; 1489 } 1490 1491 /** 1492 * hrtimer_next_event_without - time until next expiry event w/o one timer 1493 * @exclude: timer to exclude 1494 * 1495 * Returns the next expiry time over all timers except for the @exclude one or 1496 * KTIME_MAX if none of them is pending. 1497 */ 1498 u64 hrtimer_next_event_without(const struct hrtimer *exclude) 1499 { 1500 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1501 u64 expires = KTIME_MAX; 1502 unsigned long flags; 1503 1504 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1505 1506 if (hrtimer_hres_active(cpu_base)) { 1507 unsigned int active; 1508 1509 if (!cpu_base->softirq_activated) { 1510 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 1511 expires = __hrtimer_next_event_base(cpu_base, exclude, 1512 active, KTIME_MAX); 1513 } 1514 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 1515 expires = __hrtimer_next_event_base(cpu_base, exclude, active, 1516 expires); 1517 } 1518 1519 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1520 1521 return expires; 1522 } 1523 #endif 1524 1525 static inline int hrtimer_clockid_to_base(clockid_t clock_id) 1526 { 1527 if (likely(clock_id < MAX_CLOCKS)) { 1528 int base = hrtimer_clock_to_base_table[clock_id]; 1529 1530 if (likely(base != HRTIMER_MAX_CLOCK_BASES)) 1531 return base; 1532 } 1533 WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); 1534 return HRTIMER_BASE_MONOTONIC; 1535 } 1536 1537 static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) 1538 { 1539 return HRTIMER_NORESTART; 1540 } 1541 1542 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1543 enum hrtimer_mode mode) 1544 { 1545 bool softtimer = !!(mode & HRTIMER_MODE_SOFT); 1546 struct hrtimer_cpu_base *cpu_base; 1547 int base; 1548 1549 /* 1550 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly 1551 * marked for hard interrupt expiry mode are moved into soft 1552 * interrupt context for latency reasons and because the callbacks 1553 * can invoke functions which might sleep on RT, e.g. spin_lock(). 1554 */ 1555 if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) 1556 softtimer = true; 1557 1558 memset(timer, 0, sizeof(struct hrtimer)); 1559 1560 cpu_base = raw_cpu_ptr(&hrtimer_bases); 1561 1562 /* 1563 * POSIX magic: Relative CLOCK_REALTIME timers are not affected by 1564 * clock modifications, so they needs to become CLOCK_MONOTONIC to 1565 * ensure POSIX compliance. 1566 */ 1567 if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) 1568 clock_id = CLOCK_MONOTONIC; 1569 1570 base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; 1571 base += hrtimer_clockid_to_base(clock_id); 1572 timer->is_soft = softtimer; 1573 timer->is_hard = !!(mode & HRTIMER_MODE_HARD); 1574 timer->base = &cpu_base->clock_base[base]; 1575 timerqueue_init(&timer->node); 1576 } 1577 1578 static void __hrtimer_setup(struct hrtimer *timer, 1579 enum hrtimer_restart (*function)(struct hrtimer *), 1580 clockid_t clock_id, enum hrtimer_mode mode) 1581 { 1582 __hrtimer_init(timer, clock_id, mode); 1583 1584 if (WARN_ON_ONCE(!function)) 1585 timer->function = hrtimer_dummy_timeout; 1586 else 1587 timer->function = function; 1588 } 1589 1590 /** 1591 * hrtimer_init - initialize a timer to the given clock 1592 * @timer: the timer to be initialized 1593 * @clock_id: the clock to be used 1594 * @mode: The modes which are relevant for initialization: 1595 * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, 1596 * HRTIMER_MODE_REL_SOFT 1597 * 1598 * The PINNED variants of the above can be handed in, 1599 * but the PINNED bit is ignored as pinning happens 1600 * when the hrtimer is started 1601 */ 1602 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1603 enum hrtimer_mode mode) 1604 { 1605 debug_init(timer, clock_id, mode); 1606 __hrtimer_init(timer, clock_id, mode); 1607 } 1608 EXPORT_SYMBOL_GPL(hrtimer_init); 1609 1610 /** 1611 * hrtimer_setup - initialize a timer to the given clock 1612 * @timer: the timer to be initialized 1613 * @function: the callback function 1614 * @clock_id: the clock to be used 1615 * @mode: The modes which are relevant for initialization: 1616 * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, 1617 * HRTIMER_MODE_REL_SOFT 1618 * 1619 * The PINNED variants of the above can be handed in, 1620 * but the PINNED bit is ignored as pinning happens 1621 * when the hrtimer is started 1622 */ 1623 void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), 1624 clockid_t clock_id, enum hrtimer_mode mode) 1625 { 1626 debug_init(timer, clock_id, mode); 1627 __hrtimer_setup(timer, function, clock_id, mode); 1628 } 1629 EXPORT_SYMBOL_GPL(hrtimer_setup); 1630 1631 /** 1632 * hrtimer_setup_on_stack - initialize a timer on stack memory 1633 * @timer: The timer to be initialized 1634 * @function: the callback function 1635 * @clock_id: The clock to be used 1636 * @mode: The timer mode 1637 * 1638 * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack 1639 * memory. 1640 */ 1641 void hrtimer_setup_on_stack(struct hrtimer *timer, 1642 enum hrtimer_restart (*function)(struct hrtimer *), 1643 clockid_t clock_id, enum hrtimer_mode mode) 1644 { 1645 debug_init_on_stack(timer, clock_id, mode); 1646 __hrtimer_setup(timer, function, clock_id, mode); 1647 } 1648 EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); 1649 1650 /* 1651 * A timer is active, when it is enqueued into the rbtree or the 1652 * callback function is running or it's in the state of being migrated 1653 * to another cpu. 1654 * 1655 * It is important for this function to not return a false negative. 1656 */ 1657 bool hrtimer_active(const struct hrtimer *timer) 1658 { 1659 struct hrtimer_clock_base *base; 1660 unsigned int seq; 1661 1662 do { 1663 base = READ_ONCE(timer->base); 1664 seq = raw_read_seqcount_begin(&base->seq); 1665 1666 if (timer->state != HRTIMER_STATE_INACTIVE || 1667 base->running == timer) 1668 return true; 1669 1670 } while (read_seqcount_retry(&base->seq, seq) || 1671 base != READ_ONCE(timer->base)); 1672 1673 return false; 1674 } 1675 EXPORT_SYMBOL_GPL(hrtimer_active); 1676 1677 /* 1678 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 1679 * distinct sections: 1680 * 1681 * - queued: the timer is queued 1682 * - callback: the timer is being ran 1683 * - post: the timer is inactive or (re)queued 1684 * 1685 * On the read side we ensure we observe timer->state and cpu_base->running 1686 * from the same section, if anything changed while we looked at it, we retry. 1687 * This includes timer->base changing because sequence numbers alone are 1688 * insufficient for that. 1689 * 1690 * The sequence numbers are required because otherwise we could still observe 1691 * a false negative if the read side got smeared over multiple consecutive 1692 * __run_hrtimer() invocations. 1693 */ 1694 1695 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, 1696 struct hrtimer_clock_base *base, 1697 struct hrtimer *timer, ktime_t *now, 1698 unsigned long flags) __must_hold(&cpu_base->lock) 1699 { 1700 enum hrtimer_restart (*fn)(struct hrtimer *); 1701 bool expires_in_hardirq; 1702 int restart; 1703 1704 lockdep_assert_held(&cpu_base->lock); 1705 1706 debug_deactivate(timer); 1707 base->running = timer; 1708 1709 /* 1710 * Separate the ->running assignment from the ->state assignment. 1711 * 1712 * As with a regular write barrier, this ensures the read side in 1713 * hrtimer_active() cannot observe base->running == NULL && 1714 * timer->state == INACTIVE. 1715 */ 1716 raw_write_seqcount_barrier(&base->seq); 1717 1718 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); 1719 fn = timer->function; 1720 1721 /* 1722 * Clear the 'is relative' flag for the TIME_LOW_RES case. If the 1723 * timer is restarted with a period then it becomes an absolute 1724 * timer. If its not restarted it does not matter. 1725 */ 1726 if (IS_ENABLED(CONFIG_TIME_LOW_RES)) 1727 timer->is_rel = false; 1728 1729 /* 1730 * The timer is marked as running in the CPU base, so it is 1731 * protected against migration to a different CPU even if the lock 1732 * is dropped. 1733 */ 1734 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1735 trace_hrtimer_expire_entry(timer, now); 1736 expires_in_hardirq = lockdep_hrtimer_enter(timer); 1737 1738 restart = fn(timer); 1739 1740 lockdep_hrtimer_exit(expires_in_hardirq); 1741 trace_hrtimer_expire_exit(timer); 1742 raw_spin_lock_irq(&cpu_base->lock); 1743 1744 /* 1745 * Note: We clear the running state after enqueue_hrtimer and 1746 * we do not reprogram the event hardware. Happens either in 1747 * hrtimer_start_range_ns() or in hrtimer_interrupt() 1748 * 1749 * Note: Because we dropped the cpu_base->lock above, 1750 * hrtimer_start_range_ns() can have popped in and enqueued the timer 1751 * for us already. 1752 */ 1753 if (restart != HRTIMER_NORESTART && 1754 !(timer->state & HRTIMER_STATE_ENQUEUED)) 1755 enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); 1756 1757 /* 1758 * Separate the ->running assignment from the ->state assignment. 1759 * 1760 * As with a regular write barrier, this ensures the read side in 1761 * hrtimer_active() cannot observe base->running.timer == NULL && 1762 * timer->state == INACTIVE. 1763 */ 1764 raw_write_seqcount_barrier(&base->seq); 1765 1766 WARN_ON_ONCE(base->running != timer); 1767 base->running = NULL; 1768 } 1769 1770 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, 1771 unsigned long flags, unsigned int active_mask) 1772 { 1773 struct hrtimer_clock_base *base; 1774 unsigned int active = cpu_base->active_bases & active_mask; 1775 1776 for_each_active_base(base, cpu_base, active) { 1777 struct timerqueue_node *node; 1778 ktime_t basenow; 1779 1780 basenow = ktime_add(now, base->offset); 1781 1782 while ((node = timerqueue_getnext(&base->active))) { 1783 struct hrtimer *timer; 1784 1785 timer = container_of(node, struct hrtimer, node); 1786 1787 /* 1788 * The immediate goal for using the softexpires is 1789 * minimizing wakeups, not running timers at the 1790 * earliest interrupt after their soft expiration. 1791 * This allows us to avoid using a Priority Search 1792 * Tree, which can answer a stabbing query for 1793 * overlapping intervals and instead use the simple 1794 * BST we already have. 1795 * We don't add extra wakeups by delaying timers that 1796 * are right-of a not yet expired timer, because that 1797 * timer will have to trigger a wakeup anyway. 1798 */ 1799 if (basenow < hrtimer_get_softexpires_tv64(timer)) 1800 break; 1801 1802 __run_hrtimer(cpu_base, base, timer, &basenow, flags); 1803 if (active_mask == HRTIMER_ACTIVE_SOFT) 1804 hrtimer_sync_wait_running(cpu_base, flags); 1805 } 1806 } 1807 } 1808 1809 static __latent_entropy void hrtimer_run_softirq(void) 1810 { 1811 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1812 unsigned long flags; 1813 ktime_t now; 1814 1815 hrtimer_cpu_base_lock_expiry(cpu_base); 1816 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1817 1818 now = hrtimer_update_base(cpu_base); 1819 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); 1820 1821 cpu_base->softirq_activated = 0; 1822 hrtimer_update_softirq_timer(cpu_base, true); 1823 1824 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1825 hrtimer_cpu_base_unlock_expiry(cpu_base); 1826 } 1827 1828 #ifdef CONFIG_HIGH_RES_TIMERS 1829 1830 /* 1831 * High resolution timer interrupt 1832 * Called with interrupts disabled 1833 */ 1834 void hrtimer_interrupt(struct clock_event_device *dev) 1835 { 1836 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1837 ktime_t expires_next, now, entry_time, delta; 1838 unsigned long flags; 1839 int retries = 0; 1840 1841 BUG_ON(!cpu_base->hres_active); 1842 cpu_base->nr_events++; 1843 dev->next_event = KTIME_MAX; 1844 1845 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1846 entry_time = now = hrtimer_update_base(cpu_base); 1847 retry: 1848 cpu_base->in_hrtirq = 1; 1849 /* 1850 * We set expires_next to KTIME_MAX here with cpu_base->lock 1851 * held to prevent that a timer is enqueued in our queue via 1852 * the migration code. This does not affect enqueueing of 1853 * timers which run their callback and need to be requeued on 1854 * this CPU. 1855 */ 1856 cpu_base->expires_next = KTIME_MAX; 1857 1858 if (!ktime_before(now, cpu_base->softirq_expires_next)) { 1859 cpu_base->softirq_expires_next = KTIME_MAX; 1860 cpu_base->softirq_activated = 1; 1861 raise_timer_softirq(HRTIMER_SOFTIRQ); 1862 } 1863 1864 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); 1865 1866 /* Reevaluate the clock bases for the [soft] next expiry */ 1867 expires_next = hrtimer_update_next_event(cpu_base); 1868 /* 1869 * Store the new expiry value so the migration code can verify 1870 * against it. 1871 */ 1872 cpu_base->expires_next = expires_next; 1873 cpu_base->in_hrtirq = 0; 1874 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1875 1876 /* Reprogramming necessary ? */ 1877 if (!tick_program_event(expires_next, 0)) { 1878 cpu_base->hang_detected = 0; 1879 return; 1880 } 1881 1882 /* 1883 * The next timer was already expired due to: 1884 * - tracing 1885 * - long lasting callbacks 1886 * - being scheduled away when running in a VM 1887 * 1888 * We need to prevent that we loop forever in the hrtimer 1889 * interrupt routine. We give it 3 attempts to avoid 1890 * overreacting on some spurious event. 1891 * 1892 * Acquire base lock for updating the offsets and retrieving 1893 * the current time. 1894 */ 1895 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1896 now = hrtimer_update_base(cpu_base); 1897 cpu_base->nr_retries++; 1898 if (++retries < 3) 1899 goto retry; 1900 /* 1901 * Give the system a chance to do something else than looping 1902 * here. We stored the entry time, so we know exactly how long 1903 * we spent here. We schedule the next event this amount of 1904 * time away. 1905 */ 1906 cpu_base->nr_hangs++; 1907 cpu_base->hang_detected = 1; 1908 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1909 1910 delta = ktime_sub(now, entry_time); 1911 if ((unsigned int)delta > cpu_base->max_hang_time) 1912 cpu_base->max_hang_time = (unsigned int) delta; 1913 /* 1914 * Limit it to a sensible value as we enforce a longer 1915 * delay. Give the CPU at least 100ms to catch up. 1916 */ 1917 if (delta > 100 * NSEC_PER_MSEC) 1918 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); 1919 else 1920 expires_next = ktime_add(now, delta); 1921 tick_program_event(expires_next, 1); 1922 pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); 1923 } 1924 #endif /* !CONFIG_HIGH_RES_TIMERS */ 1925 1926 /* 1927 * Called from run_local_timers in hardirq context every jiffy 1928 */ 1929 void hrtimer_run_queues(void) 1930 { 1931 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1932 unsigned long flags; 1933 ktime_t now; 1934 1935 if (hrtimer_hres_active(cpu_base)) 1936 return; 1937 1938 /* 1939 * This _is_ ugly: We have to check periodically, whether we 1940 * can switch to highres and / or nohz mode. The clocksource 1941 * switch happens with xtime_lock held. Notification from 1942 * there only sets the check bit in the tick_oneshot code, 1943 * otherwise we might deadlock vs. xtime_lock. 1944 */ 1945 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { 1946 hrtimer_switch_to_hres(); 1947 return; 1948 } 1949 1950 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1951 now = hrtimer_update_base(cpu_base); 1952 1953 if (!ktime_before(now, cpu_base->softirq_expires_next)) { 1954 cpu_base->softirq_expires_next = KTIME_MAX; 1955 cpu_base->softirq_activated = 1; 1956 raise_timer_softirq(HRTIMER_SOFTIRQ); 1957 } 1958 1959 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); 1960 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1961 } 1962 1963 /* 1964 * Sleep related functions: 1965 */ 1966 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) 1967 { 1968 struct hrtimer_sleeper *t = 1969 container_of(timer, struct hrtimer_sleeper, timer); 1970 struct task_struct *task = t->task; 1971 1972 t->task = NULL; 1973 if (task) 1974 wake_up_process(task); 1975 1976 return HRTIMER_NORESTART; 1977 } 1978 1979 /** 1980 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer 1981 * @sl: sleeper to be started 1982 * @mode: timer mode abs/rel 1983 * 1984 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers 1985 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) 1986 */ 1987 void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, 1988 enum hrtimer_mode mode) 1989 { 1990 /* 1991 * Make the enqueue delivery mode check work on RT. If the sleeper 1992 * was initialized for hard interrupt delivery, force the mode bit. 1993 * This is a special case for hrtimer_sleepers because 1994 * __hrtimer_init_sleeper() determines the delivery mode on RT so the 1995 * fiddling with this decision is avoided at the call sites. 1996 */ 1997 if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) 1998 mode |= HRTIMER_MODE_HARD; 1999 2000 hrtimer_start_expires(&sl->timer, mode); 2001 } 2002 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); 2003 2004 static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, 2005 clockid_t clock_id, enum hrtimer_mode mode) 2006 { 2007 /* 2008 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly 2009 * marked for hard interrupt expiry mode are moved into soft 2010 * interrupt context either for latency reasons or because the 2011 * hrtimer callback takes regular spinlocks or invokes other 2012 * functions which are not suitable for hard interrupt context on 2013 * PREEMPT_RT. 2014 * 2015 * The hrtimer_sleeper callback is RT compatible in hard interrupt 2016 * context, but there is a latency concern: Untrusted userspace can 2017 * spawn many threads which arm timers for the same expiry time on 2018 * the same CPU. That causes a latency spike due to the wakeup of 2019 * a gazillion threads. 2020 * 2021 * OTOH, privileged real-time user space applications rely on the 2022 * low latency of hard interrupt wakeups. If the current task is in 2023 * a real-time scheduling class, mark the mode for hard interrupt 2024 * expiry. 2025 */ 2026 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 2027 if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) 2028 mode |= HRTIMER_MODE_HARD; 2029 } 2030 2031 __hrtimer_init(&sl->timer, clock_id, mode); 2032 sl->timer.function = hrtimer_wakeup; 2033 sl->task = current; 2034 } 2035 2036 /** 2037 * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory 2038 * @sl: sleeper to be initialized 2039 * @clock_id: the clock to be used 2040 * @mode: timer mode abs/rel 2041 */ 2042 void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, 2043 clockid_t clock_id, enum hrtimer_mode mode) 2044 { 2045 debug_init_on_stack(&sl->timer, clock_id, mode); 2046 __hrtimer_init_sleeper(sl, clock_id, mode); 2047 } 2048 EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); 2049 2050 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) 2051 { 2052 switch(restart->nanosleep.type) { 2053 #ifdef CONFIG_COMPAT_32BIT_TIME 2054 case TT_COMPAT: 2055 if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) 2056 return -EFAULT; 2057 break; 2058 #endif 2059 case TT_NATIVE: 2060 if (put_timespec64(ts, restart->nanosleep.rmtp)) 2061 return -EFAULT; 2062 break; 2063 default: 2064 BUG(); 2065 } 2066 return -ERESTART_RESTARTBLOCK; 2067 } 2068 2069 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 2070 { 2071 struct restart_block *restart; 2072 2073 do { 2074 set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 2075 hrtimer_sleeper_start_expires(t, mode); 2076 2077 if (likely(t->task)) 2078 schedule(); 2079 2080 hrtimer_cancel(&t->timer); 2081 mode = HRTIMER_MODE_ABS; 2082 2083 } while (t->task && !signal_pending(current)); 2084 2085 __set_current_state(TASK_RUNNING); 2086 2087 if (!t->task) 2088 return 0; 2089 2090 restart = ¤t->restart_block; 2091 if (restart->nanosleep.type != TT_NONE) { 2092 ktime_t rem = hrtimer_expires_remaining(&t->timer); 2093 struct timespec64 rmt; 2094 2095 if (rem <= 0) 2096 return 0; 2097 rmt = ktime_to_timespec64(rem); 2098 2099 return nanosleep_copyout(restart, &rmt); 2100 } 2101 return -ERESTART_RESTARTBLOCK; 2102 } 2103 2104 static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) 2105 { 2106 struct hrtimer_sleeper t; 2107 int ret; 2108 2109 hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); 2110 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); 2111 ret = do_nanosleep(&t, HRTIMER_MODE_ABS); 2112 destroy_hrtimer_on_stack(&t.timer); 2113 return ret; 2114 } 2115 2116 long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, 2117 const clockid_t clockid) 2118 { 2119 struct restart_block *restart; 2120 struct hrtimer_sleeper t; 2121 int ret = 0; 2122 2123 hrtimer_setup_sleeper_on_stack(&t, clockid, mode); 2124 hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); 2125 ret = do_nanosleep(&t, mode); 2126 if (ret != -ERESTART_RESTARTBLOCK) 2127 goto out; 2128 2129 /* Absolute timers do not update the rmtp value and restart: */ 2130 if (mode == HRTIMER_MODE_ABS) { 2131 ret = -ERESTARTNOHAND; 2132 goto out; 2133 } 2134 2135 restart = ¤t->restart_block; 2136 restart->nanosleep.clockid = t.timer.base->clockid; 2137 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); 2138 set_restart_fn(restart, hrtimer_nanosleep_restart); 2139 out: 2140 destroy_hrtimer_on_stack(&t.timer); 2141 return ret; 2142 } 2143 2144 #ifdef CONFIG_64BIT 2145 2146 SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, 2147 struct __kernel_timespec __user *, rmtp) 2148 { 2149 struct timespec64 tu; 2150 2151 if (get_timespec64(&tu, rqtp)) 2152 return -EFAULT; 2153 2154 if (!timespec64_valid(&tu)) 2155 return -EINVAL; 2156 2157 current->restart_block.fn = do_no_restart_syscall; 2158 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; 2159 current->restart_block.nanosleep.rmtp = rmtp; 2160 return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, 2161 CLOCK_MONOTONIC); 2162 } 2163 2164 #endif 2165 2166 #ifdef CONFIG_COMPAT_32BIT_TIME 2167 2168 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, 2169 struct old_timespec32 __user *, rmtp) 2170 { 2171 struct timespec64 tu; 2172 2173 if (get_old_timespec32(&tu, rqtp)) 2174 return -EFAULT; 2175 2176 if (!timespec64_valid(&tu)) 2177 return -EINVAL; 2178 2179 current->restart_block.fn = do_no_restart_syscall; 2180 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; 2181 current->restart_block.nanosleep.compat_rmtp = rmtp; 2182 return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, 2183 CLOCK_MONOTONIC); 2184 } 2185 #endif 2186 2187 /* 2188 * Functions related to boot-time initialization: 2189 */ 2190 int hrtimers_prepare_cpu(unsigned int cpu) 2191 { 2192 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 2193 int i; 2194 2195 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 2196 struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; 2197 2198 clock_b->cpu_base = cpu_base; 2199 seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); 2200 timerqueue_init_head(&clock_b->active); 2201 } 2202 2203 cpu_base->cpu = cpu; 2204 hrtimer_cpu_base_init_expiry_lock(cpu_base); 2205 return 0; 2206 } 2207 2208 int hrtimers_cpu_starting(unsigned int cpu) 2209 { 2210 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2211 2212 /* Clear out any left over state from a CPU down operation */ 2213 cpu_base->active_bases = 0; 2214 cpu_base->hres_active = 0; 2215 cpu_base->hang_detected = 0; 2216 cpu_base->next_timer = NULL; 2217 cpu_base->softirq_next_timer = NULL; 2218 cpu_base->expires_next = KTIME_MAX; 2219 cpu_base->softirq_expires_next = KTIME_MAX; 2220 cpu_base->online = 1; 2221 return 0; 2222 } 2223 2224 #ifdef CONFIG_HOTPLUG_CPU 2225 2226 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 2227 struct hrtimer_clock_base *new_base) 2228 { 2229 struct hrtimer *timer; 2230 struct timerqueue_node *node; 2231 2232 while ((node = timerqueue_getnext(&old_base->active))) { 2233 timer = container_of(node, struct hrtimer, node); 2234 BUG_ON(hrtimer_callback_running(timer)); 2235 debug_deactivate(timer); 2236 2237 /* 2238 * Mark it as ENQUEUED not INACTIVE otherwise the 2239 * timer could be seen as !active and just vanish away 2240 * under us on another CPU 2241 */ 2242 __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); 2243 timer->base = new_base; 2244 /* 2245 * Enqueue the timers on the new cpu. This does not 2246 * reprogram the event device in case the timer 2247 * expires before the earliest on this CPU, but we run 2248 * hrtimer_interrupt after we migrated everything to 2249 * sort out already expired timers and reprogram the 2250 * event device. 2251 */ 2252 enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); 2253 } 2254 } 2255 2256 int hrtimers_cpu_dying(unsigned int dying_cpu) 2257 { 2258 int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); 2259 struct hrtimer_cpu_base *old_base, *new_base; 2260 2261 old_base = this_cpu_ptr(&hrtimer_bases); 2262 new_base = &per_cpu(hrtimer_bases, ncpu); 2263 2264 /* 2265 * The caller is globally serialized and nobody else 2266 * takes two locks at once, deadlock is not possible. 2267 */ 2268 raw_spin_lock(&old_base->lock); 2269 raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); 2270 2271 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 2272 migrate_hrtimer_list(&old_base->clock_base[i], 2273 &new_base->clock_base[i]); 2274 } 2275 2276 /* 2277 * The migration might have changed the first expiring softirq 2278 * timer on this CPU. Update it. 2279 */ 2280 __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); 2281 /* Tell the other CPU to retrigger the next event */ 2282 smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); 2283 2284 raw_spin_unlock(&new_base->lock); 2285 old_base->online = 0; 2286 raw_spin_unlock(&old_base->lock); 2287 2288 return 0; 2289 } 2290 2291 #endif /* CONFIG_HOTPLUG_CPU */ 2292 2293 void __init hrtimers_init(void) 2294 { 2295 hrtimers_prepare_cpu(smp_processor_id()); 2296 hrtimers_cpu_starting(smp_processor_id()); 2297 open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); 2298 } 2299