1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * 7 * NOHZ implementation for low and high resolution timers 8 * 9 * Started by: Thomas Gleixner and Ingo Molnar 10 */ 11 #include <linux/compiler.h> 12 #include <linux/cpu.h> 13 #include <linux/err.h> 14 #include <linux/hrtimer.h> 15 #include <linux/interrupt.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/percpu.h> 18 #include <linux/nmi.h> 19 #include <linux/profile.h> 20 #include <linux/sched/signal.h> 21 #include <linux/sched/clock.h> 22 #include <linux/sched/stat.h> 23 #include <linux/sched/nohz.h> 24 #include <linux/sched/loadavg.h> 25 #include <linux/module.h> 26 #include <linux/irq_work.h> 27 #include <linux/posix-timers.h> 28 #include <linux/context_tracking.h> 29 #include <linux/mm.h> 30 31 #include <asm/irq_regs.h> 32 33 #include "tick-internal.h" 34 35 #include <trace/events/timer.h> 36 37 /* 38 * Per-CPU nohz control structure 39 */ 40 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 41 42 struct tick_sched *tick_get_tick_sched(int cpu) 43 { 44 return &per_cpu(tick_cpu_sched, cpu); 45 } 46 47 /* 48 * The time when the last jiffy update happened. Write access must hold 49 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a 50 * consistent view of jiffies and last_jiffies_update. 51 */ 52 static ktime_t last_jiffies_update; 53 54 /* 55 * Must be called with interrupts disabled ! 56 */ 57 static void tick_do_update_jiffies64(ktime_t now) 58 { 59 unsigned long ticks = 1; 60 ktime_t delta, nextp; 61 62 /* 63 * 64-bit can do a quick check without holding the jiffies lock and 64 * without looking at the sequence count. The smp_load_acquire() 65 * pairs with the update done later in this function. 66 * 67 * 32-bit cannot do that because the store of 'tick_next_period' 68 * consists of two 32-bit stores, and the first store could be 69 * moved by the CPU to a random point in the future. 70 */ 71 if (IS_ENABLED(CONFIG_64BIT)) { 72 if (ktime_before(now, smp_load_acquire(&tick_next_period))) 73 return; 74 } else { 75 unsigned int seq; 76 77 /* 78 * Avoid contention on 'jiffies_lock' and protect the quick 79 * check with the sequence count. 80 */ 81 do { 82 seq = read_seqcount_begin(&jiffies_seq); 83 nextp = tick_next_period; 84 } while (read_seqcount_retry(&jiffies_seq, seq)); 85 86 if (ktime_before(now, nextp)) 87 return; 88 } 89 90 /* Quick check failed, i.e. update is required. */ 91 raw_spin_lock(&jiffies_lock); 92 /* 93 * Re-evaluate with the lock held. Another CPU might have done the 94 * update already. 95 */ 96 if (ktime_before(now, tick_next_period)) { 97 raw_spin_unlock(&jiffies_lock); 98 return; 99 } 100 101 write_seqcount_begin(&jiffies_seq); 102 103 delta = ktime_sub(now, tick_next_period); 104 if (unlikely(delta >= TICK_NSEC)) { 105 /* Slow path for long idle sleep times */ 106 s64 incr = TICK_NSEC; 107 108 ticks += ktime_divns(delta, incr); 109 110 last_jiffies_update = ktime_add_ns(last_jiffies_update, 111 incr * ticks); 112 } else { 113 last_jiffies_update = ktime_add_ns(last_jiffies_update, 114 TICK_NSEC); 115 } 116 117 /* Advance jiffies to complete the 'jiffies_seq' protected job */ 118 jiffies_64 += ticks; 119 120 /* Keep the tick_next_period variable up to date */ 121 nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); 122 123 if (IS_ENABLED(CONFIG_64BIT)) { 124 /* 125 * Pairs with smp_load_acquire() in the lockless quick 126 * check above, and ensures that the update to 'jiffies_64' is 127 * not reordered vs. the store to 'tick_next_period', neither 128 * by the compiler nor by the CPU. 129 */ 130 smp_store_release(&tick_next_period, nextp); 131 } else { 132 /* 133 * A plain store is good enough on 32-bit, as the quick check 134 * above is protected by the sequence count. 135 */ 136 tick_next_period = nextp; 137 } 138 139 /* 140 * Release the sequence count. calc_global_load() below is not 141 * protected by it, but 'jiffies_lock' needs to be held to prevent 142 * concurrent invocations. 143 */ 144 write_seqcount_end(&jiffies_seq); 145 146 calc_global_load(); 147 148 raw_spin_unlock(&jiffies_lock); 149 update_wall_time(); 150 } 151 152 /* 153 * Initialize and return retrieve the jiffies update. 154 */ 155 static ktime_t tick_init_jiffy_update(void) 156 { 157 ktime_t period; 158 159 raw_spin_lock(&jiffies_lock); 160 write_seqcount_begin(&jiffies_seq); 161 162 /* Have we started the jiffies update yet ? */ 163 if (last_jiffies_update == 0) { 164 u32 rem; 165 166 /* 167 * Ensure that the tick is aligned to a multiple of 168 * TICK_NSEC. 169 */ 170 div_u64_rem(tick_next_period, TICK_NSEC, &rem); 171 if (rem) 172 tick_next_period += TICK_NSEC - rem; 173 174 last_jiffies_update = tick_next_period; 175 } 176 period = last_jiffies_update; 177 178 write_seqcount_end(&jiffies_seq); 179 raw_spin_unlock(&jiffies_lock); 180 181 return period; 182 } 183 184 static inline int tick_sched_flag_test(struct tick_sched *ts, 185 unsigned long flag) 186 { 187 return !!(ts->flags & flag); 188 } 189 190 static inline void tick_sched_flag_set(struct tick_sched *ts, 191 unsigned long flag) 192 { 193 lockdep_assert_irqs_disabled(); 194 ts->flags |= flag; 195 } 196 197 static inline void tick_sched_flag_clear(struct tick_sched *ts, 198 unsigned long flag) 199 { 200 lockdep_assert_irqs_disabled(); 201 ts->flags &= ~flag; 202 } 203 204 #define MAX_STALLED_JIFFIES 5 205 206 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) 207 { 208 int tick_cpu, cpu = smp_processor_id(); 209 210 /* 211 * Check if the do_timer duty was dropped. We don't care about 212 * concurrency: This happens only when the CPU in charge went 213 * into a long sleep. If two CPUs happen to assign themselves to 214 * this duty, then the jiffies update is still serialized by 215 * 'jiffies_lock'. 216 * 217 * If nohz_full is enabled, this should not happen because the 218 * 'tick_do_timer_cpu' CPU never relinquishes. 219 */ 220 tick_cpu = READ_ONCE(tick_do_timer_cpu); 221 222 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { 223 #ifdef CONFIG_NO_HZ_FULL 224 WARN_ON_ONCE(tick_nohz_full_running); 225 #endif 226 WRITE_ONCE(tick_do_timer_cpu, cpu); 227 tick_cpu = cpu; 228 } 229 230 /* Check if jiffies need an update */ 231 if (tick_cpu == cpu) 232 tick_do_update_jiffies64(now); 233 234 /* 235 * If the jiffies update stalled for too long (timekeeper in stop_machine() 236 * or VMEXIT'ed for several msecs), force an update. 237 */ 238 if (ts->last_tick_jiffies != jiffies) { 239 ts->stalled_jiffies = 0; 240 ts->last_tick_jiffies = READ_ONCE(jiffies); 241 } else { 242 if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { 243 tick_do_update_jiffies64(now); 244 ts->stalled_jiffies = 0; 245 ts->last_tick_jiffies = READ_ONCE(jiffies); 246 } 247 } 248 249 if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) 250 ts->got_idle_tick = 1; 251 } 252 253 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 254 { 255 /* 256 * When we are idle and the tick is stopped, we have to touch 257 * the watchdog as we might not schedule for a really long 258 * time. This happens on completely idle SMP systems while 259 * waiting on the login prompt. We also increment the "start of 260 * idle" jiffy stamp so the idle accounting adjustment we do 261 * when we go busy again does not account too many ticks. 262 */ 263 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && 264 tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 265 touch_softlockup_watchdog_sched(); 266 if (is_idle_task(current)) 267 ts->idle_jiffies++; 268 /* 269 * In case the current tick fired too early past its expected 270 * expiration, make sure we don't bypass the next clock reprogramming 271 * to the same deadline. 272 */ 273 ts->next_tick = 0; 274 } 275 276 update_process_times(user_mode(regs)); 277 profile_tick(CPU_PROFILING); 278 } 279 280 /* 281 * We rearm the timer until we get disabled by the idle code. 282 * Called with interrupts disabled. 283 */ 284 static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) 285 { 286 struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); 287 struct pt_regs *regs = get_irq_regs(); 288 ktime_t now = ktime_get(); 289 290 tick_sched_do_timer(ts, now); 291 292 /* 293 * Do not call when we are not in IRQ context and have 294 * no valid 'regs' pointer 295 */ 296 if (regs) 297 tick_sched_handle(ts, regs); 298 else 299 ts->next_tick = 0; 300 301 /* 302 * In dynticks mode, tick reprogram is deferred: 303 * - to the idle task if in dynticks-idle 304 * - to IRQ exit if in full-dynticks. 305 */ 306 if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) 307 return HRTIMER_NORESTART; 308 309 hrtimer_forward(timer, now, TICK_NSEC); 310 311 return HRTIMER_RESTART; 312 } 313 314 #ifdef CONFIG_NO_HZ_FULL 315 cpumask_var_t tick_nohz_full_mask; 316 EXPORT_SYMBOL_GPL(tick_nohz_full_mask); 317 bool tick_nohz_full_running; 318 EXPORT_SYMBOL_GPL(tick_nohz_full_running); 319 static atomic_t tick_dep_mask; 320 321 static bool check_tick_dependency(atomic_t *dep) 322 { 323 int val = atomic_read(dep); 324 325 if (val & TICK_DEP_MASK_POSIX_TIMER) { 326 trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); 327 return true; 328 } 329 330 if (val & TICK_DEP_MASK_PERF_EVENTS) { 331 trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); 332 return true; 333 } 334 335 if (val & TICK_DEP_MASK_SCHED) { 336 trace_tick_stop(0, TICK_DEP_MASK_SCHED); 337 return true; 338 } 339 340 if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { 341 trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); 342 return true; 343 } 344 345 if (val & TICK_DEP_MASK_RCU) { 346 trace_tick_stop(0, TICK_DEP_MASK_RCU); 347 return true; 348 } 349 350 if (val & TICK_DEP_MASK_RCU_EXP) { 351 trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); 352 return true; 353 } 354 355 return false; 356 } 357 358 static bool can_stop_full_tick(int cpu, struct tick_sched *ts) 359 { 360 lockdep_assert_irqs_disabled(); 361 362 if (unlikely(!cpu_online(cpu))) 363 return false; 364 365 if (check_tick_dependency(&tick_dep_mask)) 366 return false; 367 368 if (check_tick_dependency(&ts->tick_dep_mask)) 369 return false; 370 371 if (check_tick_dependency(¤t->tick_dep_mask)) 372 return false; 373 374 if (check_tick_dependency(¤t->signal->tick_dep_mask)) 375 return false; 376 377 return true; 378 } 379 380 static void nohz_full_kick_func(struct irq_work *work) 381 { 382 /* Empty, the tick restart happens on tick_nohz_irq_exit() */ 383 } 384 385 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = 386 IRQ_WORK_INIT_HARD(nohz_full_kick_func); 387 388 /* 389 * Kick this CPU if it's full dynticks in order to force it to 390 * re-evaluate its dependency on the tick and restart it if necessary. 391 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), 392 * is NMI safe. 393 */ 394 static void tick_nohz_full_kick(void) 395 { 396 if (!tick_nohz_full_cpu(smp_processor_id())) 397 return; 398 399 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); 400 } 401 402 /* 403 * Kick the CPU if it's full dynticks in order to force it to 404 * re-evaluate its dependency on the tick and restart it if necessary. 405 */ 406 void tick_nohz_full_kick_cpu(int cpu) 407 { 408 if (!tick_nohz_full_cpu(cpu)) 409 return; 410 411 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); 412 } 413 414 static void tick_nohz_kick_task(struct task_struct *tsk) 415 { 416 int cpu; 417 418 /* 419 * If the task is not running, run_posix_cpu_timers() 420 * has nothing to elapse, and an IPI can then be optimized out. 421 * 422 * activate_task() STORE p->tick_dep_mask 423 * STORE p->on_rq 424 * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) 425 * LOCK rq->lock LOAD p->on_rq 426 * smp_mb__after_spin_lock() 427 * tick_nohz_task_switch() 428 * LOAD p->tick_dep_mask 429 */ 430 if (!sched_task_on_rq(tsk)) 431 return; 432 433 /* 434 * If the task concurrently migrates to another CPU, 435 * we guarantee it sees the new tick dependency upon 436 * schedule. 437 * 438 * set_task_cpu(p, cpu); 439 * STORE p->cpu = @cpu 440 * __schedule() (switch to task 'p') 441 * LOCK rq->lock 442 * smp_mb__after_spin_lock() STORE p->tick_dep_mask 443 * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) 444 * LOAD p->tick_dep_mask LOAD p->cpu 445 */ 446 cpu = task_cpu(tsk); 447 448 preempt_disable(); 449 if (cpu_online(cpu)) 450 tick_nohz_full_kick_cpu(cpu); 451 preempt_enable(); 452 } 453 454 /* 455 * Kick all full dynticks CPUs in order to force these to re-evaluate 456 * their dependency on the tick and restart it if necessary. 457 */ 458 static void tick_nohz_full_kick_all(void) 459 { 460 int cpu; 461 462 if (!tick_nohz_full_running) 463 return; 464 465 preempt_disable(); 466 for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) 467 tick_nohz_full_kick_cpu(cpu); 468 preempt_enable(); 469 } 470 471 static void tick_nohz_dep_set_all(atomic_t *dep, 472 enum tick_dep_bits bit) 473 { 474 int prev; 475 476 prev = atomic_fetch_or(BIT(bit), dep); 477 if (!prev) 478 tick_nohz_full_kick_all(); 479 } 480 481 /* 482 * Set a global tick dependency. Used by perf events that rely on freq and 483 * unstable clocks. 484 */ 485 void tick_nohz_dep_set(enum tick_dep_bits bit) 486 { 487 tick_nohz_dep_set_all(&tick_dep_mask, bit); 488 } 489 490 void tick_nohz_dep_clear(enum tick_dep_bits bit) 491 { 492 atomic_andnot(BIT(bit), &tick_dep_mask); 493 } 494 495 /* 496 * Set per-CPU tick dependency. Used by scheduler and perf events in order to 497 * manage event-throttling. 498 */ 499 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) 500 { 501 int prev; 502 struct tick_sched *ts; 503 504 ts = per_cpu_ptr(&tick_cpu_sched, cpu); 505 506 prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); 507 if (!prev) { 508 preempt_disable(); 509 /* Perf needs local kick that is NMI safe */ 510 if (cpu == smp_processor_id()) { 511 tick_nohz_full_kick(); 512 } else { 513 /* Remote IRQ work not NMI-safe */ 514 if (!WARN_ON_ONCE(in_nmi())) 515 tick_nohz_full_kick_cpu(cpu); 516 } 517 preempt_enable(); 518 } 519 } 520 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); 521 522 void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) 523 { 524 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 525 526 atomic_andnot(BIT(bit), &ts->tick_dep_mask); 527 } 528 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); 529 530 /* 531 * Set a per-task tick dependency. RCU needs this. Also posix CPU timers 532 * in order to elapse per task timers. 533 */ 534 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) 535 { 536 if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) 537 tick_nohz_kick_task(tsk); 538 } 539 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); 540 541 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) 542 { 543 atomic_andnot(BIT(bit), &tsk->tick_dep_mask); 544 } 545 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); 546 547 /* 548 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse 549 * per process timers. 550 */ 551 void tick_nohz_dep_set_signal(struct task_struct *tsk, 552 enum tick_dep_bits bit) 553 { 554 int prev; 555 struct signal_struct *sig = tsk->signal; 556 557 prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); 558 if (!prev) { 559 struct task_struct *t; 560 561 lockdep_assert_held(&tsk->sighand->siglock); 562 __for_each_thread(sig, t) 563 tick_nohz_kick_task(t); 564 } 565 } 566 567 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) 568 { 569 atomic_andnot(BIT(bit), &sig->tick_dep_mask); 570 } 571 572 /* 573 * Re-evaluate the need for the tick as we switch the current task. 574 * It might need the tick due to per task/process properties: 575 * perf events, posix CPU timers, ... 576 */ 577 void __tick_nohz_task_switch(void) 578 { 579 struct tick_sched *ts; 580 581 if (!tick_nohz_full_cpu(smp_processor_id())) 582 return; 583 584 ts = this_cpu_ptr(&tick_cpu_sched); 585 586 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 587 if (atomic_read(¤t->tick_dep_mask) || 588 atomic_read(¤t->signal->tick_dep_mask)) 589 tick_nohz_full_kick(); 590 } 591 } 592 593 /* Get the boot-time nohz CPU list from the kernel parameters. */ 594 void __init tick_nohz_full_setup(cpumask_var_t cpumask) 595 { 596 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 597 cpumask_copy(tick_nohz_full_mask, cpumask); 598 tick_nohz_full_running = true; 599 } 600 601 bool tick_nohz_cpu_hotpluggable(unsigned int cpu) 602 { 603 /* 604 * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound 605 * timers, workqueues, timekeeping, ...) on behalf of full dynticks 606 * CPUs. It must remain online when nohz full is enabled. 607 */ 608 if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) 609 return false; 610 return true; 611 } 612 613 static int tick_nohz_cpu_down(unsigned int cpu) 614 { 615 return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; 616 } 617 618 void __init tick_nohz_init(void) 619 { 620 int cpu, ret; 621 622 if (!tick_nohz_full_running) 623 return; 624 625 /* 626 * Full dynticks uses IRQ work to drive the tick rescheduling on safe 627 * locking contexts. But then we need IRQ work to raise its own 628 * interrupts to avoid circular dependency on the tick. 629 */ 630 if (!arch_irq_work_has_interrupt()) { 631 pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); 632 cpumask_clear(tick_nohz_full_mask); 633 tick_nohz_full_running = false; 634 return; 635 } 636 637 if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && 638 !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { 639 cpu = smp_processor_id(); 640 641 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { 642 pr_warn("NO_HZ: Clearing %d from nohz_full range " 643 "for timekeeping\n", cpu); 644 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 645 } 646 } 647 648 for_each_cpu(cpu, tick_nohz_full_mask) 649 ct_cpu_track_user(cpu); 650 651 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 652 "kernel/nohz:predown", NULL, 653 tick_nohz_cpu_down); 654 WARN_ON(ret < 0); 655 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 656 cpumask_pr_args(tick_nohz_full_mask)); 657 } 658 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 659 660 /* 661 * NOHZ - aka dynamic tick functionality 662 */ 663 #ifdef CONFIG_NO_HZ_COMMON 664 /* 665 * NO HZ enabled ? 666 */ 667 bool tick_nohz_enabled __read_mostly = true; 668 unsigned long tick_nohz_active __read_mostly; 669 /* 670 * Enable / Disable tickless mode 671 */ 672 static int __init setup_tick_nohz(char *str) 673 { 674 return (kstrtobool(str, &tick_nohz_enabled) == 0); 675 } 676 677 __setup("nohz=", setup_tick_nohz); 678 679 bool tick_nohz_tick_stopped(void) 680 { 681 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 682 683 return tick_sched_flag_test(ts, TS_FLAG_STOPPED); 684 } 685 686 bool tick_nohz_tick_stopped_cpu(int cpu) 687 { 688 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 689 690 return tick_sched_flag_test(ts, TS_FLAG_STOPPED); 691 } 692 693 /** 694 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 695 * @now: current ktime_t 696 * 697 * Called from interrupt entry when the CPU was idle 698 * 699 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 700 * must be updated. Otherwise an interrupt handler could use a stale jiffy 701 * value. We do this unconditionally on any CPU, as we don't know whether the 702 * CPU, which has the update task assigned, is in a long sleep. 703 */ 704 static void tick_nohz_update_jiffies(ktime_t now) 705 { 706 unsigned long flags; 707 708 __this_cpu_write(tick_cpu_sched.idle_waketime, now); 709 710 local_irq_save(flags); 711 tick_do_update_jiffies64(now); 712 local_irq_restore(flags); 713 714 touch_softlockup_watchdog_sched(); 715 } 716 717 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) 718 { 719 ktime_t delta; 720 721 if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) 722 return; 723 724 delta = ktime_sub(now, ts->idle_entrytime); 725 726 write_seqcount_begin(&ts->idle_sleeptime_seq); 727 if (nr_iowait_cpu(smp_processor_id()) > 0) 728 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 729 else 730 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 731 732 ts->idle_entrytime = now; 733 tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); 734 write_seqcount_end(&ts->idle_sleeptime_seq); 735 736 sched_clock_idle_wakeup_event(); 737 } 738 739 static void tick_nohz_start_idle(struct tick_sched *ts) 740 { 741 write_seqcount_begin(&ts->idle_sleeptime_seq); 742 ts->idle_entrytime = ktime_get(); 743 tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); 744 write_seqcount_end(&ts->idle_sleeptime_seq); 745 746 sched_clock_idle_sleep_event(); 747 } 748 749 static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, 750 bool compute_delta, u64 *last_update_time) 751 { 752 ktime_t now, idle; 753 unsigned int seq; 754 755 if (!tick_nohz_active) 756 return -1; 757 758 now = ktime_get(); 759 if (last_update_time) 760 *last_update_time = ktime_to_us(now); 761 762 do { 763 seq = read_seqcount_begin(&ts->idle_sleeptime_seq); 764 765 if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { 766 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 767 768 idle = ktime_add(*sleeptime, delta); 769 } else { 770 idle = *sleeptime; 771 } 772 } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); 773 774 return ktime_to_us(idle); 775 776 } 777 778 /** 779 * get_cpu_idle_time_us - get the total idle time of a CPU 780 * @cpu: CPU number to query 781 * @last_update_time: variable to store update time in. Do not update 782 * counters if NULL. 783 * 784 * Return the cumulative idle time (since boot) for a given 785 * CPU, in microseconds. Note that this is partially broken due to 786 * the counter of iowait tasks that can be remotely updated without 787 * any synchronization. Therefore it is possible to observe backward 788 * values within two consecutive reads. 789 * 790 * This time is measured via accounting rather than sampling, 791 * and is as accurate as ktime_get() is. 792 * 793 * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu 794 */ 795 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 796 { 797 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 798 799 return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, 800 !nr_iowait_cpu(cpu), last_update_time); 801 } 802 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 803 804 /** 805 * get_cpu_iowait_time_us - get the total iowait time of a CPU 806 * @cpu: CPU number to query 807 * @last_update_time: variable to store update time in. Do not update 808 * counters if NULL. 809 * 810 * Return the cumulative iowait time (since boot) for a given 811 * CPU, in microseconds. Note this is partially broken due to 812 * the counter of iowait tasks that can be remotely updated without 813 * any synchronization. Therefore it is possible to observe backward 814 * values within two consecutive reads. 815 * 816 * This time is measured via accounting rather than sampling, 817 * and is as accurate as ktime_get() is. 818 * 819 * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu 820 */ 821 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 822 { 823 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 824 825 return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, 826 nr_iowait_cpu(cpu), last_update_time); 827 } 828 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 829 830 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 831 { 832 hrtimer_cancel(&ts->sched_timer); 833 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 834 835 /* Forward the time to expire in the future */ 836 hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 837 838 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 839 hrtimer_start_expires(&ts->sched_timer, 840 HRTIMER_MODE_ABS_PINNED_HARD); 841 } else { 842 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 843 } 844 845 /* 846 * Reset to make sure the next tick stop doesn't get fooled by past 847 * cached clock deadline. 848 */ 849 ts->next_tick = 0; 850 } 851 852 static inline bool local_timer_softirq_pending(void) 853 { 854 return local_softirq_pending() & BIT(TIMER_SOFTIRQ); 855 } 856 857 /* 858 * Read jiffies and the time when jiffies were updated last 859 */ 860 u64 get_jiffies_update(unsigned long *basej) 861 { 862 unsigned long basejiff; 863 unsigned int seq; 864 u64 basemono; 865 866 do { 867 seq = read_seqcount_begin(&jiffies_seq); 868 basemono = last_jiffies_update; 869 basejiff = jiffies; 870 } while (read_seqcount_retry(&jiffies_seq, seq)); 871 *basej = basejiff; 872 return basemono; 873 } 874 875 /** 876 * tick_nohz_next_event() - return the clock monotonic based next event 877 * @ts: pointer to tick_sched struct 878 * @cpu: CPU number 879 * 880 * Return: 881 * *%0 - When the next event is a maximum of TICK_NSEC in the future 882 * and the tick is not stopped yet 883 * *%next_event - Next event based on clock monotonic 884 */ 885 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) 886 { 887 u64 basemono, next_tick, delta, expires; 888 unsigned long basejiff; 889 int tick_cpu; 890 891 basemono = get_jiffies_update(&basejiff); 892 ts->last_jiffies = basejiff; 893 ts->timer_expires_base = basemono; 894 895 /* 896 * Keep the periodic tick, when RCU, architecture or irq_work 897 * requests it. 898 * Aside of that, check whether the local timer softirq is 899 * pending. If so, its a bad idea to call get_next_timer_interrupt(), 900 * because there is an already expired timer, so it will request 901 * immediate expiry, which rearms the hardware timer with a 902 * minimal delta, which brings us back to this place 903 * immediately. Lather, rinse and repeat... 904 */ 905 if (rcu_needs_cpu() || arch_needs_cpu() || 906 irq_work_needs_cpu() || local_timer_softirq_pending()) { 907 next_tick = basemono + TICK_NSEC; 908 } else { 909 /* 910 * Get the next pending timer. If high resolution 911 * timers are enabled this only takes the timer wheel 912 * timers into account. If high resolution timers are 913 * disabled this also looks at the next expiring 914 * hrtimer. 915 */ 916 next_tick = get_next_timer_interrupt(basejiff, basemono); 917 ts->next_timer = next_tick; 918 } 919 920 /* Make sure next_tick is never before basemono! */ 921 if (WARN_ON_ONCE(basemono > next_tick)) 922 next_tick = basemono; 923 924 /* 925 * If the tick is due in the next period, keep it ticking or 926 * force prod the timer. 927 */ 928 delta = next_tick - basemono; 929 if (delta <= (u64)TICK_NSEC) { 930 /* 931 * We've not stopped the tick yet, and there's a timer in the 932 * next period, so no point in stopping it either, bail. 933 */ 934 if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 935 ts->timer_expires = 0; 936 goto out; 937 } 938 } 939 940 /* 941 * If this CPU is the one which had the do_timer() duty last, we limit 942 * the sleep time to the timekeeping 'max_deferment' value. 943 * Otherwise we can sleep as long as we want. 944 */ 945 delta = timekeeping_max_deferment(); 946 tick_cpu = READ_ONCE(tick_do_timer_cpu); 947 if (tick_cpu != cpu && 948 (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) 949 delta = KTIME_MAX; 950 951 /* Calculate the next expiry time */ 952 if (delta < (KTIME_MAX - basemono)) 953 expires = basemono + delta; 954 else 955 expires = KTIME_MAX; 956 957 ts->timer_expires = min_t(u64, expires, next_tick); 958 959 out: 960 return ts->timer_expires; 961 } 962 963 static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) 964 { 965 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 966 unsigned long basejiff = ts->last_jiffies; 967 u64 basemono = ts->timer_expires_base; 968 bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 969 int tick_cpu; 970 u64 expires; 971 972 /* Make sure we won't be trying to stop it twice in a row. */ 973 ts->timer_expires_base = 0; 974 975 /* 976 * Now the tick should be stopped definitely - so the timer base needs 977 * to be marked idle as well to not miss a newly queued timer. 978 */ 979 expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); 980 if (expires > ts->timer_expires) { 981 /* 982 * This path could only happen when the first timer was removed 983 * between calculating the possible sleep length and now (when 984 * high resolution mode is not active, timer could also be a 985 * hrtimer). 986 * 987 * We have to stick to the original calculated expiry value to 988 * not stop the tick for too long with a shallow C-state (which 989 * was programmed by cpuidle because of an early next expiration 990 * value). 991 */ 992 expires = ts->timer_expires; 993 } 994 995 /* If the timer base is not idle, retain the not yet stopped tick. */ 996 if (!timer_idle) 997 return; 998 999 /* 1000 * If this CPU is the one which updates jiffies, then give up 1001 * the assignment and let it be taken by the CPU which runs 1002 * the tick timer next, which might be this CPU as well. If we 1003 * don't drop this here, the jiffies might be stale and 1004 * do_timer() never gets invoked. Keep track of the fact that it 1005 * was the one which had the do_timer() duty last. 1006 */ 1007 tick_cpu = READ_ONCE(tick_do_timer_cpu); 1008 if (tick_cpu == cpu) { 1009 WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); 1010 tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); 1011 } else if (tick_cpu != TICK_DO_TIMER_NONE) { 1012 tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); 1013 } 1014 1015 /* Skip reprogram of event if it's not changed */ 1016 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { 1017 /* Sanity check: make sure clockevent is actually programmed */ 1018 if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 1019 return; 1020 1021 WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " 1022 "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, 1023 dev->next_event, hrtimer_active(&ts->sched_timer), 1024 hrtimer_get_expires(&ts->sched_timer)); 1025 } 1026 1027 /* 1028 * tick_nohz_stop_tick() can be called several times before 1029 * tick_nohz_restart_sched_tick() is called. This happens when 1030 * interrupts arrive which do not cause a reschedule. In the first 1031 * call we save the current tick time, so we can restart the 1032 * scheduler tick in tick_nohz_restart_sched_tick(). 1033 */ 1034 if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1035 calc_load_nohz_start(); 1036 quiet_vmstat(); 1037 1038 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 1039 tick_sched_flag_set(ts, TS_FLAG_STOPPED); 1040 trace_tick_stop(1, TICK_DEP_MASK_NONE); 1041 } 1042 1043 ts->next_tick = expires; 1044 1045 /* 1046 * If the expiration time == KTIME_MAX, then we simply stop 1047 * the tick timer. 1048 */ 1049 if (unlikely(expires == KTIME_MAX)) { 1050 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) 1051 hrtimer_cancel(&ts->sched_timer); 1052 else 1053 tick_program_event(KTIME_MAX, 1); 1054 return; 1055 } 1056 1057 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 1058 hrtimer_start(&ts->sched_timer, expires, 1059 HRTIMER_MODE_ABS_PINNED_HARD); 1060 } else { 1061 hrtimer_set_expires(&ts->sched_timer, expires); 1062 tick_program_event(expires, 1); 1063 } 1064 } 1065 1066 static void tick_nohz_retain_tick(struct tick_sched *ts) 1067 { 1068 ts->timer_expires_base = 0; 1069 } 1070 1071 #ifdef CONFIG_NO_HZ_FULL 1072 static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) 1073 { 1074 if (tick_nohz_next_event(ts, cpu)) 1075 tick_nohz_stop_tick(ts, cpu); 1076 else 1077 tick_nohz_retain_tick(ts); 1078 } 1079 #endif /* CONFIG_NO_HZ_FULL */ 1080 1081 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 1082 { 1083 /* Update jiffies first */ 1084 tick_do_update_jiffies64(now); 1085 1086 /* 1087 * Clear the timer idle flag, so we avoid IPIs on remote queueing and 1088 * the clock forward checks in the enqueue path: 1089 */ 1090 timer_clear_idle(); 1091 1092 calc_load_nohz_stop(); 1093 touch_softlockup_watchdog_sched(); 1094 1095 /* Cancel the scheduled timer and restore the tick: */ 1096 tick_sched_flag_clear(ts, TS_FLAG_STOPPED); 1097 tick_nohz_restart(ts, now); 1098 } 1099 1100 static void __tick_nohz_full_update_tick(struct tick_sched *ts, 1101 ktime_t now) 1102 { 1103 #ifdef CONFIG_NO_HZ_FULL 1104 int cpu = smp_processor_id(); 1105 1106 if (can_stop_full_tick(cpu, ts)) 1107 tick_nohz_full_stop_tick(ts, cpu); 1108 else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) 1109 tick_nohz_restart_sched_tick(ts, now); 1110 #endif 1111 } 1112 1113 static void tick_nohz_full_update_tick(struct tick_sched *ts) 1114 { 1115 if (!tick_nohz_full_cpu(smp_processor_id())) 1116 return; 1117 1118 if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1119 return; 1120 1121 __tick_nohz_full_update_tick(ts, ktime_get()); 1122 } 1123 1124 /* 1125 * A pending softirq outside an IRQ (or softirq disabled section) context 1126 * should be waiting for ksoftirqd to handle it. Therefore we shouldn't 1127 * reach this code due to the need_resched() early check in can_stop_idle_tick(). 1128 * 1129 * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the 1130 * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, 1131 * triggering the code below, since wakep_softirqd() is ignored. 1132 * 1133 */ 1134 static bool report_idle_softirq(void) 1135 { 1136 static int ratelimit; 1137 unsigned int pending = local_softirq_pending(); 1138 1139 if (likely(!pending)) 1140 return false; 1141 1142 /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ 1143 if (!cpu_active(smp_processor_id())) { 1144 pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; 1145 if (!pending) 1146 return false; 1147 } 1148 1149 if (ratelimit >= 10) 1150 return false; 1151 1152 /* On RT, softirq handling may be waiting on some lock */ 1153 if (local_bh_blocked()) 1154 return false; 1155 1156 pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", 1157 pending); 1158 ratelimit++; 1159 1160 return true; 1161 } 1162 1163 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 1164 { 1165 WARN_ON_ONCE(cpu_is_offline(cpu)); 1166 1167 if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) 1168 return false; 1169 1170 if (need_resched()) 1171 return false; 1172 1173 if (unlikely(report_idle_softirq())) 1174 return false; 1175 1176 if (tick_nohz_full_enabled()) { 1177 int tick_cpu = READ_ONCE(tick_do_timer_cpu); 1178 1179 /* 1180 * Keep the tick alive to guarantee timekeeping progression 1181 * if there are full dynticks CPUs around 1182 */ 1183 if (tick_cpu == cpu) 1184 return false; 1185 1186 /* Should not happen for nohz-full */ 1187 if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) 1188 return false; 1189 } 1190 1191 return true; 1192 } 1193 1194 /** 1195 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task 1196 * 1197 * When the next event is more than a tick into the future, stop the idle tick 1198 */ 1199 void tick_nohz_idle_stop_tick(void) 1200 { 1201 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1202 int cpu = smp_processor_id(); 1203 ktime_t expires; 1204 1205 /* 1206 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the 1207 * tick timer expiration time is known already. 1208 */ 1209 if (ts->timer_expires_base) 1210 expires = ts->timer_expires; 1211 else if (can_stop_idle_tick(cpu, ts)) 1212 expires = tick_nohz_next_event(ts, cpu); 1213 else 1214 return; 1215 1216 ts->idle_calls++; 1217 1218 if (expires > 0LL) { 1219 int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1220 1221 tick_nohz_stop_tick(ts, cpu); 1222 1223 ts->idle_sleeps++; 1224 ts->idle_expires = expires; 1225 1226 if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1227 ts->idle_jiffies = ts->last_jiffies; 1228 nohz_balance_enter_idle(cpu); 1229 } 1230 } else { 1231 tick_nohz_retain_tick(ts); 1232 } 1233 } 1234 1235 void tick_nohz_idle_retain_tick(void) 1236 { 1237 tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); 1238 } 1239 1240 /** 1241 * tick_nohz_idle_enter - prepare for entering idle on the current CPU 1242 * 1243 * Called when we start the idle loop. 1244 */ 1245 void tick_nohz_idle_enter(void) 1246 { 1247 struct tick_sched *ts; 1248 1249 lockdep_assert_irqs_enabled(); 1250 1251 local_irq_disable(); 1252 1253 ts = this_cpu_ptr(&tick_cpu_sched); 1254 1255 WARN_ON_ONCE(ts->timer_expires_base); 1256 1257 tick_sched_flag_set(ts, TS_FLAG_INIDLE); 1258 tick_nohz_start_idle(ts); 1259 1260 local_irq_enable(); 1261 } 1262 1263 /** 1264 * tick_nohz_irq_exit - Notify the tick about IRQ exit 1265 * 1266 * A timer may have been added/modified/deleted either by the current IRQ, 1267 * or by another place using this IRQ as a notification. This IRQ may have 1268 * also updated the RCU callback list. These events may require a 1269 * re-evaluation of the next tick. Depending on the context: 1270 * 1271 * 1) If the CPU is idle and no resched is pending, just proceed with idle 1272 * time accounting. The next tick will be re-evaluated on the next idle 1273 * loop iteration. 1274 * 1275 * 2) If the CPU is nohz_full: 1276 * 1277 * 2.1) If there is any tick dependency, restart the tick if stopped. 1278 * 1279 * 2.2) If there is no tick dependency, (re-)evaluate the next tick and 1280 * stop/update it accordingly. 1281 */ 1282 void tick_nohz_irq_exit(void) 1283 { 1284 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1285 1286 if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) 1287 tick_nohz_start_idle(ts); 1288 else 1289 tick_nohz_full_update_tick(ts); 1290 } 1291 1292 /** 1293 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run 1294 * 1295 * Return: %true if the tick handler has run, otherwise %false 1296 */ 1297 bool tick_nohz_idle_got_tick(void) 1298 { 1299 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1300 1301 if (ts->got_idle_tick) { 1302 ts->got_idle_tick = 0; 1303 return true; 1304 } 1305 return false; 1306 } 1307 1308 /** 1309 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer 1310 * or the tick, whichever expires first. Note that, if the tick has been 1311 * stopped, it returns the next hrtimer. 1312 * 1313 * Called from power state control code with interrupts disabled 1314 * 1315 * Return: the next expiration time 1316 */ 1317 ktime_t tick_nohz_get_next_hrtimer(void) 1318 { 1319 return __this_cpu_read(tick_cpu_device.evtdev)->next_event; 1320 } 1321 1322 /** 1323 * tick_nohz_get_sleep_length - return the expected length of the current sleep 1324 * @delta_next: duration until the next event if the tick cannot be stopped 1325 * 1326 * Called from power state control code with interrupts disabled. 1327 * 1328 * The return value of this function and/or the value returned by it through the 1329 * @delta_next pointer can be negative which must be taken into account by its 1330 * callers. 1331 * 1332 * Return: the expected length of the current sleep 1333 */ 1334 ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) 1335 { 1336 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 1337 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1338 int cpu = smp_processor_id(); 1339 /* 1340 * The idle entry time is expected to be a sufficient approximation of 1341 * the current time at this point. 1342 */ 1343 ktime_t now = ts->idle_entrytime; 1344 ktime_t next_event; 1345 1346 WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); 1347 1348 *delta_next = ktime_sub(dev->next_event, now); 1349 1350 if (!can_stop_idle_tick(cpu, ts)) 1351 return *delta_next; 1352 1353 next_event = tick_nohz_next_event(ts, cpu); 1354 if (!next_event) 1355 return *delta_next; 1356 1357 /* 1358 * If the next highres timer to expire is earlier than 'next_event', the 1359 * idle governor needs to know that. 1360 */ 1361 next_event = min_t(u64, next_event, 1362 hrtimer_next_event_without(&ts->sched_timer)); 1363 1364 return ktime_sub(next_event, now); 1365 } 1366 1367 /** 1368 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value 1369 * for a particular CPU. 1370 * @cpu: target CPU number 1371 * 1372 * Called from the schedutil frequency scaling governor in scheduler context. 1373 * 1374 * Return: the current idle calls counter value for @cpu 1375 */ 1376 unsigned long tick_nohz_get_idle_calls_cpu(int cpu) 1377 { 1378 struct tick_sched *ts = tick_get_tick_sched(cpu); 1379 1380 return ts->idle_calls; 1381 } 1382 1383 static void tick_nohz_account_idle_time(struct tick_sched *ts, 1384 ktime_t now) 1385 { 1386 unsigned long ticks; 1387 1388 ts->idle_exittime = now; 1389 1390 if (vtime_accounting_enabled_this_cpu()) 1391 return; 1392 /* 1393 * We stopped the tick in idle. update_process_times() would miss the 1394 * time we slept, as it does only a 1 tick accounting. 1395 * Enforce that this is accounted to idle ! 1396 */ 1397 ticks = jiffies - ts->idle_jiffies; 1398 /* 1399 * We might be one off. Do not randomly account a huge number of ticks! 1400 */ 1401 if (ticks && ticks < LONG_MAX) 1402 account_idle_ticks(ticks); 1403 } 1404 1405 void tick_nohz_idle_restart_tick(void) 1406 { 1407 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1408 1409 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1410 ktime_t now = ktime_get(); 1411 tick_nohz_restart_sched_tick(ts, now); 1412 tick_nohz_account_idle_time(ts, now); 1413 } 1414 } 1415 1416 static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) 1417 { 1418 if (tick_nohz_full_cpu(smp_processor_id())) 1419 __tick_nohz_full_update_tick(ts, now); 1420 else 1421 tick_nohz_restart_sched_tick(ts, now); 1422 1423 tick_nohz_account_idle_time(ts, now); 1424 } 1425 1426 /** 1427 * tick_nohz_idle_exit - Update the tick upon idle task exit 1428 * 1429 * When the idle task exits, update the tick depending on the 1430 * following situations: 1431 * 1432 * 1) If the CPU is not in nohz_full mode (most cases), then 1433 * restart the tick. 1434 * 1435 * 2) If the CPU is in nohz_full mode (corner case): 1436 * 2.1) If the tick can be kept stopped (no tick dependencies) 1437 * then re-evaluate the next tick and try to keep it stopped 1438 * as long as possible. 1439 * 2.2) If the tick has dependencies, restart the tick. 1440 * 1441 */ 1442 void tick_nohz_idle_exit(void) 1443 { 1444 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1445 bool idle_active, tick_stopped; 1446 ktime_t now; 1447 1448 local_irq_disable(); 1449 1450 WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); 1451 WARN_ON_ONCE(ts->timer_expires_base); 1452 1453 tick_sched_flag_clear(ts, TS_FLAG_INIDLE); 1454 idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); 1455 tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1456 1457 if (idle_active || tick_stopped) 1458 now = ktime_get(); 1459 1460 if (idle_active) 1461 tick_nohz_stop_idle(ts, now); 1462 1463 if (tick_stopped) 1464 tick_nohz_idle_update_tick(ts, now); 1465 1466 local_irq_enable(); 1467 } 1468 1469 /* 1470 * In low-resolution mode, the tick handler must be implemented directly 1471 * at the clockevent level. hrtimer can't be used instead, because its 1472 * infrastructure actually relies on the tick itself as a backend in 1473 * low-resolution mode (see hrtimer_run_queues()). 1474 */ 1475 static void tick_nohz_lowres_handler(struct clock_event_device *dev) 1476 { 1477 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1478 1479 dev->next_event = KTIME_MAX; 1480 1481 if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) 1482 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1483 } 1484 1485 static inline void tick_nohz_activate(struct tick_sched *ts) 1486 { 1487 if (!tick_nohz_enabled) 1488 return; 1489 tick_sched_flag_set(ts, TS_FLAG_NOHZ); 1490 /* One update is enough */ 1491 if (!test_and_set_bit(0, &tick_nohz_active)) 1492 timers_update_nohz(); 1493 } 1494 1495 /** 1496 * tick_nohz_switch_to_nohz - switch to NOHZ mode 1497 */ 1498 static void tick_nohz_switch_to_nohz(void) 1499 { 1500 if (!tick_nohz_enabled) 1501 return; 1502 1503 if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) 1504 return; 1505 1506 /* 1507 * Recycle the hrtimer in 'ts', so we can share the 1508 * highres code. 1509 */ 1510 tick_setup_sched_timer(false); 1511 } 1512 1513 static inline void tick_nohz_irq_enter(void) 1514 { 1515 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1516 ktime_t now; 1517 1518 if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) 1519 return; 1520 now = ktime_get(); 1521 if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) 1522 tick_nohz_stop_idle(ts, now); 1523 /* 1524 * If all CPUs are idle we may need to update a stale jiffies value. 1525 * Note nohz_full is a special case: a timekeeper is guaranteed to stay 1526 * alive but it might be busy looping with interrupts disabled in some 1527 * rare case (typically stop machine). So we must make sure we have a 1528 * last resort. 1529 */ 1530 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) 1531 tick_nohz_update_jiffies(now); 1532 } 1533 1534 #else 1535 1536 static inline void tick_nohz_switch_to_nohz(void) { } 1537 static inline void tick_nohz_irq_enter(void) { } 1538 static inline void tick_nohz_activate(struct tick_sched *ts) { } 1539 1540 #endif /* CONFIG_NO_HZ_COMMON */ 1541 1542 /* 1543 * Called from irq_enter() to notify about the possible interruption of idle() 1544 */ 1545 void tick_irq_enter(void) 1546 { 1547 tick_check_oneshot_broadcast_this_cpu(); 1548 tick_nohz_irq_enter(); 1549 } 1550 1551 static int sched_skew_tick; 1552 1553 static int __init skew_tick(char *str) 1554 { 1555 get_option(&str, &sched_skew_tick); 1556 1557 return 0; 1558 } 1559 early_param("skew_tick", skew_tick); 1560 1561 /** 1562 * tick_setup_sched_timer - setup the tick emulation timer 1563 * @hrtimer: whether to use the hrtimer or not 1564 */ 1565 void tick_setup_sched_timer(bool hrtimer) 1566 { 1567 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1568 1569 /* Emulate tick processing via per-CPU hrtimers: */ 1570 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); 1571 1572 if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { 1573 tick_sched_flag_set(ts, TS_FLAG_HIGHRES); 1574 ts->sched_timer.function = tick_nohz_handler; 1575 } 1576 1577 /* Get the next period (per-CPU) */ 1578 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 1579 1580 /* Offset the tick to avert 'jiffies_lock' contention. */ 1581 if (sched_skew_tick) { 1582 u64 offset = TICK_NSEC >> 1; 1583 do_div(offset, num_possible_cpus()); 1584 offset *= smp_processor_id(); 1585 hrtimer_add_expires_ns(&ts->sched_timer, offset); 1586 } 1587 1588 hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); 1589 if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) 1590 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); 1591 else 1592 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1593 tick_nohz_activate(ts); 1594 } 1595 1596 /* 1597 * Shut down the tick and make sure the CPU won't try to retake the timekeeping 1598 * duty before disabling IRQs in idle for the last time. 1599 */ 1600 void tick_sched_timer_dying(int cpu) 1601 { 1602 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1603 ktime_t idle_sleeptime, iowait_sleeptime; 1604 unsigned long idle_calls, idle_sleeps; 1605 1606 /* This must happen before hrtimers are migrated! */ 1607 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) 1608 hrtimer_cancel(&ts->sched_timer); 1609 1610 idle_sleeptime = ts->idle_sleeptime; 1611 iowait_sleeptime = ts->iowait_sleeptime; 1612 idle_calls = ts->idle_calls; 1613 idle_sleeps = ts->idle_sleeps; 1614 memset(ts, 0, sizeof(*ts)); 1615 ts->idle_sleeptime = idle_sleeptime; 1616 ts->iowait_sleeptime = iowait_sleeptime; 1617 ts->idle_calls = idle_calls; 1618 ts->idle_sleeps = idle_sleeps; 1619 } 1620 1621 /* 1622 * Async notification about clocksource changes 1623 */ 1624 void tick_clock_notify(void) 1625 { 1626 int cpu; 1627 1628 for_each_possible_cpu(cpu) 1629 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 1630 } 1631 1632 /* 1633 * Async notification about clock event changes 1634 */ 1635 void tick_oneshot_notify(void) 1636 { 1637 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1638 1639 set_bit(0, &ts->check_clocks); 1640 } 1641 1642 /* 1643 * Check if a change happened, which makes oneshot possible. 1644 * 1645 * Called cyclically from the hrtimer softirq (driven by the timer 1646 * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ 1647 * mode, because high resolution timers are disabled (either compile 1648 * or runtime). Called with interrupts disabled. 1649 */ 1650 int tick_check_oneshot_change(int allow_nohz) 1651 { 1652 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1653 1654 if (!test_and_clear_bit(0, &ts->check_clocks)) 1655 return 0; 1656 1657 if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1658 return 0; 1659 1660 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) 1661 return 0; 1662 1663 if (!allow_nohz) 1664 return 1; 1665 1666 tick_nohz_switch_to_nohz(); 1667 return 0; 1668 } 1669