1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> 4 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * 7 * NOHZ implementation for low and high resolution timers 8 * 9 * Started by: Thomas Gleixner and Ingo Molnar 10 */ 11 #include <linux/compiler.h> 12 #include <linux/cpu.h> 13 #include <linux/err.h> 14 #include <linux/hrtimer.h> 15 #include <linux/interrupt.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/percpu.h> 18 #include <linux/nmi.h> 19 #include <linux/profile.h> 20 #include <linux/sched/signal.h> 21 #include <linux/sched/clock.h> 22 #include <linux/sched/stat.h> 23 #include <linux/sched/nohz.h> 24 #include <linux/sched/loadavg.h> 25 #include <linux/module.h> 26 #include <linux/irq_work.h> 27 #include <linux/posix-timers.h> 28 #include <linux/context_tracking.h> 29 #include <linux/mm.h> 30 31 #include <asm/irq_regs.h> 32 33 #include "tick-internal.h" 34 35 #include <trace/events/timer.h> 36 37 /* 38 * Per-CPU nohz control structure 39 */ 40 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 41 42 struct tick_sched *tick_get_tick_sched(int cpu) 43 { 44 return &per_cpu(tick_cpu_sched, cpu); 45 } 46 47 /* 48 * The time when the last jiffy update happened. Write access must hold 49 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a 50 * consistent view of jiffies and last_jiffies_update. 51 */ 52 static ktime_t last_jiffies_update; 53 54 /* 55 * Must be called with interrupts disabled ! 56 */ 57 static void tick_do_update_jiffies64(ktime_t now) 58 { 59 unsigned long ticks = 1; 60 ktime_t delta, nextp; 61 62 /* 63 * 64-bit can do a quick check without holding the jiffies lock and 64 * without looking at the sequence count. The smp_load_acquire() 65 * pairs with the update done later in this function. 66 * 67 * 32-bit cannot do that because the store of 'tick_next_period' 68 * consists of two 32-bit stores, and the first store could be 69 * moved by the CPU to a random point in the future. 70 */ 71 if (IS_ENABLED(CONFIG_64BIT)) { 72 if (ktime_before(now, smp_load_acquire(&tick_next_period))) 73 return; 74 } else { 75 unsigned int seq; 76 77 /* 78 * Avoid contention on 'jiffies_lock' and protect the quick 79 * check with the sequence count. 80 */ 81 do { 82 seq = read_seqcount_begin(&jiffies_seq); 83 nextp = tick_next_period; 84 } while (read_seqcount_retry(&jiffies_seq, seq)); 85 86 if (ktime_before(now, nextp)) 87 return; 88 } 89 90 /* Quick check failed, i.e. update is required. */ 91 raw_spin_lock(&jiffies_lock); 92 /* 93 * Re-evaluate with the lock held. Another CPU might have done the 94 * update already. 95 */ 96 if (ktime_before(now, tick_next_period)) { 97 raw_spin_unlock(&jiffies_lock); 98 return; 99 } 100 101 write_seqcount_begin(&jiffies_seq); 102 103 delta = ktime_sub(now, tick_next_period); 104 if (unlikely(delta >= TICK_NSEC)) { 105 /* Slow path for long idle sleep times */ 106 s64 incr = TICK_NSEC; 107 108 ticks += ktime_divns(delta, incr); 109 110 last_jiffies_update = ktime_add_ns(last_jiffies_update, 111 incr * ticks); 112 } else { 113 last_jiffies_update = ktime_add_ns(last_jiffies_update, 114 TICK_NSEC); 115 } 116 117 /* Advance jiffies to complete the 'jiffies_seq' protected job */ 118 jiffies_64 += ticks; 119 120 /* Keep the tick_next_period variable up to date */ 121 nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); 122 123 if (IS_ENABLED(CONFIG_64BIT)) { 124 /* 125 * Pairs with smp_load_acquire() in the lockless quick 126 * check above, and ensures that the update to 'jiffies_64' is 127 * not reordered vs. the store to 'tick_next_period', neither 128 * by the compiler nor by the CPU. 129 */ 130 smp_store_release(&tick_next_period, nextp); 131 } else { 132 /* 133 * A plain store is good enough on 32-bit, as the quick check 134 * above is protected by the sequence count. 135 */ 136 tick_next_period = nextp; 137 } 138 139 /* 140 * Release the sequence count. calc_global_load() below is not 141 * protected by it, but 'jiffies_lock' needs to be held to prevent 142 * concurrent invocations. 143 */ 144 write_seqcount_end(&jiffies_seq); 145 146 calc_global_load(); 147 148 raw_spin_unlock(&jiffies_lock); 149 update_wall_time(); 150 } 151 152 /* 153 * Initialize and return retrieve the jiffies update. 154 */ 155 static ktime_t tick_init_jiffy_update(void) 156 { 157 ktime_t period; 158 159 raw_spin_lock(&jiffies_lock); 160 write_seqcount_begin(&jiffies_seq); 161 162 /* Have we started the jiffies update yet ? */ 163 if (last_jiffies_update == 0) { 164 u32 rem; 165 166 /* 167 * Ensure that the tick is aligned to a multiple of 168 * TICK_NSEC. 169 */ 170 div_u64_rem(tick_next_period, TICK_NSEC, &rem); 171 if (rem) 172 tick_next_period += TICK_NSEC - rem; 173 174 last_jiffies_update = tick_next_period; 175 } 176 period = last_jiffies_update; 177 178 write_seqcount_end(&jiffies_seq); 179 raw_spin_unlock(&jiffies_lock); 180 181 return period; 182 } 183 184 static inline int tick_sched_flag_test(struct tick_sched *ts, 185 unsigned long flag) 186 { 187 return !!(ts->flags & flag); 188 } 189 190 static inline void tick_sched_flag_set(struct tick_sched *ts, 191 unsigned long flag) 192 { 193 lockdep_assert_irqs_disabled(); 194 ts->flags |= flag; 195 } 196 197 static inline void tick_sched_flag_clear(struct tick_sched *ts, 198 unsigned long flag) 199 { 200 lockdep_assert_irqs_disabled(); 201 ts->flags &= ~flag; 202 } 203 204 /* 205 * Allow only one non-timekeeper CPU at a time update jiffies from 206 * the timer tick. 207 * 208 * Returns true if update was run. 209 */ 210 static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) 211 { 212 static atomic_t in_progress; 213 int inp; 214 215 inp = atomic_read(&in_progress); 216 if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) 217 return false; 218 219 if (ts->last_tick_jiffies == jiffies) 220 tick_do_update_jiffies64(now); 221 atomic_set(&in_progress, 0); 222 return true; 223 } 224 225 #define MAX_STALLED_JIFFIES 5 226 227 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) 228 { 229 int tick_cpu, cpu = smp_processor_id(); 230 231 /* 232 * Check if the do_timer duty was dropped. We don't care about 233 * concurrency: This happens only when the CPU in charge went 234 * into a long sleep. If two CPUs happen to assign themselves to 235 * this duty, then the jiffies update is still serialized by 236 * 'jiffies_lock'. 237 * 238 * If nohz_full is enabled, this should not happen because the 239 * 'tick_do_timer_cpu' CPU never relinquishes. 240 */ 241 tick_cpu = READ_ONCE(tick_do_timer_cpu); 242 243 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { 244 #ifdef CONFIG_NO_HZ_FULL 245 WARN_ON_ONCE(tick_nohz_full_running); 246 #endif 247 WRITE_ONCE(tick_do_timer_cpu, cpu); 248 tick_cpu = cpu; 249 } 250 251 /* Check if jiffies need an update */ 252 if (tick_cpu == cpu) 253 tick_do_update_jiffies64(now); 254 255 /* 256 * If the jiffies update stalled for too long (timekeeper in stop_machine() 257 * or VMEXIT'ed for several msecs), force an update. 258 */ 259 if (ts->last_tick_jiffies != jiffies) { 260 ts->stalled_jiffies = 0; 261 ts->last_tick_jiffies = READ_ONCE(jiffies); 262 } else { 263 if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { 264 if (tick_limited_update_jiffies64(ts, now)) { 265 ts->stalled_jiffies = 0; 266 ts->last_tick_jiffies = READ_ONCE(jiffies); 267 } 268 } 269 } 270 271 if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) 272 ts->got_idle_tick = 1; 273 } 274 275 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 276 { 277 /* 278 * When we are idle and the tick is stopped, we have to touch 279 * the watchdog as we might not schedule for a really long 280 * time. This happens on completely idle SMP systems while 281 * waiting on the login prompt. We also increment the "start of 282 * idle" jiffy stamp so the idle accounting adjustment we do 283 * when we go busy again does not account too many ticks. 284 */ 285 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && 286 tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 287 touch_softlockup_watchdog_sched(); 288 if (is_idle_task(current)) 289 ts->idle_jiffies++; 290 /* 291 * In case the current tick fired too early past its expected 292 * expiration, make sure we don't bypass the next clock reprogramming 293 * to the same deadline. 294 */ 295 ts->next_tick = 0; 296 } 297 298 update_process_times(user_mode(regs)); 299 profile_tick(CPU_PROFILING); 300 } 301 302 /* 303 * We rearm the timer until we get disabled by the idle code. 304 * Called with interrupts disabled. 305 */ 306 static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) 307 { 308 struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); 309 struct pt_regs *regs = get_irq_regs(); 310 ktime_t now = ktime_get(); 311 312 tick_sched_do_timer(ts, now); 313 314 /* 315 * Do not call when we are not in IRQ context and have 316 * no valid 'regs' pointer 317 */ 318 if (regs) 319 tick_sched_handle(ts, regs); 320 else 321 ts->next_tick = 0; 322 323 /* 324 * In dynticks mode, tick reprogram is deferred: 325 * - to the idle task if in dynticks-idle 326 * - to IRQ exit if in full-dynticks. 327 */ 328 if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) 329 return HRTIMER_NORESTART; 330 331 hrtimer_forward(timer, now, TICK_NSEC); 332 333 return HRTIMER_RESTART; 334 } 335 336 #ifdef CONFIG_NO_HZ_FULL 337 cpumask_var_t tick_nohz_full_mask; 338 EXPORT_SYMBOL_GPL(tick_nohz_full_mask); 339 bool tick_nohz_full_running; 340 EXPORT_SYMBOL_GPL(tick_nohz_full_running); 341 static atomic_t tick_dep_mask; 342 343 static bool check_tick_dependency(atomic_t *dep) 344 { 345 int val = atomic_read(dep); 346 347 if (val & TICK_DEP_MASK_POSIX_TIMER) { 348 trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); 349 return true; 350 } 351 352 if (val & TICK_DEP_MASK_PERF_EVENTS) { 353 trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); 354 return true; 355 } 356 357 if (val & TICK_DEP_MASK_SCHED) { 358 trace_tick_stop(0, TICK_DEP_MASK_SCHED); 359 return true; 360 } 361 362 if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { 363 trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); 364 return true; 365 } 366 367 if (val & TICK_DEP_MASK_RCU) { 368 trace_tick_stop(0, TICK_DEP_MASK_RCU); 369 return true; 370 } 371 372 if (val & TICK_DEP_MASK_RCU_EXP) { 373 trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); 374 return true; 375 } 376 377 return false; 378 } 379 380 static bool can_stop_full_tick(int cpu, struct tick_sched *ts) 381 { 382 lockdep_assert_irqs_disabled(); 383 384 if (unlikely(!cpu_online(cpu))) 385 return false; 386 387 if (check_tick_dependency(&tick_dep_mask)) 388 return false; 389 390 if (check_tick_dependency(&ts->tick_dep_mask)) 391 return false; 392 393 if (check_tick_dependency(¤t->tick_dep_mask)) 394 return false; 395 396 if (check_tick_dependency(¤t->signal->tick_dep_mask)) 397 return false; 398 399 return true; 400 } 401 402 static void nohz_full_kick_func(struct irq_work *work) 403 { 404 /* Empty, the tick restart happens on tick_nohz_irq_exit() */ 405 } 406 407 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = 408 IRQ_WORK_INIT_HARD(nohz_full_kick_func); 409 410 /* 411 * Kick this CPU if it's full dynticks in order to force it to 412 * re-evaluate its dependency on the tick and restart it if necessary. 413 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), 414 * is NMI safe. 415 */ 416 static void tick_nohz_full_kick(void) 417 { 418 if (!tick_nohz_full_cpu(smp_processor_id())) 419 return; 420 421 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); 422 } 423 424 /* 425 * Kick the CPU if it's full dynticks in order to force it to 426 * re-evaluate its dependency on the tick and restart it if necessary. 427 */ 428 void tick_nohz_full_kick_cpu(int cpu) 429 { 430 if (!tick_nohz_full_cpu(cpu)) 431 return; 432 433 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); 434 } 435 436 static void tick_nohz_kick_task(struct task_struct *tsk) 437 { 438 int cpu; 439 440 /* 441 * If the task is not running, run_posix_cpu_timers() 442 * has nothing to elapse, and an IPI can then be optimized out. 443 * 444 * activate_task() STORE p->tick_dep_mask 445 * STORE p->on_rq 446 * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) 447 * LOCK rq->lock LOAD p->on_rq 448 * smp_mb__after_spin_lock() 449 * tick_nohz_task_switch() 450 * LOAD p->tick_dep_mask 451 * 452 * XXX given a task picks up the dependency on schedule(), should we 453 * only care about tasks that are currently on the CPU instead of all 454 * that are on the runqueue? 455 * 456 * That is, does this want to be: task_on_cpu() / task_curr()? 457 */ 458 if (!sched_task_on_rq(tsk)) 459 return; 460 461 /* 462 * If the task concurrently migrates to another CPU, 463 * we guarantee it sees the new tick dependency upon 464 * schedule. 465 * 466 * set_task_cpu(p, cpu); 467 * STORE p->cpu = @cpu 468 * __schedule() (switch to task 'p') 469 * LOCK rq->lock 470 * smp_mb__after_spin_lock() STORE p->tick_dep_mask 471 * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) 472 * LOAD p->tick_dep_mask LOAD p->cpu 473 */ 474 cpu = task_cpu(tsk); 475 476 preempt_disable(); 477 if (cpu_online(cpu)) 478 tick_nohz_full_kick_cpu(cpu); 479 preempt_enable(); 480 } 481 482 /* 483 * Kick all full dynticks CPUs in order to force these to re-evaluate 484 * their dependency on the tick and restart it if necessary. 485 */ 486 static void tick_nohz_full_kick_all(void) 487 { 488 int cpu; 489 490 if (!tick_nohz_full_running) 491 return; 492 493 preempt_disable(); 494 for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) 495 tick_nohz_full_kick_cpu(cpu); 496 preempt_enable(); 497 } 498 499 static void tick_nohz_dep_set_all(atomic_t *dep, 500 enum tick_dep_bits bit) 501 { 502 int prev; 503 504 prev = atomic_fetch_or(BIT(bit), dep); 505 if (!prev) 506 tick_nohz_full_kick_all(); 507 } 508 509 /* 510 * Set a global tick dependency. Used by perf events that rely on freq and 511 * unstable clocks. 512 */ 513 void tick_nohz_dep_set(enum tick_dep_bits bit) 514 { 515 tick_nohz_dep_set_all(&tick_dep_mask, bit); 516 } 517 518 void tick_nohz_dep_clear(enum tick_dep_bits bit) 519 { 520 atomic_andnot(BIT(bit), &tick_dep_mask); 521 } 522 523 /* 524 * Set per-CPU tick dependency. Used by scheduler and perf events in order to 525 * manage event-throttling. 526 */ 527 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) 528 { 529 int prev; 530 struct tick_sched *ts; 531 532 ts = per_cpu_ptr(&tick_cpu_sched, cpu); 533 534 prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); 535 if (!prev) { 536 preempt_disable(); 537 /* Perf needs local kick that is NMI safe */ 538 if (cpu == smp_processor_id()) { 539 tick_nohz_full_kick(); 540 } else { 541 /* Remote IRQ work not NMI-safe */ 542 if (!WARN_ON_ONCE(in_nmi())) 543 tick_nohz_full_kick_cpu(cpu); 544 } 545 preempt_enable(); 546 } 547 } 548 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); 549 550 void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) 551 { 552 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 553 554 atomic_andnot(BIT(bit), &ts->tick_dep_mask); 555 } 556 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); 557 558 /* 559 * Set a per-task tick dependency. RCU needs this. Also posix CPU timers 560 * in order to elapse per task timers. 561 */ 562 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) 563 { 564 if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) 565 tick_nohz_kick_task(tsk); 566 } 567 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); 568 569 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) 570 { 571 atomic_andnot(BIT(bit), &tsk->tick_dep_mask); 572 } 573 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); 574 575 /* 576 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse 577 * per process timers. 578 */ 579 void tick_nohz_dep_set_signal(struct task_struct *tsk, 580 enum tick_dep_bits bit) 581 { 582 int prev; 583 struct signal_struct *sig = tsk->signal; 584 585 prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); 586 if (!prev) { 587 struct task_struct *t; 588 589 lockdep_assert_held(&tsk->sighand->siglock); 590 __for_each_thread(sig, t) 591 tick_nohz_kick_task(t); 592 } 593 } 594 595 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) 596 { 597 atomic_andnot(BIT(bit), &sig->tick_dep_mask); 598 } 599 600 /* 601 * Re-evaluate the need for the tick as we switch the current task. 602 * It might need the tick due to per task/process properties: 603 * perf events, posix CPU timers, ... 604 */ 605 void __tick_nohz_task_switch(void) 606 { 607 struct tick_sched *ts; 608 609 if (!tick_nohz_full_cpu(smp_processor_id())) 610 return; 611 612 ts = this_cpu_ptr(&tick_cpu_sched); 613 614 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 615 if (atomic_read(¤t->tick_dep_mask) || 616 atomic_read(¤t->signal->tick_dep_mask)) 617 tick_nohz_full_kick(); 618 } 619 } 620 621 /* Get the boot-time nohz CPU list from the kernel parameters. */ 622 void __init tick_nohz_full_setup(cpumask_var_t cpumask) 623 { 624 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 625 cpumask_copy(tick_nohz_full_mask, cpumask); 626 tick_nohz_full_running = true; 627 } 628 629 bool tick_nohz_cpu_hotpluggable(unsigned int cpu) 630 { 631 /* 632 * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound 633 * timers, workqueues, timekeeping, ...) on behalf of full dynticks 634 * CPUs. It must remain online when nohz full is enabled. 635 */ 636 if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) 637 return false; 638 return true; 639 } 640 641 static int tick_nohz_cpu_down(unsigned int cpu) 642 { 643 return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; 644 } 645 646 void __init tick_nohz_init(void) 647 { 648 int cpu, ret; 649 650 if (!tick_nohz_full_running) 651 return; 652 653 /* 654 * Full dynticks uses IRQ work to drive the tick rescheduling on safe 655 * locking contexts. But then we need IRQ work to raise its own 656 * interrupts to avoid circular dependency on the tick. 657 */ 658 if (!arch_irq_work_has_interrupt()) { 659 pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); 660 cpumask_clear(tick_nohz_full_mask); 661 tick_nohz_full_running = false; 662 return; 663 } 664 665 if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && 666 !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { 667 cpu = smp_processor_id(); 668 669 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { 670 pr_warn("NO_HZ: Clearing %d from nohz_full range " 671 "for timekeeping\n", cpu); 672 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 673 } 674 } 675 676 for_each_cpu(cpu, tick_nohz_full_mask) 677 ct_cpu_track_user(cpu); 678 679 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 680 "kernel/nohz:predown", NULL, 681 tick_nohz_cpu_down); 682 WARN_ON(ret < 0); 683 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 684 cpumask_pr_args(tick_nohz_full_mask)); 685 } 686 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 687 688 /* 689 * NOHZ - aka dynamic tick functionality 690 */ 691 #ifdef CONFIG_NO_HZ_COMMON 692 /* 693 * NO HZ enabled ? 694 */ 695 bool tick_nohz_enabled __read_mostly = true; 696 static unsigned long tick_nohz_active __read_mostly; 697 /* 698 * Enable / Disable tickless mode 699 */ 700 static int __init setup_tick_nohz(char *str) 701 { 702 return (kstrtobool(str, &tick_nohz_enabled) == 0); 703 } 704 705 __setup("nohz=", setup_tick_nohz); 706 707 bool tick_nohz_is_active(void) 708 { 709 return tick_nohz_active; 710 } 711 EXPORT_SYMBOL_GPL(tick_nohz_is_active); 712 713 bool tick_nohz_tick_stopped(void) 714 { 715 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 716 717 return tick_sched_flag_test(ts, TS_FLAG_STOPPED); 718 } 719 720 bool tick_nohz_tick_stopped_cpu(int cpu) 721 { 722 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 723 724 return tick_sched_flag_test(ts, TS_FLAG_STOPPED); 725 } 726 727 /** 728 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 729 * @now: current ktime_t 730 * 731 * Called from interrupt entry when the CPU was idle 732 * 733 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 734 * must be updated. Otherwise an interrupt handler could use a stale jiffy 735 * value. We do this unconditionally on any CPU, as we don't know whether the 736 * CPU, which has the update task assigned, is in a long sleep. 737 */ 738 static void tick_nohz_update_jiffies(ktime_t now) 739 { 740 unsigned long flags; 741 742 __this_cpu_write(tick_cpu_sched.idle_waketime, now); 743 744 local_irq_save(flags); 745 tick_do_update_jiffies64(now); 746 local_irq_restore(flags); 747 748 touch_softlockup_watchdog_sched(); 749 } 750 751 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) 752 { 753 ktime_t delta; 754 755 if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) 756 return; 757 758 delta = ktime_sub(now, ts->idle_entrytime); 759 760 write_seqcount_begin(&ts->idle_sleeptime_seq); 761 if (nr_iowait_cpu(smp_processor_id()) > 0) 762 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 763 else 764 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 765 766 ts->idle_entrytime = now; 767 tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); 768 write_seqcount_end(&ts->idle_sleeptime_seq); 769 770 sched_clock_idle_wakeup_event(); 771 } 772 773 static void tick_nohz_start_idle(struct tick_sched *ts) 774 { 775 write_seqcount_begin(&ts->idle_sleeptime_seq); 776 ts->idle_entrytime = ktime_get(); 777 tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); 778 write_seqcount_end(&ts->idle_sleeptime_seq); 779 780 sched_clock_idle_sleep_event(); 781 } 782 783 static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, 784 bool compute_delta, u64 *last_update_time) 785 { 786 ktime_t now, idle; 787 unsigned int seq; 788 789 if (!tick_nohz_active) 790 return -1; 791 792 now = ktime_get(); 793 if (last_update_time) 794 *last_update_time = ktime_to_us(now); 795 796 do { 797 seq = read_seqcount_begin(&ts->idle_sleeptime_seq); 798 799 if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { 800 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 801 802 idle = ktime_add(*sleeptime, delta); 803 } else { 804 idle = *sleeptime; 805 } 806 } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); 807 808 return ktime_to_us(idle); 809 810 } 811 812 /** 813 * get_cpu_idle_time_us - get the total idle time of a CPU 814 * @cpu: CPU number to query 815 * @last_update_time: variable to store update time in. Do not update 816 * counters if NULL. 817 * 818 * Return the cumulative idle time (since boot) for a given 819 * CPU, in microseconds. Note that this is partially broken due to 820 * the counter of iowait tasks that can be remotely updated without 821 * any synchronization. Therefore it is possible to observe backward 822 * values within two consecutive reads. 823 * 824 * This time is measured via accounting rather than sampling, 825 * and is as accurate as ktime_get() is. 826 * 827 * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu 828 */ 829 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 830 { 831 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 832 833 return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, 834 !nr_iowait_cpu(cpu), last_update_time); 835 } 836 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 837 838 /** 839 * get_cpu_iowait_time_us - get the total iowait time of a CPU 840 * @cpu: CPU number to query 841 * @last_update_time: variable to store update time in. Do not update 842 * counters if NULL. 843 * 844 * Return the cumulative iowait time (since boot) for a given 845 * CPU, in microseconds. Note this is partially broken due to 846 * the counter of iowait tasks that can be remotely updated without 847 * any synchronization. Therefore it is possible to observe backward 848 * values within two consecutive reads. 849 * 850 * This time is measured via accounting rather than sampling, 851 * and is as accurate as ktime_get() is. 852 * 853 * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu 854 */ 855 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 856 { 857 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 858 859 return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, 860 nr_iowait_cpu(cpu), last_update_time); 861 } 862 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 863 864 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 865 { 866 hrtimer_cancel(&ts->sched_timer); 867 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 868 869 /* Forward the time to expire in the future */ 870 hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 871 872 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 873 hrtimer_start_expires(&ts->sched_timer, 874 HRTIMER_MODE_ABS_PINNED_HARD); 875 } else { 876 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 877 } 878 879 /* 880 * Reset to make sure the next tick stop doesn't get fooled by past 881 * cached clock deadline. 882 */ 883 ts->next_tick = 0; 884 } 885 886 static inline bool local_timer_softirq_pending(void) 887 { 888 return local_timers_pending() & BIT(TIMER_SOFTIRQ); 889 } 890 891 /* 892 * Read jiffies and the time when jiffies were updated last 893 */ 894 u64 get_jiffies_update(unsigned long *basej) 895 { 896 unsigned long basejiff; 897 unsigned int seq; 898 u64 basemono; 899 900 do { 901 seq = read_seqcount_begin(&jiffies_seq); 902 basemono = last_jiffies_update; 903 basejiff = jiffies; 904 } while (read_seqcount_retry(&jiffies_seq, seq)); 905 *basej = basejiff; 906 return basemono; 907 } 908 909 /** 910 * tick_nohz_next_event() - return the clock monotonic based next event 911 * @ts: pointer to tick_sched struct 912 * @cpu: CPU number 913 * 914 * Return: 915 * *%0 - When the next event is a maximum of TICK_NSEC in the future 916 * and the tick is not stopped yet 917 * *%next_event - Next event based on clock monotonic 918 */ 919 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) 920 { 921 u64 basemono, next_tick, delta, expires; 922 unsigned long basejiff; 923 int tick_cpu; 924 925 basemono = get_jiffies_update(&basejiff); 926 ts->last_jiffies = basejiff; 927 ts->timer_expires_base = basemono; 928 929 /* 930 * Keep the periodic tick, when RCU, architecture or irq_work 931 * requests it. 932 * Aside of that, check whether the local timer softirq is 933 * pending. If so, its a bad idea to call get_next_timer_interrupt(), 934 * because there is an already expired timer, so it will request 935 * immediate expiry, which rearms the hardware timer with a 936 * minimal delta, which brings us back to this place 937 * immediately. Lather, rinse and repeat... 938 */ 939 if (rcu_needs_cpu() || arch_needs_cpu() || 940 irq_work_needs_cpu() || local_timer_softirq_pending()) { 941 next_tick = basemono + TICK_NSEC; 942 } else { 943 /* 944 * Get the next pending timer. If high resolution 945 * timers are enabled this only takes the timer wheel 946 * timers into account. If high resolution timers are 947 * disabled this also looks at the next expiring 948 * hrtimer. 949 */ 950 next_tick = get_next_timer_interrupt(basejiff, basemono); 951 ts->next_timer = next_tick; 952 } 953 954 /* Make sure next_tick is never before basemono! */ 955 if (WARN_ON_ONCE(basemono > next_tick)) 956 next_tick = basemono; 957 958 /* 959 * If the tick is due in the next period, keep it ticking or 960 * force prod the timer. 961 */ 962 delta = next_tick - basemono; 963 if (delta <= (u64)TICK_NSEC) { 964 /* 965 * We've not stopped the tick yet, and there's a timer in the 966 * next period, so no point in stopping it either, bail. 967 */ 968 if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 969 ts->timer_expires = 0; 970 goto out; 971 } 972 } 973 974 /* 975 * If this CPU is the one which had the do_timer() duty last, we limit 976 * the sleep time to the timekeeping 'max_deferment' value. 977 * Otherwise we can sleep as long as we want. 978 */ 979 delta = timekeeping_max_deferment(); 980 tick_cpu = READ_ONCE(tick_do_timer_cpu); 981 if (tick_cpu != cpu && 982 (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) 983 delta = KTIME_MAX; 984 985 /* Calculate the next expiry time */ 986 if (delta < (KTIME_MAX - basemono)) 987 expires = basemono + delta; 988 else 989 expires = KTIME_MAX; 990 991 ts->timer_expires = min_t(u64, expires, next_tick); 992 993 out: 994 return ts->timer_expires; 995 } 996 997 static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) 998 { 999 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 1000 unsigned long basejiff = ts->last_jiffies; 1001 u64 basemono = ts->timer_expires_base; 1002 bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1003 int tick_cpu; 1004 u64 expires; 1005 1006 /* Make sure we won't be trying to stop it twice in a row. */ 1007 ts->timer_expires_base = 0; 1008 1009 /* 1010 * Now the tick should be stopped definitely - so the timer base needs 1011 * to be marked idle as well to not miss a newly queued timer. 1012 */ 1013 expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); 1014 if (expires > ts->timer_expires) { 1015 /* 1016 * This path could only happen when the first timer was removed 1017 * between calculating the possible sleep length and now (when 1018 * high resolution mode is not active, timer could also be a 1019 * hrtimer). 1020 * 1021 * We have to stick to the original calculated expiry value to 1022 * not stop the tick for too long with a shallow C-state (which 1023 * was programmed by cpuidle because of an early next expiration 1024 * value). 1025 */ 1026 expires = ts->timer_expires; 1027 } 1028 1029 /* If the timer base is not idle, retain the not yet stopped tick. */ 1030 if (!timer_idle) 1031 return; 1032 1033 /* 1034 * If this CPU is the one which updates jiffies, then give up 1035 * the assignment and let it be taken by the CPU which runs 1036 * the tick timer next, which might be this CPU as well. If we 1037 * don't drop this here, the jiffies might be stale and 1038 * do_timer() never gets invoked. Keep track of the fact that it 1039 * was the one which had the do_timer() duty last. 1040 */ 1041 tick_cpu = READ_ONCE(tick_do_timer_cpu); 1042 if (tick_cpu == cpu) { 1043 WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); 1044 tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); 1045 } else if (tick_cpu != TICK_DO_TIMER_NONE) { 1046 tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); 1047 } 1048 1049 /* Skip reprogram of event if it's not changed */ 1050 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { 1051 /* Sanity check: make sure clockevent is actually programmed */ 1052 if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 1053 return; 1054 1055 WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " 1056 "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, 1057 dev->next_event, hrtimer_active(&ts->sched_timer), 1058 hrtimer_get_expires(&ts->sched_timer)); 1059 } 1060 1061 /* 1062 * tick_nohz_stop_tick() can be called several times before 1063 * tick_nohz_restart_sched_tick() is called. This happens when 1064 * interrupts arrive which do not cause a reschedule. In the first 1065 * call we save the current tick time, so we can restart the 1066 * scheduler tick in tick_nohz_restart_sched_tick(). 1067 */ 1068 if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1069 calc_load_nohz_start(); 1070 quiet_vmstat(); 1071 1072 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 1073 tick_sched_flag_set(ts, TS_FLAG_STOPPED); 1074 trace_tick_stop(1, TICK_DEP_MASK_NONE); 1075 } 1076 1077 ts->next_tick = expires; 1078 1079 /* 1080 * If the expiration time == KTIME_MAX, then we simply stop 1081 * the tick timer. 1082 */ 1083 if (unlikely(expires == KTIME_MAX)) { 1084 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) 1085 hrtimer_cancel(&ts->sched_timer); 1086 else 1087 tick_program_event(KTIME_MAX, 1); 1088 return; 1089 } 1090 1091 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 1092 hrtimer_start(&ts->sched_timer, expires, 1093 HRTIMER_MODE_ABS_PINNED_HARD); 1094 } else { 1095 hrtimer_set_expires(&ts->sched_timer, expires); 1096 tick_program_event(expires, 1); 1097 } 1098 } 1099 1100 static void tick_nohz_retain_tick(struct tick_sched *ts) 1101 { 1102 ts->timer_expires_base = 0; 1103 } 1104 1105 #ifdef CONFIG_NO_HZ_FULL 1106 static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) 1107 { 1108 if (tick_nohz_next_event(ts, cpu)) 1109 tick_nohz_stop_tick(ts, cpu); 1110 else 1111 tick_nohz_retain_tick(ts); 1112 } 1113 #endif /* CONFIG_NO_HZ_FULL */ 1114 1115 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 1116 { 1117 /* Update jiffies first */ 1118 tick_do_update_jiffies64(now); 1119 1120 /* 1121 * Clear the timer idle flag, so we avoid IPIs on remote queueing and 1122 * the clock forward checks in the enqueue path: 1123 */ 1124 timer_clear_idle(); 1125 1126 calc_load_nohz_stop(); 1127 touch_softlockup_watchdog_sched(); 1128 1129 /* Cancel the scheduled timer and restore the tick: */ 1130 tick_sched_flag_clear(ts, TS_FLAG_STOPPED); 1131 tick_nohz_restart(ts, now); 1132 } 1133 1134 static void __tick_nohz_full_update_tick(struct tick_sched *ts, 1135 ktime_t now) 1136 { 1137 #ifdef CONFIG_NO_HZ_FULL 1138 int cpu = smp_processor_id(); 1139 1140 if (can_stop_full_tick(cpu, ts)) 1141 tick_nohz_full_stop_tick(ts, cpu); 1142 else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) 1143 tick_nohz_restart_sched_tick(ts, now); 1144 #endif 1145 } 1146 1147 static void tick_nohz_full_update_tick(struct tick_sched *ts) 1148 { 1149 if (!tick_nohz_full_cpu(smp_processor_id())) 1150 return; 1151 1152 if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1153 return; 1154 1155 __tick_nohz_full_update_tick(ts, ktime_get()); 1156 } 1157 1158 /* 1159 * A pending softirq outside an IRQ (or softirq disabled section) context 1160 * should be waiting for ksoftirqd to handle it. Therefore we shouldn't 1161 * reach this code due to the need_resched() early check in can_stop_idle_tick(). 1162 * 1163 * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the 1164 * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, 1165 * triggering the code below, since wakep_softirqd() is ignored. 1166 * 1167 */ 1168 static bool report_idle_softirq(void) 1169 { 1170 static int ratelimit; 1171 unsigned int pending = local_softirq_pending(); 1172 1173 if (likely(!pending)) 1174 return false; 1175 1176 /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ 1177 if (!cpu_active(smp_processor_id())) { 1178 pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; 1179 if (!pending) 1180 return false; 1181 } 1182 1183 /* On RT, softirq handling may be waiting on some lock */ 1184 if (local_bh_blocked()) 1185 return false; 1186 1187 if (ratelimit < 10) { 1188 pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", 1189 pending); 1190 ratelimit++; 1191 } 1192 1193 return true; 1194 } 1195 1196 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 1197 { 1198 WARN_ON_ONCE(cpu_is_offline(cpu)); 1199 1200 if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) 1201 return false; 1202 1203 if (need_resched()) 1204 return false; 1205 1206 if (unlikely(report_idle_softirq())) 1207 return false; 1208 1209 if (tick_nohz_full_enabled()) { 1210 int tick_cpu = READ_ONCE(tick_do_timer_cpu); 1211 1212 /* 1213 * Keep the tick alive to guarantee timekeeping progression 1214 * if there are full dynticks CPUs around 1215 */ 1216 if (tick_cpu == cpu) 1217 return false; 1218 1219 /* Should not happen for nohz-full */ 1220 if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) 1221 return false; 1222 } 1223 1224 return true; 1225 } 1226 1227 /** 1228 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task 1229 * 1230 * When the next event is more than a tick into the future, stop the idle tick 1231 */ 1232 void tick_nohz_idle_stop_tick(void) 1233 { 1234 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1235 int cpu = smp_processor_id(); 1236 ktime_t expires; 1237 1238 /* 1239 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the 1240 * tick timer expiration time is known already. 1241 */ 1242 if (ts->timer_expires_base) 1243 expires = ts->timer_expires; 1244 else if (can_stop_idle_tick(cpu, ts)) 1245 expires = tick_nohz_next_event(ts, cpu); 1246 else 1247 return; 1248 1249 ts->idle_calls++; 1250 1251 if (expires > 0LL) { 1252 int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1253 1254 tick_nohz_stop_tick(ts, cpu); 1255 1256 ts->idle_sleeps++; 1257 ts->idle_expires = expires; 1258 1259 if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1260 ts->idle_jiffies = ts->last_jiffies; 1261 nohz_balance_enter_idle(cpu); 1262 } 1263 } else { 1264 tick_nohz_retain_tick(ts); 1265 } 1266 } 1267 1268 void tick_nohz_idle_retain_tick(void) 1269 { 1270 tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); 1271 } 1272 1273 /** 1274 * tick_nohz_idle_enter - prepare for entering idle on the current CPU 1275 * 1276 * Called when we start the idle loop. 1277 */ 1278 void tick_nohz_idle_enter(void) 1279 { 1280 struct tick_sched *ts; 1281 1282 lockdep_assert_irqs_enabled(); 1283 1284 local_irq_disable(); 1285 1286 ts = this_cpu_ptr(&tick_cpu_sched); 1287 1288 WARN_ON_ONCE(ts->timer_expires_base); 1289 1290 tick_sched_flag_set(ts, TS_FLAG_INIDLE); 1291 tick_nohz_start_idle(ts); 1292 1293 local_irq_enable(); 1294 } 1295 1296 /** 1297 * tick_nohz_irq_exit - Notify the tick about IRQ exit 1298 * 1299 * A timer may have been added/modified/deleted either by the current IRQ, 1300 * or by another place using this IRQ as a notification. This IRQ may have 1301 * also updated the RCU callback list. These events may require a 1302 * re-evaluation of the next tick. Depending on the context: 1303 * 1304 * 1) If the CPU is idle and no resched is pending, just proceed with idle 1305 * time accounting. The next tick will be re-evaluated on the next idle 1306 * loop iteration. 1307 * 1308 * 2) If the CPU is nohz_full: 1309 * 1310 * 2.1) If there is any tick dependency, restart the tick if stopped. 1311 * 1312 * 2.2) If there is no tick dependency, (re-)evaluate the next tick and 1313 * stop/update it accordingly. 1314 */ 1315 void tick_nohz_irq_exit(void) 1316 { 1317 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1318 1319 if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) 1320 tick_nohz_start_idle(ts); 1321 else 1322 tick_nohz_full_update_tick(ts); 1323 } 1324 1325 /** 1326 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run 1327 * 1328 * Return: %true if the tick handler has run, otherwise %false 1329 */ 1330 bool tick_nohz_idle_got_tick(void) 1331 { 1332 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1333 1334 if (ts->got_idle_tick) { 1335 ts->got_idle_tick = 0; 1336 return true; 1337 } 1338 return false; 1339 } 1340 1341 /** 1342 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer 1343 * or the tick, whichever expires first. Note that, if the tick has been 1344 * stopped, it returns the next hrtimer. 1345 * 1346 * Called from power state control code with interrupts disabled 1347 * 1348 * Return: the next expiration time 1349 */ 1350 ktime_t tick_nohz_get_next_hrtimer(void) 1351 { 1352 return __this_cpu_read(tick_cpu_device.evtdev)->next_event; 1353 } 1354 1355 /** 1356 * tick_nohz_get_sleep_length - return the expected length of the current sleep 1357 * @delta_next: duration until the next event if the tick cannot be stopped 1358 * 1359 * Called from power state control code with interrupts disabled. 1360 * 1361 * The return value of this function and/or the value returned by it through the 1362 * @delta_next pointer can be negative which must be taken into account by its 1363 * callers. 1364 * 1365 * Return: the expected length of the current sleep 1366 */ 1367 ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) 1368 { 1369 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 1370 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1371 int cpu = smp_processor_id(); 1372 /* 1373 * The idle entry time is expected to be a sufficient approximation of 1374 * the current time at this point. 1375 */ 1376 ktime_t now = ts->idle_entrytime; 1377 ktime_t next_event; 1378 1379 WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); 1380 1381 *delta_next = ktime_sub(dev->next_event, now); 1382 1383 if (!can_stop_idle_tick(cpu, ts)) 1384 return *delta_next; 1385 1386 next_event = tick_nohz_next_event(ts, cpu); 1387 if (!next_event) 1388 return *delta_next; 1389 1390 /* 1391 * If the next highres timer to expire is earlier than 'next_event', the 1392 * idle governor needs to know that. 1393 */ 1394 next_event = min_t(u64, next_event, 1395 hrtimer_next_event_without(&ts->sched_timer)); 1396 1397 return ktime_sub(next_event, now); 1398 } 1399 1400 /** 1401 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value 1402 * for a particular CPU. 1403 * @cpu: target CPU number 1404 * 1405 * Called from the schedutil frequency scaling governor in scheduler context. 1406 * 1407 * Return: the current idle calls counter value for @cpu 1408 */ 1409 unsigned long tick_nohz_get_idle_calls_cpu(int cpu) 1410 { 1411 struct tick_sched *ts = tick_get_tick_sched(cpu); 1412 1413 return ts->idle_calls; 1414 } 1415 1416 static void tick_nohz_account_idle_time(struct tick_sched *ts, 1417 ktime_t now) 1418 { 1419 unsigned long ticks; 1420 1421 ts->idle_exittime = now; 1422 1423 if (vtime_accounting_enabled_this_cpu()) 1424 return; 1425 /* 1426 * We stopped the tick in idle. update_process_times() would miss the 1427 * time we slept, as it does only a 1 tick accounting. 1428 * Enforce that this is accounted to idle ! 1429 */ 1430 ticks = jiffies - ts->idle_jiffies; 1431 /* 1432 * We might be one off. Do not randomly account a huge number of ticks! 1433 */ 1434 if (ticks && ticks < LONG_MAX) 1435 account_idle_ticks(ticks); 1436 } 1437 1438 void tick_nohz_idle_restart_tick(void) 1439 { 1440 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1441 1442 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1443 ktime_t now = ktime_get(); 1444 tick_nohz_restart_sched_tick(ts, now); 1445 tick_nohz_account_idle_time(ts, now); 1446 } 1447 } 1448 1449 static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) 1450 { 1451 if (tick_nohz_full_cpu(smp_processor_id())) 1452 __tick_nohz_full_update_tick(ts, now); 1453 else 1454 tick_nohz_restart_sched_tick(ts, now); 1455 1456 tick_nohz_account_idle_time(ts, now); 1457 } 1458 1459 /** 1460 * tick_nohz_idle_exit - Update the tick upon idle task exit 1461 * 1462 * When the idle task exits, update the tick depending on the 1463 * following situations: 1464 * 1465 * 1) If the CPU is not in nohz_full mode (most cases), then 1466 * restart the tick. 1467 * 1468 * 2) If the CPU is in nohz_full mode (corner case): 1469 * 2.1) If the tick can be kept stopped (no tick dependencies) 1470 * then re-evaluate the next tick and try to keep it stopped 1471 * as long as possible. 1472 * 2.2) If the tick has dependencies, restart the tick. 1473 * 1474 */ 1475 void tick_nohz_idle_exit(void) 1476 { 1477 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1478 bool idle_active, tick_stopped; 1479 ktime_t now; 1480 1481 local_irq_disable(); 1482 1483 WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); 1484 WARN_ON_ONCE(ts->timer_expires_base); 1485 1486 tick_sched_flag_clear(ts, TS_FLAG_INIDLE); 1487 idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); 1488 tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1489 1490 if (idle_active || tick_stopped) 1491 now = ktime_get(); 1492 1493 if (idle_active) 1494 tick_nohz_stop_idle(ts, now); 1495 1496 if (tick_stopped) 1497 tick_nohz_idle_update_tick(ts, now); 1498 1499 local_irq_enable(); 1500 } 1501 1502 /* 1503 * In low-resolution mode, the tick handler must be implemented directly 1504 * at the clockevent level. hrtimer can't be used instead, because its 1505 * infrastructure actually relies on the tick itself as a backend in 1506 * low-resolution mode (see hrtimer_run_queues()). 1507 */ 1508 static void tick_nohz_lowres_handler(struct clock_event_device *dev) 1509 { 1510 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1511 1512 dev->next_event = KTIME_MAX; 1513 1514 if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) 1515 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1516 } 1517 1518 static inline void tick_nohz_activate(struct tick_sched *ts) 1519 { 1520 if (!tick_nohz_enabled) 1521 return; 1522 tick_sched_flag_set(ts, TS_FLAG_NOHZ); 1523 /* One update is enough */ 1524 if (!test_and_set_bit(0, &tick_nohz_active)) 1525 timers_update_nohz(); 1526 } 1527 1528 /** 1529 * tick_nohz_switch_to_nohz - switch to NOHZ mode 1530 */ 1531 static void tick_nohz_switch_to_nohz(void) 1532 { 1533 if (!tick_nohz_enabled) 1534 return; 1535 1536 if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) 1537 return; 1538 1539 /* 1540 * Recycle the hrtimer in 'ts', so we can share the 1541 * highres code. 1542 */ 1543 tick_setup_sched_timer(false); 1544 } 1545 1546 static inline void tick_nohz_irq_enter(void) 1547 { 1548 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1549 ktime_t now; 1550 1551 if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) 1552 return; 1553 now = ktime_get(); 1554 if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) 1555 tick_nohz_stop_idle(ts, now); 1556 /* 1557 * If all CPUs are idle we may need to update a stale jiffies value. 1558 * Note nohz_full is a special case: a timekeeper is guaranteed to stay 1559 * alive but it might be busy looping with interrupts disabled in some 1560 * rare case (typically stop machine). So we must make sure we have a 1561 * last resort. 1562 */ 1563 if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) 1564 tick_nohz_update_jiffies(now); 1565 } 1566 1567 #else 1568 1569 static inline void tick_nohz_switch_to_nohz(void) { } 1570 static inline void tick_nohz_irq_enter(void) { } 1571 static inline void tick_nohz_activate(struct tick_sched *ts) { } 1572 1573 #endif /* CONFIG_NO_HZ_COMMON */ 1574 1575 /* 1576 * Called from irq_enter() to notify about the possible interruption of idle() 1577 */ 1578 void tick_irq_enter(void) 1579 { 1580 tick_check_oneshot_broadcast_this_cpu(); 1581 tick_nohz_irq_enter(); 1582 } 1583 1584 static int sched_skew_tick; 1585 1586 static int __init skew_tick(char *str) 1587 { 1588 get_option(&str, &sched_skew_tick); 1589 1590 return 0; 1591 } 1592 early_param("skew_tick", skew_tick); 1593 1594 /** 1595 * tick_setup_sched_timer - setup the tick emulation timer 1596 * @hrtimer: whether to use the hrtimer or not 1597 */ 1598 void tick_setup_sched_timer(bool hrtimer) 1599 { 1600 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1601 1602 /* Emulate tick processing via per-CPU hrtimers: */ 1603 hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); 1604 1605 if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) 1606 tick_sched_flag_set(ts, TS_FLAG_HIGHRES); 1607 1608 /* Get the next period (per-CPU) */ 1609 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 1610 1611 /* Offset the tick to avert 'jiffies_lock' contention. */ 1612 if (sched_skew_tick) { 1613 u64 offset = TICK_NSEC >> 1; 1614 do_div(offset, num_possible_cpus()); 1615 offset *= smp_processor_id(); 1616 hrtimer_add_expires_ns(&ts->sched_timer, offset); 1617 } 1618 1619 hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); 1620 if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) 1621 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); 1622 else 1623 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1624 tick_nohz_activate(ts); 1625 } 1626 1627 /* 1628 * Shut down the tick and make sure the CPU won't try to retake the timekeeping 1629 * duty before disabling IRQs in idle for the last time. 1630 */ 1631 void tick_sched_timer_dying(int cpu) 1632 { 1633 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1634 ktime_t idle_sleeptime, iowait_sleeptime; 1635 unsigned long idle_calls, idle_sleeps; 1636 1637 /* This must happen before hrtimers are migrated! */ 1638 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) 1639 hrtimer_cancel(&ts->sched_timer); 1640 1641 idle_sleeptime = ts->idle_sleeptime; 1642 iowait_sleeptime = ts->iowait_sleeptime; 1643 idle_calls = ts->idle_calls; 1644 idle_sleeps = ts->idle_sleeps; 1645 memset(ts, 0, sizeof(*ts)); 1646 ts->idle_sleeptime = idle_sleeptime; 1647 ts->iowait_sleeptime = iowait_sleeptime; 1648 ts->idle_calls = idle_calls; 1649 ts->idle_sleeps = idle_sleeps; 1650 } 1651 1652 /* 1653 * Async notification about clocksource changes 1654 */ 1655 void tick_clock_notify(void) 1656 { 1657 int cpu; 1658 1659 for_each_possible_cpu(cpu) 1660 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 1661 } 1662 1663 /* 1664 * Async notification about clock event changes 1665 */ 1666 void tick_oneshot_notify(void) 1667 { 1668 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1669 1670 set_bit(0, &ts->check_clocks); 1671 } 1672 1673 /* 1674 * Check if a change happened, which makes oneshot possible. 1675 * 1676 * Called cyclically from the hrtimer softirq (driven by the timer 1677 * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ 1678 * mode, because high resolution timers are disabled (either compile 1679 * or runtime). Called with interrupts disabled. 1680 */ 1681 int tick_check_oneshot_change(int allow_nohz) 1682 { 1683 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1684 1685 if (!test_and_clear_bit(0, &ts->check_clocks)) 1686 return 0; 1687 1688 if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1689 return 0; 1690 1691 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) 1692 return 0; 1693 1694 if (!allow_nohz) 1695 return 1; 1696 1697 tick_nohz_switch_to_nohz(); 1698 return 0; 1699 } 1700