1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Watchdog support on powerpc systems. 4 * 5 * Copyright 2017, IBM Corporation. 6 * 7 * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c 8 */ 9 10 #define pr_fmt(fmt) "watchdog: " fmt 11 12 #include <linux/kernel.h> 13 #include <linux/param.h> 14 #include <linux/init.h> 15 #include <linux/percpu.h> 16 #include <linux/cpu.h> 17 #include <linux/nmi.h> 18 #include <linux/module.h> 19 #include <linux/export.h> 20 #include <linux/kprobes.h> 21 #include <linux/hardirq.h> 22 #include <linux/reboot.h> 23 #include <linux/slab.h> 24 #include <linux/kdebug.h> 25 #include <linux/sched/debug.h> 26 #include <linux/delay.h> 27 #include <linux/processor.h> 28 #include <linux/smp.h> 29 30 #include <asm/interrupt.h> 31 #include <asm/paca.h> 32 #include <asm/nmi.h> 33 34 /* 35 * The powerpc watchdog ensures that each CPU is able to service timers. 36 * The watchdog sets up a simple timer on each CPU to run once per timer 37 * period, and updates a per-cpu timestamp and a "pending" cpumask. This is 38 * the heartbeat. 39 * 40 * Then there are two systems to check that the heartbeat is still running. 41 * The local soft-NMI, and the SMP checker. 42 * 43 * The soft-NMI checker can detect lockups on the local CPU. When interrupts 44 * are disabled with local_irq_disable(), platforms that use soft-masking 45 * can leave hardware interrupts enabled and handle them with a masked 46 * interrupt handler. The masked handler can send the timer interrupt to the 47 * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI 48 * interrupt, and can be used to detect CPUs stuck with IRQs disabled. 49 * 50 * The soft-NMI checker will compare the heartbeat timestamp for this CPU 51 * with the current time, and take action if the difference exceeds the 52 * watchdog threshold. 53 * 54 * The limitation of the soft-NMI watchdog is that it does not work when 55 * interrupts are hard disabled or otherwise not being serviced. This is 56 * solved by also having a SMP watchdog where all CPUs check all other 57 * CPUs heartbeat. 58 * 59 * The SMP checker can detect lockups on other CPUs. A gobal "pending" 60 * cpumask is kept, containing all CPUs which enable the watchdog. Each 61 * CPU clears their pending bit in their heartbeat timer. When the bitmask 62 * becomes empty, the last CPU to clear its pending bit updates a global 63 * timestamp and refills the pending bitmask. 64 * 65 * In the heartbeat timer, if any CPU notices that the global timestamp has 66 * not been updated for a period exceeding the watchdog threshold, then it 67 * means the CPU(s) with their bit still set in the pending mask have had 68 * their heartbeat stop, and action is taken. 69 * 70 * Some platforms implement true NMI IPIs, which can be used by the SMP 71 * watchdog to detect an unresponsive CPU and pull it out of its stuck 72 * state with the NMI IPI, to get crash/debug data from it. This way the 73 * SMP watchdog can detect hardware interrupts off lockups. 74 */ 75 76 static cpumask_t wd_cpus_enabled __read_mostly; 77 78 static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */ 79 static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */ 80 81 static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */ 82 83 static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer); 84 static DEFINE_PER_CPU(u64, wd_timer_tb); 85 86 /* SMP checker bits */ 87 static unsigned long __wd_smp_lock; 88 static unsigned long __wd_reporting; 89 static cpumask_t wd_smp_cpus_pending; 90 static cpumask_t wd_smp_cpus_stuck; 91 static u64 wd_smp_last_reset_tb; 92 93 /* 94 * Try to take the exclusive watchdog action / NMI IPI / printing lock. 95 * wd_smp_lock must be held. If this fails, we should return and wait 96 * for the watchdog to kick in again (or another CPU to trigger it). 97 * 98 * Importantly, if hardlockup_panic is set, wd_try_report failure should 99 * not delay the panic, because whichever other CPU is reporting will 100 * call panic. 101 */ 102 static bool wd_try_report(void) 103 { 104 if (__wd_reporting) 105 return false; 106 __wd_reporting = 1; 107 return true; 108 } 109 110 /* End printing after successful wd_try_report. wd_smp_lock not required. */ 111 static void wd_end_reporting(void) 112 { 113 smp_mb(); /* End printing "critical section" */ 114 WARN_ON_ONCE(__wd_reporting == 0); 115 WRITE_ONCE(__wd_reporting, 0); 116 } 117 118 static inline void wd_smp_lock(unsigned long *flags) 119 { 120 /* 121 * Avoid locking layers if possible. 122 * This may be called from low level interrupt handlers at some 123 * point in future. 124 */ 125 raw_local_irq_save(*flags); 126 hard_irq_disable(); /* Make it soft-NMI safe */ 127 while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) { 128 raw_local_irq_restore(*flags); 129 spin_until_cond(!test_bit(0, &__wd_smp_lock)); 130 raw_local_irq_save(*flags); 131 hard_irq_disable(); 132 } 133 } 134 135 static inline void wd_smp_unlock(unsigned long *flags) 136 { 137 clear_bit_unlock(0, &__wd_smp_lock); 138 raw_local_irq_restore(*flags); 139 } 140 141 static void wd_lockup_ipi(struct pt_regs *regs) 142 { 143 int cpu = raw_smp_processor_id(); 144 u64 tb = get_tb(); 145 146 pr_emerg("CPU %d Hard LOCKUP\n", cpu); 147 pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n", 148 cpu, tb, per_cpu(wd_timer_tb, cpu), 149 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000); 150 print_modules(); 151 print_irqtrace_events(current); 152 if (regs) 153 show_regs(regs); 154 else 155 dump_stack(); 156 157 /* Do not panic from here because that can recurse into NMI IPI layer */ 158 } 159 160 static bool set_cpu_stuck(int cpu) 161 { 162 cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); 163 cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); 164 /* 165 * See wd_smp_clear_cpu_pending() 166 */ 167 smp_mb(); 168 if (cpumask_empty(&wd_smp_cpus_pending)) { 169 wd_smp_last_reset_tb = get_tb(); 170 cpumask_andnot(&wd_smp_cpus_pending, 171 &wd_cpus_enabled, 172 &wd_smp_cpus_stuck); 173 return true; 174 } 175 return false; 176 } 177 178 static void watchdog_smp_panic(int cpu) 179 { 180 static cpumask_t wd_smp_cpus_ipi; // protected by reporting 181 unsigned long flags; 182 u64 tb; 183 int c; 184 185 wd_smp_lock(&flags); 186 /* Double check some things under lock */ 187 tb = get_tb(); 188 if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) 189 goto out; 190 if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) 191 goto out; 192 if (!wd_try_report()) 193 goto out; 194 for_each_online_cpu(c) { 195 if (!cpumask_test_cpu(c, &wd_smp_cpus_pending)) 196 continue; 197 if (c == cpu) 198 continue; // should not happen 199 200 __cpumask_set_cpu(c, &wd_smp_cpus_ipi); 201 if (set_cpu_stuck(c)) 202 break; 203 } 204 if (cpumask_empty(&wd_smp_cpus_ipi)) { 205 wd_end_reporting(); 206 goto out; 207 } 208 wd_smp_unlock(&flags); 209 210 pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n", 211 cpu, cpumask_pr_args(&wd_smp_cpus_ipi)); 212 pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", 213 cpu, tb, wd_smp_last_reset_tb, 214 tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000); 215 216 if (!sysctl_hardlockup_all_cpu_backtrace) { 217 /* 218 * Try to trigger the stuck CPUs, unless we are going to 219 * get a backtrace on all of them anyway. 220 */ 221 for_each_cpu(c, &wd_smp_cpus_ipi) { 222 smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); 223 __cpumask_clear_cpu(c, &wd_smp_cpus_ipi); 224 } 225 } else { 226 trigger_allbutself_cpu_backtrace(); 227 cpumask_clear(&wd_smp_cpus_ipi); 228 } 229 230 /* 231 * Force flush any remote buffers that might be stuck in IRQ context 232 * and therefore could not run their irq_work. 233 */ 234 printk_trigger_flush(); 235 236 if (hardlockup_panic) 237 nmi_panic(NULL, "Hard LOCKUP"); 238 239 wd_end_reporting(); 240 241 return; 242 243 out: 244 wd_smp_unlock(&flags); 245 } 246 247 static void wd_smp_clear_cpu_pending(int cpu) 248 { 249 if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { 250 if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { 251 struct pt_regs *regs = get_irq_regs(); 252 unsigned long flags; 253 254 pr_emerg("CPU %d became unstuck TB:%lld\n", 255 cpu, get_tb()); 256 print_irqtrace_events(current); 257 if (regs) 258 show_regs(regs); 259 else 260 dump_stack(); 261 262 wd_smp_lock(&flags); 263 cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); 264 wd_smp_unlock(&flags); 265 } else { 266 /* 267 * The last CPU to clear pending should have reset the 268 * watchdog so we generally should not find it empty 269 * here if our CPU was clear. However it could happen 270 * due to a rare race with another CPU taking the 271 * last CPU out of the mask concurrently. 272 * 273 * We can't add a warning for it. But just in case 274 * there is a problem with the watchdog that is causing 275 * the mask to not be reset, try to kick it along here. 276 */ 277 if (unlikely(cpumask_empty(&wd_smp_cpus_pending))) 278 goto none_pending; 279 } 280 return; 281 } 282 283 /* 284 * All other updates to wd_smp_cpus_pending are performed under 285 * wd_smp_lock. All of them are atomic except the case where the 286 * mask becomes empty and is reset. This will not happen here because 287 * cpu was tested to be in the bitmap (above), and a CPU only clears 288 * its own bit. _Except_ in the case where another CPU has detected a 289 * hard lockup on our CPU and takes us out of the pending mask. So in 290 * normal operation there will be no race here, no problem. 291 * 292 * In the lockup case, this atomic clear-bit vs a store that refills 293 * other bits in the accessed word wll not be a problem. The bit clear 294 * is atomic so it will not cause the store to get lost, and the store 295 * will never set this bit so it will not overwrite the bit clear. The 296 * only way for a stuck CPU to return to the pending bitmap is to 297 * become unstuck itself. 298 */ 299 cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); 300 301 /* 302 * Order the store to clear pending with the load(s) to check all 303 * words in the pending mask to check they are all empty. This orders 304 * with the same barrier on another CPU. This prevents two CPUs 305 * clearing the last 2 pending bits, but neither seeing the other's 306 * store when checking if the mask is empty, and missing an empty 307 * mask, which ends with a false positive. 308 */ 309 smp_mb(); 310 if (cpumask_empty(&wd_smp_cpus_pending)) { 311 unsigned long flags; 312 313 none_pending: 314 /* 315 * Double check under lock because more than one CPU could see 316 * a clear mask with the lockless check after clearing their 317 * pending bits. 318 */ 319 wd_smp_lock(&flags); 320 if (cpumask_empty(&wd_smp_cpus_pending)) { 321 wd_smp_last_reset_tb = get_tb(); 322 cpumask_andnot(&wd_smp_cpus_pending, 323 &wd_cpus_enabled, 324 &wd_smp_cpus_stuck); 325 } 326 wd_smp_unlock(&flags); 327 } 328 } 329 330 static void watchdog_timer_interrupt(int cpu) 331 { 332 u64 tb = get_tb(); 333 334 per_cpu(wd_timer_tb, cpu) = tb; 335 336 wd_smp_clear_cpu_pending(cpu); 337 338 if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) 339 watchdog_smp_panic(cpu); 340 } 341 342 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) 343 { 344 unsigned long flags; 345 int cpu = raw_smp_processor_id(); 346 u64 tb; 347 348 /* should only arrive from kernel, with irqs disabled */ 349 WARN_ON_ONCE(!arch_irq_disabled_regs(regs)); 350 351 if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) 352 return 0; 353 354 __this_cpu_inc(irq_stat.soft_nmi_irqs); 355 356 tb = get_tb(); 357 if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { 358 /* 359 * Taking wd_smp_lock here means it is a soft-NMI lock, which 360 * means we can't take any regular or irqsafe spin locks while 361 * holding this lock. This is why timers can't printk while 362 * holding the lock. 363 */ 364 wd_smp_lock(&flags); 365 if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { 366 wd_smp_unlock(&flags); 367 return 0; 368 } 369 if (!wd_try_report()) { 370 wd_smp_unlock(&flags); 371 /* Couldn't report, try again in 100ms */ 372 mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000); 373 return 0; 374 } 375 376 set_cpu_stuck(cpu); 377 378 wd_smp_unlock(&flags); 379 380 pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", 381 cpu, (void *)regs->nip); 382 pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n", 383 cpu, tb, per_cpu(wd_timer_tb, cpu), 384 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000); 385 print_modules(); 386 print_irqtrace_events(current); 387 show_regs(regs); 388 389 if (sysctl_hardlockup_all_cpu_backtrace) 390 trigger_allbutself_cpu_backtrace(); 391 392 if (hardlockup_panic) 393 nmi_panic(regs, "Hard LOCKUP"); 394 395 wd_end_reporting(); 396 } 397 /* 398 * We are okay to change DEC in soft_nmi_interrupt because the masked 399 * handler has marked a DEC as pending, so the timer interrupt will be 400 * replayed as soon as local irqs are enabled again. 401 */ 402 if (wd_panic_timeout_tb < 0x7fffffff) 403 mtspr(SPRN_DEC, wd_panic_timeout_tb); 404 405 return 0; 406 } 407 408 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 409 { 410 int cpu = smp_processor_id(); 411 412 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) 413 return HRTIMER_NORESTART; 414 415 if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) 416 return HRTIMER_NORESTART; 417 418 watchdog_timer_interrupt(cpu); 419 420 hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms)); 421 422 return HRTIMER_RESTART; 423 } 424 425 void arch_touch_nmi_watchdog(void) 426 { 427 unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; 428 int cpu = smp_processor_id(); 429 u64 tb; 430 431 if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) 432 return; 433 434 tb = get_tb(); 435 if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { 436 per_cpu(wd_timer_tb, cpu) = tb; 437 wd_smp_clear_cpu_pending(cpu); 438 } 439 } 440 EXPORT_SYMBOL(arch_touch_nmi_watchdog); 441 442 static void start_watchdog(void *arg) 443 { 444 struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer); 445 int cpu = smp_processor_id(); 446 unsigned long flags; 447 448 if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { 449 WARN_ON(1); 450 return; 451 } 452 453 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) 454 return; 455 456 if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) 457 return; 458 459 wd_smp_lock(&flags); 460 cpumask_set_cpu(cpu, &wd_cpus_enabled); 461 if (cpumask_weight(&wd_cpus_enabled) == 1) { 462 cpumask_set_cpu(cpu, &wd_smp_cpus_pending); 463 wd_smp_last_reset_tb = get_tb(); 464 } 465 wd_smp_unlock(&flags); 466 467 *this_cpu_ptr(&wd_timer_tb) = get_tb(); 468 469 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 470 hrtimer->function = watchdog_timer_fn; 471 hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms), 472 HRTIMER_MODE_REL_PINNED); 473 } 474 475 static int start_watchdog_on_cpu(unsigned int cpu) 476 { 477 return smp_call_function_single(cpu, start_watchdog, NULL, true); 478 } 479 480 static void stop_watchdog(void *arg) 481 { 482 struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer); 483 int cpu = smp_processor_id(); 484 unsigned long flags; 485 486 if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) 487 return; /* Can happen in CPU unplug case */ 488 489 hrtimer_cancel(hrtimer); 490 491 wd_smp_lock(&flags); 492 cpumask_clear_cpu(cpu, &wd_cpus_enabled); 493 wd_smp_unlock(&flags); 494 495 wd_smp_clear_cpu_pending(cpu); 496 } 497 498 static int stop_watchdog_on_cpu(unsigned int cpu) 499 { 500 return smp_call_function_single(cpu, stop_watchdog, NULL, true); 501 } 502 503 static void watchdog_calc_timeouts(void) 504 { 505 wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; 506 507 /* Have the SMP detector trigger a bit later */ 508 wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; 509 510 /* 2/5 is the factor that the perf based detector uses */ 511 wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; 512 } 513 514 void watchdog_nmi_stop(void) 515 { 516 int cpu; 517 518 for_each_cpu(cpu, &wd_cpus_enabled) 519 stop_watchdog_on_cpu(cpu); 520 } 521 522 void watchdog_nmi_start(void) 523 { 524 int cpu; 525 526 watchdog_calc_timeouts(); 527 for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) 528 start_watchdog_on_cpu(cpu); 529 } 530 531 /* 532 * Invoked from core watchdog init. 533 */ 534 int __init watchdog_nmi_probe(void) 535 { 536 int err; 537 538 err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 539 "powerpc/watchdog:online", 540 start_watchdog_on_cpu, 541 stop_watchdog_on_cpu); 542 if (err < 0) { 543 pr_warn("could not be initialized"); 544 return err; 545 } 546 return 0; 547 } 548