1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Detect hard and soft lockups on a system 4 * 5 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. 6 * 7 * Note: Most of this code is borrowed heavily from the original softlockup 8 * detector, so thanks to Ingo for the initial implementation. 9 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks 10 * to those contributors as well. 11 */ 12 13 #define pr_fmt(fmt) "watchdog: " fmt 14 15 #include <linux/cpu.h> 16 #include <linux/init.h> 17 #include <linux/irq.h> 18 #include <linux/irqdesc.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/kvm_para.h> 21 #include <linux/math64.h> 22 #include <linux/mm.h> 23 #include <linux/module.h> 24 #include <linux/nmi.h> 25 #include <linux/stop_machine.h> 26 #include <linux/sysctl.h> 27 #include <linux/tick.h> 28 29 #include <linux/sched/clock.h> 30 #include <linux/sched/debug.h> 31 #include <linux/sched/isolation.h> 32 33 #include <asm/irq_regs.h> 34 35 static DEFINE_MUTEX(watchdog_mutex); 36 37 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64) 38 # define WATCHDOG_HARDLOCKUP_DEFAULT 1 39 #else 40 # define WATCHDOG_HARDLOCKUP_DEFAULT 0 41 #endif 42 43 #define NUM_SAMPLE_PERIODS 5 44 45 unsigned long __read_mostly watchdog_enabled; 46 int __read_mostly watchdog_user_enabled = 1; 47 static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; 48 static int __read_mostly watchdog_softlockup_user_enabled = 1; 49 int __read_mostly watchdog_thresh = 10; 50 static int __read_mostly watchdog_hardlockup_available; 51 52 struct cpumask watchdog_cpumask __read_mostly; 53 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 54 55 #ifdef CONFIG_HARDLOCKUP_DETECTOR 56 57 # ifdef CONFIG_SMP 58 int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 59 # endif /* CONFIG_SMP */ 60 61 /* 62 * Should we panic when a soft-lockup or hard-lockup occurs: 63 */ 64 unsigned int __read_mostly hardlockup_panic = 65 IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); 66 /* 67 * We may not want to enable hard lockup detection by default in all cases, 68 * for example when running the kernel as a guest on a hypervisor. In these 69 * cases this function can be called to disable hard lockup detection. This 70 * function should only be executed once by the boot processor before the 71 * kernel command line parameters are parsed, because otherwise it is not 72 * possible to override this in hardlockup_panic_setup(). 73 */ 74 void __init hardlockup_detector_disable(void) 75 { 76 watchdog_hardlockup_user_enabled = 0; 77 } 78 79 static int __init hardlockup_panic_setup(char *str) 80 { 81 if (!strncmp(str, "panic", 5)) 82 hardlockup_panic = 1; 83 else if (!strncmp(str, "nopanic", 7)) 84 hardlockup_panic = 0; 85 else if (!strncmp(str, "0", 1)) 86 watchdog_hardlockup_user_enabled = 0; 87 else if (!strncmp(str, "1", 1)) 88 watchdog_hardlockup_user_enabled = 1; 89 return 1; 90 } 91 __setup("nmi_watchdog=", hardlockup_panic_setup); 92 93 #endif /* CONFIG_HARDLOCKUP_DETECTOR */ 94 95 #if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER) 96 97 static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); 98 static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); 99 static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); 100 static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); 101 static unsigned long hard_lockup_nmi_warn; 102 103 notrace void arch_touch_nmi_watchdog(void) 104 { 105 /* 106 * Using __raw here because some code paths have 107 * preemption enabled. If preemption is enabled 108 * then interrupts should be enabled too, in which 109 * case we shouldn't have to worry about the watchdog 110 * going off. 111 */ 112 raw_cpu_write(watchdog_hardlockup_touched, true); 113 } 114 EXPORT_SYMBOL(arch_touch_nmi_watchdog); 115 116 void watchdog_hardlockup_touch_cpu(unsigned int cpu) 117 { 118 per_cpu(watchdog_hardlockup_touched, cpu) = true; 119 } 120 121 static bool is_hardlockup(unsigned int cpu) 122 { 123 int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); 124 125 if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) 126 return true; 127 128 /* 129 * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE 130 * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is 131 * written/read by a single CPU. 132 */ 133 per_cpu(hrtimer_interrupts_saved, cpu) = hrint; 134 135 return false; 136 } 137 138 static void watchdog_hardlockup_kick(void) 139 { 140 int new_interrupts; 141 142 new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); 143 watchdog_buddy_check_hardlockup(new_interrupts); 144 } 145 146 void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) 147 { 148 if (per_cpu(watchdog_hardlockup_touched, cpu)) { 149 per_cpu(watchdog_hardlockup_touched, cpu) = false; 150 return; 151 } 152 153 /* 154 * Check for a hardlockup by making sure the CPU's timer 155 * interrupt is incrementing. The timer interrupt should have 156 * fired multiple times before we overflow'd. If it hasn't 157 * then this is a good indication the cpu is stuck 158 */ 159 if (is_hardlockup(cpu)) { 160 unsigned int this_cpu = smp_processor_id(); 161 unsigned long flags; 162 163 /* Only print hardlockups once. */ 164 if (per_cpu(watchdog_hardlockup_warned, cpu)) 165 return; 166 167 /* 168 * Prevent multiple hard-lockup reports if one cpu is already 169 * engaged in dumping all cpu back traces. 170 */ 171 if (sysctl_hardlockup_all_cpu_backtrace) { 172 if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) 173 return; 174 } 175 176 /* 177 * NOTE: we call printk_cpu_sync_get_irqsave() after printing 178 * the lockup message. While it would be nice to serialize 179 * that printout, we really want to make sure that if some 180 * other CPU somehow locked up while holding the lock associated 181 * with printk_cpu_sync_get_irqsave() that we can still at least 182 * get the message about the lockup out. 183 */ 184 pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); 185 printk_cpu_sync_get_irqsave(flags); 186 187 print_modules(); 188 print_irqtrace_events(current); 189 if (cpu == this_cpu) { 190 if (regs) 191 show_regs(regs); 192 else 193 dump_stack(); 194 printk_cpu_sync_put_irqrestore(flags); 195 } else { 196 printk_cpu_sync_put_irqrestore(flags); 197 trigger_single_cpu_backtrace(cpu); 198 } 199 200 if (sysctl_hardlockup_all_cpu_backtrace) { 201 trigger_allbutcpu_cpu_backtrace(cpu); 202 if (!hardlockup_panic) 203 clear_bit_unlock(0, &hard_lockup_nmi_warn); 204 } 205 206 if (hardlockup_panic) 207 nmi_panic(regs, "Hard LOCKUP"); 208 209 per_cpu(watchdog_hardlockup_warned, cpu) = true; 210 } else { 211 per_cpu(watchdog_hardlockup_warned, cpu) = false; 212 } 213 } 214 215 #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ 216 217 static inline void watchdog_hardlockup_kick(void) { } 218 219 #endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ 220 221 /* 222 * These functions can be overridden based on the configured hardlockdup detector. 223 * 224 * watchdog_hardlockup_enable/disable can be implemented to start and stop when 225 * softlockup watchdog start and stop. The detector must select the 226 * SOFTLOCKUP_DETECTOR Kconfig. 227 */ 228 void __weak watchdog_hardlockup_enable(unsigned int cpu) { } 229 230 void __weak watchdog_hardlockup_disable(unsigned int cpu) { } 231 232 /* 233 * Watchdog-detector specific API. 234 * 235 * Return 0 when hardlockup watchdog is available, negative value otherwise. 236 * Note that the negative value means that a delayed probe might 237 * succeed later. 238 */ 239 int __weak __init watchdog_hardlockup_probe(void) 240 { 241 return -ENODEV; 242 } 243 244 /** 245 * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration 246 * 247 * The reconfiguration steps are: 248 * watchdog_hardlockup_stop(); 249 * update_variables(); 250 * watchdog_hardlockup_start(); 251 */ 252 void __weak watchdog_hardlockup_stop(void) { } 253 254 /** 255 * watchdog_hardlockup_start - Start the watchdog after reconfiguration 256 * 257 * Counterpart to watchdog_hardlockup_stop(). 258 * 259 * The following variables have been updated in update_variables() and 260 * contain the currently valid configuration: 261 * - watchdog_enabled 262 * - watchdog_thresh 263 * - watchdog_cpumask 264 */ 265 void __weak watchdog_hardlockup_start(void) { } 266 267 /** 268 * lockup_detector_update_enable - Update the sysctl enable bit 269 * 270 * Caller needs to make sure that the hard watchdogs are off, so this 271 * can't race with watchdog_hardlockup_disable(). 272 */ 273 static void lockup_detector_update_enable(void) 274 { 275 watchdog_enabled = 0; 276 if (!watchdog_user_enabled) 277 return; 278 if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled) 279 watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED; 280 if (watchdog_softlockup_user_enabled) 281 watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED; 282 } 283 284 #ifdef CONFIG_SOFTLOCKUP_DETECTOR 285 286 /* 287 * Delay the soflockup report when running a known slow code. 288 * It does _not_ affect the timestamp of the last successdul reschedule. 289 */ 290 #define SOFTLOCKUP_DELAY_REPORT ULONG_MAX 291 292 #ifdef CONFIG_SMP 293 int __read_mostly sysctl_softlockup_all_cpu_backtrace; 294 #endif 295 296 static struct cpumask watchdog_allowed_mask __read_mostly; 297 298 /* Global variables, exported for sysctl */ 299 unsigned int __read_mostly softlockup_panic = 300 IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); 301 302 static bool softlockup_initialized __read_mostly; 303 static u64 __read_mostly sample_period; 304 305 /* Timestamp taken after the last successful reschedule. */ 306 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 307 /* Timestamp of the last softlockup report. */ 308 static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); 309 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); 310 static DEFINE_PER_CPU(bool, softlockup_touch_sync); 311 static unsigned long soft_lockup_nmi_warn; 312 313 static int __init softlockup_panic_setup(char *str) 314 { 315 softlockup_panic = simple_strtoul(str, NULL, 0); 316 return 1; 317 } 318 __setup("softlockup_panic=", softlockup_panic_setup); 319 320 static int __init nowatchdog_setup(char *str) 321 { 322 watchdog_user_enabled = 0; 323 return 1; 324 } 325 __setup("nowatchdog", nowatchdog_setup); 326 327 static int __init nosoftlockup_setup(char *str) 328 { 329 watchdog_softlockup_user_enabled = 0; 330 return 1; 331 } 332 __setup("nosoftlockup", nosoftlockup_setup); 333 334 static int __init watchdog_thresh_setup(char *str) 335 { 336 get_option(&str, &watchdog_thresh); 337 return 1; 338 } 339 __setup("watchdog_thresh=", watchdog_thresh_setup); 340 341 static void __lockup_detector_cleanup(void); 342 343 #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM 344 enum stats_per_group { 345 STATS_SYSTEM, 346 STATS_SOFTIRQ, 347 STATS_HARDIRQ, 348 STATS_IDLE, 349 NUM_STATS_PER_GROUP, 350 }; 351 352 static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = { 353 CPUTIME_SYSTEM, 354 CPUTIME_SOFTIRQ, 355 CPUTIME_IRQ, 356 CPUTIME_IDLE, 357 }; 358 359 static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]); 360 static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]); 361 static DEFINE_PER_CPU(u8, cpustat_tail); 362 363 /* 364 * We don't need nanosecond resolution. A granularity of 16ms is 365 * sufficient for our precision, allowing us to use u16 to store 366 * cpustats, which will roll over roughly every ~1000 seconds. 367 * 2^24 ~= 16 * 10^6 368 */ 369 static u16 get_16bit_precision(u64 data_ns) 370 { 371 return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ 372 } 373 374 static void update_cpustat(void) 375 { 376 int i; 377 u8 util; 378 u16 old_stat, new_stat; 379 struct kernel_cpustat kcpustat; 380 u64 *cpustat = kcpustat.cpustat; 381 u8 tail = __this_cpu_read(cpustat_tail); 382 u16 sample_period_16 = get_16bit_precision(sample_period); 383 384 kcpustat_cpu_fetch(&kcpustat, smp_processor_id()); 385 386 for (i = 0; i < NUM_STATS_PER_GROUP; i++) { 387 old_stat = __this_cpu_read(cpustat_old[i]); 388 new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); 389 util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); 390 __this_cpu_write(cpustat_util[tail][i], util); 391 __this_cpu_write(cpustat_old[i], new_stat); 392 } 393 394 __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS); 395 } 396 397 static void print_cpustat(void) 398 { 399 int i, group; 400 u8 tail = __this_cpu_read(cpustat_tail); 401 u64 sample_period_second = sample_period; 402 403 do_div(sample_period_second, NSEC_PER_SEC); 404 405 /* 406 * Outputting the "watchdog" prefix on every line is redundant and not 407 * concise, and the original alarm information is sufficient for 408 * positioning in logs, hence here printk() is used instead of pr_crit(). 409 */ 410 printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", 411 smp_processor_id(), sample_period_second); 412 413 for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { 414 group = (tail + i) % NUM_SAMPLE_PERIODS; 415 printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t" 416 "%3u%% hardirq,\t%3u%% idle\n", i + 1, 417 __this_cpu_read(cpustat_util[group][STATS_SYSTEM]), 418 __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]), 419 __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]), 420 __this_cpu_read(cpustat_util[group][STATS_IDLE])); 421 } 422 } 423 424 #define HARDIRQ_PERCENT_THRESH 50 425 #define NUM_HARDIRQ_REPORT 5 426 struct irq_counts { 427 int irq; 428 u32 counts; 429 }; 430 431 static DEFINE_PER_CPU(bool, snapshot_taken); 432 433 /* Tabulate the most frequent interrupts. */ 434 static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank) 435 { 436 int i; 437 struct irq_counts new_count = {irq, counts}; 438 439 for (i = 0; i < rank; i++) { 440 if (counts > irq_counts[i].counts) 441 swap(new_count, irq_counts[i]); 442 } 443 } 444 445 /* 446 * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period, 447 * then the cause of softlockup might be interrupt storm. In this case, it 448 * would be useful to start interrupt counting. 449 */ 450 static bool need_counting_irqs(void) 451 { 452 u8 util; 453 int tail = __this_cpu_read(cpustat_tail); 454 455 tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; 456 util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); 457 return util > HARDIRQ_PERCENT_THRESH; 458 } 459 460 static void start_counting_irqs(void) 461 { 462 if (!__this_cpu_read(snapshot_taken)) { 463 kstat_snapshot_irqs(); 464 __this_cpu_write(snapshot_taken, true); 465 } 466 } 467 468 static void stop_counting_irqs(void) 469 { 470 __this_cpu_write(snapshot_taken, false); 471 } 472 473 static void print_irq_counts(void) 474 { 475 unsigned int i, count; 476 struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = { 477 {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0} 478 }; 479 480 if (__this_cpu_read(snapshot_taken)) { 481 for_each_active_irq(i) { 482 count = kstat_get_irq_since_snapshot(i); 483 tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT); 484 } 485 486 /* 487 * Outputting the "watchdog" prefix on every line is redundant and not 488 * concise, and the original alarm information is sufficient for 489 * positioning in logs, hence here printk() is used instead of pr_crit(). 490 */ 491 printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n", 492 smp_processor_id(), HARDIRQ_PERCENT_THRESH); 493 494 for (i = 0; i < NUM_HARDIRQ_REPORT; i++) { 495 if (irq_counts_sorted[i].irq == -1) 496 break; 497 498 printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n", 499 i + 1, irq_counts_sorted[i].counts, 500 irq_counts_sorted[i].irq); 501 } 502 503 /* 504 * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last 505 * sample_period, then we suspect the interrupt storm might be subsiding. 506 */ 507 if (!need_counting_irqs()) 508 stop_counting_irqs(); 509 } 510 } 511 512 static void report_cpu_status(void) 513 { 514 print_cpustat(); 515 print_irq_counts(); 516 } 517 #else 518 static inline void update_cpustat(void) { } 519 static inline void report_cpu_status(void) { } 520 static inline bool need_counting_irqs(void) { return false; } 521 static inline void start_counting_irqs(void) { } 522 static inline void stop_counting_irqs(void) { } 523 #endif 524 525 /* 526 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 527 * lockups can have false positives under extreme conditions. So we generally 528 * want a higher threshold for soft lockups than for hard lockups. So we couple 529 * the thresholds with a factor: we make the soft threshold twice the amount of 530 * time the hard threshold is. 531 */ 532 static int get_softlockup_thresh(void) 533 { 534 return watchdog_thresh * 2; 535 } 536 537 /* 538 * Returns seconds, approximately. We don't need nanosecond 539 * resolution, and we don't need to waste time with a big divide when 540 * 2^30ns == 1.074s. 541 */ 542 static unsigned long get_timestamp(void) 543 { 544 return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ 545 } 546 547 static void set_sample_period(void) 548 { 549 /* 550 * convert watchdog_thresh from seconds to ns 551 * the divide by 5 is to give hrtimer several chances (two 552 * or three with the current relation between the soft 553 * and hard thresholds) to increment before the 554 * hardlockup detector generates a warning 555 */ 556 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS); 557 watchdog_update_hrtimer_threshold(sample_period); 558 } 559 560 static void update_report_ts(void) 561 { 562 __this_cpu_write(watchdog_report_ts, get_timestamp()); 563 } 564 565 /* Commands for resetting the watchdog */ 566 static void update_touch_ts(void) 567 { 568 __this_cpu_write(watchdog_touch_ts, get_timestamp()); 569 update_report_ts(); 570 } 571 572 /** 573 * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls 574 * 575 * Call when the scheduler may have stalled for legitimate reasons 576 * preventing the watchdog task from executing - e.g. the scheduler 577 * entering idle state. This should only be used for scheduler events. 578 * Use touch_softlockup_watchdog() for everything else. 579 */ 580 notrace void touch_softlockup_watchdog_sched(void) 581 { 582 /* 583 * Preemption can be enabled. It doesn't matter which CPU's watchdog 584 * report period gets restarted here, so use the raw_ operation. 585 */ 586 raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); 587 } 588 589 notrace void touch_softlockup_watchdog(void) 590 { 591 touch_softlockup_watchdog_sched(); 592 wq_watchdog_touch(raw_smp_processor_id()); 593 } 594 EXPORT_SYMBOL(touch_softlockup_watchdog); 595 596 void touch_all_softlockup_watchdogs(void) 597 { 598 int cpu; 599 600 /* 601 * watchdog_mutex cannpt be taken here, as this might be called 602 * from (soft)interrupt context, so the access to 603 * watchdog_allowed_cpumask might race with a concurrent update. 604 * 605 * The watchdog time stamp can race against a concurrent real 606 * update as well, the only side effect might be a cycle delay for 607 * the softlockup check. 608 */ 609 for_each_cpu(cpu, &watchdog_allowed_mask) { 610 per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; 611 wq_watchdog_touch(cpu); 612 } 613 } 614 615 void touch_softlockup_watchdog_sync(void) 616 { 617 __this_cpu_write(softlockup_touch_sync, true); 618 __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); 619 } 620 621 static int is_softlockup(unsigned long touch_ts, 622 unsigned long period_ts, 623 unsigned long now) 624 { 625 if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { 626 /* 627 * If period_ts has not been updated during a sample_period, then 628 * in the subsequent few sample_periods, period_ts might also not 629 * be updated, which could indicate a potential softlockup. In 630 * this case, if we suspect the cause of the potential softlockup 631 * might be interrupt storm, then we need to count the interrupts 632 * to find which interrupt is storming. 633 */ 634 if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) && 635 need_counting_irqs()) 636 start_counting_irqs(); 637 638 /* Warn about unreasonable delays. */ 639 if (time_after(now, period_ts + get_softlockup_thresh())) 640 return now - touch_ts; 641 } 642 return 0; 643 } 644 645 /* watchdog detector functions */ 646 static DEFINE_PER_CPU(struct completion, softlockup_completion); 647 static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); 648 649 /* 650 * The watchdog feed function - touches the timestamp. 651 * 652 * It only runs once every sample_period seconds (4 seconds by 653 * default) to reset the softlockup timestamp. If this gets delayed 654 * for more than 2*watchdog_thresh seconds then the debug-printout 655 * triggers in watchdog_timer_fn(). 656 */ 657 static int softlockup_fn(void *data) 658 { 659 update_touch_ts(); 660 stop_counting_irqs(); 661 complete(this_cpu_ptr(&softlockup_completion)); 662 663 return 0; 664 } 665 666 /* watchdog kicker functions */ 667 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 668 { 669 unsigned long touch_ts, period_ts, now; 670 struct pt_regs *regs = get_irq_regs(); 671 int duration; 672 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; 673 unsigned long flags; 674 675 if (!watchdog_enabled) 676 return HRTIMER_NORESTART; 677 678 watchdog_hardlockup_kick(); 679 680 /* kick the softlockup detector */ 681 if (completion_done(this_cpu_ptr(&softlockup_completion))) { 682 reinit_completion(this_cpu_ptr(&softlockup_completion)); 683 stop_one_cpu_nowait(smp_processor_id(), 684 softlockup_fn, NULL, 685 this_cpu_ptr(&softlockup_stop_work)); 686 } 687 688 /* .. and repeat */ 689 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); 690 691 /* 692 * Read the current timestamp first. It might become invalid anytime 693 * when a virtual machine is stopped by the host or when the watchog 694 * is touched from NMI. 695 */ 696 now = get_timestamp(); 697 /* 698 * If a virtual machine is stopped by the host it can look to 699 * the watchdog like a soft lockup. This function touches the watchdog. 700 */ 701 kvm_check_and_clear_guest_paused(); 702 /* 703 * The stored timestamp is comparable with @now only when not touched. 704 * It might get touched anytime from NMI. Make sure that is_softlockup() 705 * uses the same (valid) value. 706 */ 707 period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); 708 709 update_cpustat(); 710 711 /* Reset the interval when touched by known problematic code. */ 712 if (period_ts == SOFTLOCKUP_DELAY_REPORT) { 713 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 714 /* 715 * If the time stamp was touched atomically 716 * make sure the scheduler tick is up to date. 717 */ 718 __this_cpu_write(softlockup_touch_sync, false); 719 sched_clock_tick(); 720 } 721 722 update_report_ts(); 723 return HRTIMER_RESTART; 724 } 725 726 /* Check for a softlockup. */ 727 touch_ts = __this_cpu_read(watchdog_touch_ts); 728 duration = is_softlockup(touch_ts, period_ts, now); 729 if (unlikely(duration)) { 730 /* 731 * Prevent multiple soft-lockup reports if one cpu is already 732 * engaged in dumping all cpu back traces. 733 */ 734 if (softlockup_all_cpu_backtrace) { 735 if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) 736 return HRTIMER_RESTART; 737 } 738 739 /* Start period for the next softlockup warning. */ 740 update_report_ts(); 741 742 printk_cpu_sync_get_irqsave(flags); 743 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 744 smp_processor_id(), duration, 745 current->comm, task_pid_nr(current)); 746 report_cpu_status(); 747 print_modules(); 748 print_irqtrace_events(current); 749 if (regs) 750 show_regs(regs); 751 else 752 dump_stack(); 753 printk_cpu_sync_put_irqrestore(flags); 754 755 if (softlockup_all_cpu_backtrace) { 756 trigger_allbutcpu_cpu_backtrace(smp_processor_id()); 757 if (!softlockup_panic) 758 clear_bit_unlock(0, &soft_lockup_nmi_warn); 759 } 760 761 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 762 if (softlockup_panic) 763 panic("softlockup: hung tasks"); 764 } 765 766 return HRTIMER_RESTART; 767 } 768 769 static void watchdog_enable(unsigned int cpu) 770 { 771 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); 772 struct completion *done = this_cpu_ptr(&softlockup_completion); 773 774 WARN_ON_ONCE(cpu != smp_processor_id()); 775 776 init_completion(done); 777 complete(done); 778 779 /* 780 * Start the timer first to prevent the hardlockup watchdog triggering 781 * before the timer has a chance to fire. 782 */ 783 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 784 hrtimer->function = watchdog_timer_fn; 785 hrtimer_start(hrtimer, ns_to_ktime(sample_period), 786 HRTIMER_MODE_REL_PINNED_HARD); 787 788 /* Initialize timestamp */ 789 update_touch_ts(); 790 /* Enable the hardlockup detector */ 791 if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED) 792 watchdog_hardlockup_enable(cpu); 793 } 794 795 static void watchdog_disable(unsigned int cpu) 796 { 797 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); 798 799 WARN_ON_ONCE(cpu != smp_processor_id()); 800 801 /* 802 * Disable the hardlockup detector first. That prevents that a large 803 * delay between disabling the timer and disabling the hardlockup 804 * detector causes a false positive. 805 */ 806 watchdog_hardlockup_disable(cpu); 807 hrtimer_cancel(hrtimer); 808 wait_for_completion(this_cpu_ptr(&softlockup_completion)); 809 } 810 811 static int softlockup_stop_fn(void *data) 812 { 813 watchdog_disable(smp_processor_id()); 814 return 0; 815 } 816 817 static void softlockup_stop_all(void) 818 { 819 int cpu; 820 821 if (!softlockup_initialized) 822 return; 823 824 for_each_cpu(cpu, &watchdog_allowed_mask) 825 smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); 826 827 cpumask_clear(&watchdog_allowed_mask); 828 } 829 830 static int softlockup_start_fn(void *data) 831 { 832 watchdog_enable(smp_processor_id()); 833 return 0; 834 } 835 836 static void softlockup_start_all(void) 837 { 838 int cpu; 839 840 cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); 841 for_each_cpu(cpu, &watchdog_allowed_mask) 842 smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); 843 } 844 845 int lockup_detector_online_cpu(unsigned int cpu) 846 { 847 if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) 848 watchdog_enable(cpu); 849 return 0; 850 } 851 852 int lockup_detector_offline_cpu(unsigned int cpu) 853 { 854 if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) 855 watchdog_disable(cpu); 856 return 0; 857 } 858 859 static void __lockup_detector_reconfigure(void) 860 { 861 cpus_read_lock(); 862 watchdog_hardlockup_stop(); 863 864 softlockup_stop_all(); 865 set_sample_period(); 866 lockup_detector_update_enable(); 867 if (watchdog_enabled && watchdog_thresh) 868 softlockup_start_all(); 869 870 watchdog_hardlockup_start(); 871 cpus_read_unlock(); 872 /* 873 * Must be called outside the cpus locked section to prevent 874 * recursive locking in the perf code. 875 */ 876 __lockup_detector_cleanup(); 877 } 878 879 void lockup_detector_reconfigure(void) 880 { 881 mutex_lock(&watchdog_mutex); 882 __lockup_detector_reconfigure(); 883 mutex_unlock(&watchdog_mutex); 884 } 885 886 /* 887 * Create the watchdog infrastructure and configure the detector(s). 888 */ 889 static __init void lockup_detector_setup(void) 890 { 891 /* 892 * If sysctl is off and watchdog got disabled on the command line, 893 * nothing to do here. 894 */ 895 lockup_detector_update_enable(); 896 897 if (!IS_ENABLED(CONFIG_SYSCTL) && 898 !(watchdog_enabled && watchdog_thresh)) 899 return; 900 901 mutex_lock(&watchdog_mutex); 902 __lockup_detector_reconfigure(); 903 softlockup_initialized = true; 904 mutex_unlock(&watchdog_mutex); 905 } 906 907 #else /* CONFIG_SOFTLOCKUP_DETECTOR */ 908 static void __lockup_detector_reconfigure(void) 909 { 910 cpus_read_lock(); 911 watchdog_hardlockup_stop(); 912 lockup_detector_update_enable(); 913 watchdog_hardlockup_start(); 914 cpus_read_unlock(); 915 } 916 void lockup_detector_reconfigure(void) 917 { 918 __lockup_detector_reconfigure(); 919 } 920 static inline void lockup_detector_setup(void) 921 { 922 __lockup_detector_reconfigure(); 923 } 924 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ 925 926 static void __lockup_detector_cleanup(void) 927 { 928 lockdep_assert_held(&watchdog_mutex); 929 hardlockup_detector_perf_cleanup(); 930 } 931 932 /** 933 * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes 934 * 935 * Caller must not hold the cpu hotplug rwsem. 936 */ 937 void lockup_detector_cleanup(void) 938 { 939 mutex_lock(&watchdog_mutex); 940 __lockup_detector_cleanup(); 941 mutex_unlock(&watchdog_mutex); 942 } 943 944 /** 945 * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) 946 * 947 * Special interface for parisc. It prevents lockup detector warnings from 948 * the default pm_poweroff() function which busy loops forever. 949 */ 950 void lockup_detector_soft_poweroff(void) 951 { 952 watchdog_enabled = 0; 953 } 954 955 #ifdef CONFIG_SYSCTL 956 957 /* Propagate any changes to the watchdog infrastructure */ 958 static void proc_watchdog_update(void) 959 { 960 /* Remove impossible cpus to keep sysctl output clean. */ 961 cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); 962 __lockup_detector_reconfigure(); 963 } 964 965 /* 966 * common function for watchdog, nmi_watchdog and soft_watchdog parameter 967 * 968 * caller | table->data points to | 'which' 969 * -------------------|----------------------------------|------------------------------- 970 * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED | 971 * | | WATCHDOG_SOFTOCKUP_ENABLED 972 * -------------------|----------------------------------|------------------------------- 973 * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED 974 * -------------------|----------------------------------|------------------------------- 975 * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED 976 */ 977 static int proc_watchdog_common(int which, struct ctl_table *table, int write, 978 void *buffer, size_t *lenp, loff_t *ppos) 979 { 980 int err, old, *param = table->data; 981 982 mutex_lock(&watchdog_mutex); 983 984 if (!write) { 985 /* 986 * On read synchronize the userspace interface. This is a 987 * racy snapshot. 988 */ 989 *param = (watchdog_enabled & which) != 0; 990 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 991 } else { 992 old = READ_ONCE(*param); 993 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 994 if (!err && old != READ_ONCE(*param)) 995 proc_watchdog_update(); 996 } 997 mutex_unlock(&watchdog_mutex); 998 return err; 999 } 1000 1001 /* 1002 * /proc/sys/kernel/watchdog 1003 */ 1004 static int proc_watchdog(struct ctl_table *table, int write, 1005 void *buffer, size_t *lenp, loff_t *ppos) 1006 { 1007 return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | 1008 WATCHDOG_SOFTOCKUP_ENABLED, 1009 table, write, buffer, lenp, ppos); 1010 } 1011 1012 /* 1013 * /proc/sys/kernel/nmi_watchdog 1014 */ 1015 static int proc_nmi_watchdog(struct ctl_table *table, int write, 1016 void *buffer, size_t *lenp, loff_t *ppos) 1017 { 1018 if (!watchdog_hardlockup_available && write) 1019 return -ENOTSUPP; 1020 return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED, 1021 table, write, buffer, lenp, ppos); 1022 } 1023 1024 #ifdef CONFIG_SOFTLOCKUP_DETECTOR 1025 /* 1026 * /proc/sys/kernel/soft_watchdog 1027 */ 1028 static int proc_soft_watchdog(struct ctl_table *table, int write, 1029 void *buffer, size_t *lenp, loff_t *ppos) 1030 { 1031 return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, 1032 table, write, buffer, lenp, ppos); 1033 } 1034 #endif 1035 1036 /* 1037 * /proc/sys/kernel/watchdog_thresh 1038 */ 1039 static int proc_watchdog_thresh(struct ctl_table *table, int write, 1040 void *buffer, size_t *lenp, loff_t *ppos) 1041 { 1042 int err, old; 1043 1044 mutex_lock(&watchdog_mutex); 1045 1046 old = READ_ONCE(watchdog_thresh); 1047 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 1048 1049 if (!err && write && old != READ_ONCE(watchdog_thresh)) 1050 proc_watchdog_update(); 1051 1052 mutex_unlock(&watchdog_mutex); 1053 return err; 1054 } 1055 1056 /* 1057 * The cpumask is the mask of possible cpus that the watchdog can run 1058 * on, not the mask of cpus it is actually running on. This allows the 1059 * user to specify a mask that will include cpus that have not yet 1060 * been brought online, if desired. 1061 */ 1062 static int proc_watchdog_cpumask(struct ctl_table *table, int write, 1063 void *buffer, size_t *lenp, loff_t *ppos) 1064 { 1065 int err; 1066 1067 mutex_lock(&watchdog_mutex); 1068 1069 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); 1070 if (!err && write) 1071 proc_watchdog_update(); 1072 1073 mutex_unlock(&watchdog_mutex); 1074 return err; 1075 } 1076 1077 static const int sixty = 60; 1078 1079 static struct ctl_table watchdog_sysctls[] = { 1080 { 1081 .procname = "watchdog", 1082 .data = &watchdog_user_enabled, 1083 .maxlen = sizeof(int), 1084 .mode = 0644, 1085 .proc_handler = proc_watchdog, 1086 .extra1 = SYSCTL_ZERO, 1087 .extra2 = SYSCTL_ONE, 1088 }, 1089 { 1090 .procname = "watchdog_thresh", 1091 .data = &watchdog_thresh, 1092 .maxlen = sizeof(int), 1093 .mode = 0644, 1094 .proc_handler = proc_watchdog_thresh, 1095 .extra1 = SYSCTL_ZERO, 1096 .extra2 = (void *)&sixty, 1097 }, 1098 { 1099 .procname = "watchdog_cpumask", 1100 .data = &watchdog_cpumask_bits, 1101 .maxlen = NR_CPUS, 1102 .mode = 0644, 1103 .proc_handler = proc_watchdog_cpumask, 1104 }, 1105 #ifdef CONFIG_SOFTLOCKUP_DETECTOR 1106 { 1107 .procname = "soft_watchdog", 1108 .data = &watchdog_softlockup_user_enabled, 1109 .maxlen = sizeof(int), 1110 .mode = 0644, 1111 .proc_handler = proc_soft_watchdog, 1112 .extra1 = SYSCTL_ZERO, 1113 .extra2 = SYSCTL_ONE, 1114 }, 1115 { 1116 .procname = "softlockup_panic", 1117 .data = &softlockup_panic, 1118 .maxlen = sizeof(int), 1119 .mode = 0644, 1120 .proc_handler = proc_dointvec_minmax, 1121 .extra1 = SYSCTL_ZERO, 1122 .extra2 = SYSCTL_ONE, 1123 }, 1124 #ifdef CONFIG_SMP 1125 { 1126 .procname = "softlockup_all_cpu_backtrace", 1127 .data = &sysctl_softlockup_all_cpu_backtrace, 1128 .maxlen = sizeof(int), 1129 .mode = 0644, 1130 .proc_handler = proc_dointvec_minmax, 1131 .extra1 = SYSCTL_ZERO, 1132 .extra2 = SYSCTL_ONE, 1133 }, 1134 #endif /* CONFIG_SMP */ 1135 #endif 1136 #ifdef CONFIG_HARDLOCKUP_DETECTOR 1137 { 1138 .procname = "hardlockup_panic", 1139 .data = &hardlockup_panic, 1140 .maxlen = sizeof(int), 1141 .mode = 0644, 1142 .proc_handler = proc_dointvec_minmax, 1143 .extra1 = SYSCTL_ZERO, 1144 .extra2 = SYSCTL_ONE, 1145 }, 1146 #ifdef CONFIG_SMP 1147 { 1148 .procname = "hardlockup_all_cpu_backtrace", 1149 .data = &sysctl_hardlockup_all_cpu_backtrace, 1150 .maxlen = sizeof(int), 1151 .mode = 0644, 1152 .proc_handler = proc_dointvec_minmax, 1153 .extra1 = SYSCTL_ZERO, 1154 .extra2 = SYSCTL_ONE, 1155 }, 1156 #endif /* CONFIG_SMP */ 1157 #endif 1158 }; 1159 1160 static struct ctl_table watchdog_hardlockup_sysctl[] = { 1161 { 1162 .procname = "nmi_watchdog", 1163 .data = &watchdog_hardlockup_user_enabled, 1164 .maxlen = sizeof(int), 1165 .mode = 0444, 1166 .proc_handler = proc_nmi_watchdog, 1167 .extra1 = SYSCTL_ZERO, 1168 .extra2 = SYSCTL_ONE, 1169 }, 1170 }; 1171 1172 static void __init watchdog_sysctl_init(void) 1173 { 1174 register_sysctl_init("kernel", watchdog_sysctls); 1175 1176 if (watchdog_hardlockup_available) 1177 watchdog_hardlockup_sysctl[0].mode = 0644; 1178 register_sysctl_init("kernel", watchdog_hardlockup_sysctl); 1179 } 1180 1181 #else 1182 #define watchdog_sysctl_init() do { } while (0) 1183 #endif /* CONFIG_SYSCTL */ 1184 1185 static void __init lockup_detector_delay_init(struct work_struct *work); 1186 static bool allow_lockup_detector_init_retry __initdata; 1187 1188 static struct work_struct detector_work __initdata = 1189 __WORK_INITIALIZER(detector_work, lockup_detector_delay_init); 1190 1191 static void __init lockup_detector_delay_init(struct work_struct *work) 1192 { 1193 int ret; 1194 1195 ret = watchdog_hardlockup_probe(); 1196 if (ret) { 1197 pr_info("Delayed init of the lockup detector failed: %d\n", ret); 1198 pr_info("Hard watchdog permanently disabled\n"); 1199 return; 1200 } 1201 1202 allow_lockup_detector_init_retry = false; 1203 1204 watchdog_hardlockup_available = true; 1205 lockup_detector_setup(); 1206 } 1207 1208 /* 1209 * lockup_detector_retry_init - retry init lockup detector if possible. 1210 * 1211 * Retry hardlockup detector init. It is useful when it requires some 1212 * functionality that has to be initialized later on a particular 1213 * platform. 1214 */ 1215 void __init lockup_detector_retry_init(void) 1216 { 1217 /* Must be called before late init calls */ 1218 if (!allow_lockup_detector_init_retry) 1219 return; 1220 1221 schedule_work(&detector_work); 1222 } 1223 1224 /* 1225 * Ensure that optional delayed hardlockup init is proceed before 1226 * the init code and memory is freed. 1227 */ 1228 static int __init lockup_detector_check(void) 1229 { 1230 /* Prevent any later retry. */ 1231 allow_lockup_detector_init_retry = false; 1232 1233 /* Make sure no work is pending. */ 1234 flush_work(&detector_work); 1235 1236 watchdog_sysctl_init(); 1237 1238 return 0; 1239 1240 } 1241 late_initcall_sync(lockup_detector_check); 1242 1243 void __init lockup_detector_init(void) 1244 { 1245 if (tick_nohz_full_enabled()) 1246 pr_info("Disabling watchdog on nohz_full cores by default\n"); 1247 1248 cpumask_copy(&watchdog_cpumask, 1249 housekeeping_cpumask(HK_TYPE_TIMER)); 1250 1251 if (!watchdog_hardlockup_probe()) 1252 watchdog_hardlockup_available = true; 1253 else 1254 allow_lockup_detector_init_retry = true; 1255 1256 lockup_detector_setup(); 1257 } 1258