xref: /linux/kernel/watchdog.c (revision f11c1efe46ad84555a0948401c7bdb63d711088d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Detect hard and soft lockups on a system
4  *
5  * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
6  *
7  * Note: Most of this code is borrowed heavily from the original softlockup
8  * detector, so thanks to Ingo for the initial implementation.
9  * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
10  * to those contributors as well.
11  */
12 
13 #define pr_fmt(fmt) "watchdog: " fmt
14 
15 #include <linux/cpu.h>
16 #include <linux/init.h>
17 #include <linux/irq.h>
18 #include <linux/irqdesc.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/kvm_para.h>
21 #include <linux/math64.h>
22 #include <linux/mm.h>
23 #include <linux/module.h>
24 #include <linux/nmi.h>
25 #include <linux/stop_machine.h>
26 #include <linux/sysctl.h>
27 #include <linux/tick.h>
28 
29 #include <linux/sched/clock.h>
30 #include <linux/sched/debug.h>
31 #include <linux/sched/isolation.h>
32 
33 #include <asm/irq_regs.h>
34 
35 static DEFINE_MUTEX(watchdog_mutex);
36 
37 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
38 # define WATCHDOG_HARDLOCKUP_DEFAULT	1
39 #else
40 # define WATCHDOG_HARDLOCKUP_DEFAULT	0
41 #endif
42 
43 #define NUM_SAMPLE_PERIODS	5
44 
45 unsigned long __read_mostly watchdog_enabled;
46 int __read_mostly watchdog_user_enabled = 1;
47 static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
48 static int __read_mostly watchdog_softlockup_user_enabled = 1;
49 int __read_mostly watchdog_thresh = 10;
50 static int __read_mostly watchdog_thresh_next;
51 static int __read_mostly watchdog_hardlockup_available;
52 
53 struct cpumask watchdog_cpumask __read_mostly;
54 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
55 
56 #ifdef CONFIG_HARDLOCKUP_DETECTOR
57 
58 # ifdef CONFIG_SMP
59 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
60 # endif /* CONFIG_SMP */
61 
62 /*
63  * Should we panic when a soft-lockup or hard-lockup occurs:
64  */
65 unsigned int __read_mostly hardlockup_panic =
66 			IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
67 /*
68  * We may not want to enable hard lockup detection by default in all cases,
69  * for example when running the kernel as a guest on a hypervisor. In these
70  * cases this function can be called to disable hard lockup detection. This
71  * function should only be executed once by the boot processor before the
72  * kernel command line parameters are parsed, because otherwise it is not
73  * possible to override this in hardlockup_panic_setup().
74  */
75 void __init hardlockup_detector_disable(void)
76 {
77 	watchdog_hardlockup_user_enabled = 0;
78 }
79 
80 static int __init hardlockup_panic_setup(char *str)
81 {
82 next:
83 	if (!strncmp(str, "panic", 5))
84 		hardlockup_panic = 1;
85 	else if (!strncmp(str, "nopanic", 7))
86 		hardlockup_panic = 0;
87 	else if (!strncmp(str, "0", 1))
88 		watchdog_hardlockup_user_enabled = 0;
89 	else if (!strncmp(str, "1", 1))
90 		watchdog_hardlockup_user_enabled = 1;
91 	else if (!strncmp(str, "r", 1))
92 		hardlockup_config_perf_event(str + 1);
93 	while (*(str++)) {
94 		if (*str == ',') {
95 			str++;
96 			goto next;
97 		}
98 	}
99 	return 1;
100 }
101 __setup("nmi_watchdog=", hardlockup_panic_setup);
102 
103 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
104 
105 #if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)
106 
107 static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
108 static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
109 static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
110 static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
111 static unsigned long hard_lockup_nmi_warn;
112 
113 notrace void arch_touch_nmi_watchdog(void)
114 {
115 	/*
116 	 * Using __raw here because some code paths have
117 	 * preemption enabled.  If preemption is enabled
118 	 * then interrupts should be enabled too, in which
119 	 * case we shouldn't have to worry about the watchdog
120 	 * going off.
121 	 */
122 	raw_cpu_write(watchdog_hardlockup_touched, true);
123 }
124 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
125 
126 void watchdog_hardlockup_touch_cpu(unsigned int cpu)
127 {
128 	per_cpu(watchdog_hardlockup_touched, cpu) = true;
129 }
130 
131 static bool is_hardlockup(unsigned int cpu)
132 {
133 	int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
134 
135 	if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
136 		return true;
137 
138 	/*
139 	 * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
140 	 * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
141 	 * written/read by a single CPU.
142 	 */
143 	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
144 
145 	return false;
146 }
147 
148 static void watchdog_hardlockup_kick(void)
149 {
150 	int new_interrupts;
151 
152 	new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts));
153 	watchdog_buddy_check_hardlockup(new_interrupts);
154 }
155 
156 void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
157 {
158 	if (per_cpu(watchdog_hardlockup_touched, cpu)) {
159 		per_cpu(watchdog_hardlockup_touched, cpu) = false;
160 		return;
161 	}
162 
163 	/*
164 	 * Check for a hardlockup by making sure the CPU's timer
165 	 * interrupt is incrementing. The timer interrupt should have
166 	 * fired multiple times before we overflow'd. If it hasn't
167 	 * then this is a good indication the cpu is stuck
168 	 */
169 	if (is_hardlockup(cpu)) {
170 		unsigned int this_cpu = smp_processor_id();
171 		unsigned long flags;
172 
173 		/* Only print hardlockups once. */
174 		if (per_cpu(watchdog_hardlockup_warned, cpu))
175 			return;
176 
177 		/*
178 		 * Prevent multiple hard-lockup reports if one cpu is already
179 		 * engaged in dumping all cpu back traces.
180 		 */
181 		if (sysctl_hardlockup_all_cpu_backtrace) {
182 			if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
183 				return;
184 		}
185 
186 		/*
187 		 * NOTE: we call printk_cpu_sync_get_irqsave() after printing
188 		 * the lockup message. While it would be nice to serialize
189 		 * that printout, we really want to make sure that if some
190 		 * other CPU somehow locked up while holding the lock associated
191 		 * with printk_cpu_sync_get_irqsave() that we can still at least
192 		 * get the message about the lockup out.
193 		 */
194 		pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu);
195 		printk_cpu_sync_get_irqsave(flags);
196 
197 		print_modules();
198 		print_irqtrace_events(current);
199 		if (cpu == this_cpu) {
200 			if (regs)
201 				show_regs(regs);
202 			else
203 				dump_stack();
204 			printk_cpu_sync_put_irqrestore(flags);
205 		} else {
206 			printk_cpu_sync_put_irqrestore(flags);
207 			trigger_single_cpu_backtrace(cpu);
208 		}
209 
210 		if (sysctl_hardlockup_all_cpu_backtrace) {
211 			trigger_allbutcpu_cpu_backtrace(cpu);
212 			if (!hardlockup_panic)
213 				clear_bit_unlock(0, &hard_lockup_nmi_warn);
214 		}
215 
216 		if (hardlockup_panic)
217 			nmi_panic(regs, "Hard LOCKUP");
218 
219 		per_cpu(watchdog_hardlockup_warned, cpu) = true;
220 	} else {
221 		per_cpu(watchdog_hardlockup_warned, cpu) = false;
222 	}
223 }
224 
225 #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
226 
227 static inline void watchdog_hardlockup_kick(void) { }
228 
229 #endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
230 
231 /*
232  * These functions can be overridden based on the configured hardlockdup detector.
233  *
234  * watchdog_hardlockup_enable/disable can be implemented to start and stop when
235  * softlockup watchdog start and stop. The detector must select the
236  * SOFTLOCKUP_DETECTOR Kconfig.
237  */
238 void __weak watchdog_hardlockup_enable(unsigned int cpu) { }
239 
240 void __weak watchdog_hardlockup_disable(unsigned int cpu) { }
241 
242 /*
243  * Watchdog-detector specific API.
244  *
245  * Return 0 when hardlockup watchdog is available, negative value otherwise.
246  * Note that the negative value means that a delayed probe might
247  * succeed later.
248  */
249 int __weak __init watchdog_hardlockup_probe(void)
250 {
251 	return -ENODEV;
252 }
253 
254 /**
255  * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration
256  *
257  * The reconfiguration steps are:
258  * watchdog_hardlockup_stop();
259  * update_variables();
260  * watchdog_hardlockup_start();
261  */
262 void __weak watchdog_hardlockup_stop(void) { }
263 
264 /**
265  * watchdog_hardlockup_start - Start the watchdog after reconfiguration
266  *
267  * Counterpart to watchdog_hardlockup_stop().
268  *
269  * The following variables have been updated in update_variables() and
270  * contain the currently valid configuration:
271  * - watchdog_enabled
272  * - watchdog_thresh
273  * - watchdog_cpumask
274  */
275 void __weak watchdog_hardlockup_start(void) { }
276 
277 /**
278  * lockup_detector_update_enable - Update the sysctl enable bit
279  *
280  * Caller needs to make sure that the hard watchdogs are off, so this
281  * can't race with watchdog_hardlockup_disable().
282  */
283 static void lockup_detector_update_enable(void)
284 {
285 	watchdog_enabled = 0;
286 	if (!watchdog_user_enabled)
287 		return;
288 	if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled)
289 		watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED;
290 	if (watchdog_softlockup_user_enabled)
291 		watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED;
292 }
293 
294 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
295 
296 /*
297  * Delay the soflockup report when running a known slow code.
298  * It does _not_ affect the timestamp of the last successdul reschedule.
299  */
300 #define SOFTLOCKUP_DELAY_REPORT	ULONG_MAX
301 
302 #ifdef CONFIG_SMP
303 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
304 #endif
305 
306 static struct cpumask watchdog_allowed_mask __read_mostly;
307 
308 /* Global variables, exported for sysctl */
309 unsigned int __read_mostly softlockup_panic =
310 			IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
311 
312 static bool softlockup_initialized __read_mostly;
313 static u64 __read_mostly sample_period;
314 
315 /* Timestamp taken after the last successful reschedule. */
316 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
317 /* Timestamp of the last softlockup report. */
318 static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
319 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
320 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
321 static unsigned long soft_lockup_nmi_warn;
322 
323 static int __init softlockup_panic_setup(char *str)
324 {
325 	softlockup_panic = simple_strtoul(str, NULL, 0);
326 	return 1;
327 }
328 __setup("softlockup_panic=", softlockup_panic_setup);
329 
330 static int __init nowatchdog_setup(char *str)
331 {
332 	watchdog_user_enabled = 0;
333 	return 1;
334 }
335 __setup("nowatchdog", nowatchdog_setup);
336 
337 static int __init nosoftlockup_setup(char *str)
338 {
339 	watchdog_softlockup_user_enabled = 0;
340 	return 1;
341 }
342 __setup("nosoftlockup", nosoftlockup_setup);
343 
344 static int __init watchdog_thresh_setup(char *str)
345 {
346 	get_option(&str, &watchdog_thresh);
347 	return 1;
348 }
349 __setup("watchdog_thresh=", watchdog_thresh_setup);
350 
351 #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
352 enum stats_per_group {
353 	STATS_SYSTEM,
354 	STATS_SOFTIRQ,
355 	STATS_HARDIRQ,
356 	STATS_IDLE,
357 	NUM_STATS_PER_GROUP,
358 };
359 
360 static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
361 	CPUTIME_SYSTEM,
362 	CPUTIME_SOFTIRQ,
363 	CPUTIME_IRQ,
364 	CPUTIME_IDLE,
365 };
366 
367 static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
368 static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
369 static DEFINE_PER_CPU(u8, cpustat_tail);
370 
371 /*
372  * We don't need nanosecond resolution. A granularity of 16ms is
373  * sufficient for our precision, allowing us to use u16 to store
374  * cpustats, which will roll over roughly every ~1000 seconds.
375  * 2^24 ~= 16 * 10^6
376  */
377 static u16 get_16bit_precision(u64 data_ns)
378 {
379 	return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
380 }
381 
382 static void update_cpustat(void)
383 {
384 	int i;
385 	u8 util;
386 	u16 old_stat, new_stat;
387 	struct kernel_cpustat kcpustat;
388 	u64 *cpustat = kcpustat.cpustat;
389 	u8 tail = __this_cpu_read(cpustat_tail);
390 	u16 sample_period_16 = get_16bit_precision(sample_period);
391 
392 	kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
393 
394 	for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
395 		old_stat = __this_cpu_read(cpustat_old[i]);
396 		new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
397 		util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
398 		__this_cpu_write(cpustat_util[tail][i], util);
399 		__this_cpu_write(cpustat_old[i], new_stat);
400 	}
401 
402 	__this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
403 }
404 
405 static void print_cpustat(void)
406 {
407 	int i, group;
408 	u8 tail = __this_cpu_read(cpustat_tail);
409 	u64 sample_period_second = sample_period;
410 
411 	do_div(sample_period_second, NSEC_PER_SEC);
412 
413 	/*
414 	 * Outputting the "watchdog" prefix on every line is redundant and not
415 	 * concise, and the original alarm information is sufficient for
416 	 * positioning in logs, hence here printk() is used instead of pr_crit().
417 	 */
418 	printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
419 	       smp_processor_id(), sample_period_second);
420 
421 	for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
422 		group = (tail + i) % NUM_SAMPLE_PERIODS;
423 		printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
424 			"%3u%% hardirq,\t%3u%% idle\n", i + 1,
425 			__this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
426 			__this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
427 			__this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
428 			__this_cpu_read(cpustat_util[group][STATS_IDLE]));
429 	}
430 }
431 
432 #define HARDIRQ_PERCENT_THRESH          50
433 #define NUM_HARDIRQ_REPORT              5
434 struct irq_counts {
435 	int irq;
436 	u32 counts;
437 };
438 
439 static DEFINE_PER_CPU(bool, snapshot_taken);
440 
441 /* Tabulate the most frequent interrupts. */
442 static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
443 {
444 	int i;
445 	struct irq_counts new_count = {irq, counts};
446 
447 	for (i = 0; i < rank; i++) {
448 		if (counts > irq_counts[i].counts)
449 			swap(new_count, irq_counts[i]);
450 	}
451 }
452 
453 /*
454  * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
455  * then the cause of softlockup might be interrupt storm. In this case, it
456  * would be useful to start interrupt counting.
457  */
458 static bool need_counting_irqs(void)
459 {
460 	u8 util;
461 	int tail = __this_cpu_read(cpustat_tail);
462 
463 	tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
464 	util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
465 	return util > HARDIRQ_PERCENT_THRESH;
466 }
467 
468 static void start_counting_irqs(void)
469 {
470 	if (!__this_cpu_read(snapshot_taken)) {
471 		kstat_snapshot_irqs();
472 		__this_cpu_write(snapshot_taken, true);
473 	}
474 }
475 
476 static void stop_counting_irqs(void)
477 {
478 	__this_cpu_write(snapshot_taken, false);
479 }
480 
481 static void print_irq_counts(void)
482 {
483 	unsigned int i, count;
484 	struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
485 		{-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
486 	};
487 
488 	if (__this_cpu_read(snapshot_taken)) {
489 		for_each_active_irq(i) {
490 			count = kstat_get_irq_since_snapshot(i);
491 			tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
492 		}
493 
494 		/*
495 		 * Outputting the "watchdog" prefix on every line is redundant and not
496 		 * concise, and the original alarm information is sufficient for
497 		 * positioning in logs, hence here printk() is used instead of pr_crit().
498 		 */
499 		printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
500 		       smp_processor_id(), HARDIRQ_PERCENT_THRESH);
501 
502 		for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
503 			if (irq_counts_sorted[i].irq == -1)
504 				break;
505 
506 			printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
507 			       i + 1, irq_counts_sorted[i].counts,
508 			       irq_counts_sorted[i].irq);
509 		}
510 
511 		/*
512 		 * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
513 		 * sample_period, then we suspect the interrupt storm might be subsiding.
514 		 */
515 		if (!need_counting_irqs())
516 			stop_counting_irqs();
517 	}
518 }
519 
520 static void report_cpu_status(void)
521 {
522 	print_cpustat();
523 	print_irq_counts();
524 }
525 #else
526 static inline void update_cpustat(void) { }
527 static inline void report_cpu_status(void) { }
528 static inline bool need_counting_irqs(void) { return false; }
529 static inline void start_counting_irqs(void) { }
530 static inline void stop_counting_irqs(void) { }
531 #endif
532 
533 /*
534  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
535  * lockups can have false positives under extreme conditions. So we generally
536  * want a higher threshold for soft lockups than for hard lockups. So we couple
537  * the thresholds with a factor: we make the soft threshold twice the amount of
538  * time the hard threshold is.
539  */
540 static int get_softlockup_thresh(void)
541 {
542 	return watchdog_thresh * 2;
543 }
544 
545 /*
546  * Returns seconds, approximately.  We don't need nanosecond
547  * resolution, and we don't need to waste time with a big divide when
548  * 2^30ns == 1.074s.
549  */
550 static unsigned long get_timestamp(void)
551 {
552 	return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
553 }
554 
555 static void set_sample_period(void)
556 {
557 	/*
558 	 * convert watchdog_thresh from seconds to ns
559 	 * the divide by 5 is to give hrtimer several chances (two
560 	 * or three with the current relation between the soft
561 	 * and hard thresholds) to increment before the
562 	 * hardlockup detector generates a warning
563 	 */
564 	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
565 	watchdog_update_hrtimer_threshold(sample_period);
566 }
567 
568 static void update_report_ts(void)
569 {
570 	__this_cpu_write(watchdog_report_ts, get_timestamp());
571 }
572 
573 /* Commands for resetting the watchdog */
574 static void update_touch_ts(void)
575 {
576 	__this_cpu_write(watchdog_touch_ts, get_timestamp());
577 	update_report_ts();
578 }
579 
580 /**
581  * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
582  *
583  * Call when the scheduler may have stalled for legitimate reasons
584  * preventing the watchdog task from executing - e.g. the scheduler
585  * entering idle state.  This should only be used for scheduler events.
586  * Use touch_softlockup_watchdog() for everything else.
587  */
588 notrace void touch_softlockup_watchdog_sched(void)
589 {
590 	/*
591 	 * Preemption can be enabled.  It doesn't matter which CPU's watchdog
592 	 * report period gets restarted here, so use the raw_ operation.
593 	 */
594 	raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
595 }
596 
597 notrace void touch_softlockup_watchdog(void)
598 {
599 	touch_softlockup_watchdog_sched();
600 	wq_watchdog_touch(raw_smp_processor_id());
601 }
602 EXPORT_SYMBOL(touch_softlockup_watchdog);
603 
604 void touch_all_softlockup_watchdogs(void)
605 {
606 	int cpu;
607 
608 	/*
609 	 * watchdog_mutex cannpt be taken here, as this might be called
610 	 * from (soft)interrupt context, so the access to
611 	 * watchdog_allowed_cpumask might race with a concurrent update.
612 	 *
613 	 * The watchdog time stamp can race against a concurrent real
614 	 * update as well, the only side effect might be a cycle delay for
615 	 * the softlockup check.
616 	 */
617 	for_each_cpu(cpu, &watchdog_allowed_mask) {
618 		per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT;
619 		wq_watchdog_touch(cpu);
620 	}
621 }
622 
623 void touch_softlockup_watchdog_sync(void)
624 {
625 	__this_cpu_write(softlockup_touch_sync, true);
626 	__this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
627 }
628 
629 static int is_softlockup(unsigned long touch_ts,
630 			 unsigned long period_ts,
631 			 unsigned long now)
632 {
633 	if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
634 		/*
635 		 * If period_ts has not been updated during a sample_period, then
636 		 * in the subsequent few sample_periods, period_ts might also not
637 		 * be updated, which could indicate a potential softlockup. In
638 		 * this case, if we suspect the cause of the potential softlockup
639 		 * might be interrupt storm, then we need to count the interrupts
640 		 * to find which interrupt is storming.
641 		 */
642 		if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
643 		    need_counting_irqs())
644 			start_counting_irqs();
645 
646 		/*
647 		 * A poorly behaving BPF scheduler can live-lock the system into
648 		 * soft lockups. Tell sched_ext to try ejecting the BPF
649 		 * scheduler when close to a soft lockup.
650 		 */
651 		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
652 			scx_softlockup(now - touch_ts);
653 
654 		/* Warn about unreasonable delays. */
655 		if (time_after(now, period_ts + get_softlockup_thresh()))
656 			return now - touch_ts;
657 	}
658 	return 0;
659 }
660 
661 /* watchdog detector functions */
662 static DEFINE_PER_CPU(struct completion, softlockup_completion);
663 static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
664 
665 /*
666  * The watchdog feed function - touches the timestamp.
667  *
668  * It only runs once every sample_period seconds (4 seconds by
669  * default) to reset the softlockup timestamp. If this gets delayed
670  * for more than 2*watchdog_thresh seconds then the debug-printout
671  * triggers in watchdog_timer_fn().
672  */
673 static int softlockup_fn(void *data)
674 {
675 	update_touch_ts();
676 	stop_counting_irqs();
677 	complete(this_cpu_ptr(&softlockup_completion));
678 
679 	return 0;
680 }
681 
682 /* watchdog kicker functions */
683 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
684 {
685 	unsigned long touch_ts, period_ts, now;
686 	struct pt_regs *regs = get_irq_regs();
687 	int duration;
688 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
689 	unsigned long flags;
690 
691 	if (!watchdog_enabled)
692 		return HRTIMER_NORESTART;
693 
694 	watchdog_hardlockup_kick();
695 
696 	/* kick the softlockup detector */
697 	if (completion_done(this_cpu_ptr(&softlockup_completion))) {
698 		reinit_completion(this_cpu_ptr(&softlockup_completion));
699 		stop_one_cpu_nowait(smp_processor_id(),
700 				softlockup_fn, NULL,
701 				this_cpu_ptr(&softlockup_stop_work));
702 	}
703 
704 	/* .. and repeat */
705 	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
706 
707 	/*
708 	 * Read the current timestamp first. It might become invalid anytime
709 	 * when a virtual machine is stopped by the host or when the watchog
710 	 * is touched from NMI.
711 	 */
712 	now = get_timestamp();
713 	/*
714 	 * If a virtual machine is stopped by the host it can look to
715 	 * the watchdog like a soft lockup. This function touches the watchdog.
716 	 */
717 	kvm_check_and_clear_guest_paused();
718 	/*
719 	 * The stored timestamp is comparable with @now only when not touched.
720 	 * It might get touched anytime from NMI. Make sure that is_softlockup()
721 	 * uses the same (valid) value.
722 	 */
723 	period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
724 
725 	update_cpustat();
726 
727 	/* Reset the interval when touched by known problematic code. */
728 	if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
729 		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
730 			/*
731 			 * If the time stamp was touched atomically
732 			 * make sure the scheduler tick is up to date.
733 			 */
734 			__this_cpu_write(softlockup_touch_sync, false);
735 			sched_clock_tick();
736 		}
737 
738 		update_report_ts();
739 		return HRTIMER_RESTART;
740 	}
741 
742 	/* Check for a softlockup. */
743 	touch_ts = __this_cpu_read(watchdog_touch_ts);
744 	duration = is_softlockup(touch_ts, period_ts, now);
745 	if (unlikely(duration)) {
746 		/*
747 		 * Prevent multiple soft-lockup reports if one cpu is already
748 		 * engaged in dumping all cpu back traces.
749 		 */
750 		if (softlockup_all_cpu_backtrace) {
751 			if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn))
752 				return HRTIMER_RESTART;
753 		}
754 
755 		/* Start period for the next softlockup warning. */
756 		update_report_ts();
757 
758 		printk_cpu_sync_get_irqsave(flags);
759 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
760 			smp_processor_id(), duration,
761 			current->comm, task_pid_nr(current));
762 		report_cpu_status();
763 		print_modules();
764 		print_irqtrace_events(current);
765 		if (regs)
766 			show_regs(regs);
767 		else
768 			dump_stack();
769 		printk_cpu_sync_put_irqrestore(flags);
770 
771 		if (softlockup_all_cpu_backtrace) {
772 			trigger_allbutcpu_cpu_backtrace(smp_processor_id());
773 			if (!softlockup_panic)
774 				clear_bit_unlock(0, &soft_lockup_nmi_warn);
775 		}
776 
777 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
778 		if (softlockup_panic)
779 			panic("softlockup: hung tasks");
780 	}
781 
782 	return HRTIMER_RESTART;
783 }
784 
785 static void watchdog_enable(unsigned int cpu)
786 {
787 	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
788 	struct completion *done = this_cpu_ptr(&softlockup_completion);
789 
790 	WARN_ON_ONCE(cpu != smp_processor_id());
791 
792 	init_completion(done);
793 	complete(done);
794 
795 	/*
796 	 * Start the timer first to prevent the hardlockup watchdog triggering
797 	 * before the timer has a chance to fire.
798 	 */
799 	hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
800 	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
801 		      HRTIMER_MODE_REL_PINNED_HARD);
802 
803 	/* Initialize timestamp */
804 	update_touch_ts();
805 	/* Enable the hardlockup detector */
806 	if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)
807 		watchdog_hardlockup_enable(cpu);
808 }
809 
810 static void watchdog_disable(unsigned int cpu)
811 {
812 	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
813 
814 	WARN_ON_ONCE(cpu != smp_processor_id());
815 
816 	/*
817 	 * Disable the hardlockup detector first. That prevents that a large
818 	 * delay between disabling the timer and disabling the hardlockup
819 	 * detector causes a false positive.
820 	 */
821 	watchdog_hardlockup_disable(cpu);
822 	hrtimer_cancel(hrtimer);
823 	wait_for_completion(this_cpu_ptr(&softlockup_completion));
824 }
825 
826 static int softlockup_stop_fn(void *data)
827 {
828 	watchdog_disable(smp_processor_id());
829 	return 0;
830 }
831 
832 static void softlockup_stop_all(void)
833 {
834 	int cpu;
835 
836 	if (!softlockup_initialized)
837 		return;
838 
839 	for_each_cpu(cpu, &watchdog_allowed_mask)
840 		smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
841 
842 	cpumask_clear(&watchdog_allowed_mask);
843 }
844 
845 static int softlockup_start_fn(void *data)
846 {
847 	watchdog_enable(smp_processor_id());
848 	return 0;
849 }
850 
851 static void softlockup_start_all(void)
852 {
853 	int cpu;
854 
855 	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
856 	for_each_cpu(cpu, &watchdog_allowed_mask)
857 		smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
858 }
859 
860 int lockup_detector_online_cpu(unsigned int cpu)
861 {
862 	if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
863 		watchdog_enable(cpu);
864 	return 0;
865 }
866 
867 int lockup_detector_offline_cpu(unsigned int cpu)
868 {
869 	if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
870 		watchdog_disable(cpu);
871 	return 0;
872 }
873 
874 static void __lockup_detector_reconfigure(bool thresh_changed)
875 {
876 	cpus_read_lock();
877 	watchdog_hardlockup_stop();
878 
879 	softlockup_stop_all();
880 	/*
881 	 * To prevent watchdog_timer_fn from using the old interval and
882 	 * the new watchdog_thresh at the same time, which could lead to
883 	 * false softlockup reports, it is necessary to update the
884 	 * watchdog_thresh after the softlockup is completed.
885 	 */
886 	if (thresh_changed)
887 		watchdog_thresh = READ_ONCE(watchdog_thresh_next);
888 	set_sample_period();
889 	lockup_detector_update_enable();
890 	if (watchdog_enabled && watchdog_thresh)
891 		softlockup_start_all();
892 
893 	watchdog_hardlockup_start();
894 	cpus_read_unlock();
895 }
896 
897 void lockup_detector_reconfigure(void)
898 {
899 	mutex_lock(&watchdog_mutex);
900 	__lockup_detector_reconfigure(false);
901 	mutex_unlock(&watchdog_mutex);
902 }
903 
904 /*
905  * Create the watchdog infrastructure and configure the detector(s).
906  */
907 static __init void lockup_detector_setup(void)
908 {
909 	/*
910 	 * If sysctl is off and watchdog got disabled on the command line,
911 	 * nothing to do here.
912 	 */
913 	lockup_detector_update_enable();
914 
915 	if (!IS_ENABLED(CONFIG_SYSCTL) &&
916 	    !(watchdog_enabled && watchdog_thresh))
917 		return;
918 
919 	mutex_lock(&watchdog_mutex);
920 	__lockup_detector_reconfigure(false);
921 	softlockup_initialized = true;
922 	mutex_unlock(&watchdog_mutex);
923 }
924 
925 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
926 static void __lockup_detector_reconfigure(bool thresh_changed)
927 {
928 	cpus_read_lock();
929 	watchdog_hardlockup_stop();
930 	if (thresh_changed)
931 		watchdog_thresh = READ_ONCE(watchdog_thresh_next);
932 	lockup_detector_update_enable();
933 	watchdog_hardlockup_start();
934 	cpus_read_unlock();
935 }
936 void lockup_detector_reconfigure(void)
937 {
938 	__lockup_detector_reconfigure(false);
939 }
940 static inline void lockup_detector_setup(void)
941 {
942 	__lockup_detector_reconfigure(false);
943 }
944 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
945 
946 /**
947  * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
948  *
949  * Special interface for parisc. It prevents lockup detector warnings from
950  * the default pm_poweroff() function which busy loops forever.
951  */
952 void lockup_detector_soft_poweroff(void)
953 {
954 	watchdog_enabled = 0;
955 }
956 
957 #ifdef CONFIG_SYSCTL
958 
959 /* Propagate any changes to the watchdog infrastructure */
960 static void proc_watchdog_update(bool thresh_changed)
961 {
962 	/* Remove impossible cpus to keep sysctl output clean. */
963 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
964 	__lockup_detector_reconfigure(thresh_changed);
965 }
966 
967 /*
968  * common function for watchdog, nmi_watchdog and soft_watchdog parameter
969  *
970  * caller             | table->data points to            | 'which'
971  * -------------------|----------------------------------|-------------------------------
972  * proc_watchdog      | watchdog_user_enabled            | WATCHDOG_HARDLOCKUP_ENABLED |
973  *                    |                                  | WATCHDOG_SOFTOCKUP_ENABLED
974  * -------------------|----------------------------------|-------------------------------
975  * proc_nmi_watchdog  | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED
976  * -------------------|----------------------------------|-------------------------------
977  * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED
978  */
979 static int proc_watchdog_common(int which, const struct ctl_table *table, int write,
980 				void *buffer, size_t *lenp, loff_t *ppos)
981 {
982 	int err, old, *param = table->data;
983 
984 	mutex_lock(&watchdog_mutex);
985 
986 	old = *param;
987 	if (!write) {
988 		/*
989 		 * On read synchronize the userspace interface. This is a
990 		 * racy snapshot.
991 		 */
992 		*param = (watchdog_enabled & which) != 0;
993 		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
994 		*param = old;
995 	} else {
996 		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
997 		if (!err && old != READ_ONCE(*param))
998 			proc_watchdog_update(false);
999 	}
1000 	mutex_unlock(&watchdog_mutex);
1001 	return err;
1002 }
1003 
1004 /*
1005  * /proc/sys/kernel/watchdog
1006  */
1007 static int proc_watchdog(const struct ctl_table *table, int write,
1008 			 void *buffer, size_t *lenp, loff_t *ppos)
1009 {
1010 	return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
1011 				    WATCHDOG_SOFTOCKUP_ENABLED,
1012 				    table, write, buffer, lenp, ppos);
1013 }
1014 
1015 /*
1016  * /proc/sys/kernel/nmi_watchdog
1017  */
1018 static int proc_nmi_watchdog(const struct ctl_table *table, int write,
1019 			     void *buffer, size_t *lenp, loff_t *ppos)
1020 {
1021 	if (!watchdog_hardlockup_available && write)
1022 		return -ENOTSUPP;
1023 	return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED,
1024 				    table, write, buffer, lenp, ppos);
1025 }
1026 
1027 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
1028 /*
1029  * /proc/sys/kernel/soft_watchdog
1030  */
1031 static int proc_soft_watchdog(const struct ctl_table *table, int write,
1032 			      void *buffer, size_t *lenp, loff_t *ppos)
1033 {
1034 	return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
1035 				    table, write, buffer, lenp, ppos);
1036 }
1037 #endif
1038 
1039 /*
1040  * /proc/sys/kernel/watchdog_thresh
1041  */
1042 static int proc_watchdog_thresh(const struct ctl_table *table, int write,
1043 				void *buffer, size_t *lenp, loff_t *ppos)
1044 {
1045 	int err, old;
1046 
1047 	mutex_lock(&watchdog_mutex);
1048 
1049 	watchdog_thresh_next = READ_ONCE(watchdog_thresh);
1050 
1051 	old = watchdog_thresh_next;
1052 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
1053 
1054 	if (!err && write && old != READ_ONCE(watchdog_thresh_next))
1055 		proc_watchdog_update(true);
1056 
1057 	mutex_unlock(&watchdog_mutex);
1058 	return err;
1059 }
1060 
1061 /*
1062  * The cpumask is the mask of possible cpus that the watchdog can run
1063  * on, not the mask of cpus it is actually running on.  This allows the
1064  * user to specify a mask that will include cpus that have not yet
1065  * been brought online, if desired.
1066  */
1067 static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
1068 				 void *buffer, size_t *lenp, loff_t *ppos)
1069 {
1070 	int err;
1071 
1072 	mutex_lock(&watchdog_mutex);
1073 
1074 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
1075 	if (!err && write)
1076 		proc_watchdog_update(false);
1077 
1078 	mutex_unlock(&watchdog_mutex);
1079 	return err;
1080 }
1081 
1082 static const int sixty = 60;
1083 
1084 static const struct ctl_table watchdog_sysctls[] = {
1085 	{
1086 		.procname       = "watchdog",
1087 		.data		= &watchdog_user_enabled,
1088 		.maxlen		= sizeof(int),
1089 		.mode		= 0644,
1090 		.proc_handler   = proc_watchdog,
1091 		.extra1		= SYSCTL_ZERO,
1092 		.extra2		= SYSCTL_ONE,
1093 	},
1094 	{
1095 		.procname	= "watchdog_thresh",
1096 		.data		= &watchdog_thresh_next,
1097 		.maxlen		= sizeof(int),
1098 		.mode		= 0644,
1099 		.proc_handler	= proc_watchdog_thresh,
1100 		.extra1		= SYSCTL_ZERO,
1101 		.extra2		= (void *)&sixty,
1102 	},
1103 	{
1104 		.procname	= "watchdog_cpumask",
1105 		.data		= &watchdog_cpumask_bits,
1106 		.maxlen		= NR_CPUS,
1107 		.mode		= 0644,
1108 		.proc_handler	= proc_watchdog_cpumask,
1109 	},
1110 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
1111 	{
1112 		.procname       = "soft_watchdog",
1113 		.data		= &watchdog_softlockup_user_enabled,
1114 		.maxlen		= sizeof(int),
1115 		.mode		= 0644,
1116 		.proc_handler   = proc_soft_watchdog,
1117 		.extra1		= SYSCTL_ZERO,
1118 		.extra2		= SYSCTL_ONE,
1119 	},
1120 	{
1121 		.procname	= "softlockup_panic",
1122 		.data		= &softlockup_panic,
1123 		.maxlen		= sizeof(int),
1124 		.mode		= 0644,
1125 		.proc_handler	= proc_dointvec_minmax,
1126 		.extra1		= SYSCTL_ZERO,
1127 		.extra2		= SYSCTL_ONE,
1128 	},
1129 #ifdef CONFIG_SMP
1130 	{
1131 		.procname	= "softlockup_all_cpu_backtrace",
1132 		.data		= &sysctl_softlockup_all_cpu_backtrace,
1133 		.maxlen		= sizeof(int),
1134 		.mode		= 0644,
1135 		.proc_handler	= proc_dointvec_minmax,
1136 		.extra1		= SYSCTL_ZERO,
1137 		.extra2		= SYSCTL_ONE,
1138 	},
1139 #endif /* CONFIG_SMP */
1140 #endif
1141 #ifdef CONFIG_HARDLOCKUP_DETECTOR
1142 	{
1143 		.procname	= "hardlockup_panic",
1144 		.data		= &hardlockup_panic,
1145 		.maxlen		= sizeof(int),
1146 		.mode		= 0644,
1147 		.proc_handler	= proc_dointvec_minmax,
1148 		.extra1		= SYSCTL_ZERO,
1149 		.extra2		= SYSCTL_ONE,
1150 	},
1151 #ifdef CONFIG_SMP
1152 	{
1153 		.procname	= "hardlockup_all_cpu_backtrace",
1154 		.data		= &sysctl_hardlockup_all_cpu_backtrace,
1155 		.maxlen		= sizeof(int),
1156 		.mode		= 0644,
1157 		.proc_handler	= proc_dointvec_minmax,
1158 		.extra1		= SYSCTL_ZERO,
1159 		.extra2		= SYSCTL_ONE,
1160 	},
1161 #endif /* CONFIG_SMP */
1162 #endif
1163 };
1164 
1165 static struct ctl_table watchdog_hardlockup_sysctl[] = {
1166 	{
1167 		.procname       = "nmi_watchdog",
1168 		.data		= &watchdog_hardlockup_user_enabled,
1169 		.maxlen		= sizeof(int),
1170 		.mode		= 0444,
1171 		.proc_handler   = proc_nmi_watchdog,
1172 		.extra1		= SYSCTL_ZERO,
1173 		.extra2		= SYSCTL_ONE,
1174 	},
1175 };
1176 
1177 static void __init watchdog_sysctl_init(void)
1178 {
1179 	register_sysctl_init("kernel", watchdog_sysctls);
1180 
1181 	if (watchdog_hardlockup_available)
1182 		watchdog_hardlockup_sysctl[0].mode = 0644;
1183 	register_sysctl_init("kernel", watchdog_hardlockup_sysctl);
1184 }
1185 
1186 #else
1187 #define watchdog_sysctl_init() do { } while (0)
1188 #endif /* CONFIG_SYSCTL */
1189 
1190 static void __init lockup_detector_delay_init(struct work_struct *work);
1191 static bool allow_lockup_detector_init_retry __initdata;
1192 
1193 static struct work_struct detector_work __initdata =
1194 		__WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
1195 
1196 static void __init lockup_detector_delay_init(struct work_struct *work)
1197 {
1198 	int ret;
1199 
1200 	ret = watchdog_hardlockup_probe();
1201 	if (ret) {
1202 		if (ret == -ENODEV)
1203 			pr_info("NMI not fully supported\n");
1204 		else
1205 			pr_info("Delayed init of the lockup detector failed: %d\n", ret);
1206 		pr_info("Hard watchdog permanently disabled\n");
1207 		return;
1208 	}
1209 
1210 	allow_lockup_detector_init_retry = false;
1211 
1212 	watchdog_hardlockup_available = true;
1213 	lockup_detector_setup();
1214 }
1215 
1216 /*
1217  * lockup_detector_retry_init - retry init lockup detector if possible.
1218  *
1219  * Retry hardlockup detector init. It is useful when it requires some
1220  * functionality that has to be initialized later on a particular
1221  * platform.
1222  */
1223 void __init lockup_detector_retry_init(void)
1224 {
1225 	/* Must be called before late init calls */
1226 	if (!allow_lockup_detector_init_retry)
1227 		return;
1228 
1229 	schedule_work(&detector_work);
1230 }
1231 
1232 /*
1233  * Ensure that optional delayed hardlockup init is proceed before
1234  * the init code and memory is freed.
1235  */
1236 static int __init lockup_detector_check(void)
1237 {
1238 	/* Prevent any later retry. */
1239 	allow_lockup_detector_init_retry = false;
1240 
1241 	/* Make sure no work is pending. */
1242 	flush_work(&detector_work);
1243 
1244 	watchdog_sysctl_init();
1245 
1246 	return 0;
1247 
1248 }
1249 late_initcall_sync(lockup_detector_check);
1250 
1251 void __init lockup_detector_init(void)
1252 {
1253 	if (tick_nohz_full_enabled())
1254 		pr_info("Disabling watchdog on nohz_full cores by default\n");
1255 
1256 	cpumask_copy(&watchdog_cpumask,
1257 		     housekeeping_cpumask(HK_TYPE_TIMER));
1258 
1259 	if (!watchdog_hardlockup_probe())
1260 		watchdog_hardlockup_available = true;
1261 	else
1262 		allow_lockup_detector_init_retry = true;
1263 
1264 	lockup_detector_setup();
1265 }
1266