1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * hangcheck-timer.c 4 * 5 * Driver for a little io fencing timer. 6 * 7 * Copyright (C) 2002, 2003 Oracle. All rights reserved. 8 * 9 * Author: Joel Becker <joel.becker@oracle.com> 10 */ 11 12 /* 13 * The hangcheck-timer driver uses the TSC to catch delays that 14 * jiffies does not notice. A timer is set. When the timer fires, it 15 * checks whether it was delayed and if that delay exceeds a given 16 * margin of error. The hangcheck_tick module parameter takes the timer 17 * duration in seconds. The hangcheck_margin parameter defines the 18 * margin of error, in seconds. The defaults are 60 seconds for the 19 * timer and 180 seconds for the margin of error. IOW, a timer is set 20 * for 60 seconds. When the timer fires, the callback checks the 21 * actual duration that the timer waited. If the duration exceeds the 22 * allotted time and margin (here 60 + 180, or 240 seconds), the machine 23 * is restarted. A healthy machine will have the duration match the 24 * expected timeout very closely. 25 */ 26 27 #include <linux/module.h> 28 #include <linux/moduleparam.h> 29 #include <linux/types.h> 30 #include <linux/kernel.h> 31 #include <linux/fs.h> 32 #include <linux/mm.h> 33 #include <linux/reboot.h> 34 #include <linux/init.h> 35 #include <linux/delay.h> 36 #include <linux/uaccess.h> 37 #include <linux/sysrq.h> 38 #include <linux/timer.h> 39 #include <linux/hrtimer.h> 40 41 #define VERSION_STR "0.9.1" 42 43 #define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 44 #define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 45 46 static int hangcheck_tick = DEFAULT_IOFENCE_TICK; 47 static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 48 static int hangcheck_reboot; /* Defaults to not reboot */ 49 static int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 50 51 /* options - modular */ 52 module_param(hangcheck_tick, int, 0); 53 MODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 54 module_param(hangcheck_margin, int, 0); 55 MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 56 module_param(hangcheck_reboot, int, 0); 57 MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 58 module_param(hangcheck_dump_tasks, int, 0); 59 MODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 60 61 MODULE_AUTHOR("Oracle"); 62 MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 63 MODULE_LICENSE("GPL"); 64 MODULE_VERSION(VERSION_STR); 65 66 /* options - nonmodular */ 67 #ifndef MODULE 68 69 static int __init hangcheck_parse_tick(char *str) 70 { 71 int par; 72 if (get_option(&str,&par)) 73 hangcheck_tick = par; 74 return 1; 75 } 76 77 static int __init hangcheck_parse_margin(char *str) 78 { 79 int par; 80 if (get_option(&str,&par)) 81 hangcheck_margin = par; 82 return 1; 83 } 84 85 static int __init hangcheck_parse_reboot(char *str) 86 { 87 int par; 88 if (get_option(&str,&par)) 89 hangcheck_reboot = par; 90 return 1; 91 } 92 93 static int __init hangcheck_parse_dump_tasks(char *str) 94 { 95 int par; 96 if (get_option(&str,&par)) 97 hangcheck_dump_tasks = par; 98 return 1; 99 } 100 101 __setup("hcheck_tick", hangcheck_parse_tick); 102 __setup("hcheck_margin", hangcheck_parse_margin); 103 __setup("hcheck_reboot", hangcheck_parse_reboot); 104 __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 105 #endif /* not MODULE */ 106 107 #define TIMER_FREQ 1000000000ULL 108 109 /* Last time scheduled */ 110 static unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 111 112 static void hangcheck_fire(struct timer_list *); 113 114 static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 115 116 static void hangcheck_fire(struct timer_list *unused) 117 { 118 unsigned long long cur_tsc, tsc_diff; 119 120 cur_tsc = ktime_get_ns(); 121 122 if (cur_tsc > hangcheck_tsc) 123 tsc_diff = cur_tsc - hangcheck_tsc; 124 else 125 tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 126 127 if (tsc_diff > hangcheck_tsc_margin) { 128 if (hangcheck_dump_tasks) { 129 printk(KERN_CRIT "Hangcheck: Task state:\n"); 130 #ifdef CONFIG_MAGIC_SYSRQ 131 handle_sysrq('t'); 132 #endif /* CONFIG_MAGIC_SYSRQ */ 133 } 134 if (hangcheck_reboot) { 135 printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n"); 136 emergency_restart(); 137 } else { 138 printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n"); 139 } 140 } 141 #if 0 142 /* 143 * Enable to investigate delays in detail 144 */ 145 printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n", 146 tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 147 #endif 148 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 149 hangcheck_tsc = ktime_get_ns(); 150 } 151 152 153 static int __init hangcheck_init(void) 154 { 155 printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 156 VERSION_STR, hangcheck_tick, hangcheck_margin); 157 hangcheck_tsc_margin = 158 (unsigned long long)hangcheck_margin + hangcheck_tick; 159 hangcheck_tsc_margin *= TIMER_FREQ; 160 161 hangcheck_tsc = ktime_get_ns(); 162 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 163 164 return 0; 165 } 166 167 168 static void __exit hangcheck_exit(void) 169 { 170 del_timer_sync(&hangcheck_ticktock); 171 printk("Hangcheck: Stopped hangcheck timer.\n"); 172 } 173 174 module_init(hangcheck_init); 175 module_exit(hangcheck_exit); 176