1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * hangcheck-timer.c 4 * 5 * Driver for a little io fencing timer. 6 * 7 * Copyright (C) 2002, 2003 Oracle. All rights reserved. 8 * 9 * Author: Joel Becker <joel.becker@oracle.com> 10 */ 11 12 /* 13 * The hangcheck-timer driver uses the TSC to catch delays that 14 * jiffies does not notice. A timer is set. When the timer fires, it 15 * checks whether it was delayed and if that delay exceeds a given 16 * margin of error. The hangcheck_tick module parameter takes the timer 17 * duration in seconds. The hangcheck_margin parameter defines the 18 * margin of error, in seconds. The defaults are 60 seconds for the 19 * timer and 180 seconds for the margin of error. IOW, a timer is set 20 * for 60 seconds. When the timer fires, the callback checks the 21 * actual duration that the timer waited. If the duration exceeds the 22 * allotted time and margin (here 60 + 180, or 240 seconds), the machine 23 * is restarted. A healthy machine will have the duration match the 24 * expected timeout very closely. 25 */ 26 27 #include <linux/module.h> 28 #include <linux/moduleparam.h> 29 #include <linux/types.h> 30 #include <linux/kernel.h> 31 #include <linux/fs.h> 32 #include <linux/mm.h> 33 #include <linux/reboot.h> 34 #include <linux/init.h> 35 #include <linux/delay.h> 36 #include <linux/uaccess.h> 37 #include <linux/sysrq.h> 38 #include <linux/timer.h> 39 #include <linux/hrtimer.h> 40 41 #define VERSION_STR "0.9.1" 42 43 #define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 44 #define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 45 46 static int hangcheck_tick = DEFAULT_IOFENCE_TICK; 47 static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 48 static int hangcheck_reboot; /* Defaults to not reboot */ 49 static int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 50 51 /* options - modular */ 52 module_param(hangcheck_tick, int, 0); 53 MODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 54 module_param(hangcheck_margin, int, 0); 55 MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 56 module_param(hangcheck_reboot, int, 0); 57 MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 58 module_param(hangcheck_dump_tasks, int, 0); 59 MODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 60 61 MODULE_AUTHOR("Oracle"); 62 MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 63 MODULE_LICENSE("GPL"); 64 MODULE_VERSION(VERSION_STR); 65 66 /* options - nonmodular */ 67 #ifndef MODULE 68 69 static int __init hangcheck_parse_tick(char *str) 70 { 71 int par; 72 73 if (get_option(&str, &par)) 74 hangcheck_tick = par; 75 return 1; 76 } 77 78 static int __init hangcheck_parse_margin(char *str) 79 { 80 int par; 81 82 if (get_option(&str, &par)) 83 hangcheck_margin = par; 84 return 1; 85 } 86 87 static int __init hangcheck_parse_reboot(char *str) 88 { 89 int par; 90 91 if (get_option(&str, &par)) 92 hangcheck_reboot = par; 93 return 1; 94 } 95 96 static int __init hangcheck_parse_dump_tasks(char *str) 97 { 98 int par; 99 100 if (get_option(&str, &par)) 101 hangcheck_dump_tasks = par; 102 return 1; 103 } 104 105 __setup("hcheck_tick", hangcheck_parse_tick); 106 __setup("hcheck_margin", hangcheck_parse_margin); 107 __setup("hcheck_reboot", hangcheck_parse_reboot); 108 __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 109 #endif /* not MODULE */ 110 111 #define TIMER_FREQ 1000000000ULL 112 113 /* Last time scheduled */ 114 static unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 115 116 static void hangcheck_fire(struct timer_list *); 117 118 static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 119 120 static void hangcheck_fire(struct timer_list *unused) 121 { 122 unsigned long long cur_tsc, tsc_diff; 123 124 cur_tsc = ktime_get_ns(); 125 126 if (cur_tsc > hangcheck_tsc) 127 tsc_diff = cur_tsc - hangcheck_tsc; 128 else 129 tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 130 131 if (tsc_diff > hangcheck_tsc_margin) { 132 if (hangcheck_dump_tasks) { 133 pr_crit("Hangcheck: Task state:\n"); 134 #ifdef CONFIG_MAGIC_SYSRQ 135 handle_sysrq('t'); 136 #endif /* CONFIG_MAGIC_SYSRQ */ 137 } 138 if (hangcheck_reboot) { 139 pr_crit("Hangcheck: hangcheck is restarting the machine.\n"); 140 emergency_restart(); 141 } else { 142 pr_crit("Hangcheck: hangcheck value past margin!\n"); 143 } 144 } 145 #if 0 146 /* 147 * Enable to investigate delays in detail 148 */ 149 pr_debug("Hangcheck: called %lld ns since last time (%lld ns overshoot)\n", 150 tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 151 #endif 152 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 153 hangcheck_tsc = ktime_get_ns(); 154 } 155 156 157 static int __init hangcheck_init(void) 158 { 159 pr_debug("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 160 VERSION_STR, hangcheck_tick, hangcheck_margin); 161 hangcheck_tsc_margin = 162 (unsigned long long)hangcheck_margin + hangcheck_tick; 163 hangcheck_tsc_margin *= TIMER_FREQ; 164 165 hangcheck_tsc = ktime_get_ns(); 166 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 167 168 return 0; 169 } 170 171 172 static void __exit hangcheck_exit(void) 173 { 174 timer_delete_sync(&hangcheck_ticktock); 175 pr_debug("Hangcheck: Stopped hangcheck timer.\n"); 176 } 177 178 module_init(hangcheck_init); 179 module_exit(hangcheck_exit); 180