1 /* Pseudo NMI support on sparc64 systems. 2 * 3 * Copyright (C) 2009 David S. Miller <davem@davemloft.net> 4 * 5 * The NMI watchdog support and infrastructure is based almost 6 * entirely upon the x86 NMI support code. 7 */ 8 #include <linux/kernel.h> 9 #include <linux/param.h> 10 #include <linux/init.h> 11 #include <linux/percpu.h> 12 #include <linux/nmi.h> 13 #include <linux/module.h> 14 #include <linux/kprobes.h> 15 #include <linux/kernel_stat.h> 16 #include <linux/reboot.h> 17 #include <linux/slab.h> 18 #include <linux/kdebug.h> 19 #include <linux/delay.h> 20 #include <linux/smp.h> 21 22 #include <asm/perf_event.h> 23 #include <asm/ptrace.h> 24 #include <asm/pcr.h> 25 26 #include "kstack.h" 27 28 /* We don't have a real NMI on sparc64, but we can fake one 29 * up using profiling counter overflow interrupts and interrupt 30 * levels. 31 * 32 * The profile overflow interrupts at level 15, so we use 33 * level 14 as our IRQ off level. 34 */ 35 36 static int panic_on_timeout; 37 38 /* nmi_active: 39 * >0: the NMI watchdog is active, but can be disabled 40 * <0: the NMI watchdog has not been set up, and cannot be enabled 41 * 0: the NMI watchdog is disabled, but can be enabled 42 */ 43 atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 44 EXPORT_SYMBOL(nmi_active); 45 46 static unsigned int nmi_hz = HZ; 47 static DEFINE_PER_CPU(short, wd_enabled); 48 static int endflag __initdata; 49 50 static DEFINE_PER_CPU(unsigned int, last_irq_sum); 51 static DEFINE_PER_CPU(long, alert_counter); 52 static DEFINE_PER_CPU(int, nmi_touch); 53 54 void touch_nmi_watchdog(void) 55 { 56 if (atomic_read(&nmi_active)) { 57 int cpu; 58 59 for_each_present_cpu(cpu) { 60 if (per_cpu(nmi_touch, cpu) != 1) 61 per_cpu(nmi_touch, cpu) = 1; 62 } 63 } 64 65 touch_softlockup_watchdog(); 66 } 67 EXPORT_SYMBOL(touch_nmi_watchdog); 68 69 static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) 70 { 71 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 72 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) 73 return; 74 75 console_verbose(); 76 bust_spinlocks(1); 77 78 printk(KERN_EMERG "%s", str); 79 printk(" on CPU%d, ip %08lx, registers:\n", 80 smp_processor_id(), regs->tpc); 81 show_regs(regs); 82 dump_stack(); 83 84 bust_spinlocks(0); 85 86 if (do_panic || panic_on_oops) 87 panic("Non maskable interrupt"); 88 89 nmi_exit(); 90 local_irq_enable(); 91 do_exit(SIGBUS); 92 } 93 94 notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs) 95 { 96 unsigned int sum, touched = 0; 97 void *orig_sp; 98 99 clear_softint(1 << irq); 100 101 local_cpu_data().__nmi_count++; 102 103 nmi_enter(); 104 105 orig_sp = set_hardirq_stack(); 106 107 if (notify_die(DIE_NMI, "nmi", regs, 0, 108 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) 109 touched = 1; 110 else 111 pcr_ops->write(PCR_PIC_PRIV); 112 113 sum = local_cpu_data().irq0_irqs; 114 if (__get_cpu_var(nmi_touch)) { 115 __get_cpu_var(nmi_touch) = 0; 116 touched = 1; 117 } 118 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 119 __this_cpu_inc(alert_counter); 120 if (__this_cpu_read(alert_counter) == 30 * nmi_hz) 121 die_nmi("BUG: NMI Watchdog detected LOCKUP", 122 regs, panic_on_timeout); 123 } else { 124 __get_cpu_var(last_irq_sum) = sum; 125 __this_cpu_write(alert_counter, 0); 126 } 127 if (__get_cpu_var(wd_enabled)) { 128 write_pic(picl_value(nmi_hz)); 129 pcr_ops->write(pcr_enable); 130 } 131 132 restore_hardirq_stack(orig_sp); 133 134 nmi_exit(); 135 } 136 137 static inline unsigned int get_nmi_count(int cpu) 138 { 139 return cpu_data(cpu).__nmi_count; 140 } 141 142 static __init void nmi_cpu_busy(void *data) 143 { 144 local_irq_enable_in_hardirq(); 145 while (endflag == 0) 146 mb(); 147 } 148 149 static void report_broken_nmi(int cpu, int *prev_nmi_count) 150 { 151 printk(KERN_CONT "\n"); 152 153 printk(KERN_WARNING 154 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", 155 cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); 156 157 printk(KERN_WARNING 158 "Please report this to bugzilla.kernel.org,\n"); 159 printk(KERN_WARNING 160 "and attach the output of the 'dmesg' command.\n"); 161 162 per_cpu(wd_enabled, cpu) = 0; 163 atomic_dec(&nmi_active); 164 } 165 166 void stop_nmi_watchdog(void *unused) 167 { 168 pcr_ops->write(PCR_PIC_PRIV); 169 __get_cpu_var(wd_enabled) = 0; 170 atomic_dec(&nmi_active); 171 } 172 173 static int __init check_nmi_watchdog(void) 174 { 175 unsigned int *prev_nmi_count; 176 int cpu, err; 177 178 if (!atomic_read(&nmi_active)) 179 return 0; 180 181 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(unsigned int), GFP_KERNEL); 182 if (!prev_nmi_count) { 183 err = -ENOMEM; 184 goto error; 185 } 186 187 printk(KERN_INFO "Testing NMI watchdog ... "); 188 189 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); 190 191 for_each_possible_cpu(cpu) 192 prev_nmi_count[cpu] = get_nmi_count(cpu); 193 local_irq_enable(); 194 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ 195 196 for_each_online_cpu(cpu) { 197 if (!per_cpu(wd_enabled, cpu)) 198 continue; 199 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) 200 report_broken_nmi(cpu, prev_nmi_count); 201 } 202 endflag = 1; 203 if (!atomic_read(&nmi_active)) { 204 kfree(prev_nmi_count); 205 atomic_set(&nmi_active, -1); 206 err = -ENODEV; 207 goto error; 208 } 209 printk("OK.\n"); 210 211 nmi_hz = 1; 212 213 kfree(prev_nmi_count); 214 return 0; 215 error: 216 on_each_cpu(stop_nmi_watchdog, NULL, 1); 217 return err; 218 } 219 220 void start_nmi_watchdog(void *unused) 221 { 222 __get_cpu_var(wd_enabled) = 1; 223 atomic_inc(&nmi_active); 224 225 pcr_ops->write(PCR_PIC_PRIV); 226 write_pic(picl_value(nmi_hz)); 227 228 pcr_ops->write(pcr_enable); 229 } 230 231 static void nmi_adjust_hz_one(void *unused) 232 { 233 if (!__get_cpu_var(wd_enabled)) 234 return; 235 236 pcr_ops->write(PCR_PIC_PRIV); 237 write_pic(picl_value(nmi_hz)); 238 239 pcr_ops->write(pcr_enable); 240 } 241 242 void nmi_adjust_hz(unsigned int new_hz) 243 { 244 nmi_hz = new_hz; 245 on_each_cpu(nmi_adjust_hz_one, NULL, 1); 246 } 247 EXPORT_SYMBOL_GPL(nmi_adjust_hz); 248 249 static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p) 250 { 251 on_each_cpu(stop_nmi_watchdog, NULL, 1); 252 return 0; 253 } 254 255 static struct notifier_block nmi_reboot_notifier = { 256 .notifier_call = nmi_shutdown, 257 }; 258 259 int __init nmi_init(void) 260 { 261 int err; 262 263 on_each_cpu(start_nmi_watchdog, NULL, 1); 264 265 err = check_nmi_watchdog(); 266 if (!err) { 267 err = register_reboot_notifier(&nmi_reboot_notifier); 268 if (err) { 269 on_each_cpu(stop_nmi_watchdog, NULL, 1); 270 atomic_set(&nmi_active, -1); 271 } 272 } 273 if (!err) 274 init_hw_perf_events(); 275 276 return err; 277 } 278 279 static int __init setup_nmi_watchdog(char *str) 280 { 281 if (!strncmp(str, "panic", 5)) 282 panic_on_timeout = 1; 283 284 return 0; 285 } 286 __setup("nmi_watchdog=", setup_nmi_watchdog); 287