1 /* 2 * Machine check handler 3 * 4 * Copyright IBM Corp. 2000, 2009 5 * Author(s): Ingo Adlung <adlung@de.ibm.com>, 6 * Martin Schwidefsky <schwidefsky@de.ibm.com>, 7 * Cornelia Huck <cornelia.huck@de.ibm.com>, 8 * Heiko Carstens <heiko.carstens@de.ibm.com>, 9 */ 10 11 #include <linux/kernel_stat.h> 12 #include <linux/init.h> 13 #include <linux/errno.h> 14 #include <linux/hardirq.h> 15 #include <linux/time.h> 16 #include <linux/module.h> 17 #include <asm/lowcore.h> 18 #include <asm/smp.h> 19 #include <asm/etr.h> 20 #include <asm/cputime.h> 21 #include <asm/nmi.h> 22 #include <asm/crw.h> 23 #include <asm/switch_to.h> 24 25 struct mcck_struct { 26 int kill_task; 27 int channel_report; 28 int warning; 29 unsigned long long mcck_code; 30 }; 31 32 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); 33 34 static void s390_handle_damage(char *msg) 35 { 36 smp_send_stop(); 37 disabled_wait((unsigned long) __builtin_return_address(0)); 38 while (1); 39 } 40 41 /* 42 * Main machine check handler function. Will be called with interrupts enabled 43 * or disabled and machine checks enabled or disabled. 44 */ 45 void s390_handle_mcck(void) 46 { 47 unsigned long flags; 48 struct mcck_struct mcck; 49 50 /* 51 * Disable machine checks and get the current state of accumulated 52 * machine checks. Afterwards delete the old state and enable machine 53 * checks again. 54 */ 55 local_irq_save(flags); 56 local_mcck_disable(); 57 mcck = *this_cpu_ptr(&cpu_mcck); 58 memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); 59 clear_cpu_flag(CIF_MCCK_PENDING); 60 local_mcck_enable(); 61 local_irq_restore(flags); 62 63 if (mcck.channel_report) 64 crw_handle_channel_report(); 65 /* 66 * A warning may remain for a prolonged period on the bare iron. 67 * (actually until the machine is powered off, or the problem is gone) 68 * So we just stop listening for the WARNING MCH and avoid continuously 69 * being interrupted. One caveat is however, that we must do this per 70 * processor and cannot use the smp version of ctl_clear_bit(). 71 * On VM we only get one interrupt per virtally presented machinecheck. 72 * Though one suffices, we may get one interrupt per (virtual) cpu. 73 */ 74 if (mcck.warning) { /* WARNING pending ? */ 75 static int mchchk_wng_posted = 0; 76 77 /* Use single cpu clear, as we cannot handle smp here. */ 78 __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ 79 if (xchg(&mchchk_wng_posted, 1) == 0) 80 kill_cad_pid(SIGPWR, 1); 81 } 82 if (mcck.kill_task) { 83 local_irq_enable(); 84 printk(KERN_EMERG "mcck: Terminating task because of machine " 85 "malfunction (code 0x%016llx).\n", mcck.mcck_code); 86 printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", 87 current->comm, current->pid); 88 do_exit(SIGSEGV); 89 } 90 } 91 EXPORT_SYMBOL_GPL(s390_handle_mcck); 92 93 /* 94 * returns 0 if all registers could be validated 95 * returns 1 otherwise 96 */ 97 static int notrace s390_revalidate_registers(struct mci *mci) 98 { 99 int kill_task; 100 u64 zero; 101 void *fpt_save_area, *fpt_creg_save_area; 102 103 kill_task = 0; 104 zero = 0; 105 106 if (!mci->gr) { 107 /* 108 * General purpose registers couldn't be restored and have 109 * unknown contents. Process needs to be terminated. 110 */ 111 kill_task = 1; 112 } 113 if (!mci->fp) { 114 /* 115 * Floating point registers can't be restored and 116 * therefore the process needs to be terminated. 117 */ 118 kill_task = 1; 119 } 120 fpt_save_area = &S390_lowcore.floating_pt_save_area; 121 fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area; 122 if (!mci->fc) { 123 /* 124 * Floating point control register can't be restored. 125 * Task will be terminated. 126 */ 127 asm volatile("lfpc 0(%0)" : : "a" (&zero), "m" (zero)); 128 kill_task = 1; 129 } else 130 asm volatile("lfpc 0(%0)" : : "a" (fpt_creg_save_area)); 131 132 asm volatile( 133 " ld 0,0(%0)\n" 134 " ld 1,8(%0)\n" 135 " ld 2,16(%0)\n" 136 " ld 3,24(%0)\n" 137 " ld 4,32(%0)\n" 138 " ld 5,40(%0)\n" 139 " ld 6,48(%0)\n" 140 " ld 7,56(%0)\n" 141 " ld 8,64(%0)\n" 142 " ld 9,72(%0)\n" 143 " ld 10,80(%0)\n" 144 " ld 11,88(%0)\n" 145 " ld 12,96(%0)\n" 146 " ld 13,104(%0)\n" 147 " ld 14,112(%0)\n" 148 " ld 15,120(%0)\n" 149 : : "a" (fpt_save_area)); 150 /* Revalidate vector registers */ 151 if (MACHINE_HAS_VX && current->thread.vxrs) { 152 if (!mci->vr) { 153 /* 154 * Vector registers can't be restored and therefore 155 * the process needs to be terminated. 156 */ 157 kill_task = 1; 158 } 159 restore_vx_regs((__vector128 *) 160 S390_lowcore.vector_save_area_addr); 161 } 162 /* Revalidate access registers */ 163 asm volatile( 164 " lam 0,15,0(%0)" 165 : : "a" (&S390_lowcore.access_regs_save_area)); 166 if (!mci->ar) { 167 /* 168 * Access registers have unknown contents. 169 * Terminating task. 170 */ 171 kill_task = 1; 172 } 173 /* Revalidate control registers */ 174 if (!mci->cr) { 175 /* 176 * Control registers have unknown contents. 177 * Can't recover and therefore stopping machine. 178 */ 179 s390_handle_damage("invalid control registers."); 180 } else { 181 asm volatile( 182 " lctlg 0,15,0(%0)" 183 : : "a" (&S390_lowcore.cregs_save_area)); 184 } 185 /* 186 * We don't even try to revalidate the TOD register, since we simply 187 * can't write something sensible into that register. 188 */ 189 /* 190 * See if we can revalidate the TOD programmable register with its 191 * old contents (should be zero) otherwise set it to zero. 192 */ 193 if (!mci->pr) 194 asm volatile( 195 " sr 0,0\n" 196 " sckpf" 197 : : : "0", "cc"); 198 else 199 asm volatile( 200 " l 0,0(%0)\n" 201 " sckpf" 202 : : "a" (&S390_lowcore.tod_progreg_save_area) 203 : "0", "cc"); 204 /* Revalidate clock comparator register */ 205 set_clock_comparator(S390_lowcore.clock_comparator); 206 /* Check if old PSW is valid */ 207 if (!mci->wp) 208 /* 209 * Can't tell if we come from user or kernel mode 210 * -> stopping machine. 211 */ 212 s390_handle_damage("old psw invalid."); 213 214 if (!mci->ms || !mci->pm || !mci->ia) 215 kill_task = 1; 216 217 return kill_task; 218 } 219 220 #define MAX_IPD_COUNT 29 221 #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ 222 223 #define ED_STP_ISLAND 6 /* External damage STP island check */ 224 #define ED_STP_SYNC 7 /* External damage STP sync check */ 225 #define ED_ETR_SYNC 12 /* External damage ETR sync check */ 226 #define ED_ETR_SWITCH 13 /* External damage ETR switch to local */ 227 228 /* 229 * machine check handler. 230 */ 231 void notrace s390_do_machine_check(struct pt_regs *regs) 232 { 233 static int ipd_count; 234 static DEFINE_SPINLOCK(ipd_lock); 235 static unsigned long long last_ipd; 236 struct mcck_struct *mcck; 237 unsigned long long tmp; 238 struct mci *mci; 239 int umode; 240 241 nmi_enter(); 242 inc_irq_stat(NMI_NMI); 243 mci = (struct mci *) &S390_lowcore.mcck_interruption_code; 244 mcck = this_cpu_ptr(&cpu_mcck); 245 umode = user_mode(regs); 246 247 if (mci->sd) { 248 /* System damage -> stopping machine */ 249 s390_handle_damage("received system damage machine check."); 250 } 251 if (mci->pd) { 252 if (mci->b) { 253 /* Processing backup -> verify if we can survive this */ 254 u64 z_mcic, o_mcic, t_mcic; 255 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); 256 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 257 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 258 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 | 259 1ULL<<16); 260 t_mcic = *(u64 *)mci; 261 262 if (((t_mcic & z_mcic) != 0) || 263 ((t_mcic & o_mcic) != o_mcic)) { 264 s390_handle_damage("processing backup machine " 265 "check with damage."); 266 } 267 268 /* 269 * Nullifying exigent condition, therefore we might 270 * retry this instruction. 271 */ 272 spin_lock(&ipd_lock); 273 tmp = get_tod_clock(); 274 if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME) 275 ipd_count++; 276 else 277 ipd_count = 1; 278 last_ipd = tmp; 279 if (ipd_count == MAX_IPD_COUNT) 280 s390_handle_damage("too many ipd retries."); 281 spin_unlock(&ipd_lock); 282 } else { 283 /* Processing damage -> stopping machine */ 284 s390_handle_damage("received instruction processing " 285 "damage machine check."); 286 } 287 } 288 if (s390_revalidate_registers(mci)) { 289 if (umode) { 290 /* 291 * Couldn't restore all register contents while in 292 * user mode -> mark task for termination. 293 */ 294 mcck->kill_task = 1; 295 mcck->mcck_code = *(unsigned long long *) mci; 296 set_cpu_flag(CIF_MCCK_PENDING); 297 } else { 298 /* 299 * Couldn't restore all register contents while in 300 * kernel mode -> stopping machine. 301 */ 302 s390_handle_damage("unable to revalidate registers."); 303 } 304 } 305 if (mci->cd) { 306 /* Timing facility damage */ 307 s390_handle_damage("TOD clock damaged"); 308 } 309 if (mci->ed && mci->ec) { 310 /* External damage */ 311 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC)) 312 etr_sync_check(); 313 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH)) 314 etr_switch_to_local(); 315 if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) 316 stp_sync_check(); 317 if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) 318 stp_island_check(); 319 } 320 if (mci->se) 321 /* Storage error uncorrected */ 322 s390_handle_damage("received storage error uncorrected " 323 "machine check."); 324 if (mci->ke) 325 /* Storage key-error uncorrected */ 326 s390_handle_damage("received storage key-error uncorrected " 327 "machine check."); 328 if (mci->ds && mci->fa) 329 /* Storage degradation */ 330 s390_handle_damage("received storage degradation machine " 331 "check."); 332 if (mci->cp) { 333 /* Channel report word pending */ 334 mcck->channel_report = 1; 335 set_cpu_flag(CIF_MCCK_PENDING); 336 } 337 if (mci->w) { 338 /* Warning pending */ 339 mcck->warning = 1; 340 set_cpu_flag(CIF_MCCK_PENDING); 341 } 342 nmi_exit(); 343 } 344 345 static int __init machine_check_init(void) 346 { 347 ctl_set_bit(14, 25); /* enable external damage MCH */ 348 ctl_set_bit(14, 27); /* enable system recovery MCH */ 349 ctl_set_bit(14, 24); /* enable warning MCH */ 350 return 0; 351 } 352 arch_initcall(machine_check_init); 353