1 /* 2 * Machine check exception handling. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 * 18 * Copyright 2013 IBM Corporation 19 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> 20 */ 21 22 #undef DEBUG 23 #define pr_fmt(fmt) "mce: " fmt 24 25 #include <linux/types.h> 26 #include <linux/ptrace.h> 27 #include <linux/percpu.h> 28 #include <linux/export.h> 29 #include <linux/irq_work.h> 30 #include <asm/mce.h> 31 32 static DEFINE_PER_CPU(int, mce_nest_count); 33 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); 34 35 /* Queue for delayed MCE events. */ 36 static DEFINE_PER_CPU(int, mce_queue_count); 37 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); 38 39 static void machine_check_process_queued_event(struct irq_work *work); 40 static struct irq_work mce_event_process_work = { 41 .func = machine_check_process_queued_event, 42 }; 43 44 static void mce_set_error_info(struct machine_check_event *mce, 45 struct mce_error_info *mce_err) 46 { 47 mce->error_type = mce_err->error_type; 48 switch (mce_err->error_type) { 49 case MCE_ERROR_TYPE_UE: 50 mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type; 51 break; 52 case MCE_ERROR_TYPE_SLB: 53 mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type; 54 break; 55 case MCE_ERROR_TYPE_ERAT: 56 mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type; 57 break; 58 case MCE_ERROR_TYPE_TLB: 59 mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type; 60 break; 61 case MCE_ERROR_TYPE_USER: 62 mce->u.user_error.user_error_type = mce_err->u.user_error_type; 63 break; 64 case MCE_ERROR_TYPE_RA: 65 mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type; 66 break; 67 case MCE_ERROR_TYPE_LINK: 68 mce->u.link_error.link_error_type = mce_err->u.link_error_type; 69 break; 70 case MCE_ERROR_TYPE_UNKNOWN: 71 default: 72 break; 73 } 74 } 75 76 /* 77 * Decode and save high level MCE information into per cpu buffer which 78 * is an array of machine_check_event structure. 79 */ 80 void save_mce_event(struct pt_regs *regs, long handled, 81 struct mce_error_info *mce_err, 82 uint64_t nip, uint64_t addr) 83 { 84 int index = __this_cpu_inc_return(mce_nest_count) - 1; 85 struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); 86 87 /* 88 * Return if we don't have enough space to log mce event. 89 * mce_nest_count may go beyond MAX_MC_EVT but that's ok, 90 * the check below will stop buffer overrun. 91 */ 92 if (index >= MAX_MC_EVT) 93 return; 94 95 /* Populate generic machine check info */ 96 mce->version = MCE_V1; 97 mce->srr0 = nip; 98 mce->srr1 = regs->msr; 99 mce->gpr3 = regs->gpr[3]; 100 mce->in_use = 1; 101 102 /* Mark it recovered if we have handled it and MSR(RI=1). */ 103 if (handled && (regs->msr & MSR_RI)) 104 mce->disposition = MCE_DISPOSITION_RECOVERED; 105 else 106 mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; 107 108 mce->initiator = mce_err->initiator; 109 mce->severity = mce_err->severity; 110 111 /* 112 * Populate the mce error_type and type-specific error_type. 113 */ 114 mce_set_error_info(mce, mce_err); 115 116 if (!addr) 117 return; 118 119 if (mce->error_type == MCE_ERROR_TYPE_TLB) { 120 mce->u.tlb_error.effective_address_provided = true; 121 mce->u.tlb_error.effective_address = addr; 122 } else if (mce->error_type == MCE_ERROR_TYPE_SLB) { 123 mce->u.slb_error.effective_address_provided = true; 124 mce->u.slb_error.effective_address = addr; 125 } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) { 126 mce->u.erat_error.effective_address_provided = true; 127 mce->u.erat_error.effective_address = addr; 128 } else if (mce->error_type == MCE_ERROR_TYPE_USER) { 129 mce->u.user_error.effective_address_provided = true; 130 mce->u.user_error.effective_address = addr; 131 } else if (mce->error_type == MCE_ERROR_TYPE_RA) { 132 mce->u.ra_error.effective_address_provided = true; 133 mce->u.ra_error.effective_address = addr; 134 } else if (mce->error_type == MCE_ERROR_TYPE_LINK) { 135 mce->u.link_error.effective_address_provided = true; 136 mce->u.link_error.effective_address = addr; 137 } else if (mce->error_type == MCE_ERROR_TYPE_UE) { 138 mce->u.ue_error.effective_address_provided = true; 139 mce->u.ue_error.effective_address = addr; 140 } 141 return; 142 } 143 144 /* 145 * get_mce_event: 146 * mce Pointer to machine_check_event structure to be filled. 147 * release Flag to indicate whether to free the event slot or not. 148 * 0 <= do not release the mce event. Caller will invoke 149 * release_mce_event() once event has been consumed. 150 * 1 <= release the slot. 151 * 152 * return 1 = success 153 * 0 = failure 154 * 155 * get_mce_event() will be called by platform specific machine check 156 * handle routine and in KVM. 157 * When we call get_mce_event(), we are still in interrupt context and 158 * preemption will not be scheduled until ret_from_expect() routine 159 * is called. 160 */ 161 int get_mce_event(struct machine_check_event *mce, bool release) 162 { 163 int index = __this_cpu_read(mce_nest_count) - 1; 164 struct machine_check_event *mc_evt; 165 int ret = 0; 166 167 /* Sanity check */ 168 if (index < 0) 169 return ret; 170 171 /* Check if we have MCE info to process. */ 172 if (index < MAX_MC_EVT) { 173 mc_evt = this_cpu_ptr(&mce_event[index]); 174 /* Copy the event structure and release the original */ 175 if (mce) 176 *mce = *mc_evt; 177 if (release) 178 mc_evt->in_use = 0; 179 ret = 1; 180 } 181 /* Decrement the count to free the slot. */ 182 if (release) 183 __this_cpu_dec(mce_nest_count); 184 185 return ret; 186 } 187 188 void release_mce_event(void) 189 { 190 get_mce_event(NULL, true); 191 } 192 193 /* 194 * Queue up the MCE event which then can be handled later. 195 */ 196 void machine_check_queue_event(void) 197 { 198 int index; 199 struct machine_check_event evt; 200 201 if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 202 return; 203 204 index = __this_cpu_inc_return(mce_queue_count) - 1; 205 /* If queue is full, just return for now. */ 206 if (index >= MAX_MC_EVT) { 207 __this_cpu_dec(mce_queue_count); 208 return; 209 } 210 memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt)); 211 212 /* Queue irq work to process this event later. */ 213 irq_work_queue(&mce_event_process_work); 214 } 215 216 /* 217 * process pending MCE event from the mce event queue. This function will be 218 * called during syscall exit. 219 */ 220 static void machine_check_process_queued_event(struct irq_work *work) 221 { 222 int index; 223 224 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 225 226 /* 227 * For now just print it to console. 228 * TODO: log this error event to FSP or nvram. 229 */ 230 while (__this_cpu_read(mce_queue_count) > 0) { 231 index = __this_cpu_read(mce_queue_count) - 1; 232 machine_check_print_event_info( 233 this_cpu_ptr(&mce_event_queue[index]), false); 234 __this_cpu_dec(mce_queue_count); 235 } 236 } 237 238 void machine_check_print_event_info(struct machine_check_event *evt, 239 bool user_mode) 240 { 241 const char *level, *sevstr, *subtype; 242 static const char *mc_ue_types[] = { 243 "Indeterminate", 244 "Instruction fetch", 245 "Page table walk ifetch", 246 "Load/Store", 247 "Page table walk Load/Store", 248 }; 249 static const char *mc_slb_types[] = { 250 "Indeterminate", 251 "Parity", 252 "Multihit", 253 }; 254 static const char *mc_erat_types[] = { 255 "Indeterminate", 256 "Parity", 257 "Multihit", 258 }; 259 static const char *mc_tlb_types[] = { 260 "Indeterminate", 261 "Parity", 262 "Multihit", 263 }; 264 static const char *mc_user_types[] = { 265 "Indeterminate", 266 "tlbie(l) invalid", 267 }; 268 static const char *mc_ra_types[] = { 269 "Indeterminate", 270 "Instruction fetch (bad)", 271 "Instruction fetch (foreign)", 272 "Page table walk ifetch (bad)", 273 "Page table walk ifetch (foreign)", 274 "Load (bad)", 275 "Store (bad)", 276 "Page table walk Load/Store (bad)", 277 "Page table walk Load/Store (foreign)", 278 "Load/Store (foreign)", 279 }; 280 static const char *mc_link_types[] = { 281 "Indeterminate", 282 "Instruction fetch (timeout)", 283 "Page table walk ifetch (timeout)", 284 "Load (timeout)", 285 "Store (timeout)", 286 "Page table walk Load/Store (timeout)", 287 }; 288 289 /* Print things out */ 290 if (evt->version != MCE_V1) { 291 pr_err("Machine Check Exception, Unknown event version %d !\n", 292 evt->version); 293 return; 294 } 295 switch (evt->severity) { 296 case MCE_SEV_NO_ERROR: 297 level = KERN_INFO; 298 sevstr = "Harmless"; 299 break; 300 case MCE_SEV_WARNING: 301 level = KERN_WARNING; 302 sevstr = ""; 303 break; 304 case MCE_SEV_ERROR_SYNC: 305 level = KERN_ERR; 306 sevstr = "Severe"; 307 break; 308 case MCE_SEV_FATAL: 309 default: 310 level = KERN_ERR; 311 sevstr = "Fatal"; 312 break; 313 } 314 315 printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 316 evt->disposition == MCE_DISPOSITION_RECOVERED ? 317 "Recovered" : "Not recovered"); 318 319 if (user_mode) { 320 printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, 321 evt->srr0, current->pid, current->comm); 322 } else { 323 printk("%s NIP [%016llx]: %pS\n", level, evt->srr0, 324 (void *)evt->srr0); 325 } 326 327 printk("%s Initiator: %s\n", level, 328 evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); 329 switch (evt->error_type) { 330 case MCE_ERROR_TYPE_UE: 331 subtype = evt->u.ue_error.ue_error_type < 332 ARRAY_SIZE(mc_ue_types) ? 333 mc_ue_types[evt->u.ue_error.ue_error_type] 334 : "Unknown"; 335 printk("%s Error type: UE [%s]\n", level, subtype); 336 if (evt->u.ue_error.effective_address_provided) 337 printk("%s Effective address: %016llx\n", 338 level, evt->u.ue_error.effective_address); 339 if (evt->u.ue_error.physical_address_provided) 340 printk("%s Physical address: %016llx\n", 341 level, evt->u.ue_error.physical_address); 342 break; 343 case MCE_ERROR_TYPE_SLB: 344 subtype = evt->u.slb_error.slb_error_type < 345 ARRAY_SIZE(mc_slb_types) ? 346 mc_slb_types[evt->u.slb_error.slb_error_type] 347 : "Unknown"; 348 printk("%s Error type: SLB [%s]\n", level, subtype); 349 if (evt->u.slb_error.effective_address_provided) 350 printk("%s Effective address: %016llx\n", 351 level, evt->u.slb_error.effective_address); 352 break; 353 case MCE_ERROR_TYPE_ERAT: 354 subtype = evt->u.erat_error.erat_error_type < 355 ARRAY_SIZE(mc_erat_types) ? 356 mc_erat_types[evt->u.erat_error.erat_error_type] 357 : "Unknown"; 358 printk("%s Error type: ERAT [%s]\n", level, subtype); 359 if (evt->u.erat_error.effective_address_provided) 360 printk("%s Effective address: %016llx\n", 361 level, evt->u.erat_error.effective_address); 362 break; 363 case MCE_ERROR_TYPE_TLB: 364 subtype = evt->u.tlb_error.tlb_error_type < 365 ARRAY_SIZE(mc_tlb_types) ? 366 mc_tlb_types[evt->u.tlb_error.tlb_error_type] 367 : "Unknown"; 368 printk("%s Error type: TLB [%s]\n", level, subtype); 369 if (evt->u.tlb_error.effective_address_provided) 370 printk("%s Effective address: %016llx\n", 371 level, evt->u.tlb_error.effective_address); 372 break; 373 case MCE_ERROR_TYPE_USER: 374 subtype = evt->u.user_error.user_error_type < 375 ARRAY_SIZE(mc_user_types) ? 376 mc_user_types[evt->u.user_error.user_error_type] 377 : "Unknown"; 378 printk("%s Error type: User [%s]\n", level, subtype); 379 if (evt->u.user_error.effective_address_provided) 380 printk("%s Effective address: %016llx\n", 381 level, evt->u.user_error.effective_address); 382 break; 383 case MCE_ERROR_TYPE_RA: 384 subtype = evt->u.ra_error.ra_error_type < 385 ARRAY_SIZE(mc_ra_types) ? 386 mc_ra_types[evt->u.ra_error.ra_error_type] 387 : "Unknown"; 388 printk("%s Error type: Real address [%s]\n", level, subtype); 389 if (evt->u.ra_error.effective_address_provided) 390 printk("%s Effective address: %016llx\n", 391 level, evt->u.ra_error.effective_address); 392 break; 393 case MCE_ERROR_TYPE_LINK: 394 subtype = evt->u.link_error.link_error_type < 395 ARRAY_SIZE(mc_link_types) ? 396 mc_link_types[evt->u.link_error.link_error_type] 397 : "Unknown"; 398 printk("%s Error type: Link [%s]\n", level, subtype); 399 if (evt->u.link_error.effective_address_provided) 400 printk("%s Effective address: %016llx\n", 401 level, evt->u.link_error.effective_address); 402 break; 403 default: 404 case MCE_ERROR_TYPE_UNKNOWN: 405 printk("%s Error type: Unknown\n", level); 406 break; 407 } 408 } 409 410 uint64_t get_mce_fault_addr(struct machine_check_event *evt) 411 { 412 switch (evt->error_type) { 413 case MCE_ERROR_TYPE_UE: 414 if (evt->u.ue_error.effective_address_provided) 415 return evt->u.ue_error.effective_address; 416 break; 417 case MCE_ERROR_TYPE_SLB: 418 if (evt->u.slb_error.effective_address_provided) 419 return evt->u.slb_error.effective_address; 420 break; 421 case MCE_ERROR_TYPE_ERAT: 422 if (evt->u.erat_error.effective_address_provided) 423 return evt->u.erat_error.effective_address; 424 break; 425 case MCE_ERROR_TYPE_TLB: 426 if (evt->u.tlb_error.effective_address_provided) 427 return evt->u.tlb_error.effective_address; 428 break; 429 case MCE_ERROR_TYPE_USER: 430 if (evt->u.user_error.effective_address_provided) 431 return evt->u.user_error.effective_address; 432 break; 433 case MCE_ERROR_TYPE_RA: 434 if (evt->u.ra_error.effective_address_provided) 435 return evt->u.ra_error.effective_address; 436 break; 437 case MCE_ERROR_TYPE_LINK: 438 if (evt->u.link_error.effective_address_provided) 439 return evt->u.link_error.effective_address; 440 break; 441 default: 442 case MCE_ERROR_TYPE_UNKNOWN: 443 break; 444 } 445 return 0; 446 } 447 EXPORT_SYMBOL(get_mce_fault_addr); 448