xref: /linux/arch/powerpc/platforms/pseries/ras.c (revision 8afecfb0ec961e37e61b2d19c4fa71617a9482de)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2001 Dave Engebretsen IBM Corporation
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/interrupt.h>
8 #include <linux/irq.h>
9 #include <linux/of.h>
10 #include <linux/fs.h>
11 #include <linux/reboot.h>
12 #include <linux/irq_work.h>
13 
14 #include <asm/machdep.h>
15 #include <asm/rtas.h>
16 #include <asm/firmware.h>
17 #include <asm/mce.h>
18 
19 #include "pseries.h"
20 
21 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
22 static DEFINE_SPINLOCK(ras_log_buf_lock);
23 
24 static int ras_check_exception_token;
25 
26 static void mce_process_errlog_event(struct irq_work *work);
27 static struct irq_work mce_errlog_process_work = {
28 	.func = mce_process_errlog_event,
29 };
30 
31 #define EPOW_SENSOR_TOKEN	9
32 #define EPOW_SENSOR_INDEX	0
33 
34 /* EPOW events counter variable */
35 static int num_epow_events;
36 
37 static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id);
38 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
39 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
40 
41 /* RTAS pseries MCE errorlog section. */
42 struct pseries_mc_errorlog {
43 	__be32	fru_id;
44 	__be32	proc_id;
45 	u8	error_type;
46 	/*
47 	 * sub_err_type (1 byte). Bit fields depends on error_type
48 	 *
49 	 *   MSB0
50 	 *   |
51 	 *   V
52 	 *   01234567
53 	 *   XXXXXXXX
54 	 *
55 	 * For error_type == MC_ERROR_TYPE_UE
56 	 *   XXXXXXXX
57 	 *   X		1: Permanent or Transient UE.
58 	 *    X		1: Effective address provided.
59 	 *     X	1: Logical address provided.
60 	 *      XX	2: Reserved.
61 	 *        XXX	3: Type of UE error.
62 	 *
63 	 * For error_type != MC_ERROR_TYPE_UE
64 	 *   XXXXXXXX
65 	 *   X		1: Effective address provided.
66 	 *    XXXXX	5: Reserved.
67 	 *         XX	2: Type of SLB/ERAT/TLB error.
68 	 */
69 	u8	sub_err_type;
70 	u8	reserved_1[6];
71 	__be64	effective_address;
72 	__be64	logical_address;
73 } __packed;
74 
75 /* RTAS pseries MCE error types */
76 #define MC_ERROR_TYPE_UE		0x00
77 #define MC_ERROR_TYPE_SLB		0x01
78 #define MC_ERROR_TYPE_ERAT		0x02
79 #define MC_ERROR_TYPE_TLB		0x04
80 #define MC_ERROR_TYPE_D_CACHE		0x05
81 #define MC_ERROR_TYPE_I_CACHE		0x07
82 
83 /* RTAS pseries MCE error sub types */
84 #define MC_ERROR_UE_INDETERMINATE		0
85 #define MC_ERROR_UE_IFETCH			1
86 #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH	2
87 #define MC_ERROR_UE_LOAD_STORE			3
88 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE	4
89 
90 #define MC_ERROR_SLB_PARITY		0
91 #define MC_ERROR_SLB_MULTIHIT		1
92 #define MC_ERROR_SLB_INDETERMINATE	2
93 
94 #define MC_ERROR_ERAT_PARITY		1
95 #define MC_ERROR_ERAT_MULTIHIT		2
96 #define MC_ERROR_ERAT_INDETERMINATE	3
97 
98 #define MC_ERROR_TLB_PARITY		1
99 #define MC_ERROR_TLB_MULTIHIT		2
100 #define MC_ERROR_TLB_INDETERMINATE	3
101 
102 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
103 {
104 	switch (mlog->error_type) {
105 	case	MC_ERROR_TYPE_UE:
106 		return (mlog->sub_err_type & 0x07);
107 	case	MC_ERROR_TYPE_SLB:
108 	case	MC_ERROR_TYPE_ERAT:
109 	case	MC_ERROR_TYPE_TLB:
110 		return (mlog->sub_err_type & 0x03);
111 	default:
112 		return 0;
113 	}
114 }
115 
116 static
117 inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog)
118 {
119 	__be64 addr = 0;
120 
121 	switch (mlog->error_type) {
122 	case	MC_ERROR_TYPE_UE:
123 		if (mlog->sub_err_type & 0x40)
124 			addr = mlog->effective_address;
125 		break;
126 	case	MC_ERROR_TYPE_SLB:
127 	case	MC_ERROR_TYPE_ERAT:
128 	case	MC_ERROR_TYPE_TLB:
129 		if (mlog->sub_err_type & 0x80)
130 			addr = mlog->effective_address;
131 	default:
132 		break;
133 	}
134 	return be64_to_cpu(addr);
135 }
136 
137 /*
138  * Enable the hotplug interrupt late because processing them may touch other
139  * devices or systems (e.g. hugepages) that have not been initialized at the
140  * subsys stage.
141  */
142 int __init init_ras_hotplug_IRQ(void)
143 {
144 	struct device_node *np;
145 
146 	/* Hotplug Events */
147 	np = of_find_node_by_path("/event-sources/hot-plug-events");
148 	if (np != NULL) {
149 		if (dlpar_workqueue_init() == 0)
150 			request_event_sources_irqs(np, ras_hotplug_interrupt,
151 						   "RAS_HOTPLUG");
152 		of_node_put(np);
153 	}
154 
155 	return 0;
156 }
157 machine_late_initcall(pseries, init_ras_hotplug_IRQ);
158 
159 /*
160  * Initialize handlers for the set of interrupts caused by hardware errors
161  * and power system events.
162  */
163 static int __init init_ras_IRQ(void)
164 {
165 	struct device_node *np;
166 
167 	ras_check_exception_token = rtas_token("check-exception");
168 
169 	/* Internal Errors */
170 	np = of_find_node_by_path("/event-sources/internal-errors");
171 	if (np != NULL) {
172 		request_event_sources_irqs(np, ras_error_interrupt,
173 					   "RAS_ERROR");
174 		of_node_put(np);
175 	}
176 
177 	/* EPOW Events */
178 	np = of_find_node_by_path("/event-sources/epow-events");
179 	if (np != NULL) {
180 		request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
181 		of_node_put(np);
182 	}
183 
184 	return 0;
185 }
186 machine_subsys_initcall(pseries, init_ras_IRQ);
187 
188 #define EPOW_SHUTDOWN_NORMAL				1
189 #define EPOW_SHUTDOWN_ON_UPS				2
190 #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS	3
191 #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH	4
192 
193 static void handle_system_shutdown(char event_modifier)
194 {
195 	switch (event_modifier) {
196 	case EPOW_SHUTDOWN_NORMAL:
197 		pr_emerg("Power off requested\n");
198 		orderly_poweroff(true);
199 		break;
200 
201 	case EPOW_SHUTDOWN_ON_UPS:
202 		pr_emerg("Loss of system power detected. System is running on"
203 			 " UPS/battery. Check RTAS error log for details\n");
204 		orderly_poweroff(true);
205 		break;
206 
207 	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
208 		pr_emerg("Loss of system critical functions detected. Check"
209 			 " RTAS error log for details\n");
210 		orderly_poweroff(true);
211 		break;
212 
213 	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
214 		pr_emerg("High ambient temperature detected. Check RTAS"
215 			 " error log for details\n");
216 		orderly_poweroff(true);
217 		break;
218 
219 	default:
220 		pr_err("Unknown power/cooling shutdown event (modifier = %d)\n",
221 			event_modifier);
222 	}
223 }
224 
225 struct epow_errorlog {
226 	unsigned char sensor_value;
227 	unsigned char event_modifier;
228 	unsigned char extended_modifier;
229 	unsigned char reserved;
230 	unsigned char platform_reason;
231 };
232 
233 #define EPOW_RESET			0
234 #define EPOW_WARN_COOLING		1
235 #define EPOW_WARN_POWER			2
236 #define EPOW_SYSTEM_SHUTDOWN		3
237 #define EPOW_SYSTEM_HALT		4
238 #define EPOW_MAIN_ENCLOSURE		5
239 #define EPOW_POWER_OFF			7
240 
241 static void rtas_parse_epow_errlog(struct rtas_error_log *log)
242 {
243 	struct pseries_errorlog *pseries_log;
244 	struct epow_errorlog *epow_log;
245 	char action_code;
246 	char modifier;
247 
248 	pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
249 	if (pseries_log == NULL)
250 		return;
251 
252 	epow_log = (struct epow_errorlog *)pseries_log->data;
253 	action_code = epow_log->sensor_value & 0xF;	/* bottom 4 bits */
254 	modifier = epow_log->event_modifier & 0xF;	/* bottom 4 bits */
255 
256 	switch (action_code) {
257 	case EPOW_RESET:
258 		if (num_epow_events) {
259 			pr_info("Non critical power/cooling issue cleared\n");
260 			num_epow_events--;
261 		}
262 		break;
263 
264 	case EPOW_WARN_COOLING:
265 		pr_info("Non-critical cooling issue detected. Check RTAS error"
266 			" log for details\n");
267 		break;
268 
269 	case EPOW_WARN_POWER:
270 		pr_info("Non-critical power issue detected. Check RTAS error"
271 			" log for details\n");
272 		break;
273 
274 	case EPOW_SYSTEM_SHUTDOWN:
275 		handle_system_shutdown(epow_log->event_modifier);
276 		break;
277 
278 	case EPOW_SYSTEM_HALT:
279 		pr_emerg("Critical power/cooling issue detected. Check RTAS"
280 			 " error log for details. Powering off.\n");
281 		orderly_poweroff(true);
282 		break;
283 
284 	case EPOW_MAIN_ENCLOSURE:
285 	case EPOW_POWER_OFF:
286 		pr_emerg("System about to lose power. Check RTAS error log "
287 			 " for details. Powering off immediately.\n");
288 		emergency_sync();
289 		kernel_power_off();
290 		break;
291 
292 	default:
293 		pr_err("Unknown power/cooling event (action code  = %d)\n",
294 			action_code);
295 	}
296 
297 	/* Increment epow events counter variable */
298 	if (action_code != EPOW_RESET)
299 		num_epow_events++;
300 }
301 
302 static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id)
303 {
304 	struct pseries_errorlog *pseries_log;
305 	struct pseries_hp_errorlog *hp_elog;
306 
307 	spin_lock(&ras_log_buf_lock);
308 
309 	rtas_call(ras_check_exception_token, 6, 1, NULL,
310 		  RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq),
311 		  RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf),
312 		  rtas_get_error_log_max());
313 
314 	pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf,
315 					   PSERIES_ELOG_SECT_ID_HOTPLUG);
316 	hp_elog = (struct pseries_hp_errorlog *)pseries_log->data;
317 
318 	/*
319 	 * Since PCI hotplug is not currently supported on pseries, put PCI
320 	 * hotplug events on the ras_log_buf to be handled by rtas_errd.
321 	 */
322 	if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM ||
323 	    hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU ||
324 	    hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM)
325 		queue_hotplug_event(hp_elog);
326 	else
327 		log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
328 
329 	spin_unlock(&ras_log_buf_lock);
330 	return IRQ_HANDLED;
331 }
332 
333 /* Handle environmental and power warning (EPOW) interrupts. */
334 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
335 {
336 	int status;
337 	int state;
338 	int critical;
339 
340 	status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
341 				      &state);
342 
343 	if (state > 3)
344 		critical = 1;		/* Time Critical */
345 	else
346 		critical = 0;
347 
348 	spin_lock(&ras_log_buf_lock);
349 
350 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
351 			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
352 			   virq_to_hw(irq),
353 			   RTAS_EPOW_WARNING,
354 			   critical, __pa(&ras_log_buf),
355 				rtas_get_error_log_max());
356 
357 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
358 
359 	rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
360 
361 	spin_unlock(&ras_log_buf_lock);
362 	return IRQ_HANDLED;
363 }
364 
365 /*
366  * Handle hardware error interrupts.
367  *
368  * RTAS check-exception is called to collect data on the exception.  If
369  * the error is deemed recoverable, we log a warning and return.
370  * For nonrecoverable errors, an error is logged and we stop all processing
371  * as quickly as possible in order to prevent propagation of the failure.
372  */
373 static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
374 {
375 	struct rtas_error_log *rtas_elog;
376 	int status;
377 	int fatal;
378 
379 	spin_lock(&ras_log_buf_lock);
380 
381 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
382 			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
383 			   virq_to_hw(irq),
384 			   RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
385 			   __pa(&ras_log_buf),
386 				rtas_get_error_log_max());
387 
388 	rtas_elog = (struct rtas_error_log *)ras_log_buf;
389 
390 	if (status == 0 &&
391 	    rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
392 		fatal = 1;
393 	else
394 		fatal = 0;
395 
396 	/* format and print the extended information */
397 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
398 
399 	if (fatal) {
400 		pr_emerg("Fatal hardware error detected. Check RTAS error"
401 			 " log for details. Powering off immediately\n");
402 		emergency_sync();
403 		kernel_power_off();
404 	} else {
405 		pr_err("Recoverable hardware error detected\n");
406 	}
407 
408 	spin_unlock(&ras_log_buf_lock);
409 	return IRQ_HANDLED;
410 }
411 
412 /*
413  * Some versions of FWNMI place the buffer inside the 4kB page starting at
414  * 0x7000. Other versions place it inside the rtas buffer. We check both.
415  */
416 #define VALID_FWNMI_BUFFER(A) \
417 	((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
418 	(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
419 
420 static inline struct rtas_error_log *fwnmi_get_errlog(void)
421 {
422 	return (struct rtas_error_log *)local_paca->mce_data_buf;
423 }
424 
425 /*
426  * Get the error information for errors coming through the
427  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
428  * the actual r3 if possible, and a ptr to the error log entry
429  * will be returned if found.
430  *
431  * Use one buffer mce_data_buf per cpu to store RTAS error.
432  *
433  * The mce_data_buf does not have any locks or protection around it,
434  * if a second machine check comes in, or a system reset is done
435  * before we have logged the error, then we will get corruption in the
436  * error log.  This is preferable over holding off on calling
437  * ibm,nmi-interlock which would result in us checkstopping if a
438  * second machine check did come in.
439  */
440 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
441 {
442 	unsigned long *savep;
443 	struct rtas_error_log *h;
444 
445 	/* Mask top two bits */
446 	regs->gpr[3] &= ~(0x3UL << 62);
447 
448 	if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
449 		printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
450 		return NULL;
451 	}
452 
453 	savep = __va(regs->gpr[3]);
454 	regs->gpr[3] = be64_to_cpu(savep[0]);	/* restore original r3 */
455 
456 	h = (struct rtas_error_log *)&savep[1];
457 	/* Use the per cpu buffer from paca to store rtas error log */
458 	memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
459 	if (!rtas_error_extended(h)) {
460 		memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
461 	} else {
462 		int len, error_log_length;
463 
464 		error_log_length = 8 + rtas_error_extended_log_length(h);
465 		len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
466 		memcpy(local_paca->mce_data_buf, h, len);
467 	}
468 
469 	return (struct rtas_error_log *)local_paca->mce_data_buf;
470 }
471 
472 /* Call this when done with the data returned by FWNMI_get_errinfo.
473  * It will release the saved data area for other CPUs in the
474  * partition to receive FWNMI errors.
475  */
476 static void fwnmi_release_errinfo(void)
477 {
478 	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
479 	if (ret != 0)
480 		printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
481 }
482 
483 int pSeries_system_reset_exception(struct pt_regs *regs)
484 {
485 #ifdef __LITTLE_ENDIAN__
486 	/*
487 	 * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try
488 	 * to detect the bad SRR1 pattern here. Flip the NIP back to correct
489 	 * endian for reporting purposes. Unfortunately the MSR can't be fixed,
490 	 * so clear it. It will be missing MSR_RI so we won't try to recover.
491 	 */
492 	if ((be64_to_cpu(regs->msr) &
493 			(MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR|
494 			 MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) {
495 		regs->nip = be64_to_cpu((__be64)regs->nip);
496 		regs->msr = 0;
497 	}
498 #endif
499 
500 	if (fwnmi_active) {
501 		struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
502 		if (errhdr) {
503 			/* XXX Should look at FWNMI information */
504 		}
505 		fwnmi_release_errinfo();
506 	}
507 
508 	if (smp_handle_nmi_ipi(regs))
509 		return 1;
510 
511 	return 0; /* need to perform reset */
512 }
513 
514 #define VAL_TO_STRING(ar, val)	\
515 	(((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown")
516 
517 static void pseries_print_mce_info(struct pt_regs *regs,
518 				   struct rtas_error_log *errp)
519 {
520 	const char *level, *sevstr;
521 	struct pseries_errorlog *pseries_log;
522 	struct pseries_mc_errorlog *mce_log;
523 	u8 error_type, err_sub_type;
524 	u64 addr;
525 	u8 initiator = rtas_error_initiator(errp);
526 	int disposition = rtas_error_disposition(errp);
527 
528 	static const char * const initiators[] = {
529 		[0] = "Unknown",
530 		[1] = "CPU",
531 		[2] = "PCI",
532 		[3] = "ISA",
533 		[4] = "Memory",
534 		[5] = "Power Mgmt",
535 	};
536 	static const char * const mc_err_types[] = {
537 		[0] = "UE",
538 		[1] = "SLB",
539 		[2] = "ERAT",
540 		[3] = "Unknown",
541 		[4] = "TLB",
542 		[5] = "D-Cache",
543 		[6] = "Unknown",
544 		[7] = "I-Cache",
545 	};
546 	static const char * const mc_ue_types[] = {
547 		[0] = "Indeterminate",
548 		[1] = "Instruction fetch",
549 		[2] = "Page table walk ifetch",
550 		[3] = "Load/Store",
551 		[4] = "Page table walk Load/Store",
552 	};
553 
554 	/* SLB sub errors valid values are 0x0, 0x1, 0x2 */
555 	static const char * const mc_slb_types[] = {
556 		[0] = "Parity",
557 		[1] = "Multihit",
558 		[2] = "Indeterminate",
559 	};
560 
561 	/* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
562 	static const char * const mc_soft_types[] = {
563 		[0] = "Unknown",
564 		[1] = "Parity",
565 		[2] = "Multihit",
566 		[3] = "Indeterminate",
567 	};
568 
569 	if (!rtas_error_extended(errp)) {
570 		pr_err("Machine check interrupt: Missing extended error log\n");
571 		return;
572 	}
573 
574 	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
575 	if (pseries_log == NULL)
576 		return;
577 
578 	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
579 
580 	error_type = mce_log->error_type;
581 	err_sub_type = rtas_mc_error_sub_type(mce_log);
582 
583 	switch (rtas_error_severity(errp)) {
584 	case RTAS_SEVERITY_NO_ERROR:
585 		level = KERN_INFO;
586 		sevstr = "Harmless";
587 		break;
588 	case RTAS_SEVERITY_WARNING:
589 		level = KERN_WARNING;
590 		sevstr = "";
591 		break;
592 	case RTAS_SEVERITY_ERROR:
593 	case RTAS_SEVERITY_ERROR_SYNC:
594 		level = KERN_ERR;
595 		sevstr = "Severe";
596 		break;
597 	case RTAS_SEVERITY_FATAL:
598 	default:
599 		level = KERN_ERR;
600 		sevstr = "Fatal";
601 		break;
602 	}
603 
604 #ifdef CONFIG_PPC_BOOK3S_64
605 	/* Display faulty slb contents for SLB errors. */
606 	if (error_type == MC_ERROR_TYPE_SLB)
607 		slb_dump_contents(local_paca->mce_faulty_slbs);
608 #endif
609 
610 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
611 	       disposition == RTAS_DISP_FULLY_RECOVERED ?
612 	       "Recovered" : "Not recovered");
613 	if (user_mode(regs)) {
614 		printk("%s  NIP: [%016lx] PID: %d Comm: %s\n", level,
615 		       regs->nip, current->pid, current->comm);
616 	} else {
617 		printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
618 		       (void *)regs->nip);
619 	}
620 	printk("%s  Initiator: %s\n", level,
621 	       VAL_TO_STRING(initiators, initiator));
622 
623 	switch (error_type) {
624 	case MC_ERROR_TYPE_UE:
625 		printk("%s  Error type: %s [%s]\n", level,
626 		       VAL_TO_STRING(mc_err_types, error_type),
627 		       VAL_TO_STRING(mc_ue_types, err_sub_type));
628 		break;
629 	case MC_ERROR_TYPE_SLB:
630 		printk("%s  Error type: %s [%s]\n", level,
631 		       VAL_TO_STRING(mc_err_types, error_type),
632 		       VAL_TO_STRING(mc_slb_types, err_sub_type));
633 		break;
634 	case MC_ERROR_TYPE_ERAT:
635 	case MC_ERROR_TYPE_TLB:
636 		printk("%s  Error type: %s [%s]\n", level,
637 		       VAL_TO_STRING(mc_err_types, error_type),
638 		       VAL_TO_STRING(mc_soft_types, err_sub_type));
639 		break;
640 	default:
641 		printk("%s  Error type: %s\n", level,
642 		       VAL_TO_STRING(mc_err_types, error_type));
643 		break;
644 	}
645 
646 	addr = rtas_mc_get_effective_addr(mce_log);
647 	if (addr)
648 		printk("%s    Effective address: %016llx\n", level, addr);
649 }
650 
651 static int mce_handle_error(struct rtas_error_log *errp)
652 {
653 	struct pseries_errorlog *pseries_log;
654 	struct pseries_mc_errorlog *mce_log;
655 	int disposition = rtas_error_disposition(errp);
656 	u8 error_type;
657 
658 	if (!rtas_error_extended(errp))
659 		goto out;
660 
661 	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
662 	if (pseries_log == NULL)
663 		goto out;
664 
665 	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
666 	error_type = mce_log->error_type;
667 
668 #ifdef CONFIG_PPC_BOOK3S_64
669 	if (disposition == RTAS_DISP_NOT_RECOVERED) {
670 		switch (error_type) {
671 		case	MC_ERROR_TYPE_SLB:
672 		case	MC_ERROR_TYPE_ERAT:
673 			/*
674 			 * Store the old slb content in paca before flushing.
675 			 * Print this when we go to virtual mode.
676 			 * There are chances that we may hit MCE again if there
677 			 * is a parity error on the SLB entry we trying to read
678 			 * for saving. Hence limit the slb saving to single
679 			 * level of recursion.
680 			 */
681 			if (local_paca->in_mce == 1)
682 				slb_save_contents(local_paca->mce_faulty_slbs);
683 			flush_and_reload_slb();
684 			disposition = RTAS_DISP_FULLY_RECOVERED;
685 			rtas_set_disposition_recovered(errp);
686 			break;
687 		default:
688 			break;
689 		}
690 	}
691 #endif
692 
693 out:
694 	return disposition;
695 }
696 
697 #ifdef CONFIG_MEMORY_FAILURE
698 
699 static DEFINE_PER_CPU(int, rtas_ue_count);
700 static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
701 
702 #define UE_EFFECTIVE_ADDR_PROVIDED	0x40
703 #define UE_LOGICAL_ADDR_PROVIDED	0x20
704 
705 
706 static void pseries_hwpoison_work_fn(struct work_struct *work)
707 {
708 	unsigned long paddr;
709 	int index;
710 
711 	while (__this_cpu_read(rtas_ue_count) > 0) {
712 		index = __this_cpu_read(rtas_ue_count) - 1;
713 		paddr = __this_cpu_read(rtas_ue_paddr[index]);
714 		memory_failure(paddr >> PAGE_SHIFT, 0);
715 		__this_cpu_dec(rtas_ue_count);
716 	}
717 }
718 
719 static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
720 
721 static void queue_ue_paddr(unsigned long paddr)
722 {
723 	int index;
724 
725 	index = __this_cpu_inc_return(rtas_ue_count) - 1;
726 	if (index >= MAX_MC_EVT) {
727 		__this_cpu_dec(rtas_ue_count);
728 		return;
729 	}
730 	this_cpu_write(rtas_ue_paddr[index], paddr);
731 	schedule_work(&hwpoison_work);
732 }
733 
734 static void pseries_do_memory_failure(struct pt_regs *regs,
735 				      struct pseries_mc_errorlog *mce_log)
736 {
737 	unsigned long paddr;
738 
739 	if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
740 		paddr = be64_to_cpu(mce_log->logical_address);
741 	} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
742 		unsigned long pfn;
743 
744 		pfn = addr_to_pfn(regs,
745 				  be64_to_cpu(mce_log->effective_address));
746 		if (pfn == ULONG_MAX)
747 			return;
748 		paddr = pfn << PAGE_SHIFT;
749 	} else {
750 		return;
751 	}
752 	queue_ue_paddr(paddr);
753 }
754 
755 static void pseries_process_ue(struct pt_regs *regs,
756 			       struct rtas_error_log *errp)
757 {
758 	struct pseries_errorlog *pseries_log;
759 	struct pseries_mc_errorlog *mce_log;
760 
761 	if (!rtas_error_extended(errp))
762 		return;
763 
764 	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
765 	if (!pseries_log)
766 		return;
767 
768 	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
769 
770 	if (mce_log->error_type == MC_ERROR_TYPE_UE)
771 		pseries_do_memory_failure(regs, mce_log);
772 }
773 #else
774 static inline void pseries_process_ue(struct pt_regs *regs,
775 				      struct rtas_error_log *errp) { }
776 #endif /*CONFIG_MEMORY_FAILURE */
777 
778 /*
779  * Process MCE rtas errlog event.
780  */
781 static void mce_process_errlog_event(struct irq_work *work)
782 {
783 	struct rtas_error_log *err;
784 
785 	err = fwnmi_get_errlog();
786 	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
787 }
788 
789 /*
790  * See if we can recover from a machine check exception.
791  * This is only called on power4 (or above) and only via
792  * the Firmware Non-Maskable Interrupts (fwnmi) handler
793  * which provides the error analysis for us.
794  *
795  * Return 1 if corrected (or delivered a signal).
796  * Return 0 if there is nothing we can do.
797  */
798 static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
799 {
800 	int recovered = 0;
801 	int disposition = rtas_error_disposition(err);
802 
803 	pseries_print_mce_info(regs, err);
804 
805 	if (!(regs->msr & MSR_RI)) {
806 		/* If MSR_RI isn't set, we cannot recover */
807 		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
808 		recovered = 0;
809 
810 	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
811 		/* Platform corrected itself */
812 		recovered = 1;
813 
814 	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
815 		/* Platform corrected itself but could be degraded */
816 		printk(KERN_ERR "MCE: limited recovery, system may "
817 		       "be degraded\n");
818 		recovered = 1;
819 
820 	} else if (user_mode(regs) && !is_global_init(current) &&
821 		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
822 
823 		/*
824 		 * If we received a synchronous error when in userspace
825 		 * kill the task. Firmware may report details of the fail
826 		 * asynchronously, so we can't rely on the target and type
827 		 * fields being valid here.
828 		 */
829 		printk(KERN_ERR "MCE: uncorrectable error, killing task "
830 		       "%s:%d\n", current->comm, current->pid);
831 
832 		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
833 		recovered = 1;
834 	}
835 
836 	pseries_process_ue(regs, err);
837 
838 	/* Queue irq work to log this rtas event later. */
839 	irq_work_queue(&mce_errlog_process_work);
840 
841 	return recovered;
842 }
843 
844 /*
845  * Handle a machine check.
846  *
847  * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
848  * should be present.  If so the handler which called us tells us if the
849  * error was recovered (never true if RI=0).
850  *
851  * On hardware prior to Power 4 these exceptions were asynchronous which
852  * means we can't tell exactly where it occurred and so we can't recover.
853  */
854 int pSeries_machine_check_exception(struct pt_regs *regs)
855 {
856 	struct rtas_error_log *errp;
857 
858 	if (fwnmi_active) {
859 		fwnmi_release_errinfo();
860 		errp = fwnmi_get_errlog();
861 		if (errp && recover_mce(regs, errp))
862 			return 1;
863 	}
864 
865 	return 0;
866 }
867 
868 long pseries_machine_check_realmode(struct pt_regs *regs)
869 {
870 	struct rtas_error_log *errp;
871 	int disposition;
872 
873 	if (fwnmi_active) {
874 		errp = fwnmi_get_errinfo(regs);
875 		/*
876 		 * Call to fwnmi_release_errinfo() in real mode causes kernel
877 		 * to panic. Hence we will call it as soon as we go into
878 		 * virtual mode.
879 		 */
880 		disposition = mce_handle_error(errp);
881 		if (disposition == RTAS_DISP_FULLY_RECOVERED)
882 			return 1;
883 	}
884 
885 	return 0;
886 }
887