xref: /linux/arch/powerpc/platforms/pseries/ras.c (revision 0f54bddefe7f5e4c98bea6f945ebdf85d1c44117)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2001 Dave Engebretsen IBM Corporation
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/interrupt.h>
8 #include <linux/irq.h>
9 #include <linux/of.h>
10 #include <linux/fs.h>
11 #include <linux/reboot.h>
12 #include <linux/irq_work.h>
13 
14 #include <asm/machdep.h>
15 #include <asm/rtas.h>
16 #include <asm/firmware.h>
17 #include <asm/mce.h>
18 
19 #include "pseries.h"
20 
21 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
22 static DEFINE_SPINLOCK(ras_log_buf_lock);
23 
24 static int ras_check_exception_token;
25 
26 static void mce_process_errlog_event(struct irq_work *work);
27 static struct irq_work mce_errlog_process_work = {
28 	.func = mce_process_errlog_event,
29 };
30 
31 #define EPOW_SENSOR_TOKEN	9
32 #define EPOW_SENSOR_INDEX	0
33 
34 /* EPOW events counter variable */
35 static int num_epow_events;
36 
37 static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id);
38 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
39 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
40 
41 /* RTAS pseries MCE errorlog section. */
42 struct pseries_mc_errorlog {
43 	__be32	fru_id;
44 	__be32	proc_id;
45 	u8	error_type;
46 	/*
47 	 * sub_err_type (1 byte). Bit fields depends on error_type
48 	 *
49 	 *   MSB0
50 	 *   |
51 	 *   V
52 	 *   01234567
53 	 *   XXXXXXXX
54 	 *
55 	 * For error_type == MC_ERROR_TYPE_UE
56 	 *   XXXXXXXX
57 	 *   X		1: Permanent or Transient UE.
58 	 *    X		1: Effective address provided.
59 	 *     X	1: Logical address provided.
60 	 *      XX	2: Reserved.
61 	 *        XXX	3: Type of UE error.
62 	 *
63 	 * For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB
64 	 *   XXXXXXXX
65 	 *   X		1: Effective address provided.
66 	 *    XXXXX	5: Reserved.
67 	 *         XX	2: Type of SLB/ERAT/TLB error.
68 	 *
69 	 * For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS
70 	 *   XXXXXXXX
71 	 *   X		1: Error causing address provided.
72 	 *    XXX	3: Type of error.
73 	 *       XXXX	4: Reserved.
74 	 */
75 	u8	sub_err_type;
76 	u8	reserved_1[6];
77 	__be64	effective_address;
78 	__be64	logical_address;
79 } __packed;
80 
81 /* RTAS pseries MCE error types */
82 #define MC_ERROR_TYPE_UE		0x00
83 #define MC_ERROR_TYPE_SLB		0x01
84 #define MC_ERROR_TYPE_ERAT		0x02
85 #define MC_ERROR_TYPE_UNKNOWN		0x03
86 #define MC_ERROR_TYPE_TLB		0x04
87 #define MC_ERROR_TYPE_D_CACHE		0x05
88 #define MC_ERROR_TYPE_I_CACHE		0x07
89 #define MC_ERROR_TYPE_CTRL_MEM_ACCESS	0x08
90 
91 /* RTAS pseries MCE error sub types */
92 #define MC_ERROR_UE_INDETERMINATE		0
93 #define MC_ERROR_UE_IFETCH			1
94 #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH	2
95 #define MC_ERROR_UE_LOAD_STORE			3
96 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE	4
97 
98 #define UE_EFFECTIVE_ADDR_PROVIDED		0x40
99 #define UE_LOGICAL_ADDR_PROVIDED		0x20
100 #define MC_EFFECTIVE_ADDR_PROVIDED		0x80
101 
102 #define MC_ERROR_SLB_PARITY		0
103 #define MC_ERROR_SLB_MULTIHIT		1
104 #define MC_ERROR_SLB_INDETERMINATE	2
105 
106 #define MC_ERROR_ERAT_PARITY		1
107 #define MC_ERROR_ERAT_MULTIHIT		2
108 #define MC_ERROR_ERAT_INDETERMINATE	3
109 
110 #define MC_ERROR_TLB_PARITY		1
111 #define MC_ERROR_TLB_MULTIHIT		2
112 #define MC_ERROR_TLB_INDETERMINATE	3
113 
114 #define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK	0
115 #define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS	1
116 
117 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
118 {
119 	switch (mlog->error_type) {
120 	case	MC_ERROR_TYPE_UE:
121 		return (mlog->sub_err_type & 0x07);
122 	case	MC_ERROR_TYPE_SLB:
123 	case	MC_ERROR_TYPE_ERAT:
124 	case	MC_ERROR_TYPE_TLB:
125 		return (mlog->sub_err_type & 0x03);
126 	case	MC_ERROR_TYPE_CTRL_MEM_ACCESS:
127 		return (mlog->sub_err_type & 0x70) >> 4;
128 	default:
129 		return 0;
130 	}
131 }
132 
133 /*
134  * Enable the hotplug interrupt late because processing them may touch other
135  * devices or systems (e.g. hugepages) that have not been initialized at the
136  * subsys stage.
137  */
138 static int __init init_ras_hotplug_IRQ(void)
139 {
140 	struct device_node *np;
141 
142 	/* Hotplug Events */
143 	np = of_find_node_by_path("/event-sources/hot-plug-events");
144 	if (np != NULL) {
145 		if (dlpar_workqueue_init() == 0)
146 			request_event_sources_irqs(np, ras_hotplug_interrupt,
147 						   "RAS_HOTPLUG");
148 		of_node_put(np);
149 	}
150 
151 	return 0;
152 }
153 machine_late_initcall(pseries, init_ras_hotplug_IRQ);
154 
155 /*
156  * Initialize handlers for the set of interrupts caused by hardware errors
157  * and power system events.
158  */
159 static int __init init_ras_IRQ(void)
160 {
161 	struct device_node *np;
162 
163 	ras_check_exception_token = rtas_token("check-exception");
164 
165 	/* Internal Errors */
166 	np = of_find_node_by_path("/event-sources/internal-errors");
167 	if (np != NULL) {
168 		request_event_sources_irqs(np, ras_error_interrupt,
169 					   "RAS_ERROR");
170 		of_node_put(np);
171 	}
172 
173 	/* EPOW Events */
174 	np = of_find_node_by_path("/event-sources/epow-events");
175 	if (np != NULL) {
176 		request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
177 		of_node_put(np);
178 	}
179 
180 	return 0;
181 }
182 machine_subsys_initcall(pseries, init_ras_IRQ);
183 
184 #define EPOW_SHUTDOWN_NORMAL				1
185 #define EPOW_SHUTDOWN_ON_UPS				2
186 #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS	3
187 #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH	4
188 
189 static void handle_system_shutdown(char event_modifier)
190 {
191 	switch (event_modifier) {
192 	case EPOW_SHUTDOWN_NORMAL:
193 		pr_emerg("Power off requested\n");
194 		orderly_poweroff(true);
195 		break;
196 
197 	case EPOW_SHUTDOWN_ON_UPS:
198 		pr_emerg("Loss of system power detected. System is running on"
199 			 " UPS/battery. Check RTAS error log for details\n");
200 		break;
201 
202 	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
203 		pr_emerg("Loss of system critical functions detected. Check"
204 			 " RTAS error log for details\n");
205 		orderly_poweroff(true);
206 		break;
207 
208 	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
209 		pr_emerg("High ambient temperature detected. Check RTAS"
210 			 " error log for details\n");
211 		orderly_poweroff(true);
212 		break;
213 
214 	default:
215 		pr_err("Unknown power/cooling shutdown event (modifier = %d)\n",
216 			event_modifier);
217 	}
218 }
219 
220 struct epow_errorlog {
221 	unsigned char sensor_value;
222 	unsigned char event_modifier;
223 	unsigned char extended_modifier;
224 	unsigned char reserved;
225 	unsigned char platform_reason;
226 };
227 
228 #define EPOW_RESET			0
229 #define EPOW_WARN_COOLING		1
230 #define EPOW_WARN_POWER			2
231 #define EPOW_SYSTEM_SHUTDOWN		3
232 #define EPOW_SYSTEM_HALT		4
233 #define EPOW_MAIN_ENCLOSURE		5
234 #define EPOW_POWER_OFF			7
235 
236 static void rtas_parse_epow_errlog(struct rtas_error_log *log)
237 {
238 	struct pseries_errorlog *pseries_log;
239 	struct epow_errorlog *epow_log;
240 	char action_code;
241 	char modifier;
242 
243 	pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
244 	if (pseries_log == NULL)
245 		return;
246 
247 	epow_log = (struct epow_errorlog *)pseries_log->data;
248 	action_code = epow_log->sensor_value & 0xF;	/* bottom 4 bits */
249 	modifier = epow_log->event_modifier & 0xF;	/* bottom 4 bits */
250 
251 	switch (action_code) {
252 	case EPOW_RESET:
253 		if (num_epow_events) {
254 			pr_info("Non critical power/cooling issue cleared\n");
255 			num_epow_events--;
256 		}
257 		break;
258 
259 	case EPOW_WARN_COOLING:
260 		pr_info("Non-critical cooling issue detected. Check RTAS error"
261 			" log for details\n");
262 		break;
263 
264 	case EPOW_WARN_POWER:
265 		pr_info("Non-critical power issue detected. Check RTAS error"
266 			" log for details\n");
267 		break;
268 
269 	case EPOW_SYSTEM_SHUTDOWN:
270 		handle_system_shutdown(modifier);
271 		break;
272 
273 	case EPOW_SYSTEM_HALT:
274 		pr_emerg("Critical power/cooling issue detected. Check RTAS"
275 			 " error log for details. Powering off.\n");
276 		orderly_poweroff(true);
277 		break;
278 
279 	case EPOW_MAIN_ENCLOSURE:
280 	case EPOW_POWER_OFF:
281 		pr_emerg("System about to lose power. Check RTAS error log "
282 			 " for details. Powering off immediately.\n");
283 		emergency_sync();
284 		kernel_power_off();
285 		break;
286 
287 	default:
288 		pr_err("Unknown power/cooling event (action code  = %d)\n",
289 			action_code);
290 	}
291 
292 	/* Increment epow events counter variable */
293 	if (action_code != EPOW_RESET)
294 		num_epow_events++;
295 }
296 
297 static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id)
298 {
299 	struct pseries_errorlog *pseries_log;
300 	struct pseries_hp_errorlog *hp_elog;
301 
302 	spin_lock(&ras_log_buf_lock);
303 
304 	rtas_call(ras_check_exception_token, 6, 1, NULL,
305 		  RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq),
306 		  RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf),
307 		  rtas_get_error_log_max());
308 
309 	pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf,
310 					   PSERIES_ELOG_SECT_ID_HOTPLUG);
311 	hp_elog = (struct pseries_hp_errorlog *)pseries_log->data;
312 
313 	/*
314 	 * Since PCI hotplug is not currently supported on pseries, put PCI
315 	 * hotplug events on the ras_log_buf to be handled by rtas_errd.
316 	 */
317 	if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM ||
318 	    hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU ||
319 	    hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM)
320 		queue_hotplug_event(hp_elog);
321 	else
322 		log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
323 
324 	spin_unlock(&ras_log_buf_lock);
325 	return IRQ_HANDLED;
326 }
327 
328 /* Handle environmental and power warning (EPOW) interrupts. */
329 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
330 {
331 	int state;
332 	int critical;
333 
334 	rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
335 
336 	if (state > 3)
337 		critical = 1;		/* Time Critical */
338 	else
339 		critical = 0;
340 
341 	spin_lock(&ras_log_buf_lock);
342 
343 	rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT,
344 		  virq_to_hw(irq), RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf),
345 		  rtas_get_error_log_max());
346 
347 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
348 
349 	rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
350 
351 	spin_unlock(&ras_log_buf_lock);
352 	return IRQ_HANDLED;
353 }
354 
355 /*
356  * Handle hardware error interrupts.
357  *
358  * RTAS check-exception is called to collect data on the exception.  If
359  * the error is deemed recoverable, we log a warning and return.
360  * For nonrecoverable errors, an error is logged and we stop all processing
361  * as quickly as possible in order to prevent propagation of the failure.
362  */
363 static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
364 {
365 	struct rtas_error_log *rtas_elog;
366 	int status;
367 	int fatal;
368 
369 	spin_lock(&ras_log_buf_lock);
370 
371 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
372 			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
373 			   virq_to_hw(irq),
374 			   RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
375 			   __pa(&ras_log_buf),
376 				rtas_get_error_log_max());
377 
378 	rtas_elog = (struct rtas_error_log *)ras_log_buf;
379 
380 	if (status == 0 &&
381 	    rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
382 		fatal = 1;
383 	else
384 		fatal = 0;
385 
386 	/* format and print the extended information */
387 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
388 
389 	if (fatal) {
390 		pr_emerg("Fatal hardware error detected. Check RTAS error"
391 			 " log for details. Powering off immediately\n");
392 		emergency_sync();
393 		kernel_power_off();
394 	} else {
395 		pr_err("Recoverable hardware error detected\n");
396 	}
397 
398 	spin_unlock(&ras_log_buf_lock);
399 	return IRQ_HANDLED;
400 }
401 
402 /*
403  * Some versions of FWNMI place the buffer inside the 4kB page starting at
404  * 0x7000. Other versions place it inside the rtas buffer. We check both.
405  * Minimum size of the buffer is 16 bytes.
406  */
407 #define VALID_FWNMI_BUFFER(A) \
408 	((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \
409 	(((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16))))
410 
411 static inline struct rtas_error_log *fwnmi_get_errlog(void)
412 {
413 	return (struct rtas_error_log *)local_paca->mce_data_buf;
414 }
415 
416 static __be64 *fwnmi_get_savep(struct pt_regs *regs)
417 {
418 	unsigned long savep_ra;
419 
420 	/* Mask top two bits */
421 	savep_ra = regs->gpr[3] & ~(0x3UL << 62);
422 	if (!VALID_FWNMI_BUFFER(savep_ra)) {
423 		printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
424 		return NULL;
425 	}
426 
427 	return __va(savep_ra);
428 }
429 
430 /*
431  * Get the error information for errors coming through the
432  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
433  * the actual r3 if possible, and a ptr to the error log entry
434  * will be returned if found.
435  *
436  * Use one buffer mce_data_buf per cpu to store RTAS error.
437  *
438  * The mce_data_buf does not have any locks or protection around it,
439  * if a second machine check comes in, or a system reset is done
440  * before we have logged the error, then we will get corruption in the
441  * error log.  This is preferable over holding off on calling
442  * ibm,nmi-interlock which would result in us checkstopping if a
443  * second machine check did come in.
444  */
445 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
446 {
447 	struct rtas_error_log *h;
448 	__be64 *savep;
449 
450 	savep = fwnmi_get_savep(regs);
451 	if (!savep)
452 		return NULL;
453 
454 	regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
455 
456 	h = (struct rtas_error_log *)&savep[1];
457 	/* Use the per cpu buffer from paca to store rtas error log */
458 	memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
459 	if (!rtas_error_extended(h)) {
460 		memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
461 	} else {
462 		int len, error_log_length;
463 
464 		error_log_length = 8 + rtas_error_extended_log_length(h);
465 		len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
466 		memcpy(local_paca->mce_data_buf, h, len);
467 	}
468 
469 	return (struct rtas_error_log *)local_paca->mce_data_buf;
470 }
471 
472 /* Call this when done with the data returned by FWNMI_get_errinfo.
473  * It will release the saved data area for other CPUs in the
474  * partition to receive FWNMI errors.
475  */
476 static void fwnmi_release_errinfo(void)
477 {
478 	struct rtas_args rtas_args;
479 	int ret;
480 
481 	/*
482 	 * On pseries, the machine check stack is limited to under 4GB, so
483 	 * args can be on-stack.
484 	 */
485 	rtas_call_unlocked(&rtas_args, ibm_nmi_interlock_token, 0, 1, NULL);
486 	ret = be32_to_cpu(rtas_args.rets[0]);
487 	if (ret != 0)
488 		printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
489 }
490 
491 int pSeries_system_reset_exception(struct pt_regs *regs)
492 {
493 #ifdef __LITTLE_ENDIAN__
494 	/*
495 	 * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try
496 	 * to detect the bad SRR1 pattern here. Flip the NIP back to correct
497 	 * endian for reporting purposes. Unfortunately the MSR can't be fixed,
498 	 * so clear it. It will be missing MSR_RI so we won't try to recover.
499 	 */
500 	if ((be64_to_cpu(regs->msr) &
501 			(MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR|
502 			 MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) {
503 		regs_set_return_ip(regs, be64_to_cpu((__be64)regs->nip));
504 		regs_set_return_msr(regs, 0);
505 	}
506 #endif
507 
508 	if (fwnmi_active) {
509 		__be64 *savep;
510 
511 		/*
512 		 * Firmware (PowerVM and KVM) saves r3 to a save area like
513 		 * machine check, which is not exactly what PAPR (2.9)
514 		 * suggests but there is no way to detect otherwise, so this
515 		 * is the interface now.
516 		 *
517 		 * System resets do not save any error log or require an
518 		 * "ibm,nmi-interlock" rtas call to release.
519 		 */
520 
521 		savep = fwnmi_get_savep(regs);
522 		if (savep)
523 			regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
524 	}
525 
526 	if (smp_handle_nmi_ipi(regs))
527 		return 1;
528 
529 	return 0; /* need to perform reset */
530 }
531 
532 static int mce_handle_err_realmode(int disposition, u8 error_type)
533 {
534 #ifdef CONFIG_PPC_BOOK3S_64
535 	if (disposition == RTAS_DISP_NOT_RECOVERED) {
536 		switch (error_type) {
537 		case	MC_ERROR_TYPE_ERAT:
538 			flush_erat();
539 			disposition = RTAS_DISP_FULLY_RECOVERED;
540 			break;
541 		case	MC_ERROR_TYPE_SLB:
542 #ifdef CONFIG_PPC_64S_HASH_MMU
543 			/*
544 			 * Store the old slb content in paca before flushing.
545 			 * Print this when we go to virtual mode.
546 			 * There are chances that we may hit MCE again if there
547 			 * is a parity error on the SLB entry we trying to read
548 			 * for saving. Hence limit the slb saving to single
549 			 * level of recursion.
550 			 */
551 			if (local_paca->in_mce == 1)
552 				slb_save_contents(local_paca->mce_faulty_slbs);
553 			flush_and_reload_slb();
554 			disposition = RTAS_DISP_FULLY_RECOVERED;
555 #endif
556 			break;
557 		default:
558 			break;
559 		}
560 	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
561 		/* Platform corrected itself but could be degraded */
562 		pr_err("MCE: limited recovery, system may be degraded\n");
563 		disposition = RTAS_DISP_FULLY_RECOVERED;
564 	}
565 #endif
566 	return disposition;
567 }
568 
569 static int mce_handle_err_virtmode(struct pt_regs *regs,
570 				   struct rtas_error_log *errp,
571 				   struct pseries_mc_errorlog *mce_log,
572 				   int disposition)
573 {
574 	struct mce_error_info mce_err = { 0 };
575 	int initiator = rtas_error_initiator(errp);
576 	int severity = rtas_error_severity(errp);
577 	unsigned long eaddr = 0, paddr = 0;
578 	u8 error_type, err_sub_type;
579 
580 	if (!mce_log)
581 		goto out;
582 
583 	error_type = mce_log->error_type;
584 	err_sub_type = rtas_mc_error_sub_type(mce_log);
585 
586 	if (initiator == RTAS_INITIATOR_UNKNOWN)
587 		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
588 	else if (initiator == RTAS_INITIATOR_CPU)
589 		mce_err.initiator = MCE_INITIATOR_CPU;
590 	else if (initiator == RTAS_INITIATOR_PCI)
591 		mce_err.initiator = MCE_INITIATOR_PCI;
592 	else if (initiator == RTAS_INITIATOR_ISA)
593 		mce_err.initiator = MCE_INITIATOR_ISA;
594 	else if (initiator == RTAS_INITIATOR_MEMORY)
595 		mce_err.initiator = MCE_INITIATOR_MEMORY;
596 	else if (initiator == RTAS_INITIATOR_POWERMGM)
597 		mce_err.initiator = MCE_INITIATOR_POWERMGM;
598 	else
599 		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
600 
601 	if (severity == RTAS_SEVERITY_NO_ERROR)
602 		mce_err.severity = MCE_SEV_NO_ERROR;
603 	else if (severity == RTAS_SEVERITY_EVENT)
604 		mce_err.severity = MCE_SEV_WARNING;
605 	else if (severity == RTAS_SEVERITY_WARNING)
606 		mce_err.severity = MCE_SEV_WARNING;
607 	else if (severity == RTAS_SEVERITY_ERROR_SYNC)
608 		mce_err.severity = MCE_SEV_SEVERE;
609 	else if (severity == RTAS_SEVERITY_ERROR)
610 		mce_err.severity = MCE_SEV_SEVERE;
611 	else
612 		mce_err.severity = MCE_SEV_FATAL;
613 
614 	if (severity <= RTAS_SEVERITY_ERROR_SYNC)
615 		mce_err.sync_error = true;
616 	else
617 		mce_err.sync_error = false;
618 
619 	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
620 	mce_err.error_class = MCE_ECLASS_UNKNOWN;
621 
622 	switch (error_type) {
623 	case MC_ERROR_TYPE_UE:
624 		mce_err.error_type = MCE_ERROR_TYPE_UE;
625 		mce_common_process_ue(regs, &mce_err);
626 		if (mce_err.ignore_event)
627 			disposition = RTAS_DISP_FULLY_RECOVERED;
628 		switch (err_sub_type) {
629 		case MC_ERROR_UE_IFETCH:
630 			mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
631 			break;
632 		case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH:
633 			mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
634 			break;
635 		case MC_ERROR_UE_LOAD_STORE:
636 			mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
637 			break;
638 		case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE:
639 			mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
640 			break;
641 		case MC_ERROR_UE_INDETERMINATE:
642 		default:
643 			mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE;
644 			break;
645 		}
646 		if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
647 			eaddr = be64_to_cpu(mce_log->effective_address);
648 
649 		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
650 			paddr = be64_to_cpu(mce_log->logical_address);
651 		} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
652 			unsigned long pfn;
653 
654 			pfn = addr_to_pfn(regs, eaddr);
655 			if (pfn != ULONG_MAX)
656 				paddr = pfn << PAGE_SHIFT;
657 		}
658 
659 		break;
660 	case MC_ERROR_TYPE_SLB:
661 		mce_err.error_type = MCE_ERROR_TYPE_SLB;
662 		switch (err_sub_type) {
663 		case MC_ERROR_SLB_PARITY:
664 			mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY;
665 			break;
666 		case MC_ERROR_SLB_MULTIHIT:
667 			mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
668 			break;
669 		case MC_ERROR_SLB_INDETERMINATE:
670 		default:
671 			mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
672 			break;
673 		}
674 		if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
675 			eaddr = be64_to_cpu(mce_log->effective_address);
676 		break;
677 	case MC_ERROR_TYPE_ERAT:
678 		mce_err.error_type = MCE_ERROR_TYPE_ERAT;
679 		switch (err_sub_type) {
680 		case MC_ERROR_ERAT_PARITY:
681 			mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY;
682 			break;
683 		case MC_ERROR_ERAT_MULTIHIT:
684 			mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
685 			break;
686 		case MC_ERROR_ERAT_INDETERMINATE:
687 		default:
688 			mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE;
689 			break;
690 		}
691 		if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
692 			eaddr = be64_to_cpu(mce_log->effective_address);
693 		break;
694 	case MC_ERROR_TYPE_TLB:
695 		mce_err.error_type = MCE_ERROR_TYPE_TLB;
696 		switch (err_sub_type) {
697 		case MC_ERROR_TLB_PARITY:
698 			mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY;
699 			break;
700 		case MC_ERROR_TLB_MULTIHIT:
701 			mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
702 			break;
703 		case MC_ERROR_TLB_INDETERMINATE:
704 		default:
705 			mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
706 			break;
707 		}
708 		if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
709 			eaddr = be64_to_cpu(mce_log->effective_address);
710 		break;
711 	case MC_ERROR_TYPE_D_CACHE:
712 		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
713 		break;
714 	case MC_ERROR_TYPE_I_CACHE:
715 		mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
716 		break;
717 	case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
718 		mce_err.error_type = MCE_ERROR_TYPE_RA;
719 		switch (err_sub_type) {
720 		case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
721 			mce_err.u.ra_error_type =
722 				MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
723 			break;
724 		case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
725 			mce_err.u.ra_error_type =
726 				MCE_RA_ERROR_LOAD_STORE_FOREIGN;
727 			break;
728 		}
729 		if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
730 			eaddr = be64_to_cpu(mce_log->effective_address);
731 		break;
732 	case MC_ERROR_TYPE_UNKNOWN:
733 	default:
734 		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
735 		break;
736 	}
737 out:
738 	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
739 		       &mce_err, regs->nip, eaddr, paddr);
740 	return disposition;
741 }
742 
743 static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
744 {
745 	struct pseries_errorlog *pseries_log;
746 	struct pseries_mc_errorlog *mce_log = NULL;
747 	int disposition = rtas_error_disposition(errp);
748 	unsigned long msr;
749 	u8 error_type;
750 
751 	if (!rtas_error_extended(errp))
752 		goto out;
753 
754 	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
755 	if (!pseries_log)
756 		goto out;
757 
758 	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
759 	error_type = mce_log->error_type;
760 
761 	disposition = mce_handle_err_realmode(disposition, error_type);
762 
763 	/*
764 	 * Enable translation as we will be accessing per-cpu variables
765 	 * in save_mce_event() which may fall outside RMO region, also
766 	 * leave it enabled because subsequently we will be queuing work
767 	 * to workqueues where again per-cpu variables accessed, besides
768 	 * fwnmi_release_errinfo() crashes when called in realmode on
769 	 * pseries.
770 	 * Note: All the realmode handling like flushing SLB entries for
771 	 *       SLB multihit is done by now.
772 	 */
773 out:
774 	msr = mfmsr();
775 	mtmsr(msr | MSR_IR | MSR_DR);
776 
777 	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
778 					      disposition);
779 
780 	/*
781 	 * Queue irq work to log this rtas event later.
782 	 * irq_work_queue uses per-cpu variables, so do this in virt
783 	 * mode as well.
784 	 */
785 	irq_work_queue(&mce_errlog_process_work);
786 
787 	mtmsr(msr);
788 
789 	return disposition;
790 }
791 
792 /*
793  * Process MCE rtas errlog event.
794  */
795 static void mce_process_errlog_event(struct irq_work *work)
796 {
797 	struct rtas_error_log *err;
798 
799 	err = fwnmi_get_errlog();
800 	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
801 }
802 
803 /*
804  * See if we can recover from a machine check exception.
805  * This is only called on power4 (or above) and only via
806  * the Firmware Non-Maskable Interrupts (fwnmi) handler
807  * which provides the error analysis for us.
808  *
809  * Return 1 if corrected (or delivered a signal).
810  * Return 0 if there is nothing we can do.
811  */
812 static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
813 {
814 	int recovered = 0;
815 
816 	if (regs_is_unrecoverable(regs)) {
817 		/* If MSR_RI isn't set, we cannot recover */
818 		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
819 		recovered = 0;
820 	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
821 		/* Platform corrected itself */
822 		recovered = 1;
823 	} else if (evt->severity == MCE_SEV_FATAL) {
824 		/* Fatal machine check */
825 		pr_err("Machine check interrupt is fatal\n");
826 		recovered = 0;
827 	}
828 
829 	if (!recovered && evt->sync_error) {
830 		/*
831 		 * Try to kill processes if we get a synchronous machine check
832 		 * (e.g., one caused by execution of this instruction). This
833 		 * will devolve into a panic if we try to kill init or are in
834 		 * an interrupt etc.
835 		 *
836 		 * TODO: Queue up this address for hwpoisioning later.
837 		 * TODO: This is not quite right for d-side machine
838 		 *       checks ->nip is not necessarily the important
839 		 *       address.
840 		 */
841 		if ((user_mode(regs))) {
842 			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
843 			recovered = 1;
844 		} else if (die_will_crash()) {
845 			/*
846 			 * die() would kill the kernel, so better to go via
847 			 * the platform reboot code that will log the
848 			 * machine check.
849 			 */
850 			recovered = 0;
851 		} else {
852 			die_mce("Machine check", regs, SIGBUS);
853 			recovered = 1;
854 		}
855 	}
856 
857 	return recovered;
858 }
859 
860 /*
861  * Handle a machine check.
862  *
863  * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
864  * should be present.  If so the handler which called us tells us if the
865  * error was recovered (never true if RI=0).
866  *
867  * On hardware prior to Power 4 these exceptions were asynchronous which
868  * means we can't tell exactly where it occurred and so we can't recover.
869  */
870 int pSeries_machine_check_exception(struct pt_regs *regs)
871 {
872 	struct machine_check_event evt;
873 
874 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
875 		return 0;
876 
877 	/* Print things out */
878 	if (evt.version != MCE_V1) {
879 		pr_err("Machine Check Exception, Unknown event version %d !\n",
880 		       evt.version);
881 		return 0;
882 	}
883 	machine_check_print_event_info(&evt, user_mode(regs), false);
884 
885 	if (recover_mce(regs, &evt))
886 		return 1;
887 
888 	return 0;
889 }
890 
891 long pseries_machine_check_realmode(struct pt_regs *regs)
892 {
893 	struct rtas_error_log *errp;
894 	int disposition;
895 
896 	if (fwnmi_active) {
897 		errp = fwnmi_get_errinfo(regs);
898 		/*
899 		 * Call to fwnmi_release_errinfo() in real mode causes kernel
900 		 * to panic. Hence we will call it as soon as we go into
901 		 * virtual mode.
902 		 */
903 		disposition = mce_handle_error(regs, errp);
904 
905 		fwnmi_release_errinfo();
906 
907 		if (disposition == RTAS_DISP_FULLY_RECOVERED)
908 			return 1;
909 	}
910 
911 	return 0;
912 }
913