xref: /linux/arch/powerpc/platforms/powernv/opal-hmi.c (revision bd628c1bed7902ec1f24ba0fe70758949146abbe)
1 /*
2  * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Copyright 2014 IBM Corporation
18  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19  */
20 
21 #undef DEBUG
22 
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28 
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32 
33 #include "powernv.h"
34 
35 static int opal_hmi_handler_nb_init;
36 struct OpalHmiEvtNode {
37 	struct list_head list;
38 	struct OpalHMIEvent hmi_evt;
39 };
40 
41 struct xstop_reason {
42 	uint32_t xstop_reason;
43 	const char *unit_failed;
44 	const char *description;
45 };
46 
47 static LIST_HEAD(opal_hmi_evt_list);
48 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
49 
50 static void print_core_checkstop_reason(const char *level,
51 					struct OpalHMIEvent *hmi_evt)
52 {
53 	int i;
54 	static const struct xstop_reason xstop_reason[] = {
55 		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
56 				"RegFile core check stop" },
57 		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
58 		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
59 				"Core checkstop during recovery" },
60 		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
61 				"RegFile core check stop (mapper error)" },
62 		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
63 		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
64 		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
65 		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
66 				"Recovery in maintenance mode" },
67 		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
68 				"RegFile core check stop" },
69 		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
70 				"Forward Progress Error" },
71 		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
72 		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
73 		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
74 				"Hypervisor Resource error - core check stop" },
75 		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
76 				"Hang Recovery Failed (core check stop)" },
77 		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
78 				"Ambiguous Hang Detected (unknown source)" },
79 		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
80 				"Debug Trigger Error inject" },
81 		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
82 				"Hypervisor check stop via SPRC/SPRD" },
83 	};
84 
85 	/* Validity check */
86 	if (!hmi_evt->u.xstop_error.xstop_reason) {
87 		printk("%s	Unknown Core check stop.\n", level);
88 		return;
89 	}
90 
91 	printk("%s	CPU PIR: %08x\n", level,
92 			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
93 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
94 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
95 					xstop_reason[i].xstop_reason)
96 			printk("%s	[Unit: %-3s] %s\n", level,
97 					xstop_reason[i].unit_failed,
98 					xstop_reason[i].description);
99 }
100 
101 static void print_nx_checkstop_reason(const char *level,
102 					struct OpalHMIEvent *hmi_evt)
103 {
104 	int i;
105 	static const struct xstop_reason xstop_reason[] = {
106 		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
107 					"SHM invalid state error" },
108 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
109 					"DMA invalid state error bit 15" },
110 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
111 					"DMA invalid state error bit 16" },
112 		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
113 					"Channel 0 invalid state error" },
114 		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
115 					"Channel 1 invalid state error" },
116 		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
117 					"Channel 2 invalid state error" },
118 		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
119 					"Channel 3 invalid state error" },
120 		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
121 					"Channel 4 invalid state error" },
122 		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
123 					"Channel 5 invalid state error" },
124 		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
125 					"Channel 6 invalid state error" },
126 		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
127 					"Channel 7 invalid state error" },
128 		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
129 					"UE error on CRB(CSB address, CCB)" },
130 		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
131 					"SUE error on CRB(CSB address, CCB)" },
132 		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
133 		"CRB Kill ISN received while holding ISN with UE error" },
134 	};
135 
136 	/* Validity check */
137 	if (!hmi_evt->u.xstop_error.xstop_reason) {
138 		printk("%s	Unknown NX check stop.\n", level);
139 		return;
140 	}
141 
142 	printk("%s	NX checkstop on CHIP ID: %x\n", level,
143 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
144 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
145 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
146 					xstop_reason[i].xstop_reason)
147 			printk("%s	[Unit: %-3s] %s\n", level,
148 					xstop_reason[i].unit_failed,
149 					xstop_reason[i].description);
150 }
151 
152 static void print_checkstop_reason(const char *level,
153 					struct OpalHMIEvent *hmi_evt)
154 {
155 	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
156 	switch (type) {
157 	case CHECKSTOP_TYPE_CORE:
158 		print_core_checkstop_reason(level, hmi_evt);
159 		break;
160 	case CHECKSTOP_TYPE_NX:
161 		print_nx_checkstop_reason(level, hmi_evt);
162 		break;
163 	default:
164 		printk("%s	Unknown Malfunction Alert of type %d\n",
165 		       level, type);
166 		break;
167 	}
168 }
169 
170 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
171 {
172 	const char *level, *sevstr, *error_info;
173 	static const char *hmi_error_types[] = {
174 		"Malfunction Alert",
175 		"Processor Recovery done",
176 		"Processor recovery occurred again",
177 		"Processor recovery occurred for masked error",
178 		"Timer facility experienced an error",
179 		"TFMR SPR is corrupted",
180 		"UPS (Uninterrupted Power System) Overflow indication",
181 		"An XSCOM operation failure",
182 		"An XSCOM operation completed",
183 		"SCOM has set a reserved FIR bit to cause recovery",
184 		"Debug trigger has set a reserved FIR bit to cause recovery",
185 		"A hypervisor resource error occurred",
186 		"CAPP recovery process is in progress",
187 	};
188 
189 	/* Print things out */
190 	if (hmi_evt->version < OpalHMIEvt_V1) {
191 		pr_err("HMI Interrupt, Unknown event version %d !\n",
192 			hmi_evt->version);
193 		return;
194 	}
195 	switch (hmi_evt->severity) {
196 	case OpalHMI_SEV_NO_ERROR:
197 		level = KERN_INFO;
198 		sevstr = "Harmless";
199 		break;
200 	case OpalHMI_SEV_WARNING:
201 		level = KERN_WARNING;
202 		sevstr = "";
203 		break;
204 	case OpalHMI_SEV_ERROR_SYNC:
205 		level = KERN_ERR;
206 		sevstr = "Severe";
207 		break;
208 	case OpalHMI_SEV_FATAL:
209 	default:
210 		level = KERN_ERR;
211 		sevstr = "Fatal";
212 		break;
213 	}
214 
215 	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
216 		level, sevstr,
217 		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
218 		"Recovered" : "Not recovered");
219 	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
220 			hmi_error_types[hmi_evt->type]
221 			: "Unknown";
222 	printk("%s Error detail: %s\n", level, error_info);
223 	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
224 	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
225 		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
226 		printk("%s	TFMR: %016llx\n", level,
227 						be64_to_cpu(hmi_evt->tfmr));
228 
229 	if (hmi_evt->version < OpalHMIEvt_V2)
230 		return;
231 
232 	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
233 	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
234 		print_checkstop_reason(level, hmi_evt);
235 }
236 
237 static void hmi_event_handler(struct work_struct *work)
238 {
239 	unsigned long flags;
240 	struct OpalHMIEvent *hmi_evt;
241 	struct OpalHmiEvtNode *msg_node;
242 	uint8_t disposition;
243 	struct opal_msg msg;
244 	int unrecoverable = 0;
245 
246 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
247 	while (!list_empty(&opal_hmi_evt_list)) {
248 		msg_node = list_entry(opal_hmi_evt_list.next,
249 					   struct OpalHmiEvtNode, list);
250 		list_del(&msg_node->list);
251 		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
252 
253 		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
254 		print_hmi_event_info(hmi_evt);
255 		disposition = hmi_evt->disposition;
256 		kfree(msg_node);
257 
258 		/*
259 		 * Check if HMI event has been recovered or not. If not
260 		 * then kernel can't continue, we need to panic.
261 		 * But before we do that, display all the HMI event
262 		 * available on the list and set unrecoverable flag to 1.
263 		 */
264 		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
265 			unrecoverable = 1;
266 
267 		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
268 	}
269 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
270 
271 	if (unrecoverable) {
272 		/* Pull all HMI events from OPAL before we panic. */
273 		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
274 			u32 type;
275 
276 			type = be32_to_cpu(msg.msg_type);
277 
278 			/* skip if not HMI event */
279 			if (type != OPAL_MSG_HMI_EVT)
280 				continue;
281 
282 			/* HMI event info starts from param[0] */
283 			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
284 			print_hmi_event_info(hmi_evt);
285 		}
286 
287 		pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
288 	}
289 }
290 
291 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
292 /*
293  * opal_handle_hmi_event - notifier handler that queues up HMI events
294  * to be preocessed later.
295  */
296 static int opal_handle_hmi_event(struct notifier_block *nb,
297 			  unsigned long msg_type, void *msg)
298 {
299 	unsigned long flags;
300 	struct OpalHMIEvent *hmi_evt;
301 	struct opal_msg *hmi_msg = msg;
302 	struct OpalHmiEvtNode *msg_node;
303 
304 	/* Sanity Checks */
305 	if (msg_type != OPAL_MSG_HMI_EVT)
306 		return 0;
307 
308 	/* HMI event info starts from param[0] */
309 	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
310 
311 	/* Delay the logging of HMI events to workqueue. */
312 	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
313 	if (!msg_node) {
314 		pr_err("HMI: out of memory, Opal message event not handled\n");
315 		return -ENOMEM;
316 	}
317 	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
318 
319 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
320 	list_add(&msg_node->list, &opal_hmi_evt_list);
321 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
322 
323 	schedule_work(&hmi_event_work);
324 	return 0;
325 }
326 
327 static struct notifier_block opal_hmi_handler_nb = {
328 	.notifier_call	= opal_handle_hmi_event,
329 	.next		= NULL,
330 	.priority	= 0,
331 };
332 
333 int __init opal_hmi_handler_init(void)
334 {
335 	int ret;
336 
337 	if (!opal_hmi_handler_nb_init) {
338 		ret = opal_message_notifier_register(
339 				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
340 		if (ret) {
341 			pr_err("%s: Can't register OPAL event notifier (%d)\n",
342 			       __func__, ret);
343 			return ret;
344 		}
345 		opal_hmi_handler_nb_init = 1;
346 	}
347 	return 0;
348 }
349