xref: /linux/arch/powerpc/platforms/powernv/opal-hmi.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Copyright 2014 IBM Corporation
18  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19  */
20 
21 #undef DEBUG
22 
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28 
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32 
33 static int opal_hmi_handler_nb_init;
34 struct OpalHmiEvtNode {
35 	struct list_head list;
36 	struct OpalHMIEvent hmi_evt;
37 };
38 
39 struct xstop_reason {
40 	uint32_t xstop_reason;
41 	const char *unit_failed;
42 	const char *description;
43 };
44 
45 static LIST_HEAD(opal_hmi_evt_list);
46 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
47 
48 static void print_core_checkstop_reason(const char *level,
49 					struct OpalHMIEvent *hmi_evt)
50 {
51 	int i;
52 	static const struct xstop_reason xstop_reason[] = {
53 		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
54 				"RegFile core check stop" },
55 		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
56 		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
57 				"Core checkstop during recovery" },
58 		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
59 				"RegFile core check stop (mapper error)" },
60 		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
61 		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
62 		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
63 		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
64 				"Recovery in maintenance mode" },
65 		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
66 				"RegFile core check stop" },
67 		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
68 				"Forward Progress Error" },
69 		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
70 		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
71 		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
72 				"Hypervisor Resource error - core check stop" },
73 		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
74 				"Hang Recovery Failed (core check stop)" },
75 		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
76 				"Ambiguous Hang Detected (unknown source)" },
77 		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
78 				"Debug Trigger Error inject" },
79 		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
80 				"Hypervisor check stop via SPRC/SPRD" },
81 	};
82 
83 	/* Validity check */
84 	if (!hmi_evt->u.xstop_error.xstop_reason) {
85 		printk("%s	Unknown Core check stop.\n", level);
86 		return;
87 	}
88 
89 	printk("%s	CPU PIR: %08x\n", level,
90 			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
91 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
92 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
93 					xstop_reason[i].xstop_reason)
94 			printk("%s	[Unit: %-3s] %s\n", level,
95 					xstop_reason[i].unit_failed,
96 					xstop_reason[i].description);
97 }
98 
99 static void print_nx_checkstop_reason(const char *level,
100 					struct OpalHMIEvent *hmi_evt)
101 {
102 	int i;
103 	static const struct xstop_reason xstop_reason[] = {
104 		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
105 					"SHM invalid state error" },
106 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
107 					"DMA invalid state error bit 15" },
108 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
109 					"DMA invalid state error bit 16" },
110 		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
111 					"Channel 0 invalid state error" },
112 		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
113 					"Channel 1 invalid state error" },
114 		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
115 					"Channel 2 invalid state error" },
116 		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
117 					"Channel 3 invalid state error" },
118 		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
119 					"Channel 4 invalid state error" },
120 		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
121 					"Channel 5 invalid state error" },
122 		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
123 					"Channel 6 invalid state error" },
124 		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
125 					"Channel 7 invalid state error" },
126 		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
127 					"UE error on CRB(CSB address, CCB)" },
128 		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
129 					"SUE error on CRB(CSB address, CCB)" },
130 		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
131 		"CRB Kill ISN received while holding ISN with UE error" },
132 	};
133 
134 	/* Validity check */
135 	if (!hmi_evt->u.xstop_error.xstop_reason) {
136 		printk("%s	Unknown NX check stop.\n", level);
137 		return;
138 	}
139 
140 	printk("%s	NX checkstop on CHIP ID: %x\n", level,
141 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
142 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
143 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
144 					xstop_reason[i].xstop_reason)
145 			printk("%s	[Unit: %-3s] %s\n", level,
146 					xstop_reason[i].unit_failed,
147 					xstop_reason[i].description);
148 }
149 
150 static void print_checkstop_reason(const char *level,
151 					struct OpalHMIEvent *hmi_evt)
152 {
153 	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
154 	switch (type) {
155 	case CHECKSTOP_TYPE_CORE:
156 		print_core_checkstop_reason(level, hmi_evt);
157 		break;
158 	case CHECKSTOP_TYPE_NX:
159 		print_nx_checkstop_reason(level, hmi_evt);
160 		break;
161 	default:
162 		printk("%s	Unknown Malfunction Alert of type %d\n",
163 		       level, type);
164 		break;
165 	}
166 }
167 
168 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
169 {
170 	const char *level, *sevstr, *error_info;
171 	static const char *hmi_error_types[] = {
172 		"Malfunction Alert",
173 		"Processor Recovery done",
174 		"Processor recovery occurred again",
175 		"Processor recovery occurred for masked error",
176 		"Timer facility experienced an error",
177 		"TFMR SPR is corrupted",
178 		"UPS (Uniterrupted Power System) Overflow indication",
179 		"An XSCOM operation failure",
180 		"An XSCOM operation completed",
181 		"SCOM has set a reserved FIR bit to cause recovery",
182 		"Debug trigger has set a reserved FIR bit to cause recovery",
183 		"A hypervisor resource error occurred"
184 	};
185 
186 	/* Print things out */
187 	if (hmi_evt->version < OpalHMIEvt_V1) {
188 		pr_err("HMI Interrupt, Unknown event version %d !\n",
189 			hmi_evt->version);
190 		return;
191 	}
192 	switch (hmi_evt->severity) {
193 	case OpalHMI_SEV_NO_ERROR:
194 		level = KERN_INFO;
195 		sevstr = "Harmless";
196 		break;
197 	case OpalHMI_SEV_WARNING:
198 		level = KERN_WARNING;
199 		sevstr = "";
200 		break;
201 	case OpalHMI_SEV_ERROR_SYNC:
202 		level = KERN_ERR;
203 		sevstr = "Severe";
204 		break;
205 	case OpalHMI_SEV_FATAL:
206 	default:
207 		level = KERN_ERR;
208 		sevstr = "Fatal";
209 		break;
210 	}
211 
212 	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
213 		level, sevstr,
214 		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
215 		"Recovered" : "Not recovered");
216 	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
217 			hmi_error_types[hmi_evt->type]
218 			: "Unknown";
219 	printk("%s Error detail: %s\n", level, error_info);
220 	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
221 	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
222 		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
223 		printk("%s	TFMR: %016llx\n", level,
224 						be64_to_cpu(hmi_evt->tfmr));
225 
226 	if (hmi_evt->version < OpalHMIEvt_V2)
227 		return;
228 
229 	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
230 	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
231 		print_checkstop_reason(level, hmi_evt);
232 }
233 
234 static void hmi_event_handler(struct work_struct *work)
235 {
236 	unsigned long flags;
237 	struct OpalHMIEvent *hmi_evt;
238 	struct OpalHmiEvtNode *msg_node;
239 	uint8_t disposition;
240 	struct opal_msg msg;
241 	int unrecoverable = 0;
242 
243 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
244 	while (!list_empty(&opal_hmi_evt_list)) {
245 		msg_node = list_entry(opal_hmi_evt_list.next,
246 					   struct OpalHmiEvtNode, list);
247 		list_del(&msg_node->list);
248 		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
249 
250 		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
251 		print_hmi_event_info(hmi_evt);
252 		disposition = hmi_evt->disposition;
253 		kfree(msg_node);
254 
255 		/*
256 		 * Check if HMI event has been recovered or not. If not
257 		 * then kernel can't continue, we need to panic.
258 		 * But before we do that, display all the HMI event
259 		 * available on the list and set unrecoverable flag to 1.
260 		 */
261 		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
262 			unrecoverable = 1;
263 
264 		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
265 	}
266 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
267 
268 	if (unrecoverable) {
269 		int ret;
270 
271 		/* Pull all HMI events from OPAL before we panic. */
272 		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
273 			u32 type;
274 
275 			type = be32_to_cpu(msg.msg_type);
276 
277 			/* skip if not HMI event */
278 			if (type != OPAL_MSG_HMI_EVT)
279 				continue;
280 
281 			/* HMI event info starts from param[0] */
282 			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
283 			print_hmi_event_info(hmi_evt);
284 		}
285 
286 		/*
287 		 * Unrecoverable HMI exception. We need to inform BMC/OCC
288 		 * about this error so that it can collect relevant data
289 		 * for error analysis before rebooting.
290 		 */
291 		ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
292 			"Unrecoverable HMI exception");
293 		if (ret == OPAL_UNSUPPORTED) {
294 			pr_emerg("Reboot type %d not supported\n",
295 						OPAL_REBOOT_PLATFORM_ERROR);
296 		}
297 
298 		/*
299 		 * Fall through and panic if opal_cec_reboot2() returns
300 		 * OPAL_UNSUPPORTED.
301 		 */
302 		panic("Unrecoverable HMI exception");
303 	}
304 }
305 
306 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
307 /*
308  * opal_handle_hmi_event - notifier handler that queues up HMI events
309  * to be preocessed later.
310  */
311 static int opal_handle_hmi_event(struct notifier_block *nb,
312 			  unsigned long msg_type, void *msg)
313 {
314 	unsigned long flags;
315 	struct OpalHMIEvent *hmi_evt;
316 	struct opal_msg *hmi_msg = msg;
317 	struct OpalHmiEvtNode *msg_node;
318 
319 	/* Sanity Checks */
320 	if (msg_type != OPAL_MSG_HMI_EVT)
321 		return 0;
322 
323 	/* HMI event info starts from param[0] */
324 	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
325 
326 	/* Delay the logging of HMI events to workqueue. */
327 	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
328 	if (!msg_node) {
329 		pr_err("HMI: out of memory, Opal message event not handled\n");
330 		return -ENOMEM;
331 	}
332 	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
333 
334 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
335 	list_add(&msg_node->list, &opal_hmi_evt_list);
336 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
337 
338 	schedule_work(&hmi_event_work);
339 	return 0;
340 }
341 
342 static struct notifier_block opal_hmi_handler_nb = {
343 	.notifier_call	= opal_handle_hmi_event,
344 	.next		= NULL,
345 	.priority	= 0,
346 };
347 
348 int __init opal_hmi_handler_init(void)
349 {
350 	int ret;
351 
352 	if (!opal_hmi_handler_nb_init) {
353 		ret = opal_message_notifier_register(
354 				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
355 		if (ret) {
356 			pr_err("%s: Can't register OPAL event notifier (%d)\n",
357 			       __func__, ret);
358 			return ret;
359 		}
360 		opal_hmi_handler_nb_init = 1;
361 	}
362 	return 0;
363 }
364