xref: /linux/arch/powerpc/platforms/powernv/opal-hmi.c (revision 26fbb4c8c7c3ee9a4c3b4de555a8587b5a19154e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
4  *
5  * Copyright 2014 IBM Corporation
6  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7  */
8 
9 #undef DEBUG
10 
11 #include <linux/kernel.h>
12 #include <linux/init.h>
13 #include <linux/of.h>
14 #include <linux/mm.h>
15 #include <linux/slab.h>
16 
17 #include <asm/opal.h>
18 #include <asm/cputable.h>
19 #include <asm/machdep.h>
20 
21 #include "powernv.h"
22 
23 static int opal_hmi_handler_nb_init;
24 struct OpalHmiEvtNode {
25 	struct list_head list;
26 	struct OpalHMIEvent hmi_evt;
27 };
28 
29 struct xstop_reason {
30 	uint32_t xstop_reason;
31 	const char *unit_failed;
32 	const char *description;
33 };
34 
35 static LIST_HEAD(opal_hmi_evt_list);
36 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37 
38 static void print_core_checkstop_reason(const char *level,
39 					struct OpalHMIEvent *hmi_evt)
40 {
41 	int i;
42 	static const struct xstop_reason xstop_reason[] = {
43 		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44 				"RegFile core check stop" },
45 		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46 		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47 				"Core checkstop during recovery" },
48 		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49 				"RegFile core check stop (mapper error)" },
50 		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51 		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52 		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53 		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54 				"Recovery in maintenance mode" },
55 		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56 				"RegFile core check stop" },
57 		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58 				"Forward Progress Error" },
59 		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60 		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61 		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62 				"Hypervisor Resource error - core check stop" },
63 		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64 				"Hang Recovery Failed (core check stop)" },
65 		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66 				"Ambiguous Hang Detected (unknown source)" },
67 		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68 				"Debug Trigger Error inject" },
69 		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70 				"Hypervisor check stop via SPRC/SPRD" },
71 	};
72 
73 	/* Validity check */
74 	if (!hmi_evt->u.xstop_error.xstop_reason) {
75 		printk("%s	Unknown Core check stop.\n", level);
76 		return;
77 	}
78 
79 	printk("%s	CPU PIR: %08x\n", level,
80 			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83 					xstop_reason[i].xstop_reason)
84 			printk("%s	[Unit: %-3s] %s\n", level,
85 					xstop_reason[i].unit_failed,
86 					xstop_reason[i].description);
87 }
88 
89 static void print_nx_checkstop_reason(const char *level,
90 					struct OpalHMIEvent *hmi_evt)
91 {
92 	int i;
93 	static const struct xstop_reason xstop_reason[] = {
94 		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95 					"SHM invalid state error" },
96 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97 					"DMA invalid state error bit 15" },
98 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99 					"DMA invalid state error bit 16" },
100 		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101 					"Channel 0 invalid state error" },
102 		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103 					"Channel 1 invalid state error" },
104 		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105 					"Channel 2 invalid state error" },
106 		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107 					"Channel 3 invalid state error" },
108 		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109 					"Channel 4 invalid state error" },
110 		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111 					"Channel 5 invalid state error" },
112 		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113 					"Channel 6 invalid state error" },
114 		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115 					"Channel 7 invalid state error" },
116 		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117 					"UE error on CRB(CSB address, CCB)" },
118 		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119 					"SUE error on CRB(CSB address, CCB)" },
120 		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121 		"CRB Kill ISN received while holding ISN with UE error" },
122 	};
123 
124 	/* Validity check */
125 	if (!hmi_evt->u.xstop_error.xstop_reason) {
126 		printk("%s	Unknown NX check stop.\n", level);
127 		return;
128 	}
129 
130 	printk("%s	NX checkstop on CHIP ID: %x\n", level,
131 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134 					xstop_reason[i].xstop_reason)
135 			printk("%s	[Unit: %-3s] %s\n", level,
136 					xstop_reason[i].unit_failed,
137 					xstop_reason[i].description);
138 }
139 
140 static void print_npu_checkstop_reason(const char *level,
141 					struct OpalHMIEvent *hmi_evt)
142 {
143 	uint8_t reason, reason_count, i;
144 
145 	/*
146 	 * We may not have a checkstop reason on some combination of
147 	 * hardware and/or skiboot version
148 	 */
149 	if (!hmi_evt->u.xstop_error.xstop_reason) {
150 		printk("%s	NPU checkstop on chip %x\n", level,
151 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
152 		return;
153 	}
154 
155 	/*
156 	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
157 	 *   2 bits for the FIR number
158 	 *   6 bits for the bit number
159 	 * It may be possible to find several reasons.
160 	 *
161 	 * We don't display a specific message per FIR bit as there
162 	 * are too many and most are meaningless without the workbook
163 	 * and/or hw team help anyway.
164 	 */
165 	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
166 		sizeof(reason);
167 	for (i = 0; i < reason_count; i++) {
168 		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
169 		if (reason)
170 			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
171 				level,
172 				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
173 				reason >> 6, reason & 0x3F);
174 	}
175 }
176 
177 static void print_checkstop_reason(const char *level,
178 					struct OpalHMIEvent *hmi_evt)
179 {
180 	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
181 	switch (type) {
182 	case CHECKSTOP_TYPE_CORE:
183 		print_core_checkstop_reason(level, hmi_evt);
184 		break;
185 	case CHECKSTOP_TYPE_NX:
186 		print_nx_checkstop_reason(level, hmi_evt);
187 		break;
188 	case CHECKSTOP_TYPE_NPU:
189 		print_npu_checkstop_reason(level, hmi_evt);
190 		break;
191 	default:
192 		printk("%s	Unknown Malfunction Alert of type %d\n",
193 		       level, type);
194 		break;
195 	}
196 }
197 
198 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
199 {
200 	const char *level, *sevstr, *error_info;
201 	static const char *hmi_error_types[] = {
202 		"Malfunction Alert",
203 		"Processor Recovery done",
204 		"Processor recovery occurred again",
205 		"Processor recovery occurred for masked error",
206 		"Timer facility experienced an error",
207 		"TFMR SPR is corrupted",
208 		"UPS (Uninterrupted Power System) Overflow indication",
209 		"An XSCOM operation failure",
210 		"An XSCOM operation completed",
211 		"SCOM has set a reserved FIR bit to cause recovery",
212 		"Debug trigger has set a reserved FIR bit to cause recovery",
213 		"A hypervisor resource error occurred",
214 		"CAPP recovery process is in progress",
215 	};
216 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
217 				      DEFAULT_RATELIMIT_BURST);
218 
219 	/* Print things out */
220 	if (hmi_evt->version < OpalHMIEvt_V1) {
221 		pr_err("HMI Interrupt, Unknown event version %d !\n",
222 			hmi_evt->version);
223 		return;
224 	}
225 	switch (hmi_evt->severity) {
226 	case OpalHMI_SEV_NO_ERROR:
227 		level = KERN_INFO;
228 		sevstr = "Harmless";
229 		break;
230 	case OpalHMI_SEV_WARNING:
231 		level = KERN_WARNING;
232 		sevstr = "";
233 		break;
234 	case OpalHMI_SEV_ERROR_SYNC:
235 		level = KERN_ERR;
236 		sevstr = "Severe";
237 		break;
238 	case OpalHMI_SEV_FATAL:
239 	default:
240 		level = KERN_ERR;
241 		sevstr = "Fatal";
242 		break;
243 	}
244 
245 	if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
246 		printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
247 			level, sevstr,
248 			hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
249 			"Recovered" : "Not recovered");
250 		error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
251 				hmi_error_types[hmi_evt->type]
252 				: "Unknown";
253 		printk("%s Error detail: %s\n", level, error_info);
254 		printk("%s	HMER: %016llx\n", level,
255 					be64_to_cpu(hmi_evt->hmer));
256 		if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
257 			(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
258 			printk("%s	TFMR: %016llx\n", level,
259 						be64_to_cpu(hmi_evt->tfmr));
260 	}
261 
262 	if (hmi_evt->version < OpalHMIEvt_V2)
263 		return;
264 
265 	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
266 	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
267 		print_checkstop_reason(level, hmi_evt);
268 }
269 
270 static void hmi_event_handler(struct work_struct *work)
271 {
272 	unsigned long flags;
273 	struct OpalHMIEvent *hmi_evt;
274 	struct OpalHmiEvtNode *msg_node;
275 	uint8_t disposition;
276 	struct opal_msg msg;
277 	int unrecoverable = 0;
278 
279 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
280 	while (!list_empty(&opal_hmi_evt_list)) {
281 		msg_node = list_entry(opal_hmi_evt_list.next,
282 					   struct OpalHmiEvtNode, list);
283 		list_del(&msg_node->list);
284 		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
285 
286 		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
287 		print_hmi_event_info(hmi_evt);
288 		disposition = hmi_evt->disposition;
289 		kfree(msg_node);
290 
291 		/*
292 		 * Check if HMI event has been recovered or not. If not
293 		 * then kernel can't continue, we need to panic.
294 		 * But before we do that, display all the HMI event
295 		 * available on the list and set unrecoverable flag to 1.
296 		 */
297 		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
298 			unrecoverable = 1;
299 
300 		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
301 	}
302 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
303 
304 	if (unrecoverable) {
305 		/* Pull all HMI events from OPAL before we panic. */
306 		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
307 			u32 type;
308 
309 			type = be32_to_cpu(msg.msg_type);
310 
311 			/* skip if not HMI event */
312 			if (type != OPAL_MSG_HMI_EVT)
313 				continue;
314 
315 			/* HMI event info starts from param[0] */
316 			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
317 			print_hmi_event_info(hmi_evt);
318 		}
319 
320 		pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
321 	}
322 }
323 
324 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
325 /*
326  * opal_handle_hmi_event - notifier handler that queues up HMI events
327  * to be preocessed later.
328  */
329 static int opal_handle_hmi_event(struct notifier_block *nb,
330 			  unsigned long msg_type, void *msg)
331 {
332 	unsigned long flags;
333 	struct OpalHMIEvent *hmi_evt;
334 	struct opal_msg *hmi_msg = msg;
335 	struct OpalHmiEvtNode *msg_node;
336 
337 	/* Sanity Checks */
338 	if (msg_type != OPAL_MSG_HMI_EVT)
339 		return 0;
340 
341 	/* HMI event info starts from param[0] */
342 	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
343 
344 	/* Delay the logging of HMI events to workqueue. */
345 	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
346 	if (!msg_node) {
347 		pr_err("HMI: out of memory, Opal message event not handled\n");
348 		return -ENOMEM;
349 	}
350 	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
351 
352 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
353 	list_add(&msg_node->list, &opal_hmi_evt_list);
354 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
355 
356 	schedule_work(&hmi_event_work);
357 	return 0;
358 }
359 
360 static struct notifier_block opal_hmi_handler_nb = {
361 	.notifier_call	= opal_handle_hmi_event,
362 	.next		= NULL,
363 	.priority	= 0,
364 };
365 
366 int __init opal_hmi_handler_init(void)
367 {
368 	int ret;
369 
370 	if (!opal_hmi_handler_nb_init) {
371 		ret = opal_message_notifier_register(
372 				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
373 		if (ret) {
374 			pr_err("%s: Can't register OPAL event notifier (%d)\n",
375 			       __func__, ret);
376 			return ret;
377 		}
378 		opal_hmi_handler_nb_init = 1;
379 	}
380 	return 0;
381 }
382