1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
4 *
5 * Copyright 2014 IBM Corporation
6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7 */
8
9 #undef DEBUG
10
11 #include <linux/kernel.h>
12 #include <linux/init.h>
13 #include <linux/of.h>
14 #include <linux/mm.h>
15 #include <linux/slab.h>
16
17 #include <asm/opal.h>
18 #include <asm/cputable.h>
19 #include <asm/machdep.h>
20
21 #include "powernv.h"
22
23 static int opal_hmi_handler_nb_init;
24 struct OpalHmiEvtNode {
25 struct list_head list;
26 struct OpalHMIEvent hmi_evt;
27 };
28
29 struct xstop_reason {
30 uint32_t xstop_reason;
31 const char *unit_failed;
32 const char *description;
33 };
34
35 static LIST_HEAD(opal_hmi_evt_list);
36 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37
print_core_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)38 static void print_core_checkstop_reason(const char *level,
39 struct OpalHMIEvent *hmi_evt)
40 {
41 int i;
42 static const struct xstop_reason xstop_reason[] = {
43 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44 "RegFile core check stop" },
45 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47 "Core checkstop during recovery" },
48 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49 "RegFile core check stop (mapper error)" },
50 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54 "Recovery in maintenance mode" },
55 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56 "RegFile core check stop" },
57 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58 "Forward Progress Error" },
59 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62 "Hypervisor Resource error - core check stop" },
63 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64 "Hang Recovery Failed (core check stop)" },
65 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66 "Ambiguous Hang Detected (unknown source)" },
67 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68 "Debug Trigger Error inject" },
69 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70 "Hypervisor check stop via SPRC/SPRD" },
71 };
72
73 /* Validity check */
74 if (!hmi_evt->u.xstop_error.xstop_reason) {
75 printk("%s Unknown Core check stop.\n", level);
76 return;
77 }
78
79 printk("%s CPU PIR: %08x\n", level,
80 be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83 xstop_reason[i].xstop_reason)
84 printk("%s [Unit: %-3s] %s\n", level,
85 xstop_reason[i].unit_failed,
86 xstop_reason[i].description);
87 }
88
print_nx_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)89 static void print_nx_checkstop_reason(const char *level,
90 struct OpalHMIEvent *hmi_evt)
91 {
92 int i;
93 static const struct xstop_reason xstop_reason[] = {
94 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95 "SHM invalid state error" },
96 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97 "DMA invalid state error bit 15" },
98 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99 "DMA invalid state error bit 16" },
100 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101 "Channel 0 invalid state error" },
102 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103 "Channel 1 invalid state error" },
104 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105 "Channel 2 invalid state error" },
106 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107 "Channel 3 invalid state error" },
108 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109 "Channel 4 invalid state error" },
110 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111 "Channel 5 invalid state error" },
112 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113 "Channel 6 invalid state error" },
114 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115 "Channel 7 invalid state error" },
116 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117 "UE error on CRB(CSB address, CCB)" },
118 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119 "SUE error on CRB(CSB address, CCB)" },
120 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121 "CRB Kill ISN received while holding ISN with UE error" },
122 };
123
124 /* Validity check */
125 if (!hmi_evt->u.xstop_error.xstop_reason) {
126 printk("%s Unknown NX check stop.\n", level);
127 return;
128 }
129
130 printk("%s NX checkstop on CHIP ID: %x\n", level,
131 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134 xstop_reason[i].xstop_reason)
135 printk("%s [Unit: %-3s] %s\n", level,
136 xstop_reason[i].unit_failed,
137 xstop_reason[i].description);
138 }
139
print_npu_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)140 static void print_npu_checkstop_reason(const char *level,
141 struct OpalHMIEvent *hmi_evt)
142 {
143 uint8_t reason, reason_count, i;
144
145 /*
146 * We may not have a checkstop reason on some combination of
147 * hardware and/or skiboot version
148 */
149 if (!hmi_evt->u.xstop_error.xstop_reason) {
150 printk("%s NPU checkstop on chip %x\n", level,
151 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
152 return;
153 }
154
155 /*
156 * NPU2 has 3 FIRs. Reason encoded on a byte as:
157 * 2 bits for the FIR number
158 * 6 bits for the bit number
159 * It may be possible to find several reasons.
160 *
161 * We don't display a specific message per FIR bit as there
162 * are too many and most are meaningless without the workbook
163 * and/or hw team help anyway.
164 */
165 reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
166 sizeof(reason);
167 for (i = 0; i < reason_count; i++) {
168 reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
169 if (reason)
170 printk("%s NPU checkstop on chip %x: FIR%d bit %d is set\n",
171 level,
172 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
173 reason >> 6, reason & 0x3F);
174 }
175 }
176
print_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)177 static void print_checkstop_reason(const char *level,
178 struct OpalHMIEvent *hmi_evt)
179 {
180 uint8_t type = hmi_evt->u.xstop_error.xstop_type;
181 switch (type) {
182 case CHECKSTOP_TYPE_CORE:
183 print_core_checkstop_reason(level, hmi_evt);
184 break;
185 case CHECKSTOP_TYPE_NX:
186 print_nx_checkstop_reason(level, hmi_evt);
187 break;
188 case CHECKSTOP_TYPE_NPU:
189 print_npu_checkstop_reason(level, hmi_evt);
190 break;
191 default:
192 printk("%s Unknown Malfunction Alert of type %d\n",
193 level, type);
194 break;
195 }
196 }
197
print_hmi_event_info(struct OpalHMIEvent * hmi_evt)198 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
199 {
200 const char *level, *sevstr, *error_info;
201 static const char *hmi_error_types[] = {
202 "Malfunction Alert",
203 "Processor Recovery done",
204 "Processor recovery occurred again",
205 "Processor recovery occurred for masked error",
206 "Timer facility experienced an error",
207 "TFMR SPR is corrupted",
208 "UPS (Uninterrupted Power System) Overflow indication",
209 "An XSCOM operation failure",
210 "An XSCOM operation completed",
211 "SCOM has set a reserved FIR bit to cause recovery",
212 "Debug trigger has set a reserved FIR bit to cause recovery",
213 "A hypervisor resource error occurred",
214 "CAPP recovery process is in progress",
215 };
216 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
217 DEFAULT_RATELIMIT_BURST);
218
219 /* Print things out */
220 if (hmi_evt->version < OpalHMIEvt_V1) {
221 pr_err("HMI Interrupt, Unknown event version %d !\n",
222 hmi_evt->version);
223 return;
224 }
225 switch (hmi_evt->severity) {
226 case OpalHMI_SEV_NO_ERROR:
227 level = KERN_INFO;
228 sevstr = "Harmless";
229 break;
230 case OpalHMI_SEV_WARNING:
231 level = KERN_WARNING;
232 sevstr = "";
233 break;
234 case OpalHMI_SEV_ERROR_SYNC:
235 level = KERN_ERR;
236 sevstr = "Severe";
237 break;
238 case OpalHMI_SEV_FATAL:
239 default:
240 level = KERN_ERR;
241 sevstr = "Fatal";
242 break;
243 }
244
245 if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
246 printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
247 level, sevstr,
248 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
249 "Recovered" : "Not recovered");
250 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
251 hmi_error_types[hmi_evt->type]
252 : "Unknown";
253 printk("%s Error detail: %s\n", level, error_info);
254 printk("%s HMER: %016llx\n", level,
255 be64_to_cpu(hmi_evt->hmer));
256 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
257 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
258 printk("%s TFMR: %016llx\n", level,
259 be64_to_cpu(hmi_evt->tfmr));
260 }
261
262 if (hmi_evt->version < OpalHMIEvt_V2)
263 return;
264
265 /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
266 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
267 print_checkstop_reason(level, hmi_evt);
268 }
269
hmi_event_handler(struct work_struct * work)270 static void hmi_event_handler(struct work_struct *work)
271 {
272 unsigned long flags;
273 struct OpalHMIEvent *hmi_evt;
274 struct OpalHmiEvtNode *msg_node;
275 uint8_t disposition;
276 struct opal_msg msg;
277 int unrecoverable = 0;
278
279 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
280 while (!list_empty(&opal_hmi_evt_list)) {
281 msg_node = list_entry(opal_hmi_evt_list.next,
282 struct OpalHmiEvtNode, list);
283 list_del(&msg_node->list);
284 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
285
286 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
287 print_hmi_event_info(hmi_evt);
288 disposition = hmi_evt->disposition;
289 kfree(msg_node);
290
291 /*
292 * Check if HMI event has been recovered or not. If not
293 * then kernel can't continue, we need to panic.
294 * But before we do that, display all the HMI event
295 * available on the list and set unrecoverable flag to 1.
296 */
297 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
298 unrecoverable = 1;
299
300 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
301 }
302 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
303
304 if (unrecoverable) {
305 /* Pull all HMI events from OPAL before we panic. */
306 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
307 u32 type;
308
309 type = be32_to_cpu(msg.msg_type);
310
311 /* skip if not HMI event */
312 if (type != OPAL_MSG_HMI_EVT)
313 continue;
314
315 /* HMI event info starts from param[0] */
316 hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
317 print_hmi_event_info(hmi_evt);
318 }
319
320 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
321 }
322 }
323
324 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
325 /*
326 * opal_handle_hmi_event - notifier handler that queues up HMI events
327 * to be preocessed later.
328 */
opal_handle_hmi_event(struct notifier_block * nb,unsigned long msg_type,void * msg)329 static int opal_handle_hmi_event(struct notifier_block *nb,
330 unsigned long msg_type, void *msg)
331 {
332 unsigned long flags;
333 struct OpalHMIEvent *hmi_evt;
334 struct opal_msg *hmi_msg = msg;
335 struct OpalHmiEvtNode *msg_node;
336
337 /* Sanity Checks */
338 if (msg_type != OPAL_MSG_HMI_EVT)
339 return 0;
340
341 /* HMI event info starts from param[0] */
342 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
343
344 /* Delay the logging of HMI events to workqueue. */
345 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
346 if (!msg_node) {
347 pr_err("HMI: out of memory, Opal message event not handled\n");
348 return -ENOMEM;
349 }
350 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
351
352 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
353 list_add(&msg_node->list, &opal_hmi_evt_list);
354 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
355
356 schedule_work(&hmi_event_work);
357 return 0;
358 }
359
360 static struct notifier_block opal_hmi_handler_nb = {
361 .notifier_call = opal_handle_hmi_event,
362 .next = NULL,
363 .priority = 0,
364 };
365
opal_hmi_handler_init(void)366 int __init opal_hmi_handler_init(void)
367 {
368 int ret;
369
370 if (!opal_hmi_handler_nb_init) {
371 ret = opal_message_notifier_register(
372 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
373 if (ret) {
374 pr_err("%s: Can't register OPAL event notifier (%d)\n",
375 __func__, ret);
376 return ret;
377 }
378 opal_hmi_handler_nb_init = 1;
379 }
380 return 0;
381 }
382