1 /* 2 * OPAL hypervisor Maintenance interrupt handling support in PowreNV. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; If not, see <http://www.gnu.org/licenses/>. 16 * 17 * Copyright 2014 IBM Corporation 18 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> 19 */ 20 21 #undef DEBUG 22 23 #include <linux/kernel.h> 24 #include <linux/init.h> 25 #include <linux/of.h> 26 #include <linux/mm.h> 27 #include <linux/slab.h> 28 29 #include <asm/opal.h> 30 #include <asm/cputable.h> 31 #include <asm/machdep.h> 32 33 static int opal_hmi_handler_nb_init; 34 struct OpalHmiEvtNode { 35 struct list_head list; 36 struct OpalHMIEvent hmi_evt; 37 }; 38 39 struct xstop_reason { 40 uint32_t xstop_reason; 41 const char *unit_failed; 42 const char *description; 43 }; 44 45 static LIST_HEAD(opal_hmi_evt_list); 46 static DEFINE_SPINLOCK(opal_hmi_evt_lock); 47 48 static void print_core_checkstop_reason(const char *level, 49 struct OpalHMIEvent *hmi_evt) 50 { 51 int i; 52 static const struct xstop_reason xstop_reason[] = { 53 { CORE_CHECKSTOP_IFU_REGFILE, "IFU", 54 "RegFile core check stop" }, 55 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" }, 56 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC", 57 "Core checkstop during recovery" }, 58 { CORE_CHECKSTOP_ISU_REGFILE, "ISU", 59 "RegFile core check stop (mapper error)" }, 60 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" }, 61 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" }, 62 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" }, 63 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC", 64 "Recovery in maintenance mode" }, 65 { CORE_CHECKSTOP_LSU_REGFILE, "LSU", 66 "RegFile core check stop" }, 67 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC", 68 "Forward Progress Error" }, 69 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" }, 70 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" }, 71 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC", 72 "Hypervisor Resource error - core check stop" }, 73 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC", 74 "Hang Recovery Failed (core check stop)" }, 75 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC", 76 "Ambiguous Hang Detected (unknown source)" }, 77 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC", 78 "Debug Trigger Error inject" }, 79 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC", 80 "Hypervisor check stop via SPRC/SPRD" }, 81 }; 82 83 /* Validity check */ 84 if (!hmi_evt->u.xstop_error.xstop_reason) { 85 printk("%s Unknown Core check stop.\n", level); 86 return; 87 } 88 89 printk("%s CPU PIR: %08x\n", level, 90 be32_to_cpu(hmi_evt->u.xstop_error.u.pir)); 91 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) 92 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & 93 xstop_reason[i].xstop_reason) 94 printk("%s [Unit: %-3s] %s\n", level, 95 xstop_reason[i].unit_failed, 96 xstop_reason[i].description); 97 } 98 99 static void print_nx_checkstop_reason(const char *level, 100 struct OpalHMIEvent *hmi_evt) 101 { 102 int i; 103 static const struct xstop_reason xstop_reason[] = { 104 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine", 105 "SHM invalid state error" }, 106 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine", 107 "DMA invalid state error bit 15" }, 108 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine", 109 "DMA invalid state error bit 16" }, 110 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine", 111 "Channel 0 invalid state error" }, 112 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine", 113 "Channel 1 invalid state error" }, 114 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine", 115 "Channel 2 invalid state error" }, 116 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine", 117 "Channel 3 invalid state error" }, 118 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine", 119 "Channel 4 invalid state error" }, 120 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine", 121 "Channel 5 invalid state error" }, 122 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine", 123 "Channel 6 invalid state error" }, 124 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine", 125 "Channel 7 invalid state error" }, 126 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine", 127 "UE error on CRB(CSB address, CCB)" }, 128 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine", 129 "SUE error on CRB(CSB address, CCB)" }, 130 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface", 131 "CRB Kill ISN received while holding ISN with UE error" }, 132 }; 133 134 /* Validity check */ 135 if (!hmi_evt->u.xstop_error.xstop_reason) { 136 printk("%s Unknown NX check stop.\n", level); 137 return; 138 } 139 140 printk("%s NX checkstop on CHIP ID: %x\n", level, 141 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id)); 142 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) 143 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & 144 xstop_reason[i].xstop_reason) 145 printk("%s [Unit: %-3s] %s\n", level, 146 xstop_reason[i].unit_failed, 147 xstop_reason[i].description); 148 } 149 150 static void print_checkstop_reason(const char *level, 151 struct OpalHMIEvent *hmi_evt) 152 { 153 uint8_t type = hmi_evt->u.xstop_error.xstop_type; 154 switch (type) { 155 case CHECKSTOP_TYPE_CORE: 156 print_core_checkstop_reason(level, hmi_evt); 157 break; 158 case CHECKSTOP_TYPE_NX: 159 print_nx_checkstop_reason(level, hmi_evt); 160 break; 161 default: 162 printk("%s Unknown Malfunction Alert of type %d\n", 163 level, type); 164 break; 165 } 166 } 167 168 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) 169 { 170 const char *level, *sevstr, *error_info; 171 static const char *hmi_error_types[] = { 172 "Malfunction Alert", 173 "Processor Recovery done", 174 "Processor recovery occurred again", 175 "Processor recovery occurred for masked error", 176 "Timer facility experienced an error", 177 "TFMR SPR is corrupted", 178 "UPS (Uniterrupted Power System) Overflow indication", 179 "An XSCOM operation failure", 180 "An XSCOM operation completed", 181 "SCOM has set a reserved FIR bit to cause recovery", 182 "Debug trigger has set a reserved FIR bit to cause recovery", 183 "A hypervisor resource error occurred" 184 }; 185 186 /* Print things out */ 187 if (hmi_evt->version < OpalHMIEvt_V1) { 188 pr_err("HMI Interrupt, Unknown event version %d !\n", 189 hmi_evt->version); 190 return; 191 } 192 switch (hmi_evt->severity) { 193 case OpalHMI_SEV_NO_ERROR: 194 level = KERN_INFO; 195 sevstr = "Harmless"; 196 break; 197 case OpalHMI_SEV_WARNING: 198 level = KERN_WARNING; 199 sevstr = ""; 200 break; 201 case OpalHMI_SEV_ERROR_SYNC: 202 level = KERN_ERR; 203 sevstr = "Severe"; 204 break; 205 case OpalHMI_SEV_FATAL: 206 default: 207 level = KERN_ERR; 208 sevstr = "Fatal"; 209 break; 210 } 211 212 printk("%s%s Hypervisor Maintenance interrupt [%s]\n", 213 level, sevstr, 214 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? 215 "Recovered" : "Not recovered"); 216 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? 217 hmi_error_types[hmi_evt->type] 218 : "Unknown"; 219 printk("%s Error detail: %s\n", level, error_info); 220 printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); 221 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || 222 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) 223 printk("%s TFMR: %016llx\n", level, 224 be64_to_cpu(hmi_evt->tfmr)); 225 226 if (hmi_evt->version < OpalHMIEvt_V2) 227 return; 228 229 /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */ 230 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT) 231 print_checkstop_reason(level, hmi_evt); 232 } 233 234 static void hmi_event_handler(struct work_struct *work) 235 { 236 unsigned long flags; 237 struct OpalHMIEvent *hmi_evt; 238 struct OpalHmiEvtNode *msg_node; 239 uint8_t disposition; 240 struct opal_msg msg; 241 int unrecoverable = 0; 242 243 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 244 while (!list_empty(&opal_hmi_evt_list)) { 245 msg_node = list_entry(opal_hmi_evt_list.next, 246 struct OpalHmiEvtNode, list); 247 list_del(&msg_node->list); 248 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 249 250 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt; 251 print_hmi_event_info(hmi_evt); 252 disposition = hmi_evt->disposition; 253 kfree(msg_node); 254 255 /* 256 * Check if HMI event has been recovered or not. If not 257 * then kernel can't continue, we need to panic. 258 * But before we do that, display all the HMI event 259 * available on the list and set unrecoverable flag to 1. 260 */ 261 if (disposition != OpalHMI_DISPOSITION_RECOVERED) 262 unrecoverable = 1; 263 264 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 265 } 266 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 267 268 if (unrecoverable) { 269 int ret; 270 271 /* Pull all HMI events from OPAL before we panic. */ 272 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { 273 u32 type; 274 275 type = be32_to_cpu(msg.msg_type); 276 277 /* skip if not HMI event */ 278 if (type != OPAL_MSG_HMI_EVT) 279 continue; 280 281 /* HMI event info starts from param[0] */ 282 hmi_evt = (struct OpalHMIEvent *)&msg.params[0]; 283 print_hmi_event_info(hmi_evt); 284 } 285 286 /* 287 * Unrecoverable HMI exception. We need to inform BMC/OCC 288 * about this error so that it can collect relevant data 289 * for error analysis before rebooting. 290 */ 291 ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, 292 "Unrecoverable HMI exception"); 293 if (ret == OPAL_UNSUPPORTED) { 294 pr_emerg("Reboot type %d not supported\n", 295 OPAL_REBOOT_PLATFORM_ERROR); 296 } 297 298 /* 299 * Fall through and panic if opal_cec_reboot2() returns 300 * OPAL_UNSUPPORTED. 301 */ 302 panic("Unrecoverable HMI exception"); 303 } 304 } 305 306 static DECLARE_WORK(hmi_event_work, hmi_event_handler); 307 /* 308 * opal_handle_hmi_event - notifier handler that queues up HMI events 309 * to be preocessed later. 310 */ 311 static int opal_handle_hmi_event(struct notifier_block *nb, 312 unsigned long msg_type, void *msg) 313 { 314 unsigned long flags; 315 struct OpalHMIEvent *hmi_evt; 316 struct opal_msg *hmi_msg = msg; 317 struct OpalHmiEvtNode *msg_node; 318 319 /* Sanity Checks */ 320 if (msg_type != OPAL_MSG_HMI_EVT) 321 return 0; 322 323 /* HMI event info starts from param[0] */ 324 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0]; 325 326 /* Delay the logging of HMI events to workqueue. */ 327 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); 328 if (!msg_node) { 329 pr_err("HMI: out of memory, Opal message event not handled\n"); 330 return -ENOMEM; 331 } 332 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent)); 333 334 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 335 list_add(&msg_node->list, &opal_hmi_evt_list); 336 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 337 338 schedule_work(&hmi_event_work); 339 return 0; 340 } 341 342 static struct notifier_block opal_hmi_handler_nb = { 343 .notifier_call = opal_handle_hmi_event, 344 .next = NULL, 345 .priority = 0, 346 }; 347 348 int __init opal_hmi_handler_init(void) 349 { 350 int ret; 351 352 if (!opal_hmi_handler_nb_init) { 353 ret = opal_message_notifier_register( 354 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb); 355 if (ret) { 356 pr_err("%s: Can't register OPAL event notifier (%d)\n", 357 __func__, ret); 358 return ret; 359 } 360 opal_hmi_handler_nb_init = 1; 361 } 362 return 0; 363 } 364