1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * CMU-CH ECC support 30 */ 31 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/kmem.h> 35 #include <sys/sunddi.h> 36 #include <sys/intr.h> 37 #include <sys/async.h> 38 #include <sys/ddi_impldefs.h> 39 #include <sys/machsystm.h> 40 #include <sys/sysmacros.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/fm/util.h> 43 #include <sys/fm/io/pci.h> 44 #include <sys/fm/io/sun4upci.h> 45 #include <sys/fm/io/ddi.h> 46 #include <sys/pcicmu/pcicmu.h> 47 48 /*LINTLIBRARY*/ 49 50 static void pcmu_ecc_disable(pcmu_ecc_t *, int); 51 static uint64_t pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *); 52 static void pcmu_ecc_ereport_post(dev_info_t *dip, 53 pcmu_ecc_errstate_t *ecc_err); 54 55 clock_t pcmu_pecc_panic_delay = 200; 56 57 void 58 pcmu_ecc_create(pcmu_t *pcmu_p) 59 { 60 uint64_t pcb_base_pa = pcmu_p->pcmu_cb_p->pcb_base_pa; 61 pcmu_ecc_t *pecc_p; 62 /* LINTED variable */ 63 dev_info_t *dip = pcmu_p->pcmu_dip; 64 65 pecc_p = (pcmu_ecc_t *)kmem_zalloc(sizeof (pcmu_ecc_t), KM_SLEEP); 66 pecc_p->pecc_pcmu_p = pcmu_p; 67 pcmu_p->pcmu_pecc_p = pecc_p; 68 69 pecc_p->pecc_ue.pecc_p = pecc_p; 70 pecc_p->pecc_ue.pecc_type = CBNINTR_UE; 71 72 pcmu_ecc_setup(pecc_p); 73 74 /* 75 * Determine the virtual addresses of the streaming cache 76 * control/status and flush registers. 77 */ 78 pecc_p->pecc_csr_pa = pcb_base_pa + PCMU_ECC_CSR_OFFSET; 79 pecc_p->pecc_ue.pecc_afsr_pa = pcb_base_pa + PCMU_UE_AFSR_OFFSET; 80 pecc_p->pecc_ue.pecc_afar_pa = pcb_base_pa + PCMU_UE_AFAR_OFFSET; 81 82 PCMU_DBG1(PCMU_DBG_ATTACH, dip, "pcmu_ecc_create: csr=%x\n", 83 pecc_p->pecc_csr_pa); 84 PCMU_DBG2(PCMU_DBG_ATTACH, dip, 85 "pcmu_ecc_create: ue_afsr=%x, ue_afar=%x\n", 86 pecc_p->pecc_ue.pecc_afsr_pa, pecc_p->pecc_ue.pecc_afar_pa); 87 88 pcmu_ecc_configure(pcmu_p); 89 90 /* 91 * Register routines to be called from system error handling code. 92 */ 93 bus_func_register(BF_TYPE_ERRDIS, 94 (busfunc_t)pcmu_ecc_disable_nowait, pecc_p); 95 } 96 97 int 98 pcmu_ecc_register_intr(pcmu_t *pcmu_p) 99 { 100 pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p; 101 int ret; 102 103 /* 104 * Install the UE error interrupt handlers. 105 */ 106 ret = pcmu_ecc_add_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue); 107 return (ret); 108 } 109 110 void 111 pcmu_ecc_destroy(pcmu_t *pcmu_p) 112 { 113 pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p; 114 115 PCMU_DBG0(PCMU_DBG_DETACH, pcmu_p->pcmu_dip, "pcmu_ecc_destroy:\n"); 116 117 /* 118 * Disable UE ECC error interrupts. 119 */ 120 pcmu_ecc_disable_wait(pecc_p); 121 122 /* 123 * Remove the ECC interrupt handlers. 124 */ 125 pcmu_ecc_rem_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue); 126 127 /* 128 * Unregister our error handling functions. 129 */ 130 bus_func_unregister(BF_TYPE_ERRDIS, 131 (busfunc_t)pcmu_ecc_disable_nowait, pecc_p); 132 /* 133 * If a timer has been set, unset it. 134 */ 135 (void) untimeout(pecc_p->pecc_tout_id); 136 kmem_free(pecc_p, sizeof (pcmu_ecc_t)); 137 pcmu_p->pcmu_pecc_p = NULL; 138 } 139 140 void 141 pcmu_ecc_configure(pcmu_t *pcmu_p) 142 { 143 pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p; 144 uint64_t l; 145 /* LINTED variable */ 146 dev_info_t *dip = pcmu_p->pcmu_dip; 147 148 /* 149 * Clear any pending ECC errors. 150 */ 151 PCMU_DBG0(PCMU_DBG_ATTACH, dip, 152 "pcmu_ecc_configure: clearing UE errors\n"); 153 l = (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_PE_SHIFT) | 154 (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_SE_SHIFT); 155 stdphysio(pecc_p->pecc_ue.pecc_afsr_pa, l); 156 157 /* 158 * Enable ECC error detections via the control register. 159 */ 160 PCMU_DBG0(PCMU_DBG_ATTACH, dip, 161 "pcmu_ecc_configure: enabling UE detection\n"); 162 l = PCMU_ECC_CTRL_ECC_EN; 163 if (ecc_error_intr_enable) 164 l |= PCMU_ECC_CTRL_UE_INTEN; 165 stdphysio(pecc_p->pecc_csr_pa, l); 166 } 167 168 void 169 pcmu_ecc_enable_intr(pcmu_t *pcmu_p) 170 { 171 pcmu_cb_enable_nintr(pcmu_p, CBNINTR_UE); 172 } 173 174 void 175 pcmu_ecc_disable_wait(pcmu_ecc_t *pecc_p) 176 { 177 pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_WAIT); 178 } 179 180 uint_t 181 pcmu_ecc_disable_nowait(pcmu_ecc_t *pecc_p) 182 { 183 pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_NOWAIT); 184 return (BF_NONE); 185 } 186 187 static void 188 pcmu_ecc_disable(pcmu_ecc_t *pecc_p, int wait) 189 { 190 pcmu_cb_t *pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p; 191 uint64_t csr_pa = pecc_p->pecc_csr_pa; 192 uint64_t csr = lddphysio(csr_pa); 193 194 csr &= ~(PCMU_ECC_CTRL_UE_INTEN); 195 stdphysio(csr_pa, csr); 196 pcmu_cb_disable_nintr(pcb_p, CBNINTR_UE, wait); 197 } 198 199 /* 200 * I/O ECC error handling: 201 * 202 * Below are the generic functions that handle detected ECC errors. 203 * 204 * The registered interrupt handler is pcmu_ecc_intr(), it's function 205 * is to receive the error, capture some state, and pass that on to 206 * the pcmu_ecc_err_handler() for reporting purposes. 207 * 208 * pcmu_ecc_err_handler() gathers more state(via pcmu_ecc_errstate_get) 209 * and attempts to handle and report the error. pcmu_ecc_err_handler() 210 * must determine if we need to panic due to this error (via 211 * pcmu_ecc_classify, which also decodes the * ECC afsr), and if any 212 * side effects exist that may have caused or are due * to this error. 213 * PBM errors related to the ECC error may exist, to report 214 * them we call pcmu_pbm_err_handler(). 215 * 216 * To report the error we must also get the syndrome and unum, which can not 217 * be done in high level interrupted context. Therefore we have an error 218 * queue(pcmu_ecc_queue) which we dispatch errors to, to report the errors 219 * (pcmu_ecc_err_drain()). 220 * 221 * pcmu_ecc_err_drain() will be called when either the softint is triggered 222 * or the system is panicing. Either way it will gather more information 223 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to 224 * retire the faulty page(if error is a UE), and report the detected error. 225 * 226 */ 227 228 /* 229 * Function used to get ECC AFSR register 230 */ 231 static uint64_t 232 pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *ecc_ii_p) 233 { 234 ASSERT(ecc_ii_p->pecc_type == CBNINTR_UE); 235 return (lddphysio(ecc_ii_p->pecc_afsr_pa)); 236 } 237 238 /* 239 * IO detected ECC error interrupt handler, calls pcmu_ecc_err_handler to post 240 * error reports and handle the interrupt. Re-entry into pcmu_ecc_err_handler 241 * is protected by the per-chip mutex pcmu_err_mutex. 242 */ 243 uint_t 244 pcmu_ecc_intr(caddr_t a) 245 { 246 pcmu_ecc_intr_info_t *ecc_ii_p = (pcmu_ecc_intr_info_t *)a; 247 pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p; 248 pcmu_t *pcmu_p = pecc_p->pecc_pcmu_p; 249 pcmu_ecc_errstate_t ecc_err; 250 int ret = DDI_FM_OK; 251 252 bzero(&ecc_err, sizeof (pcmu_ecc_errstate_t)); 253 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); /* RAGS */ 254 ecc_err.ecc_ii_p = *ecc_ii_p; 255 ecc_err.pecc_p = pecc_p; 256 ecc_err.ecc_caller = PCI_ECC_CALL; 257 258 mutex_enter(&pcmu_p->pcmu_err_mutex); 259 ret = pcmu_ecc_err_handler(&ecc_err); 260 mutex_exit(&pcmu_p->pcmu_err_mutex); 261 if (ret == DDI_FM_FATAL) { 262 /* 263 * Need delay here to allow CPUs to handle related traps, 264 * such as FRUs for USIIIi systems. 265 */ 266 DELAY(pcmu_pecc_panic_delay); 267 cmn_err(CE_PANIC, "Fatal PCI UE Error"); 268 } 269 270 return (DDI_INTR_CLAIMED); 271 } 272 273 /* 274 * Function used to gather IO ECC error state. 275 */ 276 static void 277 pcmu_ecc_errstate_get(pcmu_ecc_errstate_t *ecc_err_p) 278 { 279 pcmu_ecc_t *pecc_p; 280 uint_t bus_id; 281 282 ASSERT(ecc_err_p); 283 284 pecc_p = ecc_err_p->ecc_ii_p.pecc_p; 285 bus_id = pecc_p->pecc_pcmu_p->pcmu_id; 286 287 ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex)); 288 /* 289 * Read the fault registers. 290 */ 291 ecc_err_p->ecc_afsr = pcmu_ecc_read_afsr(&ecc_err_p->ecc_ii_p); 292 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.pecc_afar_pa); 293 294 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr & 295 ecc_err_p->ecc_ii_p.pecc_offset_mask) >> 296 ecc_err_p->ecc_ii_p.pecc_offset_shift) << 297 ecc_err_p->ecc_ii_p.pecc_size_log2; 298 299 ecc_err_p->ecc_aflt.flt_id = gethrtime(); 300 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr; 301 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) + 302 ecc_err_p->ecc_offset; 303 ecc_err_p->ecc_aflt.flt_bus_id = bus_id; 304 ecc_err_p->ecc_aflt.flt_inst = 0; 305 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS; 306 ecc_err_p->ecc_aflt.flt_in_memory = 0; 307 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT; 308 } 309 310 /* 311 * pcmu_ecc_check: Called by pcmu_ecc_err_handler() this function is responsible 312 * for calling pcmu_pbm_err_handler() and calling their children error 313 * handlers(via ndi_fm_handler_dispatch()). 314 */ 315 static int 316 pcmu_ecc_check(pcmu_ecc_t *pecc_p, uint64_t fme_ena) 317 { 318 ddi_fm_error_t derr; 319 int ret; 320 pcmu_t *pcmu_p; 321 322 323 ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex)); 324 325 bzero(&derr, sizeof (ddi_fm_error_t)); 326 derr.fme_version = DDI_FME_VERSION; 327 derr.fme_ena = fme_ena; 328 ret = DDI_FM_NONFATAL; 329 330 /* 331 * Need to report any PBM errors which may have caused or 332 * resulted from this error. 333 */ 334 pcmu_p = pecc_p->pecc_pcmu_p; 335 if (pcmu_pbm_err_handler(pcmu_p->pcmu_dip, &derr, (void *)pcmu_p, 336 PCI_ECC_CALL) == DDI_FM_FATAL) 337 ret = DDI_FM_FATAL; 338 339 if (ret == DDI_FM_FATAL) 340 return (DDI_FM_FATAL); 341 else 342 return (DDI_FM_NONFATAL); 343 } 344 345 /* 346 * Function used to handle and log IO detected ECC errors, can be called by 347 * pcmu_ecc_intr and pcmu_err_callback(trap callback). Protected by 348 * pcmu_err_mutex. 349 */ 350 int 351 pcmu_ecc_err_handler(pcmu_ecc_errstate_t *ecc_err_p) 352 { 353 /* LINTED variable */ 354 uint64_t pri_err, sec_err; 355 pcmu_ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p; 356 pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p; 357 /* LINTED variable */ 358 pcmu_t *pcmu_p; 359 pcmu_cb_t *pcb_p; 360 int fatal = 0; 361 int nonfatal = 0; 362 363 ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex)); 364 365 pcmu_p = pecc_p->pecc_pcmu_p; 366 pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p; 367 368 pcmu_ecc_errstate_get(ecc_err_p); 369 pri_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_PE_SHIFT) & 370 PCMU_ECC_UE_AFSR_E_MASK; 371 372 sec_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_SE_SHIFT) & 373 PCMU_ECC_UE_AFSR_E_MASK; 374 375 switch (ecc_ii_p->pecc_type) { 376 case CBNINTR_UE: 377 if (pri_err) { 378 ecc_err_p->ecc_aflt.flt_synd = 0; 379 ecc_err_p->pecc_pri = 1; 380 pcmu_ecc_classify(pri_err, ecc_err_p); 381 errorq_dispatch(pcmu_ecc_queue, (void *)ecc_err_p, 382 sizeof (pcmu_ecc_errstate_t), 383 ecc_err_p->ecc_aflt.flt_panic); 384 } 385 if (sec_err) { 386 pcmu_ecc_errstate_t ecc_sec_err; 387 388 ecc_sec_err = *ecc_err_p; 389 ecc_sec_err.pecc_pri = 0; 390 pcmu_ecc_classify(sec_err, &ecc_sec_err); 391 pcmu_ecc_ereport_post(pcmu_p->pcmu_dip, 392 &ecc_sec_err); 393 } 394 /* 395 * Check for PCI bus errors that may have resulted from or 396 * caused this UE. 397 */ 398 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 399 pcmu_ecc_check(pecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL) 400 ecc_err_p->ecc_aflt.flt_panic = 1; 401 402 if (ecc_err_p->ecc_aflt.flt_panic) { 403 /* 404 * Disable all further errors since this will be 405 * treated as a fatal error. 406 */ 407 (void) pcmu_ecc_disable_nowait(pecc_p); 408 fatal++; 409 } 410 break; 411 412 default: 413 return (DDI_FM_OK); 414 } 415 /* Clear the errors */ 416 stdphysio(ecc_ii_p->pecc_afsr_pa, ecc_err_p->ecc_afsr); 417 /* 418 * Clear the interrupt if called by pcmu_ecc_intr and UE error 419 * or if called by pcmu_ecc_intr and CE error and delayed CE 420 * interrupt handling is turned off. 421 */ 422 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 423 ecc_ii_p->pecc_type == CBNINTR_UE && !fatal) 424 pcmu_cb_clear_nintr(pcb_p, ecc_ii_p->pecc_type); 425 if (!fatal && !nonfatal) 426 return (DDI_FM_OK); 427 else if (fatal) 428 return (DDI_FM_FATAL); 429 return (DDI_FM_NONFATAL); 430 } 431 432 /* 433 * Function used to drain pcmu_ecc_queue, either during panic or after softint 434 * is generated, to log IO detected ECC errors. 435 */ 436 void 437 pcmu_ecc_err_drain(void *not_used, pcmu_ecc_errstate_t *ecc_err) 438 { 439 struct async_flt *ecc = &ecc_err->ecc_aflt; 440 pcmu_t *pcmu_p = ecc_err->pecc_p->pecc_pcmu_p; 441 442 ecc_cpu_call(ecc, ecc_err->ecc_unum, ECC_IO_UE); 443 ecc_err->ecc_err_type = "U"; 444 pcmu_ecc_ereport_post(pcmu_p->pcmu_dip, ecc_err); 445 } 446 447 /* 448 * Function used to post IO detected ECC ereports. 449 */ 450 static void 451 pcmu_ecc_ereport_post(dev_info_t *dip, pcmu_ecc_errstate_t *ecc_err) 452 { 453 char *aux_msg; 454 pcmu_t *pcmu_p; 455 int instance = ddi_get_instance(dip); 456 457 pcmu_p = get_pcmu_soft_state(instance); 458 if (ecc_err->pecc_pri) { 459 aux_msg = "PIO primary uncorrectable error"; 460 } else { 461 aux_msg = "PIO secondary uncorrectable error"; 462 } 463 cmn_err(CE_WARN, "%s %s: %s %s=0x%lx, %s=0x%lx, %s=0x%x", 464 (pcmu_p->pcmu_pcbm_p)->pcbm_nameinst_str, 465 (pcmu_p->pcmu_pcbm_p)->pcbm_nameaddr_str, 466 aux_msg, PCI_ECC_AFSR, ecc_err->ecc_afsr, 467 PCI_ECC_AFAR, ecc_err->ecc_aflt.flt_addr, 468 "portid", ecc_err->ecc_aflt.flt_bus_id); 469 } 470