1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * PCI ECC support 30 */ 31 32 #include <sys/types.h> 33 #include <sys/systm.h> /* for strrchr */ 34 #include <sys/kmem.h> 35 #include <sys/sunddi.h> 36 #include <sys/intr.h> 37 #include <sys/async.h> /* struct async_flt */ 38 #include <sys/ddi_impldefs.h> 39 #include <sys/machsystm.h> 40 #include <sys/sysmacros.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/fm/util.h> 43 #include <sys/fm/io/pci.h> 44 #include <sys/fm/io/sun4upci.h> 45 #include <sys/fm/io/ddi.h> 46 #include <sys/pci/pci_obj.h> /* ld/st physio */ 47 #include <sys/cpuvar.h> 48 #include <sys/errclassify.h> 49 #include <sys/cpu_module.h> 50 #include <sys/async.h> 51 52 /*LINTLIBRARY*/ 53 54 static void ecc_disable(ecc_t *, int); 55 static void ecc_delayed_ce(void *); 56 static uint64_t ecc_read_afsr(ecc_intr_info_t *); 57 static void ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err); 58 59 clock_t pci_ecc_panic_delay = 200; 60 int ecc_ce_delay_secs = 6; /* number of sec to delay reenabling of CEs */ 61 int ecc_ce_delayed = 1; /* global for enabling/disabling CE delay */ 62 63 void 64 ecc_create(pci_t *pci_p) 65 { 66 #ifdef DEBUG 67 dev_info_t *dip = pci_p->pci_dip; 68 #endif 69 uint64_t cb_base_pa = pci_p->pci_cb_p->cb_base_pa; 70 ecc_t *ecc_p; 71 72 ecc_p = (ecc_t *)kmem_zalloc(sizeof (ecc_t), KM_SLEEP); 73 ecc_p->ecc_pci_cmn_p = pci_p->pci_common_p; 74 pci_p->pci_ecc_p = ecc_p; 75 76 ecc_p->ecc_ue.ecc_p = ecc_p; 77 ecc_p->ecc_ue.ecc_type = CBNINTR_UE; 78 ecc_p->ecc_ce.ecc_p = ecc_p; 79 ecc_p->ecc_ce.ecc_type = CBNINTR_CE; 80 81 pci_ecc_setup(ecc_p); 82 83 /* 84 * Determine the virtual addresses of the streaming cache 85 * control/status and flush registers. 86 */ 87 ecc_p->ecc_csr_pa = cb_base_pa + COMMON_ECC_CSR_OFFSET; 88 ecc_p->ecc_ue.ecc_afsr_pa = cb_base_pa + COMMON_UE_AFSR_OFFSET; 89 ecc_p->ecc_ue.ecc_afar_pa = cb_base_pa + COMMON_UE_AFAR_OFFSET; 90 ecc_p->ecc_ce.ecc_afsr_pa = cb_base_pa + COMMON_CE_AFSR_OFFSET; 91 ecc_p->ecc_ce.ecc_afar_pa = cb_base_pa + COMMON_CE_AFAR_OFFSET; 92 93 DEBUG1(DBG_ATTACH, dip, "ecc_create: csr=%x\n", ecc_p->ecc_csr_pa); 94 DEBUG2(DBG_ATTACH, dip, "ecc_create: ue_afsr=%x, ue_afar=%x\n", 95 ecc_p->ecc_ue.ecc_afsr_pa, ecc_p->ecc_ue.ecc_afar_pa); 96 DEBUG2(DBG_ATTACH, dip, "ecc_create: ce_afsr=%x, ce_afar=%x\n", 97 ecc_p->ecc_ce.ecc_afsr_pa, ecc_p->ecc_ce.ecc_afar_pa); 98 99 ecc_configure(pci_p); 100 101 /* 102 * Register routines to be called from system error handling code. 103 */ 104 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)ecc_disable_nowait, ecc_p); 105 } 106 107 int 108 ecc_register_intr(pci_t *pci_p) 109 { 110 ecc_t *ecc_p = pci_p->pci_ecc_p; 111 int ret; 112 113 /* 114 * Install the UE and CE error interrupt handlers. 115 */ 116 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue)) != 117 DDI_SUCCESS) 118 return (ret); 119 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce)) != 120 DDI_SUCCESS) 121 return (ret); 122 123 return (DDI_SUCCESS); 124 } 125 126 void 127 ecc_destroy(pci_t *pci_p) 128 { 129 ecc_t *ecc_p = pci_p->pci_ecc_p; 130 131 DEBUG0(DBG_DETACH, pci_p->pci_dip, "ecc_destroy:\n"); 132 133 /* 134 * Disable UE and CE ECC error interrupts. 135 */ 136 ecc_disable_wait(ecc_p); 137 138 /* 139 * Remove the ECC interrupt handlers. 140 */ 141 pci_ecc_rem_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue); 142 pci_ecc_rem_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce); 143 144 /* 145 * Unregister our error handling functions. 146 */ 147 bus_func_unregister(BF_TYPE_ERRDIS, 148 (busfunc_t)ecc_disable_nowait, ecc_p); 149 /* 150 * If a timer has been set, unset it. 151 */ 152 (void) untimeout(ecc_p->ecc_to_id); 153 154 kmem_free(ecc_p, sizeof (ecc_t)); 155 pci_p->pci_ecc_p = NULL; 156 } 157 158 void 159 ecc_configure(pci_t *pci_p) 160 { 161 ecc_t *ecc_p = pci_p->pci_ecc_p; 162 dev_info_t *dip = pci_p->pci_dip; 163 uint64_t l; 164 165 /* 166 * Clear any pending ECC errors. 167 */ 168 DEBUG0(DBG_ATTACH, dip, "ecc_configure: clearing UE and CE errors\n"); 169 l = (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_PE_SHIFT) | 170 (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_SE_SHIFT); 171 stdphysio(ecc_p->ecc_ue.ecc_afsr_pa, l); 172 173 l = (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_PE_SHIFT) | 174 (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_SE_SHIFT); 175 stdphysio(ecc_p->ecc_ce.ecc_afsr_pa, l); 176 177 /* 178 * Enable ECC error detections via the control register. 179 */ 180 DEBUG0(DBG_ATTACH, dip, "ecc_configure: enabling UE CE detection\n"); 181 l = COMMON_ECC_CTRL_ECC_EN; 182 if (ecc_error_intr_enable) 183 l |= COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN; 184 stdphysio(ecc_p->ecc_csr_pa, l); 185 } 186 187 void 188 ecc_enable_intr(pci_t *pci_p) 189 { 190 cb_enable_nintr(pci_p, CBNINTR_UE); 191 cb_enable_nintr(pci_p, CBNINTR_CE); 192 } 193 194 void 195 ecc_disable_wait(ecc_t *ecc_p) 196 { 197 ecc_disable(ecc_p, IB_INTR_WAIT); 198 } 199 200 uint_t 201 ecc_disable_nowait(ecc_t *ecc_p) 202 { 203 ecc_disable(ecc_p, IB_INTR_NOWAIT); 204 return (BF_NONE); 205 } 206 207 static void 208 ecc_disable(ecc_t *ecc_p, int wait) 209 { 210 cb_t *cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 211 uint64_t csr_pa = ecc_p->ecc_csr_pa; 212 uint64_t csr = lddphysio(csr_pa); 213 214 csr &= ~(COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN); 215 stdphysio(csr_pa, csr); 216 217 cb_disable_nintr(cb_p, CBNINTR_UE, wait); 218 cb_disable_nintr(cb_p, CBNINTR_CE, wait); 219 } 220 221 /* 222 * I/O ECC error handling: 223 * 224 * Below are the generic functions that handle PCI(pcisch, pcipsy) detected 225 * ECC errors. 226 * 227 * The registered interrupt handler for both pcisch and pcipsy is ecc_intr(), 228 * it's function is to receive the error, capture some state, and pass that on 229 * to the ecc_err_handler() for reporting purposes. 230 * 231 * ecc_err_handler() gathers more state(via ecc_errstate_get) and attempts 232 * to handle and report the error. ecc_err_handler() must determine if we need 233 * to panic due to this error (via pci_ecc_classify, which also decodes the 234 * ECC afsr), and if any side effects exist that may have caused or are due 235 * to this error. PBM errors related to the ECC error may exist, to report 236 * them we call pci_pbm_err_handler() and call ndi_fm_handler_dispatch() so 237 * that the child devices can log their pci errors. 238 * 239 * To report the error we must also get the syndrome and unum, which can not 240 * be done in high level interrupted context. Therefore we have an error 241 * queue(pci_ecc_queue) which we dispatch errors to, to report the errors 242 * (ecc_err_drain()). 243 * 244 * ecc_err_drain() will be called when either the softint is triggered 245 * or the system is panicing. Either way it will gather more information 246 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to 247 * retire the faulty page(if error is a UE), and report the detected error. 248 * 249 * ecc_delayed_ce() is called via timeout from ecc_err_handler() following 250 * the receipt of a CE interrupt. It will be called after 6ms and check to 251 * see if any new CEs are present, if so we will log and another timeout will 252 * be set by(ecc_err_handler()). If no CEs are present then it will re-enable 253 * CEs by clearing the previous interrupt. This is to keep the system going 254 * in the event of a CE storm. 255 */ 256 257 /* 258 * Function used to get ECC AFSR register 259 */ 260 static uint64_t 261 ecc_read_afsr(ecc_intr_info_t *ecc_ii_p) 262 { 263 uint_t i; 264 uint64_t afsr = 0ull; 265 266 ASSERT((ecc_ii_p->ecc_type == CBNINTR_UE) || 267 (ecc_ii_p->ecc_type == CBNINTR_CE)); 268 if (!ecc_ii_p->ecc_errpndg_mask) 269 return (lddphysio(ecc_ii_p->ecc_afsr_pa)); 270 271 for (i = 0; i < pci_ecc_afsr_retries; i++) { 272 273 /* 274 * If we timeout, the logging routine will 275 * know because it will see the ERRPNDG bits 276 * set in the AFSR. 277 */ 278 afsr = lddphysio(ecc_ii_p->ecc_afsr_pa); 279 if ((afsr & ecc_ii_p->ecc_errpndg_mask) == 0) 280 break; 281 } 282 return (afsr); 283 } 284 285 /* 286 * IO detected ECC error interrupt handler, calls ecc_err_handler to post 287 * error reports and handle the interrupt. Re-entry into ecc_err_handler 288 * is protected by the per-chip mutex pci_fm_mutex. 289 */ 290 uint_t 291 ecc_intr(caddr_t a) 292 { 293 ecc_intr_info_t *ecc_ii_p = (ecc_intr_info_t *)a; 294 ecc_t *ecc_p = ecc_ii_p->ecc_p; 295 pci_common_t *cmn_p = ecc_p->ecc_pci_cmn_p; 296 ecc_errstate_t ecc_err; 297 int ret = DDI_FM_OK; 298 299 bzero(&ecc_err, sizeof (ecc_errstate_t)); 300 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 301 ecc_err.ecc_ii_p = *ecc_ii_p; 302 ecc_err.ecc_p = ecc_p; 303 ecc_err.ecc_caller = PCI_ECC_CALL; 304 305 mutex_enter(&cmn_p->pci_fm_mutex); 306 ret = ecc_err_handler(&ecc_err); 307 mutex_exit(&cmn_p->pci_fm_mutex); 308 if (ret == DDI_FM_FATAL) { 309 /* 310 * Need delay here to allow CPUs to handle related traps, 311 * such as FRUs for USIIIi systems. 312 */ 313 DELAY(pci_ecc_panic_delay); 314 fm_panic("Fatal PCI UE Error"); 315 } 316 317 return (DDI_INTR_CLAIMED); 318 } 319 320 /* 321 * Function used to gather IO ECC error state. 322 */ 323 static void 324 ecc_errstate_get(ecc_errstate_t *ecc_err_p) 325 { 326 ecc_t *ecc_p; 327 uint_t bus_id; 328 329 ASSERT(ecc_err_p); 330 331 ecc_p = ecc_err_p->ecc_ii_p.ecc_p; 332 bus_id = ecc_p->ecc_pci_cmn_p->pci_common_id; 333 334 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 335 /* 336 * Read the fault registers. 337 */ 338 ecc_err_p->ecc_afsr = ecc_read_afsr(&ecc_err_p->ecc_ii_p); 339 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.ecc_afar_pa); 340 341 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr & 342 ecc_err_p->ecc_ii_p.ecc_offset_mask) >> 343 ecc_err_p->ecc_ii_p.ecc_offset_shift) << 344 ecc_err_p->ecc_ii_p.ecc_size_log2; 345 346 ecc_err_p->ecc_aflt.flt_id = gethrtime(); 347 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr; 348 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) + 349 ecc_err_p->ecc_offset; 350 ecc_err_p->ecc_aflt.flt_bus_id = bus_id; 351 ecc_err_p->ecc_aflt.flt_inst = CPU->cpu_id; 352 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS; 353 ecc_err_p->ecc_aflt.flt_in_memory = (pf_is_memory 354 (ecc_err_p->ecc_afar >> MMU_PAGESHIFT))? 1: 0; 355 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT; 356 } 357 358 /* 359 * ecc_pci_check: Called by ecc_err_handler() this function is responsible 360 * for calling pci_pbm_err_handler() for both sides of the schizo/psycho 361 * and calling their children error handlers(via ndi_fm_handler_dispatch()). 362 */ 363 static int 364 ecc_pci_check(ecc_t *ecc_p, uint64_t fme_ena) 365 { 366 ddi_fm_error_t derr; 367 int i; 368 int ret; 369 370 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 371 372 bzero(&derr, sizeof (ddi_fm_error_t)); 373 derr.fme_version = DDI_FME_VERSION; 374 derr.fme_ena = fme_ena; 375 ret = DDI_FM_NONFATAL; 376 377 /* 378 * Need to report any PBM errors which may have caused or 379 * resulted from this error. 380 * 381 * Each psycho or schizo is represented by a pair of pci nodes 382 * in the device tree. 383 */ 384 for (i = 0; i < 2; i++) { 385 dev_info_t *dip; 386 pci_t *pci_p; 387 388 /* Make sure PBM PCI node exists */ 389 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[i]; 390 if (pci_p == NULL) 391 continue; 392 393 dip = pci_p->pci_dip; 394 if (pci_pbm_err_handler(dip, &derr, (void *)pci_p, 395 PCI_ECC_CALL) == DDI_FM_FATAL) 396 ret = DDI_FM_FATAL; 397 } 398 if (ret == DDI_FM_FATAL) 399 return (DDI_FM_FATAL); 400 else 401 return (DDI_FM_NONFATAL); 402 } 403 404 /* 405 * Function used to handle and log IO detected ECC errors, can be called by 406 * ecc_intr and pci_err_callback(trap callback). Protected by pci_fm_mutex. 407 */ 408 int 409 ecc_err_handler(ecc_errstate_t *ecc_err_p) 410 { 411 uint64_t pri_err, sec_err; 412 ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p; 413 ecc_t *ecc_p = ecc_ii_p->ecc_p; 414 pci_t *pci_p; 415 cb_t *cb_p; 416 int fatal = 0; 417 int nonfatal = 0; 418 ecc_errstate_t ecc_sec_err; 419 uint64_t sec_tmp; 420 int i; 421 uint64_t afsr_err[] = { COMMON_ECC_AFSR_E_PIO, 422 COMMON_ECC_AFSR_E_DRD, 423 COMMON_ECC_AFSR_E_DWR }; 424 425 426 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 427 428 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[0]; 429 if (pci_p == NULL) 430 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[1]; 431 432 cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 433 434 ecc_errstate_get(ecc_err_p); 435 pri_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_PE_SHIFT) & 436 COMMON_ECC_UE_AFSR_E_MASK; 437 438 sec_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_SE_SHIFT) & 439 COMMON_ECC_UE_AFSR_E_MASK; 440 441 switch (ecc_ii_p->ecc_type) { 442 case CBNINTR_UE: 443 if (pri_err) { 444 ecc_err_p->ecc_aflt.flt_synd = 445 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 446 ecc_err_p->ecc_pri = 1; 447 pci_ecc_classify(pri_err, ecc_err_p); 448 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 449 sizeof (ecc_errstate_t), 450 ecc_err_p->ecc_aflt.flt_panic); 451 } 452 if (sec_err) { 453 ecc_sec_err = *ecc_err_p; 454 ecc_sec_err.ecc_pri = 0; 455 /* 456 * Secondary errors are cumulative so we need to loop 457 * through to capture them all. 458 */ 459 for (i = 0; i < 3; i++) { 460 sec_tmp = sec_err & afsr_err[i]; 461 if (sec_tmp) { 462 pci_ecc_classify(sec_tmp, &ecc_sec_err); 463 ecc_ereport_post(pci_p->pci_dip, 464 &ecc_sec_err); 465 } 466 } 467 } 468 /* 469 * Check for PCI bus errors that may have resulted from or 470 * caused this UE. 471 */ 472 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 473 ecc_pci_check(ecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL) 474 ecc_err_p->ecc_aflt.flt_panic = 1; 475 476 if (ecc_err_p->ecc_aflt.flt_panic && 477 ecc_err_p->ecc_aflt.flt_in_memory) 478 panic_aflt = ecc_err_p->ecc_aflt; 479 480 if (ecc_err_p->ecc_aflt.flt_panic) { 481 /* 482 * Disable all further errors since this will be 483 * treated as a fatal error. 484 */ 485 (void) ecc_disable_nowait(ecc_p); 486 fatal++; 487 } 488 break; 489 490 case CBNINTR_CE: 491 if (pri_err) { 492 ecc_err_p->ecc_pri = 1; 493 pci_ecc_classify(pri_err, ecc_err_p); 494 ecc_err_p->ecc_aflt.flt_synd = 495 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 496 ce_scrub(&ecc_err_p->ecc_aflt); 497 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 498 sizeof (ecc_errstate_t), ERRORQ_ASYNC); 499 nonfatal++; 500 } 501 if (sec_err) { 502 ecc_sec_err = *ecc_err_p; 503 ecc_sec_err.ecc_pri = 0; 504 /* 505 * Secondary errors are cumulative so we need to loop 506 * through to capture them all. 507 */ 508 for (i = 0; i < 3; i++) { 509 sec_tmp = sec_err & afsr_err[i]; 510 if (sec_tmp) { 511 pci_ecc_classify(sec_tmp, &ecc_sec_err); 512 ecc_ereport_post(pci_p->pci_dip, 513 &ecc_sec_err); 514 } 515 } 516 nonfatal++; 517 } 518 break; 519 520 default: 521 return (DDI_FM_OK); 522 } 523 /* Clear the errors */ 524 stdphysio(ecc_ii_p->ecc_afsr_pa, ecc_err_p->ecc_afsr); 525 /* 526 * Clear the interrupt if called by ecc_intr and UE error or if called 527 * by ecc_intr and CE error and delayed CE interrupt handling is 528 * turned off. 529 */ 530 if ((ecc_err_p->ecc_caller == PCI_ECC_CALL && 531 ecc_ii_p->ecc_type == CBNINTR_UE && !fatal) || 532 (ecc_err_p->ecc_caller == PCI_ECC_CALL && 533 ecc_ii_p->ecc_type == CBNINTR_CE && !ecc_ce_delayed)) 534 cb_clear_nintr(cb_p, ecc_ii_p->ecc_type); 535 if (!fatal && !nonfatal) 536 return (DDI_FM_OK); 537 else if (fatal) 538 return (DDI_FM_FATAL); 539 return (DDI_FM_NONFATAL); 540 } 541 542 /* 543 * Called from ecc_err_drain below for CBINTR_CE case. 544 */ 545 static int 546 ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 547 { 548 struct async_flt *ecc = &ecc_err->ecc_aflt; 549 uint64_t errors; 550 551 if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) { 552 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP); 553 return (0); 554 } else if (errors != PR_OK) { 555 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET); 556 return (0); 557 } else { 558 return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep, 559 offsetof(ecc_errstate_t, ecc_aflt))); 560 } 561 } 562 563 /* 564 * Function used to drain pci_ecc_queue, either during panic or after softint 565 * is generated, to log IO detected ECC errors. 566 */ 567 /*ARGSUSED*/ 568 void 569 ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 570 { 571 struct async_flt *ecc = &ecc_err->ecc_aflt; 572 pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0]; 573 int ecc_type = ecc_err->ecc_ii_p.ecc_type; 574 575 if (pci_p == NULL) 576 pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[1]; 577 578 if (ecc->flt_class == RECIRC_BUS_FAULT) { 579 /* 580 * Perform any additional actions that occur after the 581 * ecc_err_cexdiag below and post the ereport. 582 */ 583 ecc->flt_class = BUS_FAULT; 584 ecc_err->ecc_err_type = flt_to_error_type(ecc); 585 ecc_ereport_post(pci_p->pci_dip, ecc_err); 586 return; 587 } 588 589 ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ? 590 ECC_IO_UE : ECC_IO_CE); 591 592 switch (ecc_type) { 593 case CBNINTR_UE: 594 if (ecc_err->ecc_pg_ret == 1) { 595 (void) page_retire(ecc->flt_addr, PR_UE); 596 } 597 ecc_err->ecc_err_type = flt_to_error_type(ecc); 598 break; 599 600 case CBNINTR_CE: 601 /* 602 * Setup timeout (if CE detected via interrupt) to 603 * re-enable CE interrupts if no more CEs are detected. 604 * This is to protect against CE storms. 605 */ 606 if (ecc_ce_delayed && 607 ecc_err->ecc_caller == PCI_ECC_CALL && 608 ecc_err->ecc_p->ecc_to_id == 0) { 609 ecc_err->ecc_p->ecc_to_id = timeout(ecc_delayed_ce, 610 (void *)ecc_err->ecc_p, 611 drv_usectohz((clock_t)ecc_ce_delay_secs * 612 MICROSEC)); 613 } 614 615 /* ecc_err_cexdiag returns nonzero to recirculate */ 616 if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) && 617 ecc_err_cexdiag(ecc_err, eqep)) 618 return; 619 ecc_err->ecc_err_type = flt_to_error_type(ecc); 620 break; 621 } 622 623 ecc_ereport_post(pci_p->pci_dip, ecc_err); 624 } 625 626 static void 627 ecc_delayed_ce(void *arg) 628 { 629 ecc_t *ecc_p = (ecc_t *)arg; 630 pci_common_t *cmn_p; 631 cb_t *cb_p; 632 633 ASSERT(ecc_p); 634 635 cmn_p = ecc_p->ecc_pci_cmn_p; 636 cb_p = cmn_p->pci_common_cb_p; 637 /* 638 * If no more CE errors are found then enable interrupts(by 639 * clearing the previous interrupt), else send in for logging 640 * and the timeout should be set again. 641 */ 642 ecc_p->ecc_to_id = 0; 643 if (!((ecc_read_afsr(&ecc_p->ecc_ce) >> 644 COMMON_ECC_UE_AFSR_PE_SHIFT) & COMMON_ECC_UE_AFSR_E_MASK)) { 645 cb_clear_nintr(cb_p, ecc_p->ecc_ce.ecc_type); 646 } else { 647 ecc_errstate_t ecc_err; 648 649 bzero(&ecc_err, sizeof (ecc_errstate_t)); 650 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 651 ecc_err.ecc_ii_p = ecc_p->ecc_ce; 652 ecc_err.ecc_p = ecc_p; 653 ecc_err.ecc_caller = PCI_ECC_CALL; 654 655 mutex_enter(&cmn_p->pci_fm_mutex); 656 (void) ecc_err_handler(&ecc_err); 657 mutex_exit(&cmn_p->pci_fm_mutex); 658 } 659 } 660 661 /* 662 * Function used to post IO detected ECC ereports. 663 */ 664 static void 665 ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err) 666 { 667 char buf[FM_MAX_CLASS], dev_path[MAXPATHLEN], *ptr; 668 struct i_ddi_fmhdl *fmhdl = DEVI(dip)->devi_fmhdl; 669 nvlist_t *ereport, *detector; 670 nv_alloc_t *nva; 671 errorq_elem_t *eqep; 672 673 /* 674 * We do not use ddi_fm_ereport_post because we need to set a 675 * special detector here. Since we do not have a device path for 676 * the bridge chip we use what we think it should be to aid in 677 * diagnosis. This path fmri is created by pci_fmri_create() 678 * during initialization. 679 */ 680 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s", DDI_IO_CLASS, 681 ecc_err->ecc_bridge_type, ecc_err->ecc_aflt.flt_erpt_class); 682 683 ecc_err->ecc_ena = ecc_err->ecc_ena ? ecc_err->ecc_ena : 684 fm_ena_generate(0, FM_ENA_FMT1); 685 686 eqep = errorq_reserve(fmhdl->fh_errorq); 687 if (eqep == NULL) 688 return; 689 690 ereport = errorq_elem_nvl(fmhdl->fh_errorq, eqep); 691 nva = errorq_elem_nva(fmhdl->fh_errorq, eqep); 692 detector = fm_nvlist_create(nva); 693 694 ASSERT(ereport); 695 ASSERT(nva); 696 ASSERT(detector); 697 698 ddi_pathname(dip, dev_path); 699 ptr = strrchr(dev_path, (int)','); 700 701 if (ptr) 702 *ptr = '\0'; 703 704 fm_fmri_dev_set(detector, FM_DEV_SCHEME_VERSION, NULL, dev_path, NULL); 705 706 if (ecc_err->ecc_pri) { 707 if ((ecc_err->ecc_fmri = fm_nvlist_create(nva)) != NULL) { 708 char sid[DIMM_SERIAL_ID_LEN] = ""; 709 uint64_t offset = (uint64_t)-1; 710 int len; 711 int ret; 712 713 ret = cpu_get_mem_sid(ecc_err->ecc_unum, sid, 714 DIMM_SERIAL_ID_LEN, &len); 715 716 if (ret == 0) { 717 (void) cpu_get_mem_offset( 718 ecc_err->ecc_aflt.flt_addr, &offset); 719 } 720 721 fm_fmri_mem_set(ecc_err->ecc_fmri, 722 FM_MEM_SCHEME_VERSION, NULL, ecc_err->ecc_unum, 723 (ret == 0) ? sid : NULL, offset); 724 } 725 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 726 ecc_err->ecc_ena, detector, 727 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 728 PCI_ECC_AFAR, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_addr, 729 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 730 PCI_ECC_SYND, DATA_TYPE_UINT16, ecc_err->ecc_aflt.flt_synd, 731 PCI_ECC_TYPE, DATA_TYPE_STRING, ecc_err->ecc_err_type, 732 PCI_ECC_DISP, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_disp, 733 PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, ecc_err->ecc_fmri, 734 NULL); 735 } else { 736 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 737 ecc_err->ecc_ena, detector, 738 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 739 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 740 NULL); 741 } 742 errorq_commit(fmhdl->fh_errorq, eqep, ERRORQ_ASYNC); 743 } 744