1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * PCI ECC support 31 */ 32 33 #include <sys/types.h> 34 #include <sys/systm.h> /* for strrchr */ 35 #include <sys/kmem.h> 36 #include <sys/sunddi.h> 37 #include <sys/intr.h> 38 #include <sys/async.h> /* struct async_flt */ 39 #include <sys/ddi_impldefs.h> 40 #include <sys/machsystm.h> 41 #include <sys/sysmacros.h> 42 #include <sys/fm/protocol.h> 43 #include <sys/fm/util.h> 44 #include <sys/fm/io/pci.h> 45 #include <sys/fm/io/sun4upci.h> 46 #include <sys/fm/io/ddi.h> 47 #include <sys/pci/pci_obj.h> /* ld/st physio */ 48 #include <sys/cpuvar.h> 49 #include <sys/errclassify.h> 50 #include <sys/cpu_module.h> 51 #include <sys/async.h> 52 53 /*LINTLIBRARY*/ 54 55 static void ecc_disable(ecc_t *, int); 56 static void ecc_delayed_ce(void *); 57 static uint64_t ecc_read_afsr(ecc_intr_info_t *); 58 static void ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err); 59 60 clock_t pci_ecc_panic_delay = 200; 61 int ecc_ce_delay_secs = 6; /* number of sec to delay reenabling of CEs */ 62 int ecc_ce_delayed = 1; /* global for enabling/disabling CE delay */ 63 64 void 65 ecc_create(pci_t *pci_p) 66 { 67 #ifdef DEBUG 68 dev_info_t *dip = pci_p->pci_dip; 69 #endif 70 uint64_t cb_base_pa = pci_p->pci_cb_p->cb_base_pa; 71 ecc_t *ecc_p; 72 73 ecc_p = (ecc_t *)kmem_zalloc(sizeof (ecc_t), KM_SLEEP); 74 ecc_p->ecc_pci_cmn_p = pci_p->pci_common_p; 75 pci_p->pci_ecc_p = ecc_p; 76 77 ecc_p->ecc_ue.ecc_p = ecc_p; 78 ecc_p->ecc_ue.ecc_type = CBNINTR_UE; 79 ecc_p->ecc_ce.ecc_p = ecc_p; 80 ecc_p->ecc_ce.ecc_type = CBNINTR_CE; 81 82 pci_ecc_setup(ecc_p); 83 84 /* 85 * Determine the virtual addresses of the streaming cache 86 * control/status and flush registers. 87 */ 88 ecc_p->ecc_csr_pa = cb_base_pa + COMMON_ECC_CSR_OFFSET; 89 ecc_p->ecc_ue.ecc_afsr_pa = cb_base_pa + COMMON_UE_AFSR_OFFSET; 90 ecc_p->ecc_ue.ecc_afar_pa = cb_base_pa + COMMON_UE_AFAR_OFFSET; 91 ecc_p->ecc_ce.ecc_afsr_pa = cb_base_pa + COMMON_CE_AFSR_OFFSET; 92 ecc_p->ecc_ce.ecc_afar_pa = cb_base_pa + COMMON_CE_AFAR_OFFSET; 93 94 DEBUG1(DBG_ATTACH, dip, "ecc_create: csr=%x\n", ecc_p->ecc_csr_pa); 95 DEBUG2(DBG_ATTACH, dip, "ecc_create: ue_afsr=%x, ue_afar=%x\n", 96 ecc_p->ecc_ue.ecc_afsr_pa, ecc_p->ecc_ue.ecc_afar_pa); 97 DEBUG2(DBG_ATTACH, dip, "ecc_create: ce_afsr=%x, ce_afar=%x\n", 98 ecc_p->ecc_ce.ecc_afsr_pa, ecc_p->ecc_ce.ecc_afar_pa); 99 100 ecc_configure(pci_p); 101 102 /* 103 * Register routines to be called from system error handling code. 104 */ 105 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)ecc_disable_nowait, ecc_p); 106 } 107 108 int 109 ecc_register_intr(pci_t *pci_p) 110 { 111 ecc_t *ecc_p = pci_p->pci_ecc_p; 112 int ret; 113 114 /* 115 * Install the UE and CE error interrupt handlers. 116 */ 117 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue)) != 118 DDI_SUCCESS) 119 return (ret); 120 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce)) != 121 DDI_SUCCESS) 122 return (ret); 123 124 return (DDI_SUCCESS); 125 } 126 127 void 128 ecc_destroy(pci_t *pci_p) 129 { 130 ecc_t *ecc_p = pci_p->pci_ecc_p; 131 132 DEBUG0(DBG_DETACH, pci_p->pci_dip, "ecc_destroy:\n"); 133 134 /* 135 * Disable UE and CE ECC error interrupts. 136 */ 137 ecc_disable_wait(ecc_p); 138 139 /* 140 * Remove the ECC interrupt handlers. 141 */ 142 pci_ecc_rem_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue); 143 pci_ecc_rem_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce); 144 145 /* 146 * Unregister our error handling functions. 147 */ 148 bus_func_unregister(BF_TYPE_ERRDIS, 149 (busfunc_t)ecc_disable_nowait, ecc_p); 150 /* 151 * If a timer has been set, unset it. 152 */ 153 (void) untimeout(ecc_p->ecc_to_id); 154 155 kmem_free(ecc_p, sizeof (ecc_t)); 156 pci_p->pci_ecc_p = NULL; 157 } 158 159 void 160 ecc_configure(pci_t *pci_p) 161 { 162 ecc_t *ecc_p = pci_p->pci_ecc_p; 163 dev_info_t *dip = pci_p->pci_dip; 164 uint64_t l; 165 166 /* 167 * Clear any pending ECC errors. 168 */ 169 DEBUG0(DBG_ATTACH, dip, "ecc_configure: clearing UE and CE errors\n"); 170 l = (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_PE_SHIFT) | 171 (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_SE_SHIFT); 172 stdphysio(ecc_p->ecc_ue.ecc_afsr_pa, l); 173 174 l = (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_PE_SHIFT) | 175 (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_SE_SHIFT); 176 stdphysio(ecc_p->ecc_ce.ecc_afsr_pa, l); 177 178 /* 179 * Enable ECC error detections via the control register. 180 */ 181 DEBUG0(DBG_ATTACH, dip, "ecc_configure: enabling UE CE detection\n"); 182 l = COMMON_ECC_CTRL_ECC_EN; 183 if (ecc_error_intr_enable) 184 l |= COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN; 185 stdphysio(ecc_p->ecc_csr_pa, l); 186 } 187 188 void 189 ecc_enable_intr(pci_t *pci_p) 190 { 191 cb_enable_nintr(pci_p, CBNINTR_UE); 192 cb_enable_nintr(pci_p, CBNINTR_CE); 193 } 194 195 void 196 ecc_disable_wait(ecc_t *ecc_p) 197 { 198 ecc_disable(ecc_p, IB_INTR_WAIT); 199 } 200 201 uint_t 202 ecc_disable_nowait(ecc_t *ecc_p) 203 { 204 ecc_disable(ecc_p, IB_INTR_NOWAIT); 205 return (BF_NONE); 206 } 207 208 static void 209 ecc_disable(ecc_t *ecc_p, int wait) 210 { 211 cb_t *cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 212 uint64_t csr_pa = ecc_p->ecc_csr_pa; 213 uint64_t csr = lddphysio(csr_pa); 214 215 csr &= ~(COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN); 216 stdphysio(csr_pa, csr); 217 218 cb_disable_nintr(cb_p, CBNINTR_UE, wait); 219 cb_disable_nintr(cb_p, CBNINTR_CE, wait); 220 } 221 222 /* 223 * I/O ECC error handling: 224 * 225 * Below are the generic functions that handle PCI(pcisch, pcipsy) detected 226 * ECC errors. 227 * 228 * The registered interrupt handler for both pcisch and pcipsy is ecc_intr(), 229 * it's function is to receive the error, capture some state, and pass that on 230 * to the ecc_err_handler() for reporting purposes. 231 * 232 * ecc_err_handler() gathers more state(via ecc_errstate_get) and attempts 233 * to handle and report the error. ecc_err_handler() must determine if we need 234 * to panic due to this error (via pci_ecc_classify, which also decodes the 235 * ECC afsr), and if any side effects exist that may have caused or are due 236 * to this error. PBM errors related to the ECC error may exist, to report 237 * them we call pci_pbm_err_handler() and call ndi_fm_handler_dispatch() so 238 * that the child devices can log their pci errors. 239 * 240 * To report the error we must also get the syndrome and unum, which can not 241 * be done in high level interrupted context. Therefore we have an error 242 * queue(pci_ecc_queue) which we dispatch errors to, to report the errors 243 * (ecc_err_drain()). 244 * 245 * ecc_err_drain() will be called when either the softint is triggered 246 * or the system is panicing. Either way it will gather more information 247 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to 248 * retire the faulty page(if error is a UE), and report the detected error. 249 * 250 * ecc_delayed_ce() is called via timeout from ecc_err_handler() following 251 * the receipt of a CE interrupt. It will be called after 6ms and check to 252 * see if any new CEs are present, if so we will log and another timeout will 253 * be set by(ecc_err_handler()). If no CEs are present then it will re-enable 254 * CEs by clearing the previous interrupt. This is to keep the system going 255 * in the event of a CE storm. 256 */ 257 258 /* 259 * Function used to get ECC AFSR register 260 */ 261 static uint64_t 262 ecc_read_afsr(ecc_intr_info_t *ecc_ii_p) 263 { 264 uint_t i; 265 uint64_t afsr = 0ull; 266 267 ASSERT((ecc_ii_p->ecc_type == CBNINTR_UE) || 268 (ecc_ii_p->ecc_type == CBNINTR_CE)); 269 if (!ecc_ii_p->ecc_errpndg_mask) 270 return (lddphysio(ecc_ii_p->ecc_afsr_pa)); 271 272 for (i = 0; i < pci_ecc_afsr_retries; i++) { 273 274 /* 275 * If we timeout, the logging routine will 276 * know because it will see the ERRPNDG bits 277 * set in the AFSR. 278 */ 279 afsr = lddphysio(ecc_ii_p->ecc_afsr_pa); 280 if ((afsr & ecc_ii_p->ecc_errpndg_mask) == 0) 281 break; 282 } 283 return (afsr); 284 } 285 286 /* 287 * IO detected ECC error interrupt handler, calls ecc_err_handler to post 288 * error reports and handle the interrupt. Re-entry into ecc_err_handler 289 * is protected by the per-chip mutex pci_fm_mutex. 290 */ 291 uint_t 292 ecc_intr(caddr_t a) 293 { 294 ecc_intr_info_t *ecc_ii_p = (ecc_intr_info_t *)a; 295 ecc_t *ecc_p = ecc_ii_p->ecc_p; 296 pci_common_t *cmn_p = ecc_p->ecc_pci_cmn_p; 297 ecc_errstate_t ecc_err; 298 int ret = DDI_FM_OK; 299 300 bzero(&ecc_err, sizeof (ecc_errstate_t)); 301 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 302 ecc_err.ecc_ii_p = *ecc_ii_p; 303 ecc_err.ecc_p = ecc_p; 304 ecc_err.ecc_caller = PCI_ECC_CALL; 305 306 mutex_enter(&cmn_p->pci_fm_mutex); 307 ret = ecc_err_handler(&ecc_err); 308 mutex_exit(&cmn_p->pci_fm_mutex); 309 if (ret == DDI_FM_FATAL) { 310 /* 311 * Need delay here to allow CPUs to handle related traps, 312 * such as FRUs for USIIIi systems. 313 */ 314 DELAY(pci_ecc_panic_delay); 315 fm_panic("Fatal PCI UE Error"); 316 } 317 318 return (DDI_INTR_CLAIMED); 319 } 320 321 /* 322 * Function used to gather IO ECC error state. 323 */ 324 static void 325 ecc_errstate_get(ecc_errstate_t *ecc_err_p) 326 { 327 ecc_t *ecc_p; 328 uint_t bus_id; 329 330 ASSERT(ecc_err_p); 331 332 ecc_p = ecc_err_p->ecc_ii_p.ecc_p; 333 bus_id = ecc_p->ecc_pci_cmn_p->pci_common_id; 334 335 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 336 /* 337 * Read the fault registers. 338 */ 339 ecc_err_p->ecc_afsr = ecc_read_afsr(&ecc_err_p->ecc_ii_p); 340 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.ecc_afar_pa); 341 342 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr & 343 ecc_err_p->ecc_ii_p.ecc_offset_mask) >> 344 ecc_err_p->ecc_ii_p.ecc_offset_shift) << 345 ecc_err_p->ecc_ii_p.ecc_size_log2; 346 347 ecc_err_p->ecc_aflt.flt_id = gethrtime(); 348 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr; 349 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) + 350 ecc_err_p->ecc_offset; 351 ecc_err_p->ecc_aflt.flt_bus_id = bus_id; 352 ecc_err_p->ecc_aflt.flt_inst = CPU->cpu_id; 353 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS; 354 ecc_err_p->ecc_aflt.flt_in_memory = (pf_is_memory 355 (ecc_err_p->ecc_afar >> MMU_PAGESHIFT))? 1: 0; 356 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT; 357 } 358 359 /* 360 * ecc_pci_check: Called by ecc_err_handler() this function is responsible 361 * for calling pci_pbm_err_handler() for both sides of the schizo/psycho 362 * and calling their children error handlers(via ndi_fm_handler_dispatch()). 363 */ 364 static int 365 ecc_pci_check(ecc_t *ecc_p, uint64_t fme_ena) 366 { 367 ddi_fm_error_t derr; 368 int i; 369 int ret; 370 371 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 372 373 bzero(&derr, sizeof (ddi_fm_error_t)); 374 derr.fme_version = DDI_FME_VERSION; 375 derr.fme_ena = fme_ena; 376 ret = DDI_FM_NONFATAL; 377 378 /* 379 * Need to report any PBM errors which may have caused or 380 * resulted from this error. 381 * 382 * Each psycho or schizo is represented by a pair of pci nodes 383 * in the device tree. 384 */ 385 for (i = 0; i < 2; i++) { 386 dev_info_t *dip; 387 pci_t *pci_p; 388 389 /* Make sure PBM PCI node exists */ 390 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[i]; 391 if (pci_p == NULL) 392 continue; 393 394 dip = pci_p->pci_dip; 395 if (pci_pbm_err_handler(dip, &derr, (void *)pci_p, 396 PCI_ECC_CALL) == DDI_FM_FATAL) 397 ret = DDI_FM_FATAL; 398 } 399 if (ret == DDI_FM_FATAL) 400 return (DDI_FM_FATAL); 401 else 402 return (DDI_FM_NONFATAL); 403 } 404 405 /* 406 * Function used to handle and log IO detected ECC errors, can be called by 407 * ecc_intr and pci_err_callback(trap callback). Protected by pci_fm_mutex. 408 */ 409 int 410 ecc_err_handler(ecc_errstate_t *ecc_err_p) 411 { 412 uint64_t pri_err, sec_err; 413 ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p; 414 ecc_t *ecc_p = ecc_ii_p->ecc_p; 415 pci_t *pci_p; 416 cb_t *cb_p; 417 int fatal = 0; 418 int nonfatal = 0; 419 420 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 421 422 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[0]; 423 if (pci_p == NULL) 424 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[1]; 425 426 cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 427 428 ecc_errstate_get(ecc_err_p); 429 pri_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_PE_SHIFT) & 430 COMMON_ECC_UE_AFSR_E_MASK; 431 432 sec_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_SE_SHIFT) & 433 COMMON_ECC_UE_AFSR_E_MASK; 434 435 switch (ecc_ii_p->ecc_type) { 436 case CBNINTR_UE: 437 if (pri_err) { 438 ecc_err_p->ecc_aflt.flt_synd = 439 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 440 ecc_err_p->ecc_pri = 1; 441 pci_ecc_classify(pri_err, ecc_err_p); 442 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 443 sizeof (ecc_errstate_t), 444 ecc_err_p->ecc_aflt.flt_panic); 445 } 446 if (sec_err) { 447 ecc_errstate_t ecc_sec_err; 448 uint64_t sec_tmp; 449 int i; 450 uint64_t afsr_err[] = {COMMON_ECC_UE_AFSR_E_PIO, 451 COMMON_ECC_UE_AFSR_E_DRD, 452 COMMON_ECC_UE_AFSR_E_DWR}; 453 454 ecc_sec_err = *ecc_err_p; 455 ecc_sec_err.ecc_pri = 0; 456 /* 457 * Secondary errors are cummulative so we need to loop 458 * through to capture them all. 459 */ 460 for (i = 0; i < 3; i++) { 461 sec_tmp = sec_err & afsr_err[i]; 462 if (sec_tmp) { 463 pci_ecc_classify(sec_tmp, &ecc_sec_err); 464 ecc_ereport_post(pci_p->pci_dip, 465 &ecc_sec_err); 466 } 467 } 468 } 469 /* 470 * Check for PCI bus errors that may have resulted from or 471 * caused this UE. 472 */ 473 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 474 ecc_pci_check(ecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL) 475 ecc_err_p->ecc_aflt.flt_panic = 1; 476 477 if (ecc_err_p->ecc_aflt.flt_panic && 478 ecc_err_p->ecc_aflt.flt_in_memory) 479 panic_aflt = ecc_err_p->ecc_aflt; 480 481 if (ecc_err_p->ecc_aflt.flt_panic) { 482 /* 483 * Disable all further errors since this will be 484 * treated as a fatal error. 485 */ 486 (void) ecc_disable_nowait(ecc_p); 487 fatal++; 488 } 489 break; 490 491 case CBNINTR_CE: 492 if (pri_err) { 493 ecc_err_p->ecc_pri = 1; 494 pci_ecc_classify(pri_err, ecc_err_p); 495 ecc_err_p->ecc_aflt.flt_synd = 496 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 497 ce_scrub(&ecc_err_p->ecc_aflt); 498 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 499 sizeof (ecc_errstate_t), ERRORQ_ASYNC); 500 nonfatal++; 501 } 502 if (sec_err) { 503 ecc_errstate_t ecc_sec_err; 504 505 ecc_sec_err = *ecc_err_p; 506 ecc_sec_err.ecc_pri = 0; 507 pci_ecc_classify(sec_err, &ecc_sec_err); 508 ecc_ereport_post(pci_p->pci_dip, &ecc_sec_err); 509 nonfatal++; 510 } 511 break; 512 513 default: 514 return (DDI_FM_OK); 515 } 516 /* Clear the errors */ 517 stdphysio(ecc_ii_p->ecc_afsr_pa, ecc_err_p->ecc_afsr); 518 /* 519 * Clear the interrupt if called by ecc_intr and UE error or if called 520 * by ecc_intr and CE error and delayed CE interrupt handling is 521 * turned off. 522 */ 523 if ((ecc_err_p->ecc_caller == PCI_ECC_CALL && 524 ecc_ii_p->ecc_type == CBNINTR_UE && !fatal) || 525 (ecc_err_p->ecc_caller == PCI_ECC_CALL && 526 ecc_ii_p->ecc_type == CBNINTR_CE && !ecc_ce_delayed)) 527 cb_clear_nintr(cb_p, ecc_ii_p->ecc_type); 528 if (!fatal && !nonfatal) 529 return (DDI_FM_OK); 530 else if (fatal) 531 return (DDI_FM_FATAL); 532 return (DDI_FM_NONFATAL); 533 } 534 535 /* 536 * Called from ecc_err_drain below for CBINTR_CE case. 537 */ 538 static int 539 ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 540 { 541 struct async_flt *ecc = &ecc_err->ecc_aflt; 542 uint64_t errors; 543 544 if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) { 545 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP); 546 return (0); 547 } else if (errors != PR_OK) { 548 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET); 549 return (0); 550 } else { 551 return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep, 552 offsetof(ecc_errstate_t, ecc_aflt))); 553 } 554 } 555 556 /* 557 * Function used to drain pci_ecc_queue, either during panic or after softint 558 * is generated, to log IO detected ECC errors. 559 */ 560 /*ARGSUSED*/ 561 void 562 ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 563 { 564 struct async_flt *ecc = &ecc_err->ecc_aflt; 565 pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0]; 566 int ecc_type = ecc_err->ecc_ii_p.ecc_type; 567 568 if (pci_p == NULL) 569 pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[1]; 570 571 if (ecc->flt_class == RECIRC_BUS_FAULT) { 572 /* 573 * Perform any additional actions that occur after the 574 * ecc_err_cexdiag below and post the ereport. 575 */ 576 ecc->flt_class = BUS_FAULT; 577 ecc_err->ecc_err_type = flt_to_error_type(ecc); 578 ecc_ereport_post(pci_p->pci_dip, ecc_err); 579 return; 580 } 581 582 ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ? 583 ECC_IO_UE : ECC_IO_CE); 584 585 switch (ecc_type) { 586 case CBNINTR_UE: 587 if (ecc_err->ecc_pg_ret == 1) { 588 (void) page_retire(ecc->flt_addr, PR_UE); 589 } 590 ecc_err->ecc_err_type = flt_to_error_type(ecc); 591 break; 592 593 case CBNINTR_CE: 594 /* 595 * Setup timeout (if CE detected via interrupt) to 596 * re-enable CE interrupts if no more CEs are detected. 597 * This is to protect against CE storms. 598 */ 599 if (ecc_ce_delayed && 600 ecc_err->ecc_caller == PCI_ECC_CALL && 601 ecc_err->ecc_p->ecc_to_id == 0) { 602 ecc_err->ecc_p->ecc_to_id = timeout(ecc_delayed_ce, 603 (void *)ecc_err->ecc_p, 604 drv_usectohz((clock_t)ecc_ce_delay_secs * 605 MICROSEC)); 606 } 607 608 /* ecc_err_cexdiag returns nonzero to recirculate */ 609 if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) && 610 ecc_err_cexdiag(ecc_err, eqep)) 611 return; 612 ecc_err->ecc_err_type = flt_to_error_type(ecc); 613 break; 614 } 615 616 ecc_ereport_post(pci_p->pci_dip, ecc_err); 617 } 618 619 static void 620 ecc_delayed_ce(void *arg) 621 { 622 ecc_t *ecc_p = (ecc_t *)arg; 623 pci_common_t *cmn_p; 624 cb_t *cb_p; 625 626 ASSERT(ecc_p); 627 628 cmn_p = ecc_p->ecc_pci_cmn_p; 629 cb_p = cmn_p->pci_common_cb_p; 630 /* 631 * If no more CE errors are found then enable interrupts(by 632 * clearing the previous interrupt), else send in for logging 633 * and the timeout should be set again. 634 */ 635 ecc_p->ecc_to_id = 0; 636 if (!((ecc_read_afsr(&ecc_p->ecc_ce) >> 637 COMMON_ECC_UE_AFSR_PE_SHIFT) & COMMON_ECC_UE_AFSR_E_MASK)) { 638 cb_clear_nintr(cb_p, ecc_p->ecc_ce.ecc_type); 639 } else { 640 ecc_errstate_t ecc_err; 641 642 bzero(&ecc_err, sizeof (ecc_errstate_t)); 643 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 644 ecc_err.ecc_ii_p = ecc_p->ecc_ce; 645 ecc_err.ecc_p = ecc_p; 646 ecc_err.ecc_caller = PCI_ECC_CALL; 647 648 mutex_enter(&cmn_p->pci_fm_mutex); 649 (void) ecc_err_handler(&ecc_err); 650 mutex_exit(&cmn_p->pci_fm_mutex); 651 } 652 } 653 654 /* 655 * Function used to post IO detected ECC ereports. 656 */ 657 static void 658 ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err) 659 { 660 char buf[FM_MAX_CLASS], dev_path[MAXPATHLEN], *ptr; 661 struct i_ddi_fmhdl *fmhdl = DEVI(dip)->devi_fmhdl; 662 nvlist_t *ereport, *detector; 663 nv_alloc_t *nva; 664 errorq_elem_t *eqep; 665 666 /* 667 * We do not use ddi_fm_ereport_post because we need to set a 668 * special detector here. Since we do not have a device path for 669 * the bridge chip we use what we think it should be to aid in 670 * diagnosis. This path fmri is created by pci_fmri_create() 671 * during initialization. 672 */ 673 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s", DDI_IO_CLASS, 674 ecc_err->ecc_bridge_type, ecc_err->ecc_aflt.flt_erpt_class); 675 676 ecc_err->ecc_ena = ecc_err->ecc_ena ? ecc_err->ecc_ena : 677 fm_ena_generate(0, FM_ENA_FMT1); 678 679 eqep = errorq_reserve(fmhdl->fh_errorq); 680 if (eqep == NULL) 681 return; 682 683 ereport = errorq_elem_nvl(fmhdl->fh_errorq, eqep); 684 nva = errorq_elem_nva(fmhdl->fh_errorq, eqep); 685 detector = fm_nvlist_create(nva); 686 687 ASSERT(ereport); 688 ASSERT(nva); 689 ASSERT(detector); 690 691 ddi_pathname(dip, dev_path); 692 ptr = strrchr(dev_path, (int)','); 693 694 if (ptr) 695 *ptr = '\0'; 696 697 fm_fmri_dev_set(detector, FM_DEV_SCHEME_VERSION, NULL, dev_path, NULL); 698 699 if (ecc_err->ecc_pri) { 700 if ((ecc_err->ecc_fmri = fm_nvlist_create(nva)) != NULL) { 701 char sid[DIMM_SERIAL_ID_LEN] = ""; 702 uint64_t offset = (uint64_t)-1; 703 int len; 704 int ret; 705 706 ret = cpu_get_mem_sid(ecc_err->ecc_unum, sid, 707 DIMM_SERIAL_ID_LEN, &len); 708 709 if (ret == 0) { 710 (void) cpu_get_mem_offset( 711 ecc_err->ecc_aflt.flt_addr, &offset); 712 } 713 714 fm_fmri_mem_set(ecc_err->ecc_fmri, 715 FM_MEM_SCHEME_VERSION, NULL, ecc_err->ecc_unum, 716 (ret == 0) ? sid : NULL, offset); 717 } 718 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 719 ecc_err->ecc_ena, detector, 720 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 721 PCI_ECC_AFAR, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_addr, 722 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 723 PCI_ECC_SYND, DATA_TYPE_UINT16, ecc_err->ecc_aflt.flt_synd, 724 PCI_ECC_TYPE, DATA_TYPE_STRING, ecc_err->ecc_err_type, 725 PCI_ECC_DISP, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_disp, 726 PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, ecc_err->ecc_fmri, 727 NULL); 728 } else { 729 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 730 ecc_err->ecc_ena, detector, 731 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 732 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 733 NULL); 734 } 735 errorq_commit(fmhdl->fh_errorq, eqep, ERRORQ_ASYNC); 736 } 737