1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * PCI ECC support 31 */ 32 33 #include <sys/types.h> 34 #include <sys/systm.h> /* for strrchr */ 35 #include <sys/kmem.h> 36 #include <sys/sunddi.h> 37 #include <sys/intr.h> 38 #include <sys/async.h> /* struct async_flt */ 39 #include <sys/ddi_impldefs.h> 40 #include <sys/machsystm.h> 41 #include <sys/sysmacros.h> 42 #include <sys/fm/protocol.h> 43 #include <sys/fm/util.h> 44 #include <sys/fm/io/pci.h> 45 #include <sys/fm/io/sun4upci.h> 46 #include <sys/fm/io/ddi.h> 47 #include <sys/pci/pci_obj.h> /* ld/st physio */ 48 #include <sys/cpuvar.h> 49 #include <sys/errclassify.h> 50 51 /*LINTLIBRARY*/ 52 53 static void ecc_disable(ecc_t *, int); 54 static void ecc_delayed_ce(void *); 55 static uint64_t ecc_read_afsr(ecc_intr_info_t *); 56 static void ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err); 57 58 clock_t pci_ecc_panic_delay = 200; 59 int ecc_ce_delay_secs = 6; /* number of sec to delay reenabling of CEs */ 60 int ecc_ce_delayed = 1; /* global for enabling/disabling CE delay */ 61 62 void 63 ecc_create(pci_t *pci_p) 64 { 65 #ifdef DEBUG 66 dev_info_t *dip = pci_p->pci_dip; 67 #endif 68 uint64_t cb_base_pa = pci_p->pci_cb_p->cb_base_pa; 69 ecc_t *ecc_p; 70 71 ecc_p = (ecc_t *)kmem_zalloc(sizeof (ecc_t), KM_SLEEP); 72 ecc_p->ecc_pci_cmn_p = pci_p->pci_common_p; 73 pci_p->pci_ecc_p = ecc_p; 74 75 ecc_p->ecc_ue.ecc_p = ecc_p; 76 ecc_p->ecc_ue.ecc_type = CBNINTR_UE; 77 ecc_p->ecc_ce.ecc_p = ecc_p; 78 ecc_p->ecc_ce.ecc_type = CBNINTR_CE; 79 80 pci_ecc_setup(ecc_p); 81 82 /* 83 * Determine the virtual addresses of the streaming cache 84 * control/status and flush registers. 85 */ 86 ecc_p->ecc_csr_pa = cb_base_pa + COMMON_ECC_CSR_OFFSET; 87 ecc_p->ecc_ue.ecc_afsr_pa = cb_base_pa + COMMON_UE_AFSR_OFFSET; 88 ecc_p->ecc_ue.ecc_afar_pa = cb_base_pa + COMMON_UE_AFAR_OFFSET; 89 ecc_p->ecc_ce.ecc_afsr_pa = cb_base_pa + COMMON_CE_AFSR_OFFSET; 90 ecc_p->ecc_ce.ecc_afar_pa = cb_base_pa + COMMON_CE_AFAR_OFFSET; 91 92 DEBUG1(DBG_ATTACH, dip, "ecc_create: csr=%x\n", ecc_p->ecc_csr_pa); 93 DEBUG2(DBG_ATTACH, dip, "ecc_create: ue_afsr=%x, ue_afar=%x\n", 94 ecc_p->ecc_ue.ecc_afsr_pa, ecc_p->ecc_ue.ecc_afar_pa); 95 DEBUG2(DBG_ATTACH, dip, "ecc_create: ce_afsr=%x, ce_afar=%x\n", 96 ecc_p->ecc_ce.ecc_afsr_pa, ecc_p->ecc_ce.ecc_afar_pa); 97 98 ecc_configure(pci_p); 99 100 /* 101 * Register routines to be called from system error handling code. 102 */ 103 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)ecc_disable_nowait, ecc_p); 104 } 105 106 int 107 ecc_register_intr(pci_t *pci_p) 108 { 109 ecc_t *ecc_p = pci_p->pci_ecc_p; 110 int ret; 111 112 /* 113 * Install the UE and CE error interrupt handlers. 114 */ 115 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue)) != 116 DDI_SUCCESS) 117 return (ret); 118 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce)) != 119 DDI_SUCCESS) 120 return (ret); 121 122 return (DDI_SUCCESS); 123 } 124 125 void 126 ecc_destroy(pci_t *pci_p) 127 { 128 ecc_t *ecc_p = pci_p->pci_ecc_p; 129 130 DEBUG0(DBG_DETACH, pci_p->pci_dip, "ecc_destroy:\n"); 131 132 /* 133 * Disable UE and CE ECC error interrupts. 134 */ 135 ecc_disable_wait(ecc_p); 136 137 /* 138 * Remove the ECC interrupt handlers. 139 */ 140 pci_ecc_rem_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue); 141 pci_ecc_rem_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce); 142 143 /* 144 * Unregister our error handling functions. 145 */ 146 bus_func_unregister(BF_TYPE_ERRDIS, 147 (busfunc_t)ecc_disable_nowait, ecc_p); 148 /* 149 * If a timer has been set, unset it. 150 */ 151 (void) untimeout(ecc_p->ecc_to_id); 152 153 kmem_free(ecc_p, sizeof (ecc_t)); 154 pci_p->pci_ecc_p = NULL; 155 } 156 157 void 158 ecc_configure(pci_t *pci_p) 159 { 160 ecc_t *ecc_p = pci_p->pci_ecc_p; 161 dev_info_t *dip = pci_p->pci_dip; 162 uint64_t l; 163 164 /* 165 * Clear any pending ECC errors. 166 */ 167 DEBUG0(DBG_ATTACH, dip, "ecc_configure: clearing UE and CE errors\n"); 168 l = (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_PE_SHIFT) | 169 (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_SE_SHIFT); 170 stdphysio(ecc_p->ecc_ue.ecc_afsr_pa, l); 171 172 l = (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_PE_SHIFT) | 173 (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_SE_SHIFT); 174 stdphysio(ecc_p->ecc_ce.ecc_afsr_pa, l); 175 176 /* 177 * Enable ECC error detections via the control register. 178 */ 179 DEBUG0(DBG_ATTACH, dip, "ecc_configure: enabling UE CE detection\n"); 180 l = COMMON_ECC_CTRL_ECC_EN; 181 if (ecc_error_intr_enable) 182 l |= COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN; 183 stdphysio(ecc_p->ecc_csr_pa, l); 184 } 185 186 void 187 ecc_enable_intr(pci_t *pci_p) 188 { 189 cb_enable_nintr(pci_p, CBNINTR_UE); 190 cb_enable_nintr(pci_p, CBNINTR_CE); 191 } 192 193 void 194 ecc_disable_wait(ecc_t *ecc_p) 195 { 196 ecc_disable(ecc_p, IB_INTR_WAIT); 197 } 198 199 uint_t 200 ecc_disable_nowait(ecc_t *ecc_p) 201 { 202 ecc_disable(ecc_p, IB_INTR_NOWAIT); 203 return (BF_NONE); 204 } 205 206 static void 207 ecc_disable(ecc_t *ecc_p, int wait) 208 { 209 cb_t *cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 210 uint64_t csr_pa = ecc_p->ecc_csr_pa; 211 uint64_t csr = lddphysio(csr_pa); 212 213 csr &= ~(COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN); 214 stdphysio(csr_pa, csr); 215 216 cb_disable_nintr(cb_p, CBNINTR_UE, wait); 217 cb_disable_nintr(cb_p, CBNINTR_CE, wait); 218 } 219 220 /* 221 * I/O ECC error handling: 222 * 223 * Below are the generic functions that handle PCI(pcisch, pcipsy) detected 224 * ECC errors. 225 * 226 * The registered interrupt handler for both pcisch and pcipsy is ecc_intr(), 227 * it's function is to receive the error, capture some state, and pass that on 228 * to the ecc_err_handler() for reporting purposes. 229 * 230 * ecc_err_handler() gathers more state(via ecc_errstate_get) and attempts 231 * to handle and report the error. ecc_err_handler() must determine if we need 232 * to panic due to this error (via pci_ecc_classify, which also decodes the 233 * ECC afsr), and if any side effects exist that may have caused or are due 234 * to this error. PBM errors related to the ECC error may exist, to report 235 * them we call pci_pbm_err_handler() and call ndi_fm_handler_dispatch() so 236 * that the child devices can log their pci errors. 237 * 238 * To report the error we must also get the syndrome and unum, which can not 239 * be done in high level interrupted context. Therefore we have an error 240 * queue(pci_ecc_queue) which we dispatch errors to, to report the errors 241 * (ecc_err_drain()). 242 * 243 * ecc_err_drain() will be called when either the softint is triggered 244 * or the system is panicing. Either way it will gather more information 245 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to 246 * retire the faulty page(if error is a UE), and report the detected error. 247 * 248 * ecc_delayed_ce() is called via timeout from ecc_err_handler() following 249 * the receipt of a CE interrupt. It will be called after 6ms and check to 250 * see if any new CEs are present, if so we will log and another timeout will 251 * be set by(ecc_err_handler()). If no CEs are present then it will re-enable 252 * CEs by clearing the previous interrupt. This is to keep the system going 253 * in the event of a CE storm. 254 */ 255 256 /* 257 * Function used to get ECC AFSR register 258 */ 259 static uint64_t 260 ecc_read_afsr(ecc_intr_info_t *ecc_ii_p) 261 { 262 uint_t i; 263 uint64_t afsr = 0ull; 264 265 ASSERT((ecc_ii_p->ecc_type == CBNINTR_UE) || 266 (ecc_ii_p->ecc_type == CBNINTR_CE)); 267 if (!ecc_ii_p->ecc_errpndg_mask) 268 return (lddphysio(ecc_ii_p->ecc_afsr_pa)); 269 270 for (i = 0; i < pci_ecc_afsr_retries; i++) { 271 272 /* 273 * If we timeout, the logging routine will 274 * know because it will see the ERRPNDG bits 275 * set in the AFSR. 276 */ 277 afsr = lddphysio(ecc_ii_p->ecc_afsr_pa); 278 if ((afsr & ecc_ii_p->ecc_errpndg_mask) == 0) 279 break; 280 } 281 return (afsr); 282 } 283 284 /* 285 * IO detected ECC error interrupt handler, calls ecc_err_handler to post 286 * error reports and handle the interrupt. Re-entry into ecc_err_handler 287 * is protected by the per-chip mutex pci_fm_mutex. 288 */ 289 uint_t 290 ecc_intr(caddr_t a) 291 { 292 ecc_intr_info_t *ecc_ii_p = (ecc_intr_info_t *)a; 293 ecc_t *ecc_p = ecc_ii_p->ecc_p; 294 pci_common_t *cmn_p = ecc_p->ecc_pci_cmn_p; 295 ecc_errstate_t ecc_err; 296 int ret = DDI_FM_OK; 297 298 bzero(&ecc_err, sizeof (ecc_errstate_t)); 299 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 300 ecc_err.ecc_ii_p = *ecc_ii_p; 301 ecc_err.ecc_p = ecc_p; 302 ecc_err.ecc_caller = PCI_ECC_CALL; 303 304 mutex_enter(&cmn_p->pci_fm_mutex); 305 ret = ecc_err_handler(&ecc_err); 306 mutex_exit(&cmn_p->pci_fm_mutex); 307 if (ret == DDI_FM_FATAL) { 308 /* 309 * Need delay here to allow CPUs to handle related traps, 310 * such as FRUs for USIIIi systems. 311 */ 312 DELAY(pci_ecc_panic_delay); 313 fm_panic("Fatal PCI UE Error"); 314 } 315 316 return (DDI_INTR_CLAIMED); 317 } 318 319 /* 320 * Function used to gather IO ECC error state. 321 */ 322 static void 323 ecc_errstate_get(ecc_errstate_t *ecc_err_p) 324 { 325 ecc_t *ecc_p; 326 uint_t bus_id; 327 328 ASSERT(ecc_err_p); 329 330 ecc_p = ecc_err_p->ecc_ii_p.ecc_p; 331 bus_id = ecc_p->ecc_pci_cmn_p->pci_common_id; 332 333 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 334 /* 335 * Read the fault registers. 336 */ 337 ecc_err_p->ecc_afsr = ecc_read_afsr(&ecc_err_p->ecc_ii_p); 338 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.ecc_afar_pa); 339 340 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr & 341 ecc_err_p->ecc_ii_p.ecc_offset_mask) >> 342 ecc_err_p->ecc_ii_p.ecc_offset_shift) << 343 ecc_err_p->ecc_ii_p.ecc_size_log2; 344 345 ecc_err_p->ecc_aflt.flt_id = gethrtime(); 346 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr; 347 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) + 348 ecc_err_p->ecc_offset; 349 ecc_err_p->ecc_aflt.flt_bus_id = bus_id; 350 ecc_err_p->ecc_aflt.flt_inst = CPU->cpu_id; 351 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS; 352 ecc_err_p->ecc_aflt.flt_in_memory = (pf_is_memory 353 (ecc_err_p->ecc_afar >> MMU_PAGESHIFT))? 1: 0; 354 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT; 355 } 356 357 /* 358 * ecc_pci_check: Called by ecc_err_handler() this function is responsible 359 * for calling pci_pbm_err_handler() for both sides of the schizo/psycho 360 * and calling their children error handlers(via ndi_fm_handler_dispatch()). 361 */ 362 static int 363 ecc_pci_check(ecc_t *ecc_p, uint64_t fme_ena) 364 { 365 ddi_fm_error_t derr; 366 int i; 367 int ret; 368 369 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 370 371 bzero(&derr, sizeof (ddi_fm_error_t)); 372 derr.fme_version = DDI_FME_VERSION; 373 derr.fme_ena = fme_ena; 374 ret = DDI_FM_NONFATAL; 375 376 /* 377 * Need to report any PBM errors which may have caused or 378 * resulted from this error. 379 * 380 * Each psycho or schizo is represented by a pair of pci nodes 381 * in the device tree. 382 */ 383 for (i = 0; i < 2; i++) { 384 dev_info_t *dip; 385 pci_t *pci_p; 386 387 /* Make sure PBM PCI node exists */ 388 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[i]; 389 if (pci_p == NULL) 390 continue; 391 392 dip = pci_p->pci_dip; 393 if (pci_pbm_err_handler(dip, &derr, (void *)pci_p, 394 PCI_ECC_CALL) == DDI_FM_FATAL) 395 ret = DDI_FM_FATAL; 396 } 397 if (ret == DDI_FM_FATAL) 398 return (DDI_FM_FATAL); 399 else 400 return (DDI_FM_NONFATAL); 401 } 402 403 /* 404 * Function used to handle and log IO detected ECC errors, can be called by 405 * ecc_intr and pci_err_callback(trap callback). Protected by pci_fm_mutex. 406 */ 407 int 408 ecc_err_handler(ecc_errstate_t *ecc_err_p) 409 { 410 uint64_t pri_err, sec_err; 411 ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p; 412 ecc_t *ecc_p = ecc_ii_p->ecc_p; 413 pci_t *pci_p; 414 cb_t *cb_p; 415 int fatal = 0; 416 int nonfatal = 0; 417 418 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 419 420 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[0]; 421 if (pci_p == NULL) 422 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[1]; 423 424 cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 425 426 ecc_errstate_get(ecc_err_p); 427 pri_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_PE_SHIFT) & 428 COMMON_ECC_UE_AFSR_E_MASK; 429 430 sec_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_SE_SHIFT) & 431 COMMON_ECC_UE_AFSR_E_MASK; 432 433 switch (ecc_ii_p->ecc_type) { 434 case CBNINTR_UE: 435 if (pri_err) { 436 ecc_err_p->ecc_aflt.flt_synd = 437 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 438 ecc_err_p->ecc_pri = 1; 439 pci_ecc_classify(pri_err, ecc_err_p); 440 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 441 sizeof (ecc_errstate_t), 442 ecc_err_p->ecc_aflt.flt_panic); 443 } 444 if (sec_err) { 445 ecc_errstate_t ecc_sec_err; 446 uint64_t sec_tmp; 447 int i; 448 uint64_t afsr_err[] = {COMMON_ECC_UE_AFSR_E_PIO, 449 COMMON_ECC_UE_AFSR_E_DRD, 450 COMMON_ECC_UE_AFSR_E_DWR}; 451 452 ecc_sec_err = *ecc_err_p; 453 ecc_sec_err.ecc_pri = 0; 454 /* 455 * Secondary errors are cummulative so we need to loop 456 * through to capture them all. 457 */ 458 for (i = 0; i < 3; i++) { 459 sec_tmp = sec_err & afsr_err[i]; 460 if (sec_tmp) { 461 pci_ecc_classify(sec_tmp, &ecc_sec_err); 462 ecc_ereport_post(pci_p->pci_dip, 463 &ecc_sec_err); 464 } 465 } 466 } 467 /* 468 * Check for PCI bus errors that may have resulted from or 469 * caused this UE. 470 */ 471 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 472 ecc_pci_check(ecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL) 473 ecc_err_p->ecc_aflt.flt_panic = 1; 474 475 if (ecc_err_p->ecc_aflt.flt_panic && 476 ecc_err_p->ecc_aflt.flt_in_memory) 477 panic_aflt = ecc_err_p->ecc_aflt; 478 479 if (ecc_err_p->ecc_aflt.flt_panic) { 480 /* 481 * Disable all further errors since this will be 482 * treated as a fatal error. 483 */ 484 (void) ecc_disable_nowait(ecc_p); 485 fatal++; 486 } 487 break; 488 489 case CBNINTR_CE: 490 if (pri_err) { 491 ecc_err_p->ecc_pri = 1; 492 pci_ecc_classify(pri_err, ecc_err_p); 493 ecc_err_p->ecc_aflt.flt_synd = 494 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 495 ce_scrub(&ecc_err_p->ecc_aflt); 496 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 497 sizeof (ecc_errstate_t), ERRORQ_ASYNC); 498 nonfatal++; 499 } 500 if (sec_err) { 501 ecc_errstate_t ecc_sec_err; 502 503 ecc_sec_err = *ecc_err_p; 504 ecc_sec_err.ecc_pri = 0; 505 pci_ecc_classify(sec_err, &ecc_sec_err); 506 ecc_ereport_post(pci_p->pci_dip, &ecc_sec_err); 507 nonfatal++; 508 } 509 break; 510 511 default: 512 return (DDI_FM_OK); 513 } 514 /* Clear the errors */ 515 stdphysio(ecc_ii_p->ecc_afsr_pa, ecc_err_p->ecc_afsr); 516 /* 517 * Clear the interrupt if called by ecc_intr and UE error or if called 518 * by ecc_intr and CE error and delayed CE interrupt handling is 519 * turned off. 520 */ 521 if ((ecc_err_p->ecc_caller == PCI_ECC_CALL && 522 ecc_ii_p->ecc_type == CBNINTR_UE && !fatal) || 523 (ecc_err_p->ecc_caller == PCI_ECC_CALL && 524 ecc_ii_p->ecc_type == CBNINTR_CE && !ecc_ce_delayed)) 525 cb_clear_nintr(cb_p, ecc_ii_p->ecc_type); 526 if (!fatal && !nonfatal) 527 return (DDI_FM_OK); 528 else if (fatal) 529 return (DDI_FM_FATAL); 530 return (DDI_FM_NONFATAL); 531 } 532 533 /* 534 * Called from ecc_err_drain below for CBINTR_CE case. 535 */ 536 static int 537 ecc_err_cexdiag(page_t *pp, ecc_errstate_t *ecc_err, 538 errorq_elem_t *eqep) 539 { 540 struct async_flt *ecc = &ecc_err->ecc_aflt; 541 542 if (!pp) { 543 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP); 544 return (0); 545 } else if (page_isretired(pp) || page_deteriorating(pp)) { 546 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET); 547 return (0); 548 } 549 550 return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep, 551 offsetof(ecc_errstate_t, ecc_aflt))); 552 } 553 554 /* 555 * Function used to drain pci_ecc_queue, either during panic or after softint 556 * is generated, to log IO detected ECC errors. 557 */ 558 /*ARGSUSED*/ 559 void 560 ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 561 { 562 struct async_flt *ecc = &ecc_err->ecc_aflt; 563 pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0]; 564 page_t *pp; 565 int ecc_type = ecc_err->ecc_ii_p.ecc_type; 566 567 if (pci_p == NULL) 568 pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[1]; 569 570 if (ecc->flt_class == RECIRC_BUS_FAULT) { 571 /* 572 * Perform any additional actions that occur after the 573 * ecc_err_cexdiag below and post the ereport. 574 */ 575 ecc->flt_class = BUS_FAULT; 576 ecc_err->ecc_err_type = flt_to_error_type(ecc); 577 ecc_ereport_post(pci_p->pci_dip, ecc_err); 578 return; 579 } 580 581 ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ? 582 ECC_IO_UE : ECC_IO_CE); 583 584 pp = page_numtopp_nolock(ecc->flt_addr >> MMU_PAGESHIFT); 585 586 switch (ecc_type) { 587 case CBNINTR_UE: 588 if (pp && ecc_err->ecc_pg_ret == 1) { 589 page_settoxic(pp, PAGE_IS_FAULTY); 590 (void) page_retire(pp, PAGE_IS_TOXIC); 591 } 592 ecc_err->ecc_err_type = flt_to_error_type(ecc); 593 break; 594 595 case CBNINTR_CE: 596 /* 597 * Setup timeout (if CE detected via interrupt) to 598 * re-enable CE interrupts if no more CEs are detected. 599 * This is to protect against CE storms. 600 */ 601 if (ecc_ce_delayed && 602 ecc_err->ecc_caller == PCI_ECC_CALL && 603 ecc_err->ecc_p->ecc_to_id == 0) { 604 ecc_err->ecc_p->ecc_to_id = timeout(ecc_delayed_ce, 605 (void *)ecc_err->ecc_p, 606 drv_usectohz((clock_t)ecc_ce_delay_secs * 607 MICROSEC)); 608 } 609 610 /* ecc_err_cexdiag returns nonzero to recirculate */ 611 if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) && 612 ecc_err_cexdiag(pp, ecc_err, eqep)) 613 return; 614 ecc_err->ecc_err_type = flt_to_error_type(ecc); 615 break; 616 } 617 618 ecc_ereport_post(pci_p->pci_dip, ecc_err); 619 } 620 621 static void 622 ecc_delayed_ce(void *arg) 623 { 624 ecc_t *ecc_p = (ecc_t *)arg; 625 pci_common_t *cmn_p; 626 cb_t *cb_p; 627 628 ASSERT(ecc_p); 629 630 cmn_p = ecc_p->ecc_pci_cmn_p; 631 cb_p = cmn_p->pci_common_cb_p; 632 /* 633 * If no more CE errors are found then enable interrupts(by 634 * clearing the previous interrupt), else send in for logging 635 * and the timeout should be set again. 636 */ 637 ecc_p->ecc_to_id = 0; 638 if (!((ecc_read_afsr(&ecc_p->ecc_ce) >> 639 COMMON_ECC_UE_AFSR_PE_SHIFT) & COMMON_ECC_UE_AFSR_E_MASK)) { 640 cb_clear_nintr(cb_p, ecc_p->ecc_ce.ecc_type); 641 } else { 642 ecc_errstate_t ecc_err; 643 644 bzero(&ecc_err, sizeof (ecc_errstate_t)); 645 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 646 ecc_err.ecc_ii_p = ecc_p->ecc_ce; 647 ecc_err.ecc_p = ecc_p; 648 ecc_err.ecc_caller = PCI_ECC_CALL; 649 650 mutex_enter(&cmn_p->pci_fm_mutex); 651 (void) ecc_err_handler(&ecc_err); 652 mutex_exit(&cmn_p->pci_fm_mutex); 653 } 654 } 655 656 /* 657 * Function used to post IO detected ECC ereports. 658 */ 659 static void 660 ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err) 661 { 662 char buf[FM_MAX_CLASS], dev_path[MAXPATHLEN], *ptr; 663 struct i_ddi_fmhdl *fmhdl = DEVI(dip)->devi_fmhdl; 664 nvlist_t *ereport, *detector; 665 nv_alloc_t *nva; 666 errorq_elem_t *eqep; 667 668 /* 669 * We do not use ddi_fm_ereport_post because we need to set a 670 * special detector here. Since we do not have a device path for 671 * the bridge chip we use what we think it should be to aid in 672 * diagnosis. This path fmri is created by pci_fmri_create() 673 * during initialization. 674 */ 675 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s", DDI_IO_CLASS, 676 ecc_err->ecc_bridge_type, ecc_err->ecc_aflt.flt_erpt_class); 677 678 ecc_err->ecc_ena = ecc_err->ecc_ena ? ecc_err->ecc_ena : 679 fm_ena_generate(0, FM_ENA_FMT1); 680 681 eqep = errorq_reserve(fmhdl->fh_errorq); 682 if (eqep == NULL) 683 return; 684 685 ereport = errorq_elem_nvl(fmhdl->fh_errorq, eqep); 686 nva = errorq_elem_nva(fmhdl->fh_errorq, eqep); 687 detector = fm_nvlist_create(nva); 688 689 ASSERT(ereport); 690 ASSERT(nva); 691 ASSERT(detector); 692 693 ddi_pathname(dip, dev_path); 694 ptr = strrchr(dev_path, (int)','); 695 696 if (ptr) 697 *ptr = '\0'; 698 699 fm_fmri_dev_set(detector, FM_DEV_SCHEME_VERSION, NULL, dev_path, NULL); 700 701 if (ecc_err->ecc_pri) { 702 if ((ecc_err->ecc_fmri = fm_nvlist_create(nva)) != NULL) 703 fm_fmri_mem_set(ecc_err->ecc_fmri, 704 FM_MEM_SCHEME_VERSION, NULL, ecc_err->ecc_unum, 705 NULL); 706 707 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 708 ecc_err->ecc_ena, detector, 709 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 710 PCI_ECC_AFAR, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_addr, 711 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 712 PCI_ECC_SYND, DATA_TYPE_UINT16, ecc_err->ecc_aflt.flt_synd, 713 PCI_ECC_TYPE, DATA_TYPE_STRING, ecc_err->ecc_err_type, 714 PCI_ECC_DISP, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_disp, 715 PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, ecc_err->ecc_fmri, 716 NULL); 717 } else { 718 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 719 ecc_err->ecc_ena, detector, 720 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 721 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 722 NULL); 723 } 724 errorq_commit(fmhdl->fh_errorq, eqep, ERRORQ_ASYNC); 725 } 726