1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * CMU-CH ECC support 30 */ 31 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/kmem.h> 35 #include <sys/sunddi.h> 36 #include <sys/intr.h> 37 #include <sys/async.h> 38 #include <sys/ddi_impldefs.h> 39 #include <sys/machsystm.h> 40 #include <sys/sysmacros.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/fm/util.h> 43 #include <sys/fm/io/pci.h> 44 #include <sys/fm/io/sun4upci.h> 45 #include <sys/fm/io/ddi.h> 46 #include <sys/pcicmu/pcicmu.h> 47 48 static void pcmu_ecc_disable(pcmu_ecc_t *, int); 49 static uint64_t pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *); 50 static void pcmu_ecc_ereport_post(dev_info_t *dip, 51 pcmu_ecc_errstate_t *ecc_err); 52 53 clock_t pcmu_pecc_panic_delay = 200; 54 55 void 56 pcmu_ecc_create(pcmu_t *pcmu_p) 57 { 58 uint64_t pcb_base_pa = pcmu_p->pcmu_cb_p->pcb_base_pa; 59 pcmu_ecc_t *pecc_p; 60 /* LINTED variable */ 61 dev_info_t *dip = pcmu_p->pcmu_dip; 62 63 pecc_p = (pcmu_ecc_t *)kmem_zalloc(sizeof (pcmu_ecc_t), KM_SLEEP); 64 pecc_p->pecc_pcmu_p = pcmu_p; 65 pcmu_p->pcmu_pecc_p = pecc_p; 66 67 pecc_p->pecc_ue.pecc_p = pecc_p; 68 pecc_p->pecc_ue.pecc_type = CBNINTR_UE; 69 70 pcmu_ecc_setup(pecc_p); 71 72 /* 73 * Determine the virtual addresses of the streaming cache 74 * control/status and flush registers. 75 */ 76 pecc_p->pecc_csr_pa = pcb_base_pa + PCMU_ECC_CSR_OFFSET; 77 pecc_p->pecc_ue.pecc_afsr_pa = pcb_base_pa + PCMU_UE_AFSR_OFFSET; 78 pecc_p->pecc_ue.pecc_afar_pa = pcb_base_pa + PCMU_UE_AFAR_OFFSET; 79 80 PCMU_DBG1(PCMU_DBG_ATTACH, dip, "pcmu_ecc_create: csr=%x\n", 81 pecc_p->pecc_csr_pa); 82 PCMU_DBG2(PCMU_DBG_ATTACH, dip, 83 "pcmu_ecc_create: ue_afsr=%x, ue_afar=%x\n", 84 pecc_p->pecc_ue.pecc_afsr_pa, pecc_p->pecc_ue.pecc_afar_pa); 85 86 pcmu_ecc_configure(pcmu_p); 87 88 /* 89 * Register routines to be called from system error handling code. 90 */ 91 bus_func_register(BF_TYPE_ERRDIS, 92 (busfunc_t)pcmu_ecc_disable_nowait, pecc_p); 93 } 94 95 int 96 pcmu_ecc_register_intr(pcmu_t *pcmu_p) 97 { 98 pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p; 99 int ret; 100 101 /* 102 * Install the UE error interrupt handlers. 103 */ 104 ret = pcmu_ecc_add_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue); 105 return (ret); 106 } 107 108 void 109 pcmu_ecc_destroy(pcmu_t *pcmu_p) 110 { 111 pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p; 112 113 PCMU_DBG0(PCMU_DBG_DETACH, pcmu_p->pcmu_dip, "pcmu_ecc_destroy:\n"); 114 115 /* 116 * Disable UE ECC error interrupts. 117 */ 118 pcmu_ecc_disable_wait(pecc_p); 119 120 /* 121 * Remove the ECC interrupt handlers. 122 */ 123 pcmu_ecc_rem_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue); 124 125 /* 126 * Unregister our error handling functions. 127 */ 128 bus_func_unregister(BF_TYPE_ERRDIS, 129 (busfunc_t)pcmu_ecc_disable_nowait, pecc_p); 130 /* 131 * If a timer has been set, unset it. 132 */ 133 (void) untimeout(pecc_p->pecc_tout_id); 134 kmem_free(pecc_p, sizeof (pcmu_ecc_t)); 135 pcmu_p->pcmu_pecc_p = NULL; 136 } 137 138 void 139 pcmu_ecc_configure(pcmu_t *pcmu_p) 140 { 141 pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p; 142 uint64_t l; 143 /* LINTED variable */ 144 dev_info_t *dip = pcmu_p->pcmu_dip; 145 146 /* 147 * Clear any pending ECC errors. 148 */ 149 PCMU_DBG0(PCMU_DBG_ATTACH, dip, 150 "pcmu_ecc_configure: clearing UE errors\n"); 151 l = (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_PE_SHIFT) | 152 (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_SE_SHIFT); 153 stdphysio(pecc_p->pecc_ue.pecc_afsr_pa, l); 154 155 /* 156 * Enable ECC error detections via the control register. 157 */ 158 PCMU_DBG0(PCMU_DBG_ATTACH, dip, 159 "pcmu_ecc_configure: enabling UE detection\n"); 160 l = PCMU_ECC_CTRL_ECC_EN; 161 if (ecc_error_intr_enable) 162 l |= PCMU_ECC_CTRL_UE_INTEN; 163 stdphysio(pecc_p->pecc_csr_pa, l); 164 } 165 166 void 167 pcmu_ecc_enable_intr(pcmu_t *pcmu_p) 168 { 169 pcmu_cb_enable_nintr(pcmu_p, CBNINTR_UE); 170 } 171 172 void 173 pcmu_ecc_disable_wait(pcmu_ecc_t *pecc_p) 174 { 175 pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_WAIT); 176 } 177 178 uint_t 179 pcmu_ecc_disable_nowait(pcmu_ecc_t *pecc_p) 180 { 181 pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_NOWAIT); 182 return (BF_NONE); 183 } 184 185 static void 186 pcmu_ecc_disable(pcmu_ecc_t *pecc_p, int wait) 187 { 188 pcmu_cb_t *pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p; 189 uint64_t csr_pa = pecc_p->pecc_csr_pa; 190 uint64_t csr = lddphysio(csr_pa); 191 192 csr &= ~(PCMU_ECC_CTRL_UE_INTEN); 193 stdphysio(csr_pa, csr); 194 pcmu_cb_disable_nintr(pcb_p, CBNINTR_UE, wait); 195 } 196 197 /* 198 * I/O ECC error handling: 199 * 200 * Below are the generic functions that handle detected ECC errors. 201 * 202 * The registered interrupt handler is pcmu_ecc_intr(), it's function 203 * is to receive the error, capture some state, and pass that on to 204 * the pcmu_ecc_err_handler() for reporting purposes. 205 * 206 * pcmu_ecc_err_handler() gathers more state(via pcmu_ecc_errstate_get) 207 * and attempts to handle and report the error. pcmu_ecc_err_handler() 208 * must determine if we need to panic due to this error (via 209 * pcmu_ecc_classify, which also decodes the * ECC afsr), and if any 210 * side effects exist that may have caused or are due * to this error. 211 * PBM errors related to the ECC error may exist, to report 212 * them we call pcmu_pbm_err_handler(). 213 * 214 * To report the error we must also get the syndrome and unum, which can not 215 * be done in high level interrupted context. Therefore we have an error 216 * queue(pcmu_ecc_queue) which we dispatch errors to, to report the errors 217 * (pcmu_ecc_err_drain()). 218 * 219 * pcmu_ecc_err_drain() will be called when either the softint is triggered 220 * or the system is panicing. Either way it will gather more information 221 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to 222 * retire the faulty page(if error is a UE), and report the detected error. 223 * 224 */ 225 226 /* 227 * Function used to get ECC AFSR register 228 */ 229 static uint64_t 230 pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *ecc_ii_p) 231 { 232 ASSERT(ecc_ii_p->pecc_type == CBNINTR_UE); 233 return (lddphysio(ecc_ii_p->pecc_afsr_pa)); 234 } 235 236 /* 237 * IO detected ECC error interrupt handler, calls pcmu_ecc_err_handler to post 238 * error reports and handle the interrupt. Re-entry into pcmu_ecc_err_handler 239 * is protected by the per-chip mutex pcmu_err_mutex. 240 */ 241 uint_t 242 pcmu_ecc_intr(caddr_t a) 243 { 244 pcmu_ecc_intr_info_t *ecc_ii_p = (pcmu_ecc_intr_info_t *)a; 245 pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p; 246 pcmu_t *pcmu_p = pecc_p->pecc_pcmu_p; 247 pcmu_ecc_errstate_t ecc_err; 248 int ret = DDI_FM_OK; 249 250 bzero(&ecc_err, sizeof (pcmu_ecc_errstate_t)); 251 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); /* RAGS */ 252 ecc_err.ecc_ii_p = *ecc_ii_p; 253 ecc_err.pecc_p = pecc_p; 254 ecc_err.ecc_caller = PCI_ECC_CALL; 255 256 mutex_enter(&pcmu_p->pcmu_err_mutex); 257 ret = pcmu_ecc_err_handler(&ecc_err); 258 mutex_exit(&pcmu_p->pcmu_err_mutex); 259 if (ret == DDI_FM_FATAL) { 260 /* 261 * Need delay here to allow CPUs to handle related traps, 262 * such as FRUs for USIIIi systems. 263 */ 264 DELAY(pcmu_pecc_panic_delay); 265 cmn_err(CE_PANIC, "Fatal PCI UE Error"); 266 } 267 268 return (DDI_INTR_CLAIMED); 269 } 270 271 /* 272 * Function used to gather IO ECC error state. 273 */ 274 static void 275 pcmu_ecc_errstate_get(pcmu_ecc_errstate_t *ecc_err_p) 276 { 277 pcmu_ecc_t *pecc_p; 278 uint_t bus_id; 279 280 ASSERT(ecc_err_p); 281 282 pecc_p = ecc_err_p->ecc_ii_p.pecc_p; 283 bus_id = pecc_p->pecc_pcmu_p->pcmu_id; 284 285 ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex)); 286 /* 287 * Read the fault registers. 288 */ 289 ecc_err_p->ecc_afsr = pcmu_ecc_read_afsr(&ecc_err_p->ecc_ii_p); 290 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.pecc_afar_pa); 291 292 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr & 293 ecc_err_p->ecc_ii_p.pecc_offset_mask) >> 294 ecc_err_p->ecc_ii_p.pecc_offset_shift) << 295 ecc_err_p->ecc_ii_p.pecc_size_log2; 296 297 ecc_err_p->ecc_aflt.flt_id = gethrtime(); 298 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr; 299 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) + 300 ecc_err_p->ecc_offset; 301 ecc_err_p->ecc_aflt.flt_bus_id = bus_id; 302 ecc_err_p->ecc_aflt.flt_inst = 0; 303 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS; 304 ecc_err_p->ecc_aflt.flt_in_memory = 0; 305 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT; 306 } 307 308 /* 309 * pcmu_ecc_check: Called by pcmu_ecc_err_handler() this function is responsible 310 * for calling pcmu_pbm_err_handler() and calling their children error 311 * handlers(via ndi_fm_handler_dispatch()). 312 */ 313 static int 314 pcmu_ecc_check(pcmu_ecc_t *pecc_p, uint64_t fme_ena) 315 { 316 ddi_fm_error_t derr; 317 int ret; 318 pcmu_t *pcmu_p; 319 320 321 ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex)); 322 323 bzero(&derr, sizeof (ddi_fm_error_t)); 324 derr.fme_version = DDI_FME_VERSION; 325 derr.fme_ena = fme_ena; 326 ret = DDI_FM_NONFATAL; 327 328 /* 329 * Need to report any PBM errors which may have caused or 330 * resulted from this error. 331 */ 332 pcmu_p = pecc_p->pecc_pcmu_p; 333 if (pcmu_pbm_err_handler(pcmu_p->pcmu_dip, &derr, (void *)pcmu_p, 334 PCI_ECC_CALL) == DDI_FM_FATAL) 335 ret = DDI_FM_FATAL; 336 337 if (ret == DDI_FM_FATAL) 338 return (DDI_FM_FATAL); 339 else 340 return (DDI_FM_NONFATAL); 341 } 342 343 /* 344 * Function used to handle and log IO detected ECC errors, can be called by 345 * pcmu_ecc_intr and pcmu_err_callback(trap callback). Protected by 346 * pcmu_err_mutex. 347 */ 348 int 349 pcmu_ecc_err_handler(pcmu_ecc_errstate_t *ecc_err_p) 350 { 351 /* LINTED variable */ 352 uint64_t pri_err, sec_err; 353 pcmu_ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p; 354 pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p; 355 /* LINTED variable */ 356 pcmu_t *pcmu_p; 357 pcmu_cb_t *pcb_p; 358 int fatal = 0; 359 int nonfatal = 0; 360 361 ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex)); 362 363 pcmu_p = pecc_p->pecc_pcmu_p; 364 pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p; 365 366 pcmu_ecc_errstate_get(ecc_err_p); 367 pri_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_PE_SHIFT) & 368 PCMU_ECC_UE_AFSR_E_MASK; 369 370 sec_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_SE_SHIFT) & 371 PCMU_ECC_UE_AFSR_E_MASK; 372 373 switch (ecc_ii_p->pecc_type) { 374 case CBNINTR_UE: 375 if (pri_err) { 376 ecc_err_p->ecc_aflt.flt_synd = 0; 377 ecc_err_p->pecc_pri = 1; 378 pcmu_ecc_classify(pri_err, ecc_err_p); 379 errorq_dispatch(pcmu_ecc_queue, (void *)ecc_err_p, 380 sizeof (pcmu_ecc_errstate_t), 381 ecc_err_p->ecc_aflt.flt_panic); 382 } 383 if (sec_err) { 384 pcmu_ecc_errstate_t ecc_sec_err; 385 386 ecc_sec_err = *ecc_err_p; 387 ecc_sec_err.pecc_pri = 0; 388 pcmu_ecc_classify(sec_err, &ecc_sec_err); 389 pcmu_ecc_ereport_post(pcmu_p->pcmu_dip, 390 &ecc_sec_err); 391 } 392 /* 393 * Check for PCI bus errors that may have resulted from or 394 * caused this UE. 395 */ 396 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 397 pcmu_ecc_check(pecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL) 398 ecc_err_p->ecc_aflt.flt_panic = 1; 399 400 if (ecc_err_p->ecc_aflt.flt_panic) { 401 /* 402 * Disable all further errors since this will be 403 * treated as a fatal error. 404 */ 405 (void) pcmu_ecc_disable_nowait(pecc_p); 406 fatal++; 407 } 408 break; 409 410 default: 411 return (DDI_FM_OK); 412 } 413 /* Clear the errors */ 414 stdphysio(ecc_ii_p->pecc_afsr_pa, ecc_err_p->ecc_afsr); 415 /* 416 * Clear the interrupt if called by pcmu_ecc_intr and UE error 417 * or if called by pcmu_ecc_intr and CE error and delayed CE 418 * interrupt handling is turned off. 419 */ 420 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 421 ecc_ii_p->pecc_type == CBNINTR_UE && !fatal) 422 pcmu_cb_clear_nintr(pcb_p, ecc_ii_p->pecc_type); 423 if (!fatal && !nonfatal) 424 return (DDI_FM_OK); 425 else if (fatal) 426 return (DDI_FM_FATAL); 427 return (DDI_FM_NONFATAL); 428 } 429 430 /* 431 * Function used to drain pcmu_ecc_queue, either during panic or after softint 432 * is generated, to log IO detected ECC errors. 433 */ 434 /* ARGSUSED */ 435 void 436 pcmu_ecc_err_drain(void *not_used, pcmu_ecc_errstate_t *ecc_err) 437 { 438 struct async_flt *ecc = &ecc_err->ecc_aflt; 439 pcmu_t *pcmu_p = ecc_err->pecc_p->pecc_pcmu_p; 440 441 ecc_cpu_call(ecc, ecc_err->ecc_unum, ECC_IO_UE); 442 ecc_err->ecc_err_type = "U"; 443 pcmu_ecc_ereport_post(pcmu_p->pcmu_dip, ecc_err); 444 } 445 446 /* 447 * Function used to post IO detected ECC ereports. 448 */ 449 static void 450 pcmu_ecc_ereport_post(dev_info_t *dip, pcmu_ecc_errstate_t *ecc_err) 451 { 452 char *aux_msg; 453 pcmu_t *pcmu_p; 454 int instance = ddi_get_instance(dip); 455 456 pcmu_p = get_pcmu_soft_state(instance); 457 if (ecc_err->pecc_pri) { 458 aux_msg = "PIO primary uncorrectable error"; 459 } else { 460 aux_msg = "PIO secondary uncorrectable error"; 461 } 462 cmn_err(CE_WARN, "%s %s: %s %s=0x%lx, %s=0x%lx, %s=0x%x", 463 (pcmu_p->pcmu_pcbm_p)->pcbm_nameinst_str, 464 (pcmu_p->pcmu_pcbm_p)->pcbm_nameaddr_str, 465 aux_msg, PCI_ECC_AFSR, ecc_err->ecc_afsr, 466 PCI_ECC_AFAR, ecc_err->ecc_aflt.flt_addr, 467 "portid", ecc_err->ecc_aflt.flt_bus_id); 468 } 469