1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1990-2002 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* 27 * Copyright 2019 Peter Tribble. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/ddi_impldefs.h> 35 #include <sys/cmn_err.h> 36 #include <sys/async.h> 37 #include <sys/sysiosbus.h> 38 #include <sys/sysioerr.h> 39 #include <sys/x_call.h> 40 #include <sys/machsystm.h> 41 #include <sys/sysmacros.h> 42 #include <sys/vmsystm.h> 43 #include <sys/cpu_module.h> 44 45 /* 46 * Set the following variable in /etc/system to tell the kernel 47 * not to shutdown the machine if the temperature reaches 48 * the Thermal Warning limit. 49 */ 50 int oven_test = 0; 51 52 /* 53 * To indicate if the prom has the property of "thermal-interrupt". 54 */ 55 static int thermal_interrupt_enabled = 0; 56 57 /* 58 * adb debug_sysio_errs to 1 if you don't want your system to panic on 59 * sbus ue errors. adb sysio_err_flag to 0 if you don't want your system 60 * to check for sysio errors at all. 61 */ 62 int sysio_err_flag = 1; 63 uint_t debug_sysio_errs = 0; 64 65 /* 66 * bto_cnt = number of bus errors and timeouts allowed within bto_secs 67 * use /etc/system to change the bto_cnt to a very large number if 68 * it's a problem! 69 */ 70 int bto_secs = 10; 71 int bto_cnt = 10; 72 73 static uint_t 74 sysio_ue_intr(struct sbus_soft_state *softsp); 75 76 static uint_t 77 sysio_ce_intr(struct sbus_soft_state *softsp); 78 79 static uint_t 80 sbus_err_intr(struct sbus_soft_state *softsp); 81 82 static void 83 sysio_log_ce_err(struct async_flt *ecc, char *unum); 84 85 static void 86 sysio_log_ue_err(struct async_flt *ecc, char *unum); 87 88 static void 89 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr); 90 91 static void 92 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar, 93 ushort_t id, ushort_t inst, int cleared, 94 on_trap_data_t *ontrap_data); 95 96 static int 97 sbus_check_bto(struct sbus_soft_state *softsp); 98 99 static void 100 sbus_log_csr_error(struct async_flt *aflt, char *unum); 101 102 static uint_t 103 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp); 104 105 static uint_t 106 sysio_dis_err(struct sbus_soft_state *softsp); 107 108 static uint_t 109 sysio_init_err(struct sbus_soft_state *softsp); 110 111 static uint_t 112 sysio_thermal_warn_intr(struct sbus_soft_state *softsp); 113 114 static int sbus_pil[] = {SBUS_UE_PIL, SBUS_CE_PIL, SBUS_ERR_PIL, SBUS_PF_PIL, 115 SBUS_THERMAL_PIL, SBUS_PM_PIL}; 116 int 117 sysio_err_init(struct sbus_soft_state *softsp, caddr_t address) 118 { 119 if (sysio_err_flag == 0) { 120 cmn_err(CE_CONT, "Warning: sysio errors not initialized\n"); 121 return (DDI_SUCCESS); 122 } 123 124 /* 125 * Get the address of the already mapped-in sysio/sbus error registers. 126 * Simply add each registers offset to the already mapped in address 127 * that was retrieved from the device node's "address" property, 128 * and passed as an argument to this function. 129 * 130 * Define a macro for the pointer arithmetic ... 131 */ 132 133 #define REG_ADDR(b, o) (uint64_t *)((caddr_t)(b) + (o)) 134 135 softsp->sysio_ecc_reg = REG_ADDR(address, OFF_SYSIO_ECC_REGS); 136 softsp->sysio_ue_reg = REG_ADDR(address, OFF_SYSIO_UE_REGS); 137 softsp->sysio_ce_reg = REG_ADDR(address, OFF_SYSIO_CE_REGS); 138 softsp->sbus_err_reg = REG_ADDR(address, OFF_SBUS_ERR_REGS); 139 140 #undef REG_ADDR 141 142 /* 143 * create the interrupt-priorities property if it doesn't 144 * already exist to provide a hint as to the PIL level for 145 * our interrupt. 146 */ 147 { 148 int len; 149 150 if (ddi_getproplen(DDI_DEV_T_ANY, softsp->dip, 151 DDI_PROP_DONTPASS, "interrupt-priorities", 152 &len) != DDI_PROP_SUCCESS) { 153 /* Create the interrupt-priorities property. */ 154 (void) ddi_prop_update_int_array(DDI_DEV_T_NONE, 155 softsp->dip, "interrupt-priorities", 156 (int *)sbus_pil, sizeof (sbus_pil) / sizeof (int)); 157 } 158 } 159 160 (void) ddi_add_intr(softsp->dip, 0, NULL, NULL, 161 (uint_t (*)())sysio_ue_intr, (caddr_t)softsp); 162 (void) ddi_add_intr(softsp->dip, 1, NULL, NULL, 163 (uint_t (*)())sysio_ce_intr, (caddr_t)softsp); 164 (void) ddi_add_intr(softsp->dip, 2, NULL, NULL, 165 (uint_t (*)())sbus_err_intr, (caddr_t)softsp); 166 /* 167 * If the thermal-interrupt property is in place, 168 * then register the thermal warning interrupt handler and 169 * program its mapping register 170 */ 171 thermal_interrupt_enabled = ddi_getprop(DDI_DEV_T_ANY, softsp->dip, 172 DDI_PROP_DONTPASS, "thermal-interrupt", -1); 173 174 if (thermal_interrupt_enabled == 1) { 175 (void) ddi_add_intr(softsp->dip, 4, NULL, NULL, 176 (uint_t (*)())sysio_thermal_warn_intr, (caddr_t)softsp); 177 } 178 179 bus_func_register(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp); 180 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp); 181 182 (void) sysio_init_err(softsp); 183 184 return (DDI_SUCCESS); 185 } 186 187 int 188 sysio_err_resume_init(struct sbus_soft_state *softsp) 189 { 190 (void) sysio_init_err(softsp); 191 return (DDI_SUCCESS); 192 } 193 194 int 195 sysio_err_uninit(struct sbus_soft_state *softsp) 196 { 197 /* remove the interrupts from the interrupt list */ 198 (void) sysio_dis_err(softsp); 199 200 ddi_remove_intr(softsp->dip, 0, NULL); 201 ddi_remove_intr(softsp->dip, 1, NULL); 202 ddi_remove_intr(softsp->dip, 2, NULL); 203 204 if (thermal_interrupt_enabled == 1) { 205 ddi_remove_intr(softsp->dip, 4, NULL); 206 } 207 208 bus_func_unregister(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp); 209 bus_func_unregister(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp); 210 211 return (DDI_SUCCESS); 212 } 213 214 static uint_t 215 sysio_init_err(struct sbus_soft_state *softsp) 216 { 217 volatile uint64_t tmp_mondo_vec, tmpreg; 218 volatile uint64_t *mondo_vec_reg; 219 uint_t cpu_id, acpu_id; 220 221 acpu_id = intr_dist_cpuid(); 222 /* 223 * Program the mondo vector accordingly. This MUST be the 224 * last thing we do. Once we program the mondo, the device 225 * may begin to interrupt. Store it in the hardware reg. 226 */ 227 mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + UE_ECC_MAPREG); 228 cpu_id = acpu_id; 229 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID; 230 *mondo_vec_reg = tmp_mondo_vec; 231 232 mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + CE_ECC_MAPREG); 233 cpu_id = acpu_id; 234 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID; 235 *mondo_vec_reg = tmp_mondo_vec; 236 237 mondo_vec_reg = 238 (uint64_t *)(softsp->intr_mapping_reg + SBUS_ERR_MAPREG); 239 cpu_id = acpu_id; 240 241 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID; 242 *mondo_vec_reg = tmp_mondo_vec; 243 244 if (thermal_interrupt_enabled == 1) { 245 mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG); 246 cpu_id = acpu_id; 247 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | 248 INTERRUPT_VALID; 249 *mondo_vec_reg = tmp_mondo_vec; 250 } 251 252 /* Flush store buffers */ 253 tmpreg = *softsp->sbus_ctrl_reg; 254 255 /* 256 * XXX - This may already be set by the OBP. 257 */ 258 tmpreg = SYSIO_APCKEN; 259 *softsp->sysio_ctrl_reg |= tmpreg; 260 tmpreg = (SECR_ECC_EN | SECR_UE_INTEN | SECR_CE_INTEN); 261 *softsp->sysio_ecc_reg = tmpreg; 262 tmpreg = SB_CSR_ERRINT_EN; 263 *softsp->sbus_err_reg |= tmpreg; 264 265 /* Initialize timeout/bus error counter */ 266 softsp->bto_timestamp = 0; 267 softsp->bto_ctr = 0; 268 269 return (0); 270 } 271 272 static uint_t 273 sysio_dis_err(struct sbus_soft_state *softsp) 274 { 275 volatile uint64_t tmpreg; 276 volatile uint64_t *mondo_vec_reg, *clear_vec_reg; 277 278 *softsp->sysio_ctrl_reg &= ~SYSIO_APCKEN; 279 *softsp->sysio_ecc_reg = 0; 280 *softsp->sbus_err_reg &= ~SB_CSR_ERRINT_EN; 281 282 /* Flush store buffers */ 283 tmpreg = *softsp->sbus_ctrl_reg; 284 #ifdef lint 285 tmpreg = tmpreg; 286 #endif 287 288 /* Unmap mapping registers */ 289 mondo_vec_reg = (softsp->intr_mapping_reg + UE_ECC_MAPREG); 290 clear_vec_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR); 291 292 *mondo_vec_reg = 0; 293 294 *clear_vec_reg = 0; 295 296 mondo_vec_reg = (softsp->intr_mapping_reg + CE_ECC_MAPREG); 297 clear_vec_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR); 298 299 *mondo_vec_reg = 0; 300 301 *clear_vec_reg = 0; 302 303 mondo_vec_reg = (softsp->intr_mapping_reg + SBUS_ERR_MAPREG); 304 clear_vec_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR); 305 306 *mondo_vec_reg = 0; 307 308 *clear_vec_reg = 0; 309 310 /* Flush store buffers */ 311 tmpreg = *softsp->sbus_ctrl_reg; 312 313 return (BF_NONE); 314 } 315 316 /* 317 * Gather information about the error into an async_flt structure, and then 318 * enqueue the error for reporting and processing and panic. 319 */ 320 static uint_t 321 sysio_ue_intr(struct sbus_soft_state *softsp) 322 { 323 volatile uint64_t t_afsr; 324 volatile uint64_t t_afar; 325 volatile uint64_t *ue_reg, *afar_reg, *clear_reg; 326 struct async_flt ecc; 327 uint64_t offset; 328 329 /* 330 * Disable all further sbus errors, for this sbus instance, for 331 * what is guaranteed to be a fatal error. And grab any other cpus. 332 */ 333 (void) sysio_dis_err(softsp); /* disabled sysio errors */ 334 335 /* 336 * Then read and clear the afsr/afar and clear interrupt regs. 337 */ 338 ue_reg = (uint64_t *)softsp->sysio_ue_reg; 339 t_afsr = *ue_reg; 340 afar_reg = (uint64_t *)ue_reg + 1; 341 t_afar = *afar_reg; 342 *ue_reg = t_afsr; 343 344 clear_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR); 345 *clear_reg = 0; 346 347 /* 348 * The AFSR DW_OFFSET field contains the offset of the doubleword with 349 * the ECC error relative to the 64-byte aligned PA. We multiply by 8 350 * to convert to a byte offset, and then add this to flt_addr. 351 */ 352 offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8; 353 354 bzero(&ecc, sizeof (ecc)); 355 ecc.flt_id = gethrtime(); 356 ecc.flt_stat = t_afsr; 357 ecc.flt_addr = P2ALIGN(t_afar, 64) + offset; 358 ecc.flt_func = sysio_log_ue_err; 359 ecc.flt_bus_id = softsp->upa_id; 360 ecc.flt_inst = ddi_get_instance(softsp->dip); 361 ecc.flt_status = ECC_IOBUS; 362 ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0; 363 ecc.flt_class = BUS_FAULT; 364 ecc.flt_panic = (debug_sysio_errs == 0); 365 366 errorq_dispatch(ue_queue, &ecc, sizeof (ecc), ecc.flt_panic); 367 368 /* 369 * If the UE is in memory and fatal, save the fault info so the 370 * panic code will know to check for copyback errors. 371 */ 372 if (ecc.flt_panic && ecc.flt_in_memory) 373 panic_aflt = ecc; 374 375 /* 376 * We must also check for other bus UE errors, and panic if 377 * any fatal ones are detected at this point. 378 */ 379 if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL) 380 ecc.flt_panic = 1; 381 382 if (ecc.flt_panic) 383 cmn_err(CE_PANIC, "Fatal Sbus%d UE Error", ecc.flt_inst); 384 385 return (DDI_INTR_CLAIMED); 386 } 387 388 /* 389 * callback logging function from the common error handling code 390 */ 391 static void 392 sysio_log_ue_err(struct async_flt *ecc, char *unum) 393 { 394 uint64_t t_afsr = ecc->flt_stat; 395 uint64_t t_afar = ecc->flt_addr; 396 397 ushort_t id = ecc->flt_bus_id; 398 ushort_t inst = ecc->flt_inst; 399 400 if (t_afsr & SB_UE_AFSR_P_PIO) { 401 cmn_err(CE_WARN, "SBus%d UE Primary Error from PIO: " 402 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 403 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 404 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 405 } 406 if (t_afsr & SB_UE_AFSR_P_DRD) { 407 cmn_err(CE_WARN, "SBus%d UE Primary Error DMA read: " 408 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d", 409 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 410 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id); 411 } 412 if (t_afsr & SB_UE_AFSR_P_DWR) { 413 cmn_err(CE_WARN, "SBus%d UE Primary Error DVMA write: " 414 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d", 415 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 416 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id); 417 } 418 /* 419 * We should never hit the secondary error panics. 420 */ 421 if (t_afsr & SB_UE_AFSR_S_PIO) { 422 cmn_err(CE_WARN, "SBus%d UE Secondary Error from PIO: " 423 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 424 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 425 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 426 } 427 if (t_afsr & SB_UE_AFSR_S_DRD) { 428 cmn_err(CE_WARN, "SBus%d UE Secondary Error DMA read: " 429 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d", 430 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 431 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id); 432 } 433 if (t_afsr & SB_UE_AFSR_S_DWR) { 434 cmn_err(CE_WARN, "SBus%d UE Secondary Error DMA write: " 435 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d", 436 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 437 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id); 438 } 439 440 if ((debug_sysio_errs) || (aft_verbose)) { 441 (void) read_ecc_data(ecc, 1, 0); 442 cmn_err(CE_CONT, "\tOffset 0x%x, Size %d, UPA MID 0x%x\n", 443 (uint32_t)((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT), 444 (uint32_t)((t_afsr & SB_UE_AFSR_SIZE) >> SB_UE_SIZE_SHIFT), 445 (uint32_t)((t_afsr & SB_UE_AFSR_MID) >> SB_UE_MID_SHIFT)); 446 } 447 } 448 449 /* 450 * gather the information about the error, plus a pointer to 451 * the callback logging function, and call the generic ce_error handler. 452 */ 453 static uint_t 454 sysio_ce_intr(struct sbus_soft_state *softsp) 455 { 456 volatile uint64_t t_afsr; 457 volatile uint64_t t_afar; 458 volatile uint64_t *afar_reg, *clear_reg, *ce_reg; 459 struct async_flt ecc; 460 uint64_t offset; 461 462 ce_reg = (uint64_t *)softsp->sysio_ce_reg; 463 t_afsr = *ce_reg; 464 afar_reg = (uint64_t *)ce_reg + 1; 465 t_afar = *afar_reg; 466 *ce_reg = t_afsr; 467 468 clear_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR); 469 *clear_reg = 0; 470 471 /* 472 * The AFSR DW_OFFSET field contains the offset of the doubleword with 473 * the ECC error relative to the 64-byte aligned PA. We multiply by 8 474 * to convert to a byte offset, and then add this to flt_addr. 475 */ 476 offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8; 477 478 bzero(&ecc, sizeof (ecc)); 479 ecc.flt_id = gethrtime(); 480 ecc.flt_stat = t_afsr; 481 ecc.flt_addr = P2ALIGN(t_afar, 64) + offset; 482 ecc.flt_func = sysio_log_ce_err; 483 ecc.flt_bus_id = softsp->upa_id; 484 ecc.flt_inst = ddi_get_instance(softsp->dip); 485 ecc.flt_status = ECC_IOBUS; 486 487 ecc.flt_synd = (ushort_t)((t_afsr & SB_CE_AFSR_SYND) >> 488 SB_CE_SYND_SHIFT); 489 490 ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0; 491 ecc.flt_class = BUS_FAULT; 492 493 ce_scrub(&ecc); 494 errorq_dispatch(ce_queue, &ecc, sizeof (ecc), ERRORQ_ASYNC); 495 496 return (DDI_INTR_CLAIMED); 497 } 498 499 /* 500 * callback logging function from the common error handling code 501 */ 502 static void 503 sysio_log_ce_err(struct async_flt *ecc, char *unum) 504 { 505 uint64_t t_afsr = ecc->flt_stat; 506 uint64_t t_afar = ecc->flt_addr; 507 ushort_t id = ecc->flt_bus_id; 508 ushort_t inst = ecc->flt_inst; 509 int ce_verbose = ce_verbose_memory; 510 char *syndrome_str = "!\tSyndrome 0x%x, Offset 0x%x, Size %d, " 511 "UPA MID 0x%x\n"; 512 513 if ((!ce_verbose_memory) && (!debug_sysio_errs)) 514 return; 515 516 if (t_afsr & SB_CE_AFSR_P_PIO) { 517 char *fmtstr = "!SBus%d CE Primary Error from PIO: " 518 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n"; 519 520 if ((debug_sysio_errs) || (ce_verbose > 1)) 521 fmtstr++; 522 523 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32), 524 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), 525 (uint32_t)t_afar, id); 526 } 527 if (t_afsr & SB_CE_AFSR_P_DRD) { 528 char *fmtstr = "!SBus%d CE Primary Error DMA read: " 529 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s " 530 "Id %d\n"; 531 532 if ((debug_sysio_errs) || (ce_verbose > 1)) 533 fmtstr++; 534 535 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32), 536 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar, 537 unum, id); 538 } 539 if (t_afsr & SB_CE_AFSR_P_DWR) { 540 char *fmtstr = "!SBus%d CE Primary Error DMA write: " 541 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d\n"; 542 543 if ((debug_sysio_errs) || (ce_verbose > 1)) 544 fmtstr++; 545 546 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32), 547 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar, 548 unum, id); 549 } 550 551 if (t_afsr & SB_CE_AFSR_S_PIO) { 552 char *fmtstr = "!SBus%d CE Secondary Error from PIO: " 553 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n"; 554 555 if ((debug_sysio_errs) || (ce_verbose > 1)) 556 fmtstr++; 557 558 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32), 559 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar, 560 id); 561 } 562 if (t_afsr & SB_CE_AFSR_S_DRD) { 563 char *fmtstr = "!SBus%d CE Secondary Error DMA read: " 564 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s " 565 "Id %d\n"; 566 567 if ((debug_sysio_errs) || (ce_verbose > 1)) 568 fmtstr++; 569 570 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32), 571 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar, 572 unum, id); 573 } 574 if (t_afsr & SB_CE_AFSR_S_DWR) { 575 char *fmtstr = "!SBus%d CE Secondary Error DMA write: " 576 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s " 577 "Id %d\n"; 578 579 if ((debug_sysio_errs) || (ce_verbose > 1)) 580 fmtstr++; 581 582 cmn_err(CE_CONT, fmtstr, 583 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 584 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id); 585 } 586 587 if ((debug_sysio_errs) || (ce_verbose > 1)) 588 syndrome_str++; 589 590 cmn_err(CE_CONT, syndrome_str, 591 (uint32_t)((t_afsr & SB_CE_AFSR_SYND) >> SB_CE_SYND_SHIFT), 592 (uint32_t)((t_afsr & SB_CE_AFSR_OFF) >> SB_CE_OFFSET_SHIFT), 593 (uint32_t)((t_afsr & SB_CE_AFSR_SIZE) >> SB_CE_SIZE_SHIFT), 594 (uint32_t)((t_afsr & SB_CE_AFSR_MID) >> SB_CE_MID_SHIFT)); 595 } 596 597 static uint_t 598 sbus_err_intr(struct sbus_soft_state *softsp) 599 { 600 volatile uint64_t t_afsr; 601 volatile uint64_t t_afar; 602 ushort_t id, inst; 603 int cleared = 0; 604 volatile uint64_t *afar_reg; 605 on_trap_data_t *otp = softsp->ontrap_data; 606 607 t_afsr = *softsp->sbus_err_reg; 608 afar_reg = (uint64_t *)softsp->sbus_err_reg + 1; 609 t_afar = *afar_reg; 610 611 if (otp == NULL || !(otp->ot_prot & OT_DATA_ACCESS)) { 612 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 613 cleared = 1; 614 } 615 616 id = (ushort_t)softsp->upa_id; 617 inst = (ushort_t)ddi_get_instance(softsp->dip); 618 619 if (debug_sysio_errs) { 620 if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) 621 otp->ot_trap |= OT_DATA_ACCESS; 622 if (!cleared) 623 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 624 625 cmn_err(CE_CONT, "SBus%d Error: AFSR 0x%08x.%08x " 626 "AFAR 0x%08x.%08x Id %d\n", 627 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 628 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 629 630 debug_enter("sbus_err_intr"); 631 } else { 632 sbus_log_error(softsp, (uint64_t *)&t_afsr, 633 (uint64_t *)&t_afar, id, inst, cleared, otp); 634 } 635 if (!cleared) { 636 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 637 } 638 639 return (DDI_INTR_CLAIMED); 640 } 641 642 static void 643 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr) 644 { 645 volatile uint64_t *clear_reg; 646 647 *softsp->sbus_err_reg = *pafsr; 648 clear_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR); 649 *clear_reg = 0; 650 } 651 652 static void 653 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar, 654 ushort_t id, ushort_t inst, int cleared, on_trap_data_t *otp) 655 { 656 uint64_t t_afsr; 657 uint64_t t_afar; 658 int level = CE_WARN; 659 660 t_afsr = *pafsr; 661 t_afar = *pafar; 662 if (t_afsr & SB_AFSR_P_LE) { 663 if (!cleared) 664 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 665 cmn_err(CE_PANIC, "SBus%d Primary Error Late PIO: " 666 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 667 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 668 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 669 } 670 if (t_afsr & SB_AFSR_P_TO) { 671 if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) { 672 otp->ot_trap |= OT_DATA_ACCESS; 673 return; 674 } 675 if (sbus_check_bto(softsp)) { 676 if (!cleared) 677 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 678 level = CE_PANIC; 679 } 680 cmn_err(level, "SBus%d Primary Error Timeout: " 681 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 682 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 683 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 684 } 685 if (t_afsr & SB_AFSR_P_BERR) { 686 if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) { 687 otp->ot_trap |= OT_DATA_ACCESS; 688 return; 689 } 690 if (sbus_check_bto(softsp)) { 691 if (!cleared) 692 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 693 level = CE_PANIC; 694 } 695 cmn_err(level, "SBus%d Primary Error Bus Error: " 696 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n", 697 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 698 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 699 } 700 701 if (t_afsr & SB_AFSR_S_LE) { 702 if (!cleared) 703 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 704 cmn_err(CE_PANIC, "SBus%d Secondary Late PIO Error: " 705 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 706 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 707 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 708 } 709 if (t_afsr & SB_AFSR_S_TO) { 710 if (sbus_check_bto(softsp)) { 711 if (!cleared) 712 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 713 level = CE_PANIC; 714 } 715 cmn_err(level, "SBus%d Secondary Timeout Error: " 716 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 717 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 718 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 719 } 720 if (t_afsr & SB_AFSR_S_BERR) { 721 if (sbus_check_bto(softsp)) { 722 if (!cleared) 723 sbus_clear_intr(softsp, (uint64_t *)&t_afsr); 724 level = CE_PANIC; 725 } 726 cmn_err(level, "SBus%d Secondary Bus Error: " 727 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d", 728 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr, 729 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id); 730 } 731 } 732 733 734 static int 735 sbus_check_bto(struct sbus_soft_state *softsp) 736 { 737 hrtime_t now = gethrtime(); /* high PIL safe */ 738 hrtime_t diff = now - softsp->bto_timestamp; 739 740 if (diff > ((hrtime_t)bto_secs * NANOSEC) || diff < 0LL) { 741 /* 742 * Reset error counter as this bus error has occurred 743 * after more than bto_secs duration. 744 */ 745 softsp->bto_timestamp = now; 746 softsp->bto_ctr = 0; 747 } 748 if (softsp->bto_ctr++ >= bto_cnt) 749 return (1); 750 return (0); 751 } 752 753 static uint_t 754 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp) 755 { 756 uint64_t t_sb_csr; 757 ushort_t id, inst; 758 759 t_sb_csr = *softsp->sbus_ctrl_reg; 760 id = (ushort_t)softsp->upa_id; 761 inst = (ushort_t)ddi_get_instance(softsp->dip); 762 763 if (debug_sysio_errs) { 764 cmn_err(CE_CONT, "sbus_ctrl_ecc_error: SBus%d Control Reg " 765 "0x%016llx Id %d\n", inst, (u_longlong_t)t_sb_csr, id); 766 } 767 768 if (t_sb_csr & (SB_CSR_DPERR_S14|SB_CSR_DPERR_S13|SB_CSR_DPERR_S3| 769 SB_CSR_DPERR_S2|SB_CSR_DPERR_S1|SB_CSR_DPERR_S0|SB_CSR_PIO_PERRS)) { 770 struct async_flt aflt; 771 772 *softsp->sbus_ctrl_reg = t_sb_csr; /* clear error bits */ 773 774 bzero(&aflt, sizeof (aflt)); 775 aflt.flt_id = gethrtime(); 776 aflt.flt_stat = t_sb_csr; 777 aflt.flt_func = sbus_log_csr_error; 778 aflt.flt_bus_id = id; 779 aflt.flt_inst = inst; 780 aflt.flt_status = ECC_IOBUS; 781 aflt.flt_class = BUS_FAULT; 782 aflt.flt_panic = 1; 783 784 errorq_dispatch(ue_queue, &aflt, sizeof (aflt), aflt.flt_panic); 785 return (BF_FATAL); 786 } 787 788 return (BF_NONE); 789 } 790 791 /*ARGSUSED*/ 792 static void 793 sbus_log_csr_error(struct async_flt *aflt, char *unum) 794 { 795 uint64_t t_sb_csr = aflt->flt_stat; 796 uint_t id = aflt->flt_bus_id; 797 uint_t inst = aflt->flt_inst; 798 799 /* 800 * Print out SBus error information. 801 */ 802 if (t_sb_csr & SB_CSR_DPERR_S14) { 803 cmn_err(CE_WARN, 804 "SBus%d Slot 14 DVMA Parity Error: AFSR 0x%08x.%08x Id %d", 805 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 806 } 807 if (t_sb_csr & SB_CSR_DPERR_S13) { 808 cmn_err(CE_WARN, 809 "SBus%d Slot 13 DVMA Parity Error: AFSR 0x%08x.%08x Id %d", 810 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 811 } 812 if (t_sb_csr & SB_CSR_DPERR_S3) { 813 cmn_err(CE_WARN, 814 "SBus%d Slot 3 DVMA Parity Error: AFSR 0x%08x.%08x Id %d", 815 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 816 } 817 if (t_sb_csr & SB_CSR_DPERR_S2) { 818 cmn_err(CE_WARN, 819 "SBus%d Slot 2 DVMA Parity Error: AFSR 0x%08x.%08x Id %d", 820 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 821 } 822 if (t_sb_csr & SB_CSR_DPERR_S1) { 823 cmn_err(CE_WARN, 824 "SBus%d Slot 1 DVMA Parity Error: AFSR 0x%08x.%08x Id %d", 825 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 826 } 827 if (t_sb_csr & SB_CSR_DPERR_S0) { 828 cmn_err(CE_WARN, 829 "SBus%d Slot 0 DVMA Parity Error: AFSR 0x%08x.%08x Id %d", 830 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 831 } 832 if (t_sb_csr & SB_CSR_PPERR_S15) { 833 cmn_err(CE_WARN, 834 "SBus%d Slot 15 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 835 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 836 } 837 if (t_sb_csr & SB_CSR_PPERR_S14) { 838 cmn_err(CE_WARN, 839 "SBus%d Slot 14 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 840 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 841 } 842 if (t_sb_csr & SB_CSR_PPERR_S13) { 843 cmn_err(CE_WARN, 844 "SBus%d Slot 13 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 845 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 846 } 847 if (t_sb_csr & SB_CSR_PPERR_S3) { 848 cmn_err(CE_WARN, 849 "SBus%d Slot 3 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 850 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 851 } 852 if (t_sb_csr & SB_CSR_PPERR_S2) { 853 cmn_err(CE_WARN, 854 "SBus%d Slot 2 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 855 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 856 } 857 if (t_sb_csr & SB_CSR_PPERR_S1) { 858 cmn_err(CE_WARN, 859 "SBus%d Slot 1 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 860 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 861 } 862 if (t_sb_csr & SB_CSR_PPERR_S0) { 863 cmn_err(CE_WARN, 864 "SBus%d Slot 0 PIO Parity Error: AFSR 0x%08x.%08x Id %d", 865 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id); 866 } 867 } 868 869 /* 870 * Sysio Thermal Warning interrupt handler 871 */ 872 static uint_t 873 sysio_thermal_warn_intr(struct sbus_soft_state *softsp) 874 { 875 volatile uint64_t *clear_reg; 876 volatile uint64_t tmp_mondo_vec; 877 volatile uint64_t *mondo_vec_reg; 878 const char thermal_warn_msg[] = 879 "Severe over-temperature condition detected!"; 880 881 /* 882 * Take off the Thermal Warning interrupt and 883 * remove its interrupt handler. 884 */ 885 mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG); 886 tmp_mondo_vec = *mondo_vec_reg; 887 tmp_mondo_vec &= ~INTERRUPT_VALID; 888 *mondo_vec_reg = tmp_mondo_vec; 889 890 ddi_remove_intr(softsp->dip, 4, NULL); 891 892 clear_reg = (softsp->clr_intr_reg + THERMAL_CLEAR); 893 *clear_reg = 0; 894 895 if (oven_test) { 896 cmn_err(CE_NOTE, "OVEN TEST: %s", thermal_warn_msg); 897 return (DDI_INTR_CLAIMED); 898 } 899 900 cmn_err(CE_WARN, "%s", thermal_warn_msg); 901 cmn_err(CE_WARN, "Powering down..."); 902 903 do_shutdown(); 904 905 /* 906 * just in case do_shutdown() fails 907 */ 908 (void) timeout((void(*)(void *))power_down, NULL, 909 thermal_powerdown_delay * hz); 910 911 return (DDI_INTR_CLAIMED); 912 } 913