xref: /titanic_52/usr/src/uts/sun4u/io/sysioerr.c (revision c2580b931007758eab8cb5ae8726ebe1588e259b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1990-2002 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/ddi_impldefs.h>
34 #include <sys/cmn_err.h>
35 #include <sys/async.h>
36 #include <sys/sysiosbus.h>
37 #include <sys/sysioerr.h>
38 #include <sys/x_call.h>
39 #include <sys/machsystm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/vmsystm.h>
42 #include <sys/cpu_module.h>
43 
44 /*
45  * Set the following variable in /etc/system to tell the kernel
46  * not to shutdown the machine if the temperature reaches
47  * the Thermal Warning limit.
48  */
49 int oven_test = 0;
50 
51 /*
52  * To indicate if the prom has the property of "thermal-interrupt".
53  */
54 static int thermal_interrupt_enabled = 0;
55 
56 #ifdef	_STARFIRE
57 #include <sys/starfire.h>
58 
59 int
60 pc_translate_tgtid(caddr_t, int, volatile uint64_t *);
61 
62 void
63 pc_ittrans_cleanup(caddr_t, volatile uint64_t *);
64 #endif	/* _STARFIRE */
65 
66 /*
67  * adb debug_sysio_errs to 1 if you don't want your system to panic on
68  * sbus ue errors. adb sysio_err_flag to 0 if you don't want your system
69  * to check for sysio errors at all.
70  */
71 int sysio_err_flag = 1;
72 uint_t debug_sysio_errs = 0;
73 
74 /*
75  * bto_cnt = number of bus errors and timeouts allowed within bto_secs
76  * use /etc/system to change the bto_cnt to a very large number if
77  * it's a problem!
78  */
79 int bto_secs = 10;
80 int bto_cnt = 10;
81 
82 static uint_t
83 sysio_ue_intr(struct sbus_soft_state *softsp);
84 
85 static uint_t
86 sysio_ce_intr(struct sbus_soft_state *softsp);
87 
88 static uint_t
89 sbus_err_intr(struct sbus_soft_state *softsp);
90 
91 static void
92 sysio_log_ce_err(struct async_flt *ecc, char *unum);
93 
94 static void
95 sysio_log_ue_err(struct async_flt *ecc, char *unum);
96 
97 static void
98 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr);
99 
100 static void
101 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar,
102     ushort_t id, ushort_t inst, int cleared,
103     on_trap_data_t *ontrap_data);
104 
105 static int
106 sbus_check_bto(struct sbus_soft_state *softsp);
107 
108 static void
109 sbus_log_csr_error(struct async_flt *aflt, char *unum);
110 
111 static uint_t
112 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp);
113 
114 static uint_t
115 sysio_dis_err(struct sbus_soft_state *softsp);
116 
117 static uint_t
118 sysio_init_err(struct sbus_soft_state *softsp);
119 
120 static uint_t
121 sysio_thermal_warn_intr(struct sbus_soft_state *softsp);
122 
123 static int sbus_pil[] = {SBUS_UE_PIL, SBUS_CE_PIL, SBUS_ERR_PIL, SBUS_PF_PIL,
124 	SBUS_THERMAL_PIL, SBUS_PM_PIL};
125 int
126 sysio_err_init(struct sbus_soft_state *softsp, caddr_t address)
127 {
128 	if (sysio_err_flag == 0) {
129 		cmn_err(CE_CONT, "Warning: sysio errors not initialized\n");
130 		return (DDI_SUCCESS);
131 	}
132 
133 	/*
134 	 * Get the address of the already mapped-in sysio/sbus error registers.
135 	 * Simply add each registers offset to the already mapped in address
136 	 * that was retrieved from the device node's "address" property,
137 	 * and passed as an argument to this function.
138 	 *
139 	 * Define a macro for the pointer arithmetic ...
140 	 */
141 
142 #define	REG_ADDR(b, o)	(uint64_t *)((caddr_t)(b) + (o))
143 
144 	softsp->sysio_ecc_reg = REG_ADDR(address, OFF_SYSIO_ECC_REGS);
145 	softsp->sysio_ue_reg = REG_ADDR(address, OFF_SYSIO_UE_REGS);
146 	softsp->sysio_ce_reg = REG_ADDR(address, OFF_SYSIO_CE_REGS);
147 	softsp->sbus_err_reg = REG_ADDR(address, OFF_SBUS_ERR_REGS);
148 
149 #undef	REG_ADDR
150 
151 	/*
152 	 * create the interrupt-priorities property if it doesn't
153 	 * already exist to provide a hint as to the PIL level for
154 	 * our interrupt.
155 	 */
156 	{
157 		int len;
158 
159 		if (ddi_getproplen(DDI_DEV_T_ANY, softsp->dip,
160 		    DDI_PROP_DONTPASS, "interrupt-priorities",
161 		    &len) != DDI_PROP_SUCCESS) {
162 				/* Create the interrupt-priorities property. */
163 			(void) ddi_prop_update_int_array(DDI_DEV_T_NONE,
164 			    softsp->dip, "interrupt-priorities",
165 			    (int *)sbus_pil, sizeof (sbus_pil) / sizeof (int));
166 		}
167 	}
168 
169 	(void) ddi_add_intr(softsp->dip, 0, NULL, NULL,
170 	    (uint_t (*)())sysio_ue_intr, (caddr_t)softsp);
171 	(void) ddi_add_intr(softsp->dip, 1, NULL, NULL,
172 	    (uint_t (*)())sysio_ce_intr, (caddr_t)softsp);
173 	(void) ddi_add_intr(softsp->dip, 2, NULL, NULL,
174 	    (uint_t (*)())sbus_err_intr, (caddr_t)softsp);
175 	/*
176 	 * If the thermal-interrupt property is in place,
177 	 * then register the thermal warning interrupt handler and
178 	 * program its mapping register
179 	 */
180 	thermal_interrupt_enabled = ddi_getprop(DDI_DEV_T_ANY, softsp->dip,
181 		DDI_PROP_DONTPASS, "thermal-interrupt", -1);
182 
183 	if (thermal_interrupt_enabled == 1) {
184 		(void) ddi_add_intr(softsp->dip, 4, NULL, NULL,
185 		    (uint_t (*)())sysio_thermal_warn_intr, (caddr_t)softsp);
186 	}
187 
188 	bus_func_register(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp);
189 	bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp);
190 
191 	(void) sysio_init_err(softsp);
192 
193 	return (DDI_SUCCESS);
194 }
195 
196 int
197 sysio_err_resume_init(struct sbus_soft_state *softsp)
198 {
199 	(void) sysio_init_err(softsp);
200 	return (DDI_SUCCESS);
201 }
202 
203 int
204 sysio_err_uninit(struct sbus_soft_state *softsp)
205 {
206 	/* remove the interrupts from the interrupt list */
207 	(void) sysio_dis_err(softsp);
208 
209 	ddi_remove_intr(softsp->dip, 0, NULL);
210 	ddi_remove_intr(softsp->dip, 1, NULL);
211 	ddi_remove_intr(softsp->dip, 2, NULL);
212 
213 	if (thermal_interrupt_enabled == 1) {
214 		ddi_remove_intr(softsp->dip, 4, NULL);
215 	}
216 
217 	bus_func_unregister(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp);
218 	bus_func_unregister(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp);
219 
220 	return (DDI_SUCCESS);
221 }
222 
223 static uint_t
224 sysio_init_err(struct sbus_soft_state *softsp)
225 {
226 	volatile uint64_t tmp_mondo_vec, tmpreg;
227 	volatile uint64_t *mondo_vec_reg;
228 	uint_t cpu_id, acpu_id;
229 
230 	acpu_id = intr_dist_cpuid();
231 	/*
232 	 * Program the mondo vector accordingly.  This MUST be the
233 	 * last thing we do.  Once we program the mondo, the device
234 	 * may begin to interrupt. Store it in the hardware reg.
235 	 */
236 	mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + UE_ECC_MAPREG);
237 	cpu_id = acpu_id;
238 #ifdef	_STARFIRE
239 	cpu_id = pc_translate_tgtid(softsp->ittrans_cookie, cpu_id,
240 				mondo_vec_reg);
241 #endif	/* _STARFIRE */
242 	tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
243 	*mondo_vec_reg = tmp_mondo_vec;
244 
245 	mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + CE_ECC_MAPREG);
246 	cpu_id = acpu_id;
247 #ifdef	_STARFIRE
248 	cpu_id = pc_translate_tgtid(softsp->ittrans_cookie, cpu_id,
249 				mondo_vec_reg);
250 #endif	/* _STARFIRE */
251 	tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
252 	*mondo_vec_reg = tmp_mondo_vec;
253 
254 	mondo_vec_reg =
255 	    (uint64_t *)(softsp->intr_mapping_reg + SBUS_ERR_MAPREG);
256 	cpu_id = acpu_id;
257 #ifdef	_STARFIRE
258 	cpu_id = pc_translate_tgtid(softsp->ittrans_cookie, cpu_id,
259 				mondo_vec_reg);
260 #endif	/* _STARFIRE */
261 
262 	tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
263 	*mondo_vec_reg = tmp_mondo_vec;
264 
265 	if (thermal_interrupt_enabled == 1) {
266 		mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG);
267 		cpu_id = acpu_id;
268 		tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) |
269 			INTERRUPT_VALID;
270 		*mondo_vec_reg = tmp_mondo_vec;
271 	}
272 
273 	/* Flush store buffers */
274 	tmpreg = *softsp->sbus_ctrl_reg;
275 
276 	/*
277 	 * XXX - This may already be set by the OBP.
278 	 */
279 	tmpreg = SYSIO_APCKEN;
280 	*softsp->sysio_ctrl_reg |= tmpreg;
281 	tmpreg = (SECR_ECC_EN | SECR_UE_INTEN | SECR_CE_INTEN);
282 	*softsp->sysio_ecc_reg = tmpreg;
283 	tmpreg = SB_CSR_ERRINT_EN;
284 	*softsp->sbus_err_reg |= tmpreg;
285 
286 	/* Initialize timeout/bus error counter */
287 	softsp->bto_timestamp = 0;
288 	softsp->bto_ctr = 0;
289 
290 	return (0);
291 }
292 
293 static uint_t
294 sysio_dis_err(struct sbus_soft_state *softsp)
295 {
296 	volatile uint64_t tmpreg;
297 	volatile uint64_t *mondo_vec_reg, *clear_vec_reg;
298 
299 	*softsp->sysio_ctrl_reg &= ~SYSIO_APCKEN;
300 	*softsp->sysio_ecc_reg = 0;
301 	*softsp->sbus_err_reg &= ~SB_CSR_ERRINT_EN;
302 
303 	/* Flush store buffers */
304 	tmpreg = *softsp->sbus_ctrl_reg;
305 #ifdef lint
306 	tmpreg = tmpreg;
307 #endif
308 
309 	/* Unmap mapping registers */
310 	mondo_vec_reg = (softsp->intr_mapping_reg + UE_ECC_MAPREG);
311 	clear_vec_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR);
312 
313 	*mondo_vec_reg = 0;
314 
315 #ifdef	_STARFIRE
316 	/* do cleanup for starfire interrupt target translation */
317 	pc_ittrans_cleanup(softsp->ittrans_cookie, mondo_vec_reg);
318 #endif	/* _STARFIRE */
319 
320 	*clear_vec_reg = 0;
321 
322 	mondo_vec_reg = (softsp->intr_mapping_reg + CE_ECC_MAPREG);
323 	clear_vec_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR);
324 
325 	*mondo_vec_reg = 0;
326 
327 #ifdef	_STARFIRE
328 	/* Do cleanup for starfire interrupt target translation */
329 	pc_ittrans_cleanup(softsp->ittrans_cookie, mondo_vec_reg);
330 #endif	/* _STARFIRE */
331 
332 	*clear_vec_reg = 0;
333 
334 	mondo_vec_reg = (softsp->intr_mapping_reg + SBUS_ERR_MAPREG);
335 	clear_vec_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR);
336 
337 	*mondo_vec_reg = 0;
338 
339 #ifdef	_STARFIRE
340 	/* Do cleanup for starfire interrupt target translation */
341 	pc_ittrans_cleanup(softsp->ittrans_cookie, mondo_vec_reg);
342 #endif	/* _STARFIRE */
343 
344 	*clear_vec_reg = 0;
345 
346 	/* Flush store buffers */
347 	tmpreg = *softsp->sbus_ctrl_reg;
348 
349 	return (BF_NONE);
350 }
351 
352 /*
353  * Gather information about the error into an async_flt structure, and then
354  * enqueue the error for reporting and processing and panic.
355  */
356 static uint_t
357 sysio_ue_intr(struct sbus_soft_state *softsp)
358 {
359 	volatile uint64_t t_afsr;
360 	volatile uint64_t t_afar;
361 	volatile uint64_t *ue_reg, *afar_reg, *clear_reg;
362 	struct async_flt ecc;
363 	uint64_t offset;
364 
365 	/*
366 	 * Disable all further sbus errors, for this sbus instance, for
367 	 * what is guaranteed to be a fatal error. And grab any other cpus.
368 	 */
369 	(void) sysio_dis_err(softsp);		/* disabled sysio errors */
370 
371 	/*
372 	 * Then read and clear the afsr/afar and clear interrupt regs.
373 	 */
374 	ue_reg = (uint64_t *)softsp->sysio_ue_reg;
375 	t_afsr = *ue_reg;
376 	afar_reg = (uint64_t *)ue_reg + 1;
377 	t_afar = *afar_reg;
378 	*ue_reg = t_afsr;
379 
380 	clear_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR);
381 	*clear_reg = 0;
382 
383 	/*
384 	 * The AFSR DW_OFFSET field contains the offset of the doubleword with
385 	 * the ECC error relative to the 64-byte aligned PA.  We multiply by 8
386 	 * to convert to a byte offset, and then add this to flt_addr.
387 	 */
388 	offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8;
389 
390 	bzero(&ecc, sizeof (ecc));
391 	ecc.flt_id = gethrtime();
392 	ecc.flt_stat = t_afsr;
393 	ecc.flt_addr = P2ALIGN(t_afar, 64) + offset;
394 	ecc.flt_func = sysio_log_ue_err;
395 	ecc.flt_bus_id = softsp->upa_id;
396 	ecc.flt_inst = ddi_get_instance(softsp->dip);
397 	ecc.flt_status = ECC_IOBUS;
398 	ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0;
399 	ecc.flt_class = BUS_FAULT;
400 	ecc.flt_panic = (debug_sysio_errs == 0);
401 
402 	errorq_dispatch(ue_queue, &ecc, sizeof (ecc), ecc.flt_panic);
403 
404 	/*
405 	 * If the UE is in memory and fatal, save the fault info so the
406 	 * panic code will know to check for copyback errors.
407 	 */
408 	if (ecc.flt_panic && ecc.flt_in_memory)
409 		panic_aflt = ecc;
410 
411 	/*
412 	 * We must also check for other bus UE errors, and panic if
413 	 * any fatal ones are detected at this point.
414 	 */
415 	if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL)
416 		ecc.flt_panic = 1;
417 
418 	if (ecc.flt_panic)
419 		cmn_err(CE_PANIC, "Fatal Sbus%d UE Error", ecc.flt_inst);
420 
421 	return (DDI_INTR_CLAIMED);
422 }
423 
424 /*
425  * callback logging function from the common error handling code
426  */
427 static void
428 sysio_log_ue_err(struct async_flt *ecc, char *unum)
429 {
430 	uint64_t t_afsr = ecc->flt_stat;
431 	uint64_t t_afar = ecc->flt_addr;
432 
433 	ushort_t id = ecc->flt_bus_id;
434 	ushort_t inst = ecc->flt_inst;
435 
436 	if (t_afsr & SB_UE_AFSR_P_PIO) {
437 		cmn_err(CE_WARN, "SBus%d UE Primary Error from PIO: "
438 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
439 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
440 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
441 	}
442 	if (t_afsr & SB_UE_AFSR_P_DRD) {
443 		cmn_err(CE_WARN, "SBus%d UE Primary Error DMA read: "
444 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
445 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
446 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
447 	}
448 	if (t_afsr & SB_UE_AFSR_P_DWR) {
449 		cmn_err(CE_WARN, "SBus%d UE Primary Error DVMA write: "
450 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
451 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
452 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
453 	}
454 	/*
455 	 * We should never hit the secondary error panics.
456 	 */
457 	if (t_afsr & SB_UE_AFSR_S_PIO) {
458 		cmn_err(CE_WARN, "SBus%d UE Secondary Error from PIO: "
459 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
460 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
461 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
462 	}
463 	if (t_afsr & SB_UE_AFSR_S_DRD) {
464 		cmn_err(CE_WARN, "SBus%d UE Secondary Error DMA read: "
465 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
466 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
467 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
468 	}
469 	if (t_afsr & SB_UE_AFSR_S_DWR) {
470 		cmn_err(CE_WARN, "SBus%d UE Secondary  Error DMA write: "
471 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
472 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
473 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
474 	}
475 
476 	if ((debug_sysio_errs) || (aft_verbose)) {
477 		(void) read_ecc_data(ecc, 1, 0);
478 		cmn_err(CE_CONT, "\tOffset 0x%x, Size %d, UPA MID 0x%x\n",
479 		    (uint32_t)((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT),
480 		    (uint32_t)((t_afsr & SB_UE_AFSR_SIZE) >> SB_UE_SIZE_SHIFT),
481 		    (uint32_t)((t_afsr & SB_UE_AFSR_MID) >> SB_UE_MID_SHIFT));
482 	}
483 }
484 
485 /*
486  * gather the information about the error, plus a pointer to
487  * the callback logging function, and call the generic ce_error handler.
488  */
489 static uint_t
490 sysio_ce_intr(struct sbus_soft_state *softsp)
491 {
492 	volatile uint64_t t_afsr;
493 	volatile uint64_t t_afar;
494 	volatile uint64_t *afar_reg, *clear_reg, *ce_reg;
495 	struct async_flt ecc;
496 	uint64_t offset;
497 
498 	ce_reg = (uint64_t *)softsp->sysio_ce_reg;
499 	t_afsr = *ce_reg;
500 	afar_reg = (uint64_t *)ce_reg + 1;
501 	t_afar = *afar_reg;
502 	*ce_reg = t_afsr;
503 
504 	clear_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR);
505 	*clear_reg = 0;
506 
507 	/*
508 	 * The AFSR DW_OFFSET field contains the offset of the doubleword with
509 	 * the ECC error relative to the 64-byte aligned PA.  We multiply by 8
510 	 * to convert to a byte offset, and then add this to flt_addr.
511 	 */
512 	offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8;
513 
514 	bzero(&ecc, sizeof (ecc));
515 	ecc.flt_id = gethrtime();
516 	ecc.flt_stat = t_afsr;
517 	ecc.flt_addr = P2ALIGN(t_afar, 64) + offset;
518 	ecc.flt_func = sysio_log_ce_err;
519 	ecc.flt_bus_id = softsp->upa_id;
520 	ecc.flt_inst = ddi_get_instance(softsp->dip);
521 	ecc.flt_status = ECC_IOBUS;
522 
523 	ecc.flt_synd = (ushort_t)((t_afsr & SB_CE_AFSR_SYND) >>
524 	    SB_CE_SYND_SHIFT);
525 
526 	ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0;
527 	ecc.flt_class = BUS_FAULT;
528 
529 	ce_scrub(&ecc);
530 	errorq_dispatch(ce_queue, &ecc, sizeof (ecc), ERRORQ_ASYNC);
531 
532 	return (DDI_INTR_CLAIMED);
533 }
534 
535 /*
536  * callback logging function from the common error handling code
537  */
538 static void
539 sysio_log_ce_err(struct async_flt *ecc, char *unum)
540 {
541 	uint64_t t_afsr = ecc->flt_stat;
542 	uint64_t t_afar = ecc->flt_addr;
543 	ushort_t id = ecc->flt_bus_id;
544 	ushort_t inst = ecc->flt_inst;
545 	int ce_verbose = ce_verbose_memory;
546 	char *syndrome_str = "!\tSyndrome 0x%x, Offset 0x%x, Size %d, "
547 	    "UPA MID 0x%x\n";
548 
549 	if ((!ce_verbose_memory) && (!debug_sysio_errs))
550 		return;
551 
552 	if (t_afsr & SB_CE_AFSR_P_PIO) {
553 		char *fmtstr = "!SBus%d CE Primary Error from PIO: "
554 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n";
555 
556 		if ((debug_sysio_errs) || (ce_verbose > 1))
557 			fmtstr++;
558 
559 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
560 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32),
561 		    (uint32_t)t_afar, id);
562 	}
563 	if (t_afsr & SB_CE_AFSR_P_DRD) {
564 		char *fmtstr = "!SBus%d CE Primary Error DMA read: "
565 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
566 		    "Id %d\n";
567 
568 		if ((debug_sysio_errs) || (ce_verbose > 1))
569 			fmtstr++;
570 
571 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
572 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
573 		    unum, id);
574 	}
575 	if (t_afsr & SB_CE_AFSR_P_DWR) {
576 		char *fmtstr = "!SBus%d CE Primary Error DMA write: "
577 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d\n";
578 
579 		if ((debug_sysio_errs) || (ce_verbose > 1))
580 			fmtstr++;
581 
582 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
583 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
584 		    unum, id);
585 	}
586 
587 	if (t_afsr & SB_CE_AFSR_S_PIO) {
588 		char *fmtstr = "!SBus%d CE Secondary Error from PIO: "
589 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n";
590 
591 		if ((debug_sysio_errs) || (ce_verbose > 1))
592 			fmtstr++;
593 
594 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
595 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
596 		    id);
597 	}
598 	if (t_afsr & SB_CE_AFSR_S_DRD) {
599 		char *fmtstr = "!SBus%d CE Secondary Error DMA read: "
600 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
601 		    "Id %d\n";
602 
603 		if ((debug_sysio_errs) || (ce_verbose > 1))
604 			fmtstr++;
605 
606 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
607 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
608 		    unum, id);
609 	}
610 	if (t_afsr & SB_CE_AFSR_S_DWR) {
611 		char *fmtstr = "!SBus%d CE Secondary Error DMA write: "
612 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
613 		    "Id %d\n";
614 
615 		if ((debug_sysio_errs) || (ce_verbose > 1))
616 			fmtstr++;
617 
618 		cmn_err(CE_CONT, fmtstr,
619 		    inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
620 		    (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
621 	}
622 
623 	if ((debug_sysio_errs) || (ce_verbose > 1))
624 		syndrome_str++;
625 
626 	cmn_err(CE_CONT, syndrome_str,
627 	    (uint32_t)((t_afsr & SB_CE_AFSR_SYND) >> SB_CE_SYND_SHIFT),
628 	    (uint32_t)((t_afsr & SB_CE_AFSR_OFF) >> SB_CE_OFFSET_SHIFT),
629 	    (uint32_t)((t_afsr & SB_CE_AFSR_SIZE) >> SB_CE_SIZE_SHIFT),
630 	    (uint32_t)((t_afsr & SB_CE_AFSR_MID) >> SB_CE_MID_SHIFT));
631 }
632 
633 static uint_t
634 sbus_err_intr(struct sbus_soft_state *softsp)
635 {
636 	volatile uint64_t t_afsr;
637 	volatile uint64_t t_afar;
638 	ushort_t id, inst;
639 	int cleared = 0;
640 	volatile uint64_t *afar_reg;
641 	on_trap_data_t *otp = softsp->ontrap_data;
642 
643 	t_afsr = *softsp->sbus_err_reg;
644 	afar_reg = (uint64_t *)softsp->sbus_err_reg + 1;
645 	t_afar = *afar_reg;
646 
647 	if (otp == NULL || !(otp->ot_prot & OT_DATA_ACCESS)) {
648 		sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
649 		cleared = 1;
650 	}
651 
652 	id = (ushort_t)softsp->upa_id;
653 	inst = (ushort_t)ddi_get_instance(softsp->dip);
654 
655 	if (debug_sysio_errs) {
656 		if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS))
657 			otp->ot_trap |= OT_DATA_ACCESS;
658 		if (!cleared)
659 			sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
660 
661 		cmn_err(CE_CONT, "SBus%d Error: AFSR 0x%08x.%08x "
662 			"AFAR 0x%08x.%08x Id %d\n",
663 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
664 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
665 
666 		debug_enter("sbus_err_intr");
667 	} else {
668 		sbus_log_error(softsp, (uint64_t *)&t_afsr,
669 		    (uint64_t *)&t_afar, id, inst, cleared, otp);
670 	}
671 	if (!cleared) {
672 		sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
673 	}
674 
675 	return (DDI_INTR_CLAIMED);
676 }
677 
678 static void
679 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr)
680 {
681 	volatile uint64_t *clear_reg;
682 
683 	*softsp->sbus_err_reg = *pafsr;
684 	clear_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR);
685 	*clear_reg = 0;
686 }
687 
688 static void
689 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar,
690     ushort_t id, ushort_t inst, int cleared, on_trap_data_t *otp)
691 {
692 	uint64_t t_afsr;
693 	uint64_t t_afar;
694 	int level = CE_WARN;
695 
696 	t_afsr = *pafsr;
697 	t_afar = *pafar;
698 	if (t_afsr & SB_AFSR_P_LE) {
699 		if (!cleared)
700 			sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
701 		cmn_err(CE_PANIC, "SBus%d Primary Error Late PIO: "
702 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
703 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
704 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
705 	}
706 	if (t_afsr & SB_AFSR_P_TO) {
707 		if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) {
708 			otp->ot_trap |= OT_DATA_ACCESS;
709 			return;
710 		}
711 		if (sbus_check_bto(softsp)) {
712 			if (!cleared)
713 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
714 			level = CE_PANIC;
715 		}
716 		cmn_err(level, "SBus%d Primary Error Timeout: "
717 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
718 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
719 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
720 	}
721 	if (t_afsr & SB_AFSR_P_BERR) {
722 		if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) {
723 			otp->ot_trap |= OT_DATA_ACCESS;
724 			return;
725 		}
726 		if (sbus_check_bto(softsp)) {
727 			if (!cleared)
728 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
729 			level = CE_PANIC;
730 		}
731 		cmn_err(level, "SBus%d Primary Error Bus Error: "
732 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n",
733 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
734 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
735 	}
736 
737 	if (t_afsr & SB_AFSR_S_LE) {
738 		if (!cleared)
739 			sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
740 		cmn_err(CE_PANIC, "SBus%d Secondary Late PIO Error: "
741 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
742 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
743 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
744 	}
745 	if (t_afsr & SB_AFSR_S_TO) {
746 		if (sbus_check_bto(softsp)) {
747 			if (!cleared)
748 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
749 			level = CE_PANIC;
750 		}
751 		cmn_err(level, "SBus%d Secondary Timeout Error: "
752 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
753 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
754 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
755 	}
756 	if (t_afsr & SB_AFSR_S_BERR) {
757 		if (sbus_check_bto(softsp)) {
758 			if (!cleared)
759 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
760 			level = CE_PANIC;
761 		}
762 		cmn_err(level, "SBus%d Secondary Bus Error: "
763 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
764 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
765 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
766 	}
767 }
768 
769 
770 static int
771 sbus_check_bto(struct sbus_soft_state *softsp)
772 {
773 	hrtime_t now = gethrtime();		/* high PIL safe */
774 	hrtime_t diff = now - softsp->bto_timestamp;
775 
776 	if (diff > ((hrtime_t)bto_secs * NANOSEC) || diff < 0LL) {
777 		/*
778 		 * Reset error counter as this bus error has occurred
779 		 * after more than bto_secs duration.
780 		 */
781 		softsp->bto_timestamp = now;
782 		softsp->bto_ctr = 0;
783 	}
784 	if (softsp->bto_ctr++ >= bto_cnt)
785 		return (1);
786 	return (0);
787 }
788 
789 static uint_t
790 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp)
791 {
792 	uint64_t t_sb_csr;
793 	ushort_t id, inst;
794 
795 	t_sb_csr = *softsp->sbus_ctrl_reg;
796 	id = (ushort_t)softsp->upa_id;
797 	inst = (ushort_t)ddi_get_instance(softsp->dip);
798 
799 	if (debug_sysio_errs) {
800 		cmn_err(CE_CONT, "sbus_ctrl_ecc_error: SBus%d Control Reg "
801 		    "0x%016llx Id %d\n", inst, (u_longlong_t)t_sb_csr, id);
802 	}
803 
804 	if (t_sb_csr & (SB_CSR_DPERR_S14|SB_CSR_DPERR_S13|SB_CSR_DPERR_S3|
805 	    SB_CSR_DPERR_S2|SB_CSR_DPERR_S1|SB_CSR_DPERR_S0|SB_CSR_PIO_PERRS)) {
806 		struct async_flt aflt;
807 
808 		*softsp->sbus_ctrl_reg = t_sb_csr; /* clear error bits */
809 
810 		bzero(&aflt, sizeof (aflt));
811 		aflt.flt_id = gethrtime();
812 		aflt.flt_stat = t_sb_csr;
813 		aflt.flt_func = sbus_log_csr_error;
814 		aflt.flt_bus_id = id;
815 		aflt.flt_inst = inst;
816 		aflt.flt_status = ECC_IOBUS;
817 		aflt.flt_class = BUS_FAULT;
818 		aflt.flt_panic = 1;
819 
820 		errorq_dispatch(ue_queue, &aflt, sizeof (aflt), aflt.flt_panic);
821 		return (BF_FATAL);
822 	}
823 
824 	return (BF_NONE);
825 }
826 
827 /*ARGSUSED*/
828 static void
829 sbus_log_csr_error(struct async_flt *aflt, char *unum)
830 {
831 	uint64_t t_sb_csr = aflt->flt_stat;
832 	uint_t id = aflt->flt_bus_id;
833 	uint_t inst = aflt->flt_inst;
834 
835 	/*
836 	 * Print out SBus error information.
837 	 */
838 	if (t_sb_csr & SB_CSR_DPERR_S14) {
839 		cmn_err(CE_WARN,
840 		"SBus%d Slot 14 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
841 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
842 	}
843 	if (t_sb_csr & SB_CSR_DPERR_S13) {
844 		cmn_err(CE_WARN,
845 		"SBus%d Slot 13 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
846 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
847 	}
848 	if (t_sb_csr & SB_CSR_DPERR_S3) {
849 		cmn_err(CE_WARN,
850 		"SBus%d Slot 3 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
851 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
852 	}
853 	if (t_sb_csr & SB_CSR_DPERR_S2) {
854 		cmn_err(CE_WARN,
855 		"SBus%d Slot 2 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
856 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
857 	}
858 	if (t_sb_csr & SB_CSR_DPERR_S1) {
859 		cmn_err(CE_WARN,
860 		"SBus%d Slot 1 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
861 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
862 	}
863 	if (t_sb_csr & SB_CSR_DPERR_S0) {
864 		cmn_err(CE_WARN,
865 		"SBus%d Slot 0 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
866 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
867 	}
868 	if (t_sb_csr & SB_CSR_PPERR_S15) {
869 		cmn_err(CE_WARN,
870 		"SBus%d Slot 15 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
871 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
872 	}
873 	if (t_sb_csr & SB_CSR_PPERR_S14) {
874 		cmn_err(CE_WARN,
875 		"SBus%d Slot 14 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
876 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
877 	}
878 	if (t_sb_csr & SB_CSR_PPERR_S13) {
879 		cmn_err(CE_WARN,
880 		"SBus%d Slot 13 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
881 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
882 	}
883 	if (t_sb_csr & SB_CSR_PPERR_S3) {
884 		cmn_err(CE_WARN,
885 		"SBus%d Slot 3 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
886 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
887 	}
888 	if (t_sb_csr & SB_CSR_PPERR_S2) {
889 		cmn_err(CE_WARN,
890 		"SBus%d Slot 2 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
891 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
892 	}
893 	if (t_sb_csr & SB_CSR_PPERR_S1) {
894 		cmn_err(CE_WARN,
895 		"SBus%d Slot 1 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
896 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
897 	}
898 	if (t_sb_csr & SB_CSR_PPERR_S0) {
899 		cmn_err(CE_WARN,
900 		"SBus%d Slot 0 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
901 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
902 	}
903 }
904 
905 /*
906  * Sysio Thermal Warning interrupt handler
907  */
908 static uint_t
909 sysio_thermal_warn_intr(struct sbus_soft_state *softsp)
910 {
911 	volatile uint64_t *clear_reg;
912 	volatile uint64_t tmp_mondo_vec;
913 	volatile uint64_t *mondo_vec_reg;
914 	const char thermal_warn_msg[] =
915 	    "Severe over-temperature condition detected!";
916 
917 	/*
918 	 * Take off the Thermal Warning interrupt and
919 	 * remove its interrupt handler.
920 	 */
921 	mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG);
922 	tmp_mondo_vec = *mondo_vec_reg;
923 	tmp_mondo_vec &= ~INTERRUPT_VALID;
924 	*mondo_vec_reg = tmp_mondo_vec;
925 
926 	ddi_remove_intr(softsp->dip, 4, NULL);
927 
928 	clear_reg = (softsp->clr_intr_reg + THERMAL_CLEAR);
929 	*clear_reg = 0;
930 
931 	if (oven_test) {
932 		cmn_err(CE_NOTE, "OVEN TEST: %s", thermal_warn_msg);
933 		return (DDI_INTR_CLAIMED);
934 	}
935 
936 	cmn_err(CE_WARN, "%s", thermal_warn_msg);
937 	cmn_err(CE_WARN, "Powering down...");
938 
939 	do_shutdown();
940 
941 	/*
942 	 * just in case do_shutdown() fails
943 	 */
944 	(void) timeout((void(*)(void *))power_down, NULL,
945 	    thermal_powerdown_delay * hz);
946 
947 	return (DDI_INTR_CLAIMED);
948 }
949