1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 1990-2002 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /*
27 * Copyright 2019 Peter Tribble.
28 */
29
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/ddi_impldefs.h>
35 #include <sys/cmn_err.h>
36 #include <sys/async.h>
37 #include <sys/sysiosbus.h>
38 #include <sys/sysioerr.h>
39 #include <sys/x_call.h>
40 #include <sys/machsystm.h>
41 #include <sys/sysmacros.h>
42 #include <sys/vmsystm.h>
43 #include <sys/cpu_module.h>
44
45 /*
46 * Set the following variable in /etc/system to tell the kernel
47 * not to shutdown the machine if the temperature reaches
48 * the Thermal Warning limit.
49 */
50 int oven_test = 0;
51
52 /*
53 * To indicate if the prom has the property of "thermal-interrupt".
54 */
55 static int thermal_interrupt_enabled = 0;
56
57 /*
58 * adb debug_sysio_errs to 1 if you don't want your system to panic on
59 * sbus ue errors. adb sysio_err_flag to 0 if you don't want your system
60 * to check for sysio errors at all.
61 */
62 int sysio_err_flag = 1;
63 uint_t debug_sysio_errs = 0;
64
65 /*
66 * bto_cnt = number of bus errors and timeouts allowed within bto_secs
67 * use /etc/system to change the bto_cnt to a very large number if
68 * it's a problem!
69 */
70 int bto_secs = 10;
71 int bto_cnt = 10;
72
73 static uint_t
74 sysio_ue_intr(struct sbus_soft_state *softsp);
75
76 static uint_t
77 sysio_ce_intr(struct sbus_soft_state *softsp);
78
79 static uint_t
80 sbus_err_intr(struct sbus_soft_state *softsp);
81
82 static void
83 sysio_log_ce_err(struct async_flt *ecc, char *unum);
84
85 static void
86 sysio_log_ue_err(struct async_flt *ecc, char *unum);
87
88 static void
89 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr);
90
91 static void
92 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar,
93 ushort_t id, ushort_t inst, int cleared,
94 on_trap_data_t *ontrap_data);
95
96 static int
97 sbus_check_bto(struct sbus_soft_state *softsp);
98
99 static void
100 sbus_log_csr_error(struct async_flt *aflt, char *unum);
101
102 static uint_t
103 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp);
104
105 static uint_t
106 sysio_dis_err(struct sbus_soft_state *softsp);
107
108 static uint_t
109 sysio_init_err(struct sbus_soft_state *softsp);
110
111 static uint_t
112 sysio_thermal_warn_intr(struct sbus_soft_state *softsp);
113
114 static int sbus_pil[] = {SBUS_UE_PIL, SBUS_CE_PIL, SBUS_ERR_PIL, SBUS_PF_PIL,
115 SBUS_THERMAL_PIL, SBUS_PM_PIL};
116 int
sysio_err_init(struct sbus_soft_state * softsp,caddr_t address)117 sysio_err_init(struct sbus_soft_state *softsp, caddr_t address)
118 {
119 if (sysio_err_flag == 0) {
120 cmn_err(CE_CONT, "Warning: sysio errors not initialized\n");
121 return (DDI_SUCCESS);
122 }
123
124 /*
125 * Get the address of the already mapped-in sysio/sbus error registers.
126 * Simply add each registers offset to the already mapped in address
127 * that was retrieved from the device node's "address" property,
128 * and passed as an argument to this function.
129 *
130 * Define a macro for the pointer arithmetic ...
131 */
132
133 #define REG_ADDR(b, o) (uint64_t *)((caddr_t)(b) + (o))
134
135 softsp->sysio_ecc_reg = REG_ADDR(address, OFF_SYSIO_ECC_REGS);
136 softsp->sysio_ue_reg = REG_ADDR(address, OFF_SYSIO_UE_REGS);
137 softsp->sysio_ce_reg = REG_ADDR(address, OFF_SYSIO_CE_REGS);
138 softsp->sbus_err_reg = REG_ADDR(address, OFF_SBUS_ERR_REGS);
139
140 #undef REG_ADDR
141
142 /*
143 * create the interrupt-priorities property if it doesn't
144 * already exist to provide a hint as to the PIL level for
145 * our interrupt.
146 */
147 {
148 int len;
149
150 if (ddi_getproplen(DDI_DEV_T_ANY, softsp->dip,
151 DDI_PROP_DONTPASS, "interrupt-priorities",
152 &len) != DDI_PROP_SUCCESS) {
153 /* Create the interrupt-priorities property. */
154 (void) ddi_prop_update_int_array(DDI_DEV_T_NONE,
155 softsp->dip, "interrupt-priorities",
156 (int *)sbus_pil, sizeof (sbus_pil) / sizeof (int));
157 }
158 }
159
160 (void) ddi_add_intr(softsp->dip, 0, NULL, NULL,
161 (uint_t (*)())sysio_ue_intr, (caddr_t)softsp);
162 (void) ddi_add_intr(softsp->dip, 1, NULL, NULL,
163 (uint_t (*)())sysio_ce_intr, (caddr_t)softsp);
164 (void) ddi_add_intr(softsp->dip, 2, NULL, NULL,
165 (uint_t (*)())sbus_err_intr, (caddr_t)softsp);
166 /*
167 * If the thermal-interrupt property is in place,
168 * then register the thermal warning interrupt handler and
169 * program its mapping register
170 */
171 thermal_interrupt_enabled = ddi_getprop(DDI_DEV_T_ANY, softsp->dip,
172 DDI_PROP_DONTPASS, "thermal-interrupt", -1);
173
174 if (thermal_interrupt_enabled == 1) {
175 (void) ddi_add_intr(softsp->dip, 4, NULL, NULL,
176 (uint_t (*)())sysio_thermal_warn_intr, (caddr_t)softsp);
177 }
178
179 bus_func_register(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp);
180 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp);
181
182 (void) sysio_init_err(softsp);
183
184 return (DDI_SUCCESS);
185 }
186
187 int
sysio_err_resume_init(struct sbus_soft_state * softsp)188 sysio_err_resume_init(struct sbus_soft_state *softsp)
189 {
190 (void) sysio_init_err(softsp);
191 return (DDI_SUCCESS);
192 }
193
194 int
sysio_err_uninit(struct sbus_soft_state * softsp)195 sysio_err_uninit(struct sbus_soft_state *softsp)
196 {
197 /* remove the interrupts from the interrupt list */
198 (void) sysio_dis_err(softsp);
199
200 ddi_remove_intr(softsp->dip, 0, NULL);
201 ddi_remove_intr(softsp->dip, 1, NULL);
202 ddi_remove_intr(softsp->dip, 2, NULL);
203
204 if (thermal_interrupt_enabled == 1) {
205 ddi_remove_intr(softsp->dip, 4, NULL);
206 }
207
208 bus_func_unregister(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp);
209 bus_func_unregister(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp);
210
211 return (DDI_SUCCESS);
212 }
213
214 static uint_t
sysio_init_err(struct sbus_soft_state * softsp)215 sysio_init_err(struct sbus_soft_state *softsp)
216 {
217 volatile uint64_t tmp_mondo_vec, tmpreg;
218 volatile uint64_t *mondo_vec_reg;
219 uint_t cpu_id, acpu_id;
220
221 acpu_id = intr_dist_cpuid();
222 /*
223 * Program the mondo vector accordingly. This MUST be the
224 * last thing we do. Once we program the mondo, the device
225 * may begin to interrupt. Store it in the hardware reg.
226 */
227 mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + UE_ECC_MAPREG);
228 cpu_id = acpu_id;
229 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
230 *mondo_vec_reg = tmp_mondo_vec;
231
232 mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + CE_ECC_MAPREG);
233 cpu_id = acpu_id;
234 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
235 *mondo_vec_reg = tmp_mondo_vec;
236
237 mondo_vec_reg =
238 (uint64_t *)(softsp->intr_mapping_reg + SBUS_ERR_MAPREG);
239 cpu_id = acpu_id;
240
241 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
242 *mondo_vec_reg = tmp_mondo_vec;
243
244 if (thermal_interrupt_enabled == 1) {
245 mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG);
246 cpu_id = acpu_id;
247 tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) |
248 INTERRUPT_VALID;
249 *mondo_vec_reg = tmp_mondo_vec;
250 }
251
252 /* Flush store buffers */
253 tmpreg = *softsp->sbus_ctrl_reg;
254
255 /*
256 * XXX - This may already be set by the OBP.
257 */
258 tmpreg = SYSIO_APCKEN;
259 *softsp->sysio_ctrl_reg |= tmpreg;
260 tmpreg = (SECR_ECC_EN | SECR_UE_INTEN | SECR_CE_INTEN);
261 *softsp->sysio_ecc_reg = tmpreg;
262 tmpreg = SB_CSR_ERRINT_EN;
263 *softsp->sbus_err_reg |= tmpreg;
264
265 /* Initialize timeout/bus error counter */
266 softsp->bto_timestamp = 0;
267 softsp->bto_ctr = 0;
268
269 return (0);
270 }
271
272 static uint_t
sysio_dis_err(struct sbus_soft_state * softsp)273 sysio_dis_err(struct sbus_soft_state *softsp)
274 {
275 volatile uint64_t tmpreg;
276 volatile uint64_t *mondo_vec_reg, *clear_vec_reg;
277
278 *softsp->sysio_ctrl_reg &= ~SYSIO_APCKEN;
279 *softsp->sysio_ecc_reg = 0;
280 *softsp->sbus_err_reg &= ~SB_CSR_ERRINT_EN;
281
282 /* Flush store buffers */
283 tmpreg = *softsp->sbus_ctrl_reg;
284 #ifdef lint
285 tmpreg = tmpreg;
286 #endif
287
288 /* Unmap mapping registers */
289 mondo_vec_reg = (softsp->intr_mapping_reg + UE_ECC_MAPREG);
290 clear_vec_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR);
291
292 *mondo_vec_reg = 0;
293
294 *clear_vec_reg = 0;
295
296 mondo_vec_reg = (softsp->intr_mapping_reg + CE_ECC_MAPREG);
297 clear_vec_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR);
298
299 *mondo_vec_reg = 0;
300
301 *clear_vec_reg = 0;
302
303 mondo_vec_reg = (softsp->intr_mapping_reg + SBUS_ERR_MAPREG);
304 clear_vec_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR);
305
306 *mondo_vec_reg = 0;
307
308 *clear_vec_reg = 0;
309
310 /* Flush store buffers */
311 tmpreg = *softsp->sbus_ctrl_reg;
312
313 return (BF_NONE);
314 }
315
316 /*
317 * Gather information about the error into an async_flt structure, and then
318 * enqueue the error for reporting and processing and panic.
319 */
320 static uint_t
sysio_ue_intr(struct sbus_soft_state * softsp)321 sysio_ue_intr(struct sbus_soft_state *softsp)
322 {
323 volatile uint64_t t_afsr;
324 volatile uint64_t t_afar;
325 volatile uint64_t *ue_reg, *afar_reg, *clear_reg;
326 struct async_flt ecc;
327 uint64_t offset;
328
329 /*
330 * Disable all further sbus errors, for this sbus instance, for
331 * what is guaranteed to be a fatal error. And grab any other cpus.
332 */
333 (void) sysio_dis_err(softsp); /* disabled sysio errors */
334
335 /*
336 * Then read and clear the afsr/afar and clear interrupt regs.
337 */
338 ue_reg = (uint64_t *)softsp->sysio_ue_reg;
339 t_afsr = *ue_reg;
340 afar_reg = (uint64_t *)ue_reg + 1;
341 t_afar = *afar_reg;
342 *ue_reg = t_afsr;
343
344 clear_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR);
345 *clear_reg = 0;
346
347 /*
348 * The AFSR DW_OFFSET field contains the offset of the doubleword with
349 * the ECC error relative to the 64-byte aligned PA. We multiply by 8
350 * to convert to a byte offset, and then add this to flt_addr.
351 */
352 offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8;
353
354 bzero(&ecc, sizeof (ecc));
355 ecc.flt_id = gethrtime();
356 ecc.flt_stat = t_afsr;
357 ecc.flt_addr = P2ALIGN(t_afar, 64) + offset;
358 ecc.flt_func = sysio_log_ue_err;
359 ecc.flt_bus_id = softsp->upa_id;
360 ecc.flt_inst = ddi_get_instance(softsp->dip);
361 ecc.flt_status = ECC_IOBUS;
362 ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0;
363 ecc.flt_class = BUS_FAULT;
364 ecc.flt_panic = (debug_sysio_errs == 0);
365
366 errorq_dispatch(ue_queue, &ecc, sizeof (ecc), ecc.flt_panic);
367
368 /*
369 * If the UE is in memory and fatal, save the fault info so the
370 * panic code will know to check for copyback errors.
371 */
372 if (ecc.flt_panic && ecc.flt_in_memory)
373 panic_aflt = ecc;
374
375 /*
376 * We must also check for other bus UE errors, and panic if
377 * any fatal ones are detected at this point.
378 */
379 if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL)
380 ecc.flt_panic = 1;
381
382 if (ecc.flt_panic)
383 cmn_err(CE_PANIC, "Fatal Sbus%d UE Error", ecc.flt_inst);
384
385 return (DDI_INTR_CLAIMED);
386 }
387
388 /*
389 * callback logging function from the common error handling code
390 */
391 static void
sysio_log_ue_err(struct async_flt * ecc,char * unum)392 sysio_log_ue_err(struct async_flt *ecc, char *unum)
393 {
394 uint64_t t_afsr = ecc->flt_stat;
395 uint64_t t_afar = ecc->flt_addr;
396
397 ushort_t id = ecc->flt_bus_id;
398 ushort_t inst = ecc->flt_inst;
399
400 if (t_afsr & SB_UE_AFSR_P_PIO) {
401 cmn_err(CE_WARN, "SBus%d UE Primary Error from PIO: "
402 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
403 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
404 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
405 }
406 if (t_afsr & SB_UE_AFSR_P_DRD) {
407 cmn_err(CE_WARN, "SBus%d UE Primary Error DMA read: "
408 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
409 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
410 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
411 }
412 if (t_afsr & SB_UE_AFSR_P_DWR) {
413 cmn_err(CE_WARN, "SBus%d UE Primary Error DVMA write: "
414 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
415 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
416 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
417 }
418 /*
419 * We should never hit the secondary error panics.
420 */
421 if (t_afsr & SB_UE_AFSR_S_PIO) {
422 cmn_err(CE_WARN, "SBus%d UE Secondary Error from PIO: "
423 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
424 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
425 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
426 }
427 if (t_afsr & SB_UE_AFSR_S_DRD) {
428 cmn_err(CE_WARN, "SBus%d UE Secondary Error DMA read: "
429 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
430 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
431 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
432 }
433 if (t_afsr & SB_UE_AFSR_S_DWR) {
434 cmn_err(CE_WARN, "SBus%d UE Secondary Error DMA write: "
435 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
436 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
437 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
438 }
439
440 if ((debug_sysio_errs) || (aft_verbose)) {
441 (void) read_ecc_data(ecc, 1, 0);
442 cmn_err(CE_CONT, "\tOffset 0x%x, Size %d, UPA MID 0x%x\n",
443 (uint32_t)((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT),
444 (uint32_t)((t_afsr & SB_UE_AFSR_SIZE) >> SB_UE_SIZE_SHIFT),
445 (uint32_t)((t_afsr & SB_UE_AFSR_MID) >> SB_UE_MID_SHIFT));
446 }
447 }
448
449 /*
450 * gather the information about the error, plus a pointer to
451 * the callback logging function, and call the generic ce_error handler.
452 */
453 static uint_t
sysio_ce_intr(struct sbus_soft_state * softsp)454 sysio_ce_intr(struct sbus_soft_state *softsp)
455 {
456 volatile uint64_t t_afsr;
457 volatile uint64_t t_afar;
458 volatile uint64_t *afar_reg, *clear_reg, *ce_reg;
459 struct async_flt ecc;
460 uint64_t offset;
461
462 ce_reg = (uint64_t *)softsp->sysio_ce_reg;
463 t_afsr = *ce_reg;
464 afar_reg = (uint64_t *)ce_reg + 1;
465 t_afar = *afar_reg;
466 *ce_reg = t_afsr;
467
468 clear_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR);
469 *clear_reg = 0;
470
471 /*
472 * The AFSR DW_OFFSET field contains the offset of the doubleword with
473 * the ECC error relative to the 64-byte aligned PA. We multiply by 8
474 * to convert to a byte offset, and then add this to flt_addr.
475 */
476 offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8;
477
478 bzero(&ecc, sizeof (ecc));
479 ecc.flt_id = gethrtime();
480 ecc.flt_stat = t_afsr;
481 ecc.flt_addr = P2ALIGN(t_afar, 64) + offset;
482 ecc.flt_func = sysio_log_ce_err;
483 ecc.flt_bus_id = softsp->upa_id;
484 ecc.flt_inst = ddi_get_instance(softsp->dip);
485 ecc.flt_status = ECC_IOBUS;
486
487 ecc.flt_synd = (ushort_t)((t_afsr & SB_CE_AFSR_SYND) >>
488 SB_CE_SYND_SHIFT);
489
490 ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0;
491 ecc.flt_class = BUS_FAULT;
492
493 ce_scrub(&ecc);
494 errorq_dispatch(ce_queue, &ecc, sizeof (ecc), ERRORQ_ASYNC);
495
496 return (DDI_INTR_CLAIMED);
497 }
498
499 /*
500 * callback logging function from the common error handling code
501 */
502 static void
sysio_log_ce_err(struct async_flt * ecc,char * unum)503 sysio_log_ce_err(struct async_flt *ecc, char *unum)
504 {
505 uint64_t t_afsr = ecc->flt_stat;
506 uint64_t t_afar = ecc->flt_addr;
507 ushort_t id = ecc->flt_bus_id;
508 ushort_t inst = ecc->flt_inst;
509 int ce_verbose = ce_verbose_memory;
510 char *syndrome_str = "!\tSyndrome 0x%x, Offset 0x%x, Size %d, "
511 "UPA MID 0x%x\n";
512
513 if ((!ce_verbose_memory) && (!debug_sysio_errs))
514 return;
515
516 if (t_afsr & SB_CE_AFSR_P_PIO) {
517 char *fmtstr = "!SBus%d CE Primary Error from PIO: "
518 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n";
519
520 if ((debug_sysio_errs) || (ce_verbose > 1))
521 fmtstr++;
522
523 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
524 (uint32_t)t_afsr, (uint32_t)(t_afar>>32),
525 (uint32_t)t_afar, id);
526 }
527 if (t_afsr & SB_CE_AFSR_P_DRD) {
528 char *fmtstr = "!SBus%d CE Primary Error DMA read: "
529 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
530 "Id %d\n";
531
532 if ((debug_sysio_errs) || (ce_verbose > 1))
533 fmtstr++;
534
535 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
536 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
537 unum, id);
538 }
539 if (t_afsr & SB_CE_AFSR_P_DWR) {
540 char *fmtstr = "!SBus%d CE Primary Error DMA write: "
541 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d\n";
542
543 if ((debug_sysio_errs) || (ce_verbose > 1))
544 fmtstr++;
545
546 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
547 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
548 unum, id);
549 }
550
551 if (t_afsr & SB_CE_AFSR_S_PIO) {
552 char *fmtstr = "!SBus%d CE Secondary Error from PIO: "
553 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n";
554
555 if ((debug_sysio_errs) || (ce_verbose > 1))
556 fmtstr++;
557
558 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
559 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
560 id);
561 }
562 if (t_afsr & SB_CE_AFSR_S_DRD) {
563 char *fmtstr = "!SBus%d CE Secondary Error DMA read: "
564 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
565 "Id %d\n";
566
567 if ((debug_sysio_errs) || (ce_verbose > 1))
568 fmtstr++;
569
570 cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
571 (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
572 unum, id);
573 }
574 if (t_afsr & SB_CE_AFSR_S_DWR) {
575 char *fmtstr = "!SBus%d CE Secondary Error DMA write: "
576 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
577 "Id %d\n";
578
579 if ((debug_sysio_errs) || (ce_verbose > 1))
580 fmtstr++;
581
582 cmn_err(CE_CONT, fmtstr,
583 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
584 (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
585 }
586
587 if ((debug_sysio_errs) || (ce_verbose > 1))
588 syndrome_str++;
589
590 cmn_err(CE_CONT, syndrome_str,
591 (uint32_t)((t_afsr & SB_CE_AFSR_SYND) >> SB_CE_SYND_SHIFT),
592 (uint32_t)((t_afsr & SB_CE_AFSR_OFF) >> SB_CE_OFFSET_SHIFT),
593 (uint32_t)((t_afsr & SB_CE_AFSR_SIZE) >> SB_CE_SIZE_SHIFT),
594 (uint32_t)((t_afsr & SB_CE_AFSR_MID) >> SB_CE_MID_SHIFT));
595 }
596
597 static uint_t
sbus_err_intr(struct sbus_soft_state * softsp)598 sbus_err_intr(struct sbus_soft_state *softsp)
599 {
600 volatile uint64_t t_afsr;
601 volatile uint64_t t_afar;
602 ushort_t id, inst;
603 int cleared = 0;
604 volatile uint64_t *afar_reg;
605 on_trap_data_t *otp = softsp->ontrap_data;
606
607 t_afsr = *softsp->sbus_err_reg;
608 afar_reg = (uint64_t *)softsp->sbus_err_reg + 1;
609 t_afar = *afar_reg;
610
611 if (otp == NULL || !(otp->ot_prot & OT_DATA_ACCESS)) {
612 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
613 cleared = 1;
614 }
615
616 id = (ushort_t)softsp->upa_id;
617 inst = (ushort_t)ddi_get_instance(softsp->dip);
618
619 if (debug_sysio_errs) {
620 if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS))
621 otp->ot_trap |= OT_DATA_ACCESS;
622 if (!cleared)
623 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
624
625 cmn_err(CE_CONT, "SBus%d Error: AFSR 0x%08x.%08x "
626 "AFAR 0x%08x.%08x Id %d\n",
627 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
628 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
629
630 debug_enter("sbus_err_intr");
631 } else {
632 sbus_log_error(softsp, (uint64_t *)&t_afsr,
633 (uint64_t *)&t_afar, id, inst, cleared, otp);
634 }
635 if (!cleared) {
636 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
637 }
638
639 return (DDI_INTR_CLAIMED);
640 }
641
642 static void
sbus_clear_intr(struct sbus_soft_state * softsp,uint64_t * pafsr)643 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr)
644 {
645 volatile uint64_t *clear_reg;
646
647 *softsp->sbus_err_reg = *pafsr;
648 clear_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR);
649 *clear_reg = 0;
650 }
651
652 static void
sbus_log_error(struct sbus_soft_state * softsp,uint64_t * pafsr,uint64_t * pafar,ushort_t id,ushort_t inst,int cleared,on_trap_data_t * otp)653 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar,
654 ushort_t id, ushort_t inst, int cleared, on_trap_data_t *otp)
655 {
656 uint64_t t_afsr;
657 uint64_t t_afar;
658 int level = CE_WARN;
659
660 t_afsr = *pafsr;
661 t_afar = *pafar;
662 if (t_afsr & SB_AFSR_P_LE) {
663 if (!cleared)
664 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
665 cmn_err(CE_PANIC, "SBus%d Primary Error Late PIO: "
666 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
667 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
668 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
669 }
670 if (t_afsr & SB_AFSR_P_TO) {
671 if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) {
672 otp->ot_trap |= OT_DATA_ACCESS;
673 return;
674 }
675 if (sbus_check_bto(softsp)) {
676 if (!cleared)
677 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
678 level = CE_PANIC;
679 }
680 cmn_err(level, "SBus%d Primary Error Timeout: "
681 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
682 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
683 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
684 }
685 if (t_afsr & SB_AFSR_P_BERR) {
686 if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) {
687 otp->ot_trap |= OT_DATA_ACCESS;
688 return;
689 }
690 if (sbus_check_bto(softsp)) {
691 if (!cleared)
692 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
693 level = CE_PANIC;
694 }
695 cmn_err(level, "SBus%d Primary Error Bus Error: "
696 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n",
697 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
698 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
699 }
700
701 if (t_afsr & SB_AFSR_S_LE) {
702 if (!cleared)
703 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
704 cmn_err(CE_PANIC, "SBus%d Secondary Late PIO Error: "
705 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
706 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
707 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
708 }
709 if (t_afsr & SB_AFSR_S_TO) {
710 if (sbus_check_bto(softsp)) {
711 if (!cleared)
712 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
713 level = CE_PANIC;
714 }
715 cmn_err(level, "SBus%d Secondary Timeout Error: "
716 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
717 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
718 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
719 }
720 if (t_afsr & SB_AFSR_S_BERR) {
721 if (sbus_check_bto(softsp)) {
722 if (!cleared)
723 sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
724 level = CE_PANIC;
725 }
726 cmn_err(level, "SBus%d Secondary Bus Error: "
727 "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
728 inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
729 (uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
730 }
731 }
732
733
734 static int
sbus_check_bto(struct sbus_soft_state * softsp)735 sbus_check_bto(struct sbus_soft_state *softsp)
736 {
737 hrtime_t now = gethrtime(); /* high PIL safe */
738 hrtime_t diff = now - softsp->bto_timestamp;
739
740 if (diff > ((hrtime_t)bto_secs * NANOSEC) || diff < 0LL) {
741 /*
742 * Reset error counter as this bus error has occurred
743 * after more than bto_secs duration.
744 */
745 softsp->bto_timestamp = now;
746 softsp->bto_ctr = 0;
747 }
748 if (softsp->bto_ctr++ >= bto_cnt)
749 return (1);
750 return (0);
751 }
752
753 static uint_t
sbus_ctrl_ecc_err(struct sbus_soft_state * softsp)754 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp)
755 {
756 uint64_t t_sb_csr;
757 ushort_t id, inst;
758
759 t_sb_csr = *softsp->sbus_ctrl_reg;
760 id = (ushort_t)softsp->upa_id;
761 inst = (ushort_t)ddi_get_instance(softsp->dip);
762
763 if (debug_sysio_errs) {
764 cmn_err(CE_CONT, "sbus_ctrl_ecc_error: SBus%d Control Reg "
765 "0x%016llx Id %d\n", inst, (u_longlong_t)t_sb_csr, id);
766 }
767
768 if (t_sb_csr & (SB_CSR_DPERR_S14|SB_CSR_DPERR_S13|SB_CSR_DPERR_S3|
769 SB_CSR_DPERR_S2|SB_CSR_DPERR_S1|SB_CSR_DPERR_S0|SB_CSR_PIO_PERRS)) {
770 struct async_flt aflt;
771
772 *softsp->sbus_ctrl_reg = t_sb_csr; /* clear error bits */
773
774 bzero(&aflt, sizeof (aflt));
775 aflt.flt_id = gethrtime();
776 aflt.flt_stat = t_sb_csr;
777 aflt.flt_func = sbus_log_csr_error;
778 aflt.flt_bus_id = id;
779 aflt.flt_inst = inst;
780 aflt.flt_status = ECC_IOBUS;
781 aflt.flt_class = BUS_FAULT;
782 aflt.flt_panic = 1;
783
784 errorq_dispatch(ue_queue, &aflt, sizeof (aflt), aflt.flt_panic);
785 return (BF_FATAL);
786 }
787
788 return (BF_NONE);
789 }
790
791 /*ARGSUSED*/
792 static void
sbus_log_csr_error(struct async_flt * aflt,char * unum)793 sbus_log_csr_error(struct async_flt *aflt, char *unum)
794 {
795 uint64_t t_sb_csr = aflt->flt_stat;
796 uint_t id = aflt->flt_bus_id;
797 uint_t inst = aflt->flt_inst;
798
799 /*
800 * Print out SBus error information.
801 */
802 if (t_sb_csr & SB_CSR_DPERR_S14) {
803 cmn_err(CE_WARN,
804 "SBus%d Slot 14 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
805 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
806 }
807 if (t_sb_csr & SB_CSR_DPERR_S13) {
808 cmn_err(CE_WARN,
809 "SBus%d Slot 13 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
810 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
811 }
812 if (t_sb_csr & SB_CSR_DPERR_S3) {
813 cmn_err(CE_WARN,
814 "SBus%d Slot 3 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
815 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
816 }
817 if (t_sb_csr & SB_CSR_DPERR_S2) {
818 cmn_err(CE_WARN,
819 "SBus%d Slot 2 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
820 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
821 }
822 if (t_sb_csr & SB_CSR_DPERR_S1) {
823 cmn_err(CE_WARN,
824 "SBus%d Slot 1 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
825 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
826 }
827 if (t_sb_csr & SB_CSR_DPERR_S0) {
828 cmn_err(CE_WARN,
829 "SBus%d Slot 0 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
830 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
831 }
832 if (t_sb_csr & SB_CSR_PPERR_S15) {
833 cmn_err(CE_WARN,
834 "SBus%d Slot 15 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
835 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
836 }
837 if (t_sb_csr & SB_CSR_PPERR_S14) {
838 cmn_err(CE_WARN,
839 "SBus%d Slot 14 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
840 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
841 }
842 if (t_sb_csr & SB_CSR_PPERR_S13) {
843 cmn_err(CE_WARN,
844 "SBus%d Slot 13 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
845 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
846 }
847 if (t_sb_csr & SB_CSR_PPERR_S3) {
848 cmn_err(CE_WARN,
849 "SBus%d Slot 3 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
850 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
851 }
852 if (t_sb_csr & SB_CSR_PPERR_S2) {
853 cmn_err(CE_WARN,
854 "SBus%d Slot 2 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
855 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
856 }
857 if (t_sb_csr & SB_CSR_PPERR_S1) {
858 cmn_err(CE_WARN,
859 "SBus%d Slot 1 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
860 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
861 }
862 if (t_sb_csr & SB_CSR_PPERR_S0) {
863 cmn_err(CE_WARN,
864 "SBus%d Slot 0 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
865 inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
866 }
867 }
868
869 /*
870 * Sysio Thermal Warning interrupt handler
871 */
872 static uint_t
sysio_thermal_warn_intr(struct sbus_soft_state * softsp)873 sysio_thermal_warn_intr(struct sbus_soft_state *softsp)
874 {
875 volatile uint64_t *clear_reg;
876 volatile uint64_t tmp_mondo_vec;
877 volatile uint64_t *mondo_vec_reg;
878 const char thermal_warn_msg[] =
879 "Severe over-temperature condition detected!";
880
881 /*
882 * Take off the Thermal Warning interrupt and
883 * remove its interrupt handler.
884 */
885 mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG);
886 tmp_mondo_vec = *mondo_vec_reg;
887 tmp_mondo_vec &= ~INTERRUPT_VALID;
888 *mondo_vec_reg = tmp_mondo_vec;
889
890 ddi_remove_intr(softsp->dip, 4, NULL);
891
892 clear_reg = (softsp->clr_intr_reg + THERMAL_CLEAR);
893 *clear_reg = 0;
894
895 if (oven_test) {
896 cmn_err(CE_NOTE, "OVEN TEST: %s", thermal_warn_msg);
897 return (DDI_INTR_CLAIMED);
898 }
899
900 cmn_err(CE_WARN, "%s", thermal_warn_msg);
901 cmn_err(CE_WARN, "Powering down...");
902
903 do_shutdown();
904
905 /*
906 * just in case do_shutdown() fails
907 */
908 (void) timeout((void(*)(void *))power_down, NULL,
909 thermal_powerdown_delay * hz);
910
911 return (DDI_INTR_CLAIMED);
912 }
913