xref: /illumos-gate/usr/src/uts/sun4u/opl/io/pcicmu/pcmu_ecc.c (revision 1a220b56b93ff1dc80855691548503117af4cc10)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * CMU-CH ECC support
30  */
31 
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/kmem.h>
35 #include <sys/sunddi.h>
36 #include <sys/intr.h>
37 #include <sys/async.h>
38 #include <sys/ddi_impldefs.h>
39 #include <sys/machsystm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/fm/util.h>
43 #include <sys/fm/io/pci.h>
44 #include <sys/fm/io/sun4upci.h>
45 #include <sys/fm/io/ddi.h>
46 #include <sys/pcicmu/pcicmu.h>
47 
48 /*LINTLIBRARY*/
49 
50 static void pcmu_ecc_disable(pcmu_ecc_t *, int);
51 static uint64_t pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *);
52 static void pcmu_ecc_ereport_post(dev_info_t *dip,
53     pcmu_ecc_errstate_t *ecc_err);
54 
55 clock_t pcmu_pecc_panic_delay = 200;
56 
57 void
58 pcmu_ecc_create(pcmu_t *pcmu_p)
59 {
60 	uint64_t pcb_base_pa = pcmu_p->pcmu_cb_p->pcb_base_pa;
61 	pcmu_ecc_t *pecc_p;
62 	/* LINTED variable */
63 	dev_info_t *dip = pcmu_p->pcmu_dip;
64 
65 	pecc_p = (pcmu_ecc_t *)kmem_zalloc(sizeof (pcmu_ecc_t), KM_SLEEP);
66 	pecc_p->pecc_pcmu_p = pcmu_p;
67 	pcmu_p->pcmu_pecc_p = pecc_p;
68 
69 	pecc_p->pecc_ue.pecc_p = pecc_p;
70 	pecc_p->pecc_ue.pecc_type = CBNINTR_UE;
71 
72 	pcmu_ecc_setup(pecc_p);
73 
74 	/*
75 	 * Determine the virtual addresses of the streaming cache
76 	 * control/status and flush registers.
77 	 */
78 	pecc_p->pecc_csr_pa = pcb_base_pa + PCMU_ECC_CSR_OFFSET;
79 	pecc_p->pecc_ue.pecc_afsr_pa = pcb_base_pa + PCMU_UE_AFSR_OFFSET;
80 	pecc_p->pecc_ue.pecc_afar_pa = pcb_base_pa + PCMU_UE_AFAR_OFFSET;
81 
82 	PCMU_DBG1(PCMU_DBG_ATTACH, dip, "pcmu_ecc_create: csr=%x\n",
83 	    pecc_p->pecc_csr_pa);
84 	PCMU_DBG2(PCMU_DBG_ATTACH, dip,
85 	    "pcmu_ecc_create: ue_afsr=%x, ue_afar=%x\n",
86 	    pecc_p->pecc_ue.pecc_afsr_pa, pecc_p->pecc_ue.pecc_afar_pa);
87 
88 	pcmu_ecc_configure(pcmu_p);
89 
90 	/*
91 	 * Register routines to be called from system error handling code.
92 	 */
93 	bus_func_register(BF_TYPE_ERRDIS,
94 	    (busfunc_t)pcmu_ecc_disable_nowait, pecc_p);
95 }
96 
97 int
98 pcmu_ecc_register_intr(pcmu_t *pcmu_p)
99 {
100 	pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p;
101 	int ret;
102 
103 	/*
104 	 * Install the UE error interrupt handlers.
105 	 */
106 	ret = pcmu_ecc_add_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue);
107 	return (ret);
108 }
109 
110 void
111 pcmu_ecc_destroy(pcmu_t *pcmu_p)
112 {
113 	pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p;
114 
115 	PCMU_DBG0(PCMU_DBG_DETACH, pcmu_p->pcmu_dip, "pcmu_ecc_destroy:\n");
116 
117 	/*
118 	 * Disable UE ECC error interrupts.
119 	 */
120 	pcmu_ecc_disable_wait(pecc_p);
121 
122 	/*
123 	 * Remove the ECC interrupt handlers.
124 	 */
125 	pcmu_ecc_rem_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue);
126 
127 	/*
128 	 * Unregister our error handling functions.
129 	 */
130 	bus_func_unregister(BF_TYPE_ERRDIS,
131 	    (busfunc_t)pcmu_ecc_disable_nowait, pecc_p);
132 	/*
133 	 * If a timer has been set, unset it.
134 	 */
135 	(void) untimeout(pecc_p->pecc_tout_id);
136 	kmem_free(pecc_p, sizeof (pcmu_ecc_t));
137 	pcmu_p->pcmu_pecc_p = NULL;
138 }
139 
140 void
141 pcmu_ecc_configure(pcmu_t *pcmu_p)
142 {
143 	pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p;
144 	uint64_t l;
145 	/* LINTED variable */
146 	dev_info_t *dip = pcmu_p->pcmu_dip;
147 
148 	/*
149 	 * Clear any pending ECC errors.
150 	 */
151 	PCMU_DBG0(PCMU_DBG_ATTACH, dip,
152 	    "pcmu_ecc_configure: clearing UE errors\n");
153 	l = (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_PE_SHIFT) |
154 	    (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_SE_SHIFT);
155 	stdphysio(pecc_p->pecc_ue.pecc_afsr_pa, l);
156 
157 	/*
158 	 * Enable ECC error detections via the control register.
159 	 */
160 	PCMU_DBG0(PCMU_DBG_ATTACH, dip,
161 	    "pcmu_ecc_configure: enabling UE detection\n");
162 	l = PCMU_ECC_CTRL_ECC_EN;
163 	if (ecc_error_intr_enable)
164 		l |= PCMU_ECC_CTRL_UE_INTEN;
165 	stdphysio(pecc_p->pecc_csr_pa, l);
166 }
167 
168 void
169 pcmu_ecc_enable_intr(pcmu_t *pcmu_p)
170 {
171 	pcmu_cb_enable_nintr(pcmu_p, CBNINTR_UE);
172 }
173 
174 void
175 pcmu_ecc_disable_wait(pcmu_ecc_t *pecc_p)
176 {
177 	pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_WAIT);
178 }
179 
180 uint_t
181 pcmu_ecc_disable_nowait(pcmu_ecc_t *pecc_p)
182 {
183 	pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_NOWAIT);
184 	return (BF_NONE);
185 }
186 
187 static void
188 pcmu_ecc_disable(pcmu_ecc_t *pecc_p, int wait)
189 {
190 	pcmu_cb_t *pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p;
191 	uint64_t csr_pa = pecc_p->pecc_csr_pa;
192 	uint64_t csr = lddphysio(csr_pa);
193 
194 	csr &= ~(PCMU_ECC_CTRL_UE_INTEN);
195 	stdphysio(csr_pa, csr);
196 	pcmu_cb_disable_nintr(pcb_p, CBNINTR_UE, wait);
197 }
198 
199 /*
200  * I/O ECC error handling:
201  *
202  * Below are the generic functions that handle detected ECC errors.
203  *
204  * The registered interrupt handler is pcmu_ecc_intr(), it's function
205  * is to receive the error, capture some state, and pass that on to
206  * the pcmu_ecc_err_handler() for reporting purposes.
207  *
208  * pcmu_ecc_err_handler() gathers more state(via pcmu_ecc_errstate_get)
209  * and attempts to handle and report the error. pcmu_ecc_err_handler()
210  * must determine if we need to panic due to this error (via
211  * pcmu_ecc_classify, which also decodes the * ECC afsr), and if any
212  * side effects exist that may have caused or are due * to this error.
213  * PBM errors related to the ECC error may exist, to report
214  * them we call pcmu_pbm_err_handler().
215  *
216  * To report the error we must also get the syndrome and unum, which can not
217  * be done in high level interrupted context. Therefore we have an error
218  * queue(pcmu_ecc_queue) which we dispatch errors to, to report the errors
219  * (pcmu_ecc_err_drain()).
220  *
221  * pcmu_ecc_err_drain() will be called when either the softint is triggered
222  * or the system is panicing. Either way it will gather more information
223  * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to
224  * retire the faulty page(if error is a UE), and report the detected error.
225  *
226  */
227 
228 /*
229  * Function used to get ECC AFSR register
230  */
231 static uint64_t
232 pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *ecc_ii_p)
233 {
234 	ASSERT(ecc_ii_p->pecc_type == CBNINTR_UE);
235 	return (lddphysio(ecc_ii_p->pecc_afsr_pa));
236 }
237 
238 /*
239  * IO detected ECC error interrupt handler, calls pcmu_ecc_err_handler to post
240  * error reports and handle the interrupt. Re-entry into pcmu_ecc_err_handler
241  * is protected by the per-chip mutex pcmu_err_mutex.
242  */
243 uint_t
244 pcmu_ecc_intr(caddr_t a)
245 {
246 	pcmu_ecc_intr_info_t *ecc_ii_p = (pcmu_ecc_intr_info_t *)a;
247 	pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p;
248 	pcmu_t *pcmu_p = pecc_p->pecc_pcmu_p;
249 	pcmu_ecc_errstate_t ecc_err;
250 	int ret = DDI_FM_OK;
251 
252 	bzero(&ecc_err, sizeof (pcmu_ecc_errstate_t));
253 	ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); /* RAGS */
254 	ecc_err.ecc_ii_p = *ecc_ii_p;
255 	ecc_err.pecc_p = pecc_p;
256 	ecc_err.ecc_caller = PCI_ECC_CALL;
257 
258 	mutex_enter(&pcmu_p->pcmu_err_mutex);
259 	ret = pcmu_ecc_err_handler(&ecc_err);
260 	mutex_exit(&pcmu_p->pcmu_err_mutex);
261 	if (ret == DDI_FM_FATAL) {
262 		/*
263 		 * Need delay here to allow CPUs to handle related traps,
264 		 * such as FRUs for USIIIi systems.
265 		 */
266 		DELAY(pcmu_pecc_panic_delay);
267 		cmn_err(CE_PANIC, "Fatal PCI UE Error");
268 	}
269 
270 	return (DDI_INTR_CLAIMED);
271 }
272 
273 /*
274  * Function used to gather IO ECC error state.
275  */
276 static void
277 pcmu_ecc_errstate_get(pcmu_ecc_errstate_t *ecc_err_p)
278 {
279 	pcmu_ecc_t *pecc_p;
280 	uint_t bus_id;
281 
282 	ASSERT(ecc_err_p);
283 
284 	pecc_p = ecc_err_p->ecc_ii_p.pecc_p;
285 	bus_id = pecc_p->pecc_pcmu_p->pcmu_id;
286 
287 	ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex));
288 	/*
289 	 * Read the fault registers.
290 	 */
291 	ecc_err_p->ecc_afsr = pcmu_ecc_read_afsr(&ecc_err_p->ecc_ii_p);
292 	ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.pecc_afar_pa);
293 
294 	ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr &
295 	    ecc_err_p->ecc_ii_p.pecc_offset_mask) >>
296 	    ecc_err_p->ecc_ii_p.pecc_offset_shift) <<
297 	    ecc_err_p->ecc_ii_p.pecc_size_log2;
298 
299 	ecc_err_p->ecc_aflt.flt_id = gethrtime();
300 	ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr;
301 	ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) +
302 	    ecc_err_p->ecc_offset;
303 	ecc_err_p->ecc_aflt.flt_bus_id = bus_id;
304 	ecc_err_p->ecc_aflt.flt_inst = 0;
305 	ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS;
306 	ecc_err_p->ecc_aflt.flt_in_memory = 0;
307 	ecc_err_p->ecc_aflt.flt_class = BUS_FAULT;
308 }
309 
310 /*
311  * pcmu_ecc_check: Called by pcmu_ecc_err_handler() this function is responsible
312  * for calling pcmu_pbm_err_handler() and calling their children error
313  * handlers(via ndi_fm_handler_dispatch()).
314  */
315 static int
316 pcmu_ecc_check(pcmu_ecc_t *pecc_p, uint64_t fme_ena)
317 {
318 	ddi_fm_error_t derr;
319 	int ret;
320 	pcmu_t *pcmu_p;
321 
322 
323 	ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex));
324 
325 	bzero(&derr, sizeof (ddi_fm_error_t));
326 	derr.fme_version = DDI_FME_VERSION;
327 	derr.fme_ena = fme_ena;
328 	ret = DDI_FM_NONFATAL;
329 
330 	/*
331 	 * Need to report any PBM errors which may have caused or
332 	 * resulted from this error.
333 	 */
334 	pcmu_p = pecc_p->pecc_pcmu_p;
335 	if (pcmu_pbm_err_handler(pcmu_p->pcmu_dip, &derr, (void *)pcmu_p,
336 	    PCI_ECC_CALL) == DDI_FM_FATAL)
337 		ret = DDI_FM_FATAL;
338 
339 	if (ret == DDI_FM_FATAL)
340 		return (DDI_FM_FATAL);
341 	else
342 		return (DDI_FM_NONFATAL);
343 }
344 
345 /*
346  * Function used to handle and log IO detected ECC errors, can be called by
347  * pcmu_ecc_intr and pcmu_err_callback(trap callback). Protected by
348  * pcmu_err_mutex.
349  */
350 int
351 pcmu_ecc_err_handler(pcmu_ecc_errstate_t *ecc_err_p)
352 {
353 	/* LINTED variable */
354 	uint64_t pri_err, sec_err;
355 	pcmu_ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p;
356 	pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p;
357 	/* LINTED variable */
358 	pcmu_t *pcmu_p;
359 	pcmu_cb_t *pcb_p;
360 	int fatal = 0;
361 	int nonfatal = 0;
362 
363 	ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex));
364 
365 	pcmu_p = pecc_p->pecc_pcmu_p;
366 	pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p;
367 
368 	pcmu_ecc_errstate_get(ecc_err_p);
369 	pri_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_PE_SHIFT) &
370 		PCMU_ECC_UE_AFSR_E_MASK;
371 
372 	sec_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_SE_SHIFT) &
373 		PCMU_ECC_UE_AFSR_E_MASK;
374 
375 	switch (ecc_ii_p->pecc_type) {
376 	case CBNINTR_UE:
377 		if (pri_err) {
378 			ecc_err_p->ecc_aflt.flt_synd = 0;
379 			ecc_err_p->pecc_pri = 1;
380 			pcmu_ecc_classify(pri_err, ecc_err_p);
381 			errorq_dispatch(pcmu_ecc_queue, (void *)ecc_err_p,
382 				sizeof (pcmu_ecc_errstate_t),
383 				ecc_err_p->ecc_aflt.flt_panic);
384 		}
385 		if (sec_err) {
386 			pcmu_ecc_errstate_t ecc_sec_err;
387 
388 			ecc_sec_err = *ecc_err_p;
389 			ecc_sec_err.pecc_pri = 0;
390 			pcmu_ecc_classify(sec_err, &ecc_sec_err);
391 			pcmu_ecc_ereport_post(pcmu_p->pcmu_dip,
392 					&ecc_sec_err);
393 		}
394 		/*
395 		 * Check for PCI bus errors that may have resulted from or
396 		 * caused this UE.
397 		 */
398 		if (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
399 		    pcmu_ecc_check(pecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL)
400 			ecc_err_p->ecc_aflt.flt_panic = 1;
401 
402 		if (ecc_err_p->ecc_aflt.flt_panic) {
403 			/*
404 			 * Disable all further errors since this will be
405 			 * treated as a fatal error.
406 			 */
407 			(void) pcmu_ecc_disable_nowait(pecc_p);
408 			fatal++;
409 		}
410 		break;
411 
412 	default:
413 		return (DDI_FM_OK);
414 	}
415 	/* Clear the errors */
416 	stdphysio(ecc_ii_p->pecc_afsr_pa, ecc_err_p->ecc_afsr);
417 	/*
418 	 * Clear the interrupt if called by pcmu_ecc_intr and UE error
419 	 * or if called by pcmu_ecc_intr and CE error and delayed CE
420 	 * interrupt handling is turned off.
421 	 */
422 	if (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
423 	    ecc_ii_p->pecc_type == CBNINTR_UE && !fatal)
424 		pcmu_cb_clear_nintr(pcb_p, ecc_ii_p->pecc_type);
425 	if (!fatal && !nonfatal)
426 		return (DDI_FM_OK);
427 	else if (fatal)
428 		return (DDI_FM_FATAL);
429 	return (DDI_FM_NONFATAL);
430 }
431 
432 /*
433  * Function used to drain pcmu_ecc_queue, either during panic or after softint
434  * is generated, to log IO detected ECC errors.
435  */
436 void
437 pcmu_ecc_err_drain(void *not_used, pcmu_ecc_errstate_t *ecc_err)
438 {
439 	struct async_flt *ecc = &ecc_err->ecc_aflt;
440 	pcmu_t *pcmu_p = ecc_err->pecc_p->pecc_pcmu_p;
441 
442 	ecc_cpu_call(ecc, ecc_err->ecc_unum, ECC_IO_UE);
443 	ecc_err->ecc_err_type = "U";
444 	pcmu_ecc_ereport_post(pcmu_p->pcmu_dip, ecc_err);
445 }
446 
447 /*
448  * Function used to post IO detected ECC ereports.
449  */
450 static void
451 pcmu_ecc_ereport_post(dev_info_t *dip, pcmu_ecc_errstate_t *ecc_err)
452 {
453 	char *aux_msg;
454 	pcmu_t *pcmu_p;
455 	int instance = ddi_get_instance(dip);
456 
457 	pcmu_p = get_pcmu_soft_state(instance);
458 	if (ecc_err->pecc_pri) {
459 		aux_msg = "PIO primary uncorrectable error";
460 	} else {
461 		aux_msg = "PIO secondary uncorrectable error";
462 	}
463 	cmn_err(CE_WARN, "%s %s: %s %s=0x%lx, %s=0x%lx, %s=0x%x",
464 		(pcmu_p->pcmu_pcbm_p)->pcbm_nameinst_str,
465 		(pcmu_p->pcmu_pcbm_p)->pcbm_nameaddr_str,
466 		aux_msg, PCI_ECC_AFSR, ecc_err->ecc_afsr,
467 		PCI_ECC_AFAR, ecc_err->ecc_aflt.flt_addr,
468 		"portid", ecc_err->ecc_aflt.flt_bus_id);
469 }
470