xref: /illumos-gate/usr/src/uts/sun4u/opl/io/pcicmu/pcmu_ecc.c (revision 3ce5372277f4657ad0e52d36c979527c4ca22de2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * CMU-CH ECC support
30  */
31 
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/kmem.h>
35 #include <sys/sunddi.h>
36 #include <sys/intr.h>
37 #include <sys/async.h>
38 #include <sys/ddi_impldefs.h>
39 #include <sys/machsystm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/fm/util.h>
43 #include <sys/fm/io/pci.h>
44 #include <sys/fm/io/sun4upci.h>
45 #include <sys/fm/io/ddi.h>
46 #include <sys/pcicmu/pcicmu.h>
47 
48 static void pcmu_ecc_disable(pcmu_ecc_t *, int);
49 static uint64_t pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *);
50 static void pcmu_ecc_ereport_post(dev_info_t *dip,
51     pcmu_ecc_errstate_t *ecc_err);
52 
53 clock_t pcmu_pecc_panic_delay = 200;
54 
55 void
56 pcmu_ecc_create(pcmu_t *pcmu_p)
57 {
58 	uint64_t pcb_base_pa = pcmu_p->pcmu_cb_p->pcb_base_pa;
59 	pcmu_ecc_t *pecc_p;
60 	/* LINTED variable */
61 	dev_info_t *dip = pcmu_p->pcmu_dip;
62 
63 	pecc_p = (pcmu_ecc_t *)kmem_zalloc(sizeof (pcmu_ecc_t), KM_SLEEP);
64 	pecc_p->pecc_pcmu_p = pcmu_p;
65 	pcmu_p->pcmu_pecc_p = pecc_p;
66 
67 	pecc_p->pecc_ue.pecc_p = pecc_p;
68 	pecc_p->pecc_ue.pecc_type = CBNINTR_UE;
69 
70 	pcmu_ecc_setup(pecc_p);
71 
72 	/*
73 	 * Determine the virtual addresses of the streaming cache
74 	 * control/status and flush registers.
75 	 */
76 	pecc_p->pecc_csr_pa = pcb_base_pa + PCMU_ECC_CSR_OFFSET;
77 	pecc_p->pecc_ue.pecc_afsr_pa = pcb_base_pa + PCMU_UE_AFSR_OFFSET;
78 	pecc_p->pecc_ue.pecc_afar_pa = pcb_base_pa + PCMU_UE_AFAR_OFFSET;
79 
80 	PCMU_DBG1(PCMU_DBG_ATTACH, dip, "pcmu_ecc_create: csr=%x\n",
81 	    pecc_p->pecc_csr_pa);
82 	PCMU_DBG2(PCMU_DBG_ATTACH, dip,
83 	    "pcmu_ecc_create: ue_afsr=%x, ue_afar=%x\n",
84 	    pecc_p->pecc_ue.pecc_afsr_pa, pecc_p->pecc_ue.pecc_afar_pa);
85 
86 	pcmu_ecc_configure(pcmu_p);
87 
88 	/*
89 	 * Register routines to be called from system error handling code.
90 	 */
91 	bus_func_register(BF_TYPE_ERRDIS,
92 	    (busfunc_t)pcmu_ecc_disable_nowait, pecc_p);
93 }
94 
95 int
96 pcmu_ecc_register_intr(pcmu_t *pcmu_p)
97 {
98 	pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p;
99 	int ret;
100 
101 	/*
102 	 * Install the UE error interrupt handlers.
103 	 */
104 	ret = pcmu_ecc_add_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue);
105 	return (ret);
106 }
107 
108 void
109 pcmu_ecc_destroy(pcmu_t *pcmu_p)
110 {
111 	pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p;
112 
113 	PCMU_DBG0(PCMU_DBG_DETACH, pcmu_p->pcmu_dip, "pcmu_ecc_destroy:\n");
114 
115 	/*
116 	 * Disable UE ECC error interrupts.
117 	 */
118 	pcmu_ecc_disable_wait(pecc_p);
119 
120 	/*
121 	 * Remove the ECC interrupt handlers.
122 	 */
123 	pcmu_ecc_rem_intr(pcmu_p, CBNINTR_UE, &pecc_p->pecc_ue);
124 
125 	/*
126 	 * Unregister our error handling functions.
127 	 */
128 	bus_func_unregister(BF_TYPE_ERRDIS,
129 	    (busfunc_t)pcmu_ecc_disable_nowait, pecc_p);
130 	/*
131 	 * If a timer has been set, unset it.
132 	 */
133 	(void) untimeout(pecc_p->pecc_tout_id);
134 	kmem_free(pecc_p, sizeof (pcmu_ecc_t));
135 	pcmu_p->pcmu_pecc_p = NULL;
136 }
137 
138 void
139 pcmu_ecc_configure(pcmu_t *pcmu_p)
140 {
141 	pcmu_ecc_t *pecc_p = pcmu_p->pcmu_pecc_p;
142 	uint64_t l;
143 	/* LINTED variable */
144 	dev_info_t *dip = pcmu_p->pcmu_dip;
145 
146 	/*
147 	 * Clear any pending ECC errors.
148 	 */
149 	PCMU_DBG0(PCMU_DBG_ATTACH, dip,
150 	    "pcmu_ecc_configure: clearing UE errors\n");
151 	l = (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_PE_SHIFT) |
152 	    (PCMU_ECC_UE_AFSR_E_MASK << PCMU_ECC_UE_AFSR_SE_SHIFT);
153 	stdphysio(pecc_p->pecc_ue.pecc_afsr_pa, l);
154 
155 	/*
156 	 * Enable ECC error detections via the control register.
157 	 */
158 	PCMU_DBG0(PCMU_DBG_ATTACH, dip,
159 	    "pcmu_ecc_configure: enabling UE detection\n");
160 	l = PCMU_ECC_CTRL_ECC_EN;
161 	if (ecc_error_intr_enable)
162 		l |= PCMU_ECC_CTRL_UE_INTEN;
163 	stdphysio(pecc_p->pecc_csr_pa, l);
164 }
165 
166 void
167 pcmu_ecc_enable_intr(pcmu_t *pcmu_p)
168 {
169 	pcmu_cb_enable_nintr(pcmu_p, CBNINTR_UE);
170 }
171 
172 void
173 pcmu_ecc_disable_wait(pcmu_ecc_t *pecc_p)
174 {
175 	pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_WAIT);
176 }
177 
178 uint_t
179 pcmu_ecc_disable_nowait(pcmu_ecc_t *pecc_p)
180 {
181 	pcmu_ecc_disable(pecc_p, PCMU_IB_INTR_NOWAIT);
182 	return (BF_NONE);
183 }
184 
185 static void
186 pcmu_ecc_disable(pcmu_ecc_t *pecc_p, int wait)
187 {
188 	pcmu_cb_t *pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p;
189 	uint64_t csr_pa = pecc_p->pecc_csr_pa;
190 	uint64_t csr = lddphysio(csr_pa);
191 
192 	csr &= ~(PCMU_ECC_CTRL_UE_INTEN);
193 	stdphysio(csr_pa, csr);
194 	pcmu_cb_disable_nintr(pcb_p, CBNINTR_UE, wait);
195 }
196 
197 /*
198  * I/O ECC error handling:
199  *
200  * Below are the generic functions that handle detected ECC errors.
201  *
202  * The registered interrupt handler is pcmu_ecc_intr(), it's function
203  * is to receive the error, capture some state, and pass that on to
204  * the pcmu_ecc_err_handler() for reporting purposes.
205  *
206  * pcmu_ecc_err_handler() gathers more state(via pcmu_ecc_errstate_get)
207  * and attempts to handle and report the error. pcmu_ecc_err_handler()
208  * must determine if we need to panic due to this error (via
209  * pcmu_ecc_classify, which also decodes the * ECC afsr), and if any
210  * side effects exist that may have caused or are due * to this error.
211  * PBM errors related to the ECC error may exist, to report
212  * them we call pcmu_pbm_err_handler().
213  *
214  * To report the error we must also get the syndrome and unum, which can not
215  * be done in high level interrupted context. Therefore we have an error
216  * queue(pcmu_ecc_queue) which we dispatch errors to, to report the errors
217  * (pcmu_ecc_err_drain()).
218  *
219  * pcmu_ecc_err_drain() will be called when either the softint is triggered
220  * or the system is panicing. Either way it will gather more information
221  * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to
222  * retire the faulty page(if error is a UE), and report the detected error.
223  *
224  */
225 
226 /*
227  * Function used to get ECC AFSR register
228  */
229 static uint64_t
230 pcmu_ecc_read_afsr(pcmu_ecc_intr_info_t *ecc_ii_p)
231 {
232 	ASSERT(ecc_ii_p->pecc_type == CBNINTR_UE);
233 	return (lddphysio(ecc_ii_p->pecc_afsr_pa));
234 }
235 
236 /*
237  * IO detected ECC error interrupt handler, calls pcmu_ecc_err_handler to post
238  * error reports and handle the interrupt. Re-entry into pcmu_ecc_err_handler
239  * is protected by the per-chip mutex pcmu_err_mutex.
240  */
241 uint_t
242 pcmu_ecc_intr(caddr_t a)
243 {
244 	pcmu_ecc_intr_info_t *ecc_ii_p = (pcmu_ecc_intr_info_t *)a;
245 	pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p;
246 	pcmu_t *pcmu_p = pecc_p->pecc_pcmu_p;
247 	pcmu_ecc_errstate_t ecc_err;
248 	int ret = DDI_FM_OK;
249 
250 	bzero(&ecc_err, sizeof (pcmu_ecc_errstate_t));
251 	ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); /* RAGS */
252 	ecc_err.ecc_ii_p = *ecc_ii_p;
253 	ecc_err.pecc_p = pecc_p;
254 	ecc_err.ecc_caller = PCI_ECC_CALL;
255 
256 	mutex_enter(&pcmu_p->pcmu_err_mutex);
257 	ret = pcmu_ecc_err_handler(&ecc_err);
258 	mutex_exit(&pcmu_p->pcmu_err_mutex);
259 	if (ret == DDI_FM_FATAL) {
260 		/*
261 		 * Need delay here to allow CPUs to handle related traps,
262 		 * such as FRUs for USIIIi systems.
263 		 */
264 		DELAY(pcmu_pecc_panic_delay);
265 		cmn_err(CE_PANIC, "Fatal PCI UE Error");
266 	}
267 
268 	return (DDI_INTR_CLAIMED);
269 }
270 
271 /*
272  * Function used to gather IO ECC error state.
273  */
274 static void
275 pcmu_ecc_errstate_get(pcmu_ecc_errstate_t *ecc_err_p)
276 {
277 	pcmu_ecc_t *pecc_p;
278 	uint_t bus_id;
279 
280 	ASSERT(ecc_err_p);
281 
282 	pecc_p = ecc_err_p->ecc_ii_p.pecc_p;
283 	bus_id = pecc_p->pecc_pcmu_p->pcmu_id;
284 
285 	ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex));
286 	/*
287 	 * Read the fault registers.
288 	 */
289 	ecc_err_p->ecc_afsr = pcmu_ecc_read_afsr(&ecc_err_p->ecc_ii_p);
290 	ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.pecc_afar_pa);
291 
292 	ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr &
293 	    ecc_err_p->ecc_ii_p.pecc_offset_mask) >>
294 	    ecc_err_p->ecc_ii_p.pecc_offset_shift) <<
295 	    ecc_err_p->ecc_ii_p.pecc_size_log2;
296 
297 	ecc_err_p->ecc_aflt.flt_id = gethrtime();
298 	ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr;
299 	ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) +
300 	    ecc_err_p->ecc_offset;
301 	ecc_err_p->ecc_aflt.flt_bus_id = bus_id;
302 	ecc_err_p->ecc_aflt.flt_inst = 0;
303 	ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS;
304 	ecc_err_p->ecc_aflt.flt_in_memory = 0;
305 	ecc_err_p->ecc_aflt.flt_class = BUS_FAULT;
306 }
307 
308 /*
309  * pcmu_ecc_check: Called by pcmu_ecc_err_handler() this function is responsible
310  * for calling pcmu_pbm_err_handler() and calling their children error
311  * handlers(via ndi_fm_handler_dispatch()).
312  */
313 static int
314 pcmu_ecc_check(pcmu_ecc_t *pecc_p, uint64_t fme_ena)
315 {
316 	ddi_fm_error_t derr;
317 	int ret;
318 	pcmu_t *pcmu_p;
319 
320 
321 	ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex));
322 
323 	bzero(&derr, sizeof (ddi_fm_error_t));
324 	derr.fme_version = DDI_FME_VERSION;
325 	derr.fme_ena = fme_ena;
326 	ret = DDI_FM_NONFATAL;
327 
328 	/*
329 	 * Need to report any PBM errors which may have caused or
330 	 * resulted from this error.
331 	 */
332 	pcmu_p = pecc_p->pecc_pcmu_p;
333 	if (pcmu_pbm_err_handler(pcmu_p->pcmu_dip, &derr, (void *)pcmu_p,
334 	    PCI_ECC_CALL) == DDI_FM_FATAL)
335 		ret = DDI_FM_FATAL;
336 
337 	if (ret == DDI_FM_FATAL)
338 		return (DDI_FM_FATAL);
339 	else
340 		return (DDI_FM_NONFATAL);
341 }
342 
343 /*
344  * Function used to handle and log IO detected ECC errors, can be called by
345  * pcmu_ecc_intr and pcmu_err_callback(trap callback). Protected by
346  * pcmu_err_mutex.
347  */
348 int
349 pcmu_ecc_err_handler(pcmu_ecc_errstate_t *ecc_err_p)
350 {
351 	/* LINTED variable */
352 	uint64_t pri_err, sec_err;
353 	pcmu_ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p;
354 	pcmu_ecc_t *pecc_p = ecc_ii_p->pecc_p;
355 	/* LINTED variable */
356 	pcmu_t *pcmu_p;
357 	pcmu_cb_t *pcb_p;
358 	int fatal = 0;
359 	int nonfatal = 0;
360 
361 	ASSERT(MUTEX_HELD(&pecc_p->pecc_pcmu_p->pcmu_err_mutex));
362 
363 	pcmu_p = pecc_p->pecc_pcmu_p;
364 	pcb_p = pecc_p->pecc_pcmu_p->pcmu_cb_p;
365 
366 	pcmu_ecc_errstate_get(ecc_err_p);
367 	pri_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_PE_SHIFT) &
368 		PCMU_ECC_UE_AFSR_E_MASK;
369 
370 	sec_err = (ecc_err_p->ecc_afsr >> PCMU_ECC_UE_AFSR_SE_SHIFT) &
371 		PCMU_ECC_UE_AFSR_E_MASK;
372 
373 	switch (ecc_ii_p->pecc_type) {
374 	case CBNINTR_UE:
375 		if (pri_err) {
376 			ecc_err_p->ecc_aflt.flt_synd = 0;
377 			ecc_err_p->pecc_pri = 1;
378 			pcmu_ecc_classify(pri_err, ecc_err_p);
379 			errorq_dispatch(pcmu_ecc_queue, (void *)ecc_err_p,
380 				sizeof (pcmu_ecc_errstate_t),
381 				ecc_err_p->ecc_aflt.flt_panic);
382 		}
383 		if (sec_err) {
384 			pcmu_ecc_errstate_t ecc_sec_err;
385 
386 			ecc_sec_err = *ecc_err_p;
387 			ecc_sec_err.pecc_pri = 0;
388 			pcmu_ecc_classify(sec_err, &ecc_sec_err);
389 			pcmu_ecc_ereport_post(pcmu_p->pcmu_dip,
390 					&ecc_sec_err);
391 		}
392 		/*
393 		 * Check for PCI bus errors that may have resulted from or
394 		 * caused this UE.
395 		 */
396 		if (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
397 		    pcmu_ecc_check(pecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL)
398 			ecc_err_p->ecc_aflt.flt_panic = 1;
399 
400 		if (ecc_err_p->ecc_aflt.flt_panic) {
401 			/*
402 			 * Disable all further errors since this will be
403 			 * treated as a fatal error.
404 			 */
405 			(void) pcmu_ecc_disable_nowait(pecc_p);
406 			fatal++;
407 		}
408 		break;
409 
410 	default:
411 		return (DDI_FM_OK);
412 	}
413 	/* Clear the errors */
414 	stdphysio(ecc_ii_p->pecc_afsr_pa, ecc_err_p->ecc_afsr);
415 	/*
416 	 * Clear the interrupt if called by pcmu_ecc_intr and UE error
417 	 * or if called by pcmu_ecc_intr and CE error and delayed CE
418 	 * interrupt handling is turned off.
419 	 */
420 	if (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
421 	    ecc_ii_p->pecc_type == CBNINTR_UE && !fatal)
422 		pcmu_cb_clear_nintr(pcb_p, ecc_ii_p->pecc_type);
423 	if (!fatal && !nonfatal)
424 		return (DDI_FM_OK);
425 	else if (fatal)
426 		return (DDI_FM_FATAL);
427 	return (DDI_FM_NONFATAL);
428 }
429 
430 /*
431  * Function used to drain pcmu_ecc_queue, either during panic or after softint
432  * is generated, to log IO detected ECC errors.
433  */
434 /* ARGSUSED */
435 void
436 pcmu_ecc_err_drain(void *not_used, pcmu_ecc_errstate_t *ecc_err)
437 {
438 	struct async_flt *ecc = &ecc_err->ecc_aflt;
439 	pcmu_t *pcmu_p = ecc_err->pecc_p->pecc_pcmu_p;
440 
441 	ecc_cpu_call(ecc, ecc_err->ecc_unum, ECC_IO_UE);
442 	ecc_err->ecc_err_type = "U";
443 	pcmu_ecc_ereport_post(pcmu_p->pcmu_dip, ecc_err);
444 }
445 
446 /*
447  * Function used to post IO detected ECC ereports.
448  */
449 static void
450 pcmu_ecc_ereport_post(dev_info_t *dip, pcmu_ecc_errstate_t *ecc_err)
451 {
452 	char *aux_msg;
453 	pcmu_t *pcmu_p;
454 	int instance = ddi_get_instance(dip);
455 
456 	pcmu_p = get_pcmu_soft_state(instance);
457 	if (ecc_err->pecc_pri) {
458 		aux_msg = "PIO primary uncorrectable error";
459 	} else {
460 		aux_msg = "PIO secondary uncorrectable error";
461 	}
462 	cmn_err(CE_WARN, "%s %s: %s %s=0x%lx, %s=0x%lx, %s=0x%x",
463 		(pcmu_p->pcmu_pcbm_p)->pcbm_nameinst_str,
464 		(pcmu_p->pcmu_pcbm_p)->pcbm_nameaddr_str,
465 		aux_msg, PCI_ECC_AFSR, ecc_err->ecc_afsr,
466 		PCI_ECC_AFAR, ecc_err->ecc_aflt.flt_addr,
467 		"portid", ecc_err->ecc_aflt.flt_bus_id);
468 }
469