xref: /illumos-gate/usr/src/uts/common/sys/ib/adapters/hermon/hermon_fm.h (revision 6a634c9dca3093f3922e4b7ab826d7bdf17bf78e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #ifndef	_SYS_IB_ADAPTERS_HERMON_FM_H
27 #define	_SYS_IB_ADAPTERS_HERMON_FM_H
28 
29 /*
30  * hermon_fm.h
31  */
32 #include <sys/ddifm.h>
33 #include <sys/fm/protocol.h>
34 #include <sys/fm/util.h>
35 #include <sys/fm/io/ddi.h>
36 
37 #ifdef __cplusplus
38 extern "C" {
39 #endif
40 
41 /*
42  * HCA FMA compile note.
43  *
44  * FMA_TEST is used for HCA function tests, and
45  * the macro can be on by changing Makefile.
46  *
47  * in case of DEBUG
48  * 	FMA_TEST is on
49  *
50  * in case of non-DEBUG (DEBUG is off)
51  * 	FMA_TEST is off
52  */
53 
54 /*
55  * HCA FM common data structure
56  */
57 
58 /*
59  * HCA FM Structure
60  * This structure is used to catch HCA HW errors.
61  */
62 struct i_hca_fm {
63 	uint32_t ref_cnt;	/* the number of instances referring to this */
64 	kmutex_t lock;		/* protection for last_err & polling thread */
65 	struct i_hca_acc_handle *hdl;	/* HCA FM acc handle structure */
66 	struct kmem_cache *fm_acc_cache; /* HCA acc handle cache */
67 
68 };
69 
70 /*
71  * HCA FM acc handle structure
72  * This structure is holding ddi_acc_handle_t and other members
73  * to deal with HCA PIO FM.
74  */
75 struct i_hca_acc_handle {
76 	struct i_hca_acc_handle *next;	/* next structure */
77 	ddi_acc_handle_t save_hdl;	/* acc handle */
78 	kmutex_t lock;			/* mutex lock for thread count */
79 	uint32_t thread_cnt;		/* number of threads issuing PIOs */
80 };
81 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", i_hca_acc_handle::save_hdl))
82 #define	fm_acc_hdl(hdl)	(((struct i_hca_acc_handle *)(hdl))->save_hdl)
83 #define	FM_POLL_INTERVAL (10000000)	/* 10ms (nano) */
84 
85 /*
86  * HCA FM function test structure
87  * This structure can be used to test the basic fm function test for HCA.
88  * The test code is included if the FMA_TEST macro is defined.
89  */
90 struct i_hca_fm_test {
91 	int num;		/* serial numner */
92 	int type;		/* PIO or Hermon specific errors */
93 #define	HCA_TEST_PIO	0x1
94 #define	HCA_TEST_IBA	0x2
95 	int trigger;		/* how to trigger a HW error */
96 #define	HCA_TEST_TRANSIENT		0x0001
97 #define	HCA_TEST_PERSISTENT		0x0002
98 #define	HCA_TEST_ATTACH			0x0010
99 #define	HCA_TEST_START			0x0100
100 #define	HCA_TEST_END			0x0200
101 	void (*pio_injection)(struct i_hca_fm_test *, ddi_fm_error_t *);
102 	int errcnt;		/* how many transient error occurs */
103 	int line_num;		/* line number in the source code */
104 	char *file_name;	/* source filename */
105 	char *hash_key;		/* hash table for test items */
106 	void *private;		/* private data */
107 };
108 
109 /*
110  * Hermon FM data structure
111  */
112 typedef struct i_hca_fm hermon_hca_fm_t;
113 typedef struct i_hca_acc_handle hermon_acc_handle_t;
114 typedef struct i_hca_fm_test hermon_test_t;
115 
116 /*
117  * The following defines are to supplement device error reporting.
118  * At each place where the planned FMA error matrix specifies that
119  * an ereport will be generated, for now there is a HERMON_FMANOTE()
120  * call generating an appropriate message string.
121  *
122  * This has been revised since it has been realized that FMA is only
123  * to be used for hardware errors.  HERMON_FMANOTE() is used to report
124  * errors that are likely to be hardware, but possibly are not.
125  */
126 #define	HERMON_FMANOTE(state, string)					\
127 	cmn_err(CE_WARN, "hermon%d: Device Error: %s",			\
128 		(state)->hs_instance, string)
129 
130 /* CQE Syndrome errors - see hermon_cq.c */
131 
132 #define	HERMON_FMA_LOCLEN 	"CQE local length error"
133 #define	HERMON_FMA_LOCQPOP	"CQE local qp operation error"
134 #define	HERMON_FMA_LOCPROT	"CQE local protection error"
135 #define	HERMON_FMA_WQFLUSH	"CQE wqe flushed in error"
136 #define	HERMON_FMA_MWBIND	"CQE memory window bind error"
137 #define	HERMON_FMA_RESP		"CQE bad response"
138 #define	HERMON_FMA_LOCACC	"CQE local access error"
139 #define	HERMON_FMA_REMREQ	"CQE remote invalid request error"
140 #define	HERMON_FMA_REMACC	"CQE remote access error"
141 #define	HERMON_FMA_REMOP	"CQE remote operation error"
142 #define	HERMON_FMA_XPORTCNT	"CQE transport retry counter exceeded"
143 #define	HERMON_FMA_RNRCNT	"CQE RNR retry counter exceeded"
144 #define	HERMON_FMA_REMABRT	"CQE remote aborted error"
145 #define	HERMON_FMA_UNKN		"CQE unknown/reserved error returned"
146 
147 /* event errors - see hermon_event.c */
148 #define	HERMON_FMA_OVERRUN	"EQE cq overrun or protection error"
149 #define	HERMON_FMA_LOCCAT	"EQE local work queue catastrophic error"
150 #define	HERMON_FMA_QPCAT	"EQE local queue pair catastrophic error"
151 #define	HERMON_FMA_PATHMIG	"EQE path migration failed"
152 #define	HERMON_FMA_LOCINV	"EQE invalid request - local work queue"
153 #define	HERMON_FMA_LOCACEQ	"EQE local access violation"
154 #define	HERMON_FMA_SRQCAT	"EQE shared received queue catastrophic"
155 #define	HERMON_FMA_INTERNAL	"EQE hca internal error"
156 
157 /* HCR device failure returns - see hermon_cmd.c */
158 #define	HERMON_FMA_HCRINT	"HCR internal error processing command"
159 #define	HERMON_FMA_NVMEM	"HCR NVRAM checksum/CRC failure"
160 #define	HERMON_FMA_TOTOG	"HCR Timeout waiting for command toggle"
161 #define	HERMON_FMA_GOBIT	"HCR Timeout waiting for command go bit"
162 #define	HERMON_FMA_RSRC		"HCR Command insufficient resources"
163 #define	HERMON_FMA_CMDINV	"HCR Command invalid status returned"
164 
165 /* HCA initialization errors - see hermon.c */
166 #define	HERMON_FMA_FWVER	"HCA firmware not at minimum version"
167 #define	HERMON_FMA_PCIID	"HCA PCIe devid not supported"
168 #define	HERMON_FMA_MAINT	"HCA device set to memory controller mode"
169 #define	HERMON_FMA_BADNVMEM	"HCR bad NVMEM error"
170 
171 /*
172  * HCA FM constants
173  */
174 
175 /* HCA FM state */
176 #define	HCA_NO_FM		0x0000	/* HCA FM is not supported */
177 /* HCA FM state flags */
178 #define	HCA_PIO_FM		0x0001	/* PIO is fma-protected */
179 #define	HCA_DMA_FM		0x0002	/* DMA is fma-protected */
180 #define	HCA_EREPORT_FM		0x0004	/* FMA ereport is available */
181 #define	HCA_ERRCB_FM		0x0010	/* FMA error callback is supported */
182 
183 #define	HCA_ATTCH_FM		0x0100	/* HCA FM attach mode */
184 #define	HCA_RUNTM_FM		0x0200	/* HCA FM runtime mode */
185 
186 /* HCA ererport type */
187 #define	HCA_SYS_ERR		0x001	/* HW error reported by Solaris FMA */
188 #define	HCA_IBA_ERR		0x002	/* IB specific HW error */
189 
190 /* HCA ereport detail */
191 #define	HCA_ERR_TRANSIENT	0x010	/* HCA temporary error */
192 #define	HCA_ERR_NON_FATAL	0x020	/* HCA persistent error */
193 #define	HCA_ERR_SRV_LOST	0x040	/* HCA attach failure */
194 #define	HCA_ERR_DEGRADED	0x080	/* HCA maintenance mode */
195 #define	HCA_ERR_FATAL		0x100	/* HCA critical situation */
196 #define	HCA_ERR_IOCTL		0x200	/* EIO */
197 
198 /* Ignore HCA HW error check */
199 #define	HCA_SKIP_HW_CHK		(-1)
200 
201 /* HCA FM pio retry operation state */
202 #define	HCA_PIO_OK		(0)	/* No HW errors */
203 #define	HCA_PIO_TRANSIENT	(1)	/* transient error */
204 #define	HCA_PIO_PERSISTENT	(2)	/* persistent error */
205 #define	HCA_PIO_RETRY_CNT	(3)
206 
207 /* HCA firmware faults */
208 #define	HCA_FW_MISC		0x1	/* firmware misc faults */
209 #define	HCA_FW_CORRUPT		0x2	/* firmware corruption */
210 #define	HCA_FW_MISMATCH		0x3	/* firmware version mismatch */
211 
212 /*
213  * Hermon FM macros
214  */
215 
216 #ifdef FMA_TEST
217 #define	TEST_DECLARE(tst)		hermon_test_t *tst;
218 #define	REGISTER_PIO_TEST(st, tst)					\
219     tst = hermon_test_register(st, __FILE__, __LINE__, HCA_TEST_PIO)
220 #define	PIO_START(st, hdl, tst)		hermon_PIO_start(st, hdl, tst)
221 #define	PIO_END(st, hdl, cnt, tst)	hermon_PIO_end(st, hdl, &cnt, tst)
222 #else
223 #define	TEST_DECLARE(tst)
224 #define	REGISTER_PIO_TEST(st, tst)
225 #define	PIO_START(st, hdl, tst)		hermon_PIO_start(st, hdl, NULL)
226 #define	PIO_END(st, hdl, cnt, tst)	hermon_PIO_end(st, hdl, &cnt, NULL)
227 #endif /* FMA_TEST */
228 
229 /*
230  * hermon_pio_init() is a macro initializing variables.
231  */
232 #define	hermon_pio_init(cnt, status, tst)				\
233 	TEST_DECLARE(tst)						\
234 	int	status = HCA_PIO_OK;					\
235 	int	cnt = HCA_PIO_RETRY_CNT
236 
237 /*
238  * hermon_pio_start() is one of a pair of macros checking HW errors
239  * at PIO requests, which should be called before the requests are issued.
240  */
241 #define	hermon_pio_start(st, hdl, label, cnt, status, tst)		\
242 	if (st->hs_fm_state & HCA_PIO_FM) {				\
243 		if (st->hs_fm_async_fatal) {				\
244 			hermon_fm_ereport(st, HCA_SYS_ERR,		\
245 			    HCA_ERR_NON_FATAL);				\
246 			goto label;					\
247 		} else {						\
248 			REGISTER_PIO_TEST(st, tst);			\
249 			cnt = HCA_PIO_RETRY_CNT;			\
250 			if (PIO_START(st, hdl, tst) ==			\
251 			    HCA_PIO_PERSISTENT) {			\
252 				goto label;				\
253 			}						\
254 		}							\
255 	} else {							\
256 		status = HCA_SKIP_HW_CHK;				\
257 	}								\
258 	do {
259 
260 /*
261  * hermon_pio_end() is the other of a pair of macros checking HW errors
262  * at PIO requests, which should be called after the requests end.
263  * If a HW error is detected and can be isolated well, these macros
264  * retry the operation to determine if the error is persistent or not.
265  */
266 #define	hermon_pio_end(st, hdl, label, cnt, status, tst)		\
267 	if (status != HCA_SKIP_HW_CHK) {				\
268 		if (st->hs_fm_async_fatal) {				\
269 			hermon_fm_ereport(st, HCA_SYS_ERR,		\
270 			    HCA_ERR_NON_FATAL);				\
271 			goto label;					\
272 		}							\
273 		if ((status = PIO_END(st, hdl, cnt, tst)) ==		\
274 		    HCA_PIO_PERSISTENT) {				\
275 			goto label;					\
276 		} else if (status == HCA_PIO_TRANSIENT) {		\
277 			hermon_fm_ereport(st, HCA_SYS_ERR,		\
278 			    HCA_ERR_TRANSIENT);				\
279 		}							\
280 	}								\
281 	} while (status == HCA_PIO_TRANSIENT)
282 
283 extern void hermon_fm_init(hermon_state_t *);
284 extern void hermon_fm_fini(hermon_state_t *);
285 extern int hermon_fm_ereport_init(hermon_state_t *);
286 extern void hermon_fm_ereport_fini(hermon_state_t *);
287 extern int hermon_get_state(hermon_state_t *);
288 extern boolean_t hermon_init_failure(hermon_state_t *);
289 extern boolean_t hermon_cmd_retry_ok(hermon_cmd_post_t *, int);
290 extern void hermon_fm_ereport(hermon_state_t *, int, int);
291 extern int hermon_regs_map_setup(hermon_state_t *, uint_t, caddr_t *, offset_t,
292     offset_t, ddi_device_acc_attr_t *, ddi_acc_handle_t *);
293 extern void hermon_regs_map_free(hermon_state_t *, ddi_acc_handle_t *);
294 extern int hermon_pci_config_setup(hermon_state_t *, ddi_acc_handle_t *);
295 extern void hermon_pci_config_teardown(hermon_state_t *, ddi_acc_handle_t *);
296 extern ushort_t hermon_devacc_attr_version(hermon_state_t *);
297 extern uchar_t hermon_devacc_attr_access(hermon_state_t *);
298 extern int hermon_PIO_start(hermon_state_t *, ddi_acc_handle_t,
299     hermon_test_t *);
300 extern int hermon_PIO_end(hermon_state_t *, ddi_acc_handle_t, int *,
301     hermon_test_t *);
302 extern ddi_acc_handle_t hermon_rsrc_alloc_uarhdl(hermon_state_t *);
303 extern ddi_acc_handle_t hermon_get_uarhdl(hermon_state_t *);
304 extern ddi_acc_handle_t hermon_get_cmdhdl(hermon_state_t *);
305 extern ddi_acc_handle_t hermon_get_msix_tblhdl(hermon_state_t *);
306 extern ddi_acc_handle_t hermon_get_msix_pbahdl(hermon_state_t *);
307 extern ddi_acc_handle_t hermon_get_pcihdl(hermon_state_t *);
308 extern void hermon_clr_state_nolock(hermon_state_t *, int);
309 extern void hermon_inter_err_chk(void *);
310 
311 #ifdef FMA_TEST
312 extern hermon_test_t *hermon_test_register(hermon_state_t *, char *, int, int);
313 extern void hermon_test_deregister(void);
314 extern int hermon_test_num;
315 #endif /* FMA_TEST */
316 
317 #ifdef __cplusplus
318 }
319 #endif
320 
321 #endif	/* _SYS_IB_ADAPTERS_HERMON_FM_H */
322