1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _SYS_IB_ADAPTERS_HERMON_FM_H 28 #define _SYS_IB_ADAPTERS_HERMON_FM_H 29 30 /* 31 * hermon_fm.h 32 */ 33 #include <sys/ddifm.h> 34 #include <sys/fm/protocol.h> 35 #include <sys/fm/util.h> 36 #include <sys/fm/io/ddi.h> 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 /* 43 * HCA FMA compile note. 44 * 45 * FMA_TEST is used for HCA function tests, and 46 * the macro can be on by changing Makefile. 47 * 48 * in case of DEBUG 49 * FMA_TEST is on 50 * 51 * in case of non-DEBUG (DEBUG is off) 52 * FMA_TEST is off 53 */ 54 55 /* 56 * HCA FM common data structure 57 */ 58 59 /* 60 * HCA FM Structure 61 * This structure is used to catch HCA HW errors. 62 */ 63 struct i_hca_fm { 64 uint32_t ref_cnt; /* the number of instances referring to this */ 65 kmutex_t lock; /* protection for last_err & polling thread */ 66 struct i_hca_acc_handle *hdl; /* HCA FM acc handle structure */ 67 struct kmem_cache *fm_acc_cache; /* HCA acc handle cache */ 68 69 }; 70 71 /* 72 * HCA FM acc handle structure 73 * This structure is holding ddi_acc_handle_t and other members 74 * to deal with HCA PIO FM. 75 */ 76 struct i_hca_acc_handle { 77 struct i_hca_acc_handle *next; /* next structure */ 78 ddi_acc_handle_t save_hdl; /* acc handle */ 79 kmutex_t lock; /* mutex lock for thread count */ 80 uint32_t thread_cnt; /* number of threads issuing PIOs */ 81 }; 82 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", i_hca_acc_handle::save_hdl)) 83 #define fm_acc_hdl(hdl) (((struct i_hca_acc_handle *)(hdl))->save_hdl) 84 #define FM_POLL_INTERVAL (10000000) /* 10ms (nano) */ 85 86 /* 87 * HCA FM function test structure 88 * This structure can be used to test the basic fm function test for HCA. 89 * The test code is included if the FMA_TEST macro is defined. 90 */ 91 struct i_hca_fm_test { 92 int num; /* serial numner */ 93 int type; /* PIO or Hermon specific errors */ 94 #define HCA_TEST_PIO 0x1 95 #define HCA_TEST_IBA 0x2 96 int trigger; /* how to trigger a HW error */ 97 #define HCA_TEST_TRANSIENT 0x0001 98 #define HCA_TEST_PERSISTENT 0x0002 99 #define HCA_TEST_ATTACH 0x0010 100 #define HCA_TEST_START 0x0100 101 #define HCA_TEST_END 0x0200 102 void (*pio_injection)(struct i_hca_fm_test *, ddi_fm_error_t *); 103 int errcnt; /* how many transient error occurs */ 104 int line_num; /* line number in the source code */ 105 char *file_name; /* source filename */ 106 char *hash_key; /* hash table for test items */ 107 void *private; /* private data */ 108 }; 109 110 /* 111 * Hermon FM data structure 112 */ 113 typedef struct i_hca_fm hermon_hca_fm_t; 114 typedef struct i_hca_acc_handle hermon_acc_handle_t; 115 typedef struct i_hca_fm_test hermon_test_t; 116 117 /* 118 * The following defines are to supplement device error reporting. 119 * At each place where the planned FMA error matrix specifies that 120 * an ereport will be generated, for now there is a HERMON_FMANOTE() 121 * call generating an appropriate message string. 122 */ 123 124 #define HERMON_FMANOTE(state, string) \ 125 cmn_err(CE_NOTE, "hermon%d: Device Error: %s", \ 126 (state)->hs_instance, string) 127 128 /* CQE Syndrome errors - see hermon_cq.c */ 129 130 #define HERMON_FMA_LOCLEN "CQE local length error" 131 #define HERMON_FMA_LOCQPOP "CQE local qp operation error" 132 #define HERMON_FMA_LOCPROT "CQE local protection error" 133 #define HERMON_FMA_WQFLUSH "CQE wqe flushed in error" 134 #define HERMON_FMA_MWBIND "CQE memory window bind error" 135 #define HERMON_FMA_RESP "CQE bad response" 136 #define HERMON_FMA_LOCACC "CQE local access error" 137 #define HERMON_FMA_REMREQ "CQE remote invalid request error" 138 #define HERMON_FMA_REMACC "CQE remote access error" 139 #define HERMON_FMA_REMOP "CQE remote operation error" 140 #define HERMON_FMA_XPORTCNT "CQE transport retry counter exceeded" 141 #define HERMON_FMA_RNRCNT "CQE RNR retry counter exceeded" 142 #define HERMON_FMA_REMABRT "CQE remote aborted error" 143 #define HERMON_FMA_UNKN "CQE unknown/reserved error returned" 144 145 /* event errors - see hermon_event.c */ 146 #define HERMON_FMA_OVERRUN "EQE cq overrun or protection error" 147 #define HERMON_FMA_LOCCAT "EQE local work queue catastrophic error" 148 #define HERMON_FMA_QPCAT "EQE local queue pair catastrophic error" 149 #define HERMON_FMA_PATHMIG "EQE path migration failed" 150 #define HERMON_FMA_LOCINV "EQE invalid request - local work queue" 151 #define HERMON_FMA_LOCACEQ "EQE local access violation" 152 #define HERMON_FMA_SRQCAT "EQE shared received queue catastrophic" 153 #define HERMON_FMA_INTERNAL "EQE hca internal error" 154 155 /* HCR device failure returns - see hermon_cmd.c */ 156 #define HERMON_FMA_HCRINT "HCR internal error processing command" 157 #define HERMON_FMA_NVMEM "HCR NVRAM checksum/CRC failure" 158 #define HERMON_FMA_TOTOG "HCR Timeout waiting for command toggle" 159 #define HERMON_FMA_GOBIT "HCR Timeout waiting for command go bit" 160 #define HERMON_FMA_RSRC "HCR Command insufficient resources" 161 #define HERMON_FMA_CMDINV "HCR Command invalid status returned" 162 163 /* HCA initialization errors - see hermon.c */ 164 #define HERMON_FMA_FWVER "HCA firmware not at minimum version" 165 #define HERMON_FMA_PCIID "HCA PCIe devid not supported" 166 #define HERMON_FMA_MAINT "HCA device set to memory controller mode" 167 #define HERMON_FMA_BADNVMEM "HCR bad NVMEM error" 168 169 /* 170 * HCA FM constants 171 */ 172 173 /* HCA FM state */ 174 #define HCA_NO_FM 0x0000 /* HCA FM is not supported */ 175 /* HCA FM state flags */ 176 #define HCA_PIO_FM 0x0001 /* PIO is fma-protected */ 177 #define HCA_DMA_FM 0x0002 /* DMA is fma-protected */ 178 #define HCA_EREPORT_FM 0x0004 /* FMA ereport is available */ 179 #define HCA_ERRCB_FM 0x0010 /* FMA error callback is supported */ 180 181 #define HCA_ATTCH_FM 0x0100 /* HCA FM attach mode */ 182 #define HCA_RUNTM_FM 0x0200 /* HCA FM runtime mode */ 183 184 /* HCA ererport type */ 185 #define HCA_SYS_ERR 0x001 /* HW error reported by Solaris FMA */ 186 #define HCA_IBA_ERR 0x002 /* IB specific HW error */ 187 188 /* HCA ereport detail */ 189 #define HCA_ERR_TRANSIENT 0x010 /* HCA temporary error */ 190 #define HCA_ERR_NON_FATAL 0x020 /* HCA persistent error */ 191 #define HCA_ERR_SRV_LOST 0x040 /* HCA attach failure */ 192 #define HCA_ERR_DEGRADED 0x080 /* HCA maintenance mode */ 193 #define HCA_ERR_FATAL 0x100 /* HCA critical situation */ 194 #define HCA_ERR_IOCTL 0x200 /* EIO */ 195 196 /* Ignore HCA HW error check */ 197 #define HCA_SKIP_HW_CHK (-1) 198 199 /* HCA FM pio retry operation state */ 200 #define HCA_PIO_OK (0) /* No HW errors */ 201 #define HCA_PIO_TRANSIENT (1) /* transient error */ 202 #define HCA_PIO_PERSISTENT (2) /* persistent error */ 203 #define HCA_PIO_RETRY_CNT (3) 204 205 /* HCA firmware faults */ 206 #define HCA_FW_MISC 0x1 /* firmware misc faults */ 207 #define HCA_FW_CORRUPT 0x2 /* firmware corruption */ 208 #define HCA_FW_MISMATCH 0x3 /* firmware version mismatch */ 209 210 /* 211 * Hermon FM macros 212 */ 213 214 #ifdef FMA_TEST 215 #define TEST_DECLARE(tst) hermon_test_t *tst; 216 #define REGISTER_PIO_TEST(st, tst) \ 217 tst = hermon_test_register(st, __FILE__, __LINE__, HCA_TEST_PIO) 218 #define PIO_START(st, hdl, tst) hermon_PIO_start(st, hdl, tst) 219 #define PIO_END(st, hdl, cnt, tst) hermon_PIO_end(st, hdl, &cnt, tst) 220 #else 221 #define TEST_DECLARE(tst) 222 #define REGISTER_PIO_TEST(st, tst) 223 #define PIO_START(st, hdl, tst) hermon_PIO_start(st, hdl, NULL) 224 #define PIO_END(st, hdl, cnt, tst) hermon_PIO_end(st, hdl, &cnt, NULL) 225 #endif /* FMA_TEST */ 226 227 /* 228 * hermon_pio_init() is a macro initializing variables. 229 */ 230 #define hermon_pio_init(cnt, status, tst) \ 231 TEST_DECLARE(tst) \ 232 int status = HCA_PIO_OK; \ 233 int cnt = HCA_PIO_RETRY_CNT 234 235 /* 236 * hermon_pio_start() is one of a pair of macros checking HW errors 237 * at PIO requests, which should be called before the requests are issued. 238 */ 239 #define hermon_pio_start(st, hdl, label, cnt, status, tst) \ 240 if (st->hs_fm_state & HCA_PIO_FM) { \ 241 if (st->hs_fm_async_fatal) { \ 242 hermon_fm_ereport(st, HCA_SYS_ERR, \ 243 HCA_ERR_NON_FATAL); \ 244 goto label; \ 245 } else { \ 246 REGISTER_PIO_TEST(st, tst); \ 247 cnt = HCA_PIO_RETRY_CNT; \ 248 if (PIO_START(st, hdl, tst) == \ 249 HCA_PIO_PERSISTENT) { \ 250 goto label; \ 251 } \ 252 } \ 253 } else { \ 254 status = HCA_SKIP_HW_CHK; \ 255 } \ 256 do { 257 258 /* 259 * hermon_pio_end() is the other of a pair of macros checking HW errors 260 * at PIO requests, which should be called after the requests end. 261 * If a HW error is detected and can be isolated well, these macros 262 * retry the operation to determine if the error is persistent or not. 263 */ 264 #define hermon_pio_end(st, hdl, label, cnt, status, tst) \ 265 if (status != HCA_SKIP_HW_CHK) { \ 266 if (st->hs_fm_async_fatal) { \ 267 hermon_fm_ereport(st, HCA_SYS_ERR, \ 268 HCA_ERR_NON_FATAL); \ 269 goto label; \ 270 } \ 271 if ((status = PIO_END(st, hdl, cnt, tst)) == \ 272 HCA_PIO_PERSISTENT) { \ 273 goto label; \ 274 } else if (status == HCA_PIO_TRANSIENT) { \ 275 hermon_fm_ereport(st, HCA_SYS_ERR, \ 276 HCA_ERR_TRANSIENT); \ 277 } \ 278 } \ 279 } while (status == HCA_PIO_TRANSIENT) 280 281 extern void hermon_fm_init(hermon_state_t *); 282 extern void hermon_fm_fini(hermon_state_t *); 283 extern int hermon_fm_ereport_init(hermon_state_t *); 284 extern void hermon_fm_ereport_fini(hermon_state_t *); 285 extern int hermon_get_state(hermon_state_t *); 286 extern boolean_t hermon_init_failure(hermon_state_t *); 287 extern boolean_t hermon_cmd_retry_ok(hermon_cmd_post_t *, int); 288 extern void hermon_fm_ereport(hermon_state_t *, int, int); 289 extern int hermon_regs_map_setup(hermon_state_t *, uint_t, caddr_t *, offset_t, 290 offset_t, ddi_device_acc_attr_t *, ddi_acc_handle_t *); 291 extern void hermon_regs_map_free(hermon_state_t *, ddi_acc_handle_t *); 292 extern int hermon_pci_config_setup(hermon_state_t *, ddi_acc_handle_t *); 293 extern void hermon_pci_config_teardown(hermon_state_t *, ddi_acc_handle_t *); 294 extern ushort_t hermon_devacc_attr_version(hermon_state_t *); 295 extern uchar_t hermon_devacc_attr_access(hermon_state_t *); 296 extern int hermon_PIO_start(hermon_state_t *, ddi_acc_handle_t, 297 hermon_test_t *); 298 extern int hermon_PIO_end(hermon_state_t *, ddi_acc_handle_t, int *, 299 hermon_test_t *); 300 extern ddi_acc_handle_t hermon_rsrc_alloc_uarhdl(hermon_state_t *); 301 extern ddi_acc_handle_t hermon_get_uarhdl(hermon_state_t *); 302 extern ddi_acc_handle_t hermon_get_cmdhdl(hermon_state_t *); 303 extern ddi_acc_handle_t hermon_get_msix_tblhdl(hermon_state_t *); 304 extern ddi_acc_handle_t hermon_get_msix_pbahdl(hermon_state_t *); 305 extern ddi_acc_handle_t hermon_get_pcihdl(hermon_state_t *); 306 extern void hermon_clr_state_nolock(hermon_state_t *, int); 307 extern void hermon_inter_err_chk(void *); 308 309 #ifdef FMA_TEST 310 extern hermon_test_t *hermon_test_register(hermon_state_t *, char *, int, int); 311 extern void hermon_test_deregister(void); 312 extern int hermon_test_num; 313 #endif /* FMA_TEST */ 314 315 #ifdef __cplusplus 316 } 317 #endif 318 319 #endif /* _SYS_IB_ADAPTERS_HERMON_FM_H */ 320