xref: /illumos-gate/usr/src/uts/common/sys/ib/clients/eoib/eib_impl.h (revision 679c9deae74d7b935e94eaaff2566ea45c1afe2b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2019, Joyent, Inc.
28  */
29 
30 #ifndef _SYS_IB_EOIB_EIB_IMPL_H
31 #define	_SYS_IB_EOIB_EIB_IMPL_H
32 
33 #ifdef __cplusplus
34 extern "C" {
35 #endif
36 
37 #include <sys/ddi.h>
38 #include <sys/mac.h>
39 #include <sys/sunddi.h>
40 #include <sys/varargs.h>
41 #include <sys/vlan.h>
42 #include <sys/ib/ibtl/ibti.h>
43 #include <sys/ib/ibtl/ibvti.h>
44 #include <sys/ib/ib_pkt_hdrs.h>
45 
46 #include <sys/ib/clients/eoib/fip.h>
47 #include <sys/ib/clients/eoib/eib.h>
48 
49 /*
50  * Driver specific constants
51  */
52 #define	EIB_E_SUCCESS			0
53 #define	EIB_E_FAILURE			-1
54 #define	EIB_MAX_LINE			128
55 #define	EIB_MAX_SGL			59
56 #define	EIB_MAX_POST_MULTIPLE		4
57 #define	EIB_MAX_PAYLOAD_HDR_SZ		160
58 #define	EIB_TX_COPY_THRESH		4096	/* greater than mtu */
59 #define	EIB_MAX_VNICS			64	/* do not change this */
60 #define	EIB_LOGIN_TIMEOUT_USEC		8000000
61 #define	EIB_RWR_CHUNK_SZ		8
62 #define	EIB_IPHDR_ALIGN_ROOM		32
63 #define	EIB_IP_HDR_ALIGN		2
64 #define	EIB_MAX_RX_PKTS_ONINTR		0x800
65 #define	EIB_MAX_LOGIN_ATTEMPTS		3
66 #define	EIB_MAX_VHUB_TBL_ATTEMPTS	3
67 #define	EIB_MAX_KA_ATTEMPTS		3
68 #define	EIB_MAX_ATTEMPTS		10
69 #define	EIB_DELAY_HALF_SECOND		500000
70 #define	EIB_GRH_SZ			(sizeof (ib_grh_t))
71 
72 /*
73  * Debug messages
74  */
75 #define	EIB_MSGS_CRIT		0x01
76 #define	EIB_MSGS_ERR		0x02
77 #define	EIB_MSGS_WARN		0x04
78 #define	EIB_MSGS_DEBUG		0x08
79 #define	EIB_MSGS_ARGS		0x10
80 #define	EIB_MSGS_PKT		0x20
81 #define	EIB_MSGS_VERBOSE	0x40
82 #define	EIB_MSGS_DEFAULT	(EIB_MSGS_CRIT | EIB_MSGS_ERR | EIB_MSGS_WARN)
83 
84 #define	EIB_LOGSZ_DEFAULT	0x20000
85 
86 #define	EIB_DPRINTF_CRIT	eib_dprintf_crit
87 #define	EIB_DPRINTF_ERR		eib_dprintf_err
88 #define	EIB_DPRINTF_WARN	eib_dprintf_warn
89 #ifdef EIB_DEBUG
90 #define	EIB_DPRINTF_DEBUG	eib_dprintf_debug
91 #define	EIB_DPRINTF_ARGS	eib_dprintf_args
92 #define	EIB_DPRINTF_PKT		eib_dprintf_pkt
93 #define	EIB_DPRINTF_VERBOSE	eib_dprintf_verbose
94 #else
95 #define	EIB_DPRINTF_DEBUG(...)	(void)(0)
96 #define	EIB_DPRINTF_ARGS(...)	(void)(0)
97 #define	EIB_DPRINTF_PKT(...)	(void)(0)
98 #define	EIB_DPRINTF_VERBOSE(...) (void)(0)
99 #endif
100 
101 /*
102  *  EoIB threads to provide various services
103  */
104 #define	EIB_EVENTS_HDLR		"eib_events_handler"
105 #define	EIB_RWQES_REFILLER	"eib_rwqes_refiller"
106 #define	EIB_VNIC_CREATOR	"eib_vnic_creator"
107 #define	EIB_TXWQES_MONITOR	"eib_txwqe_monitor"
108 #define	EIB_LSOBUFS_MONITOR	"eib_lsobufs_monitor"
109 
110 /*
111  * Macro for finding the least significant bit set in a 64-bit unsigned int
112  */
113 #define	EIB_FIND_LSB_SET(val64)	eib_setbit_mod67[((-(val64) & (val64)) % 67)]
114 
115 /*
116  * LSO buffers
117  *
118  * Under normal circumstances we should never need to use any buffer
119  * that's larger than MTU.  Unfortunately, IB HCA has limitations
120  * on the length of SGL that are much smaller than those for regular
121  * ethernet NICs.  Since the network layer doesn't care to limit the
122  * number of mblk fragments in any send mp chain, we end up having to
123  * use these larger buffers occasionally.
124  */
125 #define	EIB_LSO_MAXLEN			65536
126 #define	EIB_LSO_BUFSZ			8192
127 #define	EIB_LSO_NUM_BUFS		1024
128 #define	EIB_LSO_FREE_BUFS_THRESH	(EIB_LSO_NUM_BUFS >> 5)
129 
130 typedef struct eib_lsobuf_s {
131 	struct eib_lsobuf_s *lb_next;
132 	uint8_t		*lb_buf;
133 	int		lb_isfree;
134 } eib_lsobuf_t;
135 
136 typedef struct eib_lsobkt_s {
137 	kmutex_t	bk_lock;
138 	kcondvar_t	bk_cv;
139 	uint_t		bk_status;
140 	uint8_t		*bk_mem;
141 	eib_lsobuf_t	*bk_bufl;
142 	eib_lsobuf_t	*bk_free_head;
143 	ibt_mr_hdl_t	bk_mr_hdl;
144 	ibt_lkey_t	bk_lkey;
145 	uint_t		bk_nelem;
146 	uint_t		bk_nfree;
147 } eib_lsobkt_t;
148 
149 #define	EIB_LBUF_SHORT		0x1
150 #define	EIB_LBUF_MONITOR_DIE	0x2
151 
152 /*
153  * The admin partition is only used for sending login and logout messages
154  * and receiving login acknowledgements from the gateway.  While packets
155  * going out on several vlans at the same time could result in multiple
156  * vnic creations happening at the same time (and therefore multiple login
157  * packets), we serialize the vnic creation via the vnic creator thread, so
158  * we shouldn't need a lot of send wqes or receive wqes.  Note also that we
159  * keep the cq size request to slightly less than a 2^n boundary to allow
160  * the alloc cq routine to return the closest 2^n boundary as the real cq
161  * size without wasting too much memory.
162  */
163 #define	EIB_ADMIN_MAX_SWQE	30
164 #define	EIB_ADMIN_MAX_RWQE	30
165 #define	EIB_ADMIN_CQ_SIZE	(EIB_ADMIN_MAX_SWQE + EIB_ADMIN_MAX_RWQE + 1)
166 
167 /*
168  * The control qp is per vhub partition, and is used to send and receive
169  * vhub control messages such as vhub table request/response, vhub
170  * update response and vnic alive messages.  While the vhub table response
171  * and vhub update messages might take a few rwqes, the vhub table request
172  * is made only once per vnic, and the vnic alive message is periodic
173  * and uses a single swqe as well.  Per vnic, we should certainly not need
174  * too many swqes/rwqes.
175  */
176 #define	EIB_CTL_MAX_SWQE	30
177 #define	EIB_CTL_MAX_RWQE	30
178 #define	EIB_CTL_CQ_SIZE		(EIB_CTL_MAX_SWQE + EIB_CTL_MAX_RWQE + 1)
179 
180 /*
181  * For the vNIC's data channel, there are three items that are of importance:
182  * the constraints defined below, the hca_max_chan_sz attribute and the value of
183  * (hca_max_cq_sz - 1).  The maximum limit on swqe/rwqe is set to the minimum
184  * of these three values.
185  *
186  * While the total number of RWQEs posted to the data channel of any vNIC will
187  * not exceed EIB_DATA_MAX_RWQE, we also do not want to acquire and post all of
188  * it during the data channel initialization, since that is a lot of wqes for
189  * one vnic to consume when we don't even know if the vnic will need it at all.
190  * We post an initial set of EIB_DATA_RWQE_BKT rwqes, and slowly post more and
191  * more sets as we see them being consumed, until we hit the hard limit of
192  * EIB_DATA_MAX_RWQE.
193  */
194 #define	EIB_DATA_MAX_SWQE	4000
195 #define	EIB_DATA_MAX_RWQE	4000
196 #define	EIB_DATA_RWQE_BKT	512
197 
198 /*
199  * vNIC data channel CQ moderation parameters
200  */
201 #define	EIB_TX_COMP_COUNT		10
202 #define	EIB_TX_COMP_USEC		300
203 #define	EIB_RX_COMP_COUNT		4
204 #define	EIB_RX_COMP_USEC		10
205 
206 /*
207  * qe_info masks (blk:ndx:type:flags)
208  */
209 #define	EIB_WQEBLK_SHIFT		24
210 #define	EIB_WQEBLK_MASK			0xFF
211 #define	EIB_WQENDX_SHIFT		16
212 #define	EIB_WQENDX_MASK			0xFF
213 #define	EIB_WQETYP_SHIFT		8
214 #define	EIB_WQETYP_MASK			0xFF
215 #define	EIB_WQEFLGS_SHIFT		0
216 #define	EIB_WQEFLGS_MASK		0xFF
217 
218 /*
219  * Macros to get the bit fields from qe_info
220  */
221 #define	EIB_WQE_BLK(info)	(((info) >> EIB_WQEBLK_SHIFT) & EIB_WQEBLK_MASK)
222 #define	EIB_WQE_NDX(info)	(((info) >> EIB_WQENDX_SHIFT) & EIB_WQENDX_MASK)
223 #define	EIB_WQE_TYPE(info)	(((info) >> EIB_WQETYP_SHIFT) & EIB_WQETYP_MASK)
224 #define	EIB_WQE_FLAGS(info)	((info) & EIB_WQEFLGS_MASK)
225 
226 /*
227  * Values for type and flags in qe_info
228  */
229 #define	EIB_WQE_TX			0x1
230 #define	EIB_WQE_RX			0x2
231 
232 /*
233  * Flags for rx wqes/buffers
234  */
235 #define	EIB_WQE_FLG_POSTED_TO_HCA	0x1
236 #define	EIB_WQE_FLG_WITH_NW		0x2
237 
238 /*
239  * Flags for tx wqes/buffers
240  */
241 #define	EIB_WQE_FLG_BUFTYPE_LSO		0x4
242 #define	EIB_WQE_FLG_BUFTYPE_MAPPED	0x8
243 
244 /*
245  * Send/Recv workq entries
246  */
247 typedef struct eib_wqe_s {
248 	struct eib_wqe_pool_s	*qe_pool;
249 	uint8_t			*qe_cpbuf;
250 	uint8_t			*qe_payload_hdr;
251 	uint_t			qe_bufsz;
252 	uint_t			qe_info;
253 	int			qe_vnic_inst;
254 	ibt_ud_dest_hdl_t	qe_dest;
255 	frtn_t			qe_frp;
256 
257 	mblk_t			*qe_mp;
258 	ibt_mi_hdl_t		qe_iov_hdl;
259 	ibt_all_wr_t		qe_wr;
260 	ibt_wr_ds_t		qe_sgl;
261 	ibt_wr_ds_t		qe_big_sgl[EIB_MAX_SGL];
262 	struct eib_wqe_s	*qe_nxt_post;
263 	struct eib_chan_s	*qe_chan;
264 } eib_wqe_t;
265 
266 /*
267  * The wqe in-use/free status in EoIB is managed via a 2-level bitmap
268  * logic.
269  *
270  * Each set of 64 wqes (a "wqe block") is managed by a single 64-bit
271  * integer bitmap.  The free status of a set of 64 such wqe blocks (a
272  * "wqe pool") is managed by one 64-bit integer bitmap (if any wqe in
273  * the wqe block is free, the bit in the map is 1, otherwise it is 0).
274  *
275  * The maximum pool size is 4096 wqes, but this can easily be extended
276  * to support more wqes using additional pools of wqes.
277  *
278  * Note that an entire pool of wqes is allocated via a single allocation,
279  * the wqe addresses in a pool are all contiguous.  The tx/rx copy buffers
280  * for a wqe pool are also allocated via a single allocation.
281  */
282 #define	EIB_BLKS_PER_POOL	64
283 #define	EIB_WQES_PER_BLK	64	/* do not change this */
284 #define	EIB_WQES_PER_POOL	(EIB_BLKS_PER_POOL * EIB_WQES_PER_BLK)
285 
286 #define	EIB_WQE_SZ		(sizeof (eib_wqe_t))
287 #define	EIB_WQEBLK_SZ		(EIB_WQES_PER_BLK * EIB_WQE_SZ)
288 
289 typedef struct eib_wqe_pool_s {
290 	struct eib_wqe_pool_s	*wp_next;
291 	struct eib_s		*wp_ss;
292 	ib_vaddr_t		wp_vaddr;
293 	ib_memlen_t		wp_memsz;
294 	ibt_mr_hdl_t		wp_mr;
295 	ibt_lkey_t		wp_lkey;
296 	uint_t			wp_nfree_lwm;
297 	int			wp_type;
298 
299 	kmutex_t		wp_lock;
300 	kcondvar_t		wp_cv;
301 	uint_t			wp_status;
302 	uint_t			wp_nfree;
303 	uint64_t		wp_free_blks;
304 	uint64_t		wp_free_wqes[EIB_BLKS_PER_POOL];
305 	struct eib_wqe_s	*wp_wqe;
306 } eib_wqe_pool_t;
307 
308 /*
309  * Values for wp_type
310  */
311 #define	EIB_WP_TYPE_TX		0x1
312 #define	EIB_WP_TYPE_RX		0x2
313 
314 /*
315  * Values for wp_status (bit fields)
316  */
317 #define	EIB_TXWQE_SHORT		0x1	/* only for tx wqe pool */
318 #define	EIB_TXWQE_MONITOR_DIE	0x2	/* only for tx wqe pool */
319 
320 #define	EIB_RXWQE_SHORT		0x1	/* only for rx wqe pool */
321 
322 /*
323  * The low-water-mark is an indication of when wqe grabs for low-priority
324  * qps should start to get refused (swqe grabs for control messages such
325  * as keepalives and rwqe grabs for posting back to control qps will still
326  * be allowed).  The high-water-mark is an indication of when normal
327  * behavior should resume.
328  */
329 #define	EIB_NFREE_SWQES_LWM	(EIB_WQES_PER_POOL / 64)	/* 1/64 */
330 #define	EIB_NFREE_SWQES_HWM	(EIB_WQES_PER_POOL / 32)	/* 1/32 */
331 #define	EIB_NFREE_RWQES_LWM	(EIB_WQES_PER_POOL / 10)	/* 10% */
332 #define	EIB_NFREE_RWQES_HWM	(EIB_WQES_PER_POOL / 5)		/* 20% */
333 
334 /*
335  * The "rwqes low" is used to determine when we should start using allocb()
336  * to copy and send received mblks in the rx path.  It should be a little
337  * above the rwqes low-water-mark, but less than the high-water-mark.
338  */
339 #define	EIB_NFREE_RWQES_LOW	\
340 	((EIB_NFREE_RWQES_LWM + EIB_NFREE_RWQES_HWM) / 2)
341 
342 #define	EIB_WPRI_HI		1	/* for keepalive posts */
343 #define	EIB_WPRI_LO		2	/* for all other posts */
344 
345 /*
346  * Multicast GID Layout: the multicast gid is specified in big-endian
347  * representation, as a collection of different-sized fields in the
348  * EoIB specification.  On Solaris, the multicast gid is represented
349  * as a collection of two 8-byte fields (in ib_gid_t).
350  */
351 typedef struct eib_mgid_spec_s {
352 	uint8_t			sp_mgid_prefix[FIP_MGID_PREFIX_LEN];
353 	uint8_t			sp_type;
354 	uint8_t			sp_dmac[ETHERADDRL];
355 	uint8_t			sp_rss_hash;
356 	uint8_t			sp_vhub_id[FIP_VHUBID_LEN];
357 } eib_mgid_spec_t;
358 
359 /*
360  * Values for sp_type in mgid as per EoIB specification
361  */
362 #define	EIB_MGID_VHUB_DATA	0x0
363 #define	EIB_MGID_VHUB_UPDATE	0x2
364 #define	EIB_MGID_VHUB_TABLE	0x3
365 
366 typedef union eib_mgid_s {
367 	eib_mgid_spec_t		gd_spec;
368 	ib_gid_t		gd_sol;
369 } eib_mgid_t;
370 
371 /*
372  * Gateway properties handed over to us by the EoIB nexus
373  */
374 typedef struct eib_gw_props_s {
375 	kmutex_t		pp_gw_lock;
376 
377 	ib_guid_t		pp_gw_system_guid;
378 	ib_guid_t		pp_gw_guid;
379 	ib_sn_prefix_t		pp_gw_sn_prefix;
380 
381 	uint_t			pp_gw_adv_period;
382 	uint_t			pp_gw_ka_period;
383 	uint_t			pp_vnic_ka_period;
384 
385 	ib_qpn_t		pp_gw_ctrl_qpn;
386 	ib_lid_t		pp_gw_lid;
387 	uint16_t		pp_gw_portid;
388 
389 	uint16_t		pp_gw_num_net_vnics;
390 	uint8_t			pp_gw_flag_available;
391 	uint8_t			pp_gw_is_host_adm_vnics;
392 	uint8_t			pp_gw_sl;
393 	uint8_t			pp_gw_n_rss_qpn;
394 
395 	uint8_t			*pp_gw_system_name;
396 	uint8_t			*pp_gw_port_name;
397 	uint8_t			*pp_gw_vendor_id;
398 
399 	clock_t			pp_gw_ka_ticks;		/* 2.5 x gw_ka_period */
400 	clock_t			pp_vnic_ka_ticks;	/* vnic_ka_period */
401 } eib_gw_props_t;
402 
403 /*
404  * Port-specific properties
405  */
406 typedef struct eib_props_s {
407 	uint64_t		ep_ifspeed;
408 	ib_guid_t		ep_hca_guid;
409 	uint8_t			ep_port_num;
410 	ib_gid_t		ep_sgid;
411 	ib_lid_t		ep_blid;
412 	uint16_t		ep_mtu;
413 	ibt_srate_t		ep_srate;
414 } eib_props_t;
415 
416 /*
417  * Capabilities derived from HCA attributes
418  */
419 typedef struct eib_caps_s {
420 	uint_t			cp_lso_maxlen;
421 	uint32_t		cp_cksum_flags;
422 	int			cp_resv_lkey_capab;
423 	ibt_lkey_t		cp_resv_lkey;
424 
425 	uint_t			cp_max_swqe;
426 	uint_t			cp_max_rwqe;
427 	uint_t			cp_max_sgl;
428 	uint_t			cp_hiwm_sgl;
429 } eib_caps_t;
430 
431 /*
432  * List of multicast groups the vnic joined
433  */
434 typedef struct eib_mcg_s {
435 	struct eib_mcg_s	*mg_next;
436 	ib_gid_t		mg_rgid;
437 	ib_gid_t		mg_mgid;
438 	uint8_t			mg_join_state;
439 	uint8_t			mg_mac[ETHERADDRL];
440 	ibt_mcg_info_t		*mg_mcginfo;
441 } eib_mcg_t;
442 
443 /*
444  * Admin/control/data channel information
445  */
446 typedef struct eib_chan_s {
447 	ibt_channel_hdl_t	ch_chan;
448 	ib_qpn_t		ch_qpn;
449 
450 	ibt_wc_t		*ch_wc;
451 	ibt_cq_hdl_t		ch_cq_hdl;
452 	uint_t			ch_cq_sz;
453 
454 	ibt_wc_t		*ch_rcv_wc;
455 	ibt_cq_hdl_t		ch_rcv_cq_hdl;
456 	uint_t			ch_rcv_cq_sz;
457 
458 	int			ch_vnic_inst;
459 	uint_t			ch_max_swqes;
460 	uint_t			ch_max_rwqes;
461 	uint_t			ch_lwm_rwqes;
462 	uint_t			ch_rwqe_bktsz;
463 	uint_t			ch_ip_hdr_align;
464 	boolean_t		ch_alloc_mp;
465 	boolean_t		ch_tear_down;
466 
467 	kmutex_t		ch_pkey_lock;
468 	ib_pkey_t		ch_pkey;
469 	uint16_t		ch_pkey_ix;
470 
471 	kmutex_t		ch_cep_lock;
472 	kcondvar_t		ch_cep_cv;
473 	ibt_cep_state_t		ch_cep_state;
474 
475 	kmutex_t		ch_tx_lock;
476 	kcondvar_t		ch_tx_cv;
477 	uint_t			ch_tx_posted;
478 	boolean_t		ch_tx_busy;
479 	struct eib_wqe_s	*ch_tx;
480 	struct eib_wqe_s	*ch_tx_tail;
481 
482 	kmutex_t		ch_rx_lock;
483 	kcondvar_t		ch_rx_cv;
484 	uint_t			ch_rx_posted;
485 	boolean_t		ch_rx_refilling;
486 
487 	kmutex_t		ch_vhub_lock;
488 	struct eib_mcg_s	*ch_vhub_table;
489 	struct eib_mcg_s	*ch_vhub_update;
490 	struct eib_mcg_s	*ch_vhub_data;
491 
492 	struct eib_chan_s	*ch_rxpost_next;
493 } eib_chan_t;
494 
495 /*
496  * States for vNIC state machine during login
497  */
498 #define	EIB_LOGIN_INIT		0
499 #define	EIB_LOGIN_ACK_WAIT	1
500 #define	EIB_LOGIN_ACK_RCVD	2
501 #define	EIB_LOGIN_NACK_RCVD	3
502 #define	EIB_LOGIN_TBL_WAIT	4
503 #define	EIB_LOGIN_TBL_INPROG	5
504 #define	EIB_LOGIN_TBL_DONE	6
505 #define	EIB_LOGIN_TBL_FAILED	7
506 #define	EIB_LOGIN_DONE		8
507 #define	EIB_LOGIN_TIMED_OUT	9
508 #define	EIB_LOGOUT_DONE		10
509 
510 typedef struct eib_login_data_s {
511 	ib_guid_t		ld_gw_guid;
512 	ib_lid_t		ld_gw_lid;
513 	uint_t			ld_syndrome;
514 	uint16_t		ld_gw_port_id;
515 	ib_qpn_t		ld_gw_data_qpn;
516 	ib_qpn_t		ld_gw_ctl_qpn;
517 	uint16_t		ld_vnic_id;	/* includes set msbit */
518 	uint16_t		ld_vhub_mtu;
519 	uint16_t		ld_vhub_pkey;
520 	uint16_t		ld_assigned_vlan;
521 	uint8_t			ld_gw_sl;
522 	uint8_t			ld_n_rss_mcgid;
523 	uint8_t			ld_n_mac_mcgid;
524 	uint8_t			ld_vnic_name[FIP_VNIC_NAME_LEN];
525 	uint8_t			ld_assigned_mac[ETHERADDRL];
526 	uint8_t			ld_gw_mgid_prefix[FIP_MGID_PREFIX_LEN];
527 	uint8_t			ld_vlan_in_packets;
528 	uint32_t		ld_vhub_id;
529 } eib_login_data_t;
530 
531 #define	EIB_UNICAST_MAC(mac)		(((mac)[0] & 0x01) == 0)
532 
533 /*
534  * Map to translate between DMAC and {qpn, lid, sl}
535  */
536 typedef struct eib_vhub_map_s {
537 	struct eib_vhub_map_s	*mp_next;
538 	uint32_t		mp_tusn;
539 	ib_qpn_t		mp_qpn;
540 	ib_lid_t		mp_lid;
541 	uint8_t			mp_mac[ETHERADDRL];
542 	uint8_t			mp_sl;
543 	uint8_t			mp_v_rss_type;
544 } eib_vhub_map_t;
545 
546 /*
547  * Per-vNIC vHUB Table
548  */
549 #define	EIB_TB_NBUCKETS		13
550 typedef struct eib_vhub_table_s {
551 	kmutex_t		tb_lock;
552 	struct eib_vhub_map_s	*tb_gateway;
553 	struct eib_vhub_map_s	*tb_unicast_miss;
554 	struct eib_vhub_map_s	*tb_vhub_multicast;
555 	struct eib_vhub_map_s	*tb_vnic_entry[EIB_TB_NBUCKETS];
556 	struct eib_vhub_map_s	*tb_mcast_entry[EIB_TB_NBUCKETS];
557 
558 	uint32_t		tb_tusn;
559 	uint8_t			tb_eport_state;
560 
561 	uint16_t		tb_entries_seen;
562 	uint16_t		tb_entries_in_table;
563 	uint32_t		tb_checksum;
564 } eib_vhub_table_t;
565 
566 typedef struct eib_vhub_update_s {
567 	kmutex_t		up_lock;
568 	eib_vhub_map_t		*up_vnic_entry;
569 	uint32_t		up_tusn;
570 	uint8_t			up_eport_state;
571 } eib_vhub_update_t;
572 
573 typedef struct eib_ether_hdr_s {
574 	int			eh_tagless;
575 	uint16_t		eh_ether_type;
576 	uint16_t		eh_vlan;
577 	uint8_t			eh_dmac[ETHERADDRL];
578 	uint8_t			eh_smac[ETHERADDRL];
579 } eib_ether_hdr_t;
580 
581 /*
582  * vNIC Information
583  */
584 typedef struct eib_vnic_s {
585 	struct eib_s		*vn_ss;
586 	eib_chan_t		*vn_ctl_chan;
587 	eib_chan_t		*vn_data_chan;
588 	int			vn_instance;
589 	uint16_t		vn_vlan;
590 	uint16_t		vn_id;
591 	uint8_t			vn_macaddr[ETHERADDRL];
592 	struct eib_login_data_s	vn_login_data;
593 
594 	kmutex_t		vn_lock;
595 	kcondvar_t		vn_cv;
596 	uint_t			vn_state;
597 	struct eib_vhub_table_s	*vn_vhub_table;
598 	struct eib_vhub_update_s *vn_vhub_update;
599 
600 	ddi_softint_handle_t    vn_ctl_si_hdl;
601 	ddi_softint_handle_t    vn_data_tx_si_hdl;
602 	ddi_softint_handle_t    vn_data_rx_si_hdl;
603 } eib_vnic_t;
604 
605 
606 /*
607  * Base NIC's mac state flags. The lock protects the starting/stopping
608  * bits.  Access to the rest of the mac state is protected by these
609  * two bits.
610  */
611 #define	EIB_NIC_STARTING	0x01
612 #define	EIB_NIC_STOPPING	0x02
613 #define	EIB_NIC_STARTED		0x80
614 #define	EIB_NIC_RESTARTING	(EIB_NIC_STARTING | EIB_NIC_STOPPING)
615 
616 typedef struct eib_node_state_s {
617 	kmutex_t		ns_lock;
618 	kcondvar_t		ns_cv;
619 	uint_t			ns_nic_state;
620 	link_state_t		ns_link_state;
621 } eib_node_state_t;
622 
623 /*
624  * MIB-II statistics to report to the mac layer
625  */
626 typedef struct eib_stats_s {
627 	uint64_t		st_obytes;	/* bytes sent out */
628 	uint64_t		st_opkts;	/* pkts sent out */
629 	uint64_t		st_brdcstxmit;	/* broadcast pkts transmitted */
630 	uint64_t		st_multixmit;	/* multicast pkts transmitted */
631 	uint64_t		st_oerrors;	/* transmit errors */
632 	uint64_t		st_noxmitbuf;	/* transmit pkts discarded */
633 
634 	uint64_t		st_rbytes;	/* bytes received */
635 	uint64_t		st_ipkts;	/* pkts received */
636 	uint64_t		st_brdcstrcv;	/* broadcast pkts received */
637 	uint64_t		st_multircv;	/* multicast pkts received */
638 	uint64_t		st_ierrors;	/* receive errors */
639 	uint64_t		st_norcvbuf;	/* receive pkts discarded */
640 } eib_stats_t;
641 
642 #define	EIB_UPDATE_COUNTER(addr, val)	(atomic_add_64((addr), (val)))
643 #define	EIB_INCR_COUNTER(addr)		(atomic_inc_64((addr)))
644 #define	EIB_DECR_COUNTER(addr)		(atomic_dec_64((addr)))
645 
646 /*
647  * Cache of address vectors with dlid as the key. Currently we use
648  * eib state structure's  ei_lock to protect the individual address
649  * vector's fields.  This is a lock granularity that's slightly
650  * bigger than ideal, but it should do for now.
651  */
652 #define	EIB_AV_NBUCKETS		17
653 typedef struct eib_avect_s {
654 	struct eib_avect_s	*av_next;
655 	ibt_adds_vect_t		av_vect;
656 	uint_t			av_ref;
657 } eib_avect_t;
658 
659 /*
660  * vNIC creation and deletion are serialized by a non-zero value
661  * to the ei_vnic_state member (i.e. only one vnic may be created
662  * or deleted at a time). The code makes sure to access/update
663  * the ei_active_vnics member only after a successful setting of
664  * ei_vnic_state.
665  */
666 #define	EIB_VN_BEING_CREATED	0x01
667 #define	EIB_VN_BEING_DELETED	0x02
668 #define	EIB_VN_BEING_MODIFIED	(EIB_VN_BEING_CREATED | EIB_VN_BEING_DELETED)
669 
670 /*
671  * All possible EoIB event work items that need to be handled
672  */
673 #define	EIB_EV_NONE		0
674 #define	EIB_EV_PORT_DOWN	1
675 #define	EIB_EV_PORT_UP		2
676 #define	EIB_EV_PKEY_CHANGE	3
677 #define	EIB_EV_SGID_CHANGE	4
678 #define	EIB_EV_CLNT_REREG	5
679 #define	EIB_EV_GW_EPORT_DOWN	6
680 #define	EIB_EV_GW_DOWN		7
681 #define	EIB_EV_GW_UP		8
682 #define	EIB_EV_GW_INFO_UPDATE	9
683 #define	EIB_EV_MCG_DELETED	10
684 #define	EIB_EV_MCG_CREATED	11
685 #define	EIB_EV_SHUTDOWN		12
686 
687 typedef struct eib_event_s {
688 	struct eib_event_s	*ev_next;
689 	uint_t			ev_code;
690 	void			*ev_arg;
691 } eib_event_t;
692 
693 /*
694  * Work element for new vnic creation
695  */
696 typedef struct eib_vnic_req_s {
697 	struct eib_vnic_req_s	*vr_next;
698 	uint_t			vr_req;
699 	uint8_t			vr_mac[ETHERADDRL];
700 	uint16_t		vr_vlan;
701 } eib_vnic_req_t;
702 
703 /*
704  * Values for vr_req
705  */
706 #define	EIB_CR_REQ_NEW_VNIC	1
707 #define	EIB_CR_REQ_FLUSH	2
708 #define	EIB_CR_REQ_DIE		3
709 
710 /*
711  * Work element for vnics kept alive by the keepalive manager thread
712  * and bitfield values for ei_ka_vnics_event.
713  */
714 typedef struct eib_ka_vnics_s {
715 	struct eib_ka_vnics_s	*ka_next;
716 	struct eib_vnic_s	*ka_vnic;
717 } eib_ka_vnics_t;
718 
719 #define	EIB_KA_VNICS_DIE	0x1
720 #define	EIB_KA_VNICS_TIMED_OUT	0x2
721 
722 /*
723  * EoIB per-instance state
724  */
725 typedef struct eib_s {
726 	ibt_clnt_hdl_t		ei_ibt_hdl;
727 	ibt_hca_hdl_t		ei_hca_hdl;
728 	ibt_pd_hdl_t		ei_pd_hdl;
729 	mac_handle_t		ei_mac_hdl;
730 
731 	ddi_softint_handle_t    ei_admin_si_hdl;
732 	ddi_callback_id_t	ei_login_ack_cb;
733 	ddi_callback_id_t	ei_gw_alive_cb;
734 	ddi_callback_id_t	ei_gw_info_cb;
735 
736 	ibt_hca_attr_t		*ei_hca_attrs;
737 	dev_info_t		*ei_dip;
738 	uint_t			ei_instance;
739 
740 	struct eib_gw_props_s	*ei_gw_props;
741 	struct eib_props_s	*ei_props;
742 	struct eib_caps_s	*ei_caps;
743 	struct eib_stats_s	*ei_stats;
744 
745 	struct eib_node_state_s	*ei_node_state;
746 	struct eib_chan_s	*ei_admin_chan;
747 
748 	struct eib_wqe_pool_s	*ei_tx;
749 	struct eib_wqe_pool_s	*ei_rx;
750 	struct eib_lsobkt_s	*ei_lso;
751 
752 	kmutex_t		ei_vnic_lock;
753 	kcondvar_t		ei_vnic_cv;
754 	uint_t			ei_vnic_state;
755 	uint64_t		ei_active_vnics;
756 	uint64_t		ei_zombie_vnics;
757 	uint64_t		ei_rejoin_vnics;
758 	struct eib_vnic_s	*ei_vnic[EIB_MAX_VNICS];
759 	struct eib_vnic_s	*ei_vnic_pending;
760 	int64_t			ei_gw_last_heartbeat;
761 	boolean_t		ei_gw_unreachable;
762 	uint8_t			ei_gw_eport_state;
763 
764 	kmutex_t		ei_av_lock;
765 	struct eib_avect_s	*ei_av[EIB_AV_NBUCKETS];
766 
767 	kmutex_t		ei_ev_lock;
768 	kcondvar_t		ei_ev_cv;
769 	struct eib_event_s	*ei_event;
770 
771 	kmutex_t		ei_rxpost_lock;
772 	kcondvar_t		ei_rxpost_cv;
773 	uint_t			ei_rxpost_die;
774 	struct eib_chan_s	*ei_rxpost;
775 
776 	kmutex_t		ei_vnic_req_lock;
777 	kcondvar_t		ei_vnic_req_cv;
778 	struct eib_vnic_req_s	*ei_vnic_req;
779 	struct eib_vnic_req_s	*ei_failed_vnic_req;
780 	struct eib_vnic_req_s	*ei_pending_vnic_req;
781 
782 	kmutex_t		ei_ka_vnics_lock;
783 	kcondvar_t		ei_ka_vnics_cv;
784 	uint_t			ei_ka_vnics_event;
785 	struct eib_ka_vnics_s	*ei_ka_vnics;
786 
787 	kt_did_t		ei_txwqe_monitor;
788 	kt_did_t		ei_lsobufs_monitor;
789 	kt_did_t		ei_rwqes_refiller;
790 	kt_did_t		ei_vnic_creator;
791 	kt_did_t		ei_events_handler;
792 	kt_did_t		ei_keepalives_manager;
793 } eib_t;
794 
795 /*
796  * Private read-only datalink properties
797  */
798 #define	EIB_DLPROP_GW_EPORT_STATE	"_eib_eport_state"
799 #define	EIB_DLPROP_HCA_GUID		"_eib_hca_guid"
800 #define	EIB_DLPROP_PORT_GUID		"_eib_port_guid"
801 
802 /*
803  * FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE
804  */
805 
806 /*
807  * FIP protocol related
808  */
809 extern int eib_fip_login(eib_t *, eib_vnic_t *, int *);
810 extern int eib_fip_heartbeat(eib_t *, eib_vnic_t *, int *);
811 extern int eib_fip_vhub_table(eib_t *, eib_vnic_t *, int *);
812 extern int eib_fip_logout(eib_t *, eib_vnic_t *, int *);
813 extern int eib_fip_parse_login_ack(eib_t *, uint8_t *, eib_login_data_t *);
814 extern int eib_fip_parse_ctl_pkt(uint8_t *, eib_vnic_t *);
815 
816 /*
817  * Service threads and other handlers
818  */
819 extern void eib_events_handler(eib_t *);
820 extern void eib_svc_enqueue_event(eib_t *, eib_event_t *);
821 extern void eib_refill_rwqes(eib_t *);
822 extern void eib_vnic_creator(eib_t *);
823 extern void eib_monitor_tx_wqes(eib_t *);
824 extern void eib_monitor_lso_bufs(eib_t *);
825 extern void eib_manage_keepalives(eib_t *);
826 extern void eib_stop_events_handler(eib_t *);
827 extern void eib_stop_refill_rwqes(eib_t *);
828 extern void eib_stop_vnic_creator(eib_t *);
829 extern void eib_stop_monitor_tx_wqes(eib_t *);
830 extern int eib_stop_monitor_lso_bufs(eib_t *, boolean_t);
831 extern void eib_stop_manage_keepalives(eib_t *);
832 extern void eib_flush_vnic_reqs(eib_t *);
833 extern void eib_gw_info_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
834 extern void eib_gw_alive_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
835 extern void eib_login_ack_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
836 
837 /*
838  * Admin QP related
839  */
840 extern int eib_adm_setup_qp(eib_t *, int *);
841 extern uint_t eib_adm_comp_handler(caddr_t, caddr_t);
842 extern void eib_rb_adm_setup_qp(eib_t *);
843 
844 /*
845  * Control QP related
846  */
847 extern int eib_ctl_create_qp(eib_t *, eib_vnic_t *, int *);
848 extern uint_t eib_ctl_comp_handler(caddr_t, caddr_t);
849 extern void eib_rb_ctl_create_qp(eib_t *, eib_vnic_t *);
850 
851 /*
852  * Data QP related
853  */
854 extern int eib_data_create_qp(eib_t *, eib_vnic_t *, int *);
855 extern uint_t eib_data_rx_comp_handler(caddr_t, caddr_t);
856 extern uint_t eib_data_tx_comp_handler(caddr_t, caddr_t);
857 extern void eib_data_rx_recycle(caddr_t);
858 extern void eib_data_post_tx(eib_vnic_t *, eib_wqe_t *);
859 extern void eib_data_parse_ether_hdr(mblk_t *, eib_ether_hdr_t *);
860 extern int eib_data_lookup_vnic(eib_t *, uint8_t *, uint16_t, eib_vnic_t **,
861     boolean_t *);
862 extern int eib_data_prepare_frame(eib_vnic_t *, eib_wqe_t *, mblk_t *,
863     eib_ether_hdr_t *);
864 extern void eib_rb_data_create_qp(eib_t *, eib_vnic_t *);
865 
866 /*
867  * Resource related
868  */
869 extern int eib_rsrc_setup_bufs(eib_t *, int *);
870 extern int eib_rsrc_grab_swqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
871 extern int eib_rsrc_grab_rwqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
872 extern int eib_rsrc_grab_lsobufs(eib_t *, uint_t, ibt_wr_ds_t *, uint32_t *);
873 extern eib_wqe_t *eib_rsrc_grab_swqe(eib_t *, int);
874 extern eib_wqe_t *eib_rsrc_grab_rwqe(eib_t *, int);
875 extern void eib_rsrc_return_swqe(eib_t *, eib_wqe_t *, eib_chan_t *);
876 extern void eib_rsrc_return_rwqe(eib_t *, eib_wqe_t *, eib_chan_t *);
877 extern void eib_rsrc_return_lsobufs(eib_t *, ibt_wr_ds_t *, uint32_t);
878 extern void eib_rsrc_decr_posted_swqe(eib_t *, eib_chan_t *);
879 extern void eib_rsrc_decr_posted_rwqe(eib_t *, eib_chan_t *);
880 extern void eib_rsrc_txwqes_needed(eib_t *);
881 extern void eib_rsrc_lsobufs_needed(eib_t *);
882 extern boolean_t eib_rsrc_rxpool_low(eib_wqe_t *);
883 extern void eib_rb_rsrc_setup_bufs(eib_t *, boolean_t);
884 
885 /*
886  * IBT related
887  */
888 extern int eib_ibt_hca_init(eib_t *);
889 extern void eib_ibt_link_mod(eib_t *);
890 extern int eib_ibt_modify_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t);
891 extern eib_avect_t *eib_ibt_hold_avect(eib_t *, ib_lid_t, uint8_t);
892 extern void eib_ibt_release_avect(eib_t *, eib_avect_t *);
893 extern void eib_ibt_free_avects(eib_t *);
894 extern void eib_ibt_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
895     ibt_async_event_t *);
896 extern void eib_ibt_record_capab(eib_t *, ibt_hca_attr_t *, eib_caps_t *);
897 extern void eib_rb_ibt_hca_init(eib_t *, uint_t);
898 
899 /*
900  * Chan related
901  */
902 extern eib_chan_t *eib_chan_init(void);
903 extern void eib_chan_fini(eib_chan_t *);
904 extern int eib_chan_post_rx(eib_t *, eib_chan_t *, uint_t *);
905 extern int eib_chan_post_recv(eib_t *, eib_chan_t *, eib_wqe_t *);
906 
907 /*
908  * Mac layer related
909  */
910 extern void eib_mac_set_nic_state(eib_t *, uint_t);
911 extern void eib_mac_clr_nic_state(eib_t *, uint_t);
912 extern void eib_mac_upd_nic_state(eib_t *, uint_t, uint_t);
913 extern uint_t eib_mac_get_nic_state(eib_t *);
914 extern void eib_mac_link_state(eib_t *, link_state_t, boolean_t);
915 extern void eib_mac_link_down(eib_t *, boolean_t);
916 extern void eib_mac_link_up(eib_t *, boolean_t);
917 extern int eib_mac_start(eib_t *);
918 extern void eib_mac_stop(eib_t *);
919 extern int eib_mac_multicast(eib_t *, boolean_t, uint8_t *);
920 extern int eib_mac_promisc(eib_t *, boolean_t);
921 extern int eib_mac_tx(eib_t *, mblk_t *);
922 extern int eib_mac_hca_portstate(eib_t *, ib_lid_t *, int *);
923 
924 /*
925  * VNIC related
926  */
927 extern int eib_vnic_create(eib_t *, uint8_t *, uint16_t, eib_vnic_t **, int *);
928 extern void eib_vnic_delete(eib_t *, eib_vnic_t *);
929 extern int eib_vnic_wait_for_login_ack(eib_t *, eib_vnic_t *, int *);
930 extern void eib_vnic_login_ack(eib_t *, eib_login_data_t *);
931 extern int eib_vnic_wait_for_table(eib_t *, eib_vnic_t *, int *);
932 extern void eib_vnic_vhub_table_done(eib_vnic_t *, uint_t);
933 extern int eib_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *,
934     boolean_t, int *);
935 extern int eib_vnic_setup_dest(eib_vnic_t *, eib_wqe_t *, uint8_t *, uint16_t);
936 extern void eib_vnic_leave_data_mcg(eib_t *, eib_vnic_t *, uint8_t *);
937 extern void eib_vnic_init_tables(eib_t *, eib_vnic_t *);
938 extern void eib_vnic_fini_tables(eib_t *, eib_vnic_t *, boolean_t);
939 extern eib_chan_t *eib_vnic_get_data_chan(eib_t *, int);
940 extern void eib_vnic_need_new(eib_t *, uint8_t *, uint16_t);
941 extern void eib_vnic_enqueue_req(eib_t *, eib_vnic_req_t *);
942 extern void eib_vnic_resurrect_zombies(eib_t *, uint8_t *);
943 extern void eib_vnic_restart(eib_t *, int, uint8_t *);
944 extern void eib_vnic_rejoin_mcgs(eib_t *);
945 extern void eib_rb_vnic_create(eib_t *, eib_vnic_t *, uint_t);
946 
947 /*
948  * Logging and other stuff
949  */
950 extern void eib_debug_init(void);
951 extern void eib_debug_fini(void);
952 extern void eib_dprintf_crit(int, const char *fmt, ...);
953 extern void eib_dprintf_err(int, const char *fmt, ...);
954 extern void eib_dprintf_warn(int, const char *fmt, ...);
955 #ifdef EIB_DEBUG
956 extern void eib_dprintf_debug(int, const char *fmt, ...);
957 extern void eib_dprintf_args(int, const char *fmt, ...);
958 extern void eib_dprintf_pkt(int, uint8_t *, uint_t);
959 extern void eib_dprintf_verbose(int, const char *fmt, ...);
960 #endif
961 extern int eib_get_props(eib_t *);
962 extern void eib_update_props(eib_t *, eib_gw_info_t *);
963 extern void eib_rb_get_props(eib_t *);
964 
965 /*
966  * EoIB specific global variables
967  */
968 extern ib_gid_t eib_reserved_gid;
969 extern uint8_t eib_zero_mac[];
970 extern uint8_t eib_broadcast_mac[];
971 extern int eib_setbit_mod67[];
972 extern char *eib_pvt_props[];
973 
974 /*
975  * HW/FW workarounds
976  */
977 extern int eib_wa_no_desc_list_len;
978 extern int eib_wa_no_cksum_offload;
979 extern int eib_wa_no_lso;
980 extern int eib_wa_no_mcast_entries;
981 extern int eib_wa_no_av_discover;
982 extern int eib_wa_no_good_vp_flag;
983 extern int eib_wa_no_good_vhub_cksum;
984 
985 /*
986  * Miscellaneous externs
987  */
988 extern void freemsgchain(mblk_t *);
989 extern pri_t minclsyspri;
990 
991 #ifdef __cplusplus
992 }
993 #endif
994 
995 #endif	/* _SYS_IB_EOIB_EIB_IMPL_H */
996