xref: /illumos-gate/usr/src/uts/common/sys/ib/clients/ibd/ibd.h (revision 82beb6028da8d7d7f8562908ca027bd4a1cc7d37)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #ifndef _SYS_IB_CLIENTS_IBD_H
27 #define	_SYS_IB_CLIENTS_IBD_H
28 
29 #ifdef __cplusplus
30 extern "C" {
31 #endif
32 
33 /* The following macros are used in both ibd.c and ibd_cm.c */
34 
35 /*
36  * Completion queue polling control
37  */
38 #define	IBD_CQ_POLLING			0x1
39 #define	IBD_REDO_CQ_POLLING		0x2
40 
41 /*
42  * Maximum length for returning chained mps back to crossbow.
43  * Also used as the maximum number of rx wc's polled at a time.
44  */
45 #define	IBD_MAX_RX_MP_LEN		16
46 
47 /*
48  * When doing multiple-send-wr, this value determines how many to do at
49  * a time (in a single ibt_post_send).
50  */
51 #define	IBD_MAX_TX_POST_MULTIPLE	4
52 
53 /*
54  * Flag bits for resources to reap
55  */
56 #define	IBD_RSRC_SWQE			0x1
57 #define	IBD_RSRC_LSOBUF			0x2
58 #define	IBD_RSRC_RC_SWQE		0x4
59 #define	IBD_RSRC_RC_TX_LARGEBUF		0x8
60 
61 /*
62  * Async operation types
63  */
64 #define	IBD_ASYNC_GETAH			1
65 #define	IBD_ASYNC_JOIN			2
66 #define	IBD_ASYNC_LEAVE			3
67 #define	IBD_ASYNC_PROMON		4
68 #define	IBD_ASYNC_PROMOFF		5
69 #define	IBD_ASYNC_REAP			6
70 #define	IBD_ASYNC_TRAP			7
71 #define	IBD_ASYNC_SCHED			8
72 #define	IBD_ASYNC_LINK			9
73 #define	IBD_ASYNC_EXIT			10
74 #define	IBD_ASYNC_RC_TOO_BIG		11
75 #define	IBD_ASYNC_RC_CLOSE_ACT_CHAN	12
76 #define	IBD_ASYNC_RC_RECYCLE_ACE	13
77 #define	IBD_ASYNC_RC_CLOSE_PAS_CHAN	14
78 
79 /*
80  * State of IBD driver initialization during attach/m_start
81  */
82 #define	IBD_DRV_STATE_INITIALIZED	0x000001
83 #define	IBD_DRV_RXINTR_ADDED		0x000002
84 #define	IBD_DRV_TXINTR_ADDED		0x000004
85 #define	IBD_DRV_IBTL_ATTACH_DONE	0x000008
86 #define	IBD_DRV_HCA_OPENED		0x000010
87 #define	IBD_DRV_PD_ALLOCD		0x000020
88 #define	IBD_DRV_MAC_REGISTERED		0x000040
89 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x000080
90 #define	IBD_DRV_BCAST_GROUP_FOUND	0x000100
91 #define	IBD_DRV_ACACHE_INITIALIZED	0x000200
92 #define	IBD_DRV_CQS_ALLOCD		0x000400
93 #define	IBD_DRV_UD_CHANNEL_SETUP	0x000800
94 #define	IBD_DRV_TXLIST_ALLOCD		0x001000
95 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x002000
96 #define	IBD_DRV_RXLIST_ALLOCD		0x004000
97 #define	IBD_DRV_BCAST_GROUP_JOINED	0x008000
98 #define	IBD_DRV_ASYNC_THR_CREATED	0x010000
99 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x020000
100 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x040000
101 #define	IBD_DRV_STARTED			0x080000
102 #define	IBD_DRV_RC_SRQ_ALLOCD		0x100000
103 #define	IBD_DRV_RC_LARGEBUF_ALLOCD	0x200000
104 #define	IBD_DRV_RC_LISTEN		0x400000
105 #ifdef DEBUG
106 #define	IBD_DRV_RC_PRIVATE_STATE	0x800000
107 #endif
108 #define	IBD_DRV_IN_DELETION		0x1000000
109 #define	IBD_DRV_IN_LATE_HCA_INIT 	0x2000000
110 #define	IBD_DRV_REQ_LIST_INITED 	0x4000000
111 #define	IBD_DRV_RC_TIMEOUT		0x8000000
112 
113 /*
114  * Miscellaneous constants
115  */
116 #define	IBD_SEND			0
117 #define	IBD_RECV			1
118 
119 /* Tunables defaults and limits */
120 #define	IBD_LINK_MODE_UD		0
121 #define	IBD_LINK_MODE_RC		1
122 
123 #define	IBD_DEF_LINK_MODE		IBD_LINK_MODE_RC
124 #define	IBD_DEF_LSO_POLICY		B_TRUE
125 #define	IBD_DEF_NUM_LSO_BUFS		1024
126 #define	IBD_DEF_CREATE_BCAST_GROUP	B_TRUE
127 #define	IBD_DEF_COALESCE_COMPLETIONS	B_TRUE
128 #define	IBD_DEF_UD_RX_COMP_COUNT	4
129 #define	IBD_DEF_UD_RX_COMP_USEC		10
130 #define	IBD_DEF_UD_TX_COMP_COUNT	16
131 #define	IBD_DEF_UD_TX_COMP_USEC		300
132 #define	IBD_DEF_RC_RX_COMP_COUNT	4
133 #define	IBD_DEF_RC_RX_COMP_USEC		10
134 #define	IBD_DEF_RC_TX_COMP_COUNT	10
135 #define	IBD_DEF_RC_TX_COMP_USEC		300
136 #define	IBD_DEF_UD_TX_COPY_THRESH	4096
137 #define	IBD_DEF_RC_RX_COPY_THRESH	4096
138 #define	IBD_DEF_RC_TX_COPY_THRESH	4096
139 #define	IBD_DEF_UD_NUM_RWQE		4000
140 #define	IBD_DEF_UD_NUM_SWQE		4000
141 #define	IBD_DEF_RC_ENABLE_SRQ		B_TRUE
142 #if defined(__i386)
143 #define	IBD_DEF_RC_NUM_RWQE		511
144 #define	IBD_DEF_RC_NUM_SWQE		255
145 #else
146 #define	IBD_DEF_RC_NUM_RWQE		2047
147 #define	IBD_DEF_RC_NUM_SWQE		511
148 #endif
149 #define	IBD_DEF_NUM_AH			256
150 #define	IBD_DEF_HASH_SIZE		32
151 #define	IBD_DEF_RC_NUM_SRQ		(IBD_DEF_RC_NUM_RWQE - 1)
152 #define	IBD_DEF_RC_RX_RWQE_THRESH	(IBD_DEF_RC_NUM_RWQE >> 2)
153 
154 /* Tunable limits */
155 #define	IBD_MIN_NUM_LSO_BUFS		512
156 #define	IBD_MAX_NUM_LSO_BUFS		4096
157 #define	IBD_MIN_UD_TX_COPY_THRESH	2048
158 #define	IBD_MAX_UD_TX_COPY_THRESH	65536
159 #define	IBD_MIN_UD_NUM_SWQE		512
160 #define	IBD_MAX_UD_NUM_SWQE		8000
161 #define	IBD_MIN_UD_NUM_RWQE		512
162 #define	IBD_MAX_UD_NUM_RWQE		8000
163 #define	IBD_MIN_NUM_AH			32
164 #define	IBD_MAX_NUM_AH			8192
165 #define	IBD_MIN_HASH_SIZE		32
166 #define	IBD_MAX_HASH_SIZE		1024
167 
168 #if defined(__i386)
169 #define	IBD_MIN_RC_NUM_SWQE		255
170 #else
171 #define	IBD_MIN_RC_NUM_SWQE		511
172 #endif
173 #define	IBD_MAX_RC_NUM_SWQE		8000
174 #define	IBD_MIN_RC_NUM_RWQE		511
175 #define	IBD_MAX_RC_NUM_RWQE		8000
176 #define	IBD_MIN_RC_RX_COPY_THRESH	1500
177 #define	IBD_MAX_RC_RX_COPY_THRESH	65520
178 #define	IBD_MIN_RC_TX_COPY_THRESH	1500
179 #define	IBD_MAX_RC_TX_COPY_THRESH	65520
180 #define	IBD_MIN_RC_NUM_SRQ		(IBD_MIN_RC_NUM_RWQE - 1)
181 #define	IBD_MIN_RC_RX_RWQE_THRESH	(IBD_MIN_RC_NUM_RWQE >> 2)
182 
183 /*
184  * Thresholds
185  *
186  * When waiting for resources (swqes or lso buffers) to become available,
187  * the first two thresholds below determine how long to wait before informing
188  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
189  * determines how low the available swqes should go before we start polling
190  * the completion queue.
191  */
192 #define	IBD_FREE_LSOS_THRESH		8
193 #define	IBD_FREE_SWQES_THRESH		20
194 #define	IBD_TX_POLL_THRESH		80
195 
196 #ifdef DEBUG
197 void debug_print(int l, char *fmt, ...);
198 #define	DPRINT		debug_print
199 #else
200 #define	DPRINT		0 &&
201 #endif
202 
203 /*
204  * AH and MCE active list manipulation:
205  *
206  * Multicast disable requests and MCG delete traps are two cases
207  * where the active AH entry for the mcg (if any unreferenced one exists)
208  * will be moved to the free list (to force the next Tx to the mcg to
209  * join the MCG in SendOnly mode). Port up handling will also move AHs
210  * from active to free list.
211  *
212  * In the case when some transmits are still pending on an entry
213  * for an mcg, but a multicast disable has already been issued on the
214  * mcg, there are some options to consider to preserve the join state
215  * to ensure the emitted packet is properly routed on the IBA fabric.
216  * For the AH, we can
217  * 1. take out of active list at multicast disable time.
218  * 2. take out of active list only when last pending Tx completes.
219  * For the MCE, we can
220  * 3. take out of active list at multicast disable time.
221  * 4. take out of active list only when last pending Tx completes.
222  * 5. move from active list to stale list at multicast disable time.
223  * We choose to use 2,4. We use option 4 so that if a multicast enable
224  * is tried before the pending Tx completes, the enable code finds the
225  * mce in the active list and just has to make sure it will not be reaped
226  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
227  * a stale list (#5) that would be checked in the enable code would need
228  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
229  * after the multicast disable would try to put an AH in the active list,
230  * and associate the mce it finds in the active list to this new AH,
231  * whereas the mce is already associated with the previous AH (taken off
232  * the active list), and will be removed once the pending Tx's complete
233  * (unless a reference count on mce's is implemented). One implication of
234  * using 2,4 is that new Tx's posted before the pending Tx's complete will
235  * grab new references on the AH, further delaying the leave.
236  *
237  * In the case of mcg delete (or create) trap when the port is sendonly
238  * joined, the AH and MCE handling is different: the AH and MCE has to be
239  * immediately taken off the active lists (forcing a join and path lookup
240  * at the next Tx is the only guaranteed means of ensuring a proper Tx
241  * to an mcg as it is repeatedly created and deleted and goes thru
242  * reincarnations).
243  *
244  * When a port is already sendonly joined, and a multicast enable is
245  * attempted, the same mce structure is promoted; this ensures only a
246  * single mce on the active list tracks the most powerful join state.
247  *
248  * In the case of port up event handling, the MCE for sendonly membership
249  * is freed up, and the ACE is put into the free list as soon as possible
250  * (depending on whether posted Tx's have completed). For fullmembership
251  * MCE's though, the ACE is similarly handled; but the MCE is kept around
252  * (a re-JOIN is attempted) only if the DLPI leave has not already been
253  * done; else the mce is deconstructed (mc_fullreap case).
254  *
255  * MCG creation and deletion trap handling:
256  *
257  * These traps are unreliable (meaning sometimes the trap might never
258  * be delivered to the subscribed nodes) and may arrive out-of-order
259  * since they use UD transport. An alternative to relying on these
260  * unreliable traps is to poll for mcg presence every so often, but
261  * instead of doing that, we try to be as conservative as possible
262  * while handling the traps, and hope that the traps do arrive at
263  * the subscribed nodes soon. Note that if a node is fullmember
264  * joined to an mcg, it can not possibly receive a mcg create/delete
265  * trap for that mcg (by fullmember definition); if it does, it is
266  * an old trap from a previous incarnation of the mcg.
267  *
268  * Whenever a trap is received, the driver cleans up its sendonly
269  * membership to the group; we choose to do a sendonly leave even
270  * on a creation trap to handle the case of a prior deletion of the mcg
271  * having gone unnoticed. Consider an example scenario:
272  * T1: MCG M is deleted, and fires off deletion trap D1.
273  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
274  * T3: Node N tries to transmit to M, joining in sendonly mode.
275  * T4: MCG M is deleted, and fires off deletion trap D2.
276  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
277  *     If the trap is D2, then a LEAVE is not required, since the mcg
278  *     is already deleted; but if it is D1, a LEAVE is required. A safe
279  *     approach is to always LEAVE, but the SM may be confused if it
280  *     receives a LEAVE without a prior JOIN.
281  *
282  * Management of the non-membership to an mcg is similar to the above,
283  * except that if the interface is in promiscuous mode, it is required
284  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
285  * if the re-join attempt fails (in which case a warning message needs
286  * to be printed), it is not clear whether it failed due to the mcg not
287  * existing, or some fabric/hca issues, due to the delayed nature of
288  * trap delivery. Querying the SA to establish presence/absence of the
289  * mcg is also racy at best. Thus, the driver just prints a warning
290  * message when it can not rejoin after receiving a create trap, although
291  * this might be (on rare occasions) a mis-warning if the create trap is
292  * received after the mcg was deleted.
293  */
294 
295 /*
296  * Implementation of atomic "recycle" bits and reference count
297  * on address handles. This utilizes the fact that max reference
298  * count on any handle is limited by number of send wqes, thus
299  * high bits in the ac_ref field can be used as the recycle bits,
300  * and only the low bits hold the number of pending Tx requests.
301  * This atomic AH reference counting allows the Tx completion
302  * handler not to acquire the id_ac_mutex to process every completion,
303  * thus reducing lock contention problems between completion and
304  * the Tx path.
305  */
306 #define	CYCLEVAL		0x80000
307 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
308 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
309 #define	GET_REF(ace)		((ace)->ac_ref)
310 #define	GET_REF_CYCLE(ace) (				\
311 	/*						\
312 	 * Make sure "cycle" bit is set.		\
313 	 */						\
314 	ASSERT(CYCLE_SET(ace)),				\
315 	((ace)->ac_ref & ~(CYCLEVAL))			\
316 )
317 #define	INC_REF(ace, num) {				\
318 	atomic_add_32(&(ace)->ac_ref, num);		\
319 }
320 #define	SET_CYCLE_IF_REF(ace) (				\
321 	CYCLE_SET(ace) ? B_TRUE :			\
322 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
323 		CYCLEVAL ?				\
324 		/*					\
325 		 * Clear the "cycle" bit we just set;	\
326 		 * ref count known to be 0 from above.	\
327 		 */					\
328 		CLEAR_REFCYCLE(ace), B_FALSE :		\
329 		/*					\
330 		 * We set "cycle" bit; let caller know.	\
331 		 */					\
332 		B_TRUE					\
333 )
334 #define	DEC_REF_DO_CYCLE(ace) (				\
335 	atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ?	\
336 		/*					\
337 		 * Ref count known to be 0 from above.	\
338 		 */					\
339 		B_TRUE :				\
340 		B_FALSE					\
341 )
342 
343 /*
344  * Address handle entries maintained by the driver are kept in the
345  * free and active lists. Each entry starts out in the free list;
346  * it migrates to the active list when primed using ibt_get_paths()
347  * and ibt_modify_ud_dest() for transmission to a specific destination.
348  * In the active list, the entry has a reference count indicating the
349  * number of ongoing/uncompleted transmits that reference it. The
350  * entry is left in the active list even after the reference count
351  * goes to 0, since successive transmits can find it there and do
352  * not need to set up another entry (ie the path information is
353  * cached using the active list). Entries on the active list are
354  * also hashed using the destination link address as a key for faster
355  * lookups during transmits.
356  *
357  * For any destination address (unicast or multicast, whatever the
358  * join states), there will be at most one entry in the active list.
359  * Entries with a 0 reference count on the active list can be reused
360  * for a transmit to a new destination, if the free list is empty.
361  *
362  * The AH free list insertion/deletion is protected with the id_ac_mutex,
363  * since the async thread and Tx callback handlers insert/delete. The
364  * active list does not need a lock (all operations are done by the
365  * async thread) but updates to the reference count are atomically
366  * done (increments done by Tx path, decrements by the Tx callback handler).
367  */
368 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
369 	list_insert_head(&state->id_ah_free, ce)
370 #define	IBD_ACACHE_GET_FREE(state) \
371 	list_get_head(&state->id_ah_free)
372 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
373 	int _ret_;						\
374 	list_insert_head(&state->id_ah_active, ce);		\
375 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
376 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
377 	ASSERT(_ret_ == 0);					\
378 	state->id_ac_hot_ace = ce;				\
379 }
380 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
381 	list_remove(&state->id_ah_active, ce);			\
382 	if (state->id_ac_hot_ace == ce)				\
383 		state->id_ac_hot_ace = NULL;			\
384 	(void) mod_hash_remove(state->id_ah_active_hash,	\
385 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
386 }
387 #define	IBD_ACACHE_GET_ACTIVE(state) \
388 	list_get_head(&state->id_ah_active)
389 
390 /*
391  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
392  * front of optional src/tgt link layer address. Right now Solaris inserts
393  * padding by default at the end. The routine which is doing is nce_xmit()
394  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
395  * the packet comes down from IP layer to the IBD driver, it is in the
396  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
397  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
398  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
399  *
400  * The send routine at IBD driver changes this packet as follows:
401  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
402  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
403  * aligned.
404  *
405  * At the receiving side again ibd_process_rx takes the above packet and
406  * removes the two bytes of front padding and inserts it at the end. This
407  * is since the IP layer does not understand padding at the front.
408  */
409 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
410 	uchar_t 	*nd_lla_ptr;					\
411 	icmp6_t 	*icmp6;						\
412 	nd_opt_hdr_t	*opt;						\
413 	int 		i;						\
414 									\
415 	icmp6 = (icmp6_t *)&ip6h[1];					\
416 	len -= sizeof (nd_neighbor_advert_t);				\
417 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
418 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
419 	    (len != 0)) {						\
420 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
421 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
422 		ASSERT(opt != NULL);					\
423 		nd_lla_ptr = (uchar_t *)&opt[1];			\
424 		if (type == IBD_SEND) {					\
425 			for (i = IPOIB_ADDRL; i > 0; i--)		\
426 				*(nd_lla_ptr + i + 1) =			\
427 				    *(nd_lla_ptr + i - 1);		\
428 		} else {						\
429 			for (i = 0; i < IPOIB_ADDRL; i++)		\
430 				*(nd_lla_ptr + i) =			\
431 				    *(nd_lla_ptr + i + 2);		\
432 		}							\
433 		*(nd_lla_ptr + i) = 0;					\
434 		*(nd_lla_ptr + i + 1) = 0;				\
435 	}								\
436 }
437 
438 
439 /*
440  * IETF defined IPoIB encapsulation header, with 2b of ethertype
441  * followed by 2 reserved bytes. This is at the start of the
442  * datagram sent to and received over the wire by the driver.
443  */
444 typedef struct ipoib_header {
445 	ushort_t	ipoib_type;
446 	ushort_t	ipoib_mbz;
447 } ipoib_hdr_t;
448 
449 #define	IPOIB_HDRSIZE	sizeof (struct ipoib_header)
450 
451 /*
452  * IETF defined IPoIB link address; IBA QPN, followed by GID,
453  * which has a prefix and suffix, as reported via ARP.
454  */
455 typedef struct ipoib_mac {
456 	uint32_t	ipoib_qpn;
457 	uint32_t	ipoib_gidpref[2];
458 	uint32_t	ipoib_gidsuff[2];
459 } ipoib_mac_t;
460 
461 #define	IPOIB_ADDRL	sizeof (struct ipoib_mac)
462 
463 /*
464  * Pseudo header prepended to datagram in DLIOCRAW transmit path
465  * and when GLD hands the datagram to the gldm_send entry point.
466  */
467 typedef struct ipoib_ptxhdr {
468 	ipoib_mac_t	ipoib_dest;
469 	ipoib_hdr_t	ipoib_rhdr;
470 } ipoib_ptxhdr_t;
471 
472 #define	IPOIBDLSAP(p, offset)	((ipoib_ptxhdr_t *)((caddr_t)(p)+offset))
473 
474 /*
475  * The pseudo-GRH structure that sits before the data in the
476  * receive buffer, and is overlaid on top of the real GRH.
477  * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
478  * does not hold valid information. If it is indicated valid,
479  * the driver must additionally provide the sender's qpn in
480  * network byte order in ipoib_sqpn, and not touch the
481  * remaining parts which were DMA'ed in by the IBA hardware.
482  */
483 typedef struct ipoib_pgrh {
484 	uint32_t	ipoib_vertcflow;
485 	uint32_t	ipoib_sqpn;
486 	uint32_t	ipoib_sgid_pref[2];
487 	uint32_t	ipoib_sgid_suff[2];
488 	uint32_t	ipoib_dgid_pref[2];
489 	uint32_t	ipoib_dgid_suff[2];
490 } ipoib_pgrh_t;
491 
492 /*
493  * The GRH is also dma'ed into recv buffers, thus space needs
494  * to be allocated for them.
495  */
496 #define	IPOIB_GRH_SIZE	sizeof (ipoib_pgrh_t)
497 
498 /* support  the RC (reliable connected) mode */
499 #define	IBD_MAC_ADDR_RC		0x80000000
500 /* support the UC (unreliable connected) mode */
501 #define	IBD_MAC_ADDR_UC		0x40000000
502 
503 #define	IBD_RC_SERVICE_ID 0x100000000000000ULL
504 
505 /*
506  * Legacy OFED had used a wrong service ID (one additional zero digit) for
507  * many years. To interop with legacy OFED, we support this wrong service ID
508  * here.
509  */
510 #define	IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL
511 
512 #define	IBD_RC_MIN_CQ_SIZE	0x7f
513 
514 /* Number of ibt_wc_t provided for each RC channel */
515 #define	IBD_RC_MAX_CQ_WC	0x3f
516 
517 #if defined(_KERNEL) && !defined(_BOOT)
518 
519 #include <sys/ib/ibtl/ibti.h>
520 #include <sys/ib/ib_pkt_hdrs.h>
521 #include <sys/list.h>
522 #include <sys/mac_provider.h>
523 #include <sys/mac_ib.h>
524 #include <sys/modhash.h>
525 
526 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */
527 typedef enum {
528 	IBD_RC_STATE_INIT = 0,
529 
530 	/* Active side */
531 	IBD_RC_STATE_ACT_REP_RECV,	/* reply received */
532 	IBD_RC_STATE_ACT_ESTAB,		/* established, ready to send */
533 	IBD_RC_STATE_ACT_REJECT,	/* rejected */
534 	/* Someone else is closing this channel, please don't re-close it */
535 	IBD_RC_STATE_ACT_CLOSING,
536 	IBD_RC_STATE_ACT_CLOSED,
537 	IBD_RC_STATE_ACT_ERROR,
538 
539 	/* Passive side */
540 	IBD_RC_STATE_PAS_REQ_RECV,	/* request received */
541 	IBD_RC_STATE_PAS_ESTAB,		/* established, ready to receive */
542 	IBD_RC_STATE_PAS_REJECT,	/* rejected */
543 
544 	IBD_RC_STATE_PAS_CLOSED
545 } ibd_rc_chan_state_t;
546 
547 /*
548  * Structure to encapsulate various types of async requests.
549  */
550 typedef struct ibd_acache_rq {
551 	struct list_node 	rq_list; 	/* list of pending work */
552 	int			rq_op;		/* what operation */
553 	ipoib_mac_t		rq_mac;
554 	ib_gid_t		rq_gid;
555 	void			*rq_ptr;
556 	void			*rq_ptr2;
557 } ibd_req_t;
558 
559 typedef struct ibd_mcache {
560 	struct list_node	mc_list;	/* full/non list */
561 	uint8_t			mc_jstate;
562 	boolean_t		mc_fullreap;
563 	ibt_mcg_info_t		mc_info;
564 	ibd_req_t		mc_req;		/* to queue LEAVE req */
565 } ibd_mce_t;
566 
567 typedef struct ibd_acache_s {
568 	struct list_node	ac_list;	/* free/active list */
569 	ibt_ud_dest_hdl_t	ac_dest;
570 	ipoib_mac_t		ac_mac;
571 	uint32_t		ac_ref;
572 	ibd_mce_t		*ac_mce;	/* for MCG AHs */
573 
574 	/* For Reliable Connected mode */
575 	struct ibd_rc_chan_s	*ac_chan;
576 	/* protect tx_too_big_ongoing */
577 	kmutex_t		tx_too_big_mutex;
578 	/* Deal with too big packet */
579 	boolean_t		tx_too_big_ongoing;
580 } ibd_ace_t;
581 
582 #define	IBD_MAX_SQSEG	59
583 #define	IBD_MAX_RQSEG	1
584 
585 typedef enum {
586 	IBD_WQE_SEND,
587 	IBD_WQE_RECV
588 } ibd_wqe_type_t;
589 
590 typedef enum {
591 	IBD_WQE_TXBUF = 1,
592 	IBD_WQE_LSOBUF = 2,
593 	IBD_WQE_MAPPED = 3,
594 	IBD_WQE_RC_COPYBUF = 4
595 } ibd_wqe_buftype_t;
596 
597 #ifdef DEBUG
598 typedef struct ibd_rc_stat_s {
599 	kstat_named_t		rc_rcv_trans_byte;
600 	kstat_named_t		rc_rcv_trans_pkt;
601 	kstat_named_t		rc_rcv_copy_byte;
602 	kstat_named_t		rc_rcv_copy_pkt;
603 	kstat_named_t		rc_rcv_alloc_fail;
604 
605 	kstat_named_t		rc_rcq_err;	/* fail in rcq handler */
606 
607 	kstat_named_t		rc_rwqe_short;	/* short rwqe */
608 
609 	kstat_named_t		rc_xmt_bytes;
610 	/* pkt size <= state->id_rc_tx_copy_thresh */
611 	kstat_named_t		rc_xmt_small_pkt;
612 	kstat_named_t		rc_xmt_fragmented_pkt;
613 	/* fail in ibt_map_mem_iov() */
614 	kstat_named_t		rc_xmt_map_fail_pkt;
615 	/* succ in ibt_map_mem_iov() */
616 	kstat_named_t		rc_xmt_map_succ_pkt;
617 
618 	kstat_named_t		rc_ace_not_found;	/* ace not found */
619 	/* no swqe even after recycle */
620 	kstat_named_t		rc_scq_no_swqe;
621 	/* no tx large buf even after recycle */
622 	kstat_named_t		rc_scq_no_largebuf;
623 
624 	/* short swqe in ibd_send() */
625 	kstat_named_t		rc_swqe_short;
626 	/* call mac_tx_update() when there is enough swqe */
627 	kstat_named_t		rc_swqe_mac_update;
628 	/* short large buf in ibd_send() */
629 	kstat_named_t		rc_xmt_buf_short;
630 	/* call mac_tx_update() when there is enough Tx large buffers */
631 	kstat_named_t rc_xmt_buf_mac_update;
632 
633 	kstat_named_t		rc_conn_succ;	/* # of success connect */
634 	kstat_named_t		rc_conn_fail;	/* # of fail connect */
635 	/* ace->ac_chan == NULL for unicast packet */
636 	kstat_named_t		rc_null_conn;
637 	/* not in active established state */
638 	kstat_named_t		rc_no_estab_conn;
639 
640 	kstat_named_t		rc_act_close;	/* call ibd_rc_act_close() */
641 	kstat_named_t		rc_pas_close;	/* call ibd_rc_pas_close() */
642 	kstat_named_t		rc_delay_ace_recycle;
643 	kstat_named_t		rc_act_close_simultaneous;
644 
645 	kstat_named_t		rc_reset_cnt;	/* # of Reset RC channel */
646 	kstat_named_t		rc_timeout_act;
647 	kstat_named_t		rc_timeout_pas;
648 } ibd_rc_stat_t;
649 #endif
650 
651 typedef struct ibd_rc_chan_list_s {
652 	/* This mutex protects chan_list and ibd_rc_chan_t.next */
653 	kmutex_t		chan_list_mutex;
654 	struct ibd_rc_chan_s	*chan_list;
655 } ibd_rc_chan_list_t;
656 
657 typedef struct ibd_rc_tx_largebuf_s {
658 	struct ibd_rc_tx_largebuf_s	*lb_next;
659 	uint8_t				*lb_buf;
660 } ibd_rc_tx_largebuf_t;
661 
662 /*
663  * Pre-registered copybuf used for send and receive
664  */
665 typedef struct ibd_copybuf_s {
666 	ibt_wr_ds_t		ic_sgl;
667 	uint8_t			*ic_bufaddr;
668 } ibd_copybuf_t;
669 
670 typedef struct ibd_wqe_s {
671 	struct ibd_wqe_s	*w_next;
672 	ibd_copybuf_t		w_copybuf;
673 	mblk_t			*im_mblk;
674 } ibd_wqe_t;
675 
676 /*
677  * Send WQE
678  */
679 typedef struct ibd_swqe_s {
680 	ibd_wqe_t		w_ibd_swqe;
681 	ibd_wqe_buftype_t	w_buftype;
682 	ibt_send_wr_t		w_swr;
683 	ibd_ace_t		*w_ahandle;
684 	ibt_mi_hdl_t		w_mi_hdl;
685 	ibt_wr_ds_t		w_sgl[IBD_MAX_SQSEG];
686 	ibd_rc_tx_largebuf_t	*w_rc_tx_largebuf;
687 } ibd_swqe_t;
688 
689 #define	swqe_next		w_ibd_swqe.w_next
690 #define	swqe_copybuf		w_ibd_swqe.w_copybuf
691 #define	swqe_im_mblk		w_ibd_swqe.im_mblk
692 #define	SWQE_TO_WQE(swqe)	(ibd_wqe_t *)&((swqe)->w_ibd_swqe)
693 #define	WQE_TO_SWQE(wqe)	(ibd_swqe_t *)wqe
694 
695 /*
696  * Receive WQE
697  */
698 typedef struct ibd_rwqe_s {
699 	ibd_wqe_t		w_ibd_rwqe;
700 	struct ibd_state_s	*w_state;
701 	ibt_recv_wr_t		w_rwr;
702 	frtn_t			w_freemsg_cb;
703 	boolean_t		w_freeing_wqe;
704 	struct ibd_rc_chan_s	*w_chan;
705 } ibd_rwqe_t;
706 
707 #define	rwqe_next		w_ibd_rwqe.w_next
708 #define	rwqe_copybuf		w_ibd_rwqe.w_copybuf
709 #define	rwqe_im_mblk		w_ibd_rwqe.im_mblk
710 #define	RWQE_TO_WQE(rwqe)	(ibd_wqe_t *)&((rwqe)->w_ibd_rwqe)
711 #define	WQE_TO_RWQE(wqe)	(ibd_rwqe_t *)wqe
712 
713 typedef struct ibd_list_s {
714 	kmutex_t		dl_mutex;
715 	ibd_wqe_t		*dl_head;
716 	union {
717 		boolean_t	pending_sends;
718 		uint32_t	bufs_outstanding;
719 	} ustat;
720 	uint32_t		dl_cnt;
721 } ibd_list_t;
722 
723 #define	dl_pending_sends	ustat.pending_sends
724 #define	dl_bufs_outstanding	ustat.bufs_outstanding
725 
726 /*
727  * LSO buffers
728  *
729  * Under normal circumstances we should never need to use any buffer
730  * that's larger than MTU.  Unfortunately, IB HCA has limitations
731  * on the length of SGL that are much smaller than those for regular
732  * ethernet NICs.  Since the network layer doesn't care to limit the
733  * number of mblk fragments in any send mp chain, we end up having to
734  * use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
735  * buffers occasionally.
736  */
737 typedef struct ibd_lsobuf_s {
738 	struct ibd_lsobuf_s *lb_next;
739 	uint8_t		*lb_buf;
740 	int		lb_isfree;
741 } ibd_lsobuf_t;
742 
743 typedef struct ibd_lsobkt_s {
744 	uint8_t		*bkt_mem;
745 	ibd_lsobuf_t	*bkt_bufl;
746 	ibd_lsobuf_t	*bkt_free_head;
747 	ibt_mr_hdl_t	bkt_mr_hdl;
748 	ibt_mr_desc_t	bkt_mr_desc;
749 	uint_t		bkt_nelem;
750 	uint_t		bkt_nfree;
751 } ibd_lsobkt_t;
752 
753 #define	IBD_PORT_DRIVER		0x1
754 #define	IBD_PARTITION_OBJ	0x2
755 
756 /*
757  * Posting to a single software rx post queue is contentious,
758  * so break it out to (multiple) an array of queues.
759  *
760  * Try to ensure rx_queue structs fall in different cache lines using a filler.
761  * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
762  */
763 #define	RX_QUEUE_CACHE_LINE \
764 	(64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t)))
765 typedef struct ibd_rx_queue_s {
766 	kmutex_t		rx_post_lock;
767 	ibd_wqe_t		*rx_head;
768 	uint_t			rx_cnt;
769 	uint8_t			rx_pad[RX_QUEUE_CACHE_LINE];
770 } ibd_rx_queue_t;
771 
772 /*
773  * This structure maintains information per port per HCA
774  * (per network interface).
775  */
776 typedef struct ibd_state_s {
777 	uint_t			id_type;
778 	dev_info_t		*id_dip;
779 	ibt_clnt_hdl_t		id_ibt_hdl;
780 	ibt_hca_hdl_t		id_hca_hdl;
781 	ibt_pd_hdl_t		id_pd_hdl;
782 	kmem_cache_t		*id_req_kmc;
783 
784 	ibd_list_t		id_tx_rel_list;
785 
786 	uint32_t		id_running;
787 
788 	uint32_t		id_max_sqseg;
789 	uint32_t		id_max_sqseg_hiwm;
790 	ibd_list_t		id_tx_list;
791 	ddi_softintr_t		id_tx;
792 	uint32_t		id_tx_sends;
793 
794 	kmutex_t		id_txpost_lock;
795 	ibd_swqe_t		*id_tx_head;
796 	ibd_swqe_t		*id_tx_tail;
797 	int			id_tx_busy;
798 
799 	uint_t			id_tx_buf_sz;
800 	uint8_t			*id_tx_bufs;
801 	ibd_swqe_t		*id_tx_wqes;
802 	ibt_mr_hdl_t		id_tx_mr_hdl;
803 	ibt_mr_desc_t		id_tx_mr_desc;
804 
805 	kmutex_t		id_lso_lock;
806 	ibd_lsobkt_t		*id_lso;
807 
808 	kmutex_t		id_scq_poll_lock;
809 	int			id_scq_poll_busy;
810 
811 	ibt_cq_hdl_t		id_scq_hdl;
812 	ibt_wc_t		*id_txwcs;
813 	uint32_t		id_txwcs_size;
814 
815 	int			id_rx_nqueues;
816 	ibd_rx_queue_t		*id_rx_queues;
817 	int			id_rx_post_queue_index;
818 	uint32_t		id_rx_post_active;
819 
820 	ibd_rwqe_t		*id_rx_wqes;
821 	uint8_t			*id_rx_bufs;
822 	ibt_mr_hdl_t		id_rx_mr_hdl;
823 	ibt_mr_desc_t		id_rx_mr_desc;
824 	uint_t			id_rx_buf_sz;
825 	/*
826 	 * id_ud_num_rwqe
827 	 * Number of "receive WQE" elements that will be allocated and used
828 	 * by ibd. This parameter is limited by the maximum channel size of
829 	 * the HCA. Each buffer in the receive wqe will be of MTU size.
830 	 */
831 	uint32_t		id_ud_num_rwqe;
832 	ibd_list_t		id_rx_list;
833 	ddi_softintr_t		id_rx;
834 	uint32_t		id_rx_bufs_outstanding_limit;
835 	uint32_t		id_rx_allocb;
836 	uint32_t		id_rx_allocb_failed;
837 	ibd_list_t		id_rx_free_list;
838 
839 	kmutex_t		id_rcq_poll_lock;
840 	int			id_rcq_poll_busy;
841 	uint32_t		id_rxwcs_size;
842 	ibt_wc_t		*id_rxwcs;
843 	ibt_cq_hdl_t		id_rcq_hdl;
844 
845 	ibt_channel_hdl_t	id_chnl_hdl;
846 	ib_pkey_t		id_pkey;
847 	uint16_t		id_pkix;
848 	uint8_t			id_port;
849 	ibt_mcg_info_t		*id_mcinfo;
850 
851 	mac_handle_t		id_mh;
852 	mac_resource_handle_t	id_rh;
853 	ib_gid_t		id_sgid;
854 	ib_qpn_t		id_qpnum;
855 	ipoib_mac_t		id_macaddr;
856 	ib_gid_t		id_mgid;
857 	ipoib_mac_t		id_bcaddr;
858 
859 	int			id_mtu;
860 	uchar_t			id_scope;
861 
862 	kmutex_t		id_acache_req_lock;
863 	kcondvar_t		id_acache_req_cv;
864 	struct list		id_req_list;
865 	kt_did_t		id_async_thrid;
866 
867 	kmutex_t		id_ac_mutex;
868 	ibd_ace_t		*id_ac_hot_ace;
869 	struct list		id_ah_active;
870 	struct list		id_ah_free;
871 	ipoib_mac_t		id_ah_addr;
872 	ibd_req_t		id_ah_req;
873 	char			id_ah_op;
874 	uint64_t		id_ah_error;
875 	ibd_ace_t		*id_ac_list;
876 	mod_hash_t		*id_ah_active_hash;
877 
878 	kmutex_t		id_mc_mutex;
879 	struct list		id_mc_full;
880 	struct list		id_mc_non;
881 
882 	kmutex_t		id_trap_lock;
883 	kcondvar_t		id_trap_cv;
884 	boolean_t		id_trap_stop;
885 	uint32_t		id_trap_inprog;
886 
887 	char			id_prom_op;
888 
889 	kmutex_t		id_sched_lock;
890 	int			id_sched_needed;
891 	int			id_sched_cnt;
892 	int			id_sched_lso_cnt;
893 
894 	kmutex_t		id_link_mutex;
895 	link_state_t		id_link_state;
896 	uint64_t		id_link_speed;
897 
898 	uint64_t		id_num_intrs;
899 	uint64_t		id_tx_short;
900 	/*
901 	 * id_ud_num_swqe
902 	 * Number of "send WQE" elements that will be allocated and used by
903 	 * ibd. When tuning this parameter, the size of pre-allocated, pre-
904 	 * mapped copy buffer in each of these send wqes must be taken into
905 	 * account. This copy buffer size is determined by the value of
906 	 * IBD_TX_BUF_SZ (this is currently set to the same value of
907 	 * ibd_tx_copy_thresh, but may be changed independently if needed).
908 	 */
909 	uint32_t		id_ud_num_swqe;
910 
911 	uint64_t		id_xmt_bytes;
912 	uint64_t		id_rcv_bytes;
913 	uint64_t		id_multi_xmt;
914 	uint64_t		id_brd_xmt;
915 	uint64_t		id_multi_rcv;
916 	uint64_t		id_brd_rcv;
917 	uint64_t		id_xmt_pkt;
918 	uint64_t		id_rcv_pkt;
919 
920 	uint32_t		id_hwcksum_capab;
921 	boolean_t		id_lso_policy;
922 	boolean_t		id_lso_capable;
923 	uint_t			id_lso_maxlen;
924 	int			id_hca_res_lkey_capab;
925 	ibt_lkey_t		id_res_lkey;
926 
927 	boolean_t		id_bgroup_created;
928 	kmutex_t		id_macst_lock;
929 	kcondvar_t		id_macst_cv;
930 	uint32_t		id_mac_state;
931 
932 	/* For Reliable Connected Mode */
933 	boolean_t		id_enable_rc;
934 	boolean_t		rc_enable_srq;
935 
936 	int			rc_mtu;
937 	uint32_t		rc_tx_max_sqseg;
938 	/*
939 	 * In IPoIB over Reliable Connected mode, its mac address is added
940 	 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function
941 	 * ibd_process_rx(), the input mac address should not include the
942 	 * "IBD_MAC_ADDR_RC" prefix.
943 	 *
944 	 * So, we introduce the rc_macaddr_loopback for the loopback filter in
945 	 * IPoIB over Reliable Connected mode.
946 	 *
947 	 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix.
948 	 */
949 	ipoib_mac_t		rc_macaddr_loopback;
950 
951 	ibt_srv_hdl_t		rc_listen_hdl;
952 	ibt_sbind_hdl_t		rc_listen_bind;
953 	ibt_srv_hdl_t		rc_listen_hdl_OFED_interop;
954 	ibt_sbind_hdl_t		rc_listen_bind_OFED_interop;
955 
956 	ibd_rc_chan_list_t	rc_pass_chan_list;
957 	/* obsolete active channel list */
958 	ibd_rc_chan_list_t	rc_obs_act_chan_list;
959 
960 	kmutex_t		rc_ace_recycle_lock;
961 	ibd_ace_t		*rc_ace_recycle;
962 
963 	/* Send */
964 	/*
965 	 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree
966 	 * and ibd_rc_tx_largebuf_t->lb_next
967 	 */
968 	kmutex_t		rc_tx_large_bufs_lock;
969 	ibd_rc_tx_largebuf_t	*rc_tx_largebuf_free_head;
970 	uint_t			rc_tx_largebuf_nfree;
971 	/* The chunk of whole Tx large buffers */
972 	uint8_t			*rc_tx_mr_bufs;
973 	ibt_mr_hdl_t		rc_tx_mr_hdl;
974 	ibt_mr_desc_t		rc_tx_mr_desc;
975 	ibd_rc_tx_largebuf_t	*rc_tx_largebuf_desc_base;	/* base addr */
976 
977 	boolean_t		rc_enable_iov_map;
978 	uint_t			rc_max_sqseg_hiwm;
979 
980 	/* For SRQ */
981 	uint32_t 		rc_srq_size;
982 	ibt_srq_hdl_t		rc_srq_hdl;
983 	ibd_list_t		rc_srq_rwqe_list;
984 	ibd_list_t		rc_srq_free_list;
985 	ibd_rwqe_t		*rc_srq_rwqes;
986 	uint8_t			*rc_srq_rx_bufs;
987 	ibt_mr_hdl_t		rc_srq_rx_mr_hdl;
988 	ibt_mr_desc_t		rc_srq_rx_mr_desc;
989 
990 	/* For chained receive */
991 	kmutex_t		rc_rx_lock;
992 	mblk_t			*rc_rx_mp;
993 	mblk_t			*rc_rx_mp_tail;
994 	uint32_t		rc_rx_mp_len;
995 
996 	uint32_t		rc_num_tx_chan;
997 	uint32_t		rc_num_rx_chan;
998 
999 	/* Protect rc_timeout_start and rc_timeout */
1000 	kmutex_t		rc_timeout_lock;
1001 	boolean_t		rc_timeout_start;
1002 	timeout_id_t		rc_timeout;
1003 
1004 	/* Counters for RC mode */
1005 	/* RX */
1006 	/*
1007 	 * # of Received packets. These packets are directly transferred to GLD
1008 	 * without copy it
1009 	 */
1010 	uint64_t		rc_rcv_trans_byte;
1011 	uint64_t		rc_rcv_trans_pkt;
1012 	/*
1013 	 * # of Received packets. We will allocate new buffers for these packet,
1014 	 * copy their content into new buffers, then transfer to GLD
1015 	 */
1016 	uint64_t		rc_rcv_copy_byte;
1017 	uint64_t		rc_rcv_copy_pkt;
1018 	uint64_t		rc_rcv_alloc_fail;
1019 
1020 #ifdef DEBUG
1021 	uint64_t		rc_rwqe_short;	/* short rwqe */
1022 #endif
1023 
1024 	/* wc->wc_status != IBT_WC_SUCCESS */
1025 	uint64_t		rc_rcq_err;
1026 
1027 	/* Tx */
1028 	uint64_t		rc_xmt_bytes;
1029 
1030 	/* pkt size <= ibd_rc_tx_copy_thresh */
1031 	uint64_t		rc_xmt_small_pkt;
1032 	uint64_t		rc_xmt_fragmented_pkt;
1033 	/* fail in ibt_map_mem_iov() */
1034 	uint64_t		rc_xmt_map_fail_pkt;
1035 	/* succ in ibt_map_mem_iov() */
1036 	uint64_t		rc_xmt_map_succ_pkt;
1037 
1038 	uint64_t		rc_ace_not_found;
1039 
1040 	uint64_t		rc_xmt_drop_too_long_pkt;
1041 	uint64_t		rc_xmt_icmp_too_long_pkt;
1042 	uint64_t		rc_xmt_reenter_too_long_pkt;
1043 
1044 	/* short swqe in ibd_send() */
1045 	uint64_t		rc_swqe_short;
1046 	/* call mac_tx_update when there is enough swqe */
1047 	uint64_t		rc_swqe_mac_update;
1048 	/* short tx large copy buf in ibd_send() */
1049 	uint64_t		rc_xmt_buf_short;
1050 	/* call mac_tx_update when there is enough Tx copy buf */
1051 	uint64_t		rc_xmt_buf_mac_update;
1052 
1053 	/* No swqe even after call swqe recycle function */
1054 	uint64_t		rc_scq_no_swqe;
1055 	/* No large Tx buf even after call swqe recycle function */
1056 	uint64_t		rc_scq_no_largebuf;
1057 
1058 	/* Connection setup and close */
1059 	uint64_t		rc_conn_succ;	/* time of succ connect */
1060 	uint64_t		rc_conn_fail;	/* time of fail connect */
1061 	/* ace->ac_chan == NULL for unicast packet */
1062 	uint64_t		rc_null_conn;
1063 	/* not in active established state */
1064 	uint64_t		rc_no_estab_conn;
1065 
1066 	uint64_t		rc_act_close;	/* call ibd_rc_act_close() */
1067 	uint64_t		rc_pas_close;	/* call ibd_rc_pas_close() */
1068 	uint64_t		rc_delay_ace_recycle;
1069 	uint64_t		rc_act_close_simultaneous;
1070 	/* Fail to close a channel because someone else is still using it */
1071 	uint64_t		rc_act_close_not_clean;
1072 	/* RCQ is being invoked when closing RC channel */
1073 	uint64_t		rc_pas_close_rcq_invoking;
1074 
1075 	/* the counter of reset RC channel */
1076 	uint64_t		rc_reset_cnt;
1077 
1078 	uint64_t		rc_timeout_act;
1079 	uint64_t		rc_timeout_pas;
1080 
1081 	/*
1082 	 * Fail to stop this port because this port is connecting to a remote
1083 	 * port
1084 	 */
1085 	uint64_t		rc_stop_connect;
1086 
1087 #ifdef DEBUG
1088 	kstat_t 		*rc_ksp;
1089 #endif
1090 	ib_guid_t		id_hca_guid;
1091 	ib_guid_t		id_port_guid;
1092 	datalink_id_t		id_dlinkid;
1093 	datalink_id_t		id_plinkid;
1094 	int			id_port_inst;
1095 	struct ibd_state_s	*id_next;
1096 	boolean_t		id_force_create;
1097 	boolean_t		id_bgroup_present;
1098 	uint_t			id_hca_max_chan_sz;
1099 
1100 	/*
1101 	 * UD Mode Tunables
1102 	 *
1103 	 * id_ud_tx_copy_thresh
1104 	 * This sets the threshold at which ibd will attempt to do a bcopy
1105 	 * of the outgoing data into a pre-mapped buffer. IPoIB driver's
1106 	 * send behavior is restricted by various parameters, so setting of
1107 	 * this value must be made after careful considerations only. For
1108 	 * instance, IB HCAs currently impose a relatively small limit
1109 	 * (when compared to ethernet NICs) on the length of the SGL for
1110 	 * transmit. On the other hand, the ip stack could send down mp
1111 	 * chains that are quite long when LSO is enabled.
1112 	 *
1113 	 * id_num_lso_bufs
1114 	 * Number of "larger-than-MTU" copy buffers to use for cases when the
1115 	 * outgoing mblk chain is too fragmented to be used with
1116 	 * ibt_map_mem_iov() and too large to be used with regular MTU-sized
1117 	 * copy buffers. It is not recommended to tune this variable without
1118 	 * understanding the application environment and/or memory resources.
1119 	 * The size of each of these lso buffers is determined by the value of
1120 	 * IBD_LSO_BUFSZ.
1121 	 *
1122 	 * id_num_ah
1123 	 * Number of AH cache entries to allocate
1124 	 *
1125 	 * id_hash_size
1126 	 * Hash table size for the active AH list
1127 	 *
1128 	 */
1129 	uint_t id_ud_tx_copy_thresh;
1130 	uint_t id_num_lso_bufs;
1131 	uint_t id_num_ah;
1132 	uint_t id_hash_size;
1133 
1134 	boolean_t id_create_broadcast_group;
1135 
1136 	boolean_t id_allow_coalesce_comp_tuning;
1137 	uint_t id_ud_rx_comp_count;
1138 	uint_t id_ud_rx_comp_usec;
1139 	uint_t id_ud_tx_comp_count;
1140 	uint_t id_ud_tx_comp_usec;
1141 
1142 	/* RC Mode Tunables */
1143 
1144 	uint_t id_rc_rx_comp_count;
1145 	uint_t id_rc_rx_comp_usec;
1146 	uint_t id_rc_tx_comp_count;
1147 	uint_t id_rc_tx_comp_usec;
1148 	/*
1149 	 * id_rc_tx_copy_thresh
1150 	 * This sets the threshold at which ibd will attempt to do a bcopy
1151 	 * of the outgoing data into a pre-mapped buffer.
1152 	 *
1153 	 * id_rc_rx_copy_thresh
1154 	 * If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd
1155 	 * will attempt to allocate a buffer and do a bcopy of the incoming
1156 	 * data into the allocated buffer.
1157 	 *
1158 	 * id_rc_rx_rwqe_thresh
1159 	 * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd
1160 	 * will attempt to allocate a buffer and do a bcopy of the incoming
1161 	 * data into the allocated buffer.
1162 	 *
1163 	 * id_rc_num_swqe
1164 	 * 1) Send CQ size = ibd_rc_num_swqe
1165 	 * 2) The send queue size = ibd_rc_num_swqe -1
1166 	 * 3) Number of pre-allocated Tx buffers for ibt_post_send() =
1167 	 * ibd_rc_num_swqe - 1.
1168 	 *
1169 	 * id_rc_num_rwqe
1170 	 * 1) For non-SRQ, we pre-post id_rc_num_rwqe number of WRs
1171 	 * via ibt_post_receive() for receive queue of each RC channel.
1172 	 * 2) For SRQ and non-SRQ, receive CQ size = id_rc_num_rwqe
1173 	 *
1174 	 * For SRQ
1175 	 * If using SRQ, we allocate id_rc_num_srq number of buffers (the
1176 	 * size of each buffer is equal to RC mtu). And post them by
1177 	 * ibt_post_srq().
1178 	 *
1179 	 * id_rc_num_srq
1180 	 * id_rc_num_srq should not be larger than id_rc_num_rwqe,
1181 	 * otherwise it will cause a bug with the following warnings:
1182 	 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error
1183 	 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic
1184 	 * error
1185 	 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
1186 	 * catastrophic channel error
1187 	 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
1188 	 * completion queue error
1189 	 */
1190 	uint_t id_rc_tx_copy_thresh;
1191 	uint_t id_rc_rx_copy_thresh;
1192 	uint_t id_rc_rx_rwqe_thresh;
1193 	uint_t id_rc_num_swqe;
1194 	uint_t id_rc_num_rwqe;
1195 	uint_t id_rc_num_srq;
1196 } ibd_state_t;
1197 
1198 /*
1199  * Structures to track global IBTF data, data that is shared
1200  * among the IBD device instances.  This includes the one ibt_hdl
1201  * and the list of service registrations.
1202  */
1203 typedef struct ibd_service_s {
1204 	struct ibd_service_s	*is_link;
1205 	ibt_srv_hdl_t		is_srv_hdl;
1206 	ib_svc_id_t		is_sid;
1207 	uint_t			is_ref_cnt;
1208 } ibd_service_t;
1209 
1210 typedef struct ibd_global_state_s {
1211 	kmutex_t	ig_mutex;
1212 	ibt_clnt_hdl_t	ig_ibt_hdl;
1213 	uint_t		ig_ibt_hdl_ref_cnt;
1214 	ibd_service_t	*ig_service_list;
1215 } ibd_global_state_t;
1216 
1217 typedef struct ibd_rc_msg_hello_s {
1218 	uint32_t reserved_qpn;
1219 	uint32_t rx_mtu;
1220 } ibd_rc_msg_hello_t;
1221 
1222 typedef struct ibd_rc_chan_s {
1223 	struct ibd_rc_chan_s	*next;
1224 	/* channel hdl that we'll be using for Reliable Connected Mode */
1225 	ibt_channel_hdl_t	chan_hdl;
1226 	struct ibd_state_s	*state;
1227 	ibd_ace_t		*ace;
1228 	ibd_rc_chan_state_t	chan_state;
1229 
1230 	ibd_list_t		tx_wqe_list;	/* free wqe list */
1231 	ibd_list_t		tx_rel_list;	/* for swqe recycle */
1232 
1233 	ibd_swqe_t		*tx_wqes;
1234 
1235 	/* start address of Tx Buffers */
1236 	uint8_t			*tx_mr_bufs;
1237 	ibt_mr_hdl_t		tx_mr_hdl;
1238 	ibt_mr_desc_t		tx_mr_desc;
1239 
1240 	ibt_cq_hdl_t		scq_hdl;	/* Tx completion queue */
1241 	ibt_wc_t		tx_wc[IBD_RC_MAX_CQ_WC];
1242 	ddi_softintr_t		scq_softintr;
1243 
1244 	/* For chained send */
1245 	kmutex_t		tx_post_lock;
1246 	ibd_swqe_t		*tx_head;
1247 	ibd_swqe_t		*tx_tail;
1248 	int			tx_busy;
1249 
1250 	/* For tx buffer recycle */
1251 	kmutex_t		tx_poll_lock;
1252 	int			tx_poll_busy;
1253 
1254 	/* Rx */
1255 	ibd_list_t		rx_wqe_list;	/* used by ibt_post_recv */
1256 	ibd_list_t		rx_free_list;	/* free rwqe list */
1257 
1258 	ibt_cq_hdl_t		rcq_hdl;	/* Rx completion queue */
1259 	ibt_wc_t		rx_wc[IBD_RC_MAX_CQ_WC];
1260 
1261 	ibd_rwqe_t		*rx_rwqes;	/* the chuck of whole rwqes */
1262 	uint8_t			*rx_bufs;	/* the chuck of whole Rx bufs */
1263 	ibt_mr_hdl_t		rx_mr_hdl;	/* ibt_mr_hdl_t for rx_bufs */
1264 	ibt_mr_desc_t		rx_mr_desc;	/* ibt_mr_desc_t for rx_bufs */
1265 
1266 	/* For chained receive */
1267 	kmutex_t		rx_lock;
1268 	mblk_t			*rx_mp;
1269 	mblk_t			*rx_mp_tail;
1270 	uint32_t		rx_mp_len;
1271 
1272 	uint32_t 		rcq_size;
1273 	uint32_t 		scq_size;
1274 	/*
1275 	 * We need two channels for each connection.
1276 	 * One channel for Tx; another channel for Rx.
1277 	 * If "is_tx_chan == B_TRUE", this is a Tx channel.
1278 	 */
1279 	boolean_t		is_tx_chan;
1280 
1281 	/*
1282 	 * For the connection reaper routine ibd_rc_conn_timeout_call().
1283 	 * "is_used == B_FALSE" indicates this RC channel has not been used for
1284 	 * a long (=ibd_rc_conn_timeout) time.
1285 	 */
1286 	boolean_t		is_used;
1287 	/*
1288 	 * When closing this channel, we need to make sure
1289 	 * "chan->rcq_invoking == 0".
1290 	 */
1291 	uint32_t		rcq_invoking;
1292 } ibd_rc_chan_t;
1293 
1294 /*
1295  * The following functions are defined in "ibd.c".
1296  * They are also used by "ibd_cm.c"
1297  */
1298 void ibd_print_warn(ibd_state_t *, char *, ...);
1299 void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
1300 void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
1301 boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
1302 void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *);
1303 ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
1304 
1305 /*
1306  * The following functions are defined in "ibd_cm.c".
1307  * They are also used in "ibd.c".
1308  */
1309 void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *);
1310 void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *);
1311 void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *);
1312 
1313 /* Connection Setup/Close Functions */
1314 ibt_status_t ibd_rc_listen(ibd_state_t *);
1315 void ibd_rc_stop_listen(ibd_state_t *);
1316 ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *,
1317     uint64_t);
1318 void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *,  ibt_path_info_t *);
1319 void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *);
1320 void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *);
1321 int ibd_rc_pas_close(ibd_rc_chan_t *, boolean_t, boolean_t);
1322 void ibd_rc_close_all_chan(ibd_state_t *);
1323 void ibd_rc_conn_timeout_call(void *carg);
1324 
1325 /* Receive Functions */
1326 int ibd_rc_init_srq_list(ibd_state_t *);
1327 void ibd_rc_fini_srq_list(ibd_state_t *);
1328 int ibd_rc_repost_srq_free_list(ibd_state_t *);
1329 
1330 /* Send Functions */
1331 int ibd_rc_init_tx_largebuf_list(ibd_state_t *);
1332 void ibd_rc_fini_tx_largebuf_list(ibd_state_t *);
1333 ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *);
1334 void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *);
1335 void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t);
1336 void ibd_rc_tx_cleanup(ibd_swqe_t *);
1337 
1338 /* Others */
1339 void ibd_rc_get_conf(ibd_state_t *);
1340 int ibd_rc_init_stats(ibd_state_t *);
1341 
1342 #endif /* _KERNEL && !_BOOT */
1343 
1344 #ifdef __cplusplus
1345 }
1346 #endif
1347 
1348 #endif	/* _SYS_IB_CLIENTS_IBD_H */
1349