xref: /illumos-gate/usr/src/uts/common/sys/ib/clients/ibd/ibd.h (revision d0f40dc6a997c84bacf5f9ba83d57a95495c399b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef _SYS_IB_CLIENTS_IBD_H
28 #define	_SYS_IB_CLIENTS_IBD_H
29 
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33 
34 /* The following macros are used in both ibd.c and ibd_cm.c */
35 
36 /*
37  * Completion queue polling control
38  */
39 #define	IBD_CQ_POLLING			0x1
40 #define	IBD_REDO_CQ_POLLING		0x2
41 
42 /*
43  * Maximum length for returning chained mps back to crossbow.
44  * Also used as the maximum number of rx wc's polled at a time.
45  */
46 #define	IBD_MAX_RX_MP_LEN		16
47 
48 /*
49  * When doing multiple-send-wr, this value determines how many to do at
50  * a time (in a single ibt_post_send).
51  */
52 #define	IBD_MAX_TX_POST_MULTIPLE	4
53 
54 /*
55  * Flag bits for resources to reap
56  */
57 #define	IBD_RSRC_SWQE			0x1
58 #define	IBD_RSRC_LSOBUF			0x2
59 #define	IBD_RSRC_RC_SWQE		0x4
60 #define	IBD_RSRC_RC_TX_LARGEBUF		0x8
61 
62 /*
63  * Async operation types
64  */
65 #define	IBD_ASYNC_GETAH			1
66 #define	IBD_ASYNC_JOIN			2
67 #define	IBD_ASYNC_LEAVE			3
68 #define	IBD_ASYNC_PROMON		4
69 #define	IBD_ASYNC_PROMOFF		5
70 #define	IBD_ASYNC_REAP			6
71 #define	IBD_ASYNC_TRAP			7
72 #define	IBD_ASYNC_SCHED			8
73 #define	IBD_ASYNC_LINK			9
74 #define	IBD_ASYNC_EXIT			10
75 #define	IBD_ASYNC_RC_TOO_BIG		11
76 #define	IBD_ASYNC_RC_CLOSE_ACT_CHAN		12
77 #define	IBD_ASYNC_RC_RECYCLE_ACE		13
78 
79 /*
80  * Miscellaneous constants
81  */
82 #define	IBD_SEND			0
83 #define	IBD_RECV			1
84 
85 /*
86  * Thresholds
87  *
88  * When waiting for resources (swqes or lso buffers) to become available,
89  * the first two thresholds below determine how long to wait before informing
90  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
91  * determines how low the available swqes should go before we start polling
92  * the completion queue.
93  */
94 #define	IBD_FREE_LSOS_THRESH		8
95 #define	IBD_FREE_SWQES_THRESH		20
96 #define	IBD_TX_POLL_THRESH		80
97 
98 #ifdef DEBUG
99 void debug_print(int l, char *fmt, ...);
100 #define	DPRINT		debug_print
101 #else
102 #define	DPRINT		0 &&
103 #endif
104 
105 /*
106  * AH and MCE active list manipulation:
107  *
108  * Multicast disable requests and MCG delete traps are two cases
109  * where the active AH entry for the mcg (if any unreferenced one exists)
110  * will be moved to the free list (to force the next Tx to the mcg to
111  * join the MCG in SendOnly mode). Port up handling will also move AHs
112  * from active to free list.
113  *
114  * In the case when some transmits are still pending on an entry
115  * for an mcg, but a multicast disable has already been issued on the
116  * mcg, there are some options to consider to preserve the join state
117  * to ensure the emitted packet is properly routed on the IBA fabric.
118  * For the AH, we can
119  * 1. take out of active list at multicast disable time.
120  * 2. take out of active list only when last pending Tx completes.
121  * For the MCE, we can
122  * 3. take out of active list at multicast disable time.
123  * 4. take out of active list only when last pending Tx completes.
124  * 5. move from active list to stale list at multicast disable time.
125  * We choose to use 2,4. We use option 4 so that if a multicast enable
126  * is tried before the pending Tx completes, the enable code finds the
127  * mce in the active list and just has to make sure it will not be reaped
128  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
129  * a stale list (#5) that would be checked in the enable code would need
130  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
131  * after the multicast disable would try to put an AH in the active list,
132  * and associate the mce it finds in the active list to this new AH,
133  * whereas the mce is already associated with the previous AH (taken off
134  * the active list), and will be removed once the pending Tx's complete
135  * (unless a reference count on mce's is implemented). One implication of
136  * using 2,4 is that new Tx's posted before the pending Tx's complete will
137  * grab new references on the AH, further delaying the leave.
138  *
139  * In the case of mcg delete (or create) trap when the port is sendonly
140  * joined, the AH and MCE handling is different: the AH and MCE has to be
141  * immediately taken off the active lists (forcing a join and path lookup
142  * at the next Tx is the only guaranteed means of ensuring a proper Tx
143  * to an mcg as it is repeatedly created and deleted and goes thru
144  * reincarnations).
145  *
146  * When a port is already sendonly joined, and a multicast enable is
147  * attempted, the same mce structure is promoted; this ensures only a
148  * single mce on the active list tracks the most powerful join state.
149  *
150  * In the case of port up event handling, the MCE for sendonly membership
151  * is freed up, and the ACE is put into the free list as soon as possible
152  * (depending on whether posted Tx's have completed). For fullmembership
153  * MCE's though, the ACE is similarly handled; but the MCE is kept around
154  * (a re-JOIN is attempted) only if the DLPI leave has not already been
155  * done; else the mce is deconstructed (mc_fullreap case).
156  *
157  * MCG creation and deletion trap handling:
158  *
159  * These traps are unreliable (meaning sometimes the trap might never
160  * be delivered to the subscribed nodes) and may arrive out-of-order
161  * since they use UD transport. An alternative to relying on these
162  * unreliable traps is to poll for mcg presence every so often, but
163  * instead of doing that, we try to be as conservative as possible
164  * while handling the traps, and hope that the traps do arrive at
165  * the subscribed nodes soon. Note that if a node is fullmember
166  * joined to an mcg, it can not possibly receive a mcg create/delete
167  * trap for that mcg (by fullmember definition); if it does, it is
168  * an old trap from a previous incarnation of the mcg.
169  *
170  * Whenever a trap is received, the driver cleans up its sendonly
171  * membership to the group; we choose to do a sendonly leave even
172  * on a creation trap to handle the case of a prior deletion of the mcg
173  * having gone unnoticed. Consider an example scenario:
174  * T1: MCG M is deleted, and fires off deletion trap D1.
175  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
176  * T3: Node N tries to transmit to M, joining in sendonly mode.
177  * T4: MCG M is deleted, and fires off deletion trap D2.
178  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
179  *     If the trap is D2, then a LEAVE is not required, since the mcg
180  *     is already deleted; but if it is D1, a LEAVE is required. A safe
181  *     approach is to always LEAVE, but the SM may be confused if it
182  *     receives a LEAVE without a prior JOIN.
183  *
184  * Management of the non-membership to an mcg is similar to the above,
185  * except that if the interface is in promiscuous mode, it is required
186  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
187  * if the re-join attempt fails (in which case a warning message needs
188  * to be printed), it is not clear whether it failed due to the mcg not
189  * existing, or some fabric/hca issues, due to the delayed nature of
190  * trap delivery. Querying the SA to establish presence/absence of the
191  * mcg is also racy at best. Thus, the driver just prints a warning
192  * message when it can not rejoin after receiving a create trap, although
193  * this might be (on rare occasions) a mis-warning if the create trap is
194  * received after the mcg was deleted.
195  */
196 
197 /*
198  * Implementation of atomic "recycle" bits and reference count
199  * on address handles. This utilizes the fact that max reference
200  * count on any handle is limited by number of send wqes, thus
201  * high bits in the ac_ref field can be used as the recycle bits,
202  * and only the low bits hold the number of pending Tx requests.
203  * This atomic AH reference counting allows the Tx completion
204  * handler not to acquire the id_ac_mutex to process every completion,
205  * thus reducing lock contention problems between completion and
206  * the Tx path.
207  */
208 #define	CYCLEVAL		0x80000
209 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
210 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
211 #define	GET_REF(ace)		((ace)->ac_ref)
212 #define	GET_REF_CYCLE(ace) (				\
213 	/*						\
214 	 * Make sure "cycle" bit is set.		\
215 	 */						\
216 	ASSERT(CYCLE_SET(ace)),				\
217 	((ace)->ac_ref & ~(CYCLEVAL))			\
218 )
219 #define	INC_REF(ace, num) {				\
220 	atomic_add_32(&(ace)->ac_ref, num);		\
221 }
222 #define	SET_CYCLE_IF_REF(ace) (				\
223 	CYCLE_SET(ace) ? B_TRUE :			\
224 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
225 		CYCLEVAL ?				\
226 		/*					\
227 		 * Clear the "cycle" bit we just set;	\
228 		 * ref count known to be 0 from above.	\
229 		 */					\
230 		CLEAR_REFCYCLE(ace), B_FALSE :		\
231 		/*					\
232 		 * We set "cycle" bit; let caller know.	\
233 		 */					\
234 		B_TRUE					\
235 )
236 #define	DEC_REF_DO_CYCLE(ace) (				\
237 	atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ?	\
238 		/*					\
239 		 * Ref count known to be 0 from above.	\
240 		 */					\
241 		B_TRUE :				\
242 		B_FALSE					\
243 )
244 
245 /*
246  * Address handle entries maintained by the driver are kept in the
247  * free and active lists. Each entry starts out in the free list;
248  * it migrates to the active list when primed using ibt_get_paths()
249  * and ibt_modify_ud_dest() for transmission to a specific destination.
250  * In the active list, the entry has a reference count indicating the
251  * number of ongoing/uncompleted transmits that reference it. The
252  * entry is left in the active list even after the reference count
253  * goes to 0, since successive transmits can find it there and do
254  * not need to set up another entry (ie the path information is
255  * cached using the active list). Entries on the active list are
256  * also hashed using the destination link address as a key for faster
257  * lookups during transmits.
258  *
259  * For any destination address (unicast or multicast, whatever the
260  * join states), there will be at most one entry in the active list.
261  * Entries with a 0 reference count on the active list can be reused
262  * for a transmit to a new destination, if the free list is empty.
263  *
264  * The AH free list insertion/deletion is protected with the id_ac_mutex,
265  * since the async thread and Tx callback handlers insert/delete. The
266  * active list does not need a lock (all operations are done by the
267  * async thread) but updates to the reference count are atomically
268  * done (increments done by Tx path, decrements by the Tx callback handler).
269  */
270 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
271 	list_insert_head(&state->id_ah_free, ce)
272 #define	IBD_ACACHE_GET_FREE(state) \
273 	list_get_head(&state->id_ah_free)
274 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
275 	int _ret_;						\
276 	list_insert_head(&state->id_ah_active, ce);		\
277 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
278 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
279 	ASSERT(_ret_ == 0);					\
280 	state->id_ac_hot_ace = ce;				\
281 }
282 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
283 	list_remove(&state->id_ah_active, ce);			\
284 	if (state->id_ac_hot_ace == ce)				\
285 		state->id_ac_hot_ace = NULL;			\
286 	(void) mod_hash_remove(state->id_ah_active_hash,	\
287 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
288 }
289 #define	IBD_ACACHE_GET_ACTIVE(state) \
290 	list_get_head(&state->id_ah_active)
291 
292 /*
293  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
294  * front of optional src/tgt link layer address. Right now Solaris inserts
295  * padding by default at the end. The routine which is doing is nce_xmit()
296  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
297  * the packet comes down from IP layer to the IBD driver, it is in the
298  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
299  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
300  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
301  *
302  * The send routine at IBD driver changes this packet as follows:
303  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
304  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
305  * aligned.
306  *
307  * At the receiving side again ibd_process_rx takes the above packet and
308  * removes the two bytes of front padding and inserts it at the end. This
309  * is since the IP layer does not understand padding at the front.
310  */
311 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
312 	uchar_t 	*nd_lla_ptr;					\
313 	icmp6_t 	*icmp6;						\
314 	nd_opt_hdr_t	*opt;						\
315 	int 		i;						\
316 									\
317 	icmp6 = (icmp6_t *)&ip6h[1];					\
318 	len -= sizeof (nd_neighbor_advert_t);				\
319 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
320 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
321 	    (len != 0)) {						\
322 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
323 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
324 		ASSERT(opt != NULL);					\
325 		nd_lla_ptr = (uchar_t *)&opt[1];			\
326 		if (type == IBD_SEND) {					\
327 			for (i = IPOIB_ADDRL; i > 0; i--)		\
328 				*(nd_lla_ptr + i + 1) =			\
329 				    *(nd_lla_ptr + i - 1);		\
330 		} else {						\
331 			for (i = 0; i < IPOIB_ADDRL; i++)		\
332 				*(nd_lla_ptr + i) =			\
333 				    *(nd_lla_ptr + i + 2);		\
334 		}							\
335 		*(nd_lla_ptr + i) = 0;					\
336 		*(nd_lla_ptr + i + 1) = 0;				\
337 	}								\
338 }
339 
340 
341 /*
342  * IETF defined IPoIB encapsulation header, with 2b of ethertype
343  * followed by 2 reserved bytes. This is at the start of the
344  * datagram sent to and received over the wire by the driver.
345  */
346 typedef struct ipoib_header {
347 	ushort_t	ipoib_type;
348 	ushort_t	ipoib_mbz;
349 } ipoib_hdr_t;
350 
351 #define	IPOIB_HDRSIZE	sizeof (struct ipoib_header)
352 
353 /*
354  * IETF defined IPoIB link address; IBA QPN, followed by GID,
355  * which has a prefix and suffix, as reported via ARP.
356  */
357 typedef struct ipoib_mac {
358 	uint32_t	ipoib_qpn;
359 	uint32_t	ipoib_gidpref[2];
360 	uint32_t	ipoib_gidsuff[2];
361 } ipoib_mac_t;
362 
363 #define	IPOIB_ADDRL	sizeof (struct ipoib_mac)
364 
365 /*
366  * Pseudo header prepended to datagram in DLIOCRAW transmit path
367  * and when GLD hands the datagram to the gldm_send entry point.
368  */
369 typedef struct ipoib_ptxhdr {
370 	ipoib_mac_t	ipoib_dest;
371 	ipoib_hdr_t	ipoib_rhdr;
372 } ipoib_ptxhdr_t;
373 
374 #define	IPOIBDLSAP(p, offset)	((ipoib_ptxhdr_t *)((caddr_t)(p)+offset))
375 
376 /*
377  * The pseudo-GRH structure that sits before the data in the
378  * receive buffer, and is overlaid on top of the real GRH.
379  * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
380  * does not hold valid information. If it is indicated valid,
381  * the driver must additionally provide the sender's qpn in
382  * network byte order in ipoib_sqpn, and not touch the
383  * remaining parts which were DMA'ed in by the IBA hardware.
384  */
385 typedef struct ipoib_pgrh {
386 	uint32_t	ipoib_vertcflow;
387 	uint32_t	ipoib_sqpn;
388 	uint32_t	ipoib_sgid_pref[2];
389 	uint32_t	ipoib_sgid_suff[2];
390 	uint32_t	ipoib_dgid_pref[2];
391 	uint32_t	ipoib_dgid_suff[2];
392 } ipoib_pgrh_t;
393 
394 /*
395  * The GRH is also dma'ed into recv buffers, thus space needs
396  * to be allocated for them.
397  */
398 #define	IPOIB_GRH_SIZE	sizeof (ipoib_pgrh_t)
399 
400 /* support  the RC (reliable connected) mode */
401 #define	IBD_MAC_ADDR_RC		0x80000000
402 /* support the UC (unreliable connected) mode */
403 #define	IBD_MAC_ADDR_UC		0x40000000
404 
405 #define	IBD_RC_SERVICE_ID 0x100000000000000ULL
406 
407 /*
408  * Legacy OFED had used a wrong service ID (one additional zero digit) for
409  * many years. To interop with legacy OFED, we support this wrong service ID
410  * here.
411  */
412 #define	IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL
413 
414 #define	IBD_RC_MIN_CQ_SIZE	0x7f
415 
416 /* Number of ibt_wc_t provided for each RC channel */
417 #define	IBD_RC_MAX_CQ_WC	0x3f
418 
419 #if defined(_KERNEL) && !defined(_BOOT)
420 
421 #include <sys/ib/ibtl/ibti.h>
422 #include <sys/ib/ib_pkt_hdrs.h>
423 #include <sys/list.h>
424 #include <sys/mac_provider.h>
425 #include <sys/mac_ib.h>
426 #include <sys/modhash.h>
427 
428 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */
429 typedef enum {
430 	IBD_RC_STATE_INIT = 0,
431 
432 	/* Active side */
433 	IBD_RC_STATE_ACT_REP_RECV,	/* reply received */
434 	IBD_RC_STATE_ACT_ESTAB,		/* established, ready to send */
435 	IBD_RC_STATE_ACT_REJECT,	/* rejected */
436 	/* Someone else is closing this channel, please don't re-close it */
437 	IBD_RC_STATE_ACT_CLOSING,
438 	IBD_RC_STATE_ACT_CLOSED,
439 	IBD_RC_STATE_ACT_ERROR,
440 
441 	/* Passive side */
442 	IBD_RC_STATE_PAS_REQ_RECV,	/* request received */
443 	IBD_RC_STATE_PAS_ESTAB,		/* established, ready to receive */
444 	IBD_RC_STATE_PAS_REJECT,	/* rejected */
445 
446 	IBD_RC_STATE_PAS_CLOSED
447 } ibd_rc_chan_state_t;
448 
449 /*
450  * Structure to encapsulate various types of async requests.
451  */
452 typedef struct ibd_acache_rq {
453 	struct list_node 	rq_list; 	/* list of pending work */
454 	int			rq_op;		/* what operation */
455 	ipoib_mac_t		rq_mac;
456 	ib_gid_t		rq_gid;
457 	void			*rq_ptr;
458 	void			*rq_ptr2;
459 } ibd_req_t;
460 
461 typedef struct ibd_mcache {
462 	struct list_node	mc_list;	/* full/non list */
463 	uint8_t			mc_jstate;
464 	boolean_t		mc_fullreap;
465 	ibt_mcg_info_t		mc_info;
466 	ibd_req_t		mc_req;		/* to queue LEAVE req */
467 } ibd_mce_t;
468 
469 typedef struct ibd_acache_s {
470 	struct list_node	ac_list;	/* free/active list */
471 	ibt_ud_dest_hdl_t	ac_dest;
472 	ipoib_mac_t		ac_mac;
473 	uint32_t		ac_ref;
474 	ibd_mce_t		*ac_mce;	/* for MCG AHs */
475 
476 	/* For Reliable Connected mode */
477 	struct ibd_rc_chan_s	*ac_chan;
478 	/* protect tx_too_big_ongoing */
479 	kmutex_t		tx_too_big_mutex;
480 	/* Deal with too big packet */
481 	boolean_t		tx_too_big_ongoing;
482 } ibd_ace_t;
483 
484 #define	IBD_MAX_SQSEG	59
485 #define	IBD_MAX_RQSEG	1
486 
487 typedef enum {
488 	IBD_WQE_SEND,
489 	IBD_WQE_RECV
490 } ibd_wqe_type_t;
491 
492 typedef enum {
493 	IBD_WQE_TXBUF = 1,
494 	IBD_WQE_LSOBUF = 2,
495 	IBD_WQE_MAPPED = 3,
496 	IBD_WQE_RC_COPYBUF = 4
497 } ibd_wqe_buftype_t;
498 
499 #ifdef DEBUG
500 typedef struct ibd_rc_stat_s {
501 	kstat_named_t		rc_rcv_trans_byte;
502 	kstat_named_t		rc_rcv_trans_pkt;
503 	kstat_named_t		rc_rcv_copy_byte;
504 	kstat_named_t		rc_rcv_copy_pkt;
505 	kstat_named_t		rc_rcv_alloc_fail;
506 
507 	kstat_named_t		rc_rcq_invoke;
508 	kstat_named_t		rc_rcq_err;	/* fail in rcq handler */
509 	kstat_named_t		rc_scq_invoke;
510 
511 	kstat_named_t		rc_rwqe_short;	/* short rwqe */
512 
513 	kstat_named_t		rc_xmt_bytes;
514 	/* pkt size <= ibd_rc_tx_copy_thresh */
515 	kstat_named_t		rc_xmt_small_pkt;
516 	kstat_named_t		rc_xmt_fragmented_pkt;
517 	/* fail in ibt_map_mem_iov() */
518 	kstat_named_t		rc_xmt_map_fail_pkt;
519 	/* succ in ibt_map_mem_iov() */
520 	kstat_named_t		rc_xmt_map_succ_pkt;
521 
522 	kstat_named_t		rc_ace_not_found;	/* ace not found */
523 	/* no swqe even after recycle */
524 	kstat_named_t		rc_scq_no_swqe;
525 	/* no tx large buf even after recycle */
526 	kstat_named_t		rc_scq_no_largebuf;
527 
528 	/* short swqe in ibd_send() */
529 	kstat_named_t		rc_swqe_short;
530 	/* call mac_tx_update() when there is enough swqe */
531 	kstat_named_t		rc_swqe_mac_update;
532 	/* short large buf in ibd_send() */
533 	kstat_named_t		rc_xmt_buf_short;
534 	/* call mac_tx_update() when there is enough Tx large buffers */
535 	kstat_named_t rc_xmt_buf_mac_update;
536 
537 	kstat_named_t		rc_conn_succ;	/* # of success connect */
538 	kstat_named_t		rc_conn_fail;	/* # of fail connect */
539 	/* ace->ac_chan == NULL for unicast packet */
540 	kstat_named_t		rc_null_conn;
541 	/* not in active established state */
542 	kstat_named_t		rc_no_estab_conn;
543 
544 	kstat_named_t		rc_act_close;	/* call ibd_rc_act_close() */
545 	kstat_named_t		rc_pas_close;	/* call ibd_rc_pas_close() */
546 	kstat_named_t		rc_delay_ace_recycle;
547 	kstat_named_t		rc_act_close_simultaneous;
548 
549 	kstat_named_t		rc_reset_cnt;	/* # of Reset RC channel */
550 } ibd_rc_stat_t;
551 #endif
552 
553 typedef struct ibd_rc_chan_list_s {
554 	/* This mutex protects chan_list and ibd_rc_chan_t.next */
555 	kmutex_t		chan_list_mutex;
556 	struct ibd_rc_chan_s	*chan_list;
557 } ibd_rc_chan_list_t;
558 
559 typedef struct ibd_rc_tx_largebuf_s {
560 	struct ibd_rc_tx_largebuf_s	*lb_next;
561 	uint8_t				*lb_buf;
562 } ibd_rc_tx_largebuf_t;
563 
564 /*
565  * Pre-registered copybuf used for send and receive
566  */
567 typedef struct ibd_copybuf_s {
568 	ibt_wr_ds_t		ic_sgl;
569 	uint8_t			*ic_bufaddr;
570 } ibd_copybuf_t;
571 
572 typedef struct ibd_wqe_s {
573 	struct ibd_wqe_s	*w_next;
574 	ibd_copybuf_t		w_copybuf;
575 	mblk_t			*im_mblk;
576 } ibd_wqe_t;
577 
578 /*
579  * Send WQE
580  */
581 typedef struct ibd_swqe_s {
582 	ibd_wqe_t		w_ibd_swqe;
583 	ibd_wqe_buftype_t	w_buftype;
584 	ibt_send_wr_t		w_swr;
585 	ibd_ace_t		*w_ahandle;
586 	ibt_mi_hdl_t		w_mi_hdl;
587 	ibt_wr_ds_t		w_sgl[IBD_MAX_SQSEG];
588 	ibd_rc_tx_largebuf_t	*w_rc_tx_largebuf;
589 } ibd_swqe_t;
590 
591 #define	swqe_next		w_ibd_swqe.w_next
592 #define	swqe_copybuf		w_ibd_swqe.w_copybuf
593 #define	swqe_im_mblk		w_ibd_swqe.im_mblk
594 #define	SWQE_TO_WQE(swqe)	(ibd_wqe_t *)&((swqe)->w_ibd_swqe)
595 #define	WQE_TO_SWQE(wqe)	(ibd_swqe_t *)wqe
596 
597 /*
598  * Receive WQE
599  */
600 typedef struct ibd_rwqe_s {
601 	ibd_wqe_t		w_ibd_rwqe;
602 	struct ibd_state_s	*w_state;
603 	ibt_recv_wr_t		w_rwr;
604 	frtn_t			w_freemsg_cb;
605 	boolean_t		w_freeing_wqe;
606 	struct ibd_rc_chan_s	*w_chan;
607 } ibd_rwqe_t;
608 
609 #define	rwqe_next		w_ibd_rwqe.w_next
610 #define	rwqe_copybuf		w_ibd_rwqe.w_copybuf
611 #define	rwqe_im_mblk		w_ibd_rwqe.im_mblk
612 #define	RWQE_TO_WQE(rwqe)	(ibd_wqe_t *)&((rwqe)->w_ibd_rwqe)
613 #define	WQE_TO_RWQE(wqe)	(ibd_rwqe_t *)wqe
614 
615 typedef struct ibd_list_s {
616 	kmutex_t		dl_mutex;
617 	ibd_wqe_t		*dl_head;
618 	union {
619 		boolean_t	pending_sends;
620 		uint32_t	bufs_outstanding;
621 	} ustat;
622 	uint32_t		dl_cnt;
623 } ibd_list_t;
624 
625 #define	dl_pending_sends	ustat.pending_sends
626 #define	dl_bufs_outstanding	ustat.bufs_outstanding
627 
628 /*
629  * LSO buffers
630  *
631  * Under normal circumstances we should never need to use any buffer
632  * that's larger than MTU.  Unfortunately, IB HCA has limitations
633  * on the length of SGL that are much smaller than those for regular
634  * ethernet NICs.  Since the network layer doesn't care to limit the
635  * number of mblk fragments in any send mp chain, we end up having to
636  * use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
637  * buffers occasionally.
638  */
639 typedef struct ibd_lsobuf_s {
640 	struct ibd_lsobuf_s *lb_next;
641 	uint8_t		*lb_buf;
642 	int		lb_isfree;
643 } ibd_lsobuf_t;
644 
645 typedef struct ibd_lsobkt_s {
646 	uint8_t		*bkt_mem;
647 	ibd_lsobuf_t	*bkt_bufl;
648 	ibd_lsobuf_t	*bkt_free_head;
649 	ibt_mr_hdl_t	bkt_mr_hdl;
650 	ibt_mr_desc_t	bkt_mr_desc;
651 	uint_t		bkt_nelem;
652 	uint_t		bkt_nfree;
653 } ibd_lsobkt_t;
654 
655 /*
656  * Posting to a single software rx post queue is contentious,
657  * so break it out to (multiple) an array of queues.
658  *
659  * Try to ensure rx_queue structs fall in different cache lines using a filler.
660  * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
661  */
662 #define	RX_QUEUE_CACHE_LINE \
663 	(64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t)))
664 typedef struct ibd_rx_queue_s {
665 	kmutex_t		rx_post_lock;
666 	ibd_wqe_t		*rx_head;
667 	uint_t			rx_cnt;
668 	uint8_t			rx_pad[RX_QUEUE_CACHE_LINE];
669 } ibd_rx_queue_t;
670 
671 /*
672  * This structure maintains information per port per HCA
673  * (per network interface).
674  */
675 typedef struct ibd_state_s {
676 	dev_info_t		*id_dip;
677 	ibt_clnt_hdl_t		id_ibt_hdl;
678 	ibt_hca_hdl_t		id_hca_hdl;
679 	ibt_pd_hdl_t		id_pd_hdl;
680 	kmem_cache_t		*id_req_kmc;
681 
682 	ibd_list_t		id_tx_rel_list;
683 
684 	uint32_t		id_running;
685 
686 	uint32_t		id_max_sqseg;
687 	uint32_t		id_max_sqseg_hiwm;
688 	ibd_list_t		id_tx_list;
689 	ddi_softintr_t		id_tx;
690 	uint32_t		id_tx_sends;
691 
692 	kmutex_t		id_txpost_lock;
693 	ibd_swqe_t		*id_tx_head;
694 	ibd_swqe_t		*id_tx_tail;
695 	int			id_tx_busy;
696 
697 	uint_t			id_tx_buf_sz;
698 	uint8_t			*id_tx_bufs;
699 	ibd_swqe_t		*id_tx_wqes;
700 	ibt_mr_hdl_t		id_tx_mr_hdl;
701 	ibt_mr_desc_t		id_tx_mr_desc;
702 
703 	kmutex_t		id_lso_lock;
704 	ibd_lsobkt_t		*id_lso;
705 
706 	kmutex_t		id_scq_poll_lock;
707 	int			id_scq_poll_busy;
708 
709 	ibt_cq_hdl_t		id_scq_hdl;
710 	ibt_wc_t		*id_txwcs;
711 	uint32_t		id_txwcs_size;
712 
713 	int			id_rx_nqueues;
714 	ibd_rx_queue_t		*id_rx_queues;
715 	int			id_rx_post_queue_index;
716 	uint32_t		id_rx_post_active;
717 
718 	ibd_rwqe_t		*id_rx_wqes;
719 	uint8_t			*id_rx_bufs;
720 	ibt_mr_hdl_t		id_rx_mr_hdl;
721 	ibt_mr_desc_t		id_rx_mr_desc;
722 	uint_t			id_rx_buf_sz;
723 	uint32_t		id_num_rwqe;
724 	ibd_list_t		id_rx_list;
725 	ddi_softintr_t		id_rx;
726 	uint32_t		id_rx_bufs_outstanding_limit;
727 	uint32_t		id_rx_allocb;
728 	uint32_t		id_rx_allocb_failed;
729 	ibd_list_t		id_rx_free_list;
730 
731 	kmutex_t		id_rcq_poll_lock;
732 	int			id_rcq_poll_busy;
733 	uint32_t		id_rxwcs_size;
734 	ibt_wc_t		*id_rxwcs;
735 	ibt_cq_hdl_t		id_rcq_hdl;
736 
737 	ibt_channel_hdl_t	id_chnl_hdl;
738 	ib_pkey_t		id_pkey;
739 	uint16_t		id_pkix;
740 	uint8_t			id_port;
741 	ibt_mcg_info_t		*id_mcinfo;
742 
743 	mac_handle_t		id_mh;
744 	mac_resource_handle_t	id_rh;
745 	ib_gid_t		id_sgid;
746 	ib_qpn_t		id_qpnum;
747 	ipoib_mac_t		id_macaddr;
748 	ib_gid_t		id_mgid;
749 	ipoib_mac_t		id_bcaddr;
750 
751 	int			id_mtu;
752 	uchar_t			id_scope;
753 
754 	kmutex_t		id_acache_req_lock;
755 	kcondvar_t		id_acache_req_cv;
756 	struct list		id_req_list;
757 	kt_did_t		id_async_thrid;
758 
759 	kmutex_t		id_ac_mutex;
760 	ibd_ace_t		*id_ac_hot_ace;
761 	struct list		id_ah_active;
762 	struct list		id_ah_free;
763 	ipoib_mac_t		id_ah_addr;
764 	ibd_req_t		id_ah_req;
765 	char			id_ah_op;
766 	uint64_t		id_ah_error;
767 	ibd_ace_t		*id_ac_list;
768 	mod_hash_t		*id_ah_active_hash;
769 
770 	kmutex_t		id_mc_mutex;
771 	struct list		id_mc_full;
772 	struct list		id_mc_non;
773 
774 	kmutex_t		id_trap_lock;
775 	kcondvar_t		id_trap_cv;
776 	boolean_t		id_trap_stop;
777 	uint32_t		id_trap_inprog;
778 
779 	char			id_prom_op;
780 
781 	kmutex_t		id_sched_lock;
782 	int			id_sched_needed;
783 	int			id_sched_cnt;
784 	int			id_sched_lso_cnt;
785 
786 	kmutex_t		id_link_mutex;
787 	link_state_t		id_link_state;
788 	uint64_t		id_link_speed;
789 
790 	uint64_t		id_num_intrs;
791 	uint64_t		id_tx_short;
792 	uint32_t		id_num_swqe;
793 
794 	uint64_t		id_xmt_bytes;
795 	uint64_t		id_rcv_bytes;
796 	uint64_t		id_multi_xmt;
797 	uint64_t		id_brd_xmt;
798 	uint64_t		id_multi_rcv;
799 	uint64_t		id_brd_rcv;
800 	uint64_t		id_xmt_pkt;
801 	uint64_t		id_rcv_pkt;
802 
803 	uint32_t		id_hwcksum_capab;
804 	boolean_t		id_lso_policy;
805 	boolean_t		id_lso_capable;
806 	uint_t			id_lso_maxlen;
807 	int			id_hca_res_lkey_capab;
808 	ibt_lkey_t		id_res_lkey;
809 
810 	boolean_t		id_bgroup_created;
811 	kmutex_t		id_macst_lock;
812 	kcondvar_t		id_macst_cv;
813 	uint32_t		id_mac_state;
814 
815 	/* For Reliable Connected Mode */
816 	boolean_t		id_enable_rc;
817 	boolean_t		rc_enable_srq;
818 
819 	int			rc_mtu;
820 	uint32_t		rc_tx_max_sqseg;
821 	/*
822 	 * In IPoIB over Reliable Connected mode, its mac address is added
823 	 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function
824 	 * ibd_process_rx(), the input mac address should not include the
825 	 * "IBD_MAC_ADDR_RC" prefix.
826 	 *
827 	 * So, we introduce the rc_macaddr_loopback for the loopback filter in
828 	 * IPoIB over Reliable Connected mode.
829 	 *
830 	 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix.
831 	 */
832 	ipoib_mac_t		rc_macaddr_loopback;
833 
834 	ibt_srv_hdl_t		rc_listen_hdl;
835 	ibt_sbind_hdl_t		rc_listen_bind;
836 	ibt_srv_hdl_t		rc_listen_hdl_OFED_interop;
837 	ibt_sbind_hdl_t		rc_listen_bind_OFED_interop;
838 
839 	ibd_rc_chan_list_t	rc_pass_chan_list;
840 	/* obsolete active channel list */
841 	ibd_rc_chan_list_t	rc_obs_act_chan_list;
842 
843 	kmutex_t		rc_ace_recycle_lock;
844 	ibd_ace_t		*rc_ace_recycle;
845 
846 	/* Send */
847 	/*
848 	 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree
849 	 * and ibd_rc_tx_largebuf_t->lb_next
850 	 */
851 	kmutex_t		rc_tx_large_bufs_lock;
852 	ibd_rc_tx_largebuf_t	*rc_tx_largebuf_free_head;
853 	uint_t			rc_tx_largebuf_nfree;
854 	/* The chunk of whole Tx large buffers */
855 	uint8_t			*rc_tx_mr_bufs;
856 	ibt_mr_hdl_t		rc_tx_mr_hdl;
857 	ibt_mr_desc_t		rc_tx_mr_desc;
858 	ibd_rc_tx_largebuf_t	*rc_tx_largebuf_desc_base;	/* base addr */
859 
860 	boolean_t		rc_enable_iov_map;
861 	uint_t			rc_max_sqseg_hiwm;
862 
863 	/* For SRQ */
864 	uint32_t 		rc_srq_size;
865 	ibt_srq_hdl_t		rc_srq_hdl;
866 	ibd_list_t		rc_srq_rwqe_list;
867 	ibd_list_t		rc_srq_free_list;
868 	ibd_rwqe_t		*rc_srq_rwqes;
869 	uint8_t			*rc_srq_rx_bufs;
870 	ibt_mr_hdl_t		rc_srq_rx_mr_hdl;
871 	ibt_mr_desc_t		rc_srq_rx_mr_desc;
872 
873 	/* For chained receive */
874 	kmutex_t		rc_rx_lock;
875 	mblk_t			*rc_rx_mp;
876 	mblk_t			*rc_rx_mp_tail;
877 	uint32_t		rc_rx_mp_len;
878 
879 	/* Counters for RC mode */
880 	/* RX */
881 	/*
882 	 * # of Received packets. These packets are directly transferred to GLD
883 	 * without copy it
884 	 */
885 	uint64_t		rc_rcv_trans_byte;
886 	uint64_t		rc_rcv_trans_pkt;
887 	/*
888 	 * # of Received packets. We will allocate new buffers for these packet,
889 	 * copy their content into new buffers, then transfer to GLD
890 	 */
891 	uint64_t		rc_rcv_copy_byte;
892 	uint64_t		rc_rcv_copy_pkt;
893 	uint64_t		rc_rcv_alloc_fail;
894 
895 #ifdef DEBUG
896 	uint64_t		rc_rwqe_short;	/* short rwqe */
897 #endif
898 
899 	/* # of invoke Receive CQ handler */
900 	uint64_t		rc_rcq_invoke;
901 	/* wc->wc_status != IBT_WC_SUCCESS */
902 	uint64_t		rc_rcq_err;
903 
904 	/* Tx */
905 	uint64_t		rc_xmt_bytes;
906 
907 	/* pkt size <= ibd_rc_tx_copy_thresh */
908 	uint64_t		rc_xmt_small_pkt;
909 	uint64_t		rc_xmt_fragmented_pkt;
910 	/* fail in ibt_map_mem_iov() */
911 	uint64_t		rc_xmt_map_fail_pkt;
912 	/* succ in ibt_map_mem_iov() */
913 	uint64_t		rc_xmt_map_succ_pkt;
914 
915 	uint64_t		rc_ace_not_found;
916 
917 	uint64_t		rc_xmt_drop_too_long_pkt;
918 	uint64_t		rc_xmt_icmp_too_long_pkt;
919 	uint64_t		rc_xmt_reenter_too_long_pkt;
920 
921 	/* short swqe in ibd_send() */
922 	uint64_t		rc_swqe_short;
923 	/* call mac_tx_update when there is enough swqe */
924 	uint64_t		rc_swqe_mac_update;
925 	/* short tx large copy buf in ibd_send() */
926 	uint64_t		rc_xmt_buf_short;
927 	/* call mac_tx_update when there is enough Tx copy buf */
928 	uint64_t		rc_xmt_buf_mac_update;
929 
930 	/* No swqe even after call swqe recycle function */
931 	uint64_t		rc_scq_no_swqe;
932 	/* No large Tx buf even after call swqe recycle function */
933 	uint64_t		rc_scq_no_largebuf;
934 	/* # of invoke Send CQ handler */
935 	uint64_t		rc_scq_invoke;
936 
937 	/* Connection setup and close */
938 	uint64_t		rc_conn_succ;	/* time of succ connect */
939 	uint64_t		rc_conn_fail;	/* time of fail connect */
940 	/* ace->ac_chan == NULL for unicast packet */
941 	uint64_t		rc_null_conn;
942 	/* not in active established state */
943 	uint64_t		rc_no_estab_conn;
944 
945 	uint64_t		rc_act_close;	/* call ibd_rc_act_close() */
946 	uint64_t		rc_pas_close;	/* call ibd_rc_pas_close() */
947 	uint64_t		rc_delay_ace_recycle;
948 	uint64_t		rc_act_close_simultaneous;
949 
950 	/* the counter of reset RC channel */
951 	uint64_t		rc_reset_cnt;
952 
953 #ifdef DEBUG
954 	kstat_t 		*rc_ksp;
955 #endif
956 } ibd_state_t;
957 
958 /*
959  * Structures to track global IBTF data, data that is shared
960  * among the IBD device instances.  This includes the one ibt_hdl
961  * and the list of service registrations.
962  */
963 typedef struct ibd_service_s {
964 	struct ibd_service_s	*is_link;
965 	ibt_srv_hdl_t		is_srv_hdl;
966 	ib_svc_id_t		is_sid;
967 	uint_t			is_ref_cnt;
968 } ibd_service_t;
969 
970 typedef struct ibd_global_state_s {
971 	kmutex_t	ig_mutex;
972 	ibt_clnt_hdl_t	ig_ibt_hdl;
973 	uint_t		ig_ibt_hdl_ref_cnt;
974 	ibd_service_t	*ig_service_list;
975 } ibd_global_state_t;
976 
977 typedef struct ibd_rc_msg_hello_s {
978 	uint32_t reserved_qpn;
979 	uint32_t rx_mtu;
980 } ibd_rc_msg_hello_t;
981 
982 typedef struct ibd_rc_chan_s {
983 	struct ibd_rc_chan_s	*next;
984 	/* channel hdl that we'll be using for Reliable Connected Mode */
985 	ibt_channel_hdl_t	chan_hdl;
986 	struct ibd_state_s	*state;
987 	ibd_ace_t		*ace;
988 	ibd_rc_chan_state_t	chan_state;
989 
990 	/* used to judge duplicate connection */
991 	ib_gid_t		requester_gid;
992 	ib_pkey_t		requester_pkey;
993 
994 	ibd_list_t		tx_wqe_list;	/* free wqe list */
995 	ibd_list_t		tx_rel_list;	/* for swqe recycle */
996 
997 	ibd_swqe_t		*tx_wqes;
998 
999 	/* start address of Tx Buffers */
1000 	uint8_t			*tx_mr_bufs;
1001 	ibt_mr_hdl_t		tx_mr_hdl;
1002 	ibt_mr_desc_t		tx_mr_desc;
1003 
1004 	ibt_cq_hdl_t		scq_hdl;	/* Tx completion queue */
1005 	ibt_wc_t		tx_wc[IBD_RC_MAX_CQ_WC];
1006 	ddi_softintr_t		scq_softintr;
1007 
1008 	uint32_t		tx_trans_error_cnt;
1009 
1010 	/* For chained send */
1011 	kmutex_t		tx_post_lock;
1012 	ibd_swqe_t		*tx_head;
1013 	ibd_swqe_t		*tx_tail;
1014 	int			tx_busy;
1015 
1016 	/* For tx buffer recycle */
1017 	kmutex_t		tx_poll_lock;
1018 	int			tx_poll_busy;
1019 
1020 	/* Rx */
1021 	ibd_list_t		rx_wqe_list;	/* used by ibt_post_recv */
1022 	ibd_list_t		rx_free_list;	/* free rwqe list */
1023 
1024 	ibt_cq_hdl_t		rcq_hdl;	/* Rx completion queue */
1025 	ibt_wc_t		rx_wc[IBD_RC_MAX_CQ_WC];
1026 
1027 	ibd_rwqe_t		*rx_rwqes;	/* the chuck of whole rwqes */
1028 	uint8_t			*rx_bufs;	/* the chuck of whole Rx bufs */
1029 	ibt_mr_hdl_t		rx_mr_hdl;	/* ibt_mr_hdl_t for rx_bufs */
1030 	ibt_mr_desc_t		rx_mr_desc;	/* ibt_mr_desc_t for rx_bufs */
1031 
1032 	/* For chained receive */
1033 	kmutex_t		rx_lock;
1034 	mblk_t			*rx_mp;
1035 	mblk_t			*rx_mp_tail;
1036 	uint32_t		rx_mp_len;
1037 
1038 	uint32_t 		rcq_size;
1039 	uint32_t 		scq_size;
1040 	/*
1041 	 * We need two channels for each connection.
1042 	 * One channel for Tx; another channel for Rx.
1043 	 * If "is_tx_chan == B_TRUE", this is a Tx channel.
1044 	 */
1045 	boolean_t		is_tx_chan;
1046 } ibd_rc_chan_t;
1047 
1048 /*
1049  * The following functions are defined in "ibd.c".
1050  * They are also used by "ibd_cm.c"
1051  */
1052 void ibd_print_warn(ibd_state_t *, char *, ...);
1053 void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
1054 void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
1055 boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
1056 void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *);
1057 ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
1058 
1059 /*
1060  * The following functions are defined in "ibd_cm.c".
1061  * They are also used in "ibd.c".
1062  */
1063 void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *);
1064 void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *);
1065 void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *);
1066 
1067 /* Connection Setup/Close Functions */
1068 ibt_status_t ibd_rc_listen(ibd_state_t *);
1069 void ibd_rc_stop_listen(ibd_state_t *);
1070 ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *,
1071     uint64_t);
1072 void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *,  ibt_path_info_t *);
1073 void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *);
1074 void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *);
1075 void ibd_rc_close_all_chan(ibd_state_t *);
1076 
1077 /* Receive Functions */
1078 int ibd_rc_init_srq_list(ibd_state_t *);
1079 void ibd_rc_fini_srq_list(ibd_state_t *);
1080 int ibd_rc_repost_srq_free_list(ibd_state_t *);
1081 
1082 /* Send Functions */
1083 int ibd_rc_init_tx_largebuf_list(ibd_state_t *);
1084 void ibd_rc_fini_tx_largebuf_list(ibd_state_t *);
1085 ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *);
1086 void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *);
1087 void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t);
1088 void ibd_rc_tx_cleanup(ibd_swqe_t *);
1089 
1090 /* Others */
1091 void ibd_rc_get_conf(ibd_state_t *);
1092 int ibd_rc_init_stats(ibd_state_t *);
1093 
1094 #endif /* _KERNEL && !_BOOT */
1095 
1096 #ifdef __cplusplus
1097 }
1098 #endif
1099 
1100 #endif	/* _SYS_IB_CLIENTS_IBD_H */
1101