xref: /titanic_51/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c (revision 1a5e258f5471356ca102c7176637cdce45bac147)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
29  */
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/atomic.h>		/* for atomic_add*() */
44 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
45 #include <netinet/in.h>		/* for netinet/ip.h below */
46 #include <netinet/ip.h>		/* for struct ip */
47 #include <inet/common.h>	/* for inet/ip.h below */
48 #include <inet/ip.h>		/* for ipha_t */
49 #include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
50 #include <inet/ip6.h>		/* for ip6_t */
51 #include <netinet/icmp6.h>	/* for icmp6_t */
52 
53 #include <sys/ib/clients/ibd/ibd.h>
54 
55 extern ibd_global_state_t ibd_gstate;
56 extern int ibd_rc_conn_timeout;
57 uint_t ibd_rc_tx_softintr = 1;
58 /*
59  * If the number of WRs in receive queue of each RC connection less than
60  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
61  */
62 #define	IBD_RC_RX_WR_THRESHOLD		0x20
63 
64 /*
65  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
66  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
67  * transmitting packets.
68  */
69 #define	IBD_RC_TX_FREE_THRESH		8
70 
71 #define	IBD_RC_QPN_TO_SID(qpn) \
72 	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
73 
74 /* For interop with legacy OFED */
75 #define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
76 	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
77 
78 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
79 #define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64
80 
81 
82 /* Functions for Reliable Connected Mode */
83 /* Connection Setup/Close Functions */
84 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
85     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
86 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
87     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
88 static void ibd_rc_act_close(ibd_rc_chan_t *, boolean_t);
89 
90 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
91     ibd_rc_chan_t *);
92 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
93     ibd_rc_chan_list_t *);
94 static inline ibd_rc_chan_t *ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
95     ibd_rc_chan_t *);
96 
97 /* CQ handlers */
98 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
99 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
100 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
101 
102 /* Receive Functions */
103 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
104 static void ibd_rc_srq_freemsg_cb(char *);
105 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
106 
107 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
108 static void ibd_rc_freemsg_cb(char *);
109 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
110 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
111 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
112 
113 
114 /* Send Functions */
115 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
116 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
117 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
118 static uint_t ibd_rc_tx_recycle(caddr_t);
119 
120 
121 void
122 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
123 {
124 	ibd_rc_chan_t *rc_chan = req->rq_ptr;
125 	ibd_ace_t *ace;
126 
127 	while (rc_chan != NULL) {
128 		ace = rc_chan->ace;
129 		ASSERT(ace != NULL);
130 		/* Close old RC channel */
131 		ibd_rc_act_close(rc_chan, B_TRUE);
132 		mutex_enter(&state->id_ac_mutex);
133 		ASSERT(ace->ac_ref != 0);
134 		atomic_dec_32(&ace->ac_ref);
135 		ace->ac_chan = NULL;
136 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
137 			IBD_ACACHE_INSERT_FREE(state, ace);
138 			ace->ac_ref = 0;
139 		} else {
140 			ace->ac_ref |= CYCLEVAL;
141 			state->rc_delay_ace_recycle++;
142 		}
143 		mutex_exit(&state->id_ac_mutex);
144 		rc_chan = ibd_rc_rm_header_chan_list(
145 		    &state->rc_obs_act_chan_list);
146 	}
147 }
148 
149 void
150 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
151 {
152 	ibd_ace_t *ace = req->rq_ptr;
153 	ibd_rc_chan_t *rc_chan;
154 
155 	ASSERT(ace != NULL);
156 	rc_chan = ace->ac_chan;
157 	ASSERT(rc_chan != NULL);
158 	/* Close old RC channel */
159 	ibd_rc_act_close(rc_chan, B_TRUE);
160 	mutex_enter(&state->id_ac_mutex);
161 	ASSERT(ace->ac_ref != 0);
162 	atomic_dec_32(&ace->ac_ref);
163 	ace->ac_chan = NULL;
164 	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
165 		IBD_ACACHE_INSERT_FREE(state, ace);
166 		ace->ac_ref = 0;
167 	} else {
168 		ace->ac_ref |= CYCLEVAL;
169 		state->rc_delay_ace_recycle++;
170 	}
171 	mutex_exit(&state->id_ac_mutex);
172 	mutex_enter(&state->rc_ace_recycle_lock);
173 	state->rc_ace_recycle = NULL;
174 	mutex_exit(&state->rc_ace_recycle_lock);
175 }
176 
177 /* Simple ICMP IP Header Template */
178 static const ipha_t icmp_ipha = {
179 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
180 };
181 
182 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
183 void
184 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
185 {
186 	mblk_t *mp = req->rq_ptr;
187 	ibd_ace_t *ace = req->rq_ptr2;
188 	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
189 	uint_t	len_needed;
190 	size_t	msg_len;
191 	mblk_t	*pmtu_mp;
192 	ushort_t	sap;
193 	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
194 	/*
195 	 * ipha: IP header for pmtu_pkt
196 	 * old_ipha: IP header for old packet
197 	 */
198 	ipha_t *ipha, *old_ipha;
199 	icmph_t	*icmph;
200 
201 	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
202 
203 	if (!pullupmsg(mp, -1)) {
204 		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
205 		goto too_big_fail;
206 	}
207 	/* move to IP header. */
208 	mp->b_rptr += IPOIB_HDRSIZE;
209 	old_ipha = (ipha_t *)mp->b_rptr;
210 
211 	len_needed = IPH_HDR_LENGTH(old_ipha);
212 	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
213 		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
214 		    len_needed));
215 	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
216 		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
217 		    + len_needed);
218 		len_needed += ip_hdr_length_v6(mp, ip6h);
219 	}
220 	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
221 	msg_len = msgdsize(mp);
222 	if (msg_len > len_needed) {
223 		(void) adjmsg(mp, len_needed - msg_len);
224 		msg_len = len_needed;
225 	}
226 
227 	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
228 	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
229 		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
230 		goto too_big_fail;
231 	}
232 	pmtu_mp->b_cont = mp;
233 	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
234 	    + sizeof (ipha_t) + sizeof (icmph_t);
235 
236 	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
237 
238 	/* Fill IB header */
239 	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
240 	/*
241 	 * If the GRH is not valid, indicate to GLDv3 by setting
242 	 * the VerTcFlow field to 0.
243 	 */
244 	ibha->ib_grh.ipoib_vertcflow = 0;
245 	ibha->ipib_rhdr.ipoib_type = htons(sap);
246 	ibha->ipib_rhdr.ipoib_mbz = 0;
247 
248 	/* Fill IP header */
249 	ipha = (ipha_t *)&ibha[1];
250 	*ipha = icmp_ipha;
251 	ipha->ipha_src = old_ipha->ipha_dst;
252 	ipha->ipha_dst = old_ipha->ipha_src;
253 	ipha->ipha_ttl = old_ipha->ipha_ttl;
254 	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
255 	if (msg_len > IP_MAXPACKET) {
256 		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
257 		    "> IP_MAXPACKET", (uint32_t)msg_len);
258 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
259 		msg_len = IP_MAXPACKET;
260 	}
261 	ipha->ipha_length = htons((uint16_t)msg_len);
262 	ipha->ipha_hdr_checksum = 0;
263 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
264 
265 	/* Fill ICMP body */
266 	icmph = (icmph_t *)&ipha[1];
267 	bzero(icmph, sizeof (icmph_t));
268 	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
269 	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
270 	icmph->icmph_du_mtu = htons(mtu);
271 	icmph->icmph_checksum = 0;
272 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
273 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
274 
275 	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
276 	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
277 
278 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
279 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
280 	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
281 	    len_needed, (uint32_t)msg_len);
282 
283 	mac_rx(state->id_mh, state->id_rh, pmtu_mp);
284 
285 	mutex_enter(&ace->tx_too_big_mutex);
286 	ace->tx_too_big_ongoing = B_FALSE;
287 	mutex_exit(&ace->tx_too_big_mutex);
288 	return;
289 
290 too_big_fail:
291 	/* Drop packet */
292 	freemsg(mp);
293 	mutex_enter(&ace->tx_too_big_mutex);
294 	ace->tx_too_big_ongoing = B_FALSE;
295 	mutex_exit(&ace->tx_too_big_mutex);
296 }
297 
298 /*
299  * Check all active/passive channels. If any ative/passive
300  * channel has not been used for a long time, close it.
301  */
302 void
303 ibd_rc_conn_timeout_call(void *carg)
304 {
305 	ibd_state_t *state = carg;
306 	ibd_ace_t *ace, *pre_ace;
307 	ibd_rc_chan_t *chan, *pre_chan, *next_chan;
308 	ibd_req_t *req;
309 
310 	/* Check all active channels. If chan->is_used == B_FALSE, close it */
311 	mutex_enter(&state->id_ac_mutex);
312 	ace = list_head(&state->id_ah_active);
313 	while ((pre_ace = ace) != NULL) {
314 		ace = list_next(&state->id_ah_active, ace);
315 		if (pre_ace->ac_chan != NULL) {
316 			chan = pre_ace->ac_chan;
317 			ASSERT(state->id_enable_rc == B_TRUE);
318 			if (chan->chan_state == IBD_RC_STATE_ACT_ESTAB) {
319 				if (chan->is_used == B_FALSE) {
320 					state->rc_timeout_act++;
321 					INC_REF(pre_ace, 1);
322 					IBD_ACACHE_PULLOUT_ACTIVE(state,
323 					    pre_ace);
324 					chan->chan_state =
325 					    IBD_RC_STATE_ACT_CLOSING;
326 					ibd_rc_signal_act_close(state, pre_ace);
327 				} else {
328 					chan->is_used = B_FALSE;
329 				}
330 			}
331 		}
332 	}
333 	mutex_exit(&state->id_ac_mutex);
334 
335 	/* Check all passive channels. If chan->is_used == B_FALSE, close it */
336 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
337 	next_chan = state->rc_pass_chan_list.chan_list;
338 	pre_chan = NULL;
339 	while ((chan = next_chan) != NULL) {
340 		next_chan = chan->next;
341 		if (chan->is_used == B_FALSE) {
342 			req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
343 			if (req != NULL) {
344 				/* remove it */
345 				state->rc_timeout_pas++;
346 				req->rq_ptr = chan;
347 				ibd_queue_work_slot(state, req,
348 				    IBD_ASYNC_RC_CLOSE_PAS_CHAN);
349 			} else {
350 				ibd_print_warn(state, "ibd_rc_conn_timeout: "
351 				    "alloc ibd_req_t fail");
352 				if (pre_chan == NULL) {
353 					state->rc_pass_chan_list.chan_list =
354 					    chan;
355 				} else {
356 					pre_chan->next = chan;
357 				}
358 				pre_chan = chan;
359 			}
360 		} else {
361 			if (pre_chan == NULL) {
362 				state->rc_pass_chan_list.chan_list = chan;
363 			} else {
364 				pre_chan->next = chan;
365 			}
366 			pre_chan = chan;
367 			chan->is_used = B_FALSE;
368 		}
369 	}
370 	if (pre_chan != NULL) {
371 		pre_chan->next = NULL;
372 	} else {
373 		state->rc_pass_chan_list.chan_list = NULL;
374 	}
375 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
376 
377 	mutex_enter(&state->rc_timeout_lock);
378 	if (state->rc_timeout_start == B_TRUE) {
379 		state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
380 		    SEC_TO_TICK(ibd_rc_conn_timeout));
381 	}
382 	mutex_exit(&state->rc_timeout_lock);
383 }
384 
385 #ifdef DEBUG
386 /*
387  * ibd_rc_update_stats - update driver private kstat counters
388  *
389  * This routine will dump the internal statistics counters for ibd's
390  * Reliable Connected Mode. The current stats dump values will
391  * be sent to the kernel status area.
392  */
393 static int
394 ibd_rc_update_stats(kstat_t *ksp, int rw)
395 {
396 	ibd_state_t *state;
397 	ibd_rc_stat_t *ibd_rc_ksp;
398 
399 	if (rw == KSTAT_WRITE)
400 		return (EACCES);
401 
402 	state = (ibd_state_t *)ksp->ks_private;
403 	ASSERT(state != NULL);
404 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
405 
406 	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
407 	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
408 	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
409 	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
410 	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
411 
412 	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
413 
414 	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
415 
416 	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
417 	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
418 	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
419 	    state->rc_xmt_fragmented_pkt;
420 	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
421 	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
422 	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
423 
424 	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
425 	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
426 	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
427 	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
428 	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
429 	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
430 	    state->rc_xmt_buf_mac_update;
431 
432 	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
433 	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
434 	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
435 	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
436 
437 	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
438 	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
439 	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
440 	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
441 	    state->rc_act_close_simultaneous;
442 	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
443 	ibd_rc_ksp->rc_timeout_act.value.ul = state->rc_timeout_act;
444 	ibd_rc_ksp->rc_timeout_pas.value.ul = state->rc_timeout_pas;
445 
446 	return (0);
447 }
448 
449 
450 /*
451  * ibd_rc_init_stats - initialize kstat data structures
452  *
453  * This routine will create and initialize the driver private
454  * statistics counters.
455  */
456 int
457 ibd_rc_init_stats(ibd_state_t *state)
458 {
459 	kstat_t *ksp;
460 	ibd_rc_stat_t *ibd_rc_ksp;
461 	char stat_name[KSTAT_STRLEN];
462 	int inst;
463 
464 	/*
465 	 * Create and init kstat
466 	 */
467 	inst = ddi_get_instance(state->id_dip);
468 	(void) snprintf(stat_name, KSTAT_STRLEN, "statistics%d_%x_%u", inst,
469 	    state->id_pkey, state->id_plinkid);
470 	ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED,
471 	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
472 
473 	if (ksp == NULL) {
474 		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
475 		    "kernel statistics");
476 		return (DDI_FAILURE);
477 	}
478 
479 	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */
480 
481 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
482 
483 	/*
484 	 * Initialize all the statistics
485 	 */
486 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
487 	    "transfer mode", KSTAT_DATA_ULONG);
488 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
489 	    "transfer mode", KSTAT_DATA_ULONG);
490 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
491 	    "copy mode", KSTAT_DATA_ULONG);
492 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
493 	    "copy mode", KSTAT_DATA_ULONG);
494 	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
495 	    KSTAT_DATA_ULONG);
496 
497 	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
498 	    KSTAT_DATA_ULONG);
499 
500 	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
501 	    KSTAT_DATA_ULONG);
502 
503 	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
504 	    KSTAT_DATA_ULONG);
505 	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
506 	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
507 	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
508 	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
509 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
510 	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
511 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
512 	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
513 	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
514 	    KSTAT_DATA_ULONG);
515 
516 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
517 	    "recycle", KSTAT_DATA_ULONG);
518 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
519 	    "after recycle", KSTAT_DATA_ULONG);
520 	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
521 	    KSTAT_DATA_ULONG);
522 	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
523 	    "#, swqe available", KSTAT_DATA_ULONG);
524 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
525 	    "ibd_send", KSTAT_DATA_ULONG);
526 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
527 	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
528 
529 	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
530 	    KSTAT_DATA_ULONG);
531 	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
532 	    KSTAT_DATA_ULONG);
533 	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
534 	    "pkt", KSTAT_DATA_ULONG);
535 	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
536 	    "state", KSTAT_DATA_ULONG);
537 
538 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
539 	    KSTAT_DATA_ULONG);
540 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
541 	    KSTAT_DATA_ULONG);
542 	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
543 	    "recycle", KSTAT_DATA_ULONG);
544 	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
545 	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
546 	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
547 	    KSTAT_DATA_ULONG);
548 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: timeout act side",
549 	    KSTAT_DATA_ULONG);
550 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: timeout pas side",
551 	    KSTAT_DATA_ULONG);
552 
553 	/*
554 	 * Function to provide kernel stat update on demand
555 	 */
556 	ksp->ks_update = ibd_rc_update_stats;
557 
558 	/*
559 	 * Pointer into provider's raw statistics
560 	 */
561 	ksp->ks_private = (void *)state;
562 
563 	/*
564 	 * Add kstat to systems kstat chain
565 	 */
566 	kstat_install(ksp);
567 
568 	return (DDI_SUCCESS);
569 }
570 #endif
571 
572 static ibt_status_t
573 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
574     boolean_t is_tx_chan)
575 {
576 	ibt_status_t result;
577 	ibd_rc_chan_t *chan;
578 	ibt_rc_chan_alloc_args_t alloc_args;
579 	ibt_chan_alloc_flags_t alloc_flags;
580 	ibt_chan_sizes_t sizes;
581 	ibt_cq_attr_t cq_atts;
582 	int rv;
583 
584 	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
585 
586 	chan->state = state;
587 	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
589 	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
590 	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
591 	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
592 	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
593 
594 	/* Allocate IB structures for a new RC channel. */
595 	if (is_tx_chan) {
596 		chan->scq_size = state->id_rc_num_swqe;
597 		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
598 	} else {
599 		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
600 		chan->rcq_size = state->id_rc_num_rwqe;
601 	}
602 	cq_atts.cq_size = chan->scq_size;
603 	cq_atts.cq_sched = NULL;
604 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
605 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
606 	    &chan->scq_size);
607 	if (result != IBT_SUCCESS) {
608 		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
609 		    "create scq completion queue (size <%d>)",
610 		    result, chan->scq_size);
611 		goto alloc_scq_err;
612 	}	/* if failure to alloc cq */
613 
614 	if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count,
615 	    state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) {
616 		DPRINT(30, "ibd_rc_alloc_chan: Send CQ "
617 		    "interrupt moderation failed");
618 	}
619 
620 	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
621 	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
622 	    (void *) (uintptr_t)chan);
623 
624 	cq_atts.cq_size = chan->rcq_size;
625 	cq_atts.cq_sched = NULL;
626 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
627 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
628 	    &chan->rcq_size);
629 	if (result != IBT_SUCCESS) {
630 		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
631 		    "rx completion queue (size <%d>)", result, chan->rcq_size);
632 		goto alloc_rcq_err;
633 	}	/* if failure to alloc cq */
634 
635 	if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count,
636 	    state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) {
637 		DPRINT(30, "ibd_rc_alloc_chan: Receive CQ "
638 		    "interrupt moderation failed");
639 	}
640 
641 	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
642 	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
643 	    (void *)(uintptr_t)chan);
644 
645 	if (is_tx_chan) {
646 		chan->is_tx_chan = B_TRUE;
647 		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
648 			ibd_print_warn(state, "ibd_rc_alloc_chan: "
649 			    "ibd_rc_init_txlist failed");
650 			goto init_txlist_err;
651 		}
652 		if (ibd_rc_tx_softintr == 1) {
653 			if ((rv = ddi_add_softintr(state->id_dip,
654 			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
655 			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
656 			    DDI_SUCCESS) {
657 				DPRINT(10, "ibd_rc_alloc_chan: failed in "
658 				    "ddi_add_softintr(scq_softintr), ret=%d",
659 				    rv);
660 				goto alloc_softintr_err;
661 			}
662 		}
663 	} else {
664 		chan->is_tx_chan = B_FALSE;
665 	}
666 
667 	/*
668 	 * enable completions
669 	 */
670 	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
671 	if (result != IBT_SUCCESS) {
672 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
673 		    "(scq) failed: status %d\n", result);
674 		goto alloc_scq_enable_err;
675 	}
676 
677 	/* We will enable chan->rcq_hdl later. */
678 
679 	/* alloc a RC channel */
680 	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
681 	bzero(&sizes, sizeof (ibt_chan_sizes_t));
682 
683 	alloc_args.rc_flags = IBT_WR_SIGNALED;
684 	alloc_args.rc_control = IBT_CEP_NO_FLAGS;
685 
686 	alloc_args.rc_scq = chan->scq_hdl;
687 	alloc_args.rc_rcq = chan->rcq_hdl;
688 	alloc_args.rc_pd = state->id_pd_hdl;
689 
690 	alloc_args.rc_hca_port_num = state->id_port;
691 	alloc_args.rc_clone_chan = NULL;
692 
693 	/* scatter/gather */
694 	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
695 
696 	/*
697 	 * For the number of SGL elements in receive side, I think it
698 	 * should be 1. Because ibd driver allocates a whole block memory
699 	 * for each ibt_post_recv().
700 	 */
701 	alloc_args.rc_sizes.cs_rq_sgl = 1;
702 
703 	/* The send queue size and the receive queue size */
704 	alloc_args.rc_sizes.cs_sq = chan->scq_size;
705 	alloc_args.rc_sizes.cs_rq = chan->rcq_size;
706 
707 	if (state->id_hca_res_lkey_capab) {
708 		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
709 	} else {
710 		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
711 	}
712 
713 	if (state->rc_enable_srq) {
714 		alloc_flags = IBT_ACHAN_USES_SRQ;
715 		alloc_args.rc_srq = state->rc_srq_hdl;
716 	} else {
717 		alloc_flags = IBT_ACHAN_NO_FLAGS;
718 	}
719 
720 	result = ibt_alloc_rc_channel(state->id_hca_hdl,
721 	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
722 	if (result != IBT_SUCCESS) {
723 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
724 		    " fail:<%d>", result);
725 		goto alloc_scq_enable_err;
726 	}
727 
728 	if (is_tx_chan)
729 		atomic_inc_32(&state->rc_num_tx_chan);
730 	else
731 		atomic_inc_32(&state->rc_num_rx_chan);
732 
733 	/* For the connection reaper routine ibd_rc_conn_timeout_call() */
734 	chan->is_used = B_TRUE;
735 
736 	*ret_chan = chan;
737 	return (IBT_SUCCESS);
738 
739 alloc_scq_enable_err:
740 	if (is_tx_chan) {
741 		if (ibd_rc_tx_softintr == 1) {
742 			ddi_remove_softintr(chan->scq_softintr);
743 		}
744 	}
745 alloc_softintr_err:
746 	if (is_tx_chan) {
747 		ibd_rc_fini_txlist(chan);
748 	}
749 init_txlist_err:
750 	(void) ibt_free_cq(chan->rcq_hdl);
751 alloc_rcq_err:
752 	(void) ibt_free_cq(chan->scq_hdl);
753 alloc_scq_err:
754 	mutex_destroy(&chan->tx_poll_lock);
755 	mutex_destroy(&chan->tx_post_lock);
756 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
757 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
758 	mutex_destroy(&chan->rx_free_list.dl_mutex);
759 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
760 	kmem_free(chan, sizeof (ibd_rc_chan_t));
761 	return (result);
762 }
763 
764 static void
765 ibd_rc_free_chan(ibd_rc_chan_t *chan)
766 {
767 	ibt_status_t ret;
768 
769 	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
770 
771 	if (chan->chan_hdl != NULL) {
772 		ret = ibt_free_channel(chan->chan_hdl);
773 		if (ret != IBT_SUCCESS) {
774 			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
775 			    "chan=%p, returned: %d", chan, ret);
776 			return;
777 		}
778 		chan->chan_hdl = NULL;
779 	}
780 
781 	if (chan->rcq_hdl != NULL) {
782 		ret = ibt_free_cq(chan->rcq_hdl);
783 		if (ret != IBT_SUCCESS) {
784 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
785 			    "chan=%p, returned: %d", chan, ret);
786 			return;
787 		}
788 		chan->rcq_hdl = NULL;
789 	}
790 
791 	if (chan->scq_hdl != NULL) {
792 		ret = ibt_free_cq(chan->scq_hdl);
793 		if (ret != IBT_SUCCESS) {
794 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
795 			    "chan=%p, returned: %d", chan, ret);
796 			return;
797 		}
798 		chan->scq_hdl = NULL;
799 	}
800 
801 	/* Free buffers */
802 	if (chan->is_tx_chan) {
803 		ibd_rc_fini_txlist(chan);
804 		if (ibd_rc_tx_softintr == 1) {
805 			ddi_remove_softintr(chan->scq_softintr);
806 		}
807 		atomic_dec_32(&chan->state->rc_num_tx_chan);
808 	} else {
809 		if (!chan->state->rc_enable_srq) {
810 			ibd_rc_fini_rxlist(chan);
811 		}
812 		atomic_dec_32(&chan->state->rc_num_rx_chan);
813 	}
814 
815 	mutex_destroy(&chan->tx_poll_lock);
816 	mutex_destroy(&chan->tx_post_lock);
817 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
818 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
819 	mutex_destroy(&chan->rx_free_list.dl_mutex);
820 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
821 
822 	/*
823 	 * If it is a passive channel, must make sure it has been removed
824 	 * from chan->state->rc_pass_chan_list
825 	 */
826 	kmem_free(chan, sizeof (ibd_rc_chan_t));
827 }
828 
829 /* Add a RC channel */
830 static inline void
831 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
832 {
833 	mutex_enter(&list->chan_list_mutex);
834 	if (list->chan_list == NULL) {
835 		list->chan_list = chan;
836 		chan->next = NULL;
837 	} else {
838 		chan->next = list->chan_list;
839 		list->chan_list = chan;
840 	}
841 	mutex_exit(&list->chan_list_mutex);
842 }
843 
844 static boolean_t
845 ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t *chan)
846 {
847 	ibd_state_t *state = chan->state;
848 
849 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
850 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0) {
851 		mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
852 		return (B_FALSE);
853 	} else {
854 		if (state->rc_pass_chan_list.chan_list == NULL) {
855 			state->rc_pass_chan_list.chan_list = chan;
856 			chan->next = NULL;
857 		} else {
858 			chan->next = state->rc_pass_chan_list.chan_list;
859 			state->rc_pass_chan_list.chan_list = chan;
860 		}
861 		mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
862 		return (B_TRUE);
863 	}
864 }
865 
866 /* Remove a RC channel */
867 static inline ibd_rc_chan_t *
868 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
869 {
870 	ibd_rc_chan_t *pre_chan;
871 
872 	mutex_enter(&list->chan_list_mutex);
873 	if (list->chan_list == chan) {
874 		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
875 		    " in chan_list", chan);
876 		list->chan_list = chan->next;
877 	} else {
878 		pre_chan = list->chan_list;
879 		while (pre_chan != NULL) {
880 			if (pre_chan->next == chan) {
881 				DPRINT(30, "ibd_rc_rm_from_chan_list"
882 				    "(middle): found chan(%p)", chan);
883 				pre_chan->next = chan->next;
884 				break;
885 			}
886 			pre_chan = pre_chan->next;
887 		}
888 		if (pre_chan == NULL)
889 			chan = NULL;
890 	}
891 	mutex_exit(&list->chan_list_mutex);
892 	return (chan);
893 }
894 
895 static inline ibd_rc_chan_t *
896 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
897 {
898 	ibd_rc_chan_t *rc_chan;
899 
900 	mutex_enter(&list->chan_list_mutex);
901 	rc_chan = list->chan_list;
902 	if (rc_chan != NULL) {
903 		list->chan_list = rc_chan->next;
904 	}
905 	mutex_exit(&list->chan_list_mutex);
906 	return (rc_chan);
907 }
908 
909 static int
910 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
911 {
912 	ibt_mr_attr_t mem_attr;
913 	uint_t rc_rx_bufs_sz;
914 
915 	/*
916 	 * Allocate one big chunk for all regular rx copy bufs
917 	 */
918 	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
919 
920 	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
921 
922 	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
923 	    sizeof (ibd_rwqe_t), KM_SLEEP);
924 
925 	/*
926 	 * Do one memory registration on the entire rxbuf area
927 	 */
928 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
929 	mem_attr.mr_len = rc_rx_bufs_sz;
930 	mem_attr.mr_as = NULL;
931 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
932 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
933 	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
934 	    != IBT_SUCCESS) {
935 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
936 		    "failed");
937 		kmem_free(state->rc_srq_rwqes,
938 		    state->rc_srq_size * sizeof (ibd_rwqe_t));
939 		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
940 		state->rc_srq_rx_bufs = NULL;
941 		state->rc_srq_rwqes = NULL;
942 		return (DDI_FAILURE);
943 	}
944 
945 	return (DDI_SUCCESS);
946 }
947 
948 static void
949 ibd_rc_free_srq_copybufs(ibd_state_t *state)
950 {
951 	uint_t rc_rx_buf_sz;
952 
953 	/*
954 	 * Don't change the value of state->rc_mtu at the period from call
955 	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
956 	 */
957 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
958 
959 	/*
960 	 * Unregister rxbuf mr
961 	 */
962 	if (ibt_deregister_mr(state->id_hca_hdl,
963 	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
964 		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
965 		    " failed");
966 	}
967 	state->rc_srq_rx_mr_hdl = NULL;
968 
969 	/*
970 	 * Free rxbuf memory
971 	 */
972 	kmem_free(state->rc_srq_rwqes,
973 	    state->rc_srq_size * sizeof (ibd_rwqe_t));
974 	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
975 	state->rc_srq_rwqes = NULL;
976 	state->rc_srq_rx_bufs = NULL;
977 }
978 
979 /*
980  * Allocate and post a certain number of SRQ receive buffers and WRs.
981  */
982 int
983 ibd_rc_init_srq_list(ibd_state_t *state)
984 {
985 	ibd_rwqe_t *rwqe;
986 	ibt_lkey_t lkey;
987 	int i;
988 	uint_t len;
989 	uint8_t *bufaddr;
990 	ibt_srq_sizes_t srq_sizes;
991 	ibt_srq_sizes_t	 srq_real_sizes;
992 	ibt_status_t ret;
993 
994 	srq_sizes.srq_sgl_sz = 1;
995 	srq_sizes.srq_wr_sz = state->id_rc_num_srq;
996 	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
997 	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
998 	if (ret != IBT_SUCCESS) {
999 		/*
1000 		 * The following code is for CR 6932460 (can't configure ibd
1001 		 * interface on 32 bits x86 systems). 32 bits x86 system has
1002 		 * less memory resource than 64 bits x86 system. If current
1003 		 * resource request can't be satisfied, we request less
1004 		 * resource here.
1005 		 */
1006 		len = state->id_rc_num_srq;
1007 		while ((ret == IBT_HCA_WR_EXCEEDED) &&
1008 		    (len >= 2 * IBD_RC_MIN_CQ_SIZE)) {
1009 			len = len/2;
1010 			srq_sizes.srq_sgl_sz = 1;
1011 			srq_sizes.srq_wr_sz = len;
1012 			ret = ibt_alloc_srq(state->id_hca_hdl,
1013 			    IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes,
1014 			    &state->rc_srq_hdl, &srq_real_sizes);
1015 		}
1016 		if (ret != IBT_SUCCESS) {
1017 			DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
1018 			    "req_sgl_sz=%d, req_wr_sz=0x%x, final_req_wr_sz="
1019 			    "0x%x, ret=%d", srq_sizes.srq_sgl_sz,
1020 			    srq_sizes.srq_wr_sz, len, ret);
1021 			return (DDI_FAILURE);
1022 		}
1023 		state->id_rc_num_srq = len;
1024 		state->id_rc_num_rwqe = state->id_rc_num_srq + 1;
1025 	}
1026 
1027 	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
1028 	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
1029 		ret = ibt_free_srq(state->rc_srq_hdl);
1030 		if (ret != IBT_SUCCESS) {
1031 			ibd_print_warn(state, "ibd_rc_init_srq_list: "
1032 			    "ibt_free_srq fail, ret=%d", ret);
1033 		}
1034 		return (DDI_FAILURE);
1035 	}
1036 
1037 	/*
1038 	 * Allocate and setup the rwqe list
1039 	 */
1040 	lkey = state->rc_srq_rx_mr_desc.md_lkey;
1041 	rwqe = state->rc_srq_rwqes;
1042 	bufaddr = state->rc_srq_rx_bufs;
1043 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1044 	state->rc_srq_rwqe_list.dl_cnt = 0;
1045 	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1046 	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1047 		rwqe->w_state = state;
1048 		rwqe->w_freeing_wqe = B_FALSE;
1049 		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1050 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1051 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1052 
1053 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1054 		    &rwqe->w_freemsg_cb)) == NULL) {
1055 			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1056 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1057 			if (atomic_dec_32_nv(&state->id_running) != 0) {
1058 				cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
1059 				    "id_running was not 1\n");
1060 			}
1061 			ibd_rc_fini_srq_list(state);
1062 			atomic_inc_32(&state->id_running);
1063 			return (DDI_FAILURE);
1064 		}
1065 
1066 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1067 		/* Leave IPOIB_GRH_SIZE space */
1068 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1069 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1070 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1071 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1072 		rwqe->w_rwr.wr_nds = 1;
1073 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1074 		(void) ibd_rc_post_srq(state, rwqe);
1075 	}
1076 
1077 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1078 	state->rc_srq_free_list.dl_head = NULL;
1079 	state->rc_srq_free_list.dl_cnt = 0;
1080 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1081 
1082 	return (DDI_SUCCESS);
1083 }
1084 
1085 /*
1086  * Free the statically allocated Rx buffer list for SRQ.
1087  */
1088 void
1089 ibd_rc_fini_srq_list(ibd_state_t *state)
1090 {
1091 	ibd_rwqe_t *rwqe;
1092 	int i;
1093 	ibt_status_t ret;
1094 
1095 	ASSERT(state->id_running == 0);
1096 	ret = ibt_free_srq(state->rc_srq_hdl);
1097 	if (ret != IBT_SUCCESS) {
1098 		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1099 		    "ibt_free_srq fail, ret=%d", ret);
1100 	}
1101 
1102 	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1103 	rwqe = state->rc_srq_rwqes;
1104 	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1105 		if (rwqe->rwqe_im_mblk != NULL) {
1106 			rwqe->w_freeing_wqe = B_TRUE;
1107 			freemsg(rwqe->rwqe_im_mblk);
1108 		}
1109 	}
1110 	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1111 
1112 	ibd_rc_free_srq_copybufs(state);
1113 }
1114 
1115 /* Repost the elements in state->ib_rc_free_list */
1116 int
1117 ibd_rc_repost_srq_free_list(ibd_state_t *state)
1118 {
1119 	ibd_rwqe_t *rwqe;
1120 	ibd_wqe_t *list;
1121 	uint_t len;
1122 
1123 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1124 	if (state->rc_srq_free_list.dl_head != NULL) {
1125 		/* repost them */
1126 		len = state->rc_mtu + IPOIB_GRH_SIZE;
1127 		list = state->rc_srq_free_list.dl_head;
1128 		state->rc_srq_free_list.dl_head = NULL;
1129 		state->rc_srq_free_list.dl_cnt = 0;
1130 		mutex_exit(&state->rc_srq_free_list.dl_mutex);
1131 		while (list != NULL) {
1132 			rwqe = WQE_TO_RWQE(list);
1133 			if ((rwqe->rwqe_im_mblk == NULL) &&
1134 			    ((rwqe->rwqe_im_mblk = desballoc(
1135 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
1136 			    &rwqe->w_freemsg_cb)) == NULL)) {
1137 				DPRINT(40, "ibd_rc_repost_srq_free_list: "
1138 				    "failed in desballoc()");
1139 				do {
1140 					ibd_rc_srq_free_rwqe(state, rwqe);
1141 					list = list->w_next;
1142 					rwqe = WQE_TO_RWQE(list);
1143 				} while (list != NULL);
1144 				return (DDI_FAILURE);
1145 			}
1146 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1147 				ibd_rc_srq_free_rwqe(state, rwqe);
1148 			}
1149 			list = list->w_next;
1150 		}
1151 		return (DDI_SUCCESS);
1152 	}
1153 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1154 	return (DDI_SUCCESS);
1155 }
1156 
1157 /*
1158  * Free an allocated recv wqe.
1159  */
1160 static void
1161 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1162 {
1163 	/*
1164 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1165 	 *
1166 	 * This rwqe is placed on a free list so that it
1167 	 * can be reinstated in future.
1168 	 *
1169 	 * NOTE: no code currently exists to reinstate
1170 	 * these "lost" rwqes.
1171 	 */
1172 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1173 	state->rc_srq_free_list.dl_cnt++;
1174 	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1175 	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1176 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1177 }
1178 
1179 static void
1180 ibd_rc_srq_freemsg_cb(char *arg)
1181 {
1182 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1183 	ibd_state_t *state = rwqe->w_state;
1184 
1185 	ASSERT(state->rc_enable_srq);
1186 
1187 	/*
1188 	 * If the driver is stopped, just free the rwqe.
1189 	 */
1190 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1191 		if (!rwqe->w_freeing_wqe) {
1192 			atomic_dec_32(
1193 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1194 			DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1195 			rwqe->rwqe_im_mblk = NULL;
1196 			ibd_rc_srq_free_rwqe(state, rwqe);
1197 		}
1198 		return;
1199 	}
1200 
1201 	atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1202 
1203 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1204 	ASSERT(!rwqe->w_freeing_wqe);
1205 
1206 	/*
1207 	 * Upper layer has released held mblk, so we have
1208 	 * no more use for keeping the old pointer in
1209 	 * our rwqe.
1210 	 */
1211 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1212 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1213 	if (rwqe->rwqe_im_mblk == NULL) {
1214 		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1215 		ibd_rc_srq_free_rwqe(state, rwqe);
1216 		return;
1217 	}
1218 
1219 	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1220 		ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1221 		    " failed");
1222 		ibd_rc_srq_free_rwqe(state, rwqe);
1223 		return;
1224 	}
1225 }
1226 
1227 /*
1228  * Post a rwqe to the hardware and add it to the Rx list.
1229  */
1230 static int
1231 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1232 {
1233 	/*
1234 	 * Here we should add dl_cnt before post recv, because
1235 	 * we would have to make sure dl_cnt is updated before
1236 	 * the corresponding ibd_rc_process_rx() is called.
1237 	 */
1238 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1239 	atomic_inc_32(&state->rc_srq_rwqe_list.dl_cnt);
1240 	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1241 	    IBT_SUCCESS) {
1242 		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1243 		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1244 		return (DDI_FAILURE);
1245 	}
1246 
1247 	return (DDI_SUCCESS);
1248 }
1249 
1250 /*
1251  * Post a rwqe to the hardware and add it to the Rx list.
1252  */
1253 static int
1254 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1255 {
1256 	/*
1257 	 * Here we should add dl_cnt before post recv, because we would
1258 	 * have to make sure dl_cnt has already updated before
1259 	 * corresponding ibd_rc_process_rx() is called.
1260 	 */
1261 	atomic_inc_32(&chan->rx_wqe_list.dl_cnt);
1262 	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1263 	    IBT_SUCCESS) {
1264 		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1265 		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1266 		return (DDI_FAILURE);
1267 	}
1268 	return (DDI_SUCCESS);
1269 }
1270 
1271 static int
1272 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1273 {
1274 	ibd_state_t *state = chan->state;
1275 	ibt_mr_attr_t mem_attr;
1276 	uint_t rc_rx_bufs_sz;
1277 
1278 	/*
1279 	 * Allocate one big chunk for all regular rx copy bufs
1280 	 */
1281 	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1282 
1283 	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1284 
1285 	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1286 	    sizeof (ibd_rwqe_t), KM_SLEEP);
1287 
1288 	/*
1289 	 * Do one memory registration on the entire rxbuf area
1290 	 */
1291 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1292 	mem_attr.mr_len = rc_rx_bufs_sz;
1293 	mem_attr.mr_as = NULL;
1294 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1295 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1296 	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1297 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1298 		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1299 		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1300 		chan->rx_bufs = NULL;
1301 		chan->rx_rwqes = NULL;
1302 		return (DDI_FAILURE);
1303 	}
1304 
1305 	return (DDI_SUCCESS);
1306 }
1307 
1308 static void
1309 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1310 {
1311 	ibd_state_t *state = chan->state;
1312 	uint_t rc_rx_buf_sz;
1313 
1314 	ASSERT(!state->rc_enable_srq);
1315 	ASSERT(chan->rx_rwqes != NULL);
1316 	ASSERT(chan->rx_bufs != NULL);
1317 
1318 	/*
1319 	 * Don't change the value of state->rc_mtu at the period from call
1320 	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1321 	 */
1322 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1323 
1324 	/*
1325 	 * Unregister rxbuf mr
1326 	 */
1327 	if (ibt_deregister_mr(state->id_hca_hdl,
1328 	    chan->rx_mr_hdl) != IBT_SUCCESS) {
1329 		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1330 	}
1331 	chan->rx_mr_hdl = NULL;
1332 
1333 	/*
1334 	 * Free rxbuf memory
1335 	 */
1336 	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1337 	chan->rx_rwqes = NULL;
1338 
1339 	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1340 	chan->rx_bufs = NULL;
1341 }
1342 
1343 /*
1344  * Post a certain number of receive buffers and WRs on a RC channel.
1345  */
1346 static int
1347 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1348 {
1349 	ibd_state_t *state = chan->state;
1350 	ibd_rwqe_t *rwqe;
1351 	ibt_lkey_t lkey;
1352 	int i;
1353 	uint_t len;
1354 	uint8_t *bufaddr;
1355 
1356 	ASSERT(!state->rc_enable_srq);
1357 	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1358 		return (DDI_FAILURE);
1359 
1360 	/*
1361 	 * Allocate and setup the rwqe list
1362 	 */
1363 	lkey = chan->rx_mr_desc.md_lkey;
1364 	rwqe = chan->rx_rwqes;
1365 	bufaddr = chan->rx_bufs;
1366 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1367 	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1368 		rwqe->w_state = state;
1369 		rwqe->w_chan = chan;
1370 		rwqe->w_freeing_wqe = B_FALSE;
1371 		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1372 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1373 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1374 
1375 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1376 		    &rwqe->w_freemsg_cb)) == NULL) {
1377 			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1378 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1379 			ibd_rc_fini_rxlist(chan);
1380 			return (DDI_FAILURE);
1381 		}
1382 
1383 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1384 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1385 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1386 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1387 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1388 		rwqe->w_rwr.wr_nds = 1;
1389 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1390 		(void) ibd_rc_post_rwqe(chan, rwqe);
1391 	}
1392 
1393 	return (DDI_SUCCESS);
1394 }
1395 
1396 /*
1397  * Free the statically allocated Rx buffer list for SRQ.
1398  */
1399 static void
1400 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1401 {
1402 	ibd_rwqe_t *rwqe;
1403 	int i;
1404 
1405 	if (chan->rx_bufs == NULL) {
1406 		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1407 		return;
1408 	}
1409 
1410 	/* bufs_outstanding must be 0 */
1411 	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1412 	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1413 
1414 	mutex_enter(&chan->rx_wqe_list.dl_mutex);
1415 	rwqe = chan->rx_rwqes;
1416 	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1417 		if (rwqe->rwqe_im_mblk != NULL) {
1418 			rwqe->w_freeing_wqe = B_TRUE;
1419 			freemsg(rwqe->rwqe_im_mblk);
1420 		}
1421 	}
1422 	mutex_exit(&chan->rx_wqe_list.dl_mutex);
1423 
1424 	ibd_rc_free_rx_copybufs(chan);
1425 }
1426 
1427 /*
1428  * Free an allocated recv wqe.
1429  */
1430 static void
1431 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1432 {
1433 	/*
1434 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1435 	 *
1436 	 * This rwqe is placed on a free list so that it
1437 	 * can be reinstated in future.
1438 	 *
1439 	 * NOTE: no code currently exists to reinstate
1440 	 * these "lost" rwqes.
1441 	 */
1442 	mutex_enter(&chan->rx_free_list.dl_mutex);
1443 	chan->rx_free_list.dl_cnt++;
1444 	rwqe->rwqe_next = chan->rx_free_list.dl_head;
1445 	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1446 	mutex_exit(&chan->rx_free_list.dl_mutex);
1447 }
1448 
1449 /*
1450  * Processing to be done after receipt of a packet; hand off to GLD
1451  * in the format expected by GLD.
1452  */
1453 static void
1454 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1455 {
1456 	ibd_state_t *state = chan->state;
1457 	ib_header_info_t *phdr;
1458 	ipoib_hdr_t *ipibp;
1459 	mblk_t *mp;
1460 	mblk_t *mpc;
1461 	int rxcnt;
1462 	ip6_t *ip6h;
1463 	int len;
1464 
1465 	/*
1466 	 * Track number handed to upper layer, and number still
1467 	 * available to receive packets.
1468 	 */
1469 	if (state->rc_enable_srq) {
1470 		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1471 	} else {
1472 		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1473 	}
1474 
1475 	/*
1476 	 * It can not be a IBA multicast packet.
1477 	 */
1478 	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1479 
1480 	/* For the connection reaper routine ibd_rc_conn_timeout_call() */
1481 	chan->is_used = B_TRUE;
1482 
1483 #ifdef DEBUG
1484 	if (rxcnt < state->id_rc_rx_rwqe_thresh) {
1485 		state->rc_rwqe_short++;
1486 	}
1487 #endif
1488 
1489 	/*
1490 	 * Possibly replenish the Rx pool if needed.
1491 	 */
1492 	if ((rxcnt >= state->id_rc_rx_rwqe_thresh) &&
1493 	    (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) {
1494 		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1495 		atomic_inc_64(&state->rc_rcv_trans_pkt);
1496 
1497 		/*
1498 		 * Record how many rwqe has been occupied by upper
1499 		 * network layer
1500 		 */
1501 		if (state->rc_enable_srq) {
1502 			atomic_inc_32(
1503 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1504 		} else {
1505 			atomic_inc_32(&chan->rx_wqe_list.dl_bufs_outstanding);
1506 		}
1507 		mp = rwqe->rwqe_im_mblk;
1508 	} else {
1509 		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1510 		atomic_inc_64(&state->rc_rcv_copy_pkt);
1511 
1512 		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1513 		    BPRI_HI)) == NULL) {	/* no memory */
1514 			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1515 			state->rc_rcv_alloc_fail++;
1516 			if (state->rc_enable_srq) {
1517 				if (ibd_rc_post_srq(state, rwqe) ==
1518 				    DDI_FAILURE) {
1519 					ibd_rc_srq_free_rwqe(state, rwqe);
1520 				}
1521 			} else {
1522 				if (ibd_rc_post_rwqe(chan, rwqe) ==
1523 				    DDI_FAILURE) {
1524 					ibd_rc_free_rwqe(chan, rwqe);
1525 				}
1526 			}
1527 			return;
1528 		}
1529 
1530 		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1531 		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1532 
1533 		if (state->rc_enable_srq) {
1534 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1535 				ibd_rc_srq_free_rwqe(state, rwqe);
1536 			}
1537 		} else {
1538 			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1539 				ibd_rc_free_rwqe(chan, rwqe);
1540 			}
1541 		}
1542 	}
1543 
1544 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1545 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1546 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1547 		len = ntohs(ip6h->ip6_plen);
1548 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1549 			/* LINTED: E_CONSTANT_CONDITION */
1550 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1551 		}
1552 	}
1553 
1554 	phdr = (ib_header_info_t *)mp->b_rptr;
1555 	phdr->ib_grh.ipoib_vertcflow = 0;
1556 	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1557 	    sizeof (ipoib_mac_t));
1558 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1559 
1560 	/*
1561 	 * Can RC mode in IB guarantee its checksum correctness?
1562 	 *
1563 	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
1564 	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
1565 	 */
1566 
1567 	/*
1568 	 * Make sure this is NULL or we're in trouble.
1569 	 */
1570 	if (mp->b_next != NULL) {
1571 		ibd_print_warn(state,
1572 		    "ibd_rc_process_rx: got duplicate mp from rcq?");
1573 		mp->b_next = NULL;
1574 	}
1575 
1576 	/*
1577 	 * Add this mp to the list of processed mp's to send to
1578 	 * the nw layer
1579 	 */
1580 	if (state->rc_enable_srq) {
1581 		mutex_enter(&state->rc_rx_lock);
1582 		if (state->rc_rx_mp) {
1583 			ASSERT(state->rc_rx_mp_tail != NULL);
1584 			state->rc_rx_mp_tail->b_next = mp;
1585 		} else {
1586 			ASSERT(state->rc_rx_mp_tail == NULL);
1587 			state->rc_rx_mp = mp;
1588 		}
1589 
1590 		state->rc_rx_mp_tail = mp;
1591 		state->rc_rx_mp_len++;
1592 
1593 		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1594 			mpc = state->rc_rx_mp;
1595 
1596 			state->rc_rx_mp = NULL;
1597 			state->rc_rx_mp_tail = NULL;
1598 			state->rc_rx_mp_len = 0;
1599 			mutex_exit(&state->rc_rx_lock);
1600 			mac_rx(state->id_mh, NULL, mpc);
1601 		} else {
1602 			mutex_exit(&state->rc_rx_lock);
1603 		}
1604 	} else {
1605 		mutex_enter(&chan->rx_lock);
1606 		if (chan->rx_mp) {
1607 			ASSERT(chan->rx_mp_tail != NULL);
1608 			chan->rx_mp_tail->b_next = mp;
1609 		} else {
1610 			ASSERT(chan->rx_mp_tail == NULL);
1611 			chan->rx_mp = mp;
1612 		}
1613 
1614 		chan->rx_mp_tail = mp;
1615 		chan->rx_mp_len++;
1616 
1617 		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1618 			mpc = chan->rx_mp;
1619 
1620 			chan->rx_mp = NULL;
1621 			chan->rx_mp_tail = NULL;
1622 			chan->rx_mp_len = 0;
1623 			mutex_exit(&chan->rx_lock);
1624 			mac_rx(state->id_mh, NULL, mpc);
1625 		} else {
1626 			mutex_exit(&chan->rx_lock);
1627 		}
1628 	}
1629 }
1630 
1631 /*
1632  * Callback code invoked from STREAMs when the recv data buffer is free
1633  * for recycling.
1634  */
1635 static void
1636 ibd_rc_freemsg_cb(char *arg)
1637 {
1638 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1639 	ibd_rc_chan_t *chan = rwqe->w_chan;
1640 	ibd_state_t *state = rwqe->w_state;
1641 
1642 	/*
1643 	 * If the wqe is being destructed, do not attempt recycling.
1644 	 */
1645 	if (rwqe->w_freeing_wqe == B_TRUE) {
1646 		return;
1647 	}
1648 
1649 	ASSERT(!state->rc_enable_srq);
1650 	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1651 
1652 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1653 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1654 	if (rwqe->rwqe_im_mblk == NULL) {
1655 		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1656 		ibd_rc_free_rwqe(chan, rwqe);
1657 		return;
1658 	}
1659 
1660 	/*
1661 	 * Post back to h/w. We could actually have more than
1662 	 * id_num_rwqe WQEs on the list if there were multiple
1663 	 * ibd_freemsg_cb() calls outstanding (since the lock is
1664 	 * not held the entire time). This will start getting
1665 	 * corrected over subsequent ibd_freemsg_cb() calls.
1666 	 */
1667 	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1668 		ibd_rc_free_rwqe(chan, rwqe);
1669 		return;
1670 	}
1671 	atomic_dec_32(&chan->rx_wqe_list.dl_bufs_outstanding);
1672 }
1673 
1674 /*
1675  * Common code for interrupt handling as well as for polling
1676  * for all completed wqe's while detaching.
1677  */
1678 static void
1679 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1680 {
1681 	ibd_wqe_t *wqe;
1682 	ibt_wc_t *wc, *wcs;
1683 	uint_t numwcs, real_numwcs;
1684 	int i;
1685 
1686 	wcs = chan->rx_wc;
1687 	numwcs = IBD_RC_MAX_CQ_WC;
1688 
1689 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1690 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1691 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1692 			if (wc->wc_status != IBT_WC_SUCCESS) {
1693 				chan->state->rc_rcq_err++;
1694 				/*
1695 				 * Channel being torn down.
1696 				 */
1697 				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1698 				    "SUCC, chan=%p", wc->wc_status, chan);
1699 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1700 					/*
1701 					 * Do not invoke Rx handler because
1702 					 * it might add buffers to the Rx pool
1703 					 * when we are trying to deinitialize.
1704 					 */
1705 					continue;
1706 				}
1707 			}
1708 			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1709 		}
1710 	}
1711 }
1712 
1713 /* Receive CQ handler */
1714 /* ARGSUSED */
1715 static void
1716 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1717 {
1718 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1719 	ibd_state_t *state = chan->state;
1720 
1721 	atomic_inc_32(&chan->rcq_invoking);
1722 	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1723 
1724 	/*
1725 	 * Poll for completed entries; the CQ will not interrupt any
1726 	 * more for incoming (or transmitted) packets.
1727 	 */
1728 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1729 
1730 	/*
1731 	 * Now enable CQ notifications; all packets that arrive now
1732 	 * (or complete transmission) will cause new interrupts.
1733 	 */
1734 	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1735 	    IBT_SUCCESS) {
1736 		/*
1737 		 * We do not expect a failure here.
1738 		 */
1739 		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1740 	}
1741 
1742 	/*
1743 	 * Repoll to catch all packets that might have arrived after
1744 	 * we finished the first poll loop and before interrupts got
1745 	 * armed.
1746 	 */
1747 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1748 
1749 	if (state->rc_enable_srq) {
1750 		mutex_enter(&state->rc_rx_lock);
1751 
1752 		if (state->rc_rx_mp != NULL) {
1753 			mblk_t *mpc;
1754 			mpc = state->rc_rx_mp;
1755 
1756 			state->rc_rx_mp = NULL;
1757 			state->rc_rx_mp_tail = NULL;
1758 			state->rc_rx_mp_len = 0;
1759 
1760 			mutex_exit(&state->rc_rx_lock);
1761 			mac_rx(state->id_mh, NULL, mpc);
1762 		} else {
1763 			mutex_exit(&state->rc_rx_lock);
1764 		}
1765 	} else {
1766 		mutex_enter(&chan->rx_lock);
1767 
1768 		if (chan->rx_mp != NULL) {
1769 			mblk_t *mpc;
1770 			mpc = chan->rx_mp;
1771 
1772 			chan->rx_mp = NULL;
1773 			chan->rx_mp_tail = NULL;
1774 			chan->rx_mp_len = 0;
1775 
1776 			mutex_exit(&chan->rx_lock);
1777 			mac_rx(state->id_mh, NULL, mpc);
1778 		} else {
1779 			mutex_exit(&chan->rx_lock);
1780 		}
1781 	}
1782 	atomic_dec_32(&chan->rcq_invoking);
1783 }
1784 
1785 /*
1786  * Allocate the statically allocated Tx buffer list.
1787  */
1788 int
1789 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1790 {
1791 	ibd_rc_tx_largebuf_t *lbufp;
1792 	ibd_rc_tx_largebuf_t *tail;
1793 	uint8_t *memp;
1794 	ibt_mr_attr_t mem_attr;
1795 	uint32_t num_swqe;
1796 	size_t  mem_size;
1797 	int i;
1798 
1799 	num_swqe = state->id_rc_num_swqe - 1;
1800 
1801 	/*
1802 	 * Allocate one big chunk for all Tx large copy bufs
1803 	 */
1804 	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1805 	mem_size = num_swqe * state->rc_mtu;
1806 	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1807 
1808 	mem_attr.mr_len = mem_size;
1809 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1810 	mem_attr.mr_as = NULL;
1811 	mem_attr.mr_flags = IBT_MR_SLEEP;
1812 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1813 	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1814 		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1815 		    "failed");
1816 		kmem_free(state->rc_tx_mr_bufs, mem_size);
1817 		state->rc_tx_mr_bufs = NULL;
1818 		return (DDI_FAILURE);
1819 	}
1820 
1821 	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1822 	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1823 
1824 	/*
1825 	 * Set up the buf chain
1826 	 */
1827 	memp = state->rc_tx_mr_bufs;
1828 	mutex_enter(&state->rc_tx_large_bufs_lock);
1829 	lbufp = state->rc_tx_largebuf_desc_base;
1830 	for (i = 0; i < num_swqe; i++) {
1831 		lbufp->lb_buf = memp;
1832 		lbufp->lb_next = lbufp + 1;
1833 
1834 		tail = lbufp;
1835 
1836 		memp += state->rc_mtu;
1837 		lbufp++;
1838 	}
1839 	tail->lb_next = NULL;
1840 
1841 	/*
1842 	 * Set up the buffer information in ibd state
1843 	 */
1844 	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1845 	state->rc_tx_largebuf_nfree = num_swqe;
1846 	mutex_exit(&state->rc_tx_large_bufs_lock);
1847 	return (DDI_SUCCESS);
1848 }
1849 
1850 void
1851 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1852 {
1853 	uint32_t num_swqe;
1854 
1855 	num_swqe = state->id_rc_num_swqe - 1;
1856 
1857 	if (ibt_deregister_mr(state->id_hca_hdl,
1858 	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1859 		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1860 		    "failed");
1861 	}
1862 	state->rc_tx_mr_hdl = NULL;
1863 
1864 	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1865 	state->rc_tx_mr_bufs = NULL;
1866 
1867 	kmem_free(state->rc_tx_largebuf_desc_base,
1868 	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1869 	state->rc_tx_largebuf_desc_base = NULL;
1870 }
1871 
1872 static int
1873 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1874 {
1875 	ibt_mr_attr_t mem_attr;
1876 	ibd_state_t *state;
1877 
1878 	state = chan->state;
1879 	ASSERT(state != NULL);
1880 
1881 	/*
1882 	 * Allocate one big chunk for all regular tx copy bufs
1883 	 */
1884 	mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh;
1885 
1886 	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1887 
1888 	/*
1889 	 * Do one memory registration on the entire txbuf area
1890 	 */
1891 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1892 	mem_attr.mr_as = NULL;
1893 	mem_attr.mr_flags = IBT_MR_SLEEP;
1894 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1895 	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1896 		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1897 		ASSERT(mem_attr.mr_len ==
1898 		    chan->scq_size * state->id_rc_tx_copy_thresh);
1899 		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1900 		chan->tx_mr_bufs = NULL;
1901 		return (DDI_FAILURE);
1902 	}
1903 
1904 	return (DDI_SUCCESS);
1905 }
1906 
1907 /*
1908  * Allocate the statically allocated Tx buffer list.
1909  */
1910 static int
1911 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1912 {
1913 	ibd_swqe_t *swqe;
1914 	int i;
1915 	ibt_lkey_t lkey;
1916 	ibd_state_t *state = chan->state;
1917 
1918 	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1919 		return (DDI_FAILURE);
1920 
1921 	/*
1922 	 * Allocate and setup the swqe list
1923 	 */
1924 	lkey = chan->tx_mr_desc.md_lkey;
1925 	chan->tx_wqes = kmem_zalloc(chan->scq_size *
1926 	    sizeof (ibd_swqe_t), KM_SLEEP);
1927 	swqe = chan->tx_wqes;
1928 	for (i = 0; i < chan->scq_size; i++, swqe++) {
1929 		swqe->swqe_next = NULL;
1930 		swqe->swqe_im_mblk = NULL;
1931 
1932 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1933 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1934 
1935 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1936 		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1937 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1938 		    (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh);
1939 		swqe->w_swr.wr_trans = IBT_RC_SRV;
1940 
1941 		/* Add to list */
1942 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
1943 		chan->tx_wqe_list.dl_cnt++;
1944 		swqe->swqe_next = chan->tx_wqe_list.dl_head;
1945 		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1946 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
1947 	}
1948 
1949 	return (DDI_SUCCESS);
1950 }
1951 
1952 /*
1953  * Free the statically allocated Tx buffer list.
1954  */
1955 static void
1956 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1957 {
1958 	ibd_state_t *state = chan->state;
1959 	if (chan->tx_mr_hdl != NULL) {
1960 		if (ibt_deregister_mr(chan->state->id_hca_hdl,
1961 		    chan->tx_mr_hdl) != IBT_SUCCESS) {
1962 			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1963 			    "failed");
1964 		}
1965 		chan->tx_mr_hdl = NULL;
1966 	}
1967 
1968 	if (chan->tx_mr_bufs != NULL) {
1969 		kmem_free(chan->tx_mr_bufs, chan->scq_size *
1970 		    state->id_rc_tx_copy_thresh);
1971 		chan->tx_mr_bufs = NULL;
1972 	}
1973 
1974 	if (chan->tx_wqes != NULL) {
1975 		kmem_free(chan->tx_wqes, chan->scq_size *
1976 		    sizeof (ibd_swqe_t));
1977 		chan->tx_wqes = NULL;
1978 	}
1979 }
1980 
1981 /*
1982  * Acquire send wqe from free list.
1983  * Returns error number and send wqe pointer.
1984  */
1985 ibd_swqe_t *
1986 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1987 {
1988 	ibd_swqe_t *wqe;
1989 
1990 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1991 	if (chan->tx_rel_list.dl_head != NULL) {
1992 		/* transfer id_tx_rel_list to id_tx_list */
1993 		chan->tx_wqe_list.dl_head =
1994 		    chan->tx_rel_list.dl_head;
1995 		chan->tx_wqe_list.dl_cnt =
1996 		    chan->tx_rel_list.dl_cnt;
1997 		chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1998 
1999 		/* clear id_tx_rel_list */
2000 		chan->tx_rel_list.dl_head = NULL;
2001 		chan->tx_rel_list.dl_cnt = 0;
2002 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2003 
2004 		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
2005 		chan->tx_wqe_list.dl_cnt -= 1;
2006 		chan->tx_wqe_list.dl_head = wqe->swqe_next;
2007 	} else {	/* no free swqe */
2008 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2009 		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
2010 		wqe = NULL;
2011 	}
2012 	return (wqe);
2013 }
2014 
2015 /*
2016  * Release send wqe back into free list.
2017  */
2018 static void
2019 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
2020 {
2021 	/*
2022 	 * Add back on Tx list for reuse.
2023 	 */
2024 	swqe->swqe_next = NULL;
2025 	mutex_enter(&chan->tx_rel_list.dl_mutex);
2026 	chan->tx_rel_list.dl_pending_sends = B_FALSE;
2027 	swqe->swqe_next = chan->tx_rel_list.dl_head;
2028 	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
2029 	chan->tx_rel_list.dl_cnt++;
2030 	mutex_exit(&chan->tx_rel_list.dl_mutex);
2031 }
2032 
2033 void
2034 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
2035 {
2036 	uint_t		i;
2037 	uint_t		num_posted;
2038 	uint_t		n_wrs;
2039 	ibt_status_t	ibt_status;
2040 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
2041 	ibd_swqe_t	*tx_head, *elem;
2042 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
2043 
2044 	/* post the one request, then check for more */
2045 	ibt_status = ibt_post_send(chan->chan_hdl,
2046 	    &node->w_swr, 1, NULL);
2047 	if (ibt_status != IBT_SUCCESS) {
2048 		ibd_print_warn(chan->state, "ibd_post_send: "
2049 		    "posting one wr failed: ret=%d", ibt_status);
2050 		ibd_rc_tx_cleanup(node);
2051 	}
2052 
2053 	tx_head = NULL;
2054 	for (;;) {
2055 		if (tx_head == NULL) {
2056 			mutex_enter(&chan->tx_post_lock);
2057 			tx_head = chan->tx_head;
2058 			if (tx_head == NULL) {
2059 				chan->tx_busy = 0;
2060 				mutex_exit(&chan->tx_post_lock);
2061 				return;
2062 			}
2063 			chan->tx_head = NULL;
2064 			mutex_exit(&chan->tx_post_lock);
2065 		}
2066 
2067 		/*
2068 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
2069 		 * at a time if possible, and keep posting them.
2070 		 */
2071 		for (n_wrs = 0, elem = tx_head;
2072 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
2073 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
2074 			nodes[n_wrs] = elem;
2075 			wrs[n_wrs] = elem->w_swr;
2076 		}
2077 		tx_head = elem;
2078 
2079 		ASSERT(n_wrs != 0);
2080 
2081 		/*
2082 		 * If posting fails for some reason, we'll never receive
2083 		 * completion intimation, so we'll need to cleanup. But
2084 		 * we need to make sure we don't clean up nodes whose
2085 		 * wrs have been successfully posted. We assume that the
2086 		 * hca driver returns on the first failure to post and
2087 		 * therefore the first 'num_posted' entries don't need
2088 		 * cleanup here.
2089 		 */
2090 		num_posted = 0;
2091 		ibt_status = ibt_post_send(chan->chan_hdl,
2092 		    wrs, n_wrs, &num_posted);
2093 		if (ibt_status != IBT_SUCCESS) {
2094 			ibd_print_warn(chan->state, "ibd_post_send: "
2095 			    "posting multiple wrs failed: "
2096 			    "requested=%d, done=%d, ret=%d",
2097 			    n_wrs, num_posted, ibt_status);
2098 
2099 			for (i = num_posted; i < n_wrs; i++)
2100 				ibd_rc_tx_cleanup(nodes[i]);
2101 		}
2102 	}
2103 }
2104 
2105 /*
2106  * Common code that deals with clean ups after a successful or
2107  * erroneous transmission attempt.
2108  */
2109 void
2110 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2111 {
2112 	ibd_ace_t *ace = swqe->w_ahandle;
2113 	ibd_state_t *state;
2114 
2115 	ASSERT(ace != NULL);
2116 	ASSERT(ace->ac_chan != NULL);
2117 
2118 	state = ace->ac_chan->state;
2119 
2120 	/*
2121 	 * If this was a dynamic registration in ibd_send(),
2122 	 * deregister now.
2123 	 */
2124 	if (swqe->swqe_im_mblk != NULL) {
2125 		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2126 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
2127 			ibd_unmap_mem(state, swqe);
2128 		}
2129 		freemsg(swqe->swqe_im_mblk);
2130 		swqe->swqe_im_mblk = NULL;
2131 	} else {
2132 		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2133 	}
2134 
2135 	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2136 		ibd_rc_tx_largebuf_t *lbufp;
2137 
2138 		lbufp = swqe->w_rc_tx_largebuf;
2139 		ASSERT(lbufp != NULL);
2140 
2141 		mutex_enter(&state->rc_tx_large_bufs_lock);
2142 		lbufp->lb_next = state->rc_tx_largebuf_free_head;
2143 		state->rc_tx_largebuf_free_head = lbufp;
2144 		state->rc_tx_largebuf_nfree ++;
2145 		mutex_exit(&state->rc_tx_large_bufs_lock);
2146 		swqe->w_rc_tx_largebuf = NULL;
2147 	}
2148 
2149 
2150 	/*
2151 	 * Release the send wqe for reuse.
2152 	 */
2153 	ibd_rc_release_swqe(ace->ac_chan, swqe);
2154 
2155 	/*
2156 	 * Drop the reference count on the AH; it can be reused
2157 	 * now for a different destination if there are no more
2158 	 * posted sends that will use it. This can be eliminated
2159 	 * if we can always associate each Tx buffer with an AH.
2160 	 * The ace can be null if we are cleaning up from the
2161 	 * ibd_send() error path.
2162 	 */
2163 	ibd_dec_ref_ace(state, ace);
2164 }
2165 
2166 void
2167 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2168 {
2169 	ibd_state_t *state = chan->state;
2170 	ibd_wqe_t *wqe;
2171 	ibt_wc_t *wc, *wcs;
2172 	ibd_ace_t *ace;
2173 	uint_t numwcs, real_numwcs;
2174 	int i;
2175 	boolean_t encount_error;
2176 
2177 	wcs = chan->tx_wc;
2178 	numwcs = IBD_RC_MAX_CQ_WC;
2179 	encount_error = B_FALSE;
2180 
2181 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2182 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2183 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2184 			if (wc->wc_status != IBT_WC_SUCCESS) {
2185 				if (encount_error == B_FALSE) {
2186 					/*
2187 					 * This RC channle is in error status,
2188 					 * remove it.
2189 					 */
2190 					encount_error = B_TRUE;
2191 					mutex_enter(&state->id_ac_mutex);
2192 					if ((chan->chan_state ==
2193 					    IBD_RC_STATE_ACT_ESTAB) &&
2194 					    (chan->state->id_link_state ==
2195 					    LINK_STATE_UP) &&
2196 					    ((ace = ibd_acache_find(state,
2197 					    &chan->ace->ac_mac, B_FALSE, 0))
2198 					    != NULL) && (ace == chan->ace)) {
2199 						ASSERT(ace->ac_mce == NULL);
2200 						INC_REF(ace, 1);
2201 						IBD_ACACHE_PULLOUT_ACTIVE(
2202 						    state, ace);
2203 						chan->chan_state =
2204 						    IBD_RC_STATE_ACT_CLOSING;
2205 						mutex_exit(&state->id_ac_mutex);
2206 						state->rc_reset_cnt++;
2207 						DPRINT(30, "ibd_rc_drain_scq: "
2208 						    "wc_status(%d) != SUCC, "
2209 						    "chan=%p, ace=%p, "
2210 						    "link_state=%d"
2211 						    "reset RC channel",
2212 						    wc->wc_status, chan,
2213 						    chan->ace, chan->state->
2214 						    id_link_state);
2215 						ibd_rc_signal_act_close(
2216 						    state, ace);
2217 					} else {
2218 						mutex_exit(&state->id_ac_mutex);
2219 						state->
2220 						    rc_act_close_simultaneous++;
2221 						DPRINT(40, "ibd_rc_drain_scq: "
2222 						    "wc_status(%d) != SUCC, "
2223 						    "chan=%p, chan_state=%d,"
2224 						    "ace=%p, link_state=%d."
2225 						    "other thread is closing "
2226 						    "it", wc->wc_status, chan,
2227 						    chan->chan_state, chan->ace,
2228 						    chan->state->id_link_state);
2229 					}
2230 				}
2231 			}
2232 			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2233 		}
2234 
2235 		mutex_enter(&state->id_sched_lock);
2236 		if (state->id_sched_needed == 0) {
2237 			mutex_exit(&state->id_sched_lock);
2238 		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2239 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2240 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2241 			if ((chan->tx_rel_list.dl_cnt +
2242 			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2243 				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2244 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2245 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2246 				mutex_exit(&state->id_sched_lock);
2247 				state->rc_swqe_mac_update++;
2248 				mac_tx_update(state->id_mh);
2249 			} else {
2250 				state->rc_scq_no_swqe++;
2251 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2252 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2253 				mutex_exit(&state->id_sched_lock);
2254 			}
2255 		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2256 			mutex_enter(&state->rc_tx_large_bufs_lock);
2257 			if (state->rc_tx_largebuf_nfree >
2258 			    IBD_RC_TX_FREE_THRESH) {
2259 				ASSERT(state->rc_tx_largebuf_free_head != NULL);
2260 				state->id_sched_needed &=
2261 				    ~IBD_RSRC_RC_TX_LARGEBUF;
2262 				mutex_exit(&state->rc_tx_large_bufs_lock);
2263 				mutex_exit(&state->id_sched_lock);
2264 				state->rc_xmt_buf_mac_update++;
2265 				mac_tx_update(state->id_mh);
2266 			} else {
2267 				state->rc_scq_no_largebuf++;
2268 				mutex_exit(&state->rc_tx_large_bufs_lock);
2269 				mutex_exit(&state->id_sched_lock);
2270 			}
2271 		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2272 			mutex_enter(&state->id_tx_list.dl_mutex);
2273 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
2274 			if ((state->id_tx_list.dl_cnt +
2275 			    state->id_tx_rel_list.dl_cnt)
2276 			    > IBD_FREE_SWQES_THRESH) {
2277 				state->id_sched_needed &= ~IBD_RSRC_SWQE;
2278 				state->id_sched_cnt++;
2279 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2280 				mutex_exit(&state->id_tx_list.dl_mutex);
2281 				mutex_exit(&state->id_sched_lock);
2282 				mac_tx_update(state->id_mh);
2283 			} else {
2284 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2285 				mutex_exit(&state->id_tx_list.dl_mutex);
2286 				mutex_exit(&state->id_sched_lock);
2287 			}
2288 		} else {
2289 			mutex_exit(&state->id_sched_lock);
2290 		}
2291 	}
2292 }
2293 
2294 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2295 /* ARGSUSED */
2296 static void
2297 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2298 {
2299 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2300 
2301 	if (ibd_rc_tx_softintr == 1) {
2302 		mutex_enter(&chan->tx_poll_lock);
2303 		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2304 			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2305 			mutex_exit(&chan->tx_poll_lock);
2306 			return;
2307 		} else {
2308 			mutex_exit(&chan->tx_poll_lock);
2309 			ddi_trigger_softintr(chan->scq_softintr);
2310 		}
2311 	} else
2312 		(void) ibd_rc_tx_recycle(arg);
2313 }
2314 
2315 static uint_t
2316 ibd_rc_tx_recycle(caddr_t arg)
2317 {
2318 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2319 	ibd_state_t *state = chan->state;
2320 	int flag, redo_flag;
2321 	int redo = 1;
2322 
2323 	flag = IBD_CQ_POLLING;
2324 	redo_flag = IBD_REDO_CQ_POLLING;
2325 
2326 	mutex_enter(&chan->tx_poll_lock);
2327 	if (chan->tx_poll_busy & flag) {
2328 		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2329 		    "threads");
2330 		chan->tx_poll_busy |= redo_flag;
2331 		mutex_exit(&chan->tx_poll_lock);
2332 		return (DDI_INTR_CLAIMED);
2333 	}
2334 	chan->tx_poll_busy |= flag;
2335 	mutex_exit(&chan->tx_poll_lock);
2336 
2337 	/*
2338 	 * Poll for completed entries; the CQ will not interrupt any
2339 	 * more for completed packets.
2340 	 */
2341 	ibd_rc_drain_scq(chan, chan->scq_hdl);
2342 
2343 	/*
2344 	 * Now enable CQ notifications; all completions originating now
2345 	 * will cause new interrupts.
2346 	 */
2347 	do {
2348 		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2349 		    IBT_SUCCESS) {
2350 			/*
2351 			 * We do not expect a failure here.
2352 			 */
2353 			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2354 			    " failed");
2355 		}
2356 
2357 		ibd_rc_drain_scq(chan, chan->scq_hdl);
2358 
2359 		mutex_enter(&chan->tx_poll_lock);
2360 		if (chan->tx_poll_busy & redo_flag)
2361 			chan->tx_poll_busy &= ~redo_flag;
2362 		else {
2363 			chan->tx_poll_busy &= ~flag;
2364 			redo = 0;
2365 		}
2366 		mutex_exit(&chan->tx_poll_lock);
2367 
2368 	} while (redo);
2369 
2370 	return (DDI_INTR_CLAIMED);
2371 }
2372 
2373 static ibt_status_t
2374 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2375     int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2376 {
2377 	ibd_service_t *p;
2378 	ibt_status_t status;
2379 
2380 	mutex_enter(&ibd_gstate.ig_mutex);
2381 	for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2382 		if (p->is_sid == sid) {
2383 			p->is_ref_cnt++;
2384 			*srv_hdl = p->is_srv_hdl;
2385 			*ret_sid = sid;
2386 			mutex_exit(&ibd_gstate.ig_mutex);
2387 			return (IBT_SUCCESS);
2388 		}
2389 	}
2390 	status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2391 	    num_sids, srv_hdl, ret_sid);
2392 	if (status == IBT_SUCCESS) {
2393 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2394 		p->is_srv_hdl = *srv_hdl;
2395 		p->is_sid = sid;
2396 		p->is_ref_cnt = 1;
2397 		p->is_link = ibd_gstate.ig_service_list;
2398 		ibd_gstate.ig_service_list = p;
2399 	}
2400 	mutex_exit(&ibd_gstate.ig_mutex);
2401 	return (status);
2402 }
2403 
2404 static ibt_status_t
2405 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2406 {
2407 	ibd_service_t *p, **pp;
2408 	ibt_status_t status;
2409 
2410 	mutex_enter(&ibd_gstate.ig_mutex);
2411 	for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2412 	    pp = &((*pp)->is_link)) {
2413 		p = *pp;
2414 		if (p->is_srv_hdl == srv_hdl) {	/* Found it */
2415 			if (--p->is_ref_cnt == 0) {
2416 				status = ibt_deregister_service(
2417 				    ibd_gstate.ig_ibt_hdl, srv_hdl);
2418 				*pp = p->is_link; /* link prev to next */
2419 				kmem_free(p, sizeof (*p));
2420 			} else {
2421 				status = IBT_SUCCESS;
2422 			}
2423 			mutex_exit(&ibd_gstate.ig_mutex);
2424 			return (status);
2425 		}
2426 	}
2427 	/* Should not ever get here */
2428 	mutex_exit(&ibd_gstate.ig_mutex);
2429 	return (IBT_FAILURE);
2430 }
2431 
2432 /* Listen with corresponding service ID */
2433 ibt_status_t
2434 ibd_rc_listen(ibd_state_t *state)
2435 {
2436 	ibt_srv_desc_t srvdesc;
2437 	ib_svc_id_t ret_sid;
2438 	ibt_status_t status;
2439 	ib_gid_t gid;
2440 
2441 	if (state->rc_listen_hdl != NULL) {
2442 		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2443 		return (IBT_FAILURE);
2444 	}
2445 
2446 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2447 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2448 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2449 
2450 	/*
2451 	 * Register the service with service id
2452 	 * Incoming connection requests should arrive on this service id.
2453 	 */
2454 	status = ibd_register_service(&srvdesc,
2455 	    IBD_RC_QPN_TO_SID(state->id_qpnum),
2456 	    1, &state->rc_listen_hdl, &ret_sid);
2457 	if (status != IBT_SUCCESS) {
2458 		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2459 		    "ret=%d", status);
2460 		return (status);
2461 	}
2462 
2463 	gid = state->id_sgid;
2464 
2465 	/* pass state as cm_private */
2466 	status = ibt_bind_service(state->rc_listen_hdl,
2467 	    gid, NULL, state, &state->rc_listen_bind);
2468 	if (status != IBT_SUCCESS) {
2469 		DPRINT(40, "ibd_rc_listen:"
2470 		    " fail to bind port: <%d>", status);
2471 		(void) ibd_deregister_service(state->rc_listen_hdl);
2472 		return (status);
2473 	}
2474 
2475 	/*
2476 	 * Legacy OFED had used a wrong service ID (one additional zero digit)
2477 	 * for many years. To interop with legacy OFED, we support this wrong
2478 	 * service ID here.
2479 	 */
2480 	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2481 
2482 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2483 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2484 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2485 
2486 	/*
2487 	 * Register the service with service id
2488 	 * Incoming connection requests should arrive on this service id.
2489 	 */
2490 	status = ibd_register_service(&srvdesc,
2491 	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2492 	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2493 	if (status != IBT_SUCCESS) {
2494 		DPRINT(40,
2495 		    "ibd_rc_listen: Service Registration for Legacy OFED "
2496 		    "Failed %d", status);
2497 		(void) ibt_unbind_service(state->rc_listen_hdl,
2498 		    state->rc_listen_bind);
2499 		(void) ibd_deregister_service(state->rc_listen_hdl);
2500 		return (status);
2501 	}
2502 
2503 	gid = state->id_sgid;
2504 
2505 	/* pass state as cm_private */
2506 	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2507 	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2508 	if (status != IBT_SUCCESS) {
2509 		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2510 		    "Legacy OFED listener", status);
2511 		(void) ibd_deregister_service(
2512 		    state->rc_listen_hdl_OFED_interop);
2513 		(void) ibt_unbind_service(state->rc_listen_hdl,
2514 		    state->rc_listen_bind);
2515 		(void) ibd_deregister_service(state->rc_listen_hdl);
2516 		return (status);
2517 	}
2518 
2519 	return (IBT_SUCCESS);
2520 }
2521 
2522 void
2523 ibd_rc_stop_listen(ibd_state_t *state)
2524 {
2525 	int ret;
2526 
2527 	/* Disable incoming connection requests */
2528 	if (state->rc_listen_hdl != NULL) {
2529 		ret = ibt_unbind_all_services(state->rc_listen_hdl);
2530 		if (ret != 0) {
2531 			DPRINT(40, "ibd_rc_stop_listen:"
2532 			    "ibt_unbind_all_services() failed, ret=%d", ret);
2533 		}
2534 		ret = ibd_deregister_service(state->rc_listen_hdl);
2535 		if (ret != 0) {
2536 			DPRINT(40, "ibd_rc_stop_listen:"
2537 			    "ibd_deregister_service() failed, ret=%d", ret);
2538 		} else {
2539 			state->rc_listen_hdl = NULL;
2540 		}
2541 	}
2542 
2543 	/* Disable incoming connection requests */
2544 	if (state->rc_listen_hdl_OFED_interop != NULL) {
2545 		ret = ibt_unbind_all_services(
2546 		    state->rc_listen_hdl_OFED_interop);
2547 		if (ret != 0) {
2548 			DPRINT(40, "ibd_rc_stop_listen:"
2549 			    "ibt_unbind_all_services() failed: %d", ret);
2550 		}
2551 		ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2552 		if (ret != 0) {
2553 			DPRINT(40, "ibd_rc_stop_listen:"
2554 			    "ibd_deregister_service() failed: %d", ret);
2555 		} else {
2556 			state->rc_listen_hdl_OFED_interop = NULL;
2557 		}
2558 	}
2559 }
2560 
2561 void
2562 ibd_rc_close_all_chan(ibd_state_t *state)
2563 {
2564 	ibd_rc_chan_t *rc_chan;
2565 	ibd_ace_t *ace, *pre_ace;
2566 	uint_t attempts;
2567 
2568 	/* Disable all Rx routines */
2569 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2570 	rc_chan = state->rc_pass_chan_list.chan_list;
2571 	while (rc_chan != NULL) {
2572 		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2573 		rc_chan = rc_chan->next;
2574 	}
2575 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2576 
2577 	if (state->rc_enable_srq) {
2578 		attempts = 10;
2579 		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2580 			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2581 			delay(drv_usectohz(100000));
2582 			if (--attempts == 0) {
2583 				/*
2584 				 * There are pending bufs with the network
2585 				 * layer and we have no choice but to wait
2586 				 * for them to be done with. Reap all the
2587 				 * Tx/Rx completions that were posted since
2588 				 * we turned off the notification and
2589 				 * return failure.
2590 				 */
2591 				break;
2592 			}
2593 		}
2594 	}
2595 
2596 	/* Close all passive RC channels */
2597 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2598 	while (rc_chan != NULL) {
2599 		(void) ibd_rc_pas_close(rc_chan, B_TRUE, B_FALSE);
2600 		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2601 	}
2602 
2603 	/* Close all active RC channels */
2604 	mutex_enter(&state->id_ac_mutex);
2605 	state->id_ac_hot_ace = NULL;
2606 	ace = list_head(&state->id_ah_active);
2607 	while ((pre_ace = ace) != NULL) {
2608 		ace = list_next(&state->id_ah_active, ace);
2609 		if (pre_ace->ac_chan != NULL) {
2610 			INC_REF(pre_ace, 1);
2611 			IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace);
2612 			pre_ace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2613 			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2614 			    pre_ace->ac_chan);
2615 		}
2616 	}
2617 	mutex_exit(&state->id_ac_mutex);
2618 
2619 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2620 	while (rc_chan != NULL) {
2621 		ace = rc_chan->ace;
2622 		ibd_rc_act_close(rc_chan, B_TRUE);
2623 		if (ace != NULL) {
2624 			mutex_enter(&state->id_ac_mutex);
2625 			ASSERT(ace->ac_ref != 0);
2626 			atomic_dec_32(&ace->ac_ref);
2627 			ace->ac_chan = NULL;
2628 			if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
2629 				IBD_ACACHE_INSERT_FREE(state, ace);
2630 				ace->ac_ref = 0;
2631 			} else {
2632 				ace->ac_ref |= CYCLEVAL;
2633 				state->rc_delay_ace_recycle++;
2634 			}
2635 			mutex_exit(&state->id_ac_mutex);
2636 		}
2637 		rc_chan = ibd_rc_rm_header_chan_list(
2638 		    &state->rc_obs_act_chan_list);
2639 	}
2640 
2641 	attempts = 400;
2642 	while (((state->rc_num_tx_chan != 0) ||
2643 	    (state->rc_num_rx_chan != 0)) && (attempts > 0)) {
2644 		/* Other thread is closing CM channel, wait it */
2645 		delay(drv_usectohz(100000));
2646 		attempts--;
2647 	}
2648 }
2649 
2650 void
2651 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2652 {
2653 	ibt_status_t status;
2654 
2655 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2656 		return;
2657 
2658 	status = ibd_rc_connect(state, ace, path,
2659 	    IBD_RC_SERVICE_ID_OFED_INTEROP);
2660 
2661 	if (status != IBT_SUCCESS) {
2662 		/* wait peer side remove stale channel */
2663 		delay(drv_usectohz(10000));
2664 		if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2665 			return;
2666 		status = ibd_rc_connect(state, ace, path,
2667 		    IBD_RC_SERVICE_ID_OFED_INTEROP);
2668 	}
2669 
2670 	if (status != IBT_SUCCESS) {
2671 		/* wait peer side remove stale channel */
2672 		delay(drv_usectohz(10000));
2673 		if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2674 			return;
2675 		(void) ibd_rc_connect(state, ace, path,
2676 		    IBD_RC_SERVICE_ID);
2677 	}
2678 }
2679 
2680 /*
2681  * Allocates channel and sets the ace->ac_chan to it.
2682  * Opens the channel.
2683  */
2684 ibt_status_t
2685 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2686     uint64_t ietf_cm_service_id)
2687 {
2688 	ibt_status_t status = 0;
2689 	ibt_rc_returns_t open_returns;
2690 	ibt_chan_open_args_t open_args;
2691 	ibd_rc_msg_hello_t hello_req_msg;
2692 	ibd_rc_msg_hello_t *hello_ack_msg;
2693 	ibd_rc_chan_t *chan;
2694 	ibt_ud_dest_query_attr_t dest_attrs;
2695 
2696 	ASSERT(ace != NULL);
2697 	ASSERT(ace->ac_mce == NULL);
2698 	ASSERT(ace->ac_chan == NULL);
2699 
2700 	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2701 		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2702 		return (status);
2703 	}
2704 
2705 	ace->ac_chan = chan;
2706 	chan->state = state;
2707 	chan->ace = ace;
2708 
2709 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2710 
2711 	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2712 
2713 	/*
2714 	 * open the channels
2715 	 */
2716 	bzero(&open_args, sizeof (ibt_chan_open_args_t));
2717 	bzero(&open_returns, sizeof (ibt_rc_returns_t));
2718 
2719 	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2720 	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2721 
2722 	/*
2723 	 * update path record with the SID
2724 	 */
2725 	if ((status = ibt_query_ud_dest(ace->ac_dest, &dest_attrs))
2726 	    != IBT_SUCCESS) {
2727 		DPRINT(40, "ibd_rc_connect: ibt_query_ud_dest() failed, "
2728 		    "ret=%d", status);
2729 		return (status);
2730 	}
2731 
2732 	path->pi_sid =
2733 	    ietf_cm_service_id | ((dest_attrs.ud_dst_qpn) & 0xffffff);
2734 
2735 
2736 	/* pre-allocate memory for hello ack message */
2737 	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2738 	open_returns.rc_priv_data = hello_ack_msg;
2739 
2740 	open_args.oc_path = path;
2741 
2742 	open_args.oc_path_rnr_retry_cnt	= 1;
2743 	open_args.oc_path_retry_cnt = 1;
2744 
2745 	/* We don't do RDMA */
2746 	open_args.oc_rdma_ra_out = 0;
2747 	open_args.oc_rdma_ra_in	= 0;
2748 
2749 	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2750 	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2751 	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2752 	open_args.oc_priv_data = (void *)(&hello_req_msg);
2753 
2754 	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2755 	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2756 	ASSERT(open_args.oc_cm_handler != NULL);
2757 
2758 	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2759 	    IBT_BLOCKING, &open_args, &open_returns);
2760 
2761 	if (status == IBT_SUCCESS) {
2762 		/* Success! */
2763 		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2764 		state->rc_conn_succ++;
2765 		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2766 		return (IBT_SUCCESS);
2767 	}
2768 
2769 	/* failure */
2770 	(void) ibt_flush_channel(chan->chan_hdl);
2771 	ibd_rc_free_chan(chan);
2772 	ace->ac_chan = NULL;
2773 
2774 	/* check open_returns report error and exit */
2775 	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2776 	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2777 	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2778 	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2779 	    dest_attrs.ud_dst_qpn);
2780 	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2781 	return (status);
2782 }
2783 
2784 void
2785 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2786 {
2787 	ibd_req_t *req;
2788 
2789 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2790 	if (req == NULL) {
2791 		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2792 		    "ibd_req_t fail");
2793 		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2794 		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2795 		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2796 		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2797 	} else {
2798 		req->rq_ptr = ace->ac_chan;
2799 		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2800 	}
2801 }
2802 
2803 void
2804 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2805 {
2806 	ibd_req_t *req;
2807 
2808 	mutex_enter(&state->rc_ace_recycle_lock);
2809 	if (state->rc_ace_recycle != NULL) {
2810 		mutex_exit(&state->rc_ace_recycle_lock);
2811 		return;
2812 	}
2813 
2814 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2815 	if (req == NULL) {
2816 		mutex_exit(&state->rc_ace_recycle_lock);
2817 		return;
2818 	}
2819 
2820 	state->rc_ace_recycle = ace;
2821 	mutex_exit(&state->rc_ace_recycle_lock);
2822 	ASSERT(ace->ac_mce == NULL);
2823 	INC_REF(ace, 1);
2824 	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2825 	req->rq_ptr = ace;
2826 	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2827 }
2828 
2829 /*
2830  * Close an active channel
2831  *
2832  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2833  */
2834 static void
2835 ibd_rc_act_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan)
2836 {
2837 	ibd_state_t *state;
2838 	ibd_ace_t *ace;
2839 	uint_t times;
2840 	ibt_status_t ret;
2841 
2842 	ASSERT(chan != NULL);
2843 
2844 	chan->state->rc_act_close++;
2845 	switch (chan->chan_state) {
2846 	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
2847 	case IBD_RC_STATE_ACT_ESTAB:
2848 		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2849 		    "act_state=%d, chan=%p", chan->chan_state, chan);
2850 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2851 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2852 		/*
2853 		 * Wait send queue empty. Its old value is 50 (5 seconds). But
2854 		 * in my experiment, 5 seconds is not enough time to let IBTL
2855 		 * return all buffers and ace->ac_ref. I tried 25 seconds, it
2856 		 * works well. As another evidence, I saw IBTL takes about 17
2857 		 * seconds every time it cleans a stale RC channel.
2858 		 */
2859 		times = 250;
2860 		ace = chan->ace;
2861 		ASSERT(ace != NULL);
2862 		state = chan->state;
2863 		ASSERT(state != NULL);
2864 		mutex_enter(&state->id_ac_mutex);
2865 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
2866 		mutex_enter(&chan->tx_rel_list.dl_mutex);
2867 		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2868 		    != chan->scq_size) || ((ace->ac_ref != 1) &&
2869 		    (ace->ac_ref != (CYCLEVAL+1)))) {
2870 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2871 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2872 			mutex_exit(&state->id_ac_mutex);
2873 			times--;
2874 			if (times == 0) {
2875 				state->rc_act_close_not_clean++;
2876 				DPRINT(40, "ibd_rc_act_close: dl_cnt(tx_wqe_"
2877 				    "list=%d, tx_rel_list=%d) != chan->"
2878 				    "scq_size=%d, OR ac_ref(=%d) not clean",
2879 				    chan->tx_wqe_list.dl_cnt,
2880 				    chan->tx_rel_list.dl_cnt,
2881 				    chan->scq_size, ace->ac_ref);
2882 				break;
2883 			}
2884 			mutex_enter(&chan->tx_poll_lock);
2885 			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2886 				DPRINT(40, "ibd_rc_act_close: multiple "
2887 				    "polling threads");
2888 				mutex_exit(&chan->tx_poll_lock);
2889 			} else {
2890 				chan->tx_poll_busy = IBD_CQ_POLLING;
2891 				mutex_exit(&chan->tx_poll_lock);
2892 				ibd_rc_drain_scq(chan, chan->scq_hdl);
2893 				mutex_enter(&chan->tx_poll_lock);
2894 				chan->tx_poll_busy = 0;
2895 				mutex_exit(&chan->tx_poll_lock);
2896 			}
2897 			delay(drv_usectohz(100000));
2898 			mutex_enter(&state->id_ac_mutex);
2899 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2900 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2901 		}
2902 		if (times != 0) {
2903 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2904 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2905 			mutex_exit(&state->id_ac_mutex);
2906 		}
2907 
2908 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2909 		if (is_close_rc_chan) {
2910 			ret = ibt_close_rc_channel(chan->chan_hdl,
2911 			    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
2912 			    0);
2913 			if (ret != IBT_SUCCESS) {
2914 				DPRINT(40, "ibd_rc_act_close: ibt_close_rc_"
2915 				    "channel fail, chan=%p, ret=%d",
2916 				    chan, ret);
2917 			} else {
2918 				DPRINT(30, "ibd_rc_act_close: ibt_close_rc_"
2919 				    "channel succ, chan=%p", chan);
2920 			}
2921 		}
2922 
2923 		ibd_rc_free_chan(chan);
2924 		break;
2925 	case IBD_RC_STATE_ACT_REP_RECV:
2926 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2927 		(void) ibt_flush_channel(chan->chan_hdl);
2928 		ibd_rc_free_chan(chan);
2929 		break;
2930 	case IBD_RC_STATE_ACT_ERROR:
2931 		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2932 		break;
2933 	default:
2934 		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2935 		    "chan=%p", chan->chan_state, chan);
2936 	}
2937 }
2938 
2939 /*
2940  * Close a passive channel
2941  *
2942  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2943  *
2944  * is_timeout_close: if B_TRUE, this function is called by the connection
2945  * reaper (refer to function ibd_rc_conn_timeout_call). When the connection
2946  * reaper calls ibd_rc_pas_close(), and if it finds that dl_bufs_outstanding
2947  * or chan->rcq_invoking is non-zero, then it can simply put that channel back
2948  * on the passive channels list and move on, since it might be an indication
2949  * that the channel became active again by the time we started it's cleanup.
2950  * It is costlier to do the cleanup and then reinitiate the channel
2951  * establishment and hence it will help to be conservative when we do the
2952  * cleanup.
2953  */
2954 int
2955 ibd_rc_pas_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan,
2956     boolean_t is_timeout_close)
2957 {
2958 	uint_t times;
2959 	ibt_status_t ret;
2960 
2961 	ASSERT(chan != NULL);
2962 	chan->state->rc_pas_close++;
2963 
2964 	switch (chan->chan_state) {
2965 	case IBD_RC_STATE_PAS_ESTAB:
2966 		if (is_timeout_close) {
2967 			if ((chan->rcq_invoking != 0) ||
2968 			    ((!chan->state->rc_enable_srq) &&
2969 			    (chan->rx_wqe_list.dl_bufs_outstanding > 0))) {
2970 				if (ibd_rc_re_add_to_pas_chan_list(chan)) {
2971 					return (DDI_FAILURE);
2972 				}
2973 			}
2974 		}
2975 		/*
2976 		 * First, stop receive interrupts; this stops the
2977 		 * connection from handing up buffers to higher layers.
2978 		 * Wait for receive buffers to be returned; give up
2979 		 * after 5 seconds.
2980 		 */
2981 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2982 		/* Wait 0.01 second to let ibt_set_cq_handler() take effect */
2983 		delay(drv_usectohz(10000));
2984 		if (!chan->state->rc_enable_srq) {
2985 			times = 50;
2986 			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2987 				delay(drv_usectohz(100000));
2988 				if (--times == 0) {
2989 					DPRINT(40, "ibd_rc_pas_close : "
2990 					    "reclaiming failed");
2991 					ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2992 					ibt_set_cq_handler(chan->rcq_hdl,
2993 					    ibd_rc_rcq_handler,
2994 					    (void *)(uintptr_t)chan);
2995 					return (DDI_FAILURE);
2996 				}
2997 			}
2998 		}
2999 		times = 50;
3000 		while (chan->rcq_invoking != 0) {
3001 			delay(drv_usectohz(100000));
3002 			if (--times == 0) {
3003 				DPRINT(40, "ibd_rc_pas_close : "
3004 				    "rcq handler is being invoked");
3005 				chan->state->rc_pas_close_rcq_invoking++;
3006 				break;
3007 			}
3008 		}
3009 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
3010 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3011 		DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
3012 		    "chan_state=%d, chan=%p", chan->chan_state, chan);
3013 		if (is_close_rc_chan) {
3014 			ret = ibt_close_rc_channel(chan->chan_hdl,
3015 			    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
3016 			    0);
3017 			if (ret != IBT_SUCCESS) {
3018 				DPRINT(40, "ibd_rc_pas_close: ibt_close_rc_"
3019 				    "channel() fail, chan=%p, ret=%d", chan,
3020 				    ret);
3021 			} else {
3022 				DPRINT(30, "ibd_rc_pas_close: ibt_close_rc_"
3023 				    "channel() succ, chan=%p", chan);
3024 			}
3025 		}
3026 		ibd_rc_free_chan(chan);
3027 		break;
3028 	case IBD_RC_STATE_PAS_REQ_RECV:
3029 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3030 		(void) ibt_flush_channel(chan->chan_hdl);
3031 		ibd_rc_free_chan(chan);
3032 		break;
3033 	default:
3034 		DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
3035 		    chan->chan_state, chan);
3036 	}
3037 	return (DDI_SUCCESS);
3038 }
3039 
3040 /*
3041  * Passive Side:
3042  *	Handle an incoming CM REQ from active side.
3043  *
3044  *	If success, this function allocates an ibd_rc_chan_t, then
3045  * assigns it to "*ret_conn".
3046  */
3047 static ibt_cm_status_t
3048 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
3049     ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
3050     void *ret_priv_data)
3051 {
3052 	ibd_rc_msg_hello_t *hello_msg;
3053 	ibd_state_t *state = (ibd_state_t *)arg;
3054 	ibd_rc_chan_t *chan;
3055 
3056 	if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
3057 		DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
3058 		return (IBT_CM_REJECT);
3059 	}
3060 
3061 	ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
3062 
3063 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
3064 
3065 	if (!state->rc_enable_srq) {
3066 		if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
3067 			ibd_rc_free_chan(chan);
3068 			DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
3069 			    "failed");
3070 			return (IBT_CM_REJECT);
3071 		}
3072 	}
3073 
3074 	ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
3075 
3076 	/* We don't do RDMA */
3077 	ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
3078 	ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
3079 
3080 	ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
3081 	ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
3082 
3083 	hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3084 	DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
3085 	    ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
3086 
3087 	hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
3088 	hello_msg->reserved_qpn = htonl(state->id_qpnum);
3089 	hello_msg->rx_mtu = htonl(state->rc_mtu);
3090 
3091 	chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;	/* ready to receive */
3092 	*ret_conn = chan;
3093 
3094 	return (IBT_CM_ACCEPT);
3095 }
3096 
3097 /*
3098  * ibd_rc_handle_act_estab -- handler for connection established completion
3099  * for active side.
3100  */
3101 static ibt_cm_status_t
3102 ibd_rc_handle_act_estab(ibd_ace_t *ace)
3103 {
3104 	ibt_status_t result;
3105 
3106 	switch (ace->ac_chan->chan_state) {
3107 		case IBD_RC_STATE_ACT_REP_RECV:
3108 			ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
3109 			result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
3110 			    IBT_NEXT_COMPLETION);
3111 			if (result != IBT_SUCCESS) {
3112 				DPRINT(40, "ibd_rc_handle_act_estab: "
3113 				    "ibt_enable_cq_notify(rcq) "
3114 				    "failed: status %d", result);
3115 				return (IBT_CM_REJECT);
3116 			}
3117 			break;
3118 		default:
3119 			DPRINT(40, "ibd_rc_handle_act_estab: default "
3120 			    "branch, act_state=%d", ace->ac_chan->chan_state);
3121 			return (IBT_CM_REJECT);
3122 	}
3123 	return (IBT_CM_ACCEPT);
3124 }
3125 
3126 /*
3127  * ibd_rc_handle_pas_estab -- handler for connection established completion
3128  * for passive side.
3129  */
3130 static ibt_cm_status_t
3131 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
3132 {
3133 	ibt_status_t result;
3134 
3135 	switch (chan->chan_state) {
3136 		case IBD_RC_STATE_PAS_REQ_RECV:
3137 			chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
3138 
3139 			result = ibt_enable_cq_notify(chan->rcq_hdl,
3140 			    IBT_NEXT_COMPLETION);
3141 			if (result != IBT_SUCCESS) {
3142 				DPRINT(40, "ibd_rc_handle_pas_estab: "
3143 				    "ibt_enable_cq_notify(rcq) "
3144 				    "failed: status %d", result);
3145 				return (IBT_CM_REJECT);
3146 			}
3147 			break;
3148 		default:
3149 			DPRINT(40, "ibd_rc_handle_pas_estab: default "
3150 			    "branch, chan_state=%d", chan->chan_state);
3151 			return (IBT_CM_REJECT);
3152 	}
3153 	return (IBT_CM_ACCEPT);
3154 }
3155 
3156 /* ARGSUSED */
3157 static ibt_cm_status_t
3158 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3159     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3160     ibt_priv_data_len_t ret_len_max)
3161 {
3162 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3163 	ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
3164 	ibd_rc_chan_t *rc_chan;
3165 	ibd_state_t *state;
3166 	ibd_rc_msg_hello_t *hello_ack;
3167 
3168 	switch (ibt_cm_event->cm_type) {
3169 	case IBT_CM_EVENT_REP_RCV:
3170 		ASSERT(ace->ac_chan != NULL);
3171 		ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
3172 		hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3173 		DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
3174 		    "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
3175 		    ntohl(hello_ack->reserved_qpn));
3176 		ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
3177 		break;
3178 
3179 	case IBT_CM_EVENT_CONN_EST:
3180 		ASSERT(ace->ac_chan != NULL);
3181 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
3182 		    "ace=%p, act_state=%d, chan=%p",
3183 		    ace, ace->ac_chan->chan_state, ace->ac_chan);
3184 		result = ibd_rc_handle_act_estab(ace);
3185 		break;
3186 
3187 	case IBT_CM_EVENT_CONN_CLOSED:
3188 		rc_chan = ace->ac_chan;
3189 		if (rc_chan == NULL) {
3190 			DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3191 			    "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
3192 			return (IBT_CM_ACCEPT);
3193 		}
3194 		state = rc_chan->state;
3195 		mutex_enter(&state->id_ac_mutex);
3196 		if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
3197 		    ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
3198 		    != NULL) && (ace == rc_chan->ace)) {
3199 			rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
3200 			ASSERT(ace->ac_mce == NULL);
3201 			INC_REF(ace, 1);
3202 			IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
3203 			mutex_exit(&state->id_ac_mutex);
3204 			DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3205 			    "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
3206 			    "reason=%d", ace, rc_chan,
3207 			    ibt_cm_event->cm_event.closed);
3208 		} else {
3209 			mutex_exit(&state->id_ac_mutex);
3210 			state->rc_act_close_simultaneous++;
3211 			DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
3212 			    "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
3213 			    "chan_state=%d", rc_chan->chan_state);
3214 			return (IBT_CM_ACCEPT);
3215 		}
3216 		ibd_rc_act_close(rc_chan, B_FALSE);
3217 		mutex_enter(&state->id_ac_mutex);
3218 		ace->ac_chan = NULL;
3219 		ASSERT(ace->ac_ref != 0);
3220 		atomic_dec_32(&ace->ac_ref);
3221 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3222 			IBD_ACACHE_INSERT_FREE(state, ace);
3223 			ace->ac_ref = 0;
3224 		} else {
3225 			ace->ac_ref |= CYCLEVAL;
3226 			state->rc_delay_ace_recycle++;
3227 		}
3228 		mutex_exit(&state->id_ac_mutex);
3229 		break;
3230 
3231 	case IBT_CM_EVENT_FAILURE:
3232 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3233 		    "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3234 		    ace, ace->ac_chan,
3235 		    ibt_cm_event->cm_event.failed.cf_code,
3236 		    ibt_cm_event->cm_event.failed.cf_msg,
3237 		    ibt_cm_event->cm_event.failed.cf_reason);
3238 		/*
3239 		 * Don't need free resource here. The resource is freed
3240 		 * at function ibd_rc_connect()
3241 		 */
3242 		break;
3243 
3244 	case IBT_CM_EVENT_MRA_RCV:
3245 		DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3246 		break;
3247 	case IBT_CM_EVENT_LAP_RCV:
3248 		DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3249 		break;
3250 	case IBT_CM_EVENT_APR_RCV:
3251 		DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3252 		break;
3253 	default:
3254 		DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3255 		    "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3256 		break;
3257 	}
3258 
3259 	return (result);
3260 }
3261 
3262 /* ARGSUSED */
3263 static ibt_cm_status_t
3264 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3265     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3266     ibt_priv_data_len_t ret_len_max)
3267 {
3268 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3269 	ibd_rc_chan_t *chan;
3270 
3271 	if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3272 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3273 		    "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3274 		/* Receive an incoming CM REQ from active side */
3275 		result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3276 		    ret_priv_data);
3277 		return (result);
3278 	}
3279 
3280 	if (ibt_cm_event->cm_channel == 0) {
3281 		DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3282 		    "ERROR ibt_cm_event->cm_channel == 0");
3283 		return (IBT_CM_REJECT);
3284 	}
3285 
3286 	chan =
3287 	    (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3288 	if (chan == NULL) {
3289 		DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3290 		return (IBT_CM_REJECT);
3291 	}
3292 
3293 	switch (ibt_cm_event->cm_type) {
3294 	case IBT_CM_EVENT_CONN_EST:
3295 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3296 		    "chan=%p", chan);
3297 		result = ibd_rc_handle_pas_estab(chan);
3298 		break;
3299 	case IBT_CM_EVENT_CONN_CLOSED:
3300 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3301 		    " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3302 		chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3303 		    chan);
3304 		if (chan != NULL)
3305 			(void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3306 		break;
3307 	case IBT_CM_EVENT_FAILURE:
3308 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3309 		    " chan=%p, code: %d, msg: %d, reason=%d", chan,
3310 		    ibt_cm_event->cm_event.failed.cf_code,
3311 		    ibt_cm_event->cm_event.failed.cf_msg,
3312 		    ibt_cm_event->cm_event.failed.cf_reason);
3313 		chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3314 		    chan);
3315 		if (chan != NULL)
3316 			(void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3317 		return (IBT_CM_ACCEPT);
3318 	case IBT_CM_EVENT_MRA_RCV:
3319 		DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3320 		break;
3321 	case IBT_CM_EVENT_LAP_RCV:
3322 		DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3323 		break;
3324 	case IBT_CM_EVENT_APR_RCV:
3325 		DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3326 		break;
3327 	default:
3328 		DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3329 		    "chan=%p", ibt_cm_event->cm_type, chan);
3330 		break;
3331 	}
3332 
3333 	return (result);
3334 }
3335