1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2018 Joyent, Inc.
25 */
26 /* Copyright (c) 1990 Mentat Inc. */
27
28 /*
29 * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
30 */
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42
43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */
44 #include <sys/atomic.h> /* for atomic_add*() */
45 #include <sys/ethernet.h> /* for ETHERTYPE_IP */
46 #include <netinet/in.h> /* for netinet/ip.h below */
47 #include <netinet/ip.h> /* for struct ip */
48 #include <inet/common.h> /* for inet/ip.h below */
49 #include <inet/ip.h> /* for ipha_t */
50 #include <inet/ip_if.h> /* for ETHERTYPE_IPV6 */
51 #include <inet/ip6.h> /* for ip6_t */
52 #include <netinet/icmp6.h> /* for icmp6_t */
53
54 #include <sys/ib/clients/ibd/ibd.h>
55
56 extern ibd_global_state_t ibd_gstate;
57 extern int ibd_rc_conn_timeout;
58 uint_t ibd_rc_tx_softintr = 1;
59 /*
60 * If the number of WRs in receive queue of each RC connection less than
61 * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
62 */
63 #define IBD_RC_RX_WR_THRESHOLD 0x20
64
65 /*
66 * If the number of free SWQEs (or large Tx buf) is larger than or equal to
67 * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
68 * transmitting packets.
69 */
70 #define IBD_RC_TX_FREE_THRESH 8
71
72 #define IBD_RC_QPN_TO_SID(qpn) \
73 ((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
74
75 /* For interop with legacy OFED */
76 #define IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
77 ((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
78
79 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
80 #define IBD_RC_IP_ICMP_RETURN_DATA_BYTES 64
81
82
83 /* Functions for Reliable Connected Mode */
84 /* Connection Setup/Close Functions */
85 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
86 ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
87 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
88 ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
89 static void ibd_rc_act_close(ibd_rc_chan_t *, boolean_t);
90
91 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
92 ibd_rc_chan_t *);
93 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
94 ibd_rc_chan_list_t *);
95 static inline ibd_rc_chan_t *ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
96 ibd_rc_chan_t *);
97
98 /* CQ handlers */
99 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
100 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
101 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
102
103 /* Receive Functions */
104 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
105 static void ibd_rc_srq_freemsg_cb(char *);
106 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
107
108 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
109 static void ibd_rc_freemsg_cb(char *);
110 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
111 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
112 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
113
114
115 /* Send Functions */
116 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
117 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
118 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
119 static uint_t ibd_rc_tx_recycle(caddr_t);
120
121
122 void
ibd_async_rc_close_act_chan(ibd_state_t * state,ibd_req_t * req)123 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
124 {
125 ibd_rc_chan_t *rc_chan = req->rq_ptr;
126 ibd_ace_t *ace;
127
128 while (rc_chan != NULL) {
129 ace = rc_chan->ace;
130 ASSERT(ace != NULL);
131 /* Close old RC channel */
132 ibd_rc_act_close(rc_chan, B_TRUE);
133 mutex_enter(&state->id_ac_mutex);
134 ASSERT(ace->ac_ref != 0);
135 atomic_dec_32(&ace->ac_ref);
136 ace->ac_chan = NULL;
137 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
138 IBD_ACACHE_INSERT_FREE(state, ace);
139 ace->ac_ref = 0;
140 } else {
141 ace->ac_ref |= CYCLEVAL;
142 state->rc_delay_ace_recycle++;
143 }
144 mutex_exit(&state->id_ac_mutex);
145 rc_chan = ibd_rc_rm_header_chan_list(
146 &state->rc_obs_act_chan_list);
147 }
148 }
149
150 void
ibd_async_rc_recycle_ace(ibd_state_t * state,ibd_req_t * req)151 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
152 {
153 ibd_ace_t *ace = req->rq_ptr;
154 ibd_rc_chan_t *rc_chan;
155
156 ASSERT(ace != NULL);
157 rc_chan = ace->ac_chan;
158 ASSERT(rc_chan != NULL);
159 /* Close old RC channel */
160 ibd_rc_act_close(rc_chan, B_TRUE);
161 mutex_enter(&state->id_ac_mutex);
162 ASSERT(ace->ac_ref != 0);
163 atomic_dec_32(&ace->ac_ref);
164 ace->ac_chan = NULL;
165 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
166 IBD_ACACHE_INSERT_FREE(state, ace);
167 ace->ac_ref = 0;
168 } else {
169 ace->ac_ref |= CYCLEVAL;
170 state->rc_delay_ace_recycle++;
171 }
172 mutex_exit(&state->id_ac_mutex);
173 mutex_enter(&state->rc_ace_recycle_lock);
174 state->rc_ace_recycle = NULL;
175 mutex_exit(&state->rc_ace_recycle_lock);
176 }
177
178 /* Simple ICMP IP Header Template */
179 static const ipha_t icmp_ipha = {
180 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
181 };
182
183 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
184 void
ibd_async_rc_process_too_big(ibd_state_t * state,ibd_req_t * req)185 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
186 {
187 mblk_t *mp = req->rq_ptr;
188 ibd_ace_t *ace = req->rq_ptr2;
189 uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
190 uint_t len_needed;
191 size_t msg_len;
192 mblk_t *pmtu_mp;
193 ushort_t sap;
194 ib_header_info_t *ibha; /* ib header for pmtu_pkt */
195 /*
196 * ipha: IP header for pmtu_pkt
197 * old_ipha: IP header for old packet
198 */
199 ipha_t *ipha, *old_ipha;
200 icmph_t *icmph;
201
202 sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
203
204 if (!pullupmsg(mp, -1)) {
205 DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
206 goto too_big_fail;
207 }
208 /* move to IP header. */
209 mp->b_rptr += IPOIB_HDRSIZE;
210 old_ipha = (ipha_t *)mp->b_rptr;
211
212 len_needed = IPH_HDR_LENGTH(old_ipha);
213 if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
214 len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
215 len_needed));
216 } else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
217 ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
218 + len_needed);
219 len_needed += ip_hdr_length_v6(mp, ip6h);
220 }
221 len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
222 msg_len = msgdsize(mp);
223 if (msg_len > len_needed) {
224 (void) adjmsg(mp, len_needed - msg_len);
225 msg_len = len_needed;
226 }
227
228 if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
229 + sizeof (icmph_t), BPRI_MED)) == NULL) {
230 DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
231 goto too_big_fail;
232 }
233 pmtu_mp->b_cont = mp;
234 pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
235 + sizeof (ipha_t) + sizeof (icmph_t);
236
237 ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
238
239 /* Fill IB header */
240 bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
241 /*
242 * If the GRH is not valid, indicate to GLDv3 by setting
243 * the VerTcFlow field to 0.
244 */
245 ibha->ib_grh.ipoib_vertcflow = 0;
246 ibha->ipib_rhdr.ipoib_type = htons(sap);
247 ibha->ipib_rhdr.ipoib_mbz = 0;
248
249 /* Fill IP header */
250 ipha = (ipha_t *)&ibha[1];
251 *ipha = icmp_ipha;
252 ipha->ipha_src = old_ipha->ipha_dst;
253 ipha->ipha_dst = old_ipha->ipha_src;
254 ipha->ipha_ttl = old_ipha->ipha_ttl;
255 msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
256 if (msg_len > IP_MAXPACKET) {
257 ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
258 "> IP_MAXPACKET", (uint32_t)msg_len);
259 (void) adjmsg(mp, IP_MAXPACKET - msg_len);
260 msg_len = IP_MAXPACKET;
261 }
262 ipha->ipha_length = htons((uint16_t)msg_len);
263 ipha->ipha_hdr_checksum = 0;
264 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
265
266 /* Fill ICMP body */
267 icmph = (icmph_t *)&ipha[1];
268 bzero(icmph, sizeof (icmph_t));
269 icmph->icmph_type = ICMP_DEST_UNREACHABLE;
270 icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
271 icmph->icmph_du_mtu = htons(mtu);
272 icmph->icmph_checksum = 0;
273 icmph->icmph_checksum = IP_CSUM(pmtu_mp,
274 (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
275
276 mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
277
278 DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
279 "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
280 sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
281 len_needed, (uint32_t)msg_len);
282
283 mac_rx(state->id_mh, state->id_rh, pmtu_mp);
284
285 mutex_enter(&ace->tx_too_big_mutex);
286 ace->tx_too_big_ongoing = B_FALSE;
287 mutex_exit(&ace->tx_too_big_mutex);
288 return;
289
290 too_big_fail:
291 /* Drop packet */
292 freemsg(mp);
293 mutex_enter(&ace->tx_too_big_mutex);
294 ace->tx_too_big_ongoing = B_FALSE;
295 mutex_exit(&ace->tx_too_big_mutex);
296 }
297
298 /*
299 * Check all active/passive channels. If any ative/passive
300 * channel has not been used for a long time, close it.
301 */
302 void
ibd_rc_conn_timeout_call(void * carg)303 ibd_rc_conn_timeout_call(void *carg)
304 {
305 ibd_state_t *state = carg;
306 ibd_ace_t *ace, *pre_ace;
307 ibd_rc_chan_t *chan, *pre_chan, *next_chan;
308 ibd_req_t *req;
309
310 /* Check all active channels. If chan->is_used == B_FALSE, close it */
311 mutex_enter(&state->id_ac_mutex);
312 ace = list_head(&state->id_ah_active);
313 while ((pre_ace = ace) != NULL) {
314 ace = list_next(&state->id_ah_active, ace);
315 if (pre_ace->ac_chan != NULL) {
316 chan = pre_ace->ac_chan;
317 ASSERT(state->id_enable_rc == B_TRUE);
318 if (chan->chan_state == IBD_RC_STATE_ACT_ESTAB) {
319 if (chan->is_used == B_FALSE) {
320 state->rc_timeout_act++;
321 INC_REF(pre_ace, 1);
322 IBD_ACACHE_PULLOUT_ACTIVE(state,
323 pre_ace);
324 chan->chan_state =
325 IBD_RC_STATE_ACT_CLOSING;
326 ibd_rc_signal_act_close(state, pre_ace);
327 } else {
328 chan->is_used = B_FALSE;
329 }
330 }
331 }
332 }
333 mutex_exit(&state->id_ac_mutex);
334
335 /* Check all passive channels. If chan->is_used == B_FALSE, close it */
336 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
337 next_chan = state->rc_pass_chan_list.chan_list;
338 pre_chan = NULL;
339 while ((chan = next_chan) != NULL) {
340 next_chan = chan->next;
341 if (chan->is_used == B_FALSE) {
342 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
343 if (req != NULL) {
344 /* remove it */
345 state->rc_timeout_pas++;
346 req->rq_ptr = chan;
347 ibd_queue_work_slot(state, req,
348 IBD_ASYNC_RC_CLOSE_PAS_CHAN);
349 } else {
350 ibd_print_warn(state, "ibd_rc_conn_timeout: "
351 "alloc ibd_req_t fail");
352 if (pre_chan == NULL) {
353 state->rc_pass_chan_list.chan_list =
354 chan;
355 } else {
356 pre_chan->next = chan;
357 }
358 pre_chan = chan;
359 }
360 } else {
361 if (pre_chan == NULL) {
362 state->rc_pass_chan_list.chan_list = chan;
363 } else {
364 pre_chan->next = chan;
365 }
366 pre_chan = chan;
367 chan->is_used = B_FALSE;
368 }
369 }
370 if (pre_chan != NULL) {
371 pre_chan->next = NULL;
372 } else {
373 state->rc_pass_chan_list.chan_list = NULL;
374 }
375 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
376
377 mutex_enter(&state->rc_timeout_lock);
378 if (state->rc_timeout_start == B_TRUE) {
379 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
380 SEC_TO_TICK(ibd_rc_conn_timeout));
381 }
382 mutex_exit(&state->rc_timeout_lock);
383 }
384
385 #ifdef DEBUG
386 /*
387 * ibd_rc_update_stats - update driver private kstat counters
388 *
389 * This routine will dump the internal statistics counters for ibd's
390 * Reliable Connected Mode. The current stats dump values will
391 * be sent to the kernel status area.
392 */
393 static int
ibd_rc_update_stats(kstat_t * ksp,int rw)394 ibd_rc_update_stats(kstat_t *ksp, int rw)
395 {
396 ibd_state_t *state;
397 ibd_rc_stat_t *ibd_rc_ksp;
398
399 if (rw == KSTAT_WRITE)
400 return (EACCES);
401
402 state = (ibd_state_t *)ksp->ks_private;
403 ASSERT(state != NULL);
404 ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
405
406 ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
407 ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
408 ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
409 ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
410 ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
411
412 ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
413
414 ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
415
416 ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
417 ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
418 ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
419 state->rc_xmt_fragmented_pkt;
420 ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
421 ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
422 ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
423
424 ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
425 ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
426 ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
427 ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
428 ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
429 ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
430 state->rc_xmt_buf_mac_update;
431
432 ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
433 ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
434 ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
435 ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
436
437 ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
438 ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
439 ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
440 ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
441 state->rc_act_close_simultaneous;
442 ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
443 ibd_rc_ksp->rc_timeout_act.value.ul = state->rc_timeout_act;
444 ibd_rc_ksp->rc_timeout_pas.value.ul = state->rc_timeout_pas;
445
446 return (0);
447 }
448
449
450 /*
451 * ibd_rc_init_stats - initialize kstat data structures
452 *
453 * This routine will create and initialize the driver private
454 * statistics counters.
455 */
456 int
ibd_rc_init_stats(ibd_state_t * state)457 ibd_rc_init_stats(ibd_state_t *state)
458 {
459 kstat_t *ksp;
460 ibd_rc_stat_t *ibd_rc_ksp;
461 char stat_name[KSTAT_STRLEN];
462 int inst;
463
464 /*
465 * Create and init kstat
466 */
467 inst = ddi_get_instance(state->id_dip);
468 (void) snprintf(stat_name, KSTAT_STRLEN, "statistics%d_%x_%u", inst,
469 state->id_pkey, state->id_plinkid);
470 ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED,
471 sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
472
473 if (ksp == NULL) {
474 ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
475 "kernel statistics");
476 return (DDI_FAILURE);
477 }
478
479 state->rc_ksp = ksp; /* Fill in the ksp of ibd over RC mode */
480
481 ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
482
483 /*
484 * Initialize all the statistics
485 */
486 kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
487 "transfer mode", KSTAT_DATA_ULONG);
488 kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
489 "transfer mode", KSTAT_DATA_ULONG);
490 kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
491 "copy mode", KSTAT_DATA_ULONG);
492 kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
493 "copy mode", KSTAT_DATA_ULONG);
494 kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
495 KSTAT_DATA_ULONG);
496
497 kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
498 KSTAT_DATA_ULONG);
499
500 kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
501 KSTAT_DATA_ULONG);
502
503 kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
504 KSTAT_DATA_ULONG);
505 kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
506 "RC: Tx pkt small size", KSTAT_DATA_ULONG);
507 kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
508 "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
509 kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
510 "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
511 kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
512 "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
513 kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
514 KSTAT_DATA_ULONG);
515
516 kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
517 "recycle", KSTAT_DATA_ULONG);
518 kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
519 "after recycle", KSTAT_DATA_ULONG);
520 kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
521 KSTAT_DATA_ULONG);
522 kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
523 "#, swqe available", KSTAT_DATA_ULONG);
524 kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
525 "ibd_send", KSTAT_DATA_ULONG);
526 kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
527 "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
528
529 kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
530 KSTAT_DATA_ULONG);
531 kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
532 KSTAT_DATA_ULONG);
533 kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
534 "pkt", KSTAT_DATA_ULONG);
535 kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
536 "state", KSTAT_DATA_ULONG);
537
538 kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
539 KSTAT_DATA_ULONG);
540 kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
541 KSTAT_DATA_ULONG);
542 kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
543 "recycle", KSTAT_DATA_ULONG);
544 kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
545 "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
546 kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
547 KSTAT_DATA_ULONG);
548 kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: timeout act side",
549 KSTAT_DATA_ULONG);
550 kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: timeout pas side",
551 KSTAT_DATA_ULONG);
552
553 /*
554 * Function to provide kernel stat update on demand
555 */
556 ksp->ks_update = ibd_rc_update_stats;
557
558 /*
559 * Pointer into provider's raw statistics
560 */
561 ksp->ks_private = (void *)state;
562
563 /*
564 * Add kstat to systems kstat chain
565 */
566 kstat_install(ksp);
567
568 return (DDI_SUCCESS);
569 }
570 #endif
571
572 static ibt_status_t
ibd_rc_alloc_chan(ibd_rc_chan_t ** ret_chan,ibd_state_t * state,boolean_t is_tx_chan)573 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
574 boolean_t is_tx_chan)
575 {
576 ibt_status_t result;
577 ibd_rc_chan_t *chan;
578 ibt_rc_chan_alloc_args_t alloc_args;
579 ibt_chan_alloc_flags_t alloc_flags;
580 ibt_chan_sizes_t sizes;
581 ibt_cq_attr_t cq_atts;
582 int rv;
583
584 chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
585
586 chan->state = state;
587 mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
588 mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
589 mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
590 mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
591 mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
592 mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
593
594 /* Allocate IB structures for a new RC channel. */
595 if (is_tx_chan) {
596 chan->scq_size = state->id_rc_num_swqe;
597 chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
598 } else {
599 chan->scq_size = IBD_RC_MIN_CQ_SIZE;
600 chan->rcq_size = state->id_rc_num_rwqe;
601 }
602 cq_atts.cq_size = chan->scq_size;
603 cq_atts.cq_sched = NULL;
604 cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
605 result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
606 &chan->scq_size);
607 if (result != IBT_SUCCESS) {
608 DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
609 "create scq completion queue (size <%d>)",
610 result, chan->scq_size);
611 goto alloc_scq_err;
612 } /* if failure to alloc cq */
613
614 if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count,
615 state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) {
616 DPRINT(30, "ibd_rc_alloc_chan: Send CQ "
617 "interrupt moderation failed");
618 }
619
620 ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
621 ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
622 (void *) (uintptr_t)chan);
623
624 cq_atts.cq_size = chan->rcq_size;
625 cq_atts.cq_sched = NULL;
626 cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
627 result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
628 &chan->rcq_size);
629 if (result != IBT_SUCCESS) {
630 ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
631 "rx completion queue (size <%d>)", result, chan->rcq_size);
632 goto alloc_rcq_err;
633 } /* if failure to alloc cq */
634
635 if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count,
636 state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) {
637 DPRINT(30, "ibd_rc_alloc_chan: Receive CQ "
638 "interrupt moderation failed");
639 }
640
641 ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
642 ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
643 (void *)(uintptr_t)chan);
644
645 if (is_tx_chan) {
646 chan->is_tx_chan = B_TRUE;
647 if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
648 ibd_print_warn(state, "ibd_rc_alloc_chan: "
649 "ibd_rc_init_txlist failed");
650 goto init_txlist_err;
651 }
652 if (ibd_rc_tx_softintr == 1) {
653 if ((rv = ddi_add_softintr(state->id_dip,
654 DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
655 ibd_rc_tx_recycle, (caddr_t)chan)) !=
656 DDI_SUCCESS) {
657 DPRINT(10, "ibd_rc_alloc_chan: failed in "
658 "ddi_add_softintr(scq_softintr), ret=%d",
659 rv);
660 goto alloc_softintr_err;
661 }
662 }
663 } else {
664 chan->is_tx_chan = B_FALSE;
665 }
666
667 /*
668 * enable completions
669 */
670 result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
671 if (result != IBT_SUCCESS) {
672 ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
673 "(scq) failed: status %d\n", result);
674 goto alloc_scq_enable_err;
675 }
676
677 /* We will enable chan->rcq_hdl later. */
678
679 /* alloc a RC channel */
680 bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
681 bzero(&sizes, sizeof (ibt_chan_sizes_t));
682
683 alloc_args.rc_flags = IBT_WR_SIGNALED;
684 alloc_args.rc_control = IBT_CEP_NO_FLAGS;
685
686 alloc_args.rc_scq = chan->scq_hdl;
687 alloc_args.rc_rcq = chan->rcq_hdl;
688 alloc_args.rc_pd = state->id_pd_hdl;
689
690 alloc_args.rc_hca_port_num = state->id_port;
691 alloc_args.rc_clone_chan = NULL;
692
693 /* scatter/gather */
694 alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
695
696 /*
697 * For the number of SGL elements in receive side, I think it
698 * should be 1. Because ibd driver allocates a whole block memory
699 * for each ibt_post_recv().
700 */
701 alloc_args.rc_sizes.cs_rq_sgl = 1;
702
703 /* The send queue size and the receive queue size */
704 alloc_args.rc_sizes.cs_sq = chan->scq_size;
705 alloc_args.rc_sizes.cs_rq = chan->rcq_size;
706
707 if (state->id_hca_res_lkey_capab) {
708 alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
709 } else {
710 DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
711 }
712
713 if (state->rc_enable_srq) {
714 alloc_flags = IBT_ACHAN_USES_SRQ;
715 alloc_args.rc_srq = state->rc_srq_hdl;
716 } else {
717 alloc_flags = IBT_ACHAN_NO_FLAGS;
718 }
719
720 result = ibt_alloc_rc_channel(state->id_hca_hdl,
721 alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
722 if (result != IBT_SUCCESS) {
723 ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
724 " fail:<%d>", result);
725 goto alloc_scq_enable_err;
726 }
727
728 if (is_tx_chan)
729 atomic_inc_32(&state->rc_num_tx_chan);
730 else
731 atomic_inc_32(&state->rc_num_rx_chan);
732
733 /* For the connection reaper routine ibd_rc_conn_timeout_call() */
734 chan->is_used = B_TRUE;
735
736 *ret_chan = chan;
737 return (IBT_SUCCESS);
738
739 alloc_scq_enable_err:
740 if (is_tx_chan) {
741 if (ibd_rc_tx_softintr == 1) {
742 ddi_remove_softintr(chan->scq_softintr);
743 }
744 }
745 alloc_softintr_err:
746 if (is_tx_chan) {
747 ibd_rc_fini_txlist(chan);
748 }
749 init_txlist_err:
750 (void) ibt_free_cq(chan->rcq_hdl);
751 alloc_rcq_err:
752 (void) ibt_free_cq(chan->scq_hdl);
753 alloc_scq_err:
754 mutex_destroy(&chan->tx_poll_lock);
755 mutex_destroy(&chan->tx_post_lock);
756 mutex_destroy(&chan->tx_rel_list.dl_mutex);
757 mutex_destroy(&chan->tx_wqe_list.dl_mutex);
758 mutex_destroy(&chan->rx_free_list.dl_mutex);
759 mutex_destroy(&chan->rx_wqe_list.dl_mutex);
760 kmem_free(chan, sizeof (ibd_rc_chan_t));
761 return (result);
762 }
763
764 static void
ibd_rc_free_chan(ibd_rc_chan_t * chan)765 ibd_rc_free_chan(ibd_rc_chan_t *chan)
766 {
767 ibt_status_t ret;
768
769 /* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
770
771 if (chan->chan_hdl != NULL) {
772 ret = ibt_free_channel(chan->chan_hdl);
773 if (ret != IBT_SUCCESS) {
774 DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
775 "chan=%p, returned: %d", chan, ret);
776 return;
777 }
778 chan->chan_hdl = NULL;
779 }
780
781 if (chan->rcq_hdl != NULL) {
782 ret = ibt_free_cq(chan->rcq_hdl);
783 if (ret != IBT_SUCCESS) {
784 DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
785 "chan=%p, returned: %d", chan, ret);
786 return;
787 }
788 chan->rcq_hdl = NULL;
789 }
790
791 if (chan->scq_hdl != NULL) {
792 ret = ibt_free_cq(chan->scq_hdl);
793 if (ret != IBT_SUCCESS) {
794 DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
795 "chan=%p, returned: %d", chan, ret);
796 return;
797 }
798 chan->scq_hdl = NULL;
799 }
800
801 /* Free buffers */
802 if (chan->is_tx_chan) {
803 ibd_rc_fini_txlist(chan);
804 if (ibd_rc_tx_softintr == 1) {
805 ddi_remove_softintr(chan->scq_softintr);
806 }
807 atomic_dec_32(&chan->state->rc_num_tx_chan);
808 } else {
809 if (!chan->state->rc_enable_srq) {
810 ibd_rc_fini_rxlist(chan);
811 }
812 atomic_dec_32(&chan->state->rc_num_rx_chan);
813 }
814
815 mutex_destroy(&chan->tx_poll_lock);
816 mutex_destroy(&chan->tx_post_lock);
817 mutex_destroy(&chan->tx_rel_list.dl_mutex);
818 mutex_destroy(&chan->tx_wqe_list.dl_mutex);
819 mutex_destroy(&chan->rx_free_list.dl_mutex);
820 mutex_destroy(&chan->rx_wqe_list.dl_mutex);
821
822 /*
823 * If it is a passive channel, must make sure it has been removed
824 * from chan->state->rc_pass_chan_list
825 */
826 kmem_free(chan, sizeof (ibd_rc_chan_t));
827 }
828
829 /* Add a RC channel */
830 static inline void
ibd_rc_add_to_chan_list(ibd_rc_chan_list_t * list,ibd_rc_chan_t * chan)831 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
832 {
833 mutex_enter(&list->chan_list_mutex);
834 if (list->chan_list == NULL) {
835 list->chan_list = chan;
836 chan->next = NULL;
837 } else {
838 chan->next = list->chan_list;
839 list->chan_list = chan;
840 }
841 mutex_exit(&list->chan_list_mutex);
842 }
843
844 static boolean_t
ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t * chan)845 ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t *chan)
846 {
847 ibd_state_t *state = chan->state;
848
849 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
850 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) {
851 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
852 return (B_FALSE);
853 } else {
854 if (state->rc_pass_chan_list.chan_list == NULL) {
855 state->rc_pass_chan_list.chan_list = chan;
856 chan->next = NULL;
857 } else {
858 chan->next = state->rc_pass_chan_list.chan_list;
859 state->rc_pass_chan_list.chan_list = chan;
860 }
861 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
862 return (B_TRUE);
863 }
864 }
865
866 /* Remove a RC channel */
867 static inline ibd_rc_chan_t *
ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t * list,ibd_rc_chan_t * chan)868 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
869 {
870 ibd_rc_chan_t *pre_chan;
871
872 mutex_enter(&list->chan_list_mutex);
873 if (list->chan_list == chan) {
874 DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
875 " in chan_list", chan);
876 list->chan_list = chan->next;
877 } else {
878 pre_chan = list->chan_list;
879 while (pre_chan != NULL) {
880 if (pre_chan->next == chan) {
881 DPRINT(30, "ibd_rc_rm_from_chan_list"
882 "(middle): found chan(%p)", chan);
883 pre_chan->next = chan->next;
884 break;
885 }
886 pre_chan = pre_chan->next;
887 }
888 if (pre_chan == NULL)
889 chan = NULL;
890 }
891 mutex_exit(&list->chan_list_mutex);
892 return (chan);
893 }
894
895 static inline ibd_rc_chan_t *
ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t * list)896 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
897 {
898 ibd_rc_chan_t *rc_chan;
899
900 mutex_enter(&list->chan_list_mutex);
901 rc_chan = list->chan_list;
902 if (rc_chan != NULL) {
903 list->chan_list = rc_chan->next;
904 }
905 mutex_exit(&list->chan_list_mutex);
906 return (rc_chan);
907 }
908
909 static int
ibd_rc_alloc_srq_copybufs(ibd_state_t * state)910 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
911 {
912 ibt_mr_attr_t mem_attr;
913 uint_t rc_rx_bufs_sz;
914
915 /*
916 * Allocate one big chunk for all regular rx copy bufs
917 */
918 rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
919
920 state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
921
922 state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
923 sizeof (ibd_rwqe_t), KM_SLEEP);
924
925 /*
926 * Do one memory registration on the entire rxbuf area
927 */
928 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
929 mem_attr.mr_len = rc_rx_bufs_sz;
930 mem_attr.mr_as = NULL;
931 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
932 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
933 &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
934 != IBT_SUCCESS) {
935 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
936 "failed");
937 kmem_free(state->rc_srq_rwqes,
938 state->rc_srq_size * sizeof (ibd_rwqe_t));
939 kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
940 state->rc_srq_rx_bufs = NULL;
941 state->rc_srq_rwqes = NULL;
942 return (DDI_FAILURE);
943 }
944
945 return (DDI_SUCCESS);
946 }
947
948 static void
ibd_rc_free_srq_copybufs(ibd_state_t * state)949 ibd_rc_free_srq_copybufs(ibd_state_t *state)
950 {
951 uint_t rc_rx_buf_sz;
952
953 /*
954 * Don't change the value of state->rc_mtu at the period from call
955 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
956 */
957 rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
958
959 /*
960 * Unregister rxbuf mr
961 */
962 if (ibt_deregister_mr(state->id_hca_hdl,
963 state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
964 DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
965 " failed");
966 }
967 state->rc_srq_rx_mr_hdl = NULL;
968
969 /*
970 * Free rxbuf memory
971 */
972 kmem_free(state->rc_srq_rwqes,
973 state->rc_srq_size * sizeof (ibd_rwqe_t));
974 kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
975 state->rc_srq_rwqes = NULL;
976 state->rc_srq_rx_bufs = NULL;
977 }
978
979 /*
980 * Allocate and post a certain number of SRQ receive buffers and WRs.
981 */
982 int
ibd_rc_init_srq_list(ibd_state_t * state)983 ibd_rc_init_srq_list(ibd_state_t *state)
984 {
985 ibd_rwqe_t *rwqe;
986 ibt_lkey_t lkey;
987 int i;
988 uint_t len;
989 uint8_t *bufaddr;
990 ibt_srq_sizes_t srq_sizes;
991 ibt_srq_sizes_t srq_real_sizes;
992 ibt_status_t ret;
993
994 srq_sizes.srq_sgl_sz = 1;
995 srq_sizes.srq_wr_sz = state->id_rc_num_srq;
996 ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
997 state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
998 if (ret != IBT_SUCCESS) {
999 /*
1000 * The following code is for CR 6932460 (can't configure ibd
1001 * interface on 32 bits x86 systems). 32 bits x86 system has
1002 * less memory resource than 64 bits x86 system. If current
1003 * resource request can't be satisfied, we request less
1004 * resource here.
1005 */
1006 len = state->id_rc_num_srq;
1007 while ((ret == IBT_HCA_WR_EXCEEDED) &&
1008 (len >= 2 * IBD_RC_MIN_CQ_SIZE)) {
1009 len = len/2;
1010 srq_sizes.srq_sgl_sz = 1;
1011 srq_sizes.srq_wr_sz = len;
1012 ret = ibt_alloc_srq(state->id_hca_hdl,
1013 IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes,
1014 &state->rc_srq_hdl, &srq_real_sizes);
1015 }
1016 if (ret != IBT_SUCCESS) {
1017 DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
1018 "req_sgl_sz=%d, req_wr_sz=0x%x, final_req_wr_sz="
1019 "0x%x, ret=%d", srq_sizes.srq_sgl_sz,
1020 srq_sizes.srq_wr_sz, len, ret);
1021 return (DDI_FAILURE);
1022 }
1023 state->id_rc_num_srq = len;
1024 state->id_rc_num_rwqe = state->id_rc_num_srq + 1;
1025 }
1026
1027 state->rc_srq_size = srq_real_sizes.srq_wr_sz;
1028 if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
1029 ret = ibt_free_srq(state->rc_srq_hdl);
1030 if (ret != IBT_SUCCESS) {
1031 ibd_print_warn(state, "ibd_rc_init_srq_list: "
1032 "ibt_free_srq fail, ret=%d", ret);
1033 }
1034 return (DDI_FAILURE);
1035 }
1036
1037 /*
1038 * Allocate and setup the rwqe list
1039 */
1040 lkey = state->rc_srq_rx_mr_desc.md_lkey;
1041 rwqe = state->rc_srq_rwqes;
1042 bufaddr = state->rc_srq_rx_bufs;
1043 len = state->rc_mtu + IPOIB_GRH_SIZE;
1044 state->rc_srq_rwqe_list.dl_cnt = 0;
1045 state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1046 for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1047 rwqe->w_state = state;
1048 rwqe->w_freeing_wqe = B_FALSE;
1049 rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1050 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1051 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1052
1053 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1054 &rwqe->w_freemsg_cb)) == NULL) {
1055 DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1056 rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1057 if (atomic_dec_32_nv(&state->id_running) != 0) {
1058 cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
1059 "id_running was not 1\n");
1060 }
1061 ibd_rc_fini_srq_list(state);
1062 atomic_inc_32(&state->id_running);
1063 return (DDI_FAILURE);
1064 }
1065
1066 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1067 /* Leave IPOIB_GRH_SIZE space */
1068 rwqe->rwqe_copybuf.ic_sgl.ds_va =
1069 (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1070 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1071 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1072 rwqe->w_rwr.wr_nds = 1;
1073 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1074 (void) ibd_rc_post_srq(state, rwqe);
1075 }
1076
1077 mutex_enter(&state->rc_srq_free_list.dl_mutex);
1078 state->rc_srq_free_list.dl_head = NULL;
1079 state->rc_srq_free_list.dl_cnt = 0;
1080 mutex_exit(&state->rc_srq_free_list.dl_mutex);
1081
1082 return (DDI_SUCCESS);
1083 }
1084
1085 /*
1086 * Free the statically allocated Rx buffer list for SRQ.
1087 */
1088 void
ibd_rc_fini_srq_list(ibd_state_t * state)1089 ibd_rc_fini_srq_list(ibd_state_t *state)
1090 {
1091 ibd_rwqe_t *rwqe;
1092 int i;
1093 ibt_status_t ret;
1094
1095 ASSERT(state->id_running == 0);
1096 ret = ibt_free_srq(state->rc_srq_hdl);
1097 if (ret != IBT_SUCCESS) {
1098 ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1099 "ibt_free_srq fail, ret=%d", ret);
1100 }
1101
1102 mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1103 rwqe = state->rc_srq_rwqes;
1104 for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1105 if (rwqe->rwqe_im_mblk != NULL) {
1106 rwqe->w_freeing_wqe = B_TRUE;
1107 freemsg(rwqe->rwqe_im_mblk);
1108 }
1109 }
1110 mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1111
1112 ibd_rc_free_srq_copybufs(state);
1113 }
1114
1115 /* Repost the elements in state->ib_rc_free_list */
1116 int
ibd_rc_repost_srq_free_list(ibd_state_t * state)1117 ibd_rc_repost_srq_free_list(ibd_state_t *state)
1118 {
1119 ibd_rwqe_t *rwqe;
1120 ibd_wqe_t *list;
1121 uint_t len;
1122
1123 mutex_enter(&state->rc_srq_free_list.dl_mutex);
1124 if (state->rc_srq_free_list.dl_head != NULL) {
1125 /* repost them */
1126 len = state->rc_mtu + IPOIB_GRH_SIZE;
1127 list = state->rc_srq_free_list.dl_head;
1128 state->rc_srq_free_list.dl_head = NULL;
1129 state->rc_srq_free_list.dl_cnt = 0;
1130 mutex_exit(&state->rc_srq_free_list.dl_mutex);
1131 while (list != NULL) {
1132 rwqe = WQE_TO_RWQE(list);
1133 if ((rwqe->rwqe_im_mblk == NULL) &&
1134 ((rwqe->rwqe_im_mblk = desballoc(
1135 rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
1136 &rwqe->w_freemsg_cb)) == NULL)) {
1137 DPRINT(40, "ibd_rc_repost_srq_free_list: "
1138 "failed in desballoc()");
1139 do {
1140 ibd_rc_srq_free_rwqe(state, rwqe);
1141 list = list->w_next;
1142 rwqe = WQE_TO_RWQE(list);
1143 } while (list != NULL);
1144 return (DDI_FAILURE);
1145 }
1146 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1147 ibd_rc_srq_free_rwqe(state, rwqe);
1148 }
1149 list = list->w_next;
1150 }
1151 return (DDI_SUCCESS);
1152 }
1153 mutex_exit(&state->rc_srq_free_list.dl_mutex);
1154 return (DDI_SUCCESS);
1155 }
1156
1157 /*
1158 * Free an allocated recv wqe.
1159 */
1160 static void
ibd_rc_srq_free_rwqe(ibd_state_t * state,ibd_rwqe_t * rwqe)1161 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1162 {
1163 /*
1164 * desballoc() failed (no memory) or the posting of rwqe failed.
1165 *
1166 * This rwqe is placed on a free list so that it
1167 * can be reinstated in future.
1168 *
1169 * NOTE: no code currently exists to reinstate
1170 * these "lost" rwqes.
1171 */
1172 mutex_enter(&state->rc_srq_free_list.dl_mutex);
1173 state->rc_srq_free_list.dl_cnt++;
1174 rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1175 state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1176 mutex_exit(&state->rc_srq_free_list.dl_mutex);
1177 }
1178
1179 static void
ibd_rc_srq_freemsg_cb(char * arg)1180 ibd_rc_srq_freemsg_cb(char *arg)
1181 {
1182 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1183 ibd_state_t *state = rwqe->w_state;
1184
1185 ASSERT(state->rc_enable_srq);
1186
1187 /*
1188 * If the driver is stopped, just free the rwqe.
1189 */
1190 if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1191 if (!rwqe->w_freeing_wqe) {
1192 atomic_dec_32(
1193 &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1194 DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1195 rwqe->rwqe_im_mblk = NULL;
1196 ibd_rc_srq_free_rwqe(state, rwqe);
1197 }
1198 return;
1199 }
1200
1201 atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1202
1203 ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1204 ASSERT(!rwqe->w_freeing_wqe);
1205
1206 /*
1207 * Upper layer has released held mblk, so we have
1208 * no more use for keeping the old pointer in
1209 * our rwqe.
1210 */
1211 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1212 state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1213 if (rwqe->rwqe_im_mblk == NULL) {
1214 DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1215 ibd_rc_srq_free_rwqe(state, rwqe);
1216 return;
1217 }
1218
1219 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1220 ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1221 " failed");
1222 ibd_rc_srq_free_rwqe(state, rwqe);
1223 return;
1224 }
1225 }
1226
1227 /*
1228 * Post a rwqe to the hardware and add it to the Rx list.
1229 */
1230 static int
ibd_rc_post_srq(ibd_state_t * state,ibd_rwqe_t * rwqe)1231 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1232 {
1233 /*
1234 * Here we should add dl_cnt before post recv, because
1235 * we would have to make sure dl_cnt is updated before
1236 * the corresponding ibd_rc_process_rx() is called.
1237 */
1238 ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1239 atomic_inc_32(&state->rc_srq_rwqe_list.dl_cnt);
1240 if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1241 IBT_SUCCESS) {
1242 atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1243 DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1244 return (DDI_FAILURE);
1245 }
1246
1247 return (DDI_SUCCESS);
1248 }
1249
1250 /*
1251 * Post a rwqe to the hardware and add it to the Rx list.
1252 */
1253 static int
ibd_rc_post_rwqe(ibd_rc_chan_t * chan,ibd_rwqe_t * rwqe)1254 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1255 {
1256 /*
1257 * Here we should add dl_cnt before post recv, because we would
1258 * have to make sure dl_cnt has already updated before
1259 * corresponding ibd_rc_process_rx() is called.
1260 */
1261 atomic_inc_32(&chan->rx_wqe_list.dl_cnt);
1262 if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1263 IBT_SUCCESS) {
1264 atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1265 DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1266 return (DDI_FAILURE);
1267 }
1268 return (DDI_SUCCESS);
1269 }
1270
1271 static int
ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t * chan)1272 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1273 {
1274 ibd_state_t *state = chan->state;
1275 ibt_mr_attr_t mem_attr;
1276 uint_t rc_rx_bufs_sz;
1277
1278 /*
1279 * Allocate one big chunk for all regular rx copy bufs
1280 */
1281 rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1282
1283 chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1284
1285 chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1286 sizeof (ibd_rwqe_t), KM_SLEEP);
1287
1288 /*
1289 * Do one memory registration on the entire rxbuf area
1290 */
1291 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1292 mem_attr.mr_len = rc_rx_bufs_sz;
1293 mem_attr.mr_as = NULL;
1294 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1295 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1296 &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1297 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1298 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1299 kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1300 chan->rx_bufs = NULL;
1301 chan->rx_rwqes = NULL;
1302 return (DDI_FAILURE);
1303 }
1304
1305 return (DDI_SUCCESS);
1306 }
1307
1308 static void
ibd_rc_free_rx_copybufs(ibd_rc_chan_t * chan)1309 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1310 {
1311 ibd_state_t *state = chan->state;
1312 uint_t rc_rx_buf_sz;
1313
1314 ASSERT(!state->rc_enable_srq);
1315 ASSERT(chan->rx_rwqes != NULL);
1316 ASSERT(chan->rx_bufs != NULL);
1317
1318 /*
1319 * Don't change the value of state->rc_mtu at the period from call
1320 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1321 */
1322 rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1323
1324 /*
1325 * Unregister rxbuf mr
1326 */
1327 if (ibt_deregister_mr(state->id_hca_hdl,
1328 chan->rx_mr_hdl) != IBT_SUCCESS) {
1329 DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1330 }
1331 chan->rx_mr_hdl = NULL;
1332
1333 /*
1334 * Free rxbuf memory
1335 */
1336 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1337 chan->rx_rwqes = NULL;
1338
1339 kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1340 chan->rx_bufs = NULL;
1341 }
1342
1343 /*
1344 * Post a certain number of receive buffers and WRs on a RC channel.
1345 */
1346 static int
ibd_rc_init_rxlist(ibd_rc_chan_t * chan)1347 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1348 {
1349 ibd_state_t *state = chan->state;
1350 ibd_rwqe_t *rwqe;
1351 ibt_lkey_t lkey;
1352 int i;
1353 uint_t len;
1354 uint8_t *bufaddr;
1355
1356 ASSERT(!state->rc_enable_srq);
1357 if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1358 return (DDI_FAILURE);
1359
1360 /*
1361 * Allocate and setup the rwqe list
1362 */
1363 lkey = chan->rx_mr_desc.md_lkey;
1364 rwqe = chan->rx_rwqes;
1365 bufaddr = chan->rx_bufs;
1366 len = state->rc_mtu + IPOIB_GRH_SIZE;
1367 for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1368 rwqe->w_state = state;
1369 rwqe->w_chan = chan;
1370 rwqe->w_freeing_wqe = B_FALSE;
1371 rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1372 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1373 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1374
1375 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1376 &rwqe->w_freemsg_cb)) == NULL) {
1377 DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1378 rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1379 ibd_rc_fini_rxlist(chan);
1380 return (DDI_FAILURE);
1381 }
1382
1383 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1384 rwqe->rwqe_copybuf.ic_sgl.ds_va =
1385 (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1386 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1387 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1388 rwqe->w_rwr.wr_nds = 1;
1389 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1390 (void) ibd_rc_post_rwqe(chan, rwqe);
1391 }
1392
1393 return (DDI_SUCCESS);
1394 }
1395
1396 /*
1397 * Free the statically allocated Rx buffer list for SRQ.
1398 */
1399 static void
ibd_rc_fini_rxlist(ibd_rc_chan_t * chan)1400 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1401 {
1402 ibd_rwqe_t *rwqe;
1403 int i;
1404
1405 if (chan->rx_bufs == NULL) {
1406 DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1407 return;
1408 }
1409
1410 /* bufs_outstanding must be 0 */
1411 ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1412 (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1413
1414 mutex_enter(&chan->rx_wqe_list.dl_mutex);
1415 rwqe = chan->rx_rwqes;
1416 for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1417 if (rwqe->rwqe_im_mblk != NULL) {
1418 rwqe->w_freeing_wqe = B_TRUE;
1419 freemsg(rwqe->rwqe_im_mblk);
1420 }
1421 }
1422 mutex_exit(&chan->rx_wqe_list.dl_mutex);
1423
1424 ibd_rc_free_rx_copybufs(chan);
1425 }
1426
1427 /*
1428 * Free an allocated recv wqe.
1429 */
1430 static void
ibd_rc_free_rwqe(ibd_rc_chan_t * chan,ibd_rwqe_t * rwqe)1431 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1432 {
1433 /*
1434 * desballoc() failed (no memory) or the posting of rwqe failed.
1435 *
1436 * This rwqe is placed on a free list so that it
1437 * can be reinstated in future.
1438 *
1439 * NOTE: no code currently exists to reinstate
1440 * these "lost" rwqes.
1441 */
1442 mutex_enter(&chan->rx_free_list.dl_mutex);
1443 chan->rx_free_list.dl_cnt++;
1444 rwqe->rwqe_next = chan->rx_free_list.dl_head;
1445 chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1446 mutex_exit(&chan->rx_free_list.dl_mutex);
1447 }
1448
1449 /*
1450 * Processing to be done after receipt of a packet; hand off to GLD
1451 * in the format expected by GLD.
1452 */
1453 static void
ibd_rc_process_rx(ibd_rc_chan_t * chan,ibd_rwqe_t * rwqe,ibt_wc_t * wc)1454 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1455 {
1456 ibd_state_t *state = chan->state;
1457 ib_header_info_t *phdr;
1458 ipoib_hdr_t *ipibp;
1459 mblk_t *mp;
1460 mblk_t *mpc;
1461 int rxcnt;
1462 ip6_t *ip6h;
1463 int len;
1464
1465 /*
1466 * Track number handed to upper layer, and number still
1467 * available to receive packets.
1468 */
1469 if (state->rc_enable_srq) {
1470 rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1471 } else {
1472 rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1473 }
1474
1475 /*
1476 * It can not be a IBA multicast packet.
1477 */
1478 ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1479
1480 /* For the connection reaper routine ibd_rc_conn_timeout_call() */
1481 chan->is_used = B_TRUE;
1482
1483 #ifdef DEBUG
1484 if (rxcnt < state->id_rc_rx_rwqe_thresh) {
1485 state->rc_rwqe_short++;
1486 }
1487 #endif
1488
1489 /*
1490 * Possibly replenish the Rx pool if needed.
1491 */
1492 if ((rxcnt >= state->id_rc_rx_rwqe_thresh) &&
1493 (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) {
1494 atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1495 atomic_inc_64(&state->rc_rcv_trans_pkt);
1496
1497 /*
1498 * Record how many rwqe has been occupied by upper
1499 * network layer
1500 */
1501 if (state->rc_enable_srq) {
1502 atomic_inc_32(
1503 &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1504 } else {
1505 atomic_inc_32(&chan->rx_wqe_list.dl_bufs_outstanding);
1506 }
1507 mp = rwqe->rwqe_im_mblk;
1508 } else {
1509 atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1510 atomic_inc_64(&state->rc_rcv_copy_pkt);
1511
1512 if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1513 BPRI_HI)) == NULL) { /* no memory */
1514 DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1515 state->rc_rcv_alloc_fail++;
1516 if (state->rc_enable_srq) {
1517 if (ibd_rc_post_srq(state, rwqe) ==
1518 DDI_FAILURE) {
1519 ibd_rc_srq_free_rwqe(state, rwqe);
1520 }
1521 } else {
1522 if (ibd_rc_post_rwqe(chan, rwqe) ==
1523 DDI_FAILURE) {
1524 ibd_rc_free_rwqe(chan, rwqe);
1525 }
1526 }
1527 return;
1528 }
1529
1530 bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1531 mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1532
1533 if (state->rc_enable_srq) {
1534 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1535 ibd_rc_srq_free_rwqe(state, rwqe);
1536 }
1537 } else {
1538 if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1539 ibd_rc_free_rwqe(chan, rwqe);
1540 }
1541 }
1542 }
1543
1544 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1545 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1546 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1547 len = ntohs(ip6h->ip6_plen);
1548 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1549 /* LINTED: E_CONSTANT_CONDITION */
1550 IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1551 }
1552 }
1553
1554 phdr = (ib_header_info_t *)mp->b_rptr;
1555 phdr->ib_grh.ipoib_vertcflow = 0;
1556 ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1557 sizeof (ipoib_mac_t));
1558 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1559
1560 /*
1561 * Can RC mode in IB guarantee its checksum correctness?
1562 *
1563 * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
1564 */
1565
1566 /*
1567 * Make sure this is NULL or we're in trouble.
1568 */
1569 if (mp->b_next != NULL) {
1570 ibd_print_warn(state,
1571 "ibd_rc_process_rx: got duplicate mp from rcq?");
1572 mp->b_next = NULL;
1573 }
1574
1575 /*
1576 * Add this mp to the list of processed mp's to send to
1577 * the nw layer
1578 */
1579 if (state->rc_enable_srq) {
1580 mutex_enter(&state->rc_rx_lock);
1581 if (state->rc_rx_mp) {
1582 ASSERT(state->rc_rx_mp_tail != NULL);
1583 state->rc_rx_mp_tail->b_next = mp;
1584 } else {
1585 ASSERT(state->rc_rx_mp_tail == NULL);
1586 state->rc_rx_mp = mp;
1587 }
1588
1589 state->rc_rx_mp_tail = mp;
1590 state->rc_rx_mp_len++;
1591
1592 if (state->rc_rx_mp_len >= IBD_MAX_RX_MP_LEN) {
1593 mpc = state->rc_rx_mp;
1594
1595 state->rc_rx_mp = NULL;
1596 state->rc_rx_mp_tail = NULL;
1597 state->rc_rx_mp_len = 0;
1598 mutex_exit(&state->rc_rx_lock);
1599 mac_rx(state->id_mh, NULL, mpc);
1600 } else {
1601 mutex_exit(&state->rc_rx_lock);
1602 }
1603 } else {
1604 mutex_enter(&chan->rx_lock);
1605 if (chan->rx_mp) {
1606 ASSERT(chan->rx_mp_tail != NULL);
1607 chan->rx_mp_tail->b_next = mp;
1608 } else {
1609 ASSERT(chan->rx_mp_tail == NULL);
1610 chan->rx_mp = mp;
1611 }
1612
1613 chan->rx_mp_tail = mp;
1614 chan->rx_mp_len++;
1615
1616 if (chan->rx_mp_len >= IBD_MAX_RX_MP_LEN) {
1617 mpc = chan->rx_mp;
1618
1619 chan->rx_mp = NULL;
1620 chan->rx_mp_tail = NULL;
1621 chan->rx_mp_len = 0;
1622 mutex_exit(&chan->rx_lock);
1623 mac_rx(state->id_mh, NULL, mpc);
1624 } else {
1625 mutex_exit(&chan->rx_lock);
1626 }
1627 }
1628 }
1629
1630 /*
1631 * Callback code invoked from STREAMs when the recv data buffer is free
1632 * for recycling.
1633 */
1634 static void
ibd_rc_freemsg_cb(char * arg)1635 ibd_rc_freemsg_cb(char *arg)
1636 {
1637 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1638 ibd_rc_chan_t *chan = rwqe->w_chan;
1639 ibd_state_t *state = rwqe->w_state;
1640
1641 /*
1642 * If the wqe is being destructed, do not attempt recycling.
1643 */
1644 if (rwqe->w_freeing_wqe == B_TRUE) {
1645 return;
1646 }
1647
1648 ASSERT(!state->rc_enable_srq);
1649 ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1650
1651 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1652 state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1653 if (rwqe->rwqe_im_mblk == NULL) {
1654 DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1655 ibd_rc_free_rwqe(chan, rwqe);
1656 return;
1657 }
1658
1659 /*
1660 * Post back to h/w. We could actually have more than
1661 * id_num_rwqe WQEs on the list if there were multiple
1662 * ibd_freemsg_cb() calls outstanding (since the lock is
1663 * not held the entire time). This will start getting
1664 * corrected over subsequent ibd_freemsg_cb() calls.
1665 */
1666 if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1667 ibd_rc_free_rwqe(chan, rwqe);
1668 return;
1669 }
1670 atomic_dec_32(&chan->rx_wqe_list.dl_bufs_outstanding);
1671 }
1672
1673 /*
1674 * Common code for interrupt handling as well as for polling
1675 * for all completed wqe's while detaching.
1676 */
1677 static void
ibd_rc_poll_rcq(ibd_rc_chan_t * chan,ibt_cq_hdl_t cq_hdl)1678 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1679 {
1680 ibd_wqe_t *wqe;
1681 ibt_wc_t *wc, *wcs;
1682 uint_t numwcs, real_numwcs;
1683 int i;
1684
1685 wcs = chan->rx_wc;
1686 numwcs = IBD_RC_MAX_CQ_WC;
1687
1688 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1689 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1690 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1691 if (wc->wc_status != IBT_WC_SUCCESS) {
1692 chan->state->rc_rcq_err++;
1693 /*
1694 * Channel being torn down.
1695 */
1696 DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1697 "SUCC, chan=%p", wc->wc_status, chan);
1698 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1699 /*
1700 * Do not invoke Rx handler because
1701 * it might add buffers to the Rx pool
1702 * when we are trying to deinitialize.
1703 */
1704 continue;
1705 }
1706 }
1707 ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1708 }
1709 }
1710 }
1711
1712 /* Receive CQ handler */
1713 /* ARGSUSED */
1714 static void
ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1715 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1716 {
1717 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1718 ibd_state_t *state = chan->state;
1719
1720 atomic_inc_32(&chan->rcq_invoking);
1721 ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1722
1723 /*
1724 * Poll for completed entries; the CQ will not interrupt any
1725 * more for incoming (or transmitted) packets.
1726 */
1727 ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1728
1729 /*
1730 * Now enable CQ notifications; all packets that arrive now
1731 * (or complete transmission) will cause new interrupts.
1732 */
1733 if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1734 IBT_SUCCESS) {
1735 /*
1736 * We do not expect a failure here.
1737 */
1738 DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1739 }
1740
1741 /*
1742 * Repoll to catch all packets that might have arrived after
1743 * we finished the first poll loop and before interrupts got
1744 * armed.
1745 */
1746 ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1747
1748 if (state->rc_enable_srq) {
1749 mutex_enter(&state->rc_rx_lock);
1750
1751 if (state->rc_rx_mp != NULL) {
1752 mblk_t *mpc;
1753 mpc = state->rc_rx_mp;
1754
1755 state->rc_rx_mp = NULL;
1756 state->rc_rx_mp_tail = NULL;
1757 state->rc_rx_mp_len = 0;
1758
1759 mutex_exit(&state->rc_rx_lock);
1760 mac_rx(state->id_mh, NULL, mpc);
1761 } else {
1762 mutex_exit(&state->rc_rx_lock);
1763 }
1764 } else {
1765 mutex_enter(&chan->rx_lock);
1766
1767 if (chan->rx_mp != NULL) {
1768 mblk_t *mpc;
1769 mpc = chan->rx_mp;
1770
1771 chan->rx_mp = NULL;
1772 chan->rx_mp_tail = NULL;
1773 chan->rx_mp_len = 0;
1774
1775 mutex_exit(&chan->rx_lock);
1776 mac_rx(state->id_mh, NULL, mpc);
1777 } else {
1778 mutex_exit(&chan->rx_lock);
1779 }
1780 }
1781 atomic_dec_32(&chan->rcq_invoking);
1782 }
1783
1784 /*
1785 * Allocate the statically allocated Tx buffer list.
1786 */
1787 int
ibd_rc_init_tx_largebuf_list(ibd_state_t * state)1788 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1789 {
1790 ibd_rc_tx_largebuf_t *lbufp;
1791 ibd_rc_tx_largebuf_t *tail;
1792 uint8_t *memp;
1793 ibt_mr_attr_t mem_attr;
1794 uint32_t num_swqe;
1795 size_t mem_size;
1796 int i;
1797
1798 num_swqe = state->id_rc_num_swqe - 1;
1799
1800 /*
1801 * Allocate one big chunk for all Tx large copy bufs
1802 */
1803 /* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1804 mem_size = num_swqe * state->rc_mtu;
1805 state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1806
1807 mem_attr.mr_len = mem_size;
1808 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1809 mem_attr.mr_as = NULL;
1810 mem_attr.mr_flags = IBT_MR_SLEEP;
1811 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1812 &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1813 DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1814 "failed");
1815 kmem_free(state->rc_tx_mr_bufs, mem_size);
1816 state->rc_tx_mr_bufs = NULL;
1817 return (DDI_FAILURE);
1818 }
1819
1820 state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1821 sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1822
1823 /*
1824 * Set up the buf chain
1825 */
1826 memp = state->rc_tx_mr_bufs;
1827 mutex_enter(&state->rc_tx_large_bufs_lock);
1828 lbufp = state->rc_tx_largebuf_desc_base;
1829 for (i = 0; i < num_swqe; i++) {
1830 lbufp->lb_buf = memp;
1831 lbufp->lb_next = lbufp + 1;
1832
1833 tail = lbufp;
1834
1835 memp += state->rc_mtu;
1836 lbufp++;
1837 }
1838 tail->lb_next = NULL;
1839
1840 /*
1841 * Set up the buffer information in ibd state
1842 */
1843 state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1844 state->rc_tx_largebuf_nfree = num_swqe;
1845 mutex_exit(&state->rc_tx_large_bufs_lock);
1846 return (DDI_SUCCESS);
1847 }
1848
1849 void
ibd_rc_fini_tx_largebuf_list(ibd_state_t * state)1850 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1851 {
1852 uint32_t num_swqe;
1853
1854 num_swqe = state->id_rc_num_swqe - 1;
1855
1856 if (ibt_deregister_mr(state->id_hca_hdl,
1857 state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1858 DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1859 "failed");
1860 }
1861 state->rc_tx_mr_hdl = NULL;
1862
1863 kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1864 state->rc_tx_mr_bufs = NULL;
1865
1866 kmem_free(state->rc_tx_largebuf_desc_base,
1867 num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1868 state->rc_tx_largebuf_desc_base = NULL;
1869 }
1870
1871 static int
ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t * chan)1872 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1873 {
1874 ibt_mr_attr_t mem_attr;
1875 ibd_state_t *state;
1876
1877 state = chan->state;
1878 ASSERT(state != NULL);
1879
1880 /*
1881 * Allocate one big chunk for all regular tx copy bufs
1882 */
1883 mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh;
1884
1885 chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1886
1887 /*
1888 * Do one memory registration on the entire txbuf area
1889 */
1890 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1891 mem_attr.mr_as = NULL;
1892 mem_attr.mr_flags = IBT_MR_SLEEP;
1893 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1894 &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1895 DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1896 ASSERT(mem_attr.mr_len ==
1897 chan->scq_size * state->id_rc_tx_copy_thresh);
1898 kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1899 chan->tx_mr_bufs = NULL;
1900 return (DDI_FAILURE);
1901 }
1902
1903 return (DDI_SUCCESS);
1904 }
1905
1906 /*
1907 * Allocate the statically allocated Tx buffer list.
1908 */
1909 static int
ibd_rc_init_txlist(ibd_rc_chan_t * chan)1910 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1911 {
1912 ibd_swqe_t *swqe;
1913 int i;
1914 ibt_lkey_t lkey;
1915 ibd_state_t *state = chan->state;
1916
1917 if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1918 return (DDI_FAILURE);
1919
1920 /*
1921 * Allocate and setup the swqe list
1922 */
1923 lkey = chan->tx_mr_desc.md_lkey;
1924 chan->tx_wqes = kmem_zalloc(chan->scq_size *
1925 sizeof (ibd_swqe_t), KM_SLEEP);
1926 swqe = chan->tx_wqes;
1927 for (i = 0; i < chan->scq_size; i++, swqe++) {
1928 swqe->swqe_next = NULL;
1929 swqe->swqe_im_mblk = NULL;
1930
1931 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1932 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1933
1934 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1935 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1936 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1937 (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh);
1938 swqe->w_swr.wr_trans = IBT_RC_SRV;
1939
1940 /* Add to list */
1941 mutex_enter(&chan->tx_wqe_list.dl_mutex);
1942 chan->tx_wqe_list.dl_cnt++;
1943 swqe->swqe_next = chan->tx_wqe_list.dl_head;
1944 chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1945 mutex_exit(&chan->tx_wqe_list.dl_mutex);
1946 }
1947
1948 return (DDI_SUCCESS);
1949 }
1950
1951 /*
1952 * Free the statically allocated Tx buffer list.
1953 */
1954 static void
ibd_rc_fini_txlist(ibd_rc_chan_t * chan)1955 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1956 {
1957 ibd_state_t *state = chan->state;
1958 if (chan->tx_mr_hdl != NULL) {
1959 if (ibt_deregister_mr(chan->state->id_hca_hdl,
1960 chan->tx_mr_hdl) != IBT_SUCCESS) {
1961 DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1962 "failed");
1963 }
1964 chan->tx_mr_hdl = NULL;
1965 }
1966
1967 if (chan->tx_mr_bufs != NULL) {
1968 kmem_free(chan->tx_mr_bufs, chan->scq_size *
1969 state->id_rc_tx_copy_thresh);
1970 chan->tx_mr_bufs = NULL;
1971 }
1972
1973 if (chan->tx_wqes != NULL) {
1974 kmem_free(chan->tx_wqes, chan->scq_size *
1975 sizeof (ibd_swqe_t));
1976 chan->tx_wqes = NULL;
1977 }
1978 }
1979
1980 /*
1981 * Acquire send wqe from free list.
1982 * Returns error number and send wqe pointer.
1983 */
1984 ibd_swqe_t *
ibd_rc_acquire_swqes(ibd_rc_chan_t * chan)1985 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1986 {
1987 ibd_swqe_t *wqe;
1988
1989 mutex_enter(&chan->tx_rel_list.dl_mutex);
1990 if (chan->tx_rel_list.dl_head != NULL) {
1991 /* transfer id_tx_rel_list to id_tx_list */
1992 chan->tx_wqe_list.dl_head =
1993 chan->tx_rel_list.dl_head;
1994 chan->tx_wqe_list.dl_cnt =
1995 chan->tx_rel_list.dl_cnt;
1996 chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1997
1998 /* clear id_tx_rel_list */
1999 chan->tx_rel_list.dl_head = NULL;
2000 chan->tx_rel_list.dl_cnt = 0;
2001 mutex_exit(&chan->tx_rel_list.dl_mutex);
2002
2003 wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
2004 chan->tx_wqe_list.dl_cnt -= 1;
2005 chan->tx_wqe_list.dl_head = wqe->swqe_next;
2006 } else { /* no free swqe */
2007 mutex_exit(&chan->tx_rel_list.dl_mutex);
2008 chan->tx_wqe_list.dl_pending_sends = B_TRUE;
2009 wqe = NULL;
2010 }
2011 return (wqe);
2012 }
2013
2014 /*
2015 * Release send wqe back into free list.
2016 */
2017 static void
ibd_rc_release_swqe(ibd_rc_chan_t * chan,ibd_swqe_t * swqe)2018 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
2019 {
2020 /*
2021 * Add back on Tx list for reuse.
2022 */
2023 swqe->swqe_next = NULL;
2024 mutex_enter(&chan->tx_rel_list.dl_mutex);
2025 chan->tx_rel_list.dl_pending_sends = B_FALSE;
2026 swqe->swqe_next = chan->tx_rel_list.dl_head;
2027 chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
2028 chan->tx_rel_list.dl_cnt++;
2029 mutex_exit(&chan->tx_rel_list.dl_mutex);
2030 }
2031
2032 void
ibd_rc_post_send(ibd_rc_chan_t * chan,ibd_swqe_t * node)2033 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
2034 {
2035 uint_t i;
2036 uint_t num_posted;
2037 uint_t n_wrs;
2038 ibt_status_t ibt_status;
2039 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE];
2040 ibd_swqe_t *tx_head, *elem;
2041 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE];
2042
2043 /* post the one request, then check for more */
2044 ibt_status = ibt_post_send(chan->chan_hdl,
2045 &node->w_swr, 1, NULL);
2046 if (ibt_status != IBT_SUCCESS) {
2047 ibd_print_warn(chan->state, "ibd_post_send: "
2048 "posting one wr failed: ret=%d", ibt_status);
2049 ibd_rc_tx_cleanup(node);
2050 }
2051
2052 tx_head = NULL;
2053 for (;;) {
2054 if (tx_head == NULL) {
2055 mutex_enter(&chan->tx_post_lock);
2056 tx_head = chan->tx_head;
2057 if (tx_head == NULL) {
2058 chan->tx_busy = 0;
2059 mutex_exit(&chan->tx_post_lock);
2060 return;
2061 }
2062 chan->tx_head = NULL;
2063 mutex_exit(&chan->tx_post_lock);
2064 }
2065
2066 /*
2067 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
2068 * at a time if possible, and keep posting them.
2069 */
2070 for (n_wrs = 0, elem = tx_head;
2071 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
2072 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
2073 nodes[n_wrs] = elem;
2074 wrs[n_wrs] = elem->w_swr;
2075 }
2076 tx_head = elem;
2077
2078 ASSERT(n_wrs != 0);
2079
2080 /*
2081 * If posting fails for some reason, we'll never receive
2082 * completion intimation, so we'll need to cleanup. But
2083 * we need to make sure we don't clean up nodes whose
2084 * wrs have been successfully posted. We assume that the
2085 * hca driver returns on the first failure to post and
2086 * therefore the first 'num_posted' entries don't need
2087 * cleanup here.
2088 */
2089 num_posted = 0;
2090 ibt_status = ibt_post_send(chan->chan_hdl,
2091 wrs, n_wrs, &num_posted);
2092 if (ibt_status != IBT_SUCCESS) {
2093 ibd_print_warn(chan->state, "ibd_post_send: "
2094 "posting multiple wrs failed: "
2095 "requested=%d, done=%d, ret=%d",
2096 n_wrs, num_posted, ibt_status);
2097
2098 for (i = num_posted; i < n_wrs; i++)
2099 ibd_rc_tx_cleanup(nodes[i]);
2100 }
2101 }
2102 }
2103
2104 /*
2105 * Common code that deals with clean ups after a successful or
2106 * erroneous transmission attempt.
2107 */
2108 void
ibd_rc_tx_cleanup(ibd_swqe_t * swqe)2109 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2110 {
2111 ibd_ace_t *ace = swqe->w_ahandle;
2112 ibd_state_t *state;
2113
2114 ASSERT(ace != NULL);
2115 ASSERT(ace->ac_chan != NULL);
2116
2117 state = ace->ac_chan->state;
2118
2119 /*
2120 * If this was a dynamic registration in ibd_send(),
2121 * deregister now.
2122 */
2123 if (swqe->swqe_im_mblk != NULL) {
2124 ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2125 if (swqe->w_buftype == IBD_WQE_MAPPED) {
2126 ibd_unmap_mem(state, swqe);
2127 }
2128 freemsg(swqe->swqe_im_mblk);
2129 swqe->swqe_im_mblk = NULL;
2130 } else {
2131 ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2132 }
2133
2134 if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2135 ibd_rc_tx_largebuf_t *lbufp;
2136
2137 lbufp = swqe->w_rc_tx_largebuf;
2138 ASSERT(lbufp != NULL);
2139
2140 mutex_enter(&state->rc_tx_large_bufs_lock);
2141 lbufp->lb_next = state->rc_tx_largebuf_free_head;
2142 state->rc_tx_largebuf_free_head = lbufp;
2143 state->rc_tx_largebuf_nfree ++;
2144 mutex_exit(&state->rc_tx_large_bufs_lock);
2145 swqe->w_rc_tx_largebuf = NULL;
2146 }
2147
2148
2149 /*
2150 * Release the send wqe for reuse.
2151 */
2152 ibd_rc_release_swqe(ace->ac_chan, swqe);
2153
2154 /*
2155 * Drop the reference count on the AH; it can be reused
2156 * now for a different destination if there are no more
2157 * posted sends that will use it. This can be eliminated
2158 * if we can always associate each Tx buffer with an AH.
2159 * The ace can be null if we are cleaning up from the
2160 * ibd_send() error path.
2161 */
2162 ibd_dec_ref_ace(state, ace);
2163 }
2164
2165 void
ibd_rc_drain_scq(ibd_rc_chan_t * chan,ibt_cq_hdl_t cq_hdl)2166 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2167 {
2168 ibd_state_t *state = chan->state;
2169 ibd_wqe_t *wqe;
2170 ibt_wc_t *wc, *wcs;
2171 ibd_ace_t *ace;
2172 uint_t numwcs, real_numwcs;
2173 int i;
2174 boolean_t encount_error;
2175
2176 wcs = chan->tx_wc;
2177 numwcs = IBD_RC_MAX_CQ_WC;
2178 encount_error = B_FALSE;
2179
2180 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2181 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2182 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2183 if (wc->wc_status != IBT_WC_SUCCESS) {
2184 if (encount_error == B_FALSE) {
2185 /*
2186 * This RC channle is in error status,
2187 * remove it.
2188 */
2189 encount_error = B_TRUE;
2190 mutex_enter(&state->id_ac_mutex);
2191 if ((chan->chan_state ==
2192 IBD_RC_STATE_ACT_ESTAB) &&
2193 (chan->state->id_link_state ==
2194 LINK_STATE_UP) &&
2195 ((ace = ibd_acache_find(state,
2196 &chan->ace->ac_mac, B_FALSE, 0))
2197 != NULL) && (ace == chan->ace)) {
2198 ASSERT(ace->ac_mce == NULL);
2199 INC_REF(ace, 1);
2200 IBD_ACACHE_PULLOUT_ACTIVE(
2201 state, ace);
2202 chan->chan_state =
2203 IBD_RC_STATE_ACT_CLOSING;
2204 mutex_exit(&state->id_ac_mutex);
2205 state->rc_reset_cnt++;
2206 DPRINT(30, "ibd_rc_drain_scq: "
2207 "wc_status(%d) != SUCC, "
2208 "chan=%p, ace=%p, "
2209 "link_state=%d"
2210 "reset RC channel",
2211 wc->wc_status, chan,
2212 chan->ace, chan->state->
2213 id_link_state);
2214 ibd_rc_signal_act_close(
2215 state, ace);
2216 } else {
2217 mutex_exit(&state->id_ac_mutex);
2218 state->
2219 rc_act_close_simultaneous++;
2220 DPRINT(40, "ibd_rc_drain_scq: "
2221 "wc_status(%d) != SUCC, "
2222 "chan=%p, chan_state=%d,"
2223 "ace=%p, link_state=%d."
2224 "other thread is closing "
2225 "it", wc->wc_status, chan,
2226 chan->chan_state, chan->ace,
2227 chan->state->id_link_state);
2228 }
2229 }
2230 }
2231 ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2232 }
2233
2234 mutex_enter(&state->id_sched_lock);
2235 if (state->id_sched_needed == 0) {
2236 mutex_exit(&state->id_sched_lock);
2237 } else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2238 mutex_enter(&chan->tx_wqe_list.dl_mutex);
2239 mutex_enter(&chan->tx_rel_list.dl_mutex);
2240 if ((chan->tx_rel_list.dl_cnt +
2241 chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2242 state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2243 mutex_exit(&chan->tx_rel_list.dl_mutex);
2244 mutex_exit(&chan->tx_wqe_list.dl_mutex);
2245 mutex_exit(&state->id_sched_lock);
2246 state->rc_swqe_mac_update++;
2247 mac_tx_update(state->id_mh);
2248 } else {
2249 state->rc_scq_no_swqe++;
2250 mutex_exit(&chan->tx_rel_list.dl_mutex);
2251 mutex_exit(&chan->tx_wqe_list.dl_mutex);
2252 mutex_exit(&state->id_sched_lock);
2253 }
2254 } else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2255 mutex_enter(&state->rc_tx_large_bufs_lock);
2256 if (state->rc_tx_largebuf_nfree >
2257 IBD_RC_TX_FREE_THRESH) {
2258 ASSERT(state->rc_tx_largebuf_free_head != NULL);
2259 state->id_sched_needed &=
2260 ~IBD_RSRC_RC_TX_LARGEBUF;
2261 mutex_exit(&state->rc_tx_large_bufs_lock);
2262 mutex_exit(&state->id_sched_lock);
2263 state->rc_xmt_buf_mac_update++;
2264 mac_tx_update(state->id_mh);
2265 } else {
2266 state->rc_scq_no_largebuf++;
2267 mutex_exit(&state->rc_tx_large_bufs_lock);
2268 mutex_exit(&state->id_sched_lock);
2269 }
2270 } else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2271 mutex_enter(&state->id_tx_list.dl_mutex);
2272 mutex_enter(&state->id_tx_rel_list.dl_mutex);
2273 if ((state->id_tx_list.dl_cnt +
2274 state->id_tx_rel_list.dl_cnt)
2275 > IBD_FREE_SWQES_THRESH) {
2276 state->id_sched_needed &= ~IBD_RSRC_SWQE;
2277 state->id_sched_cnt++;
2278 mutex_exit(&state->id_tx_rel_list.dl_mutex);
2279 mutex_exit(&state->id_tx_list.dl_mutex);
2280 mutex_exit(&state->id_sched_lock);
2281 mac_tx_update(state->id_mh);
2282 } else {
2283 mutex_exit(&state->id_tx_rel_list.dl_mutex);
2284 mutex_exit(&state->id_tx_list.dl_mutex);
2285 mutex_exit(&state->id_sched_lock);
2286 }
2287 } else {
2288 mutex_exit(&state->id_sched_lock);
2289 }
2290 }
2291 }
2292
2293 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2294 /* ARGSUSED */
2295 static void
ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl,void * arg)2296 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2297 {
2298 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2299
2300 if (ibd_rc_tx_softintr == 1) {
2301 mutex_enter(&chan->tx_poll_lock);
2302 if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2303 chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2304 mutex_exit(&chan->tx_poll_lock);
2305 return;
2306 } else {
2307 mutex_exit(&chan->tx_poll_lock);
2308 ddi_trigger_softintr(chan->scq_softintr);
2309 }
2310 } else
2311 (void) ibd_rc_tx_recycle(arg);
2312 }
2313
2314 static uint_t
ibd_rc_tx_recycle(caddr_t arg)2315 ibd_rc_tx_recycle(caddr_t arg)
2316 {
2317 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2318 ibd_state_t *state = chan->state;
2319 int flag, redo_flag;
2320 int redo = 1;
2321
2322 flag = IBD_CQ_POLLING;
2323 redo_flag = IBD_REDO_CQ_POLLING;
2324
2325 mutex_enter(&chan->tx_poll_lock);
2326 if (chan->tx_poll_busy & flag) {
2327 ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2328 "threads");
2329 chan->tx_poll_busy |= redo_flag;
2330 mutex_exit(&chan->tx_poll_lock);
2331 return (DDI_INTR_CLAIMED);
2332 }
2333 chan->tx_poll_busy |= flag;
2334 mutex_exit(&chan->tx_poll_lock);
2335
2336 /*
2337 * Poll for completed entries; the CQ will not interrupt any
2338 * more for completed packets.
2339 */
2340 ibd_rc_drain_scq(chan, chan->scq_hdl);
2341
2342 /*
2343 * Now enable CQ notifications; all completions originating now
2344 * will cause new interrupts.
2345 */
2346 do {
2347 if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2348 IBT_SUCCESS) {
2349 /*
2350 * We do not expect a failure here.
2351 */
2352 DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2353 " failed");
2354 }
2355
2356 ibd_rc_drain_scq(chan, chan->scq_hdl);
2357
2358 mutex_enter(&chan->tx_poll_lock);
2359 if (chan->tx_poll_busy & redo_flag)
2360 chan->tx_poll_busy &= ~redo_flag;
2361 else {
2362 chan->tx_poll_busy &= ~flag;
2363 redo = 0;
2364 }
2365 mutex_exit(&chan->tx_poll_lock);
2366
2367 } while (redo);
2368
2369 return (DDI_INTR_CLAIMED);
2370 }
2371
2372 static ibt_status_t
ibd_register_service(ibt_srv_desc_t * srv,ib_svc_id_t sid,int num_sids,ibt_srv_hdl_t * srv_hdl,ib_svc_id_t * ret_sid)2373 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2374 int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2375 {
2376 ibd_service_t *p;
2377 ibt_status_t status;
2378
2379 mutex_enter(&ibd_gstate.ig_mutex);
2380 for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2381 if (p->is_sid == sid) {
2382 p->is_ref_cnt++;
2383 *srv_hdl = p->is_srv_hdl;
2384 *ret_sid = sid;
2385 mutex_exit(&ibd_gstate.ig_mutex);
2386 return (IBT_SUCCESS);
2387 }
2388 }
2389 status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2390 num_sids, srv_hdl, ret_sid);
2391 if (status == IBT_SUCCESS) {
2392 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2393 p->is_srv_hdl = *srv_hdl;
2394 p->is_sid = sid;
2395 p->is_ref_cnt = 1;
2396 p->is_link = ibd_gstate.ig_service_list;
2397 ibd_gstate.ig_service_list = p;
2398 }
2399 mutex_exit(&ibd_gstate.ig_mutex);
2400 return (status);
2401 }
2402
2403 static ibt_status_t
ibd_deregister_service(ibt_srv_hdl_t srv_hdl)2404 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2405 {
2406 ibd_service_t *p, **pp;
2407 ibt_status_t status;
2408
2409 mutex_enter(&ibd_gstate.ig_mutex);
2410 for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2411 pp = &((*pp)->is_link)) {
2412 p = *pp;
2413 if (p->is_srv_hdl == srv_hdl) { /* Found it */
2414 if (--p->is_ref_cnt == 0) {
2415 status = ibt_deregister_service(
2416 ibd_gstate.ig_ibt_hdl, srv_hdl);
2417 *pp = p->is_link; /* link prev to next */
2418 kmem_free(p, sizeof (*p));
2419 } else {
2420 status = IBT_SUCCESS;
2421 }
2422 mutex_exit(&ibd_gstate.ig_mutex);
2423 return (status);
2424 }
2425 }
2426 /* Should not ever get here */
2427 mutex_exit(&ibd_gstate.ig_mutex);
2428 return (IBT_FAILURE);
2429 }
2430
2431 /* Listen with corresponding service ID */
2432 ibt_status_t
ibd_rc_listen(ibd_state_t * state)2433 ibd_rc_listen(ibd_state_t *state)
2434 {
2435 ibt_srv_desc_t srvdesc;
2436 ib_svc_id_t ret_sid;
2437 ibt_status_t status;
2438 ib_gid_t gid;
2439
2440 if (state->rc_listen_hdl != NULL) {
2441 DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2442 return (IBT_FAILURE);
2443 }
2444
2445 bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2446 srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2447 srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2448
2449 /*
2450 * Register the service with service id
2451 * Incoming connection requests should arrive on this service id.
2452 */
2453 status = ibd_register_service(&srvdesc,
2454 IBD_RC_QPN_TO_SID(state->id_qpnum),
2455 1, &state->rc_listen_hdl, &ret_sid);
2456 if (status != IBT_SUCCESS) {
2457 DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2458 "ret=%d", status);
2459 return (status);
2460 }
2461
2462 gid = state->id_sgid;
2463
2464 /* pass state as cm_private */
2465 status = ibt_bind_service(state->rc_listen_hdl,
2466 gid, NULL, state, &state->rc_listen_bind);
2467 if (status != IBT_SUCCESS) {
2468 DPRINT(40, "ibd_rc_listen:"
2469 " fail to bind port: <%d>", status);
2470 (void) ibd_deregister_service(state->rc_listen_hdl);
2471 return (status);
2472 }
2473
2474 /*
2475 * Legacy OFED had used a wrong service ID (one additional zero digit)
2476 * for many years. To interop with legacy OFED, we support this wrong
2477 * service ID here.
2478 */
2479 ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2480
2481 bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2482 srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2483 srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2484
2485 /*
2486 * Register the service with service id
2487 * Incoming connection requests should arrive on this service id.
2488 */
2489 status = ibd_register_service(&srvdesc,
2490 IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2491 1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2492 if (status != IBT_SUCCESS) {
2493 DPRINT(40,
2494 "ibd_rc_listen: Service Registration for Legacy OFED "
2495 "Failed %d", status);
2496 (void) ibt_unbind_service(state->rc_listen_hdl,
2497 state->rc_listen_bind);
2498 (void) ibd_deregister_service(state->rc_listen_hdl);
2499 return (status);
2500 }
2501
2502 gid = state->id_sgid;
2503
2504 /* pass state as cm_private */
2505 status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2506 gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2507 if (status != IBT_SUCCESS) {
2508 DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2509 "Legacy OFED listener", status);
2510 (void) ibd_deregister_service(
2511 state->rc_listen_hdl_OFED_interop);
2512 (void) ibt_unbind_service(state->rc_listen_hdl,
2513 state->rc_listen_bind);
2514 (void) ibd_deregister_service(state->rc_listen_hdl);
2515 return (status);
2516 }
2517
2518 return (IBT_SUCCESS);
2519 }
2520
2521 void
ibd_rc_stop_listen(ibd_state_t * state)2522 ibd_rc_stop_listen(ibd_state_t *state)
2523 {
2524 int ret;
2525
2526 /* Disable incoming connection requests */
2527 if (state->rc_listen_hdl != NULL) {
2528 ret = ibt_unbind_all_services(state->rc_listen_hdl);
2529 if (ret != 0) {
2530 DPRINT(40, "ibd_rc_stop_listen:"
2531 "ibt_unbind_all_services() failed, ret=%d", ret);
2532 }
2533 ret = ibd_deregister_service(state->rc_listen_hdl);
2534 if (ret != 0) {
2535 DPRINT(40, "ibd_rc_stop_listen:"
2536 "ibd_deregister_service() failed, ret=%d", ret);
2537 } else {
2538 state->rc_listen_hdl = NULL;
2539 }
2540 }
2541
2542 /* Disable incoming connection requests */
2543 if (state->rc_listen_hdl_OFED_interop != NULL) {
2544 ret = ibt_unbind_all_services(
2545 state->rc_listen_hdl_OFED_interop);
2546 if (ret != 0) {
2547 DPRINT(40, "ibd_rc_stop_listen:"
2548 "ibt_unbind_all_services() failed: %d", ret);
2549 }
2550 ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2551 if (ret != 0) {
2552 DPRINT(40, "ibd_rc_stop_listen:"
2553 "ibd_deregister_service() failed: %d", ret);
2554 } else {
2555 state->rc_listen_hdl_OFED_interop = NULL;
2556 }
2557 }
2558 }
2559
2560 void
ibd_rc_close_all_chan(ibd_state_t * state)2561 ibd_rc_close_all_chan(ibd_state_t *state)
2562 {
2563 ibd_rc_chan_t *rc_chan;
2564 ibd_ace_t *ace, *pre_ace;
2565 uint_t attempts;
2566
2567 /* Disable all Rx routines */
2568 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2569 rc_chan = state->rc_pass_chan_list.chan_list;
2570 while (rc_chan != NULL) {
2571 ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2572 rc_chan = rc_chan->next;
2573 }
2574 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2575
2576 if (state->rc_enable_srq) {
2577 attempts = 10;
2578 while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2579 DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2580 delay(drv_usectohz(100000));
2581 if (--attempts == 0) {
2582 /*
2583 * There are pending bufs with the network
2584 * layer and we have no choice but to wait
2585 * for them to be done with. Reap all the
2586 * Tx/Rx completions that were posted since
2587 * we turned off the notification and
2588 * return failure.
2589 */
2590 break;
2591 }
2592 }
2593 }
2594
2595 /* Close all passive RC channels */
2596 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2597 while (rc_chan != NULL) {
2598 (void) ibd_rc_pas_close(rc_chan, B_TRUE, B_FALSE);
2599 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2600 }
2601
2602 /* Close all active RC channels */
2603 mutex_enter(&state->id_ac_mutex);
2604 state->id_ac_hot_ace = NULL;
2605 ace = list_head(&state->id_ah_active);
2606 while ((pre_ace = ace) != NULL) {
2607 ace = list_next(&state->id_ah_active, ace);
2608 if (pre_ace->ac_chan != NULL) {
2609 INC_REF(pre_ace, 1);
2610 IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace);
2611 pre_ace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2612 ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2613 pre_ace->ac_chan);
2614 }
2615 }
2616 mutex_exit(&state->id_ac_mutex);
2617
2618 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2619 while (rc_chan != NULL) {
2620 ace = rc_chan->ace;
2621 ibd_rc_act_close(rc_chan, B_TRUE);
2622 if (ace != NULL) {
2623 mutex_enter(&state->id_ac_mutex);
2624 ASSERT(ace->ac_ref != 0);
2625 atomic_dec_32(&ace->ac_ref);
2626 ace->ac_chan = NULL;
2627 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
2628 IBD_ACACHE_INSERT_FREE(state, ace);
2629 ace->ac_ref = 0;
2630 } else {
2631 ace->ac_ref |= CYCLEVAL;
2632 state->rc_delay_ace_recycle++;
2633 }
2634 mutex_exit(&state->id_ac_mutex);
2635 }
2636 rc_chan = ibd_rc_rm_header_chan_list(
2637 &state->rc_obs_act_chan_list);
2638 }
2639
2640 attempts = 400;
2641 while (((state->rc_num_tx_chan != 0) ||
2642 (state->rc_num_rx_chan != 0)) && (attempts > 0)) {
2643 /* Other thread is closing CM channel, wait it */
2644 delay(drv_usectohz(100000));
2645 attempts--;
2646 }
2647 }
2648
2649 void
ibd_rc_try_connect(ibd_state_t * state,ibd_ace_t * ace,ibt_path_info_t * path)2650 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path)
2651 {
2652 ibt_status_t status;
2653
2654 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2655 return;
2656
2657 status = ibd_rc_connect(state, ace, path,
2658 IBD_RC_SERVICE_ID_OFED_INTEROP);
2659
2660 if (status != IBT_SUCCESS) {
2661 /* wait peer side remove stale channel */
2662 delay(drv_usectohz(10000));
2663 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2664 return;
2665 status = ibd_rc_connect(state, ace, path,
2666 IBD_RC_SERVICE_ID_OFED_INTEROP);
2667 }
2668
2669 if (status != IBT_SUCCESS) {
2670 /* wait peer side remove stale channel */
2671 delay(drv_usectohz(10000));
2672 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2673 return;
2674 (void) ibd_rc_connect(state, ace, path,
2675 IBD_RC_SERVICE_ID);
2676 }
2677 }
2678
2679 /*
2680 * Allocates channel and sets the ace->ac_chan to it.
2681 * Opens the channel.
2682 */
2683 ibt_status_t
ibd_rc_connect(ibd_state_t * state,ibd_ace_t * ace,ibt_path_info_t * path,uint64_t ietf_cm_service_id)2684 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path,
2685 uint64_t ietf_cm_service_id)
2686 {
2687 ibt_status_t status = 0;
2688 ibt_rc_returns_t open_returns;
2689 ibt_chan_open_args_t open_args;
2690 ibd_rc_msg_hello_t hello_req_msg;
2691 ibd_rc_msg_hello_t *hello_ack_msg;
2692 ibd_rc_chan_t *chan;
2693 ibt_ud_dest_query_attr_t dest_attrs;
2694
2695 ASSERT(ace != NULL);
2696 ASSERT(ace->ac_mce == NULL);
2697 ASSERT(ace->ac_chan == NULL);
2698
2699 if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2700 DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2701 return (status);
2702 }
2703
2704 ace->ac_chan = chan;
2705 chan->state = state;
2706 chan->ace = ace;
2707
2708 ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2709
2710 hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2711
2712 /*
2713 * open the channels
2714 */
2715 bzero(&open_args, sizeof (ibt_chan_open_args_t));
2716 bzero(&open_returns, sizeof (ibt_rc_returns_t));
2717
2718 open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2719 open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2720
2721 /*
2722 * update path record with the SID
2723 */
2724 if ((status = ibt_query_ud_dest(ace->ac_dest, &dest_attrs))
2725 != IBT_SUCCESS) {
2726 DPRINT(40, "ibd_rc_connect: ibt_query_ud_dest() failed, "
2727 "ret=%d", status);
2728 return (status);
2729 }
2730
2731 path->pi_sid =
2732 ietf_cm_service_id | ((dest_attrs.ud_dst_qpn) & 0xffffff);
2733
2734
2735 /* pre-allocate memory for hello ack message */
2736 open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2737 open_returns.rc_priv_data = hello_ack_msg;
2738
2739 open_args.oc_path = path;
2740
2741 open_args.oc_path_rnr_retry_cnt = 1;
2742 open_args.oc_path_retry_cnt = 1;
2743
2744 /* We don't do RDMA */
2745 open_args.oc_rdma_ra_out = 0;
2746 open_args.oc_rdma_ra_in = 0;
2747
2748 hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2749 hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2750 open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2751 open_args.oc_priv_data = (void *)(&hello_req_msg);
2752
2753 ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2754 ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2755 ASSERT(open_args.oc_cm_handler != NULL);
2756
2757 status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2758 IBT_BLOCKING, &open_args, &open_returns);
2759
2760 if (status == IBT_SUCCESS) {
2761 /* Success! */
2762 DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2763 state->rc_conn_succ++;
2764 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2765 return (IBT_SUCCESS);
2766 }
2767
2768 /* failure */
2769 (void) ibt_flush_channel(chan->chan_hdl);
2770 ibd_rc_free_chan(chan);
2771 ace->ac_chan = NULL;
2772
2773 /* check open_returns report error and exit */
2774 DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2775 "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2776 " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2777 hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2778 dest_attrs.ud_dst_qpn);
2779 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2780 return (status);
2781 }
2782
2783 void
ibd_rc_signal_act_close(ibd_state_t * state,ibd_ace_t * ace)2784 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2785 {
2786 ibd_req_t *req;
2787
2788 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2789 if (req == NULL) {
2790 ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2791 "ibd_req_t fail");
2792 mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2793 ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2794 state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2795 mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2796 } else {
2797 req->rq_ptr = ace->ac_chan;
2798 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2799 }
2800 }
2801
2802 void
ibd_rc_signal_ace_recycle(ibd_state_t * state,ibd_ace_t * ace)2803 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2804 {
2805 ibd_req_t *req;
2806
2807 mutex_enter(&state->rc_ace_recycle_lock);
2808 if (state->rc_ace_recycle != NULL) {
2809 mutex_exit(&state->rc_ace_recycle_lock);
2810 return;
2811 }
2812
2813 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2814 if (req == NULL) {
2815 mutex_exit(&state->rc_ace_recycle_lock);
2816 return;
2817 }
2818
2819 state->rc_ace_recycle = ace;
2820 mutex_exit(&state->rc_ace_recycle_lock);
2821 ASSERT(ace->ac_mce == NULL);
2822 INC_REF(ace, 1);
2823 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2824 req->rq_ptr = ace;
2825 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2826 }
2827
2828 /*
2829 * Close an active channel
2830 *
2831 * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2832 */
2833 static void
ibd_rc_act_close(ibd_rc_chan_t * chan,boolean_t is_close_rc_chan)2834 ibd_rc_act_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan)
2835 {
2836 ibd_state_t *state;
2837 ibd_ace_t *ace;
2838 uint_t times;
2839 ibt_status_t ret;
2840
2841 ASSERT(chan != NULL);
2842
2843 chan->state->rc_act_close++;
2844 switch (chan->chan_state) {
2845 case IBD_RC_STATE_ACT_CLOSING: /* stale, close it */
2846 case IBD_RC_STATE_ACT_ESTAB:
2847 DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2848 "act_state=%d, chan=%p", chan->chan_state, chan);
2849 chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2850 ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2851 /*
2852 * Wait send queue empty. Its old value is 50 (5 seconds). But
2853 * in my experiment, 5 seconds is not enough time to let IBTL
2854 * return all buffers and ace->ac_ref. I tried 25 seconds, it
2855 * works well. As another evidence, I saw IBTL takes about 17
2856 * seconds every time it cleans a stale RC channel.
2857 */
2858 times = 250;
2859 ace = chan->ace;
2860 ASSERT(ace != NULL);
2861 state = chan->state;
2862 ASSERT(state != NULL);
2863 mutex_enter(&state->id_ac_mutex);
2864 mutex_enter(&chan->tx_wqe_list.dl_mutex);
2865 mutex_enter(&chan->tx_rel_list.dl_mutex);
2866 while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2867 != chan->scq_size) || ((ace->ac_ref != 1) &&
2868 (ace->ac_ref != (CYCLEVAL+1)))) {
2869 mutex_exit(&chan->tx_rel_list.dl_mutex);
2870 mutex_exit(&chan->tx_wqe_list.dl_mutex);
2871 mutex_exit(&state->id_ac_mutex);
2872 times--;
2873 if (times == 0) {
2874 state->rc_act_close_not_clean++;
2875 DPRINT(40, "ibd_rc_act_close: dl_cnt(tx_wqe_"
2876 "list=%d, tx_rel_list=%d) != chan->"
2877 "scq_size=%d, OR ac_ref(=%d) not clean",
2878 chan->tx_wqe_list.dl_cnt,
2879 chan->tx_rel_list.dl_cnt,
2880 chan->scq_size, ace->ac_ref);
2881 break;
2882 }
2883 mutex_enter(&chan->tx_poll_lock);
2884 if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2885 DPRINT(40, "ibd_rc_act_close: multiple "
2886 "polling threads");
2887 mutex_exit(&chan->tx_poll_lock);
2888 } else {
2889 chan->tx_poll_busy = IBD_CQ_POLLING;
2890 mutex_exit(&chan->tx_poll_lock);
2891 ibd_rc_drain_scq(chan, chan->scq_hdl);
2892 mutex_enter(&chan->tx_poll_lock);
2893 chan->tx_poll_busy = 0;
2894 mutex_exit(&chan->tx_poll_lock);
2895 }
2896 delay(drv_usectohz(100000));
2897 mutex_enter(&state->id_ac_mutex);
2898 mutex_enter(&chan->tx_wqe_list.dl_mutex);
2899 mutex_enter(&chan->tx_rel_list.dl_mutex);
2900 }
2901 if (times != 0) {
2902 mutex_exit(&chan->tx_rel_list.dl_mutex);
2903 mutex_exit(&chan->tx_wqe_list.dl_mutex);
2904 mutex_exit(&state->id_ac_mutex);
2905 }
2906
2907 ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2908 if (is_close_rc_chan) {
2909 ret = ibt_close_rc_channel(chan->chan_hdl,
2910 IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
2911 0);
2912 if (ret != IBT_SUCCESS) {
2913 DPRINT(40, "ibd_rc_act_close: ibt_close_rc_"
2914 "channel fail, chan=%p, ret=%d",
2915 chan, ret);
2916 } else {
2917 DPRINT(30, "ibd_rc_act_close: ibt_close_rc_"
2918 "channel succ, chan=%p", chan);
2919 }
2920 }
2921
2922 ibd_rc_free_chan(chan);
2923 break;
2924 case IBD_RC_STATE_ACT_REP_RECV:
2925 chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2926 (void) ibt_flush_channel(chan->chan_hdl);
2927 ibd_rc_free_chan(chan);
2928 break;
2929 case IBD_RC_STATE_ACT_ERROR:
2930 DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2931 break;
2932 default:
2933 DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2934 "chan=%p", chan->chan_state, chan);
2935 }
2936 }
2937
2938 /*
2939 * Close a passive channel
2940 *
2941 * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2942 *
2943 * is_timeout_close: if B_TRUE, this function is called by the connection
2944 * reaper (refer to function ibd_rc_conn_timeout_call). When the connection
2945 * reaper calls ibd_rc_pas_close(), and if it finds that dl_bufs_outstanding
2946 * or chan->rcq_invoking is non-zero, then it can simply put that channel back
2947 * on the passive channels list and move on, since it might be an indication
2948 * that the channel became active again by the time we started it's cleanup.
2949 * It is costlier to do the cleanup and then reinitiate the channel
2950 * establishment and hence it will help to be conservative when we do the
2951 * cleanup.
2952 */
2953 int
ibd_rc_pas_close(ibd_rc_chan_t * chan,boolean_t is_close_rc_chan,boolean_t is_timeout_close)2954 ibd_rc_pas_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan,
2955 boolean_t is_timeout_close)
2956 {
2957 uint_t times;
2958 ibt_status_t ret;
2959
2960 ASSERT(chan != NULL);
2961 chan->state->rc_pas_close++;
2962
2963 switch (chan->chan_state) {
2964 case IBD_RC_STATE_PAS_ESTAB:
2965 if (is_timeout_close) {
2966 if ((chan->rcq_invoking != 0) ||
2967 ((!chan->state->rc_enable_srq) &&
2968 (chan->rx_wqe_list.dl_bufs_outstanding > 0))) {
2969 if (ibd_rc_re_add_to_pas_chan_list(chan)) {
2970 return (DDI_FAILURE);
2971 }
2972 }
2973 }
2974 /*
2975 * First, stop receive interrupts; this stops the
2976 * connection from handing up buffers to higher layers.
2977 * Wait for receive buffers to be returned; give up
2978 * after 5 seconds.
2979 */
2980 ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2981 /* Wait 0.01 second to let ibt_set_cq_handler() take effect */
2982 delay(drv_usectohz(10000));
2983 if (!chan->state->rc_enable_srq) {
2984 times = 50;
2985 while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2986 delay(drv_usectohz(100000));
2987 if (--times == 0) {
2988 DPRINT(40, "ibd_rc_pas_close : "
2989 "reclaiming failed");
2990 ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2991 ibt_set_cq_handler(chan->rcq_hdl,
2992 ibd_rc_rcq_handler,
2993 (void *)(uintptr_t)chan);
2994 return (DDI_FAILURE);
2995 }
2996 }
2997 }
2998 times = 50;
2999 while (chan->rcq_invoking != 0) {
3000 delay(drv_usectohz(100000));
3001 if (--times == 0) {
3002 DPRINT(40, "ibd_rc_pas_close : "
3003 "rcq handler is being invoked");
3004 chan->state->rc_pas_close_rcq_invoking++;
3005 break;
3006 }
3007 }
3008 ibt_set_cq_handler(chan->scq_hdl, 0, 0);
3009 chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3010 DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
3011 "chan_state=%d, chan=%p", chan->chan_state, chan);
3012 if (is_close_rc_chan) {
3013 ret = ibt_close_rc_channel(chan->chan_hdl,
3014 IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
3015 0);
3016 if (ret != IBT_SUCCESS) {
3017 DPRINT(40, "ibd_rc_pas_close: ibt_close_rc_"
3018 "channel() fail, chan=%p, ret=%d", chan,
3019 ret);
3020 } else {
3021 DPRINT(30, "ibd_rc_pas_close: ibt_close_rc_"
3022 "channel() succ, chan=%p", chan);
3023 }
3024 }
3025 ibd_rc_free_chan(chan);
3026 break;
3027 case IBD_RC_STATE_PAS_REQ_RECV:
3028 chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3029 (void) ibt_flush_channel(chan->chan_hdl);
3030 ibd_rc_free_chan(chan);
3031 break;
3032 default:
3033 DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
3034 chan->chan_state, chan);
3035 }
3036 return (DDI_SUCCESS);
3037 }
3038
3039 /*
3040 * Passive Side:
3041 * Handle an incoming CM REQ from active side.
3042 *
3043 * If success, this function allocates an ibd_rc_chan_t, then
3044 * assigns it to "*ret_conn".
3045 */
3046 static ibt_cm_status_t
ibd_rc_handle_req(void * arg,ibd_rc_chan_t ** ret_conn,ibt_cm_event_t * ibt_cm_event,ibt_cm_return_args_t * ret_args,void * ret_priv_data)3047 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
3048 ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
3049 void *ret_priv_data)
3050 {
3051 ibd_rc_msg_hello_t *hello_msg;
3052 ibd_state_t *state = (ibd_state_t *)arg;
3053 ibd_rc_chan_t *chan;
3054
3055 if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
3056 DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
3057 return (IBT_CM_REJECT);
3058 }
3059
3060 ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
3061
3062 ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
3063
3064 if (!state->rc_enable_srq) {
3065 if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
3066 ibd_rc_free_chan(chan);
3067 DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
3068 "failed");
3069 return (IBT_CM_REJECT);
3070 }
3071 }
3072
3073 ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
3074
3075 /* We don't do RDMA */
3076 ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
3077 ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
3078
3079 ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
3080 ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
3081
3082 hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3083 DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
3084 ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
3085
3086 hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
3087 hello_msg->reserved_qpn = htonl(state->id_qpnum);
3088 hello_msg->rx_mtu = htonl(state->rc_mtu);
3089
3090 chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV; /* ready to receive */
3091 *ret_conn = chan;
3092
3093 return (IBT_CM_ACCEPT);
3094 }
3095
3096 /*
3097 * ibd_rc_handle_act_estab -- handler for connection established completion
3098 * for active side.
3099 */
3100 static ibt_cm_status_t
ibd_rc_handle_act_estab(ibd_ace_t * ace)3101 ibd_rc_handle_act_estab(ibd_ace_t *ace)
3102 {
3103 ibt_status_t result;
3104
3105 switch (ace->ac_chan->chan_state) {
3106 case IBD_RC_STATE_ACT_REP_RECV:
3107 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
3108 result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
3109 IBT_NEXT_COMPLETION);
3110 if (result != IBT_SUCCESS) {
3111 DPRINT(40, "ibd_rc_handle_act_estab: "
3112 "ibt_enable_cq_notify(rcq) "
3113 "failed: status %d", result);
3114 return (IBT_CM_REJECT);
3115 }
3116 break;
3117 default:
3118 DPRINT(40, "ibd_rc_handle_act_estab: default "
3119 "branch, act_state=%d", ace->ac_chan->chan_state);
3120 return (IBT_CM_REJECT);
3121 }
3122 return (IBT_CM_ACCEPT);
3123 }
3124
3125 /*
3126 * ibd_rc_handle_pas_estab -- handler for connection established completion
3127 * for passive side.
3128 */
3129 static ibt_cm_status_t
ibd_rc_handle_pas_estab(ibd_rc_chan_t * chan)3130 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
3131 {
3132 ibt_status_t result;
3133
3134 switch (chan->chan_state) {
3135 case IBD_RC_STATE_PAS_REQ_RECV:
3136 chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
3137
3138 result = ibt_enable_cq_notify(chan->rcq_hdl,
3139 IBT_NEXT_COMPLETION);
3140 if (result != IBT_SUCCESS) {
3141 DPRINT(40, "ibd_rc_handle_pas_estab: "
3142 "ibt_enable_cq_notify(rcq) "
3143 "failed: status %d", result);
3144 return (IBT_CM_REJECT);
3145 }
3146 break;
3147 default:
3148 DPRINT(40, "ibd_rc_handle_pas_estab: default "
3149 "branch, chan_state=%d", chan->chan_state);
3150 return (IBT_CM_REJECT);
3151 }
3152 return (IBT_CM_ACCEPT);
3153 }
3154
3155 /* ARGSUSED */
3156 static ibt_cm_status_t
ibd_rc_dispatch_actv_mad(void * arg,ibt_cm_event_t * ibt_cm_event,ibt_cm_return_args_t * ret_args,void * ret_priv_data,ibt_priv_data_len_t ret_len_max)3157 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3158 ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3159 ibt_priv_data_len_t ret_len_max)
3160 {
3161 ibt_cm_status_t result = IBT_CM_ACCEPT;
3162 ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
3163 ibd_rc_chan_t *rc_chan;
3164 ibd_state_t *state;
3165 ibd_rc_msg_hello_t *hello_ack;
3166
3167 switch (ibt_cm_event->cm_type) {
3168 case IBT_CM_EVENT_REP_RCV:
3169 ASSERT(ace->ac_chan != NULL);
3170 ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
3171 hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3172 DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
3173 "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
3174 ntohl(hello_ack->reserved_qpn));
3175 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
3176 break;
3177
3178 case IBT_CM_EVENT_CONN_EST:
3179 ASSERT(ace->ac_chan != NULL);
3180 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
3181 "ace=%p, act_state=%d, chan=%p",
3182 ace, ace->ac_chan->chan_state, ace->ac_chan);
3183 result = ibd_rc_handle_act_estab(ace);
3184 break;
3185
3186 case IBT_CM_EVENT_CONN_CLOSED:
3187 rc_chan = ace->ac_chan;
3188 if (rc_chan == NULL) {
3189 DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3190 "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
3191 return (IBT_CM_ACCEPT);
3192 }
3193 state = rc_chan->state;
3194 mutex_enter(&state->id_ac_mutex);
3195 if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
3196 ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
3197 != NULL) && (ace == rc_chan->ace)) {
3198 rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
3199 ASSERT(ace->ac_mce == NULL);
3200 INC_REF(ace, 1);
3201 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
3202 mutex_exit(&state->id_ac_mutex);
3203 DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3204 "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
3205 "reason=%d", ace, rc_chan,
3206 ibt_cm_event->cm_event.closed);
3207 } else {
3208 mutex_exit(&state->id_ac_mutex);
3209 state->rc_act_close_simultaneous++;
3210 DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
3211 "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
3212 "chan_state=%d", rc_chan->chan_state);
3213 return (IBT_CM_ACCEPT);
3214 }
3215 ibd_rc_act_close(rc_chan, B_FALSE);
3216 mutex_enter(&state->id_ac_mutex);
3217 ace->ac_chan = NULL;
3218 ASSERT(ace->ac_ref != 0);
3219 atomic_dec_32(&ace->ac_ref);
3220 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3221 IBD_ACACHE_INSERT_FREE(state, ace);
3222 ace->ac_ref = 0;
3223 } else {
3224 ace->ac_ref |= CYCLEVAL;
3225 state->rc_delay_ace_recycle++;
3226 }
3227 mutex_exit(&state->id_ac_mutex);
3228 break;
3229
3230 case IBT_CM_EVENT_FAILURE:
3231 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3232 "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3233 ace, ace->ac_chan,
3234 ibt_cm_event->cm_event.failed.cf_code,
3235 ibt_cm_event->cm_event.failed.cf_msg,
3236 ibt_cm_event->cm_event.failed.cf_reason);
3237 /*
3238 * Don't need free resource here. The resource is freed
3239 * at function ibd_rc_connect()
3240 */
3241 break;
3242
3243 case IBT_CM_EVENT_MRA_RCV:
3244 DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3245 break;
3246 case IBT_CM_EVENT_LAP_RCV:
3247 DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3248 break;
3249 case IBT_CM_EVENT_APR_RCV:
3250 DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3251 break;
3252 default:
3253 DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3254 "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3255 break;
3256 }
3257
3258 return (result);
3259 }
3260
3261 /* ARGSUSED */
3262 static ibt_cm_status_t
ibd_rc_dispatch_pass_mad(void * arg,ibt_cm_event_t * ibt_cm_event,ibt_cm_return_args_t * ret_args,void * ret_priv_data,ibt_priv_data_len_t ret_len_max)3263 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3264 ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3265 ibt_priv_data_len_t ret_len_max)
3266 {
3267 ibt_cm_status_t result = IBT_CM_ACCEPT;
3268 ibd_rc_chan_t *chan;
3269
3270 if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3271 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3272 "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3273 /* Receive an incoming CM REQ from active side */
3274 result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3275 ret_priv_data);
3276 return (result);
3277 }
3278
3279 if (ibt_cm_event->cm_channel == 0) {
3280 DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3281 "ERROR ibt_cm_event->cm_channel == 0");
3282 return (IBT_CM_REJECT);
3283 }
3284
3285 chan =
3286 (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3287 if (chan == NULL) {
3288 DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3289 return (IBT_CM_REJECT);
3290 }
3291
3292 switch (ibt_cm_event->cm_type) {
3293 case IBT_CM_EVENT_CONN_EST:
3294 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3295 "chan=%p", chan);
3296 result = ibd_rc_handle_pas_estab(chan);
3297 break;
3298 case IBT_CM_EVENT_CONN_CLOSED:
3299 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3300 " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3301 chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3302 chan);
3303 if (chan != NULL)
3304 (void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3305 break;
3306 case IBT_CM_EVENT_FAILURE:
3307 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3308 " chan=%p, code: %d, msg: %d, reason=%d", chan,
3309 ibt_cm_event->cm_event.failed.cf_code,
3310 ibt_cm_event->cm_event.failed.cf_msg,
3311 ibt_cm_event->cm_event.failed.cf_reason);
3312 chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3313 chan);
3314 if (chan != NULL)
3315 (void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3316 return (IBT_CM_ACCEPT);
3317 case IBT_CM_EVENT_MRA_RCV:
3318 DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3319 break;
3320 case IBT_CM_EVENT_LAP_RCV:
3321 DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3322 break;
3323 case IBT_CM_EVENT_APR_RCV:
3324 DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3325 break;
3326 default:
3327 DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3328 "chan=%p", ibt_cm_event->cm_type, chan);
3329 break;
3330 }
3331
3332 return (result);
3333 }
3334