1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27 *
28 * This software is available to you under a choice of one of two
29 * licenses. You may choose to be licensed under the terms of the GNU
30 * General Public License (GPL) Version 2, available from the file
31 * COPYING in the main directory of this source tree, or the
32 * OpenIB.org BSD license below:
33 *
34 * Redistribution and use in source and binary forms, with or
35 * without modification, are permitted provided that the following
36 * conditions are met:
37 *
38 * - Redistributions of source code must retain the above
39 * copyright notice, this list of conditions and the following
40 * disclaimer.
41 *
42 * - Redistributions in binary form must reproduce the above
43 * copyright notice, this list of conditions and the following
44 * disclaimer in the documentation and/or other materials
45 * provided with the distribution.
46 *
47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54 * SOFTWARE.
55 *
56 */
57 /*
58 * Sun elects to include this software in Sun product
59 * under the OpenIB BSD license.
60 *
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72 * POSSIBILITY OF SUCH DAMAGE.
73 */
74
75 #include <sys/ib/clients/rds/rdsib_cm.h>
76 #include <sys/ib/clients/rds/rdsib_ib.h>
77 #include <sys/ib/clients/rds/rdsib_buf.h>
78 #include <sys/ib/clients/rds/rdsib_ep.h>
79
80 /*
81 * This file contains CM related work:
82 *
83 * Service registration/deregistration
84 * Path lookup
85 * CM connection callbacks
86 * CM active and passive connection establishment
87 * Connection failover
88 */
89
90 #define SRCIP src_addr.un.ip4addr
91 #define DSTIP dst_addr.un.ip4addr
92
93 /*
94 * Handle an incoming CM REQ
95 */
96 /* ARGSUSED */
97 static ibt_cm_status_t
rds_handle_cm_req(rds_state_t * statep,ibt_cm_event_t * evp,ibt_cm_return_args_t * rargsp,void * rcmp,ibt_priv_data_len_t rcmp_len)98 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
99 ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
100 {
101 ibt_cm_req_rcv_t *reqp;
102 ib_gid_t lgid, rgid;
103 rds_cm_private_data_t cmp;
104 rds_session_t *sp;
105 rds_ep_t *ep;
106 ibt_channel_hdl_t chanhdl;
107 ibt_ip_cm_info_t ipcm_info;
108 uint8_t save_state, save_type;
109 int ret;
110
111 RDS_DPRINTF2("rds_handle_cm_req", "Enter");
112
113 reqp = &evp->cm_event.req;
114 rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
115 lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
116
117 RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
118 rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
119
120 /*
121 * CM private data brings IP information
122 * Private data received is a stream of bytes and may not be properly
123 * aligned. So, bcopy the data onto the stack before accessing it.
124 */
125 bcopy((uint8_t *)evp->cm_priv_data, &cmp,
126 sizeof (rds_cm_private_data_t));
127
128 /* extract the CM IP info */
129 ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
130 &ipcm_info);
131 if (ret != IBT_SUCCESS) {
132 RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
133 ret);
134 return (IBT_CM_REJECT);
135 }
136
137 RDS_DPRINTF2("rds_handle_cm_req",
138 "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
139 ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
140
141 if (cmp.cmp_version != RDS_VERSION) {
142 RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d "
143 "Remote version: %d", RDS_VERSION, cmp.cmp_version);
144 return (IBT_CM_REJECT);
145 }
146
147 /* RDS supports V4 addresses only */
148 if ((ipcm_info.src_addr.family != AF_INET) ||
149 (ipcm_info.dst_addr.family != AF_INET)) {
150 RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
151 "src: %d dst: %d", ipcm_info.src_addr.family,
152 ipcm_info.dst_addr.family);
153 return (IBT_CM_REJECT);
154 }
155
156 if (cmp.cmp_arch != RDS_THIS_ARCH) {
157 RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
158 cmp.cmp_arch, RDS_THIS_ARCH);
159 return (IBT_CM_REJECT);
160 }
161
162 if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
163 (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
164 RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
165 return (IBT_CM_REJECT);
166 }
167
168 /* user_buffer_size should be same on all nodes */
169 if (cmp.cmp_user_buffer_size != UserBufferSize) {
170 RDS_DPRINTF2(LABEL,
171 "UserBufferSize Mismatch, this node: %d remote node: %d",
172 UserBufferSize, cmp.cmp_user_buffer_size);
173 return (IBT_CM_REJECT);
174 }
175
176 /*
177 * RDS needs more time to process a failover REQ so send an MRA.
178 * Otherwise, the remote may retry the REQ and fail the connection.
179 */
180 if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
181 RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
182 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
183 10000000 /* 10 sec */, NULL, 0);
184 }
185
186 /* Is there a session to the destination node? */
187 rw_enter(&statep->rds_sessionlock, RW_READER);
188 sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
189 rw_exit(&statep->rds_sessionlock);
190
191 if (sp == NULL) {
192 /*
193 * currently there is no session to the destination
194 * remote ip in the private data is the local ip and vice
195 * versa
196 */
197 sp = rds_session_create(statep, ipcm_info.DSTIP,
198 ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
199 if (sp == NULL) {
200 /* Check the list anyway. */
201 rw_enter(&statep->rds_sessionlock, RW_READER);
202 sp = rds_session_lkup(statep, ipcm_info.SRCIP,
203 rgid.gid_guid);
204 rw_exit(&statep->rds_sessionlock);
205 if (sp == NULL) {
206 /*
207 * The only way this can fail is due to lack
208 * of kernel resources
209 */
210 return (IBT_CM_REJECT);
211 }
212 }
213 }
214
215 rw_enter(&sp->session_lock, RW_WRITER);
216
217 /* catch peer-to-peer case as soon as possible */
218 if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
219 (sp->session_state == RDS_SESSION_STATE_INIT)) {
220 /* Check possible peer-to-peer case here */
221 if (sp->session_type != RDS_SESSION_PASSIVE) {
222 RDS_DPRINTF2("rds_handle_cm_req",
223 "SP(%p) Peer-peer connection handling", sp);
224 if (lgid.gid_guid > rgid.gid_guid) {
225 /* this node is active so reject this request */
226 rw_exit(&sp->session_lock);
227 return (IBT_CM_REJECT);
228 } else {
229 /* this node is passive, change the session */
230 sp->session_type = RDS_SESSION_PASSIVE;
231 sp->session_lgid = lgid;
232 sp->session_rgid = rgid;
233 }
234 }
235 }
236
237 RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
238 save_state = sp->session_state;
239 save_type = sp->session_type;
240
241 switch (sp->session_state) {
242 case RDS_SESSION_STATE_CONNECTED:
243 RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
244 sp->session_state = RDS_SESSION_STATE_ERROR;
245 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
246 "RDS_SESSION_STATE_ERROR", sp);
247
248 /* FALLTHRU */
249 case RDS_SESSION_STATE_ERROR:
250 case RDS_SESSION_STATE_PASSIVE_CLOSING:
251 /*
252 * Some other thread must be processing this session,
253 * this thread must wait until the other thread finishes.
254 */
255 sp->session_type = RDS_SESSION_PASSIVE;
256 rw_exit(&sp->session_lock);
257
258 /* Handling this will take some time, so send an MRA */
259 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
260 10000000 /* 10 sec */, NULL, 0);
261
262 /*
263 * Any pending completions don't get flushed until the channel
264 * is closed. So, passing 0 here will not wait for pending
265 * completions in rds_session_close before closing the channel
266 */
267 rds_session_close(sp, IBT_NOCALLBACKS, 0);
268
269 rw_enter(&sp->session_lock, RW_WRITER);
270
271 /*
272 * If the session was in ERROR, then either a failover thread
273 * or event_failure thread would be processing this session.
274 * This thread should wait for event_failure thread to
275 * complete. This need not wait for failover thread.
276 */
277 if ((save_state != RDS_SESSION_STATE_CONNECTED) &&
278 (save_type == RDS_SESSION_PASSIVE)) {
279 /*
280 * The other thread is event_failure thread,
281 * wait until it finishes.
282 */
283 while (!((sp->session_state ==
284 RDS_SESSION_STATE_FAILED) ||
285 (sp->session_state ==
286 RDS_SESSION_STATE_FINI))) {
287 rw_exit(&sp->session_lock);
288 delay(drv_usectohz(1000000));
289 rw_enter(&sp->session_lock, RW_WRITER);
290 }
291 }
292
293 /* move the session to init state */
294 if ((sp->session_state == RDS_SESSION_STATE_ERROR) ||
295 (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING)) {
296 ret = rds_session_reinit(sp, lgid);
297 sp->session_myip = ipcm_info.DSTIP;
298 sp->session_lgid = lgid;
299 sp->session_rgid = rgid;
300 if (ret != 0) {
301 rds_session_fini(sp);
302 sp->session_state = RDS_SESSION_STATE_FAILED;
303 RDS_DPRINTF3("rds_handle_cm_req",
304 "SP(%p) State RDS_SESSION_STATE_FAILED",
305 sp);
306 rw_exit(&sp->session_lock);
307 return (IBT_CM_REJECT);
308 } else {
309 sp->session_state = RDS_SESSION_STATE_INIT;
310 RDS_DPRINTF3("rds_handle_cm_req",
311 "SP(%p) State RDS_SESSION_STATE_INIT", sp);
312 }
313
314 if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
315 ep = &sp->session_ctrlep;
316 } else {
317 ep = &sp->session_dataep;
318 }
319 break;
320 }
321
322 /* FALLTHRU */
323 case RDS_SESSION_STATE_CREATED:
324 case RDS_SESSION_STATE_FAILED:
325 case RDS_SESSION_STATE_FINI:
326 /*
327 * Initialize both channels, we accept this connection
328 * only if both channels are initialized
329 */
330 sp->session_type = RDS_SESSION_PASSIVE;
331 sp->session_lgid = lgid;
332 sp->session_rgid = rgid;
333 sp->session_state = RDS_SESSION_STATE_CREATED;
334 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
335 "RDS_SESSION_STATE_CREATED", sp);
336 ret = rds_session_init(sp);
337 if (ret != 0) {
338 /* Seems like there are not enough resources */
339 sp->session_state = RDS_SESSION_STATE_FAILED;
340 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
341 "RDS_SESSION_STATE_FAILED", sp);
342 rw_exit(&sp->session_lock);
343 return (IBT_CM_REJECT);
344 }
345 sp->session_state = RDS_SESSION_STATE_INIT;
346 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
347 "RDS_SESSION_STATE_INIT", sp);
348
349 /* FALLTHRU */
350 case RDS_SESSION_STATE_INIT:
351 /*
352 * When re-using an existing session, make sure the
353 * session is still through the same HCA. Otherwise, the
354 * memory registrations have to moved to the new HCA.
355 */
356 if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
357 if (sp->session_lgid.gid_guid != lgid.gid_guid) {
358 RDS_DPRINTF2("rds_handle_cm_req",
359 "Existing Session but different gid "
360 "existing: 0x%llx, new: 0x%llx, "
361 "sending an MRA",
362 sp->session_lgid.gid_guid, lgid.gid_guid);
363 (void) ibt_cm_delay(IBT_CM_DELAY_REQ,
364 evp->cm_session_id, 10000000 /* 10 sec */,
365 NULL, 0);
366 ret = rds_session_reinit(sp, lgid);
367 if (ret != 0) {
368 rds_session_fini(sp);
369 sp->session_state =
370 RDS_SESSION_STATE_FAILED;
371 sp->session_failover = 0;
372 RDS_DPRINTF3("rds_failover_session",
373 "SP(%p) State "
374 "RDS_SESSION_STATE_FAILED", sp);
375 rw_exit(&sp->session_lock);
376 return (IBT_CM_REJECT);
377 }
378 }
379 ep = &sp->session_dataep;
380 } else {
381 ep = &sp->session_ctrlep;
382 }
383
384 break;
385 default:
386 RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
387 "state: %d", sp, sp->session_state);
388 rw_exit(&sp->session_lock);
389 return (IBT_CM_REJECT);
390 }
391
392 sp->session_failover = 0; /* reset any previous value */
393 if (cmp.cmp_failover) {
394 RDS_DPRINTF2("rds_handle_cm_req",
395 "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
396 sp->session_failover = 1;
397 }
398
399 mutex_enter(&ep->ep_lock);
400 if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
401 ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
402 sp->session_type = RDS_SESSION_PASSIVE;
403 rw_exit(&sp->session_lock);
404 } else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
405 rw_exit(&sp->session_lock);
406 /*
407 * Peer to peer connection. There is an active
408 * connection pending on this ep. The one with
409 * greater port guid becomes active and the
410 * other becomes passive.
411 */
412 RDS_DPRINTF2("rds_handle_cm_req",
413 "EP(%p) Peer-peer connection handling", ep);
414 if (lgid.gid_guid > rgid.gid_guid) {
415 /* this node is active so reject this request */
416 mutex_exit(&ep->ep_lock);
417 RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
418 "Rejecting passive in favor of active", sp, ep);
419 return (IBT_CM_REJECT);
420 } else {
421 /*
422 * This session is not the active end, change it
423 * to passive end.
424 */
425 ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
426
427 rw_enter(&sp->session_lock, RW_WRITER);
428 sp->session_type = RDS_SESSION_PASSIVE;
429 sp->session_lgid = lgid;
430 sp->session_rgid = rgid;
431 rw_exit(&sp->session_lock);
432 }
433 } else {
434 rw_exit(&sp->session_lock);
435 }
436
437 ep->ep_lbufid = cmp.cmp_last_bufid;
438 ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
439 ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
440 cmp.cmp_last_bufid = ep->ep_rbufid;
441 cmp.cmp_ack_addr = ep->ep_ack_addr;
442 cmp.cmp_ack_rkey = ep->ep_ack_rkey;
443 mutex_exit(&ep->ep_lock);
444
445 /* continue with accepting the connection request for this channel */
446 chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
447 if (chanhdl == NULL) {
448 mutex_enter(&ep->ep_lock);
449 ep->ep_state = RDS_EP_STATE_UNCONNECTED;
450 mutex_exit(&ep->ep_lock);
451 return (IBT_CM_REJECT);
452 }
453
454 /* pre-post recv buffers in the RQ */
455 rds_post_recv_buf((void *)chanhdl);
456
457 rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
458 bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
459 rargsp->cm_ret.rep.cm_channel = chanhdl;
460 rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
461 rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
462 rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
463
464 RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
465 sp, ep, chanhdl);
466
467 return (IBT_CM_ACCEPT);
468 }
469
470 /*
471 * Handle an incoming CM REP
472 * Pre-post recv buffers for the QP
473 */
474 /* ARGSUSED */
475 static ibt_cm_status_t
rds_handle_cm_rep(ibt_cm_event_t * evp,ibt_cm_return_args_t * rargsp,void * rcmp,ibt_priv_data_len_t rcmp_len)476 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
477 void *rcmp, ibt_priv_data_len_t rcmp_len)
478 {
479 rds_ep_t *ep;
480 rds_cm_private_data_t cmp;
481
482 RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
483
484 /* pre-post recv buffers in the RQ */
485 rds_post_recv_buf((void *)evp->cm_channel);
486
487 ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
488 bcopy((uint8_t *)evp->cm_priv_data, &cmp,
489 sizeof (rds_cm_private_data_t));
490 ep->ep_lbufid = cmp.cmp_last_bufid;
491 ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
492 ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
493
494 rargsp->cm_ret_len = 0;
495
496 RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
497
498 return (IBT_CM_ACCEPT);
499 }
500
501 /*
502 * Handle CONN EST
503 */
504 static ibt_cm_status_t
rds_handle_cm_conn_est(ibt_cm_event_t * evp)505 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
506 {
507 rds_session_t *sp;
508 rds_ep_t *ep;
509
510 ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
511
512 RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
513 ep->ep_state);
514
515 mutex_enter(&ep->ep_lock);
516 ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
517 (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
518 ep->ep_state = RDS_EP_STATE_CONNECTED;
519 ep->ep_chanhdl = evp->cm_channel;
520 sp = ep->ep_sp;
521 mutex_exit(&ep->ep_lock);
522
523 (void) rds_session_active(sp);
524
525 RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
526 return (IBT_CM_ACCEPT);
527 }
528
529 /*
530 * Handle CONN CLOSED
531 */
532 static ibt_cm_status_t
rds_handle_cm_conn_closed(ibt_cm_event_t * evp)533 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
534 {
535 rds_ep_t *ep;
536 rds_session_t *sp;
537
538 /* Catch DREQs but ignore DREPs */
539 if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
540 RDS_DPRINTF2("rds_handle_cm_conn_closed",
541 "Ignoring Event: %d received", evp->cm_event.closed);
542 return (IBT_CM_ACCEPT);
543 }
544
545 ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
546 sp = ep->ep_sp;
547 RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
548 ep, evp->cm_channel);
549
550 mutex_enter(&ep->ep_lock);
551 if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
552 /* Ignore this DREQ */
553 RDS_DPRINTF2("rds_handle_cm_conn_closed",
554 "EP(%p) not connected, state: %d", ep, ep->ep_state);
555 mutex_exit(&ep->ep_lock);
556 return (IBT_CM_ACCEPT);
557 }
558 ep->ep_state = RDS_EP_STATE_CLOSING;
559 mutex_exit(&ep->ep_lock);
560
561 rw_enter(&sp->session_lock, RW_WRITER);
562 RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
563 sp->session_state);
564
565 switch (sp->session_state) {
566 case RDS_SESSION_STATE_CONNECTED:
567 case RDS_SESSION_STATE_HCA_CLOSING:
568 sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
569 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
570 "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
571 break;
572
573 case RDS_SESSION_STATE_PASSIVE_CLOSING:
574 sp->session_state = RDS_SESSION_STATE_CLOSED;
575 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
576 "RDS_SESSION_STATE_CLOSED", sp);
577 rds_passive_session_fini(sp);
578 sp->session_state = RDS_SESSION_STATE_FINI;
579 RDS_DPRINTF3("rds_handle_cm_conn_closed",
580 "SP(%p) State RDS_SESSION_STATE_FINI", sp);
581 break;
582
583 case RDS_SESSION_STATE_ACTIVE_CLOSING:
584 case RDS_SESSION_STATE_ERROR:
585 case RDS_SESSION_STATE_CLOSED:
586 break;
587
588 case RDS_SESSION_STATE_INIT:
589 sp->session_state = RDS_SESSION_STATE_ERROR;
590 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
591 "RDS_SESSION_STATE_ERROR", sp);
592 rds_passive_session_fini(sp);
593 sp->session_state = RDS_SESSION_STATE_FAILED;
594 RDS_DPRINTF3("rds_handle_cm_conn_closed",
595 "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
596 break;
597
598 default:
599 RDS_DPRINTF2("rds_handle_cm_conn_closed",
600 "SP(%p) - Unexpected state: %d", sp, sp->session_state);
601 rds_passive_session_fini(sp);
602 sp->session_state = RDS_SESSION_STATE_FAILED;
603 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
604 "RDS_SESSION_STATE_FAILED", sp);
605 }
606 rw_exit(&sp->session_lock);
607
608 mutex_enter(&ep->ep_lock);
609 ep->ep_state = RDS_EP_STATE_CLOSED;
610 mutex_exit(&ep->ep_lock);
611
612 RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
613 return (IBT_CM_ACCEPT);
614 }
615
616 /*
617 * Handle EVENT FAILURE
618 */
619 static ibt_cm_status_t
rds_handle_cm_event_failure(ibt_cm_event_t * evp)620 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
621 {
622 rds_ep_t *ep;
623 rds_session_t *sp;
624 int ret;
625
626 RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
627 "Code: %d msg: %d reason: %d", evp->cm_channel,
628 evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
629 evp->cm_event.failed.cf_reason);
630
631 if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
632 RDS_DPRINTF2(LABEL,
633 "Received REJ with reason IBT_CM_INVALID_SID: "
634 "RDS may not be loaded on the remote system");
635 }
636
637 if (evp->cm_channel == NULL) {
638 return (IBT_CM_ACCEPT);
639 }
640
641 if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
642 (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
643 /*
644 * This end is active, just ignore, ibt_open_rc_channel()
645 * caller will take care of cleanup.
646 */
647 RDS_DPRINTF2("rds_handle_cm_event_failure",
648 "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
649 return (IBT_CM_ACCEPT);
650 }
651
652 ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
653 sp = ep->ep_sp;
654
655 rw_enter(&sp->session_lock, RW_WRITER);
656 if (sp->session_type == RDS_SESSION_PASSIVE) {
657 RDS_DPRINTF2("rds_handle_cm_event_failure",
658 "SP(%p) - state: %d", sp, sp->session_state);
659 if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
660 (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
661 sp->session_state = RDS_SESSION_STATE_ERROR;
662 RDS_DPRINTF3("rds_handle_cm_event_failure",
663 "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
664
665 /*
666 * Store the cm_channel for freeing later
667 * Active side frees it on ibt_open_rc_channel
668 * failure
669 */
670 if (ep->ep_chanhdl == NULL) {
671 ep->ep_chanhdl = evp->cm_channel;
672 }
673 rw_exit(&sp->session_lock);
674
675 /*
676 * rds_passive_session_fini should not be called
677 * directly in the CM handler. It will cause a deadlock.
678 */
679 ret = ddi_taskq_dispatch(rds_taskq,
680 rds_cleanup_passive_session, (void *)sp,
681 DDI_NOSLEEP);
682 if (ret != DDI_SUCCESS) {
683 RDS_DPRINTF2("rds_handle_cm_event_failure",
684 "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
685 }
686 return (IBT_CM_ACCEPT);
687 }
688 }
689 rw_exit(&sp->session_lock);
690
691 RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
692 return (IBT_CM_ACCEPT);
693 }
694
695 /*
696 * CM Handler
697 *
698 * Called by IBCM
699 * The cm_private type differs for active and passive events.
700 */
701 ibt_cm_status_t
rds_cm_handler(void * cm_private,ibt_cm_event_t * eventp,ibt_cm_return_args_t * ret_args,void * ret_priv_data,ibt_priv_data_len_t ret_len_max)702 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
703 ibt_cm_return_args_t *ret_args, void *ret_priv_data,
704 ibt_priv_data_len_t ret_len_max)
705 {
706 ibt_cm_status_t ret = IBT_CM_ACCEPT;
707
708 RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
709
710 switch (eventp->cm_type) {
711 case IBT_CM_EVENT_REQ_RCV:
712 ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
713 ret_args, ret_priv_data, ret_len_max);
714 break;
715 case IBT_CM_EVENT_REP_RCV:
716 ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
717 ret_len_max);
718 break;
719 case IBT_CM_EVENT_MRA_RCV:
720 /* Not supported */
721 break;
722 case IBT_CM_EVENT_CONN_EST:
723 ret = rds_handle_cm_conn_est(eventp);
724 break;
725 case IBT_CM_EVENT_CONN_CLOSED:
726 ret = rds_handle_cm_conn_closed(eventp);
727 break;
728 case IBT_CM_EVENT_FAILURE:
729 ret = rds_handle_cm_event_failure(eventp);
730 break;
731 case IBT_CM_EVENT_LAP_RCV:
732 /* Not supported */
733 RDS_DPRINTF2(LABEL, "LAP message received");
734 break;
735 case IBT_CM_EVENT_APR_RCV:
736 /* Not supported */
737 RDS_DPRINTF2(LABEL, "APR message received");
738 break;
739 default:
740 break;
741 }
742
743 RDS_DPRINTF2("rds_cm_handler", "Return");
744
745 return (ret);
746 }
747
748 /* This is based on OFED Linux RDS */
749 #define RDS_PORT_NUM 6556
750
751 /*
752 * Register the wellknown service with service id: RDS_SERVICE_ID
753 * Incoming connection requests should arrive on this service id.
754 */
755 ibt_srv_hdl_t
rds_register_service(ibt_clnt_hdl_t rds_ibhdl)756 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
757 {
758 ibt_srv_hdl_t srvhdl;
759 ibt_srv_desc_t srvdesc;
760 int ret;
761
762 RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
763
764 bzero(&srvdesc, sizeof (ibt_srv_desc_t));
765 srvdesc.sd_handler = rds_cm_handler;
766 srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
767
768 /*
769 * This is the new service id as per:
770 * Annex A11: RDMA IP CM Service
771 */
772 rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
773 RDS_PORT_NUM);
774 ret = ibt_register_service(rds_ibhdl, &srvdesc,
775 rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
776 if (ret != IBT_SUCCESS) {
777 RDS_DPRINTF2(LABEL,
778 "RDS Service (0x%llx) Registration Failed: %d",
779 rdsib_statep->rds_service_id, ret);
780 return (NULL);
781 }
782
783 RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
784 return (srvhdl);
785 }
786
787 /* Bind the RDS service on all ports */
788 int
rds_bind_service(rds_state_t * statep)789 rds_bind_service(rds_state_t *statep)
790 {
791 rds_hca_t *hcap;
792 ib_gid_t gid;
793 uint_t jx, nbinds = 0, nports = 0;
794 int ret;
795
796 RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
797
798 rw_enter(&statep->rds_hca_lock, RW_READER);
799
800 hcap = statep->rds_hcalistp;
801 while (hcap != NULL) {
802
803 /* skip the HCAs that are not fully online */
804 if ((hcap->hca_state != RDS_HCA_STATE_OPEN) &&
805 (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED)) {
806 RDS_DPRINTF2("rds_bind_service",
807 "Skipping HCA: 0x%llx, state: %d",
808 hcap->hca_guid, hcap->hca_state);
809 hcap = hcap->hca_nextp;
810 continue;
811 }
812
813 /* currently, we have space for only 4 bindhdls */
814 ASSERT(hcap->hca_nports < 4);
815 for (jx = 0; jx < hcap->hca_nports; jx++) {
816 nports++;
817 if (hcap->hca_pinfop[jx].p_linkstate !=
818 IBT_PORT_ACTIVE) {
819 /*
820 * service bind will be called in the async
821 * handler when the port comes up. Clear any
822 * stale bind handle.
823 */
824 hcap->hca_bindhdl[jx] = NULL;
825 continue;
826 }
827
828 gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
829 RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
830 "gid: %llx:%llx", hcap->hca_guid,
831 hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
832 gid.gid_guid);
833
834 /* pass statep as cm_private */
835 ret = ibt_bind_service(statep->rds_srvhdl, gid,
836 NULL, statep, &hcap->hca_bindhdl[jx]);
837 if (ret != IBT_SUCCESS) {
838 RDS_DPRINTF2(LABEL, "Bind service for "
839 "HCA: 0x%llx Port: %d gid %llx:%llx "
840 "failed: %d", hcap->hca_guid,
841 hcap->hca_pinfop[jx].p_port_num,
842 gid.gid_prefix, gid.gid_guid, ret);
843 continue;
844 }
845
846 nbinds++;
847 }
848 hcap = hcap->hca_nextp;
849 }
850
851 rw_exit(&statep->rds_hca_lock);
852
853 RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
854 nbinds, nports);
855
856 #if 0
857 if (nbinds == 0) {
858 return (-1);
859 }
860 #endif
861
862 RDS_DPRINTF2("rds_bind_service", "Return");
863
864 return (0);
865 }
866
867 /* Open an RC connection */
868 int
rds_open_rc_channel(rds_ep_t * ep,ibt_path_info_t * pinfo,ibt_execution_mode_t mode,ibt_channel_hdl_t * chanhdl)869 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
870 ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
871 {
872 rds_session_t *sp;
873 ibt_chan_open_args_t ocargs;
874 ibt_rc_returns_t ocrets;
875 rds_cm_private_data_t cmp;
876 uint8_t hca_port;
877 ibt_channel_hdl_t hdl;
878 ibt_status_t ret = 0;
879 ibt_ip_cm_info_t ipcm_info;
880
881 RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
882
883 sp = ep->ep_sp;
884
885 bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
886 ipcm_info.src_addr.family = AF_INET;
887 ipcm_info.SRCIP = sp->session_myip;
888 ipcm_info.dst_addr.family = AF_INET;
889 ipcm_info.DSTIP = sp->session_remip;
890 ipcm_info.src_port = RDS_PORT_NUM;
891 ret = ibt_format_ip_private_data(&ipcm_info,
892 sizeof (rds_cm_private_data_t), &cmp);
893 if (ret != IBT_SUCCESS) {
894 RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
895 "failed: %d", sp, ep, ret);
896 return (-1);
897 }
898
899 hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
900
901 hdl = rds_ep_alloc_rc_channel(ep, hca_port);
902 if (hdl == NULL) {
903 return (-1);
904 }
905
906 cmp.cmp_version = RDS_VERSION;
907 cmp.cmp_arch = RDS_THIS_ARCH;
908 cmp.cmp_eptype = ep->ep_type;
909 cmp.cmp_failover = sp->session_failover;
910 cmp.cmp_last_bufid = ep->ep_rbufid;
911 cmp.cmp_user_buffer_size = UserBufferSize;
912 cmp.cmp_ack_addr = ep->ep_ack_addr;
913 cmp.cmp_ack_rkey = ep->ep_ack_rkey;
914
915 bzero(&ocargs, sizeof (ibt_chan_open_args_t));
916 bzero(&ocrets, sizeof (ibt_rc_returns_t));
917 ocargs.oc_path = pinfo;
918 ocargs.oc_cm_handler = rds_cm_handler;
919 ocargs.oc_cm_clnt_private = NULL;
920 ocargs.oc_rdma_ra_out = 4;
921 ocargs.oc_rdma_ra_in = 4;
922 ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
923 ocargs.oc_priv_data = &cmp;
924 ocargs.oc_path_retry_cnt = IBPathRetryCount;
925 ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
926 ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
927 mode, &ocargs, &ocrets);
928 if (ret != IBT_SUCCESS) {
929 RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
930 "failed: %d", sp, ep, ret);
931 (void) ibt_flush_channel(hdl);
932 (void) ibt_free_channel(hdl);
933
934 mutex_enter(&ep->ep_lock);
935 /* don't cleanup if this failure is due to peer-peer race */
936 if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
937 /* cleanup stuff allocated in rds_ep_alloc_rc_channel */
938 ep->ep_state = RDS_EP_STATE_ERROR;
939 rds_ep_free_rc_channel(ep);
940 }
941 mutex_exit(&ep->ep_lock);
942
943 return (-1);
944 }
945
946 *chanhdl = hdl;
947
948 RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
949 *chanhdl);
950
951 return (0);
952 }
953
954 int
rds_close_rc_channel(ibt_channel_hdl_t chanhdl,ibt_execution_mode_t mode)955 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
956 {
957 int ret;
958
959 RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
960 chanhdl, mode);
961
962 ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
963
964 RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
965
966 return (ret);
967 }
968