xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision 1b8adde7ba7d5e04395c141c5400dc2cffd7d809)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #include <sys/ib/clients/rds/rdsib_cm.h>
76 #include <sys/ib/clients/rds/rdsib_ib.h>
77 #include <sys/ib/clients/rds/rdsib_buf.h>
78 #include <sys/ib/clients/rds/rdsib_ep.h>
79 
80 /*
81  * This file contains CM related work:
82  *
83  * Service registration/deregistration
84  * Path lookup
85  * CM connection callbacks
86  * CM active and passive connection establishment
87  * Connection failover
88  */
89 
90 #define	SRCIP	src_addr.un.ip4addr
91 #define	DSTIP	dst_addr.un.ip4addr
92 
93 /*
94  * Handle an incoming CM REQ
95  */
96 /* ARGSUSED */
97 static ibt_cm_status_t
98 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
99     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
100 {
101 	ibt_cm_req_rcv_t	*reqp;
102 	ib_gid_t		lgid, rgid;
103 	rds_cm_private_data_t	cmp;
104 	rds_session_t		*sp;
105 	rds_ep_t		*ep;
106 	ibt_channel_hdl_t	chanhdl;
107 	ibt_ip_cm_info_t	ipcm_info;
108 	int			ret;
109 
110 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
111 
112 	reqp = &evp->cm_event.req;
113 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
114 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
115 
116 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
117 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
118 
119 	/* validate service id */
120 	if (reqp->req_service_id == RDS_SERVICE_ID) {
121 		RDS_DPRINTF2(LABEL, "Version Mismatch: Remote system "
122 		    "(GUID: 0x%llx) is running an older version of RDS",
123 		    rgid.gid_guid);
124 		return (IBT_CM_REJECT);
125 	}
126 
127 	/*
128 	 * CM private data brings IP information
129 	 * Private data received is a stream of bytes and may not be properly
130 	 * aligned. So, bcopy the data onto the stack before accessing it.
131 	 */
132 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
133 	    sizeof (rds_cm_private_data_t));
134 
135 	/* extract the CM IP info */
136 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
137 	    &ipcm_info);
138 	if (ret != IBT_SUCCESS) {
139 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
140 		    ret);
141 		return (IBT_CM_REJECT);
142 	}
143 
144 	RDS_DPRINTF2("rds_handle_cm_req",
145 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
146 	    ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
147 
148 	if (cmp.cmp_version != RDS_VERSION) {
149 		RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d "
150 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
151 		return (IBT_CM_REJECT);
152 	}
153 
154 	/* RDS supports V4 addresses only */
155 	if ((ipcm_info.src_addr.family != AF_INET) ||
156 	    (ipcm_info.dst_addr.family != AF_INET)) {
157 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
158 		    "src: %d dst: %d", ipcm_info.src_addr.family,
159 		    ipcm_info.dst_addr.family);
160 		return (IBT_CM_REJECT);
161 	}
162 
163 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
164 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
165 		    cmp.cmp_arch, RDS_THIS_ARCH);
166 		return (IBT_CM_REJECT);
167 	}
168 
169 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
170 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
171 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
172 		return (IBT_CM_REJECT);
173 	}
174 
175 	/* user_buffer_size should be same on all nodes */
176 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
177 		RDS_DPRINTF2(LABEL,
178 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
179 		    UserBufferSize, cmp.cmp_user_buffer_size);
180 		return (IBT_CM_REJECT);
181 	}
182 
183 	/*
184 	 * RDS needs more time to process a failover REQ so send an MRA.
185 	 * Otherwise, the remote may retry the REQ and fail the connection.
186 	 */
187 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
188 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
189 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
190 		    10000000 /* 10 sec */, NULL, 0);
191 	}
192 
193 	/* Is there a session to the destination node? */
194 	rw_enter(&statep->rds_sessionlock, RW_READER);
195 	sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
196 	rw_exit(&statep->rds_sessionlock);
197 
198 	if (sp == NULL) {
199 		/*
200 		 * currently there is no session to the destination
201 		 * remote ip in the private data is the local ip and vice
202 		 * versa
203 		 */
204 		sp = rds_session_create(statep, ipcm_info.DSTIP,
205 		    ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
206 		if (sp == NULL) {
207 			/* Check the list anyway. */
208 			rw_enter(&statep->rds_sessionlock, RW_READER);
209 			sp = rds_session_lkup(statep, ipcm_info.SRCIP,
210 			    rgid.gid_guid);
211 			rw_exit(&statep->rds_sessionlock);
212 			if (sp == NULL) {
213 				/*
214 				 * The only way this can fail is due to lack
215 				 * of kernel resources
216 				 */
217 				return (IBT_CM_REJECT);
218 			}
219 		}
220 	}
221 
222 	rw_enter(&sp->session_lock, RW_WRITER);
223 
224 	/* catch peer-to-peer case as soon as possible */
225 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
226 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
227 		/* Check possible peer-to-peer case here */
228 		if (sp->session_type != RDS_SESSION_PASSIVE) {
229 			RDS_DPRINTF2("rds_handle_cm_req",
230 			    "SP(%p) Peer-peer connection handling", sp);
231 			if (lgid.gid_guid > rgid.gid_guid) {
232 				/* this node is active so reject this request */
233 				rw_exit(&sp->session_lock);
234 				return (IBT_CM_REJECT);
235 			} else {
236 				/* this node is passive, change the session */
237 				sp->session_type = RDS_SESSION_PASSIVE;
238 				sp->session_lgid = lgid;
239 				sp->session_rgid = rgid;
240 			}
241 		}
242 	}
243 
244 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
245 
246 	switch (sp->session_state) {
247 	case RDS_SESSION_STATE_CONNECTED:
248 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
249 		sp->session_state = RDS_SESSION_STATE_ERROR;
250 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
251 		    "RDS_SESSION_STATE_ERROR", sp);
252 
253 		/* FALLTHRU */
254 	case RDS_SESSION_STATE_ERROR:
255 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
256 		sp->session_type = RDS_SESSION_PASSIVE;
257 		rw_exit(&sp->session_lock);
258 
259 		/* Handling this will take some time, so send an MRA */
260 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
261 		    10000000 /* 10 sec */, NULL, 0);
262 
263 		/*
264 		 * Any pending completions don't get flushed until the channel
265 		 * is closed. So, passing 0 here will not wait for pending
266 		 * completions in rds_session_close before closing the channel
267 		 */
268 		rds_session_close(sp, IBT_NOCALLBACKS, 0);
269 
270 		/* move the session to init state */
271 		rw_enter(&sp->session_lock, RW_WRITER);
272 		ret = rds_session_reinit(sp, lgid);
273 		sp->session_myip = ipcm_info.DSTIP;
274 		sp->session_lgid = lgid;
275 		sp->session_rgid = rgid;
276 		if (ret != 0) {
277 			rds_session_fini(sp);
278 			sp->session_state = RDS_SESSION_STATE_FAILED;
279 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
280 			    "RDS_SESSION_STATE_FAILED", sp);
281 			rw_exit(&sp->session_lock);
282 			return (IBT_CM_REJECT);
283 		} else {
284 			sp->session_state = RDS_SESSION_STATE_INIT;
285 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
286 			    "RDS_SESSION_STATE_INIT", sp);
287 		}
288 
289 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
290 			ep = &sp->session_ctrlep;
291 		} else {
292 			ep = &sp->session_dataep;
293 		}
294 		break;
295 	case RDS_SESSION_STATE_CREATED:
296 	case RDS_SESSION_STATE_FAILED:
297 	case RDS_SESSION_STATE_FINI:
298 		/*
299 		 * Initialize both channels, we accept this connection
300 		 * only if both channels are initialized
301 		 */
302 		sp->session_type = RDS_SESSION_PASSIVE;
303 		sp->session_lgid = lgid;
304 		sp->session_rgid = rgid;
305 		sp->session_state = RDS_SESSION_STATE_CREATED;
306 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
307 		    "RDS_SESSION_STATE_CREATED", sp);
308 		ret = rds_session_init(sp);
309 		if (ret != 0) {
310 			/* Seems like there are not enough resources */
311 			sp->session_state = RDS_SESSION_STATE_FAILED;
312 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
313 			    "RDS_SESSION_STATE_FAILED", sp);
314 			rw_exit(&sp->session_lock);
315 			return (IBT_CM_REJECT);
316 		}
317 		sp->session_state = RDS_SESSION_STATE_INIT;
318 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
319 		    "RDS_SESSION_STATE_INIT", sp);
320 
321 		/* FALLTHRU */
322 	case RDS_SESSION_STATE_INIT:
323 		/*
324 		 * When re-using an existing session, make sure the
325 		 * session is still through the same HCA. Otherwise, the
326 		 * memory registrations have to moved to the new HCA.
327 		 */
328 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
329 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
330 				RDS_DPRINTF2("rds_handle_cm_req",
331 				    "Existing Session but different gid "
332 				    "existing: 0x%llx, new: 0x%llx, "
333 				    "sending an MRA",
334 				    sp->session_lgid.gid_guid, lgid.gid_guid);
335 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
336 				    evp->cm_session_id, 10000000 /* 10 sec */,
337 				    NULL, 0);
338 				ret = rds_session_reinit(sp, lgid);
339 				if (ret != 0) {
340 					rds_session_fini(sp);
341 					sp->session_state =
342 					    RDS_SESSION_STATE_FAILED;
343 					sp->session_failover = 0;
344 					RDS_DPRINTF3("rds_failover_session",
345 					    "SP(%p) State "
346 					    "RDS_SESSION_STATE_FAILED", sp);
347 					rw_exit(&sp->session_lock);
348 					return (IBT_CM_REJECT);
349 				}
350 			}
351 			ep = &sp->session_dataep;
352 		} else {
353 			ep = &sp->session_ctrlep;
354 		}
355 
356 		break;
357 	default:
358 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
359 		    "state: %d", sp, sp->session_state);
360 		rw_exit(&sp->session_lock);
361 		return (IBT_CM_REJECT);
362 	}
363 
364 	sp->session_failover = 0; /* reset any previous value */
365 	if (cmp.cmp_failover) {
366 		RDS_DPRINTF2("rds_handle_cm_req",
367 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
368 		sp->session_failover = 1;
369 	}
370 
371 	mutex_enter(&ep->ep_lock);
372 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
373 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
374 		sp->session_type = RDS_SESSION_PASSIVE;
375 		rw_exit(&sp->session_lock);
376 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
377 		rw_exit(&sp->session_lock);
378 		/*
379 		 * Peer to peer connection. There is an active
380 		 * connection pending on this ep. The one with
381 		 * greater port guid becomes active and the
382 		 * other becomes passive.
383 		 */
384 		RDS_DPRINTF2("rds_handle_cm_req",
385 		    "EP(%p) Peer-peer connection handling", ep);
386 		if (lgid.gid_guid > rgid.gid_guid) {
387 			/* this node is active so reject this request */
388 			mutex_exit(&ep->ep_lock);
389 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
390 			    "Rejecting passive in favor of active", sp, ep);
391 			return (IBT_CM_REJECT);
392 		} else {
393 			/*
394 			 * This session is not the active end, change it
395 			 * to passive end.
396 			 */
397 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
398 
399 			rw_enter(&sp->session_lock, RW_WRITER);
400 			sp->session_type = RDS_SESSION_PASSIVE;
401 			sp->session_lgid = lgid;
402 			sp->session_rgid = rgid;
403 			rw_exit(&sp->session_lock);
404 		}
405 	} else {
406 		rw_exit(&sp->session_lock);
407 	}
408 
409 	ep->ep_lbufid = cmp.cmp_last_bufid;
410 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
411 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
412 	cmp.cmp_last_bufid = ep->ep_rbufid;
413 	cmp.cmp_ack_addr = ep->ep_ack_addr;
414 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
415 	mutex_exit(&ep->ep_lock);
416 
417 	/* continue with accepting the connection request for this channel */
418 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
419 	if (chanhdl == NULL) {
420 		mutex_enter(&ep->ep_lock);
421 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
422 		mutex_exit(&ep->ep_lock);
423 		return (IBT_CM_REJECT);
424 	}
425 
426 	/* pre-post recv buffers in the RQ */
427 	rds_post_recv_buf((void *)chanhdl);
428 
429 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
430 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
431 	rargsp->cm_ret.rep.cm_channel = chanhdl;
432 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
433 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
434 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
435 
436 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
437 	    sp, ep, chanhdl);
438 
439 	return (IBT_CM_ACCEPT);
440 }
441 
442 /*
443  * Handle an incoming CM REP
444  * Pre-post recv buffers for the QP
445  */
446 /* ARGSUSED */
447 static ibt_cm_status_t
448 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
449     void *rcmp, ibt_priv_data_len_t rcmp_len)
450 {
451 	rds_ep_t	*ep;
452 	rds_cm_private_data_t	cmp;
453 
454 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
455 
456 	/* pre-post recv buffers in the RQ */
457 	rds_post_recv_buf((void *)evp->cm_channel);
458 
459 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
460 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
461 	    sizeof (rds_cm_private_data_t));
462 	ep->ep_lbufid = cmp.cmp_last_bufid;
463 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
464 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
465 
466 	rargsp->cm_ret_len = 0;
467 
468 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
469 
470 	return (IBT_CM_ACCEPT);
471 }
472 
473 /*
474  * Handle CONN EST
475  */
476 static ibt_cm_status_t
477 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
478 {
479 	rds_session_t	*sp;
480 	rds_ep_t	*ep;
481 
482 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
483 
484 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
485 	    ep->ep_state);
486 
487 	mutex_enter(&ep->ep_lock);
488 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
489 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
490 	ep->ep_state = RDS_EP_STATE_CONNECTED;
491 	ep->ep_chanhdl = evp->cm_channel;
492 	sp = ep->ep_sp;
493 	mutex_exit(&ep->ep_lock);
494 
495 	(void) rds_session_active(sp);
496 
497 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
498 	return (IBT_CM_ACCEPT);
499 }
500 
501 /*
502  * Handle CONN CLOSED
503  */
504 static ibt_cm_status_t
505 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
506 {
507 	rds_ep_t	*ep;
508 	rds_session_t	*sp;
509 
510 	/* Catch DREQs but ignore DREPs */
511 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
512 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
513 		    "Ignoring Event: %d received", evp->cm_event.closed);
514 		return (IBT_CM_ACCEPT);
515 	}
516 
517 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
518 	sp = ep->ep_sp;
519 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
520 	    ep, evp->cm_channel);
521 
522 	mutex_enter(&ep->ep_lock);
523 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
524 		/* Ignore this DREQ */
525 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
526 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
527 		mutex_exit(&ep->ep_lock);
528 		return (IBT_CM_ACCEPT);
529 	}
530 	ep->ep_state = RDS_EP_STATE_CLOSING;
531 	mutex_exit(&ep->ep_lock);
532 
533 	rw_enter(&sp->session_lock, RW_WRITER);
534 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
535 	    sp->session_state);
536 
537 	switch (sp->session_state) {
538 	case RDS_SESSION_STATE_CONNECTED:
539 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
540 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
541 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
542 		break;
543 
544 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
545 		sp->session_state = RDS_SESSION_STATE_CLOSED;
546 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
547 		    "RDS_SESSION_STATE_CLOSED", sp);
548 		rds_passive_session_fini(sp);
549 		sp->session_state = RDS_SESSION_STATE_FINI;
550 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
551 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
552 		break;
553 
554 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
555 	case RDS_SESSION_STATE_ERROR:
556 	case RDS_SESSION_STATE_CLOSED:
557 		break;
558 
559 	case RDS_SESSION_STATE_INIT:
560 		sp->session_state = RDS_SESSION_STATE_ERROR;
561 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
562 		    "RDS_SESSION_STATE_ERROR", sp);
563 		rds_passive_session_fini(sp);
564 		sp->session_state = RDS_SESSION_STATE_FAILED;
565 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
566 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
567 		break;
568 
569 	default:
570 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
571 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
572 		rds_passive_session_fini(sp);
573 		sp->session_state = RDS_SESSION_STATE_FAILED;
574 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
575 		    "RDS_SESSION_STATE_FAILED", sp);
576 	}
577 	rw_exit(&sp->session_lock);
578 
579 	mutex_enter(&ep->ep_lock);
580 	ep->ep_state = RDS_EP_STATE_CLOSED;
581 	mutex_exit(&ep->ep_lock);
582 
583 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
584 	return (IBT_CM_ACCEPT);
585 }
586 
587 /*
588  * Handle EVENT FAILURE
589  */
590 static ibt_cm_status_t
591 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
592 {
593 	rds_ep_t	*ep;
594 	rds_session_t	*sp;
595 	int		ret;
596 
597 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
598 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
599 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
600 	    evp->cm_event.failed.cf_reason);
601 
602 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
603 		RDS_DPRINTF2(LABEL,
604 		    "Received REJ with reason IBT_CM_INVALID_SID: "
605 		    "RDS may not be loaded on the remote system");
606 	}
607 
608 	if (evp->cm_channel == NULL) {
609 		return (IBT_CM_ACCEPT);
610 	}
611 
612 	if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
613 	    (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
614 		/*
615 		 * This end is active, just ignore, ibt_open_rc_channel()
616 		 * caller will take care of cleanup.
617 		 */
618 		RDS_DPRINTF2("rds_handle_cm_event_failure",
619 		    "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
620 		return (IBT_CM_ACCEPT);
621 	}
622 
623 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
624 	sp = ep->ep_sp;
625 
626 	rw_enter(&sp->session_lock, RW_WRITER);
627 	if (sp->session_type == RDS_SESSION_PASSIVE) {
628 		RDS_DPRINTF2("rds_handle_cm_event_failure",
629 		    "SP(%p) - state: %d", sp, sp->session_state);
630 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
631 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
632 			sp->session_state = RDS_SESSION_STATE_ERROR;
633 			RDS_DPRINTF3("rds_handle_cm_event_failure",
634 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
635 
636 			/*
637 			 * Store the cm_channel for freeing later
638 			 * Active side frees it on ibt_open_rc_channel
639 			 * failure
640 			 */
641 			if (ep->ep_chanhdl == NULL) {
642 				ep->ep_chanhdl = evp->cm_channel;
643 			}
644 			rw_exit(&sp->session_lock);
645 
646 			/*
647 			 * rds_passive_session_fini should not be called
648 			 * directly in the CM handler. It will cause a deadlock.
649 			 */
650 			ret = ddi_taskq_dispatch(rds_taskq,
651 			    rds_cleanup_passive_session, (void *)sp,
652 			    DDI_NOSLEEP);
653 			if (ret != DDI_SUCCESS) {
654 				RDS_DPRINTF2("rds_handle_cm_event_failure",
655 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
656 			}
657 			return (IBT_CM_ACCEPT);
658 		}
659 	}
660 	rw_exit(&sp->session_lock);
661 
662 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
663 	return (IBT_CM_ACCEPT);
664 }
665 
666 /*
667  * CM Handler
668  *
669  * Called by IBCM
670  * The cm_private type differs for active and passive events.
671  */
672 ibt_cm_status_t
673 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
674     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
675     ibt_priv_data_len_t ret_len_max)
676 {
677 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
678 
679 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
680 
681 	switch (eventp->cm_type) {
682 	case IBT_CM_EVENT_REQ_RCV:
683 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
684 		    ret_args, ret_priv_data, ret_len_max);
685 		break;
686 	case IBT_CM_EVENT_REP_RCV:
687 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
688 		    ret_len_max);
689 		break;
690 	case IBT_CM_EVENT_MRA_RCV:
691 		/* Not supported */
692 		break;
693 	case IBT_CM_EVENT_CONN_EST:
694 		ret = rds_handle_cm_conn_est(eventp);
695 		break;
696 	case IBT_CM_EVENT_CONN_CLOSED:
697 		ret = rds_handle_cm_conn_closed(eventp);
698 		break;
699 	case IBT_CM_EVENT_FAILURE:
700 		ret = rds_handle_cm_event_failure(eventp);
701 		break;
702 	case IBT_CM_EVENT_LAP_RCV:
703 		/* Not supported */
704 		RDS_DPRINTF2(LABEL, "LAP message received");
705 		break;
706 	case IBT_CM_EVENT_APR_RCV:
707 		/* Not supported */
708 		RDS_DPRINTF2(LABEL, "APR message received");
709 		break;
710 	default:
711 		break;
712 	}
713 
714 	RDS_DPRINTF2("rds_cm_handler", "Return");
715 
716 	return (ret);
717 }
718 
719 /* This is based on OFED Linux RDS */
720 #define	RDS_PORT_NUM	6556
721 
722 /*
723  * Register the wellknown service with service id: RDS_SERVICE_ID
724  * Incoming connection requests should arrive on this service id.
725  */
726 ibt_srv_hdl_t
727 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
728 {
729 	ibt_srv_hdl_t	srvhdl;
730 	ibt_srv_desc_t	srvdesc;
731 	int		ret;
732 
733 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
734 
735 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
736 	srvdesc.sd_handler = rds_cm_handler;
737 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
738 
739 	/*
740 	 * Register the old service id for backward compatibility
741 	 * REQs received on this service id would be rejected
742 	 */
743 	ret = ibt_register_service(rds_ibhdl, &srvdesc, RDS_SERVICE_ID,
744 	    1, &rdsib_statep->rds_old_srvhdl, NULL);
745 	if (ret != IBT_SUCCESS) {
746 		RDS_DPRINTF2(LABEL,
747 		    "RDS Service (0x%llx) Registration Failed: %d",
748 		    RDS_SERVICE_ID, ret);
749 		return (NULL);
750 	}
751 
752 	/*
753 	 * This is the new service id as per:
754 	 * Annex A11: RDMA IP CM Service
755 	 */
756 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
757 	    RDS_PORT_NUM);
758 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
759 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
760 	if (ret != IBT_SUCCESS) {
761 		RDS_DPRINTF2(LABEL,
762 		    "RDS Service (0x%llx) Registration Failed: %d",
763 		    rdsib_statep->rds_service_id, ret);
764 		return (NULL);
765 	}
766 
767 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
768 	return (srvhdl);
769 }
770 
771 /* Bind the RDS service on all ports */
772 int
773 rds_bind_service(rds_state_t *statep)
774 {
775 	rds_hca_t	*hcap;
776 	ib_gid_t	gid;
777 	uint_t		jx, nbinds = 0, nports = 0;
778 	int		ret;
779 
780 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
781 
782 	hcap = statep->rds_hcalistp;
783 	while (hcap != NULL) {
784 		for (jx = 0; jx < hcap->hca_nports; jx++) {
785 			nports++;
786 			if (hcap->hca_pinfop[jx].p_linkstate !=
787 			    IBT_PORT_ACTIVE) {
788 				/*
789 				 * service bind will be called in the async
790 				 * handler when the port comes up
791 				 */
792 				continue;
793 			}
794 
795 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
796 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
797 			    "gid: %llx:%llx", hcap->hca_guid,
798 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
799 			    gid.gid_guid);
800 
801 			/* pass statep as cm_private */
802 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
803 			    NULL, statep, NULL);
804 			if (ret != IBT_SUCCESS) {
805 				RDS_DPRINTF2(LABEL, "Bind service for "
806 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
807 				    "failed: %d", hcap->hca_guid,
808 				    hcap->hca_pinfop[jx].p_port_num,
809 				    gid.gid_prefix, gid.gid_guid, ret);
810 				continue;
811 			}
812 
813 			nbinds++;
814 
815 			/* bind the old service, ignore if it fails */
816 			ret = ibt_bind_service(statep->rds_old_srvhdl, gid,
817 			    NULL, statep, NULL);
818 			if (ret != IBT_SUCCESS) {
819 				RDS_DPRINTF2(LABEL, "Bind service for "
820 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
821 				    "failed: %d", hcap->hca_guid,
822 				    hcap->hca_pinfop[jx].p_port_num,
823 				    gid.gid_prefix, gid.gid_guid, ret);
824 			}
825 		}
826 		hcap = hcap->hca_nextp;
827 	}
828 
829 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
830 	    nbinds, nports);
831 
832 #if 0
833 	if (nbinds == 0) {
834 		return (-1);
835 	}
836 #endif
837 
838 	RDS_DPRINTF2("rds_bind_service", "Return");
839 
840 	return (0);
841 }
842 
843 /* Open an RC connection */
844 int
845 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
846     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
847 {
848 	rds_session_t		*sp;
849 	ibt_chan_open_args_t	ocargs;
850 	ibt_rc_returns_t	ocrets;
851 	rds_cm_private_data_t	cmp;
852 	uint8_t			hca_port;
853 	ibt_channel_hdl_t	hdl;
854 	ibt_status_t		ret = 0;
855 	ibt_ip_cm_info_t	ipcm_info;
856 
857 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
858 
859 	sp = ep->ep_sp;
860 
861 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
862 	ipcm_info.src_addr.family = AF_INET;
863 	ipcm_info.SRCIP = sp->session_myip;
864 	ipcm_info.dst_addr.family = AF_INET;
865 	ipcm_info.DSTIP = sp->session_remip;
866 	ipcm_info.src_port = RDS_PORT_NUM;
867 	ret = ibt_format_ip_private_data(&ipcm_info,
868 	    sizeof (rds_cm_private_data_t), &cmp);
869 	if (ret != IBT_SUCCESS) {
870 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
871 		    "failed: %d", sp, ep, ret);
872 		return (-1);
873 	}
874 
875 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
876 
877 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
878 	if (hdl == NULL) {
879 		return (-1);
880 	}
881 
882 	cmp.cmp_version = RDS_VERSION;
883 	cmp.cmp_arch = RDS_THIS_ARCH;
884 	cmp.cmp_eptype = ep->ep_type;
885 	cmp.cmp_failover = sp->session_failover;
886 	cmp.cmp_last_bufid = ep->ep_rbufid;
887 	cmp.cmp_user_buffer_size = UserBufferSize;
888 	cmp.cmp_ack_addr = ep->ep_ack_addr;
889 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
890 
891 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
892 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
893 	ocargs.oc_path = pinfo;
894 	ocargs.oc_cm_handler = rds_cm_handler;
895 	ocargs.oc_cm_clnt_private = NULL;
896 	ocargs.oc_rdma_ra_out = 4;
897 	ocargs.oc_rdma_ra_in = 4;
898 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
899 	ocargs.oc_priv_data = &cmp;
900 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
901 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
902 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
903 	    mode, &ocargs, &ocrets);
904 	if (ret != IBT_SUCCESS) {
905 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
906 		    "failed: %d", sp, ep, ret);
907 		(void) ibt_flush_channel(hdl);
908 		(void) ibt_free_channel(hdl);
909 
910 		mutex_enter(&ep->ep_lock);
911 		/* don't cleanup if this failure is due to peer-peer race */
912 		if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
913 			/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
914 			ep->ep_state = RDS_EP_STATE_ERROR;
915 			rds_ep_free_rc_channel(ep);
916 		}
917 		mutex_exit(&ep->ep_lock);
918 
919 		return (-1);
920 	}
921 
922 	*chanhdl = hdl;
923 
924 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
925 	    *chanhdl);
926 
927 	return (0);
928 }
929 
930 int
931 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
932 {
933 	int	ret;
934 
935 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
936 	    chanhdl, mode);
937 
938 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
939 
940 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
941 
942 	return (ret);
943 }
944